├── .gitignore
├── dune-project
├── src
    ├── test
    │   ├── dune
    │   └── test.ml
    ├── lib
    │   ├── dune
    │   ├── EMA.mli
    │   ├── EMA.ml
    │   ├── Uniformize.mli
    │   └── Uniformize.ml
    └── bin
    │   ├── dune
    │   └── main.ml
├── Makefile
├── README.md
├── uniformize.opam
├── .ocp-indent
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | _opam
3 | .merlin
4 | 


--------------------------------------------------------------------------------
/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 2.6)
2 | (name uniformization)
3 | 


--------------------------------------------------------------------------------
/src/test/dune:
--------------------------------------------------------------------------------
1 | (executable
2 |   (name test)
3 |   (libraries uniformize alcotest)
4 | )
5 | 


--------------------------------------------------------------------------------
/src/lib/dune:
--------------------------------------------------------------------------------
1 | (library
2 |   (public_name uniformize)
3 |   (name uniformize)
4 |   (libraries )
5 | )
6 | 


--------------------------------------------------------------------------------
/src/bin/dune:
--------------------------------------------------------------------------------
1 | (executable
2 |   (public_name uniformize)
3 |   (name main)
4 |   (libraries
5 |     uniformize
6 |     cmdliner
7 |   )
8 | )
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build
 2 | build:
 3 | 	dune build
 4 | 
 5 | test:
 6 | 	dune exec src/test/test.exe
 7 | 
 8 | .PHONY: install
 9 | install:
10 | 	dune install
11 | 
12 | .PHONY: clean
13 | clean:
14 | 	 git clean -dfX
15 | 


--------------------------------------------------------------------------------
/src/test/test.ml:
--------------------------------------------------------------------------------
 1 | (*
 2 |    Entrypoint to run the unit tests from the command line.
 3 | *)
 4 | 
 5 | let make_alcotest_suite tests_list : unit Alcotest.test list =
 6 |   List.map (fun (name, tests) ->
 7 |     name, List.map (fun (name, f) -> (name, `Quick, f)) tests
 8 |   ) tests_list
 9 | 
10 | let test_suites =
11 |   make_alcotest_suite [
12 |     "EMA", Uniformize.ema_tests;
13 |     "Uniformize", Uniformize.tests;
14 |   ]
15 | 
16 | let main () = Alcotest.run "uniformize" test_suites
17 | 
18 | let () = main ()
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dynamic Signal Uniformization
 2 | 
 3 | This is an OCaml library and command-line executable for mapping a
 4 | signal into a uniform distribution over [0, 1].
 5 | 
 6 | It is a complete, tested implementation of the heuristic outlined in
 7 | this article:
 8 | 
 9 | Martin Jambon, [Dynamic Signal
10 | Uniformization](https://mjambon.com/2020-06-28-dynamic-signal-uniformization/),
11 | June 2020.
12 | 
13 | Installation
14 | --
15 | 
16 | Install [opam](https://opam.ocaml.org/doc/Install.html), then run
17 | ```
18 | opam install .
19 | ```
20 | 
21 | This installs a `uniformize` executable as well as the OCaml library
22 | of the same name.
23 | 
24 | Project status
25 | --
26 | 
27 | This source code was extracted from a private project in the hope that
28 | it will be useful. See [license](LICENSE) for use and redistribution rights.
29 | 


--------------------------------------------------------------------------------
/uniformize.opam:
--------------------------------------------------------------------------------
 1 | opam-version: "2.0"
 2 | maintainer: "martin@mjambon.com"
 3 | authors: ["Martin Jambon"]
 4 | homepage: "https://github.com/mjambon/uniformization"
 5 | bug-reports: "https://github.com/mjambon/uniformization/issues"
 6 | dev-repo: "git+https://github.com/mjambon/uniformization.git"
 7 | license: "BSD-3-Clause"
 8 | 
 9 | build: [
10 |   ["dune" "build" "-p" name "-j" jobs]
11 | ]
12 | 
13 | depends: [
14 |   "alcotest"
15 |   "cmdliner"
16 |   "dune" {>= "2.1"}
17 |   "ocaml"
18 | ]
19 | 
20 | synopsis: "Show dependency graph of a multi-component dune project"
21 | 
22 | description: """
23 | This library implements dynamic signal uniformization. It transforms an
24 | input signal of unknown distribution into a signal that is uniformly
25 | distributed.
26 | """
27 | 
28 | url {
29 |   src: "git+https://github.com/mjambon/uniformization"
30 | }
31 | 


--------------------------------------------------------------------------------
/src/lib/EMA.mli:
--------------------------------------------------------------------------------
 1 | (*
 2 |    Exponential moving average.
 3 | 
 4 |    See https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
 5 | *)
 6 | 
 7 | type param
 8 | 
 9 | type t = private {
10 |   param: param;
11 | 
12 |   mutable m: float;
13 |     (* Current estimate of the average, initially nan. *)
14 | 
15 |   mutable age: int;
16 |     (* Number of observations. *)
17 | }
18 | 
19 | val default_alpha : float
20 | val create_param : ?alpha:float -> unit -> param
21 | val default_param : param
22 | 
23 | val init : ?param:param -> unit -> t
24 | 
25 | val update : t -> float -> unit
26 | 
27 | val get : t -> float
28 |   (* Get the current average. *)
29 | 
30 | val get_obs_count : t -> int
31 |   (* Return the number of observations. *)
32 | 
33 | val of_list : ?alpha:float -> float list -> float
34 |   (* Shorthand for getting the EMA from a list of numbers.
35 |      This is for evaluation purposes. *)
36 | 
37 | val tests : (string * (unit -> unit)) list
38 | 


--------------------------------------------------------------------------------
/.ocp-indent:
--------------------------------------------------------------------------------
 1 | # See https://github.com/OCamlPro/ocp-indent/blob/master/.ocp-indent for more
 2 | 
 3 | # Indent for clauses inside a pattern-match (after the arrow):
 4 | #    match foo with
 5 | #    | _ ->
 6 | #    ^^^^bar
 7 | # the default is 2, which aligns the pattern and the expression
 8 | match_clause = 4
 9 | 
10 | # When nesting expressions on the same line, their indentation are in
11 | # some cases stacked, so that it remains correct if you close them one
12 | # at a line. This may lead to large indents in complex code though, so
13 | # this parameter can be used to set a maximum value. Note that it only
14 | # affects indentation after function arrows and opening parens at end
15 | # of line.
16 | #
17 | # for example (left: `none`; right: `4`)
18 | #    let f = g (h (i (fun x ->     #    let f = g (h (i (fun x ->
19 | #          x)                      #        x)
20 | #        )                         #      )
21 | #      )                           #    )
22 | max_indent = 2
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Martin Jambon
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/src/lib/EMA.ml:
--------------------------------------------------------------------------------
 1 | (* Exponential moving average. See .mli *)
 2 | 
 3 | type param = {
 4 |   alpha: float;
 5 |   age_min: int; (* derived from alpha *)
 6 | }
 7 | 
 8 | type t = {
 9 |   param: param;
10 |   mutable m: float;
11 |   mutable age: int;
12 | }
13 | 
14 | let default_alpha = 0.1
15 | 
16 | let create_param ?(alpha = default_alpha) () =
17 |   if not (alpha >= 0. && alpha <= 1.) then
18 |     invalid_arg "EMA.create_param: invalid alpha";
19 |   let age_min = truncate (ceil (1. /. alpha)) in
20 |   {
21 |     alpha;
22 |     age_min;
23 |   }
24 | 
25 | let default_param = create_param ()
26 | 
27 | let init ?(param = default_param) () =
28 |   {
29 |     param;
30 |     m = nan;
31 |     age = 0
32 |   }
33 | 
34 | let update_age state =
35 |   let age = state.age in
36 |   if age >= 0 then
37 |     state.age <- age + 1
38 | 
39 | let update state x =
40 |   if x <> x then
41 |     invalid_arg "EMA.update: not a number";
42 |   update_age state;
43 |   let param = state.param in
44 |   let alpha =
45 |     if state.age > param.age_min then
46 |       param.alpha
47 |     else (
48 |       if state.age = 1 then
49 |         (* replace nan to avoid contamination when it's multiplied by 0 *)
50 |         state.m <- 0.;
51 |       (* arithmetic mean until we reach 1/r observations *)
52 |       1. /. float state.age
53 |     )
54 |   in
55 |   state.m <- (1. -. alpha) *. state.m +. alpha *. x
56 | 
57 | let get state = state.m
58 | 
59 | let get_obs_count state = state.age
60 | 
61 | let of_list ?(alpha = default_alpha) data =
62 |   let param = create_param ~alpha () in
63 |   let state = init ~param () in
64 |   List.iter (update state) data;
65 |   get state
66 | 
67 | (* Equality within some absolute error 'err'. *)
68 | let float_eq ?(err = 1e-6) a b =
69 |   if not (err >= 0.) then
70 |     invalid_arg "Float.eq: invalid 'err' parameter";
71 |   abs_float (a -. b) <= err
72 | 
73 | let test_initial_samples () =
74 |   let ( =~ ) = float_eq ~err:1e-6 in
75 |   let ema l = of_list ~alpha:0.5 l in
76 |   assert (ema [10.] =~ 10.);
77 |   assert (ema [10.; 20.] =~ 15.);
78 |   assert (ema [10.; 20.; 25.] =~ 20.)
79 | 
80 | let test_convergence () =
81 |   let ( =~ ) = float_eq ~err:1e-3 in
82 |   let ema l = of_list ~alpha:0.5 l in
83 |   assert (ema [0.; 1.] =~ 0.5);
84 |   assert (ema [0.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1. ] =~ 1.)
85 | 
86 | let tests = [
87 |   "initial samples", test_initial_samples;
88 |   "convergence", test_convergence;
89 | ]
90 | 


--------------------------------------------------------------------------------
/src/bin/main.ml:
--------------------------------------------------------------------------------
 1 | (*
 2 |    Entrypoint for command line normalizer.
 3 | *)
 4 | 
 5 | open Printf
 6 | open Cmdliner
 7 | 
 8 | type config = {
 9 |   ema_alpha: float;
10 |   num_bins: int;
11 |   flush_line: bool;
12 | }
13 | 
14 | let map ~flush_line tracker ic oc =
15 |   try
16 |     while true do
17 |       let x = input_line ic |> float_of_string in
18 |       let y = Uniformize.map tracker x in
19 |       printf "%g\n" y;
20 |       if flush_line then
21 |         flush oc
22 |     done;
23 |     assert false
24 |   with End_of_file ->
25 |     ()
26 | 
27 | let run config ic oc =
28 |   let tracker =
29 |     Uniformize.create
30 |       ~ema_alpha:config.ema_alpha
31 |       ~num_bins:config.num_bins
32 |       ()
33 |   in
34 |   map ~flush_line:config.flush_line tracker ic oc
35 | 
36 | let ema_alpha_term =
37 |   let info =
38 |     Arg.info ["ema-alpha"; "a"]
39 |       ~docv:"NUM"
40 |       ~doc:"$(docv) specifies the alpha parameter used by the exponential
41 |             moving average (EMA). It must be a value within (0, 1).
42 |             A greater value gives a greater weight to the recent samples."
43 |   in
44 |   Arg.value (Arg.opt Arg.float Uniformize.default_ema_alpha info)
45 | 
46 | let num_bins_term =
47 |   let info =
48 |     Arg.info ["num-bins"; "n"]
49 |       ~docv:"N"
50 |       ~doc:"$(docv) specifies the number of points to use to model the
51 |             input distribution.
52 |             A greater value increases the accuracy with which the
53 |             original distribution is modeled but increases memory usage
54 |             accordingly."
55 |   in
56 |   Arg.value (Arg.opt Arg.int Uniformize.default_num_bins info)
57 | 
58 | let flush_line_term =
59 |   let info =
60 |     Arg.info ["flush-line"; "f"]
61 |      ~doc:"Print each line of output as soon as it is available."
62 |   in
63 |   Arg.value (Arg.flag info)
64 | 
65 | let cmdline_term =
66 |   let combine ema_alpha num_bins flush_line =
67 |     { ema_alpha; num_bins; flush_line }
68 |   in
69 |   Term.(const combine
70 |         $ ema_alpha_term
71 |         $ num_bins_term
72 |         $ flush_line_term
73 |        )
74 | 
75 | let doc =
76 |   "make an input signal uniformly distributed over [0, 1]"
77 | 
78 | let parse_command_line () =
79 |   let info =
80 |     Term.info
81 |       ~doc
82 |       "uniformize"
83 |   in
84 |   match Term.eval (cmdline_term, info) with
85 |   | `Error _ -> exit 1
86 |   | `Version | `Help -> exit 0
87 |   | `Ok config -> config
88 | 
89 | let main () =
90 |   let config = parse_command_line () in
91 |   run config stdin stdout
92 | 
93 | let () = main ()
94 | 


--------------------------------------------------------------------------------
/src/lib/Uniformize.mli:
--------------------------------------------------------------------------------
 1 | (**
 2 |    Dynamic Signal Uniformization.
 3 | 
 4 |    An algorithm for tracking a distribution and estimating the rank of
 5 |    sample values.
 6 | 
 7 |    Algorithm outline
 8 | 
 9 |    Tracker update cycle:
10 |    1. Create an array of n bins.
11 |    2. Accumulate n sample points (floats) into a buffer.
12 |    3. Sort the buffer of n sample points.
13 |    4. Add the data of the sorted buffer to the corresponding bin, which
14 |       tracks an exponential moving average of the samples it receives.
15 |    5. Go to 2.
16 | 
17 |    Rank estimation:
18 |    Given a query, we use binary search to find the two best bins whose value
19 |    can be used to estimate the rank of the query using linear interpolation.
20 |    The rank is normalized to range from 0 to 1.
21 | 
22 |    Queries that fall below the value of the lowest bin and above the value of
23 |    the greatest bin use linear interpolation based on the first two or last
24 |    two bins. If this results in a estimated rank outside of the allowed range
25 |    [0, 1], it is rounded to 0 to 1. Therefore, two different extreme values
26 |    may result in the same rank of 0 or 1. So the following doesn't hold
27 |    for extreme values but holds elsewhere:
28 | 
29 |      a < b => rank(a) < rank(b)
30 | 
31 |    If this turns out to be a critical property, it can be implemented using
32 |    a non-linear function (for example, an exponential) that converges smoothly
33 |    toward 0 (resp. 1) without reaching it. (See scanned figure in personal
34 |    notes).
35 | 
36 |    This implementation dates back from April 2019 and was referred to
37 |    privately as rank-normalization. It was published in June 2020.
38 |    Copyright (c) 2020 Martin Jambon
39 | *)
40 | 
41 | (**
42 |    The type of a distribution tracker.
43 | *)
44 | type t
45 | 
46 | val default_ema_alpha : float
47 | val default_num_bins : int
48 | 
49 | (**
50 |    Create a rank tracker.
51 | 
52 |    @param num_bins specifies the number of bins to use to model the
53 |    distribution.
54 | 
55 |    @param ema_alpha specifies the alpha parameter used when updating the
56 |    exponential moving averages used to represent each bin.
57 | *)
58 | val create : ?ema_alpha:float -> ?num_bins:int -> unit -> t
59 | 
60 | val get_num_bins : t -> int
61 | 
62 | (**
63 |    Add a sample to the tracker. This is a write-only operation.
64 | *)
65 | val add : t -> float -> unit
66 | 
67 | (**
68 |    Estimate the rank of a sample. The result ranges from [0.] to [1.].
69 |    Half of the samples matching the lowest bin will result in exactly [0.]
70 |    and half of the samples matching the greatest bin will result in exactly
71 |    [1.].
72 | 
73 |    This is a read-only operation.
74 | *)
75 | val normalize : t -> float -> float
76 | 
77 | (**
78 |    Add an input sample to the tracker and return its estimated rank.
79 | 
80 |    This the read-write operation that combines [add] and [normalize].
81 | *)
82 | val map : t -> float -> float
83 | 
84 | val ema_tests : (string * (unit -> unit)) list
85 | val tests : (string * (unit -> unit)) list
86 | 


--------------------------------------------------------------------------------
/src/lib/Uniformize.ml:
--------------------------------------------------------------------------------
  1 | (* see mli *)
  2 | 
  3 | open Printf
  4 | 
  5 | let debug = false
  6 | 
  7 | type buffer = {
  8 |   buf_array : float array;
  9 |   mutable buf_count : int;
 10 | }
 11 | 
 12 | type t = {
 13 |   num_bins : int;
 14 |   mutable num_rounds : int;
 15 |   averages : EMA.t array;
 16 |   buffer : buffer;
 17 | }
 18 | 
 19 | let default_ema_alpha = 0.01
 20 | let default_num_bins = 101
 21 | 
 22 | let get_num_bins t = t.num_bins
 23 | 
 24 | let create_buffer capacity =
 25 |   {
 26 |     buf_array = Array.make capacity nan;
 27 |     buf_count = 0;
 28 |   }
 29 | 
 30 | let create
 31 |     ?(ema_alpha = default_ema_alpha)
 32 |     ?(num_bins = default_num_bins)
 33 |     () =
 34 | 
 35 |   if num_bins <= 0 then
 36 |     invalid_arg (
 37 |       sprintf "Rank_normalize.create: num_bins=%i must be >= 1"
 38 |         num_bins
 39 |     );
 40 |   let ema_param = EMA.create_param ~alpha:ema_alpha () in
 41 |   let averages =
 42 |     Array.init num_bins (fun _ -> EMA.init ~param:ema_param ())
 43 |   in
 44 |   {
 45 |     num_bins;
 46 |     num_rounds = 0;
 47 |     averages;
 48 |     buffer = create_buffer num_bins;
 49 |   }
 50 | 
 51 | let add_array averages samples =
 52 |   Array.sort (compare : float -> float -> int) samples;
 53 |   Array.iteri (fun i x ->
 54 |     EMA.update averages.(i) x
 55 |   ) samples
 56 | 
 57 | let add t x =
 58 |   if not (Float.is_finite x) then
 59 |     invalid_arg (sprintf "Rank_normalize.add: %g" x);
 60 |   let n = t.num_bins in
 61 |   let b = t.buffer in
 62 |   let pos = b.buf_count in
 63 |   b.buf_array.(pos) <- x;
 64 |   let count = pos + 1 in
 65 |   if count = n then (
 66 |     add_array t.averages b.buf_array;
 67 |     b.buf_count <- 0;
 68 |     t.num_rounds <- t.num_rounds + 1;
 69 |   )
 70 |   else
 71 |     b.buf_count <- count
 72 | 
 73 | (*
 74 |    "Procedure for finding the leftmost element" from Wikipedia.
 75 | 
 76 |    Returns a whole rank, as you would obtain if you were inserting the
 77 |    performance of a contestant and calculate their rank. Therefore,
 78 |    this cannot return a rank less than 0 (first) but can return a rank of
 79 |    n (new dead last among n+1). In case of a tie, the most favorable (lowest)
 80 |    rank is returned.
 81 | 
 82 |    This assumes a sorted array and returns the leftmost position such that
 83 |    all elements to its left are strictly lower than the query.
 84 |    See the tests for examples.
 85 | 
 86 |   function binary_search_leftmost(A, n, T):
 87 |     L := 0
 88 |     R := n
 89 |     while L < R:
 90 |         m := floor((L + R) / 2)
 91 |         if A[m] < T:
 92 |             L := m + 1
 93 |         else:
 94 |             R := m
 95 |     return L
 96 | *)
 97 | let find_rank_from_left get a x =
 98 |   let n = Array.length a in
 99 |   let left = ref 0 in
100 |   let right = ref n in
101 |   while !left < !right do
102 |     let m = (!left + !right) / 2 in
103 |     if get a m < x then
104 |       left := m + 1
105 |     else
106 |       right := m
107 |   done;
108 |   !left
109 | 
110 | let string_of_array a =
111 |    a |> Array.map string_of_float |> Array.to_list |> String.concat " "
112 | 
113 | let test_find_rank_from_left () =
114 |   let f a x =
115 |     let rank = find_rank_from_left Array.get a x in
116 |     if debug then
117 |       printf "[%s] query: %g -> rank: %i\n" (string_of_array a) x rank;
118 |     rank
119 |   in
120 |   assert (f [| 1. |] 2. = 1);
121 |   assert (f [| 1. |] 1. = 0);
122 |   assert (f [| 1. |] 0. = 0);
123 |   assert (f [| 1.; 2. |] 1.5 = 1);
124 |   assert (f [| 1.; 2. |] 2.5 = 2);
125 |   assert (f [| 1.; 2.; 3. |] 2.5 = 2);
126 |   assert (f [| 1.; 2.; 2.; 3. |] 2. = 1);
127 |   assert (f [| 1.; 2.; 2.; 3. |] 2.5 = 3);
128 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 1);
129 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 2.5 = 4)
130 | 
131 | (*
132 |    Assuming a sorted array and a query, find the position of the most suitable
133 |    lower value to be used in linear interpolation.
134 | 
135 |    [ 3; 4 ], query = 3.5 -> 0
136 |    [ 3; 4 ], query = 3   -> 0
137 |    [ 3; 4 ], query = 1   -> -1   (out of bounds on purpose)
138 |    [ 3; 4 ], query = 6   -> 1
139 | 
140 |    See the tests for examples.
141 | *)
142 | let find_inf get a x =
143 |   if Array.length a = 0 then
144 |     invalid_arg "Rank_normalize.find_inf: empty array";
145 |   let rank = find_rank_from_left get a x in
146 |   if rank = Array.length a || x < get a rank then
147 |     rank - 1
148 |   else
149 |     rank
150 | 
151 | let test_find_inf () =
152 |   let inf a x =
153 |     let inf = find_inf Array.get a x in
154 |     printf "[%s] query: %g -> inf: %i\n" (string_of_array a) x inf;
155 |     inf
156 |   in
157 |   assert (inf [| 1. |] 2. = 0);
158 |   assert (inf [| 1. |] 1. = 0);
159 |   assert (inf [| 1. |] 0. = -1);
160 |   assert (inf [| 1.; 2. |] 1.5 = 0);
161 |   assert (inf [| 1.; 2. |] 2.5 = 1);
162 |   assert (inf [| 1.; 2.; 3. |] 2.5 = 1);
163 |   assert (inf [| 1.; 2.; 2.; 3. |] 2. = 1);
164 |   assert (inf [| 1.; 2.; 2.; 3. |] 2.5 = 2);
165 |   assert (inf [| 1.; 2.; 2.; 2.; 3. |] 2. = 1);
166 |   assert (inf [| 1.; 2.; 2.; 2.; 3. |] 2.5 = 3)
167 | 
168 | (*
169 |    "Procedure for finding the rightmost element" from Wikipedia.
170 |    Same as find_rank_from_left, but determines the rank from the last position.
171 | 
172 |    See the tests for examples.
173 | 
174 |    Original code from Wikipedia:
175 | 
176 |    function binary_search_rightmost(A, n, T):
177 |     L := 0
178 |     R := n
179 |     while L < R:
180 |         m := floor((L + R) / 2)
181 |         if A[m] <= T:
182 |             L := m + 1
183 |         else:
184 |             R := m
185 |     return L - 1
186 | *)
187 | let find_rank_from_right get a x =
188 |   let n = Array.length a in
189 |   let left = ref 0 in
190 |   let right = ref n in
191 |   while !left < !right do
192 |     let m = (!left + !right) / 2 in
193 |     if get a m <= x then
194 |       left := m + 1
195 |     else
196 |       right := m
197 |   done;
198 |   !right - 1
199 | 
200 | let test_find_rank_from_right () =
201 |   let f a x =
202 |     let rank = find_rank_from_right Array.get a x in
203 |     printf "[%s] query: %g -> rank: %i\n" (string_of_array a) x rank;
204 |     rank
205 |   in
206 |   assert (f [| 1. |] 2. = 0);
207 |   assert (f [| 1. |] 1. = 0);
208 |   assert (f [| 1. |] 0. = -1);
209 |   assert (f [| 1.; 2. |] 1.5 = 0);
210 |   assert (f [| 1.; 2. |] 0.5 = -1);
211 |   assert (f [| 0.; 1.; 2.; |] 0.5 = 0);
212 |   assert (f [| 1.; 2.; 2.; 3. |] 2. = 2);
213 |   assert (f [| 1.; 2.; 2.; 3. |] 1.5 = 0);
214 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 3);
215 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 1.5 = 0)
216 | 
217 | (*
218 |    Assuming a sorted array and a query, find the position of the most suitable
219 |    higher value to be used in linear interpolation.
220 | 
221 |    [ 3; 4 ], query = 3.5 -> 1
222 |    [ 3; 4 ], query = 3   -> 0
223 |    [ 3; 4 ], query = 1   -> 0
224 |    [ 3; 4 ], query = 6   -> 2  (out of bounds on purpose)
225 | 
226 |    See the tests for examples.
227 | *)
228 | let find_sup get a x =
229 |   if Array.length a = 0 then
230 |     invalid_arg "Rank_normalize.find_sup: empty array";
231 |   let rank = find_rank_from_right get a x in
232 |   if rank = -1 || x > get a rank then
233 |     rank + 1
234 |   else
235 |     rank
236 | 
237 | let test_find_sup () =
238 |   let f a x =
239 |     let sup = find_sup Array.get a x in
240 |     printf "[%s] query: %g -> sup: %i\n" (string_of_array a) x sup;
241 |     sup
242 |   in
243 |   assert (f [| 1. |] 2. = 1);
244 |   assert (f [| 1. |] 1. = 0);
245 |   assert (f [| 1. |] 0. = 0);
246 |   assert (f [| 1.; 2. |] 1.5 = 1);
247 |   assert (f [| 1.; 2. |] 0.5 = 0);
248 |   assert (f [| 0.; 1.; 2.; |] 0.5 = 1);
249 |   assert (f [| 1.; 2.; 2.; 3. |] 2. = 2);
250 |   assert (f [| 1.; 2.; 2.; 3. |] 1.5 = 1);
251 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 3);
252 |   assert (f [| 1.; 2.; 2.; 2.; 3. |] 1.5 = 1)
253 | 
254 | (*
255 |    Estimate the rank of a query in an unsorted array by scanning the whole
256 |    array.
257 |    This is used in the initialization phase, before num_bins have been added
258 |    to the tracker.
259 | *)
260 | let find_rank_in_buffer b x =
261 |   let n = b.buf_count in
262 |   match n with
263 |   | 0 | 1 -> 0.5
264 |   | _ ->
265 |       (*
266 |          left_count and right_count are the number of elements lower than
267 |          and greater than the query.
268 |          We don't keep track of which elements are closest to the query,
269 |          so don't do any interpolation to obtain a non-discrete rank.
270 |       *)
271 |       let left_count = ref 0 in
272 |       let right_count = ref 0 in
273 |       let a = b.buf_array in
274 |       for i = 0 to n - 1 do
275 |         let c = compare a.(i) x in
276 |         if c < 0 then
277 |           incr left_count
278 |         else if c > 0 then
279 |           incr right_count
280 |       done;
281 |       assert (!left_count + !right_count <= n);
282 |       (* inf and sup are the indices, possibly out of bounds, of the positions
283 |          of the elements surrounding the query in a sorted array:
284 |          query = 2
285 |          [| 3; 4.5 |] -> inf = -1, sup = 0
286 |          [| 0; 2; 2; 3.1 |] -> inf = 1, sup = 2
287 |          [| 0; 0 |] -> inf = 1, sup = 2
288 |       *)
289 |       let inf = !left_count - 1 in
290 |       let sup = n - !right_count in
291 |       assert (inf < sup);
292 |       (*
293 |          Index i represents the bin whose ranks span [i, i+1] and of average
294 |          rank i + 0.5.
295 |          Hence, we add 0.5 to inf and 0.5 to sup to obtain the average rank
296 |          of the corresponding bins.
297 |          The rank of the query is the average of the ranks of the bins
298 |          inf and sup.
299 |       *)
300 |       let rank = 0.5 *. float (inf + sup + 1) in
301 |       rank /. float n
302 | 
303 | let test_find_rank_in_buffer () =
304 |   let f array query =
305 |     let b = {
306 |       buf_array = Array.copy array;
307 |       buf_count = Array.length array;
308 |     } in
309 |     let rank = find_rank_in_buffer b query in
310 |     printf "[%s] query: %g -> rank = %g\n" (string_of_array array) query rank;
311 |     rank
312 |   in
313 |   assert (f [| |] 2. = 0.5);
314 |   assert (f [| 3. |] 2. = 0.5);
315 |   assert (f [| 3. |] 4. = 0.5);
316 |   assert (f [| 3.; 4. |] 1. = 0.);
317 |   assert (f [| 3.; 4. |] 3. = 0.25);
318 |   assert (f [| 4.; 3. |] 3. = 0.25);
319 |   assert (f [| 3.; 4. |] 3.5 = 0.5);
320 |   assert (f [| 3.; 4. |] 3.1 = 0.5);
321 |   assert (f [| 3.; 4. |] 4. = 0.75);
322 |   assert (f [| 3.; 4. |] 5. = 1.)
323 | 
324 | (*
325 |    Linear interpolation: find y such that (x, y) lies on the line that goes
326 |    through (x1, y1) and (x2, y2).
327 | *)
328 | let interpolate x1 y1 x2 y2 x =
329 |   if debug then
330 |     printf "(x1=%g, y1=%i) (x2=%g, y2=%i) x=%g\n"
331 |       x1 y1 x2 y2 x;
332 |   if x1 = x2 then
333 |     if x < x1 then
334 |       float y1
335 |     else if x > x2 then
336 |       float y2
337 |     else (* x = x1 = x2 *)
338 |       0.5 *. (float y1 +. float y2)
339 |   else
340 |     let slope = float (y2 - y1) /. (x2 -. x1) in
341 |     float y1 +. slope *. (x -. x1)
342 | 
343 | let test_interpolate () =
344 |   let f x1 y1 x2 y2 x =
345 |     let y = interpolate x1 y1 x2 y2 x in
346 |     printf "(x1=%g, y1=%i) (x2=%g, y2=%i) x=%g -> y=%g\n"
347 |       x1 y1 x2 y2 x y;
348 |     y
349 |   in
350 |   let ( =~ ) a b = abs_float (a -. b) < 1e-6 in
351 |   assert (f 0. 0 1. 1 0.5 =~ 0.5);
352 |   assert (f 0. 0 1. 1 0. =~ 0.);
353 |   assert (f 0. 0 1. 1 1. =~ 1.);
354 |   assert (f 0. 0 1. 1 (-1.) =~ -1.);
355 |   assert (f 10. 10 1. 1 0.5 =~ 0.5);
356 |   assert (f 10. 10 1. 1 0. =~ 0.);
357 |   assert (f 10. 10 1. 1 1. =~ 1.);
358 |   assert (f 10. 10 1. 1 (-1.) =~ -1.);
359 |   assert (f 2. 0 3. 1 3.5 =~ 1.5);
360 | 
361 |   (* x1 = x2 *)
362 |   assert (f 0. 10 0. 20 0. =~ 15.);
363 |   assert (f 0. 10 0. 20 (-1.) =~ 10.);
364 |   assert (f 0. 10 0. 20 1. =~ 20.)
365 | 
366 | let normalize t x =
367 |   match t.num_rounds with
368 |   | 0 ->
369 |       (* The bins haven't been initialized yet, fall back to scanning the
370 |          buffer. *)
371 |       let buffer = t.buffer in
372 |       find_rank_in_buffer buffer x
373 |   | _ ->
374 |       (*
375 |          The bins have been initialized. We use binary search and linear
376 |          interpolation to estimate the rank such that
377 | 
378 |               a < b => rank(a) < rank(b) or rank(b) = 0 or rank(a) = 1
379 |               a = b => rank(a) = rank(b)
380 |               For all x, rank(x) >= 0 and rank(x) <= 1
381 | 
382 |               (assuming no rounding errors)
383 | 
384 |          - If the query is equal to the values in one or more bins, the average
385 |            rank of these bins is returned.
386 |          - If the query falls to the left of the minimum (value in bin 0),
387 |            then the values in bins 0 and 1 are used for linear interpolation.
388 |          - If the query falls to the right of the maximum (value in bin n-1),
389 |            then the values in bins n-2 and n-1 are used for linear
390 |            interpolation.
391 |          - If the query falls between the values of two (consecutive) bins,
392 |            the values in these two bins are used for linear interpolation.
393 |       *)
394 |       match t.num_bins with
395 |       | 0 | 1 -> 0.5
396 |       | n ->
397 |           let a = t.averages in
398 |           let get a i = EMA.get a.(i) in
399 |           let inf = find_inf get a x in
400 |           let sup = find_sup get a x in
401 |           assert (inf >= -1);
402 |           assert (inf <= sup);
403 |           assert (sup <= n);
404 |           let rank =
405 |             if inf = -1 then
406 |               let inf = inf + 1 in
407 |               let sup = sup + 1 in
408 |               max 0.
409 |                 (0.5 +. interpolate (get a inf) inf (get a sup) sup x)
410 |             else if sup = n then
411 |               let inf = inf - 1 in
412 |               let sup = sup - 1 in
413 |               min (float n)
414 |                 (0.5 +. interpolate (get a inf) inf (get a sup) sup x)
415 |             else
416 |               let a_inf = get a inf in
417 |               let a_sup = get a sup in
418 |               if x = a_inf then (
419 |                 assert (x = a_sup);
420 |                 0.5 *. float (inf + sup + 1)
421 |               )
422 |               else (
423 |                 0.5 +. interpolate a_inf inf a_sup sup x
424 |               )
425 |           in
426 |           rank /. float n
427 | 
428 | let map t x =
429 |   add t x;
430 |   normalize t x
431 | 
432 | let string_of_list l =
433 |    l |> List.map string_of_float |> String.concat " "
434 | 
435 | let test_normalize () =
436 |   let f n samples query =
437 |     let t = create ~num_bins:n () in
438 |     List.iter (add t) samples;
439 |     let r = normalize t query in
440 |     printf "n=%i [%s] query=%g -> rank=%g\n"
441 |       n (string_of_list samples) query r;
442 |     r
443 |   in
444 |   let ( =~ ) a b = abs_float (a -. b) < 1e-6 in
445 |   (* Test initialization phase *)
446 |   assert (f 1 [] 0. =~ 0.5);
447 |   assert (f 5 [] 0. =~ 0.5);
448 |   assert (f 5 [3.] 0. =~ 0.5);
449 |   assert (f 5 [3.; 2.] 0. =~ 0.);
450 |   assert (f 5 [3.; 2.] 1.5 =~ 0.);
451 |   assert (f 5 [3.; 2.] 1.75 =~ 0.);
452 |   assert (f 5 [3.; 2.] 2. =~ 0.25);
453 |   assert (f 5 [3.; 2.] 2.25 =~ 0.5);
454 |   assert (f 5 [3.; 2.] 2.5 =~ 0.5);
455 |   assert (f 5 [3.; 2.] 2.75 =~ 0.5);
456 |   assert (f 5 [3.; 2.] 3. =~ 0.75);
457 |   assert (f 5 [3.; 2.] 3.5 =~ 1.);
458 |   assert (f 5 [3.; 2.] 4. =~ 1.);
459 |   (* Test normal operation *)
460 |   assert (f 2 [3.; 2.; 10.] 0. =~ 0.);
461 |   assert (f 2 [3.; 2.; 10.] 1.5 =~ 0.);
462 |   assert (f 2 [3.; 2.; 10.] 1.75 =~ 0.125);
463 |   assert (f 2 [3.; 2.; 10.] 2. =~ 0.25);
464 |   assert (f 2 [3.; 2.; 10.] 2.25 =~ 0.375);
465 |   assert (f 2 [3.; 2.; 10.] 2.5 =~ 0.5);
466 |   assert (f 2 [3.; 2.; 10.] 2.75 =~ 0.625);
467 |   assert (f 2 [3.; 2.; 10.] 3. =~ 0.75);
468 |   assert (f 2 [3.; 2.; 10.] 3.5 =~ 1.);
469 |   assert (f 2 [3.; 2.; 10.] 4. =~ 1.);
470 |   (* Test after two rounds of identical samples *)
471 |   assert (f 2 [3.; 2.; 2.; 3.] 0. =~ 0.);
472 |   assert (f 2 [3.; 2.; 2.; 3.] 1.5 =~ 0.);
473 |   assert (f 2 [3.; 2.; 2.; 3.] 1.75 =~ 0.125);
474 |   assert (f 2 [3.; 2.; 2.; 3.] 2. =~ 0.25);
475 |   assert (f 2 [3.; 2.; 2.; 3.] 2.25 =~ 0.375);
476 |   assert (f 2 [3.; 2.; 2.; 3.] 2.5 =~ 0.5);
477 |   assert (f 2 [3.; 2.; 2.; 3.] 2.75 =~ 0.625);
478 |   assert (f 2 [3.; 2.; 2.; 3.] 3. =~ 0.75);
479 |   assert (f 2 [3.; 2.; 2.; 3.] 3.5 =~ 1.);
480 |   assert (f 2 [3.; 2.; 2.; 3.] 4. =~ 1.);
481 |   (* Test after two rounds of different samples, shifting avg. values by 1. *)
482 |   assert (f 2 [3.; 2.; 4.; 5.] 1. =~ 0.);
483 |   assert (f 2 [3.; 2.; 4.; 5.] 2.5 =~ 0.);
484 |   assert (f 2 [3.; 2.; 4.; 5.] 2.75 =~ 0.125);
485 |   assert (f 2 [3.; 2.; 4.; 5.] 3. =~ 0.25);
486 |   assert (f 2 [3.; 2.; 4.; 5.] 3.25 =~ 0.375);
487 |   assert (f 2 [3.; 2.; 4.; 5.] 3.5 =~ 0.5);
488 |   assert (f 2 [3.; 2.; 4.; 5.] 3.75 =~ 0.625);
489 |   assert (f 2 [3.; 2.; 4.; 5.] 4. =~ 0.75);
490 |   assert (f 2 [3.; 2.; 4.; 5.] 4.5 =~ 1.);
491 |   assert (f 2 [3.; 2.; 4.; 5.] 5. =~ 1.)
492 | 
493 | let ema_tests = EMA.tests
494 | 
495 | let tests = [
496 |   "find_rank_from_left", test_find_rank_from_left;
497 |   "find_inf", test_find_inf;
498 |   "find_rank_from_right", test_find_rank_from_right;
499 |   "find_sup", test_find_sup;
500 |   "find_rank_in_buffer", test_find_rank_in_buffer;
501 |   "interpolate", test_interpolate;
502 |   "normalize", test_normalize;
503 | ]
504 | 


--------------------------------------------------------------------------------