├── .gitignore ├── dune-project ├── src ├── test │ ├── dune │ └── test.ml ├── lib │ ├── dune │ ├── EMA.mli │ ├── EMA.ml │ ├── Uniformize.mli │ └── Uniformize.ml └── bin │ ├── dune │ └── main.ml ├── Makefile ├── README.md ├── uniformize.opam ├── .ocp-indent └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | _opam 3 | .merlin 4 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 2.6) 2 | (name uniformization) 3 | -------------------------------------------------------------------------------- /src/test/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name test) 3 | (libraries uniformize alcotest) 4 | ) 5 | -------------------------------------------------------------------------------- /src/lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (public_name uniformize) 3 | (name uniformize) 4 | (libraries ) 5 | ) 6 | -------------------------------------------------------------------------------- /src/bin/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (public_name uniformize) 3 | (name main) 4 | (libraries 5 | uniformize 6 | cmdliner 7 | ) 8 | ) 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build 2 | build: 3 | dune build 4 | 5 | test: 6 | dune exec src/test/test.exe 7 | 8 | .PHONY: install 9 | install: 10 | dune install 11 | 12 | .PHONY: clean 13 | clean: 14 | git clean -dfX 15 | -------------------------------------------------------------------------------- /src/test/test.ml: -------------------------------------------------------------------------------- 1 | (* 2 | Entrypoint to run the unit tests from the command line. 3 | *) 4 | 5 | let make_alcotest_suite tests_list : unit Alcotest.test list = 6 | List.map (fun (name, tests) -> 7 | name, List.map (fun (name, f) -> (name, `Quick, f)) tests 8 | ) tests_list 9 | 10 | let test_suites = 11 | make_alcotest_suite [ 12 | "EMA", Uniformize.ema_tests; 13 | "Uniformize", Uniformize.tests; 14 | ] 15 | 16 | let main () = Alcotest.run "uniformize" test_suites 17 | 18 | let () = main () 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Signal Uniformization 2 | 3 | This is an OCaml library and command-line executable for mapping a 4 | signal into a uniform distribution over [0, 1]. 5 | 6 | It is a complete, tested implementation of the heuristic outlined in 7 | this article: 8 | 9 | Martin Jambon, [Dynamic Signal 10 | Uniformization](https://mjambon.com/2020-06-28-dynamic-signal-uniformization/), 11 | June 2020. 12 | 13 | Installation 14 | -- 15 | 16 | Install [opam](https://opam.ocaml.org/doc/Install.html), then run 17 | ``` 18 | opam install . 19 | ``` 20 | 21 | This installs a `uniformize` executable as well as the OCaml library 22 | of the same name. 23 | 24 | Project status 25 | -- 26 | 27 | This source code was extracted from a private project in the hope that 28 | it will be useful. See [license](LICENSE) for use and redistribution rights. 29 | -------------------------------------------------------------------------------- /uniformize.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | maintainer: "martin@mjambon.com" 3 | authors: ["Martin Jambon"] 4 | homepage: "https://github.com/mjambon/uniformization" 5 | bug-reports: "https://github.com/mjambon/uniformization/issues" 6 | dev-repo: "git+https://github.com/mjambon/uniformization.git" 7 | license: "BSD-3-Clause" 8 | 9 | build: [ 10 | ["dune" "build" "-p" name "-j" jobs] 11 | ] 12 | 13 | depends: [ 14 | "alcotest" 15 | "cmdliner" 16 | "dune" {>= "2.1"} 17 | "ocaml" 18 | ] 19 | 20 | synopsis: "Show dependency graph of a multi-component dune project" 21 | 22 | description: """ 23 | This library implements dynamic signal uniformization. It transforms an 24 | input signal of unknown distribution into a signal that is uniformly 25 | distributed. 26 | """ 27 | 28 | url { 29 | src: "git+https://github.com/mjambon/uniformization" 30 | } 31 | -------------------------------------------------------------------------------- /src/lib/EMA.mli: -------------------------------------------------------------------------------- 1 | (* 2 | Exponential moving average. 3 | 4 | See https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average 5 | *) 6 | 7 | type param 8 | 9 | type t = private { 10 | param: param; 11 | 12 | mutable m: float; 13 | (* Current estimate of the average, initially nan. *) 14 | 15 | mutable age: int; 16 | (* Number of observations. *) 17 | } 18 | 19 | val default_alpha : float 20 | val create_param : ?alpha:float -> unit -> param 21 | val default_param : param 22 | 23 | val init : ?param:param -> unit -> t 24 | 25 | val update : t -> float -> unit 26 | 27 | val get : t -> float 28 | (* Get the current average. *) 29 | 30 | val get_obs_count : t -> int 31 | (* Return the number of observations. *) 32 | 33 | val of_list : ?alpha:float -> float list -> float 34 | (* Shorthand for getting the EMA from a list of numbers. 35 | This is for evaluation purposes. *) 36 | 37 | val tests : (string * (unit -> unit)) list 38 | -------------------------------------------------------------------------------- /.ocp-indent: -------------------------------------------------------------------------------- 1 | # See https://github.com/OCamlPro/ocp-indent/blob/master/.ocp-indent for more 2 | 3 | # Indent for clauses inside a pattern-match (after the arrow): 4 | # match foo with 5 | # | _ -> 6 | # ^^^^bar 7 | # the default is 2, which aligns the pattern and the expression 8 | match_clause = 4 9 | 10 | # When nesting expressions on the same line, their indentation are in 11 | # some cases stacked, so that it remains correct if you close them one 12 | # at a line. This may lead to large indents in complex code though, so 13 | # this parameter can be used to set a maximum value. Note that it only 14 | # affects indentation after function arrows and opening parens at end 15 | # of line. 16 | # 17 | # for example (left: `none`; right: `4`) 18 | # let f = g (h (i (fun x -> # let f = g (h (i (fun x -> 19 | # x) # x) 20 | # ) # ) 21 | # ) # ) 22 | max_indent = 2 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Martin Jambon 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/lib/EMA.ml: -------------------------------------------------------------------------------- 1 | (* Exponential moving average. See .mli *) 2 | 3 | type param = { 4 | alpha: float; 5 | age_min: int; (* derived from alpha *) 6 | } 7 | 8 | type t = { 9 | param: param; 10 | mutable m: float; 11 | mutable age: int; 12 | } 13 | 14 | let default_alpha = 0.1 15 | 16 | let create_param ?(alpha = default_alpha) () = 17 | if not (alpha >= 0. && alpha <= 1.) then 18 | invalid_arg "EMA.create_param: invalid alpha"; 19 | let age_min = truncate (ceil (1. /. alpha)) in 20 | { 21 | alpha; 22 | age_min; 23 | } 24 | 25 | let default_param = create_param () 26 | 27 | let init ?(param = default_param) () = 28 | { 29 | param; 30 | m = nan; 31 | age = 0 32 | } 33 | 34 | let update_age state = 35 | let age = state.age in 36 | if age >= 0 then 37 | state.age <- age + 1 38 | 39 | let update state x = 40 | if x <> x then 41 | invalid_arg "EMA.update: not a number"; 42 | update_age state; 43 | let param = state.param in 44 | let alpha = 45 | if state.age > param.age_min then 46 | param.alpha 47 | else ( 48 | if state.age = 1 then 49 | (* replace nan to avoid contamination when it's multiplied by 0 *) 50 | state.m <- 0.; 51 | (* arithmetic mean until we reach 1/r observations *) 52 | 1. /. float state.age 53 | ) 54 | in 55 | state.m <- (1. -. alpha) *. state.m +. alpha *. x 56 | 57 | let get state = state.m 58 | 59 | let get_obs_count state = state.age 60 | 61 | let of_list ?(alpha = default_alpha) data = 62 | let param = create_param ~alpha () in 63 | let state = init ~param () in 64 | List.iter (update state) data; 65 | get state 66 | 67 | (* Equality within some absolute error 'err'. *) 68 | let float_eq ?(err = 1e-6) a b = 69 | if not (err >= 0.) then 70 | invalid_arg "Float.eq: invalid 'err' parameter"; 71 | abs_float (a -. b) <= err 72 | 73 | let test_initial_samples () = 74 | let ( =~ ) = float_eq ~err:1e-6 in 75 | let ema l = of_list ~alpha:0.5 l in 76 | assert (ema [10.] =~ 10.); 77 | assert (ema [10.; 20.] =~ 15.); 78 | assert (ema [10.; 20.; 25.] =~ 20.) 79 | 80 | let test_convergence () = 81 | let ( =~ ) = float_eq ~err:1e-3 in 82 | let ema l = of_list ~alpha:0.5 l in 83 | assert (ema [0.; 1.] =~ 0.5); 84 | assert (ema [0.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1.; 1. ] =~ 1.) 85 | 86 | let tests = [ 87 | "initial samples", test_initial_samples; 88 | "convergence", test_convergence; 89 | ] 90 | -------------------------------------------------------------------------------- /src/bin/main.ml: -------------------------------------------------------------------------------- 1 | (* 2 | Entrypoint for command line normalizer. 3 | *) 4 | 5 | open Printf 6 | open Cmdliner 7 | 8 | type config = { 9 | ema_alpha: float; 10 | num_bins: int; 11 | flush_line: bool; 12 | } 13 | 14 | let map ~flush_line tracker ic oc = 15 | try 16 | while true do 17 | let x = input_line ic |> float_of_string in 18 | let y = Uniformize.map tracker x in 19 | printf "%g\n" y; 20 | if flush_line then 21 | flush oc 22 | done; 23 | assert false 24 | with End_of_file -> 25 | () 26 | 27 | let run config ic oc = 28 | let tracker = 29 | Uniformize.create 30 | ~ema_alpha:config.ema_alpha 31 | ~num_bins:config.num_bins 32 | () 33 | in 34 | map ~flush_line:config.flush_line tracker ic oc 35 | 36 | let ema_alpha_term = 37 | let info = 38 | Arg.info ["ema-alpha"; "a"] 39 | ~docv:"NUM" 40 | ~doc:"$(docv) specifies the alpha parameter used by the exponential 41 | moving average (EMA). It must be a value within (0, 1). 42 | A greater value gives a greater weight to the recent samples." 43 | in 44 | Arg.value (Arg.opt Arg.float Uniformize.default_ema_alpha info) 45 | 46 | let num_bins_term = 47 | let info = 48 | Arg.info ["num-bins"; "n"] 49 | ~docv:"N" 50 | ~doc:"$(docv) specifies the number of points to use to model the 51 | input distribution. 52 | A greater value increases the accuracy with which the 53 | original distribution is modeled but increases memory usage 54 | accordingly." 55 | in 56 | Arg.value (Arg.opt Arg.int Uniformize.default_num_bins info) 57 | 58 | let flush_line_term = 59 | let info = 60 | Arg.info ["flush-line"; "f"] 61 | ~doc:"Print each line of output as soon as it is available." 62 | in 63 | Arg.value (Arg.flag info) 64 | 65 | let cmdline_term = 66 | let combine ema_alpha num_bins flush_line = 67 | { ema_alpha; num_bins; flush_line } 68 | in 69 | Term.(const combine 70 | $ ema_alpha_term 71 | $ num_bins_term 72 | $ flush_line_term 73 | ) 74 | 75 | let doc = 76 | "make an input signal uniformly distributed over [0, 1]" 77 | 78 | let parse_command_line () = 79 | let info = 80 | Term.info 81 | ~doc 82 | "uniformize" 83 | in 84 | match Term.eval (cmdline_term, info) with 85 | | `Error _ -> exit 1 86 | | `Version | `Help -> exit 0 87 | | `Ok config -> config 88 | 89 | let main () = 90 | let config = parse_command_line () in 91 | run config stdin stdout 92 | 93 | let () = main () 94 | -------------------------------------------------------------------------------- /src/lib/Uniformize.mli: -------------------------------------------------------------------------------- 1 | (** 2 | Dynamic Signal Uniformization. 3 | 4 | An algorithm for tracking a distribution and estimating the rank of 5 | sample values. 6 | 7 | Algorithm outline 8 | 9 | Tracker update cycle: 10 | 1. Create an array of n bins. 11 | 2. Accumulate n sample points (floats) into a buffer. 12 | 3. Sort the buffer of n sample points. 13 | 4. Add the data of the sorted buffer to the corresponding bin, which 14 | tracks an exponential moving average of the samples it receives. 15 | 5. Go to 2. 16 | 17 | Rank estimation: 18 | Given a query, we use binary search to find the two best bins whose value 19 | can be used to estimate the rank of the query using linear interpolation. 20 | The rank is normalized to range from 0 to 1. 21 | 22 | Queries that fall below the value of the lowest bin and above the value of 23 | the greatest bin use linear interpolation based on the first two or last 24 | two bins. If this results in a estimated rank outside of the allowed range 25 | [0, 1], it is rounded to 0 to 1. Therefore, two different extreme values 26 | may result in the same rank of 0 or 1. So the following doesn't hold 27 | for extreme values but holds elsewhere: 28 | 29 | a < b => rank(a) < rank(b) 30 | 31 | If this turns out to be a critical property, it can be implemented using 32 | a non-linear function (for example, an exponential) that converges smoothly 33 | toward 0 (resp. 1) without reaching it. (See scanned figure in personal 34 | notes). 35 | 36 | This implementation dates back from April 2019 and was referred to 37 | privately as rank-normalization. It was published in June 2020. 38 | Copyright (c) 2020 Martin Jambon 39 | *) 40 | 41 | (** 42 | The type of a distribution tracker. 43 | *) 44 | type t 45 | 46 | val default_ema_alpha : float 47 | val default_num_bins : int 48 | 49 | (** 50 | Create a rank tracker. 51 | 52 | @param num_bins specifies the number of bins to use to model the 53 | distribution. 54 | 55 | @param ema_alpha specifies the alpha parameter used when updating the 56 | exponential moving averages used to represent each bin. 57 | *) 58 | val create : ?ema_alpha:float -> ?num_bins:int -> unit -> t 59 | 60 | val get_num_bins : t -> int 61 | 62 | (** 63 | Add a sample to the tracker. This is a write-only operation. 64 | *) 65 | val add : t -> float -> unit 66 | 67 | (** 68 | Estimate the rank of a sample. The result ranges from [0.] to [1.]. 69 | Half of the samples matching the lowest bin will result in exactly [0.] 70 | and half of the samples matching the greatest bin will result in exactly 71 | [1.]. 72 | 73 | This is a read-only operation. 74 | *) 75 | val normalize : t -> float -> float 76 | 77 | (** 78 | Add an input sample to the tracker and return its estimated rank. 79 | 80 | This the read-write operation that combines [add] and [normalize]. 81 | *) 82 | val map : t -> float -> float 83 | 84 | val ema_tests : (string * (unit -> unit)) list 85 | val tests : (string * (unit -> unit)) list 86 | -------------------------------------------------------------------------------- /src/lib/Uniformize.ml: -------------------------------------------------------------------------------- 1 | (* see mli *) 2 | 3 | open Printf 4 | 5 | let debug = false 6 | 7 | type buffer = { 8 | buf_array : float array; 9 | mutable buf_count : int; 10 | } 11 | 12 | type t = { 13 | num_bins : int; 14 | mutable num_rounds : int; 15 | averages : EMA.t array; 16 | buffer : buffer; 17 | } 18 | 19 | let default_ema_alpha = 0.01 20 | let default_num_bins = 101 21 | 22 | let get_num_bins t = t.num_bins 23 | 24 | let create_buffer capacity = 25 | { 26 | buf_array = Array.make capacity nan; 27 | buf_count = 0; 28 | } 29 | 30 | let create 31 | ?(ema_alpha = default_ema_alpha) 32 | ?(num_bins = default_num_bins) 33 | () = 34 | 35 | if num_bins <= 0 then 36 | invalid_arg ( 37 | sprintf "Rank_normalize.create: num_bins=%i must be >= 1" 38 | num_bins 39 | ); 40 | let ema_param = EMA.create_param ~alpha:ema_alpha () in 41 | let averages = 42 | Array.init num_bins (fun _ -> EMA.init ~param:ema_param ()) 43 | in 44 | { 45 | num_bins; 46 | num_rounds = 0; 47 | averages; 48 | buffer = create_buffer num_bins; 49 | } 50 | 51 | let add_array averages samples = 52 | Array.sort (compare : float -> float -> int) samples; 53 | Array.iteri (fun i x -> 54 | EMA.update averages.(i) x 55 | ) samples 56 | 57 | let add t x = 58 | if not (Float.is_finite x) then 59 | invalid_arg (sprintf "Rank_normalize.add: %g" x); 60 | let n = t.num_bins in 61 | let b = t.buffer in 62 | let pos = b.buf_count in 63 | b.buf_array.(pos) <- x; 64 | let count = pos + 1 in 65 | if count = n then ( 66 | add_array t.averages b.buf_array; 67 | b.buf_count <- 0; 68 | t.num_rounds <- t.num_rounds + 1; 69 | ) 70 | else 71 | b.buf_count <- count 72 | 73 | (* 74 | "Procedure for finding the leftmost element" from Wikipedia. 75 | 76 | Returns a whole rank, as you would obtain if you were inserting the 77 | performance of a contestant and calculate their rank. Therefore, 78 | this cannot return a rank less than 0 (first) but can return a rank of 79 | n (new dead last among n+1). In case of a tie, the most favorable (lowest) 80 | rank is returned. 81 | 82 | This assumes a sorted array and returns the leftmost position such that 83 | all elements to its left are strictly lower than the query. 84 | See the tests for examples. 85 | 86 | function binary_search_leftmost(A, n, T): 87 | L := 0 88 | R := n 89 | while L < R: 90 | m := floor((L + R) / 2) 91 | if A[m] < T: 92 | L := m + 1 93 | else: 94 | R := m 95 | return L 96 | *) 97 | let find_rank_from_left get a x = 98 | let n = Array.length a in 99 | let left = ref 0 in 100 | let right = ref n in 101 | while !left < !right do 102 | let m = (!left + !right) / 2 in 103 | if get a m < x then 104 | left := m + 1 105 | else 106 | right := m 107 | done; 108 | !left 109 | 110 | let string_of_array a = 111 | a |> Array.map string_of_float |> Array.to_list |> String.concat " " 112 | 113 | let test_find_rank_from_left () = 114 | let f a x = 115 | let rank = find_rank_from_left Array.get a x in 116 | if debug then 117 | printf "[%s] query: %g -> rank: %i\n" (string_of_array a) x rank; 118 | rank 119 | in 120 | assert (f [| 1. |] 2. = 1); 121 | assert (f [| 1. |] 1. = 0); 122 | assert (f [| 1. |] 0. = 0); 123 | assert (f [| 1.; 2. |] 1.5 = 1); 124 | assert (f [| 1.; 2. |] 2.5 = 2); 125 | assert (f [| 1.; 2.; 3. |] 2.5 = 2); 126 | assert (f [| 1.; 2.; 2.; 3. |] 2. = 1); 127 | assert (f [| 1.; 2.; 2.; 3. |] 2.5 = 3); 128 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 1); 129 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 2.5 = 4) 130 | 131 | (* 132 | Assuming a sorted array and a query, find the position of the most suitable 133 | lower value to be used in linear interpolation. 134 | 135 | [ 3; 4 ], query = 3.5 -> 0 136 | [ 3; 4 ], query = 3 -> 0 137 | [ 3; 4 ], query = 1 -> -1 (out of bounds on purpose) 138 | [ 3; 4 ], query = 6 -> 1 139 | 140 | See the tests for examples. 141 | *) 142 | let find_inf get a x = 143 | if Array.length a = 0 then 144 | invalid_arg "Rank_normalize.find_inf: empty array"; 145 | let rank = find_rank_from_left get a x in 146 | if rank = Array.length a || x < get a rank then 147 | rank - 1 148 | else 149 | rank 150 | 151 | let test_find_inf () = 152 | let inf a x = 153 | let inf = find_inf Array.get a x in 154 | printf "[%s] query: %g -> inf: %i\n" (string_of_array a) x inf; 155 | inf 156 | in 157 | assert (inf [| 1. |] 2. = 0); 158 | assert (inf [| 1. |] 1. = 0); 159 | assert (inf [| 1. |] 0. = -1); 160 | assert (inf [| 1.; 2. |] 1.5 = 0); 161 | assert (inf [| 1.; 2. |] 2.5 = 1); 162 | assert (inf [| 1.; 2.; 3. |] 2.5 = 1); 163 | assert (inf [| 1.; 2.; 2.; 3. |] 2. = 1); 164 | assert (inf [| 1.; 2.; 2.; 3. |] 2.5 = 2); 165 | assert (inf [| 1.; 2.; 2.; 2.; 3. |] 2. = 1); 166 | assert (inf [| 1.; 2.; 2.; 2.; 3. |] 2.5 = 3) 167 | 168 | (* 169 | "Procedure for finding the rightmost element" from Wikipedia. 170 | Same as find_rank_from_left, but determines the rank from the last position. 171 | 172 | See the tests for examples. 173 | 174 | Original code from Wikipedia: 175 | 176 | function binary_search_rightmost(A, n, T): 177 | L := 0 178 | R := n 179 | while L < R: 180 | m := floor((L + R) / 2) 181 | if A[m] <= T: 182 | L := m + 1 183 | else: 184 | R := m 185 | return L - 1 186 | *) 187 | let find_rank_from_right get a x = 188 | let n = Array.length a in 189 | let left = ref 0 in 190 | let right = ref n in 191 | while !left < !right do 192 | let m = (!left + !right) / 2 in 193 | if get a m <= x then 194 | left := m + 1 195 | else 196 | right := m 197 | done; 198 | !right - 1 199 | 200 | let test_find_rank_from_right () = 201 | let f a x = 202 | let rank = find_rank_from_right Array.get a x in 203 | printf "[%s] query: %g -> rank: %i\n" (string_of_array a) x rank; 204 | rank 205 | in 206 | assert (f [| 1. |] 2. = 0); 207 | assert (f [| 1. |] 1. = 0); 208 | assert (f [| 1. |] 0. = -1); 209 | assert (f [| 1.; 2. |] 1.5 = 0); 210 | assert (f [| 1.; 2. |] 0.5 = -1); 211 | assert (f [| 0.; 1.; 2.; |] 0.5 = 0); 212 | assert (f [| 1.; 2.; 2.; 3. |] 2. = 2); 213 | assert (f [| 1.; 2.; 2.; 3. |] 1.5 = 0); 214 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 3); 215 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 1.5 = 0) 216 | 217 | (* 218 | Assuming a sorted array and a query, find the position of the most suitable 219 | higher value to be used in linear interpolation. 220 | 221 | [ 3; 4 ], query = 3.5 -> 1 222 | [ 3; 4 ], query = 3 -> 0 223 | [ 3; 4 ], query = 1 -> 0 224 | [ 3; 4 ], query = 6 -> 2 (out of bounds on purpose) 225 | 226 | See the tests for examples. 227 | *) 228 | let find_sup get a x = 229 | if Array.length a = 0 then 230 | invalid_arg "Rank_normalize.find_sup: empty array"; 231 | let rank = find_rank_from_right get a x in 232 | if rank = -1 || x > get a rank then 233 | rank + 1 234 | else 235 | rank 236 | 237 | let test_find_sup () = 238 | let f a x = 239 | let sup = find_sup Array.get a x in 240 | printf "[%s] query: %g -> sup: %i\n" (string_of_array a) x sup; 241 | sup 242 | in 243 | assert (f [| 1. |] 2. = 1); 244 | assert (f [| 1. |] 1. = 0); 245 | assert (f [| 1. |] 0. = 0); 246 | assert (f [| 1.; 2. |] 1.5 = 1); 247 | assert (f [| 1.; 2. |] 0.5 = 0); 248 | assert (f [| 0.; 1.; 2.; |] 0.5 = 1); 249 | assert (f [| 1.; 2.; 2.; 3. |] 2. = 2); 250 | assert (f [| 1.; 2.; 2.; 3. |] 1.5 = 1); 251 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 2. = 3); 252 | assert (f [| 1.; 2.; 2.; 2.; 3. |] 1.5 = 1) 253 | 254 | (* 255 | Estimate the rank of a query in an unsorted array by scanning the whole 256 | array. 257 | This is used in the initialization phase, before num_bins have been added 258 | to the tracker. 259 | *) 260 | let find_rank_in_buffer b x = 261 | let n = b.buf_count in 262 | match n with 263 | | 0 | 1 -> 0.5 264 | | _ -> 265 | (* 266 | left_count and right_count are the number of elements lower than 267 | and greater than the query. 268 | We don't keep track of which elements are closest to the query, 269 | so don't do any interpolation to obtain a non-discrete rank. 270 | *) 271 | let left_count = ref 0 in 272 | let right_count = ref 0 in 273 | let a = b.buf_array in 274 | for i = 0 to n - 1 do 275 | let c = compare a.(i) x in 276 | if c < 0 then 277 | incr left_count 278 | else if c > 0 then 279 | incr right_count 280 | done; 281 | assert (!left_count + !right_count <= n); 282 | (* inf and sup are the indices, possibly out of bounds, of the positions 283 | of the elements surrounding the query in a sorted array: 284 | query = 2 285 | [| 3; 4.5 |] -> inf = -1, sup = 0 286 | [| 0; 2; 2; 3.1 |] -> inf = 1, sup = 2 287 | [| 0; 0 |] -> inf = 1, sup = 2 288 | *) 289 | let inf = !left_count - 1 in 290 | let sup = n - !right_count in 291 | assert (inf < sup); 292 | (* 293 | Index i represents the bin whose ranks span [i, i+1] and of average 294 | rank i + 0.5. 295 | Hence, we add 0.5 to inf and 0.5 to sup to obtain the average rank 296 | of the corresponding bins. 297 | The rank of the query is the average of the ranks of the bins 298 | inf and sup. 299 | *) 300 | let rank = 0.5 *. float (inf + sup + 1) in 301 | rank /. float n 302 | 303 | let test_find_rank_in_buffer () = 304 | let f array query = 305 | let b = { 306 | buf_array = Array.copy array; 307 | buf_count = Array.length array; 308 | } in 309 | let rank = find_rank_in_buffer b query in 310 | printf "[%s] query: %g -> rank = %g\n" (string_of_array array) query rank; 311 | rank 312 | in 313 | assert (f [| |] 2. = 0.5); 314 | assert (f [| 3. |] 2. = 0.5); 315 | assert (f [| 3. |] 4. = 0.5); 316 | assert (f [| 3.; 4. |] 1. = 0.); 317 | assert (f [| 3.; 4. |] 3. = 0.25); 318 | assert (f [| 4.; 3. |] 3. = 0.25); 319 | assert (f [| 3.; 4. |] 3.5 = 0.5); 320 | assert (f [| 3.; 4. |] 3.1 = 0.5); 321 | assert (f [| 3.; 4. |] 4. = 0.75); 322 | assert (f [| 3.; 4. |] 5. = 1.) 323 | 324 | (* 325 | Linear interpolation: find y such that (x, y) lies on the line that goes 326 | through (x1, y1) and (x2, y2). 327 | *) 328 | let interpolate x1 y1 x2 y2 x = 329 | if debug then 330 | printf "(x1=%g, y1=%i) (x2=%g, y2=%i) x=%g\n" 331 | x1 y1 x2 y2 x; 332 | if x1 = x2 then 333 | if x < x1 then 334 | float y1 335 | else if x > x2 then 336 | float y2 337 | else (* x = x1 = x2 *) 338 | 0.5 *. (float y1 +. float y2) 339 | else 340 | let slope = float (y2 - y1) /. (x2 -. x1) in 341 | float y1 +. slope *. (x -. x1) 342 | 343 | let test_interpolate () = 344 | let f x1 y1 x2 y2 x = 345 | let y = interpolate x1 y1 x2 y2 x in 346 | printf "(x1=%g, y1=%i) (x2=%g, y2=%i) x=%g -> y=%g\n" 347 | x1 y1 x2 y2 x y; 348 | y 349 | in 350 | let ( =~ ) a b = abs_float (a -. b) < 1e-6 in 351 | assert (f 0. 0 1. 1 0.5 =~ 0.5); 352 | assert (f 0. 0 1. 1 0. =~ 0.); 353 | assert (f 0. 0 1. 1 1. =~ 1.); 354 | assert (f 0. 0 1. 1 (-1.) =~ -1.); 355 | assert (f 10. 10 1. 1 0.5 =~ 0.5); 356 | assert (f 10. 10 1. 1 0. =~ 0.); 357 | assert (f 10. 10 1. 1 1. =~ 1.); 358 | assert (f 10. 10 1. 1 (-1.) =~ -1.); 359 | assert (f 2. 0 3. 1 3.5 =~ 1.5); 360 | 361 | (* x1 = x2 *) 362 | assert (f 0. 10 0. 20 0. =~ 15.); 363 | assert (f 0. 10 0. 20 (-1.) =~ 10.); 364 | assert (f 0. 10 0. 20 1. =~ 20.) 365 | 366 | let normalize t x = 367 | match t.num_rounds with 368 | | 0 -> 369 | (* The bins haven't been initialized yet, fall back to scanning the 370 | buffer. *) 371 | let buffer = t.buffer in 372 | find_rank_in_buffer buffer x 373 | | _ -> 374 | (* 375 | The bins have been initialized. We use binary search and linear 376 | interpolation to estimate the rank such that 377 | 378 | a < b => rank(a) < rank(b) or rank(b) = 0 or rank(a) = 1 379 | a = b => rank(a) = rank(b) 380 | For all x, rank(x) >= 0 and rank(x) <= 1 381 | 382 | (assuming no rounding errors) 383 | 384 | - If the query is equal to the values in one or more bins, the average 385 | rank of these bins is returned. 386 | - If the query falls to the left of the minimum (value in bin 0), 387 | then the values in bins 0 and 1 are used for linear interpolation. 388 | - If the query falls to the right of the maximum (value in bin n-1), 389 | then the values in bins n-2 and n-1 are used for linear 390 | interpolation. 391 | - If the query falls between the values of two (consecutive) bins, 392 | the values in these two bins are used for linear interpolation. 393 | *) 394 | match t.num_bins with 395 | | 0 | 1 -> 0.5 396 | | n -> 397 | let a = t.averages in 398 | let get a i = EMA.get a.(i) in 399 | let inf = find_inf get a x in 400 | let sup = find_sup get a x in 401 | assert (inf >= -1); 402 | assert (inf <= sup); 403 | assert (sup <= n); 404 | let rank = 405 | if inf = -1 then 406 | let inf = inf + 1 in 407 | let sup = sup + 1 in 408 | max 0. 409 | (0.5 +. interpolate (get a inf) inf (get a sup) sup x) 410 | else if sup = n then 411 | let inf = inf - 1 in 412 | let sup = sup - 1 in 413 | min (float n) 414 | (0.5 +. interpolate (get a inf) inf (get a sup) sup x) 415 | else 416 | let a_inf = get a inf in 417 | let a_sup = get a sup in 418 | if x = a_inf then ( 419 | assert (x = a_sup); 420 | 0.5 *. float (inf + sup + 1) 421 | ) 422 | else ( 423 | 0.5 +. interpolate a_inf inf a_sup sup x 424 | ) 425 | in 426 | rank /. float n 427 | 428 | let map t x = 429 | add t x; 430 | normalize t x 431 | 432 | let string_of_list l = 433 | l |> List.map string_of_float |> String.concat " " 434 | 435 | let test_normalize () = 436 | let f n samples query = 437 | let t = create ~num_bins:n () in 438 | List.iter (add t) samples; 439 | let r = normalize t query in 440 | printf "n=%i [%s] query=%g -> rank=%g\n" 441 | n (string_of_list samples) query r; 442 | r 443 | in 444 | let ( =~ ) a b = abs_float (a -. b) < 1e-6 in 445 | (* Test initialization phase *) 446 | assert (f 1 [] 0. =~ 0.5); 447 | assert (f 5 [] 0. =~ 0.5); 448 | assert (f 5 [3.] 0. =~ 0.5); 449 | assert (f 5 [3.; 2.] 0. =~ 0.); 450 | assert (f 5 [3.; 2.] 1.5 =~ 0.); 451 | assert (f 5 [3.; 2.] 1.75 =~ 0.); 452 | assert (f 5 [3.; 2.] 2. =~ 0.25); 453 | assert (f 5 [3.; 2.] 2.25 =~ 0.5); 454 | assert (f 5 [3.; 2.] 2.5 =~ 0.5); 455 | assert (f 5 [3.; 2.] 2.75 =~ 0.5); 456 | assert (f 5 [3.; 2.] 3. =~ 0.75); 457 | assert (f 5 [3.; 2.] 3.5 =~ 1.); 458 | assert (f 5 [3.; 2.] 4. =~ 1.); 459 | (* Test normal operation *) 460 | assert (f 2 [3.; 2.; 10.] 0. =~ 0.); 461 | assert (f 2 [3.; 2.; 10.] 1.5 =~ 0.); 462 | assert (f 2 [3.; 2.; 10.] 1.75 =~ 0.125); 463 | assert (f 2 [3.; 2.; 10.] 2. =~ 0.25); 464 | assert (f 2 [3.; 2.; 10.] 2.25 =~ 0.375); 465 | assert (f 2 [3.; 2.; 10.] 2.5 =~ 0.5); 466 | assert (f 2 [3.; 2.; 10.] 2.75 =~ 0.625); 467 | assert (f 2 [3.; 2.; 10.] 3. =~ 0.75); 468 | assert (f 2 [3.; 2.; 10.] 3.5 =~ 1.); 469 | assert (f 2 [3.; 2.; 10.] 4. =~ 1.); 470 | (* Test after two rounds of identical samples *) 471 | assert (f 2 [3.; 2.; 2.; 3.] 0. =~ 0.); 472 | assert (f 2 [3.; 2.; 2.; 3.] 1.5 =~ 0.); 473 | assert (f 2 [3.; 2.; 2.; 3.] 1.75 =~ 0.125); 474 | assert (f 2 [3.; 2.; 2.; 3.] 2. =~ 0.25); 475 | assert (f 2 [3.; 2.; 2.; 3.] 2.25 =~ 0.375); 476 | assert (f 2 [3.; 2.; 2.; 3.] 2.5 =~ 0.5); 477 | assert (f 2 [3.; 2.; 2.; 3.] 2.75 =~ 0.625); 478 | assert (f 2 [3.; 2.; 2.; 3.] 3. =~ 0.75); 479 | assert (f 2 [3.; 2.; 2.; 3.] 3.5 =~ 1.); 480 | assert (f 2 [3.; 2.; 2.; 3.] 4. =~ 1.); 481 | (* Test after two rounds of different samples, shifting avg. values by 1. *) 482 | assert (f 2 [3.; 2.; 4.; 5.] 1. =~ 0.); 483 | assert (f 2 [3.; 2.; 4.; 5.] 2.5 =~ 0.); 484 | assert (f 2 [3.; 2.; 4.; 5.] 2.75 =~ 0.125); 485 | assert (f 2 [3.; 2.; 4.; 5.] 3. =~ 0.25); 486 | assert (f 2 [3.; 2.; 4.; 5.] 3.25 =~ 0.375); 487 | assert (f 2 [3.; 2.; 4.; 5.] 3.5 =~ 0.5); 488 | assert (f 2 [3.; 2.; 4.; 5.] 3.75 =~ 0.625); 489 | assert (f 2 [3.; 2.; 4.; 5.] 4. =~ 0.75); 490 | assert (f 2 [3.; 2.; 4.; 5.] 4.5 =~ 1.); 491 | assert (f 2 [3.; 2.; 4.; 5.] 5. =~ 1.) 492 | 493 | let ema_tests = EMA.tests 494 | 495 | let tests = [ 496 | "find_rank_from_left", test_find_rank_from_left; 497 | "find_inf", test_find_inf; 498 | "find_rank_from_right", test_find_rank_from_right; 499 | "find_sup", test_find_sup; 500 | "find_rank_in_buffer", test_find_rank_in_buffer; 501 | "interpolate", test_interpolate; 502 | "normalize", test_normalize; 503 | ] 504 | --------------------------------------------------------------------------------