├── LICENSE ├── README.md ├── clustering.opam ├── lib ├── agglomerative.ml ├── agglomerative.mli ├── clustering.ml ├── k_means.ml ├── k_means.mli ├── k_medoids.ml ├── k_medoids.mli ├── multiStart.ml └── multiStart.mli ├── lib_test └── jbuild └── wiki ├── accuracy.png ├── accuracy_multistart.png └── first_example.png /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Ilias Garnier ilias.gar@gmail.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Deprecation notice 2 | ################## 3 | 4 | NB: this library is deprecated by https://github.com/igarnier/prbnmcn-clustering 5 | 6 | Functorial k-clustering implementation in OCaml. 7 | 8 | Provided algorithms: 9 | * k-means (voronoi iteration + various initialisation methods) 10 | * k-medoids (voronoi iteration or PAM + various initialisation methods) 11 | * agglomerative hierarchical clustering 12 | 13 | Some example code is available here: 14 | 15 | https://github.com/igarnier/clustering/wiki/An-example-with-k-means 16 | 17 | For some simple, typical distance measures used in maths, look at the gromov library. 18 | 19 | Some examples are provided in lib_test/gaussian.ml (relies on owl, vplot and gromov). 20 | The library also provides sequential (and soon parallel) multi-start wrappers, 21 | as well as ways to assess the tradeoff between the quality of clustering 22 | and overfitting using provided cost functions. 23 | 24 | BUILDING & INSTALLING 25 | 26 | Building and installing the package relies on having the opam package manager 27 | installed. 28 | 1. Do `opam install jbuilder batteries parmap`. 29 | 2. Type `jbuilder build` at the root to build the package. 30 | 3. Type `jbuilder install` to install the package in your .opam directory. 31 | 4. (optional) 32 | To build the example in lib_test/gaussian.ml, you will need to `opam install owl`. Then, 33 | `jbuilder runtest` should build and run the example. The results should be available 34 | in _build/default/lib_test/dataset.png (to see the clusters as they have been generated) 35 | and _build/default/lib_test/result.png (to see the clusters as inferred by k-means) 36 | 37 | TODOS: 38 | 1. Parmap doesn't work super well: all child processes run on the same CPU. Try with 39 | Netmcore. 40 | 2. Compare quality of clustering as well as performance with scikit-learn's implementation 41 | -------------------------------------------------------------------------------- /clustering.opam: -------------------------------------------------------------------------------- 1 | opam-version: "1.2" 2 | maintainer: "ilias.gar@gmail.com" 3 | authors: ["Ilias Garnier"] 4 | homepage: "https://github.com/igarnier/clustering" 5 | bug-reports: "https://github.com/igarnier/clustering/issues" 6 | dev-repo: "git+https://github.com/igarnier/clustering.git" 7 | license: "MIT" 8 | build: [ 9 | ["dune" "build" "-p" name "-j" jobs] 10 | ] 11 | depends: [ 12 | "dune" {build} 13 | "batteries" 14 | ] 15 | available: [ ocaml-version >= "4.04.1" ] 16 | descr: " 17 | k-means, k-medoids and agglomerative clustering in OCaml 18 | " 19 | -------------------------------------------------------------------------------- /lib/agglomerative.ml: -------------------------------------------------------------------------------- 1 | open Batteries 2 | 3 | module type Element = 4 | sig 5 | 6 | type t 7 | 8 | val dist : t -> t -> float 9 | 10 | end 11 | 12 | module type ElementSet = 13 | sig 14 | 15 | type t 16 | type elt 17 | 18 | val singleton : elt -> t 19 | 20 | val dist : t -> t -> float 21 | 22 | val join : t -> t -> t 23 | 24 | end 25 | 26 | module Make = functor (E : Element) (S : ElementSet with type elt = E.t) -> 27 | struct 28 | 29 | type cluster = 30 | { 31 | set : S.t; 32 | tree : tree; 33 | uid : int 34 | } 35 | and tree = 36 | | Node of cluster * cluster 37 | | Leaf 38 | 39 | let uid = 40 | let x = ref (-1) in 41 | fun () -> incr x; !x 42 | 43 | let mkcluster set tree = 44 | { 45 | set; tree; uid = uid () 46 | } 47 | 48 | (* Hash distance computation between clusters. *) 49 | module Table = Hashtbl.Make( 50 | struct 51 | type t = (cluster * cluster) 52 | 53 | let equal (c1,c2) (c1',c2') = 54 | (c1.uid = c1'.uid && c2.uid = c2'.uid) 55 | || (c1.uid = c2'.uid && c2.uid = c1'.uid) 56 | 57 | let hash (c1,c2) = 58 | if c1.uid < c2.uid then 59 | Hashtbl.hash (c1.uid, c2.uid) 60 | else 61 | Hashtbl.hash (c2.uid, c1.uid) 62 | end) 63 | 64 | let dist sz = 65 | let table = Table.create sz in 66 | fun c1 c2 -> 67 | if c1.uid = c2.uid then 0.0 68 | else 69 | match Table.find_option table (c1, c2) with 70 | | Some dist -> dist 71 | | None -> 72 | let dist = S.dist c1.set c2.set in 73 | Table.add table (c1, c2) dist; 74 | dist 75 | 76 | let minimum_pairwise_distance (dist : cluster -> cluster -> float) clusters = 77 | match clusters with 78 | | [] -> invalid_arg "empty cluster list" 79 | | [c] -> 80 | (0.0, c, c) 81 | | c :: c' :: _tl -> 82 | List.fold_lefti (fun acc i c -> 83 | List.fold_lefti (fun acc j c' -> 84 | if j > i then 85 | let (best_d, _, _) = acc in 86 | let d = dist c c' in 87 | if d < best_d then 88 | (d, c, c') 89 | else 90 | acc 91 | else 92 | acc 93 | ) acc clusters 94 | ) (dist c c', c, c') clusters 95 | 96 | let rec iterate dist clusters = 97 | match clusters with 98 | | [] -> invalid_arg "empty cluster list" 99 | | [c] -> c 100 | | _ -> 101 | let (_d, c, c') = minimum_pairwise_distance dist clusters in 102 | let clusters = List.filter (fun c0 -> c0.uid <> c.uid && c0.uid <> c'.uid) clusters in 103 | let joined = mkcluster (S.join c.set c'.set) (Node(c,c')) in 104 | iterate dist (joined :: clusters) 105 | 106 | let cluster elements = 107 | let len = List.length elements in 108 | let dist = dist len in 109 | let clusters = 110 | List.map (fun x -> mkcluster (S.singleton x) Leaf) elements 111 | in 112 | iterate dist clusters 113 | 114 | let truncate cluster depth = 115 | let rec truncate { set; tree; _ } depth queue acc = 116 | match tree with 117 | | Leaf -> 118 | (if depth > 0 then 119 | invalid_arg "truncate: tree too short" 120 | else 121 | let acc = set :: acc in 122 | match queue with 123 | | [] -> acc 124 | | (next, d) :: tl -> 125 | truncate next d tl acc) 126 | | Node(l, r) -> 127 | if depth = 0 then 128 | let acc = set :: acc in 129 | match queue with 130 | | [] -> acc 131 | | (next, d) :: tl -> 132 | truncate next d tl acc 133 | else 134 | truncate l (depth-1) ((r,depth-1) :: queue) acc 135 | in 136 | truncate cluster depth [] [] 137 | 138 | let all_clusters cluster = 139 | let rec fold { set; tree; _ } depth acc = 140 | match tree with 141 | | Leaf -> (set, depth) :: acc 142 | | Node(l, r) -> 143 | fold r (depth+1) (fold l (depth+1) ((set, depth) :: acc)) 144 | in 145 | fold cluster 0 [] 146 | 147 | end 148 | -------------------------------------------------------------------------------- /lib/agglomerative.mli: -------------------------------------------------------------------------------- 1 | (** Agglomerative clustering functor. *) 2 | 3 | module type Element = 4 | sig 5 | 6 | (** [t] is the type of elements to be clustered. *) 7 | type t 8 | 9 | 10 | (** [dist] should be a distance function: symmetric, zero on the diagonal and verifying 11 | the triangular inequality. *) 12 | val dist : t -> t -> float 13 | 14 | end 15 | 16 | module type ElementSet = 17 | sig 18 | 19 | (** [t] is the type of (multi)-sets of elements. *) 20 | type t 21 | 22 | (** [elt] is the type of elements to be clustered. *) 23 | type elt 24 | 25 | (** [singleton x] is the cluster containing [x] as only element. *) 26 | val singleton : elt -> t 27 | 28 | (** The user should provide [dist], aa distance function on clusters. 29 | A typical (costly) choice is the Hausdorff distance (see e.g. the [gromov] package). 30 | Other, non-metric choices are sometimes used. 31 | *) 32 | val dist : t -> t -> float 33 | 34 | (** One should be able to "join" clusters. This can be multiset union, set union 35 | or any sensible overapproximation - the algorithm will work anyway (think 36 | for instance of convex hulls in R^n) *) 37 | val join : t -> t -> t 38 | 39 | end 40 | 41 | (** [Make] takes as first argument a module [E : Element] of elements 42 | admitting the structure of a metric space. The second argument of 43 | [Make] is a module endowing sets of elements with the structure of 44 | a metric space. *) 45 | module Make : functor (E : Element) (S : ElementSet with type elt = E.t) -> 46 | sig 47 | 48 | type cluster = 49 | { 50 | set : S.t; 51 | tree : tree; 52 | uid : int 53 | } 54 | and tree = 55 | | Node of cluster * cluster 56 | | Leaf 57 | 58 | val cluster : E.t list -> cluster 59 | 60 | (** [truncate c depth] returns all the sub-clusters at depth [depth]. 61 | The depth of the root is 0. *) 62 | val truncate : cluster -> int -> S.t list 63 | 64 | (** Returns all clusters along their depths. *) 65 | val all_clusters : cluster -> (S.t * int) list 66 | 67 | end 68 | -------------------------------------------------------------------------------- /lib/clustering.ml: -------------------------------------------------------------------------------- 1 | 2 | module K_means = K_means 3 | module K_medoids = K_medoids 4 | module Agglomerative = Agglomerative 5 | module MultiStart = MultiStart 6 | -------------------------------------------------------------------------------- /lib/k_means.ml: -------------------------------------------------------------------------------- 1 | open Batteries 2 | 3 | (* We require elements to have the structure of a 4 | metric space and to support the computing of geometric means. *) 5 | module type Element = 6 | sig 7 | 8 | type t 9 | 10 | (* This should be a proper distance function (symmetric, zero on the diagonal, 11 | verifying the triangular inequality). *) 12 | val dist : t -> t -> float 13 | 14 | val mean : t array -> t 15 | 16 | end 17 | 18 | type init = 19 | [ `Forgy (* Selects k elements at random (without replacement) *) 20 | | `RandomPartition (* Select a random partition *) 21 | | `KmeansPP ] (* K-means++ *) 22 | 23 | 24 | type termination = 25 | [ `Num_iter of int 26 | | `Threshold of float 27 | | `Min of constraints ] 28 | and constraints = { max_iter : int; threshold : float } 29 | 30 | exception KmeansError of string 31 | 32 | module Make(E : Element) = 33 | struct 34 | 35 | type elt = E.t 36 | 37 | (* [closest elements centroid elti] returns the pair (m,d) such that 38 | [centroids.(m)] is closest to [elements.(elti)], and the distance 39 | is equal to d. *) 40 | let closest elements centroids elti = 41 | let m = ref 0 in 42 | let d = ref max_float in 43 | for i = 0 to Array.length centroids - 1 do 44 | let dist = E.dist elements.(elti) centroids.(i) in 45 | if dist < !d then 46 | (d := dist; m := i) 47 | done; 48 | !m, !d 49 | 50 | (* [compute_classes centroids elements] computes for each centroid 51 | the set of elements which are closest to it, i.e. it computes the 52 | voronoi partition of [elements] according to [centroids]. *) 53 | let compute_classes centroids elements = 54 | let classes : int list array = 55 | Array.create (Array.length centroids) [] 56 | in 57 | Array.iteri (fun elti _ -> 58 | let k, _ = closest elements centroids elti in 59 | classes.(k) <- elti :: classes.(k) 60 | ) elements; 61 | let classes = Array.filter (function [] -> false | _ -> true) classes in 62 | Array.map Array.of_list classes 63 | 64 | (* [compute_centroids], given a partition [classes] of [elements], returns 65 | the centroid of each class. *) 66 | let compute_centroids (elements : elt array) (classes : int array array) = 67 | Array.map (fun arr -> E.mean (Array.map (fun i -> elements.(i)) arr)) classes 68 | 69 | (* Voronoi iteration: partition according to the centroids, then update the centroids, 70 | etc under the centroids do not move collectively more than [threshold]. *) 71 | let rec iterate (centroids : E.t array) (elements : E.t array) niter (termination : termination) = 72 | let classes = compute_classes centroids elements in 73 | let centroids' = compute_centroids elements classes in 74 | let terminate = 75 | match termination with 76 | | `Num_iter max_iter -> niter >= max_iter 77 | | `Threshold threshold -> 78 | let dist = 79 | Array.mapi (fun i c -> E.dist c centroids'.(i)) centroids' 80 | |> Array.fsum 81 | in 82 | dist < threshold 83 | | `Min { max_iter; threshold } -> 84 | niter >= max_iter || 85 | (let dist = 86 | Array.mapi (fun i c -> E.dist c centroids'.(i)) centroids' 87 | |> Array.fsum 88 | in 89 | dist < threshold) 90 | in 91 | if terminate then 92 | classes 93 | else 94 | iterate centroids elements (niter + 1) termination 95 | 96 | (* [forgy_init] picks [k] initial centroids uniformly at random. *) 97 | let forgy_init k elements = 98 | Array.of_enum (Random.multi_choice k (Array.enum elements)) 99 | 100 | (* [random_partition_init] picks [k] initial centroids by selecting a 101 | partition in [k] classes at random, and then computing the centroid 102 | of each class. *) 103 | let random_partition_init k elements = 104 | let classes = Array.create k [] in 105 | Array.iteri (fun elti _ -> 106 | let i = Random.int k in 107 | classes.(i) <- elti :: classes.(i) 108 | ) elements; 109 | let classes = Array.map Array.of_list classes in 110 | compute_centroids elements classes 111 | 112 | let pick_uniformly arr = 113 | let c = Array.length arr in 114 | if c = 0 then 115 | raise (KmeansError "pick_uniformly: empty array - bug found, please report") 116 | else 117 | arr.(Random.int c) 118 | 119 | (* Given a discrete probability distribution stored in [arr], pick an index according 120 | to that distribution. *) 121 | (* Note that the distance to a point to itself is 0, so the probability for a centroid 122 | to pick itself is also zero. *) 123 | let pick_proportional arr = 124 | let total = Array.fsum arr in 125 | let r = Random.float total in 126 | let rec loop i acc = 127 | if acc <= arr.(i) then 128 | i 129 | else 130 | loop (i+1) (acc -. arr.(i)) 131 | in 132 | loop 0 r 133 | 134 | (* [kmeanspp_iter] selects [k] centroids iteratively: the first one is taken uniformly at 135 | random, and in the inductive step the next one is picked with a probability proportional 136 | to its squared distance to the closest centroid. *) 137 | let rec kmeanspp_iter k centroids elements = 138 | if k = 0 then centroids 139 | else 140 | let dists = Array.mapi (fun elti _ -> let _, d = closest elements centroids elti in d *. d) elements in 141 | let i = pick_proportional dists in 142 | let centroids = Array.concat [ centroids; [| elements.(i) |] ] in 143 | kmeanspp_iter (k-1) centroids elements 144 | 145 | let kmeanspp_init k elements = 146 | if k < 1 then 147 | raise (KmeansError "kmeanspp_init: k < 1, error") 148 | else 149 | let elt = pick_uniformly elements in 150 | kmeanspp_iter (k-1) [| elt |] elements 151 | 152 | let k_means_internal ~k ~(init : init) ~elements ~(termination : termination) = 153 | let centroids = 154 | match init with 155 | | `Forgy -> 156 | forgy_init k elements 157 | | `RandomPartition -> 158 | random_partition_init k elements 159 | | `KmeansPP -> 160 | kmeanspp_init k elements 161 | in 162 | iterate centroids elements 0 termination 163 | 164 | let k_means ~k ~init ~elements ~termination = 165 | if Array.length elements = 0 then 166 | raise (KmeansError "k_means: empty elements array") 167 | else 168 | let classes = k_means_internal ~k ~init ~elements ~termination in 169 | Array.map (Array.map (fun i -> elements.(i))) classes 170 | 171 | (* 2 x cluster_radius overapproximates the diameter, hopefully tightly *) 172 | (* let cluster_radius elements = 173 | * let mean = E.mean elements in 174 | * Array.fold_left (fun maxdist elt -> 175 | * max maxdist (E.dist elt mean) 176 | * ) (~-. max_float) elements *) 177 | 178 | (* let sum_of_cluster_radius classes = 179 | * Array.fsum (Array.map cluster_radius classes) *) 180 | 181 | let total_squared_dist_to_mean elements = 182 | let mean = E.mean elements in 183 | Array.fold_left (fun acc elt -> 184 | let d = E.dist elt mean in 185 | acc +. d *. d 186 | ) 0.0 elements 187 | 188 | let cost ~classes = 189 | Array.fsum (Array.map total_squared_dist_to_mean classes) 190 | 191 | end 192 | -------------------------------------------------------------------------------- /lib/k_means.mli: -------------------------------------------------------------------------------- 1 | 2 | (** K-means functor. *) 3 | 4 | module type Element = 5 | sig 6 | 7 | (** [t] is the type of elements to be clustered. *) 8 | type t 9 | 10 | (** In principle, [dist] should be a distance function: symmetric, zero on the diagonal and verifying 11 | the triangular inequality. If these assumptions are violated, the algorithm will 12 | still terminate with a "clustering" though. *) 13 | val dist : t -> t -> float 14 | 15 | (** Elements of type [t] should support taking arithmetic means. The function 16 | [mean] provides this. *) 17 | val mean : t array -> t 18 | 19 | end 20 | 21 | (** K-means is rather sensitive to the initial choice of centroids. 22 | This implementation provides several initialization algorithms, 23 | the standard one being Kmeans++ (KmeansPP) *) 24 | type init = 25 | [ 26 | | `Forgy 27 | (** [Forgy] selects k elements at random (without replacement) as initial centroids. *) 28 | 29 | | `RandomPartition 30 | (** Assigns each point to a random cluster, and computes the corresponding centroid. 31 | Note that these centroids do not necessarily belong to the dataset, which might 32 | cause robustness issues. *) 33 | 34 | | `KmeansPP 35 | (** [KmeansPP] selects initial centroids iteratively with probabilities proportional 36 | to their squared distance to the previously selected centroids. This intuitively 37 | allows to spread them well. *) 38 | ] 39 | 40 | (** Termination of the algorithm can be either specified as: 41 | 1) an /exact/ number of iterations [Num_iter], 42 | 2) a [Threshold] giving the biggest delta-[cost] decrease under which we stop iterating, or 43 | 3) as the minimum of the above to, i.e. stop iterating when the [cost]-decrease is 44 | under [threshold] or when we reach [max_iter]. *) 45 | type termination = 46 | [ 47 | | `Num_iter of int 48 | | `Threshold of float 49 | | `Min of constraints ] 50 | and constraints = { max_iter : int; threshold : float } 51 | 52 | 53 | (** Exception thrown by [k_means] in case something goes awry.*) 54 | exception KmeansError of string 55 | 56 | module Make : functor (E : Element) -> 57 | sig 58 | 59 | (** [k_means] performs the clustering using to the provided initialization method. *) 60 | val k_means : k:int -> init:init -> elements:E.t array -> termination:termination -> E.t array array 61 | 62 | (** [cost] returns the sum over all classes of the sum of squared distances from 63 | the mean of the class to all elements of the class. This quantity will decrease 64 | monotonically as [k] increases, and the usual method to determine [k] is to select 65 | the one where this quantity's decrease has its first inflection. *) 66 | val cost : classes:E.t array array -> float 67 | 68 | end 69 | -------------------------------------------------------------------------------- /lib/k_medoids.ml: -------------------------------------------------------------------------------- 1 | open Batteries 2 | 3 | (* K-medoids only requires that elements are endowed with the structure 4 | of a metric space. *) 5 | module type Element = 6 | sig 7 | 8 | type t 9 | 10 | (* This should be a proper distance function (symmetric, zero on the diagonal, 11 | verifying the triangular inequality). *) 12 | val dist : t -> t -> float 13 | 14 | end 15 | 16 | type init = 17 | [ `Forgy (* Selects k elements at random (without replacement) *) 18 | | `KmedoidsPP ] (* K-medoids++, cf K-means++ *) 19 | 20 | type algorithm = 21 | [ `PAM (* Partition Around Medoids - the classical greedy algorithm. Costly. *) 22 | | `VoronoiIteration ] (* Another heuristic, less costly but perhaps less reliable. *) 23 | 24 | type termination = 25 | [ `Num_iter of int 26 | | `Threshold of float 27 | | `Min of constraints ] 28 | and constraints = { max_iter : int; threshold : float } 29 | 30 | exception KmedoidsError of string 31 | 32 | 33 | module Make(E : Element) = 34 | struct 35 | 36 | (* Avoid polymorphic refs for efficiency (perhaps a case of premature optimization but well) *) 37 | type fref = { 38 | mutable c : float 39 | } 40 | 41 | let fref x = { c = x } 42 | 43 | (* [closest dist elt medoids] returns the pair (m,d) such that [medoids.(m)] is closest 44 | to [elt] according to the distance function [dist], and the distance is d. *) 45 | let closest dist elt medoids = 46 | let m = ref 0 in 47 | let d = fref max_float in 48 | for i = 0 to Array.length medoids - 1 do 49 | let dist = dist elt medoids.(i) in 50 | if dist < d.c then 51 | (d.c <- dist; 52 | m := i) 53 | done; 54 | !m, d.c 55 | 56 | (* Internal function to compute cost of a choice of medoids. *) 57 | let cost_ dist elements medoids = 58 | Array.fold_lefti (fun acc _i elt -> 59 | let _, dist_to_closest = closest dist elt medoids in 60 | acc +. dist_to_closest 61 | ) 0.0 elements 62 | 63 | (* One step of the PAM algorithm. For each medoid m, for each element e, 64 | evaluate the cost of the configuration where e replaces m as medoid. 65 | If cost is lower, keep e as medoid, otherwise keep m. *) 66 | let pam_step dist elements medoids = 67 | for mi = 0 to Array.length medoids - 1 do 68 | let current_cost = fref (cost_ dist elements medoids) in 69 | let m = medoids.(mi) in 70 | for ei = 0 to Array.length elements - 1 do 71 | let e = elements.(ei) in 72 | medoids.(mi) <- e; 73 | let new_cost = cost_ dist elements medoids in 74 | if new_cost >= current_cost.c then 75 | medoids.(mi) <- m 76 | else 77 | current_cost.c <- new_cost 78 | done 79 | done 80 | 81 | (* [produce_clusters dist elements medoids] computes the voronoi 82 | partition induced by the choice of [medoids], according to [dist]. *) 83 | let produce_clusters dist elements medoids = 84 | let buckets = Array.make (Array.length medoids) [] in 85 | Array.iter (fun elt -> 86 | let closest_idx, _ = closest dist elt medoids in 87 | buckets.(closest_idx) <- elt :: buckets.(closest_idx) 88 | ) elements; 89 | Array.map Array.of_list buckets 90 | 91 | (* computes most central element of a subset of elements *) 92 | let compute_medoid_of_class dist cls = 93 | let centralities = 94 | Array.map (fun elt -> 95 | let dists = Array.map (dist elt) cls in 96 | let centrality = Array.fsum dists in 97 | (elt, centrality) 98 | ) cls 99 | in 100 | Array.sort (fun (_, c) (_, c') -> Float.compare c c') centralities; 101 | let (elt, cost) = centralities.(0) in 102 | elt, cost 103 | 104 | let compute_medoids dist classes = 105 | let result = Array.map (compute_medoid_of_class dist) classes in 106 | let medoids = Array.map fst result in 107 | let costs = Array.fsum (Array.map snd result) in 108 | (medoids, costs) 109 | 110 | (* One step of the voronoi iteration algorithm.*) 111 | let voronoi_iteration_step dist elements medoids = 112 | let classes = produce_clusters dist elements medoids in 113 | Array.modifyi (fun k _ -> 114 | fst (compute_medoid_of_class dist classes.(k)) 115 | ) medoids 116 | 117 | (* Initialization stuff (see k_means.ml) *) 118 | let pick_uniformly arr = 119 | let c = Array.length arr in 120 | if c = 0 then 121 | raise (KmedoidsError "pick_uniformly: empty array - bug found, please report") 122 | else 123 | arr.(Random.int c) 124 | 125 | let pick_proportional arr = 126 | let total = Array.fsum arr in 127 | let r = Random.float total in 128 | let rec loop i acc = 129 | if acc <= arr.(i) then 130 | i 131 | else 132 | loop (i+1) (acc -. arr.(i)) 133 | in 134 | loop 0 r 135 | 136 | let cost ~classes = 137 | snd (compute_medoids E.dist classes) 138 | 139 | let forgy_init k elements = 140 | Array.of_enum (Random.multi_choice k (Array.enum elements)) 141 | 142 | let rec kmedoidspp_iter dist k medoids elements = 143 | if k = 0 then medoids 144 | else 145 | let dists = Array.map (fun elt -> 146 | let _, d = closest dist elt medoids in 147 | d 148 | ) elements in 149 | let i = pick_proportional dists in 150 | let medoids = Array.concat [ medoids; [| elements.(i) |] ] in 151 | kmedoidspp_iter dist (k-1) medoids elements 152 | 153 | let kmedoidspp_init dist k elements = 154 | if k < 1 then 155 | raise (KmedoidsError "kmedoidspp_init: k < 1, error") 156 | else 157 | let elt = pick_uniformly elements in 158 | kmedoidspp_iter dist (k-1) [| elt |] elements 159 | 160 | 161 | 162 | (* The [iterate_?] functions are parameterised by a step function [step]. Each will 163 | destructively update [medoids] and [elements] until termination. *) 164 | 165 | let iterate_n dist elements medoids step n = 166 | for _ = 1 to n do 167 | step dist elements medoids 168 | done 169 | 170 | let iterate_threshold dist elements medoids step threshold = 171 | let cost = fref (cost_ dist elements medoids) in 172 | let loop = ref true in 173 | while !loop do 174 | step dist elements medoids; 175 | let new_cost = cost_ dist elements medoids in 176 | let delta = cost.c -. new_cost in 177 | if delta >= 0.0 && delta < threshold then 178 | loop := false 179 | else 180 | cost.c <- new_cost 181 | done 182 | 183 | let iterate_min dist elements medoids step n threshold = 184 | let cost = fref (cost_ dist elements medoids) in 185 | let exception Break in 186 | try 187 | for _ = 1 to n do 188 | step dist elements medoids; 189 | let new_cost = cost_ dist elements medoids in 190 | let delta = cost.c -. new_cost in 191 | if delta >= 0.0 && delta < threshold then 192 | raise Break 193 | else 194 | cost.c <- new_cost 195 | done 196 | with Break -> () 197 | 198 | let k_medoids_internal dist elements k (init : init) algorithm (termination : termination) = 199 | let medoids = 200 | (* Initialize medoids *) 201 | match init with 202 | | `KmedoidsPP -> 203 | kmedoidspp_init dist k elements 204 | | `Forgy -> 205 | forgy_init k elements 206 | in 207 | (* Select algorithm used for iteration step *) 208 | let step = match algorithm with 209 | | `PAM -> pam_step 210 | | `VoronoiIteration -> voronoi_iteration_step 211 | in 212 | (match termination with 213 | | `Num_iter n -> 214 | iterate_n dist elements medoids step n 215 | | `Threshold threshold -> 216 | iterate_threshold dist elements medoids step threshold 217 | | `Min { max_iter = n; threshold } -> 218 | iterate_min dist elements medoids step n threshold 219 | ); 220 | produce_clusters dist elements medoids 221 | 222 | let k_medoids ~precompute ~elements = 223 | if precompute then 224 | let len = Array.length elements in 225 | let mat = Array.init len (fun i -> 226 | Array.init len (fun j -> E.dist elements.(i) elements.(j)) 227 | ) 228 | in 229 | let dist i j = mat.(i).(j) in 230 | let elements_indices = Array.init len (fun i -> i) in 231 | fun ~k ~init ~algorithm ~termination -> 232 | let clusters = k_medoids_internal dist elements_indices k init algorithm termination in 233 | Array.map (Array.map (fun i -> elements.(i))) clusters 234 | else 235 | fun ~k ~init ~algorithm ~termination -> 236 | k_medoids_internal E.dist elements k init algorithm termination 237 | 238 | end 239 | -------------------------------------------------------------------------------- /lib/k_medoids.mli: -------------------------------------------------------------------------------- 1 | (** K-medoids functor. *) 2 | 3 | module type Element = 4 | sig 5 | 6 | (** [t] is the type of elements to be clustered. *) 7 | type t 8 | 9 | (** [dist] should be a distance function: symmetric, zero and the diagonal and verifying 10 | the triangular inequality. *) 11 | val dist : t -> t -> float 12 | 13 | end 14 | 15 | 16 | (** Initial choice of medoids. 17 | This implementation provides several initialization algorithms, 18 | the standard one being Kmedoids++, identical to Kmeans++ *) 19 | type init = 20 | [ 21 | | `Forgy 22 | (** [Forgy] selects k elements at random (without replacement) as initial centroids. *) 23 | 24 | | `KmedoidsPP 25 | (** [KmedoidsPP] selects initial medoids iteratively with probabilities proportional 26 | to their distance to the previously selected centroids. This intuitively 27 | allows to spread them well. *) 28 | ] 29 | 30 | (** Algorithm used to perform partitioning. *) 31 | type algorithm = 32 | [ 33 | | `PAM 34 | (** [PAM] stands for Partition Around Medoids - the classical greedy algorithm. Costly. *) 35 | 36 | | `VoronoiIteration 37 | (** Another heuristic, proceeding similarly to Lloyd's algorithm for Kmeans. Less costly 38 | (but still more than Kmeans) but perhaps less precise. *) 39 | ] 40 | 41 | (** See [K_means]. *) 42 | type termination = 43 | [ `Num_iter of int 44 | | `Threshold of float 45 | | `Min of constraints ] 46 | and constraints = { max_iter : int; threshold : float } 47 | 48 | 49 | (** Exception thrown by [k_medoids] in case something goes awry.*) 50 | exception KmedoidsError of string 51 | 52 | module Make : functor (E : Element) -> 53 | sig 54 | 55 | (* In contrast with [k_means], k_medoids never has to produce new elements, 56 | and the distance function is only evaluated on the given set of elements. 57 | It might be worthwhile to precompute the distance on those elements, 58 | especially if the distance function is costly and if one wants to compute a clustering for several [k] 59 | or try different algorithms with the same dataset. This is done by partially evaluating [k_medoids] and 60 | setting precompute to true, as in: 61 | 62 | let cluster_function = k_medoids ~precompute:true ~elements in 63 | let res1 = cluster_function ~k:10 ~init:KmedoidsPP ~algorithm:PAM ~threshold:0.1 in 64 | let res2 = cluster_function ~k:15 ~init:KmedoidsPP ~algorithm:VoronoiIteration ~threshold:0.1 in 65 | ... 66 | *) 67 | 68 | (** [k_means] performs the clustering using the provided initialization method. *) 69 | val k_medoids : precompute:bool -> elements:E.t array -> k:int -> init:init -> algorithm:algorithm -> termination:termination -> E.t array array 70 | 71 | (** [cost] returns the sum over all classes of the sum of distances from 72 | the medoid of the class to all elements of the class. *) 73 | val cost : classes:E.t array array -> float 74 | 75 | end 76 | -------------------------------------------------------------------------------- /lib/multiStart.ml: -------------------------------------------------------------------------------- 1 | open Batteries 2 | 3 | let multi_start ~f ~nstarts = 4 | let results = Array.init nstarts (fun _ -> f ()) in 5 | let cost, result = 6 | Array.fold_left (fun (min_cost, min_cost_sol) (cost, sol) -> 7 | if cost < min_cost then 8 | (cost, sol) 9 | else 10 | (min_cost, min_cost_sol) 11 | ) results.(0) results 12 | in 13 | (cost : float), result 14 | 15 | (* let multi_start_parallel ~f ~nstarts ~ncores = *) 16 | (* let results = *) 17 | (* Parmap.array_parmap *) 18 | (* ~ncores *) 19 | (* (fun _ -> *) 20 | (* Random.set_state (Random.State.make_self_init ()); *) 21 | (* f () *) 22 | (* ) (Array.create nstarts ()) *) 23 | (* in *) 24 | (* let cost, result = *) 25 | (* Array.fold_left (fun (min_cost, min_cost_sol) (cost, sol) -> *) 26 | (* if cost < min_cost then *) 27 | (* (cost, sol) *) 28 | (* else *) 29 | (* (min_cost, min_cost_sol) *) 30 | (* ) results.(0) results *) 31 | (* in *) 32 | (* (cost : float), result *) 33 | 34 | -------------------------------------------------------------------------------- /lib/multiStart.mli: -------------------------------------------------------------------------------- 1 | 2 | 3 | (** [multi_start] performs [nstarts] independent calls to the given function [f], which 4 | must return some outcome together with its cost. [multi_start] then returns the result 5 | that has minimal cost. [multi_start_parallel] does the same by forking several 6 | processes (in the hope of getting some parallelism). Note: for the parallel version, 7 | the random state is reinitizalized using Random.self_init per process. 8 | *) 9 | val multi_start : f:(unit -> float * 'a) -> nstarts:int -> float * 'a 10 | 11 | (* val multi_start_parallel : f:(unit -> float * 'a) -> nstarts:int -> ncores:int -> float * 'a *) 12 | -------------------------------------------------------------------------------- /lib_test/jbuild: -------------------------------------------------------------------------------- 1 | (jbuild_version 1) 2 | 3 | (executables 4 | ((names (benchmark)) 5 | (libraries (vlayout vplot clustering owl)))) 6 | 7 | (alias 8 | ((name runtest) 9 | (deps (benchmark.exe)) 10 | (action (run ${<})))) 11 | -------------------------------------------------------------------------------- /wiki/accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igarnier/clustering/4c20f9a253abeeeb6a0f05430d5d72957942ef1e/wiki/accuracy.png -------------------------------------------------------------------------------- /wiki/accuracy_multistart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igarnier/clustering/4c20f9a253abeeeb6a0f05430d5d72957942ef1e/wiki/accuracy_multistart.png -------------------------------------------------------------------------------- /wiki/first_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igarnier/clustering/4c20f9a253abeeeb6a0f05430d5d72957942ef1e/wiki/first_example.png --------------------------------------------------------------------------------