├── README.md ├── Setup.hs ├── .gitignore ├── src ├── Clustering.hs └── Clustering │ └── Hierarchical.hs ├── clustering.cabal └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | cabal-dev 3 | *.o 4 | *.hi 5 | *.chi 6 | *.chs.h 7 | .virthualenv 8 | -------------------------------------------------------------------------------- /src/Clustering.hs: -------------------------------------------------------------------------------- 1 | {------------------------------------------------------------------------------- 2 | Clustering 3 | 4 | (c) 2009 Jan Snajder 5 | 6 | -------------------------------------------------------------------------------} 7 | 8 | module Clustering where 9 | 10 | type DistMeasure a = a -> a -> Double 11 | type DistMeasureM m a = a -> a -> m Double 12 | 13 | -------------------------------------------------------------------------------- /clustering.cabal: -------------------------------------------------------------------------------- 1 | -- Initial clustering.cabal generated by cabal init. For further 2 | -- documentation, see http://haskell.org/cabal/users-guide/ 3 | 4 | name: clustering 5 | version: 0.1.0.0 6 | -- synopsis: 7 | -- description: 8 | license: BSD3 9 | license-file: LICENSE 10 | author: Jan Snajder 11 | maintainer: jan.snajder@fer.hr 12 | -- copyright: 13 | -- category: 14 | build-type: Simple 15 | cabal-version: >=1.8 16 | 17 | library 18 | exposed-modules: Clustering, Clustering.Hierarchical 19 | -- other-modules: 20 | build-depends: base ==4.6.*, containers ==0.5.*, 21 | mtl ==2.1.* 22 | hs-source-dirs: src 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Jan Snajder 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Jan Snajder nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /src/Clustering/Hierarchical.hs: -------------------------------------------------------------------------------- 1 | {------------------------------------------------------------------------------- 2 | Clustering.Hierarchical 3 | Hierarchical agglomerative clustering. 4 | 5 | (c) 2009 Jan Snajder 6 | 7 | -------------------------------------------------------------------------------} 8 | 9 | -- | Hierarchical agglomerative clustering. 10 | module Clustering.Hierarchical 11 | (DistMeasure, 12 | DistMeasureM, 13 | Linkage(..), 14 | Dendrogram, 15 | cluster, 16 | clusterM, 17 | distanceCut, 18 | numberCut, 19 | clusterAt, 20 | clusterInto, 21 | distances, 22 | scale, 23 | normalize, 24 | clusterPartition, 25 | clusterPartitionM, 26 | clusterPartitionCb, 27 | clusterPartitionCbM) where 28 | 29 | import Clustering 30 | 31 | import Data.List (insertBy,sortBy,partition) 32 | import Data.Maybe 33 | import Control.Monad (liftM,zipWithM) 34 | import Data.Ord (comparing) 35 | import qualified Data.Map as M 36 | import Control.Applicative 37 | import Control.Monad.Trans 38 | 39 | type Distances = [((Int,Int),Double)] 40 | type Clusters a = M.Map Int (Int, Dendrogram a) 41 | data Linkage = Single | Complete | Average 42 | deriving (Eq,Show) 43 | data Dendrogram a = Item a | Merge Double (Dendrogram a) (Dendrogram a) 44 | deriving (Show,Read,Eq) 45 | 46 | ------------------------------------------------------------------------------ 47 | -- Operations on dendrograms 48 | ------------------------------------------------------------------------------ 49 | 50 | -- | Converts cluster map into a dendrogram. 51 | toDendrogram :: Clusters a -> Dendrogram a 52 | toDendrogram = snd . head . M.elems 53 | 54 | -- | Cuts a dendrogram at a specified distance level. 55 | distanceCut :: Double -> Dendrogram a -> [[a]] 56 | distanceCut _ (Item x) = [[x]] 57 | distanceCut t (Merge d l r) 58 | | d<=t = merge (distanceCut t l ++ distanceCut t r) 59 | | otherwise = distanceCut t l ++ distanceCut t r 60 | where merge = (:[]) . concat 61 | 62 | infty = 1/0 63 | 64 | -- | Returns a reverse-sorted list of finite dendrogram distance levels, 65 | -- ignoring the infinite distances. 66 | distances :: Dendrogram a -> [Double] 67 | distances = dropWhile (==infty) . sortBy (flip compare) . dist 68 | where dist (Merge d l r) = d : dist l ++ dist r 69 | dist _ = [] 70 | 71 | -- | Scales dendrogram distances by a specified factor, 72 | -- thereby ignoring the infinite distances. 73 | scale :: Double -> Dendrogram a -> Dendrogram a 74 | scale _ i@(Item _ ) = i 75 | scale x dg@(Merge d l r) = Merge (d/x) (scale x l) (scale x r) 76 | 77 | -- | Normalizes dendrogram distances to [0,1] interval, 78 | -- thereby ignoring the infinite distances. 79 | -- Defined as: 80 | -- @ normalize dg = scale (head $ distances dg) dg @ 81 | normalize :: Dendrogram a -> Dendrogram a 82 | normalize dg = scale (head $ distances dg) dg 83 | 84 | -- | Cuts a dendrogram at a specified number of clusters. 85 | numberCut :: Int -> Dendrogram a -> [[a]] 86 | numberCut n dg = distanceCut ((distances dg ++ repeat 0.0)!!n) dg 87 | 88 | -- | Merges two dendrograms (merging distance is set to infinity). 89 | merge :: Dendrogram a -> Dendrogram a -> Dendrogram a 90 | merge dg1 dg2 = Merge infty dg1 dg2 91 | 92 | -- | Concatenates a list of dendrograms (undefined for empty list). 93 | concatDendrograms :: [Dendrogram a] -> Dendrogram a 94 | concatDendrograms [] = error "empty list" 95 | concatDendrograms dgs = foldr1 merge dgs 96 | 97 | ------------------------------------------------------------------------------ 98 | -- Clustering algorithm 99 | ------------------------------------------------------------------------------ 100 | 101 | -- | Creates initial clusters from a list of items. 102 | initClusters :: [a] -> Clusters a 103 | initClusters xs = M.fromList . zip [0..] $ [(1,Item x) | x <- xs] 104 | 105 | -- | Merges two clusters in the map of clusters. 106 | mergeClusters :: Clusters a -> ((Int,Int),Double) -> Clusters a 107 | mergeClusters cs ((i,j),d) = 108 | M.delete j . M.update (\(ni,ci) -> Just (ni+nj,Merge d ci cj)) i $ cs 109 | where Just (nj,cj) = M.lookup j cs 110 | 111 | -- | Returns the initial distances list between singleton clusters. 112 | initDist :: Clusters a -> DistMeasure a -> Distances 113 | initDist cs dm = sortBy (comparing snd) dl 114 | where n = M.size cs 115 | dl = map calcDist [(i,j) | i <- [1..n-1], j <- [0..i-1]] 116 | calcDist ij@(i,j) = (ij,dm ei ej) 117 | where Just (_, Item ei) = M.lookup i cs 118 | Just (_, Item ej) = M.lookup j cs 119 | 120 | -- | Monadic version of the above. 121 | initDistM :: Monad m => Clusters a -> DistMeasureM m a -> m Distances 122 | initDistM cs dm = do 123 | dl <- mapM calcDist [(i,j) | i <- [1..n-1], j <- [0..i-1]] 124 | return $ sortBy (comparing snd) dl 125 | where n = M.size cs 126 | calcDist ij@(i,j) = do 127 | let Just (_, Item ei) = M.lookup i cs 128 | Just (_, Item ej) = M.lookup j cs 129 | d <- dm ei ej 130 | return (ij,d) 131 | 132 | -- | Returns the size of a specified cluster. 133 | clusterSize :: Clusters a -> Int -> Int 134 | clusterSize cs i = case M.lookup i cs of 135 | Just (n,_) -> n 136 | 137 | -- | Updates distance list after merging of clusters i and j. 138 | -- (Distances to cluster i are updated, distances to cluster j are removed.) 139 | updateDistances :: Clusters a -> Linkage -> Distances -> Distances 140 | updateDistances cs l (((i,j),_):ds) = mergeBy (comparing snd) new rest 141 | where (aff,rest) = partition mustUpdate ds 142 | (del,upd) = partition mustDelete aff 143 | new = sortBy (comparing snd) . map update $ upd 144 | mustUpdate ((x,y),_) = x==i || x==j || y==i || y==j 145 | mustDelete ((x,y),_) = x==j || y==j 146 | update ((x,y),di) | l==Complete = ((x,y),max di dj) 147 | | l==Single = ((x,y),min di dj) 148 | | l==Average = ((x,y),(ni*di+nj*dj)/(ni+nj)) 149 | where (Just dj) = lookup pair del 150 | pair = if (z>j) then (z,j) else (j,z) 151 | z = if x==i then y else x 152 | ni = fromInteger.toInteger $ clusterSize cs i 153 | nj = fromInteger.toInteger $ clusterSize cs j 154 | 155 | -- | Merges two sorted lists using a user-specified comparison function. 156 | mergeBy :: (Ord a) => (a -> a -> Ordering) -> [a] -> [a] -> [a] 157 | mergeBy _ [] ys = ys 158 | mergeBy _ xs [] = xs 159 | mergeBy p (x:xs) (y:ys) 160 | | p x y == LT = x:mergeBy p xs (y:ys) 161 | | otherwise = y:mergeBy p (x:xs) ys 162 | 163 | -- | Clusters the elements of a list using a given distance measure 164 | -- and linkage type. Returns a 'Dendrogram', which can be cut at 165 | -- arbitrary levels using 'distanceCut' and 'numberCut'. 166 | cluster :: DistMeasure a -> Linkage -> [a] -> Dendrogram a 167 | cluster dm l xs = toDendrogram $ cluster' cs ds 168 | where cs = initClusters xs 169 | ds = initDist cs dm 170 | cluster' cs [] = cs 171 | cluster' cs ds@(d:_) = 172 | cluster' (mergeClusters cs d) (updateDistances cs l ds) 173 | 174 | -- | Monadic version of the above. 175 | clusterM :: Monad m => DistMeasureM m a -> Linkage -> [a] -> m (Dendrogram a) 176 | clusterM dm l xs = do 177 | let cs = initClusters xs 178 | ds <- initDistM cs dm 179 | return . toDendrogram $ cluster' cs ds 180 | where cluster' cs [] = cs 181 | cluster' cs ds@(d:_) = 182 | cluster' (mergeClusters cs d) (updateDistances cs l ds) 183 | 184 | ------------------------------------------------------------------------------ 185 | -- Additional flavors 186 | ------------------------------------------------------------------------------ 187 | 188 | -- | Clusters until distances are above a specified threshold. 189 | -- Defined as: 190 | -- @ clusterAt t dm l = distanceCut t . cluster dm l @ 191 | clusterAt :: Double -> DistMeasure a -> Linkage -> [a] -> [[a]] 192 | clusterAt t dm l = distanceCut t . cluster dm l 193 | 194 | -- | Clusters into a specified number of clusters (if possible). 195 | -- Defined as: 196 | -- @ clusterInto n dm l = numberCut n . cluster dm l @ 197 | clusterInto :: Int -> DistMeasure a -> Linkage -> [a] -> [[a]] 198 | clusterInto n dm l = numberCut n . cluster dm l 199 | 200 | -- | Clusters pre-partitioned elements and produces a single 201 | -- dendrogram. 202 | clusterPartition :: DistMeasure a -> Linkage -> [[a]] -> Dendrogram a 203 | clusterPartition dm l = concatDendrograms . map (cluster dm l) 204 | 205 | -- | Monadic version of the above. 206 | clusterPartitionM :: Monad m => 207 | DistMeasureM m a -> Linkage -> [[a]] -> m (Dendrogram a) 208 | clusterPartitionM dm l xs = concatDendrograms `liftM` mapM (clusterM dm l) xs 209 | 210 | -- | A call-back version of 'clusterPartition'. The call-back function is 211 | -- invoked for each partition and the argument of the 212 | -- function is the number of the partition being clustered. 213 | clusterPartitionCb :: (Int -> IO t) -> DistMeasure a -> Linkage -> [[a]] -> IO (Dendrogram a) 214 | clusterPartitionCb cb dm l xss = concatDendrograms `liftM` zipWithM step xss [1..] 215 | where step xs i = cb i >> let r = cluster dm l xs in r `seq` return r 216 | 217 | -- | Monadic version of the above. 218 | clusterPartitionCbM :: MonadIO m => 219 | (Int -> IO t) -> DistMeasureM m a -> Linkage -> [[a]] -> m (Dendrogram a) 220 | clusterPartitionCbM cb dm l xss = 221 | concatDendrograms `liftM` zipWithM step xss [1..] 222 | where step xs i = liftIO (cb i) >> clusterM dm l xs 223 | 224 | --------------------------------------------------------------------------------