├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── birch.R ├── clusteringdatasets-package.r └── sklearn.R ├── README-asets-1.png ├── README-birch-1.png ├── README-highd1-1.png ├── README-mopsi-1.png ├── README-neuralgas-1.png ├── README-nonconvex-1.png ├── README-shapesets-1.png ├── README-showkdcupbio-1.png ├── README-ssets-1.png ├── README-t48k-1.png ├── README.Rmd ├── README.md ├── data ├── Aggregation.rda ├── Circle.rda ├── Complex1.rda ├── Complex2.rda ├── Complex3.rda ├── Complex4.rda ├── Compound.rda ├── D31.rda ├── Discrete.rda ├── HiLoDensity.rda ├── JumpingRectangle.rda ├── MovingJumpingRectangle.rda ├── MovingRectangle.rda ├── R15.rda ├── RMouseRectangle.rda ├── Rectangle.rda ├── Ring.rda ├── a1.rda ├── a2.rda ├── a3.rda ├── birch1.rda ├── birch2.rda ├── birch3.rda ├── breast.rda ├── centroids.rda ├── cross.rda ├── d4.rda ├── dim032.rda ├── dim064.rda ├── dim1024.rda ├── dim128.rda ├── dim256.rda ├── dim512.rda ├── face.rda ├── flame.rda ├── glass.rda ├── jain.rda ├── kddcup04bio.rda ├── mopsifinland.rda ├── mopsijoensu.rda ├── nm.rda ├── pathbased.rda ├── pie.rda ├── ring2.rda ├── s1.rda ├── s2.rda ├── s3.rda ├── s4.rda ├── sincos.rda ├── spiral.rda ├── t48k.rda ├── t58k.rda ├── t710k.rda ├── t88k.rda ├── thyroid.rda ├── wdbc.rda ├── wine.rda └── yeast.rda ├── inst ├── doc │ ├── clusteringdatasets.R │ ├── clusteringdatasets.Rmd │ └── clusteringdatasets.html └── extdata │ ├── clusters.Rda │ ├── kdvis.Rda │ └── vises.Rda ├── man ├── asets.Rd ├── birch.Rd ├── chameleon.Rd ├── clusteringdatasets.Rd ├── highdimsets.Rd ├── kddcup04bio.Rd ├── make_blobs.Rd ├── make_moons.Rd ├── mopsi.Rd ├── neuralgas.Rd ├── nonconvex.Rd ├── shapesets.Rd ├── ssets.Rd └── uci.Rd └── vignettes └── clusteringdatasets.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^README-.*\.png$ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clusteringdatasets 2 | Type: Package 3 | Title: Datasets useful for testing clustering algorithms 4 | Version: 0.1.1 5 | Authors@R: person("Amos", "Elberg", email = "amos.elberg@gmail.com", role = c("aut", "cre")) 6 | Description: Nothing fancy - just an R-packaging of some datasets used in well-known papers on clustering algorithms, obtained from http://cs.joensuu.fi/sipu/datasets/. Also imitations of some functions for making toy datasets from Python sklearn. 7 | License: See individual data descriptions 8 | Encoding: UTF-8 9 | LazyData: true 10 | RoxygenNote: 6.0.1 11 | Depends: 12 | R (>= 2.10) 13 | Suggests: knitr, 14 | rmarkdown 15 | VignetteBuilder: knitr 16 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(make_blobs) 4 | export(make_moons) 5 | -------------------------------------------------------------------------------- /R/birch.R: -------------------------------------------------------------------------------- 1 | #' @title BIRCH clustering datasets. 2 | #' 3 | #' @details Synthetic 2-d data with N=100,000 vectors and M=100 clusters 4 | #' See Zhang et al., "BIRCH: A new data clustering algorithm and its applications", Data Mining and Knowledge Discovery, 1 (2), 141-182, 1997. 5 | #' 6 | #' @format Data frame of x, y coordinates 7 | #' 8 | #' @description Clusters in regular grid structure 9 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 10 | #' @rdname birch 11 | "birch1" 12 | 13 | #' @description Clusters at a sine curve 14 | #' @rdname birch 15 | "birch2" 16 | 17 | #'@description Random sized clusters in random locations 18 | #'@rdname birch 19 | "birch3" 20 | 21 | 22 | #' @title Shape sets 23 | #' 24 | #' @description Various sets of points that form shapes. Good for testing density-based clustering methods. 25 | #' 26 | #' @format Data frame of x, y coordinates and label 27 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 28 | #' @rdname shapesets 29 | #' @references A. Gionis, H. Mannila, and P. Tsaparas, Clustering aggregation. ACM Transactions on Knowledge Discovery from Data (TKDD), 2007. 1(1): p. 1-30. 30 | "Aggregation" 31 | 32 | #' @rdname shapesets 33 | #' @references C.T. Zahn, Graph-theoretical methods for detecting and describing gestalt clusters. IEEE Transactions on Computers, 1971. 100(1): p. 68-86. 34 | "Compound" 35 | 36 | #' @rdname shapesets 37 | #' @references H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203. 38 | "pathbased" 39 | 40 | #' @rdname shapesets 41 | #' @references H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203. 42 | "spiral" 43 | 44 | #' @rdname shapesets 45 | #' @references A. Jain and M. Law, Data clustering: A user's dilemma. Lecture Notes in Computer Science, 2005. 3776: p. 1-10. 46 | "jain" 47 | 48 | #' @rdname shapesets 49 | #' @references L. Fu and E. Medico, FLAME, a novel fuzzy clustering method for the analysis of DNA microarray data. BMC bioinformatics, 2007. 8(1): p. 3. 50 | "flame" 51 | 52 | #' @rdname shapesets 53 | #' @references C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280. 54 | "D31" 55 | 56 | #' @rdname shapesets 57 | #' @references C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280. 58 | "R15" 59 | 60 | #' @title S-sets 61 | #' @description Synthetic 2-d data with N=5000 vectors and M=15 Gaussian clusters with different degree of cluster overlapping. Centroids are found in the data object \code{centroids}. 62 | #' @format Data frame of x, y coordinates and labels 63 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 64 | #' @rdname ssets 65 | #' @references P. Fränti and O. Virmajoki, "Iterative shrinking method for clustering problems", Pattern Recognition, 39 (5), 761-765, May 2006. 66 | "s1" 67 | 68 | #' @rdname ssets 69 | "s2" 70 | 71 | #' @rdname ssets 72 | "s3" 73 | 74 | #' @rdname ssets 75 | "s4" 76 | 77 | #' @rdname ssets 78 | "centroids" 79 | 80 | #' @title A-sets 81 | #' @description Synthetic 2-d data with varying number of vectors (N) and clusters (M). There are 150 vectors per cluster. 82 | #' @format Data frame of x, y coordinates 83 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 84 | #' @rdname asets 85 | #' @references I. Kärkkäinen and P. Fränti, "Dynamic local search algorithm for the clustering problem", Research Report A-2002-6 86 | "a1" 87 | 88 | #' @rdname asets 89 | "a2" 90 | 91 | #' @rdname asets 92 | "a3" 93 | 94 | #' @title High-Dim Sets 95 | #' @description Six sets of high-dimensional data, each with 1024 vectors and 16 gaussian clusters. 96 | #' @format Data frames 97 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 98 | #' @references P. Fränti, O. Virmajoki and V. Hautamäki, "Fast agglomerative clustering using a k-nearest neighbor graph", IEEE Trans. on Pattern Analysis and Machine Intelligence, 28 (11), 1875-1881, November 2006. 99 | #' @rdname highdimsets 100 | "dim032" 101 | 102 | #' @rdname highdimsets 103 | "dim064" 104 | 105 | #' @rdname highdimsets 106 | "dim128" 107 | 108 | #' @rdname highdimsets 109 | "dim256" 110 | 111 | #' @rdname highdimsets 112 | "dim512" 113 | 114 | #' @rdname highdimsets 115 | "dim1024" 116 | 117 | #' @title UCI High Dimensional Datasets 118 | #' @description Various high-dimensional datasets with identifying data removed 119 | #' \itemize{ 120 | #' \item{"thyroid"}{N=215,M=2,D=5} 121 | #' \item{"wine"}{N=178,M=3,D=13} 122 | #' \item{"glass"}{N=214,M=7,D=9} 123 | #' \item{"yeast"}{N=1484,M=10,D=8} 124 | #' \item{"breast"}{N=699,M=2,D=9} 125 | #' \item{"wdbc"}{N=569,M=2,D=32} 126 | #' } 127 | #' @source \url{http://archive.ics.uci.edu/ml/} 128 | #' @format Data frames 129 | #' @rdname uci 130 | "thyroid" 131 | 132 | #' @rdname uci 133 | "wine" 134 | 135 | #' @rdname uci 136 | "glass" 137 | 138 | #' @rdname uci 139 | "yeast" 140 | 141 | #' @rdname uci 142 | "breast" 143 | 144 | #' @rdname uci 145 | "wdbc" 146 | 147 | #' @title Chameleon Datasets 148 | #' @description 4 sets of 2-vectors. 149 | #' \itemize{ 150 | #' \item{"t48k"}{N=8000,M=4,D=2} 151 | #' \item{"t88k"}{N=8000,M=8,D=2} 152 | #' \item{"t710k"}{N=10000,M=7,D=2} 153 | #' \item{"t58k"}{N=8000,M=5,D=2} 154 | #' } 155 | #' @format Data frame 156 | #' @rdname chameleon 157 | #' @references G. Karypis, E.H. Han, V. Kumar, CHAMELEON: A hierarchical clustering algorithm using dynamic modeling, IEEE Trans. on Computers, 32 (8), 68-75, 1999. 158 | "t48k" 159 | 160 | #' @rdname chameleon 161 | "t88k" 162 | 163 | #' @rdname chameleon 164 | "t710k" 165 | 166 | #' @rdname chameleon 167 | "t58k" 168 | 169 | #' @title KDDCUPO4Bio 170 | #' @description 145751 vectors, 74-D 171 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/} 172 | #' @format Data frame 173 | #' @rdname kddcup04bio 174 | "kddcup04bio" 175 | 176 | #' @title Mopsi Data 177 | #' @description User locations, N = 13467 for Finland, N = 6014 for Joensuu 178 | #' @source \url{http://cs.uef.fi/mopsi/data/} 179 | #' @format Data frame 180 | #' @rdname mopsi 181 | "mopsifinland" 182 | 183 | #' @rdname mopsi 184 | "mopsijoensu" 185 | 186 | #' @title Neural Gas 187 | #' @description 4 sets of 2-vectors. 188 | #' \itemize{ 189 | #' \item{"Circle"}{N=5000,D=2} 190 | #' \item{"Complex1"}{N=5000,D=2} 191 | #' \item{"Complex2"}{N=5000,D=2} 192 | #' \item{"Complex3"}{N=5000,D=2} 193 | #' \item{"Complex3"}{N=5000,D=2} 194 | #' \item{"Discrete"}{N=5000,D=2} 195 | #' \item{"HiLoDensity"}{N=5000,D=2} 196 | #' \item{"JumpingRectangle"}{N=5000,D=2} 197 | #' \item{"MovingJumpingRectangle"}{N=5000,D=2} 198 | #' \item{"MovingRectangle"}{N=5000,D=2} 199 | #' \item{"Rectangle"}{N=5000,D=2} 200 | #' \item{"Ring"}{N=5000,D=2} 201 | #' \item{"RMouseRectangle"}{N=5000,D=2} 202 | #' } 203 | #' @format Data frame 204 | #' @rdname neuralgas 205 | "Circle" 206 | 207 | #' @rdname neuralgas 208 | "Complex1" 209 | 210 | #' @rdname neuralgas 211 | "Complex2" 212 | 213 | #' @rdname neuralgas 214 | "Complex3" 215 | 216 | #' @rdname neuralgas 217 | "Complex4" 218 | 219 | #' @rdname neuralgas 220 | "Discrete" 221 | 222 | #' @rdname neuralgas 223 | "HiLoDensity" 224 | 225 | #' @rdname neuralgas 226 | "JumpingRectangle" 227 | 228 | #' @rdname neuralgas 229 | "MovingJumpingRectangle" 230 | 231 | #' @rdname neuralgas 232 | "MovingRectangle" 233 | 234 | #' @rdname neuralgas 235 | "Rectangle" 236 | 237 | #' @rdname neuralgas 238 | "Ring" 239 | 240 | #' @rdname neuralgas 241 | "RMouseRectangle" 242 | 243 | #' @title Non-Convex 244 | #' @description 245 | #' \itemize{ 246 | #' \item{"cross"}{N=2000,D=2} 247 | #' \item{"d4"}{N=200,D=2} 248 | #' \item{"face"}{N=500,D=2} 249 | #' \item{"pie"}{N=2322,D=2} 250 | #' \item{"ring2"}{N=60,D=2} 251 | #' \item{"sincos"}{N=300,D=2} 252 | #' } 253 | #' @format Data frame 254 | #' @rdname nonconvex 255 | "cross" 256 | 257 | #' @rdname nonconvex 258 | "d4" 259 | 260 | #' @rdname nonconvex 261 | "face" 262 | 263 | #' @rdname nonconvex 264 | "pie" 265 | 266 | #' @rdname nonconvex 267 | "ring2" 268 | 269 | #' @rdname nonconvex 270 | "sincos" -------------------------------------------------------------------------------- /R/clusteringdatasets-package.r: -------------------------------------------------------------------------------- 1 | #' clusteringdatasets. 2 | #' 3 | #' A set of datasets useful for testing clustering algorithms. 4 | #' 5 | #' @name clusteringdatasets 6 | #' @docType package 7 | NULL 8 | -------------------------------------------------------------------------------- /R/sklearn.R: -------------------------------------------------------------------------------- 1 | #' Make gaussian blobs 2 | #' 3 | #' @param n_samples Number of points 4 | #' @param n_features Dimensionality of dataset 5 | #' @param centers Either the number of centers, or a matrix of the chosen centers 6 | #' @param cluster_std Standard deviation of Gaussian noise. Either one number, or a vector of length equal to the number of centers 7 | #' @param center_box If the centers are being generated, the bounding box within which they will be created. 8 | #' @param shuffle Ignored; included for compatibility with the Python 9 | #' 10 | #' @description Imitation of the Python \code{sklearn.datasets.make_blobs} function. 11 | #' 12 | #' @return a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the cluster from which each point came. 13 | #' @export 14 | make_blobs <- function(n_samples=100, n_features=2, centers=3, 15 | cluster_std=1.0, center_box=c(-10,10), 16 | shuffle=TRUE) { 17 | if (is.matrix(centers)) { 18 | if (ncol(centers) != n_features) stop("Dimensionality of centers must equal number of features.") 19 | } else { 20 | centers <- runif(n = n_features * centers, min=center_box[1], max=center_box[2]) 21 | centers <- matrix(centers, ncol=n_features) 22 | } 23 | 24 | if (length(cluster_std) != 1 & length(cluster_std) != nrow(centers)) stop("Cluster_std must be 1 or the same length as the number of clusters") 25 | 26 | categories <- sample(nrow(centers), size = n_samples, replace = TRUE) 27 | 28 | starting_points <- matrix( 29 | rnorm(n = n_samples * n_features), 30 | ncol = n_features 31 | ) 32 | 33 | if (length(cluster_std) == 1) points <- starting_points * cluster_std 34 | else points <- starting_points * cluster_std[categories] 35 | 36 | points <- points + centers[categories, ] 37 | 38 | list( 39 | samples=points, 40 | labels=categories 41 | ) 42 | } 43 | 44 | #' Make two interleaving half-circles 45 | #' 46 | #' @param n_samples Number of points (will be divided equally among the circles) 47 | #' @param shuffle Whether to randomize the sequence 48 | #' @param noise Standard deviation of Gaussian noise applied to point positions 49 | #' 50 | #' @description Imitation of the Python \code{sklearn.datasets.make_moons} function. 51 | #' @return a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the circle from which each point came. 52 | #' @export 53 | make_moons <- function(n_samples=100, shuffle=TRUE, noise=NA) { 54 | n_samples_out = trunc(n_samples / 2) 55 | n_samples_in = n_samples - n_samples_out 56 | 57 | points <- matrix( c( 58 | cos(seq(from=0, to=pi, length.out=n_samples_out)), # Outer circle x 59 | 1 - cos(seq(from=0, to=pi, length.out=n_samples_in)), # Inner circle x 60 | sin(seq(from=0, to=pi, length.out=n_samples_out)), # Outer circle y 61 | 1 - sin(seq(from=0, to=pi, length.out=n_samples_in)) - 0.5 # Inner circle y 62 | ), ncol=2) 63 | 64 | if (! is.na(noise)) points <- points + rnorm(length(points), sd=noise) 65 | 66 | labels <- c(rep(1, n_samples_out), rep(2, n_samples_in)) 67 | 68 | if (! shuffle) { 69 | list( 70 | samples=points, 71 | labels=labels 72 | ) 73 | } else { 74 | order <- sample(x = n_samples, size = n_samples, replace = F) 75 | list( 76 | samples=points[order,], 77 | labels=labels[order] 78 | ) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /README-asets-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-asets-1.png -------------------------------------------------------------------------------- /README-birch-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-birch-1.png -------------------------------------------------------------------------------- /README-highd1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-highd1-1.png -------------------------------------------------------------------------------- /README-mopsi-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-mopsi-1.png -------------------------------------------------------------------------------- /README-neuralgas-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-neuralgas-1.png -------------------------------------------------------------------------------- /README-nonconvex-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-nonconvex-1.png -------------------------------------------------------------------------------- /README-shapesets-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-shapesets-1.png -------------------------------------------------------------------------------- /README-showkdcupbio-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-showkdcupbio-1.png -------------------------------------------------------------------------------- /README-ssets-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-ssets-1.png -------------------------------------------------------------------------------- /README-t48k-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-t48k-1.png -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | ``` 14 | 15 | ## Clustering Datasets 16 | 17 | An R-repackaging of datasets useful for evaluating clustering methods. The source for most is http://cs.joensuu.fi/sipu/datasets 18 | 19 | I would love to include additional clustering datasets, if folks would like to provide them or make a PR. 20 | 21 | ```{r child = 'vignettes/clusteringdatasets.Rmd'} 22 | ``` 23 | 24 | ## Sklearn Toy Datasets 25 | 26 | The Python `sklearn.datasets` package includes functions for creating toy datasets. I've ported a few of them. 27 | 28 | ### Make Blobs 29 | 30 | ```{r makeblobs,echo=T} 31 | library(clusteringdatasets) 32 | blobs <- make_blobs(centers=matrix(c(-7, -5, 6, -7, 3, 6), ncol=2)) 33 | plot(blobs$samples, col=rainbow(3)[blobs$labels], xlim=c(-10, 10), ylim=c(-10, 10)) 34 | ``` 35 | 36 | ### Make Moons 37 | 38 | ```{r makemoons,echo=T} 39 | moons <- make_moons(noise=0.04) 40 | plot(moons$samples, col=rainbow(2)[moons$labels]) 41 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## Clustering Datasets 5 | 6 | An R-repackaging of datasets useful for evaluating clustering methods. 7 | The source for most is 8 | 9 | I would love to include additional clustering datasets, if folks would 10 | like to provide them or make a PR. 11 | 12 | ## Clustering Datasets 13 | 14 | This vignette provides a simple overview of the datasets included in the 15 | package. 16 | 17 | ### Birch 18 | 19 | ![](README-birch-1.png) 20 | 21 | ### S Sets 22 | 23 | The S-sets are useful for testing how an algorithm handles cluster 24 | overlap. 25 | 26 | ![](README-ssets-1.png) 27 | 28 | ### A Sets 29 | 30 | ![](README-asets-1.png) 31 | 32 | ### Shapesets 33 | 34 | ![](README-shapesets-1.png) 35 | 36 | ### Chameleon 37 | 38 | ![](README-t48k-1.png) 39 | 40 | ### Neural Gas 41 | 42 | ![](README-neuralgas-1.png) 43 | 44 | ### Non-Convex 45 | 46 | ![](README-nonconvex-1.png) 47 | 48 | ## Locations 49 | 50 | ![](README-mopsi-1.png) 51 | 52 | ## High Dimensional Datasets 53 | 54 | The package contains three sets of high-dimensional data. The 55 | visualizations below were made using my `largeVis` package to reduce 56 | each dataset to two dimensions, and the colors are the result of 57 | applying the `hdbscan` function within the package. 58 | 59 | ### UCI Datasets 60 | 61 | ![](README-highd1-1.png) 62 | 63 | ### KDDCUP04Bio 64 | 65 | ![](README-showkdcupbio-1.png) 66 | 67 | ## Sklearn Toy Datasets 68 | 69 | The Python `sklearn.datasets` package includes functions for creating 70 | toy datasets. I’ve ported a few of them. 71 | 72 | ### Make Blobs 73 | 74 | ``` r 75 | library(clusteringdatasets) 76 | blobs <- make_blobs() 77 | plot(blobs$samples, col=rainbow(3)[blobs$labels]) 78 | ``` 79 | 80 | ![](README-makeblobs-1.png) 81 | 82 | ### Make Moons 83 | 84 | ``` r 85 | moons <- make_moons(noise=0.04) 86 | plot(moons$samples, col=rainbow(2)[moons$labels]) 87 | ``` 88 | 89 | ![](README-makemoons-1.png) 90 | -------------------------------------------------------------------------------- /data/Aggregation.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Aggregation.rda -------------------------------------------------------------------------------- /data/Circle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Circle.rda -------------------------------------------------------------------------------- /data/Complex1.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex1.rda -------------------------------------------------------------------------------- /data/Complex2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex2.rda -------------------------------------------------------------------------------- /data/Complex3.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex3.rda -------------------------------------------------------------------------------- /data/Complex4.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex4.rda -------------------------------------------------------------------------------- /data/Compound.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Compound.rda -------------------------------------------------------------------------------- /data/D31.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/D31.rda -------------------------------------------------------------------------------- /data/Discrete.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Discrete.rda -------------------------------------------------------------------------------- /data/HiLoDensity.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/HiLoDensity.rda -------------------------------------------------------------------------------- /data/JumpingRectangle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/JumpingRectangle.rda -------------------------------------------------------------------------------- /data/MovingJumpingRectangle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/MovingJumpingRectangle.rda -------------------------------------------------------------------------------- /data/MovingRectangle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/MovingRectangle.rda -------------------------------------------------------------------------------- /data/R15.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/R15.rda -------------------------------------------------------------------------------- /data/RMouseRectangle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/RMouseRectangle.rda -------------------------------------------------------------------------------- /data/Rectangle.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Rectangle.rda -------------------------------------------------------------------------------- /data/Ring.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Ring.rda -------------------------------------------------------------------------------- /data/a1.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a1.rda -------------------------------------------------------------------------------- /data/a2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a2.rda -------------------------------------------------------------------------------- /data/a3.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a3.rda -------------------------------------------------------------------------------- /data/birch1.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch1.rda -------------------------------------------------------------------------------- /data/birch2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch2.rda -------------------------------------------------------------------------------- /data/birch3.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch3.rda -------------------------------------------------------------------------------- /data/breast.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/breast.rda -------------------------------------------------------------------------------- /data/centroids.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/centroids.rda -------------------------------------------------------------------------------- /data/cross.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/cross.rda -------------------------------------------------------------------------------- /data/d4.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/d4.rda -------------------------------------------------------------------------------- /data/dim032.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim032.rda -------------------------------------------------------------------------------- /data/dim064.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim064.rda -------------------------------------------------------------------------------- /data/dim1024.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim1024.rda -------------------------------------------------------------------------------- /data/dim128.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim128.rda -------------------------------------------------------------------------------- /data/dim256.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim256.rda -------------------------------------------------------------------------------- /data/dim512.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim512.rda -------------------------------------------------------------------------------- /data/face.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/face.rda -------------------------------------------------------------------------------- /data/flame.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/flame.rda -------------------------------------------------------------------------------- /data/glass.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/glass.rda -------------------------------------------------------------------------------- /data/jain.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/jain.rda -------------------------------------------------------------------------------- /data/kddcup04bio.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/kddcup04bio.rda -------------------------------------------------------------------------------- /data/mopsifinland.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/mopsifinland.rda -------------------------------------------------------------------------------- /data/mopsijoensu.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/mopsijoensu.rda -------------------------------------------------------------------------------- /data/nm.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/nm.rda -------------------------------------------------------------------------------- /data/pathbased.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/pathbased.rda -------------------------------------------------------------------------------- /data/pie.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/pie.rda -------------------------------------------------------------------------------- /data/ring2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/ring2.rda -------------------------------------------------------------------------------- /data/s1.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s1.rda -------------------------------------------------------------------------------- /data/s2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s2.rda -------------------------------------------------------------------------------- /data/s3.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s3.rda -------------------------------------------------------------------------------- /data/s4.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s4.rda -------------------------------------------------------------------------------- /data/sincos.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/sincos.rda -------------------------------------------------------------------------------- /data/spiral.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/spiral.rda -------------------------------------------------------------------------------- /data/t48k.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t48k.rda -------------------------------------------------------------------------------- /data/t58k.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t58k.rda -------------------------------------------------------------------------------- /data/t710k.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t710k.rda -------------------------------------------------------------------------------- /data/t88k.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t88k.rda -------------------------------------------------------------------------------- /data/thyroid.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/thyroid.rda -------------------------------------------------------------------------------- /data/wdbc.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/wdbc.rda -------------------------------------------------------------------------------- /data/wine.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/wine.rda -------------------------------------------------------------------------------- /data/yeast.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/yeast.rda -------------------------------------------------------------------------------- /inst/doc/clusteringdatasets.R: -------------------------------------------------------------------------------- 1 | ## ----setup,echo=F-------------------------------------------------------- 2 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6) 3 | 4 | ## ----birch,fig.height=2,fig.width=6-------------------------------------- 5 | library(clusteringdatasets) 6 | data(birch1) 7 | data(birch2) 8 | data(birch3) 9 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 10 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n') 11 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 12 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 13 | 14 | ## ----ssets,fig.height=4,fig.width=4-------------------------------------- 15 | data(s1) 16 | data(s2) 17 | data(s3) 18 | data(s4) 19 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 20 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 21 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 22 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 23 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 24 | 25 | ## ----asets,fig.height=2,fig.width=6-------------------------------------- 26 | data(a1) 27 | data(a2) 28 | data(a3) 29 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 30 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 31 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 32 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 33 | 34 | ## ----shapesets,fig.height=6,fig.width=6---------------------------------- 35 | data("Aggregation") 36 | data("spiral") 37 | data("D31") 38 | data(Compound) 39 | data(pathbased) 40 | data(jain) 41 | data(flame) 42 | data(R15) 43 | par(mfrow = c(3, 3), mar = c(0,0,1,0)) 44 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 45 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 46 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 47 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 48 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 49 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 50 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 51 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 52 | 53 | ## ----t48k,fig.height=3,fig.width=3--------------------------------------- 54 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 55 | data("t48k") 56 | data("t58k") 57 | data("t710k") 58 | data("t88k") 59 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 60 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 61 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 62 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 63 | 64 | ## ----neuralgas----------------------------------------------------------- 65 | names <- c("Circle" , "Complex1" , "Complex2" , "Complex3" , "Complex4" , "Discrete" , "HiLoDensity" , "JumpingRectangle" , 66 | "MovingJumpingRectangle", "MovingRectangle", "Rectangle" , "RMouseRectangle" ) # "Ring" 67 | data(list = names) 68 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0)) 69 | for (nm in names) { 70 | plot(eval(parse(text = nm)), cex = 0.01, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 71 | } 72 | 73 | ## ----nonconvex----------------------------------------------------------- 74 | names <- c("cross" , "d4" , "face" , "pie" , "ring2" , "sincos") 75 | data(list = names) 76 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0)) 77 | for (nm in names) { 78 | plot(eval(parse(text = nm)), cex = 0.1, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 79 | } 80 | 81 | ## ----mopsi--------------------------------------------------------------- 82 | data("mopsifinland") 83 | data("mopsijoensu") 84 | par(mfrow = c(1, 2), mar = c(0,0,1,0)) 85 | plot(mopsifinland[, 1:2], cex = 0.01, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 86 | plot(mopsijoensu[, 1:2], cex = 0.05, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 87 | 88 | ## ----toproc,eval=F------------------------------------------------------- 89 | # library(largeVis) 90 | # library(clusteringdatasets) 91 | # library(ggplot2) 92 | # data(glass) 93 | # data(wdbc) 94 | # data(breast) 95 | # data(yeast) 96 | # data(wine) 97 | # data(thyroid) 98 | # toproc <- list(glass, wdbc, breast, yeast, wine, thyroid) 99 | # vises <- list() 100 | # clusters <- list() 101 | # for (i in 1:length(toproc)) { 102 | # dat <- t(scale(as.matrix(toproc[[i]]))) 103 | # if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE) 104 | # else vis <- largeVis(dat, K = 100, verbose = TRUE) 105 | # neighbors <- randomProjectionTreeSearch(dat, K = 50) 106 | # edges <- buildEdgeMatrix(data = dat, neighbors = neighbors) 107 | # print(str(edges)) 108 | # cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE) 109 | # vises[[i]] <- vis 110 | # clusters[[i]] <- cluster 111 | # } 112 | 113 | ## ----highd1-------------------------------------------------------------- 114 | library(ggplot2) 115 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets")) 116 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets")) 117 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid") 118 | par(mfrow = c(2, 3), mar = c(0,0,1,0)) 119 | for (i in 1:length(names)) { 120 | df <- data.frame(t(vises[[i]]$coords)) 121 | colnames(df) <- c("x", "y") 122 | df$label <- clusters[[i]]$clusters 123 | if (length(unique(df$label)) > 1) { 124 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 125 | } else { 126 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 127 | } 128 | } 129 | 130 | ## ----kdcupbio,eval=F----------------------------------------------------- 131 | # data("kddcup04bio") 132 | # library(largeVis) 133 | # load("./kddvis.Rda") 134 | # dat <- t(scale(as.matrix(kddcup04bio))) 135 | # vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE) 136 | 137 | ## ----showkdcupbio-------------------------------------------------------- 138 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets")) 139 | par(mfrow = c(1, 1), mar = c(0,0,1,0)) 140 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 141 | xlim = c(-20, 18), ylim = c(-20, 30)) 142 | 143 | -------------------------------------------------------------------------------- /inst/doc/clusteringdatasets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clustering Datasets" 3 | author: "Amos Elberg" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{clusteringdatasets} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ## Clustering Datasets 13 | 14 | This vignette provides a simple overview of the datasets included in the package. 15 | 16 | ```{r setup,echo=F} 17 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6) 18 | ``` 19 | 20 | ### Birch 21 | 22 | ```{r birch,fig.height=2,fig.width=6} 23 | library(clusteringdatasets) 24 | data(birch1) 25 | data(birch2) 26 | data(birch3) 27 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 28 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n') 29 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 30 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 31 | ``` 32 | 33 | ### S Sets 34 | 35 | The S-sets are useful for testing how an algorithm handles cluster overlap. 36 | 37 | ```{r ssets,fig.height=4,fig.width=4} 38 | data(s1) 39 | data(s2) 40 | data(s3) 41 | data(s4) 42 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 43 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 44 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 45 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 46 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 47 | ``` 48 | 49 | 50 | ### A Sets 51 | 52 | ```{r asets,fig.height=2,fig.width=6} 53 | data(a1) 54 | data(a2) 55 | data(a3) 56 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 57 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 58 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 59 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 60 | ``` 61 | 62 | ### Shapesets 63 | 64 | ```{r shapesets,fig.height=6,fig.width=6} 65 | data("Aggregation") 66 | data("spiral") 67 | data("D31") 68 | data(Compound) 69 | data(pathbased) 70 | data(jain) 71 | data(flame) 72 | data(R15) 73 | par(mfrow = c(3, 3), mar = c(0,0,1,0)) 74 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 75 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 76 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 77 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 78 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 79 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 80 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 81 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 82 | ``` 83 | 84 | ### Chameleon 85 | 86 | ```{r t48k,fig.height=3,fig.width=3} 87 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 88 | data("t48k") 89 | data("t58k") 90 | data("t710k") 91 | data("t88k") 92 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 93 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 94 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 95 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 96 | ``` 97 | 98 | ### Neural Gas 99 | 100 | ```{r neuralgas} 101 | names <- c("Circle" , "Complex1" , "Complex2" , "Complex3" , "Complex4" , "Discrete" , "HiLoDensity" , "JumpingRectangle" , 102 | "MovingJumpingRectangle", "MovingRectangle", "Rectangle" , "RMouseRectangle" ) # "Ring" 103 | data(list = names) 104 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0)) 105 | for (nm in names) { 106 | plot(eval(parse(text = nm)), cex = 0.01, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 107 | } 108 | ``` 109 | 110 | 111 | ### Non-Convex 112 | 113 | ```{r nonconvex} 114 | names <- c("cross" , "d4" , "face" , "pie" , "ring2" , "sincos") 115 | data(list = names) 116 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0)) 117 | for (nm in names) { 118 | plot(eval(parse(text = nm)), cex = 0.1, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 119 | } 120 | ``` 121 | 122 | ## Locations 123 | 124 | ```{r mopsi} 125 | data("mopsifinland") 126 | data("mopsijoensu") 127 | par(mfrow = c(1, 2), mar = c(0,0,1,0)) 128 | plot(mopsifinland[, 1:2], cex = 0.01, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 129 | plot(mopsijoensu[, 1:2], cex = 0.05, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 130 | ``` 131 | 132 | ## High Dimensional Datasets 133 | 134 | The package contains three sets of high-dimensional data. The visualizations below were made using my `largeVis` package to reduce each dataset to two dimensions, and the colors are the result of applying the `hdbscan` function within the package. 135 | 136 | ### UCI Datasets 137 | 138 | ```{r toproc,eval=F} 139 | library(largeVis) 140 | library(clusteringdatasets) 141 | library(ggplot2) 142 | data(glass) 143 | data(wdbc) 144 | data(breast) 145 | data(yeast) 146 | data(wine) 147 | data(thyroid) 148 | toproc <- list(glass, wdbc, breast, yeast, wine, thyroid) 149 | vises <- list() 150 | clusters <- list() 151 | for (i in 1:length(toproc)) { 152 | dat <- t(scale(as.matrix(toproc[[i]]))) 153 | if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE) 154 | else vis <- largeVis(dat, K = 100, verbose = TRUE) 155 | neighbors <- randomProjectionTreeSearch(dat, K = 50) 156 | edges <- buildEdgeMatrix(data = dat, neighbors = neighbors) 157 | print(str(edges)) 158 | cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE) 159 | vises[[i]] <- vis 160 | clusters[[i]] <- cluster 161 | } 162 | ``` 163 | 164 | ```{r highd1} 165 | library(ggplot2) 166 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets")) 167 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets")) 168 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid") 169 | par(mfrow = c(2, 3), mar = c(0,0,1,0)) 170 | for (i in 1:length(names)) { 171 | df <- data.frame(t(vises[[i]]$coords)) 172 | colnames(df) <- c("x", "y") 173 | df$label <- clusters[[i]]$clusters 174 | if (length(unique(df$label)) > 1) { 175 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 176 | } else { 177 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 178 | } 179 | } 180 | ``` 181 | 182 | ### KDDCUP04Bio 183 | 184 | ```{r kdcupbio,eval=F} 185 | data("kddcup04bio") 186 | library(largeVis) 187 | load("./kddvis.Rda") 188 | dat <- t(scale(as.matrix(kddcup04bio))) 189 | vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE) 190 | ``` 191 | 192 | ```{r showkdcupbio} 193 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets")) 194 | par(mfrow = c(1, 1), mar = c(0,0,1,0)) 195 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 196 | xlim = c(-20, 18), ylim = c(-20, 30)) 197 | ``` 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /inst/extdata/clusters.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/clusters.Rda -------------------------------------------------------------------------------- /inst/extdata/kdvis.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/kdvis.Rda -------------------------------------------------------------------------------- /inst/extdata/vises.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/vises.Rda -------------------------------------------------------------------------------- /man/asets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{a1} 5 | \alias{a1} 6 | \alias{a2} 7 | \alias{a3} 8 | \title{A-sets} 9 | \format{Data frame of x, y coordinates} 10 | \source{ 11 | \url{http://cs.joensuu.fi/sipu/datasets/} 12 | } 13 | \usage{ 14 | a1 15 | 16 | a2 17 | 18 | a3 19 | } 20 | \description{ 21 | Synthetic 2-d data with varying number of vectors (N) and clusters (M). There are 150 vectors per cluster. 22 | } 23 | \references{ 24 | I. Kärkkäinen and P. Fränti, "Dynamic local search algorithm for the clustering problem", Research Report A-2002-6 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/birch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{birch1} 5 | \alias{birch1} 6 | \alias{birch2} 7 | \alias{birch3} 8 | \title{BIRCH clustering datasets.} 9 | \format{Data frame of x, y coordinates} 10 | \source{ 11 | \url{http://cs.joensuu.fi/sipu/datasets/} 12 | } 13 | \usage{ 14 | birch1 15 | 16 | birch2 17 | 18 | birch3 19 | } 20 | \description{ 21 | Clusters in regular grid structure 22 | 23 | Clusters at a sine curve 24 | 25 | Random sized clusters in random locations 26 | } 27 | \details{ 28 | Synthetic 2-d data with N=100,000 vectors and M=100 clusters 29 | See Zhang et al., "BIRCH: A new data clustering algorithm and its applications", Data Mining and Knowledge Discovery, 1 (2), 141-182, 1997. 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /man/chameleon.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{t48k} 5 | \alias{t48k} 6 | \alias{t88k} 7 | \alias{t710k} 8 | \alias{t58k} 9 | \title{Chameleon Datasets} 10 | \format{Data frame} 11 | \usage{ 12 | t48k 13 | 14 | t88k 15 | 16 | t710k 17 | 18 | t58k 19 | } 20 | \description{ 21 | 4 sets of 2-vectors. 22 | \itemize{ 23 | \item{"t48k"}{N=8000,M=4,D=2} 24 | \item{"t88k"}{N=8000,M=8,D=2} 25 | \item{"t710k"}{N=10000,M=7,D=2} 26 | \item{"t58k"}{N=8000,M=5,D=2} 27 | } 28 | } 29 | \references{ 30 | G. Karypis, E.H. Han, V. Kumar, CHAMELEON: A hierarchical clustering algorithm using dynamic modeling, IEEE Trans. on Computers, 32 (8), 68-75, 1999. 31 | } 32 | \keyword{datasets} 33 | -------------------------------------------------------------------------------- /man/clusteringdatasets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clusteringdatasets-package.r 3 | \docType{package} 4 | \name{clusteringdatasets} 5 | \alias{clusteringdatasets} 6 | \alias{clusteringdatasets-package} 7 | \title{clusteringdatasets.} 8 | \description{ 9 | A set of datasets useful for testing clustering algorithms. 10 | } 11 | -------------------------------------------------------------------------------- /man/highdimsets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{dim032} 5 | \alias{dim032} 6 | \alias{dim064} 7 | \alias{dim128} 8 | \alias{dim256} 9 | \alias{dim512} 10 | \alias{dim1024} 11 | \title{High-Dim Sets} 12 | \format{Data frames} 13 | \source{ 14 | \url{http://cs.joensuu.fi/sipu/datasets/} 15 | } 16 | \usage{ 17 | dim032 18 | 19 | dim064 20 | 21 | dim128 22 | 23 | dim256 24 | 25 | dim512 26 | 27 | dim1024 28 | } 29 | \description{ 30 | Six sets of high-dimensional data, each with 1024 vectors and 16 gaussian clusters. 31 | } 32 | \references{ 33 | P. Fränti, O. Virmajoki and V. Hautamäki, "Fast agglomerative clustering using a k-nearest neighbor graph", IEEE Trans. on Pattern Analysis and Machine Intelligence, 28 (11), 1875-1881, November 2006. 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /man/kddcup04bio.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{kddcup04bio} 5 | \alias{kddcup04bio} 6 | \title{KDDCUPO4Bio} 7 | \format{Data frame} 8 | \source{ 9 | \url{http://cs.joensuu.fi/sipu/datasets/} 10 | } 11 | \usage{ 12 | kddcup04bio 13 | } 14 | \description{ 15 | 145751 vectors, 74-D 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /man/make_blobs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sklearn.R 3 | \name{make_blobs} 4 | \alias{make_blobs} 5 | \title{Make gaussian blobs} 6 | \usage{ 7 | make_blobs(n_samples = 100, n_features = 2, centers = 3, 8 | cluster_std = 1, center_box = c(-10, 10), shuffle = TRUE) 9 | } 10 | \arguments{ 11 | \item{n_samples}{Number of points} 12 | 13 | \item{n_features}{Dimensionality of dataset} 14 | 15 | \item{centers}{Either the number of centers, or a matrix of the chosen centers} 16 | 17 | \item{cluster_std}{Standard deviation of Gaussian noise. Either one number, or a vector of length equal to the number of centers} 18 | 19 | \item{center_box}{If the centers are being generated, the bounding box within which they will be created.} 20 | 21 | \item{shuffle}{Ignored; included for compatibility with the Python} 22 | } 23 | \value{ 24 | a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the cluster from which each point came. 25 | } 26 | \description{ 27 | Imitation of the Python \code{sklearn.datasets.make_blobs} function. 28 | } 29 | -------------------------------------------------------------------------------- /man/make_moons.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sklearn.R 3 | \name{make_moons} 4 | \alias{make_moons} 5 | \title{Make two interleaving half-circles} 6 | \usage{ 7 | make_moons(n_samples = 100, shuffle = TRUE, noise = NA) 8 | } 9 | \arguments{ 10 | \item{n_samples}{Number of points (will be divided equally among the circles)} 11 | 12 | \item{shuffle}{Whether to randomize the sequence} 13 | 14 | \item{noise}{Standard deviation of Gaussian noise applied to point positions} 15 | } 16 | \value{ 17 | a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the circle from which each point came. 18 | } 19 | \description{ 20 | Imitation of the Python \code{sklearn.datasets.make_moons} function. 21 | } 22 | -------------------------------------------------------------------------------- /man/mopsi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{mopsifinland} 5 | \alias{mopsifinland} 6 | \alias{mopsijoensu} 7 | \title{Mopsi Data} 8 | \format{Data frame} 9 | \source{ 10 | \url{http://cs.uef.fi/mopsi/data/} 11 | } 12 | \usage{ 13 | mopsifinland 14 | 15 | mopsijoensu 16 | } 17 | \description{ 18 | User locations, N = 13467 for Finland, N = 6014 for Joensuu 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/neuralgas.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{Circle} 5 | \alias{Circle} 6 | \alias{Complex1} 7 | \alias{Complex2} 8 | \alias{Complex3} 9 | \alias{Complex4} 10 | \alias{Discrete} 11 | \alias{HiLoDensity} 12 | \alias{JumpingRectangle} 13 | \alias{MovingJumpingRectangle} 14 | \alias{MovingRectangle} 15 | \alias{Rectangle} 16 | \alias{Ring} 17 | \alias{RMouseRectangle} 18 | \title{Neural Gas} 19 | \format{Data frame} 20 | \usage{ 21 | Circle 22 | 23 | Complex1 24 | 25 | Complex2 26 | 27 | Complex3 28 | 29 | Complex4 30 | 31 | Discrete 32 | 33 | HiLoDensity 34 | 35 | JumpingRectangle 36 | 37 | MovingJumpingRectangle 38 | 39 | MovingRectangle 40 | 41 | Rectangle 42 | 43 | Ring 44 | 45 | RMouseRectangle 46 | } 47 | \description{ 48 | 4 sets of 2-vectors. 49 | \itemize{ 50 | \item{"Circle"}{N=5000,D=2} 51 | \item{"Complex1"}{N=5000,D=2} 52 | \item{"Complex2"}{N=5000,D=2} 53 | \item{"Complex3"}{N=5000,D=2} 54 | \item{"Complex3"}{N=5000,D=2} 55 | \item{"Discrete"}{N=5000,D=2} 56 | \item{"HiLoDensity"}{N=5000,D=2} 57 | \item{"JumpingRectangle"}{N=5000,D=2} 58 | \item{"MovingJumpingRectangle"}{N=5000,D=2} 59 | \item{"MovingRectangle"}{N=5000,D=2} 60 | \item{"Rectangle"}{N=5000,D=2} 61 | \item{"Ring"}{N=5000,D=2} 62 | \item{"RMouseRectangle"}{N=5000,D=2} 63 | } 64 | } 65 | \keyword{datasets} 66 | -------------------------------------------------------------------------------- /man/nonconvex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{cross} 5 | \alias{cross} 6 | \alias{d4} 7 | \alias{face} 8 | \alias{pie} 9 | \alias{ring2} 10 | \alias{sincos} 11 | \title{Non-Convex} 12 | \format{Data frame} 13 | \usage{ 14 | cross 15 | 16 | d4 17 | 18 | face 19 | 20 | pie 21 | 22 | ring2 23 | 24 | sincos 25 | } 26 | \description{ 27 | \itemize{ 28 | \item{"cross"}{N=2000,D=2} 29 | \item{"d4"}{N=200,D=2} 30 | \item{"face"}{N=500,D=2} 31 | \item{"pie"}{N=2322,D=2} 32 | \item{"ring2"}{N=60,D=2} 33 | \item{"sincos"}{N=300,D=2} 34 | } 35 | } 36 | \keyword{datasets} 37 | -------------------------------------------------------------------------------- /man/shapesets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{Aggregation} 5 | \alias{Aggregation} 6 | \alias{Compound} 7 | \alias{pathbased} 8 | \alias{spiral} 9 | \alias{jain} 10 | \alias{flame} 11 | \alias{D31} 12 | \alias{R15} 13 | \title{Shape sets} 14 | \format{Data frame of x, y coordinates and label} 15 | \source{ 16 | \url{http://cs.joensuu.fi/sipu/datasets/} 17 | } 18 | \usage{ 19 | Aggregation 20 | 21 | Compound 22 | 23 | pathbased 24 | 25 | spiral 26 | 27 | jain 28 | 29 | flame 30 | 31 | D31 32 | 33 | R15 34 | } 35 | \description{ 36 | Various sets of points that form shapes. Good for testing density-based clustering methods. 37 | } 38 | \references{ 39 | A. Gionis, H. Mannila, and P. Tsaparas, Clustering aggregation. ACM Transactions on Knowledge Discovery from Data (TKDD), 2007. 1(1): p. 1-30. 40 | 41 | C.T. Zahn, Graph-theoretical methods for detecting and describing gestalt clusters. IEEE Transactions on Computers, 1971. 100(1): p. 68-86. 42 | 43 | H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203. 44 | 45 | H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203. 46 | 47 | A. Jain and M. Law, Data clustering: A user's dilemma. Lecture Notes in Computer Science, 2005. 3776: p. 1-10. 48 | 49 | L. Fu and E. Medico, FLAME, a novel fuzzy clustering method for the analysis of DNA microarray data. BMC bioinformatics, 2007. 8(1): p. 3. 50 | 51 | C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280. 52 | 53 | C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280. 54 | } 55 | \keyword{datasets} 56 | -------------------------------------------------------------------------------- /man/ssets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{s1} 5 | \alias{s1} 6 | \alias{s2} 7 | \alias{s3} 8 | \alias{s4} 9 | \alias{centroids} 10 | \title{S-sets} 11 | \format{Data frame of x, y coordinates and labels} 12 | \source{ 13 | \url{http://cs.joensuu.fi/sipu/datasets/} 14 | } 15 | \usage{ 16 | s1 17 | 18 | s2 19 | 20 | s3 21 | 22 | s4 23 | 24 | centroids 25 | } 26 | \description{ 27 | Synthetic 2-d data with N=5000 vectors and M=15 Gaussian clusters with different degree of cluster overlapping. Centroids are found in the data object \code{centroids}. 28 | } 29 | \references{ 30 | P. Fränti and O. Virmajoki, "Iterative shrinking method for clustering problems", Pattern Recognition, 39 (5), 761-765, May 2006. 31 | } 32 | \keyword{datasets} 33 | -------------------------------------------------------------------------------- /man/uci.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/birch.R 3 | \docType{data} 4 | \name{thyroid} 5 | \alias{thyroid} 6 | \alias{wine} 7 | \alias{glass} 8 | \alias{yeast} 9 | \alias{breast} 10 | \alias{wdbc} 11 | \title{UCI High Dimensional Datasets} 12 | \format{Data frames} 13 | \source{ 14 | \url{http://archive.ics.uci.edu/ml/} 15 | } 16 | \usage{ 17 | thyroid 18 | 19 | wine 20 | 21 | glass 22 | 23 | yeast 24 | 25 | breast 26 | 27 | wdbc 28 | } 29 | \description{ 30 | Various high-dimensional datasets with identifying data removed 31 | \itemize{ 32 | \item{"thyroid"}{N=215,M=2,D=5} 33 | \item{"wine"}{N=178,M=3,D=13} 34 | \item{"glass"}{N=214,M=7,D=9} 35 | \item{"yeast"}{N=1484,M=10,D=8} 36 | \item{"breast"}{N=699,M=2,D=9} 37 | \item{"wdbc"}{N=569,M=2,D=32} 38 | } 39 | } 40 | \keyword{datasets} 41 | -------------------------------------------------------------------------------- /vignettes/clusteringdatasets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clustering Datasets" 3 | author: "Amos Elberg" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{clusteringdatasets} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ## Clustering Datasets 13 | 14 | This vignette provides a simple overview of the datasets included in the package. 15 | 16 | ```{r setup,echo=F} 17 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6) 18 | ``` 19 | 20 | ### Birch 21 | 22 | ```{r birch,fig.height=2,fig.width=6} 23 | library(clusteringdatasets) 24 | data(birch1) 25 | data(birch2) 26 | data(birch3) 27 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 28 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n') 29 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 30 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 31 | ``` 32 | 33 | ### S Sets 34 | 35 | The S-sets are useful for testing how an algorithm handles cluster overlap. 36 | 37 | ```{r ssets,fig.height=4,fig.width=4} 38 | data(s1) 39 | data(s2) 40 | data(s3) 41 | data(s4) 42 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 43 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 44 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 45 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 46 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 47 | ``` 48 | 49 | 50 | ### A Sets 51 | 52 | ```{r asets,fig.height=2,fig.width=6} 53 | data(a1) 54 | data(a2) 55 | data(a3) 56 | par(mfrow = c(1, 3), mar = c(0,0,1,0)) 57 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 58 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 59 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 60 | ``` 61 | 62 | ### Shapesets 63 | 64 | ```{r shapesets,fig.height=6,fig.width=6} 65 | data("Aggregation") 66 | data("spiral") 67 | data("D31") 68 | data(Compound) 69 | data(pathbased) 70 | data(jain) 71 | data(flame) 72 | data(R15) 73 | par(mfrow = c(3, 3), mar = c(0,0,1,0)) 74 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 75 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 76 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 77 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 78 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 79 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 80 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 81 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 82 | ``` 83 | 84 | ### Chameleon 85 | 86 | ```{r t48k,fig.height=3,fig.width=3} 87 | par(mfrow = c(2, 2), mar = c(0,0,1,0)) 88 | data("t48k") 89 | data("t58k") 90 | data("t710k") 91 | data("t88k") 92 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 93 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 94 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 95 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 96 | ``` 97 | 98 | ### Neural Gas 99 | 100 | ```{r neuralgas} 101 | names <- c("Circle" , "Complex1" , "Complex2" , "Complex3" , "Complex4" , "Discrete" , "HiLoDensity" , "JumpingRectangle" , 102 | "MovingJumpingRectangle", "MovingRectangle", "Rectangle" , "RMouseRectangle" ) # "Ring" 103 | data(list = names) 104 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0)) 105 | for (nm in names) { 106 | plot(eval(parse(text = nm)), cex = 0.01, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 107 | } 108 | ``` 109 | 110 | 111 | ### Non-Convex 112 | 113 | ```{r nonconvex} 114 | names <- c("cross" , "d4" , "face" , "pie" , "ring2" , "sincos") 115 | data(list = names) 116 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0)) 117 | for (nm in names) { 118 | plot(eval(parse(text = nm)), cex = 0.1, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 119 | } 120 | ``` 121 | 122 | ## Locations 123 | 124 | ```{r mopsi} 125 | data("mopsifinland") 126 | data("mopsijoensu") 127 | par(mfrow = c(1, 2), mar = c(0,0,1,0)) 128 | plot(mopsifinland[, 1:2], cex = 0.01, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 129 | plot(mopsijoensu[, 1:2], cex = 0.05, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 130 | ``` 131 | 132 | ## High Dimensional Datasets 133 | 134 | The package contains three sets of high-dimensional data. The visualizations below were made using my `largeVis` package to reduce each dataset to two dimensions, and the colors are the result of applying the `hdbscan` function within the package. 135 | 136 | ### UCI Datasets 137 | 138 | ```{r toproc,eval=F} 139 | library(largeVis) 140 | library(clusteringdatasets) 141 | library(ggplot2) 142 | data(glass) 143 | data(wdbc) 144 | data(breast) 145 | data(yeast) 146 | data(wine) 147 | data(thyroid) 148 | toproc <- list(glass, wdbc, breast, yeast, wine, thyroid) 149 | vises <- list() 150 | clusters <- list() 151 | for (i in 1:length(toproc)) { 152 | dat <- t(scale(as.matrix(toproc[[i]]))) 153 | if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE) 154 | else vis <- largeVis(dat, K = 100, verbose = TRUE) 155 | neighbors <- randomProjectionTreeSearch(dat, K = 50) 156 | edges <- buildEdgeMatrix(data = dat, neighbors = neighbors) 157 | print(str(edges)) 158 | cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE) 159 | vises[[i]] <- vis 160 | clusters[[i]] <- cluster 161 | } 162 | ``` 163 | 164 | ```{r highd1} 165 | library(ggplot2) 166 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets")) 167 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets")) 168 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid") 169 | par(mfrow = c(2, 3), mar = c(0,0,1,0)) 170 | for (i in 1:length(names)) { 171 | df <- data.frame(t(vises[[i]]$coords)) 172 | colnames(df) <- c("x", "y") 173 | df$label <- clusters[[i]]$clusters 174 | if (length(unique(df$label)) > 1) { 175 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 176 | } else { 177 | plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n') 178 | } 179 | } 180 | ``` 181 | 182 | ### KDDCUP04Bio 183 | 184 | ```{r kdcupbio,eval=F} 185 | data("kddcup04bio") 186 | library(largeVis) 187 | load("./kddvis.Rda") 188 | dat <- t(scale(as.matrix(kddcup04bio))) 189 | vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE) 190 | ``` 191 | 192 | ```{r showkdcupbio} 193 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets")) 194 | par(mfrow = c(1, 1), mar = c(0,0,1,0)) 195 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 196 | xlim = c(-20, 18), ylim = c(-20, 30)) 197 | ``` 198 | 199 | 200 | 201 | --------------------------------------------------------------------------------