├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── birch.R
    ├── clusteringdatasets-package.r
    └── sklearn.R
├── README-asets-1.png
├── README-birch-1.png
├── README-highd1-1.png
├── README-mopsi-1.png
├── README-neuralgas-1.png
├── README-nonconvex-1.png
├── README-shapesets-1.png
├── README-showkdcupbio-1.png
├── README-ssets-1.png
├── README-t48k-1.png
├── README.Rmd
├── README.md
├── data
    ├── Aggregation.rda
    ├── Circle.rda
    ├── Complex1.rda
    ├── Complex2.rda
    ├── Complex3.rda
    ├── Complex4.rda
    ├── Compound.rda
    ├── D31.rda
    ├── Discrete.rda
    ├── HiLoDensity.rda
    ├── JumpingRectangle.rda
    ├── MovingJumpingRectangle.rda
    ├── MovingRectangle.rda
    ├── R15.rda
    ├── RMouseRectangle.rda
    ├── Rectangle.rda
    ├── Ring.rda
    ├── a1.rda
    ├── a2.rda
    ├── a3.rda
    ├── birch1.rda
    ├── birch2.rda
    ├── birch3.rda
    ├── breast.rda
    ├── centroids.rda
    ├── cross.rda
    ├── d4.rda
    ├── dim032.rda
    ├── dim064.rda
    ├── dim1024.rda
    ├── dim128.rda
    ├── dim256.rda
    ├── dim512.rda
    ├── face.rda
    ├── flame.rda
    ├── glass.rda
    ├── jain.rda
    ├── kddcup04bio.rda
    ├── mopsifinland.rda
    ├── mopsijoensu.rda
    ├── nm.rda
    ├── pathbased.rda
    ├── pie.rda
    ├── ring2.rda
    ├── s1.rda
    ├── s2.rda
    ├── s3.rda
    ├── s4.rda
    ├── sincos.rda
    ├── spiral.rda
    ├── t48k.rda
    ├── t58k.rda
    ├── t710k.rda
    ├── t88k.rda
    ├── thyroid.rda
    ├── wdbc.rda
    ├── wine.rda
    └── yeast.rda
├── inst
    ├── doc
    │   ├── clusteringdatasets.R
    │   ├── clusteringdatasets.Rmd
    │   └── clusteringdatasets.html
    └── extdata
    │   ├── clusters.Rda
    │   ├── kdvis.Rda
    │   └── vises.Rda
├── man
    ├── asets.Rd
    ├── birch.Rd
    ├── chameleon.Rd
    ├── clusteringdatasets.Rd
    ├── highdimsets.Rd
    ├── kddcup04bio.Rd
    ├── make_blobs.Rd
    ├── make_moons.Rd
    ├── mopsi.Rd
    ├── neuralgas.Rd
    ├── nonconvex.Rd
    ├── shapesets.Rd
    ├── ssets.Rd
    └── uci.Rd
└── vignettes
    └── clusteringdatasets.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^README-.*\.png$
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: clusteringdatasets
 2 | Type: Package
 3 | Title: Datasets useful for testing clustering algorithms
 4 | Version: 0.1.1
 5 | Authors@R: person("Amos", "Elberg", email = "amos.elberg@gmail.com", role = c("aut", "cre"))
 6 | Description: Nothing fancy - just an R-packaging of some datasets used in well-known papers on clustering algorithms, obtained from http://cs.joensuu.fi/sipu/datasets/. Also imitations of some functions for making toy datasets from Python sklearn.
 7 | License: See individual data descriptions
 8 | Encoding: UTF-8
 9 | LazyData: true
10 | RoxygenNote: 6.0.1
11 | Depends:
12 | 	R (>= 2.10)
13 | Suggests: knitr,
14 |     rmarkdown
15 | VignetteBuilder: knitr
16 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(make_blobs)
4 | export(make_moons)
5 | 


--------------------------------------------------------------------------------
/R/birch.R:
--------------------------------------------------------------------------------
  1 | #' @title BIRCH clustering datasets.
  2 | #'
  3 | #' @details Synthetic 2-d data with N=100,000 vectors and M=100 clusters
  4 | #' See Zhang et al., "BIRCH: A new data clustering algorithm and its applications", Data Mining and Knowledge Discovery, 1 (2), 141-182, 1997.
  5 | #'
  6 | #' @format Data frame of x, y coordinates
  7 | #'
  8 | #' @description Clusters in regular grid structure
  9 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
 10 | #' @rdname birch
 11 | "birch1"
 12 | 
 13 | #' @description Clusters at a sine curve
 14 | #' @rdname birch
 15 | "birch2"
 16 | 
 17 | #'@description Random sized clusters in random locations
 18 | #'@rdname birch
 19 | "birch3"
 20 | 
 21 | 
 22 | #' @title Shape sets
 23 | #'
 24 | #' @description Various sets of points that form shapes.  Good for testing density-based clustering methods.
 25 | #'
 26 | #' @format Data frame of x, y coordinates and label
 27 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
 28 | #' @rdname shapesets
 29 | #' @references A. Gionis, H. Mannila, and P. Tsaparas, Clustering aggregation. ACM Transactions on Knowledge Discovery from Data (TKDD), 2007. 1(1): p. 1-30.
 30 | "Aggregation"
 31 | 
 32 | #' @rdname shapesets
 33 | #' @references C.T. Zahn, Graph-theoretical methods for detecting and describing gestalt clusters. IEEE Transactions on Computers, 1971. 100(1): p. 68-86.
 34 | "Compound"
 35 | 
 36 | #' @rdname shapesets
 37 | #' @references H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203.
 38 | "pathbased"
 39 | 
 40 | #' @rdname shapesets
 41 | #' @references H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203.
 42 | "spiral"
 43 | 
 44 | #' @rdname shapesets
 45 | #' @references A. Jain and M. Law, Data clustering: A user's dilemma. Lecture Notes in Computer Science, 2005. 3776: p. 1-10.
 46 | "jain"
 47 | 
 48 | #' @rdname shapesets
 49 | #' @references L. Fu and E. Medico, FLAME, a novel fuzzy clustering method for the analysis of DNA microarray data. BMC bioinformatics, 2007. 8(1): p. 3.
 50 | "flame"
 51 | 
 52 | #' @rdname shapesets
 53 | #' @references C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280.
 54 | "D31"
 55 | 
 56 | #' @rdname shapesets
 57 | #' @references C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280.
 58 | "R15"
 59 | 
 60 | #' @title S-sets
 61 | #' @description Synthetic 2-d data with N=5000 vectors and M=15 Gaussian clusters with different degree of cluster overlapping. Centroids are found in the data object \code{centroids}.
 62 | #' @format Data frame of x, y coordinates and labels
 63 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
 64 | #' @rdname ssets
 65 | #' @references P. Fränti and O. Virmajoki, "Iterative shrinking method for clustering problems", Pattern Recognition, 39 (5), 761-765, May 2006.
 66 | "s1"
 67 | 
 68 | #' @rdname ssets
 69 | "s2"
 70 | 
 71 | #' @rdname ssets
 72 | "s3"
 73 | 
 74 | #' @rdname ssets
 75 | "s4"
 76 | 
 77 | #' @rdname ssets
 78 | "centroids"
 79 | 
 80 | #' @title A-sets
 81 | #' @description Synthetic 2-d data with varying number of vectors (N) and clusters (M). There are 150 vectors per cluster.
 82 | #' @format Data frame of x, y coordinates
 83 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
 84 | #' @rdname asets
 85 | #' @references I. Kärkkäinen and P. Fränti, "Dynamic local search algorithm for the clustering problem", Research Report A-2002-6
 86 | "a1"
 87 | 
 88 | #' @rdname asets
 89 | "a2"
 90 | 
 91 | #' @rdname asets
 92 | "a3"
 93 | 
 94 | #' @title High-Dim Sets
 95 | #' @description Six sets of high-dimensional data, each with 1024 vectors and 16 gaussian clusters.
 96 | #' @format Data frames
 97 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
 98 | #' @references P. Fränti, O. Virmajoki and V. Hautamäki, "Fast agglomerative clustering using a k-nearest neighbor graph", IEEE Trans. on Pattern Analysis and Machine Intelligence, 28 (11), 1875-1881, November 2006.
 99 | #' @rdname highdimsets
100 | "dim032"
101 | 
102 | #' @rdname highdimsets
103 | "dim064"
104 | 
105 | #' @rdname highdimsets
106 | "dim128"
107 | 
108 | #' @rdname highdimsets
109 | "dim256"
110 | 
111 | #' @rdname highdimsets
112 | "dim512"
113 | 
114 | #' @rdname highdimsets
115 | "dim1024"
116 | 
117 | #' @title UCI High Dimensional Datasets
118 | #' @description Various high-dimensional datasets with identifying data removed
119 | #' \itemize{
120 | #' \item{"thyroid"}{N=215,M=2,D=5}
121 | #' \item{"wine"}{N=178,M=3,D=13}
122 | #' \item{"glass"}{N=214,M=7,D=9}
123 | #' \item{"yeast"}{N=1484,M=10,D=8}
124 | #' \item{"breast"}{N=699,M=2,D=9}
125 | #' \item{"wdbc"}{N=569,M=2,D=32}
126 | #' }
127 | #' @source \url{http://archive.ics.uci.edu/ml/}
128 | #' @format Data frames
129 | #' @rdname uci
130 | "thyroid"
131 | 
132 | #' @rdname uci
133 | "wine"
134 | 
135 | #' @rdname uci
136 | "glass"
137 | 
138 | #' @rdname uci
139 | "yeast"
140 | 
141 | #' @rdname uci
142 | "breast"
143 | 
144 | #' @rdname uci
145 | "wdbc"
146 | 
147 | #' @title Chameleon Datasets
148 | #' @description 4 sets of 2-vectors.
149 | #' \itemize{
150 | #' \item{"t48k"}{N=8000,M=4,D=2}
151 | #' \item{"t88k"}{N=8000,M=8,D=2}
152 | #' \item{"t710k"}{N=10000,M=7,D=2}
153 | #' \item{"t58k"}{N=8000,M=5,D=2}
154 | #' }
155 | #' @format Data frame
156 | #' @rdname chameleon
157 | #' @references G. Karypis, E.H. Han, V. Kumar, CHAMELEON: A hierarchical clustering algorithm using dynamic modeling, IEEE Trans. on Computers, 32 (8), 68-75, 1999.
158 | "t48k"
159 | 
160 | #' @rdname chameleon
161 | "t88k"
162 | 
163 | #' @rdname chameleon
164 | "t710k"
165 | 
166 | #' @rdname chameleon
167 | "t58k"
168 | 
169 | #' @title KDDCUPO4Bio
170 | #' @description 145751 vectors, 74-D
171 | #' @source \url{http://cs.joensuu.fi/sipu/datasets/}
172 | #' @format Data frame
173 | #' @rdname kddcup04bio
174 | "kddcup04bio"
175 | 
176 | #' @title Mopsi Data
177 | #' @description User locations, N = 13467 for Finland, N = 6014 for Joensuu
178 | #' @source \url{http://cs.uef.fi/mopsi/data/}
179 | #' @format Data frame
180 | #' @rdname mopsi
181 | "mopsifinland"
182 | 
183 | #' @rdname mopsi
184 | "mopsijoensu"
185 | 
186 | #' @title Neural Gas
187 | #' @description 4 sets of 2-vectors.
188 | #' \itemize{
189 | #' \item{"Circle"}{N=5000,D=2}
190 | #' \item{"Complex1"}{N=5000,D=2}
191 | #' \item{"Complex2"}{N=5000,D=2}
192 | #' \item{"Complex3"}{N=5000,D=2}
193 | #' \item{"Complex3"}{N=5000,D=2}
194 | #' \item{"Discrete"}{N=5000,D=2}
195 | #' \item{"HiLoDensity"}{N=5000,D=2}
196 | #' \item{"JumpingRectangle"}{N=5000,D=2}
197 | #' \item{"MovingJumpingRectangle"}{N=5000,D=2}
198 | #' \item{"MovingRectangle"}{N=5000,D=2}
199 | #' \item{"Rectangle"}{N=5000,D=2}
200 | #' \item{"Ring"}{N=5000,D=2}
201 | #' \item{"RMouseRectangle"}{N=5000,D=2}
202 | #' }
203 | #' @format Data frame
204 | #' @rdname neuralgas
205 | "Circle"
206 | 
207 | #' @rdname neuralgas
208 | "Complex1"
209 | 
210 | #' @rdname neuralgas
211 | "Complex2"
212 | 
213 | #' @rdname neuralgas
214 | "Complex3"
215 | 
216 | #' @rdname neuralgas
217 | "Complex4"
218 | 
219 | #' @rdname neuralgas
220 | "Discrete"
221 | 
222 | #' @rdname neuralgas
223 | "HiLoDensity"
224 | 
225 | #' @rdname neuralgas
226 | "JumpingRectangle"
227 | 
228 | #' @rdname neuralgas
229 | "MovingJumpingRectangle"
230 | 
231 | #' @rdname neuralgas
232 | "MovingRectangle"
233 | 
234 | #' @rdname neuralgas
235 | "Rectangle"
236 | 
237 | #' @rdname neuralgas
238 | "Ring"
239 | 
240 | #' @rdname neuralgas
241 | "RMouseRectangle"
242 | 
243 | #' @title Non-Convex
244 | #' @description
245 | #' \itemize{
246 | #' \item{"cross"}{N=2000,D=2}
247 | #' \item{"d4"}{N=200,D=2}
248 | #' \item{"face"}{N=500,D=2}
249 | #' \item{"pie"}{N=2322,D=2}
250 | #' \item{"ring2"}{N=60,D=2}
251 | #' \item{"sincos"}{N=300,D=2}
252 | #' }
253 | #' @format Data frame
254 | #' @rdname nonconvex
255 | "cross"
256 | 
257 | #' @rdname nonconvex
258 | "d4"
259 | 
260 | #' @rdname nonconvex
261 | "face"
262 | 
263 | #' @rdname nonconvex
264 | "pie"
265 | 
266 | #' @rdname nonconvex
267 | "ring2"
268 | 
269 | #' @rdname nonconvex
270 | "sincos"


--------------------------------------------------------------------------------
/R/clusteringdatasets-package.r:
--------------------------------------------------------------------------------
1 | #' clusteringdatasets.
2 | #'
3 | #'	A set of datasets useful for testing clustering algorithms.
4 | #'
5 | #' @name clusteringdatasets
6 | #' @docType package
7 | NULL
8 | 


--------------------------------------------------------------------------------
/R/sklearn.R:
--------------------------------------------------------------------------------
 1 | #' Make gaussian blobs
 2 | #'
 3 | #' @param n_samples Number of points
 4 | #' @param n_features Dimensionality of dataset
 5 | #' @param centers Either the number of centers, or a matrix of the chosen centers
 6 | #' @param cluster_std Standard deviation of Gaussian noise. Either one number, or a vector of length equal to the number of centers
 7 | #' @param center_box If the centers are being generated, the bounding box within which they will be created. 
 8 | #' @param shuffle Ignored; included for compatibility with the Python
 9 | #' 
10 | #' @description Imitation of the Python \code{sklearn.datasets.make_blobs} function.
11 | #'
12 | #' @return a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the cluster from which each point came.
13 | #' @export
14 | make_blobs <- function(n_samples=100, n_features=2, centers=3, 
15 |                        cluster_std=1.0, center_box=c(-10,10), 
16 |                        shuffle=TRUE) {
17 |   if (is.matrix(centers)) {
18 |     if (ncol(centers) != n_features) stop("Dimensionality of centers must equal number of features.")
19 |   } else {
20 |     centers <- runif(n = n_features * centers, min=center_box[1], max=center_box[2])
21 |     centers <- matrix(centers, ncol=n_features)
22 |   }
23 |   
24 |   if (length(cluster_std) != 1 & length(cluster_std) != nrow(centers)) stop("Cluster_std must be 1 or the same length as the number of clusters")
25 |   
26 |   categories <- sample(nrow(centers), size = n_samples, replace = TRUE)
27 |   
28 |   starting_points <- matrix(
29 |     rnorm(n = n_samples * n_features), 
30 |     ncol = n_features
31 |   )
32 |   
33 |   if (length(cluster_std) == 1) points <- starting_points * cluster_std
34 |   else points <- starting_points * cluster_std[categories]
35 |   
36 |   points <- points + centers[categories, ]
37 |   
38 |   list(
39 |     samples=points, 
40 |     labels=categories
41 |   )
42 | }
43 | 
44 | #' Make two interleaving half-circles
45 | #'
46 | #' @param n_samples Number of points (will be divided equally among the circles)
47 | #' @param shuffle Whether to randomize the sequence
48 | #' @param noise Standard deviation of Gaussian noise applied to point positions
49 | #'
50 | #' @description Imitation of the Python \code{sklearn.datasets.make_moons} function.
51 | #' @return a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the circle from which each point came.
52 | #' @export
53 | make_moons <- function(n_samples=100, shuffle=TRUE, noise=NA) {
54 |   n_samples_out = trunc(n_samples / 2)
55 |   n_samples_in = n_samples - n_samples_out
56 |   
57 |   points <- matrix( c(
58 |     cos(seq(from=0, to=pi, length.out=n_samples_out)),  # Outer circle x
59 |     1 - cos(seq(from=0, to=pi, length.out=n_samples_in)), # Inner circle x
60 |     sin(seq(from=0, to=pi, length.out=n_samples_out)), # Outer circle y
61 |     1 - sin(seq(from=0, to=pi, length.out=n_samples_in)) - 0.5 # Inner circle y 
62 |   ), ncol=2) 
63 |   
64 |   if (! is.na(noise)) points <- points + rnorm(length(points), sd=noise)
65 |   
66 |   labels <- c(rep(1, n_samples_out), rep(2, n_samples_in))
67 |   
68 |   if (! shuffle) {
69 |     list(
70 |       samples=points, 
71 |       labels=labels
72 |     )
73 |   } else {
74 |     order <- sample(x = n_samples, size = n_samples, replace = F)
75 |     list(
76 |       samples=points[order,],
77 |       labels=labels[order]
78 |     )
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/README-asets-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-asets-1.png


--------------------------------------------------------------------------------
/README-birch-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-birch-1.png


--------------------------------------------------------------------------------
/README-highd1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-highd1-1.png


--------------------------------------------------------------------------------
/README-mopsi-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-mopsi-1.png


--------------------------------------------------------------------------------
/README-neuralgas-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-neuralgas-1.png


--------------------------------------------------------------------------------
/README-nonconvex-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-nonconvex-1.png


--------------------------------------------------------------------------------
/README-shapesets-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-shapesets-1.png


--------------------------------------------------------------------------------
/README-showkdcupbio-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-showkdcupbio-1.png


--------------------------------------------------------------------------------
/README-ssets-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-ssets-1.png


--------------------------------------------------------------------------------
/README-t48k-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/README-t48k-1.png


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, echo = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "README-"
12 | )
13 | ```
14 | 
15 | ## Clustering Datasets
16 | 
17 | An R-repackaging of datasets useful for evaluating clustering methods.  The source for most is http://cs.joensuu.fi/sipu/datasets 
18 | 
19 | I would love to include additional clustering datasets, if folks would like to provide them or make a PR.
20 | 
21 | ```{r child = 'vignettes/clusteringdatasets.Rmd'}
22 | ```
23 | 
24 | ## Sklearn Toy Datasets
25 | 
26 | The Python `sklearn.datasets` package includes functions for creating toy datasets.  I've ported a few of them. 
27 | 
28 | ### Make Blobs
29 | 
30 | ```{r makeblobs,echo=T}
31 | library(clusteringdatasets)
32 | blobs <- make_blobs(centers=matrix(c(-7, -5, 6, -7, 3, 6), ncol=2))
33 | plot(blobs$samples, col=rainbow(3)[blobs$labels], xlim=c(-10, 10), ylim=c(-10, 10))
34 | ```
35 | 
36 | ### Make Moons
37 | 
38 | ```{r makemoons,echo=T}
39 | moons <- make_moons(noise=0.04)
40 | plot(moons$samples, col=rainbow(2)[moons$labels])
41 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | ## Clustering Datasets
 5 | 
 6 | An R-repackaging of datasets useful for evaluating clustering methods.
 7 | The source for most is <http://cs.joensuu.fi/sipu/datasets>
 8 | 
 9 | I would love to include additional clustering datasets, if folks would
10 | like to provide them or make a PR.
11 | 
12 | ## Clustering Datasets
13 | 
14 | This vignette provides a simple overview of the datasets included in the
15 | package.
16 | 
17 | ### Birch
18 | 
19 | ![](README-birch-1.png)<!-- -->
20 | 
21 | ### S Sets
22 | 
23 | The S-sets are useful for testing how an algorithm handles cluster
24 | overlap.
25 | 
26 | ![](README-ssets-1.png)<!-- -->
27 | 
28 | ### A Sets
29 | 
30 | ![](README-asets-1.png)<!-- -->
31 | 
32 | ### Shapesets
33 | 
34 | ![](README-shapesets-1.png)<!-- -->
35 | 
36 | ### Chameleon
37 | 
38 | ![](README-t48k-1.png)<!-- -->
39 | 
40 | ### Neural Gas
41 | 
42 | ![](README-neuralgas-1.png)<!-- -->
43 | 
44 | ### Non-Convex
45 | 
46 | ![](README-nonconvex-1.png)<!-- -->
47 | 
48 | ## Locations
49 | 
50 | ![](README-mopsi-1.png)<!-- -->
51 | 
52 | ## High Dimensional Datasets
53 | 
54 | The package contains three sets of high-dimensional data. The
55 | visualizations below were made using my `largeVis` package to reduce
56 | each dataset to two dimensions, and the colors are the result of
57 | applying the `hdbscan` function within the package.
58 | 
59 | ### UCI Datasets
60 | 
61 | ![](README-highd1-1.png)<!-- -->
62 | 
63 | ### KDDCUP04Bio
64 | 
65 | ![](README-showkdcupbio-1.png)<!-- -->
66 | 
67 | ## Sklearn Toy Datasets
68 | 
69 | The Python `sklearn.datasets` package includes functions for creating
70 | toy datasets. I’ve ported a few of them.
71 | 
72 | ### Make Blobs
73 | 
74 | ``` r
75 | library(clusteringdatasets)
76 | blobs <- make_blobs()
77 | plot(blobs$samples, col=rainbow(3)[blobs$labels])
78 | ```
79 | 
80 | ![](README-makeblobs-1.png)<!-- -->
81 | 
82 | ### Make Moons
83 | 
84 | ``` r
85 | moons <- make_moons(noise=0.04)
86 | plot(moons$samples, col=rainbow(2)[moons$labels])
87 | ```
88 | 
89 | ![](README-makemoons-1.png)<!-- -->
90 | 


--------------------------------------------------------------------------------
/data/Aggregation.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Aggregation.rda


--------------------------------------------------------------------------------
/data/Circle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Circle.rda


--------------------------------------------------------------------------------
/data/Complex1.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex1.rda


--------------------------------------------------------------------------------
/data/Complex2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex2.rda


--------------------------------------------------------------------------------
/data/Complex3.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex3.rda


--------------------------------------------------------------------------------
/data/Complex4.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Complex4.rda


--------------------------------------------------------------------------------
/data/Compound.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Compound.rda


--------------------------------------------------------------------------------
/data/D31.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/D31.rda


--------------------------------------------------------------------------------
/data/Discrete.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Discrete.rda


--------------------------------------------------------------------------------
/data/HiLoDensity.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/HiLoDensity.rda


--------------------------------------------------------------------------------
/data/JumpingRectangle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/JumpingRectangle.rda


--------------------------------------------------------------------------------
/data/MovingJumpingRectangle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/MovingJumpingRectangle.rda


--------------------------------------------------------------------------------
/data/MovingRectangle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/MovingRectangle.rda


--------------------------------------------------------------------------------
/data/R15.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/R15.rda


--------------------------------------------------------------------------------
/data/RMouseRectangle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/RMouseRectangle.rda


--------------------------------------------------------------------------------
/data/Rectangle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Rectangle.rda


--------------------------------------------------------------------------------
/data/Ring.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/Ring.rda


--------------------------------------------------------------------------------
/data/a1.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a1.rda


--------------------------------------------------------------------------------
/data/a2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a2.rda


--------------------------------------------------------------------------------
/data/a3.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/a3.rda


--------------------------------------------------------------------------------
/data/birch1.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch1.rda


--------------------------------------------------------------------------------
/data/birch2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch2.rda


--------------------------------------------------------------------------------
/data/birch3.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/birch3.rda


--------------------------------------------------------------------------------
/data/breast.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/breast.rda


--------------------------------------------------------------------------------
/data/centroids.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/centroids.rda


--------------------------------------------------------------------------------
/data/cross.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/cross.rda


--------------------------------------------------------------------------------
/data/d4.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/d4.rda


--------------------------------------------------------------------------------
/data/dim032.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim032.rda


--------------------------------------------------------------------------------
/data/dim064.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim064.rda


--------------------------------------------------------------------------------
/data/dim1024.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim1024.rda


--------------------------------------------------------------------------------
/data/dim128.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim128.rda


--------------------------------------------------------------------------------
/data/dim256.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim256.rda


--------------------------------------------------------------------------------
/data/dim512.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/dim512.rda


--------------------------------------------------------------------------------
/data/face.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/face.rda


--------------------------------------------------------------------------------
/data/flame.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/flame.rda


--------------------------------------------------------------------------------
/data/glass.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/glass.rda


--------------------------------------------------------------------------------
/data/jain.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/jain.rda


--------------------------------------------------------------------------------
/data/kddcup04bio.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/kddcup04bio.rda


--------------------------------------------------------------------------------
/data/mopsifinland.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/mopsifinland.rda


--------------------------------------------------------------------------------
/data/mopsijoensu.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/mopsijoensu.rda


--------------------------------------------------------------------------------
/data/nm.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/nm.rda


--------------------------------------------------------------------------------
/data/pathbased.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/pathbased.rda


--------------------------------------------------------------------------------
/data/pie.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/pie.rda


--------------------------------------------------------------------------------
/data/ring2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/ring2.rda


--------------------------------------------------------------------------------
/data/s1.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s1.rda


--------------------------------------------------------------------------------
/data/s2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s2.rda


--------------------------------------------------------------------------------
/data/s3.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s3.rda


--------------------------------------------------------------------------------
/data/s4.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/s4.rda


--------------------------------------------------------------------------------
/data/sincos.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/sincos.rda


--------------------------------------------------------------------------------
/data/spiral.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/spiral.rda


--------------------------------------------------------------------------------
/data/t48k.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t48k.rda


--------------------------------------------------------------------------------
/data/t58k.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t58k.rda


--------------------------------------------------------------------------------
/data/t710k.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t710k.rda


--------------------------------------------------------------------------------
/data/t88k.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/t88k.rda


--------------------------------------------------------------------------------
/data/thyroid.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/thyroid.rda


--------------------------------------------------------------------------------
/data/wdbc.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/wdbc.rda


--------------------------------------------------------------------------------
/data/wine.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/wine.rda


--------------------------------------------------------------------------------
/data/yeast.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/data/yeast.rda


--------------------------------------------------------------------------------
/inst/doc/clusteringdatasets.R:
--------------------------------------------------------------------------------
  1 | ## ----setup,echo=F--------------------------------------------------------
  2 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6)
  3 | 
  4 | ## ----birch,fig.height=2,fig.width=6--------------------------------------
  5 | library(clusteringdatasets)
  6 | data(birch1)
  7 | data(birch2)
  8 | data(birch3)
  9 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 10 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
 11 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 12 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 13 | 
 14 | ## ----ssets,fig.height=4,fig.width=4--------------------------------------
 15 | data(s1)
 16 | data(s2)
 17 | data(s3)
 18 | data(s4)
 19 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 20 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 21 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 22 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 23 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 24 | 
 25 | ## ----asets,fig.height=2,fig.width=6--------------------------------------
 26 | data(a1)
 27 | data(a2)
 28 | data(a3)
 29 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 30 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 31 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 32 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 33 | 
 34 | ## ----shapesets,fig.height=6,fig.width=6----------------------------------
 35 | data("Aggregation")
 36 | data("spiral")
 37 | data("D31")
 38 | data(Compound)
 39 | data(pathbased)
 40 | data(jain)
 41 | data(flame)
 42 | data(R15)
 43 | par(mfrow = c(3, 3), mar = c(0,0,1,0))
 44 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 45 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 46 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 47 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 48 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 49 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 50 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 51 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 52 | 
 53 | ## ----t48k,fig.height=3,fig.width=3---------------------------------------
 54 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 55 | data("t48k")
 56 | data("t58k")
 57 | data("t710k")
 58 | data("t88k")
 59 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 60 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 61 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 62 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 63 | 
 64 | ## ----neuralgas-----------------------------------------------------------
 65 | names <- c("Circle"  ,  "Complex1"  , "Complex2" ,  "Complex3"    ,  "Complex4"     ,   "Discrete"  , "HiLoDensity" ,  "JumpingRectangle" ,
 66 | 					 "MovingJumpingRectangle", "MovingRectangle", "Rectangle" ,  "RMouseRectangle"    ) # "Ring"
 67 | data(list = names)
 68 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
 69 | for (nm in names) {
 70 | 	plot(eval(parse(text = nm)), cex = 0.01,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 71 | }
 72 | 
 73 | ## ----nonconvex-----------------------------------------------------------
 74 | names <- c("cross" , "d4"  ,   "face"  , "pie"  ,  "ring2"  , "sincos")
 75 | data(list = names)
 76 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
 77 | for (nm in names) {
 78 | 	plot(eval(parse(text = nm)), cex = 0.1,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 79 | }
 80 | 
 81 | ## ----mopsi---------------------------------------------------------------
 82 | data("mopsifinland")
 83 | data("mopsijoensu")
 84 | par(mfrow = c(1, 2), mar = c(0,0,1,0))
 85 | plot(mopsifinland[, 1:2], cex = 0.01,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 86 | plot(mopsijoensu[, 1:2], cex = 0.05,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 87 | 
 88 | ## ----toproc,eval=F-------------------------------------------------------
 89 | #  library(largeVis)
 90 | #  library(clusteringdatasets)
 91 | #  library(ggplot2)
 92 | #  data(glass)
 93 | #  data(wdbc)
 94 | #  data(breast)
 95 | #  data(yeast)
 96 | #  data(wine)
 97 | #  data(thyroid)
 98 | #  toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
 99 | #  vises <- list()
100 | #  clusters <- list()
101 | #  for (i in 1:length(toproc)) {
102 | #  	dat <- t(scale(as.matrix(toproc[[i]])))
103 | #  	if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
104 | #  	else vis <- largeVis(dat, K = 100, verbose = TRUE)
105 | #  	neighbors <- randomProjectionTreeSearch(dat, K = 50)
106 | #  	edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
107 | #  	print(str(edges))
108 | #  	cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
109 | #  	vises[[i]] <- vis
110 | #  	clusters[[i]] <- cluster
111 | #  }
112 | 
113 | ## ----highd1--------------------------------------------------------------
114 | library(ggplot2)
115 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
116 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
117 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
118 | par(mfrow = c(2, 3), mar = c(0,0,1,0))
119 | for (i in 1:length(names)) {
120 | 	df <- data.frame(t(vises[[i]]$coords))
121 | 	colnames(df) <- c("x", "y")
122 | 	df$label <- clusters[[i]]$clusters
123 | 	if (length(unique(df$label)) > 1) {
124 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
125 | 	} else {
126 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
127 | 	}
128 | }
129 | 
130 | ## ----kdcupbio,eval=F-----------------------------------------------------
131 | #  data("kddcup04bio")
132 | #  library(largeVis)
133 | #  load("./kddvis.Rda")
134 | #  dat <- t(scale(as.matrix(kddcup04bio)))
135 | #  vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)
136 | 
137 | ## ----showkdcupbio--------------------------------------------------------
138 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
139 | par(mfrow = c(1, 1), mar = c(0,0,1,0))
140 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 
141 | 		 xlim = c(-20, 18), ylim = c(-20, 30))
142 | 
143 | 


--------------------------------------------------------------------------------
/inst/doc/clusteringdatasets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Clustering Datasets"
  3 | author: "Amos Elberg"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{clusteringdatasets}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ## Clustering Datasets
 13 | 
 14 | This vignette provides a simple overview of the datasets included in the package. 
 15 | 
 16 | ```{r setup,echo=F}
 17 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6)
 18 | ```
 19 | 
 20 | ### Birch
 21 | 
 22 | ```{r birch,fig.height=2,fig.width=6}
 23 | library(clusteringdatasets)
 24 | data(birch1)
 25 | data(birch2)
 26 | data(birch3)
 27 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 28 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
 29 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 30 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 31 | ```
 32 | 
 33 | ### S Sets
 34 | 
 35 | The S-sets are useful for testing how an algorithm handles cluster overlap.
 36 | 
 37 | ```{r ssets,fig.height=4,fig.width=4}
 38 | data(s1)
 39 | data(s2)
 40 | data(s3)
 41 | data(s4)
 42 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 43 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 44 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 45 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 46 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 47 | ```
 48 | 
 49 | 
 50 | ### A Sets
 51 | 
 52 | ```{r asets,fig.height=2,fig.width=6}
 53 | data(a1)
 54 | data(a2)
 55 | data(a3)
 56 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 57 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 58 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 59 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 60 | ```
 61 | 
 62 | ### Shapesets
 63 | 
 64 | ```{r shapesets,fig.height=6,fig.width=6}
 65 | data("Aggregation")
 66 | data("spiral")
 67 | data("D31")
 68 | data(Compound)
 69 | data(pathbased)
 70 | data(jain)
 71 | data(flame)
 72 | data(R15)
 73 | par(mfrow = c(3, 3), mar = c(0,0,1,0))
 74 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 75 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 76 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 77 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 78 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 79 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 80 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 81 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 82 | ```
 83 | 
 84 | ### Chameleon
 85 | 
 86 | ```{r t48k,fig.height=3,fig.width=3}
 87 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 88 | data("t48k")
 89 | data("t58k")
 90 | data("t710k")
 91 | data("t88k")
 92 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 93 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 94 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 95 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 96 | ```
 97 | 
 98 | ### Neural Gas
 99 | 
100 | ```{r neuralgas}
101 | names <- c("Circle"  ,  "Complex1"  , "Complex2" ,  "Complex3"    ,  "Complex4"     ,   "Discrete"  , "HiLoDensity" ,  "JumpingRectangle" ,
102 | 					 "MovingJumpingRectangle", "MovingRectangle", "Rectangle" ,  "RMouseRectangle"    ) # "Ring"
103 | data(list = names)
104 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
105 | for (nm in names) {
106 | 	plot(eval(parse(text = nm)), cex = 0.01,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
107 | }
108 | ```
109 | 
110 | 
111 | ### Non-Convex
112 | 
113 | ```{r nonconvex}
114 | names <- c("cross" , "d4"  ,   "face"  , "pie"  ,  "ring2"  , "sincos")
115 | data(list = names)
116 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
117 | for (nm in names) {
118 | 	plot(eval(parse(text = nm)), cex = 0.1,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
119 | }
120 | ```
121 | 
122 | ## Locations
123 | 
124 | ```{r mopsi}
125 | data("mopsifinland")
126 | data("mopsijoensu")
127 | par(mfrow = c(1, 2), mar = c(0,0,1,0))
128 | plot(mopsifinland[, 1:2], cex = 0.01,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
129 | plot(mopsijoensu[, 1:2], cex = 0.05,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
130 | ```
131 | 
132 | ## High Dimensional Datasets
133 | 
134 | The package contains three sets of high-dimensional data. The visualizations below were made using my `largeVis` package to reduce each dataset to two dimensions, and the colors are the result of applying the `hdbscan` function within the package. 
135 | 
136 | ### UCI Datasets
137 | 
138 | ```{r toproc,eval=F}
139 | library(largeVis)
140 | library(clusteringdatasets)
141 | library(ggplot2)
142 | data(glass)
143 | data(wdbc)
144 | data(breast)
145 | data(yeast)
146 | data(wine)
147 | data(thyroid)
148 | toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
149 | vises <- list()
150 | clusters <- list()
151 | for (i in 1:length(toproc)) {
152 | 	dat <- t(scale(as.matrix(toproc[[i]])))
153 | 	if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
154 | 	else vis <- largeVis(dat, K = 100, verbose = TRUE)
155 | 	neighbors <- randomProjectionTreeSearch(dat, K = 50)
156 | 	edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
157 | 	print(str(edges))
158 | 	cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
159 | 	vises[[i]] <- vis
160 | 	clusters[[i]] <- cluster
161 | }
162 | ```
163 | 
164 | ```{r highd1}
165 | library(ggplot2)
166 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
167 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
168 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
169 | par(mfrow = c(2, 3), mar = c(0,0,1,0))
170 | for (i in 1:length(names)) {
171 | 	df <- data.frame(t(vises[[i]]$coords))
172 | 	colnames(df) <- c("x", "y")
173 | 	df$label <- clusters[[i]]$clusters
174 | 	if (length(unique(df$label)) > 1) {
175 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
176 | 	} else {
177 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
178 | 	}
179 | }
180 | ```
181 | 
182 | ### KDDCUP04Bio
183 | 
184 | ```{r kdcupbio,eval=F}
185 | data("kddcup04bio")
186 | library(largeVis)
187 | load("./kddvis.Rda")
188 | dat <- t(scale(as.matrix(kddcup04bio)))
189 | vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)
190 | ```
191 | 
192 | ```{r showkdcupbio}
193 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
194 | par(mfrow = c(1, 1), mar = c(0,0,1,0))
195 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 
196 | 		 xlim = c(-20, 18), ylim = c(-20, 30))
197 | ```
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/inst/extdata/clusters.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/clusters.Rda


--------------------------------------------------------------------------------
/inst/extdata/kdvis.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/kdvis.Rda


--------------------------------------------------------------------------------
/inst/extdata/vises.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbamos/clusteringdatasets/995d303d1bc70e3b139a13aab04b2cf4e890aa9c/inst/extdata/vises.Rda


--------------------------------------------------------------------------------
/man/asets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{a1}
 5 | \alias{a1}
 6 | \alias{a2}
 7 | \alias{a3}
 8 | \title{A-sets}
 9 | \format{Data frame of x, y coordinates}
10 | \source{
11 | \url{http://cs.joensuu.fi/sipu/datasets/}
12 | }
13 | \usage{
14 | a1
15 | 
16 | a2
17 | 
18 | a3
19 | }
20 | \description{
21 | Synthetic 2-d data with varying number of vectors (N) and clusters (M). There are 150 vectors per cluster.
22 | }
23 | \references{
24 | I. Kärkkäinen and P. Fränti, "Dynamic local search algorithm for the clustering problem", Research Report A-2002-6
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/birch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{birch1}
 5 | \alias{birch1}
 6 | \alias{birch2}
 7 | \alias{birch3}
 8 | \title{BIRCH clustering datasets.}
 9 | \format{Data frame of x, y coordinates}
10 | \source{
11 | \url{http://cs.joensuu.fi/sipu/datasets/}
12 | }
13 | \usage{
14 | birch1
15 | 
16 | birch2
17 | 
18 | birch3
19 | }
20 | \description{
21 | Clusters in regular grid structure
22 | 
23 | Clusters at a sine curve
24 | 
25 | Random sized clusters in random locations
26 | }
27 | \details{
28 | Synthetic 2-d data with N=100,000 vectors and M=100 clusters
29 | See Zhang et al., "BIRCH: A new data clustering algorithm and its applications", Data Mining and Knowledge Discovery, 1 (2), 141-182, 1997.
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/man/chameleon.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{t48k}
 5 | \alias{t48k}
 6 | \alias{t88k}
 7 | \alias{t710k}
 8 | \alias{t58k}
 9 | \title{Chameleon Datasets}
10 | \format{Data frame}
11 | \usage{
12 | t48k
13 | 
14 | t88k
15 | 
16 | t710k
17 | 
18 | t58k
19 | }
20 | \description{
21 | 4 sets of 2-vectors.
22 | \itemize{
23 | \item{"t48k"}{N=8000,M=4,D=2}
24 | \item{"t88k"}{N=8000,M=8,D=2}
25 | \item{"t710k"}{N=10000,M=7,D=2}
26 | \item{"t58k"}{N=8000,M=5,D=2}
27 | }
28 | }
29 | \references{
30 | G. Karypis, E.H. Han, V. Kumar, CHAMELEON: A hierarchical clustering algorithm using dynamic modeling, IEEE Trans. on Computers, 32 (8), 68-75, 1999.
31 | }
32 | \keyword{datasets}
33 | 


--------------------------------------------------------------------------------
/man/clusteringdatasets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clusteringdatasets-package.r
 3 | \docType{package}
 4 | \name{clusteringdatasets}
 5 | \alias{clusteringdatasets}
 6 | \alias{clusteringdatasets-package}
 7 | \title{clusteringdatasets.}
 8 | \description{
 9 | A set of datasets useful for testing clustering algorithms.
10 | }
11 | 


--------------------------------------------------------------------------------
/man/highdimsets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{dim032}
 5 | \alias{dim032}
 6 | \alias{dim064}
 7 | \alias{dim128}
 8 | \alias{dim256}
 9 | \alias{dim512}
10 | \alias{dim1024}
11 | \title{High-Dim Sets}
12 | \format{Data frames}
13 | \source{
14 | \url{http://cs.joensuu.fi/sipu/datasets/}
15 | }
16 | \usage{
17 | dim032
18 | 
19 | dim064
20 | 
21 | dim128
22 | 
23 | dim256
24 | 
25 | dim512
26 | 
27 | dim1024
28 | }
29 | \description{
30 | Six sets of high-dimensional data, each with 1024 vectors and 16 gaussian clusters.
31 | }
32 | \references{
33 | P. Fränti, O. Virmajoki and V. Hautamäki, "Fast agglomerative clustering using a k-nearest neighbor graph", IEEE Trans. on Pattern Analysis and Machine Intelligence, 28 (11), 1875-1881, November 2006.
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/man/kddcup04bio.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{kddcup04bio}
 5 | \alias{kddcup04bio}
 6 | \title{KDDCUPO4Bio}
 7 | \format{Data frame}
 8 | \source{
 9 | \url{http://cs.joensuu.fi/sipu/datasets/}
10 | }
11 | \usage{
12 | kddcup04bio
13 | }
14 | \description{
15 | 145751 vectors, 74-D
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/man/make_blobs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sklearn.R
 3 | \name{make_blobs}
 4 | \alias{make_blobs}
 5 | \title{Make gaussian blobs}
 6 | \usage{
 7 | make_blobs(n_samples = 100, n_features = 2, centers = 3,
 8 |   cluster_std = 1, center_box = c(-10, 10), shuffle = TRUE)
 9 | }
10 | \arguments{
11 | \item{n_samples}{Number of points}
12 | 
13 | \item{n_features}{Dimensionality of dataset}
14 | 
15 | \item{centers}{Either the number of centers, or a matrix of the chosen centers}
16 | 
17 | \item{cluster_std}{Standard deviation of Gaussian noise. Either one number, or a vector of length equal to the number of centers}
18 | 
19 | \item{center_box}{If the centers are being generated, the bounding box within which they will be created.}
20 | 
21 | \item{shuffle}{Ignored; included for compatibility with the Python}
22 | }
23 | \value{
24 | a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the cluster from which each point came.
25 | }
26 | \description{
27 | Imitation of the Python \code{sklearn.datasets.make_blobs} function.
28 | }
29 | 


--------------------------------------------------------------------------------
/man/make_moons.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sklearn.R
 3 | \name{make_moons}
 4 | \alias{make_moons}
 5 | \title{Make two interleaving half-circles}
 6 | \usage{
 7 | make_moons(n_samples = 100, shuffle = TRUE, noise = NA)
 8 | }
 9 | \arguments{
10 | \item{n_samples}{Number of points (will be divided equally among the circles)}
11 | 
12 | \item{shuffle}{Whether to randomize the sequence}
13 | 
14 | \item{noise}{Standard deviation of Gaussian noise applied to point positions}
15 | }
16 | \value{
17 | a \code{list} containining \code{samples}, a matrix of points, and \code{labels}, which identifies the circle from which each point came.
18 | }
19 | \description{
20 | Imitation of the Python \code{sklearn.datasets.make_moons} function.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/mopsi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{mopsifinland}
 5 | \alias{mopsifinland}
 6 | \alias{mopsijoensu}
 7 | \title{Mopsi Data}
 8 | \format{Data frame}
 9 | \source{
10 | \url{http://cs.uef.fi/mopsi/data/}
11 | }
12 | \usage{
13 | mopsifinland
14 | 
15 | mopsijoensu
16 | }
17 | \description{
18 | User locations, N = 13467 for Finland, N = 6014 for Joensuu
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/neuralgas.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{Circle}
 5 | \alias{Circle}
 6 | \alias{Complex1}
 7 | \alias{Complex2}
 8 | \alias{Complex3}
 9 | \alias{Complex4}
10 | \alias{Discrete}
11 | \alias{HiLoDensity}
12 | \alias{JumpingRectangle}
13 | \alias{MovingJumpingRectangle}
14 | \alias{MovingRectangle}
15 | \alias{Rectangle}
16 | \alias{Ring}
17 | \alias{RMouseRectangle}
18 | \title{Neural Gas}
19 | \format{Data frame}
20 | \usage{
21 | Circle
22 | 
23 | Complex1
24 | 
25 | Complex2
26 | 
27 | Complex3
28 | 
29 | Complex4
30 | 
31 | Discrete
32 | 
33 | HiLoDensity
34 | 
35 | JumpingRectangle
36 | 
37 | MovingJumpingRectangle
38 | 
39 | MovingRectangle
40 | 
41 | Rectangle
42 | 
43 | Ring
44 | 
45 | RMouseRectangle
46 | }
47 | \description{
48 | 4 sets of 2-vectors.
49 | \itemize{
50 | \item{"Circle"}{N=5000,D=2}
51 | \item{"Complex1"}{N=5000,D=2}
52 | \item{"Complex2"}{N=5000,D=2}
53 | \item{"Complex3"}{N=5000,D=2}
54 | \item{"Complex3"}{N=5000,D=2}
55 | \item{"Discrete"}{N=5000,D=2}
56 | \item{"HiLoDensity"}{N=5000,D=2}
57 | \item{"JumpingRectangle"}{N=5000,D=2}
58 | \item{"MovingJumpingRectangle"}{N=5000,D=2}
59 | \item{"MovingRectangle"}{N=5000,D=2}
60 | \item{"Rectangle"}{N=5000,D=2}
61 | \item{"Ring"}{N=5000,D=2}
62 | \item{"RMouseRectangle"}{N=5000,D=2}
63 | }
64 | }
65 | \keyword{datasets}
66 | 


--------------------------------------------------------------------------------
/man/nonconvex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{cross}
 5 | \alias{cross}
 6 | \alias{d4}
 7 | \alias{face}
 8 | \alias{pie}
 9 | \alias{ring2}
10 | \alias{sincos}
11 | \title{Non-Convex}
12 | \format{Data frame}
13 | \usage{
14 | cross
15 | 
16 | d4
17 | 
18 | face
19 | 
20 | pie
21 | 
22 | ring2
23 | 
24 | sincos
25 | }
26 | \description{
27 | \itemize{
28 | \item{"cross"}{N=2000,D=2}
29 | \item{"d4"}{N=200,D=2}
30 | \item{"face"}{N=500,D=2}
31 | \item{"pie"}{N=2322,D=2}
32 | \item{"ring2"}{N=60,D=2}
33 | \item{"sincos"}{N=300,D=2}
34 | }
35 | }
36 | \keyword{datasets}
37 | 


--------------------------------------------------------------------------------
/man/shapesets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{Aggregation}
 5 | \alias{Aggregation}
 6 | \alias{Compound}
 7 | \alias{pathbased}
 8 | \alias{spiral}
 9 | \alias{jain}
10 | \alias{flame}
11 | \alias{D31}
12 | \alias{R15}
13 | \title{Shape sets}
14 | \format{Data frame of x, y coordinates and label}
15 | \source{
16 | \url{http://cs.joensuu.fi/sipu/datasets/}
17 | }
18 | \usage{
19 | Aggregation
20 | 
21 | Compound
22 | 
23 | pathbased
24 | 
25 | spiral
26 | 
27 | jain
28 | 
29 | flame
30 | 
31 | D31
32 | 
33 | R15
34 | }
35 | \description{
36 | Various sets of points that form shapes.  Good for testing density-based clustering methods.
37 | }
38 | \references{
39 | A. Gionis, H. Mannila, and P. Tsaparas, Clustering aggregation. ACM Transactions on Knowledge Discovery from Data (TKDD), 2007. 1(1): p. 1-30.
40 | 
41 | C.T. Zahn, Graph-theoretical methods for detecting and describing gestalt clusters. IEEE Transactions on Computers, 1971. 100(1): p. 68-86.
42 | 
43 | H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203.
44 | 
45 | H. Chang and D.Y. Yeung, Robust path-based spectral clustering. Pattern Recognition, 2008. 41(1): p. 191-203.
46 | 
47 | A. Jain and M. Law, Data clustering: A user's dilemma. Lecture Notes in Computer Science, 2005. 3776: p. 1-10.
48 | 
49 | L. Fu and E. Medico, FLAME, a novel fuzzy clustering method for the analysis of DNA microarray data. BMC bioinformatics, 2007. 8(1): p. 3.
50 | 
51 | C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280.
52 | 
53 | C.J. Veenman, M.J.T. Reinders, and E. Backer, A maximum variance cluster algorithm. IEEE Trans. Pattern Analysis and Machine Intelligence 2002. 24(9): p. 1273-1280.
54 | }
55 | \keyword{datasets}
56 | 


--------------------------------------------------------------------------------
/man/ssets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{s1}
 5 | \alias{s1}
 6 | \alias{s2}
 7 | \alias{s3}
 8 | \alias{s4}
 9 | \alias{centroids}
10 | \title{S-sets}
11 | \format{Data frame of x, y coordinates and labels}
12 | \source{
13 | \url{http://cs.joensuu.fi/sipu/datasets/}
14 | }
15 | \usage{
16 | s1
17 | 
18 | s2
19 | 
20 | s3
21 | 
22 | s4
23 | 
24 | centroids
25 | }
26 | \description{
27 | Synthetic 2-d data with N=5000 vectors and M=15 Gaussian clusters with different degree of cluster overlapping. Centroids are found in the data object \code{centroids}.
28 | }
29 | \references{
30 | P. Fränti and O. Virmajoki, "Iterative shrinking method for clustering problems", Pattern Recognition, 39 (5), 761-765, May 2006.
31 | }
32 | \keyword{datasets}
33 | 


--------------------------------------------------------------------------------
/man/uci.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/birch.R
 3 | \docType{data}
 4 | \name{thyroid}
 5 | \alias{thyroid}
 6 | \alias{wine}
 7 | \alias{glass}
 8 | \alias{yeast}
 9 | \alias{breast}
10 | \alias{wdbc}
11 | \title{UCI High Dimensional Datasets}
12 | \format{Data frames}
13 | \source{
14 | \url{http://archive.ics.uci.edu/ml/}
15 | }
16 | \usage{
17 | thyroid
18 | 
19 | wine
20 | 
21 | glass
22 | 
23 | yeast
24 | 
25 | breast
26 | 
27 | wdbc
28 | }
29 | \description{
30 | Various high-dimensional datasets with identifying data removed
31 | \itemize{
32 | \item{"thyroid"}{N=215,M=2,D=5}
33 | \item{"wine"}{N=178,M=3,D=13}
34 | \item{"glass"}{N=214,M=7,D=9}
35 | \item{"yeast"}{N=1484,M=10,D=8}
36 | \item{"breast"}{N=699,M=2,D=9}
37 | \item{"wdbc"}{N=569,M=2,D=32}
38 | }
39 | }
40 | \keyword{datasets}
41 | 


--------------------------------------------------------------------------------
/vignettes/clusteringdatasets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Clustering Datasets"
  3 | author: "Amos Elberg"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{clusteringdatasets}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ## Clustering Datasets
 13 | 
 14 | This vignette provides a simple overview of the datasets included in the package. 
 15 | 
 16 | ```{r setup,echo=F}
 17 | knitr::opts_chunk$set(comment=NA, echo=FALSE, fig.width=6, fig.height=6)
 18 | ```
 19 | 
 20 | ### Birch
 21 | 
 22 | ```{r birch,fig.height=2,fig.width=6}
 23 | library(clusteringdatasets)
 24 | data(birch1)
 25 | data(birch2)
 26 | data(birch3)
 27 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 28 | plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
 29 | plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 30 | plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 31 | ```
 32 | 
 33 | ### S Sets
 34 | 
 35 | The S-sets are useful for testing how an algorithm handles cluster overlap.
 36 | 
 37 | ```{r ssets,fig.height=4,fig.width=4}
 38 | data(s1)
 39 | data(s2)
 40 | data(s3)
 41 | data(s4)
 42 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 43 | plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 44 | plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 45 | plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 46 | plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 47 | ```
 48 | 
 49 | 
 50 | ### A Sets
 51 | 
 52 | ```{r asets,fig.height=2,fig.width=6}
 53 | data(a1)
 54 | data(a2)
 55 | data(a3)
 56 | par(mfrow = c(1, 3), mar = c(0,0,1,0))
 57 | plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 58 | plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 59 | plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 60 | ```
 61 | 
 62 | ### Shapesets
 63 | 
 64 | ```{r shapesets,fig.height=6,fig.width=6}
 65 | data("Aggregation")
 66 | data("spiral")
 67 | data("D31")
 68 | data(Compound)
 69 | data(pathbased)
 70 | data(jain)
 71 | data(flame)
 72 | data(R15)
 73 | par(mfrow = c(3, 3), mar = c(0,0,1,0))
 74 | plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 75 | plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 76 | plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 77 | plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 78 | plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 79 | plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 80 | plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 81 | plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 82 | ```
 83 | 
 84 | ### Chameleon
 85 | 
 86 | ```{r t48k,fig.height=3,fig.width=3}
 87 | par(mfrow = c(2, 2), mar = c(0,0,1,0))
 88 | data("t48k")
 89 | data("t58k")
 90 | data("t710k")
 91 | data("t88k")
 92 | plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 93 | plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 94 | plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 95 | plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
 96 | ```
 97 | 
 98 | ### Neural Gas
 99 | 
100 | ```{r neuralgas}
101 | names <- c("Circle"  ,  "Complex1"  , "Complex2" ,  "Complex3"    ,  "Complex4"     ,   "Discrete"  , "HiLoDensity" ,  "JumpingRectangle" ,
102 | 					 "MovingJumpingRectangle", "MovingRectangle", "Rectangle" ,  "RMouseRectangle"    ) # "Ring"
103 | data(list = names)
104 | par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
105 | for (nm in names) {
106 | 	plot(eval(parse(text = nm)), cex = 0.01,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
107 | }
108 | ```
109 | 
110 | 
111 | ### Non-Convex
112 | 
113 | ```{r nonconvex}
114 | names <- c("cross" , "d4"  ,   "face"  , "pie"  ,  "ring2"  , "sincos")
115 | data(list = names)
116 | par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
117 | for (nm in names) {
118 | 	plot(eval(parse(text = nm)), cex = 0.1,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
119 | }
120 | ```
121 | 
122 | ## Locations
123 | 
124 | ```{r mopsi}
125 | data("mopsifinland")
126 | data("mopsijoensu")
127 | par(mfrow = c(1, 2), mar = c(0,0,1,0))
128 | plot(mopsifinland[, 1:2], cex = 0.01,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
129 | plot(mopsijoensu[, 1:2], cex = 0.05,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
130 | ```
131 | 
132 | ## High Dimensional Datasets
133 | 
134 | The package contains three sets of high-dimensional data. The visualizations below were made using my `largeVis` package to reduce each dataset to two dimensions, and the colors are the result of applying the `hdbscan` function within the package. 
135 | 
136 | ### UCI Datasets
137 | 
138 | ```{r toproc,eval=F}
139 | library(largeVis)
140 | library(clusteringdatasets)
141 | library(ggplot2)
142 | data(glass)
143 | data(wdbc)
144 | data(breast)
145 | data(yeast)
146 | data(wine)
147 | data(thyroid)
148 | toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
149 | vises <- list()
150 | clusters <- list()
151 | for (i in 1:length(toproc)) {
152 | 	dat <- t(scale(as.matrix(toproc[[i]])))
153 | 	if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
154 | 	else vis <- largeVis(dat, K = 100, verbose = TRUE)
155 | 	neighbors <- randomProjectionTreeSearch(dat, K = 50)
156 | 	edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
157 | 	print(str(edges))
158 | 	cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
159 | 	vises[[i]] <- vis
160 | 	clusters[[i]] <- cluster
161 | }
162 | ```
163 | 
164 | ```{r highd1}
165 | library(ggplot2)
166 | load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
167 | load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
168 | names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
169 | par(mfrow = c(2, 3), mar = c(0,0,1,0))
170 | for (i in 1:length(names)) {
171 | 	df <- data.frame(t(vises[[i]]$coords))
172 | 	colnames(df) <- c("x", "y")
173 | 	df$label <- clusters[[i]]$clusters
174 | 	if (length(unique(df$label)) > 1) {
175 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
176 | 	} else {
177 | 		plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
178 | 	}
179 | }
180 | ```
181 | 
182 | ### KDDCUP04Bio
183 | 
184 | ```{r kdcupbio,eval=F}
185 | data("kddcup04bio")
186 | library(largeVis)
187 | load("./kddvis.Rda")
188 | dat <- t(scale(as.matrix(kddcup04bio)))
189 | vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)
190 | ```
191 | 
192 | ```{r showkdcupbio}
193 | load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
194 | par(mfrow = c(1, 1), mar = c(0,0,1,0))
195 | plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 
196 | 		 xlim = c(-20, 18), ylim = c(-20, 30))
197 | ```
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------