├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS
├── NEWS.md
├── R
├── approx_k.R
├── as_topic.R
├── assign_cluster.R
├── categorize.R
├── clustext-package.R
├── compare.R
├── cosine_distance.R
├── data_store.R
├── get_documents.R
├── get_dtm.R
├── get_removed.R
├── get_terms.R
├── get_text.R
├── hierarchical_cluster.R
├── jaccard_distance.R
├── kmeans_cluster.R
├── nmf_cluster.R
├── skmeans_cluster.R
├── utils.R
└── write_cluster_text.R
├── README.Rmd
├── README.md
├── data
├── assignments.rda
└── presidential_debates_2012.rda
├── inst
├── CITATION
├── additional
│ └── foo_turk.txt
├── build.R
├── extra_scripts
│ └── build_data.R
├── extra_statdoc
│ └── readme.R
├── maintenance.R
└── staticdocs
│ └── index.R
├── man
├── approx_k.Rd
├── as_topic.Rd
├── assign_cluster.Rd
├── assignments.Rd
├── categorize.Rd
├── clustext.Rd
├── compare.Rd
├── cosine_distance.Rd
├── data_store.Rd
├── get_documents.Rd
├── get_dtm.Rd
├── get_removed.Rd
├── get_terms.Rd
├── get_text.Rd
├── hierarchical_cluster.Rd
├── jaccard_distance.Rd
├── kmeans_cluster.Rd
├── nmf_cluster.Rd
├── plot.hierarchical_cluster.Rd
├── presidential_debates_2012.Rd
├── print.as_topic.Rd
├── print.assign_cluster.Rd
├── print.compare.Rd
├── print.data_store.Rd
├── print.get_documents.Rd
├── print.get_terms.Rd
├── skmeans_cluster.Rd
├── summary.assign_cluster.Rd
└── write_cluster_text.Rd
├── tests
├── testthat.R
└── testthat
│ └── test-assign_cluster.R
└── tools
├── clustext_logo
├── r_clustext.png
├── r_clustext.pptx
├── r_clustexta.png
└── resize_icon.txt
└── figure
├── unnamed-chunk-12-1.png
├── unnamed-chunk-14-1.png
├── unnamed-chunk-15-1.png
├── unnamed-chunk-16-1.png
├── unnamed-chunk-17-1.png
├── unnamed-chunk-18-1.png
├── unnamed-chunk-19-1.png
├── unnamed-chunk-20-1.png
├── unnamed-chunk-22-1.png
├── unnamed-chunk-6-1.png
├── unnamed-chunk-6-2.png
├── unnamed-chunk-6-3.png
└── unnamed-chunk-8-1.png
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.gitignore
4 | NEWS.md
5 | FAQ.md
6 | NEWS.html
7 | FAQ.html
8 | ^\.travis\.yml$
9 | travis-tool.sh
10 | inst/web
11 | contributors.geojson
12 | inst/build.R
13 | ^.*\.Rprofile$
14 | README.Rmd
15 | README.R
16 | travis.yml
17 | inst/maintenance.R
18 | Thumb.db
19 | tools/clustext_logo/r_clustexta.png
20 | tools/clustext_logo/r_clustext.pptx
21 | tools/clustext_logo/resize_icon.txt
22 | inst/staticdocs
23 | inst/extra_statdoc
24 | inst/extra_scripts
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 |
4 | # Example code in package build process
5 | *-Ex.R
6 |
7 | .Rprofile
8 | .Rproj.user
9 | clustext.Rproj
10 | Thumbs.db
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 |
3 | sudo: false
4 | before_install:
5 | - sh -e /etc/init.d/xvfb start
6 |
7 | r_github_packages:
8 | - jimhester/covr
9 | - trinker/gofastr
10 | - trinker/textshape
11 | - trinker/termco
12 | notifications:
13 | email:
14 | on_success: change
15 | on_failure: change
16 |
17 | after_success:
18 | - Rscript -e 'covr::coveralls()'
19 |
20 | r_build_args: "--resave-data=best"
21 | r_check_args: "--as-cran"
22 |
23 | env:
24 | global:
25 | - DISPLAY=:99.0
26 | - BOOTSTRAP_LATEX=1
27 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: clustext
2 | Title: Consistent Clustering for Text Data
3 | Version: 0.1.1
4 | Authors@R: c(person("Tyler", "Rinker", email =
5 | "tyler.rinker@gmail.com", role = c("aut", "cre")))
6 | Maintainer: Tyler Rinker
7 | Description: Optimized, consistent tools for clustering text data.
8 | Depends: R (>= 3.2.3)
9 | Imports: dplyr, dynamicTreeCut, fastcluster, gofastr, graphics, Matrix,
10 | mclust, methods, rNMF, skmeans, slam, stats, termco, textshape,
11 | tm
12 | Suggests: testthat
13 | Date: 2017-04-14
14 | License: GPL-2
15 | LazyData: TRUE
16 | Roxygen: list(wrap = FALSE)
17 | RoxygenNote: 5.0.1
18 | BugReports: https://github.com/trinker/clustext/issues?state=open
19 | URL: https://github.com/trinker/clustext
20 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(approx_k,DocumentTermMatrix)
4 | S3method(approx_k,TermDocumentMatrix)
5 | S3method(as_topic,get_terms)
6 | S3method(assign_cluster,hierarchical_cluster)
7 | S3method(assign_cluster,kmeans_cluster)
8 | S3method(assign_cluster,nmf_cluster)
9 | S3method(assign_cluster,skmeans_cluster)
10 | S3method(cosine_distance,DocumentTermMatrix)
11 | S3method(cosine_distance,TermDocumentMatrix)
12 | S3method(get_documents,assign_cluster)
13 | S3method(get_dtm,data_store)
14 | S3method(get_dtm,hierarchical_cluster)
15 | S3method(get_dtm,kmeans_cluster)
16 | S3method(get_dtm,nmf_cluster)
17 | S3method(get_dtm,skmeans_cluster)
18 | S3method(get_removed,data_store)
19 | S3method(get_removed,hierarchical_cluster)
20 | S3method(get_removed,kmeans_cluster)
21 | S3method(get_removed,nmf_cluster)
22 | S3method(get_removed,skmeans_cluster)
23 | S3method(get_terms,assign_cluster_hierarchical)
24 | S3method(get_terms,assign_cluster_kmeans)
25 | S3method(get_terms,assign_cluster_nmf)
26 | S3method(get_terms,assign_cluster_skmeans)
27 | S3method(get_text,assign_cluster)
28 | S3method(get_text,data_store)
29 | S3method(get_text,default)
30 | S3method(get_text,hierarchical_cluster)
31 | S3method(get_text,kmeans_cluster)
32 | S3method(get_text,nmf_cluster)
33 | S3method(get_text,skmeans_cluster)
34 | S3method(hierarchical_cluster,data_store)
35 | S3method(jaccard_distance,DocumentTermMatrix)
36 | S3method(jaccard_distance,TermDocumentMatrix)
37 | S3method(kmeans_cluster,data_store)
38 | S3method(nmf_cluster,data_store)
39 | S3method(plot,hierarchical_cluster)
40 | S3method(print,as_topic)
41 | S3method(print,assign_cluster)
42 | S3method(print,compare)
43 | S3method(print,data_store)
44 | S3method(print,get_documents)
45 | S3method(print,get_terms)
46 | S3method(skmeans_cluster,data_store)
47 | S3method(summary,assign_cluster)
48 | export(approx_k)
49 | export(as_topic)
50 | export(assign_cluster)
51 | export(categorize)
52 | export(compare)
53 | export(cosine_distance)
54 | export(data_store)
55 | export(get_documents)
56 | export(get_dtm)
57 | export(get_removed)
58 | export(get_terms)
59 | export(get_text)
60 | export(hierarchical_cluster)
61 | export(jaccard_distance)
62 | export(kmeans_cluster)
63 | export(nmf_cluster)
64 | export(read_cluster_text)
65 | export(skmeans_cluster)
66 | export(write_cluster_text)
67 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | NEWS
2 | ====
3 |
4 | Versioning
5 | ----------
6 |
7 | Releases will be numbered with the following semantic versioning format:
8 |
9 | ..
10 |
11 | And constructed with the following guidelines:
12 |
13 | * Breaking backward compatibility bumps the major (and resets the minor
14 | and patch)
15 | * New additions without breaking backward compatibility bumps the minor
16 | (and resets the patch)
17 | * Bug fixes and misc changes bumps the patch
18 |
19 |
20 |
21 | clustext 0.1.0-
22 | ----------------------------------------------------------------
23 |
24 | BUG FIXES
25 |
26 | * `join` from an `assign_cluster`'s `attributes`, now explicitly joins on (`by`)
27 | `'id_temporary'`.
28 |
29 | NEW FEATURES
30 |
31 | * `as_topic` added to convert `get_terms` to more print friendly cluster-topic-terms
32 | display.
33 |
34 | * `write_cluster_text` & `read_cluster_text` added to write/read cluster text for
35 | human categorization.
36 |
37 | * `categorize` added to join original data, clusters, and human categories.
38 |
39 | MINOR FEATURES
40 |
41 | IMPROVEMENTS
42 |
43 | CHANGES
44 |
45 | clustext 0.0.1
46 | ----------------------------------------------------------------
47 |
48 | This package is a collection of tools for optimized, consistent clustering of text data.
49 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | NEWS
2 | ====
3 |
4 | Versioning
5 | ----------
6 |
7 | Releases will be numbered with the following semantic versioning format:
8 |
9 | <major>.<minor>.<patch>
10 |
11 | And constructed with the following guidelines:
12 |
13 | * Breaking backward compatibility bumps the major (and resets the minor
14 | and patch)
15 | * New additions without breaking backward compatibility bumps the minor
16 | (and resets the patch)
17 | * Bug fixes and misc changes bumps the patch
18 |
19 |
20 |
21 | clustext 0.1.0-
22 | ----------------------------------------------------------------
23 |
24 | **BUG FIXES**
25 |
26 | * `join` from an `assign_cluster`'s `attributes`, now explicitly joins on (`by`)
27 | `'id_temporary'`.
28 |
29 | **NEW FEATURES**
30 |
31 | * `as_topic` added to convert `get_terms` to more print friendly cluster-topic-terms
32 | display.
33 |
34 | * `write_cluster_text` & `read_cluster_text` added to write/read cluster text for
35 | human categorization.
36 |
37 | * `categorize` added to join original data, clusters, and human categories.
38 |
39 | **MINOR FEATURES**
40 |
41 | **IMPROVEMENTS**
42 |
43 | **CHANGES**
44 |
45 | clustext 0.0.1
46 | ----------------------------------------------------------------
47 |
48 | This package is a collection of tools for optimized, consistent clustering of text data.
--------------------------------------------------------------------------------
/R/approx_k.R:
--------------------------------------------------------------------------------
1 | #' Approximate Number of Clusters for a Text Matrix
2 | #'
3 | #' Can & Ozkarahan (1990) formula for approximating the number of clusters for
4 | #' a text matrix: \eqn{(m * n)/t} where \eqn{m} and \eqn{n} are the dimensions
5 | #' of the matrix and \eqn{t} is the length of the non-zero elements in matrix
6 | #' \eqn{A}.
7 | #'
8 | #' @param x A matrix.
9 | #' @param verbose logical. If \code{TRUE} the k determination is printed.
10 | #' @return Returns an integer.
11 | #' @references Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of
12 | #' the cover-coefficient-based clustering methodology for text databases.
13 | #' ACM Transactions on Database Systems 15 (4): 483. doi:10.1145/99935.99938. \cr
14 | #' @rdname approx_k
15 | #' @export
16 | #' @examples
17 | #' library(gofastr)
18 | #' library(dplyr)
19 | #'
20 | #' presidential_debates_2012 %>%
21 | #' with(q_dtm(dialogue)) %>%
22 | #' approx_k()
23 | approx_k <- function(x, verbose = TRUE){
24 |
25 | UseMethod("approx_k")
26 |
27 | }
28 |
29 | #' @export
30 | #' @rdname approx_k
31 | #' @method approx_k TermDocumentMatrix
32 | approx_k.TermDocumentMatrix <- function(x, verbose = TRUE) {
33 | m <- round(do.call("*", as.list(dim(x)))/length(x[["v"]]))
34 | if (verbose) cat(sprintf("\nk approximated to: %s\n", m))
35 | m
36 | }
37 |
38 | #' @export
39 | #' @rdname approx_k
40 | #' @method approx_k DocumentTermMatrix
41 | approx_k.DocumentTermMatrix <- function(x, verbose = TRUE) {
42 | m <- round(do.call("*", as.list(dim(x)))/length(x[["v"]]))
43 | if (verbose) cat(sprintf("\nk approximated to: %s\n", m))
44 | m
45 | }
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/R/as_topic.R:
--------------------------------------------------------------------------------
1 | #' Convert \code{get_terms} to Topics
2 | #'
3 | #' View important terms as a comma separated string (a topic).
4 | #'
5 | #' @param x A \code{get_terms} object.
6 | #' @param max.n The max number of words to show before truncation.
7 | #' @param sort logical. If \code{TRUE} the cluster topics are sorted by size
8 | #' (number of documents) otherwise the topics are sorted by cluster number.
9 | #' @param \ldots ignored.
10 | #' @return Returns a \code{\link[base]{data.frame}} of \code{"cluster"},
11 | #' \code{"count"}, and \code{"terms"}. Pretty prints as clusters, number of
12 | #' documents, and associated important terms.
13 | #' @export
14 | #' @examples
15 | #' library(dplyr)
16 | #'
17 | #' myfit5 <- presidential_debates_2012 %>%
18 | #' mutate(tot = gsub("\\..+$", "", tot)) %>%
19 | #' textshape::combine() %>%
20 | #' filter(person %in% c("ROMNEY", "OBAMA")) %>%
21 | #' with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>%
22 | #' hierarchical_cluster()
23 | #'
24 | #' ca5 <- assign_cluster(myfit5, k = 50)
25 | #'
26 | #' get_terms(ca5, .4) %>%
27 | #' as_topic()
28 | #'
29 | #' get_terms(ca5, .4) %>%
30 | #' as_topic(sort=FALSE)
31 | #'
32 | #' get_terms(ca5, .95) %>%
33 | #' as_topic()
34 | as_topic <- function(x, max.n = 8, sort = TRUE, ...){
35 | UseMethod("as_topic")
36 | }
37 |
38 |
39 | #' @export
40 | #' @rdname as_topic
41 | #' @method as_topic get_terms
42 | as_topic.get_terms <- function(x, max.n = 8, sort = TRUE, ...){
43 |
44 | cluster <- NULL
45 |
46 | terms <- lapply(x, function(x) {
47 | if (is.null(x)) return(NA)
48 | trms <- x[['term']]
49 | if (length(trms) > max.n) {
50 | paste0(paste(trms[1:max.n], collapse= ", "), "...")
51 | } else {
52 | paste(trms, collapse= ", ")
53 | }
54 | })
55 |
56 | dat <- dplyr::left_join(
57 | summary(attributes(x)[["assignment"]], plot=FALSE),
58 | textshape::tidy_list(terms, "cluster", "terms"),
59 | by = 'cluster'
60 | )
61 | if (!isTRUE(sort)){
62 | dat <- dplyr::arrange(dat, as.numeric(cluster))
63 | }
64 | class(dat) <- c('as_topic', class(dat))
65 | dat
66 | }
67 |
68 |
69 | #' Prints an as_topic Object
70 | #'
71 | #' Prints an as_topic object
72 | #'
73 | #' @param x An as_topic object.
74 | #' @param \ldots ignored.
75 | #' @method print as_topic
76 | #' @export
77 | print.as_topic <- function(x, ...){
78 | cat(paste(sprintf("Cluster %s (n=%s): %s", x[['cluster']], x[['count']],
79 | x[['terms']]), collapse="\n"), "\n")
80 | }
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/R/assign_cluster.R:
--------------------------------------------------------------------------------
1 | #' Assign Clusters to Documents/Text Elements
2 | #'
3 | #' Assign clusters to documents/text elements.
4 | #'
5 | #' @param x a \code{xxx_cluster} object.
6 | #' @param k The number of clusters (can supply \code{h} instead). Defaults to
7 | #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced
8 | #' by \code{data_storage}.
9 | #' @param h The height at which to cut the dendrograms (determines number of
10 | #' clusters). If this argument is supplied \code{k} is ignored.
11 | #' @param cut The type of cut method to use for \code{hierarchical_cluster}; one
12 | #' of \code{'static'}, \code{'dynamic'} or \code{'iterative'}.
13 | #' @param deepSplit logical. See \code{\link[dynamicTreeCut]{cutreeDynamic}}.
14 | #' @param minClusterSize The minimum cluster size. See
15 | #' \code{\link[dynamicTreeCut]{cutreeDynamic}}.
16 | #' @param \ldots ignored.
17 | #' @return Returns an \code{assign_cluster} object; a named vector of cluster
18 | #' assignments with documents as names. The object also contains the original
19 | #' \code{data_storage} object and a \code{join} function. \code{join} is a
20 | #' function (a closure) that captures information about the \code{assign_cluster}
21 | #' that makes rejoining to the original data set simple. The user simply
22 | #' supplies the original data set as an argument to \code{join}
23 | #' (\code{attributes(FROM_ASSIGN_CLUSTER)$join(ORIGINAL_DATA)}).
24 | #' @rdname assign_cluster
25 | #' @export
26 | #' @examples
27 | #' \dontrun{
28 | #' library(dplyr)
29 | #'
30 | #' x <- with(
31 | #' presidential_debates_2012,
32 | #' data_store(dialogue, paste(person, time, sep = "_"))
33 | #' )
34 | #'
35 | #' hierarchical_cluster(x) %>%
36 | #' plot(h=.7, lwd=2)
37 | #'
38 | #' hierarchical_cluster(x) %>%
39 | #' assign_cluster(h=.7)
40 | #'
41 | #' hierarchical_cluster(x, method="complete") %>%
42 | #' plot(k=6)
43 | #'
44 | #' hierarchical_cluster(x) %>%
45 | #' assign_cluster(k=6)
46 | #'
47 | #'
48 | #' x2 <- presidential_debates_2012 %>%
49 | #' with(data_store(dialogue)) %>%
50 | #' hierarchical_cluster()
51 | #'
52 | #' ca2 <- assign_cluster(x2, k = 55)
53 | #' summary(ca2)
54 | #'
55 | #' ## Dynamic cut
56 | #' ca3 <- assign_cluster(x2, cut = 'dynamic', minClusterSize = 5)
57 | #' get_text(ca3)
58 | #'
59 | #' ## add to original data
60 | #' attributes(ca2)$join(presidential_debates_2012)
61 | #'
62 | #' ## split text into clusters
63 | #' get_text(ca2)
64 | #'
65 | #' ## Kmeans Algorithm
66 | #' kmeans_cluster(x, k=6) %>%
67 | #' assign_cluster()
68 | #'
69 | #' x3 <- presidential_debates_2012 %>%
70 | #' with(data_store(dialogue)) %>%
71 | #' kmeans_cluster(55)
72 | #'
73 | #' ca3 <- assign_cluster(x3)
74 | #' summary(ca3)
75 | #'
76 | #' ## split text into clusters
77 | #' get_text(ca3)
78 | #' }
79 | assign_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL, ...){
80 | UseMethod("assign_cluster")
81 | }
82 |
83 |
84 | #' @export
85 | #' @rdname assign_cluster
86 | #' @method assign_cluster hierarchical_cluster
87 | assign_cluster.hierarchical_cluster <- function(x, k = approx_k(get_dtm(x)),
88 | h = NULL, cut = 'static', deepSplit = TRUE, minClusterSize = 1, ...){
89 |
90 | id_temporary <- n <- NULL
91 |
92 |
93 | switch(cut,
94 |
95 | static = {
96 | if (!is.null(h)){
97 | out <- stats::cutree(x, h=h)
98 | } else {
99 | out <- stats::cutree(x, k=k)
100 | }
101 | },
102 | dynamic = {
103 | y <- x
104 | attributes(y)[['text_data_store']] <- NULL
105 | class(y) <- 'hclust'
106 | out <- dynamicTreeCut::cutreeDynamic(
107 | dendro = y,
108 | cutHeight = NULL,
109 | minClusterSize = minClusterSize,
110 | method = "tree",
111 | deepSplit = deepSplit,
112 | ...
113 | )
114 | names(out) <- y[['labels']]
115 | },
116 | iterative = {
117 | stop("'iterative', method not implemented yet;\n Use \'static\' or \'dynamic\'")
118 | },
119 | stop('`cut` must be one of: \'static\', \'dynamic\' or \'iterative\'')
120 | )
121 |
122 | orig <- attributes(x)[['text_data_store']][['data']]
123 | lens <- length(orig[['text']]) + length(orig[['removed']])
124 |
125 | class(out) <- c("assign_cluster_hierarchical", "assign_cluster", class(out))
126 |
127 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]]
128 | attributes(out)[["model"]] <- x
129 | attributes(out)[["algorithm"]] <- 'hierarchical'
130 | vect <- c(out)
131 | attributes(out)[["join"]] <- function(x) {
132 |
133 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x)))
134 |
135 | dplyr::select(
136 | dplyr::left_join(
137 | dplyr::mutate(x, id_temporary = as.character(1:n())),
138 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') ),
139 | by = 'id_temporary'
140 | ),
141 | -id_temporary
142 | )
143 | }
144 | out
145 |
146 | }
147 |
148 |
149 | #' @export
150 | #' @rdname assign_cluster
151 | #' @method assign_cluster kmeans_cluster
152 | assign_cluster.kmeans_cluster <- function(x, ...){
153 |
154 | out <- x[['cluster']]
155 | n <- id_temporary <- NULL
156 | orig <- attributes(x)[['text_data_store']][['data']]
157 | lens <- length(orig[['text']]) + length(orig[['removed']])
158 |
159 | class(out) <- c("assign_cluster_kmeans","assign_cluster", class(out))
160 |
161 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]]
162 | attributes(out)[["model"]] <- x
163 | attributes(out)[["algorithm"]] <- 'kmeans'
164 | vect <- c(out)
165 | attributes(out)[["join"]] <- function(x) {
166 |
167 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x)))
168 |
169 | dplyr::select(
170 | dplyr::left_join(
171 | dplyr::mutate(x, id_temporary = as.character(1:n())),
172 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') )
173 | ),
174 | -id_temporary
175 | )
176 | }
177 | out
178 |
179 | }
180 |
181 |
182 |
183 | #' @export
184 | #' @rdname assign_cluster
185 | #' @method assign_cluster skmeans_cluster
186 | assign_cluster.skmeans_cluster <- function(x, ...){
187 |
188 | out <- x[['cluster']]
189 | n <- id_temporary <- NULL
190 | orig <- attributes(x)[['text_data_store']][['data']]
191 | lens <- length(orig[['text']]) + length(orig[['removed']])
192 |
193 | class(out) <- c("assign_cluster_skmeans","assign_cluster", class(out))
194 |
195 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]]
196 | attributes(out)[["model"]] <- x
197 | attributes(out)[["algorithm"]] <- 'skmeans'
198 | vect <- c(out)
199 | attributes(out)[["join"]] <- function(x) {
200 |
201 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x)))
202 |
203 | dplyr::select(
204 | dplyr::left_join(
205 | dplyr::mutate(x, id_temporary = as.character(1:n())),
206 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') )
207 | ),
208 | -id_temporary
209 | )
210 | }
211 | out
212 |
213 | }
214 |
215 |
216 | #' @export
217 | #' @rdname assign_cluster
218 | #' @method assign_cluster nmf_cluster
219 | assign_cluster.nmf_cluster <- function(x, ...){
220 |
221 | out <- unlist(apply(x[['W']], 1, which.max))
222 |
223 | n <- id_temporary <- NULL
224 | orig <- attributes(x)[['text_data_store']][['data']]
225 | lens <- length(orig[['text']]) + length(orig[['removed']])
226 |
227 | class(out) <- c("assign_cluster_nmf","assign_cluster", class(out))
228 |
229 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]]
230 | attributes(out)[["model"]] <- x
231 | attributes(out)[["algorithm"]] <- 'nmf'
232 | vect <- c(out)
233 | attributes(out)[["join"]] <- function(x) {
234 |
235 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x)))
236 |
237 | dplyr::select(
238 | dplyr::left_join(
239 | dplyr::mutate(x, id_temporary = as.character(1:n())),
240 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') )
241 | ),
242 | -id_temporary
243 | )
244 | }
245 | out
246 |
247 | }
248 |
249 |
250 | #' Prints an assign_cluster Object
251 | #'
252 | #' Prints an assign_cluster object
253 | #'
254 | #' @param x An assign_cluster object.
255 | #' @param \ldots ignored.
256 | #' @method print assign_cluster
257 | #' @export
258 | print.assign_cluster <- function(x, ...){
259 | print(stats::setNames(as.integer(x), names(x)))
260 | }
261 |
262 |
263 | #' Summary of an assign_cluster Object
264 | #'
265 | #' Summary of an assign_cluster object
266 | #'
267 | #' @param object An assign_cluster object.
268 | #' @param plot logical. If \code{TRUE} an accompanying bar plot is produced a
269 | #' well.
270 | #' @param print logical. If \code{TRUE} data.frame counts are printed.
271 | #' @param \ldots ignored.
272 | #' @method summary assign_cluster
273 | #' @export
274 | summary.assign_cluster <- function(object, plot = TRUE, print = TRUE, ...){
275 | count <- NULL
276 | out <- textshape::tidy_table(table(as.integer(object)), "cluster", "count")
277 | if (isTRUE(plot)) print(termco::plot_counts(as.integer(object), item.name = "Cluster"))
278 | if (isTRUE(print)) dplyr::arrange(as.data.frame(out), dplyr::desc(count))
279 | }
280 |
281 |
282 |
283 |
284 |
285 |
--------------------------------------------------------------------------------
/R/categorize.R:
--------------------------------------------------------------------------------
1 | #' Merge Clusters & Cluster Categories Back to Original Data
2 | #'
3 | #' Merge clusters, categories, and the original data back together.
4 | #'
5 | #' @param data A data set that was fit with a cluster model.
6 | #' @param assign.cluster An \code{\link[clustext]{assign_cluster}} object.
7 | #' @param cluster.key An \code{\link[clustext]{assign_cluster}} object.
8 | #' @return Returns a \code{\link[base]{data.frame}} key of clusters and categories.
9 | #' @export
10 | #' @seealso \code{\link[clustext]{write_cluster_text}},
11 | #' \code{\link[clustext]{read_cluster_text}}
12 | #' @examples
13 | #' library(dplyr)
14 | #'
15 | #' ## Assign Clusters
16 | #' ca <- presidential_debates_2012 %>%
17 | #' with(data_store(dialogue)) %>%
18 | #' hierarchical_cluster() %>%
19 | #' assign_cluster(k = 7)
20 | #'
21 | #' ## Write Cluster Text for Human Categorization
22 | #' write_cluster_text(ca)
23 | #' write_cluster_text(ca, n.sample=10)
24 | #' write_cluster_text(ca, lead=" -", n.sample=10)
25 | #'
26 | #' ## Read Human Coded Categories Back In
27 | #' categories_file <- system.file("additional/foo_turk.txt", package = "clustext")
28 | #' readLines(categories_file)
29 | #' (categories_key <- read_cluster_text(categories_file))
30 | #'
31 | #' ## Add Categories Back to Original Data Set
32 | #' categorize(
33 | #' data = presidential_debates_2012,
34 | #' assign.cluster = ca,
35 | #' cluster.key = categories_key
36 | #' )
37 | categorize <- function(data, assign.cluster, cluster.key) {
38 |
39 | stopifnot(methods::is(assign.cluster, 'assign_cluster'))
40 | stopifnot(methods::is(cluster.key, 'cluster_key'))
41 |
42 | data <- attributes(assign.cluster)[["join"]](data)
43 | dplyr::left_join(data, cluster.key, by = 'cluster')
44 | }
--------------------------------------------------------------------------------
/R/clustext-package.R:
--------------------------------------------------------------------------------
1 | #' Consistent Clustering for Text Data
2 | #'
3 | #' Optimized, consistent tools for clustering text data.
4 | #' @docType package
5 | #' @name clustext
6 | #' @aliases clustext package-clustext
7 | NULL
8 |
9 |
10 | #' 2012 U.S. Presidential Debates
11 | #'
12 | #' A dataset containing a cleaned version of all three presidential debates for
13 | #' the 2012 election.
14 | #'
15 | #' @details
16 | #' \itemize{
17 | #' \item person. The speaker
18 | #' \item tot. Turn of talk
19 | #' \item dialogue. The words spoken
20 | #' \item time. Variable indicating which of the three debates the dialogue is from
21 | #' }
22 | #'
23 | #' @docType data
24 | #' @keywords datasets
25 | #' @name presidential_debates_2012
26 | #' @usage data(presidential_debates_2012)
27 | #' @format A data frame with 2912 rows and 4 variables
28 | NULL
29 |
30 |
31 | #' Topic Assignments
32 | #'
33 | #' A dataset containing a list of topic assignments by various clustering
34 | #' algorithms. Assignments correspond to the rows (minus empty rows) of the
35 | #' \code{presidential_debates_2012} data set.
36 | #'
37 | #' @docType data
38 | #' @keywords datasets
39 | #' @name assignments
40 | #' @usage data(assignments)
41 | #' @format A list with 3 elements
42 | NULL
43 |
--------------------------------------------------------------------------------
/R/compare.R:
--------------------------------------------------------------------------------
1 | #' Adjusted Rand Index Comaprison Between Algorithms
2 | #'
3 | #' An Adjusted Rand Index comparison of the assignments between different
4 | #' clustering algorithms.
5 | #'
6 | #' @param \ldots A series of outputs from \code{assign_cluster} for various
7 | #' cluster algorithmns.
8 | #' @return Returns a pair-wise comparison matrix of Adjusted Rand Indices for
9 | #' algorithm. Higher Adjusted Rand Index scores indicate higher cluster
10 | #' assignment agreement.
11 | #' @references \url{http://faculty.washington.edu/kayee/pca/supp.pdf}
12 | #' @export
13 | #' @examples
14 | #' compare(
15 | #' assignments$hierarchical_assignment,
16 | #' assignments$kmeans_assignment,
17 | #' assignments$skmeans_assignment,
18 | #' assignments$nmf_assignment
19 | #' )
20 | #'
21 | #' ## Understanding the ARI
22 | #' set.seed(10)
23 | #' w <- sample(1:10, 40, TRUE)
24 | #' x <- 11-w
25 | #' set.seed(20)
26 | #' y <- sample(1:10, 40, TRUE)
27 | #' set.seed(50)
28 | #' z <- sample(1:10, 40, TRUE)
29 | #'
30 | #' data.frame(w, x, y, z)
31 | #'
32 | #' library(mclust)
33 | #'
34 | #' mclust::adjustedRandIndex(w, x)
35 | #' mclust::adjustedRandIndex(x, y)
36 | #' mclust::adjustedRandIndex(x, z)
37 | compare <- function(...) {
38 | nms <- unlist(lapply(list(...), function(x) attributes(x)[['algorithm']]))
39 | vouter(stats::setNames(as.data.frame(list(...)), nms), mclust::adjustedRandIndex)
40 |
41 | }
42 |
43 |
44 | #comare(assignments[[1]], assignments[[2]], assignments[[3]])
45 |
46 | vouter <- function(x, FUN, ...){
47 |
48 | nc <- ncol(x)
49 | mat <- matrix(rep(NA, nc^2), nc)
50 | for (i in 1:nc) {
51 | for (j in 1:nc) {
52 | mat[i, j] <- FUN(.subset2(x, i), .subset2(x, j))
53 | }
54 | }
55 | dimnames(mat) <- list(colnames(x), colnames(x))
56 | class(mat) <- c("compare", class(mat))
57 | mat
58 | }
59 |
60 |
61 | #' Prints a compare Object.
62 | #'
63 | #' Prints a compare object.
64 | #'
65 | #' @param x The compare object
66 | #' @param digits Number of decimal places to print.
67 | #' @param \ldots ignored
68 | #' @method print compare
69 | #' @export
70 | print.compare <- function(x, digits = 3, ...) {
71 | WD <- options()[["width"]]
72 | options(width=3000)
73 | y <- unclass(x)
74 | if (is.numeric(y) & !is.null(digits)) {
75 | y <- round(y, digits = digits)
76 | }
77 | print(y)
78 | options(width=WD)
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/R/cosine_distance.R:
--------------------------------------------------------------------------------
1 | #' Optimized Computation of Cosine Distance
2 | #'
3 | #' Utilizes the \pkg{slam} package to efficiently calculate cosine distance
4 | #' on large sparse matrices.
5 | #'
6 | #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
7 | #' \code{\link[tm]{TermDocumentMatrix}}).
8 | #' @param \ldots ignored.
9 | #' @return Returns a cosine distance object of class \code{"dist"}.
10 | #' @references \url{http://stackoverflow.com/a/29755756/1000343}
11 | #' @keywords cosine dissimilarity
12 | #' @rdname cosine_distance
13 | #' @export
14 | #' @author Michael Andrec and Tyler Rinker .
15 | #' @examples
16 | #' library(gofastr)
17 | #' library(dplyr)
18 | #'
19 | #' out <- presidential_debates_2012 %>%
20 | #' with(q_dtm(dialogue)) %>%
21 | #' cosine_distance()
22 | cosine_distance <- function(x, ...){
23 | UseMethod("cosine_distance")
24 | }
25 |
26 |
27 | #' @export
28 | #' @rdname cosine_distance
29 | #' @method cosine_distance DocumentTermMatrix
30 | cosine_distance.DocumentTermMatrix <- function(x, ...){
31 | x <- t(slam::as.simple_triplet_matrix(x))
32 | stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2)))))
33 | }
34 |
35 |
36 | #' @export
37 | #' @rdname cosine_distance
38 | #' @method cosine_distance TermDocumentMatrix
39 | cosine_distance.TermDocumentMatrix <- function(x, ...){
40 | x <- slam::as.simple_triplet_matrix(x)
41 | stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2)))))
42 | }
43 |
44 |
45 |
--------------------------------------------------------------------------------
/R/data_store.R:
--------------------------------------------------------------------------------
1 | #' Data Structure for \pkg{hclusttext}
2 | #'
3 | #' A data structure which stores the text, DocumentTermMatrix, and information
4 | #' regarding removed text elements which can not be handled by the
5 | #' \code{hierarchical_cluster} function. This structure is required because it
6 | #' documents important meta information, including removed elements, required by
7 | #' other \pkg{clustext} functions. If the user wishes to combine documents
8 | #' (say by a common grouping variable) it is recomended this be handled by
9 | #' \code{\link[textshape]{combine}} prior to using \code{data_store}.
10 | #'
11 | #' @param text A character vector.
12 | #' @param doc.names An optional vector of document names corresponding to the
13 | #' length of \code{text}.
14 | #' @param min.term.freq The minimum times a term must appear to be included in
15 | #' the \code{\link[tm]{DocumentTermMatrix}}.
16 | #' @param min.doc.len The minimum words a document must contain to be included
17 | #' in the data structure (other wise it is stored as a \code{removed} element).
18 | #' @param stopwords A vector of stopwords to remove.
19 | #' @param min.char The minial length character for retained words.
20 | #' @param max.char The maximum length character for retained words.
21 | #' @param stem Logical. If \code{TRUE} the \code{stopwords} will be stemmed.
22 | #' @param denumber Logical. If \code{TRUE} numbers will be excluded.
23 | #' @return Returns a list containing:
24 | #' \describe{
25 | #' \item{dtm}{A tf-idf weighted \code{\link[tm]{DocumentTermMatrix}}}
26 | #' \item{text}{The text vector with unanalyzable elements removed}
27 | #' \item{removed}{The indices of the removed text elements, i.e., documents not meeting \code{min.doc.len}}
28 | #' \item{n.nonsparse}{The length of the non-zero elements}
29 | #' }
30 | #' @keywords data structure
31 | #' @export
32 | #' @examples
33 | #' data_store(presidential_debates_2012[["dialogue"]])
34 | #'
35 | #' ## Use `combine` to merge text prior to `data_stare`
36 | #' library(textshape)
37 | #' library(dplyr)
38 | #'
39 | #' dat <- presidential_debates_2012 %>%
40 | #' dplyr::select(person, time, dialogue) %>%
41 | #' textshape::combine()
42 | #'
43 | #' ## Elements in `ds` correspond to `dat` grouping vars
44 | #' (ds <- with(dat, data_store(dialogue)))
45 | #' dplyr::select(dat, -3)
46 | #'
47 | #' ## Add row names
48 | #' (ds2 <- with(dat, data_store(dialogue, paste(person, time, sep = "_"))))
49 | #' rownames(ds2[["dtm"]])
50 | #'
51 | #' ## Get a DocumentTermMatrix
52 | #' get_dtm(ds2)
53 | data_store <- function(text, doc.names, min.term.freq = 1, min.doc.len = 1,
54 | stopwords = tm::stopwords("english"), min.char = 3, max.char = NULL,
55 | stem = FALSE, denumber = TRUE){
56 |
57 | stopifnot(is.atomic(text))
58 | if (missing(doc.names)) doc.names <- seq_len(length(text))
59 | stopifnot(length(text) == length(doc.names))
60 |
61 | if (isTRUE(stem)){
62 | dtm <- gofastr::q_dtm_stem(text, docs = doc.names)
63 | } else {
64 | dtm <- gofastr::q_dtm(text, docs = doc.names)
65 | }
66 |
67 | dtm <- gofastr::remove_stopwords(dtm, stopwords = stopwords, min.char = min.char,
68 | max.char = max.char, stem = stem, denumber = denumber)
69 |
70 | if (nrow(dtm) != length(text)){
71 | text <- dplyr::group_by_(dplyr::data_frame(text=text, doc.names=doc.names), "doc.names")
72 | text <- dplyr::summarise(text, text = paste(text, collapse = " "))
73 | text <- text[match(text[["doc.names"]], rownames(dtm)),][["text"]]
74 | }
75 |
76 | names(text) <- text_seq <- seq_len(length(text))
77 |
78 | # remove terms
79 | dtm <- dtm[, slam::col_sums(dtm) >= min.term.freq]
80 |
81 | # remove short docs
82 | long_docs <- slam::row_sums(dtm) >= min.doc.len
83 | text <- text[long_docs]
84 | dtm <- dtm[long_docs,]
85 |
86 | # remove terms/docs again (ensure no zero lengths)
87 | # Eventually determine which elements were kept removed
88 | dtm <- dtm[, slam::col_sums(dtm) > 0]
89 |
90 | long_docs <- slam::row_sums(dtm) > 0
91 | text <- text[long_docs]
92 | dtm <- dtm[long_docs,]
93 |
94 | ## Add tf-idf
95 | dtm <- tm::weightTfIdf(dtm)
96 |
97 | out <- list(dtm = dtm, text = unname(text),
98 | removed = setdiff(text_seq, names(text)), n.nonsparse = length(dtm[["v"]]))
99 |
100 | class(out) <- "data_store"
101 | attributes(out)[["guid"]] <-rguid()
102 | out
103 | }
104 |
105 |
106 | #' Prints a data_store Object
107 | #'
108 | #' Prints a data_store object
109 | #'
110 | #' @param x A data_store object.
111 | #' @param \ldots ignored.
112 | #' @method print data_store
113 | #' @export
114 | print.data_store <- function(x, ...){
115 |
116 | cat(sprintf("<>\n", pn2(nrow(x[["dtm"]])), pn2(ncol(x[["dtm"]])) ))
117 | cat(sprintf("Text Elements : %s\n", pn2(length(x[["text"]])) ))
118 | cat(sprintf("Elements Removed : %s\n", pn2(length(x[["removed"]])) ))
119 | #cat(sprintf("Documents : %s\n", pn2(nrow(x[["dtm"]])) ))
120 | #cat(sprintf("Terms : %s\n", pn2(ncol(x[["dtm"]])) ))
121 | cat(sprintf("Non-/sparse entries: %d/%.0f\n", x[["n.nonsparse"]],
122 | prod(dim(x[["dtm"]])) - x[["n.nonsparse"]]))
123 | if (!prod(dim(x))) {
124 | sparsity <- 100
125 | } else {
126 | sparsity <- round((1 - x[["n.nonsparse"]]/prod(dim(x[["dtm"]]))) * 100)
127 | }
128 | cat(sprintf("Sparsity : %s%%\n", sparsity))
129 | cat(sprintf("Maximal term length: %s\n", max(nchar(colnames(x[["dtm"]])))))
130 | cat(sprintf("Minimum term length: %s\n", min(nchar(colnames(x[["dtm"]])))))
131 | }
132 |
133 |
134 |
135 | #
136 | # data_store_map <- function(stopwords = tm::stopwords("english"), min.char = 3,
137 | # max.char = NULL, stem = FALSE, denumber = TRUE) {
138 | #
139 | # }
140 |
--------------------------------------------------------------------------------
/R/get_documents.R:
--------------------------------------------------------------------------------
1 | #' Get Documents Based on Cluster Assignment in \code{assign_cluster}
2 | #'
3 | #' Get the documents associated with each of the k clusters .
4 | #'
5 | #' @param x A \code{\link[clustext]{assign_cluster}} object.
6 | #' @param \ldots ignored.
7 | #' @return Returns a list of \code{\link[base]{vector}}s of document names.
8 | #' @export
9 | #' @rdname get_documents
10 | #' @examples
11 | #' library(dplyr)
12 | #'
13 | #' mydocuments1 <- presidential_debates_2012 %>%
14 | #' with(data_store(dialogue, paste(person, time, sep="-"))) %>%
15 | #' hierarchical_cluster() %>%
16 | #' assign_cluster(k = 6) %>%
17 | #' get_documents()
18 | #'
19 | #' mydocuments1
20 | #'
21 | #' mydocuments2 <- presidential_debates_2012 %>%
22 | #' with(data_store(dialogue)) %>%
23 | #' hierarchical_cluster() %>%
24 | #' assign_cluster(k = 55) %>%
25 | #' get_documents()
26 | #'
27 | #' mydocuments2
28 | get_documents <- function(x, ...){
29 | UseMethod("get_documents")
30 | }
31 |
32 | #' @export
33 | #' @rdname get_documents
34 | #' @method get_documents assign_cluster
35 | get_documents.assign_cluster <- function(x, ...){
36 |
37 | desc <- topic <- n <- NULL
38 | out <- split(attributes(x)[["names"]], x)
39 |
40 | class(out) <- c("get_documents", class(out))
41 | out
42 | }
43 |
44 | #' Prints a get_documents Object
45 | #'
46 | #' Prints a get_documents object
47 | #'
48 | #' @param x A get_documents object.
49 | #' @param \ldots ignored.
50 | #' @method print get_documents
51 | #' @export
52 | print.get_documents <- function(x, ...){
53 | class(x) <- "list"
54 | print(x)
55 | }
56 |
--------------------------------------------------------------------------------
/R/get_dtm.R:
--------------------------------------------------------------------------------
1 | #' Get a \code{\link[tm]{DocumentTermMatrix}} Stored in a \code{hierarchical_cluster} Object
2 | #'
3 | #' Extract the \code{\link[tm]{DocumentTermMatrix}} supplied to/produced by a
4 | #' \code{\link[clustext]{hierarchical_cluster}} object.
5 | #'
6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object.
7 | #' @param \ldots ignored.
8 | #' @return Returns a \code{\link[tm]{DocumentTermMatrix}}.
9 | #' @export
10 | #' @rdname get_dtm
11 | #' @examples
12 | #' library(dplyr)
13 | #'
14 | #' presidential_debates_2012 %>%
15 | #' with(data_store(dialogue)) %>%
16 | #' hierarchical_cluster() %>%
17 | #' get_dtm()
18 | get_dtm <- function(x, ...){
19 | UseMethod("get_dtm")
20 | }
21 |
22 | #' @export
23 | #' @rdname get_dtm
24 | #' @method get_dtm data_store
25 | get_dtm.data_store <- function(x, ...){
26 | x[["dtm"]]
27 | }
28 |
29 |
30 | #' @export
31 | #' @rdname get_dtm
32 | #' @method get_dtm hierarchical_cluster
33 | get_dtm.hierarchical_cluster <- function(x, ...){
34 | get_dtm(attributes(x)[["text_data_store"]][["data"]])
35 | }
36 |
37 |
38 | #' @export
39 | #' @rdname get_dtm
40 | #' @method get_dtm kmeans_cluster
41 | get_dtm.kmeans_cluster <- function(x, ...){
42 | get_dtm(attributes(x)[["text_data_store"]][["data"]])
43 | }
44 |
45 | #' @export
46 | #' @rdname get_dtm
47 | #' @method get_dtm skmeans_cluster
48 | get_dtm.skmeans_cluster <- function(x, ...){
49 | get_dtm(attributes(x)[["text_data_store"]][["data"]])
50 | }
51 |
52 |
53 |
54 | #' @export
55 | #' @rdname get_dtm
56 | #' @method get_dtm nmf_cluster
57 | get_dtm.nmf_cluster <- function(x, ...){
58 | get_dtm(attributes(x)[["text_data_store"]][["data"]])
59 | }
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/R/get_removed.R:
--------------------------------------------------------------------------------
1 | #' Get a Text Stored in a \code{hierarchical_cluster} Object
2 | #'
3 | #' Extract the text supplied to the
4 | #' \code{\link[clustext]{hierarchical_cluster}} object.
5 | #'
6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object.
7 | #' @param \ldots ignored.
8 | #' @return Returns a vector of text strings.
9 | #' @export
10 | #' @rdname get_removed
11 | #' @examples
12 | #' library(dplyr)
13 | #'
14 | #' presidential_debates_2012 %>%
15 | #' with(data_store(dialogue)) %>%
16 | #' hierarchical_cluster() %>%
17 | #' get_removed()
18 | get_removed <- function(x, ...){
19 | UseMethod("get_removed")
20 | }
21 |
22 | #' @export
23 | #' @rdname get_removed
24 | #' @method get_removed hierarchical_cluster
25 | get_removed.hierarchical_cluster <- function(x, ...){
26 | get_removed(attributes(x)[["text_data_store"]][["data"]])
27 | }
28 |
29 | #' @export
30 | #' @rdname get_removed
31 | #' @method get_removed kmeans_cluster
32 | get_removed.kmeans_cluster <- function(x, ...){
33 | get_removed(attributes(x)[["text_data_store"]][["data"]])
34 | }
35 |
36 | #' @export
37 | #' @rdname get_removed
38 | #' @method get_removed skmeans_cluster
39 | get_removed.skmeans_cluster <- function(x, ...){
40 | get_removed(attributes(x)[["text_data_store"]][["data"]])
41 | }
42 |
43 |
44 | #' @export
45 | #' @rdname get_removed
46 | #' @method get_removed nmf_cluster
47 | get_removed.nmf_cluster <- function(x, ...){
48 | get_removed(attributes(x)[["text_data_store"]][["data"]])
49 | }
50 |
51 |
52 | #' @export
53 | #' @rdname get_removed
54 | #' @method get_removed data_store
55 | get_removed.data_store <- function(x, ...){
56 | x[["removed"]]
57 | }
58 |
59 |
60 |
--------------------------------------------------------------------------------
/R/get_terms.R:
--------------------------------------------------------------------------------
1 | #' Get Terms Based on Cluster Assignment in \code{assign_cluster}
2 | #'
3 | #' Get the terms weighted (either by tf-idf or returned from the model) and
4 | #' min/max scaling associated with each of the k clusters .
5 | #'
6 | #' @param x A \code{\link[clustext]{assign_cluster}} object.
7 | #' @param min.weight The lowest min/max scaled tf-idf weighting to consider
8 | #' as a document's salient term.
9 | #' @param nrow The max number of rows to display in the returned
10 | #' \code{\link[base]{data.frame}}s.
11 | #' @param \ldots ignored.
12 | #' @return Returns a list of \code{\link[base]{data.frame}}s of top weighted terms.
13 | #' @export
14 | #' @rdname get_terms
15 | #' @examples
16 | #' library(dplyr)
17 | #' library(textshape)
18 | #'
19 | #' myterms <- presidential_debates_2012 %>%
20 | #' with(data_store(dialogue)) %>%
21 | #' hierarchical_cluster() %>%
22 | #' assign_cluster(k = 55) %>%
23 | #' get_terms()
24 | #'
25 | #' myterms
26 | #' textshape::tidy_list(myterms[!sapply(myterms, is.null)], "Topic")
27 | #' \dontrun{
28 | #' library(ggplot2)
29 | #' library(gridExtra)
30 | #' library(dplyr)
31 | #' library(textshape)
32 | #' library(wordcloud)
33 | #'
34 | #' max.n <- max(textshape::tidy_list(myterms)[["n"]])
35 | #'
36 | #' myplots <- Map(function(x, y){
37 | #' x %>%
38 | #' mutate(term = factor(term, levels = rev(term))) %>%
39 | #' ggplot(aes(term, weight=n)) +
40 | #' geom_bar() +
41 | #' scale_y_continuous(expand = c(0, 0),limits=c(0, max.n)) +
42 | #' ggtitle(sprintf("Topic: %s", y)) +
43 | #' coord_flip()
44 | #' }, myterms, names(myterms))
45 | #'
46 | #' myplots[["ncol"]] <- 10
47 | #'
48 | #' do.call(gridExtra::grid.arrange, myplots[!sapply(myplots, is.null)])
49 | #'
50 | #' ##wordclouds
51 | #' par(mfrow=c(5, 11), mar=c(0, 4, 0, 0))
52 | #' Map(function(x, y){
53 | #' wordcloud::wordcloud(x[[1]], x[[2]], scale=c(1,.25),min.freq=1)
54 | #' mtext(sprintf("Topic: %s", y), col = "blue", cex=.55, padj = 1.5)
55 | #' }, myterms, names(myterms))
56 | #' }
57 | get_terms <- function(x, min.weight = .6, nrow = NULL, ...){
58 | UseMethod("get_terms")
59 | }
60 |
61 |
62 |
63 | #' @export
64 | #' @rdname get_terms
65 | #' @method get_terms assign_cluster_hierarchical
66 | get_terms.assign_cluster_hierarchical <- function(x, min.weight = .6, nrow = NULL, ...){
67 |
68 | assignment <- x
69 | desc <- topic <- n <- term <- weight <- NULL
70 | dat <- attributes(x)[["data_store"]][["data"]][['dtm']]
71 | clusters <-split(names(x), x)
72 |
73 | out <- stats::setNames(lapply(clusters, function(y){
74 | vals <- min_max(sort(slam::col_sums(dat[y,]), decreasing=TRUE))
75 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE)
76 | }), names(clusters))
77 |
78 |
79 | out2 <- lapply(out, function(x) {
80 | rownames(x) <- NULL
81 | x <- dplyr::filter(x, weight >= min.weight)
82 | x <- dplyr::filter(x, !is.na(term))
83 |
84 | if (nrow(x) == 0) return(NULL)
85 | x
86 | })
87 |
88 | if (!is.null(nrow)) {
89 | out2 <- lapply(out2, function(x){
90 | if (is.null(x) || nrow(x) <= nrow) return(x)
91 | x[1:nrow]
92 | })
93 | }
94 | class(out2) <- c("get_terms", class(out2))
95 | attributes(out2)[['assignment']] <- assignment
96 | out2
97 |
98 | }
99 |
100 |
101 |
102 |
103 |
104 |
105 | #' @export
106 | #' @rdname get_terms
107 | #' @method get_terms assign_cluster_kmeans
108 | get_terms.assign_cluster_kmeans <- function(x, min.weight = .6, nrow = NULL, ...){
109 |
110 |
111 | weight <- term <- NULL
112 |
113 | assignment <- x
114 | x <- attributes(x)[['model']]
115 | nms <- seq_along(x[['size']])
116 | x <- x[['centers']]
117 |
118 | out <- stats::setNames(lapply(1:nrow(x), function(i){
119 | vals <- min_max(sort(x[i, ], decreasing=TRUE))
120 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE)
121 | }), nms)
122 |
123 | out2 <- lapply(out, function(x) {
124 | rownames(x) <- NULL
125 | x <- dplyr::filter(x, weight >= min.weight)
126 |
127 | x <- dplyr::filter(x, !is.na(term))
128 |
129 | if (nrow(x) == 0) return(NULL)
130 | x
131 | })
132 |
133 | if (!is.null(nrow)) {
134 | out2 <- lapply(out2, function(x){
135 | if (is.null(x) || nrow(x) <= nrow) return(x)
136 | x[1:nrow]
137 | })
138 | }
139 | class(out2) <- c("get_terms", class(out2))
140 | attributes(out2)[['assignment']] <- assignment
141 | out2
142 |
143 | }
144 |
145 |
146 | #' @export
147 | #' @rdname get_terms
148 | #' @method get_terms assign_cluster_skmeans
149 | get_terms.assign_cluster_skmeans <- function(x, min.weight = .6, nrow = NULL, ...){
150 |
151 | assignment <- x
152 | desc <- topic <- n <- term <- weight <- NULL
153 | dat <- attributes(x)[["data_store"]][["data"]][['dtm']]
154 | clusters <-split(names(x), x)
155 |
156 | out <- stats::setNames(lapply(clusters, function(y){
157 | vals <- min_max(sort(slam::col_sums(dat[y,]), decreasing=TRUE))
158 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE)
159 | }), names(clusters))
160 |
161 |
162 | out2 <- lapply(out, function(x) {
163 | rownames(x) <- NULL
164 | x <- dplyr::filter(x, weight >= min.weight)
165 | x <- dplyr::filter(x, !is.na(term))
166 |
167 | if (nrow(x) == 0) return(NULL)
168 | x
169 | })
170 |
171 | if (!is.null(nrow)) {
172 | out2 <- lapply(out2, function(x){
173 | if (is.null(x) || nrow(x) <= nrow) return(x)
174 | x[1:nrow]
175 | })
176 | }
177 | class(out2) <- c("get_terms", class(out2))
178 | attributes(out2)[['assignment']] <- assignment
179 | out2
180 | }
181 |
182 |
183 |
184 | #' @export
185 | #' @rdname get_terms
186 | #' @method get_terms assign_cluster_nmf
187 | get_terms.assign_cluster_nmf <- function(x, min.weight = .6, nrow = NULL, ...){
188 |
189 | weight <- term <- NULL
190 |
191 | assignment <- x
192 | x <- attributes(x)[['model']]
193 | nms <- seq_len(ncol(x[["W"]]))
194 | x <- x[['H']]
195 |
196 | out <- stats::setNames(lapply(1:nrow(x), function(i){
197 | vals <- min_max(sort(x[i, ], decreasing=TRUE))
198 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE)
199 | }), nms)
200 |
201 | out2 <- lapply(out, function(x) {
202 | rownames(x) <- NULL
203 | x <- dplyr::filter(x, weight >= min.weight)
204 | x <- dplyr::filter(x, !is.na(term))
205 |
206 | if (nrow(x) == 0) return(NULL)
207 | x
208 | })
209 |
210 | if (!is.null(nrow)) {
211 | out2 <- lapply(out2, function(x){
212 | if (is.null(x) || nrow(x) <= nrow) return(x)
213 | x[1:nrow]
214 | })
215 | }
216 | class(out2) <- c("get_terms", class(out2))
217 | attributes(out2)[['assignment']] <- assignment
218 | out2
219 |
220 | }
221 |
222 |
223 | #' Prints a get_terms Object
224 | #'
225 | #' Prints a get_terms object
226 | #'
227 | #' @param x A get_terms object.
228 | #' @param \ldots ignored.
229 | #' @method print get_terms
230 | #' @export
231 | print.get_terms <- function(x, ...){
232 |
233 | lens <- unlist(lapply(split(names(attributes(x)[['assignment']]), attributes(x)[['assignment']]), length))
234 | attributes(x)[['assignment']] <- NULL
235 |
236 | class(x) <- "list"
237 | names(x) <- sprintf("%s (n=%s)", names(x), lens)
238 | print(x)
239 | }
240 |
241 |
242 |
243 |
244 |
245 |
--------------------------------------------------------------------------------
/R/get_text.R:
--------------------------------------------------------------------------------
1 | #' Get a Text Stored in Various Objects
2 | #'
3 | #' Extract the text supplied to the
4 | #' \code{\link[clustext]{hierarchical_cluster}} object.
5 | #'
6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object.
7 | #' @param \ldots ignored.
8 | #' @return Returns a vector or list of text strings.
9 | #' @export
10 | #' @rdname get_text
11 | #' @examples
12 | #' library(dplyr)
13 | #'
14 | #' presidential_debates_2012 %>%
15 | #' with(data_store(dialogue)) %>%
16 | #' hierarchical_cluster() %>%
17 | #' get_text() %>%
18 | #' head()
19 | get_text <- function(x, ...){
20 | UseMethod("get_text")
21 | }
22 |
23 |
24 | #' @export
25 | #' @rdname get_text
26 | #' @method get_text default
27 | get_text.default <- function(x, ...){
28 | termco::get_text(x, ...)
29 | }
30 |
31 |
32 | #' @export
33 | #' @rdname get_text
34 | #' @method get_text hierarchical_cluster
35 | get_text.hierarchical_cluster <- function(x, ...){
36 | get_text(attributes(x)[["text_data_store"]][["data"]])
37 | }
38 |
39 |
40 | #' @export
41 | #' @rdname get_text
42 | #' @method get_text kmeans_cluster
43 | get_text.kmeans_cluster <- function(x, ...){
44 | get_text(attributes(x)[["text_data_store"]][["data"]])
45 | }
46 |
47 |
48 | #' @export
49 | #' @rdname get_text
50 | #' @method get_text nmf_cluster
51 | get_text.nmf_cluster <- function(x, ...){
52 | get_text(attributes(x)[["text_data_store"]][["data"]])
53 | }
54 |
55 |
56 | #' @export
57 | #' @rdname get_text
58 | #' @method get_text skmeans_cluster
59 | get_text.skmeans_cluster <- function(x, ...){
60 | get_text(attributes(x)[["text_data_store"]][["data"]])
61 | }
62 |
63 |
64 |
65 |
66 | #' @export
67 | #' @rdname get_text
68 | #' @method get_text data_store
69 | get_text.data_store <- function(x, ...){
70 | x[["text"]]
71 | }
72 |
73 |
74 | #' @export
75 | #' @rdname get_text
76 | #' @method get_text assign_cluster
77 | get_text.assign_cluster <- function(x, ...){
78 | split(get_text(attributes(x)[["data_store"]][["data"]]), as.integer(x))
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/R/hierarchical_cluster.R:
--------------------------------------------------------------------------------
1 | #' Fit a Hierarchical Cluster
2 | #'
3 | #' Fit a hierarchical cluster to text data. Prior to distance measures being
4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
5 | #' \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate
6 | #' the distance matrix supplied to \code{\link[fastcluster]{hclust}}. \code{method}
7 | #' defaults to \code{"ward.D2"}. A faster cosine dissimilarity calculation is used
8 | #' under the hood (see \code{\link[clustext]{cosine_distance}}). Additionally,
9 | #' \code{\link[fastcluster]{hclust}} is used to quickly calculate the fit.
10 | #' Essentially, this is a wrapper function optimized for clustering text data.
11 | #'
12 | #' @param x A data store object (see \code{\link[clustext]{data_store}}).
13 | #' @param distance A distance measure ("cosine" or "jaccard").
14 | #' @param method The agglomeration method to be used. This must be (an
15 | #' unambiguous abbreviation of) one of \code{"single"}, \code{"complete"},
16 | #' \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"},
17 | #' \code{"centroid"}, or \code{"median"}.
18 | #' @param \ldots ignored.
19 | #' @return Returns an object of class \code{"hclust"}.
20 | #' @export
21 | #' @rdname hierarchical_cluster
22 | #' @examples
23 | #' library(dplyr)
24 | #'
25 | #' x <- with(
26 | #' presidential_debates_2012,
27 | #' data_store(dialogue, paste(person, time, sep = "_"))
28 | #' )
29 | #'
30 | #' hierarchical_cluster(x) %>%
31 | #' plot(k=4)
32 | #'
33 | #' hierarchical_cluster(x) %>%
34 | #' plot(h=.7, lwd=2)
35 | #'
36 | #' hierarchical_cluster(x) %>%
37 | #' assign_cluster(h=.7)
38 | #'
39 | #' \dontrun{
40 | #' ## interactive cutting
41 | #' hierarchical_cluster(x) %>%
42 | #' plot(h=TRUE)
43 | #' }
44 | #'
45 | #' hierarchical_cluster(x, method="complete") %>%
46 | #' plot(k=6)
47 | #'
48 | #' hierarchical_cluster(x) %>%
49 | #' assign_cluster(k=6)
50 | #'
51 | #' x2 <- presidential_debates_2012 %>%
52 | #' with(data_store(dialogue))
53 | #'
54 | #' myfit2 <- hierarchical_cluster(x2)
55 | #'
56 | #' plot(myfit2)
57 | #' plot(myfit2, 55)
58 | #'
59 | #' assign_cluster(myfit2, k = 55)
60 | #'
61 | #' ## Example from StackOverflow Question Response
62 | #' ## Asking fo grouping similar texts together
63 | #' ## http://stackoverflow.com/q/22936951/1000343
64 | #' dat <- data.frame(
65 | #' person = LETTERS[1:3],
66 | #' text = c("Best way to waste money",
67 | #' "Amazing stuff. lets you stay connected all the time",
68 | #' "Instrument to waste money and time"),
69 | #' stringsAsFactors = FALSE
70 | #' )
71 | #'
72 | #'
73 | #' x <- with(
74 | #' dat,
75 | #' data_store(text, person)
76 | #' )
77 | #'
78 | #'
79 | #' hierarchical_cluster(x) %>%
80 | #' plot(h=.9, lwd=2)
81 | #'
82 | #' hierarchical_cluster(x) %>%
83 | #' assign_cluster(h=.9)
84 | #'
85 | #'
86 | #' hierarchical_cluster(x) %>%
87 | #' assign_cluster(h=.9) %>%
88 | #' get_terms()
89 | #'
90 | #' hierarchical_cluster(x) %>%
91 | #' assign_cluster(h=.9) %>%
92 | #' get_terms() %>%
93 | #' as_topic()
94 | #'
95 | #' hierarchical_cluster(x) %>%
96 | #' assign_cluster(h=.9) %>%
97 | #' get_documents()
98 | hierarchical_cluster <- function(x, distance = 'cosine', method = "ward.D2", ...){
99 |
100 | UseMethod("hierarchical_cluster")
101 |
102 | }
103 |
104 |
105 | #' @export
106 | #' @rdname hierarchical_cluster
107 | #' @method hierarchical_cluster data_store
108 | hierarchical_cluster.data_store <- function(x, distance = 'cosine', method = "ward.D", ...){
109 |
110 | distmes <- switch(distance,
111 | 'cosine' = cosine_distance,
112 | 'jaccard' = jaccard_distance,
113 | stop('provide a valid `distance` type')
114 | )
115 | fit <- fastcluster::hclust(distmes(x[["dtm"]]), method = method)
116 |
117 | text_data_store <- new.env(FALSE)
118 | text_data_store[["data"]] <- x
119 |
120 | class(fit) <- c("hierarchical_cluster", class(fit))
121 | attributes(fit)[["text_data_store"]] <- text_data_store
122 | fit
123 | }
124 |
125 |
126 |
127 | #' Plots a hierarchical_cluster Object
128 | #'
129 | #' Plots a hierarchical_cluster object
130 | #'
131 | #' @param x A hierarchical_cluster object.
132 | #' @param k The number of clusters (can supply \code{h} instead). Defaults to
133 | #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced
134 | #' by \code{data_storage}. Boxes are drawn around the clusters.
135 | #' @param h The height at which to cut the dendrograms (determines number of
136 | #' clusters). If this argument is supplied \code{k} is ignored. A line is drawn
137 | #' showing the cut point on the dendrogram. If \code{h} is set to \code{TRUE}
138 | #' or \code{"locator"} then the cutting becomes interactive and the height is
139 | #' returned invisibly.
140 | #' @param color The color to make the cluster boxes (\code{k}) or line (\code{h}).
141 | #' @param digits The number o digits to display if h\code{h} is set to
142 | #' interactive.
143 | #' @param \ldots Other arguments passed to \code{\link[stats]{rect.hclust}} or
144 | #' \code{\link[graphics]{abline}}.
145 | #' @method plot hierarchical_cluster
146 | #' @export
147 | plot.hierarchical_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL,
148 | color = "red", digits = 3, ...){
149 |
150 | if (is.null(h)) y <- k
151 | class(x) <- "hclust"
152 | graphics::plot(x)
153 | if (is.null(h) && !is.null(k)) stats::rect.hclust(x, k = y, border = color, ...)
154 | if (!is.null(h)) {
155 | if (isTRUE(h) | h == 'locator') {
156 | cat("Click a location in the plot...\n")
157 | h <- graphics::locator(1)
158 | cat(paste("You cut at h =", round(h[['y']], digits), "\n"))
159 | graphics::abline(h = h[['y']], col = color, ...)
160 | return(invisible(h[['y']]))
161 | } else {
162 | graphics::abline(h = h, col = color, ...)
163 | }
164 | }
165 |
166 | }
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/R/jaccard_distance.R:
--------------------------------------------------------------------------------
1 | #' Optimized Computation of Jaccard Distance
2 | #'
3 | #' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance
4 | #' on large sparse matrices.
5 | #'
6 | #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
7 | #' \code{\link[tm]{TermDocumentMatrix}}).
8 | #' @param \ldots ignored.
9 | #' @return Returns a jaccard distance object of class \code{"dist"}.
10 | #' @references \url{http://stackoverflow.com/a/36373333/1000343}
11 | #' \url{http://stats.stackexchange.com/a/89947/7482}
12 | #' @keywords jaccard dissimilarity
13 | #' @rdname jaccard_distance
14 | #' @export
15 | #' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker .
16 | #' @examples
17 | #' library(gofastr)
18 | #' library(dplyr)
19 | #'
20 | #' out <- presidential_debates_2012 %>%
21 | #' with(q_dtm(dialogue)) %>%
22 | #' jaccard_distance()
23 | jaccard_distance <- function(x, ...){
24 | UseMethod("jaccard_distance")
25 | }
26 |
27 |
28 | #' @export
29 | #' @rdname jaccard_distance
30 | #' @method jaccard_distance DocumentTermMatrix
31 | jaccard_distance.DocumentTermMatrix <- function(x, ...){
32 | mat <-sign(x)
33 | A <- slam::tcrossprod_simple_triplet_matrix(mat)
34 | im <- which(A > 0, arr.ind=TRUE)
35 | b <- slam::row_sums(mat)
36 | Aim <- A[im]
37 |
38 | stats::as.dist(1 - Matrix::sparseMatrix(
39 | i = im[,1],
40 | j = im[,2],
41 | x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
42 | dims = dim(A)
43 | ))
44 | }
45 |
46 |
47 | #' @export
48 | #' @rdname jaccard_distance
49 | #' @method jaccard_distance TermDocumentMatrix
50 | jaccard_distance.TermDocumentMatrix <- function(x, ...){
51 | mat <-sign(x)
52 | A <- slam::crossprod_simple_triplet_matrix(mat)
53 | im <- which(A > 0, arr.ind=TRUE)
54 | b <- slam::col_sums(mat)
55 | Aim <- A[im]
56 |
57 | stats::as.dist(1 - Matrix::sparseMatrix(
58 | i = im[,1],
59 | j = im[,2],
60 | x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
61 | dims = dim(A)
62 | ))
63 | }
64 |
65 |
--------------------------------------------------------------------------------
/R/kmeans_cluster.R:
--------------------------------------------------------------------------------
1 | #' Fit a Kmeans Cluster
2 | #'
3 | #' Fit a kmeans cluster to text data. Prior to distance measures being
4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
5 | #' \code{\link[tm]{DocumentTermMatrix}}.
6 | #'
7 | #' @param x A data store object (see \code{\link[clustext]{data_store}}).
8 | #' @param k The number of clusters.
9 | #' @param \ldots Other arguments passed to \code{\link[stats]{kmeans}}.
10 | #' @return Returns an object of class \code{"kmeans"}.
11 | #' @export
12 | #' @rdname kmeans_cluster
13 | #' @examples
14 | #' \dontrun{
15 | #' library(dplyr)
16 | #'
17 | #' x <- with(
18 | #' presidential_debates_2012,
19 | #' data_store(dialogue, paste(person, time, sep = "_"))
20 | #' )
21 | #'
22 | #'
23 | #' ## 6 topic model
24 | #' kmeans_cluster(x, k=6)
25 | #'
26 | #' kmeans_cluster(x, k=6) %>%
27 | #' assign_cluster()
28 | #'
29 | #' kmeans_cluster(x, k=6) %>%
30 | #' assign_cluster() %>%
31 | #' summary()
32 | #'
33 | #' x2 <- presidential_debates_2012 %>%
34 | #' with(data_store(dialogue))
35 | #'
36 | #' myfit2 <- kmeans_cluster(x2, 55)
37 | #'
38 | #' assign_cluster(myfit2)
39 | #'
40 | #' assign_cluster(myfit2) %>%
41 | #' summary()
42 | #' }
43 | kmeans_cluster <- function(x, k, ...){
44 |
45 | UseMethod("kmeans_cluster")
46 |
47 | }
48 |
49 |
50 | #' @export
51 | #' @rdname kmeans_cluster
52 | #' @method kmeans_cluster data_store
53 | kmeans_cluster.data_store <- function(x, k, ...){
54 |
55 |
56 | fit <- stats::kmeans(x[["dtm"]], centers=k, ...)
57 |
58 | text_data_store <- new.env(FALSE)
59 | text_data_store[["data"]] <- x
60 |
61 | class(fit) <- c("kmeans_cluster", class(fit))
62 | attributes(fit)[["text_data_store"]] <- text_data_store
63 | fit
64 | }
65 |
66 |
67 |
68 | # #' Plots a kmeans_cluster Object
69 | # #'
70 | # #' Plots a kmeans_cluster object
71 | # #'
72 | # #' @param x A kmeans_cluster object.
73 | # #' @param k The number of clusters (can supply \code{h} instead). Defaults to
74 | # #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced
75 | # #' by \code{data_storage}. Boxes are drawn around the clusters.
76 | # #' @param h The height at which to cut the dendrograms (determines number of
77 | # #' clusters). If this argument is supplied \code{k} is ignored. A line is drawn
78 | # #' showing the cut point on the dendrogram.
79 | # #' @param color The color to make the cluster boxes (\code{k}) or line (\code{h}).
80 | # #' @param \ldots Other arguments passed to \code{\link[stats]{rect.hclust}} or
81 | # #' \code{\link[graphics]{abline}}.
82 | # #' @method plot kmeans_cluster
83 | # #' @export
84 | # plot.kmeans_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL,
85 | # color = "red", ...){
86 | #
87 | # if (is.null(h)) y <- k
88 | # class(x) <- "hclust"
89 | # graphics::plot(x)
90 | # if (is.null(h) && !is.null(k)) stats::rect.hclust(x, k = y, border = color, ...)
91 | # if (!is.null(h)) graphics::abline(h = h, col = color, ...)
92 | # }
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/R/nmf_cluster.R:
--------------------------------------------------------------------------------
1 | #' Fit a Non-Negative Matrix Factorization Cluster
2 | #'
3 | #' Fit a robust non-negative matrix factorization cluster to text data via
4 | #' \code{\link[rNMF]{rnmf}}. Prior to distance measures being
5 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
6 | #' \code{\link[tm]{DocumentTermMatrix}}.
7 | #'
8 | #' @param x A data store object (see \code{\link[clustext]{data_store}}).
9 | #' @param k The number of clusters.
10 | #' @param \ldots Other arguments passed to \code{\link[rNMF]{rnmf}}.
11 | #' @return Returns an object of class \code{"hclust"}.
12 | #' @export
13 | #' @rdname nmf_cluster
14 | #' @examples
15 | #' library(dplyr)
16 | #'
17 | #' x <- with(
18 | #' presidential_debates_2012,
19 | #' data_store(dialogue, paste(person, time, sep = "_"))
20 | #' )
21 | #'
22 | #'
23 | #' ## 6 topic model
24 | #' model6 <- nmf_cluster(x, k=6)
25 | #'
26 | #' model6 %>%
27 | #' assign_cluster()
28 | #'
29 | #' model6 %>%
30 | #' assign_cluster() %>%
31 | #' summary()
32 | #' \dontrun{
33 | #' x2 <- presidential_debates_2012 %>%
34 | #' with(data_store(dialogue))
35 | #'
36 | #' myfit2 <- nmf_cluster(x2, 55)
37 | #'
38 | #' assign_cluster(myfit2)
39 | #'
40 | #' assign_cluster(myfit2) %>%
41 | #' summary()
42 | #' }
43 | nmf_cluster <- function(x, k = k, ...){
44 |
45 | UseMethod("nmf_cluster")
46 |
47 | }
48 |
49 |
50 | #' @export
51 | #' @rdname nmf_cluster
52 | #' @method nmf_cluster data_store
53 | nmf_cluster.data_store <- function(x, k, ...){
54 |
55 | fit <- rNMF::rnmf(as.matrix(x[["dtm"]]), k = k, ...)
56 |
57 |
58 | text_data_store <- new.env(FALSE)
59 | text_data_store[["data"]] <- x
60 |
61 | class(fit) <- c("nmf_cluster", class(fit))
62 | attributes(fit)[["text_data_store"]] <- text_data_store
63 | fit
64 | }
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/R/skmeans_cluster.R:
--------------------------------------------------------------------------------
1 | #' Fit a skmean Cluster
2 | #'
3 | #' Fit a skmean cluster to text data. Prior to distance measures being
4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
5 | #' \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate
6 | #' the distance matrix supplied to \code{\link[skmeans]{skmeans}}.
7 | #'
8 | #' @param x A data store object (see \code{\link[clustext]{data_store}}).
9 | #' @param k The number of clusters.
10 | #' @param \ldots Other arguments passed to \code{\link[skmeans]{skmeans}}.
11 | #' @return Returns an object of class \code{"skmean"}.
12 | #' @export
13 | #' @rdname skmeans_cluster
14 | #' @examples
15 | #' library(dplyr)
16 | #'
17 | #' x <- with(
18 | #' presidential_debates_2012,
19 | #' data_store(dialogue, paste(person, time, sep = "_"))
20 | #' )
21 | #'
22 | #'
23 | #' ## 6 topic model
24 | #' myfit1 <- skmeans_cluster(x, k=6)
25 | #'
26 | #' myfit1 %>%
27 | #' assign_cluster()
28 | #'
29 | #' myfit1 %>%
30 | #' assign_cluster() %>%
31 | #' summary()
32 | #'
33 | #' \dontrun{
34 | #' x2 <- presidential_debates_2012 %>%
35 | #' with(data_store(dialogue))
36 | #'
37 | #' myfit2 <- skmeans_cluster(x2, 55)
38 | #'
39 | #' assign_cluster(myfit2)
40 | #'
41 | #' assign_cluster(myfit2) %>%
42 | #' summary()
43 | #' }
44 | skmeans_cluster <- function(x, k, ...){
45 |
46 | UseMethod("skmeans_cluster")
47 |
48 | }
49 |
50 |
51 | #' @export
52 | #' @rdname skmeans_cluster
53 | #' @method skmeans_cluster data_store
54 | skmeans_cluster.data_store <- function(x, k, ...){
55 |
56 |
57 | fit <- skmeans::skmeans(x[["dtm"]], k=k, ...)
58 |
59 | text_data_store <- new.env(FALSE)
60 | text_data_store[["data"]] <- x
61 |
62 | class(fit) <- c("skmeans_cluster", class(fit))
63 | attributes(fit)[["text_data_store"]] <- text_data_store
64 | fit
65 | }
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | pn <- function(x, y) {
2 | m <- prettyNum(x, big.mark = ",", scientific = FALSE)
3 | paste0(paste(rep(" ", y - nchar(m)), collapse = ""), m)
4 | }
5 |
6 | pn2 <- function(x) prettyNum(x, big.mark = ",", scientific = FALSE)
7 |
8 |
9 | # min max scaling function
10 | min_max <- function(x) {
11 | if(max(x) - min(x) == 0) return(stats::setNames(rep(1, length(x)), names(x)))
12 | (x - min(x))/(max(x) - min(x))
13 | }
14 |
15 | above <- function(x, threshhold) which(x >= threshhold)
16 |
17 | rguid <- function(n=20){
18 | paste(sample(c(LETTERS, letters, 0:9), n, TRUE), collapse="")
19 | }
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/R/write_cluster_text.R:
--------------------------------------------------------------------------------
1 | #' Write/Read Cluster Text for Human Categorization
2 | #'
3 | #' Write cluster text from \code{get_text(assign_cluster(myfit))} to an external
4 | #' file for categorization. After file has been written with
5 | #' \code{write_cluster_text} a human coder can assign categories to each cluster.
6 | #' Simple write the category after the \code{Cluster #:}. To set a cluster category
7 | #' equal to another simply write and equal sign follwed by the other cluster to set
8 | #' as the same category (e.g., \code{Cluster 10: =5} to set cluster #10 the same as
9 | #' cluster #5). See \code{readLines(system.file("additional/foo_turk.txt", package = "clustext"))}
10 | #' for an example.
11 | #'
12 | #' @param x An \code{assign_cluster} object.
13 | #' @param path A pather to the file (.txt) is recommended.
14 | #' @param n.sample The length to limit the sample to (default gives all text in the cluster).
15 | #' Setting this to an integer uses this as the number to randomly sample from.
16 | #' @param lead A leading character string prefix to give the cluster text.
17 | #' @param \ldots ignored.
18 | #' @rdname write_cluster_text
19 | #' @export
20 | #' @seealso \code{\link[clustext]{categorize}}
21 | #' @examples
22 | #' library(dplyr)
23 | #'
24 | #' ## Assign Clusters
25 | #' ca <- presidential_debates_2012 %>%
26 | #' with(data_store(dialogue)) %>%
27 | #' hierarchical_cluster() %>%
28 | #' assign_cluster(k = 7)
29 | #'
30 | #' ## Write Cluster Text for Human Categorization
31 | #' write_cluster_text(ca)
32 | #' write_cluster_text(ca, n.sample=10)
33 | #' write_cluster_text(ca, lead=" -", n.sample=10)
34 | #'
35 | #' ## Read Human Coded Categories Back In
36 | #' categories_file <- system.file("additional/foo_turk.txt", package = "clustext")
37 | #' readLines(categories_file)
38 | #' (categories_key <- read_cluster_text(categories_file))
39 | #'
40 | #' ## Add Categories Back to Original Data Set
41 | #' categorize(
42 | #' data = presidential_debates_2012,
43 | #' assign.cluster = ca,
44 | #' cluster.key = categories_key
45 | #' )
46 | write_cluster_text <- function(x, path, n.sample = NULL, lead = " * ", ...){
47 |
48 | stopifnot(methods::is(x, 'assign_cluster'))
49 | if (missing(path)) path <- ""
50 |
51 |
52 | y <- get_text(x)
53 | lens <- paste0("n = ", pn2(unlist(lapply(y, length))))
54 |
55 | # get sample of text in each cluster
56 | if (!is.null(n.sample)){
57 | y <- lapply(y, function(x) sample(x, min(length(x), n.sample)))
58 | }
59 |
60 | cls <- paste0("Cluster ", seq_along(y), ":")
61 | brd <- sapply(nchar(cls), function(x) paste(rep("=", x), collapse=""))
62 |
63 | if (!is.null(n.sample)){
64 | cls <- paste(cls, lens, sep = "\n")
65 | }
66 |
67 | cat(
68 | gsub("^\\s+", "",
69 | paste(
70 | paste(
71 | paste("\n", brd, cls, brd, sep="\n"),
72 | sapply(lapply(y, function(z) paste0(lead, z)), paste, collapse="\n"), sep="\n"
73 | ),
74 | collapse="\n\n\n"
75 | )
76 | ), "\n",
77 | file=path
78 | )
79 | }
80 |
81 |
82 | #' @rdname write_cluster_text
83 | #' @export
84 | read_cluster_text <- function(path, ...){
85 |
86 | x <- suppressWarnings(readLines(path))
87 | y <- sub("\\s*:\\s*", "splitherenow", gsub("Cluster\\s*", "", grep("^Cluster", x, value=TRUE)))
88 | content <- lapply(strsplit(y, "splitherenow"), function(x){
89 | out <- trimws(x)
90 | if (length(out) == 1) out <- c(out, NA)
91 | out
92 | })
93 |
94 | key <- stats::setNames(data.frame(do.call(rbind, content), stringsAsFactors = FALSE), c('cluster', 'category'))
95 |
96 | repl <- sub("^=", "", grep("=\\d+", key[['category']], value=TRUE))
97 | if (length(repl) > 0){
98 | key[['category']][grep("=\\d+", key[['category']])] <- key[match(repl, key[["cluster"]]), "category"]
99 | }
100 | key[["cluster"]] <- as.integer(key[["cluster"]])
101 | class(key) <- c('cluster_key', class(key))
102 | key
103 | }
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "clustext"
3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
4 | output:
5 | md_document:
6 | toc: true
7 | ---
8 |
9 | ```{r, echo=FALSE}
10 | library(knitr)
11 | desc <- suppressWarnings(readLines("DESCRIPTION"))
12 | regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)"
13 | loc <- grep(regex, desc)
14 | ver <- gsub(regex, "\\2", desc[loc])
15 | verbadge <- sprintf('
', ver, ver)
16 | ````
17 |
18 |
19 | ```{r, echo=FALSE}
20 | knit_hooks$set(htmlcap = function(before, options, envir) {
21 | if(!before) {
22 | paste('',options$htmlcap,"
",sep="")
23 | }
24 | })
25 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE)
26 | knitr::opts_chunk$set(fig.path = "tools/figure/")
27 | ```
28 |
29 | [](http://www.repostatus.org/#active)
30 | [](https://travis-ci.org/trinker/clustext)
31 | [](https://coveralls.io/r/trinker/clustext?branch=master)
32 | `r verbadge`
33 |
34 | 
35 |
36 | **clustext** is a collection of optimized tools for clustering text data via various text appropriate clustering algorithms. There are many great R [clustering tools](https://cran.r-project.org/web/views/Cluster.html) to locate topics within documents. I have had success with hierarchical clustering for topic extraction. This initial success birthed the [**hclustext**](https://github.com/trinker/hclustext) package. Additional techniques such as kmeans and non-negative matrix factorization also proved useful. These algorithms began to be collected in a consistent manor of use in the **clustext** package. This package wraps many of the great R tools for clustering and working with sparse matrices to aide in the workflow associated with topic extraction.
37 |
38 | The general idea is that we turn the documents into a matrix of words. After this we weight the terms by importance using [tf-idf](http://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html). This helps the more salient words to rise to the top. Some clustering algorithms require a similarity matrix while others require just the tf-idf weighted DocumentTermMatrices. Likewise, some algorithms require `k` terms to be specified before the model fit while others allow `k` topics to be determined after the model has been fit.
39 |
40 |
41 | With algorithms that require a similarity matrix (e.g., hierarchical clustering) we apply cosine distance measures to compare the terms (or features) of each document. I have found cosine distance to work well with sparse matrices to produce distances metrics between the documents. The clustering model is fit to separate the documents into clusters. In the case of some clustering techniques (e.g., hierarchical clustering) the user then may apply k clusters to the fit, clustering documents with similar important text features. Other techniques require that `k` be specified prior to fitting the model. The documents can then be grouped by clusters and their accompanying salient words extracted as well.
42 |
43 | # Functions
44 |
45 | The main functions, task category, & descriptions are summarized in the table below:
46 |
47 | | Function | Category | Description |
48 | |------------------------|----------------|-------------------------------------------------------------------------|
49 | | `data_store` | data structure | **clustext**'s data structure (list of dtm + text) |
50 | | `hierarchical_cluster` | cluster fit | Fits a hierarchical cluster model |
51 | | `kmeans_cluster` | cluster fit | Fits a kmeans cluster model |
52 | | `skmeans_cluster` | cluster fit | Fits an skmeans cluster model |
53 | | `nfm_cluster` | cluster fit | Fits a non-negative matrix factorization cluster model |
54 | | `assign_cluster` | assignment | Assigns cluster to document/text element |
55 | | `get_text` | extraction | Get text from various **clustext** objects |
56 | | `get_dtm` | extraction | Get `tm::DocumentTermMatrix` from various **clustext** objects |
57 | | `get_removed` | extraction | Get removed text elements from various **clustext** objects |
58 | | `get_documents` | extraction | Get clustered documents from an **assign_cluster** object |
59 | | `get_terms` | extraction | Get clustered weighted important terms from an **assign_cluster** object|
60 | | `as_topic` | categorization | View `get_terms` object as topics (pretty printed important words) |
61 | | `write_cluster_text` | categorization | Write `get_text(assign_cluster(myfit))` to file for human coding |
62 | | `read_cluster_text` | categorization | Read in a human coded `write_cluster_text` file |
63 | | `categorize` | categorization | Assign human categories and matching clusters to original data |
64 |
65 | # Installation
66 |
67 | To download the development version of **clustext**:
68 |
69 | Download the [zip ball](https://github.com/trinker/clustext/zipball/master) or [tar ball](https://github.com/trinker/clustext/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version:
70 |
71 | ```r
72 | if (!require("pacman")) install.packages("pacman")
73 | pacman::p_load_gh(
74 | "trinker/textshape",
75 | "trinker/gofastr",
76 | "trinker/termco",
77 | "trinker/clustext"
78 | )
79 | ```
80 |
81 | # Contact
82 |
83 | You are welcome to:
84 | * submit suggestions and bug-reports at:
85 | * send a pull request on:
86 | * compose a friendly e-mail to:
87 |
88 | # Demonstration
89 |
90 | ## Load Packages and Data
91 |
92 | ```{r}
93 | if (!require("pacman")) install.packages("pacman")
94 | pacman::p_load(clustext, dplyr, textshape, ggplot2, tidyr)
95 |
96 | data(presidential_debates_2012)
97 | ```
98 |
99 |
100 | ## Data Structure
101 |
102 | The data structure for **clustext** is very specific. The `data_storage` produces a `DocumentTermMatrix` which maps to the original text. The empty/removed documents are tracked within this data structure, making subsequent calls to cluster the original documents and produce weighted important terms more robust. Making the `data_storage` object is the first step to analysis.
103 |
104 | We can give the `DocumentTermMatrix` rownames via the `doc.names` argument. If these names are not unique they will be combined into a single document as seen below. Also, if you want to do stemming, minimum character length, stopword removal or such this is when/where it's done.
105 |
106 |
107 | ```{r}
108 | ds <- with(
109 | presidential_debates_2012,
110 | data_store(dialogue, doc.names = paste(person, time, sep = "_"))
111 | )
112 |
113 | ds
114 | ```
115 |
116 |
117 | ## Fit the Model: Hierarchical Cluster
118 |
119 | Next we can fit a hierarchical cluster model to the `data_store` object via `hierarchical_cluster`.
120 |
121 | ```{r}
122 | myfit <- hierarchical_cluster(ds)
123 |
124 | myfit
125 | ```
126 |
127 |
128 | This object can be plotted with various `k` or `h` parameters specified to experiment with cutting the dendrogram. This cut will determine the number of clusters or topics that will be generated in the next step. The visual inspection allows for determining how to cluster the data as well as determining if a tf-idf, cosine, hierarchical cluster model is a right fit for the data and task. By default `plot` uses an approximation of `k` based on Can & Ozkarahan's (1990) formula $(m * n)/t$ where $m$ and $n$ are the dimensions of the matrix and $t$ is the length of the non-zero elements in matrix $A$.
129 |
130 | - Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of the cover-coefficient-based clustering methodology for text databases. *ACM Transactions on Database Systems 15* (4): 483. doi:10.1145/99935.99938
131 |
132 | Interestingly, in the plots below where `k = 6` clusters, the model groups each of the candidates together at each of the debate times.
133 |
134 |
135 | ```{r}
136 | plot(myfit)
137 | plot(myfit, k=6)
138 | plot(myfit, h = .75)
139 | ```
140 |
141 | ## Assigning Clusters
142 |
143 | The `assign_cluster` function allows the user to dictate the number of clusters. Because the model has already been fit the cluster assignment is merely selecting the branches from the dendrogram, and is thus very quick. Unlike many clustering techniques the number of clusters is done after the model is fit, this allows for speedy cluster assignment, meaning the user can experiment with the number of clusters.
144 |
145 |
146 | ```{r}
147 | ca <- assign_cluster(myfit, k = 6)
148 |
149 | ca
150 | ```
151 |
152 |
153 | ### Cluster Loading
154 |
155 | To check the number of documents loading on a cluster there is a `summary` method for `assign_cluster` which provides a descending data frame of clusters and counts. Additionally, a horizontal bar plot shows the document loadings on each cluster.
156 |
157 | ```{r}
158 | summary(ca)
159 | ```
160 |
161 |
162 | ### Cluster Text
163 |
164 | The user can grab the texts from the original documents grouped by cluster using the `get_text` function. Here I demo a 40 character substring of the document texts.
165 |
166 | ```{r}
167 | get_text(ca) %>%
168 | lapply(substring, 1, 40)
169 | ```
170 |
171 |
172 | ### Cluster Frequent Terms
173 |
174 | As with many topic clustering techniques, it is useful to get the to salient terms from the model. The `get_terms` function uses the [min-max](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) scaled, [tf-idf weighted](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), `DocumentTermMatrix` to extract the most frequent salient terms. These terms can give a sense of the topic being discussed. Notice the absence of clusters 1 & 6. This is a result of only a single document included in each of the clusters. The `term.cutoff` hyperparmeter sets the lower bound on the min-max scaled tf-idf to accept. If you don't get any terms you may want to lower this or reduce `min.n`. Likewise, these two parameters can be raised to eliminate noise.
175 |
176 | ```{r}
177 | get_terms(ca)
178 | ```
179 |
180 | Or pretty printed...
181 |
182 | ```{r}
183 | get_terms(ca) %>%
184 | as_topic()
185 | ```
186 |
187 | ### Clusters, Terms, and Docs Plot
188 |
189 | Here I plot the clusters, terms, and documents (grouping variables) together as a combined heatmap. This can be useful for viewing & comparing what documents are clustering together in the context of the cluster's salient terms. This example also shows how to use the cluster terms as a lookup key to extract probable salient terms for a given document.
190 |
191 | ```{r, fig.width=11}
192 | key <- data_frame(
193 | cluster = 1:6,
194 | labs = get_terms(ca) %>%
195 | tidy_list("cluster") %>%
196 | select(-weight) %>%
197 | group_by(cluster) %>%
198 | summarize(term=paste(term, collapse=", ")) %>%
199 | apply(1, paste, collapse=": ")
200 | )
201 |
202 | ca %>%
203 | tidy_vector("id", "cluster") %>%
204 | separate(id, c("person", "time"), sep="_") %>%
205 | tbl_df() %>%
206 | left_join(key, by = "cluster") %>%
207 | mutate(n = 1) %>%
208 | mutate(labs = factor(labs, levels=rev(key[["labs"]]))) %>%
209 | unite("time_person", time, person, sep="\n") %>%
210 | select(-cluster) %>%
211 | complete(time_person, labs) %>%
212 | mutate(n = factor(ifelse(is.na(n), FALSE, TRUE))) %>%
213 | ggplot(aes(time_person, labs, fill = n)) +
214 | geom_tile() +
215 | scale_fill_manual(values=c("grey90", "red"), guide=FALSE) +
216 | labs(x=NULL, y=NULL)
217 | ```
218 |
219 |
220 | ### Cluster Documents
221 |
222 | The `get_documents` function grabs the documents associated with a particular cluster. This is most useful in cases where the number of documents is small and they have been given names.
223 |
224 | ```{r}
225 | get_documents(ca)
226 | ```
227 |
228 |
229 | ## Putting it Together
230 |
231 | I like working in a chain. In the setup below we work within a **magrittr** pipeline to fit a model, select clusters, and examine the results. In this example I do not condense the 2012 Presidential Debates data by speaker and time, rather leaving every sentence as a separate document. On my machine the initial `data_store` and model fit take ~5-8 seconds to run. Note that I do restrict the number of clusters (for texts and terms) to a random 5 clusters for the sake of space.
232 |
233 |
234 | ```{r, fig.height = 10}
235 | .tic <- Sys.time()
236 |
237 | myfit2 <- presidential_debates_2012 %>%
238 | with(data_store(dialogue)) %>%
239 | hierarchical_cluster()
240 |
241 | difftime(Sys.time(), .tic)
242 |
243 | ## View Document Loadings
244 | ca2 <- assign_cluster(myfit2, k = 100)
245 | summary(ca2) %>%
246 | head(12)
247 |
248 | ## Split Text into Clusters
249 | set.seed(5); inds <- sort(sample.int(100, 5))
250 |
251 | get_text(ca2)[inds] %>%
252 | lapply(head, 10)
253 |
254 | ## Get Associated Terms
255 | get_terms(ca2, .4)[inds]
256 |
257 | ## Pretty Printed Topics
258 | ## Get Associated Terms
259 | get_terms(ca2, .4) %>%
260 | as_topic()
261 | ```
262 |
263 |
264 | ## An Experiment
265 |
266 | It seems to me that if the hierarchical clustering is function as expected we'd see topics clustering together within a conversation as the natural eb and flow of a conversation is to talk around a topic for a while and then move on to the next related topic. A Gantt style plot of topics across time seems like an excellent way to observe clustering across time. In the experiment I first ran the hierarchical clustering at the sentence level for all participants in the 2012 presidential debates data set. I then decided to use turn of talk as the unit of analysis. Finally, I pulled out the two candidates (President Obama and Romney) and faceted n their topic use over time.
267 |
268 | ```{r, fig.width=10, fig.height = 9}
269 | if (!require("pacman")) install.packages("pacman")
270 | pacman::p_load(dplyr, clustext, textshape, ggplot2, stringi)
271 |
272 | myfit3 <- presidential_debates_2012 %>%
273 | mutate(tot = gsub("\\..+$", "", tot)) %>%
274 | with(data_store(dialogue)) %>%
275 | hierarchical_cluster()
276 |
277 | plot(myfit3, 75)
278 | ```
279 |
280 | Can & Ozkarahan's (1990) formula indicated a `k = 259`. This umber seemed overly large. I used `k = 75` for the number of topics as it seemed unreasonable that there'd be more topics than this but with `k = 75` over half of the sentences loaded on one cluster. Note the use of the `attribute` `join` from `assign_cluster` to make joining back to the original data set easier.
281 |
282 |
283 | ```{r, fig.width=14, fig.height = 16}
284 | k <- 75
285 | ca3 <- assign_cluster(myfit3, k = k)
286 |
287 | presidential_debates_2012 %>%
288 | mutate(tot = gsub("\\..+$", "", tot)) %>%
289 | tbl_df() %>%
290 | attributes(ca3)$join() %>%
291 | group_by(time) %>%
292 | mutate(
293 | word_count = stringi::stri_count_words(dialogue),
294 | start = starts(word_count),
295 | end = ends(word_count)
296 | ) %>%
297 | na.omit() %>%
298 | mutate(cluster = factor(cluster, levels = k:1)) %>%
299 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) +
300 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
301 | ggplot2::theme_bw() +
302 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
303 | panel.grid.minor.x = ggplot2::element_blank(),
304 | panel.grid.major.x = ggplot2::element_blank(),
305 | panel.grid.minor.y = ggplot2::element_blank(),
306 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
307 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
308 | strip.background = ggplot2::element_blank()) +
309 | ggplot2::facet_wrap(~time, scales='free', ncol=1) +
310 | ggplot2::labs(x="Duration (words)", y="Cluster")
311 | ```
312 |
313 | Right away we notice that not all topics are used across all three times. This is encouraging that the clustering is working as expected as we'd expect some overlap in debate topics as well as some unique topics. However, there were so many topics clustering on cluster 3 that I had to make some decisions. I could (a) ignore this mass and essentially throw out half the data that loaded on a single cluster, (b) increase `k` to split up the mass loading on cluster 3, (c) change the unit of analysis. It seemed the first option was wasteful of data and could miss information. The second approach could lead to a model that had so many topics it wouldn't be meaningful. The last approach seemed reasonable, inspecting the cluster text showed that many were capturing functions of language rather than content. For example, people use *"Oh."* to indicate agreement. This isn't a topic but the clustering would group sentences that use this convention together. Combining this sentence with other sentences in the turn of talk are more likely to get the content we're after.
314 |
315 | Next I used the `textshape::combine` function to group turns of talk together.
316 |
317 | ```{r, fig.width=10, fig.height = 9}
318 | myfit4 <- presidential_debates_2012 %>%
319 | mutate(tot = gsub("\\..+$", "", tot)) %>%
320 | textshape::combine() %>%
321 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>%
322 | hierarchical_cluster()
323 |
324 | plot(myfit4, k = 80)
325 | ```
326 |
327 | The distribution of turns of talk looked much more dispersed across clusters. I used `k = 60` for the number of topics.
328 |
329 |
330 | ```{r, fig.width=14, fig.height = 16}
331 | k <- 80
332 | ca4 <- assign_cluster(myfit4, k = k)
333 |
334 | presidential_debates_2012 %>%
335 | mutate(tot = gsub("\\..+$", "", tot)) %>%
336 | textshape::combine() %>%
337 | tbl_df() %>%
338 | attributes(ca4)$join() %>%
339 | group_by(time) %>%
340 | mutate(
341 | word_count = stringi::stri_count_words(dialogue),
342 | start = starts(word_count),
343 | end = ends(word_count)
344 | ) %>%
345 | na.omit() %>%
346 | mutate(cluster = factor(cluster, levels = k:1)) %>%
347 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) +
348 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
349 | ggplot2::theme_bw() +
350 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
351 | panel.grid.minor.x = ggplot2::element_blank(),
352 | panel.grid.major.x = ggplot2::element_blank(),
353 | panel.grid.minor.y = ggplot2::element_blank(),
354 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
355 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
356 | strip.background = ggplot2::element_blank()) +
357 | ggplot2::facet_wrap(~time, scales='free', ncol=1) +
358 | ggplot2::labs(x="Duration (words)", y="Cluster")
359 | ```
360 |
361 |
362 | The plots looked less messy and indeed topics do appear to be clustering around one another. I wanted to see how the primary participants, the candidates, compared to each other in topic use.
363 |
364 | In this last bit of analysis I filter out all participants except Obama and Romeny and facet by participant across time.
365 |
366 | ```{r, fig.width=10, fig.height = 9}
367 | myfit5 <- presidential_debates_2012 %>%
368 | mutate(tot = gsub("\\..+$", "", tot)) %>%
369 | textshape::combine() %>%
370 | filter(person %in% c("ROMNEY", "OBAMA")) %>%
371 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>%
372 | hierarchical_cluster()
373 |
374 |
375 | plot(myfit5, 50)
376 | ```
377 |
378 | Based on the dendrogram, I used `k = 50` for the number of topics.
379 |
380 |
381 | ```{r, fig.width=14, fig.height = 12}
382 | k <- 50
383 | ca5 <- assign_cluster(myfit5, k = k)
384 |
385 | presidential_debates_2012 %>%
386 | mutate(tot = gsub("\\..+$", "", tot)) %>%
387 | textshape::combine() %>%
388 | filter(person %in% c("ROMNEY", "OBAMA")) %>%
389 | tbl_df() %>%
390 | attributes(ca5)$join() %>%
391 | group_by(time) %>%
392 | mutate(
393 | word_count = stringi::stri_count_words(dialogue),
394 | start = starts(word_count),
395 | end = ends(word_count)
396 | ) %>%
397 | na.omit() %>%
398 | mutate(cluster = factor(cluster, levels = k:1)) %>%
399 | ggplot2::ggplot(ggplot2::aes(x = start-10, y = cluster, xend = end+10, yend = cluster)) +
400 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
401 | ggplot2::theme_bw() +
402 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
403 | panel.grid.minor.x = ggplot2::element_blank(),
404 | panel.grid.major.x = ggplot2::element_blank(),
405 | panel.grid.minor.y = ggplot2::element_blank(),
406 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
407 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
408 | strip.background = ggplot2::element_blank()) +
409 | ggplot2::facet_grid(person~time, scales='free', space='free') +
410 | ggplot2::labs(x="Duration (words)", y="Cluster")
411 | ```
412 |
413 |
414 | If you're curious about the heaviest weighted tf-idf terms in each cluster the next code chunk provides the top five weighted terms used in each cluster. Below this I provide a bar plot of the frequencies of clusters to help put the other information into perspective.
415 |
416 |
417 |
418 | ```{r}
419 | invisible(Map(function(x, y){
420 |
421 | if (is.null(x)) {
422 | cat(sprintf("Cluster %s: ...\n", y))
423 | } else {
424 | m <- dplyr::top_n(x, 5, n)
425 | o <- paste(paste0(m[[1]], " (", round(m[[2]], 1), ")"), collapse="; ")
426 | cat(sprintf("Cluster %s: %s\n", y, o))
427 | }
428 |
429 | }, get_terms(ca5, .4), names(get_terms(ca5, .4))))
430 |
431 | ```
432 |
433 | ```{r, fig.height = 9}
434 | invisible(summary(ca5))
435 | ```
436 |
437 | It appears that in fact the topics do cluster within segments of time as we'd expect. This is more apparent when turn of talk is used as the unit of analysis (document level) rather than each sentence.
438 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | clustext [](https://twitter.com/intent/follow?screen_name=tylerrinker)
2 | ============
3 |
4 |
5 | [](http://www.repostatus.org/#active)
8 | [](https://travis-ci.org/trinker/clustext)
10 | [](https://coveralls.io/r/trinker/clustext?branch=master)
12 |
13 |
14 |
15 | 
16 |
17 | **clustext** is a collection of optimized tools for clustering text data
18 | via various text appropriate clustering algorithms. There are many great
19 | R [clustering tools](https://cran.r-project.org/web/views/Cluster.html)
20 | to locate topics within documents. I have had success with hierarchical
21 | clustering for topic extraction. This initial success birthed the
22 | [**hclustext**](https://github.com/trinker/hclustext) package.
23 | Additional techniques such as kmeans and non-negative matrix
24 | factorization also proved useful. These algorithms began to be collected
25 | in a consistent manor of use in the **clustext** package. This package
26 | wraps many of the great R tools for clustering and working with sparse
27 | matrices to aide in the workflow associated with topic extraction.
28 |
29 | The general idea is that we turn the documents into a matrix of words.
30 | After this we weight the terms by importance using
31 | [tf-idf](http://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html).
32 | This helps the more salient words to rise to the top. Some clustering
33 | algorithms require a similarity matrix while others require just the
34 | tf-idf weighted DocumentTermMatrices. Likewise, some algorithms require
35 | `k` terms to be specified before the model fit while others allow `k`
36 | topics to be determined after the model has been fit.
37 |
38 | With algorithms that require a similarity matrix (e.g., hierarchical
39 | clustering) we apply cosine distance measures to compare the terms (or
40 | features) of each document. I have found cosine distance to work well
41 | with sparse matrices to produce distances metrics between the documents.
42 | The clustering model is fit to separate the documents into clusters. In
43 | the case of some clustering techniques (e.g., hierarchical clustering)
44 | the user then may apply k clusters to the fit, clustering documents with
45 | similar important text features. Other techniques require that `k` be
46 | specified prior to fitting the model. The documents can then be grouped
47 | by clusters and their accompanying salient words extracted as well.
48 |
49 |
50 | Table of Contents
51 | ============
52 |
53 | - [Functions](#functions)
54 | - [Installation](#installation)
55 | - [Contact](#contact)
56 | - [Demonstration](#demonstration)
57 | - [Load Packages and Data](#load-packages-and-data)
58 | - [Data Structure](#data-structure)
59 | - [Fit the Model: Hierarchical Cluster](#fit-the-model-hierarchical-cluster)
60 | - [Assigning Clusters](#assigning-clusters)
61 | - [Cluster Loading](#cluster-loading)
62 | - [Cluster Text](#cluster-text)
63 | - [Cluster Frequent Terms](#cluster-frequent-terms)
64 | - [Clusters, Terms, and Docs Plot](#clusters-terms-and-docs-plot)
65 | - [Cluster Documents](#cluster-documents)
66 | - [Putting it Together](#putting-it-together)
67 | - [An Experiment](#an-experiment)
68 |
69 | Functions
70 | ============
71 |
72 |
73 | The main functions, task category, & descriptions are summarized in the
74 | table below:
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
88 |
89 |
90 |
91 | data_store |
92 | data structure |
93 | clustext's data structure (list of dtm + text) |
94 |
95 |
96 | hierarchical_cluster |
97 | cluster fit |
98 | Fits a hierarchical cluster model |
99 |
100 |
101 | kmeans_cluster |
102 | cluster fit |
103 | Fits a kmeans cluster model |
104 |
105 |
106 | skmeans_cluster |
107 | cluster fit |
108 | Fits an skmeans cluster model |
109 |
110 |
111 | nfm_cluster |
112 | cluster fit |
113 | Fits a non-negative matrix factorization cluster model |
114 |
115 |
116 | assign_cluster |
117 | assignment |
118 | Assigns cluster to document/text element |
119 |
120 |
121 | get_text |
122 | extraction |
123 | Get text from various clustext objects |
124 |
125 |
126 | get_dtm |
127 | extraction |
128 | Get tm::DocumentTermMatrix from various clustext objects |
129 |
130 |
131 | get_removed |
132 | extraction |
133 | Get removed text elements from various clustext objects |
134 |
135 |
136 | get_documents |
137 | extraction |
138 | Get clustered documents from an assign_cluster object |
139 |
140 |
141 | get_terms |
142 | extraction |
143 | Get clustered weighted important terms from an assign_cluster object |
144 |
145 |
146 | as_topic |
147 | categorization |
148 | View get_terms object as topics (pretty printed important words) |
149 |
150 |
151 | write_cluster_text |
152 | categorization |
153 | Write get_text(assign_cluster(myfit)) to file for human coding |
154 |
155 |
156 | read_cluster_text |
157 | categorization |
158 | Read in a human coded write_cluster_text file |
159 |
160 |
161 | categorize |
162 | categorization |
163 | Assign human categories and matching clusters to original data |
164 |
165 |
166 |
167 |
168 | Installation
169 | ============
170 |
171 | To download the development version of **clustext**:
172 |
173 | Download the [zip
174 | ball](https://github.com/trinker/clustext/zipball/master) or [tar
175 | ball](https://github.com/trinker/clustext/tarball/master), decompress
176 | and run `R CMD INSTALL` on it, or use the **pacman** package to install
177 | the development version:
178 |
179 | if (!require("pacman")) install.packages("pacman")
180 | pacman::p_load_gh(
181 | "trinker/textshape",
182 | "trinker/gofastr",
183 | "trinker/termco",
184 | "trinker/clustext"
185 | )
186 |
187 | Contact
188 | =======
189 |
190 | You are welcome to:
191 | - submit suggestions and bug-reports at:
192 | - send a pull request on:
193 | - compose a friendly e-mail to:
194 |
195 | Demonstration
196 | =============
197 |
198 | Load Packages and Data
199 | ----------------------
200 |
201 | if (!require("pacman")) install.packages("pacman")
202 | pacman::p_load(clustext, dplyr, textshape, ggplot2, tidyr)
203 |
204 | data(presidential_debates_2012)
205 |
206 | Data Structure
207 | --------------
208 |
209 | The data structure for **clustext** is very specific. The `data_storage`
210 | produces a `DocumentTermMatrix` which maps to the original text. The
211 | empty/removed documents are tracked within this data structure, making
212 | subsequent calls to cluster the original documents and produce weighted
213 | important terms more robust. Making the `data_storage` object is the
214 | first step to analysis.
215 |
216 | We can give the `DocumentTermMatrix` rownames via the `doc.names`
217 | argument. If these names are not unique they will be combined into a
218 | single document as seen below. Also, if you want to do stemming, minimum
219 | character length, stopword removal or such this is when/where it's done.
220 |
221 | ds <- with(
222 | presidential_debates_2012,
223 | data_store(dialogue, doc.names = paste(person, time, sep = "_"))
224 | )
225 |
226 | ds
227 |
228 | ## <>
229 | ## Text Elements : 10
230 | ## Elements Removed : 0
231 | ## Non-/sparse entries: 6916/24884
232 | ## Sparsity : 78%
233 | ## Maximal term length: 16
234 | ## Minimum term length: 3
235 |
236 | Fit the Model: Hierarchical Cluster
237 | -----------------------------------
238 |
239 | Next we can fit a hierarchical cluster model to the `data_store` object
240 | via `hierarchical_cluster`.
241 |
242 | myfit <- hierarchical_cluster(ds)
243 |
244 | myfit
245 |
246 | ##
247 | ## Call:
248 | ## fastcluster::hclust(d = distmes(x[["dtm"]]), method = method)
249 | ##
250 | ## Cluster method : ward.D
251 | ## Number of objects: 10
252 |
253 | This object can be plotted with various `k` or `h` parameters specified
254 | to experiment with cutting the dendrogram. This cut will determine the
255 | number of clusters or topics that will be generated in the next step.
256 | The visual inspection allows for determining how to cluster the data as
257 | well as determining if a tf-idf, cosine, hierarchical cluster model is a
258 | right fit for the data and task. By default `plot` uses an approximation
259 | of `k` based on Can & Ozkarahan's (1990) formula (*m* \* *n*)/*t* where
260 | *m* and *n* are the dimensions of the matrix and *t* is the length of
261 | the non-zero elements in matrix *A*.
262 |
263 | - Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of the
264 | cover-coefficient-based clustering methodology for text databases.
265 | *ACM Transactions on Database Systems 15* (4): 483.
266 |
267 |
268 | Interestingly, in the plots below where `k = 6` clusters, the model
269 | groups each of the candidates together at each of the debate times.
270 |
271 | plot(myfit)
272 |
273 | ##
274 | ## k approximated to: 5
275 |
276 | 
277 |
278 | plot(myfit, k=6)
279 |
280 | 
281 |
282 | plot(myfit, h = .75)
283 |
284 | 
285 |
286 | Assigning Clusters
287 | ------------------
288 |
289 | The `assign_cluster` function allows the user to dictate the number of
290 | clusters. Because the model has already been fit the cluster assignment
291 | is merely selecting the branches from the dendrogram, and is thus very
292 | quick. Unlike many clustering techniques the number of clusters is done
293 | after the model is fit, this allows for speedy cluster assignment,
294 | meaning the user can experiment with the number of clusters.
295 |
296 | ca <- assign_cluster(myfit, k = 6)
297 |
298 | ca
299 |
300 | ## CROWLEY_time 2 LEHRER_time 1 OBAMA_time 1 OBAMA_time 2
301 | ## 1 2 3 4
302 | ## OBAMA_time 3 QUESTION_time 2 ROMNEY_time 1 ROMNEY_time 2
303 | ## 5 6 3 4
304 | ## ROMNEY_time 3 SCHIEFFER_time 3
305 | ## 5 2
306 |
307 | ### Cluster Loading
308 |
309 | To check the number of documents loading on a cluster there is a
310 | `summary` method for `assign_cluster` which provides a descending data
311 | frame of clusters and counts. Additionally, a horizontal bar plot shows
312 | the document loadings on each cluster.
313 |
314 | summary(ca)
315 |
316 | 
317 |
318 | ## cluster count
319 | ## 1 2 2
320 | ## 2 3 2
321 | ## 3 4 2
322 | ## 4 5 2
323 | ## 5 1 1
324 | ## 6 6 1
325 |
326 | ### Cluster Text
327 |
328 | The user can grab the texts from the original documents grouped by
329 | cluster using the `get_text` function. Here I demo a 40 character
330 | substring of the document texts.
331 |
332 | get_text(ca) %>%
333 | lapply(substring, 1, 40)
334 |
335 | ## $`1`
336 | ## [1] "Good evening from Hofstra University in "
337 | ##
338 | ## $`2`
339 | ## [1] "We'll talk about specifically about heal"
340 | ## [2] "Good evening from the campus of Lynn Uni"
341 | ##
342 | ## $`3`
343 | ## [1] "Jim, if I if I can just respond very qui"
344 | ## [2] "What I support is no change for current "
345 | ##
346 | ## $`4`
347 | ## [1] "Jeremy, first of all, your future is bri"
348 | ## [2] "Thank you, Jeremy. I appreciate your you"
349 | ##
350 | ## $`5`
351 | ## [1] "Well, my first job as commander in chief"
352 | ## [2] "Thank you, Bob. And thank you for agreei"
353 | ##
354 | ## $`6`
355 | ## [1] "Mister President, Governor Romney, as a "
356 |
357 | ### Cluster Frequent Terms
358 |
359 | As with many topic clustering techniques, it is useful to get the to
360 | salient terms from the model. The `get_terms` function uses the
361 | [min-max](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling)
362 | scaled, [tf-idf weighted](https://en.wikipedia.org/wiki/Tf%E2%80%93idf),
363 | `DocumentTermMatrix` to extract the most frequent salient terms. These
364 | terms can give a sense of the topic being discussed. Notice the absence
365 | of clusters 1 & 6. This is a result of only a single document included
366 | in each of the clusters. The `term.cutoff` hyperparmeter sets the lower
367 | bound on the min-max scaled tf-idf to accept. If you don't get any terms
368 | you may want to lower this or reduce `min.n`. Likewise, these two
369 | parameters can be raised to eliminate noise.
370 |
371 | get_terms(ca)
372 |
373 | ## $`1 (n=1)`
374 | ## term weight
375 | ## 1 mister 1.0000000
376 | ## 2 along 0.7086841
377 | ## 3 sort 0.6678306
378 | ## 4 unemployed 0.6223915
379 | ##
380 | ## $`2 (n=2)`
381 | ## term weight
382 | ## 1 segment 1.0000000
383 | ## 2 minutes 0.9091730
384 | ## 3 minute 0.6648988
385 | ##
386 | ## $`3 (n=2)`
387 | ## term weight
388 | ## 1 insurance 1.0000000
389 | ## 2 health 0.6200389
390 | ##
391 | ## $`4 (n=2)`
392 | ## term weight
393 | ## 1 coal 1.0000000
394 | ## 2 jobs 0.9439400
395 | ## 3 sure 0.9330092
396 | ## 4 immigration 0.9134630
397 | ## 5 oil 0.9014907
398 | ## 6 issue 0.7352300
399 | ## 7 candy 0.7303597
400 | ## 8 production 0.7291683
401 | ## 9 women 0.7073096
402 | ## 10 million 0.6792076
403 | ## 11 settle 0.6056192
404 | ## 12 illegally 0.6055244
405 | ##
406 | ## $`5 (n=2)`
407 | ## term weight
408 | ## 1 nuclear 1.0000000
409 | ## 2 iran 0.9511527
410 | ## 3 sanctions 0.8585336
411 | ## 4 israel 0.7895173
412 | ## 5 sure 0.7698270
413 | ## 6 region 0.7608304
414 | ## 7 military 0.7272537
415 | ## 8 troops 0.6768143
416 | ## 9 pakistan 0.6766784
417 | ## 10 world 0.6716568
418 | ## 11 threat 0.6520238
419 | ## 12 iraq 0.6467488
420 | ##
421 | ## $`6 (n=1)`
422 | ## term weight
423 | ## 1 department 1.0000000
424 | ## 2 chu 0.6666667
425 | ## 3 stated 0.6666667
426 | ## 4 misperception 0.6666667
427 |
428 | Or pretty printed...
429 |
430 | get_terms(ca) %>%
431 | as_topic()
432 |
433 | ## Cluster 2 (n=2): segment, minutes, minute
434 | ## Cluster 3 (n=2): insurance, health
435 | ## Cluster 4 (n=2): coal, jobs, sure, immigration, oil, issue, candy, production...
436 | ## Cluster 5 (n=2): nuclear, iran, sanctions, israel, sure, region, military, troops...
437 | ## Cluster 1 (n=1): mister, along, sort, unemployed
438 | ## Cluster 6 (n=1): department, chu, stated, misperception
439 |
440 | ### Clusters, Terms, and Docs Plot
441 |
442 | Here I plot the clusters, terms, and documents (grouping variables)
443 | together as a combined heatmap. This can be useful for viewing &
444 | comparing what documents are clustering together in the context of the
445 | cluster's salient terms. This example also shows how to use the cluster
446 | terms as a lookup key to extract probable salient terms for a given
447 | document.
448 |
449 | key <- data_frame(
450 | cluster = 1:6,
451 | labs = get_terms(ca) %>%
452 | tidy_list("cluster") %>%
453 | select(-weight) %>%
454 | group_by(cluster) %>%
455 | summarize(term=paste(term, collapse=", ")) %>%
456 | apply(1, paste, collapse=": ")
457 | )
458 |
459 | ca %>%
460 | tidy_vector("id", "cluster") %>%
461 | separate(id, c("person", "time"), sep="_") %>%
462 | tbl_df() %>%
463 | left_join(key, by = "cluster") %>%
464 | mutate(n = 1) %>%
465 | mutate(labs = factor(labs, levels=rev(key[["labs"]]))) %>%
466 | unite("time_person", time, person, sep="\n") %>%
467 | select(-cluster) %>%
468 | complete(time_person, labs) %>%
469 | mutate(n = factor(ifelse(is.na(n), FALSE, TRUE))) %>%
470 | ggplot(aes(time_person, labs, fill = n)) +
471 | geom_tile() +
472 | scale_fill_manual(values=c("grey90", "red"), guide=FALSE) +
473 | labs(x=NULL, y=NULL)
474 |
475 | 
476 |
477 | ### Cluster Documents
478 |
479 | The `get_documents` function grabs the documents associated with a
480 | particular cluster. This is most useful in cases where the number of
481 | documents is small and they have been given names.
482 |
483 | get_documents(ca)
484 |
485 | ## $`1`
486 | ## [1] "CROWLEY_time 2"
487 | ##
488 | ## $`2`
489 | ## [1] "LEHRER_time 1" "SCHIEFFER_time 3"
490 | ##
491 | ## $`3`
492 | ## [1] "OBAMA_time 1" "ROMNEY_time 1"
493 | ##
494 | ## $`4`
495 | ## [1] "OBAMA_time 2" "ROMNEY_time 2"
496 | ##
497 | ## $`5`
498 | ## [1] "OBAMA_time 3" "ROMNEY_time 3"
499 | ##
500 | ## $`6`
501 | ## [1] "QUESTION_time 2"
502 |
503 | Putting it Together
504 | -------------------
505 |
506 | I like working in a chain. In the setup below we work within a
507 | **magrittr** pipeline to fit a model, select clusters, and examine the
508 | results. In this example I do not condense the 2012 Presidential Debates
509 | data by speaker and time, rather leaving every sentence as a separate
510 | document. On my machine the initial `data_store` and model fit take ~5-8
511 | seconds to run. Note that I do restrict the number of clusters (for
512 | texts and terms) to a random 5 clusters for the sake of space.
513 |
514 | .tic <- Sys.time()
515 |
516 | myfit2 <- presidential_debates_2012 %>%
517 | with(data_store(dialogue)) %>%
518 | hierarchical_cluster()
519 |
520 | difftime(Sys.time(), .tic)
521 |
522 | ## Time difference of 5.450191 secs
523 |
524 | ## View Document Loadings
525 | ca2 <- assign_cluster(myfit2, k = 100)
526 | summary(ca2) %>%
527 | head(12)
528 |
529 | 
530 |
531 | ## cluster count
532 | ## 1 2 1409
533 | ## 2 25 54
534 | ## 3 15 50
535 | ## 4 39 46
536 | ## 5 61 39
537 | ## 6 36 37
538 | ## 7 40 33
539 | ## 8 17 31
540 | ## 9 31 29
541 | ## 10 37 28
542 | ## 11 27 25
543 | ## 12 46 23
544 |
545 | ## Split Text into Clusters
546 | set.seed(5); inds <- sort(sample.int(100, 5))
547 |
548 | get_text(ca2)[inds] %>%
549 | lapply(head, 10)
550 |
551 | ## $`11`
552 | ## [1] "Yeah, we're going to yeah, I want to get to it."
553 | ## [2] "And do you yeah."
554 | ## [3] "Yeah, you bet."
555 | ## [4] "Yeah."
556 | ##
557 | ## $`21`
558 | ## [1] "Regulation is essential."
559 | ## [2] "You can't have a free market work if you don't have regulation."
560 | ## [3] "Every free economy has good regulation."
561 | ## [4] "You have to have regulation."
562 | ## [5] "Now, it wasn't just on Wall Street."
563 | ## [6] "We stepped in and had the toughest reforms on Wall Street since the one thousand nine hundred thirtys."
564 | ## [7] "And so the question is: Does anybody out there think that the big problem we had is that there was too much oversight and regulation of Wall Street?"
565 | ## [8] "Look, we have to have regulation on Wall Street."
566 | ## [9] "That's why I'd have regulation."
567 | ## [10] "I committed that I would rein in the excesses of Wall Street, and we passed the toughest Wall Street reforms since the one thousand nine hundred thirtys."
568 | ##
569 | ## $`28`
570 | ## [1] "We said you've got banks, you've got to raise your capital requirements."
571 | ## [2] "Well, actually it's it's it's a lengthy description."
572 | ## [3] "Well, actually Governor, that isn't what your plan does."
573 | ## [4] "Number two, we've got to make sure that we have the best education system in the world."
574 | ## [5] "We've got to reduce our deficit, but we've got to do it in a balanced way."
575 | ## [6] "I got to I got to move you on."
576 | ## [7] "He actually got|"
577 | ## [8] "He actually got the first question."
578 | ## [9] "Governor Romney, I'm sure you've got a reply there."
579 | ## [10] "I've got to say|"
580 | ##
581 | ## $`68`
582 | ## [1] "That's how we went after Al Qaida and bin Laden."
583 | ## [2] "I said that we'd go after al Qaeda and bin Laden, we have."
584 | ## [3] "I congratulate him on on taking out Osama bin Laden and going after the leadership in al Qaeda."
585 | ## [4] "When it comes to going after Osama bin Laden, you said, well, any president would make that call."
586 | ## [5] "And she said to me, You know, by finally getting bin Laden, that brought some closure to me."
587 | ## [6] "We had to go in there to get Osama bin Laden."
588 | ## [7] "Well, keep in mind our strategy wasn't just going after bin Laden."
589 | ##
590 | ## $`90`
591 | ## [1] "And finally, championing small business."
592 | ## [2] "And then let's take the last one, championing small business."
593 | ## [3] "I came through small business."
594 | ## [4] "I understand how hard it is to start a small business."
595 | ## [5] "I want to keep their taxes down on small business."
596 | ## [6] "So if you're starting a business, where would you rather start it?"
597 | ## [7] "But, of course, if you're a small business or a mom and pop business or a big business starting up here, you've got to pay even the reduced rate that Governor Romney's talking about."
598 | ## [8] "And finally, number five, we've got to champion small business."
599 | ## [9] "Small business is where jobs come from."
600 | ## [10] "That's not the kind of small business promotion we need."
601 |
602 | ## Get Associated Terms
603 | get_terms(ca2, .4)[inds]
604 |
605 | ## $`11`
606 | ## term weight
607 | ## 1 yeah 1
608 | ##
609 | ## $`21`
610 | ## term weight
611 | ## 1 regulation 1
612 | ##
613 | ## $`28`
614 | ## term weight
615 | ## 1 got 1.0000000
616 | ## 2 actually 0.5421049
617 | ##
618 | ## $`68`
619 | ## term weight
620 | ## 1 bin 1.000000
621 | ## 2 laden 1.000000
622 | ## 3 osama 0.449873
623 | ##
624 | ## $`90`
625 | ## term weight
626 | ## 1 business 1.000000
627 | ## 2 small 0.679014
628 |
629 | ## Pretty Printed Topics
630 | ## Get Associated Terms
631 | get_terms(ca2, .4) %>%
632 | as_topic()
633 |
634 | ## Cluster 2 (n=1409): going, people, said, can, make, one, governor, get...
635 | ## Cluster 25 (n=54): need, keep, thought, indict, speak, progress, trying, standard...
636 | ## Cluster 15 (n=50): want, thirty, now, leave, bigger
637 | ## Cluster 39 (n=46): jobs, investing
638 | ## Cluster 61 (n=39): education, doubt, oil, lands, public
639 | ## Cluster 36 (n=37): thousand, hundred, eighty, two
640 | ## Cluster 40 (n=33): middle, class, east, certainly
641 | ## Cluster 17 (n=31): israel, part, experience, united, way, states
642 | ## Cluster 31 (n=29): years, four, last
643 | ## Cluster 37 (n=28): tax, deductions, amount
644 | ## Cluster 27 (n=25): military, maintaining, spending
645 | ## Cluster 46 (n=23): important, gone, think, mistake, tough
646 | ## Cluster 62 (n=23): thank, gentlemen
647 | ## Cluster 10 (n=22): talk, two, minute, minutes, toss
648 | ## Cluster 14 (n=21): back, medicare, come, manufacturing
649 | ## Cluster 16 (n=21): right
650 | ## Cluster 22 (n=21): example, chance, give, state, let
651 | ## Cluster 55 (n=21): mister, president
652 | ## Cluster 30 (n=20): happened, exactly
653 | ## Cluster 65 (n=20): governor
654 | ## Cluster 81 (n=20): china, problem, compete
655 | ## Cluster 19 (n=19): economy, grows
656 | ## Cluster 35 (n=19): deal, trade
657 | ## Cluster 42 (n=19): percent, seven
658 | ## Cluster 44 (n=19): trillion, dollar
659 | ## Cluster 84 (n=19): first, year
660 | ## Cluster 1 (n=18): care, health
661 | ## Cluster 24 (n=18): get, jeremy, rid
662 | ## Cluster 28 (n=18): got, actually
663 | ## Cluster 54 (n=18): role, leadership, government, kind, shown
664 | ## Cluster 70 (n=18): bush, different
665 | ## Cluster 26 (n=17): respond, will, lose
666 | ## Cluster 32 (n=17): future, bright, critical
667 | ## Cluster 76 (n=17): energy
668 | ## Cluster 34 (n=16): sure, make, job, college
669 | ## Cluster 43 (n=16): taxes, cut
670 | ## Cluster 13 (n=15): please, ask, repealed, quickly
671 | ## Cluster 67 (n=15): war, end, iraq
672 | ## Cluster 4 (n=14): one, number
673 | ## Cluster 63 (n=14): grow, planning
674 | ## Cluster 79 (n=14): coal
675 | ## Cluster 23 (n=13): dodd, frank, repeal
676 | ## Cluster 38 (n=13): three, twenty, million
677 | ## Cluster 56 (n=13): question
678 | ## Cluster 78 (n=13): true
679 | ## Cluster 96 (n=13): iran, bomb, closer, nuclear
680 | ## Cluster 21 (n=12): regulation
681 | ## Cluster 47 (n=12): let
682 | ## Cluster 51 (n=12): stand, principles
683 | ## Cluster 69 (n=12): election, course
684 | ## Cluster 72 (n=12): thank
685 | ## Cluster 6 (n=11): happen
686 | ## Cluster 57 (n=11): difference
687 | ## Cluster 59 (n=11): happy, teachers
688 | ## Cluster 74 (n=11): know
689 | ## Cluster 89 (n=11): balanced, budget
690 | ## Cluster 90 (n=11): business, small
691 | ## Cluster 33 (n=10): romney, governor
692 | ## Cluster 45 (n=10): plan
693 | ## Cluster 75 (n=10): detroit, bankrupt
694 | ## Cluster 77 (n=10): policies
695 | ## Cluster 83 (n=10): still, speaking
696 | ## Cluster 95 (n=10): policy, failure, foreign
697 | ## Cluster 98 (n=10): sanctions, tighten, crippling
698 | ## Cluster 7 (n=9): private
699 | ## Cluster 8 (n=9): high
700 | ## Cluster 66 (n=9): done
701 | ## Cluster 82 (n=9): answer, straightforward
702 | ## Cluster 92 (n=9): said
703 | ## Cluster 100 (n=9): strong
704 | ## Cluster 3 (n=8): billion, dollar, ninety, sixteen
705 | ## Cluster 18 (n=8): absolutely
706 | ## Cluster 41 (n=8): tell
707 | ## Cluster 50 (n=8): move, along
708 | ## Cluster 73 (n=8): good, night
709 | ## Cluster 80 (n=8): production
710 | ## Cluster 86 (n=8): create, jobs
711 | ## Cluster 5 (n=7): choice
712 | ## Cluster 9 (n=7): point, last, make
713 | ## Cluster 49 (n=7): wrong
714 | ## Cluster 53 (n=7): see
715 | ## Cluster 60 (n=7): stamps, food, million
716 | ## Cluster 64 (n=7): excuse, sir
717 | ## Cluster 68 (n=7): bin, laden, osama
718 | ## Cluster 71 (n=7): record, check
719 | ## Cluster 97 (n=7): bob
720 | ## Cluster 29 (n=6): believe
721 | ## Cluster 85 (n=6): candy
722 | ## Cluster 91 (n=6): understand
723 | ## Cluster 48 (n=5): well
724 | ## Cluster 58 (n=5): great
725 | ## Cluster 87 (n=5): thanks
726 | ## Cluster 88 (n=5): yes
727 | ## Cluster 93 (n=5): lorraine
728 | ## Cluster 94 (n=5): pension, looked
729 | ## Cluster 11 (n=4): yeah
730 | ## Cluster 12 (n=4): sorry
731 | ## Cluster 20 (n=4): much, cut
732 | ## Cluster 52 (n=4): time
733 | ## Cluster 99 (n=4): work
734 |
735 | An Experiment
736 | -------------
737 |
738 | It seems to me that if the hierarchical clustering is function as
739 | expected we'd see topics clustering together within a conversation as
740 | the natural eb and flow of a conversation is to talk around a topic for
741 | a while and then move on to the next related topic. A Gantt style plot
742 | of topics across time seems like an excellent way to observe clustering
743 | across time. In the experiment I first ran the hierarchical clustering
744 | at the sentence level for all participants in the 2012 presidential
745 | debates data set. I then decided to use turn of talk as the unit of
746 | analysis. Finally, I pulled out the two candidates (President Obama and
747 | Romney) and faceted n their topic use over time.
748 |
749 | if (!require("pacman")) install.packages("pacman")
750 | pacman::p_load(dplyr, clustext, textshape, ggplot2, stringi)
751 |
752 | myfit3 <- presidential_debates_2012 %>%
753 | mutate(tot = gsub("\\..+$", "", tot)) %>%
754 | with(data_store(dialogue)) %>%
755 | hierarchical_cluster()
756 |
757 | plot(myfit3, 75)
758 |
759 | 
760 |
761 | Can & Ozkarahan's (1990) formula indicated a `k = 259`. This umber
762 | seemed overly large. I used `k = 75` for the number of topics as it
763 | seemed unreasonable that there'd be more topics than this but with
764 | `k = 75` over half of the sentences loaded on one cluster. Note the use
765 | of the `attribute` `join` from `assign_cluster` to make joining back to
766 | the original data set easier.
767 |
768 | k <- 75
769 | ca3 <- assign_cluster(myfit3, k = k)
770 |
771 | presidential_debates_2012 %>%
772 | mutate(tot = gsub("\\..+$", "", tot)) %>%
773 | tbl_df() %>%
774 | attributes(ca3)$join() %>%
775 | group_by(time) %>%
776 | mutate(
777 | word_count = stringi::stri_count_words(dialogue),
778 | start = starts(word_count),
779 | end = ends(word_count)
780 | ) %>%
781 | na.omit() %>%
782 | mutate(cluster = factor(cluster, levels = k:1)) %>%
783 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) +
784 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
785 | ggplot2::theme_bw() +
786 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
787 | panel.grid.minor.x = ggplot2::element_blank(),
788 | panel.grid.major.x = ggplot2::element_blank(),
789 | panel.grid.minor.y = ggplot2::element_blank(),
790 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
791 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
792 | strip.background = ggplot2::element_blank()) +
793 | ggplot2::facet_wrap(~time, scales='free', ncol=1) +
794 | ggplot2::labs(x="Duration (words)", y="Cluster")
795 |
796 | ## Warning: Ignoring unknown aesthetics: position
797 |
798 | 
799 |
800 | Right away we notice that not all topics are used across all three
801 | times. This is encouraging that the clustering is working as expected as
802 | we'd expect some overlap in debate topics as well as some unique topics.
803 | However, there were so many topics clustering on cluster 3 that I had to
804 | make some decisions. I could (a) ignore this mass and essentially throw
805 | out half the data that loaded on a single cluster, (b) increase `k` to
806 | split up the mass loading on cluster 3, (c) change the unit of analysis.
807 | It seemed the first option was wasteful of data and could miss
808 | information. The second approach could lead to a model that had so many
809 | topics it wouldn't be meaningful. The last approach seemed reasonable,
810 | inspecting the cluster text showed that many were capturing functions of
811 | language rather than content. For example, people use *"Oh."* to
812 | indicate agreement. This isn't a topic but the clustering would group
813 | sentences that use this convention together. Combining this sentence
814 | with other sentences in the turn of talk are more likely to get the
815 | content we're after.
816 |
817 | Next I used the `textshape::combine` function to group turns of talk
818 | together.
819 |
820 | myfit4 <- presidential_debates_2012 %>%
821 | mutate(tot = gsub("\\..+$", "", tot)) %>%
822 | textshape::combine() %>%
823 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>%
824 | hierarchical_cluster()
825 |
826 | plot(myfit4, k = 80)
827 |
828 | 
829 |
830 | The distribution of turns of talk looked much more dispersed across
831 | clusters. I used `k = 60` for the number of topics.
832 |
833 | k <- 80
834 | ca4 <- assign_cluster(myfit4, k = k)
835 |
836 | presidential_debates_2012 %>%
837 | mutate(tot = gsub("\\..+$", "", tot)) %>%
838 | textshape::combine() %>%
839 | tbl_df() %>%
840 | attributes(ca4)$join() %>%
841 | group_by(time) %>%
842 | mutate(
843 | word_count = stringi::stri_count_words(dialogue),
844 | start = starts(word_count),
845 | end = ends(word_count)
846 | ) %>%
847 | na.omit() %>%
848 | mutate(cluster = factor(cluster, levels = k:1)) %>%
849 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) +
850 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
851 | ggplot2::theme_bw() +
852 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
853 | panel.grid.minor.x = ggplot2::element_blank(),
854 | panel.grid.major.x = ggplot2::element_blank(),
855 | panel.grid.minor.y = ggplot2::element_blank(),
856 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
857 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
858 | strip.background = ggplot2::element_blank()) +
859 | ggplot2::facet_wrap(~time, scales='free', ncol=1) +
860 | ggplot2::labs(x="Duration (words)", y="Cluster")
861 |
862 | ## Warning: Ignoring unknown aesthetics: position
863 |
864 | 
865 |
866 | The plots looked less messy and indeed topics do appear to be clustering
867 | around one another. I wanted to see how the primary participants, the
868 | candidates, compared to each other in topic use.
869 |
870 | In this last bit of analysis I filter out all participants except Obama
871 | and Romeny and facet by participant across time.
872 |
873 | myfit5 <- presidential_debates_2012 %>%
874 | mutate(tot = gsub("\\..+$", "", tot)) %>%
875 | textshape::combine() %>%
876 | filter(person %in% c("ROMNEY", "OBAMA")) %>%
877 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>%
878 | hierarchical_cluster()
879 |
880 |
881 | plot(myfit5, 50)
882 |
883 | 
884 |
885 | Based on the dendrogram, I used `k = 50` for the number of topics.
886 |
887 | k <- 50
888 | ca5 <- assign_cluster(myfit5, k = k)
889 |
890 | presidential_debates_2012 %>%
891 | mutate(tot = gsub("\\..+$", "", tot)) %>%
892 | textshape::combine() %>%
893 | filter(person %in% c("ROMNEY", "OBAMA")) %>%
894 | tbl_df() %>%
895 | attributes(ca5)$join() %>%
896 | group_by(time) %>%
897 | mutate(
898 | word_count = stringi::stri_count_words(dialogue),
899 | start = starts(word_count),
900 | end = ends(word_count)
901 | ) %>%
902 | na.omit() %>%
903 | mutate(cluster = factor(cluster, levels = k:1)) %>%
904 | ggplot2::ggplot(ggplot2::aes(x = start-10, y = cluster, xend = end+10, yend = cluster)) +
905 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) +
906 | ggplot2::theme_bw() +
907 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'),
908 | panel.grid.minor.x = ggplot2::element_blank(),
909 | panel.grid.major.x = ggplot2::element_blank(),
910 | panel.grid.minor.y = ggplot2::element_blank(),
911 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'),
912 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0),
913 | strip.background = ggplot2::element_blank()) +
914 | ggplot2::facet_grid(person~time, scales='free', space='free') +
915 | ggplot2::labs(x="Duration (words)", y="Cluster")
916 |
917 | ## Warning: Ignoring unknown aesthetics: position
918 |
919 | 
920 |
921 | If you're curious about the heaviest weighted tf-idf terms in each
922 | cluster the next code chunk provides the top five weighted terms used in
923 | each cluster. Below this I provide a bar plot of the frequencies of
924 | clusters to help put the other information into perspective.
925 |
926 | invisible(Map(function(x, y){
927 |
928 | if (is.null(x)) {
929 | cat(sprintf("Cluster %s: ...\n", y))
930 | } else {
931 | m <- dplyr::top_n(x, 5, n)
932 | o <- paste(paste0(m[[1]], " (", round(m[[2]], 1), ")"), collapse="; ")
933 | cat(sprintf("Cluster %s: %s\n", y, o))
934 | }
935 |
936 | }, get_terms(ca5, .4), names(get_terms(ca5, .4))))
937 |
938 | ## Cluster 1: going (1); time (0.6); get (0.5); years (0.5); like (0.4)
939 | ## Cluster 2: trillion (1); dollar (0.9); billion (0.5)
940 | ## Cluster 3: one (1); number (0.9)
941 | ## Cluster 4: get (1); private (0.7); medicare (0.5)
942 | ## Cluster 5: election (1); choice (0.9)
943 | ## Cluster 6: two (1); eighty (0.9); thousand (0.6); dollar (0.5); hundred (0.4)
944 | ## Cluster 7: happen (1)
945 | ## Cluster 8: care (1); government (0.9); keep (0.7); federal (0.6); health (0.5)
946 | ## Cluster 9: war (1); high (0.9); end (0.8); iraq (0.5)
947 | ## Cluster 10: good (1); forward (0.6); know (0.6)
948 | ## Cluster 11: make (1); sure (0.9); point (0.7); want (0.6); last (0.5)
949 | ## Cluster 12: sorry (1); china (0.9); record (0.7); talk (0.6); problem (0.5)
950 | ## Cluster 13: absolutely (1)
951 | ## Cluster 14: yes (1); places (0.5)
952 | ## Cluster 15: let (1); respond (0.7); tell (0.7); bob (0.7); example (0.5)
953 | ## Cluster 16: dodd (1); frank (1)
954 | ## Cluster 17: cut (1); taxes (0.8)
955 | ## Cluster 18: believe (1)
956 | ## Cluster 19: three (1)
957 | ## Cluster 20: jobs (1)
958 | ## Cluster 21: middle (1); class (0.6); east (0.5); certainly (0.5)
959 | ## Cluster 22: plan (1)
960 | ## Cluster 23: well (1); first (0.7); year (0.6); posture (0.6)
961 | ## Cluster 24: wrong (1)
962 | ## Cluster 25: great (1); bush (0.7); different (0.6); teachers (0.6)
963 | ## Cluster 26: difference (1)
964 | ## Cluster 27: military (1)
965 | ## Cluster 28: stamps (1); food (1); million (0.4)
966 | ## Cluster 29: job (1); college (0.8); find (0.6)
967 | ## Cluster 30: pension (1); mister (0.9); speaking (0.6); still (0.5); looked (0.5)
968 | ## Cluster 31: business (1); grow (0.8); small (0.8); planning (0.5)
969 | ## Cluster 32: done (1)
970 | ## Cluster 33: thank (1)
971 | ## Cluster 34: four (1); last (1); years (0.9)
972 | ## Cluster 35: question (1); answer (0.8); straightforward (0.5)
973 | ## Cluster 36: detroit (1); bankrupt (0.5)
974 | ## Cluster 37: energy (1)
975 | ## Cluster 38: percent (1); seven (0.5)
976 | ## Cluster 39: right (1); course (0.6)
977 | ## Cluster 40: governor (1)
978 | ## Cluster 41: production (1); coal (0.9); oil (0.5); gas (0.4)
979 | ## Cluster 42: true (1)
980 | ## Cluster 43: candy (1)
981 | ## Cluster 44: economy (1); grows (0.5)
982 | ## Cluster 45: balanced (1); budget (0.9)
983 | ## Cluster 46: leadership (1); kind (0.8); role (0.5); shown (0.5); show (0.5)
984 | ## Cluster 47: said (1)
985 | ## Cluster 48: iran (1); nuclear (0.5)
986 | ## Cluster 49: work (1)
987 | ## Cluster 50: strong (1)
988 |
989 | invisible(summary(ca5))
990 |
991 | 
992 |
993 | It appears that in fact the topics do cluster within segments of time as
994 | we'd expect. This is more apparent when turn of talk is used as the unit
995 | of analysis (document level) rather than each sentence.
--------------------------------------------------------------------------------
/data/assignments.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/data/assignments.rda
--------------------------------------------------------------------------------
/data/presidential_debates_2012.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/data/presidential_debates_2012.rda
--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
1 | citHeader("To cite clustext in publications, please use:")
2 |
3 |
4 | citEntry(entry = "manual",
5 | title = "{clustext}: Consistent Clustering for Text Data",
6 | author = "Tyler W. Rinker",
7 | organization = "University at Buffalo/SUNY",
8 | address = "Buffalo, New York",
9 | note = "version 0.1.1",
10 | year = "2016",
11 | url = "http://github.com/trinker/clustext",
12 | textVersion = paste("Rinker, T. W. (2016).",
13 | "clustext: Consistent Clustering for Text Data",
14 | "version 0.1.1. University at Buffalo. Buffalo, New York.",
15 | "http://github.com/trinker/clustext")
16 | )
17 |
--------------------------------------------------------------------------------
/inst/additional/foo_turk.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 | ==========
4 | Cluster 1:
5 | ==========
6 | * But I'm not in favor of rounding up people and and and taking them out of this country.
7 | * What is the deal that you would accept, Mister President?
8 | * We take program after program that we don't absolutely have to have, and we get rid of them.
9 | * I I was I was someone who ran businesses for twenty five years, and balanced the budget.
10 | * We need to be talking about space.
11 | * It's wonderful that Libya seems to be making some progress, despite this terrible tragedy.
12 | * Number two, I asked the president a question I think Hispanics and immigrants all over the nation have asked.
13 | * We need to have strong allies.
14 | * Questions remain.
15 | * Now, let me mention one other thing, and that is self deportation says let people make their own choice.
16 |
17 |
18 |
19 |
20 | ==========
21 | Cluster 2: money & jobs
22 | ==========
23 | * You're going to get a repeat of the last four years.
24 | * That's creating tens of thousands of jobs all across the country.
25 | * But Governor, when it comes to our foreign policy, you seem to want to import the foreign policies of the one thousand nine hundred eightys, just like the social policies of the one thousand nine hundred fiftys and the economic policies of the one thousand nine hundred twentys.
26 | * And we would have lost a million jobs.
27 | * Look, I look at what's happening around the world, and i see Iran four years closer to a bomb.
28 | * That's another trillion dollars that's dollar eight trillion.
29 | * Median income is down dollar four thousand three hundred a family and twenty three million Americans out of work.
30 | * eighty six.
31 | * It is also essential for us to understand what our mission is in Iran, and that is to dissuade Iran from having a nuclear weapon through peaceful and diplomatic means.
32 | * Massachusetts schools are ranked number one in the nation.
33 |
34 |
35 |
36 |
37 | ==========
38 | Cluster 3: correct
39 | ==========
40 | * All right.
41 | * All right.
42 | * All right.
43 | * Right.
44 | * And that's right.
45 | * Did I get that right?
46 | * All right.
47 | * All right.
48 | * All right.
49 | * All right.
50 |
51 |
52 |
53 |
54 | ==========
55 | Cluster 4: =3
56 | ==========
57 | * You're absolutely right.
58 | * Nothing Governor Romney just said is true, starting with this notion of me apologizing.
59 | * Absolutely.
60 | * Absolutely.
61 | * Well, that's probably true.
62 | * Candy, what Governor Romney said just isn't true.
63 | * Absolutely.
64 | * It's just not true.
65 | * Not true.
66 | * Absolutely.
67 |
68 |
69 |
70 |
71 | ==========
72 | Cluster 5: ello govna
73 | ==========
74 | * Hi, Governor.
75 | * Governor Romney, you did not|
76 | * Governor?
77 | * Governor|
78 | * Governor?
79 | * Governor|
80 | * Governor|
81 | * And you go first, Governor Romney.
82 | * I was a governor.
83 | * Governor?
84 |
85 |
86 |
87 |
88 | ==========
89 | Cluster 6: mister
90 | ==========
91 | * This has not been Mister Oil, or Mister Gas, or Mister Coal.
92 | * Mister President?
93 | * Mister President|
94 | * Mister President?
95 | * But what would you do as president?
96 | * Mister President?
97 | * Mister President, Mister President, you're entitled as the president to your own airplane and to your own house, but not to your own facts.
98 | * Mister President?
99 | * This is a president who has not been able to do what he said he'd do.
100 | * Mister President?
101 |
102 |
103 |
104 |
105 | ==========
106 | Cluster 7: politeness
107 | ==========
108 | * And that is the question.
109 | * Governor, this question is for you.
110 | * But the president does get this question.
111 | * For now, from the University of Denver, I'm Jim Lehrer.
112 | * No, no, I had a question and the question was how much did you cut them by?
113 | * And I appreciate that question.
114 | * Thank you.
115 | * Thank you so much.
116 | * Thank you, and that's an opportunity for me, and I appreciate it.
117 | * Well, Jim, I want to thank you, and I want to thank Governor Romney, because I think was a terrific debate, and I very much appreciate it.
118 |
--------------------------------------------------------------------------------
/inst/build.R:
--------------------------------------------------------------------------------
1 | root <- Sys.getenv("USERPROFILE")
2 | pack <- basename(getwd())
3 |
4 | quick <- TRUE
5 | pdf <- TRUE
6 |
7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)
8 | devtools::document()
9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE)
10 |
11 | if(pdf){
12 | path <- find.package(pack)
13 | system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path)))
14 | file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf')))
15 | while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)}
16 | empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE)
17 | unlink(empts, recursive = TRUE, force = TRUE)
18 | }
19 |
20 | message("Done!")
21 |
--------------------------------------------------------------------------------
/inst/extra_scripts/build_data.R:
--------------------------------------------------------------------------------
1 | pacman::p_load(clustext, dplyr)
2 |
3 | x <- presidential_debates_2012 %>%
4 | mutate(tot = gsub("\\..+$", "", tot)) %>%
5 | textshape::combine() %>%
6 | filter(person %in% c("ROMNEY", "OBAMA")) %>%
7 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3))
8 |
9 | set.seed(10)
10 | kmeans_assignment <- kmeans_cluster(x, 50) %>%
11 | assign_cluster(myfit2)
12 |
13 | set.seed(10)
14 | nmf_assignment <- nmf_cluster(x, 50) %>%
15 | assign_cluster(myfit2)
16 |
17 | set.seed(10)
18 | skmeans_assignment <- skmeans_cluster(x, 50) %>%
19 | assign_cluster(myfit2)
20 |
21 | hierarchical_assignment <- hierarchical_cluster(x) %>%
22 | assign_cluster(k=50)
23 |
24 | assignments <- list(
25 | hierarchical_assignment = hierarchical_assignment,
26 | kmeans_assignment = kmeans_assignment,
27 | skmeans_assignment = skmeans_assignment,
28 | nmf_assignment =nmf_assignment
29 | )
30 |
31 |
32 | assignments <- lapply(assignments, function(x) {
33 | attributes(x)[['data_store']] <- NULL
34 | attributes(x)[['model']] <- NULL
35 | attributes(x)[['join']] <- NULL
36 | x
37 | })
38 |
39 | lapply(assignments, function(x) {names(attributes(x))})
40 |
41 | pax::new_data(assignments)
42 |
--------------------------------------------------------------------------------
/inst/extra_statdoc/readme.R:
--------------------------------------------------------------------------------
1 | 
2 |
clustext is a...
3 | Download the development version of clustext here
4 |
--------------------------------------------------------------------------------
/inst/maintenance.R:
--------------------------------------------------------------------------------
1 | #========
2 | # BUILD
3 | #========
4 | source("inst/build.R")
5 |
6 | #==========================
7 | # Run unit tests
8 | #==========================
9 | devtools::test()
10 |
11 | #==========================
12 | # knit README.md
13 | #==========================
14 | rmarkdown::render("README.Rmd", "all"); md_toc()
15 |
16 | #==========================
17 | # UPDATE NEWS
18 | #==========================
19 | update_news()
20 |
21 | #==========================
22 | # UPDATE VERSION
23 | #==========================
24 | update_version()
25 |
26 | #========================
27 | #staticdocs dev version
28 | #========================
29 |
30 | if (!require("pacman")) install.packages("pacman")
31 | pacman::p_load_gh("hadley/staticdocs", "trinker/acc.roxygen2")
32 | p_load(rstudioapi, qdap)
33 |
34 | R_USER <- switch(Sys.info()[["user"]],
35 | Tyler = "C:/Users/Tyler",
36 | trinker = "C:/Users/trinker",
37 | message("Computer name not found")
38 | )
39 | build_site(pkg=file.path(R_USER, "GitHub", basename(getwd())), launch = FALSE)
40 |
41 | #STEP 2: reshape index
42 | path <- "inst/web"
43 | path2 <- file.path(path, "/index.html")
44 | rdme <- file.path(R_USER, "GitHub", basename(getwd()), "inst/extra_statdoc/readme.R")
45 |
46 | extras <- qcv("")
47 | ## drops <- qcv()
48 | expand_statdoc(path2, to.icon = extras, readme = rdme)
49 |
50 | x <- readLines(path2)
51 | x[grepl("
Authors
", x)] <- paste(
52 | c("Author
"
53 | #rep("Contributor
", 1)
54 | ),
55 | c("Tyler W. Rinker")
56 | )
57 |
58 | cat(paste(x, collapse="\n"), file=path2)
59 |
60 | #STEP 3: move to trinker.guthub
61 | library(reports)
62 | file <- file.path(R_USER, "/GitHub/trinker.github.com")
63 | # incoming <- file.path(file, basename(getwd()))
64 | delete(incoming)
65 | file.copy(path, file, TRUE, TRUE)
66 | file.rename(file.path(file, "web"), incoming)
67 | ## delete(path)
68 |
69 | #==========================
70 | #staticdocs current version
71 | #==========================
72 |
73 | #STEP 3: move to trinker.guthub
74 | library(reports)
75 | file <- file.path(R_USER, "/GitHub/trinker.github.com")
76 | incoming <- file.path(file, "discon")
77 | ## delete(incoming); file.copy(path, file, TRUE, TRUE); file.rename(file.path(file, "web"), incoming)
78 |
79 | #==========================
80 | # NEWS new version
81 | #==========================
82 | x <- c("BUG FIXES", "NEW FEATURES", "MINOR FEATURES", "IMPROVEMENTS", "CHANGES")
83 | cat(paste(x, collapse = "\n\n"), file="clipboard")
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/inst/staticdocs/index.R:
--------------------------------------------------------------------------------
1 | library(staticdocs)
2 |
3 | sd_section("",
4 | "Function for...",
5 | c(
6 | "myfun"
7 | )
8 | )
--------------------------------------------------------------------------------
/man/approx_k.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/approx_k.R
3 | \name{approx_k}
4 | \alias{approx_k}
5 | \alias{approx_k.DocumentTermMatrix}
6 | \alias{approx_k.TermDocumentMatrix}
7 | \title{Approximate Number of Clusters for a Text Matrix}
8 | \usage{
9 | approx_k(x, verbose = TRUE)
10 |
11 | \method{approx_k}{TermDocumentMatrix}(x, verbose = TRUE)
12 |
13 | \method{approx_k}{DocumentTermMatrix}(x, verbose = TRUE)
14 | }
15 | \arguments{
16 | \item{x}{A matrix.}
17 |
18 | \item{verbose}{logical. If \code{TRUE} the k determination is printed.}
19 | }
20 | \value{
21 | Returns an integer.
22 | }
23 | \description{
24 | Can & Ozkarahan (1990) formula for approximating the number of clusters for
25 | a text matrix: \eqn{(m * n)/t} where \eqn{m} and \eqn{n} are the dimensions
26 | of the matrix and \eqn{t} is the length of the non-zero elements in matrix
27 | \eqn{A}.
28 | }
29 | \examples{
30 | library(gofastr)
31 | library(dplyr)
32 |
33 | presidential_debates_2012 \%>\%
34 | with(q_dtm(dialogue)) \%>\%
35 | approx_k()
36 | }
37 | \references{
38 | Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of
39 | the cover-coefficient-based clustering methodology for text databases.
40 | ACM Transactions on Database Systems 15 (4): 483. doi:10.1145/99935.99938. \cr
41 | }
42 |
43 |
--------------------------------------------------------------------------------
/man/as_topic.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/as_topic.R
3 | \name{as_topic}
4 | \alias{as_topic}
5 | \alias{as_topic.get_terms}
6 | \title{Convert \code{get_terms} to Topics}
7 | \usage{
8 | as_topic(x, max.n = 8, sort = TRUE, ...)
9 |
10 | \method{as_topic}{get_terms}(x, max.n = 8, sort = TRUE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{get_terms} object.}
14 |
15 | \item{max.n}{The max number of words to show before truncation.}
16 |
17 | \item{sort}{logical. If \code{TRUE} the cluster topics are sorted by size
18 | (number of documents) otherwise the topics are sorted by cluster number.}
19 |
20 | \item{\ldots}{ignored.}
21 | }
22 | \value{
23 | Returns a \code{\link[base]{data.frame}} of \code{"cluster"},
24 | \code{"count"}, and \code{"terms"}. Pretty prints as clusters, number of
25 | documents, and associated important terms.
26 | }
27 | \description{
28 | View important terms as a comma separated string (a topic).
29 | }
30 | \examples{
31 | library(dplyr)
32 |
33 | myfit5 <- presidential_debates_2012 \%>\%
34 | mutate(tot = gsub("\\\\..+$", "", tot)) \%>\%
35 | textshape::combine() \%>\%
36 | filter(person \%in\% c("ROMNEY", "OBAMA")) \%>\%
37 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) \%>\%
38 | hierarchical_cluster()
39 |
40 | ca5 <- assign_cluster(myfit5, k = 50)
41 |
42 | get_terms(ca5, .4) \%>\%
43 | as_topic()
44 |
45 | get_terms(ca5, .4) \%>\%
46 | as_topic(sort=FALSE)
47 |
48 | get_terms(ca5, .95) \%>\%
49 | as_topic()
50 | }
51 |
52 |
--------------------------------------------------------------------------------
/man/assign_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_cluster.R
3 | \name{assign_cluster}
4 | \alias{assign_cluster}
5 | \alias{assign_cluster.hierarchical_cluster}
6 | \alias{assign_cluster.kmeans_cluster}
7 | \alias{assign_cluster.nmf_cluster}
8 | \alias{assign_cluster.skmeans_cluster}
9 | \title{Assign Clusters to Documents/Text Elements}
10 | \usage{
11 | assign_cluster(x, k = approx_k(get_dtm(x)), h = NULL, ...)
12 |
13 | \method{assign_cluster}{hierarchical_cluster}(x, k = approx_k(get_dtm(x)),
14 | h = NULL, cut = "static", deepSplit = TRUE, minClusterSize = 1, ...)
15 |
16 | \method{assign_cluster}{kmeans_cluster}(x, ...)
17 |
18 | \method{assign_cluster}{skmeans_cluster}(x, ...)
19 |
20 | \method{assign_cluster}{nmf_cluster}(x, ...)
21 | }
22 | \arguments{
23 | \item{x}{a \code{xxx_cluster} object.}
24 |
25 | \item{k}{The number of clusters (can supply \code{h} instead). Defaults to
26 | use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced
27 | by \code{data_storage}.}
28 |
29 | \item{h}{The height at which to cut the dendrograms (determines number of
30 | clusters). If this argument is supplied \code{k} is ignored.}
31 |
32 | \item{cut}{The type of cut method to use for \code{hierarchical_cluster}; one
33 | of \code{'static'}, \code{'dynamic'} or \code{'iterative'}.}
34 |
35 | \item{deepSplit}{logical. See \code{\link[dynamicTreeCut]{cutreeDynamic}}.}
36 |
37 | \item{minClusterSize}{The minimum cluster size. See
38 | \code{\link[dynamicTreeCut]{cutreeDynamic}}.}
39 |
40 | \item{\ldots}{ignored.}
41 | }
42 | \value{
43 | Returns an \code{assign_cluster} object; a named vector of cluster
44 | assignments with documents as names. The object also contains the original
45 | \code{data_storage} object and a \code{join} function. \code{join} is a
46 | function (a closure) that captures information about the \code{assign_cluster}
47 | that makes rejoining to the original data set simple. The user simply
48 | supplies the original data set as an argument to \code{join}
49 | (\code{attributes(FROM_ASSIGN_CLUSTER)$join(ORIGINAL_DATA)}).
50 | }
51 | \description{
52 | Assign clusters to documents/text elements.
53 | }
54 | \examples{
55 | \dontrun{
56 | library(dplyr)
57 |
58 | x <- with(
59 | presidential_debates_2012,
60 | data_store(dialogue, paste(person, time, sep = "_"))
61 | )
62 |
63 | hierarchical_cluster(x) \%>\%
64 | plot(h=.7, lwd=2)
65 |
66 | hierarchical_cluster(x) \%>\%
67 | assign_cluster(h=.7)
68 |
69 | hierarchical_cluster(x, method="complete") \%>\%
70 | plot(k=6)
71 |
72 | hierarchical_cluster(x) \%>\%
73 | assign_cluster(k=6)
74 |
75 |
76 | x2 <- presidential_debates_2012 \%>\%
77 | with(data_store(dialogue)) \%>\%
78 | hierarchical_cluster()
79 |
80 | ca2 <- assign_cluster(x2, k = 55)
81 | summary(ca2)
82 |
83 | ## Dynamic cut
84 | ca3 <- assign_cluster(x2, cut = 'dynamic', minClusterSize = 5)
85 | get_text(ca3)
86 |
87 | ## add to original data
88 | attributes(ca2)$join(presidential_debates_2012)
89 |
90 | ## split text into clusters
91 | get_text(ca2)
92 |
93 | ## Kmeans Algorithm
94 | kmeans_cluster(x, k=6) \%>\%
95 | assign_cluster()
96 |
97 | x3 <- presidential_debates_2012 \%>\%
98 | with(data_store(dialogue)) \%>\%
99 | kmeans_cluster(55)
100 |
101 | ca3 <- assign_cluster(x3)
102 | summary(ca3)
103 |
104 | ## split text into clusters
105 | get_text(ca3)
106 | }
107 | }
108 |
109 |
--------------------------------------------------------------------------------
/man/assignments.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/clustext-package.R
3 | \docType{data}
4 | \name{assignments}
5 | \alias{assignments}
6 | \title{Topic Assignments}
7 | \format{A list with 3 elements}
8 | \usage{
9 | data(assignments)
10 | }
11 | \description{
12 | A dataset containing a list of topic assignments by various clustering
13 | algorithms. Assignments correspond to the rows (minus empty rows) of the
14 | \code{presidential_debates_2012} data set.
15 | }
16 | \keyword{datasets}
17 |
18 |
--------------------------------------------------------------------------------
/man/categorize.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/categorize.R
3 | \name{categorize}
4 | \alias{categorize}
5 | \title{Merge Clusters & Cluster Categories Back to Original Data}
6 | \usage{
7 | categorize(data, assign.cluster, cluster.key)
8 | }
9 | \arguments{
10 | \item{data}{A data set that was fit with a cluster model.}
11 |
12 | \item{assign.cluster}{An \code{\link[clustext]{assign_cluster}} object.}
13 |
14 | \item{cluster.key}{An \code{\link[clustext]{assign_cluster}} object.}
15 | }
16 | \value{
17 | Returns a \code{\link[base]{data.frame}} key of clusters and categories.
18 | }
19 | \description{
20 | Merge clusters, categories, and the original data back together.
21 | }
22 | \examples{
23 | library(dplyr)
24 |
25 | ## Assign Clusters
26 | ca <- presidential_debates_2012 \%>\%
27 | with(data_store(dialogue)) \%>\%
28 | hierarchical_cluster() \%>\%
29 | assign_cluster(k = 7)
30 |
31 | ## Write Cluster Text for Human Categorization
32 | write_cluster_text(ca)
33 | write_cluster_text(ca, n.sample=10)
34 | write_cluster_text(ca, lead=" -", n.sample=10)
35 |
36 | ## Read Human Coded Categories Back In
37 | categories_file <- system.file("additional/foo_turk.txt", package = "clustext")
38 | readLines(categories_file)
39 | (categories_key <- read_cluster_text(categories_file))
40 |
41 | ## Add Categories Back to Original Data Set
42 | categorize(
43 | data = presidential_debates_2012,
44 | assign.cluster = ca,
45 | cluster.key = categories_key
46 | )
47 | }
48 | \seealso{
49 | \code{\link[clustext]{write_cluster_text}},
50 | \code{\link[clustext]{read_cluster_text}}
51 | }
52 |
53 |
--------------------------------------------------------------------------------
/man/clustext.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/clustext-package.R
3 | \docType{package}
4 | \name{clustext}
5 | \alias{clustext}
6 | \alias{clustext-package}
7 | \alias{package-clustext}
8 | \title{Consistent Clustering for Text Data}
9 | \description{
10 | Optimized, consistent tools for clustering text data.
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/man/compare.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/compare.R
3 | \name{compare}
4 | \alias{compare}
5 | \title{Adjusted Rand Index Comaprison Between Algorithms}
6 | \usage{
7 | compare(...)
8 | }
9 | \arguments{
10 | \item{\ldots}{A series of outputs from \code{assign_cluster} for various
11 | cluster algorithmns.}
12 | }
13 | \value{
14 | Returns a pair-wise comparison matrix of Adjusted Rand Indices for
15 | algorithm. Higher Adjusted Rand Index scores indicate higher cluster
16 | assignment agreement.
17 | }
18 | \description{
19 | An Adjusted Rand Index comparison of the assignments between different
20 | clustering algorithms.
21 | }
22 | \examples{
23 | compare(
24 | assignments$hierarchical_assignment,
25 | assignments$kmeans_assignment,
26 | assignments$skmeans_assignment,
27 | assignments$nmf_assignment
28 | )
29 |
30 | ## Understanding the ARI
31 | set.seed(10)
32 | w <- sample(1:10, 40, TRUE)
33 | x <- 11-w
34 | set.seed(20)
35 | y <- sample(1:10, 40, TRUE)
36 | set.seed(50)
37 | z <- sample(1:10, 40, TRUE)
38 |
39 | data.frame(w, x, y, z)
40 |
41 | library(mclust)
42 |
43 | mclust::adjustedRandIndex(w, x)
44 | mclust::adjustedRandIndex(x, y)
45 | mclust::adjustedRandIndex(x, z)
46 | }
47 | \references{
48 | \url{http://faculty.washington.edu/kayee/pca/supp.pdf}
49 | }
50 |
51 |
--------------------------------------------------------------------------------
/man/cosine_distance.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cosine_distance.R
3 | \name{cosine_distance}
4 | \alias{cosine_distance}
5 | \alias{cosine_distance.DocumentTermMatrix}
6 | \alias{cosine_distance.TermDocumentMatrix}
7 | \title{Optimized Computation of Cosine Distance}
8 | \usage{
9 | cosine_distance(x, ...)
10 |
11 | \method{cosine_distance}{DocumentTermMatrix}(x, ...)
12 |
13 | \method{cosine_distance}{TermDocumentMatrix}(x, ...)
14 | }
15 | \arguments{
16 | \item{x}{A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
17 | \code{\link[tm]{TermDocumentMatrix}}).}
18 |
19 | \item{\ldots}{ignored.}
20 | }
21 | \value{
22 | Returns a cosine distance object of class \code{"dist"}.
23 | }
24 | \description{
25 | Utilizes the \pkg{slam} package to efficiently calculate cosine distance
26 | on large sparse matrices.
27 | }
28 | \examples{
29 | library(gofastr)
30 | library(dplyr)
31 |
32 | out <- presidential_debates_2012 \%>\%
33 | with(q_dtm(dialogue)) \%>\%
34 | cosine_distance()
35 | }
36 | \author{
37 | Michael Andrec and Tyler Rinker .
38 | }
39 | \references{
40 | \url{http://stackoverflow.com/a/29755756/1000343}
41 | }
42 | \keyword{cosine}
43 | \keyword{dissimilarity}
44 |
45 |
--------------------------------------------------------------------------------
/man/data_store.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data_store.R
3 | \name{data_store}
4 | \alias{data_store}
5 | \title{Data Structure for \pkg{hclusttext}}
6 | \usage{
7 | data_store(text, doc.names, min.term.freq = 1, min.doc.len = 1,
8 | stopwords = tm::stopwords("english"), min.char = 3, max.char = NULL,
9 | stem = FALSE, denumber = TRUE)
10 | }
11 | \arguments{
12 | \item{text}{A character vector.}
13 |
14 | \item{doc.names}{An optional vector of document names corresponding to the
15 | length of \code{text}.}
16 |
17 | \item{min.term.freq}{The minimum times a term must appear to be included in
18 | the \code{\link[tm]{DocumentTermMatrix}}.}
19 |
20 | \item{min.doc.len}{The minimum words a document must contain to be included
21 | in the data structure (other wise it is stored as a \code{removed} element).}
22 |
23 | \item{stopwords}{A vector of stopwords to remove.}
24 |
25 | \item{min.char}{The minial length character for retained words.}
26 |
27 | \item{max.char}{The maximum length character for retained words.}
28 |
29 | \item{stem}{Logical. If \code{TRUE} the \code{stopwords} will be stemmed.}
30 |
31 | \item{denumber}{Logical. If \code{TRUE} numbers will be excluded.}
32 | }
33 | \value{
34 | Returns a list containing:
35 | \describe{
36 | \item{dtm}{A tf-idf weighted \code{\link[tm]{DocumentTermMatrix}}}
37 | \item{text}{The text vector with unanalyzable elements removed}
38 | \item{removed}{The indices of the removed text elements, i.e., documents not meeting \code{min.doc.len}}
39 | \item{n.nonsparse}{The length of the non-zero elements}
40 | }
41 | }
42 | \description{
43 | A data structure which stores the text, DocumentTermMatrix, and information
44 | regarding removed text elements which can not be handled by the
45 | \code{hierarchical_cluster} function. This structure is required because it
46 | documents important meta information, including removed elements, required by
47 | other \pkg{clustext} functions. If the user wishes to combine documents
48 | (say by a common grouping variable) it is recomended this be handled by
49 | \code{\link[textshape]{combine}} prior to using \code{data_store}.
50 | }
51 | \examples{
52 | data_store(presidential_debates_2012[["dialogue"]])
53 |
54 | ## Use `combine` to merge text prior to `data_stare`
55 | library(textshape)
56 | library(dplyr)
57 |
58 | dat <- presidential_debates_2012 \%>\%
59 | dplyr::select(person, time, dialogue) \%>\%
60 | textshape::combine()
61 |
62 | ## Elements in `ds` correspond to `dat` grouping vars
63 | (ds <- with(dat, data_store(dialogue)))
64 | dplyr::select(dat, -3)
65 |
66 | ## Add row names
67 | (ds2 <- with(dat, data_store(dialogue, paste(person, time, sep = "_"))))
68 | rownames(ds2[["dtm"]])
69 |
70 | ## Get a DocumentTermMatrix
71 | get_dtm(ds2)
72 | }
73 | \keyword{data}
74 | \keyword{structure}
75 |
76 |
--------------------------------------------------------------------------------
/man/get_documents.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_documents.R
3 | \name{get_documents}
4 | \alias{get_documents}
5 | \alias{get_documents.assign_cluster}
6 | \title{Get Documents Based on Cluster Assignment in \code{assign_cluster}}
7 | \usage{
8 | get_documents(x, ...)
9 |
10 | \method{get_documents}{assign_cluster}(x, ...)
11 | }
12 | \arguments{
13 | \item{x}{A \code{\link[clustext]{assign_cluster}} object.}
14 |
15 | \item{\ldots}{ignored.}
16 | }
17 | \value{
18 | Returns a list of \code{\link[base]{vector}}s of document names.
19 | }
20 | \description{
21 | Get the documents associated with each of the k clusters .
22 | }
23 | \examples{
24 | library(dplyr)
25 |
26 | mydocuments1 <- presidential_debates_2012 \%>\%
27 | with(data_store(dialogue, paste(person, time, sep="-"))) \%>\%
28 | hierarchical_cluster() \%>\%
29 | assign_cluster(k = 6) \%>\%
30 | get_documents()
31 |
32 | mydocuments1
33 |
34 | mydocuments2 <- presidential_debates_2012 \%>\%
35 | with(data_store(dialogue)) \%>\%
36 | hierarchical_cluster() \%>\%
37 | assign_cluster(k = 55) \%>\%
38 | get_documents()
39 |
40 | mydocuments2
41 | }
42 |
43 |
--------------------------------------------------------------------------------
/man/get_dtm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_dtm.R
3 | \name{get_dtm}
4 | \alias{get_dtm}
5 | \alias{get_dtm.data_store}
6 | \alias{get_dtm.hierarchical_cluster}
7 | \alias{get_dtm.kmeans_cluster}
8 | \alias{get_dtm.nmf_cluster}
9 | \alias{get_dtm.skmeans_cluster}
10 | \title{Get a \code{\link[tm]{DocumentTermMatrix}} Stored in a \code{hierarchical_cluster} Object}
11 | \usage{
12 | get_dtm(x, ...)
13 |
14 | \method{get_dtm}{data_store}(x, ...)
15 |
16 | \method{get_dtm}{hierarchical_cluster}(x, ...)
17 |
18 | \method{get_dtm}{kmeans_cluster}(x, ...)
19 |
20 | \method{get_dtm}{skmeans_cluster}(x, ...)
21 |
22 | \method{get_dtm}{nmf_cluster}(x, ...)
23 | }
24 | \arguments{
25 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.}
26 |
27 | \item{\ldots}{ignored.}
28 | }
29 | \value{
30 | Returns a \code{\link[tm]{DocumentTermMatrix}}.
31 | }
32 | \description{
33 | Extract the \code{\link[tm]{DocumentTermMatrix}} supplied to/produced by a
34 | \code{\link[clustext]{hierarchical_cluster}} object.
35 | }
36 | \examples{
37 | library(dplyr)
38 |
39 | presidential_debates_2012 \%>\%
40 | with(data_store(dialogue)) \%>\%
41 | hierarchical_cluster() \%>\%
42 | get_dtm()
43 | }
44 |
45 |
--------------------------------------------------------------------------------
/man/get_removed.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_removed.R
3 | \name{get_removed}
4 | \alias{get_removed}
5 | \alias{get_removed.data_store}
6 | \alias{get_removed.hierarchical_cluster}
7 | \alias{get_removed.kmeans_cluster}
8 | \alias{get_removed.nmf_cluster}
9 | \alias{get_removed.skmeans_cluster}
10 | \title{Get a Text Stored in a \code{hierarchical_cluster} Object}
11 | \usage{
12 | get_removed(x, ...)
13 |
14 | \method{get_removed}{hierarchical_cluster}(x, ...)
15 |
16 | \method{get_removed}{kmeans_cluster}(x, ...)
17 |
18 | \method{get_removed}{skmeans_cluster}(x, ...)
19 |
20 | \method{get_removed}{nmf_cluster}(x, ...)
21 |
22 | \method{get_removed}{data_store}(x, ...)
23 | }
24 | \arguments{
25 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.}
26 |
27 | \item{\ldots}{ignored.}
28 | }
29 | \value{
30 | Returns a vector of text strings.
31 | }
32 | \description{
33 | Extract the text supplied to the
34 | \code{\link[clustext]{hierarchical_cluster}} object.
35 | }
36 | \examples{
37 | library(dplyr)
38 |
39 | presidential_debates_2012 \%>\%
40 | with(data_store(dialogue)) \%>\%
41 | hierarchical_cluster() \%>\%
42 | get_removed()
43 | }
44 |
45 |
--------------------------------------------------------------------------------
/man/get_terms.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_terms.R
3 | \name{get_terms}
4 | \alias{get_terms}
5 | \alias{get_terms.assign_cluster_hierarchical}
6 | \alias{get_terms.assign_cluster_kmeans}
7 | \alias{get_terms.assign_cluster_nmf}
8 | \alias{get_terms.assign_cluster_skmeans}
9 | \title{Get Terms Based on Cluster Assignment in \code{assign_cluster}}
10 | \usage{
11 | get_terms(x, min.weight = 0.6, nrow = NULL, ...)
12 |
13 | \method{get_terms}{assign_cluster_hierarchical}(x, min.weight = 0.6,
14 | nrow = NULL, ...)
15 |
16 | \method{get_terms}{assign_cluster_kmeans}(x, min.weight = 0.6, nrow = NULL,
17 | ...)
18 |
19 | \method{get_terms}{assign_cluster_skmeans}(x, min.weight = 0.6, nrow = NULL,
20 | ...)
21 |
22 | \method{get_terms}{assign_cluster_nmf}(x, min.weight = 0.6, nrow = NULL,
23 | ...)
24 | }
25 | \arguments{
26 | \item{x}{A \code{\link[clustext]{assign_cluster}} object.}
27 |
28 | \item{min.weight}{The lowest min/max scaled tf-idf weighting to consider
29 | as a document's salient term.}
30 |
31 | \item{nrow}{The max number of rows to display in the returned
32 | \code{\link[base]{data.frame}}s.}
33 |
34 | \item{\ldots}{ignored.}
35 | }
36 | \value{
37 | Returns a list of \code{\link[base]{data.frame}}s of top weighted terms.
38 | }
39 | \description{
40 | Get the terms weighted (either by tf-idf or returned from the model) and
41 | min/max scaling associated with each of the k clusters .
42 | }
43 | \examples{
44 | library(dplyr)
45 | library(textshape)
46 |
47 | myterms <- presidential_debates_2012 \%>\%
48 | with(data_store(dialogue)) \%>\%
49 | hierarchical_cluster() \%>\%
50 | assign_cluster(k = 55) \%>\%
51 | get_terms()
52 |
53 | myterms
54 | textshape::tidy_list(myterms[!sapply(myterms, is.null)], "Topic")
55 | \dontrun{
56 | library(ggplot2)
57 | library(gridExtra)
58 | library(dplyr)
59 | library(textshape)
60 | library(wordcloud)
61 |
62 | max.n <- max(textshape::tidy_list(myterms)[["n"]])
63 |
64 | myplots <- Map(function(x, y){
65 | x \%>\%
66 | mutate(term = factor(term, levels = rev(term))) \%>\%
67 | ggplot(aes(term, weight=n)) +
68 | geom_bar() +
69 | scale_y_continuous(expand = c(0, 0),limits=c(0, max.n)) +
70 | ggtitle(sprintf("Topic: \%s", y)) +
71 | coord_flip()
72 | }, myterms, names(myterms))
73 |
74 | myplots[["ncol"]] <- 10
75 |
76 | do.call(gridExtra::grid.arrange, myplots[!sapply(myplots, is.null)])
77 |
78 | ##wordclouds
79 | par(mfrow=c(5, 11), mar=c(0, 4, 0, 0))
80 | Map(function(x, y){
81 | wordcloud::wordcloud(x[[1]], x[[2]], scale=c(1,.25),min.freq=1)
82 | mtext(sprintf("Topic: \%s", y), col = "blue", cex=.55, padj = 1.5)
83 | }, myterms, names(myterms))
84 | }
85 | }
86 |
87 |
--------------------------------------------------------------------------------
/man/get_text.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_text.R
3 | \name{get_text}
4 | \alias{get_text}
5 | \alias{get_text.assign_cluster}
6 | \alias{get_text.data_store}
7 | \alias{get_text.default}
8 | \alias{get_text.hierarchical_cluster}
9 | \alias{get_text.kmeans_cluster}
10 | \alias{get_text.nmf_cluster}
11 | \alias{get_text.skmeans_cluster}
12 | \title{Get a Text Stored in Various Objects}
13 | \usage{
14 | get_text(x, ...)
15 |
16 | \method{get_text}{default}(x, ...)
17 |
18 | \method{get_text}{hierarchical_cluster}(x, ...)
19 |
20 | \method{get_text}{kmeans_cluster}(x, ...)
21 |
22 | \method{get_text}{nmf_cluster}(x, ...)
23 |
24 | \method{get_text}{skmeans_cluster}(x, ...)
25 |
26 | \method{get_text}{data_store}(x, ...)
27 |
28 | \method{get_text}{assign_cluster}(x, ...)
29 | }
30 | \arguments{
31 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.}
32 |
33 | \item{\ldots}{ignored.}
34 | }
35 | \value{
36 | Returns a vector or list of text strings.
37 | }
38 | \description{
39 | Extract the text supplied to the
40 | \code{\link[clustext]{hierarchical_cluster}} object.
41 | }
42 | \examples{
43 | library(dplyr)
44 |
45 | presidential_debates_2012 \%>\%
46 | with(data_store(dialogue)) \%>\%
47 | hierarchical_cluster() \%>\%
48 | get_text() \%>\%
49 | head()
50 | }
51 |
52 |
--------------------------------------------------------------------------------
/man/hierarchical_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/hierarchical_cluster.R
3 | \name{hierarchical_cluster}
4 | \alias{hierarchical_cluster}
5 | \alias{hierarchical_cluster.data_store}
6 | \title{Fit a Hierarchical Cluster}
7 | \usage{
8 | hierarchical_cluster(x, distance = "cosine", method = "ward.D2", ...)
9 |
10 | \method{hierarchical_cluster}{data_store}(x, distance = "cosine",
11 | method = "ward.D", ...)
12 | }
13 | \arguments{
14 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).}
15 |
16 | \item{distance}{A distance measure ("cosine" or "jaccard").}
17 |
18 | \item{method}{The agglomeration method to be used. This must be (an
19 | unambiguous abbreviation of) one of \code{"single"}, \code{"complete"},
20 | \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"},
21 | \code{"centroid"}, or \code{"median"}.}
22 |
23 | \item{\ldots}{ignored.}
24 | }
25 | \value{
26 | Returns an object of class \code{"hclust"}.
27 | }
28 | \description{
29 | Fit a hierarchical cluster to text data. Prior to distance measures being
30 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
31 | \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate
32 | the distance matrix supplied to \code{\link[fastcluster]{hclust}}. \code{method}
33 | defaults to \code{"ward.D2"}. A faster cosine dissimilarity calculation is used
34 | under the hood (see \code{\link[clustext]{cosine_distance}}). Additionally,
35 | \code{\link[fastcluster]{hclust}} is used to quickly calculate the fit.
36 | Essentially, this is a wrapper function optimized for clustering text data.
37 | }
38 | \examples{
39 | library(dplyr)
40 |
41 | x <- with(
42 | presidential_debates_2012,
43 | data_store(dialogue, paste(person, time, sep = "_"))
44 | )
45 |
46 | hierarchical_cluster(x) \%>\%
47 | plot(k=4)
48 |
49 | hierarchical_cluster(x) \%>\%
50 | plot(h=.7, lwd=2)
51 |
52 | hierarchical_cluster(x) \%>\%
53 | assign_cluster(h=.7)
54 |
55 | \dontrun{
56 | ## interactive cutting
57 | hierarchical_cluster(x) \%>\%
58 | plot(h=TRUE)
59 | }
60 |
61 | hierarchical_cluster(x, method="complete") \%>\%
62 | plot(k=6)
63 |
64 | hierarchical_cluster(x) \%>\%
65 | assign_cluster(k=6)
66 |
67 | x2 <- presidential_debates_2012 \%>\%
68 | with(data_store(dialogue))
69 |
70 | myfit2 <- hierarchical_cluster(x2)
71 |
72 | plot(myfit2)
73 | plot(myfit2, 55)
74 |
75 | assign_cluster(myfit2, k = 55)
76 |
77 | ## Example from StackOverflow Question Response
78 | ## Asking fo grouping similar texts together
79 | ## http://stackoverflow.com/q/22936951/1000343
80 | dat <- data.frame(
81 | person = LETTERS[1:3],
82 | text = c("Best way to waste money",
83 | "Amazing stuff. lets you stay connected all the time",
84 | "Instrument to waste money and time"),
85 | stringsAsFactors = FALSE
86 | )
87 |
88 |
89 | x <- with(
90 | dat,
91 | data_store(text, person)
92 | )
93 |
94 |
95 | hierarchical_cluster(x) \%>\%
96 | plot(h=.9, lwd=2)
97 |
98 | hierarchical_cluster(x) \%>\%
99 | assign_cluster(h=.9)
100 |
101 |
102 | hierarchical_cluster(x) \%>\%
103 | assign_cluster(h=.9) \%>\%
104 | get_terms()
105 |
106 | hierarchical_cluster(x) \%>\%
107 | assign_cluster(h=.9) \%>\%
108 | get_terms() \%>\%
109 | as_topic()
110 |
111 | hierarchical_cluster(x) \%>\%
112 | assign_cluster(h=.9) \%>\%
113 | get_documents()
114 | }
115 |
116 |
--------------------------------------------------------------------------------
/man/jaccard_distance.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/jaccard_distance.R
3 | \name{jaccard_distance}
4 | \alias{jaccard_distance}
5 | \alias{jaccard_distance.DocumentTermMatrix}
6 | \alias{jaccard_distance.TermDocumentMatrix}
7 | \title{Optimized Computation of Jaccard Distance}
8 | \usage{
9 | jaccard_distance(x, ...)
10 |
11 | \method{jaccard_distance}{DocumentTermMatrix}(x, ...)
12 |
13 | \method{jaccard_distance}{TermDocumentMatrix}(x, ...)
14 | }
15 | \arguments{
16 | \item{x}{A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or
17 | \code{\link[tm]{TermDocumentMatrix}}).}
18 |
19 | \item{\ldots}{ignored.}
20 | }
21 | \value{
22 | Returns a jaccard distance object of class \code{"dist"}.
23 | }
24 | \description{
25 | Utilizes the \pkg{slam} package to efficiently calculate jaccard distance
26 | on large sparse matrices.
27 | }
28 | \examples{
29 | library(gofastr)
30 | library(dplyr)
31 |
32 | out <- presidential_debates_2012 \%>\%
33 | with(q_dtm(dialogue)) \%>\%
34 | jaccard_distance()
35 | }
36 | \author{
37 | user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker .
38 | }
39 | \references{
40 | \url{http://stackoverflow.com/a/36373333/1000343}
41 | \url{http://stats.stackexchange.com/a/89947/7482}
42 | }
43 | \keyword{dissimilarity}
44 | \keyword{jaccard}
45 |
46 |
--------------------------------------------------------------------------------
/man/kmeans_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/kmeans_cluster.R
3 | \name{kmeans_cluster}
4 | \alias{kmeans_cluster}
5 | \alias{kmeans_cluster.data_store}
6 | \title{Fit a Kmeans Cluster}
7 | \usage{
8 | kmeans_cluster(x, k, ...)
9 |
10 | \method{kmeans_cluster}{data_store}(x, k, ...)
11 | }
12 | \arguments{
13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).}
14 |
15 | \item{k}{The number of clusters.}
16 |
17 | \item{\ldots}{Other arguments passed to \code{\link[stats]{kmeans}}.}
18 | }
19 | \value{
20 | Returns an object of class \code{"kmeans"}.
21 | }
22 | \description{
23 | Fit a kmeans cluster to text data. Prior to distance measures being
24 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
25 | \code{\link[tm]{DocumentTermMatrix}}.
26 | }
27 | \examples{
28 | \dontrun{
29 | library(dplyr)
30 |
31 | x <- with(
32 | presidential_debates_2012,
33 | data_store(dialogue, paste(person, time, sep = "_"))
34 | )
35 |
36 |
37 | ## 6 topic model
38 | kmeans_cluster(x, k=6)
39 |
40 | kmeans_cluster(x, k=6) \%>\%
41 | assign_cluster()
42 |
43 | kmeans_cluster(x, k=6) \%>\%
44 | assign_cluster() \%>\%
45 | summary()
46 |
47 | x2 <- presidential_debates_2012 \%>\%
48 | with(data_store(dialogue))
49 |
50 | myfit2 <- kmeans_cluster(x2, 55)
51 |
52 | assign_cluster(myfit2)
53 |
54 | assign_cluster(myfit2) \%>\%
55 | summary()
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/man/nmf_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/nmf_cluster.R
3 | \name{nmf_cluster}
4 | \alias{nmf_cluster}
5 | \alias{nmf_cluster.data_store}
6 | \title{Fit a Non-Negative Matrix Factorization Cluster}
7 | \usage{
8 | nmf_cluster(x, k = k, ...)
9 |
10 | \method{nmf_cluster}{data_store}(x, k, ...)
11 | }
12 | \arguments{
13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).}
14 |
15 | \item{k}{The number of clusters.}
16 |
17 | \item{\ldots}{Other arguments passed to \code{\link[rNMF]{rnmf}}.}
18 | }
19 | \value{
20 | Returns an object of class \code{"hclust"}.
21 | }
22 | \description{
23 | Fit a robust non-negative matrix factorization cluster to text data via
24 | \code{\link[rNMF]{rnmf}}. Prior to distance measures being
25 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
26 | \code{\link[tm]{DocumentTermMatrix}}.
27 | }
28 | \examples{
29 | library(dplyr)
30 |
31 | x <- with(
32 | presidential_debates_2012,
33 | data_store(dialogue, paste(person, time, sep = "_"))
34 | )
35 |
36 |
37 | ## 6 topic model
38 | model6 <- nmf_cluster(x, k=6)
39 |
40 | model6 \%>\%
41 | assign_cluster()
42 |
43 | model6 \%>\%
44 | assign_cluster() \%>\%
45 | summary()
46 | \dontrun{
47 | x2 <- presidential_debates_2012 \%>\%
48 | with(data_store(dialogue))
49 |
50 | myfit2 <- nmf_cluster(x2, 55)
51 |
52 | assign_cluster(myfit2)
53 |
54 | assign_cluster(myfit2) \%>\%
55 | summary()
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/man/plot.hierarchical_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/hierarchical_cluster.R
3 | \name{plot.hierarchical_cluster}
4 | \alias{plot.hierarchical_cluster}
5 | \title{Plots a hierarchical_cluster Object}
6 | \usage{
7 | \method{plot}{hierarchical_cluster}(x, k = approx_k(get_dtm(x)), h = NULL,
8 | color = "red", digits = 3, ...)
9 | }
10 | \arguments{
11 | \item{x}{A hierarchical_cluster object.}
12 |
13 | \item{k}{The number of clusters (can supply \code{h} instead). Defaults to
14 | use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced
15 | by \code{data_storage}. Boxes are drawn around the clusters.}
16 |
17 | \item{h}{The height at which to cut the dendrograms (determines number of
18 | clusters). If this argument is supplied \code{k} is ignored. A line is drawn
19 | showing the cut point on the dendrogram. If \code{h} is set to \code{TRUE}
20 | or \code{"locator"} then the cutting becomes interactive and the height is
21 | returned invisibly.}
22 |
23 | \item{color}{The color to make the cluster boxes (\code{k}) or line (\code{h}).}
24 |
25 | \item{digits}{The number o digits to display if h\code{h} is set to
26 | interactive.}
27 |
28 | \item{\ldots}{Other arguments passed to \code{\link[stats]{rect.hclust}} or
29 | \code{\link[graphics]{abline}}.}
30 | }
31 | \description{
32 | Plots a hierarchical_cluster object
33 | }
34 |
35 |
--------------------------------------------------------------------------------
/man/presidential_debates_2012.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/clustext-package.R
3 | \docType{data}
4 | \name{presidential_debates_2012}
5 | \alias{presidential_debates_2012}
6 | \title{2012 U.S. Presidential Debates}
7 | \format{A data frame with 2912 rows and 4 variables}
8 | \usage{
9 | data(presidential_debates_2012)
10 | }
11 | \description{
12 | A dataset containing a cleaned version of all three presidential debates for
13 | the 2012 election.
14 | }
15 | \details{
16 | \itemize{
17 | \item person. The speaker
18 | \item tot. Turn of talk
19 | \item dialogue. The words spoken
20 | \item time. Variable indicating which of the three debates the dialogue is from
21 | }
22 | }
23 | \keyword{datasets}
24 |
25 |
--------------------------------------------------------------------------------
/man/print.as_topic.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/as_topic.R
3 | \name{print.as_topic}
4 | \alias{print.as_topic}
5 | \title{Prints an as_topic Object}
6 | \usage{
7 | \method{print}{as_topic}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{An as_topic object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints an as_topic object
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/man/print.assign_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_cluster.R
3 | \name{print.assign_cluster}
4 | \alias{print.assign_cluster}
5 | \title{Prints an assign_cluster Object}
6 | \usage{
7 | \method{print}{assign_cluster}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{An assign_cluster object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints an assign_cluster object
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/man/print.compare.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/compare.R
3 | \name{print.compare}
4 | \alias{print.compare}
5 | \title{Prints a compare Object.}
6 | \usage{
7 | \method{print}{compare}(x, digits = 3, ...)
8 | }
9 | \arguments{
10 | \item{x}{The compare object}
11 |
12 | \item{digits}{Number of decimal places to print.}
13 |
14 | \item{\ldots}{ignored}
15 | }
16 | \description{
17 | Prints a compare object.
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/man/print.data_store.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data_store.R
3 | \name{print.data_store}
4 | \alias{print.data_store}
5 | \title{Prints a data_store Object}
6 | \usage{
7 | \method{print}{data_store}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{A data_store object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints a data_store object
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/man/print.get_documents.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_documents.R
3 | \name{print.get_documents}
4 | \alias{print.get_documents}
5 | \title{Prints a get_documents Object}
6 | \usage{
7 | \method{print}{get_documents}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{A get_documents object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints a get_documents object
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/man/print.get_terms.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_terms.R
3 | \name{print.get_terms}
4 | \alias{print.get_terms}
5 | \title{Prints a get_terms Object}
6 | \usage{
7 | \method{print}{get_terms}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{A get_terms object.}
11 |
12 | \item{\ldots}{ignored.}
13 | }
14 | \description{
15 | Prints a get_terms object
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/man/skmeans_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/skmeans_cluster.R
3 | \name{skmeans_cluster}
4 | \alias{skmeans_cluster}
5 | \alias{skmeans_cluster.data_store}
6 | \title{Fit a skmean Cluster}
7 | \usage{
8 | skmeans_cluster(x, k, ...)
9 |
10 | \method{skmeans_cluster}{data_store}(x, k, ...)
11 | }
12 | \arguments{
13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).}
14 |
15 | \item{k}{The number of clusters.}
16 |
17 | \item{\ldots}{Other arguments passed to \code{\link[skmeans]{skmeans}}.}
18 | }
19 | \value{
20 | Returns an object of class \code{"skmean"}.
21 | }
22 | \description{
23 | Fit a skmean cluster to text data. Prior to distance measures being
24 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the
25 | \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate
26 | the distance matrix supplied to \code{\link[skmeans]{skmeans}}.
27 | }
28 | \examples{
29 | library(dplyr)
30 |
31 | x <- with(
32 | presidential_debates_2012,
33 | data_store(dialogue, paste(person, time, sep = "_"))
34 | )
35 |
36 |
37 | ## 6 topic model
38 | myfit1 <- skmeans_cluster(x, k=6)
39 |
40 | myfit1 \%>\%
41 | assign_cluster()
42 |
43 | myfit1 \%>\%
44 | assign_cluster() \%>\%
45 | summary()
46 |
47 | \dontrun{
48 | x2 <- presidential_debates_2012 \%>\%
49 | with(data_store(dialogue))
50 |
51 | myfit2 <- skmeans_cluster(x2, 55)
52 |
53 | assign_cluster(myfit2)
54 |
55 | assign_cluster(myfit2) \%>\%
56 | summary()
57 | }
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/man/summary.assign_cluster.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/assign_cluster.R
3 | \name{summary.assign_cluster}
4 | \alias{summary.assign_cluster}
5 | \title{Summary of an assign_cluster Object}
6 | \usage{
7 | \method{summary}{assign_cluster}(object, plot = TRUE, print = TRUE, ...)
8 | }
9 | \arguments{
10 | \item{object}{An assign_cluster object.}
11 |
12 | \item{plot}{logical. If \code{TRUE} an accompanying bar plot is produced a
13 | well.}
14 |
15 | \item{print}{logical. If \code{TRUE} data.frame counts are printed.}
16 |
17 | \item{\ldots}{ignored.}
18 | }
19 | \description{
20 | Summary of an assign_cluster object
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/man/write_cluster_text.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/write_cluster_text.R
3 | \name{write_cluster_text}
4 | \alias{read_cluster_text}
5 | \alias{write_cluster_text}
6 | \title{Write/Read Cluster Text for Human Categorization}
7 | \usage{
8 | write_cluster_text(x, path, n.sample = NULL, lead = " * ", ...)
9 |
10 | read_cluster_text(path, ...)
11 | }
12 | \arguments{
13 | \item{x}{An \code{assign_cluster} object.}
14 |
15 | \item{path}{A pather to the file (.txt) is recommended.}
16 |
17 | \item{n.sample}{The length to limit the sample to (default gives all text in the cluster).
18 | Setting this to an integer uses this as the number to randomly sample from.}
19 |
20 | \item{lead}{A leading character string prefix to give the cluster text.}
21 |
22 | \item{\ldots}{ignored.}
23 | }
24 | \description{
25 | Write cluster text from \code{get_text(assign_cluster(myfit))} to an external
26 | file for categorization. After file has been written with
27 | \code{write_cluster_text} a human coder can assign categories to each cluster.
28 | Simple write the category after the \code{Cluster #:}. To set a cluster category
29 | equal to another simply write and equal sign follwed by the other cluster to set
30 | as the same category (e.g., \code{Cluster 10: =5} to set cluster #10 the same as
31 | cluster #5). See \code{readLines(system.file("additional/foo_turk.txt", package = "clustext"))}
32 | for an example.
33 | }
34 | \examples{
35 | library(dplyr)
36 |
37 | ## Assign Clusters
38 | ca <- presidential_debates_2012 \%>\%
39 | with(data_store(dialogue)) \%>\%
40 | hierarchical_cluster() \%>\%
41 | assign_cluster(k = 7)
42 |
43 | ## Write Cluster Text for Human Categorization
44 | write_cluster_text(ca)
45 | write_cluster_text(ca, n.sample=10)
46 | write_cluster_text(ca, lead=" -", n.sample=10)
47 |
48 | ## Read Human Coded Categories Back In
49 | categories_file <- system.file("additional/foo_turk.txt", package = "clustext")
50 | readLines(categories_file)
51 | (categories_key <- read_cluster_text(categories_file))
52 |
53 | ## Add Categories Back to Original Data Set
54 | categorize(
55 | data = presidential_debates_2012,
56 | assign.cluster = ca,
57 | cluster.key = categories_key
58 | )
59 | }
60 | \seealso{
61 | \code{\link[clustext]{categorize}}
62 | }
63 |
64 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("clustext")
3 |
4 | test_check("clustext")
--------------------------------------------------------------------------------
/tests/testthat/test-assign_cluster.R:
--------------------------------------------------------------------------------
1 | context("Checking assign_cluster")
2 |
3 | test_that("assign_cluster ...",{
4 |
5 |
6 | })
7 |
8 |
--------------------------------------------------------------------------------
/tools/clustext_logo/r_clustext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustext.png
--------------------------------------------------------------------------------
/tools/clustext_logo/r_clustext.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustext.pptx
--------------------------------------------------------------------------------
/tools/clustext_logo/r_clustexta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustexta.png
--------------------------------------------------------------------------------
/tools/clustext_logo/resize_icon.txt:
--------------------------------------------------------------------------------
1 | cd C:\Users\Tyler\GitHub\clustext\tools\clustext_logo
2 | ffmpeg -i r_clustexta.png -vf scale=250:-1 r_clustext.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-12-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-14-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-15-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-16-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-17-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-18-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-19-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-20-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-22-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-6-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-2.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-6-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-3.png
--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------