├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS ├── NEWS.md ├── R ├── approx_k.R ├── as_topic.R ├── assign_cluster.R ├── categorize.R ├── clustext-package.R ├── compare.R ├── cosine_distance.R ├── data_store.R ├── get_documents.R ├── get_dtm.R ├── get_removed.R ├── get_terms.R ├── get_text.R ├── hierarchical_cluster.R ├── jaccard_distance.R ├── kmeans_cluster.R ├── nmf_cluster.R ├── skmeans_cluster.R ├── utils.R └── write_cluster_text.R ├── README.Rmd ├── README.md ├── data ├── assignments.rda └── presidential_debates_2012.rda ├── inst ├── CITATION ├── additional │ └── foo_turk.txt ├── build.R ├── extra_scripts │ └── build_data.R ├── extra_statdoc │ └── readme.R ├── maintenance.R └── staticdocs │ └── index.R ├── man ├── approx_k.Rd ├── as_topic.Rd ├── assign_cluster.Rd ├── assignments.Rd ├── categorize.Rd ├── clustext.Rd ├── compare.Rd ├── cosine_distance.Rd ├── data_store.Rd ├── get_documents.Rd ├── get_dtm.Rd ├── get_removed.Rd ├── get_terms.Rd ├── get_text.Rd ├── hierarchical_cluster.Rd ├── jaccard_distance.Rd ├── kmeans_cluster.Rd ├── nmf_cluster.Rd ├── plot.hierarchical_cluster.Rd ├── presidential_debates_2012.Rd ├── print.as_topic.Rd ├── print.assign_cluster.Rd ├── print.compare.Rd ├── print.data_store.Rd ├── print.get_documents.Rd ├── print.get_terms.Rd ├── skmeans_cluster.Rd ├── summary.assign_cluster.Rd └── write_cluster_text.Rd ├── tests ├── testthat.R └── testthat │ └── test-assign_cluster.R └── tools ├── clustext_logo ├── r_clustext.png ├── r_clustext.pptx ├── r_clustexta.png └── resize_icon.txt └── figure ├── unnamed-chunk-12-1.png ├── unnamed-chunk-14-1.png ├── unnamed-chunk-15-1.png ├── unnamed-chunk-16-1.png ├── unnamed-chunk-17-1.png ├── unnamed-chunk-18-1.png ├── unnamed-chunk-19-1.png ├── unnamed-chunk-20-1.png ├── unnamed-chunk-22-1.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-6-2.png ├── unnamed-chunk-6-3.png └── unnamed-chunk-8-1.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.gitignore 4 | NEWS.md 5 | FAQ.md 6 | NEWS.html 7 | FAQ.html 8 | ^\.travis\.yml$ 9 | travis-tool.sh 10 | inst/web 11 | contributors.geojson 12 | inst/build.R 13 | ^.*\.Rprofile$ 14 | README.Rmd 15 | README.R 16 | travis.yml 17 | inst/maintenance.R 18 | Thumb.db 19 | tools/clustext_logo/r_clustexta.png 20 | tools/clustext_logo/r_clustext.pptx 21 | tools/clustext_logo/resize_icon.txt 22 | inst/staticdocs 23 | inst/extra_statdoc 24 | inst/extra_scripts 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | 4 | # Example code in package build process 5 | *-Ex.R 6 | 7 | .Rprofile 8 | .Rproj.user 9 | clustext.Rproj 10 | Thumbs.db -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | 3 | sudo: false 4 | before_install: 5 | - sh -e /etc/init.d/xvfb start 6 | 7 | r_github_packages: 8 | - jimhester/covr 9 | - trinker/gofastr 10 | - trinker/textshape 11 | - trinker/termco 12 | notifications: 13 | email: 14 | on_success: change 15 | on_failure: change 16 | 17 | after_success: 18 | - Rscript -e 'covr::coveralls()' 19 | 20 | r_build_args: "--resave-data=best" 21 | r_check_args: "--as-cran" 22 | 23 | env: 24 | global: 25 | - DISPLAY=:99.0 26 | - BOOTSTRAP_LATEX=1 27 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clustext 2 | Title: Consistent Clustering for Text Data 3 | Version: 0.1.1 4 | Authors@R: c(person("Tyler", "Rinker", email = 5 | "tyler.rinker@gmail.com", role = c("aut", "cre"))) 6 | Maintainer: Tyler Rinker 7 | Description: Optimized, consistent tools for clustering text data. 8 | Depends: R (>= 3.2.3) 9 | Imports: dplyr, dynamicTreeCut, fastcluster, gofastr, graphics, Matrix, 10 | mclust, methods, rNMF, skmeans, slam, stats, termco, textshape, 11 | tm 12 | Suggests: testthat 13 | Date: 2017-04-14 14 | License: GPL-2 15 | LazyData: TRUE 16 | Roxygen: list(wrap = FALSE) 17 | RoxygenNote: 5.0.1 18 | BugReports: https://github.com/trinker/clustext/issues?state=open 19 | URL: https://github.com/trinker/clustext 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(approx_k,DocumentTermMatrix) 4 | S3method(approx_k,TermDocumentMatrix) 5 | S3method(as_topic,get_terms) 6 | S3method(assign_cluster,hierarchical_cluster) 7 | S3method(assign_cluster,kmeans_cluster) 8 | S3method(assign_cluster,nmf_cluster) 9 | S3method(assign_cluster,skmeans_cluster) 10 | S3method(cosine_distance,DocumentTermMatrix) 11 | S3method(cosine_distance,TermDocumentMatrix) 12 | S3method(get_documents,assign_cluster) 13 | S3method(get_dtm,data_store) 14 | S3method(get_dtm,hierarchical_cluster) 15 | S3method(get_dtm,kmeans_cluster) 16 | S3method(get_dtm,nmf_cluster) 17 | S3method(get_dtm,skmeans_cluster) 18 | S3method(get_removed,data_store) 19 | S3method(get_removed,hierarchical_cluster) 20 | S3method(get_removed,kmeans_cluster) 21 | S3method(get_removed,nmf_cluster) 22 | S3method(get_removed,skmeans_cluster) 23 | S3method(get_terms,assign_cluster_hierarchical) 24 | S3method(get_terms,assign_cluster_kmeans) 25 | S3method(get_terms,assign_cluster_nmf) 26 | S3method(get_terms,assign_cluster_skmeans) 27 | S3method(get_text,assign_cluster) 28 | S3method(get_text,data_store) 29 | S3method(get_text,default) 30 | S3method(get_text,hierarchical_cluster) 31 | S3method(get_text,kmeans_cluster) 32 | S3method(get_text,nmf_cluster) 33 | S3method(get_text,skmeans_cluster) 34 | S3method(hierarchical_cluster,data_store) 35 | S3method(jaccard_distance,DocumentTermMatrix) 36 | S3method(jaccard_distance,TermDocumentMatrix) 37 | S3method(kmeans_cluster,data_store) 38 | S3method(nmf_cluster,data_store) 39 | S3method(plot,hierarchical_cluster) 40 | S3method(print,as_topic) 41 | S3method(print,assign_cluster) 42 | S3method(print,compare) 43 | S3method(print,data_store) 44 | S3method(print,get_documents) 45 | S3method(print,get_terms) 46 | S3method(skmeans_cluster,data_store) 47 | S3method(summary,assign_cluster) 48 | export(approx_k) 49 | export(as_topic) 50 | export(assign_cluster) 51 | export(categorize) 52 | export(compare) 53 | export(cosine_distance) 54 | export(data_store) 55 | export(get_documents) 56 | export(get_dtm) 57 | export(get_removed) 58 | export(get_terms) 59 | export(get_text) 60 | export(hierarchical_cluster) 61 | export(jaccard_distance) 62 | export(kmeans_cluster) 63 | export(nmf_cluster) 64 | export(read_cluster_text) 65 | export(skmeans_cluster) 66 | export(write_cluster_text) 67 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | NEWS 2 | ==== 3 | 4 | Versioning 5 | ---------- 6 | 7 | Releases will be numbered with the following semantic versioning format: 8 | 9 | .. 10 | 11 | And constructed with the following guidelines: 12 | 13 | * Breaking backward compatibility bumps the major (and resets the minor 14 | and patch) 15 | * New additions without breaking backward compatibility bumps the minor 16 | (and resets the patch) 17 | * Bug fixes and misc changes bumps the patch 18 | 19 | 20 | 21 | clustext 0.1.0- 22 | ---------------------------------------------------------------- 23 | 24 | BUG FIXES 25 | 26 | * `join` from an `assign_cluster`'s `attributes`, now explicitly joins on (`by`) 27 | `'id_temporary'`. 28 | 29 | NEW FEATURES 30 | 31 | * `as_topic` added to convert `get_terms` to more print friendly cluster-topic-terms 32 | display. 33 | 34 | * `write_cluster_text` & `read_cluster_text` added to write/read cluster text for 35 | human categorization. 36 | 37 | * `categorize` added to join original data, clusters, and human categories. 38 | 39 | MINOR FEATURES 40 | 41 | IMPROVEMENTS 42 | 43 | CHANGES 44 | 45 | clustext 0.0.1 46 | ---------------------------------------------------------------- 47 | 48 | This package is a collection of tools for optimized, consistent clustering of text data. 49 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | NEWS 2 | ==== 3 | 4 | Versioning 5 | ---------- 6 | 7 | Releases will be numbered with the following semantic versioning format: 8 | 9 | <major>.<minor>.<patch> 10 | 11 | And constructed with the following guidelines: 12 | 13 | * Breaking backward compatibility bumps the major (and resets the minor 14 | and patch) 15 | * New additions without breaking backward compatibility bumps the minor 16 | (and resets the patch) 17 | * Bug fixes and misc changes bumps the patch 18 | 19 | 20 | 21 | clustext 0.1.0- 22 | ---------------------------------------------------------------- 23 | 24 | **BUG FIXES** 25 | 26 | * `join` from an `assign_cluster`'s `attributes`, now explicitly joins on (`by`) 27 | `'id_temporary'`. 28 | 29 | **NEW FEATURES** 30 | 31 | * `as_topic` added to convert `get_terms` to more print friendly cluster-topic-terms 32 | display. 33 | 34 | * `write_cluster_text` & `read_cluster_text` added to write/read cluster text for 35 | human categorization. 36 | 37 | * `categorize` added to join original data, clusters, and human categories. 38 | 39 | **MINOR FEATURES** 40 | 41 | **IMPROVEMENTS** 42 | 43 | **CHANGES** 44 | 45 | clustext 0.0.1 46 | ---------------------------------------------------------------- 47 | 48 | This package is a collection of tools for optimized, consistent clustering of text data. -------------------------------------------------------------------------------- /R/approx_k.R: -------------------------------------------------------------------------------- 1 | #' Approximate Number of Clusters for a Text Matrix 2 | #' 3 | #' Can & Ozkarahan (1990) formula for approximating the number of clusters for 4 | #' a text matrix: \eqn{(m * n)/t} where \eqn{m} and \eqn{n} are the dimensions 5 | #' of the matrix and \eqn{t} is the length of the non-zero elements in matrix 6 | #' \eqn{A}. 7 | #' 8 | #' @param x A matrix. 9 | #' @param verbose logical. If \code{TRUE} the k determination is printed. 10 | #' @return Returns an integer. 11 | #' @references Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of 12 | #' the cover-coefficient-based clustering methodology for text databases. 13 | #' ACM Transactions on Database Systems 15 (4): 483. doi:10.1145/99935.99938. \cr 14 | #' @rdname approx_k 15 | #' @export 16 | #' @examples 17 | #' library(gofastr) 18 | #' library(dplyr) 19 | #' 20 | #' presidential_debates_2012 %>% 21 | #' with(q_dtm(dialogue)) %>% 22 | #' approx_k() 23 | approx_k <- function(x, verbose = TRUE){ 24 | 25 | UseMethod("approx_k") 26 | 27 | } 28 | 29 | #' @export 30 | #' @rdname approx_k 31 | #' @method approx_k TermDocumentMatrix 32 | approx_k.TermDocumentMatrix <- function(x, verbose = TRUE) { 33 | m <- round(do.call("*", as.list(dim(x)))/length(x[["v"]])) 34 | if (verbose) cat(sprintf("\nk approximated to: %s\n", m)) 35 | m 36 | } 37 | 38 | #' @export 39 | #' @rdname approx_k 40 | #' @method approx_k DocumentTermMatrix 41 | approx_k.DocumentTermMatrix <- function(x, verbose = TRUE) { 42 | m <- round(do.call("*", as.list(dim(x)))/length(x[["v"]])) 43 | if (verbose) cat(sprintf("\nk approximated to: %s\n", m)) 44 | m 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /R/as_topic.R: -------------------------------------------------------------------------------- 1 | #' Convert \code{get_terms} to Topics 2 | #' 3 | #' View important terms as a comma separated string (a topic). 4 | #' 5 | #' @param x A \code{get_terms} object. 6 | #' @param max.n The max number of words to show before truncation. 7 | #' @param sort logical. If \code{TRUE} the cluster topics are sorted by size 8 | #' (number of documents) otherwise the topics are sorted by cluster number. 9 | #' @param \ldots ignored. 10 | #' @return Returns a \code{\link[base]{data.frame}} of \code{"cluster"}, 11 | #' \code{"count"}, and \code{"terms"}. Pretty prints as clusters, number of 12 | #' documents, and associated important terms. 13 | #' @export 14 | #' @examples 15 | #' library(dplyr) 16 | #' 17 | #' myfit5 <- presidential_debates_2012 %>% 18 | #' mutate(tot = gsub("\\..+$", "", tot)) %>% 19 | #' textshape::combine() %>% 20 | #' filter(person %in% c("ROMNEY", "OBAMA")) %>% 21 | #' with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>% 22 | #' hierarchical_cluster() 23 | #' 24 | #' ca5 <- assign_cluster(myfit5, k = 50) 25 | #' 26 | #' get_terms(ca5, .4) %>% 27 | #' as_topic() 28 | #' 29 | #' get_terms(ca5, .4) %>% 30 | #' as_topic(sort=FALSE) 31 | #' 32 | #' get_terms(ca5, .95) %>% 33 | #' as_topic() 34 | as_topic <- function(x, max.n = 8, sort = TRUE, ...){ 35 | UseMethod("as_topic") 36 | } 37 | 38 | 39 | #' @export 40 | #' @rdname as_topic 41 | #' @method as_topic get_terms 42 | as_topic.get_terms <- function(x, max.n = 8, sort = TRUE, ...){ 43 | 44 | cluster <- NULL 45 | 46 | terms <- lapply(x, function(x) { 47 | if (is.null(x)) return(NA) 48 | trms <- x[['term']] 49 | if (length(trms) > max.n) { 50 | paste0(paste(trms[1:max.n], collapse= ", "), "...") 51 | } else { 52 | paste(trms, collapse= ", ") 53 | } 54 | }) 55 | 56 | dat <- dplyr::left_join( 57 | summary(attributes(x)[["assignment"]], plot=FALSE), 58 | textshape::tidy_list(terms, "cluster", "terms"), 59 | by = 'cluster' 60 | ) 61 | if (!isTRUE(sort)){ 62 | dat <- dplyr::arrange(dat, as.numeric(cluster)) 63 | } 64 | class(dat) <- c('as_topic', class(dat)) 65 | dat 66 | } 67 | 68 | 69 | #' Prints an as_topic Object 70 | #' 71 | #' Prints an as_topic object 72 | #' 73 | #' @param x An as_topic object. 74 | #' @param \ldots ignored. 75 | #' @method print as_topic 76 | #' @export 77 | print.as_topic <- function(x, ...){ 78 | cat(paste(sprintf("Cluster %s (n=%s): %s", x[['cluster']], x[['count']], 79 | x[['terms']]), collapse="\n"), "\n") 80 | } 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /R/assign_cluster.R: -------------------------------------------------------------------------------- 1 | #' Assign Clusters to Documents/Text Elements 2 | #' 3 | #' Assign clusters to documents/text elements. 4 | #' 5 | #' @param x a \code{xxx_cluster} object. 6 | #' @param k The number of clusters (can supply \code{h} instead). Defaults to 7 | #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced 8 | #' by \code{data_storage}. 9 | #' @param h The height at which to cut the dendrograms (determines number of 10 | #' clusters). If this argument is supplied \code{k} is ignored. 11 | #' @param cut The type of cut method to use for \code{hierarchical_cluster}; one 12 | #' of \code{'static'}, \code{'dynamic'} or \code{'iterative'}. 13 | #' @param deepSplit logical. See \code{\link[dynamicTreeCut]{cutreeDynamic}}. 14 | #' @param minClusterSize The minimum cluster size. See 15 | #' \code{\link[dynamicTreeCut]{cutreeDynamic}}. 16 | #' @param \ldots ignored. 17 | #' @return Returns an \code{assign_cluster} object; a named vector of cluster 18 | #' assignments with documents as names. The object also contains the original 19 | #' \code{data_storage} object and a \code{join} function. \code{join} is a 20 | #' function (a closure) that captures information about the \code{assign_cluster} 21 | #' that makes rejoining to the original data set simple. The user simply 22 | #' supplies the original data set as an argument to \code{join} 23 | #' (\code{attributes(FROM_ASSIGN_CLUSTER)$join(ORIGINAL_DATA)}). 24 | #' @rdname assign_cluster 25 | #' @export 26 | #' @examples 27 | #' \dontrun{ 28 | #' library(dplyr) 29 | #' 30 | #' x <- with( 31 | #' presidential_debates_2012, 32 | #' data_store(dialogue, paste(person, time, sep = "_")) 33 | #' ) 34 | #' 35 | #' hierarchical_cluster(x) %>% 36 | #' plot(h=.7, lwd=2) 37 | #' 38 | #' hierarchical_cluster(x) %>% 39 | #' assign_cluster(h=.7) 40 | #' 41 | #' hierarchical_cluster(x, method="complete") %>% 42 | #' plot(k=6) 43 | #' 44 | #' hierarchical_cluster(x) %>% 45 | #' assign_cluster(k=6) 46 | #' 47 | #' 48 | #' x2 <- presidential_debates_2012 %>% 49 | #' with(data_store(dialogue)) %>% 50 | #' hierarchical_cluster() 51 | #' 52 | #' ca2 <- assign_cluster(x2, k = 55) 53 | #' summary(ca2) 54 | #' 55 | #' ## Dynamic cut 56 | #' ca3 <- assign_cluster(x2, cut = 'dynamic', minClusterSize = 5) 57 | #' get_text(ca3) 58 | #' 59 | #' ## add to original data 60 | #' attributes(ca2)$join(presidential_debates_2012) 61 | #' 62 | #' ## split text into clusters 63 | #' get_text(ca2) 64 | #' 65 | #' ## Kmeans Algorithm 66 | #' kmeans_cluster(x, k=6) %>% 67 | #' assign_cluster() 68 | #' 69 | #' x3 <- presidential_debates_2012 %>% 70 | #' with(data_store(dialogue)) %>% 71 | #' kmeans_cluster(55) 72 | #' 73 | #' ca3 <- assign_cluster(x3) 74 | #' summary(ca3) 75 | #' 76 | #' ## split text into clusters 77 | #' get_text(ca3) 78 | #' } 79 | assign_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL, ...){ 80 | UseMethod("assign_cluster") 81 | } 82 | 83 | 84 | #' @export 85 | #' @rdname assign_cluster 86 | #' @method assign_cluster hierarchical_cluster 87 | assign_cluster.hierarchical_cluster <- function(x, k = approx_k(get_dtm(x)), 88 | h = NULL, cut = 'static', deepSplit = TRUE, minClusterSize = 1, ...){ 89 | 90 | id_temporary <- n <- NULL 91 | 92 | 93 | switch(cut, 94 | 95 | static = { 96 | if (!is.null(h)){ 97 | out <- stats::cutree(x, h=h) 98 | } else { 99 | out <- stats::cutree(x, k=k) 100 | } 101 | }, 102 | dynamic = { 103 | y <- x 104 | attributes(y)[['text_data_store']] <- NULL 105 | class(y) <- 'hclust' 106 | out <- dynamicTreeCut::cutreeDynamic( 107 | dendro = y, 108 | cutHeight = NULL, 109 | minClusterSize = minClusterSize, 110 | method = "tree", 111 | deepSplit = deepSplit, 112 | ... 113 | ) 114 | names(out) <- y[['labels']] 115 | }, 116 | iterative = { 117 | stop("'iterative', method not implemented yet;\n Use \'static\' or \'dynamic\'") 118 | }, 119 | stop('`cut` must be one of: \'static\', \'dynamic\' or \'iterative\'') 120 | ) 121 | 122 | orig <- attributes(x)[['text_data_store']][['data']] 123 | lens <- length(orig[['text']]) + length(orig[['removed']]) 124 | 125 | class(out) <- c("assign_cluster_hierarchical", "assign_cluster", class(out)) 126 | 127 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]] 128 | attributes(out)[["model"]] <- x 129 | attributes(out)[["algorithm"]] <- 'hierarchical' 130 | vect <- c(out) 131 | attributes(out)[["join"]] <- function(x) { 132 | 133 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x))) 134 | 135 | dplyr::select( 136 | dplyr::left_join( 137 | dplyr::mutate(x, id_temporary = as.character(1:n())), 138 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') ), 139 | by = 'id_temporary' 140 | ), 141 | -id_temporary 142 | ) 143 | } 144 | out 145 | 146 | } 147 | 148 | 149 | #' @export 150 | #' @rdname assign_cluster 151 | #' @method assign_cluster kmeans_cluster 152 | assign_cluster.kmeans_cluster <- function(x, ...){ 153 | 154 | out <- x[['cluster']] 155 | n <- id_temporary <- NULL 156 | orig <- attributes(x)[['text_data_store']][['data']] 157 | lens <- length(orig[['text']]) + length(orig[['removed']]) 158 | 159 | class(out) <- c("assign_cluster_kmeans","assign_cluster", class(out)) 160 | 161 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]] 162 | attributes(out)[["model"]] <- x 163 | attributes(out)[["algorithm"]] <- 'kmeans' 164 | vect <- c(out) 165 | attributes(out)[["join"]] <- function(x) { 166 | 167 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x))) 168 | 169 | dplyr::select( 170 | dplyr::left_join( 171 | dplyr::mutate(x, id_temporary = as.character(1:n())), 172 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') ) 173 | ), 174 | -id_temporary 175 | ) 176 | } 177 | out 178 | 179 | } 180 | 181 | 182 | 183 | #' @export 184 | #' @rdname assign_cluster 185 | #' @method assign_cluster skmeans_cluster 186 | assign_cluster.skmeans_cluster <- function(x, ...){ 187 | 188 | out <- x[['cluster']] 189 | n <- id_temporary <- NULL 190 | orig <- attributes(x)[['text_data_store']][['data']] 191 | lens <- length(orig[['text']]) + length(orig[['removed']]) 192 | 193 | class(out) <- c("assign_cluster_skmeans","assign_cluster", class(out)) 194 | 195 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]] 196 | attributes(out)[["model"]] <- x 197 | attributes(out)[["algorithm"]] <- 'skmeans' 198 | vect <- c(out) 199 | attributes(out)[["join"]] <- function(x) { 200 | 201 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x))) 202 | 203 | dplyr::select( 204 | dplyr::left_join( 205 | dplyr::mutate(x, id_temporary = as.character(1:n())), 206 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') ) 207 | ), 208 | -id_temporary 209 | ) 210 | } 211 | out 212 | 213 | } 214 | 215 | 216 | #' @export 217 | #' @rdname assign_cluster 218 | #' @method assign_cluster nmf_cluster 219 | assign_cluster.nmf_cluster <- function(x, ...){ 220 | 221 | out <- unlist(apply(x[['W']], 1, which.max)) 222 | 223 | n <- id_temporary <- NULL 224 | orig <- attributes(x)[['text_data_store']][['data']] 225 | lens <- length(orig[['text']]) + length(orig[['removed']]) 226 | 227 | class(out) <- c("assign_cluster_nmf","assign_cluster", class(out)) 228 | 229 | attributes(out)[["data_store"]] <- attributes(x)[["text_data_store"]] 230 | attributes(out)[["model"]] <- x 231 | attributes(out)[["algorithm"]] <- 'nmf' 232 | vect <- c(out) 233 | attributes(out)[["join"]] <- function(x) { 234 | 235 | if (nrow(x) != lens) warning(sprintf("original data had %s elements, `x` has %s", lens, nrow(x))) 236 | 237 | dplyr::select( 238 | dplyr::left_join( 239 | dplyr::mutate(x, id_temporary = as.character(1:n())), 240 | dplyr::tbl_df(textshape::tidy_vector(vect, 'id_temporary', 'cluster') ) 241 | ), 242 | -id_temporary 243 | ) 244 | } 245 | out 246 | 247 | } 248 | 249 | 250 | #' Prints an assign_cluster Object 251 | #' 252 | #' Prints an assign_cluster object 253 | #' 254 | #' @param x An assign_cluster object. 255 | #' @param \ldots ignored. 256 | #' @method print assign_cluster 257 | #' @export 258 | print.assign_cluster <- function(x, ...){ 259 | print(stats::setNames(as.integer(x), names(x))) 260 | } 261 | 262 | 263 | #' Summary of an assign_cluster Object 264 | #' 265 | #' Summary of an assign_cluster object 266 | #' 267 | #' @param object An assign_cluster object. 268 | #' @param plot logical. If \code{TRUE} an accompanying bar plot is produced a 269 | #' well. 270 | #' @param print logical. If \code{TRUE} data.frame counts are printed. 271 | #' @param \ldots ignored. 272 | #' @method summary assign_cluster 273 | #' @export 274 | summary.assign_cluster <- function(object, plot = TRUE, print = TRUE, ...){ 275 | count <- NULL 276 | out <- textshape::tidy_table(table(as.integer(object)), "cluster", "count") 277 | if (isTRUE(plot)) print(termco::plot_counts(as.integer(object), item.name = "Cluster")) 278 | if (isTRUE(print)) dplyr::arrange(as.data.frame(out), dplyr::desc(count)) 279 | } 280 | 281 | 282 | 283 | 284 | 285 | -------------------------------------------------------------------------------- /R/categorize.R: -------------------------------------------------------------------------------- 1 | #' Merge Clusters & Cluster Categories Back to Original Data 2 | #' 3 | #' Merge clusters, categories, and the original data back together. 4 | #' 5 | #' @param data A data set that was fit with a cluster model. 6 | #' @param assign.cluster An \code{\link[clustext]{assign_cluster}} object. 7 | #' @param cluster.key An \code{\link[clustext]{assign_cluster}} object. 8 | #' @return Returns a \code{\link[base]{data.frame}} key of clusters and categories. 9 | #' @export 10 | #' @seealso \code{\link[clustext]{write_cluster_text}}, 11 | #' \code{\link[clustext]{read_cluster_text}} 12 | #' @examples 13 | #' library(dplyr) 14 | #' 15 | #' ## Assign Clusters 16 | #' ca <- presidential_debates_2012 %>% 17 | #' with(data_store(dialogue)) %>% 18 | #' hierarchical_cluster() %>% 19 | #' assign_cluster(k = 7) 20 | #' 21 | #' ## Write Cluster Text for Human Categorization 22 | #' write_cluster_text(ca) 23 | #' write_cluster_text(ca, n.sample=10) 24 | #' write_cluster_text(ca, lead=" -", n.sample=10) 25 | #' 26 | #' ## Read Human Coded Categories Back In 27 | #' categories_file <- system.file("additional/foo_turk.txt", package = "clustext") 28 | #' readLines(categories_file) 29 | #' (categories_key <- read_cluster_text(categories_file)) 30 | #' 31 | #' ## Add Categories Back to Original Data Set 32 | #' categorize( 33 | #' data = presidential_debates_2012, 34 | #' assign.cluster = ca, 35 | #' cluster.key = categories_key 36 | #' ) 37 | categorize <- function(data, assign.cluster, cluster.key) { 38 | 39 | stopifnot(methods::is(assign.cluster, 'assign_cluster')) 40 | stopifnot(methods::is(cluster.key, 'cluster_key')) 41 | 42 | data <- attributes(assign.cluster)[["join"]](data) 43 | dplyr::left_join(data, cluster.key, by = 'cluster') 44 | } -------------------------------------------------------------------------------- /R/clustext-package.R: -------------------------------------------------------------------------------- 1 | #' Consistent Clustering for Text Data 2 | #' 3 | #' Optimized, consistent tools for clustering text data. 4 | #' @docType package 5 | #' @name clustext 6 | #' @aliases clustext package-clustext 7 | NULL 8 | 9 | 10 | #' 2012 U.S. Presidential Debates 11 | #' 12 | #' A dataset containing a cleaned version of all three presidential debates for 13 | #' the 2012 election. 14 | #' 15 | #' @details 16 | #' \itemize{ 17 | #' \item person. The speaker 18 | #' \item tot. Turn of talk 19 | #' \item dialogue. The words spoken 20 | #' \item time. Variable indicating which of the three debates the dialogue is from 21 | #' } 22 | #' 23 | #' @docType data 24 | #' @keywords datasets 25 | #' @name presidential_debates_2012 26 | #' @usage data(presidential_debates_2012) 27 | #' @format A data frame with 2912 rows and 4 variables 28 | NULL 29 | 30 | 31 | #' Topic Assignments 32 | #' 33 | #' A dataset containing a list of topic assignments by various clustering 34 | #' algorithms. Assignments correspond to the rows (minus empty rows) of the 35 | #' \code{presidential_debates_2012} data set. 36 | #' 37 | #' @docType data 38 | #' @keywords datasets 39 | #' @name assignments 40 | #' @usage data(assignments) 41 | #' @format A list with 3 elements 42 | NULL 43 | -------------------------------------------------------------------------------- /R/compare.R: -------------------------------------------------------------------------------- 1 | #' Adjusted Rand Index Comaprison Between Algorithms 2 | #' 3 | #' An Adjusted Rand Index comparison of the assignments between different 4 | #' clustering algorithms. 5 | #' 6 | #' @param \ldots A series of outputs from \code{assign_cluster} for various 7 | #' cluster algorithmns. 8 | #' @return Returns a pair-wise comparison matrix of Adjusted Rand Indices for 9 | #' algorithm. Higher Adjusted Rand Index scores indicate higher cluster 10 | #' assignment agreement. 11 | #' @references \url{http://faculty.washington.edu/kayee/pca/supp.pdf} 12 | #' @export 13 | #' @examples 14 | #' compare( 15 | #' assignments$hierarchical_assignment, 16 | #' assignments$kmeans_assignment, 17 | #' assignments$skmeans_assignment, 18 | #' assignments$nmf_assignment 19 | #' ) 20 | #' 21 | #' ## Understanding the ARI 22 | #' set.seed(10) 23 | #' w <- sample(1:10, 40, TRUE) 24 | #' x <- 11-w 25 | #' set.seed(20) 26 | #' y <- sample(1:10, 40, TRUE) 27 | #' set.seed(50) 28 | #' z <- sample(1:10, 40, TRUE) 29 | #' 30 | #' data.frame(w, x, y, z) 31 | #' 32 | #' library(mclust) 33 | #' 34 | #' mclust::adjustedRandIndex(w, x) 35 | #' mclust::adjustedRandIndex(x, y) 36 | #' mclust::adjustedRandIndex(x, z) 37 | compare <- function(...) { 38 | nms <- unlist(lapply(list(...), function(x) attributes(x)[['algorithm']])) 39 | vouter(stats::setNames(as.data.frame(list(...)), nms), mclust::adjustedRandIndex) 40 | 41 | } 42 | 43 | 44 | #comare(assignments[[1]], assignments[[2]], assignments[[3]]) 45 | 46 | vouter <- function(x, FUN, ...){ 47 | 48 | nc <- ncol(x) 49 | mat <- matrix(rep(NA, nc^2), nc) 50 | for (i in 1:nc) { 51 | for (j in 1:nc) { 52 | mat[i, j] <- FUN(.subset2(x, i), .subset2(x, j)) 53 | } 54 | } 55 | dimnames(mat) <- list(colnames(x), colnames(x)) 56 | class(mat) <- c("compare", class(mat)) 57 | mat 58 | } 59 | 60 | 61 | #' Prints a compare Object. 62 | #' 63 | #' Prints a compare object. 64 | #' 65 | #' @param x The compare object 66 | #' @param digits Number of decimal places to print. 67 | #' @param \ldots ignored 68 | #' @method print compare 69 | #' @export 70 | print.compare <- function(x, digits = 3, ...) { 71 | WD <- options()[["width"]] 72 | options(width=3000) 73 | y <- unclass(x) 74 | if (is.numeric(y) & !is.null(digits)) { 75 | y <- round(y, digits = digits) 76 | } 77 | print(y) 78 | options(width=WD) 79 | } 80 | 81 | -------------------------------------------------------------------------------- /R/cosine_distance.R: -------------------------------------------------------------------------------- 1 | #' Optimized Computation of Cosine Distance 2 | #' 3 | #' Utilizes the \pkg{slam} package to efficiently calculate cosine distance 4 | #' on large sparse matrices. 5 | #' 6 | #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or 7 | #' \code{\link[tm]{TermDocumentMatrix}}). 8 | #' @param \ldots ignored. 9 | #' @return Returns a cosine distance object of class \code{"dist"}. 10 | #' @references \url{http://stackoverflow.com/a/29755756/1000343} 11 | #' @keywords cosine dissimilarity 12 | #' @rdname cosine_distance 13 | #' @export 14 | #' @author Michael Andrec and Tyler Rinker . 15 | #' @examples 16 | #' library(gofastr) 17 | #' library(dplyr) 18 | #' 19 | #' out <- presidential_debates_2012 %>% 20 | #' with(q_dtm(dialogue)) %>% 21 | #' cosine_distance() 22 | cosine_distance <- function(x, ...){ 23 | UseMethod("cosine_distance") 24 | } 25 | 26 | 27 | #' @export 28 | #' @rdname cosine_distance 29 | #' @method cosine_distance DocumentTermMatrix 30 | cosine_distance.DocumentTermMatrix <- function(x, ...){ 31 | x <- t(slam::as.simple_triplet_matrix(x)) 32 | stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2))))) 33 | } 34 | 35 | 36 | #' @export 37 | #' @rdname cosine_distance 38 | #' @method cosine_distance TermDocumentMatrix 39 | cosine_distance.TermDocumentMatrix <- function(x, ...){ 40 | x <- slam::as.simple_triplet_matrix(x) 41 | stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*% t(slam::col_sums(x^2))))) 42 | } 43 | 44 | 45 | -------------------------------------------------------------------------------- /R/data_store.R: -------------------------------------------------------------------------------- 1 | #' Data Structure for \pkg{hclusttext} 2 | #' 3 | #' A data structure which stores the text, DocumentTermMatrix, and information 4 | #' regarding removed text elements which can not be handled by the 5 | #' \code{hierarchical_cluster} function. This structure is required because it 6 | #' documents important meta information, including removed elements, required by 7 | #' other \pkg{clustext} functions. If the user wishes to combine documents 8 | #' (say by a common grouping variable) it is recomended this be handled by 9 | #' \code{\link[textshape]{combine}} prior to using \code{data_store}. 10 | #' 11 | #' @param text A character vector. 12 | #' @param doc.names An optional vector of document names corresponding to the 13 | #' length of \code{text}. 14 | #' @param min.term.freq The minimum times a term must appear to be included in 15 | #' the \code{\link[tm]{DocumentTermMatrix}}. 16 | #' @param min.doc.len The minimum words a document must contain to be included 17 | #' in the data structure (other wise it is stored as a \code{removed} element). 18 | #' @param stopwords A vector of stopwords to remove. 19 | #' @param min.char The minial length character for retained words. 20 | #' @param max.char The maximum length character for retained words. 21 | #' @param stem Logical. If \code{TRUE} the \code{stopwords} will be stemmed. 22 | #' @param denumber Logical. If \code{TRUE} numbers will be excluded. 23 | #' @return Returns a list containing: 24 | #' \describe{ 25 | #' \item{dtm}{A tf-idf weighted \code{\link[tm]{DocumentTermMatrix}}} 26 | #' \item{text}{The text vector with unanalyzable elements removed} 27 | #' \item{removed}{The indices of the removed text elements, i.e., documents not meeting \code{min.doc.len}} 28 | #' \item{n.nonsparse}{The length of the non-zero elements} 29 | #' } 30 | #' @keywords data structure 31 | #' @export 32 | #' @examples 33 | #' data_store(presidential_debates_2012[["dialogue"]]) 34 | #' 35 | #' ## Use `combine` to merge text prior to `data_stare` 36 | #' library(textshape) 37 | #' library(dplyr) 38 | #' 39 | #' dat <- presidential_debates_2012 %>% 40 | #' dplyr::select(person, time, dialogue) %>% 41 | #' textshape::combine() 42 | #' 43 | #' ## Elements in `ds` correspond to `dat` grouping vars 44 | #' (ds <- with(dat, data_store(dialogue))) 45 | #' dplyr::select(dat, -3) 46 | #' 47 | #' ## Add row names 48 | #' (ds2 <- with(dat, data_store(dialogue, paste(person, time, sep = "_")))) 49 | #' rownames(ds2[["dtm"]]) 50 | #' 51 | #' ## Get a DocumentTermMatrix 52 | #' get_dtm(ds2) 53 | data_store <- function(text, doc.names, min.term.freq = 1, min.doc.len = 1, 54 | stopwords = tm::stopwords("english"), min.char = 3, max.char = NULL, 55 | stem = FALSE, denumber = TRUE){ 56 | 57 | stopifnot(is.atomic(text)) 58 | if (missing(doc.names)) doc.names <- seq_len(length(text)) 59 | stopifnot(length(text) == length(doc.names)) 60 | 61 | if (isTRUE(stem)){ 62 | dtm <- gofastr::q_dtm_stem(text, docs = doc.names) 63 | } else { 64 | dtm <- gofastr::q_dtm(text, docs = doc.names) 65 | } 66 | 67 | dtm <- gofastr::remove_stopwords(dtm, stopwords = stopwords, min.char = min.char, 68 | max.char = max.char, stem = stem, denumber = denumber) 69 | 70 | if (nrow(dtm) != length(text)){ 71 | text <- dplyr::group_by_(dplyr::data_frame(text=text, doc.names=doc.names), "doc.names") 72 | text <- dplyr::summarise(text, text = paste(text, collapse = " ")) 73 | text <- text[match(text[["doc.names"]], rownames(dtm)),][["text"]] 74 | } 75 | 76 | names(text) <- text_seq <- seq_len(length(text)) 77 | 78 | # remove terms 79 | dtm <- dtm[, slam::col_sums(dtm) >= min.term.freq] 80 | 81 | # remove short docs 82 | long_docs <- slam::row_sums(dtm) >= min.doc.len 83 | text <- text[long_docs] 84 | dtm <- dtm[long_docs,] 85 | 86 | # remove terms/docs again (ensure no zero lengths) 87 | # Eventually determine which elements were kept removed 88 | dtm <- dtm[, slam::col_sums(dtm) > 0] 89 | 90 | long_docs <- slam::row_sums(dtm) > 0 91 | text <- text[long_docs] 92 | dtm <- dtm[long_docs,] 93 | 94 | ## Add tf-idf 95 | dtm <- tm::weightTfIdf(dtm) 96 | 97 | out <- list(dtm = dtm, text = unname(text), 98 | removed = setdiff(text_seq, names(text)), n.nonsparse = length(dtm[["v"]])) 99 | 100 | class(out) <- "data_store" 101 | attributes(out)[["guid"]] <-rguid() 102 | out 103 | } 104 | 105 | 106 | #' Prints a data_store Object 107 | #' 108 | #' Prints a data_store object 109 | #' 110 | #' @param x A data_store object. 111 | #' @param \ldots ignored. 112 | #' @method print data_store 113 | #' @export 114 | print.data_store <- function(x, ...){ 115 | 116 | cat(sprintf("<>\n", pn2(nrow(x[["dtm"]])), pn2(ncol(x[["dtm"]])) )) 117 | cat(sprintf("Text Elements : %s\n", pn2(length(x[["text"]])) )) 118 | cat(sprintf("Elements Removed : %s\n", pn2(length(x[["removed"]])) )) 119 | #cat(sprintf("Documents : %s\n", pn2(nrow(x[["dtm"]])) )) 120 | #cat(sprintf("Terms : %s\n", pn2(ncol(x[["dtm"]])) )) 121 | cat(sprintf("Non-/sparse entries: %d/%.0f\n", x[["n.nonsparse"]], 122 | prod(dim(x[["dtm"]])) - x[["n.nonsparse"]])) 123 | if (!prod(dim(x))) { 124 | sparsity <- 100 125 | } else { 126 | sparsity <- round((1 - x[["n.nonsparse"]]/prod(dim(x[["dtm"]]))) * 100) 127 | } 128 | cat(sprintf("Sparsity : %s%%\n", sparsity)) 129 | cat(sprintf("Maximal term length: %s\n", max(nchar(colnames(x[["dtm"]]))))) 130 | cat(sprintf("Minimum term length: %s\n", min(nchar(colnames(x[["dtm"]]))))) 131 | } 132 | 133 | 134 | 135 | # 136 | # data_store_map <- function(stopwords = tm::stopwords("english"), min.char = 3, 137 | # max.char = NULL, stem = FALSE, denumber = TRUE) { 138 | # 139 | # } 140 | -------------------------------------------------------------------------------- /R/get_documents.R: -------------------------------------------------------------------------------- 1 | #' Get Documents Based on Cluster Assignment in \code{assign_cluster} 2 | #' 3 | #' Get the documents associated with each of the k clusters . 4 | #' 5 | #' @param x A \code{\link[clustext]{assign_cluster}} object. 6 | #' @param \ldots ignored. 7 | #' @return Returns a list of \code{\link[base]{vector}}s of document names. 8 | #' @export 9 | #' @rdname get_documents 10 | #' @examples 11 | #' library(dplyr) 12 | #' 13 | #' mydocuments1 <- presidential_debates_2012 %>% 14 | #' with(data_store(dialogue, paste(person, time, sep="-"))) %>% 15 | #' hierarchical_cluster() %>% 16 | #' assign_cluster(k = 6) %>% 17 | #' get_documents() 18 | #' 19 | #' mydocuments1 20 | #' 21 | #' mydocuments2 <- presidential_debates_2012 %>% 22 | #' with(data_store(dialogue)) %>% 23 | #' hierarchical_cluster() %>% 24 | #' assign_cluster(k = 55) %>% 25 | #' get_documents() 26 | #' 27 | #' mydocuments2 28 | get_documents <- function(x, ...){ 29 | UseMethod("get_documents") 30 | } 31 | 32 | #' @export 33 | #' @rdname get_documents 34 | #' @method get_documents assign_cluster 35 | get_documents.assign_cluster <- function(x, ...){ 36 | 37 | desc <- topic <- n <- NULL 38 | out <- split(attributes(x)[["names"]], x) 39 | 40 | class(out) <- c("get_documents", class(out)) 41 | out 42 | } 43 | 44 | #' Prints a get_documents Object 45 | #' 46 | #' Prints a get_documents object 47 | #' 48 | #' @param x A get_documents object. 49 | #' @param \ldots ignored. 50 | #' @method print get_documents 51 | #' @export 52 | print.get_documents <- function(x, ...){ 53 | class(x) <- "list" 54 | print(x) 55 | } 56 | -------------------------------------------------------------------------------- /R/get_dtm.R: -------------------------------------------------------------------------------- 1 | #' Get a \code{\link[tm]{DocumentTermMatrix}} Stored in a \code{hierarchical_cluster} Object 2 | #' 3 | #' Extract the \code{\link[tm]{DocumentTermMatrix}} supplied to/produced by a 4 | #' \code{\link[clustext]{hierarchical_cluster}} object. 5 | #' 6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object. 7 | #' @param \ldots ignored. 8 | #' @return Returns a \code{\link[tm]{DocumentTermMatrix}}. 9 | #' @export 10 | #' @rdname get_dtm 11 | #' @examples 12 | #' library(dplyr) 13 | #' 14 | #' presidential_debates_2012 %>% 15 | #' with(data_store(dialogue)) %>% 16 | #' hierarchical_cluster() %>% 17 | #' get_dtm() 18 | get_dtm <- function(x, ...){ 19 | UseMethod("get_dtm") 20 | } 21 | 22 | #' @export 23 | #' @rdname get_dtm 24 | #' @method get_dtm data_store 25 | get_dtm.data_store <- function(x, ...){ 26 | x[["dtm"]] 27 | } 28 | 29 | 30 | #' @export 31 | #' @rdname get_dtm 32 | #' @method get_dtm hierarchical_cluster 33 | get_dtm.hierarchical_cluster <- function(x, ...){ 34 | get_dtm(attributes(x)[["text_data_store"]][["data"]]) 35 | } 36 | 37 | 38 | #' @export 39 | #' @rdname get_dtm 40 | #' @method get_dtm kmeans_cluster 41 | get_dtm.kmeans_cluster <- function(x, ...){ 42 | get_dtm(attributes(x)[["text_data_store"]][["data"]]) 43 | } 44 | 45 | #' @export 46 | #' @rdname get_dtm 47 | #' @method get_dtm skmeans_cluster 48 | get_dtm.skmeans_cluster <- function(x, ...){ 49 | get_dtm(attributes(x)[["text_data_store"]][["data"]]) 50 | } 51 | 52 | 53 | 54 | #' @export 55 | #' @rdname get_dtm 56 | #' @method get_dtm nmf_cluster 57 | get_dtm.nmf_cluster <- function(x, ...){ 58 | get_dtm(attributes(x)[["text_data_store"]][["data"]]) 59 | } 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /R/get_removed.R: -------------------------------------------------------------------------------- 1 | #' Get a Text Stored in a \code{hierarchical_cluster} Object 2 | #' 3 | #' Extract the text supplied to the 4 | #' \code{\link[clustext]{hierarchical_cluster}} object. 5 | #' 6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object. 7 | #' @param \ldots ignored. 8 | #' @return Returns a vector of text strings. 9 | #' @export 10 | #' @rdname get_removed 11 | #' @examples 12 | #' library(dplyr) 13 | #' 14 | #' presidential_debates_2012 %>% 15 | #' with(data_store(dialogue)) %>% 16 | #' hierarchical_cluster() %>% 17 | #' get_removed() 18 | get_removed <- function(x, ...){ 19 | UseMethod("get_removed") 20 | } 21 | 22 | #' @export 23 | #' @rdname get_removed 24 | #' @method get_removed hierarchical_cluster 25 | get_removed.hierarchical_cluster <- function(x, ...){ 26 | get_removed(attributes(x)[["text_data_store"]][["data"]]) 27 | } 28 | 29 | #' @export 30 | #' @rdname get_removed 31 | #' @method get_removed kmeans_cluster 32 | get_removed.kmeans_cluster <- function(x, ...){ 33 | get_removed(attributes(x)[["text_data_store"]][["data"]]) 34 | } 35 | 36 | #' @export 37 | #' @rdname get_removed 38 | #' @method get_removed skmeans_cluster 39 | get_removed.skmeans_cluster <- function(x, ...){ 40 | get_removed(attributes(x)[["text_data_store"]][["data"]]) 41 | } 42 | 43 | 44 | #' @export 45 | #' @rdname get_removed 46 | #' @method get_removed nmf_cluster 47 | get_removed.nmf_cluster <- function(x, ...){ 48 | get_removed(attributes(x)[["text_data_store"]][["data"]]) 49 | } 50 | 51 | 52 | #' @export 53 | #' @rdname get_removed 54 | #' @method get_removed data_store 55 | get_removed.data_store <- function(x, ...){ 56 | x[["removed"]] 57 | } 58 | 59 | 60 | -------------------------------------------------------------------------------- /R/get_terms.R: -------------------------------------------------------------------------------- 1 | #' Get Terms Based on Cluster Assignment in \code{assign_cluster} 2 | #' 3 | #' Get the terms weighted (either by tf-idf or returned from the model) and 4 | #' min/max scaling associated with each of the k clusters . 5 | #' 6 | #' @param x A \code{\link[clustext]{assign_cluster}} object. 7 | #' @param min.weight The lowest min/max scaled tf-idf weighting to consider 8 | #' as a document's salient term. 9 | #' @param nrow The max number of rows to display in the returned 10 | #' \code{\link[base]{data.frame}}s. 11 | #' @param \ldots ignored. 12 | #' @return Returns a list of \code{\link[base]{data.frame}}s of top weighted terms. 13 | #' @export 14 | #' @rdname get_terms 15 | #' @examples 16 | #' library(dplyr) 17 | #' library(textshape) 18 | #' 19 | #' myterms <- presidential_debates_2012 %>% 20 | #' with(data_store(dialogue)) %>% 21 | #' hierarchical_cluster() %>% 22 | #' assign_cluster(k = 55) %>% 23 | #' get_terms() 24 | #' 25 | #' myterms 26 | #' textshape::tidy_list(myterms[!sapply(myterms, is.null)], "Topic") 27 | #' \dontrun{ 28 | #' library(ggplot2) 29 | #' library(gridExtra) 30 | #' library(dplyr) 31 | #' library(textshape) 32 | #' library(wordcloud) 33 | #' 34 | #' max.n <- max(textshape::tidy_list(myterms)[["n"]]) 35 | #' 36 | #' myplots <- Map(function(x, y){ 37 | #' x %>% 38 | #' mutate(term = factor(term, levels = rev(term))) %>% 39 | #' ggplot(aes(term, weight=n)) + 40 | #' geom_bar() + 41 | #' scale_y_continuous(expand = c(0, 0),limits=c(0, max.n)) + 42 | #' ggtitle(sprintf("Topic: %s", y)) + 43 | #' coord_flip() 44 | #' }, myterms, names(myterms)) 45 | #' 46 | #' myplots[["ncol"]] <- 10 47 | #' 48 | #' do.call(gridExtra::grid.arrange, myplots[!sapply(myplots, is.null)]) 49 | #' 50 | #' ##wordclouds 51 | #' par(mfrow=c(5, 11), mar=c(0, 4, 0, 0)) 52 | #' Map(function(x, y){ 53 | #' wordcloud::wordcloud(x[[1]], x[[2]], scale=c(1,.25),min.freq=1) 54 | #' mtext(sprintf("Topic: %s", y), col = "blue", cex=.55, padj = 1.5) 55 | #' }, myterms, names(myterms)) 56 | #' } 57 | get_terms <- function(x, min.weight = .6, nrow = NULL, ...){ 58 | UseMethod("get_terms") 59 | } 60 | 61 | 62 | 63 | #' @export 64 | #' @rdname get_terms 65 | #' @method get_terms assign_cluster_hierarchical 66 | get_terms.assign_cluster_hierarchical <- function(x, min.weight = .6, nrow = NULL, ...){ 67 | 68 | assignment <- x 69 | desc <- topic <- n <- term <- weight <- NULL 70 | dat <- attributes(x)[["data_store"]][["data"]][['dtm']] 71 | clusters <-split(names(x), x) 72 | 73 | out <- stats::setNames(lapply(clusters, function(y){ 74 | vals <- min_max(sort(slam::col_sums(dat[y,]), decreasing=TRUE)) 75 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE) 76 | }), names(clusters)) 77 | 78 | 79 | out2 <- lapply(out, function(x) { 80 | rownames(x) <- NULL 81 | x <- dplyr::filter(x, weight >= min.weight) 82 | x <- dplyr::filter(x, !is.na(term)) 83 | 84 | if (nrow(x) == 0) return(NULL) 85 | x 86 | }) 87 | 88 | if (!is.null(nrow)) { 89 | out2 <- lapply(out2, function(x){ 90 | if (is.null(x) || nrow(x) <= nrow) return(x) 91 | x[1:nrow] 92 | }) 93 | } 94 | class(out2) <- c("get_terms", class(out2)) 95 | attributes(out2)[['assignment']] <- assignment 96 | out2 97 | 98 | } 99 | 100 | 101 | 102 | 103 | 104 | 105 | #' @export 106 | #' @rdname get_terms 107 | #' @method get_terms assign_cluster_kmeans 108 | get_terms.assign_cluster_kmeans <- function(x, min.weight = .6, nrow = NULL, ...){ 109 | 110 | 111 | weight <- term <- NULL 112 | 113 | assignment <- x 114 | x <- attributes(x)[['model']] 115 | nms <- seq_along(x[['size']]) 116 | x <- x[['centers']] 117 | 118 | out <- stats::setNames(lapply(1:nrow(x), function(i){ 119 | vals <- min_max(sort(x[i, ], decreasing=TRUE)) 120 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE) 121 | }), nms) 122 | 123 | out2 <- lapply(out, function(x) { 124 | rownames(x) <- NULL 125 | x <- dplyr::filter(x, weight >= min.weight) 126 | 127 | x <- dplyr::filter(x, !is.na(term)) 128 | 129 | if (nrow(x) == 0) return(NULL) 130 | x 131 | }) 132 | 133 | if (!is.null(nrow)) { 134 | out2 <- lapply(out2, function(x){ 135 | if (is.null(x) || nrow(x) <= nrow) return(x) 136 | x[1:nrow] 137 | }) 138 | } 139 | class(out2) <- c("get_terms", class(out2)) 140 | attributes(out2)[['assignment']] <- assignment 141 | out2 142 | 143 | } 144 | 145 | 146 | #' @export 147 | #' @rdname get_terms 148 | #' @method get_terms assign_cluster_skmeans 149 | get_terms.assign_cluster_skmeans <- function(x, min.weight = .6, nrow = NULL, ...){ 150 | 151 | assignment <- x 152 | desc <- topic <- n <- term <- weight <- NULL 153 | dat <- attributes(x)[["data_store"]][["data"]][['dtm']] 154 | clusters <-split(names(x), x) 155 | 156 | out <- stats::setNames(lapply(clusters, function(y){ 157 | vals <- min_max(sort(slam::col_sums(dat[y,]), decreasing=TRUE)) 158 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE) 159 | }), names(clusters)) 160 | 161 | 162 | out2 <- lapply(out, function(x) { 163 | rownames(x) <- NULL 164 | x <- dplyr::filter(x, weight >= min.weight) 165 | x <- dplyr::filter(x, !is.na(term)) 166 | 167 | if (nrow(x) == 0) return(NULL) 168 | x 169 | }) 170 | 171 | if (!is.null(nrow)) { 172 | out2 <- lapply(out2, function(x){ 173 | if (is.null(x) || nrow(x) <= nrow) return(x) 174 | x[1:nrow] 175 | }) 176 | } 177 | class(out2) <- c("get_terms", class(out2)) 178 | attributes(out2)[['assignment']] <- assignment 179 | out2 180 | } 181 | 182 | 183 | 184 | #' @export 185 | #' @rdname get_terms 186 | #' @method get_terms assign_cluster_nmf 187 | get_terms.assign_cluster_nmf <- function(x, min.weight = .6, nrow = NULL, ...){ 188 | 189 | weight <- term <- NULL 190 | 191 | assignment <- x 192 | x <- attributes(x)[['model']] 193 | nms <- seq_len(ncol(x[["W"]])) 194 | x <- x[['H']] 195 | 196 | out <- stats::setNames(lapply(1:nrow(x), function(i){ 197 | vals <- min_max(sort(x[i, ], decreasing=TRUE)) 198 | as.data.frame(textshape::tidy_vector(vals, 'term', 'weight'), stringsAsFactors = FALSE) 199 | }), nms) 200 | 201 | out2 <- lapply(out, function(x) { 202 | rownames(x) <- NULL 203 | x <- dplyr::filter(x, weight >= min.weight) 204 | x <- dplyr::filter(x, !is.na(term)) 205 | 206 | if (nrow(x) == 0) return(NULL) 207 | x 208 | }) 209 | 210 | if (!is.null(nrow)) { 211 | out2 <- lapply(out2, function(x){ 212 | if (is.null(x) || nrow(x) <= nrow) return(x) 213 | x[1:nrow] 214 | }) 215 | } 216 | class(out2) <- c("get_terms", class(out2)) 217 | attributes(out2)[['assignment']] <- assignment 218 | out2 219 | 220 | } 221 | 222 | 223 | #' Prints a get_terms Object 224 | #' 225 | #' Prints a get_terms object 226 | #' 227 | #' @param x A get_terms object. 228 | #' @param \ldots ignored. 229 | #' @method print get_terms 230 | #' @export 231 | print.get_terms <- function(x, ...){ 232 | 233 | lens <- unlist(lapply(split(names(attributes(x)[['assignment']]), attributes(x)[['assignment']]), length)) 234 | attributes(x)[['assignment']] <- NULL 235 | 236 | class(x) <- "list" 237 | names(x) <- sprintf("%s (n=%s)", names(x), lens) 238 | print(x) 239 | } 240 | 241 | 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /R/get_text.R: -------------------------------------------------------------------------------- 1 | #' Get a Text Stored in Various Objects 2 | #' 3 | #' Extract the text supplied to the 4 | #' \code{\link[clustext]{hierarchical_cluster}} object. 5 | #' 6 | #' @param x A \code{\link[clustext]{hierarchical_cluster}} object. 7 | #' @param \ldots ignored. 8 | #' @return Returns a vector or list of text strings. 9 | #' @export 10 | #' @rdname get_text 11 | #' @examples 12 | #' library(dplyr) 13 | #' 14 | #' presidential_debates_2012 %>% 15 | #' with(data_store(dialogue)) %>% 16 | #' hierarchical_cluster() %>% 17 | #' get_text() %>% 18 | #' head() 19 | get_text <- function(x, ...){ 20 | UseMethod("get_text") 21 | } 22 | 23 | 24 | #' @export 25 | #' @rdname get_text 26 | #' @method get_text default 27 | get_text.default <- function(x, ...){ 28 | termco::get_text(x, ...) 29 | } 30 | 31 | 32 | #' @export 33 | #' @rdname get_text 34 | #' @method get_text hierarchical_cluster 35 | get_text.hierarchical_cluster <- function(x, ...){ 36 | get_text(attributes(x)[["text_data_store"]][["data"]]) 37 | } 38 | 39 | 40 | #' @export 41 | #' @rdname get_text 42 | #' @method get_text kmeans_cluster 43 | get_text.kmeans_cluster <- function(x, ...){ 44 | get_text(attributes(x)[["text_data_store"]][["data"]]) 45 | } 46 | 47 | 48 | #' @export 49 | #' @rdname get_text 50 | #' @method get_text nmf_cluster 51 | get_text.nmf_cluster <- function(x, ...){ 52 | get_text(attributes(x)[["text_data_store"]][["data"]]) 53 | } 54 | 55 | 56 | #' @export 57 | #' @rdname get_text 58 | #' @method get_text skmeans_cluster 59 | get_text.skmeans_cluster <- function(x, ...){ 60 | get_text(attributes(x)[["text_data_store"]][["data"]]) 61 | } 62 | 63 | 64 | 65 | 66 | #' @export 67 | #' @rdname get_text 68 | #' @method get_text data_store 69 | get_text.data_store <- function(x, ...){ 70 | x[["text"]] 71 | } 72 | 73 | 74 | #' @export 75 | #' @rdname get_text 76 | #' @method get_text assign_cluster 77 | get_text.assign_cluster <- function(x, ...){ 78 | split(get_text(attributes(x)[["data_store"]][["data"]]), as.integer(x)) 79 | } 80 | 81 | -------------------------------------------------------------------------------- /R/hierarchical_cluster.R: -------------------------------------------------------------------------------- 1 | #' Fit a Hierarchical Cluster 2 | #' 3 | #' Fit a hierarchical cluster to text data. Prior to distance measures being 4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 5 | #' \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate 6 | #' the distance matrix supplied to \code{\link[fastcluster]{hclust}}. \code{method} 7 | #' defaults to \code{"ward.D2"}. A faster cosine dissimilarity calculation is used 8 | #' under the hood (see \code{\link[clustext]{cosine_distance}}). Additionally, 9 | #' \code{\link[fastcluster]{hclust}} is used to quickly calculate the fit. 10 | #' Essentially, this is a wrapper function optimized for clustering text data. 11 | #' 12 | #' @param x A data store object (see \code{\link[clustext]{data_store}}). 13 | #' @param distance A distance measure ("cosine" or "jaccard"). 14 | #' @param method The agglomeration method to be used. This must be (an 15 | #' unambiguous abbreviation of) one of \code{"single"}, \code{"complete"}, 16 | #' \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"}, 17 | #' \code{"centroid"}, or \code{"median"}. 18 | #' @param \ldots ignored. 19 | #' @return Returns an object of class \code{"hclust"}. 20 | #' @export 21 | #' @rdname hierarchical_cluster 22 | #' @examples 23 | #' library(dplyr) 24 | #' 25 | #' x <- with( 26 | #' presidential_debates_2012, 27 | #' data_store(dialogue, paste(person, time, sep = "_")) 28 | #' ) 29 | #' 30 | #' hierarchical_cluster(x) %>% 31 | #' plot(k=4) 32 | #' 33 | #' hierarchical_cluster(x) %>% 34 | #' plot(h=.7, lwd=2) 35 | #' 36 | #' hierarchical_cluster(x) %>% 37 | #' assign_cluster(h=.7) 38 | #' 39 | #' \dontrun{ 40 | #' ## interactive cutting 41 | #' hierarchical_cluster(x) %>% 42 | #' plot(h=TRUE) 43 | #' } 44 | #' 45 | #' hierarchical_cluster(x, method="complete") %>% 46 | #' plot(k=6) 47 | #' 48 | #' hierarchical_cluster(x) %>% 49 | #' assign_cluster(k=6) 50 | #' 51 | #' x2 <- presidential_debates_2012 %>% 52 | #' with(data_store(dialogue)) 53 | #' 54 | #' myfit2 <- hierarchical_cluster(x2) 55 | #' 56 | #' plot(myfit2) 57 | #' plot(myfit2, 55) 58 | #' 59 | #' assign_cluster(myfit2, k = 55) 60 | #' 61 | #' ## Example from StackOverflow Question Response 62 | #' ## Asking fo grouping similar texts together 63 | #' ## http://stackoverflow.com/q/22936951/1000343 64 | #' dat <- data.frame( 65 | #' person = LETTERS[1:3], 66 | #' text = c("Best way to waste money", 67 | #' "Amazing stuff. lets you stay connected all the time", 68 | #' "Instrument to waste money and time"), 69 | #' stringsAsFactors = FALSE 70 | #' ) 71 | #' 72 | #' 73 | #' x <- with( 74 | #' dat, 75 | #' data_store(text, person) 76 | #' ) 77 | #' 78 | #' 79 | #' hierarchical_cluster(x) %>% 80 | #' plot(h=.9, lwd=2) 81 | #' 82 | #' hierarchical_cluster(x) %>% 83 | #' assign_cluster(h=.9) 84 | #' 85 | #' 86 | #' hierarchical_cluster(x) %>% 87 | #' assign_cluster(h=.9) %>% 88 | #' get_terms() 89 | #' 90 | #' hierarchical_cluster(x) %>% 91 | #' assign_cluster(h=.9) %>% 92 | #' get_terms() %>% 93 | #' as_topic() 94 | #' 95 | #' hierarchical_cluster(x) %>% 96 | #' assign_cluster(h=.9) %>% 97 | #' get_documents() 98 | hierarchical_cluster <- function(x, distance = 'cosine', method = "ward.D2", ...){ 99 | 100 | UseMethod("hierarchical_cluster") 101 | 102 | } 103 | 104 | 105 | #' @export 106 | #' @rdname hierarchical_cluster 107 | #' @method hierarchical_cluster data_store 108 | hierarchical_cluster.data_store <- function(x, distance = 'cosine', method = "ward.D", ...){ 109 | 110 | distmes <- switch(distance, 111 | 'cosine' = cosine_distance, 112 | 'jaccard' = jaccard_distance, 113 | stop('provide a valid `distance` type') 114 | ) 115 | fit <- fastcluster::hclust(distmes(x[["dtm"]]), method = method) 116 | 117 | text_data_store <- new.env(FALSE) 118 | text_data_store[["data"]] <- x 119 | 120 | class(fit) <- c("hierarchical_cluster", class(fit)) 121 | attributes(fit)[["text_data_store"]] <- text_data_store 122 | fit 123 | } 124 | 125 | 126 | 127 | #' Plots a hierarchical_cluster Object 128 | #' 129 | #' Plots a hierarchical_cluster object 130 | #' 131 | #' @param x A hierarchical_cluster object. 132 | #' @param k The number of clusters (can supply \code{h} instead). Defaults to 133 | #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced 134 | #' by \code{data_storage}. Boxes are drawn around the clusters. 135 | #' @param h The height at which to cut the dendrograms (determines number of 136 | #' clusters). If this argument is supplied \code{k} is ignored. A line is drawn 137 | #' showing the cut point on the dendrogram. If \code{h} is set to \code{TRUE} 138 | #' or \code{"locator"} then the cutting becomes interactive and the height is 139 | #' returned invisibly. 140 | #' @param color The color to make the cluster boxes (\code{k}) or line (\code{h}). 141 | #' @param digits The number o digits to display if h\code{h} is set to 142 | #' interactive. 143 | #' @param \ldots Other arguments passed to \code{\link[stats]{rect.hclust}} or 144 | #' \code{\link[graphics]{abline}}. 145 | #' @method plot hierarchical_cluster 146 | #' @export 147 | plot.hierarchical_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL, 148 | color = "red", digits = 3, ...){ 149 | 150 | if (is.null(h)) y <- k 151 | class(x) <- "hclust" 152 | graphics::plot(x) 153 | if (is.null(h) && !is.null(k)) stats::rect.hclust(x, k = y, border = color, ...) 154 | if (!is.null(h)) { 155 | if (isTRUE(h) | h == 'locator') { 156 | cat("Click a location in the plot...\n") 157 | h <- graphics::locator(1) 158 | cat(paste("You cut at h =", round(h[['y']], digits), "\n")) 159 | graphics::abline(h = h[['y']], col = color, ...) 160 | return(invisible(h[['y']])) 161 | } else { 162 | graphics::abline(h = h, col = color, ...) 163 | } 164 | } 165 | 166 | } 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /R/jaccard_distance.R: -------------------------------------------------------------------------------- 1 | #' Optimized Computation of Jaccard Distance 2 | #' 3 | #' Utilizes the \pkg{slam} package to efficiently calculate jaccard distance 4 | #' on large sparse matrices. 5 | #' 6 | #' @param x A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or 7 | #' \code{\link[tm]{TermDocumentMatrix}}). 8 | #' @param \ldots ignored. 9 | #' @return Returns a jaccard distance object of class \code{"dist"}. 10 | #' @references \url{http://stackoverflow.com/a/36373333/1000343} 11 | #' \url{http://stats.stackexchange.com/a/89947/7482} 12 | #' @keywords jaccard dissimilarity 13 | #' @rdname jaccard_distance 14 | #' @export 15 | #' @author user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker . 16 | #' @examples 17 | #' library(gofastr) 18 | #' library(dplyr) 19 | #' 20 | #' out <- presidential_debates_2012 %>% 21 | #' with(q_dtm(dialogue)) %>% 22 | #' jaccard_distance() 23 | jaccard_distance <- function(x, ...){ 24 | UseMethod("jaccard_distance") 25 | } 26 | 27 | 28 | #' @export 29 | #' @rdname jaccard_distance 30 | #' @method jaccard_distance DocumentTermMatrix 31 | jaccard_distance.DocumentTermMatrix <- function(x, ...){ 32 | mat <-sign(x) 33 | A <- slam::tcrossprod_simple_triplet_matrix(mat) 34 | im <- which(A > 0, arr.ind=TRUE) 35 | b <- slam::row_sums(mat) 36 | Aim <- A[im] 37 | 38 | stats::as.dist(1 - Matrix::sparseMatrix( 39 | i = im[,1], 40 | j = im[,2], 41 | x = Aim / (b[im[,1]] + b[im[,2]] - Aim), 42 | dims = dim(A) 43 | )) 44 | } 45 | 46 | 47 | #' @export 48 | #' @rdname jaccard_distance 49 | #' @method jaccard_distance TermDocumentMatrix 50 | jaccard_distance.TermDocumentMatrix <- function(x, ...){ 51 | mat <-sign(x) 52 | A <- slam::crossprod_simple_triplet_matrix(mat) 53 | im <- which(A > 0, arr.ind=TRUE) 54 | b <- slam::col_sums(mat) 55 | Aim <- A[im] 56 | 57 | stats::as.dist(1 - Matrix::sparseMatrix( 58 | i = im[,1], 59 | j = im[,2], 60 | x = Aim / (b[im[,1]] + b[im[,2]] - Aim), 61 | dims = dim(A) 62 | )) 63 | } 64 | 65 | -------------------------------------------------------------------------------- /R/kmeans_cluster.R: -------------------------------------------------------------------------------- 1 | #' Fit a Kmeans Cluster 2 | #' 3 | #' Fit a kmeans cluster to text data. Prior to distance measures being 4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 5 | #' \code{\link[tm]{DocumentTermMatrix}}. 6 | #' 7 | #' @param x A data store object (see \code{\link[clustext]{data_store}}). 8 | #' @param k The number of clusters. 9 | #' @param \ldots Other arguments passed to \code{\link[stats]{kmeans}}. 10 | #' @return Returns an object of class \code{"kmeans"}. 11 | #' @export 12 | #' @rdname kmeans_cluster 13 | #' @examples 14 | #' \dontrun{ 15 | #' library(dplyr) 16 | #' 17 | #' x <- with( 18 | #' presidential_debates_2012, 19 | #' data_store(dialogue, paste(person, time, sep = "_")) 20 | #' ) 21 | #' 22 | #' 23 | #' ## 6 topic model 24 | #' kmeans_cluster(x, k=6) 25 | #' 26 | #' kmeans_cluster(x, k=6) %>% 27 | #' assign_cluster() 28 | #' 29 | #' kmeans_cluster(x, k=6) %>% 30 | #' assign_cluster() %>% 31 | #' summary() 32 | #' 33 | #' x2 <- presidential_debates_2012 %>% 34 | #' with(data_store(dialogue)) 35 | #' 36 | #' myfit2 <- kmeans_cluster(x2, 55) 37 | #' 38 | #' assign_cluster(myfit2) 39 | #' 40 | #' assign_cluster(myfit2) %>% 41 | #' summary() 42 | #' } 43 | kmeans_cluster <- function(x, k, ...){ 44 | 45 | UseMethod("kmeans_cluster") 46 | 47 | } 48 | 49 | 50 | #' @export 51 | #' @rdname kmeans_cluster 52 | #' @method kmeans_cluster data_store 53 | kmeans_cluster.data_store <- function(x, k, ...){ 54 | 55 | 56 | fit <- stats::kmeans(x[["dtm"]], centers=k, ...) 57 | 58 | text_data_store <- new.env(FALSE) 59 | text_data_store[["data"]] <- x 60 | 61 | class(fit) <- c("kmeans_cluster", class(fit)) 62 | attributes(fit)[["text_data_store"]] <- text_data_store 63 | fit 64 | } 65 | 66 | 67 | 68 | # #' Plots a kmeans_cluster Object 69 | # #' 70 | # #' Plots a kmeans_cluster object 71 | # #' 72 | # #' @param x A kmeans_cluster object. 73 | # #' @param k The number of clusters (can supply \code{h} instead). Defaults to 74 | # #' use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced 75 | # #' by \code{data_storage}. Boxes are drawn around the clusters. 76 | # #' @param h The height at which to cut the dendrograms (determines number of 77 | # #' clusters). If this argument is supplied \code{k} is ignored. A line is drawn 78 | # #' showing the cut point on the dendrogram. 79 | # #' @param color The color to make the cluster boxes (\code{k}) or line (\code{h}). 80 | # #' @param \ldots Other arguments passed to \code{\link[stats]{rect.hclust}} or 81 | # #' \code{\link[graphics]{abline}}. 82 | # #' @method plot kmeans_cluster 83 | # #' @export 84 | # plot.kmeans_cluster <- function(x, k = approx_k(get_dtm(x)), h = NULL, 85 | # color = "red", ...){ 86 | # 87 | # if (is.null(h)) y <- k 88 | # class(x) <- "hclust" 89 | # graphics::plot(x) 90 | # if (is.null(h) && !is.null(k)) stats::rect.hclust(x, k = y, border = color, ...) 91 | # if (!is.null(h)) graphics::abline(h = h, col = color, ...) 92 | # } 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /R/nmf_cluster.R: -------------------------------------------------------------------------------- 1 | #' Fit a Non-Negative Matrix Factorization Cluster 2 | #' 3 | #' Fit a robust non-negative matrix factorization cluster to text data via 4 | #' \code{\link[rNMF]{rnmf}}. Prior to distance measures being 5 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 6 | #' \code{\link[tm]{DocumentTermMatrix}}. 7 | #' 8 | #' @param x A data store object (see \code{\link[clustext]{data_store}}). 9 | #' @param k The number of clusters. 10 | #' @param \ldots Other arguments passed to \code{\link[rNMF]{rnmf}}. 11 | #' @return Returns an object of class \code{"hclust"}. 12 | #' @export 13 | #' @rdname nmf_cluster 14 | #' @examples 15 | #' library(dplyr) 16 | #' 17 | #' x <- with( 18 | #' presidential_debates_2012, 19 | #' data_store(dialogue, paste(person, time, sep = "_")) 20 | #' ) 21 | #' 22 | #' 23 | #' ## 6 topic model 24 | #' model6 <- nmf_cluster(x, k=6) 25 | #' 26 | #' model6 %>% 27 | #' assign_cluster() 28 | #' 29 | #' model6 %>% 30 | #' assign_cluster() %>% 31 | #' summary() 32 | #' \dontrun{ 33 | #' x2 <- presidential_debates_2012 %>% 34 | #' with(data_store(dialogue)) 35 | #' 36 | #' myfit2 <- nmf_cluster(x2, 55) 37 | #' 38 | #' assign_cluster(myfit2) 39 | #' 40 | #' assign_cluster(myfit2) %>% 41 | #' summary() 42 | #' } 43 | nmf_cluster <- function(x, k = k, ...){ 44 | 45 | UseMethod("nmf_cluster") 46 | 47 | } 48 | 49 | 50 | #' @export 51 | #' @rdname nmf_cluster 52 | #' @method nmf_cluster data_store 53 | nmf_cluster.data_store <- function(x, k, ...){ 54 | 55 | fit <- rNMF::rnmf(as.matrix(x[["dtm"]]), k = k, ...) 56 | 57 | 58 | text_data_store <- new.env(FALSE) 59 | text_data_store[["data"]] <- x 60 | 61 | class(fit) <- c("nmf_cluster", class(fit)) 62 | attributes(fit)[["text_data_store"]] <- text_data_store 63 | fit 64 | } 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /R/skmeans_cluster.R: -------------------------------------------------------------------------------- 1 | #' Fit a skmean Cluster 2 | #' 3 | #' Fit a skmean cluster to text data. Prior to distance measures being 4 | #' calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 5 | #' \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate 6 | #' the distance matrix supplied to \code{\link[skmeans]{skmeans}}. 7 | #' 8 | #' @param x A data store object (see \code{\link[clustext]{data_store}}). 9 | #' @param k The number of clusters. 10 | #' @param \ldots Other arguments passed to \code{\link[skmeans]{skmeans}}. 11 | #' @return Returns an object of class \code{"skmean"}. 12 | #' @export 13 | #' @rdname skmeans_cluster 14 | #' @examples 15 | #' library(dplyr) 16 | #' 17 | #' x <- with( 18 | #' presidential_debates_2012, 19 | #' data_store(dialogue, paste(person, time, sep = "_")) 20 | #' ) 21 | #' 22 | #' 23 | #' ## 6 topic model 24 | #' myfit1 <- skmeans_cluster(x, k=6) 25 | #' 26 | #' myfit1 %>% 27 | #' assign_cluster() 28 | #' 29 | #' myfit1 %>% 30 | #' assign_cluster() %>% 31 | #' summary() 32 | #' 33 | #' \dontrun{ 34 | #' x2 <- presidential_debates_2012 %>% 35 | #' with(data_store(dialogue)) 36 | #' 37 | #' myfit2 <- skmeans_cluster(x2, 55) 38 | #' 39 | #' assign_cluster(myfit2) 40 | #' 41 | #' assign_cluster(myfit2) %>% 42 | #' summary() 43 | #' } 44 | skmeans_cluster <- function(x, k, ...){ 45 | 46 | UseMethod("skmeans_cluster") 47 | 48 | } 49 | 50 | 51 | #' @export 52 | #' @rdname skmeans_cluster 53 | #' @method skmeans_cluster data_store 54 | skmeans_cluster.data_store <- function(x, k, ...){ 55 | 56 | 57 | fit <- skmeans::skmeans(x[["dtm"]], k=k, ...) 58 | 59 | text_data_store <- new.env(FALSE) 60 | text_data_store[["data"]] <- x 61 | 62 | class(fit) <- c("skmeans_cluster", class(fit)) 63 | attributes(fit)[["text_data_store"]] <- text_data_store 64 | fit 65 | } 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | pn <- function(x, y) { 2 | m <- prettyNum(x, big.mark = ",", scientific = FALSE) 3 | paste0(paste(rep(" ", y - nchar(m)), collapse = ""), m) 4 | } 5 | 6 | pn2 <- function(x) prettyNum(x, big.mark = ",", scientific = FALSE) 7 | 8 | 9 | # min max scaling function 10 | min_max <- function(x) { 11 | if(max(x) - min(x) == 0) return(stats::setNames(rep(1, length(x)), names(x))) 12 | (x - min(x))/(max(x) - min(x)) 13 | } 14 | 15 | above <- function(x, threshhold) which(x >= threshhold) 16 | 17 | rguid <- function(n=20){ 18 | paste(sample(c(LETTERS, letters, 0:9), n, TRUE), collapse="") 19 | } 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /R/write_cluster_text.R: -------------------------------------------------------------------------------- 1 | #' Write/Read Cluster Text for Human Categorization 2 | #' 3 | #' Write cluster text from \code{get_text(assign_cluster(myfit))} to an external 4 | #' file for categorization. After file has been written with 5 | #' \code{write_cluster_text} a human coder can assign categories to each cluster. 6 | #' Simple write the category after the \code{Cluster #:}. To set a cluster category 7 | #' equal to another simply write and equal sign follwed by the other cluster to set 8 | #' as the same category (e.g., \code{Cluster 10: =5} to set cluster #10 the same as 9 | #' cluster #5). See \code{readLines(system.file("additional/foo_turk.txt", package = "clustext"))} 10 | #' for an example. 11 | #' 12 | #' @param x An \code{assign_cluster} object. 13 | #' @param path A pather to the file (.txt) is recommended. 14 | #' @param n.sample The length to limit the sample to (default gives all text in the cluster). 15 | #' Setting this to an integer uses this as the number to randomly sample from. 16 | #' @param lead A leading character string prefix to give the cluster text. 17 | #' @param \ldots ignored. 18 | #' @rdname write_cluster_text 19 | #' @export 20 | #' @seealso \code{\link[clustext]{categorize}} 21 | #' @examples 22 | #' library(dplyr) 23 | #' 24 | #' ## Assign Clusters 25 | #' ca <- presidential_debates_2012 %>% 26 | #' with(data_store(dialogue)) %>% 27 | #' hierarchical_cluster() %>% 28 | #' assign_cluster(k = 7) 29 | #' 30 | #' ## Write Cluster Text for Human Categorization 31 | #' write_cluster_text(ca) 32 | #' write_cluster_text(ca, n.sample=10) 33 | #' write_cluster_text(ca, lead=" -", n.sample=10) 34 | #' 35 | #' ## Read Human Coded Categories Back In 36 | #' categories_file <- system.file("additional/foo_turk.txt", package = "clustext") 37 | #' readLines(categories_file) 38 | #' (categories_key <- read_cluster_text(categories_file)) 39 | #' 40 | #' ## Add Categories Back to Original Data Set 41 | #' categorize( 42 | #' data = presidential_debates_2012, 43 | #' assign.cluster = ca, 44 | #' cluster.key = categories_key 45 | #' ) 46 | write_cluster_text <- function(x, path, n.sample = NULL, lead = " * ", ...){ 47 | 48 | stopifnot(methods::is(x, 'assign_cluster')) 49 | if (missing(path)) path <- "" 50 | 51 | 52 | y <- get_text(x) 53 | lens <- paste0("n = ", pn2(unlist(lapply(y, length)))) 54 | 55 | # get sample of text in each cluster 56 | if (!is.null(n.sample)){ 57 | y <- lapply(y, function(x) sample(x, min(length(x), n.sample))) 58 | } 59 | 60 | cls <- paste0("Cluster ", seq_along(y), ":") 61 | brd <- sapply(nchar(cls), function(x) paste(rep("=", x), collapse="")) 62 | 63 | if (!is.null(n.sample)){ 64 | cls <- paste(cls, lens, sep = "\n") 65 | } 66 | 67 | cat( 68 | gsub("^\\s+", "", 69 | paste( 70 | paste( 71 | paste("\n", brd, cls, brd, sep="\n"), 72 | sapply(lapply(y, function(z) paste0(lead, z)), paste, collapse="\n"), sep="\n" 73 | ), 74 | collapse="\n\n\n" 75 | ) 76 | ), "\n", 77 | file=path 78 | ) 79 | } 80 | 81 | 82 | #' @rdname write_cluster_text 83 | #' @export 84 | read_cluster_text <- function(path, ...){ 85 | 86 | x <- suppressWarnings(readLines(path)) 87 | y <- sub("\\s*:\\s*", "splitherenow", gsub("Cluster\\s*", "", grep("^Cluster", x, value=TRUE))) 88 | content <- lapply(strsplit(y, "splitherenow"), function(x){ 89 | out <- trimws(x) 90 | if (length(out) == 1) out <- c(out, NA) 91 | out 92 | }) 93 | 94 | key <- stats::setNames(data.frame(do.call(rbind, content), stringsAsFactors = FALSE), c('cluster', 'category')) 95 | 96 | repl <- sub("^=", "", grep("=\\d+", key[['category']], value=TRUE)) 97 | if (length(repl) > 0){ 98 | key[['category']][grep("=\\d+", key[['category']])] <- key[match(repl, key[["cluster"]]), "category"] 99 | } 100 | key[["cluster"]] <- as.integer(key[["cluster"]]) 101 | class(key) <- c('cluster_key', class(key)) 102 | key 103 | } 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "clustext" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | md_document: 6 | toc: true 7 | --- 8 | 9 | ```{r, echo=FALSE} 10 | library(knitr) 11 | desc <- suppressWarnings(readLines("DESCRIPTION")) 12 | regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)" 13 | loc <- grep(regex, desc) 14 | ver <- gsub(regex, "\\2", desc[loc]) 15 | verbadge <- sprintf('Version

', ver, ver) 16 | ```` 17 | 18 | 19 | ```{r, echo=FALSE} 20 | knit_hooks$set(htmlcap = function(before, options, envir) { 21 | if(!before) { 22 | paste('

',options$htmlcap,"

",sep="") 23 | } 24 | }) 25 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE) 26 | knitr::opts_chunk$set(fig.path = "tools/figure/") 27 | ``` 28 | 29 | [![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) 30 | [![Build Status](https://travis-ci.org/trinker/clustext.svg?branch=master)](https://travis-ci.org/trinker/clustext) 31 | [![Coverage Status](https://coveralls.io/repos/trinker/clustext/badge.svg?branch=master)](https://coveralls.io/r/trinker/clustext?branch=master) 32 | `r verbadge` 33 | 34 | ![](tools/clustext_logo/r_clustext.png) 35 | 36 | **clustext** is a collection of optimized tools for clustering text data via various text appropriate clustering algorithms. There are many great R [clustering tools](https://cran.r-project.org/web/views/Cluster.html) to locate topics within documents. I have had success with hierarchical clustering for topic extraction. This initial success birthed the [**hclustext**](https://github.com/trinker/hclustext) package. Additional techniques such as kmeans and non-negative matrix factorization also proved useful. These algorithms began to be collected in a consistent manor of use in the **clustext** package. This package wraps many of the great R tools for clustering and working with sparse matrices to aide in the workflow associated with topic extraction. 37 | 38 | The general idea is that we turn the documents into a matrix of words. After this we weight the terms by importance using [tf-idf](http://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html). This helps the more salient words to rise to the top. Some clustering algorithms require a similarity matrix while others require just the tf-idf weighted DocumentTermMatrices. Likewise, some algorithms require `k` terms to be specified before the model fit while others allow `k` topics to be determined after the model has been fit. 39 | 40 | 41 | With algorithms that require a similarity matrix (e.g., hierarchical clustering) we apply cosine distance measures to compare the terms (or features) of each document. I have found cosine distance to work well with sparse matrices to produce distances metrics between the documents. The clustering model is fit to separate the documents into clusters. In the case of some clustering techniques (e.g., hierarchical clustering) the user then may apply k clusters to the fit, clustering documents with similar important text features. Other techniques require that `k` be specified prior to fitting the model. The documents can then be grouped by clusters and their accompanying salient words extracted as well. 42 | 43 | # Functions 44 | 45 | The main functions, task category, & descriptions are summarized in the table below: 46 | 47 | | Function | Category | Description | 48 | |------------------------|----------------|-------------------------------------------------------------------------| 49 | | `data_store` | data structure | **clustext**'s data structure (list of dtm + text) | 50 | | `hierarchical_cluster` | cluster fit | Fits a hierarchical cluster model | 51 | | `kmeans_cluster` | cluster fit | Fits a kmeans cluster model | 52 | | `skmeans_cluster` | cluster fit | Fits an skmeans cluster model | 53 | | `nfm_cluster` | cluster fit | Fits a non-negative matrix factorization cluster model | 54 | | `assign_cluster` | assignment | Assigns cluster to document/text element | 55 | | `get_text` | extraction | Get text from various **clustext** objects | 56 | | `get_dtm` | extraction | Get `tm::DocumentTermMatrix` from various **clustext** objects | 57 | | `get_removed` | extraction | Get removed text elements from various **clustext** objects | 58 | | `get_documents` | extraction | Get clustered documents from an **assign_cluster** object | 59 | | `get_terms` | extraction | Get clustered weighted important terms from an **assign_cluster** object| 60 | | `as_topic` | categorization | View `get_terms` object as topics (pretty printed important words) | 61 | | `write_cluster_text` | categorization | Write `get_text(assign_cluster(myfit))` to file for human coding | 62 | | `read_cluster_text` | categorization | Read in a human coded `write_cluster_text` file | 63 | | `categorize` | categorization | Assign human categories and matching clusters to original data | 64 | 65 | # Installation 66 | 67 | To download the development version of **clustext**: 68 | 69 | Download the [zip ball](https://github.com/trinker/clustext/zipball/master) or [tar ball](https://github.com/trinker/clustext/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version: 70 | 71 | ```r 72 | if (!require("pacman")) install.packages("pacman") 73 | pacman::p_load_gh( 74 | "trinker/textshape", 75 | "trinker/gofastr", 76 | "trinker/termco", 77 | "trinker/clustext" 78 | ) 79 | ``` 80 | 81 | # Contact 82 | 83 | You are welcome to: 84 | * submit suggestions and bug-reports at: 85 | * send a pull request on: 86 | * compose a friendly e-mail to: 87 | 88 | # Demonstration 89 | 90 | ## Load Packages and Data 91 | 92 | ```{r} 93 | if (!require("pacman")) install.packages("pacman") 94 | pacman::p_load(clustext, dplyr, textshape, ggplot2, tidyr) 95 | 96 | data(presidential_debates_2012) 97 | ``` 98 | 99 | 100 | ## Data Structure 101 | 102 | The data structure for **clustext** is very specific. The `data_storage` produces a `DocumentTermMatrix` which maps to the original text. The empty/removed documents are tracked within this data structure, making subsequent calls to cluster the original documents and produce weighted important terms more robust. Making the `data_storage` object is the first step to analysis. 103 | 104 | We can give the `DocumentTermMatrix` rownames via the `doc.names` argument. If these names are not unique they will be combined into a single document as seen below. Also, if you want to do stemming, minimum character length, stopword removal or such this is when/where it's done. 105 | 106 | 107 | ```{r} 108 | ds <- with( 109 | presidential_debates_2012, 110 | data_store(dialogue, doc.names = paste(person, time, sep = "_")) 111 | ) 112 | 113 | ds 114 | ``` 115 | 116 | 117 | ## Fit the Model: Hierarchical Cluster 118 | 119 | Next we can fit a hierarchical cluster model to the `data_store` object via `hierarchical_cluster`. 120 | 121 | ```{r} 122 | myfit <- hierarchical_cluster(ds) 123 | 124 | myfit 125 | ``` 126 | 127 | 128 | This object can be plotted with various `k` or `h` parameters specified to experiment with cutting the dendrogram. This cut will determine the number of clusters or topics that will be generated in the next step. The visual inspection allows for determining how to cluster the data as well as determining if a tf-idf, cosine, hierarchical cluster model is a right fit for the data and task. By default `plot` uses an approximation of `k` based on Can & Ozkarahan's (1990) formula $(m * n)/t$ where $m$ and $n$ are the dimensions of the matrix and $t$ is the length of the non-zero elements in matrix $A$. 129 | 130 | - Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of the cover-coefficient-based clustering methodology for text databases. *ACM Transactions on Database Systems 15* (4): 483. doi:10.1145/99935.99938 131 | 132 | Interestingly, in the plots below where `k = 6` clusters, the model groups each of the candidates together at each of the debate times. 133 | 134 | 135 | ```{r} 136 | plot(myfit) 137 | plot(myfit, k=6) 138 | plot(myfit, h = .75) 139 | ``` 140 | 141 | ## Assigning Clusters 142 | 143 | The `assign_cluster` function allows the user to dictate the number of clusters. Because the model has already been fit the cluster assignment is merely selecting the branches from the dendrogram, and is thus very quick. Unlike many clustering techniques the number of clusters is done after the model is fit, this allows for speedy cluster assignment, meaning the user can experiment with the number of clusters. 144 | 145 | 146 | ```{r} 147 | ca <- assign_cluster(myfit, k = 6) 148 | 149 | ca 150 | ``` 151 | 152 | 153 | ### Cluster Loading 154 | 155 | To check the number of documents loading on a cluster there is a `summary` method for `assign_cluster` which provides a descending data frame of clusters and counts. Additionally, a horizontal bar plot shows the document loadings on each cluster. 156 | 157 | ```{r} 158 | summary(ca) 159 | ``` 160 | 161 | 162 | ### Cluster Text 163 | 164 | The user can grab the texts from the original documents grouped by cluster using the `get_text` function. Here I demo a 40 character substring of the document texts. 165 | 166 | ```{r} 167 | get_text(ca) %>% 168 | lapply(substring, 1, 40) 169 | ``` 170 | 171 | 172 | ### Cluster Frequent Terms 173 | 174 | As with many topic clustering techniques, it is useful to get the to salient terms from the model. The `get_terms` function uses the [min-max](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) scaled, [tf-idf weighted](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), `DocumentTermMatrix` to extract the most frequent salient terms. These terms can give a sense of the topic being discussed. Notice the absence of clusters 1 & 6. This is a result of only a single document included in each of the clusters. The `term.cutoff` hyperparmeter sets the lower bound on the min-max scaled tf-idf to accept. If you don't get any terms you may want to lower this or reduce `min.n`. Likewise, these two parameters can be raised to eliminate noise. 175 | 176 | ```{r} 177 | get_terms(ca) 178 | ``` 179 | 180 | Or pretty printed... 181 | 182 | ```{r} 183 | get_terms(ca) %>% 184 | as_topic() 185 | ``` 186 | 187 | ### Clusters, Terms, and Docs Plot 188 | 189 | Here I plot the clusters, terms, and documents (grouping variables) together as a combined heatmap. This can be useful for viewing & comparing what documents are clustering together in the context of the cluster's salient terms. This example also shows how to use the cluster terms as a lookup key to extract probable salient terms for a given document. 190 | 191 | ```{r, fig.width=11} 192 | key <- data_frame( 193 | cluster = 1:6, 194 | labs = get_terms(ca) %>% 195 | tidy_list("cluster") %>% 196 | select(-weight) %>% 197 | group_by(cluster) %>% 198 | summarize(term=paste(term, collapse=", ")) %>% 199 | apply(1, paste, collapse=": ") 200 | ) 201 | 202 | ca %>% 203 | tidy_vector("id", "cluster") %>% 204 | separate(id, c("person", "time"), sep="_") %>% 205 | tbl_df() %>% 206 | left_join(key, by = "cluster") %>% 207 | mutate(n = 1) %>% 208 | mutate(labs = factor(labs, levels=rev(key[["labs"]]))) %>% 209 | unite("time_person", time, person, sep="\n") %>% 210 | select(-cluster) %>% 211 | complete(time_person, labs) %>% 212 | mutate(n = factor(ifelse(is.na(n), FALSE, TRUE))) %>% 213 | ggplot(aes(time_person, labs, fill = n)) + 214 | geom_tile() + 215 | scale_fill_manual(values=c("grey90", "red"), guide=FALSE) + 216 | labs(x=NULL, y=NULL) 217 | ``` 218 | 219 | 220 | ### Cluster Documents 221 | 222 | The `get_documents` function grabs the documents associated with a particular cluster. This is most useful in cases where the number of documents is small and they have been given names. 223 | 224 | ```{r} 225 | get_documents(ca) 226 | ``` 227 | 228 | 229 | ## Putting it Together 230 | 231 | I like working in a chain. In the setup below we work within a **magrittr** pipeline to fit a model, select clusters, and examine the results. In this example I do not condense the 2012 Presidential Debates data by speaker and time, rather leaving every sentence as a separate document. On my machine the initial `data_store` and model fit take ~5-8 seconds to run. Note that I do restrict the number of clusters (for texts and terms) to a random 5 clusters for the sake of space. 232 | 233 | 234 | ```{r, fig.height = 10} 235 | .tic <- Sys.time() 236 | 237 | myfit2 <- presidential_debates_2012 %>% 238 | with(data_store(dialogue)) %>% 239 | hierarchical_cluster() 240 | 241 | difftime(Sys.time(), .tic) 242 | 243 | ## View Document Loadings 244 | ca2 <- assign_cluster(myfit2, k = 100) 245 | summary(ca2) %>% 246 | head(12) 247 | 248 | ## Split Text into Clusters 249 | set.seed(5); inds <- sort(sample.int(100, 5)) 250 | 251 | get_text(ca2)[inds] %>% 252 | lapply(head, 10) 253 | 254 | ## Get Associated Terms 255 | get_terms(ca2, .4)[inds] 256 | 257 | ## Pretty Printed Topics 258 | ## Get Associated Terms 259 | get_terms(ca2, .4) %>% 260 | as_topic() 261 | ``` 262 | 263 | 264 | ## An Experiment 265 | 266 | It seems to me that if the hierarchical clustering is function as expected we'd see topics clustering together within a conversation as the natural eb and flow of a conversation is to talk around a topic for a while and then move on to the next related topic. A Gantt style plot of topics across time seems like an excellent way to observe clustering across time. In the experiment I first ran the hierarchical clustering at the sentence level for all participants in the 2012 presidential debates data set. I then decided to use turn of talk as the unit of analysis. Finally, I pulled out the two candidates (President Obama and Romney) and faceted n their topic use over time. 267 | 268 | ```{r, fig.width=10, fig.height = 9} 269 | if (!require("pacman")) install.packages("pacman") 270 | pacman::p_load(dplyr, clustext, textshape, ggplot2, stringi) 271 | 272 | myfit3 <- presidential_debates_2012 %>% 273 | mutate(tot = gsub("\\..+$", "", tot)) %>% 274 | with(data_store(dialogue)) %>% 275 | hierarchical_cluster() 276 | 277 | plot(myfit3, 75) 278 | ``` 279 | 280 | Can & Ozkarahan's (1990) formula indicated a `k = 259`. This umber seemed overly large. I used `k = 75` for the number of topics as it seemed unreasonable that there'd be more topics than this but with `k = 75` over half of the sentences loaded on one cluster. Note the use of the `attribute` `join` from `assign_cluster` to make joining back to the original data set easier. 281 | 282 | 283 | ```{r, fig.width=14, fig.height = 16} 284 | k <- 75 285 | ca3 <- assign_cluster(myfit3, k = k) 286 | 287 | presidential_debates_2012 %>% 288 | mutate(tot = gsub("\\..+$", "", tot)) %>% 289 | tbl_df() %>% 290 | attributes(ca3)$join() %>% 291 | group_by(time) %>% 292 | mutate( 293 | word_count = stringi::stri_count_words(dialogue), 294 | start = starts(word_count), 295 | end = ends(word_count) 296 | ) %>% 297 | na.omit() %>% 298 | mutate(cluster = factor(cluster, levels = k:1)) %>% 299 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) + 300 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 301 | ggplot2::theme_bw() + 302 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 303 | panel.grid.minor.x = ggplot2::element_blank(), 304 | panel.grid.major.x = ggplot2::element_blank(), 305 | panel.grid.minor.y = ggplot2::element_blank(), 306 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 307 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 308 | strip.background = ggplot2::element_blank()) + 309 | ggplot2::facet_wrap(~time, scales='free', ncol=1) + 310 | ggplot2::labs(x="Duration (words)", y="Cluster") 311 | ``` 312 | 313 | Right away we notice that not all topics are used across all three times. This is encouraging that the clustering is working as expected as we'd expect some overlap in debate topics as well as some unique topics. However, there were so many topics clustering on cluster 3 that I had to make some decisions. I could (a) ignore this mass and essentially throw out half the data that loaded on a single cluster, (b) increase `k` to split up the mass loading on cluster 3, (c) change the unit of analysis. It seemed the first option was wasteful of data and could miss information. The second approach could lead to a model that had so many topics it wouldn't be meaningful. The last approach seemed reasonable, inspecting the cluster text showed that many were capturing functions of language rather than content. For example, people use *"Oh."* to indicate agreement. This isn't a topic but the clustering would group sentences that use this convention together. Combining this sentence with other sentences in the turn of talk are more likely to get the content we're after. 314 | 315 | Next I used the `textshape::combine` function to group turns of talk together. 316 | 317 | ```{r, fig.width=10, fig.height = 9} 318 | myfit4 <- presidential_debates_2012 %>% 319 | mutate(tot = gsub("\\..+$", "", tot)) %>% 320 | textshape::combine() %>% 321 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>% 322 | hierarchical_cluster() 323 | 324 | plot(myfit4, k = 80) 325 | ``` 326 | 327 | The distribution of turns of talk looked much more dispersed across clusters. I used `k = 60` for the number of topics. 328 | 329 | 330 | ```{r, fig.width=14, fig.height = 16} 331 | k <- 80 332 | ca4 <- assign_cluster(myfit4, k = k) 333 | 334 | presidential_debates_2012 %>% 335 | mutate(tot = gsub("\\..+$", "", tot)) %>% 336 | textshape::combine() %>% 337 | tbl_df() %>% 338 | attributes(ca4)$join() %>% 339 | group_by(time) %>% 340 | mutate( 341 | word_count = stringi::stri_count_words(dialogue), 342 | start = starts(word_count), 343 | end = ends(word_count) 344 | ) %>% 345 | na.omit() %>% 346 | mutate(cluster = factor(cluster, levels = k:1)) %>% 347 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) + 348 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 349 | ggplot2::theme_bw() + 350 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 351 | panel.grid.minor.x = ggplot2::element_blank(), 352 | panel.grid.major.x = ggplot2::element_blank(), 353 | panel.grid.minor.y = ggplot2::element_blank(), 354 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 355 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 356 | strip.background = ggplot2::element_blank()) + 357 | ggplot2::facet_wrap(~time, scales='free', ncol=1) + 358 | ggplot2::labs(x="Duration (words)", y="Cluster") 359 | ``` 360 | 361 | 362 | The plots looked less messy and indeed topics do appear to be clustering around one another. I wanted to see how the primary participants, the candidates, compared to each other in topic use. 363 | 364 | In this last bit of analysis I filter out all participants except Obama and Romeny and facet by participant across time. 365 | 366 | ```{r, fig.width=10, fig.height = 9} 367 | myfit5 <- presidential_debates_2012 %>% 368 | mutate(tot = gsub("\\..+$", "", tot)) %>% 369 | textshape::combine() %>% 370 | filter(person %in% c("ROMNEY", "OBAMA")) %>% 371 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>% 372 | hierarchical_cluster() 373 | 374 | 375 | plot(myfit5, 50) 376 | ``` 377 | 378 | Based on the dendrogram, I used `k = 50` for the number of topics. 379 | 380 | 381 | ```{r, fig.width=14, fig.height = 12} 382 | k <- 50 383 | ca5 <- assign_cluster(myfit5, k = k) 384 | 385 | presidential_debates_2012 %>% 386 | mutate(tot = gsub("\\..+$", "", tot)) %>% 387 | textshape::combine() %>% 388 | filter(person %in% c("ROMNEY", "OBAMA")) %>% 389 | tbl_df() %>% 390 | attributes(ca5)$join() %>% 391 | group_by(time) %>% 392 | mutate( 393 | word_count = stringi::stri_count_words(dialogue), 394 | start = starts(word_count), 395 | end = ends(word_count) 396 | ) %>% 397 | na.omit() %>% 398 | mutate(cluster = factor(cluster, levels = k:1)) %>% 399 | ggplot2::ggplot(ggplot2::aes(x = start-10, y = cluster, xend = end+10, yend = cluster)) + 400 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 401 | ggplot2::theme_bw() + 402 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 403 | panel.grid.minor.x = ggplot2::element_blank(), 404 | panel.grid.major.x = ggplot2::element_blank(), 405 | panel.grid.minor.y = ggplot2::element_blank(), 406 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 407 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 408 | strip.background = ggplot2::element_blank()) + 409 | ggplot2::facet_grid(person~time, scales='free', space='free') + 410 | ggplot2::labs(x="Duration (words)", y="Cluster") 411 | ``` 412 | 413 | 414 | If you're curious about the heaviest weighted tf-idf terms in each cluster the next code chunk provides the top five weighted terms used in each cluster. Below this I provide a bar plot of the frequencies of clusters to help put the other information into perspective. 415 | 416 | 417 | 418 | ```{r} 419 | invisible(Map(function(x, y){ 420 | 421 | if (is.null(x)) { 422 | cat(sprintf("Cluster %s: ...\n", y)) 423 | } else { 424 | m <- dplyr::top_n(x, 5, n) 425 | o <- paste(paste0(m[[1]], " (", round(m[[2]], 1), ")"), collapse="; ") 426 | cat(sprintf("Cluster %s: %s\n", y, o)) 427 | } 428 | 429 | }, get_terms(ca5, .4), names(get_terms(ca5, .4)))) 430 | 431 | ``` 432 | 433 | ```{r, fig.height = 9} 434 | invisible(summary(ca5)) 435 | ``` 436 | 437 | It appears that in fact the topics do cluster within segments of time as we'd expect. This is more apparent when turn of talk is used as the unit of analysis (document level) rather than each sentence. 438 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | clustext [![Follow](https://img.shields.io/twitter/follow/tylerrinker.svg?style=social)](https://twitter.com/intent/follow?screen_name=tylerrinker) 2 | ============ 3 | 4 | 5 | [![Project Status: Active - The project has reached a stable, usable 6 | state and is being actively 7 | developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) 8 | [![Build 9 | Status](https://travis-ci.org/trinker/clustext.svg?branch=master)](https://travis-ci.org/trinker/clustext) 10 | [![Coverage 11 | Status](https://coveralls.io/repos/trinker/clustext/badge.svg?branch=master)](https://coveralls.io/r/trinker/clustext?branch=master) 12 | Version 13 |

14 | 15 | ![](tools/clustext_logo/r_clustext.png) 16 | 17 | **clustext** is a collection of optimized tools for clustering text data 18 | via various text appropriate clustering algorithms. There are many great 19 | R [clustering tools](https://cran.r-project.org/web/views/Cluster.html) 20 | to locate topics within documents. I have had success with hierarchical 21 | clustering for topic extraction. This initial success birthed the 22 | [**hclustext**](https://github.com/trinker/hclustext) package. 23 | Additional techniques such as kmeans and non-negative matrix 24 | factorization also proved useful. These algorithms began to be collected 25 | in a consistent manor of use in the **clustext** package. This package 26 | wraps many of the great R tools for clustering and working with sparse 27 | matrices to aide in the workflow associated with topic extraction. 28 | 29 | The general idea is that we turn the documents into a matrix of words. 30 | After this we weight the terms by importance using 31 | [tf-idf](http://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html). 32 | This helps the more salient words to rise to the top. Some clustering 33 | algorithms require a similarity matrix while others require just the 34 | tf-idf weighted DocumentTermMatrices. Likewise, some algorithms require 35 | `k` terms to be specified before the model fit while others allow `k` 36 | topics to be determined after the model has been fit. 37 | 38 | With algorithms that require a similarity matrix (e.g., hierarchical 39 | clustering) we apply cosine distance measures to compare the terms (or 40 | features) of each document. I have found cosine distance to work well 41 | with sparse matrices to produce distances metrics between the documents. 42 | The clustering model is fit to separate the documents into clusters. In 43 | the case of some clustering techniques (e.g., hierarchical clustering) 44 | the user then may apply k clusters to the fit, clustering documents with 45 | similar important text features. Other techniques require that `k` be 46 | specified prior to fitting the model. The documents can then be grouped 47 | by clusters and their accompanying salient words extracted as well. 48 | 49 | 50 | Table of Contents 51 | ============ 52 | 53 | - [Functions](#functions) 54 | - [Installation](#installation) 55 | - [Contact](#contact) 56 | - [Demonstration](#demonstration) 57 | - [Load Packages and Data](#load-packages-and-data) 58 | - [Data Structure](#data-structure) 59 | - [Fit the Model: Hierarchical Cluster](#fit-the-model-hierarchical-cluster) 60 | - [Assigning Clusters](#assigning-clusters) 61 | - [Cluster Loading](#cluster-loading) 62 | - [Cluster Text](#cluster-text) 63 | - [Cluster Frequent Terms](#cluster-frequent-terms) 64 | - [Clusters, Terms, and Docs Plot](#clusters-terms-and-docs-plot) 65 | - [Cluster Documents](#cluster-documents) 66 | - [Putting it Together](#putting-it-together) 67 | - [An Experiment](#an-experiment) 68 | 69 | Functions 70 | ============ 71 | 72 | 73 | The main functions, task category, & descriptions are summarized in the 74 | table below: 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 |
FunctionCategoryDescription
data_storedata structureclustext's data structure (list of dtm + text)
hierarchical_clustercluster fitFits a hierarchical cluster model
kmeans_clustercluster fitFits a kmeans cluster model
skmeans_clustercluster fitFits an skmeans cluster model
nfm_clustercluster fitFits a non-negative matrix factorization cluster model
assign_clusterassignmentAssigns cluster to document/text element
get_textextractionGet text from various clustext objects
get_dtmextractionGet tm::DocumentTermMatrix from various clustext objects
get_removedextractionGet removed text elements from various clustext objects
get_documentsextractionGet clustered documents from an assign_cluster object
get_termsextractionGet clustered weighted important terms from an assign_cluster object
as_topiccategorizationView get_terms object as topics (pretty printed important words)
write_cluster_textcategorizationWrite get_text(assign_cluster(myfit)) to file for human coding
read_cluster_textcategorizationRead in a human coded write_cluster_text file
categorizecategorizationAssign human categories and matching clusters to original data
167 | 168 | Installation 169 | ============ 170 | 171 | To download the development version of **clustext**: 172 | 173 | Download the [zip 174 | ball](https://github.com/trinker/clustext/zipball/master) or [tar 175 | ball](https://github.com/trinker/clustext/tarball/master), decompress 176 | and run `R CMD INSTALL` on it, or use the **pacman** package to install 177 | the development version: 178 | 179 | if (!require("pacman")) install.packages("pacman") 180 | pacman::p_load_gh( 181 | "trinker/textshape", 182 | "trinker/gofastr", 183 | "trinker/termco", 184 | "trinker/clustext" 185 | ) 186 | 187 | Contact 188 | ======= 189 | 190 | You are welcome to: 191 | - submit suggestions and bug-reports at: 192 | - send a pull request on: 193 | - compose a friendly e-mail to: 194 | 195 | Demonstration 196 | ============= 197 | 198 | Load Packages and Data 199 | ---------------------- 200 | 201 | if (!require("pacman")) install.packages("pacman") 202 | pacman::p_load(clustext, dplyr, textshape, ggplot2, tidyr) 203 | 204 | data(presidential_debates_2012) 205 | 206 | Data Structure 207 | -------------- 208 | 209 | The data structure for **clustext** is very specific. The `data_storage` 210 | produces a `DocumentTermMatrix` which maps to the original text. The 211 | empty/removed documents are tracked within this data structure, making 212 | subsequent calls to cluster the original documents and produce weighted 213 | important terms more robust. Making the `data_storage` object is the 214 | first step to analysis. 215 | 216 | We can give the `DocumentTermMatrix` rownames via the `doc.names` 217 | argument. If these names are not unique they will be combined into a 218 | single document as seen below. Also, if you want to do stemming, minimum 219 | character length, stopword removal or such this is when/where it's done. 220 | 221 | ds <- with( 222 | presidential_debates_2012, 223 | data_store(dialogue, doc.names = paste(person, time, sep = "_")) 224 | ) 225 | 226 | ds 227 | 228 | ## <> 229 | ## Text Elements : 10 230 | ## Elements Removed : 0 231 | ## Non-/sparse entries: 6916/24884 232 | ## Sparsity : 78% 233 | ## Maximal term length: 16 234 | ## Minimum term length: 3 235 | 236 | Fit the Model: Hierarchical Cluster 237 | ----------------------------------- 238 | 239 | Next we can fit a hierarchical cluster model to the `data_store` object 240 | via `hierarchical_cluster`. 241 | 242 | myfit <- hierarchical_cluster(ds) 243 | 244 | myfit 245 | 246 | ## 247 | ## Call: 248 | ## fastcluster::hclust(d = distmes(x[["dtm"]]), method = method) 249 | ## 250 | ## Cluster method : ward.D 251 | ## Number of objects: 10 252 | 253 | This object can be plotted with various `k` or `h` parameters specified 254 | to experiment with cutting the dendrogram. This cut will determine the 255 | number of clusters or topics that will be generated in the next step. 256 | The visual inspection allows for determining how to cluster the data as 257 | well as determining if a tf-idf, cosine, hierarchical cluster model is a 258 | right fit for the data and task. By default `plot` uses an approximation 259 | of `k` based on Can & Ozkarahan's (1990) formula (*m* \* *n*)/*t* where 260 | *m* and *n* are the dimensions of the matrix and *t* is the length of 261 | the non-zero elements in matrix *A*. 262 | 263 | - Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of the 264 | cover-coefficient-based clustering methodology for text databases. 265 | *ACM Transactions on Database Systems 15* (4): 483. 266 | 267 | 268 | Interestingly, in the plots below where `k = 6` clusters, the model 269 | groups each of the candidates together at each of the debate times. 270 | 271 | plot(myfit) 272 | 273 | ## 274 | ## k approximated to: 5 275 | 276 | ![](tools/figure/unnamed-chunk-6-1.png) 277 | 278 | plot(myfit, k=6) 279 | 280 | ![](tools/figure/unnamed-chunk-6-2.png) 281 | 282 | plot(myfit, h = .75) 283 | 284 | ![](tools/figure/unnamed-chunk-6-3.png) 285 | 286 | Assigning Clusters 287 | ------------------ 288 | 289 | The `assign_cluster` function allows the user to dictate the number of 290 | clusters. Because the model has already been fit the cluster assignment 291 | is merely selecting the branches from the dendrogram, and is thus very 292 | quick. Unlike many clustering techniques the number of clusters is done 293 | after the model is fit, this allows for speedy cluster assignment, 294 | meaning the user can experiment with the number of clusters. 295 | 296 | ca <- assign_cluster(myfit, k = 6) 297 | 298 | ca 299 | 300 | ## CROWLEY_time 2 LEHRER_time 1 OBAMA_time 1 OBAMA_time 2 301 | ## 1 2 3 4 302 | ## OBAMA_time 3 QUESTION_time 2 ROMNEY_time 1 ROMNEY_time 2 303 | ## 5 6 3 4 304 | ## ROMNEY_time 3 SCHIEFFER_time 3 305 | ## 5 2 306 | 307 | ### Cluster Loading 308 | 309 | To check the number of documents loading on a cluster there is a 310 | `summary` method for `assign_cluster` which provides a descending data 311 | frame of clusters and counts. Additionally, a horizontal bar plot shows 312 | the document loadings on each cluster. 313 | 314 | summary(ca) 315 | 316 | ![](tools/figure/unnamed-chunk-8-1.png) 317 | 318 | ## cluster count 319 | ## 1 2 2 320 | ## 2 3 2 321 | ## 3 4 2 322 | ## 4 5 2 323 | ## 5 1 1 324 | ## 6 6 1 325 | 326 | ### Cluster Text 327 | 328 | The user can grab the texts from the original documents grouped by 329 | cluster using the `get_text` function. Here I demo a 40 character 330 | substring of the document texts. 331 | 332 | get_text(ca) %>% 333 | lapply(substring, 1, 40) 334 | 335 | ## $`1` 336 | ## [1] "Good evening from Hofstra University in " 337 | ## 338 | ## $`2` 339 | ## [1] "We'll talk about specifically about heal" 340 | ## [2] "Good evening from the campus of Lynn Uni" 341 | ## 342 | ## $`3` 343 | ## [1] "Jim, if I if I can just respond very qui" 344 | ## [2] "What I support is no change for current " 345 | ## 346 | ## $`4` 347 | ## [1] "Jeremy, first of all, your future is bri" 348 | ## [2] "Thank you, Jeremy. I appreciate your you" 349 | ## 350 | ## $`5` 351 | ## [1] "Well, my first job as commander in chief" 352 | ## [2] "Thank you, Bob. And thank you for agreei" 353 | ## 354 | ## $`6` 355 | ## [1] "Mister President, Governor Romney, as a " 356 | 357 | ### Cluster Frequent Terms 358 | 359 | As with many topic clustering techniques, it is useful to get the to 360 | salient terms from the model. The `get_terms` function uses the 361 | [min-max](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) 362 | scaled, [tf-idf weighted](https://en.wikipedia.org/wiki/Tf%E2%80%93idf), 363 | `DocumentTermMatrix` to extract the most frequent salient terms. These 364 | terms can give a sense of the topic being discussed. Notice the absence 365 | of clusters 1 & 6. This is a result of only a single document included 366 | in each of the clusters. The `term.cutoff` hyperparmeter sets the lower 367 | bound on the min-max scaled tf-idf to accept. If you don't get any terms 368 | you may want to lower this or reduce `min.n`. Likewise, these two 369 | parameters can be raised to eliminate noise. 370 | 371 | get_terms(ca) 372 | 373 | ## $`1 (n=1)` 374 | ## term weight 375 | ## 1 mister 1.0000000 376 | ## 2 along 0.7086841 377 | ## 3 sort 0.6678306 378 | ## 4 unemployed 0.6223915 379 | ## 380 | ## $`2 (n=2)` 381 | ## term weight 382 | ## 1 segment 1.0000000 383 | ## 2 minutes 0.9091730 384 | ## 3 minute 0.6648988 385 | ## 386 | ## $`3 (n=2)` 387 | ## term weight 388 | ## 1 insurance 1.0000000 389 | ## 2 health 0.6200389 390 | ## 391 | ## $`4 (n=2)` 392 | ## term weight 393 | ## 1 coal 1.0000000 394 | ## 2 jobs 0.9439400 395 | ## 3 sure 0.9330092 396 | ## 4 immigration 0.9134630 397 | ## 5 oil 0.9014907 398 | ## 6 issue 0.7352300 399 | ## 7 candy 0.7303597 400 | ## 8 production 0.7291683 401 | ## 9 women 0.7073096 402 | ## 10 million 0.6792076 403 | ## 11 settle 0.6056192 404 | ## 12 illegally 0.6055244 405 | ## 406 | ## $`5 (n=2)` 407 | ## term weight 408 | ## 1 nuclear 1.0000000 409 | ## 2 iran 0.9511527 410 | ## 3 sanctions 0.8585336 411 | ## 4 israel 0.7895173 412 | ## 5 sure 0.7698270 413 | ## 6 region 0.7608304 414 | ## 7 military 0.7272537 415 | ## 8 troops 0.6768143 416 | ## 9 pakistan 0.6766784 417 | ## 10 world 0.6716568 418 | ## 11 threat 0.6520238 419 | ## 12 iraq 0.6467488 420 | ## 421 | ## $`6 (n=1)` 422 | ## term weight 423 | ## 1 department 1.0000000 424 | ## 2 chu 0.6666667 425 | ## 3 stated 0.6666667 426 | ## 4 misperception 0.6666667 427 | 428 | Or pretty printed... 429 | 430 | get_terms(ca) %>% 431 | as_topic() 432 | 433 | ## Cluster 2 (n=2): segment, minutes, minute 434 | ## Cluster 3 (n=2): insurance, health 435 | ## Cluster 4 (n=2): coal, jobs, sure, immigration, oil, issue, candy, production... 436 | ## Cluster 5 (n=2): nuclear, iran, sanctions, israel, sure, region, military, troops... 437 | ## Cluster 1 (n=1): mister, along, sort, unemployed 438 | ## Cluster 6 (n=1): department, chu, stated, misperception 439 | 440 | ### Clusters, Terms, and Docs Plot 441 | 442 | Here I plot the clusters, terms, and documents (grouping variables) 443 | together as a combined heatmap. This can be useful for viewing & 444 | comparing what documents are clustering together in the context of the 445 | cluster's salient terms. This example also shows how to use the cluster 446 | terms as a lookup key to extract probable salient terms for a given 447 | document. 448 | 449 | key <- data_frame( 450 | cluster = 1:6, 451 | labs = get_terms(ca) %>% 452 | tidy_list("cluster") %>% 453 | select(-weight) %>% 454 | group_by(cluster) %>% 455 | summarize(term=paste(term, collapse=", ")) %>% 456 | apply(1, paste, collapse=": ") 457 | ) 458 | 459 | ca %>% 460 | tidy_vector("id", "cluster") %>% 461 | separate(id, c("person", "time"), sep="_") %>% 462 | tbl_df() %>% 463 | left_join(key, by = "cluster") %>% 464 | mutate(n = 1) %>% 465 | mutate(labs = factor(labs, levels=rev(key[["labs"]]))) %>% 466 | unite("time_person", time, person, sep="\n") %>% 467 | select(-cluster) %>% 468 | complete(time_person, labs) %>% 469 | mutate(n = factor(ifelse(is.na(n), FALSE, TRUE))) %>% 470 | ggplot(aes(time_person, labs, fill = n)) + 471 | geom_tile() + 472 | scale_fill_manual(values=c("grey90", "red"), guide=FALSE) + 473 | labs(x=NULL, y=NULL) 474 | 475 | ![](tools/figure/unnamed-chunk-12-1.png) 476 | 477 | ### Cluster Documents 478 | 479 | The `get_documents` function grabs the documents associated with a 480 | particular cluster. This is most useful in cases where the number of 481 | documents is small and they have been given names. 482 | 483 | get_documents(ca) 484 | 485 | ## $`1` 486 | ## [1] "CROWLEY_time 2" 487 | ## 488 | ## $`2` 489 | ## [1] "LEHRER_time 1" "SCHIEFFER_time 3" 490 | ## 491 | ## $`3` 492 | ## [1] "OBAMA_time 1" "ROMNEY_time 1" 493 | ## 494 | ## $`4` 495 | ## [1] "OBAMA_time 2" "ROMNEY_time 2" 496 | ## 497 | ## $`5` 498 | ## [1] "OBAMA_time 3" "ROMNEY_time 3" 499 | ## 500 | ## $`6` 501 | ## [1] "QUESTION_time 2" 502 | 503 | Putting it Together 504 | ------------------- 505 | 506 | I like working in a chain. In the setup below we work within a 507 | **magrittr** pipeline to fit a model, select clusters, and examine the 508 | results. In this example I do not condense the 2012 Presidential Debates 509 | data by speaker and time, rather leaving every sentence as a separate 510 | document. On my machine the initial `data_store` and model fit take ~5-8 511 | seconds to run. Note that I do restrict the number of clusters (for 512 | texts and terms) to a random 5 clusters for the sake of space. 513 | 514 | .tic <- Sys.time() 515 | 516 | myfit2 <- presidential_debates_2012 %>% 517 | with(data_store(dialogue)) %>% 518 | hierarchical_cluster() 519 | 520 | difftime(Sys.time(), .tic) 521 | 522 | ## Time difference of 5.450191 secs 523 | 524 | ## View Document Loadings 525 | ca2 <- assign_cluster(myfit2, k = 100) 526 | summary(ca2) %>% 527 | head(12) 528 | 529 | ![](tools/figure/unnamed-chunk-14-1.png) 530 | 531 | ## cluster count 532 | ## 1 2 1409 533 | ## 2 25 54 534 | ## 3 15 50 535 | ## 4 39 46 536 | ## 5 61 39 537 | ## 6 36 37 538 | ## 7 40 33 539 | ## 8 17 31 540 | ## 9 31 29 541 | ## 10 37 28 542 | ## 11 27 25 543 | ## 12 46 23 544 | 545 | ## Split Text into Clusters 546 | set.seed(5); inds <- sort(sample.int(100, 5)) 547 | 548 | get_text(ca2)[inds] %>% 549 | lapply(head, 10) 550 | 551 | ## $`11` 552 | ## [1] "Yeah, we're going to yeah, I want to get to it." 553 | ## [2] "And do you yeah." 554 | ## [3] "Yeah, you bet." 555 | ## [4] "Yeah." 556 | ## 557 | ## $`21` 558 | ## [1] "Regulation is essential." 559 | ## [2] "You can't have a free market work if you don't have regulation." 560 | ## [3] "Every free economy has good regulation." 561 | ## [4] "You have to have regulation." 562 | ## [5] "Now, it wasn't just on Wall Street." 563 | ## [6] "We stepped in and had the toughest reforms on Wall Street since the one thousand nine hundred thirtys." 564 | ## [7] "And so the question is: Does anybody out there think that the big problem we had is that there was too much oversight and regulation of Wall Street?" 565 | ## [8] "Look, we have to have regulation on Wall Street." 566 | ## [9] "That's why I'd have regulation." 567 | ## [10] "I committed that I would rein in the excesses of Wall Street, and we passed the toughest Wall Street reforms since the one thousand nine hundred thirtys." 568 | ## 569 | ## $`28` 570 | ## [1] "We said you've got banks, you've got to raise your capital requirements." 571 | ## [2] "Well, actually it's it's it's a lengthy description." 572 | ## [3] "Well, actually Governor, that isn't what your plan does." 573 | ## [4] "Number two, we've got to make sure that we have the best education system in the world." 574 | ## [5] "We've got to reduce our deficit, but we've got to do it in a balanced way." 575 | ## [6] "I got to I got to move you on." 576 | ## [7] "He actually got|" 577 | ## [8] "He actually got the first question." 578 | ## [9] "Governor Romney, I'm sure you've got a reply there." 579 | ## [10] "I've got to say|" 580 | ## 581 | ## $`68` 582 | ## [1] "That's how we went after Al Qaida and bin Laden." 583 | ## [2] "I said that we'd go after al Qaeda and bin Laden, we have." 584 | ## [3] "I congratulate him on on taking out Osama bin Laden and going after the leadership in al Qaeda." 585 | ## [4] "When it comes to going after Osama bin Laden, you said, well, any president would make that call." 586 | ## [5] "And she said to me, You know, by finally getting bin Laden, that brought some closure to me." 587 | ## [6] "We had to go in there to get Osama bin Laden." 588 | ## [7] "Well, keep in mind our strategy wasn't just going after bin Laden." 589 | ## 590 | ## $`90` 591 | ## [1] "And finally, championing small business." 592 | ## [2] "And then let's take the last one, championing small business." 593 | ## [3] "I came through small business." 594 | ## [4] "I understand how hard it is to start a small business." 595 | ## [5] "I want to keep their taxes down on small business." 596 | ## [6] "So if you're starting a business, where would you rather start it?" 597 | ## [7] "But, of course, if you're a small business or a mom and pop business or a big business starting up here, you've got to pay even the reduced rate that Governor Romney's talking about." 598 | ## [8] "And finally, number five, we've got to champion small business." 599 | ## [9] "Small business is where jobs come from." 600 | ## [10] "That's not the kind of small business promotion we need." 601 | 602 | ## Get Associated Terms 603 | get_terms(ca2, .4)[inds] 604 | 605 | ## $`11` 606 | ## term weight 607 | ## 1 yeah 1 608 | ## 609 | ## $`21` 610 | ## term weight 611 | ## 1 regulation 1 612 | ## 613 | ## $`28` 614 | ## term weight 615 | ## 1 got 1.0000000 616 | ## 2 actually 0.5421049 617 | ## 618 | ## $`68` 619 | ## term weight 620 | ## 1 bin 1.000000 621 | ## 2 laden 1.000000 622 | ## 3 osama 0.449873 623 | ## 624 | ## $`90` 625 | ## term weight 626 | ## 1 business 1.000000 627 | ## 2 small 0.679014 628 | 629 | ## Pretty Printed Topics 630 | ## Get Associated Terms 631 | get_terms(ca2, .4) %>% 632 | as_topic() 633 | 634 | ## Cluster 2 (n=1409): going, people, said, can, make, one, governor, get... 635 | ## Cluster 25 (n=54): need, keep, thought, indict, speak, progress, trying, standard... 636 | ## Cluster 15 (n=50): want, thirty, now, leave, bigger 637 | ## Cluster 39 (n=46): jobs, investing 638 | ## Cluster 61 (n=39): education, doubt, oil, lands, public 639 | ## Cluster 36 (n=37): thousand, hundred, eighty, two 640 | ## Cluster 40 (n=33): middle, class, east, certainly 641 | ## Cluster 17 (n=31): israel, part, experience, united, way, states 642 | ## Cluster 31 (n=29): years, four, last 643 | ## Cluster 37 (n=28): tax, deductions, amount 644 | ## Cluster 27 (n=25): military, maintaining, spending 645 | ## Cluster 46 (n=23): important, gone, think, mistake, tough 646 | ## Cluster 62 (n=23): thank, gentlemen 647 | ## Cluster 10 (n=22): talk, two, minute, minutes, toss 648 | ## Cluster 14 (n=21): back, medicare, come, manufacturing 649 | ## Cluster 16 (n=21): right 650 | ## Cluster 22 (n=21): example, chance, give, state, let 651 | ## Cluster 55 (n=21): mister, president 652 | ## Cluster 30 (n=20): happened, exactly 653 | ## Cluster 65 (n=20): governor 654 | ## Cluster 81 (n=20): china, problem, compete 655 | ## Cluster 19 (n=19): economy, grows 656 | ## Cluster 35 (n=19): deal, trade 657 | ## Cluster 42 (n=19): percent, seven 658 | ## Cluster 44 (n=19): trillion, dollar 659 | ## Cluster 84 (n=19): first, year 660 | ## Cluster 1 (n=18): care, health 661 | ## Cluster 24 (n=18): get, jeremy, rid 662 | ## Cluster 28 (n=18): got, actually 663 | ## Cluster 54 (n=18): role, leadership, government, kind, shown 664 | ## Cluster 70 (n=18): bush, different 665 | ## Cluster 26 (n=17): respond, will, lose 666 | ## Cluster 32 (n=17): future, bright, critical 667 | ## Cluster 76 (n=17): energy 668 | ## Cluster 34 (n=16): sure, make, job, college 669 | ## Cluster 43 (n=16): taxes, cut 670 | ## Cluster 13 (n=15): please, ask, repealed, quickly 671 | ## Cluster 67 (n=15): war, end, iraq 672 | ## Cluster 4 (n=14): one, number 673 | ## Cluster 63 (n=14): grow, planning 674 | ## Cluster 79 (n=14): coal 675 | ## Cluster 23 (n=13): dodd, frank, repeal 676 | ## Cluster 38 (n=13): three, twenty, million 677 | ## Cluster 56 (n=13): question 678 | ## Cluster 78 (n=13): true 679 | ## Cluster 96 (n=13): iran, bomb, closer, nuclear 680 | ## Cluster 21 (n=12): regulation 681 | ## Cluster 47 (n=12): let 682 | ## Cluster 51 (n=12): stand, principles 683 | ## Cluster 69 (n=12): election, course 684 | ## Cluster 72 (n=12): thank 685 | ## Cluster 6 (n=11): happen 686 | ## Cluster 57 (n=11): difference 687 | ## Cluster 59 (n=11): happy, teachers 688 | ## Cluster 74 (n=11): know 689 | ## Cluster 89 (n=11): balanced, budget 690 | ## Cluster 90 (n=11): business, small 691 | ## Cluster 33 (n=10): romney, governor 692 | ## Cluster 45 (n=10): plan 693 | ## Cluster 75 (n=10): detroit, bankrupt 694 | ## Cluster 77 (n=10): policies 695 | ## Cluster 83 (n=10): still, speaking 696 | ## Cluster 95 (n=10): policy, failure, foreign 697 | ## Cluster 98 (n=10): sanctions, tighten, crippling 698 | ## Cluster 7 (n=9): private 699 | ## Cluster 8 (n=9): high 700 | ## Cluster 66 (n=9): done 701 | ## Cluster 82 (n=9): answer, straightforward 702 | ## Cluster 92 (n=9): said 703 | ## Cluster 100 (n=9): strong 704 | ## Cluster 3 (n=8): billion, dollar, ninety, sixteen 705 | ## Cluster 18 (n=8): absolutely 706 | ## Cluster 41 (n=8): tell 707 | ## Cluster 50 (n=8): move, along 708 | ## Cluster 73 (n=8): good, night 709 | ## Cluster 80 (n=8): production 710 | ## Cluster 86 (n=8): create, jobs 711 | ## Cluster 5 (n=7): choice 712 | ## Cluster 9 (n=7): point, last, make 713 | ## Cluster 49 (n=7): wrong 714 | ## Cluster 53 (n=7): see 715 | ## Cluster 60 (n=7): stamps, food, million 716 | ## Cluster 64 (n=7): excuse, sir 717 | ## Cluster 68 (n=7): bin, laden, osama 718 | ## Cluster 71 (n=7): record, check 719 | ## Cluster 97 (n=7): bob 720 | ## Cluster 29 (n=6): believe 721 | ## Cluster 85 (n=6): candy 722 | ## Cluster 91 (n=6): understand 723 | ## Cluster 48 (n=5): well 724 | ## Cluster 58 (n=5): great 725 | ## Cluster 87 (n=5): thanks 726 | ## Cluster 88 (n=5): yes 727 | ## Cluster 93 (n=5): lorraine 728 | ## Cluster 94 (n=5): pension, looked 729 | ## Cluster 11 (n=4): yeah 730 | ## Cluster 12 (n=4): sorry 731 | ## Cluster 20 (n=4): much, cut 732 | ## Cluster 52 (n=4): time 733 | ## Cluster 99 (n=4): work 734 | 735 | An Experiment 736 | ------------- 737 | 738 | It seems to me that if the hierarchical clustering is function as 739 | expected we'd see topics clustering together within a conversation as 740 | the natural eb and flow of a conversation is to talk around a topic for 741 | a while and then move on to the next related topic. A Gantt style plot 742 | of topics across time seems like an excellent way to observe clustering 743 | across time. In the experiment I first ran the hierarchical clustering 744 | at the sentence level for all participants in the 2012 presidential 745 | debates data set. I then decided to use turn of talk as the unit of 746 | analysis. Finally, I pulled out the two candidates (President Obama and 747 | Romney) and faceted n their topic use over time. 748 | 749 | if (!require("pacman")) install.packages("pacman") 750 | pacman::p_load(dplyr, clustext, textshape, ggplot2, stringi) 751 | 752 | myfit3 <- presidential_debates_2012 %>% 753 | mutate(tot = gsub("\\..+$", "", tot)) %>% 754 | with(data_store(dialogue)) %>% 755 | hierarchical_cluster() 756 | 757 | plot(myfit3, 75) 758 | 759 | ![](tools/figure/unnamed-chunk-15-1.png) 760 | 761 | Can & Ozkarahan's (1990) formula indicated a `k = 259`. This umber 762 | seemed overly large. I used `k = 75` for the number of topics as it 763 | seemed unreasonable that there'd be more topics than this but with 764 | `k = 75` over half of the sentences loaded on one cluster. Note the use 765 | of the `attribute` `join` from `assign_cluster` to make joining back to 766 | the original data set easier. 767 | 768 | k <- 75 769 | ca3 <- assign_cluster(myfit3, k = k) 770 | 771 | presidential_debates_2012 %>% 772 | mutate(tot = gsub("\\..+$", "", tot)) %>% 773 | tbl_df() %>% 774 | attributes(ca3)$join() %>% 775 | group_by(time) %>% 776 | mutate( 777 | word_count = stringi::stri_count_words(dialogue), 778 | start = starts(word_count), 779 | end = ends(word_count) 780 | ) %>% 781 | na.omit() %>% 782 | mutate(cluster = factor(cluster, levels = k:1)) %>% 783 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) + 784 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 785 | ggplot2::theme_bw() + 786 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 787 | panel.grid.minor.x = ggplot2::element_blank(), 788 | panel.grid.major.x = ggplot2::element_blank(), 789 | panel.grid.minor.y = ggplot2::element_blank(), 790 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 791 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 792 | strip.background = ggplot2::element_blank()) + 793 | ggplot2::facet_wrap(~time, scales='free', ncol=1) + 794 | ggplot2::labs(x="Duration (words)", y="Cluster") 795 | 796 | ## Warning: Ignoring unknown aesthetics: position 797 | 798 | ![](tools/figure/unnamed-chunk-16-1.png) 799 | 800 | Right away we notice that not all topics are used across all three 801 | times. This is encouraging that the clustering is working as expected as 802 | we'd expect some overlap in debate topics as well as some unique topics. 803 | However, there were so many topics clustering on cluster 3 that I had to 804 | make some decisions. I could (a) ignore this mass and essentially throw 805 | out half the data that loaded on a single cluster, (b) increase `k` to 806 | split up the mass loading on cluster 3, (c) change the unit of analysis. 807 | It seemed the first option was wasteful of data and could miss 808 | information. The second approach could lead to a model that had so many 809 | topics it wouldn't be meaningful. The last approach seemed reasonable, 810 | inspecting the cluster text showed that many were capturing functions of 811 | language rather than content. For example, people use *"Oh."* to 812 | indicate agreement. This isn't a topic but the clustering would group 813 | sentences that use this convention together. Combining this sentence 814 | with other sentences in the turn of talk are more likely to get the 815 | content we're after. 816 | 817 | Next I used the `textshape::combine` function to group turns of talk 818 | together. 819 | 820 | myfit4 <- presidential_debates_2012 %>% 821 | mutate(tot = gsub("\\..+$", "", tot)) %>% 822 | textshape::combine() %>% 823 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>% 824 | hierarchical_cluster() 825 | 826 | plot(myfit4, k = 80) 827 | 828 | ![](tools/figure/unnamed-chunk-17-1.png) 829 | 830 | The distribution of turns of talk looked much more dispersed across 831 | clusters. I used `k = 60` for the number of topics. 832 | 833 | k <- 80 834 | ca4 <- assign_cluster(myfit4, k = k) 835 | 836 | presidential_debates_2012 %>% 837 | mutate(tot = gsub("\\..+$", "", tot)) %>% 838 | textshape::combine() %>% 839 | tbl_df() %>% 840 | attributes(ca4)$join() %>% 841 | group_by(time) %>% 842 | mutate( 843 | word_count = stringi::stri_count_words(dialogue), 844 | start = starts(word_count), 845 | end = ends(word_count) 846 | ) %>% 847 | na.omit() %>% 848 | mutate(cluster = factor(cluster, levels = k:1)) %>% 849 | ggplot2::ggplot(ggplot2::aes(x = start-2, y = cluster, xend = end+2, yend = cluster)) + 850 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 851 | ggplot2::theme_bw() + 852 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 853 | panel.grid.minor.x = ggplot2::element_blank(), 854 | panel.grid.major.x = ggplot2::element_blank(), 855 | panel.grid.minor.y = ggplot2::element_blank(), 856 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 857 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 858 | strip.background = ggplot2::element_blank()) + 859 | ggplot2::facet_wrap(~time, scales='free', ncol=1) + 860 | ggplot2::labs(x="Duration (words)", y="Cluster") 861 | 862 | ## Warning: Ignoring unknown aesthetics: position 863 | 864 | ![](tools/figure/unnamed-chunk-18-1.png) 865 | 866 | The plots looked less messy and indeed topics do appear to be clustering 867 | around one another. I wanted to see how the primary participants, the 868 | candidates, compared to each other in topic use. 869 | 870 | In this last bit of analysis I filter out all participants except Obama 871 | and Romeny and facet by participant across time. 872 | 873 | myfit5 <- presidential_debates_2012 %>% 874 | mutate(tot = gsub("\\..+$", "", tot)) %>% 875 | textshape::combine() %>% 876 | filter(person %in% c("ROMNEY", "OBAMA")) %>% 877 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) %>% 878 | hierarchical_cluster() 879 | 880 | 881 | plot(myfit5, 50) 882 | 883 | ![](tools/figure/unnamed-chunk-19-1.png) 884 | 885 | Based on the dendrogram, I used `k = 50` for the number of topics. 886 | 887 | k <- 50 888 | ca5 <- assign_cluster(myfit5, k = k) 889 | 890 | presidential_debates_2012 %>% 891 | mutate(tot = gsub("\\..+$", "", tot)) %>% 892 | textshape::combine() %>% 893 | filter(person %in% c("ROMNEY", "OBAMA")) %>% 894 | tbl_df() %>% 895 | attributes(ca5)$join() %>% 896 | group_by(time) %>% 897 | mutate( 898 | word_count = stringi::stri_count_words(dialogue), 899 | start = starts(word_count), 900 | end = ends(word_count) 901 | ) %>% 902 | na.omit() %>% 903 | mutate(cluster = factor(cluster, levels = k:1)) %>% 904 | ggplot2::ggplot(ggplot2::aes(x = start-10, y = cluster, xend = end+10, yend = cluster)) + 905 | ggplot2::geom_segment(ggplot2::aes(position="dodge"), color = 'white', size = 3) + 906 | ggplot2::theme_bw() + 907 | ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'grey20'), 908 | panel.grid.minor.x = ggplot2::element_blank(), 909 | panel.grid.major.x = ggplot2::element_blank(), 910 | panel.grid.minor.y = ggplot2::element_blank(), 911 | panel.grid.major.y = ggplot2::element_line(color = 'grey35'), 912 | strip.text.y = ggplot2::element_text(angle=0, hjust = 0), 913 | strip.background = ggplot2::element_blank()) + 914 | ggplot2::facet_grid(person~time, scales='free', space='free') + 915 | ggplot2::labs(x="Duration (words)", y="Cluster") 916 | 917 | ## Warning: Ignoring unknown aesthetics: position 918 | 919 | ![](tools/figure/unnamed-chunk-20-1.png) 920 | 921 | If you're curious about the heaviest weighted tf-idf terms in each 922 | cluster the next code chunk provides the top five weighted terms used in 923 | each cluster. Below this I provide a bar plot of the frequencies of 924 | clusters to help put the other information into perspective. 925 | 926 | invisible(Map(function(x, y){ 927 | 928 | if (is.null(x)) { 929 | cat(sprintf("Cluster %s: ...\n", y)) 930 | } else { 931 | m <- dplyr::top_n(x, 5, n) 932 | o <- paste(paste0(m[[1]], " (", round(m[[2]], 1), ")"), collapse="; ") 933 | cat(sprintf("Cluster %s: %s\n", y, o)) 934 | } 935 | 936 | }, get_terms(ca5, .4), names(get_terms(ca5, .4)))) 937 | 938 | ## Cluster 1: going (1); time (0.6); get (0.5); years (0.5); like (0.4) 939 | ## Cluster 2: trillion (1); dollar (0.9); billion (0.5) 940 | ## Cluster 3: one (1); number (0.9) 941 | ## Cluster 4: get (1); private (0.7); medicare (0.5) 942 | ## Cluster 5: election (1); choice (0.9) 943 | ## Cluster 6: two (1); eighty (0.9); thousand (0.6); dollar (0.5); hundred (0.4) 944 | ## Cluster 7: happen (1) 945 | ## Cluster 8: care (1); government (0.9); keep (0.7); federal (0.6); health (0.5) 946 | ## Cluster 9: war (1); high (0.9); end (0.8); iraq (0.5) 947 | ## Cluster 10: good (1); forward (0.6); know (0.6) 948 | ## Cluster 11: make (1); sure (0.9); point (0.7); want (0.6); last (0.5) 949 | ## Cluster 12: sorry (1); china (0.9); record (0.7); talk (0.6); problem (0.5) 950 | ## Cluster 13: absolutely (1) 951 | ## Cluster 14: yes (1); places (0.5) 952 | ## Cluster 15: let (1); respond (0.7); tell (0.7); bob (0.7); example (0.5) 953 | ## Cluster 16: dodd (1); frank (1) 954 | ## Cluster 17: cut (1); taxes (0.8) 955 | ## Cluster 18: believe (1) 956 | ## Cluster 19: three (1) 957 | ## Cluster 20: jobs (1) 958 | ## Cluster 21: middle (1); class (0.6); east (0.5); certainly (0.5) 959 | ## Cluster 22: plan (1) 960 | ## Cluster 23: well (1); first (0.7); year (0.6); posture (0.6) 961 | ## Cluster 24: wrong (1) 962 | ## Cluster 25: great (1); bush (0.7); different (0.6); teachers (0.6) 963 | ## Cluster 26: difference (1) 964 | ## Cluster 27: military (1) 965 | ## Cluster 28: stamps (1); food (1); million (0.4) 966 | ## Cluster 29: job (1); college (0.8); find (0.6) 967 | ## Cluster 30: pension (1); mister (0.9); speaking (0.6); still (0.5); looked (0.5) 968 | ## Cluster 31: business (1); grow (0.8); small (0.8); planning (0.5) 969 | ## Cluster 32: done (1) 970 | ## Cluster 33: thank (1) 971 | ## Cluster 34: four (1); last (1); years (0.9) 972 | ## Cluster 35: question (1); answer (0.8); straightforward (0.5) 973 | ## Cluster 36: detroit (1); bankrupt (0.5) 974 | ## Cluster 37: energy (1) 975 | ## Cluster 38: percent (1); seven (0.5) 976 | ## Cluster 39: right (1); course (0.6) 977 | ## Cluster 40: governor (1) 978 | ## Cluster 41: production (1); coal (0.9); oil (0.5); gas (0.4) 979 | ## Cluster 42: true (1) 980 | ## Cluster 43: candy (1) 981 | ## Cluster 44: economy (1); grows (0.5) 982 | ## Cluster 45: balanced (1); budget (0.9) 983 | ## Cluster 46: leadership (1); kind (0.8); role (0.5); shown (0.5); show (0.5) 984 | ## Cluster 47: said (1) 985 | ## Cluster 48: iran (1); nuclear (0.5) 986 | ## Cluster 49: work (1) 987 | ## Cluster 50: strong (1) 988 | 989 | invisible(summary(ca5)) 990 | 991 | ![](tools/figure/unnamed-chunk-22-1.png) 992 | 993 | It appears that in fact the topics do cluster within segments of time as 994 | we'd expect. This is more apparent when turn of talk is used as the unit 995 | of analysis (document level) rather than each sentence. -------------------------------------------------------------------------------- /data/assignments.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/data/assignments.rda -------------------------------------------------------------------------------- /data/presidential_debates_2012.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/data/presidential_debates_2012.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite clustext in publications, please use:") 2 | 3 | 4 | citEntry(entry = "manual", 5 | title = "{clustext}: Consistent Clustering for Text Data", 6 | author = "Tyler W. Rinker", 7 | organization = "University at Buffalo/SUNY", 8 | address = "Buffalo, New York", 9 | note = "version 0.1.1", 10 | year = "2016", 11 | url = "http://github.com/trinker/clustext", 12 | textVersion = paste("Rinker, T. W. (2016).", 13 | "clustext: Consistent Clustering for Text Data", 14 | "version 0.1.1. University at Buffalo. Buffalo, New York.", 15 | "http://github.com/trinker/clustext") 16 | ) 17 | -------------------------------------------------------------------------------- /inst/additional/foo_turk.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | ========== 4 | Cluster 1: 5 | ========== 6 | * But I'm not in favor of rounding up people and and and taking them out of this country. 7 | * What is the deal that you would accept, Mister President? 8 | * We take program after program that we don't absolutely have to have, and we get rid of them. 9 | * I I was I was someone who ran businesses for twenty five years, and balanced the budget. 10 | * We need to be talking about space. 11 | * It's wonderful that Libya seems to be making some progress, despite this terrible tragedy. 12 | * Number two, I asked the president a question I think Hispanics and immigrants all over the nation have asked. 13 | * We need to have strong allies. 14 | * Questions remain. 15 | * Now, let me mention one other thing, and that is self deportation says let people make their own choice. 16 | 17 | 18 | 19 | 20 | ========== 21 | Cluster 2: money & jobs 22 | ========== 23 | * You're going to get a repeat of the last four years. 24 | * That's creating tens of thousands of jobs all across the country. 25 | * But Governor, when it comes to our foreign policy, you seem to want to import the foreign policies of the one thousand nine hundred eightys, just like the social policies of the one thousand nine hundred fiftys and the economic policies of the one thousand nine hundred twentys. 26 | * And we would have lost a million jobs. 27 | * Look, I look at what's happening around the world, and i see Iran four years closer to a bomb. 28 | * That's another trillion dollars that's dollar eight trillion. 29 | * Median income is down dollar four thousand three hundred a family and twenty three million Americans out of work. 30 | * eighty six. 31 | * It is also essential for us to understand what our mission is in Iran, and that is to dissuade Iran from having a nuclear weapon through peaceful and diplomatic means. 32 | * Massachusetts schools are ranked number one in the nation. 33 | 34 | 35 | 36 | 37 | ========== 38 | Cluster 3: correct 39 | ========== 40 | * All right. 41 | * All right. 42 | * All right. 43 | * Right. 44 | * And that's right. 45 | * Did I get that right? 46 | * All right. 47 | * All right. 48 | * All right. 49 | * All right. 50 | 51 | 52 | 53 | 54 | ========== 55 | Cluster 4: =3 56 | ========== 57 | * You're absolutely right. 58 | * Nothing Governor Romney just said is true, starting with this notion of me apologizing. 59 | * Absolutely. 60 | * Absolutely. 61 | * Well, that's probably true. 62 | * Candy, what Governor Romney said just isn't true. 63 | * Absolutely. 64 | * It's just not true. 65 | * Not true. 66 | * Absolutely. 67 | 68 | 69 | 70 | 71 | ========== 72 | Cluster 5: ello govna 73 | ========== 74 | * Hi, Governor. 75 | * Governor Romney, you did not| 76 | * Governor? 77 | * Governor| 78 | * Governor? 79 | * Governor| 80 | * Governor| 81 | * And you go first, Governor Romney. 82 | * I was a governor. 83 | * Governor? 84 | 85 | 86 | 87 | 88 | ========== 89 | Cluster 6: mister 90 | ========== 91 | * This has not been Mister Oil, or Mister Gas, or Mister Coal. 92 | * Mister President? 93 | * Mister President| 94 | * Mister President? 95 | * But what would you do as president? 96 | * Mister President? 97 | * Mister President, Mister President, you're entitled as the president to your own airplane and to your own house, but not to your own facts. 98 | * Mister President? 99 | * This is a president who has not been able to do what he said he'd do. 100 | * Mister President? 101 | 102 | 103 | 104 | 105 | ========== 106 | Cluster 7: politeness 107 | ========== 108 | * And that is the question. 109 | * Governor, this question is for you. 110 | * But the president does get this question. 111 | * For now, from the University of Denver, I'm Jim Lehrer. 112 | * No, no, I had a question and the question was how much did you cut them by? 113 | * And I appreciate that question. 114 | * Thank you. 115 | * Thank you so much. 116 | * Thank you, and that's an opportunity for me, and I appreciate it. 117 | * Well, Jim, I want to thank you, and I want to thank Governor Romney, because I think was a terrific debate, and I very much appreciate it. 118 | -------------------------------------------------------------------------------- /inst/build.R: -------------------------------------------------------------------------------- 1 | root <- Sys.getenv("USERPROFILE") 2 | pack <- basename(getwd()) 3 | 4 | quick <- TRUE 5 | pdf <- TRUE 6 | 7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE) 8 | devtools::document() 9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE) 10 | 11 | if(pdf){ 12 | path <- find.package(pack) 13 | system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path))) 14 | file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf'))) 15 | while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)} 16 | empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE) 17 | unlink(empts, recursive = TRUE, force = TRUE) 18 | } 19 | 20 | message("Done!") 21 | -------------------------------------------------------------------------------- /inst/extra_scripts/build_data.R: -------------------------------------------------------------------------------- 1 | pacman::p_load(clustext, dplyr) 2 | 3 | x <- presidential_debates_2012 %>% 4 | mutate(tot = gsub("\\..+$", "", tot)) %>% 5 | textshape::combine() %>% 6 | filter(person %in% c("ROMNEY", "OBAMA")) %>% 7 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) 8 | 9 | set.seed(10) 10 | kmeans_assignment <- kmeans_cluster(x, 50) %>% 11 | assign_cluster(myfit2) 12 | 13 | set.seed(10) 14 | nmf_assignment <- nmf_cluster(x, 50) %>% 15 | assign_cluster(myfit2) 16 | 17 | set.seed(10) 18 | skmeans_assignment <- skmeans_cluster(x, 50) %>% 19 | assign_cluster(myfit2) 20 | 21 | hierarchical_assignment <- hierarchical_cluster(x) %>% 22 | assign_cluster(k=50) 23 | 24 | assignments <- list( 25 | hierarchical_assignment = hierarchical_assignment, 26 | kmeans_assignment = kmeans_assignment, 27 | skmeans_assignment = skmeans_assignment, 28 | nmf_assignment =nmf_assignment 29 | ) 30 | 31 | 32 | assignments <- lapply(assignments, function(x) { 33 | attributes(x)[['data_store']] <- NULL 34 | attributes(x)[['model']] <- NULL 35 | attributes(x)[['join']] <- NULL 36 | x 37 | }) 38 | 39 | lapply(assignments, function(x) {names(attributes(x))}) 40 | 41 | pax::new_data(assignments) 42 | -------------------------------------------------------------------------------- /inst/extra_statdoc/readme.R: -------------------------------------------------------------------------------- 1 |


2 |

clustext is a...

3 |

Download the development version of clustext here 4 | -------------------------------------------------------------------------------- /inst/maintenance.R: -------------------------------------------------------------------------------- 1 | #======== 2 | # BUILD 3 | #======== 4 | source("inst/build.R") 5 | 6 | #========================== 7 | # Run unit tests 8 | #========================== 9 | devtools::test() 10 | 11 | #========================== 12 | # knit README.md 13 | #========================== 14 | rmarkdown::render("README.Rmd", "all"); md_toc() 15 | 16 | #========================== 17 | # UPDATE NEWS 18 | #========================== 19 | update_news() 20 | 21 | #========================== 22 | # UPDATE VERSION 23 | #========================== 24 | update_version() 25 | 26 | #======================== 27 | #staticdocs dev version 28 | #======================== 29 | 30 | if (!require("pacman")) install.packages("pacman") 31 | pacman::p_load_gh("hadley/staticdocs", "trinker/acc.roxygen2") 32 | p_load(rstudioapi, qdap) 33 | 34 | R_USER <- switch(Sys.info()[["user"]], 35 | Tyler = "C:/Users/Tyler", 36 | trinker = "C:/Users/trinker", 37 | message("Computer name not found") 38 | ) 39 | build_site(pkg=file.path(R_USER, "GitHub", basename(getwd())), launch = FALSE) 40 | 41 | #STEP 2: reshape index 42 | path <- "inst/web" 43 | path2 <- file.path(path, "/index.html") 44 | rdme <- file.path(R_USER, "GitHub", basename(getwd()), "inst/extra_statdoc/readme.R") 45 | 46 | extras <- qcv("") 47 | ## drops <- qcv() 48 | expand_statdoc(path2, to.icon = extras, readme = rdme) 49 | 50 | x <- readLines(path2) 51 | x[grepl("

Authors

", x)] <- paste( 52 | c("

Author

" 53 | #rep("

Contributor

", 1) 54 | ), 55 | c("Tyler W. Rinker") 56 | ) 57 | 58 | cat(paste(x, collapse="\n"), file=path2) 59 | 60 | #STEP 3: move to trinker.guthub 61 | library(reports) 62 | file <- file.path(R_USER, "/GitHub/trinker.github.com") 63 | # incoming <- file.path(file, basename(getwd())) 64 | delete(incoming) 65 | file.copy(path, file, TRUE, TRUE) 66 | file.rename(file.path(file, "web"), incoming) 67 | ## delete(path) 68 | 69 | #========================== 70 | #staticdocs current version 71 | #========================== 72 | 73 | #STEP 3: move to trinker.guthub 74 | library(reports) 75 | file <- file.path(R_USER, "/GitHub/trinker.github.com") 76 | incoming <- file.path(file, "discon") 77 | ## delete(incoming); file.copy(path, file, TRUE, TRUE); file.rename(file.path(file, "web"), incoming) 78 | 79 | #========================== 80 | # NEWS new version 81 | #========================== 82 | x <- c("BUG FIXES", "NEW FEATURES", "MINOR FEATURES", "IMPROVEMENTS", "CHANGES") 83 | cat(paste(x, collapse = "\n\n"), file="clipboard") 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /inst/staticdocs/index.R: -------------------------------------------------------------------------------- 1 | library(staticdocs) 2 | 3 | sd_section("", 4 | "Function for...", 5 | c( 6 | "myfun" 7 | ) 8 | ) -------------------------------------------------------------------------------- /man/approx_k.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/approx_k.R 3 | \name{approx_k} 4 | \alias{approx_k} 5 | \alias{approx_k.DocumentTermMatrix} 6 | \alias{approx_k.TermDocumentMatrix} 7 | \title{Approximate Number of Clusters for a Text Matrix} 8 | \usage{ 9 | approx_k(x, verbose = TRUE) 10 | 11 | \method{approx_k}{TermDocumentMatrix}(x, verbose = TRUE) 12 | 13 | \method{approx_k}{DocumentTermMatrix}(x, verbose = TRUE) 14 | } 15 | \arguments{ 16 | \item{x}{A matrix.} 17 | 18 | \item{verbose}{logical. If \code{TRUE} the k determination is printed.} 19 | } 20 | \value{ 21 | Returns an integer. 22 | } 23 | \description{ 24 | Can & Ozkarahan (1990) formula for approximating the number of clusters for 25 | a text matrix: \eqn{(m * n)/t} where \eqn{m} and \eqn{n} are the dimensions 26 | of the matrix and \eqn{t} is the length of the non-zero elements in matrix 27 | \eqn{A}. 28 | } 29 | \examples{ 30 | library(gofastr) 31 | library(dplyr) 32 | 33 | presidential_debates_2012 \%>\% 34 | with(q_dtm(dialogue)) \%>\% 35 | approx_k() 36 | } 37 | \references{ 38 | Can, F., Ozkarahan, E. A. (1990). Concepts and effectiveness of 39 | the cover-coefficient-based clustering methodology for text databases. 40 | ACM Transactions on Database Systems 15 (4): 483. doi:10.1145/99935.99938. \cr 41 | } 42 | 43 | -------------------------------------------------------------------------------- /man/as_topic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/as_topic.R 3 | \name{as_topic} 4 | \alias{as_topic} 5 | \alias{as_topic.get_terms} 6 | \title{Convert \code{get_terms} to Topics} 7 | \usage{ 8 | as_topic(x, max.n = 8, sort = TRUE, ...) 9 | 10 | \method{as_topic}{get_terms}(x, max.n = 8, sort = TRUE, ...) 11 | } 12 | \arguments{ 13 | \item{x}{A \code{get_terms} object.} 14 | 15 | \item{max.n}{The max number of words to show before truncation.} 16 | 17 | \item{sort}{logical. If \code{TRUE} the cluster topics are sorted by size 18 | (number of documents) otherwise the topics are sorted by cluster number.} 19 | 20 | \item{\ldots}{ignored.} 21 | } 22 | \value{ 23 | Returns a \code{\link[base]{data.frame}} of \code{"cluster"}, 24 | \code{"count"}, and \code{"terms"}. Pretty prints as clusters, number of 25 | documents, and associated important terms. 26 | } 27 | \description{ 28 | View important terms as a comma separated string (a topic). 29 | } 30 | \examples{ 31 | library(dplyr) 32 | 33 | myfit5 <- presidential_debates_2012 \%>\% 34 | mutate(tot = gsub("\\\\..+$", "", tot)) \%>\% 35 | textshape::combine() \%>\% 36 | filter(person \%in\% c("ROMNEY", "OBAMA")) \%>\% 37 | with(data_store(dialogue, stopwords = tm::stopwords("english"), min.char = 3)) \%>\% 38 | hierarchical_cluster() 39 | 40 | ca5 <- assign_cluster(myfit5, k = 50) 41 | 42 | get_terms(ca5, .4) \%>\% 43 | as_topic() 44 | 45 | get_terms(ca5, .4) \%>\% 46 | as_topic(sort=FALSE) 47 | 48 | get_terms(ca5, .95) \%>\% 49 | as_topic() 50 | } 51 | 52 | -------------------------------------------------------------------------------- /man/assign_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assign_cluster.R 3 | \name{assign_cluster} 4 | \alias{assign_cluster} 5 | \alias{assign_cluster.hierarchical_cluster} 6 | \alias{assign_cluster.kmeans_cluster} 7 | \alias{assign_cluster.nmf_cluster} 8 | \alias{assign_cluster.skmeans_cluster} 9 | \title{Assign Clusters to Documents/Text Elements} 10 | \usage{ 11 | assign_cluster(x, k = approx_k(get_dtm(x)), h = NULL, ...) 12 | 13 | \method{assign_cluster}{hierarchical_cluster}(x, k = approx_k(get_dtm(x)), 14 | h = NULL, cut = "static", deepSplit = TRUE, minClusterSize = 1, ...) 15 | 16 | \method{assign_cluster}{kmeans_cluster}(x, ...) 17 | 18 | \method{assign_cluster}{skmeans_cluster}(x, ...) 19 | 20 | \method{assign_cluster}{nmf_cluster}(x, ...) 21 | } 22 | \arguments{ 23 | \item{x}{a \code{xxx_cluster} object.} 24 | 25 | \item{k}{The number of clusters (can supply \code{h} instead). Defaults to 26 | use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced 27 | by \code{data_storage}.} 28 | 29 | \item{h}{The height at which to cut the dendrograms (determines number of 30 | clusters). If this argument is supplied \code{k} is ignored.} 31 | 32 | \item{cut}{The type of cut method to use for \code{hierarchical_cluster}; one 33 | of \code{'static'}, \code{'dynamic'} or \code{'iterative'}.} 34 | 35 | \item{deepSplit}{logical. See \code{\link[dynamicTreeCut]{cutreeDynamic}}.} 36 | 37 | \item{minClusterSize}{The minimum cluster size. See 38 | \code{\link[dynamicTreeCut]{cutreeDynamic}}.} 39 | 40 | \item{\ldots}{ignored.} 41 | } 42 | \value{ 43 | Returns an \code{assign_cluster} object; a named vector of cluster 44 | assignments with documents as names. The object also contains the original 45 | \code{data_storage} object and a \code{join} function. \code{join} is a 46 | function (a closure) that captures information about the \code{assign_cluster} 47 | that makes rejoining to the original data set simple. The user simply 48 | supplies the original data set as an argument to \code{join} 49 | (\code{attributes(FROM_ASSIGN_CLUSTER)$join(ORIGINAL_DATA)}). 50 | } 51 | \description{ 52 | Assign clusters to documents/text elements. 53 | } 54 | \examples{ 55 | \dontrun{ 56 | library(dplyr) 57 | 58 | x <- with( 59 | presidential_debates_2012, 60 | data_store(dialogue, paste(person, time, sep = "_")) 61 | ) 62 | 63 | hierarchical_cluster(x) \%>\% 64 | plot(h=.7, lwd=2) 65 | 66 | hierarchical_cluster(x) \%>\% 67 | assign_cluster(h=.7) 68 | 69 | hierarchical_cluster(x, method="complete") \%>\% 70 | plot(k=6) 71 | 72 | hierarchical_cluster(x) \%>\% 73 | assign_cluster(k=6) 74 | 75 | 76 | x2 <- presidential_debates_2012 \%>\% 77 | with(data_store(dialogue)) \%>\% 78 | hierarchical_cluster() 79 | 80 | ca2 <- assign_cluster(x2, k = 55) 81 | summary(ca2) 82 | 83 | ## Dynamic cut 84 | ca3 <- assign_cluster(x2, cut = 'dynamic', minClusterSize = 5) 85 | get_text(ca3) 86 | 87 | ## add to original data 88 | attributes(ca2)$join(presidential_debates_2012) 89 | 90 | ## split text into clusters 91 | get_text(ca2) 92 | 93 | ## Kmeans Algorithm 94 | kmeans_cluster(x, k=6) \%>\% 95 | assign_cluster() 96 | 97 | x3 <- presidential_debates_2012 \%>\% 98 | with(data_store(dialogue)) \%>\% 99 | kmeans_cluster(55) 100 | 101 | ca3 <- assign_cluster(x3) 102 | summary(ca3) 103 | 104 | ## split text into clusters 105 | get_text(ca3) 106 | } 107 | } 108 | 109 | -------------------------------------------------------------------------------- /man/assignments.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustext-package.R 3 | \docType{data} 4 | \name{assignments} 5 | \alias{assignments} 6 | \title{Topic Assignments} 7 | \format{A list with 3 elements} 8 | \usage{ 9 | data(assignments) 10 | } 11 | \description{ 12 | A dataset containing a list of topic assignments by various clustering 13 | algorithms. Assignments correspond to the rows (minus empty rows) of the 14 | \code{presidential_debates_2012} data set. 15 | } 16 | \keyword{datasets} 17 | 18 | -------------------------------------------------------------------------------- /man/categorize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/categorize.R 3 | \name{categorize} 4 | \alias{categorize} 5 | \title{Merge Clusters & Cluster Categories Back to Original Data} 6 | \usage{ 7 | categorize(data, assign.cluster, cluster.key) 8 | } 9 | \arguments{ 10 | \item{data}{A data set that was fit with a cluster model.} 11 | 12 | \item{assign.cluster}{An \code{\link[clustext]{assign_cluster}} object.} 13 | 14 | \item{cluster.key}{An \code{\link[clustext]{assign_cluster}} object.} 15 | } 16 | \value{ 17 | Returns a \code{\link[base]{data.frame}} key of clusters and categories. 18 | } 19 | \description{ 20 | Merge clusters, categories, and the original data back together. 21 | } 22 | \examples{ 23 | library(dplyr) 24 | 25 | ## Assign Clusters 26 | ca <- presidential_debates_2012 \%>\% 27 | with(data_store(dialogue)) \%>\% 28 | hierarchical_cluster() \%>\% 29 | assign_cluster(k = 7) 30 | 31 | ## Write Cluster Text for Human Categorization 32 | write_cluster_text(ca) 33 | write_cluster_text(ca, n.sample=10) 34 | write_cluster_text(ca, lead=" -", n.sample=10) 35 | 36 | ## Read Human Coded Categories Back In 37 | categories_file <- system.file("additional/foo_turk.txt", package = "clustext") 38 | readLines(categories_file) 39 | (categories_key <- read_cluster_text(categories_file)) 40 | 41 | ## Add Categories Back to Original Data Set 42 | categorize( 43 | data = presidential_debates_2012, 44 | assign.cluster = ca, 45 | cluster.key = categories_key 46 | ) 47 | } 48 | \seealso{ 49 | \code{\link[clustext]{write_cluster_text}}, 50 | \code{\link[clustext]{read_cluster_text}} 51 | } 52 | 53 | -------------------------------------------------------------------------------- /man/clustext.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustext-package.R 3 | \docType{package} 4 | \name{clustext} 5 | \alias{clustext} 6 | \alias{clustext-package} 7 | \alias{package-clustext} 8 | \title{Consistent Clustering for Text Data} 9 | \description{ 10 | Optimized, consistent tools for clustering text data. 11 | } 12 | 13 | -------------------------------------------------------------------------------- /man/compare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compare.R 3 | \name{compare} 4 | \alias{compare} 5 | \title{Adjusted Rand Index Comaprison Between Algorithms} 6 | \usage{ 7 | compare(...) 8 | } 9 | \arguments{ 10 | \item{\ldots}{A series of outputs from \code{assign_cluster} for various 11 | cluster algorithmns.} 12 | } 13 | \value{ 14 | Returns a pair-wise comparison matrix of Adjusted Rand Indices for 15 | algorithm. Higher Adjusted Rand Index scores indicate higher cluster 16 | assignment agreement. 17 | } 18 | \description{ 19 | An Adjusted Rand Index comparison of the assignments between different 20 | clustering algorithms. 21 | } 22 | \examples{ 23 | compare( 24 | assignments$hierarchical_assignment, 25 | assignments$kmeans_assignment, 26 | assignments$skmeans_assignment, 27 | assignments$nmf_assignment 28 | ) 29 | 30 | ## Understanding the ARI 31 | set.seed(10) 32 | w <- sample(1:10, 40, TRUE) 33 | x <- 11-w 34 | set.seed(20) 35 | y <- sample(1:10, 40, TRUE) 36 | set.seed(50) 37 | z <- sample(1:10, 40, TRUE) 38 | 39 | data.frame(w, x, y, z) 40 | 41 | library(mclust) 42 | 43 | mclust::adjustedRandIndex(w, x) 44 | mclust::adjustedRandIndex(x, y) 45 | mclust::adjustedRandIndex(x, z) 46 | } 47 | \references{ 48 | \url{http://faculty.washington.edu/kayee/pca/supp.pdf} 49 | } 50 | 51 | -------------------------------------------------------------------------------- /man/cosine_distance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cosine_distance.R 3 | \name{cosine_distance} 4 | \alias{cosine_distance} 5 | \alias{cosine_distance.DocumentTermMatrix} 6 | \alias{cosine_distance.TermDocumentMatrix} 7 | \title{Optimized Computation of Cosine Distance} 8 | \usage{ 9 | cosine_distance(x, ...) 10 | 11 | \method{cosine_distance}{DocumentTermMatrix}(x, ...) 12 | 13 | \method{cosine_distance}{TermDocumentMatrix}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or 17 | \code{\link[tm]{TermDocumentMatrix}}).} 18 | 19 | \item{\ldots}{ignored.} 20 | } 21 | \value{ 22 | Returns a cosine distance object of class \code{"dist"}. 23 | } 24 | \description{ 25 | Utilizes the \pkg{slam} package to efficiently calculate cosine distance 26 | on large sparse matrices. 27 | } 28 | \examples{ 29 | library(gofastr) 30 | library(dplyr) 31 | 32 | out <- presidential_debates_2012 \%>\% 33 | with(q_dtm(dialogue)) \%>\% 34 | cosine_distance() 35 | } 36 | \author{ 37 | Michael Andrec and Tyler Rinker . 38 | } 39 | \references{ 40 | \url{http://stackoverflow.com/a/29755756/1000343} 41 | } 42 | \keyword{cosine} 43 | \keyword{dissimilarity} 44 | 45 | -------------------------------------------------------------------------------- /man/data_store.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_store.R 3 | \name{data_store} 4 | \alias{data_store} 5 | \title{Data Structure for \pkg{hclusttext}} 6 | \usage{ 7 | data_store(text, doc.names, min.term.freq = 1, min.doc.len = 1, 8 | stopwords = tm::stopwords("english"), min.char = 3, max.char = NULL, 9 | stem = FALSE, denumber = TRUE) 10 | } 11 | \arguments{ 12 | \item{text}{A character vector.} 13 | 14 | \item{doc.names}{An optional vector of document names corresponding to the 15 | length of \code{text}.} 16 | 17 | \item{min.term.freq}{The minimum times a term must appear to be included in 18 | the \code{\link[tm]{DocumentTermMatrix}}.} 19 | 20 | \item{min.doc.len}{The minimum words a document must contain to be included 21 | in the data structure (other wise it is stored as a \code{removed} element).} 22 | 23 | \item{stopwords}{A vector of stopwords to remove.} 24 | 25 | \item{min.char}{The minial length character for retained words.} 26 | 27 | \item{max.char}{The maximum length character for retained words.} 28 | 29 | \item{stem}{Logical. If \code{TRUE} the \code{stopwords} will be stemmed.} 30 | 31 | \item{denumber}{Logical. If \code{TRUE} numbers will be excluded.} 32 | } 33 | \value{ 34 | Returns a list containing: 35 | \describe{ 36 | \item{dtm}{A tf-idf weighted \code{\link[tm]{DocumentTermMatrix}}} 37 | \item{text}{The text vector with unanalyzable elements removed} 38 | \item{removed}{The indices of the removed text elements, i.e., documents not meeting \code{min.doc.len}} 39 | \item{n.nonsparse}{The length of the non-zero elements} 40 | } 41 | } 42 | \description{ 43 | A data structure which stores the text, DocumentTermMatrix, and information 44 | regarding removed text elements which can not be handled by the 45 | \code{hierarchical_cluster} function. This structure is required because it 46 | documents important meta information, including removed elements, required by 47 | other \pkg{clustext} functions. If the user wishes to combine documents 48 | (say by a common grouping variable) it is recomended this be handled by 49 | \code{\link[textshape]{combine}} prior to using \code{data_store}. 50 | } 51 | \examples{ 52 | data_store(presidential_debates_2012[["dialogue"]]) 53 | 54 | ## Use `combine` to merge text prior to `data_stare` 55 | library(textshape) 56 | library(dplyr) 57 | 58 | dat <- presidential_debates_2012 \%>\% 59 | dplyr::select(person, time, dialogue) \%>\% 60 | textshape::combine() 61 | 62 | ## Elements in `ds` correspond to `dat` grouping vars 63 | (ds <- with(dat, data_store(dialogue))) 64 | dplyr::select(dat, -3) 65 | 66 | ## Add row names 67 | (ds2 <- with(dat, data_store(dialogue, paste(person, time, sep = "_")))) 68 | rownames(ds2[["dtm"]]) 69 | 70 | ## Get a DocumentTermMatrix 71 | get_dtm(ds2) 72 | } 73 | \keyword{data} 74 | \keyword{structure} 75 | 76 | -------------------------------------------------------------------------------- /man/get_documents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_documents.R 3 | \name{get_documents} 4 | \alias{get_documents} 5 | \alias{get_documents.assign_cluster} 6 | \title{Get Documents Based on Cluster Assignment in \code{assign_cluster}} 7 | \usage{ 8 | get_documents(x, ...) 9 | 10 | \method{get_documents}{assign_cluster}(x, ...) 11 | } 12 | \arguments{ 13 | \item{x}{A \code{\link[clustext]{assign_cluster}} object.} 14 | 15 | \item{\ldots}{ignored.} 16 | } 17 | \value{ 18 | Returns a list of \code{\link[base]{vector}}s of document names. 19 | } 20 | \description{ 21 | Get the documents associated with each of the k clusters . 22 | } 23 | \examples{ 24 | library(dplyr) 25 | 26 | mydocuments1 <- presidential_debates_2012 \%>\% 27 | with(data_store(dialogue, paste(person, time, sep="-"))) \%>\% 28 | hierarchical_cluster() \%>\% 29 | assign_cluster(k = 6) \%>\% 30 | get_documents() 31 | 32 | mydocuments1 33 | 34 | mydocuments2 <- presidential_debates_2012 \%>\% 35 | with(data_store(dialogue)) \%>\% 36 | hierarchical_cluster() \%>\% 37 | assign_cluster(k = 55) \%>\% 38 | get_documents() 39 | 40 | mydocuments2 41 | } 42 | 43 | -------------------------------------------------------------------------------- /man/get_dtm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_dtm.R 3 | \name{get_dtm} 4 | \alias{get_dtm} 5 | \alias{get_dtm.data_store} 6 | \alias{get_dtm.hierarchical_cluster} 7 | \alias{get_dtm.kmeans_cluster} 8 | \alias{get_dtm.nmf_cluster} 9 | \alias{get_dtm.skmeans_cluster} 10 | \title{Get a \code{\link[tm]{DocumentTermMatrix}} Stored in a \code{hierarchical_cluster} Object} 11 | \usage{ 12 | get_dtm(x, ...) 13 | 14 | \method{get_dtm}{data_store}(x, ...) 15 | 16 | \method{get_dtm}{hierarchical_cluster}(x, ...) 17 | 18 | \method{get_dtm}{kmeans_cluster}(x, ...) 19 | 20 | \method{get_dtm}{skmeans_cluster}(x, ...) 21 | 22 | \method{get_dtm}{nmf_cluster}(x, ...) 23 | } 24 | \arguments{ 25 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.} 26 | 27 | \item{\ldots}{ignored.} 28 | } 29 | \value{ 30 | Returns a \code{\link[tm]{DocumentTermMatrix}}. 31 | } 32 | \description{ 33 | Extract the \code{\link[tm]{DocumentTermMatrix}} supplied to/produced by a 34 | \code{\link[clustext]{hierarchical_cluster}} object. 35 | } 36 | \examples{ 37 | library(dplyr) 38 | 39 | presidential_debates_2012 \%>\% 40 | with(data_store(dialogue)) \%>\% 41 | hierarchical_cluster() \%>\% 42 | get_dtm() 43 | } 44 | 45 | -------------------------------------------------------------------------------- /man/get_removed.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_removed.R 3 | \name{get_removed} 4 | \alias{get_removed} 5 | \alias{get_removed.data_store} 6 | \alias{get_removed.hierarchical_cluster} 7 | \alias{get_removed.kmeans_cluster} 8 | \alias{get_removed.nmf_cluster} 9 | \alias{get_removed.skmeans_cluster} 10 | \title{Get a Text Stored in a \code{hierarchical_cluster} Object} 11 | \usage{ 12 | get_removed(x, ...) 13 | 14 | \method{get_removed}{hierarchical_cluster}(x, ...) 15 | 16 | \method{get_removed}{kmeans_cluster}(x, ...) 17 | 18 | \method{get_removed}{skmeans_cluster}(x, ...) 19 | 20 | \method{get_removed}{nmf_cluster}(x, ...) 21 | 22 | \method{get_removed}{data_store}(x, ...) 23 | } 24 | \arguments{ 25 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.} 26 | 27 | \item{\ldots}{ignored.} 28 | } 29 | \value{ 30 | Returns a vector of text strings. 31 | } 32 | \description{ 33 | Extract the text supplied to the 34 | \code{\link[clustext]{hierarchical_cluster}} object. 35 | } 36 | \examples{ 37 | library(dplyr) 38 | 39 | presidential_debates_2012 \%>\% 40 | with(data_store(dialogue)) \%>\% 41 | hierarchical_cluster() \%>\% 42 | get_removed() 43 | } 44 | 45 | -------------------------------------------------------------------------------- /man/get_terms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_terms.R 3 | \name{get_terms} 4 | \alias{get_terms} 5 | \alias{get_terms.assign_cluster_hierarchical} 6 | \alias{get_terms.assign_cluster_kmeans} 7 | \alias{get_terms.assign_cluster_nmf} 8 | \alias{get_terms.assign_cluster_skmeans} 9 | \title{Get Terms Based on Cluster Assignment in \code{assign_cluster}} 10 | \usage{ 11 | get_terms(x, min.weight = 0.6, nrow = NULL, ...) 12 | 13 | \method{get_terms}{assign_cluster_hierarchical}(x, min.weight = 0.6, 14 | nrow = NULL, ...) 15 | 16 | \method{get_terms}{assign_cluster_kmeans}(x, min.weight = 0.6, nrow = NULL, 17 | ...) 18 | 19 | \method{get_terms}{assign_cluster_skmeans}(x, min.weight = 0.6, nrow = NULL, 20 | ...) 21 | 22 | \method{get_terms}{assign_cluster_nmf}(x, min.weight = 0.6, nrow = NULL, 23 | ...) 24 | } 25 | \arguments{ 26 | \item{x}{A \code{\link[clustext]{assign_cluster}} object.} 27 | 28 | \item{min.weight}{The lowest min/max scaled tf-idf weighting to consider 29 | as a document's salient term.} 30 | 31 | \item{nrow}{The max number of rows to display in the returned 32 | \code{\link[base]{data.frame}}s.} 33 | 34 | \item{\ldots}{ignored.} 35 | } 36 | \value{ 37 | Returns a list of \code{\link[base]{data.frame}}s of top weighted terms. 38 | } 39 | \description{ 40 | Get the terms weighted (either by tf-idf or returned from the model) and 41 | min/max scaling associated with each of the k clusters . 42 | } 43 | \examples{ 44 | library(dplyr) 45 | library(textshape) 46 | 47 | myterms <- presidential_debates_2012 \%>\% 48 | with(data_store(dialogue)) \%>\% 49 | hierarchical_cluster() \%>\% 50 | assign_cluster(k = 55) \%>\% 51 | get_terms() 52 | 53 | myterms 54 | textshape::tidy_list(myterms[!sapply(myterms, is.null)], "Topic") 55 | \dontrun{ 56 | library(ggplot2) 57 | library(gridExtra) 58 | library(dplyr) 59 | library(textshape) 60 | library(wordcloud) 61 | 62 | max.n <- max(textshape::tidy_list(myterms)[["n"]]) 63 | 64 | myplots <- Map(function(x, y){ 65 | x \%>\% 66 | mutate(term = factor(term, levels = rev(term))) \%>\% 67 | ggplot(aes(term, weight=n)) + 68 | geom_bar() + 69 | scale_y_continuous(expand = c(0, 0),limits=c(0, max.n)) + 70 | ggtitle(sprintf("Topic: \%s", y)) + 71 | coord_flip() 72 | }, myterms, names(myterms)) 73 | 74 | myplots[["ncol"]] <- 10 75 | 76 | do.call(gridExtra::grid.arrange, myplots[!sapply(myplots, is.null)]) 77 | 78 | ##wordclouds 79 | par(mfrow=c(5, 11), mar=c(0, 4, 0, 0)) 80 | Map(function(x, y){ 81 | wordcloud::wordcloud(x[[1]], x[[2]], scale=c(1,.25),min.freq=1) 82 | mtext(sprintf("Topic: \%s", y), col = "blue", cex=.55, padj = 1.5) 83 | }, myterms, names(myterms)) 84 | } 85 | } 86 | 87 | -------------------------------------------------------------------------------- /man/get_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_text.R 3 | \name{get_text} 4 | \alias{get_text} 5 | \alias{get_text.assign_cluster} 6 | \alias{get_text.data_store} 7 | \alias{get_text.default} 8 | \alias{get_text.hierarchical_cluster} 9 | \alias{get_text.kmeans_cluster} 10 | \alias{get_text.nmf_cluster} 11 | \alias{get_text.skmeans_cluster} 12 | \title{Get a Text Stored in Various Objects} 13 | \usage{ 14 | get_text(x, ...) 15 | 16 | \method{get_text}{default}(x, ...) 17 | 18 | \method{get_text}{hierarchical_cluster}(x, ...) 19 | 20 | \method{get_text}{kmeans_cluster}(x, ...) 21 | 22 | \method{get_text}{nmf_cluster}(x, ...) 23 | 24 | \method{get_text}{skmeans_cluster}(x, ...) 25 | 26 | \method{get_text}{data_store}(x, ...) 27 | 28 | \method{get_text}{assign_cluster}(x, ...) 29 | } 30 | \arguments{ 31 | \item{x}{A \code{\link[clustext]{hierarchical_cluster}} object.} 32 | 33 | \item{\ldots}{ignored.} 34 | } 35 | \value{ 36 | Returns a vector or list of text strings. 37 | } 38 | \description{ 39 | Extract the text supplied to the 40 | \code{\link[clustext]{hierarchical_cluster}} object. 41 | } 42 | \examples{ 43 | library(dplyr) 44 | 45 | presidential_debates_2012 \%>\% 46 | with(data_store(dialogue)) \%>\% 47 | hierarchical_cluster() \%>\% 48 | get_text() \%>\% 49 | head() 50 | } 51 | 52 | -------------------------------------------------------------------------------- /man/hierarchical_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hierarchical_cluster.R 3 | \name{hierarchical_cluster} 4 | \alias{hierarchical_cluster} 5 | \alias{hierarchical_cluster.data_store} 6 | \title{Fit a Hierarchical Cluster} 7 | \usage{ 8 | hierarchical_cluster(x, distance = "cosine", method = "ward.D2", ...) 9 | 10 | \method{hierarchical_cluster}{data_store}(x, distance = "cosine", 11 | method = "ward.D", ...) 12 | } 13 | \arguments{ 14 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).} 15 | 16 | \item{distance}{A distance measure ("cosine" or "jaccard").} 17 | 18 | \item{method}{The agglomeration method to be used. This must be (an 19 | unambiguous abbreviation of) one of \code{"single"}, \code{"complete"}, 20 | \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"}, 21 | \code{"centroid"}, or \code{"median"}.} 22 | 23 | \item{\ldots}{ignored.} 24 | } 25 | \value{ 26 | Returns an object of class \code{"hclust"}. 27 | } 28 | \description{ 29 | Fit a hierarchical cluster to text data. Prior to distance measures being 30 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 31 | \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate 32 | the distance matrix supplied to \code{\link[fastcluster]{hclust}}. \code{method} 33 | defaults to \code{"ward.D2"}. A faster cosine dissimilarity calculation is used 34 | under the hood (see \code{\link[clustext]{cosine_distance}}). Additionally, 35 | \code{\link[fastcluster]{hclust}} is used to quickly calculate the fit. 36 | Essentially, this is a wrapper function optimized for clustering text data. 37 | } 38 | \examples{ 39 | library(dplyr) 40 | 41 | x <- with( 42 | presidential_debates_2012, 43 | data_store(dialogue, paste(person, time, sep = "_")) 44 | ) 45 | 46 | hierarchical_cluster(x) \%>\% 47 | plot(k=4) 48 | 49 | hierarchical_cluster(x) \%>\% 50 | plot(h=.7, lwd=2) 51 | 52 | hierarchical_cluster(x) \%>\% 53 | assign_cluster(h=.7) 54 | 55 | \dontrun{ 56 | ## interactive cutting 57 | hierarchical_cluster(x) \%>\% 58 | plot(h=TRUE) 59 | } 60 | 61 | hierarchical_cluster(x, method="complete") \%>\% 62 | plot(k=6) 63 | 64 | hierarchical_cluster(x) \%>\% 65 | assign_cluster(k=6) 66 | 67 | x2 <- presidential_debates_2012 \%>\% 68 | with(data_store(dialogue)) 69 | 70 | myfit2 <- hierarchical_cluster(x2) 71 | 72 | plot(myfit2) 73 | plot(myfit2, 55) 74 | 75 | assign_cluster(myfit2, k = 55) 76 | 77 | ## Example from StackOverflow Question Response 78 | ## Asking fo grouping similar texts together 79 | ## http://stackoverflow.com/q/22936951/1000343 80 | dat <- data.frame( 81 | person = LETTERS[1:3], 82 | text = c("Best way to waste money", 83 | "Amazing stuff. lets you stay connected all the time", 84 | "Instrument to waste money and time"), 85 | stringsAsFactors = FALSE 86 | ) 87 | 88 | 89 | x <- with( 90 | dat, 91 | data_store(text, person) 92 | ) 93 | 94 | 95 | hierarchical_cluster(x) \%>\% 96 | plot(h=.9, lwd=2) 97 | 98 | hierarchical_cluster(x) \%>\% 99 | assign_cluster(h=.9) 100 | 101 | 102 | hierarchical_cluster(x) \%>\% 103 | assign_cluster(h=.9) \%>\% 104 | get_terms() 105 | 106 | hierarchical_cluster(x) \%>\% 107 | assign_cluster(h=.9) \%>\% 108 | get_terms() \%>\% 109 | as_topic() 110 | 111 | hierarchical_cluster(x) \%>\% 112 | assign_cluster(h=.9) \%>\% 113 | get_documents() 114 | } 115 | 116 | -------------------------------------------------------------------------------- /man/jaccard_distance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/jaccard_distance.R 3 | \name{jaccard_distance} 4 | \alias{jaccard_distance} 5 | \alias{jaccard_distance.DocumentTermMatrix} 6 | \alias{jaccard_distance.TermDocumentMatrix} 7 | \title{Optimized Computation of Jaccard Distance} 8 | \usage{ 9 | jaccard_distance(x, ...) 10 | 11 | \method{jaccard_distance}{DocumentTermMatrix}(x, ...) 12 | 13 | \method{jaccard_distance}{TermDocumentMatrix}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A data type (e.g., \code{\link[tm]{DocumentTermMatrix}} or 17 | \code{\link[tm]{TermDocumentMatrix}}).} 18 | 19 | \item{\ldots}{ignored.} 20 | } 21 | \value{ 22 | Returns a jaccard distance object of class \code{"dist"}. 23 | } 24 | \description{ 25 | Utilizes the \pkg{slam} package to efficiently calculate jaccard distance 26 | on large sparse matrices. 27 | } 28 | \examples{ 29 | library(gofastr) 30 | library(dplyr) 31 | 32 | out <- presidential_debates_2012 \%>\% 33 | with(q_dtm(dialogue)) \%>\% 34 | jaccard_distance() 35 | } 36 | \author{ 37 | user41844 of StackOverflow, Dmitriy Selivanov, and Tyler Rinker . 38 | } 39 | \references{ 40 | \url{http://stackoverflow.com/a/36373333/1000343} 41 | \url{http://stats.stackexchange.com/a/89947/7482} 42 | } 43 | \keyword{dissimilarity} 44 | \keyword{jaccard} 45 | 46 | -------------------------------------------------------------------------------- /man/kmeans_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kmeans_cluster.R 3 | \name{kmeans_cluster} 4 | \alias{kmeans_cluster} 5 | \alias{kmeans_cluster.data_store} 6 | \title{Fit a Kmeans Cluster} 7 | \usage{ 8 | kmeans_cluster(x, k, ...) 9 | 10 | \method{kmeans_cluster}{data_store}(x, k, ...) 11 | } 12 | \arguments{ 13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).} 14 | 15 | \item{k}{The number of clusters.} 16 | 17 | \item{\ldots}{Other arguments passed to \code{\link[stats]{kmeans}}.} 18 | } 19 | \value{ 20 | Returns an object of class \code{"kmeans"}. 21 | } 22 | \description{ 23 | Fit a kmeans cluster to text data. Prior to distance measures being 24 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 25 | \code{\link[tm]{DocumentTermMatrix}}. 26 | } 27 | \examples{ 28 | \dontrun{ 29 | library(dplyr) 30 | 31 | x <- with( 32 | presidential_debates_2012, 33 | data_store(dialogue, paste(person, time, sep = "_")) 34 | ) 35 | 36 | 37 | ## 6 topic model 38 | kmeans_cluster(x, k=6) 39 | 40 | kmeans_cluster(x, k=6) \%>\% 41 | assign_cluster() 42 | 43 | kmeans_cluster(x, k=6) \%>\% 44 | assign_cluster() \%>\% 45 | summary() 46 | 47 | x2 <- presidential_debates_2012 \%>\% 48 | with(data_store(dialogue)) 49 | 50 | myfit2 <- kmeans_cluster(x2, 55) 51 | 52 | assign_cluster(myfit2) 53 | 54 | assign_cluster(myfit2) \%>\% 55 | summary() 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /man/nmf_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nmf_cluster.R 3 | \name{nmf_cluster} 4 | \alias{nmf_cluster} 5 | \alias{nmf_cluster.data_store} 6 | \title{Fit a Non-Negative Matrix Factorization Cluster} 7 | \usage{ 8 | nmf_cluster(x, k = k, ...) 9 | 10 | \method{nmf_cluster}{data_store}(x, k, ...) 11 | } 12 | \arguments{ 13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).} 14 | 15 | \item{k}{The number of clusters.} 16 | 17 | \item{\ldots}{Other arguments passed to \code{\link[rNMF]{rnmf}}.} 18 | } 19 | \value{ 20 | Returns an object of class \code{"hclust"}. 21 | } 22 | \description{ 23 | Fit a robust non-negative matrix factorization cluster to text data via 24 | \code{\link[rNMF]{rnmf}}. Prior to distance measures being 25 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 26 | \code{\link[tm]{DocumentTermMatrix}}. 27 | } 28 | \examples{ 29 | library(dplyr) 30 | 31 | x <- with( 32 | presidential_debates_2012, 33 | data_store(dialogue, paste(person, time, sep = "_")) 34 | ) 35 | 36 | 37 | ## 6 topic model 38 | model6 <- nmf_cluster(x, k=6) 39 | 40 | model6 \%>\% 41 | assign_cluster() 42 | 43 | model6 \%>\% 44 | assign_cluster() \%>\% 45 | summary() 46 | \dontrun{ 47 | x2 <- presidential_debates_2012 \%>\% 48 | with(data_store(dialogue)) 49 | 50 | myfit2 <- nmf_cluster(x2, 55) 51 | 52 | assign_cluster(myfit2) 53 | 54 | assign_cluster(myfit2) \%>\% 55 | summary() 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /man/plot.hierarchical_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hierarchical_cluster.R 3 | \name{plot.hierarchical_cluster} 4 | \alias{plot.hierarchical_cluster} 5 | \title{Plots a hierarchical_cluster Object} 6 | \usage{ 7 | \method{plot}{hierarchical_cluster}(x, k = approx_k(get_dtm(x)), h = NULL, 8 | color = "red", digits = 3, ...) 9 | } 10 | \arguments{ 11 | \item{x}{A hierarchical_cluster object.} 12 | 13 | \item{k}{The number of clusters (can supply \code{h} instead). Defaults to 14 | use \code{approx_k} of the \code{\link[tm]{DocumentTermMatrix}} produced 15 | by \code{data_storage}. Boxes are drawn around the clusters.} 16 | 17 | \item{h}{The height at which to cut the dendrograms (determines number of 18 | clusters). If this argument is supplied \code{k} is ignored. A line is drawn 19 | showing the cut point on the dendrogram. If \code{h} is set to \code{TRUE} 20 | or \code{"locator"} then the cutting becomes interactive and the height is 21 | returned invisibly.} 22 | 23 | \item{color}{The color to make the cluster boxes (\code{k}) or line (\code{h}).} 24 | 25 | \item{digits}{The number o digits to display if h\code{h} is set to 26 | interactive.} 27 | 28 | \item{\ldots}{Other arguments passed to \code{\link[stats]{rect.hclust}} or 29 | \code{\link[graphics]{abline}}.} 30 | } 31 | \description{ 32 | Plots a hierarchical_cluster object 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/presidential_debates_2012.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustext-package.R 3 | \docType{data} 4 | \name{presidential_debates_2012} 5 | \alias{presidential_debates_2012} 6 | \title{2012 U.S. Presidential Debates} 7 | \format{A data frame with 2912 rows and 4 variables} 8 | \usage{ 9 | data(presidential_debates_2012) 10 | } 11 | \description{ 12 | A dataset containing a cleaned version of all three presidential debates for 13 | the 2012 election. 14 | } 15 | \details{ 16 | \itemize{ 17 | \item person. The speaker 18 | \item tot. Turn of talk 19 | \item dialogue. The words spoken 20 | \item time. Variable indicating which of the three debates the dialogue is from 21 | } 22 | } 23 | \keyword{datasets} 24 | 25 | -------------------------------------------------------------------------------- /man/print.as_topic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/as_topic.R 3 | \name{print.as_topic} 4 | \alias{print.as_topic} 5 | \title{Prints an as_topic Object} 6 | \usage{ 7 | \method{print}{as_topic}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An as_topic object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints an as_topic object 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/print.assign_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assign_cluster.R 3 | \name{print.assign_cluster} 4 | \alias{print.assign_cluster} 5 | \title{Prints an assign_cluster Object} 6 | \usage{ 7 | \method{print}{assign_cluster}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An assign_cluster object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints an assign_cluster object 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/print.compare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compare.R 3 | \name{print.compare} 4 | \alias{print.compare} 5 | \title{Prints a compare Object.} 6 | \usage{ 7 | \method{print}{compare}(x, digits = 3, ...) 8 | } 9 | \arguments{ 10 | \item{x}{The compare object} 11 | 12 | \item{digits}{Number of decimal places to print.} 13 | 14 | \item{\ldots}{ignored} 15 | } 16 | \description{ 17 | Prints a compare object. 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/print.data_store.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data_store.R 3 | \name{print.data_store} 4 | \alias{print.data_store} 5 | \title{Prints a data_store Object} 6 | \usage{ 7 | \method{print}{data_store}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A data_store object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints a data_store object 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/print.get_documents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_documents.R 3 | \name{print.get_documents} 4 | \alias{print.get_documents} 5 | \title{Prints a get_documents Object} 6 | \usage{ 7 | \method{print}{get_documents}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A get_documents object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints a get_documents object 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/print.get_terms.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_terms.R 3 | \name{print.get_terms} 4 | \alias{print.get_terms} 5 | \title{Prints a get_terms Object} 6 | \usage{ 7 | \method{print}{get_terms}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A get_terms object.} 11 | 12 | \item{\ldots}{ignored.} 13 | } 14 | \description{ 15 | Prints a get_terms object 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/skmeans_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/skmeans_cluster.R 3 | \name{skmeans_cluster} 4 | \alias{skmeans_cluster} 5 | \alias{skmeans_cluster.data_store} 6 | \title{Fit a skmean Cluster} 7 | \usage{ 8 | skmeans_cluster(x, k, ...) 9 | 10 | \method{skmeans_cluster}{data_store}(x, k, ...) 11 | } 12 | \arguments{ 13 | \item{x}{A data store object (see \code{\link[clustext]{data_store}}).} 14 | 15 | \item{k}{The number of clusters.} 16 | 17 | \item{\ldots}{Other arguments passed to \code{\link[skmeans]{skmeans}}.} 18 | } 19 | \value{ 20 | Returns an object of class \code{"skmean"}. 21 | } 22 | \description{ 23 | Fit a skmean cluster to text data. Prior to distance measures being 24 | calculated the tf-idf (see \code{\link[tm]{weightTfIdf}}) is applied to the 25 | \code{\link[tm]{DocumentTermMatrix}}. Cosine dissimilarity is used to generate 26 | the distance matrix supplied to \code{\link[skmeans]{skmeans}}. 27 | } 28 | \examples{ 29 | library(dplyr) 30 | 31 | x <- with( 32 | presidential_debates_2012, 33 | data_store(dialogue, paste(person, time, sep = "_")) 34 | ) 35 | 36 | 37 | ## 6 topic model 38 | myfit1 <- skmeans_cluster(x, k=6) 39 | 40 | myfit1 \%>\% 41 | assign_cluster() 42 | 43 | myfit1 \%>\% 44 | assign_cluster() \%>\% 45 | summary() 46 | 47 | \dontrun{ 48 | x2 <- presidential_debates_2012 \%>\% 49 | with(data_store(dialogue)) 50 | 51 | myfit2 <- skmeans_cluster(x2, 55) 52 | 53 | assign_cluster(myfit2) 54 | 55 | assign_cluster(myfit2) \%>\% 56 | summary() 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /man/summary.assign_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assign_cluster.R 3 | \name{summary.assign_cluster} 4 | \alias{summary.assign_cluster} 5 | \title{Summary of an assign_cluster Object} 6 | \usage{ 7 | \method{summary}{assign_cluster}(object, plot = TRUE, print = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{An assign_cluster object.} 11 | 12 | \item{plot}{logical. If \code{TRUE} an accompanying bar plot is produced a 13 | well.} 14 | 15 | \item{print}{logical. If \code{TRUE} data.frame counts are printed.} 16 | 17 | \item{\ldots}{ignored.} 18 | } 19 | \description{ 20 | Summary of an assign_cluster object 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/write_cluster_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/write_cluster_text.R 3 | \name{write_cluster_text} 4 | \alias{read_cluster_text} 5 | \alias{write_cluster_text} 6 | \title{Write/Read Cluster Text for Human Categorization} 7 | \usage{ 8 | write_cluster_text(x, path, n.sample = NULL, lead = " * ", ...) 9 | 10 | read_cluster_text(path, ...) 11 | } 12 | \arguments{ 13 | \item{x}{An \code{assign_cluster} object.} 14 | 15 | \item{path}{A pather to the file (.txt) is recommended.} 16 | 17 | \item{n.sample}{The length to limit the sample to (default gives all text in the cluster). 18 | Setting this to an integer uses this as the number to randomly sample from.} 19 | 20 | \item{lead}{A leading character string prefix to give the cluster text.} 21 | 22 | \item{\ldots}{ignored.} 23 | } 24 | \description{ 25 | Write cluster text from \code{get_text(assign_cluster(myfit))} to an external 26 | file for categorization. After file has been written with 27 | \code{write_cluster_text} a human coder can assign categories to each cluster. 28 | Simple write the category after the \code{Cluster #:}. To set a cluster category 29 | equal to another simply write and equal sign follwed by the other cluster to set 30 | as the same category (e.g., \code{Cluster 10: =5} to set cluster #10 the same as 31 | cluster #5). See \code{readLines(system.file("additional/foo_turk.txt", package = "clustext"))} 32 | for an example. 33 | } 34 | \examples{ 35 | library(dplyr) 36 | 37 | ## Assign Clusters 38 | ca <- presidential_debates_2012 \%>\% 39 | with(data_store(dialogue)) \%>\% 40 | hierarchical_cluster() \%>\% 41 | assign_cluster(k = 7) 42 | 43 | ## Write Cluster Text for Human Categorization 44 | write_cluster_text(ca) 45 | write_cluster_text(ca, n.sample=10) 46 | write_cluster_text(ca, lead=" -", n.sample=10) 47 | 48 | ## Read Human Coded Categories Back In 49 | categories_file <- system.file("additional/foo_turk.txt", package = "clustext") 50 | readLines(categories_file) 51 | (categories_key <- read_cluster_text(categories_file)) 52 | 53 | ## Add Categories Back to Original Data Set 54 | categorize( 55 | data = presidential_debates_2012, 56 | assign.cluster = ca, 57 | cluster.key = categories_key 58 | ) 59 | } 60 | \seealso{ 61 | \code{\link[clustext]{categorize}} 62 | } 63 | 64 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library("testthat") 2 | library("clustext") 3 | 4 | test_check("clustext") -------------------------------------------------------------------------------- /tests/testthat/test-assign_cluster.R: -------------------------------------------------------------------------------- 1 | context("Checking assign_cluster") 2 | 3 | test_that("assign_cluster ...",{ 4 | 5 | 6 | }) 7 | 8 | -------------------------------------------------------------------------------- /tools/clustext_logo/r_clustext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustext.png -------------------------------------------------------------------------------- /tools/clustext_logo/r_clustext.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustext.pptx -------------------------------------------------------------------------------- /tools/clustext_logo/r_clustexta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/clustext_logo/r_clustexta.png -------------------------------------------------------------------------------- /tools/clustext_logo/resize_icon.txt: -------------------------------------------------------------------------------- 1 | cd C:\Users\Tyler\GitHub\clustext\tools\clustext_logo 2 | ffmpeg -i r_clustexta.png -vf scale=250:-1 r_clustext.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-22-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-22-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-6-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-2.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-6-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-6-3.png -------------------------------------------------------------------------------- /tools/figure/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trinker/clustext/19963bc5a63148aa40f29f32bee3430128206215/tools/figure/unnamed-chunk-8-1.png --------------------------------------------------------------------------------