├── README.md ├── NMF.r ├── LICENSE ├── kwictopics.R ├── NMF.rmd ├── LDA.Rmd └── optimal_K.r /README.md: -------------------------------------------------------------------------------- 1 | # Github repo to accompany the useR2018 presentation entitled "Topic Modeling with LDA and NMF from a Qualitative Content Analysis Perspective" 2 | 3 | Slides: https://goo.gl/XoZusf 4 | 5 | LDA.Rmd contains code to perform Latent Dirichlet Allocation using the Topicmodels package. 6 | 7 | NMF.Rmd contains code to use Non Negative Matrix Factorization using the NMF package. 8 | 9 | KwicRTopics.r contains code to load a dev version of the experimental topic model browser that incorporates keywords in context functionality as an R HTMLWidget. The KwicRTopics github repo: https://github.com/aneesha/kwicRtopics 10 | 11 | Also check out the textmineR package as it incorporates some recent research on topic coherence, model selection and labeling: https://cran.r-project.org/web/packages/textmineR/vignettes/c_topic_modeling.html 12 | 13 | Additional Resources: 14 | https://github.com/trinker/topicmodels_learning 15 | -------------------------------------------------------------------------------- /NMF.r: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 7 | 8 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 9 | 10 | ```{r} 11 | plot(cars) 12 | ``` 13 | 14 | Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*. 15 | 16 | When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). 17 | 18 | The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed. 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 aneesha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /kwictopics.R: -------------------------------------------------------------------------------- 1 | # Code to test KwicRTopics Widget 2 | 3 | library("htmlwidgets") 4 | 5 | # navigate to package dir 6 | setwd("kwictopics") 7 | 8 | survey_responses_raw<-read.csv("/Users/aneesha/user2018/surveyresponses.csv", stringsAsFactors = FALSE) 9 | # Perform Character Encoding 10 | survey_responses$documents <- iconv(survey_responses_raw$documents, "latin1", "ASCII", sub="") 11 | # Vectorize the documents 12 | survey_responses <- Corpus(VectorSource(as.vector(survey_responses$documents))) 13 | # Text Preprocessing 14 | # Convert to lowercase 15 | survey_responses <- tm_map(survey_responses, tolower) 16 | # remove punctuation 17 | survey_responses <- tm_map(survey_responses, removePunctuation) 18 | #remove numbers 19 | survey_responses <- tm_map(survey_responses, removeNumbers); 20 | # remove generic and custom stopwords 21 | stopword <- c(stopwords('english'), "best"); 22 | survey_responses <- tm_map(survey_responses, removeWords, stopword) 23 | survey_responses <- tm_map(survey_responses, stemDocument) 24 | 25 | survey_Dtm <- DocumentTermMatrix(survey_responses, control = list(minWordLength = 2)); 26 | survey_Dtm2 <- removeSparseTerms(survey_Dtm, sparse=0.98) 27 | 28 | k = 12; 29 | SEED = 1234; 30 | 31 | no_top_words = 6 32 | no_top_documents = 6 33 | 34 | survey.lda <- LDA(survey_Dtm2, k, method="Gibbs", control=list(seed = SEED)) 35 | 36 | devtools::install() 37 | library(kwictopics) 38 | 39 | kwictopicsWidget(survey.lda, k, survey_responses_raw$documents) 40 | 41 | -------------------------------------------------------------------------------- /NMF.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NMF Topic Modeling in R" 3 | output: html_notebook 4 | --- 5 | 6 | Uses TM for text data pre-processing and topicmodels package. 7 | 8 | 9 | ```{r} 10 | library(NMF) 11 | library(ggplot2) 12 | library(tm) 13 | library(SnowballC) 14 | ``` 15 | 16 | ```{r} 17 | survey_responses<-read.csv("surveyresponses.csv", stringsAsFactors = FALSE) 18 | # Perform Character Encoding 19 | survey_responses$documents <- iconv(survey_responses$documents, "latin1", "ASCII", sub="") 20 | # Vectorize the documents 21 | survey_responses <- Corpus(VectorSource(as.vector(survey_responses$documents))) 22 | ``` 23 | 24 | ```{r} 25 | # Text Preprocessing 26 | # convert to lowercase 27 | survey_responses <- tm_map(survey_responses, tolower) 28 | # remove punctuation 29 | survey_responses <- tm_map(survey_responses, removePunctuation) 30 | #remove numbers 31 | survey_responses <- tm_map(survey_responses, removeNumbers); 32 | # remove generic and custom stopwords 33 | stopword <- c(stopwords('english'), "best"); 34 | survey_responses <- tm_map(survey_responses, removeWords, stopword) 35 | survey_responses <- tm_map(survey_responses, stemDocument) 36 | ``` 37 | 38 | ```{r} 39 | survey_Dtm <- DocumentTermMatrix(survey_responses, control = list(minWordLength = 2)); 40 | survey_Dtm2 <- removeSparseTerms(survey_Dtm, sparse=0.98) 41 | 42 | ``` 43 | 44 | ```{r} 45 | new_matrix <- as.matrix(survey_Dtm2) 46 | rownames(new_matrix) <- new_matrix[,1] 47 | 48 | k = 12 # number of topics 49 | # Run NMF 50 | nmf_tm <- nmf(new_matrix, k,"lee", seed="nndsvd") # nndsvd greatly helps with convergence 51 | 52 | w <- basis(nmf_tm) # W matrix 53 | dim(w) # show dimensions 54 | h <- coef(nmf_tm) # H matrix 55 | dim(h) # show dimensions 56 | 57 | # print both top words and top documents in a topic 58 | no_top_words = 6 59 | no_top_documents = 6 60 | 61 | for (topic_no in 1:k){ 62 | print(paste("Topic", topic_no)) 63 | words_in_topic <- h[topic_no,] 64 | ordered_words <- order(words_in_topic, decreasing = TRUE) 65 | top_ordered_words <- ordered_words[1:no_top_words] 66 | print(top_ordered_words) 67 | for (word in top_ordered_words){ 68 | print(names(words_in_topic)[word]) 69 | } 70 | } 71 | ``` 72 | ```{r} 73 | # Display graphs to help estimate number of topics 74 | estim.r <- nmf(new_matrix, 2:18, nrun=4, seed="nndsvd") 75 | plot(estim.r) 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /LDA.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "LDA Topic Modeling in R" 3 | output: html_notebook 4 | --- 5 | 6 | Uses TM for text data pre-processing and topicmodels package. 7 | 8 | 9 | ```{r} 10 | setwd("/Users/aneesha/user2018/") 11 | ``` 12 | 13 | ```{r} 14 | library(tm) 15 | library(SnowballC) 16 | library(topicmodels) 17 | ``` 18 | 19 | ```{r} 20 | survey_responses_raw<-read.csv("surveyresponses.csv", stringsAsFactors = FALSE) 21 | ``` 22 | 23 | ```{r} 24 | # show column names 25 | colnames(survey_responses_raw) 26 | ``` 27 | ```{r} 28 | # display a document 29 | survey_responses_raw$documents[3] 30 | ``` 31 | 32 | ```{r} 33 | # Perform Character Encoding 34 | survey_responses_raw$documents <- iconv(survey_responses_raw$documents, "latin1", "ASCII", sub="") 35 | # Vectorize the documents 36 | survey_responses <- Corpus(VectorSource(as.vector(survey_responses_raw$documents))) 37 | 38 | ``` 39 | 40 | ```{r} 41 | # Text Pre-processing 42 | # convert to lowercase 43 | survey_responses <- tm_map(survey_responses, tolower) 44 | # remove punctuation 45 | survey_responses <- tm_map(survey_responses, removePunctuation) 46 | # remove numbers 47 | survey_responses <- tm_map(survey_responses, removeNumbers); 48 | # remove stopwords 49 | stopword <- c(stopwords('english'), "best"); 50 | survey_responses <- tm_map(survey_responses, removeWords, stopword) 51 | survey_responses <- tm_map(survey_responses, stemDocument) 52 | ``` 53 | 54 | ```{r} 55 | # Create document term matrix 56 | survey_Dtm <- DocumentTermMatrix(survey_responses, control = list(minWordLength = 2)); 57 | survey_Dtm2 <- removeSparseTerms(survey_Dtm, sparse=0.98) 58 | 59 | # Find best starting value for the number of topics (k) 60 | 61 | source("optimal_K.r") 62 | control <- list(burnin = 500, iter = 1000, keep = 100) 63 | 64 | ## Determine Optimal Number of Topics 65 | (k <- optimal_k(survey_Dtm2, 20, control = control)) 66 | ``` 67 | ``` 68 | 69 | ```{r} 70 | k = 12; #number of topics 71 | SEED = 1234; 72 | 73 | no_top_words = 6 #no of top words to show 74 | no_top_documents = 6 #no top documents to show 75 | 76 | # Run LDA 77 | survey.lda <- LDA(survey_Dtm2, k, method="Gibbs", control=list(seed = SEED)) 78 | 79 | lda.topics <- as.matrix(topics(survey.lda, 6)) 80 | lda.terms <- terms(survey.lda, no_top_words) 81 | lda.terms 82 | 83 | # print both top words and top documents in a topic 84 | for (topic_no in 1:k){ 85 | print(paste("Topic", topic_no)) 86 | print(lda.terms[,topic_no]) 87 | docs_in_topic <- survey.lda@gamma[,topic_no] 88 | ordered_docs <- order(docs_in_topic, decreasing = TRUE) 89 | top_ordered_docs <- ordered_docs[1:no_top_documents] 90 | print(top_ordered_docs) 91 | for (doc in top_ordered_docs){ 92 | print(survey_responses_raw$documents[doc]) 93 | } 94 | } 95 | 96 | ``` 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /optimal_K.r: -------------------------------------------------------------------------------- 1 | #' Find Optimal Number of Topics 2 | #' Original code from: https://github.com/trinker/topicmodels_learning 3 | #' Iteratively produces models and then compares the harmonic mean of the log 4 | #' likelihoods in a graphical output. 5 | #' 6 | #' @param x A \code{\link[tm]{DocumentTermMatrix}}. 7 | #' @param max.k Maximum number of topics to fit (start small [i.e., default of 8 | #' 30] and add as necessary). 9 | #' @param harmonic.mean Logical. If \code{TRUE} the harmonic means of the 10 | #' log likelihoods are used to determine k (see 11 | #' \url{http://stackoverflow.com/a/21394092/1000343}). Otherwise just the log 12 | #' likelihoods are graphed against k (see 13 | #' \url{http://stats.stackexchange.com/a/25128/7482}). 14 | #' @param burnin Object of class \code{"integer"}; number of omitted Gibbs 15 | #' iterations at beginning, by default equals 0. 16 | #' @param iter Object of class \code{"integer"}; number of Gibbs iterations, by 17 | #' default equals 2000. 18 | #' @param keep Object of class \code{"integer"}; if a positive integer, the 19 | #' log likelihood is saved every keep iterations. 20 | #' @param method The method to be used for fitting; currently 21 | #' \code{method = "VEM"} or \code{method= "Gibbs"} are supported. 22 | #' @param drop.seed Logical. If \code{TRUE} \code{seed} argument is dropped from 23 | #' \code{control}. 24 | #' @param \ldots Other arguments passed to \code{??LDAcontrol}. 25 | #' @return Returns the \code{\link[base]{data.frame}} of k (nuber of topics) and 26 | #' the associated log likelihood. 27 | #' @references \url{http://stackoverflow.com/a/21394092/1000343} \cr 28 | #' \url{http://stats.stackexchange.com/a/25128/7482} \cr 29 | #' Ponweiser, M. (2012). Latent Dirichlet Allocation in R (Diploma Thesis). 30 | #' Vienna University of Economics and Business, Vienna. 31 | #' http://epub.wu.ac.at/3558/1/main.pdf \cr\cr 32 | #' Griffiths, T.L., and Steyvers, M. (2004). Finding scientific topics. 33 | #' Proceedings of the National Academy of Sciences of the United States of America 34 | #' 101(Suppl 1), 5228 - 5235. \url{http://www.pnas.org/content/101/suppl_1/5228.full.pdf} 35 | #' @keywords k topicmodel 36 | #' @export 37 | #' @author Ben Marwick and Tyler Rinker . 38 | #' @examples 39 | #' ## Install/Load Tools & Data 40 | #' if (!require("pacman")) install.packages("pacman") 41 | #' pacman::p_load_gh("trinker/gofastr") 42 | #' pacman::p_load(tm, topicmodels, dplyr, tidyr, devtools, LDAvis, ggplot2) 43 | #' 44 | #' 45 | #' ## Source topicmodels2LDAvis function 46 | #' devtools::source_url("https://gist.githubusercontent.com/trinker/477d7ae65ff6ca73cace/raw/79dbc9d64b17c3c8befde2436fdeb8ec2124b07b/topicmodels2LDAvis") 47 | #' 48 | #' data(presidential_debates_2012) 49 | #' 50 | #' 51 | #' ## Generate Stopwords 52 | #' stops <- c( 53 | #' tm::stopwords("english"), 54 | #' "governor", "president", "mister", "obama","romney" 55 | #' ) %>% 56 | #' gofastr::prep_stopwords() 57 | #' 58 | #' 59 | #' ## Create the DocumentTermMatrix 60 | #' doc_term_mat <- presidential_debates_2012 %>% 61 | #' with(gofastr::q_dtm_stem(dialogue, paste(person, time, sep = "_"))) %>% 62 | #' gofastr::remove_stopwords(stops) %>% 63 | #' gofastr::filter_tf_idf() %>% 64 | #' gofastr::filter_documents() 65 | #' 66 | #' 67 | #' opti_k1 <- optimal_k(doc_term_mat) 68 | #' opti_k1 69 | #' 70 | #' opti_k2 <- optimal_k(doc_term_mat, harmonic.mean = FALSE) 71 | #' opti_k2 72 | optimal_k <- function(x, max.k = 30, harmonic.mean = TRUE, 73 | control = if (harmonic.mean) list(burnin = 500, iter = 1000, keep = 100) else NULL, 74 | method = if (harmonic.mean) "Gibbs" else "VEM", verbose = TRUE, drop.seed = TRUE, ...){ 75 | 76 | if (isTRUE(drop.seed)){ 77 | control[["seed"]] <- NULL 78 | } 79 | 80 | if (isTRUE(harmonic.mean)) { 81 | optimal_k1(x, max.k = max.k, control = control, method = method, verbose = verbose, ...) 82 | } else { 83 | optimal_k2(x, max.k = max.k, control = control, method = method, ...) 84 | } 85 | } 86 | 87 | #' Plots a plot.optimal_k1 Object 88 | #' 89 | #' Plots a plot.optimal_k1 object 90 | #' 91 | #' @param x A \code{optimal_k1} object. 92 | #' @param \ldots Ignored. 93 | #' @method plot plot.optimal_k1 94 | #' @export 95 | plot.optimal_k1 <- function(x, ...){ 96 | 97 | y <- attributes(x)[["k_dataframe"]] 98 | y <- y[y[["k"]] == as.numeric(x), ] 99 | 100 | ggplot2::ggplot(attributes(x)[["k_dataframe"]], ggplot2::aes_string(x="k", y="harmonic_mean")) + 101 | ggplot2::xlab(sprintf("Number of Topics (Optimal Number: %s)", as.numeric(x))) + 102 | ggplot2::ylab("Harmonic Mean of Log Likelihood") + 103 | ggplot2::geom_smooth(method = "loess", fill=NA) + 104 | ggplot2::geom_point(data=y, color="red", fill=NA, size = 6, shape = 21) + 105 | ggplot2::geom_line(size=1) + 106 | ggplot2::theme_bw() + 107 | ggplot2::theme( 108 | axis.title.x = ggplot2::element_text(vjust = -0.25, size = 14), 109 | axis.title.y = ggplot2::element_text(size = 14, angle=90) 110 | ) 111 | } 112 | 113 | #' Prints a optimal_k Object 114 | #' 115 | #' Prints a optimal_k object 116 | #' 117 | #' @param x A \code{optimal_k} object. 118 | #' @param \ldots Ignored. 119 | #' @method print optimal_k 120 | #' @export 121 | print.optimal_k <- function(x, ...){ 122 | 123 | print(graphics::plot(x)) 124 | 125 | } 126 | 127 | 128 | 129 | 130 | optimal_k1 <- function(x, max.k = 30, 131 | control = list(burnin = 500, iter = 1000, keep = 100), method = "Gibbs", 132 | verbose = TRUE, ...){ 133 | 134 | 135 | if (max.k > 20) { 136 | message("\nGrab a cup of coffee this could take a while...\n") 137 | flush.console() 138 | } 139 | 140 | tic <- Sys.time() 141 | v <- rep(NA, floor(max.k/10)) 142 | dat <- data.frame(k = v, time = v) 143 | end <- data.frame(k = max.k^2) 144 | 145 | hm_many <- sapply(2:max.k, function(k){ 146 | if (k %% 10 == 0){ 147 | time <- as.numeric(difftime(Sys.time(), tic, units = "mins")) 148 | dat[k/10, 1:2] <<- c(k^2, time) 149 | if (k/10 > 1) { 150 | fit <- with(dat, lm(time~k)) 151 | pred <- predict(fit, end) - time 152 | if (pred < 0) pred <- 0 153 | est <- paste0("; Remaining: ~", time2char(pred), " mins") 154 | } else { 155 | est <- "" 156 | } 157 | cur <- format(Sys.time(), format="%I:%M:%S") 158 | elapsed <- time2char(time) 159 | #gsub("^0+", "", as.character(round(as.numeric(difftime(Sys.time(), tic, units = "mins")), 1))) 160 | cat(sprintf("%s of %s iterations (Current: %s; Elapsed: %s mins%s)\n", k, max.k, cur, elapsed, est)); flush.console() 161 | } 162 | burnin <- control[["burnin"]] 163 | keep <- control[["keep"]] 164 | if (is.null(burnin) | is.null(keep)) stop("Supply burnin & keep to control") 165 | fitted <- topicmodels::LDA(x, k = k, method = method, control = control) 166 | logLiks <- fitted@logLiks[-c(1:(burnin/keep))] 167 | harmonicMean(logLiks) 168 | }) 169 | 170 | out <- c(2:max.k)[which.max(hm_many)] 171 | if (which.max(hm_many) == max.k) warning("Optimal K is last value; suggest increasing `max.k`") 172 | class(out) <- c("optimal_k", "optimal_k1", class(out)) 173 | attributes(out)[["k_dataframe"]] <- data.frame( 174 | k = 2:max.k, 175 | harmonic_mean = hm_many 176 | ) 177 | if (isTRUE(verbose)) cat(sprintf("Optimal number of topics = %s\n",as.numeric(out))) 178 | out 179 | } 180 | 181 | harmonicMean <- function(logLikelihoods, precision=2000L) { 182 | llMed <- Rmpfr::median(logLikelihoods) 183 | as.double(llMed - log(Rmpfr::mean(exp(-Rmpfr::mpfr(logLikelihoods, prec = precision) + llMed)))) 184 | } 185 | 186 | optimal_k2 <- function(x, max.k = 30, control = NULL, method = "VEM", ...){ 187 | 188 | if (max.k > 20) { 189 | message("\nGrab a cup of coffee this could take a while...\n") 190 | flush.console() 191 | } 192 | 193 | tic <- Sys.time() 194 | v <- rep(NA, floor(max.k/10)) 195 | dat <- data.frame(k = v, time = v) 196 | end <- data.frame(k = max.k^2) 197 | 198 | best_model <- lapply(seq(2, max.k, by=1), function(k){ 199 | if (k %% 10 == 0){ 200 | time <- as.numeric(difftime(Sys.time(), tic, units = "mins")) 201 | dat[k/10, 1:2] <<- c(k^2, time) 202 | if (k/10 > 1) { 203 | fit <- with(dat, lm(time~k)) 204 | est <- paste0("; Remaining: ~", time2char(predict(fit, end) - time), " mins") 205 | } else { 206 | est <- "" 207 | } 208 | cur <- format(Sys.time(), format="%I:%M:%S") 209 | elapsed <- time2char(time) 210 | #gsub("^0+", "", as.character(round(as.numeric(difftime(Sys.time(), tic, units = "mins")), 1))) 211 | cat(sprintf("%s of %s iterations (Current: %s; Elapsed: %s mins%s)\n", k, max.k, cur, elapsed, est)); flush.console() 212 | } 213 | topicmodels::LDA(x, k = k, method = method, control = control, ...) 214 | }) 215 | 216 | out <- data.frame( 217 | k = c(2:max.k), 218 | logLik = sapply(best_model, logLik) 219 | ) 220 | 221 | class(out) <- c("optimal_k", "optimal_k2", "data.frame") 222 | out 223 | } 224 | 225 | time2char <- function(x){ 226 | x <- as.character(round(x, 1)) 227 | if (identical("0", x)) return(x) 228 | gsub("^0+", "", x) 229 | } 230 | 231 | #' Plots a plot.optimal_k2 Object 232 | #' 233 | #' Plots a plot.optimal_k2 object 234 | #' 235 | #' @param x A \code{optimal_k2} object. 236 | #' @param \ldots Ignored. 237 | #' @method plot plot.optimal_k2 238 | #' @export 239 | plot.optimal_k2 <- function(x, ...){ 240 | 241 | ggplot2::ggplot(x, ggplot2::aes_string(x="k", y="logLik")) + 242 | ggplot2::xlab("Number of Topics") + 243 | ggplot2::ylab("Log Likelihood") + 244 | ggplot2::geom_smooth(size=.8, se=FALSE, method="loess") + 245 | ggplot2::geom_line(size=1) + 246 | ggplot2::theme_bw() + 247 | ggplot2::theme( 248 | axis.title.x = ggplot2::element_text(vjust = -0.25, size = 14), 249 | axis.title.y = ggplot2::element_text(size = 14, angle=90) 250 | ) 251 | 252 | } 253 | 254 | 255 | #pacman::p_load(ggplot2, topicmodels, Rmpfr) --------------------------------------------------------------------------------