├── LICENSE
├── data
    └── TwentyNewsgroups.rda
├── docs
    ├── newsgroup
    │   ├── cache
    │   │   ├── __packages
    │   │   ├── json_936adc7103a3e761125b22efedb2144e.rdb
    │   │   ├── json_936adc7103a3e761125b22efedb2144e.rdx
    │   │   └── json_936adc7103a3e761125b22efedb2144e.RData
    │   ├── vis
    │   │   ├── lda.css
    │   │   └── index.html
    │   ├── newsgroup.Rmd
    │   ├── newsgroup.md
    │   └── newsgroup.html
    ├── AP
    │   ├── AP.Rmd
    │   ├── vis
    │   │   ├── lda.css
    │   │   └── index.html
    │   ├── AP.md
    │   └── AP.html
    ├── Jeopardy
    │   ├── Jeopardy.Rmd
    │   ├── Jeopardy.md
    │   ├── vis
    │   │   ├── lda.css
    │   │   └── index.html
    │   └── Jeopardy.html
    ├── reviews
    │   ├── vis
    │   │   ├── lda.css
    │   │   └── index.html
    │   ├── reviews.md
    │   ├── reviews.Rmd
    │   └── reviews.html
    └── render.R
├── .gitignore
├── Makefile
├── cran-comments.md
├── .Rbuildignore
├── NAMESPACE
├── R
    ├── imports.R
    ├── data.R
    ├── runShiny.R
    ├── shiny.R
    ├── serVis.R
    └── createJSON.R
├── inst
    ├── examples
    │   ├── shiny
    │   │   ├── ui.R
    │   │   └── server.R
    │   └── rmarkdown.Rmd
    ├── htmljs
    │   ├── lda.css
    │   └── index.html
    ├── shiny
    │   └── shinyLDAvis.js
    ├── CITATION
    └── languages
    │   └── dictionary.txt
├── LDAvis.Rproj
├── man
    ├── visOutput.Rd
    ├── jsPCA.Rd
    ├── renderVis.Rd
    ├── TwentyNewsgroups.Rd
    ├── runShiny.Rd
    ├── serVis.Rd
    └── createJSON.Rd
├── .travis.yml
├── DESCRIPTION
├── NEWS
├── README.md
└── vignettes
    └── details.Rnw


/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2013
2 | COPYRIGHT HOLDER: AT&T Intellectual Property


--------------------------------------------------------------------------------
/data/TwentyNewsgroups.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpsievert/LDAvis/HEAD/data/TwentyNewsgroups.rda


--------------------------------------------------------------------------------
/docs/newsgroup/cache/__packages:
--------------------------------------------------------------------------------
1 | base
2 | datasets
3 | utils
4 | grDevices
5 | graphics
6 | stats
7 | LDAvis
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rhistory
 2 | .Rproj.user
 3 | .DS_Store
 4 | 
 5 | docs/*/*_cache
 6 | 
 7 | inst/examples/*/*
 8 | !inst/examples/*/*.R
 9 | !inst/examples/*/*.Rmd
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: site
 2 | 
 3 | clean:
 4 | 	rm -r docs/*/vis
 5 | 	rm docs/*/*.html
 6 | 	rm docs/*/*.md
 7 | 
 8 | site:
 9 | 	cd docs && Rscript render.R && cd ..
10 | 


--------------------------------------------------------------------------------
/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpsievert/LDAvis/HEAD/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.rdb


--------------------------------------------------------------------------------
/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpsievert/LDAvis/HEAD/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.rdx


--------------------------------------------------------------------------------
/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpsievert/LDAvis/HEAD/docs/newsgroup/cache/json_936adc7103a3e761125b22efedb2144e.RData


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Test environments
2 | * local OS X install, R 3.1.2
3 | * win-builder (devel and release)
4 | 
5 | ## R CMD check results
6 | There were no ERRORs, WARNINGs or NOTEs.
7 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | .Rhistory
 4 | .travis.yml
 5 | .push_gh_pages.sh
 6 | cran-comments.md
 7 | inst/examples/*/*
 8 | !inst/examples/*/*.R
 9 | !inst/examples/*/*.Rmd
10 | 
11 | docs
12 | Makefile


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(createJSON)
 4 | export(jsPCA)
 5 | export(renderVis)
 6 | export(runShiny)
 7 | export(serVis)
 8 | export(visOutput)
 9 | importFrom(utils,read.csv)
10 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | #' @importFrom digest digest
2 | #' @importFrom parallel parLapply
3 | #' @importFrom RJSONIO toJSON
4 | #' @importFrom proxy dist
5 | #' @importFrom utils packageVersion browseURL
6 | #' @importFrom stats cmdscale
7 | #' @importFrom utils read.csv
8 | 


--------------------------------------------------------------------------------
/inst/examples/shiny/ui.R:
--------------------------------------------------------------------------------
1 | library(LDAvis)
2 | shinyUI(
3 |   fluidPage(
4 |     sliderInput("nTerms", "Number of terms to display", min = 20, max = 40, value = 30),
5 |     textOutput("termClicked"),
6 |     textOutput("topicClicked"),
7 |     visOutput('myChart')
8 |   )
9 | )


--------------------------------------------------------------------------------
/docs/AP/AP.Rmd:
--------------------------------------------------------------------------------
 1 | Associated Press
 2 | =================
 3 | 
 4 | [Click here](http://cpsievert.github.io/LDAvis/AP/vis) to see the result of the code below:
 5 | 
 6 | ```{r}
 7 | library("LDAvis")
 8 | data(AP, package = "LDAvisData")
 9 | json <- with(AP, createJSON(phi, theta, doc.length, vocab, term.frequency))
10 | serVis(json, out.dir = 'vis', open.browser = FALSE)
11 | ```
12 | 


--------------------------------------------------------------------------------
/LDAvis.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/docs/Jeopardy/Jeopardy.Rmd:
--------------------------------------------------------------------------------
 1 | Jeopardy Data
 2 | ==============
 3 | 
 4 | [Click here](http://cpsievert.github.io/LDAvis/Jeopardy/vis) to see the result of the code below:
 5 | 
 6 | ```{r}
 7 | library("LDAvis")
 8 | data(Jeopardy, package = "LDAvisData")
 9 | json <- with(Jeopardy, createJSON(phi, theta, doc.length, vocab, term.frequency))
10 | serVis(json, out.dir = 'vis', open.browser = FALSE)
11 | ```
12 | 


--------------------------------------------------------------------------------
/docs/Jeopardy/Jeopardy.md:
--------------------------------------------------------------------------------
 1 | Jeopardy Data
 2 | ==============
 3 | 
 4 | [Click here](http://cpsievert.github.io/LDAvis/Jeopardy/vis) to see the result of the code below:
 5 | 
 6 | 
 7 | ```r
 8 | library("LDAvis")
 9 | data(Jeopardy, package = "LDAvisData")
10 | json <- with(Jeopardy, createJSON(phi, theta, doc.length, vocab, term.frequency))
11 | serVis(json, out.dir = 'vis', open.browser = FALSE)
12 | ```
13 | 


--------------------------------------------------------------------------------
/docs/AP/vis/lda.css:
--------------------------------------------------------------------------------
 1 | path {
 2 |   fill: none;
 3 |   stroke: none;
 4 | }
 5 | 
 6 | .xaxis .tick.major {
 7 |     fill: black;
 8 |     stroke: black;
 9 |     stroke-width: 0.1;
10 |     opacity: 0.7;
11 | }
12 | 
13 | .slideraxis {
14 |     fill: black;
15 |     stroke: black;
16 |     stroke-width: 0.4;
17 |     opacity: 1;
18 | }
19 | 
20 | text {
21 |     font-family: sans-serif;
22 |     font-size: 11px;
23 | }


--------------------------------------------------------------------------------
/inst/htmljs/lda.css:
--------------------------------------------------------------------------------
 1 | path {
 2 |   fill: none;
 3 |   stroke: none;
 4 | }
 5 | 
 6 | .xaxis .tick.major {
 7 |     fill: black;
 8 |     stroke: black;
 9 |     stroke-width: 0.1;
10 |     opacity: 0.7;
11 | }
12 | 
13 | .slideraxis {
14 |     fill: black;
15 |     stroke: black;
16 |     stroke-width: 0.4;
17 |     opacity: 1;
18 | }
19 | 
20 | text {
21 |     font-family: sans-serif;
22 |     font-size: 11px;
23 | }


--------------------------------------------------------------------------------
/docs/Jeopardy/vis/lda.css:
--------------------------------------------------------------------------------
 1 | path {
 2 |   fill: none;
 3 |   stroke: none;
 4 | }
 5 | 
 6 | .xaxis .tick.major {
 7 |     fill: black;
 8 |     stroke: black;
 9 |     stroke-width: 0.1;
10 |     opacity: 0.7;
11 | }
12 | 
13 | .slideraxis {
14 |     fill: black;
15 |     stroke: black;
16 |     stroke-width: 0.4;
17 |     opacity: 1;
18 | }
19 | 
20 | text {
21 |     font-family: sans-serif;
22 |     font-size: 11px;
23 | }


--------------------------------------------------------------------------------
/docs/newsgroup/vis/lda.css:
--------------------------------------------------------------------------------
 1 | path {
 2 |   fill: none;
 3 |   stroke: none;
 4 | }
 5 | 
 6 | .xaxis .tick.major {
 7 |     fill: black;
 8 |     stroke: black;
 9 |     stroke-width: 0.1;
10 |     opacity: 0.7;
11 | }
12 | 
13 | .slideraxis {
14 |     fill: black;
15 |     stroke: black;
16 |     stroke-width: 0.4;
17 |     opacity: 1;
18 | }
19 | 
20 | text {
21 |     font-family: sans-serif;
22 |     font-size: 11px;
23 | }


--------------------------------------------------------------------------------
/docs/reviews/vis/lda.css:
--------------------------------------------------------------------------------
 1 | path {
 2 |   fill: none;
 3 |   stroke: none;
 4 | }
 5 | 
 6 | .xaxis .tick.major {
 7 |     fill: black;
 8 |     stroke: black;
 9 |     stroke-width: 0.1;
10 |     opacity: 0.7;
11 | }
12 | 
13 | .slideraxis {
14 |     fill: black;
15 |     stroke: black;
16 |     stroke-width: 0.4;
17 |     opacity: 1;
18 | }
19 | 
20 | text {
21 |     font-family: sans-serif;
22 |     font-size: 11px;
23 | }


--------------------------------------------------------------------------------
/docs/render.R:
--------------------------------------------------------------------------------
 1 | knit_examples <- function() {
 2 |   old <- getwd()
 3 |   on.exit(setwd(old))
 4 |   dirs <- dir()
 5 |   # keep only directories
 6 |   dirs <- dirs[file_test("-d", dirs)]
 7 |   # navigate intp each example and knit individually
 8 |   for (i in dirs) {
 9 |     setwd(i)
10 |     knitr::knit2html(input = paste0(i, ".Rmd"), envir = new.env())
11 |     setwd(old)
12 |   }
13 | }
14 | 
15 | knit_examples()
16 | 


--------------------------------------------------------------------------------
/docs/AP/AP.md:
--------------------------------------------------------------------------------
 1 | Associated Press
 2 | =================
 3 | 
 4 | [Click here](http://cpsievert.github.io/LDAvis/AP/vis) to see the result of the code below:
 5 | 
 6 | 
 7 | ```r
 8 | library("LDAvis")
 9 | data(AP, package = "LDAvisData")
10 | json <- with(AP, createJSON(phi, theta, doc.length, vocab, term.frequency))
11 | serVis(json, out.dir = 'vis', open.browser = FALSE)
12 | ```
13 | 
14 | ```
15 | ## Loading required namespace: servr
16 | ```
17 | 


--------------------------------------------------------------------------------
/man/visOutput.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shiny.R
 3 | \name{visOutput}
 4 | \alias{visOutput}
 5 | \title{Shiny ui output function}
 6 | \usage{
 7 | visOutput(outputId)
 8 | }
 9 | \arguments{
10 | \item{outputId}{output variable to read the plot from}
11 | }
12 | \description{
13 | Shiny ui output function
14 | }
15 | \seealso{
16 | http://shiny.rstudio.com/articles/building-outputs.html
17 | }
18 | 


--------------------------------------------------------------------------------
/docs/AP/vis/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 |     <title>LDAvis</title>
 6 |     <script src="d3.v3.js"></script>
 7 |     <script src="ldavis.js"></script>
 8 |     <link rel="stylesheet" type="text/css" href="lda.css">
 9 |   </head>
10 | 
11 |   <body>
12 |     <div id = "lda"></div>
13 |     <script>
14 |       var vis = new LDAvis("#lda", "lda.json");
15 |     </script>
16 |   </body>
17 | 
18 | </html>
19 | 


--------------------------------------------------------------------------------
/inst/htmljs/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 |     <title>LDAvis</title>
 6 |     <script src="d3.v3.js"></script>
 7 |     <script src="ldavis.js"></script>
 8 |     <link rel="stylesheet" type="text/css" href="lda.css">
 9 |   </head>
10 | 
11 |   <body>
12 |     <div id = "lda"></div>
13 |     <script>
14 |       var vis = new LDAvis("#lda", "lda.json");
15 |     </script>
16 |   </body>
17 | 
18 | </html>
19 | 


--------------------------------------------------------------------------------
/docs/reviews/vis/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 |     <title>LDAvis</title>
 6 |     <script src="d3.v3.js"></script>
 7 |     <script src="ldavis.js"></script>
 8 |     <link rel="stylesheet" type="text/css" href="lda.css">
 9 |   </head>
10 | 
11 |   <body>
12 |     <div id = "lda"></div>
13 |     <script>
14 |       var vis = new LDAvis("#lda", "lda.json");
15 |     </script>
16 |   </body>
17 | 
18 | </html>
19 | 


--------------------------------------------------------------------------------
/docs/Jeopardy/vis/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 |     <title>LDAvis</title>
 6 |     <script src="d3.v3.js"></script>
 7 |     <script src="ldavis.js"></script>
 8 |     <link rel="stylesheet" type="text/css" href="lda.css">
 9 |   </head>
10 | 
11 |   <body>
12 |     <div id = "lda"></div>
13 |     <script>
14 |       var vis = new LDAvis("#lda", "lda.json");
15 |     </script>
16 |   </body>
17 | 
18 | </html>
19 | 


--------------------------------------------------------------------------------
/docs/newsgroup/vis/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 5 |     <title>LDAvis</title>
 6 |     <script src="d3.v3.js"></script>
 7 |     <script src="ldavis.js"></script>
 8 |     <link rel="stylesheet" type="text/css" href="lda.css">
 9 |   </head>
10 | 
11 |   <body>
12 |     <div id = "lda"></div>
13 |     <script>
14 |       var vis = new LDAvis("#lda", "lda.json");
15 |     </script>
16 |   </body>
17 | 
18 | </html>
19 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
 2 | 
 3 | language: r
 4 | sudo: true
 5 | dist: trusty
 6 | 
 7 | r:
 8 | #  - oldrel
 9 |   - release
10 |   - devel
11 | 
12 | # system requirements for rJava and topicmodels packages
13 | # http://stackoverflow.com/questions/16438073/unable-to-install-rjava-in-r-3-0-in-ubuntu-13-04?rq=1
14 | # http://stackoverflow.com/questions/25759007/error-installing-topicmodels-package-non-zero-exit-status-ubuntu
15 | before_install:
16 |   - sudo apt-get install libgsl0-dev openjdk-7-*
17 | 
18 | r_packages: devtools  
19 | 


--------------------------------------------------------------------------------
/man/jsPCA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/createJSON.R
 3 | \name{jsPCA}
 4 | \alias{jsPCA}
 5 | \title{Dimension reduction via Jensen-Shannon Divergence & Principal Components}
 6 | \usage{
 7 | jsPCA(phi)
 8 | }
 9 | \arguments{
10 | \item{phi}{matrix, with each row containing the distribution over terms 
11 | for a topic, with as many rows as there are topics in the model, and as 
12 | many columns as there are terms in the vocabulary.}
13 | }
14 | \description{
15 | Dimension reduction via Jensen-Shannon Divergence & Principal Components
16 | }
17 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Twenty Newsgroups Data
 2 | #'
 3 | #' @format A list elements extracted from a topic model fit to this data
 4 | #' \describe{
 5 | #'   \item{phi}{phi, a matrix with the topic-term distributions}
 6 | #'   \item{theta}{theta, a matrix with the document-topic distributions}
 7 | #'   \item{doc.length}{doc.length, a numeric vector with token counts for each document}
 8 | #'   \item{vocab}{vocab, a character vector containing the terms}
 9 | #'   \item{term.frequency}{term.frequency, a numeric vector of observed term frequencies}
10 | #' }
11 | #' @source \url{http://qwone.com/~jason/20Newsgroups/}
12 | "TwentyNewsgroups"
13 | 


--------------------------------------------------------------------------------
/inst/examples/shiny/server.R:
--------------------------------------------------------------------------------
 1 | library(LDAvis)
 2 | library(shiny)
 3 | shinyServer(function(input, output, session) {
 4 |   output$myChart <- renderVis({
 5 |     with(TwentyNewsgroups,
 6 |          createJSON(phi, theta, doc.length, vocab, term.frequency,
 7 |                     R = input$nTerms))
 8 |   })
 9 | 
10 |   output$termClicked <- renderPrint({
11 |     if (is.null(input$myChart_term_click)) return()
12 |     paste("You clicked on term:", input$myChart_term_click)    
13 |   })
14 |   
15 |   output$topicClicked <- renderPrint({
16 |     if (is.null(input$myChart_topic_click)) return()
17 |     paste("You clicked on topic:", input$myChart_topic_click)    
18 |   })
19 | })
20 | 


--------------------------------------------------------------------------------
/inst/shiny/shinyLDAvis.js:
--------------------------------------------------------------------------------
 1 | var ldavisBinding = new Shiny.OutputBinding();
 2 | 
 3 | ldavisBinding.find = function(scope) {
 4 |   return $(scope).find(".shinyLDAvis");
 5 | };
 6 | 
 7 | ldavisBinding.renderValue = function(el, data) {
 8 |   // remove the old graph 
 9 |   // http://stackoverflow.com/questions/14422198/how-do-i-remove-all-children-elements-from-a-node-and-them-apply-them-again-with
10 |   var old_plot = d3.select(el).selectAll("*").remove();
11 |   // add the new plot
12 |   var json_file = "ldavisAssets/" + data.jsonFile;
13 |   var to_select = "#" + el.id;
14 |   var vis = new LDAvis(to_select, json_file);
15 |   
16 | };
17 | 
18 | Shiny.outputBindings.register(ldavisBinding, "cpsievert.ldavisBinding");
19 | 


--------------------------------------------------------------------------------
/man/renderVis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/shiny.R
 3 | \name{renderVis}
 4 | \alias{renderVis}
 5 | \title{Create an LDAvis output element}
 6 | \usage{
 7 | renderVis(expr, env = parent.frame(), quoted = FALSE)
 8 | }
 9 | \arguments{
10 | \item{expr}{An expression that generates a plot.}
11 | 
12 | \item{env}{The environment in which to evaluate \code{expr}.}
13 | 
14 | \item{quoted}{Is expr a quoted expression (with \code{quote()})? This is useful if you want to save an expression in a variable.}
15 | }
16 | \description{
17 | Shiny server output function customized for animint plots 
18 | (similar to \code{shiny::plotOutput} and friends).
19 | }
20 | \seealso{
21 | http://shiny.rstudio.com/articles/building-outputs.html
22 | }
23 | 


--------------------------------------------------------------------------------
/man/TwentyNewsgroups.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{TwentyNewsgroups}
 5 | \alias{TwentyNewsgroups}
 6 | \title{Twenty Newsgroups Data}
 7 | \format{A list elements extracted from a topic model fit to this data
 8 | \describe{
 9 |   \item{phi}{phi, a matrix with the topic-term distributions}
10 |   \item{theta}{theta, a matrix with the document-topic distributions}
11 |   \item{doc.length}{doc.length, a numeric vector with token counts for each document}
12 |   \item{vocab}{vocab, a character vector containing the terms}
13 |   \item{term.frequency}{term.frequency, a numeric vector of observed term frequencies}
14 | }}
15 | \source{
16 | \url{http://qwone.com/~jason/20Newsgroups/}
17 | }
18 | \usage{
19 | TwentyNewsgroups
20 | }
21 | \description{
22 | Twenty Newsgroups Data
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/R/runShiny.R:
--------------------------------------------------------------------------------
 1 | #' Run shiny/D3 visualization
 2 | #' 
 3 | #' This function is deprecated as of version 0.2
 4 | #' 
 5 | #' @param phi a matrix with W rows, one for each term in the vocabulary, and K 
 6 | #' columns, one for each topic, where each column sums to one. Each column is the 
 7 | #' multinomial distribution over terms for a given topic in an LDA topic model.
 8 | #' @param term.frequency an integer vector of length W containing the frequency 
 9 | #' of each term in the vocabulary.
10 | #' @param vocab a character vector of length W containing the unique terms in 
11 | #' the corpus.
12 | #' @param topic.proportion a numeric vector of length K containing the proportion
13 | #'  of each topic in the corpus.
14 | #' @export
15 | 
16 | runShiny <- function(phi, term.frequency, vocab, topic.proportion) {
17 |   message("`runShiny` is deprecated as of version 0.2, please use `createJSON`")
18 |   return(NULL)
19 | }
20 | 


--------------------------------------------------------------------------------
/man/runShiny.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/runShiny.R
 3 | \name{runShiny}
 4 | \alias{runShiny}
 5 | \title{Run shiny/D3 visualization}
 6 | \usage{
 7 | runShiny(phi, term.frequency, vocab, topic.proportion)
 8 | }
 9 | \arguments{
10 | \item{phi}{a matrix with W rows, one for each term in the vocabulary, and K 
11 | columns, one for each topic, where each column sums to one. Each column is the 
12 | multinomial distribution over terms for a given topic in an LDA topic model.}
13 | 
14 | \item{term.frequency}{an integer vector of length W containing the frequency 
15 | of each term in the vocabulary.}
16 | 
17 | \item{vocab}{a character vector of length W containing the unique terms in 
18 | the corpus.}
19 | 
20 | \item{topic.proportion}{a numeric vector of length K containing the proportion
21 | of each topic in the corpus.}
22 | }
23 | \description{
24 | This function is deprecated as of version 0.2
25 | }
26 | 


--------------------------------------------------------------------------------
/inst/examples/rmarkdown.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Using LDAvis with shiny & rmarkdown"
 3 | author: "Carson Sievert"
 4 | date: "`r Sys.Date()`"
 5 | output: html_document
 6 | runtime: shiny
 7 | ---
 8 | 
 9 | This document details how to make **LDAvis** react to [**shiny**](http://cran.r-project.org/web/packages/shiny/index.html) inputs. In theory, any `createJSON()` argument *could* be dynamically altered in a [shiny app](http://shiny.rstudio.com/) or [interactive document](http://rmarkdown.rstudio.com/authoring_shiny.html). In practice, this might not be such a great idea since `createJSON()` can take a few seconds to run (note in this example it's about 5 seconds).
10 | 
11 | Anyway, here is one way to alter the number of terms shown using a `shiny::sliderInput()`.
12 | 
13 | ```{r}
14 | library(shiny)
15 | sliderInput("nTerms", "Number of terms to display", min = 20, max = 40, value = 30)
16 | ```
17 | 
18 | By wrapping a call to `createJSON()` with `renderVis()`, the output will update whenever the value of `input$nTerms` changes.
19 | 
20 | ```{r}
21 | library(LDAvis)
22 | data(TwentyNewsgroups, package = "LDAvis")
23 | renderVis({
24 |   with(TwentyNewsgroups, 
25 |        createJSON(phi, theta, doc.length, vocab, term.frequency, 
26 |                   R = input$nTerms))
27 | })
28 | ```


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite LDAvis in publications use:")
 2 | 
 3 | year <- sub('.*(2[[:digit:]]{3})-.*', '\\1', meta$Date, perl = TRUE)
 4 | vers <- paste('R package version', meta$Version)
 5 | 
 6 | bibentry(
 7 |   bibtype = "Article",
 8 |   title = "LDAvis: A method for visualizing and interpreting topics.",
 9 |   author = personList(as.person("Carson Sievert"), as.person("Kenny Shirley")),
10 |   journal = "Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.",
11 |   year = "2014",
12 |   url = "http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf",
13 | 
14 |   textVersion =
15 |   paste("Carson Sievert and Kenny Shirley (2014).",
16 |         "LDAvis: A method for visualizing and interpreting topics.",
17 |         "Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.",
18 |         "URL http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf.")
19 | )
20 | 
21 | citEntry(
22 |   entry = 'manual',
23 |   title = paste('LDAvis:', meta$Title),
24 |   author = as.person(meta$Author),
25 |   year = year,
26 |   note = vers,
27 |   url = meta$URL,
28 |   textVersion = paste('Carson Sievert and Kenny Shirley (', year, '). LDAvis: ', meta$Title, '. ', vers, '.', sep = '')
29 | )
30 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: LDAvis
 2 | Title: Interactive Visualization of Topic Models
 3 | Version: 0.3.5
 4 | Authors@R: c(
 5 |    person("Carson", "Sievert", role = c("aut", "cre"), email = "cpsievert1@gmail.com", comment = c(ORCID = "0000-0002-4958-2844")), 
 6 |    person("Kenny", "Shirley", role = "aut", email = "kshirley@research.att.com"), 
 7 |    person("Christopher", "Baker", role = "ctb", email = "chriscrewbaker@gmail.com")
 8 |    )
 9 | Description: Tools to create an interactive web-based visualization of a
10 |     topic model that has been fit to a corpus of text data using
11 |     Latent Dirichlet Allocation (LDA). Given the estimated parameters of
12 |     the topic model, it computes various summary statistics as input to
13 |     an interactive visualization built with 'D3.js' that is accessed via
14 |     a browser. The goal is to help users interpret the topics in their
15 |     'LDA' topic model.
16 | Depends:
17 |     R (>= 3.4.0)
18 | Imports:
19 |     proxy,
20 |     RJSONIO,
21 |     parallel
22 | License: MIT + file LICENSE
23 | Suggests:
24 |     mallet,
25 |     lda,
26 |     topicmodels,
27 |     gistr (>= 0.0.8.99),
28 |     servr,
29 |     shiny,
30 |     knitr,
31 |     rmarkdown,
32 |     digest,
33 |     htmltools
34 | LazyData: true
35 | VignetteBuilder: knitr
36 | URL: https://github.com/cpsievert/LDAvis
37 | BugReports: https://github.com/cpsievert/LDAvis/issues
38 | RoxygenNote: 6.0.1
39 | 


--------------------------------------------------------------------------------
/inst/languages/dictionary.txt:
--------------------------------------------------------------------------------
 1 | base, polish
 2 | "Marginal topic distribution", "Rozkład brzegowy tematów"
 3 | "Intertopic Distance Map (via multidimensional scaling)", "Mapa odległości między tematami (na bazie skalowania wielowymiarowego)"
 4 | "Overall term frequency", "Całkowita częstość terminu"
 5 | "Estimated term frequency within the selected topic", "Przewidywana częstość terminu w wybranym temacie"
 6 | "1. saliency(term w) = frequency(w) * [sum_t p(t | w) * log(p(t | w)/p(t))] for topics t; see Chuang et. al (2012)", "1. widoczność(terminu w) = częstość(w) * [sum_t p(t | w) * log(p(t | w)/p(t))] dla tematu t; Chuang et. al (2012)",
 7 | ""2. relevance(term w | topic t) = \u03BB * p(w | t) + (1 - \u03BB) * p(w | t)/p(w); see Sievert & Shirley (2014)"", "2. związek(terminu w | temat t) = \u03BB * p(w | t) + (1 - \u03BB) * p(w | t)/p(w); Sievert & Shirley (2014)",
 8 | " Most Salient Terms", " Najbardziej Widocznych Terminów "
 9 | "Slide to adjust relevance metric:", "Dopasuj miarę związku terminów z tematami"
10 | " Most Relevant Terms for Topic ", " Najbardziej Związanych Terminów z Tematem "
11 | "% of tokens)", "% wyrazów)"
12 | "Conditional topic distribution given term = '", "Warunkowy rozkład tematów przy zadanym terminie = '"
13 | "Previous Topic", "Poprzedni Temat"
14 | "Next Topic", "Następny Temat"
15 | "Clear Topic", "Wyczyść"
16 | "Selected Topic: <span", "Wybrany Temat: <span"
17 | "mdswidth = 530", "mdswidth = 620"
18 | 


--------------------------------------------------------------------------------
/man/serVis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/serVis.R
 3 | \name{serVis}
 4 | \alias{serVis}
 5 | \title{View and/or share LDAvis in a browser}
 6 | \usage{
 7 | serVis(json, out.dir = tempfile(), open.browser = interactive(),
 8 |   as.gist = FALSE, language = "english", encoding = getOption("encoding"),
 9 |   ...)
10 | }
11 | \arguments{
12 | \item{json}{character string output from \link{createJSON}.}
13 | 
14 | \item{out.dir}{directory to store html/js/json files.}
15 | 
16 | \item{open.browser}{Should R open a browser? If yes, this function will 
17 | attempt to create a local file server via the servr package.
18 | This is necessary since the javascript needs to access local files and most 
19 | browsers will not allow this.}
20 | 
21 | \item{as.gist}{should the vis be uploaded as a gist? Will prompt for an 
22 | interactive login if the GITHUB_PAT environment variable is not set. For more
23 | details, see \url{https://github.com/ropensci/gistr#authentication}.}
24 | 
25 | \item{language}{Which language to use in visualization? So far: \code{english} or \code{polish}.}
26 | 
27 | \item{encoding}{Sets the encoding to be used when writing the JSON file.}
28 | 
29 | \item{...}{arguments passed onto \code{gistr::gist_create}}
30 | }
31 | \value{
32 | An invisible object.
33 | }
34 | \description{
35 | View and/or share LDAvis in a browser.
36 | }
37 | \details{
38 | This function will place the necessary html/js/css files (located in 
39 | \code{system.file("htmljs", package = "LDAvis")}) in a directory specified 
40 | by \code{out.dir}, start a local file server in that directory (if necessary), 
41 | and (optionally) open the default browser in this directory. 
42 | If \code{as.gist=TRUE}, it will attempt to upload these files as a gist (in this
43 | case, please make sure you have the gistr package installed as well as your 
44 | 'github.username' and 'github.password' set in \link{options}.)
45 | }
46 | \examples{
47 | 
48 | \dontrun{
49 | # Use of serVis is documented here:
50 | help(createJSON, package = "LDAvis")
51 | }
52 | }
53 | \seealso{
54 | \link{createJSON}
55 | }
56 | \author{
57 | Carson Sievert
58 | }
59 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 |   CHANGES IN LDAvis VERSION 0.3.5
 2 | 
 3 | NEW FEATURES
 4 | 
 5 | Click events (on topics and terms) are now made available to the Shiny input object (closes [[#45]](https://github.com/cpsievert/LDAvis/issues/45)). When a topic is clicked on, the topic number is made available to Shiny server at input$OUTPUTID_topic_click, where OUTPUTID is the outputId of the LDAvis plot. When a term is clicked on, the term is available at input$OUTPUTID_term_click. See https://github.com/cpsievert/LDAvis/tree/master/inst/examples/shiny for an example Shiny app.
 6 | 
 7 | BUG FIXES
 8 | 
 9 | The jensen-shannon divergence calculated in `jsPCA()` (i.e., the default for the `mds.method` argument in `createJSON`) now does division in log space which is more precies and reduces the probability of underflow (see #80, thanks @dpmccable). It also now checks for 0 values and returns 0 in that case (instead of producing a NaN result, see #56).
10 | 
11 | If rows of theta/phi don't all sum to 1, a warning is now thrown instead of an error (see #68).
12 | 
13 |   CHANGES IN LDAvis VERSION 0.3.4
14 | 
15 | A new argument, encoding, was added to serVis(). This sets the encoding which will be used to write the JSON file. This may be set to `UTF-8` to  fix an issue where, when running on windows, the JSON output would be encoded in ANSI despite the R data beeing encoded in UTF-8. This should also fix #50.
16 | 
17 |   CHANGES IN LDAvis VERSION 0.3.3
18 | 
19 | A new argument, reorder.topics, was added to createJSON(). For details, see #51
20 | 
21 | It is possible to use different language than english in the final visualization (polish so far). See [[#60]](https://github.com/cpsievert/LDAvis/issues/60).
22 | 
23 |   CHANGES IN LDAvis VERSION 0.3.2
24 |   
25 | BUG FIX
26 | 
27 | In some cases, the widths of the red topic-term bars did not decrease (as they should have) from term \#1 to term \#R under the relevance ranking with $\lambda = 1$. In other words, when $\lambda = 1$, there were topics in which a narrow red bar was displayed above a wider red bar, which should never happen. The issue had to do with the way topic-term bar widths are computed, and is discussed in detail in #32.
28 | 
29 | In the end, we implemented a quick fix in which we compute term frequencies implicitly, rather than using those supplied in the createJSON() function. The upside is that the red bar widths are now explicitly controlled to produce the correct visualization. The downside is that the blue bar widths do not necessarily match the user-supplied term frequencies exactly -- in fact, the new version of LDAvis ignores the user-supplied term frequencies entirely. In a few experiments, the differences are small, and decrease (as a proportion of the true term frequencies) as the true term frequencies increase. Thanks to @bmabey for leading us to this bug.
30 | 
31 |   CHANGES IN LDAvis VERSION 0.3.1
32 |   
33 | BUG FIX
34 | 
35 |   - See #33
36 | 
37 |   CHANGES IN LDAvis VERSION 0.3
38 | 
39 | NEW FEATURES
40 | 
41 |   - LDAvis output can now be integrated in shiny apps and interactive rmarkdown documents using the functions renderVis & visOutput.
42 | 
43 |   CHANGES IN LDAvis VERSION 0.2
44 | 
45 | MAJOR CHANGES
46 | 
47 |   - The function runShiny was deprecated. Please use createJSON to create visualizations instead.
48 | 
49 |     CHANGES IN LDAvis VERSION 0.1
50 | 
51 | NEW FEATURES
52 | 
53 |   - first version of LDAvis
54 | 
55 | MISC
56 | 
57 |   - in this NEWS file, #n means the issue number on GitHub, e.g. #1 is
58 |   https://github.com/cpsievert/pitchRx/issues/1
59 | 


--------------------------------------------------------------------------------
/R/shiny.R:
--------------------------------------------------------------------------------
 1 | #' Shiny ui output function
 2 | #' @param outputId output variable to read the plot from
 3 | #' @seealso http://shiny.rstudio.com/articles/building-outputs.html
 4 | #' @export
 5 | visOutput <- function(outputId) {
 6 |   # Note that requireNamespace("shiny") should load digest & htmltools (both used later on)
 7 |   if (!requireNamespace("shiny")) message("Please install.packages('shiny')")
 8 |   deps <- lapply(ldavis_dependencies(), shiny::createWebDependency)
 9 |   htmltools::attachDependencies(
10 |     htmltools::tags$div(id = outputId, class = 'shinyLDAvis'), 
11 |     deps
12 |   )
13 | }
14 | 
15 | #' Create an LDAvis output element
16 | #' 
17 | #' Shiny server output function customized for animint plots 
18 | #' (similar to \code{shiny::plotOutput} and friends).
19 | #' 
20 | #' @param expr An expression that generates a plot.
21 | #' @param env The environment in which to evaluate \code{expr}.
22 | #' @param quoted Is expr a quoted expression (with \code{quote()})? This is useful if you want to save an expression in a variable.
23 | #' @seealso http://shiny.rstudio.com/articles/building-outputs.html
24 | #' @export
25 | renderVis <- function(expr, env = parent.frame(), quoted = FALSE) {
26 |   # Note that requireNamespace("shiny") should load digest & htmltools (both used later on)
27 |   if (!requireNamespace("shiny")) message("Please install.packages('shiny')")
28 |   
29 |   # Convert the expression + environment into a function
30 |   func <- shiny::exprToFunction(expr, env, quoted)
31 |   
32 |   # this will tell knitr how to place animint into an interactive document
33 |   # implementation is similar to htmlwidgets::shinyRenderWidget
34 |   # we can't use that in our case since we must call animint2dir
35 |   # everytime shiny calls renderFunc
36 |   renderFunc <- function(shinysession, name, ...) {
37 |     # func() should return a string that contains a JSON object
38 |     val <- func()
39 |     #  digest will guarantee a unique json file name for each output
40 |     jsonFile <- paste0(digest::digest(val), '.json')
41 |     tmp <- tempdir()
42 |     cat(val, file = file.path(tmp, jsonFile))
43 |     shiny::addResourcePath("ldavisAssets", tmp)
44 |     list(jsonFile = jsonFile)
45 |   }
46 |   shiny::markRenderFunction(LDAvis::visOutput, renderFunc)
47 | }
48 | 
49 | # html dependencies according htmltools protocols
50 | # these are here basically so we can take advantage of shiny::createWebDependency
51 | ldavis_dependencies <- function() {
52 |   list(html_dependency_d3(),
53 |        html_dependency_ldavis(),
54 |        html_dependency_ldavis_css(),
55 |        html_dependency_ldavis_shiny())
56 | }
57 | 
58 | html_dependency_d3 <- function() {
59 |   htmltools::htmlDependency(name = "d3",
60 |                             version = "3.2.7",
61 |                             src = system.file("htmljs", package = "LDAvis"),
62 |                             script = "d3.v3.js")
63 | }
64 | 
65 | html_dependency_ldavis <- function() {
66 |   htmltools::htmlDependency(name = "ldavis",
67 |                             version = utils::packageVersion("LDAvis"),
68 |                             src = system.file("htmljs", package = "LDAvis"),
69 |                             script = "ldavis.js")
70 | }
71 | 
72 | html_dependency_ldavis_css <- function() {
73 |   htmltools::htmlDependency(name = "ldavis-css",
74 |                             version = utils::packageVersion("LDAvis"),
75 |                             src = system.file("htmljs", package = "LDAvis"),
76 |                             stylesheet = "lda.css")
77 | }
78 | 
79 | html_dependency_ldavis_shiny <- function() {
80 |   htmltools::htmlDependency(name = "shinyLDAvis",
81 |                             version = utils::packageVersion("LDAvis"),
82 |                             src = system.file("shiny", package = "LDAvis"),
83 |                             script = "shinyLDAvis.js")
84 | }
85 | 
86 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## LDAvis
 2 | 
 3 | [![Build Status](https://travis-ci.org/cpsievert/LDAvis.png?branch=master)](https://travis-ci.org/cpsievert/LDAvis)
 4 | 
 5 | R package for interactive topic model visualization.
 6 | 
 7 | ![LDAvis icon](http://www.kennyshirley.com/figures/ldavis-pic.png)
 8 | 
 9 | **LDAvis** is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization.
10 | 
11 | ### Installing the package
12 | 
13 | * Stable version on CRAN:
14 | 
15 | ```r
16 | install.packages("LDAvis")
17 | ```
18 | 
19 | * Development version on GitHub (with [devtools](https://cran.r-project.org/package=devtools)):
20 | 
21 | ```r
22 | devtools::install_github("cpsievert/LDAvis")
23 | ```
24 | 
25 | ### Getting started
26 | 
27 | Once installed, we recommend a visit to the main help page:
28 | 
29 | ```r
30 | library(LDAvis)
31 | help(createJSON, package = "LDAvis")
32 | ```
33 | 
34 | The documentation and example on the bottom of that page should provide a quick sense of how to create (and share) your own visualizations. If you want more details about the technical specifications of the visualization, see the vignette:
35 | 
36 | ```r
37 | vignette("details", package = "LDAvis")
38 | ```
39 | 
40 | Note that **LDAvis** itself does not provide facilities for *fitting* the model (only *visualizing* a fitted model). If you want to perform LDA in R, there are several packages, including [mallet](https://cran.r-project.org/package=mallet), [lda](https://cran.r-project.org/package=lda), and [topicmodels](https://cran.r-project.org/package=topicmodels).
41 | 
42 | If you want to perform LDA with the R package **lda** and visualize the result with **LDAvis**, our example of a [20-topic model fit to 2,000 movie reviews](https://ldavis.cpsievert.me/reviews/reviews.html) may be helpful.
43 | 
44 | **LDAvis** does not limit you to topic modeling facilities in R. If you use other tools ([MALLET](http://mallet.cs.umass.edu/) and [gensim](https://radimrehurek.com/gensim/) are popular), we recommend that you visit our [Twenty Newsgroups](https://ldavis.cpsievert.me/newsgroup/newsgroup.html) example to help quickly understand what components **LDAvis** will need.
45 | 
46 | ### Sharing a Visualization
47 | 
48 | To share a visualization that you created using **LDAvis**, you can encode the state of the visualization into the URL by appending a string of the form:
49 | 
50 | "#topic=k&lambda=l&term=s"
51 | 
52 | to the end of the URL, where "k", "l", and "s" are strings indicating the desired values of the selected topic, the value of lambda, and the selected term, respectively. For more details, see the last section of our [Movie Reviews example](https://ldavis.cpsievert.me/reviews/reviews.html), or for a quick example, see the link here:
53 | 
54 | <https://ldavis.cpsievert.me/reviews/vis/#topic=3&lambda=0.6&term=cop>
55 | 
56 | ### Video demos
57 | 
58 | * [Visualizing & Exploring the Twenty Newsgroup Data](http://stat-graphics.org/movies/ldavis.html)
59 | * [Visualizing Topic Models demo with Hacker News Corpus](https://www.youtube.com/watch?v=tGxW2BzC_DU)
60 |   * [Notebook w/Visualization](http://nbviewer.ipython.org/github/bmabey/hacker_news_topic_modelling/blob/master/HN%20Topic%20Model%20Talk.ipynb)
61 |   * [Slide deck](https://speakerdeck.com/bmabey/visualizing-topic-models)
62 | 
63 | ### More documentation
64 | 
65 | To read about the methodology behind LDAvis, see [our paper](http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf), which we presented at the [2014 ACL Workshop on Interactive Language Learning, Visualization, and Interfaces](http://nlp.stanford.edu/events/illvi2014/) in Baltimore on June 27, 2014.
66 | 
67 | ### Additional data
68 | 
69 | We included one data set in LDAvis, 'TwentyNewsgroups', which consists of a list with 5 elements:
70 | - phi, a matrix with the topic-term distributions
71 | - theta, a matrix with the document-topic distributions
72 | - doc.length, a numeric vector with token counts for each document
73 | - vocab, a character vector containing the terms
74 | - term.frequency, a numeric vector of observed term frequencies
75 | 
76 | We also created a second data-only package called [LDAvisData](https://github.com/cpsievert/LDAvisData) to hold additional example data sets. Currently there are three more examples available there:
77 | - Movie Reviews (a 20-topic model fit to 2,000 movie reviews)
78 | - AP (a 40-topic model fit to approximately 2,246 news articles)
79 | - Jeopardy (a 100-topic model fit to approximately 20,000 Jeopardy questions)
80 | 


--------------------------------------------------------------------------------
/R/serVis.R:
--------------------------------------------------------------------------------
  1 | #' View and/or share LDAvis in a browser
  2 | #' 
  3 | #' @description View and/or share LDAvis in a browser.
  4 | #' 
  5 | #' @details This function will place the necessary html/js/css files (located in 
  6 | #' \code{system.file("htmljs", package = "LDAvis")}) in a directory specified 
  7 | #' by \code{out.dir}, start a local file server in that directory (if necessary), 
  8 | #' and (optionally) open the default browser in this directory. 
  9 | #' If \code{as.gist=TRUE}, it will attempt to upload these files as a gist (in this
 10 | #' case, please make sure you have the gistr package installed as well as your 
 11 | #' 'github.username' and 'github.password' set in \link{options}.)
 12 | #' 
 13 | #' @param json character string output from \link{createJSON}.
 14 | #' @param out.dir directory to store html/js/json files.
 15 | #' @param open.browser Should R open a browser? If yes, this function will 
 16 | #' attempt to create a local file server via the servr package.
 17 | #' This is necessary since the javascript needs to access local files and most 
 18 | #' browsers will not allow this.
 19 | #' @param as.gist should the vis be uploaded as a gist? Will prompt for an 
 20 | #' interactive login if the GITHUB_PAT environment variable is not set. For more
 21 | #' details, see \url{https://github.com/ropensci/gistr#authentication}.
 22 | #' @param ... arguments passed onto \code{gistr::gist_create}
 23 | #' @param language Which language to use in visualization? So far: \code{english} or \code{polish}.
 24 | #' @param encoding Sets the encoding to be used when writing the JSON file.
 25 | #'
 26 | #' @return An invisible object.
 27 | #' @seealso \link{createJSON}
 28 | #' @export
 29 | #' @author Carson Sievert
 30 | #' @importFrom utils read.csv
 31 | #' @examples
 32 | #' 
 33 | #' \dontrun{
 34 | #' # Use of serVis is documented here:
 35 | #' help(createJSON, package = "LDAvis")
 36 | #' }
 37 | 
 38 | serVis <- function(json, out.dir = tempfile(), open.browser = interactive(), 
 39 |                    as.gist = FALSE, language = "english", encoding = getOption("encoding"), ...) {
 40 | 
 41 |   stopifnot(is.character(language), length(language) == 1, language %in% c('english', 'polish'))
 42 |   
 43 |   ## Copy html/js/css files to out.dir
 44 |   dir.create(out.dir)
 45 |   src.dir <- system.file("htmljs", package = "LDAvis")
 46 |   to.copy <- Sys.glob(file.path(src.dir, "*"))
 47 |   file.copy(to.copy, out.dir, overwrite = TRUE, recursive = TRUE)
 48 |   
 49 |   ## Substitute words to different language if required
 50 |   if (language != 'english') {
 51 |     ldavis.js <- readLines(file.path(out.dir, "ldavis.js")) # changes are made only in this file
 52 |     lang.dict <- read.csv(system.file("languages/dictionary.txt",
 53 |                                       package = "LDAvis")) # read the dictionary
 54 |     for (i in 1:nrow(lang.dict)){ # substitute sentences row by row
 55 |       ldavis.js <- gsub(x  = ldavis.js, pattern = lang.dict[i, 1], 
 56 |                         replacement = lang.dict[i, language], fixed = TRUE)
 57 |     }
 58 |     # lambda coordinate to display its value
 59 |     if (language == 'polish') {
 60 |       ldavis.js[674] <- gsub(ldavis.js[674], pattern = "80", replacement ="175", fixed = TRUE)
 61 |     }
 62 |     # save new language version
 63 |     write(ldavis.js, file = file.path(out.dir, "ldavis.js"))
 64 |   }
 65 |   
 66 |   ## Write json to out.dir
 67 |   con <- file(file.path(out.dir, "lda.json"), encoding = encoding)	  
 68 |   on.exit(close.connection(con))
 69 |   cat(json, file = con)
 70 | 
 71 |   ## Try to upload gist
 72 |   if (as.gist) {
 73 |     gistd <- requireNamespace('gistr')
 74 |     if (!gistd) {
 75 |       warning("Please run `devtools::install_github('rOpenSci/gistr')` 
 76 |               to upload files to https://gist.github.com")
 77 |     } else {
 78 |       gist <- gistr::gist_create(file.path(out.dir, list.files(out.dir)), ...)
 79 |       if (interactive()) gist
 80 |       url_name <- paste("http://bl.ocks.org", gist$id, sep = "/")
 81 |       if (open.browser) utils::browseURL(url_name)
 82 |     }
 83 |     return(invisible())
 84 |   }
 85 | 
 86 |   servd <- requireNamespace('servr')
 87 |   if (open.browser) {
 88 |     if (!servd) {
 89 |       message("If the visualization doesn't render, install the servr package\n",
 90 |                "and re-run serVis: \n install.packages('servr') \n",
 91 |               "Alternatively, you could configure your default browser to allow\n", 
 92 |               "access to local files as some browsers block this by default") 
 93 |       utils::browseURL(sprintf("%s/index.html", out.dir))
 94 |     } else {
 95 |       servr::httd(dir = out.dir)
 96 |     }
 97 |   }
 98 |   return(invisible())
 99 | }
100 | 


--------------------------------------------------------------------------------
/docs/newsgroup/newsgroup.Rmd:
--------------------------------------------------------------------------------
 1 | A topic model for the Twenty Newsgroups data
 2 | ============================================
 3 | [LDAvis](https://github.com/cpsievert/LDAvis/) comes prepackaged with some data sets to help quickly demonstrate how to use it. This document visualizes a topic model fit to the 'Twenty Newsgroups' data created with **LDAvis** and **knitr** ([see here](https://github.com/cpsievert/LDAvis/blob/master/inst/examples/newsgroup/newsgroup.Rmd) for source code). 
 4 | 
 5 | First, we downloaded the data from the [home page for the Twenty Newsgroups data](http://qwone.com/~jason/20Newsgroups/). Specifically, we used the '20news-bydate' version of the data and fit our topic model to the 'training' portion of the data. The raw training data consists of $D = 11,269$ documents, a vocabulary of $W = 53,975$ terms, and $N = 2,765,300$ total tokens in the corpus. Each document is a message posted to one of twenty selected Usenet newsgroups during a time span roughly between June, 1992 and May, 1993. It appears that the documents were tokenized by splitting on punctuation and whitespace. We remove all occurrences of the 174 stop words contained in the "English" stop words list in the R package **tm**. We also removed all occurrences of terms that occurred less than a total of ten times. This left $W = 15,954$ terms in the vocabulary and a total of $N = 1,511,137$ tokens in the corpus. One document was removed because it contained only stop words and rare words.
 6 | 
 7 | We fit a $K=50$-topic model to the corpus (allowing for topics other than the 20 standard newsgroups topics to be discovered) by running the collapsed Gibbs sampler for 10,000 iterations using symmetric priors for the document-topic distributions ($\alpha = 0.02$) and the topic-term distributions ($\beta = 0.02$). We used MALLET to fit the model. We computed estimates of the document-topic distributions (stored in a $D \times K$ matrix denoted $\theta$) and the topic-term distributions (stored in a $K \times W$ matrix denoted $\phi$) by cross-tabulating the latent topic assignments from the last iteration of the Gibbs sampler with the document IDs and the term IDs, and then adding pseudocounts to account for the priors. A better estimate might average over multiple MCMC iterations of latent topic assignments (assuming the MCMC has settled into a local mode of the posterior and there is no label-switching going on), but we don't worry about that for now.
 8 | 
 9 | To visualize the fitted model using `LDAvis`, we load the data object `TwentyNewsgroups`, which is a list containing five elements.
10 | 
11 | ```{r data, message = FALSE}
12 | library(LDAvis)
13 | data("TwentyNewsgroups", package = "LDAvis")
14 | str(TwentyNewsgroups)
15 | ```
16 | 
17 | The first two elements are $\phi$ and $\theta$. Both of these are matrices whose rows must sum to one, since their rows contain probability distributions over terms and topics, respectively.
18 | 
19 | The third element of the list is `doc.length`, which is an integer vector of length $D = 11,268$ containing the number of tokens in each document. For this data the median document length is 81 tokens, with a range of 1 to 6409.
20 | 
21 | The fourth element of the list is `vocab`, which is a character vector containing the terms in the vocabulary, in the same order as the columns of $\phi$.
22 | 
23 | The fifth element of the list is `term.frequency`, which is an integer vector containing the frequencies of the terms in the vocabulary. The median term frequency is 27 with a range of 10 to 12,289 ('edu' is the most frequent term, because the data contain email addresses and tokenization was performed by splitting on punctuation).
24 | 
25 | At this point, we call the R function `createJSON()` to create a JSON object that will feed the web-based visualization. The `createJSON()` function performs several operations:
26 | 
27 | - It computes topic frequencies, inter-topic distances, and a projection of the topics onto a two-dimensional space using multidimensional scaling.
28 | 
29 | - It computes the $R$ most relevant terms for each topic for a grid of values of $\lambda$ (determined by the argument `lambda.step`, set to 0.01 by default), where the relevance of a term to a topic is defined as $\lambda \times p(term \mid topic) + (1 - \lambda) \times p(term \mid topic)/p(term)$, for $0 \leq \lambda \leq 1$.
30 | 
31 | The idea is to help users interpret topics by allowing them to interactively re-rank the most relevant terms for each topic by changing the value of $\lambda$ via a slider, where large values of $\lambda$ highly rank frequent words within a topic, and low values of $\lambda$ highly rank exclusive words within a topic. The topic plot on the left side of **LDAvis** allows users to browse groups of similar topics (positioned near each other in the 2-d plot) or simply progress through the topics in order (they are, by default, ordered in decreasing order of frequency). For more on relevance, see our paper about **LDAvis** [here](http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf).
32 | 
33 | ```{r json, results = 'hide', cache = TRUE}
34 | json <- with(TwentyNewsgroups, 
35 |              createJSON(phi = phi, theta = theta, vocab = vocab,
36 |                 doc.length = doc.length, term.frequency = term.frequency))
37 | ```
38 | 
39 | Now, the `serVis` function can take `json` and serve the result in a variety of ways. Here we write `json` to a file within the 'vis' directory (along with other HTML and JavaScript required to render the page). You can see the result [here](http://cpsievert.github.io/LDAvis/newsgroup/vis).
40 | 
41 | ```{r serVis, message = FALSE}
42 | serVis(json, out.dir = "vis", open.browser = FALSE)
43 | ```
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/newsgroup/newsgroup.md:
--------------------------------------------------------------------------------
 1 | A topic model for the Twenty Newsgroups data
 2 | ============================================
 3 | [LDAvis](https://github.com/cpsievert/LDAvis/) comes prepackaged with some data sets to help quickly demonstrate how to use it. This document visualizes a topic model fit to the 'Twenty Newsgroups' data created with **LDAvis** and **knitr** ([see here](https://github.com/cpsievert/LDAvis/blob/master/inst/examples/newsgroup/newsgroup.Rmd) for source code). 
 4 | 
 5 | First, we downloaded the data from the [home page for the Twenty Newsgroups data](http://qwone.com/~jason/20Newsgroups/). Specifically, we used the '20news-bydate' version of the data and fit our topic model to the 'training' portion of the data. The raw training data consists of $D = 11,269$ documents, a vocabulary of $W = 53,975$ terms, and $N = 2,765,300$ total tokens in the corpus. Each document is a message posted to one of twenty selected Usenet newsgroups during a time span roughly between June, 1992 and May, 1993. It appears that the documents were tokenized by splitting on punctuation and whitespace. We remove all occurrences of the 174 stop words contained in the "English" stop words list in the R package **tm**. We also removed all occurrences of terms that occurred less than a total of ten times. This left $W = 15,954$ terms in the vocabulary and a total of $N = 1,511,137$ tokens in the corpus. One document was removed because it contained only stop words and rare words.
 6 | 
 7 | We fit a $K=50$-topic model to the corpus (allowing for topics other than the 20 standard newsgroups topics to be discovered) by running the collapsed Gibbs sampler for 10,000 iterations using symmetric priors for the document-topic distributions ($\alpha = 0.02$) and the topic-term distributions ($\beta = 0.02$). We used MALLET to fit the model. We computed estimates of the document-topic distributions (stored in a $D \times K$ matrix denoted $\theta$) and the topic-term distributions (stored in a $K \times W$ matrix denoted $\phi$) by cross-tabulating the latent topic assignments from the last iteration of the Gibbs sampler with the document IDs and the term IDs, and then adding pseudocounts to account for the priors. A better estimate might average over multiple MCMC iterations of latent topic assignments (assuming the MCMC has settled into a local mode of the posterior and there is no label-switching going on), but we don't worry about that for now.
 8 | 
 9 | To visualize the fitted model using `LDAvis`, we load the data object `TwentyNewsgroups`, which is a list containing five elements.
10 | 
11 | 
12 | ```r
13 | library(LDAvis)
14 | data("TwentyNewsgroups", package = "LDAvis")
15 | str(TwentyNewsgroups)
16 | ```
17 | 
18 | ```
19 | ## List of 5
20 | ##  $ phi           : num [1:50, 1:15954] 5.78e-07 1.26e-04 2.77e-06 2.98e-04 4.55e-07 ...
21 | ##  $ theta         : num [1:11268, 1:50] 2.28e-05 2.08e-04 6.06e-04 5.88e-04 1.02e-04 ...
22 | ##  $ doc.length    : int [1:11268] 878 95 32 33 196 23 44 83 38 179 ...
23 | ##  $ vocab         : chr [1:15954] "archive" "name" "atheism" "resources" ...
24 | ##  $ term.frequency: int [1:15954] 317 1364 300 226 327 1832 125 108 1002 208 ...
25 | ```
26 | 
27 | The first two elements are $\phi$ and $\theta$. Both of these are matrices whose rows must sum to one, since their rows contain probability distributions over terms and topics, respectively.
28 | 
29 | The third element of the list is `doc.length`, which is an integer vector of length $D = 11,268$ containing the number of tokens in each document. For this data the median document length is 81 tokens, with a range of 1 to 6409.
30 | 
31 | The fourth element of the list is `vocab`, which is a character vector containing the terms in the vocabulary, in the same order as the columns of $\phi$.
32 | 
33 | The fifth element of the list is `term.frequency`, which is an integer vector containing the frequencies of the terms in the vocabulary. The median term frequency is 27 with a range of 10 to 12,289 ('edu' is the most frequent term, because the data contain email addresses and tokenization was performed by splitting on punctuation).
34 | 
35 | At this point, we call the R function `createJSON()` to create a JSON object that will feed the web-based visualization. The `createJSON()` function performs several operations:
36 | 
37 | - It computes topic frequencies, inter-topic distances, and a projection of the topics onto a two-dimensional space using multidimensional scaling.
38 | 
39 | - It computes the $R$ most relevant terms for each topic for a grid of values of $\lambda$ (determined by the argument `lambda.step`, set to 0.01 by default), where the relevance of a term to a topic is defined as $\lambda \times p(term \mid topic) + (1 - \lambda) \times p(term \mid topic)/p(term)$, for $0 \leq \lambda \leq 1$.
40 | 
41 | The idea is to help users interpret topics by allowing them to interactively re-rank the most relevant terms for each topic by changing the value of $\lambda$ via a slider, where large values of $\lambda$ highly rank frequent words within a topic, and low values of $\lambda$ highly rank exclusive words within a topic. The topic plot on the left side of **LDAvis** allows users to browse groups of similar topics (positioned near each other in the 2-d plot) or simply progress through the topics in order (they are, by default, ordered in decreasing order of frequency). For more on relevance, see our paper about **LDAvis** [here](http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf).
42 | 
43 | 
44 | ```r
45 | json <- with(TwentyNewsgroups, 
46 |              createJSON(phi = phi, theta = theta, vocab = vocab,
47 |                 doc.length = doc.length, term.frequency = term.frequency))
48 | ```
49 | 
50 | Now, the `serVis` function can take `json` and serve the result in a variety of ways. Here we write `json` to a file within the 'vis' directory (along with other HTML and JavaScript required to render the page). You can see the result [here](http://cpsievert.github.io/LDAvis/newsgroup/vis).
51 | 
52 | 
53 | ```r
54 | serVis(json, out.dir = "vis", open.browser = FALSE)
55 | ```
56 | 
57 | 


--------------------------------------------------------------------------------
/man/createJSON.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/createJSON.R
  3 | \name{createJSON}
  4 | \alias{createJSON}
  5 | \title{Create the JSON object to read into the javascript visualization}
  6 | \usage{
  7 | createJSON(phi = matrix(), theta = matrix(), doc.length = integer(),
  8 |   vocab = character(), term.frequency = integer(), R = 30,
  9 |   lambda.step = 0.01, mds.method = jsPCA, cluster, plot.opts = list(xlab =
 10 |   "PC1", ylab = "PC2"), reorder.topics = TRUE, ...)
 11 | }
 12 | \arguments{
 13 | \item{phi}{matrix, with each row containing the distribution over terms 
 14 | for a topic, with as many rows as there are topics in the model, and as 
 15 | many columns as there are terms in the vocabulary.}
 16 | 
 17 | \item{theta}{matrix, with each row containing the probability distribution
 18 | over topics for a document, with as many rows as there are documents in the
 19 | corpus, and as many columns as there are topics in the model.}
 20 | 
 21 | \item{doc.length}{integer vector containing the number of tokens in each
 22 | document of the corpus.}
 23 | 
 24 | \item{vocab}{character vector of the terms in the vocabulary (in the same
 25 | order as the columns of \code{phi}). Each term must have at least one
 26 | character.}
 27 | 
 28 | \item{term.frequency}{integer vector containing the frequency of each term 
 29 | in the vocabulary.}
 30 | 
 31 | \item{R}{integer, the number of terms to display in the barcharts
 32 | of the interactive viz. Default is 30. Recommended to be roughly
 33 | between 10 and 50.}
 34 | 
 35 | \item{lambda.step}{a value between 0 and 1. 
 36 | Determines the interstep distance in the grid of lambda 
 37 | values over which to iterate when computing relevance.
 38 | Default is 0.01. Recommended to be between 0.01 and 0.1.}
 39 | 
 40 | \item{mds.method}{a function that takes \code{phi} as an input and outputs
 41 | a K by 2 data.frame (or matrix). The output approximates the distance
 42 | between topics. See \link{jsPCA} for details on the default method.}
 43 | 
 44 | \item{cluster}{a cluster object created from the \link{parallel} package. 
 45 | If supplied, computations are performed using \link{parLapply} instead
 46 | of \link{lapply}.}
 47 | 
 48 | \item{plot.opts}{a named list used to customize various plot elements. 
 49 | By default, the x and y axes are labeled "PC1" and "PC2" 
 50 | (principal components 1 and 2), since \link{jsPCA} is the default
 51 | scaling method.}
 52 | 
 53 | \item{reorder.topics}{whether to re-order the K topics in order 
 54 | of decreasing proportion.}
 55 | 
 56 | \item{...}{not currently used.}
 57 | }
 58 | \value{
 59 | A string containing JSON content which can be written to a file 
 60 | or feed into \link{serVis} for easy viewing/sharing. One element of this 
 61 | string is the new ordering of the topics.
 62 | }
 63 | \description{
 64 | This function creates the JSON object that feeds the visualization template.
 65 | For a more detailed overview, 
 66 | see \code{vignette("details", package = "LDAvis")}
 67 | }
 68 | \details{
 69 | The function first computes the topic frequencies (across the whole
 70 | corpus), and then it reorders the topics in decreasing order of 
 71 | frequency. The main computation is to loop through the topics and through the
 72 | grid of lambda values (determined by \code{lambda.step})
 73 | to compute the \code{R} most 
 74 | \emph{relevant} terms for each topic and value of lambda.
 75 | }
 76 | \examples{
 77 | 
 78 | \dontrun{
 79 | data(TwentyNewsgroups, package="LDAvis")
 80 | # create the json object, start a local file server, open in default browser
 81 | json <- with(TwentyNewsgroups, 
 82 |              createJSON(phi, theta, doc.length, vocab, term.frequency))
 83 | serVis(json) # press ESC or Ctrl-C to kill
 84 | 
 85 | # createJSON() reorders topics in decreasing order of term frequency
 86 | RJSONIO::fromJSON(json)$topic.order
 87 | 
 88 | # You may want to just write the JSON and other dependency files 
 89 | # to a folder named TwentyNewsgroups under the working directory
 90 | serVis(json, out.dir = 'TwentyNewsgroups', open.browser = FALSE)
 91 | # then you could use a server of your choice; for example,
 92 | # open your terminal, type `cd TwentyNewsgroups && python -m SimpleHTTPServer`
 93 | # then open http://localhost:8000 in your web browser
 94 | 
 95 | # A different data set: the Jeopardy Questions+Answers data:
 96 | # Install LDAvisData (the associated data package) if not already installed:
 97 | # devtools::install_github("cpsievert/LDAvisData")
 98 | library(LDAvisData)
 99 | data(Jeopardy, package="LDAvisData")
100 | json <- with(Jeopardy, 
101 |              createJSON(phi, theta, doc.length, vocab, term.frequency))
102 | serVis(json) # Check out Topic 22 (bodies of water!)
103 | 
104 | # If you have a GitHub account, you can even publish as a gist
105 | # which allows you to easily share with others!
106 | serVis(json, as.gist = TRUE)
107 | 
108 | # Run createJSON on a cluster of machines to speed it up
109 | system.time(
110 | json <- with(TwentyNewsgroups, 
111 |              createJSON(phi, theta, doc.length, vocab, term.frequency))
112 | )
113 | #   user  system elapsed 
114 | # 14.415   0.800  15.066 
115 | library("parallel")
116 | cl <- makeCluster(detectCores() - 1)
117 | cl # socket cluster with 3 nodes on host 'localhost'
118 | system.time(
119 |  json <- with(TwentyNewsgroups, 
120 |    createJSON(phi, theta, doc.length, vocab, term.frequency, 
121 |      cluster = cl))
122 | )
123 | #   user  system elapsed 
124 | #  2.006   0.361   8.822
125 | 
126 | # another scaling method (svd + tsne)
127 | library("tsne")
128 | svd_tsne <- function(x) tsne(svd(x)$u)
129 | json <- with(TwentyNewsgroups, 
130 |              createJSON(phi, theta, doc.length, vocab, term.frequency, 
131 |                         mds.method = svd_tsne, 
132 |                         plot.opts = list(xlab="", ylab="")
133 |                         )
134 |              )
135 | serVis(json) # Results in a different topic layout in the left panel
136 | 
137 | }
138 | }
139 | \references{
140 | Sievert, C. and Shirley, K. (2014) \emph{LDAvis: A Method for
141 | Visualizing and Interpreting Topics}, ACL Workshop on Interactive 
142 | Language Learning, Visualization, and Interfaces.
143 | \url{http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf}
144 | }
145 | \seealso{
146 | \link{serVis}
147 | }
148 | 


--------------------------------------------------------------------------------
/docs/reviews/reviews.md:
--------------------------------------------------------------------------------
  1 | A topic model for movie reviews
  2 | ========================================================
  3 | 
  4 | In this document, we fit an [LDA topic model](http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) to the [Cornell Movie Review Data](http://www.cs.cornell.edu/people/pabo/movie-review-data/) introduced by Pang, Lee, and Vaidyanathan in their 2002 EMNLP paper, where we use 'polarity dataset version 2.0' (introduced in a subsequent Pang and Lee 2004 ACL paper). To fit the model, we used the R package [lda](http://cran.r-project.org/web/packages/lda/) and we visualize the output using [LDAvis](https://github.com/cpsievert/LDAvis).
  5 | 
  6 | 
  7 | 
  8 | ### The data
  9 | 
 10 | For convenience, the R package 'LDAvisData' provides data used to supplement LDAvis examples. The package provides an object named `reviews` which is a character vector of length 2000. Each element of that vector contains a single movie review. Note that `reviews` was created using [this script](https://github.com/cpsievert/moviereviews/blob/master/data-raw/reviews.R). 
 11 | 
 12 | 
 13 | ```r
 14 | # LDAvisData can be installed from GitHub via 'devtools::install_github("cpsievert/LDAvisData")'
 15 | data(reviews, package = "LDAvisData")
 16 | ```
 17 | 
 18 | ### Pre-processing
 19 | 
 20 | Before fitting a topic model, we need to tokenize the text. This dataset is already fairly clean, so we only remove punctuation and some common [stop words](http://en.wikipedia.org/wiki/Stop_words). In particular, we use the english stop words from the [SMART information retrieval system](http://en.wikipedia.org/wiki/SMART_Information_Retrieval_System), available in the R package **tm**.
 21 | 
 22 | 
 23 | ```r
 24 | # read in some stopwords:
 25 | library(tm)
 26 | stop_words <- stopwords("SMART")
 27 | 
 28 | # pre-processing:
 29 | reviews <- gsub("'", "", reviews)  # remove apostrophes
 30 | reviews <- gsub("[[:punct:]]", " ", reviews)  # replace punctuation with space
 31 | reviews <- gsub("[[:cntrl:]]", " ", reviews)  # replace control characters with space
 32 | reviews <- gsub("^[[:space:]]+", "", reviews) # remove whitespace at beginning of documents
 33 | reviews <- gsub("[[:space:]]+$", "", reviews) # remove whitespace at end of documents
 34 | reviews <- tolower(reviews)  # force to lowercase
 35 | 
 36 | # tokenize on space and output as a list:
 37 | doc.list <- strsplit(reviews, "[[:space:]]+")
 38 | 
 39 | # compute the table of terms:
 40 | term.table <- table(unlist(doc.list))
 41 | term.table <- sort(term.table, decreasing = TRUE)
 42 | 
 43 | # remove terms that are stop words or occur fewer than 5 times:
 44 | del <- names(term.table) %in% stop_words | term.table < 5
 45 | term.table <- term.table[!del]
 46 | vocab <- names(term.table)
 47 | 
 48 | # now put the documents into the format required by the lda package:
 49 | get.terms <- function(x) {
 50 |   index <- match(x, vocab)
 51 |   index <- index[!is.na(index)]
 52 |   rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
 53 | }
 54 | documents <- lapply(doc.list, get.terms)
 55 | ```
 56 | 
 57 | ### Using the R package 'lda' for model fitting
 58 | 
 59 | The object `documents` is a length-2000 list where each element represents one document, according to the specifications of the **lda** package. After creating this list, we compute a few statistics about the corpus:
 60 | 
 61 | 
 62 | ```r
 63 | # Compute some statistics related to the data set:
 64 | D <- length(documents)  # number of documents (2,000)
 65 | W <- length(vocab)  # number of terms in the vocab (14,568)
 66 | doc.length <- sapply(documents, function(x) sum(x[2, ]))  # number of tokens per document [312, 288, 170, 436, 291, ...]
 67 | N <- sum(doc.length)  # total number of tokens in the data (546,827)
 68 | term.frequency <- as.integer(term.table)  # frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, ...]
 69 | ```
 70 | 
 71 | Next, we set up a topic model with 20 topics, relatively diffuse priors for the topic-term distributions ($\eta$ = 0.02) and document-topic distributions ($\alpha$  = 0.02), and we set the collapsed Gibbs sampler to run for 5,000 iterations (slightly conservative to ensure convergence). A visual inspection of `fit$log.likelihood` shows that the MCMC algorithm has converged after 5,000 iterations. This block of code takes about 24 minutes to run on a laptop using a single core 1.7Ghz processor (and 8GB RAM).
 72 | 
 73 | 
 74 | ```r
 75 | # MCMC and model tuning parameters:
 76 | K <- 20
 77 | G <- 5000
 78 | alpha <- 0.02
 79 | eta <- 0.02
 80 | 
 81 | # Fit the model:
 82 | library(lda)
 83 | set.seed(357)
 84 | t1 <- Sys.time()
 85 | fit <- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab, 
 86 |                                    num.iterations = G, alpha = alpha, 
 87 |                                    eta = eta, initial = NULL, burnin = 0,
 88 |                                    compute.log.likelihood = TRUE)
 89 | t2 <- Sys.time()
 90 | t2 - t1  # about 24 minutes on laptop
 91 | ```
 92 | 
 93 | ### Visualizing the fitted model with LDAvis
 94 | 
 95 | To visualize the result using [LDAvis](https://github.com/cpsievert/LDAvis/), we'll need estimates of the document-topic distributions, which we denote by the $D \times K$ matrix $\theta$, and the set of topic-term distributions, which we denote by the $K \times W$ matrix $\phi$. We estimate the "smoothed" versions of these distributions ("smoothed" means that we've incorporated the effects of the priors into the estimates) by cross-tabulating the latent topic assignments from the last iteration of the collapsed Gibbs sampler with the documents and the terms, respectively, and then adding pseudocounts according to the priors. A better estimator might average over multiple iterations of the Gibbs sampler (after convergence, assuming that the MCMC is sampling within a local mode and there is no label switching occurring), but we won't worry about that for now.
 96 | 
 97 | 
 98 | ```r
 99 | theta <- t(apply(fit$document_sums + alpha, 2, function(x) x/sum(x)))
100 | phi <- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))
101 | ```
102 | 
103 | We've already computed the number of tokens per document and the frequency of the terms across the entire corpus. We save these, along with $\phi$, $\theta$, and `vocab`, in a list as the data object `MovieReviews`, which is included in the **LDAvis** package.
104 | 
105 | 
106 | ```r
107 | MovieReviews <- list(phi = phi,
108 |                      theta = theta,
109 |                      doc.length = doc.length,
110 |                      vocab = vocab,
111 |                      term.frequency = term.frequency)
112 | ```
113 | 
114 | Now we're ready to call the `createJSON()` function in **LDAvis**. This function will return a character string representing a JSON object used to populate the visualization. The `createJSON()` function computes topic frequencies, inter-topic distances, and projects topics onto a two-dimensional plane to represent their similarity to each other. It also loops through a grid of values of a tuning parameter, $0 \leq \lambda \leq 1$, that controls how the terms are ranked for each topic, where terms are listed in decreasing of *relevance*, where the relevance of term $w$ to topic $t$ is defined as $\lambda \times p(w \mid t) + (1 - \lambda) \times p(w \mid t)/p(w)$. Values of $\lambda$ near 1 give high relevance rankings to *frequent* terms within a given topic, whereas values of $\lambda$ near zero give high relevance rankings to *exclusive* terms within a topic. The set of all terms which are ranked among the top-`R` most relevant terms for each topic are pre-computed by the `createJSON()` function and sent to the browser to be interactively visualized using D3 as part of the JSON object.
115 | 
116 | 
117 | 
118 | 
119 | ```r
120 | library(LDAvis)
121 | 
122 | # create the JSON object to feed the visualization:
123 | json <- createJSON(phi = MovieReviews$phi, 
124 |                    theta = MovieReviews$theta, 
125 |                    doc.length = MovieReviews$doc.length, 
126 |                    vocab = MovieReviews$vocab, 
127 |                    term.frequency = MovieReviews$term.frequency)
128 | ```
129 | 
130 | The `serVis()` function can take `json` and serve the result in a variety of ways. Here we'll write `json` to a file within the 'vis' directory (along with other HTML and JavaScript required to render the page). You can see the result [here](http://cpsievert.github.io/LDAvis/reviews/vis).
131 | 
132 | 
133 | ```r
134 | serVis(json, out.dir = 'vis', open.browser = FALSE)
135 | ```
136 | 
137 | If you discover something interesting in your data using **LDAvis**, you can share the result via a URL since the state of the visualization is stored in the URL at all times. For example, in the movie review data, you can quickly see that Topic 7 is broadly about comedies by linking directly to the state of LDAvis where the selected Topic is "7" and the value of $\lambda$ is 0.6 with the following URL:
138 | 
139 | [http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&lambda=0.6&term=](http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&lambda=0.6&term=)
140 | 
141 | You can also link to the term that is hovered. For example, when you look at the 30 most relevant terms for Topic 5 using a relevance setting of $\lambda = 0.5$, the term "action" is the 6th bar from the top (i.e. the 6th most relevant term for this topic). The widths of the red and blue bars indicate that there is at least one other topic in which the term "action" appears frequently. By hovering over "action", we see from the following state of **LDAvis** that term "action" also appears frequently in Topic 14 (as the 9th most relevant term):
142 | 
143 | http://cpsievert.github.io/LDAvis/reviews/vis/#topic=14&lambda=0.5&term=action
144 | 
145 | Comparing these two topics, we can see that Topic 5 discusses action in the context of movies about crime and police, whereas in Topic 14, the term "action"" is also used frequently, but the topic is specifically about kung fu movies with Chinese actors (Jackie Chan and Jet Li, for example). These two topics both make heavy use of the word "action" but in slightly different contexts (i.e. slightly different styles of movies).
146 | 
147 | To encode a state of the visualization in the URL, you must include a string after the "/" of the form "#topic=k&labmda=l&term=s", where "k", "l", and "s" are strings representing the topic to be selected, the value of $\lambda$ to be used in the relevance calculation, and the term to be hovered, respectively. If no term hovering is desired, omit "s" from the URL. The topic, "k", will be forced to an integer in $\{0, 1, .., K\}$, and the value of $\lambda$ will be forced to the interval $[0, 1]$, with non-numeric values returning the default state of the visualization (topic = 0, $\lambda$ = 1, term = "").
148 | 


--------------------------------------------------------------------------------
/docs/reviews/reviews.Rmd:
--------------------------------------------------------------------------------
  1 | A topic model for movie reviews
  2 | ========================================================
  3 | 
  4 | In this document, we fit an [LDA topic model](http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) to the [Cornell Movie Review Data](http://www.cs.cornell.edu/people/pabo/movie-review-data/) introduced by Pang, Lee, and Vaidyanathan in their 2002 EMNLP paper, where we use 'polarity dataset version 2.0' (introduced in a subsequent Pang and Lee 2004 ACL paper). To fit the model, we used the R package [lda](http://cran.r-project.org/web/packages/lda/) and we visualize the output using [LDAvis](https://github.com/cpsievert/LDAvis).
  5 | 
  6 | ```{r setup, echo = FALSE, message = FALSE}
  7 | knitr::opts_chunk$set(message = FALSE, warning = FALSE, cache = FALSE, eval = FALSE)
  8 | ```
  9 | 
 10 | ### The data
 11 | 
 12 | For convenience, the R package 'LDAvisData' provides data used to supplement LDAvis examples. The package provides an object named `reviews` which is a character vector of length 2000. Each element of that vector contains a single movie review. Note that `reviews` was created using [this script](https://github.com/cpsievert/moviereviews/blob/master/data-raw/reviews.R). 
 13 | 
 14 | ```{r read_new}
 15 | # LDAvisData can be installed from GitHub via 'devtools::install_github("cpsievert/LDAvisData")'
 16 | data(reviews, package = "LDAvisData")
 17 | ```
 18 | 
 19 | ### Pre-processing
 20 | 
 21 | Before fitting a topic model, we need to tokenize the text. This dataset is already fairly clean, so we only remove punctuation and some common [stop words](http://en.wikipedia.org/wiki/Stop_words). In particular, we use the english stop words from the [SMART information retrieval system](http://en.wikipedia.org/wiki/SMART_Information_Retrieval_System), available in the R package **tm**.
 22 | 
 23 | ```{r collect_stops}
 24 | # read in some stopwords:
 25 | library(tm)
 26 | stop_words <- stopwords("SMART")
 27 | 
 28 | # pre-processing:
 29 | reviews <- gsub("'", "", reviews)  # remove apostrophes
 30 | reviews <- gsub("[[:punct:]]", " ", reviews)  # replace punctuation with space
 31 | reviews <- gsub("[[:cntrl:]]", " ", reviews)  # replace control characters with space
 32 | reviews <- gsub("^[[:space:]]+", "", reviews) # remove whitespace at beginning of documents
 33 | reviews <- gsub("[[:space:]]+$", "", reviews) # remove whitespace at end of documents
 34 | reviews <- tolower(reviews)  # force to lowercase
 35 | 
 36 | # tokenize on space and output as a list:
 37 | doc.list <- strsplit(reviews, "[[:space:]]+")
 38 | 
 39 | # compute the table of terms:
 40 | term.table <- table(unlist(doc.list))
 41 | term.table <- sort(term.table, decreasing = TRUE)
 42 | 
 43 | # remove terms that are stop words or occur fewer than 5 times:
 44 | del <- names(term.table) %in% stop_words | term.table < 5
 45 | term.table <- term.table[!del]
 46 | vocab <- names(term.table)
 47 | 
 48 | # now put the documents into the format required by the lda package:
 49 | get.terms <- function(x) {
 50 |   index <- match(x, vocab)
 51 |   index <- index[!is.na(index)]
 52 |   rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
 53 | }
 54 | documents <- lapply(doc.list, get.terms)
 55 | ```
 56 | 
 57 | ### Using the R package 'lda' for model fitting
 58 | 
 59 | The object `documents` is a length-2000 list where each element represents one document, according to the specifications of the **lda** package. After creating this list, we compute a few statistics about the corpus:
 60 | 
 61 | ```{r corpus}
 62 | # Compute some statistics related to the data set:
 63 | D <- length(documents)  # number of documents (2,000)
 64 | W <- length(vocab)  # number of terms in the vocab (14,568)
 65 | doc.length <- sapply(documents, function(x) sum(x[2, ]))  # number of tokens per document [312, 288, 170, 436, 291, ...]
 66 | N <- sum(doc.length)  # total number of tokens in the data (546,827)
 67 | term.frequency <- as.integer(term.table)  # frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, ...]
 68 | ```
 69 | 
 70 | Next, we set up a topic model with 20 topics, relatively diffuse priors for the topic-term distributions ($\eta$ = 0.02) and document-topic distributions ($\alpha$  = 0.02), and we set the collapsed Gibbs sampler to run for 5,000 iterations (slightly conservative to ensure convergence). A visual inspection of `fit$log.likelihood` shows that the MCMC algorithm has converged after 5,000 iterations. This block of code takes about 24 minutes to run on a laptop using a single core 1.7Ghz processor (and 8GB RAM).
 71 | 
 72 | ```{r MCMC}
 73 | # MCMC and model tuning parameters:
 74 | K <- 20
 75 | G <- 5000
 76 | alpha <- 0.02
 77 | eta <- 0.02
 78 | 
 79 | # Fit the model:
 80 | library(lda)
 81 | set.seed(357)
 82 | t1 <- Sys.time()
 83 | fit <- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab, 
 84 |                                    num.iterations = G, alpha = alpha, 
 85 |                                    eta = eta, initial = NULL, burnin = 0,
 86 |                                    compute.log.likelihood = TRUE)
 87 | t2 <- Sys.time()
 88 | t2 - t1  # about 24 minutes on laptop
 89 | ```
 90 | 
 91 | ### Visualizing the fitted model with LDAvis
 92 | 
 93 | To visualize the result using [LDAvis](https://github.com/cpsievert/LDAvis/), we'll need estimates of the document-topic distributions, which we denote by the $D \times K$ matrix $\theta$, and the set of topic-term distributions, which we denote by the $K \times W$ matrix $\phi$. We estimate the "smoothed" versions of these distributions ("smoothed" means that we've incorporated the effects of the priors into the estimates) by cross-tabulating the latent topic assignments from the last iteration of the collapsed Gibbs sampler with the documents and the terms, respectively, and then adding pseudocounts according to the priors. A better estimator might average over multiple iterations of the Gibbs sampler (after convergence, assuming that the MCMC is sampling within a local mode and there is no label switching occurring), but we won't worry about that for now.
 94 | 
 95 | ```{r get_dists}
 96 | theta <- t(apply(fit$document_sums + alpha, 2, function(x) x/sum(x)))
 97 | phi <- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))
 98 | ```
 99 | 
100 | We've already computed the number of tokens per document and the frequency of the terms across the entire corpus. We save these, along with $\phi$, $\theta$, and `vocab`, in a list as the data object `MovieReviews`, which is included in the **LDAvis** package.
101 | 
102 | ```{r save_list}
103 | MovieReviews <- list(phi = phi,
104 |                      theta = theta,
105 |                      doc.length = doc.length,
106 |                      vocab = vocab,
107 |                      term.frequency = term.frequency)
108 | ```
109 | 
110 | Now we're ready to call the `createJSON()` function in **LDAvis**. This function will return a character string representing a JSON object used to populate the visualization. The `createJSON()` function computes topic frequencies, inter-topic distances, and projects topics onto a two-dimensional plane to represent their similarity to each other. It also loops through a grid of values of a tuning parameter, $0 \leq \lambda \leq 1$, that controls how the terms are ranked for each topic, where terms are listed in decreasing of *relevance*, where the relevance of term $w$ to topic $t$ is defined as $\lambda \times p(w \mid t) + (1 - \lambda) \times p(w \mid t)/p(w)$. Values of $\lambda$ near 1 give high relevance rankings to *frequent* terms within a given topic, whereas values of $\lambda$ near zero give high relevance rankings to *exclusive* terms within a topic. The set of all terms which are ranked among the top-`R` most relevant terms for each topic are pre-computed by the `createJSON()` function and sent to the browser to be interactively visualized using D3 as part of the JSON object.
111 | 
112 | ```{r echo=FALSE, eval=TRUE}
113 | data(MovieReviews, package = "LDAvisData")
114 | ```
115 | 
116 | ```{r vis, eval=TRUE}
117 | library(LDAvis)
118 | 
119 | # create the JSON object to feed the visualization:
120 | json <- createJSON(phi = MovieReviews$phi, 
121 |                    theta = MovieReviews$theta, 
122 |                    doc.length = MovieReviews$doc.length, 
123 |                    vocab = MovieReviews$vocab, 
124 |                    term.frequency = MovieReviews$term.frequency)
125 | ```
126 | 
127 | The `serVis()` function can take `json` and serve the result in a variety of ways. Here we'll write `json` to a file within the 'vis' directory (along with other HTML and JavaScript required to render the page). You can see the result [here](http://cpsievert.github.io/LDAvis/reviews/vis).
128 | 
129 | ```{r serVis, eval=TRUE}
130 | serVis(json, out.dir = 'vis', open.browser = FALSE)
131 | ```
132 | 
133 | If you discover something interesting in your data using **LDAvis**, you can share the result via a URL since the state of the visualization is stored in the URL at all times. For example, in the movie review data, you can quickly see that Topic 7 is broadly about comedies by linking directly to the state of LDAvis where the selected Topic is "7" and the value of $\lambda$ is 0.6 with the following URL:
134 | 
135 | [http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&lambda=0.6&term=](http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&lambda=0.6&term=)
136 | 
137 | You can also link to the term that is hovered. For example, when you look at the 30 most relevant terms for Topic 5 using a relevance setting of $\lambda = 0.5$, the term "action" is the 6th bar from the top (i.e. the 6th most relevant term for this topic). The widths of the red and blue bars indicate that there is at least one other topic in which the term "action" appears frequently. By hovering over "action", we see from the following state of **LDAvis** that term "action" also appears frequently in Topic 14 (as the 9th most relevant term):
138 | 
139 | http://cpsievert.github.io/LDAvis/reviews/vis/#topic=14&lambda=0.5&term=action
140 | 
141 | Comparing these two topics, we can see that Topic 5 discusses action in the context of movies about crime and police, whereas in Topic 14, the term "action"" is also used frequently, but the topic is specifically about kung fu movies with Chinese actors (Jackie Chan and Jet Li, for example). These two topics both make heavy use of the word "action" but in slightly different contexts (i.e. slightly different styles of movies).
142 | 
143 | To encode a state of the visualization in the URL, you must include a string after the "/" of the form "#topic=k&labmda=l&term=s", where "k", "l", and "s" are strings representing the topic to be selected, the value of $\lambda$ to be used in the relevance calculation, and the term to be hovered, respectively. If no term hovering is desired, omit "s" from the URL. The topic, "k", will be forced to an integer in $\{0, 1, .., K\}$, and the value of $\lambda$ will be forced to the interval $[0, 1]$, with non-numeric values returning the default state of the visualization (topic = 0, $\lambda$ = 1, term = "").


--------------------------------------------------------------------------------
/docs/Jeopardy/Jeopardy.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  5 | 
  6 | <title>Jeopardy Data</title>
  7 | 
  8 | <script type="text/javascript">
  9 | window.onload = function() {
 10 |   var imgs = document.getElementsByTagName('img'), i, img;
 11 |   for (i = 0; i < imgs.length; i++) {
 12 |     img = imgs[i];
 13 |     // center an image if it is the only element of its parent
 14 |     if (img.parentElement.childElementCount === 1)
 15 |       img.parentElement.style.textAlign = 'center';
 16 |   }
 17 | };
 18 | </script>
 19 | 
 20 | <!-- Styles for R syntax highlighter -->
 21 | <style type="text/css">
 22 |    pre .operator,
 23 |    pre .paren {
 24 |      color: rgb(104, 118, 135)
 25 |    }
 26 | 
 27 |    pre .literal {
 28 |      color: #990073
 29 |    }
 30 | 
 31 |    pre .number {
 32 |      color: #099;
 33 |    }
 34 | 
 35 |    pre .comment {
 36 |      color: #998;
 37 |      font-style: italic
 38 |    }
 39 | 
 40 |    pre .keyword {
 41 |      color: #900;
 42 |      font-weight: bold
 43 |    }
 44 | 
 45 |    pre .identifier {
 46 |      color: rgb(0, 0, 0);
 47 |    }
 48 | 
 49 |    pre .string {
 50 |      color: #d14;
 51 |    }
 52 | </style>
 53 | 
 54 | <!-- R syntax highlighter -->
 55 | <script type="text/javascript">
 56 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
 57 | hljs.initHighlightingOnLoad();
 58 | </script>
 59 | 
 60 | 
 61 | 
 62 | <style type="text/css">
 63 | body, td {
 64 |    font-family: sans-serif;
 65 |    background-color: white;
 66 |    font-size: 13px;
 67 | }
 68 | 
 69 | body {
 70 |   max-width: 800px;
 71 |   margin: auto;
 72 |   padding: 1em;
 73 |   line-height: 20px;
 74 | }
 75 | 
 76 | tt, code, pre {
 77 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
 78 | }
 79 | 
 80 | h1 {
 81 |    font-size:2.2em;
 82 | }
 83 | 
 84 | h2 {
 85 |    font-size:1.8em;
 86 | }
 87 | 
 88 | h3 {
 89 |    font-size:1.4em;
 90 | }
 91 | 
 92 | h4 {
 93 |    font-size:1.0em;
 94 | }
 95 | 
 96 | h5 {
 97 |    font-size:0.9em;
 98 | }
 99 | 
100 | h6 {
101 |    font-size:0.8em;
102 | }
103 | 
104 | a:visited {
105 |    color: rgb(50%, 0%, 50%);
106 | }
107 | 
108 | pre, img {
109 |   max-width: 100%;
110 | }
111 | pre {
112 |   overflow-x: auto;
113 | }
114 | pre code {
115 |    display: block; padding: 0.5em;
116 | }
117 | 
118 | code {
119 |   font-size: 92%;
120 |   border: 1px solid #ccc;
121 | }
122 | 
123 | code[class] {
124 |   background-color: #F8F8F8;
125 | }
126 | 
127 | table, td, th {
128 |   border: none;
129 | }
130 | 
131 | blockquote {
132 |    color:#666666;
133 |    margin:0;
134 |    padding-left: 1em;
135 |    border-left: 0.5em #EEE solid;
136 | }
137 | 
138 | hr {
139 |    height: 0px;
140 |    border-bottom: none;
141 |    border-top-width: thin;
142 |    border-top-style: dotted;
143 |    border-top-color: #999999;
144 | }
145 | 
146 | @media print {
147 |    * {
148 |       background: transparent !important;
149 |       color: black !important;
150 |       filter:none !important;
151 |       -ms-filter: none !important;
152 |    }
153 | 
154 |    body {
155 |       font-size:12pt;
156 |       max-width:100%;
157 |    }
158 | 
159 |    a, a:visited {
160 |       text-decoration: underline;
161 |    }
162 | 
163 |    hr {
164 |       visibility: hidden;
165 |       page-break-before: always;
166 |    }
167 | 
168 |    pre, blockquote {
169 |       padding-right: 1em;
170 |       page-break-inside: avoid;
171 |    }
172 | 
173 |    tr, img {
174 |       page-break-inside: avoid;
175 |    }
176 | 
177 |    img {
178 |       max-width: 100% !important;
179 |    }
180 | 
181 |    @page :left {
182 |       margin: 15mm 20mm 15mm 10mm;
183 |    }
184 | 
185 |    @page :right {
186 |       margin: 15mm 10mm 15mm 20mm;
187 |    }
188 | 
189 |    p, h2, h3 {
190 |       orphans: 3; widows: 3;
191 |    }
192 | 
193 |    h2, h3 {
194 |       page-break-after: avoid;
195 |    }
196 | }
197 | </style>
198 | 
199 | 
200 | 
201 | </head>
202 | 
203 | <body>
204 | <h1>Jeopardy Data</h1>
205 | 
206 | <p><a href="http://cpsievert.github.io/LDAvis/Jeopardy/vis">Click here</a> to see the result of the code below:</p>
207 | 
208 | <pre><code class="r">library(&quot;LDAvis&quot;)
209 | data(Jeopardy, package = &quot;LDAvisData&quot;)
210 | json &lt;- with(Jeopardy, createJSON(phi, theta, doc.length, vocab, term.frequency))
211 | serVis(json, out.dir = &#39;vis&#39;, open.browser = FALSE)
212 | </code></pre>
213 | 
214 | </body>
215 | 
216 | </html>
217 | 


--------------------------------------------------------------------------------
/docs/AP/AP.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  5 | 
  6 | <title>Associated Press</title>
  7 | 
  8 | <script type="text/javascript">
  9 | window.onload = function() {
 10 |   var imgs = document.getElementsByTagName('img'), i, img;
 11 |   for (i = 0; i < imgs.length; i++) {
 12 |     img = imgs[i];
 13 |     // center an image if it is the only element of its parent
 14 |     if (img.parentElement.childElementCount === 1)
 15 |       img.parentElement.style.textAlign = 'center';
 16 |   }
 17 | };
 18 | </script>
 19 | 
 20 | <!-- Styles for R syntax highlighter -->
 21 | <style type="text/css">
 22 |    pre .operator,
 23 |    pre .paren {
 24 |      color: rgb(104, 118, 135)
 25 |    }
 26 | 
 27 |    pre .literal {
 28 |      color: #990073
 29 |    }
 30 | 
 31 |    pre .number {
 32 |      color: #099;
 33 |    }
 34 | 
 35 |    pre .comment {
 36 |      color: #998;
 37 |      font-style: italic
 38 |    }
 39 | 
 40 |    pre .keyword {
 41 |      color: #900;
 42 |      font-weight: bold
 43 |    }
 44 | 
 45 |    pre .identifier {
 46 |      color: rgb(0, 0, 0);
 47 |    }
 48 | 
 49 |    pre .string {
 50 |      color: #d14;
 51 |    }
 52 | </style>
 53 | 
 54 | <!-- R syntax highlighter -->
 55 | <script type="text/javascript">
 56 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
 57 | hljs.initHighlightingOnLoad();
 58 | </script>
 59 | 
 60 | 
 61 | 
 62 | <style type="text/css">
 63 | body, td {
 64 |    font-family: sans-serif;
 65 |    background-color: white;
 66 |    font-size: 13px;
 67 | }
 68 | 
 69 | body {
 70 |   max-width: 800px;
 71 |   margin: auto;
 72 |   padding: 1em;
 73 |   line-height: 20px;
 74 | }
 75 | 
 76 | tt, code, pre {
 77 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
 78 | }
 79 | 
 80 | h1 {
 81 |    font-size:2.2em;
 82 | }
 83 | 
 84 | h2 {
 85 |    font-size:1.8em;
 86 | }
 87 | 
 88 | h3 {
 89 |    font-size:1.4em;
 90 | }
 91 | 
 92 | h4 {
 93 |    font-size:1.0em;
 94 | }
 95 | 
 96 | h5 {
 97 |    font-size:0.9em;
 98 | }
 99 | 
100 | h6 {
101 |    font-size:0.8em;
102 | }
103 | 
104 | a:visited {
105 |    color: rgb(50%, 0%, 50%);
106 | }
107 | 
108 | pre, img {
109 |   max-width: 100%;
110 | }
111 | pre {
112 |   overflow-x: auto;
113 | }
114 | pre code {
115 |    display: block; padding: 0.5em;
116 | }
117 | 
118 | code {
119 |   font-size: 92%;
120 |   border: 1px solid #ccc;
121 | }
122 | 
123 | code[class] {
124 |   background-color: #F8F8F8;
125 | }
126 | 
127 | table, td, th {
128 |   border: none;
129 | }
130 | 
131 | blockquote {
132 |    color:#666666;
133 |    margin:0;
134 |    padding-left: 1em;
135 |    border-left: 0.5em #EEE solid;
136 | }
137 | 
138 | hr {
139 |    height: 0px;
140 |    border-bottom: none;
141 |    border-top-width: thin;
142 |    border-top-style: dotted;
143 |    border-top-color: #999999;
144 | }
145 | 
146 | @media print {
147 |    * {
148 |       background: transparent !important;
149 |       color: black !important;
150 |       filter:none !important;
151 |       -ms-filter: none !important;
152 |    }
153 | 
154 |    body {
155 |       font-size:12pt;
156 |       max-width:100%;
157 |    }
158 | 
159 |    a, a:visited {
160 |       text-decoration: underline;
161 |    }
162 | 
163 |    hr {
164 |       visibility: hidden;
165 |       page-break-before: always;
166 |    }
167 | 
168 |    pre, blockquote {
169 |       padding-right: 1em;
170 |       page-break-inside: avoid;
171 |    }
172 | 
173 |    tr, img {
174 |       page-break-inside: avoid;
175 |    }
176 | 
177 |    img {
178 |       max-width: 100% !important;
179 |    }
180 | 
181 |    @page :left {
182 |       margin: 15mm 20mm 15mm 10mm;
183 |    }
184 | 
185 |    @page :right {
186 |       margin: 15mm 10mm 15mm 20mm;
187 |    }
188 | 
189 |    p, h2, h3 {
190 |       orphans: 3; widows: 3;
191 |    }
192 | 
193 |    h2, h3 {
194 |       page-break-after: avoid;
195 |    }
196 | }
197 | </style>
198 | 
199 | 
200 | 
201 | </head>
202 | 
203 | <body>
204 | <h1>Associated Press</h1>
205 | 
206 | <p><a href="http://cpsievert.github.io/LDAvis/AP/vis">Click here</a> to see the result of the code below:</p>
207 | 
208 | <pre><code class="r">library(&quot;LDAvis&quot;)
209 | data(AP, package = &quot;LDAvisData&quot;)
210 | json &lt;- with(AP, createJSON(phi, theta, doc.length, vocab, term.frequency))
211 | serVis(json, out.dir = &#39;vis&#39;, open.browser = FALSE)
212 | </code></pre>
213 | 
214 | <pre><code>## Loading required namespace: servr
215 | </code></pre>
216 | 
217 | </body>
218 | 
219 | </html>
220 | 


--------------------------------------------------------------------------------
/vignettes/details.Rnw:
--------------------------------------------------------------------------------
  1 | %\VignetteEngine{knitr::knitr}
  2 | %\VignetteIndexEntry{LDAvis details}
  3 | 
  4 | \documentclass[12pt]{article}
  5 | \usepackage{graphicx,amsmath}
  6 | \usepackage{epsfig}
  7 | \usepackage{rotating}
  8 | \usepackage[normalem]{ulem}
  9 | \usepackage{multirow}
 10 | 
 11 | \topmargin=-.5in \oddsidemargin=0in \textwidth=6.5in
 12 | \textheight=9in
 13 | 
 14 | \parindent 0.0in
 15 | \parskip .20in
 16 | 
 17 | 
 18 | \begin{document}
 19 | 
 20 | \section{Introduction}
 21 | In $\texttt{LDAvis}$, we visualize the fit of an LDA topic model to a corpus of documents. The data and model are described as follows:
 22 | 
 23 | $\textbf{Data:}$
 24 | \begin{itemize}
 25 | \item $D$ documents in the corpus
 26 | \item $n_d$ tokens in document $d$, for $d = 1...D$ (denoted \texttt{doc.length} in our package's R code)
 27 | \item $N = \sum_d n_d$ total tokens in the corpus
 28 | \item $W$ terms in the vocabulary
 29 | \item $M_w$ is defined as the frequency of term $w$ across the corpus, where $\sum_w M_w = N$  (denoted \texttt{term.frequency} in our package's R code)
 30 | \end{itemize}
 31 | 
 32 | $\textbf{Model:}$
 33 | \begin{itemize}
 34 | \item $K$ topics in the model
 35 | \item For document $d = 1...D$, the length-$K$ topic probability vector, $\boldsymbol{\theta}_d$, is drawn from a Dirichlet($\boldsymbol{\alpha}$) prior, where $\alpha_k > 0$ for topics $k = 1...K$.
 36 | \item For topic $k = 1...K$, the length-$W$ term probability vector, $\boldsymbol{\phi}_k$, is drawn from a Dirichlet($\boldsymbol{\beta}$) prior, where $\beta_w > 0$ for terms $w = 1...W$.
 37 | \item The probability model states that for the $j^{th}$ token from document $d$, a latent topic, $z_{dj}$, is drawn, where P($z_{dj} = k) = \theta_{dk}$ for document $d = 1...D$, token $j = 1...n_d$, and topic $k = 1...K$.
 38 | \item Then, the $j^{th}$ token from the $d^{th}$ document, $Y_{dj}$, is drawn from the vocabulary of terms according to P($Y_{dj} = w \mid z_{dj}) = \phi_{(z_{dj},w)}$, for document $d = 1...D$, token $j = 1...n_d$, and term $w = 1...W$.
 39 | \end{itemize}
 40 | A number of algorithms can be used to fit an LDA model to a data set. Two of the most common are the collapsed Gibbs sampler (Griffiths and Steyvers, 2004) and variational Bayes (Blei et al 2003).
 41 | %A key feature of $\texttt{LDAvis}$ is that we incorporate the prior distributions (which are a part of every every LDA model) into the visualization, so that we can visualize aspects of the posterior distributions of the various parameters in the LDA model. Specifically, we don't take as input to $\texttt{LDAvis}$ the sampled topic assignments from the last iteration of a collapsed Gibbs sampler, for instance, but rather we take as inputs general variables that are estimated as a result of any algorithm used to fit an LDA model, including the collapsed Gibbs sampler (Griffiths and Steyvers, 2004), variational Bayes (Blei et al 2003), or others.
 42 | 
 43 | Our interactive visualization tool, $\texttt{LDAvis}$, requires five input arguments:
 44 | \begin{enumerate}
 45 | \item $\boldsymbol\phi$, the $K \times W$ matrix containing the estimated probability mass function over the $W$ terms in the vocabulary for each of the $K$ topics in the model. Note that $\phi_{kw} > 0$ for all $k \in 1...K$ and all $w \in 1...W$, because of the priors. (Although our software allows values of zero due to rounding). Each of the $K$ rows of $\boldsymbol\phi$ must sum to one.
 46 | \item $\boldsymbol\theta$, the $D \times K$ matrix containing the estimated probability mass function over the $K$ topics in the model for each of the $D$ documents in the corpus. Note that $\theta_{dk} > 0$ for all $d \in 1...D$ and all $k \in 1...K$, because of the priors (although, as above, our software accepts zeroes due to rounding). Each of the $D$ rows of $\boldsymbol\theta$ must sum to one.
 47 | %\item $\boldsymbol\alpha$, the length-$K$ vector of Dirichlet hyperparameters that specify the prior distribution over topics for each document (we assume that the same vector, $\boldsymbol\alpha$ is used for each of the $D$ documents, but note that this prior doesn't have to be symmetric). We require $\alpha_k > 0$ for each topic $k \in 1...K$.
 48 | %\item $\boldsymbol\beta$, the length-$W$ vector of Dirichlet hyperparameters that specify the prior distribution over the $W$ terms in the vocabulary for each of the $K$ topics. We require $\beta_w > 0$ for each term $w \in 1...W$.
 49 | \item $n_d$, the number of tokens observed in document $d$, where $n_d$ is required to be an integer greater than zero, for documents $d = 1...D$. Denoted \texttt{doc.length} in our code.
 50 | \item \texttt{vocab}, the length-$W$ character vector containing the terms in the vocabulary (listed in the same order as the columns of $\phi$).
 51 | \item $M_w$, the frequency of term $w$ across the entire corpus, where $M_w$ is required to be an integer greater than zero for each term $w=1...W$. Denoted \texttt{term.frequency} in our code.
 52 | \end{enumerate}
 53 | In general, the prior parameters $\boldsymbol\alpha$ and $\boldsymbol\beta$ are specified by the modeler (although in some cases they are estimated from the data), $n_d$ and $M_w$ are computed from the data, and the algorithm used to fit the model produces point estimates of $\boldsymbol\phi$ and $\boldsymbol\theta$. When using the collapsed Gibbs sampler, we recommend using equations 6 and 7 from Griffiths and Steyvers (2004) to estimate $\boldsymbol\phi$ and $\boldsymbol\theta$. These are the ``smoothed" estimates of the parameters that incorporate the priors, rather than, for example, the matrices containing the counts of topic assignments to each document and term, which are a common output of Gibbs Sampler implementations that don't necessarily incorporate the priors. Two popular packages for fitting an LDA model to data are the R package \texttt{lda} (Chang, 2012) and the JAVA-based standalone software package \texttt{MALLET} (McCallum, 2002). Our package contains an example of using the \texttt{lda} package to fit a topic model to a corpus of movie reviews, available in the \texttt{inst/examples/reviews} directory of \texttt{LDAvis}.
 54 | 
 55 | %We denote the total number of observed tokens in the corpus $N = \sum_d n_d$. Next we introduce the idea of a ``pseudo-token".
 56 | 
 57 | %When fitting the LDA model, the Dirichlet priors on the $D$ document-topic distributions and the $K$ topic-term distributions have the effect of adding ``pseudo-tokens" to the data. The Dirichlet distribution is the conjugate prior for the multinomial likelihood, and, as with the Beta-Binomial conjugate pair, the vectors specifying the Dirichlet priors can be thought of as adding additional (possibly partial) data points to the observed data to arrive at the posterior distribution. For example, suppose the first document in the corpus contains $n_1 = 50$ tokens, and we use the document-topic prior $\alpha_k = 0.01$ for each topic $k \in 1...K$, where we choose to fit a $K=20$-topic model. This is equivalent to adding $K \times \bar{\alpha}$, or $\sum_k \alpha_k = 20*0.01 = 0.2$ ``pseudo-tokens" to this document, spread out evenly across the $K$ topics. Compared to the 50 observed tokens, adding 0.2 ``pseudo-tokens" will not influence the posterior heavily, meaning this is a ``weak", ``flat", or relatively ``non-informative" prior. If the prior $\boldsymbol\alpha$ is not symmetric, then the numbers of ``pseudo-tokens" added to each topic, for each document, are not equal (which might be desired by the modeler, if he/she has some prior notion that some topics will be more prevalent than others -- see Wallach et al ``Why Priors Matter"). Likewise, we also add $W \times \bar{\beta}$ ``pseudo-tokens" to each topic, spread across the the $W$ terms in the vocabulary according to the individuals $\beta_w$'s.
 58 | 
 59 | %These priors have the effect of ``smoothing" the estimates of $\boldsymbol\phi$ and $\boldsymbol\theta$, which guarantees that each topic has a non-zero probability under each document, and each term has a non-zero probability under each topic. Our visualization, $\texttt{LDAvis}$, reflects this fact (modulo rounding error). We denote the total number of ``smoothed" tokens in the data as the sum of the observed tokens and the ``pseudo-tokens" added by the priors:
 60 | %$$
 61 | %N_\text{smoothed} = N + DK\bar{\alpha} + KW\bar{\beta}.
 62 | %$$
 63 | 
 64 | \section{Definitions of visual elements in $\texttt{LDAvis}$}
 65 | Here we define the dimensions of the visual elements in $\texttt{LDAvis}$. There are essentially four sets of visual elements that can be displayed, depending on the state of the visualization. They are:
 66 | %-- two sets of circles that get displayed, $K$-at-a-time, in the left panel of the visualization, and two sets of horizontal bars that get displayed in a barchart, $r$-at-a-time ($r$ to be defined below), in the right panel of the visualization. At any one time, only $K$ circles are displayed on the left, but it's possible that either $r$ or $2r$ horizontal bars will be displayed at one time.
 67 | \begin{enumerate}
 68 | \item \textbf{Default Topic Circles:} $K$ circles, one to represent each topic, whose areas are set to be proportional to the proportions of the topics across the $N$ total tokens in the corpus. The default topic circles are displayed when no term is highlighted.
 69 | \item \textbf{Red Bars:} $K \times W$ red horizontal bars, each of which represents the estimated number of times a given term was generated by a given topic. When a topic is selected, we show the red bars for the $R$ most \emph{relevant} terms for the selected topic, where $R = 30$ by default (see Sievert and Shirley (2014) for the definition of \emph{relevance}).
 70 | \item \textbf{Blue Bars:} $W$ blue horizontal bars, one to represent the overall frequency of each term in the corpus. When no topic is selected, we display the blue bars for the $R$ most salient terms in the corpus, and when a topic is selected, we display the blue bars for the $R$ most relevant terms. See Chuang et al. (2012) for the definition of the \emph{saliency} of a term in a topic model.
 71 | \item \textbf{Topic-Term Circles:} $K \times W$ circles whose areas are set to be proportional to the frequencies with which a given term is estimated to have been generated by the topics. When a given term, $w$, is highlighted, the $K$ default circles transition (i.e. their areas change) to the $K$ topic-term circles for term $w$.
 72 | \end{enumerate}
 73 | 
 74 | Let's define the dimensions of these visual elements:
 75 | \begin{enumerate}
 76 | \item The area of the \textbf{Default Circle} for topic $k$, $A^\text{default}_k$, is set to be proportional to $N_k/\sum_k N_k$, where $N_k$ is the estimated number of tokens that were generated by topic $k$ across the entire corpus. The formula for $N_k$ is:
 77 | $$
 78 | N_k = \sum_{d=1}^D \theta_{dk}n_d.
 79 | $$
 80 | It is straightforward to verify that $\sum_k N_k = N$.
 81 | \item The width of the \textbf{Red Bar} for topic $k$ and term $w$, denoted $P_{kw}$, is set to $\phi_{kw} \times N_k$ for all topics $k = 1...K$ and terms $w = 1...W$.
 82 | \item The width of the \textbf{Blue Bar} for term $w$ is set to $\sum_k P_{kw}$, the total number of occurrences of term $w$ in the corpus (note that prior to version 0.3.2 of LDAvis, this width was set to $M_w$, the user-supplied frequency of term $w$ across the entire corpus).
 83 | \item The area of the \textbf{Topic-Term Circle} for term $w$ and topic $k$, denoted $A^\text{topic-term}_{kw}$, is set to be proportional to $P_{kw}/\sum_k P_{kw}$.
 84 | \end{enumerate}
 85 | 
 86 | \section{Discussion}
 87 | Here we point out a few things about \texttt{LDAvis}:
 88 | \begin{enumerate}
 89 | \item Note that the all the visual elements represent frequencies (of various things in the training data), rather than conditional probabilities. For example, the area of topic-term circle $A^\text{topic-term}_{kw}$ could have been set to be proportional to $\phi_{kw}/\sum_k \phi_{kw}$, but instead we set it to be proportional to $P_{kw}/\sum_k P_{kw}$. So, suppose the term ``foo" had a 0.003 probability under, say, topic 20 and topic 45, and negligible probability under all other topics. One might expect that upon highlighting ``foo", the topic-term circles would all disappear except for two equal-area topic-term circles representing topics 20 and 45. Instead, if, for example, topic 20 occurred twice as frequently as topic 45, then the topic-term circle for topic 20 would be twice as large as that for topic 45 upon ``foo" being highlighted. This reflects the fact that 2/3 of the occurrences of ``foo" in the training data were estimated to have come from topic 20. In other words, we reflect the underlying (and potentially variable) frequencies of the topics themselves when we compute the areas of the topic-term circles.
 90 | 
 91 | The same principle holds for the red bars and blue bars -- they visualize frequencies, rather than proportions, so that wider bars signify more frequent terms in the training data. We felt this was an important feature of the data to visualize, rather than building a visualization that simply displayed aspects of $\boldsymbol{\phi}$ and $\boldsymbol{\theta}$, which are normalized, and don't reflect the frequencies of the terms and topics in the data.
 92 | 
 93 | \item By default, we set the dimensions of the left panel to be 530 x 530 pixels, and we set the sum of the areas of the default topic circles and the topic-term circles to be $530^2/4$, so that these circles cover at most 1/4 of the panel (in practice, because of overlapping circles, they cover less than 1/4 of the area of the panel). Likewise, the sum of the areas of the topic-term circles is set to be 1/4 of the area of the left panel of the display. This way the visualization looks OK for a range of numbers of topics, from roughly $10 \leq K \leq 100$.
 94 | 
 95 | \item The centers of the default topic circles are laid out in two dimensions according to a multidimensional scaling (MDS) algorithm that is run on the inter-topic distance matrix. We use Jensen-Shannon divergence to compute distances between topics, and then we use the \texttt{cmdscale()} function in \texttt{R} to implement classical multidimensional scaling. The range of the first coordinate (along the x-axis) is not necessarily equal to that of the second coordinate (along the y-axis); thus we force the aspect ratio to be 1 to preserve the MDS distances. In practice (across the examples we've seen), the ranges of the x and y coordinates are within about 10\% of each other.
 96 | 
 97 | \end{enumerate}
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | \section{References}
104 | 
105 | 1. Blei, David M., Ng, Andrew Y., and Jordan, Michael I. (2003). Latent Dirichlet Allocation, \emph{Journal of Machine Learning Research}, Volume 3, pages 993-1022.
106 | 
107 | 2. Griffiths, Thomas L., and Steyvers, Mark (2004). Finding Scientific Topics, \emph{Proceedings of the National Academy of Science}, Volume 101, pages 5228-5235.
108 | 
109 | 3. Jonathan Chang (2012). lda: Collapsed Gibbs sampling methods for topic models. R package version 1.3.2. \texttt{http://CRAN.R-project.org/package=lda}.
110 | 
111 | 4. McCallum, Andrew Kachites (2002). MALLET: A Machine Learning for Language Toolkit. \texttt{http://mallet.cs.umass.edu}.
112 | 
113 | 5. Chuang, Jason, Manning, Christopher D., and Heer, Jeffrey (2012). Termite: Visualization Techniques for Assessing Textual Topic Models, \emph{Advanced Visual Interfaces}.
114 | 
115 | \end{document}
116 | 


--------------------------------------------------------------------------------
/R/createJSON.R:
--------------------------------------------------------------------------------
  1 | #' Create the JSON object to read into the javascript visualization
  2 | #' 
  3 | #' This function creates the JSON object that feeds the visualization template.
  4 | #' For a more detailed overview, 
  5 | #' see \code{vignette("details", package = "LDAvis")}
  6 | #' 
  7 | #' @param phi matrix, with each row containing the distribution over terms 
  8 | #' for a topic, with as many rows as there are topics in the model, and as 
  9 | #' many columns as there are terms in the vocabulary.
 10 | #' @param theta matrix, with each row containing the probability distribution
 11 | #' over topics for a document, with as many rows as there are documents in the
 12 | #' corpus, and as many columns as there are topics in the model.
 13 | #' @param doc.length integer vector containing the number of tokens in each
 14 | #' document of the corpus.
 15 | #' @param vocab character vector of the terms in the vocabulary (in the same
 16 | #' order as the columns of \code{phi}). Each term must have at least one
 17 | #' character.
 18 | #' @param term.frequency integer vector containing the frequency of each term 
 19 | #' in the vocabulary.
 20 | #' @param R integer, the number of terms to display in the barcharts
 21 | #' of the interactive viz. Default is 30. Recommended to be roughly
 22 | #' between 10 and 50.
 23 | #' @param lambda.step a value between 0 and 1. 
 24 | #' Determines the interstep distance in the grid of lambda 
 25 | #' values over which to iterate when computing relevance.
 26 | #' Default is 0.01. Recommended to be between 0.01 and 0.1. 
 27 | #' @param mds.method a function that takes \code{phi} as an input and outputs
 28 | #' a K by 2 data.frame (or matrix). The output approximates the distance
 29 | #' between topics. See \link{jsPCA} for details on the default method.
 30 | #' @param cluster a cluster object created from the \link{parallel} package. 
 31 | #' If supplied, computations are performed using \link{parLapply} instead
 32 | #' of \link{lapply}.
 33 | #' @param plot.opts a named list used to customize various plot elements. 
 34 | #' By default, the x and y axes are labeled "PC1" and "PC2" 
 35 | #' (principal components 1 and 2), since \link{jsPCA} is the default
 36 | #' scaling method.
 37 | #' @param reorder.topics whether to re-order the K topics in order 
 38 | #' of decreasing proportion.
 39 | #' @param ... not currently used.
 40 | #'
 41 | #' @details The function first computes the topic frequencies (across the whole
 42 | #' corpus), and then it reorders the topics in decreasing order of 
 43 | #' frequency. The main computation is to loop through the topics and through the
 44 | #' grid of lambda values (determined by \code{lambda.step})
 45 | #' to compute the \code{R} most 
 46 | #' \emph{relevant} terms for each topic and value of lambda.
 47 | #'
 48 | #' @return A string containing JSON content which can be written to a file 
 49 | #' or feed into \link{serVis} for easy viewing/sharing. One element of this 
 50 | #' string is the new ordering of the topics.
 51 | #'
 52 | #' @seealso \link{serVis}
 53 | #' @references Sievert, C. and Shirley, K. (2014) \emph{LDAvis: A Method for
 54 | #' Visualizing and Interpreting Topics}, ACL Workshop on Interactive 
 55 | #' Language Learning, Visualization, and Interfaces.
 56 | #' \url{http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf}
 57 | #'
 58 | #' @export
 59 | #' @examples
 60 | #' 
 61 | #' \dontrun{
 62 | #' data(TwentyNewsgroups, package="LDAvis")
 63 | #' # create the json object, start a local file server, open in default browser
 64 | #' json <- with(TwentyNewsgroups, 
 65 | #'              createJSON(phi, theta, doc.length, vocab, term.frequency))
 66 | #' serVis(json) # press ESC or Ctrl-C to kill
 67 | #' 
 68 | #' # createJSON() reorders topics in decreasing order of term frequency
 69 | #' RJSONIO::fromJSON(json)$topic.order
 70 | #' 
 71 | #' # You may want to just write the JSON and other dependency files 
 72 | #' # to a folder named TwentyNewsgroups under the working directory
 73 | #' serVis(json, out.dir = 'TwentyNewsgroups', open.browser = FALSE)
 74 | #' # then you could use a server of your choice; for example,
 75 | #' # open your terminal, type `cd TwentyNewsgroups && python -m SimpleHTTPServer`
 76 | #' # then open http://localhost:8000 in your web browser
 77 | #'
 78 | #' # A different data set: the Jeopardy Questions+Answers data:
 79 | #' # Install LDAvisData (the associated data package) if not already installed:
 80 | #' # devtools::install_github("cpsievert/LDAvisData")
 81 | #' library(LDAvisData)
 82 | #' data(Jeopardy, package="LDAvisData")
 83 | #' json <- with(Jeopardy, 
 84 | #'              createJSON(phi, theta, doc.length, vocab, term.frequency))
 85 | #' serVis(json) # Check out Topic 22 (bodies of water!)
 86 | #' 
 87 | #' # If you have a GitHub account, you can even publish as a gist
 88 | #' # which allows you to easily share with others!
 89 | #' serVis(json, as.gist = TRUE)
 90 | #' 
 91 | #' # Run createJSON on a cluster of machines to speed it up
 92 | #' system.time(
 93 | #' json <- with(TwentyNewsgroups, 
 94 | #'              createJSON(phi, theta, doc.length, vocab, term.frequency))
 95 | #' )
 96 | #' #   user  system elapsed 
 97 | #' # 14.415   0.800  15.066 
 98 | #' library("parallel")
 99 | #' cl <- makeCluster(detectCores() - 1)
100 | #' cl # socket cluster with 3 nodes on host 'localhost'
101 | #' system.time(
102 | #'  json <- with(TwentyNewsgroups, 
103 | #'    createJSON(phi, theta, doc.length, vocab, term.frequency, 
104 | #'      cluster = cl))
105 | #' )
106 | #' #   user  system elapsed 
107 | #' #  2.006   0.361   8.822
108 | #' 
109 | #' # another scaling method (svd + tsne)
110 | #' library("tsne")
111 | #' svd_tsne <- function(x) tsne(svd(x)$u)
112 | #' json <- with(TwentyNewsgroups, 
113 | #'              createJSON(phi, theta, doc.length, vocab, term.frequency, 
114 | #'                         mds.method = svd_tsne, 
115 | #'                         plot.opts = list(xlab="", ylab="")
116 | #'                         )
117 | #'              )
118 | #' serVis(json) # Results in a different topic layout in the left panel
119 | #' 
120 | #'}
121 | 
122 | createJSON <- function(phi = matrix(), theta = matrix(), doc.length = integer(), 
123 |                        vocab = character(), term.frequency = integer(), R = 30, 
124 |                        lambda.step = 0.01, mds.method = jsPCA, cluster, 
125 |                        plot.opts = list(xlab = "PC1", ylab = "PC2"), 
126 |                        reorder.topics = TRUE,
127 |                        ...) {
128 |   # Set the values of a few summary statistics of the corpus and model:
129 |   dp <- dim(phi)  # should be K x W
130 |   dt <- dim(theta)  # should be D x K
131 | 
132 |   N <- sum(doc.length)  # number of tokens in the data
133 |   W <- length(vocab)  # number of terms in the vocab
134 |   D <- length(doc.length)  # number of documents in the data
135 |   K <- dt[2]  # number of topics in the model
136 | 
137 |   # check that certain input dimensions match
138 |   if (dp[1] != K) stop("Number of rows of phi does not match 
139 |       number of columns of theta; both should be equal to the number of topics 
140 |       in the model.")  
141 |   if (D != dt[1]) stop("Length of doc.length not equal 
142 |       to the number of rows in theta; both should be equal to the number of 
143 |       documents in the data.")
144 |   if (dp[2] != W) stop("Number of terms in vocabulary does 
145 |       not match the number of columns of phi (where each row of phi is a
146 |       probability distribution of terms for a given topic).")
147 |   if (length(term.frequency) != W) stop("Length of term.frequency 
148 |       not equal to the number of terms in the vocabulary.")
149 |   if (any(nchar(vocab) == 0)) stop("One or more terms in the vocabulary
150 |       has zero characters -- all terms must have at least one character.")
151 | 
152 |   # check that conditional distributions are normalized:
153 |   phi.test <- all.equal(rowSums(phi), rep(1, K), check.attributes = FALSE)
154 |   theta.test <- all.equal(rowSums(theta), rep(1, dt[1]), 
155 |                           check.attributes = FALSE)
156 |   if (!isTRUE(phi.test)) warning("Rows of phi don't all sum to 1.")
157 |   if (!isTRUE(theta.test)) warning("Rows of theta don't all sum to 1.")
158 | 
159 |   # compute counts of tokens across K topics (length-K vector):
160 |   # (this determines the areas of the default topic circles when no term is 
161 |   # highlighted)
162 |   topic.frequency <- colSums(theta * doc.length)
163 |   topic.proportion <- topic.frequency/sum(topic.frequency)
164 | 
165 |   # re-order the K topics in order of decreasing proportion:
166 |   if(reorder.topics)
167 |     o <- order(topic.proportion, decreasing = TRUE)
168 |   else 
169 |     o <- seq_along(topic.proportion)
170 |   
171 |   phi <- phi[o, ]
172 |   theta <- theta[, o]
173 |   topic.frequency <- topic.frequency[o]
174 |   topic.proportion <- topic.proportion[o]
175 |   
176 |   # compute intertopic distances using the specified multidimensional
177 |   # scaling method:
178 |   mds.res <- mds.method(phi)
179 |   if (is.matrix(mds.res)) {
180 |     colnames(mds.res) <- c("x", "y")
181 |   } else if (is.data.frame(mds.res)) {
182 |     names(mds.res) <- c("x", "y")
183 |   } else {
184 |     warning("Result of mds.method should be a matrix or data.frame.")
185 |   }  
186 |   mds.df <- data.frame(mds.res, topics = seq_len(K), Freq = topic.proportion*100, 
187 |                        cluster = 1, stringsAsFactors = FALSE)
188 |   # note: cluster (should?) be deprecated soon.
189 | 
190 |   # token counts for each term-topic combination (widths of red bars)
191 |   term.topic.frequency <- phi * topic.frequency  
192 |   
193 |   # compute term frequencies as column sums of term.topic.frequency
194 |   # we actually won't use the user-supplied term.frequency vector.
195 |   # the term frequencies won't match the user-supplied frequencies exactly
196 |   # this is a work-around to solve the bug described in Issue #32 on github:
197 |   # https://github.com/cpsievert/LDAvis/issues/32
198 |   term.frequency <- colSums(term.topic.frequency)
199 |   stopifnot(all(term.frequency > 0))
200 | 
201 |   # marginal distribution over terms (width of blue bars)
202 |   term.proportion <- term.frequency/sum(term.frequency)
203 | 
204 |   # Old code to adjust term frequencies. Deprecated for now
205 |   # adjust to match term frequencies exactly (get rid of rounding error)
206 |   #err <- as.numeric(term.frequency/colSums(term.topic.frequency))
207 |   # http://stackoverflow.com/questions/3643555/multiply-rows-of-matrix-by-vector
208 |   #term.topic.frequency <- sweep(term.topic.frequency, MARGIN=2, err, `*`)
209 | 
210 |   # Most operations on phi after this point are across topics
211 |   # R has better facilities for column-wise operations
212 |   phi <- t(phi)
213 | 
214 |   # compute the distinctiveness and saliency of the terms:
215 |   # this determines the R terms that are displayed when no topic is selected
216 |   topic.given.term <- phi/rowSums(phi)  # (W x K)
217 |   kernel <- topic.given.term * log(sweep(topic.given.term, MARGIN=2, 
218 |                                          topic.proportion, `/`))
219 |   distinctiveness <- rowSums(kernel)
220 |   saliency <- term.proportion * distinctiveness
221 | 
222 |   # Order the terms for the "default" view by decreasing saliency:
223 |   default.terms <- vocab[order(saliency, decreasing = TRUE)][1:R]
224 |   counts <- as.integer(term.frequency[match(default.terms, vocab)])
225 |   Rs <- rev(seq_len(R))
226 |   default <- data.frame(Term = default.terms, logprob = Rs, loglift = Rs, 
227 |                         Freq = counts, Total = counts, Category = "Default", 
228 |                         stringsAsFactors = FALSE)
229 |   topic_seq <- rep(seq_len(K), each = R)
230 |   category <- paste0("Topic", topic_seq)
231 |   lift <- phi/term.proportion
232 | 
233 |   # Collect R most relevant terms for each topic/lambda combination
234 |   # Note that relevance is re-computed in the browser, so we only need
235 |   # to send each possible term/topic combination to the browser
236 |   find_relevance <- function(i) {
237 |     relevance <- i*log(phi) + (1 - i)*log(lift)
238 |     idx <- apply(relevance, 2, 
239 |                  function(x) order(x, decreasing = TRUE)[seq_len(R)])
240 |     # for matrices, we pick out elements by their row/column index
241 |     indices <- cbind(c(idx), topic_seq)
242 |     data.frame(Term = vocab[idx], Category = category,
243 |                logprob = round(log(phi[indices]), 4),
244 |                loglift = round(log(lift[indices]), 4),
245 |                stringsAsFactors = FALSE)
246 |   }
247 |   lambda.seq <- seq(0, 1, by=lambda.step)
248 |   if (missing(cluster)) {
249 |     tinfo <- lapply(as.list(lambda.seq), find_relevance)
250 |   } else {
251 |     tinfo <- parallel::parLapply(cluster, as.list(lambda.seq), find_relevance)
252 |   }
253 |   tinfo <- unique(do.call("rbind", tinfo))
254 |   tinfo$Total <- term.frequency[match(tinfo$Term, vocab)]
255 |   rownames(term.topic.frequency) <- paste0("Topic", seq_len(K))
256 |   colnames(term.topic.frequency) <- vocab
257 |   tinfo$Freq <- term.topic.frequency[as.matrix(tinfo[c("Category", "Term")])]
258 |   tinfo <- rbind(default, tinfo)
259 |   
260 |   # last, to compute the areas of the circles when a term is highlighted
261 |   # we must gather all unique terms that could show up (for every combination 
262 |   # of topic and value of lambda) and compute its distribution over topics.
263 | 
264 |   # unique terms across all topics and all values of lambda
265 |   ut <- sort(unique(tinfo$Term))
266 |   # indices of unique terms in the vocab
267 |   m <- sort(match(ut, vocab))
268 |   # term-topic frequency table
269 |   tmp <- term.topic.frequency[, m]
270 | 
271 |   # round down infrequent term occurrences so that we can send sparse 
272 |   # data to the browser:
273 |   r <- row(tmp)[tmp >= 0.5]
274 |   c <- col(tmp)[tmp >= 0.5]
275 |   dd <- data.frame(Term = vocab[m][c], Topic = r, Freq = round(tmp[cbind(r, c)]), 
276 |                    stringsAsFactors = FALSE)
277 | 
278 |   # Normalize token frequencies:
279 |   dd[, "Freq"] <- dd[, "Freq"]/term.frequency[match(dd[, "Term"], vocab)]
280 |   token.table <- dd[order(dd[, 1], dd[, 2]), ]
281 |   
282 |   RJSONIO::toJSON(list(mdsDat = mds.df, tinfo = tinfo, 
283 |                        token.table = token.table, R = R, 
284 |                        lambda.step = lambda.step,
285 |                        plot.opts = plot.opts, 
286 |                        topic.order = o))
287 | }
288 | 
289 | 
290 | #' Dimension reduction via Jensen-Shannon Divergence & Principal Components
291 | #' 
292 | #' @param phi matrix, with each row containing the distribution over terms 
293 | #' for a topic, with as many rows as there are topics in the model, and as 
294 | #' many columns as there are terms in the vocabulary.
295 | #' 
296 | #' @export
297 | jsPCA <- function(phi) {
298 |   # first, we compute a pairwise distance between topic distributions
299 |   # using a symmetric version of KL-divergence
300 |   # http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
301 |   jensenShannon <- function(x, y) {
302 |     m <- 0.5 * (x + y)
303 |     lhs <- ifelse(x == 0, 0, x * (log(x) - log(m)))
304 |     rhs <- ifelse(y == 0, 0, y * (log(y) - log(m)))
305 |     0.5 * sum(lhs) + 0.5 * sum(rhs)
306 |   }
307 |   dist.mat <- proxy::dist(x = phi, method = jensenShannon)
308 |   # then, we reduce the K by K proximity matrix down to K by 2 using PCA
309 |   pca.fit <- stats::cmdscale(dist.mat, k = 2)
310 |   data.frame(x = pca.fit[,1], y = pca.fit[,2])
311 | }
312 | 


--------------------------------------------------------------------------------
/docs/newsgroup/newsgroup.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  5 | 
  6 | <title>A topic model for the Twenty Newsgroups data</title>
  7 | 
  8 | <script type="text/javascript">
  9 | window.onload = function() {
 10 |   var imgs = document.getElementsByTagName('img'), i, img;
 11 |   for (i = 0; i < imgs.length; i++) {
 12 |     img = imgs[i];
 13 |     // center an image if it is the only element of its parent
 14 |     if (img.parentElement.childElementCount === 1)
 15 |       img.parentElement.style.textAlign = 'center';
 16 |   }
 17 | };
 18 | </script>
 19 | 
 20 | <!-- Styles for R syntax highlighter -->
 21 | <style type="text/css">
 22 |    pre .operator,
 23 |    pre .paren {
 24 |      color: rgb(104, 118, 135)
 25 |    }
 26 | 
 27 |    pre .literal {
 28 |      color: #990073
 29 |    }
 30 | 
 31 |    pre .number {
 32 |      color: #099;
 33 |    }
 34 | 
 35 |    pre .comment {
 36 |      color: #998;
 37 |      font-style: italic
 38 |    }
 39 | 
 40 |    pre .keyword {
 41 |      color: #900;
 42 |      font-weight: bold
 43 |    }
 44 | 
 45 |    pre .identifier {
 46 |      color: rgb(0, 0, 0);
 47 |    }
 48 | 
 49 |    pre .string {
 50 |      color: #d14;
 51 |    }
 52 | </style>
 53 | 
 54 | <!-- R syntax highlighter -->
 55 | <script type="text/javascript">
 56 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
 57 | hljs.initHighlightingOnLoad();
 58 | </script>
 59 | 
 60 | <!-- MathJax scripts -->
 61 | <script type="text/javascript" src="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML">
 62 | </script>
 63 | 
 64 | 
 65 | <style type="text/css">
 66 | body, td {
 67 |    font-family: sans-serif;
 68 |    background-color: white;
 69 |    font-size: 13px;
 70 | }
 71 | 
 72 | body {
 73 |   max-width: 800px;
 74 |   margin: auto;
 75 |   padding: 1em;
 76 |   line-height: 20px;
 77 | }
 78 | 
 79 | tt, code, pre {
 80 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
 81 | }
 82 | 
 83 | h1 {
 84 |    font-size:2.2em;
 85 | }
 86 | 
 87 | h2 {
 88 |    font-size:1.8em;
 89 | }
 90 | 
 91 | h3 {
 92 |    font-size:1.4em;
 93 | }
 94 | 
 95 | h4 {
 96 |    font-size:1.0em;
 97 | }
 98 | 
 99 | h5 {
100 |    font-size:0.9em;
101 | }
102 | 
103 | h6 {
104 |    font-size:0.8em;
105 | }
106 | 
107 | a:visited {
108 |    color: rgb(50%, 0%, 50%);
109 | }
110 | 
111 | pre, img {
112 |   max-width: 100%;
113 | }
114 | pre {
115 |   overflow-x: auto;
116 | }
117 | pre code {
118 |    display: block; padding: 0.5em;
119 | }
120 | 
121 | code {
122 |   font-size: 92%;
123 |   border: 1px solid #ccc;
124 | }
125 | 
126 | code[class] {
127 |   background-color: #F8F8F8;
128 | }
129 | 
130 | table, td, th {
131 |   border: none;
132 | }
133 | 
134 | blockquote {
135 |    color:#666666;
136 |    margin:0;
137 |    padding-left: 1em;
138 |    border-left: 0.5em #EEE solid;
139 | }
140 | 
141 | hr {
142 |    height: 0px;
143 |    border-bottom: none;
144 |    border-top-width: thin;
145 |    border-top-style: dotted;
146 |    border-top-color: #999999;
147 | }
148 | 
149 | @media print {
150 |    * {
151 |       background: transparent !important;
152 |       color: black !important;
153 |       filter:none !important;
154 |       -ms-filter: none !important;
155 |    }
156 | 
157 |    body {
158 |       font-size:12pt;
159 |       max-width:100%;
160 |    }
161 | 
162 |    a, a:visited {
163 |       text-decoration: underline;
164 |    }
165 | 
166 |    hr {
167 |       visibility: hidden;
168 |       page-break-before: always;
169 |    }
170 | 
171 |    pre, blockquote {
172 |       padding-right: 1em;
173 |       page-break-inside: avoid;
174 |    }
175 | 
176 |    tr, img {
177 |       page-break-inside: avoid;
178 |    }
179 | 
180 |    img {
181 |       max-width: 100% !important;
182 |    }
183 | 
184 |    @page :left {
185 |       margin: 15mm 20mm 15mm 10mm;
186 |    }
187 | 
188 |    @page :right {
189 |       margin: 15mm 10mm 15mm 20mm;
190 |    }
191 | 
192 |    p, h2, h3 {
193 |       orphans: 3; widows: 3;
194 |    }
195 | 
196 |    h2, h3 {
197 |       page-break-after: avoid;
198 |    }
199 | }
200 | </style>
201 | 
202 | 
203 | 
204 | </head>
205 | 
206 | <body>
207 | <h1>A topic model for the Twenty Newsgroups data</h1>
208 | 
209 | <p><a href="https://github.com/cpsievert/LDAvis/">LDAvis</a> comes prepackaged with some data sets to help quickly demonstrate how to use it. This document visualizes a topic model fit to the &#39;Twenty Newsgroups&#39; data created with <strong>LDAvis</strong> and <strong>knitr</strong> (<a href="https://github.com/cpsievert/LDAvis/blob/master/inst/examples/newsgroup/newsgroup.Rmd">see here</a> for source code). </p>
210 | 
211 | <p>First, we downloaded the data from the <a href="http://qwone.com/%7Ejason/20Newsgroups/">home page for the Twenty Newsgroups data</a>. Specifically, we used the &#39;20news-bydate&#39; version of the data and fit our topic model to the &#39;training&#39; portion of the data. The raw training data consists of \(D = 11,269\) documents, a vocabulary of \(W = 53,975\) terms, and \(N = 2,765,300\) total tokens in the corpus. Each document is a message posted to one of twenty selected Usenet newsgroups during a time span roughly between June, 1992 and May, 1993. It appears that the documents were tokenized by splitting on punctuation and whitespace. We remove all occurrences of the 174 stop words contained in the &ldquo;English&rdquo; stop words list in the R package <strong>tm</strong>. We also removed all occurrences of terms that occurred less than a total of ten times. This left \(W = 15,954\) terms in the vocabulary and a total of \(N = 1,511,137\) tokens in the corpus. One document was removed because it contained only stop words and rare words.</p>
212 | 
213 | <p>We fit a \(K=50\)-topic model to the corpus (allowing for topics other than the 20 standard newsgroups topics to be discovered) by running the collapsed Gibbs sampler for 10,000 iterations using symmetric priors for the document-topic distributions (\(\alpha = 0.02\)) and the topic-term distributions (\(\beta = 0.02\)). We used MALLET to fit the model. We computed estimates of the document-topic distributions (stored in a \(D \times K\) matrix denoted \(\theta\)) and the topic-term distributions (stored in a \(K \times W\) matrix denoted \(\phi\)) by cross-tabulating the latent topic assignments from the last iteration of the Gibbs sampler with the document IDs and the term IDs, and then adding pseudocounts to account for the priors. A better estimate might average over multiple MCMC iterations of latent topic assignments (assuming the MCMC has settled into a local mode of the posterior and there is no label-switching going on), but we don&#39;t worry about that for now.</p>
214 | 
215 | <p>To visualize the fitted model using <code>LDAvis</code>, we load the data object <code>TwentyNewsgroups</code>, which is a list containing five elements.</p>
216 | 
217 | <pre><code class="r">library(LDAvis)
218 | data(&quot;TwentyNewsgroups&quot;, package = &quot;LDAvis&quot;)
219 | str(TwentyNewsgroups)
220 | </code></pre>
221 | 
222 | <pre><code>## List of 5
223 | ##  $ phi           : num [1:50, 1:15954] 5.78e-07 1.26e-04 2.77e-06 2.98e-04 4.55e-07 ...
224 | ##  $ theta         : num [1:11268, 1:50] 2.28e-05 2.08e-04 6.06e-04 5.88e-04 1.02e-04 ...
225 | ##  $ doc.length    : int [1:11268] 878 95 32 33 196 23 44 83 38 179 ...
226 | ##  $ vocab         : chr [1:15954] &quot;archive&quot; &quot;name&quot; &quot;atheism&quot; &quot;resources&quot; ...
227 | ##  $ term.frequency: int [1:15954] 317 1364 300 226 327 1832 125 108 1002 208 ...
228 | </code></pre>
229 | 
230 | <p>The first two elements are \(\phi\) and \(\theta\). Both of these are matrices whose rows must sum to one, since their rows contain probability distributions over terms and topics, respectively.</p>
231 | 
232 | <p>The third element of the list is <code>doc.length</code>, which is an integer vector of length \(D = 11,268\) containing the number of tokens in each document. For this data the median document length is 81 tokens, with a range of 1 to 6409.</p>
233 | 
234 | <p>The fourth element of the list is <code>vocab</code>, which is a character vector containing the terms in the vocabulary, in the same order as the columns of \(\phi\).</p>
235 | 
236 | <p>The fifth element of the list is <code>term.frequency</code>, which is an integer vector containing the frequencies of the terms in the vocabulary. The median term frequency is 27 with a range of 10 to 12,289 (&#39;edu&#39; is the most frequent term, because the data contain email addresses and tokenization was performed by splitting on punctuation).</p>
237 | 
238 | <p>At this point, we call the R function <code>createJSON()</code> to create a JSON object that will feed the web-based visualization. The <code>createJSON()</code> function performs several operations:</p>
239 | 
240 | <ul>
241 | <li><p>It computes topic frequencies, inter-topic distances, and a projection of the topics onto a two-dimensional space using multidimensional scaling.</p></li>
242 | <li><p>It computes the \(R\) most relevant terms for each topic for a grid of values of \(\lambda\) (determined by the argument <code>lambda.step</code>, set to 0.01 by default), where the relevance of a term to a topic is defined as \(\lambda \times p(term \mid topic) + (1 - \lambda) \times p(term \mid topic)/p(term)\), for \(0 \leq \lambda \leq 1\).</p></li>
243 | </ul>
244 | 
245 | <p>The idea is to help users interpret topics by allowing them to interactively re-rank the most relevant terms for each topic by changing the value of \(\lambda\) via a slider, where large values of \(\lambda\) highly rank frequent words within a topic, and low values of \(\lambda\) highly rank exclusive words within a topic. The topic plot on the left side of <strong>LDAvis</strong> allows users to browse groups of similar topics (positioned near each other in the 2-d plot) or simply progress through the topics in order (they are, by default, ordered in decreasing order of frequency). For more on relevance, see our paper about <strong>LDAvis</strong> <a href="http://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf">here</a>.</p>
246 | 
247 | <pre><code class="r">json &lt;- with(TwentyNewsgroups, 
248 |              createJSON(phi = phi, theta = theta, vocab = vocab,
249 |                 doc.length = doc.length, term.frequency = term.frequency))
250 | </code></pre>
251 | 
252 | <p>Now, the <code>serVis</code> function can take <code>json</code> and serve the result in a variety of ways. Here we write <code>json</code> to a file within the &#39;vis&#39; directory (along with other HTML and JavaScript required to render the page). You can see the result <a href="http://cpsievert.github.io/LDAvis/newsgroup/vis">here</a>.</p>
253 | 
254 | <pre><code class="r">serVis(json, out.dir = &quot;vis&quot;, open.browser = FALSE)
255 | </code></pre>
256 | 
257 | </body>
258 | 
259 | </html>
260 | 


--------------------------------------------------------------------------------
/docs/reviews/reviews.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  5 | 
  6 | <title>A topic model for movie reviews</title>
  7 | 
  8 | <script type="text/javascript">
  9 | window.onload = function() {
 10 |   var imgs = document.getElementsByTagName('img'), i, img;
 11 |   for (i = 0; i < imgs.length; i++) {
 12 |     img = imgs[i];
 13 |     // center an image if it is the only element of its parent
 14 |     if (img.parentElement.childElementCount === 1)
 15 |       img.parentElement.style.textAlign = 'center';
 16 |   }
 17 | };
 18 | </script>
 19 | 
 20 | <!-- Styles for R syntax highlighter -->
 21 | <style type="text/css">
 22 |    pre .operator,
 23 |    pre .paren {
 24 |      color: rgb(104, 118, 135)
 25 |    }
 26 | 
 27 |    pre .literal {
 28 |      color: #990073
 29 |    }
 30 | 
 31 |    pre .number {
 32 |      color: #099;
 33 |    }
 34 | 
 35 |    pre .comment {
 36 |      color: #998;
 37 |      font-style: italic
 38 |    }
 39 | 
 40 |    pre .keyword {
 41 |      color: #900;
 42 |      font-weight: bold
 43 |    }
 44 | 
 45 |    pre .identifier {
 46 |      color: rgb(0, 0, 0);
 47 |    }
 48 | 
 49 |    pre .string {
 50 |      color: #d14;
 51 |    }
 52 | </style>
 53 | 
 54 | <!-- R syntax highlighter -->
 55 | <script type="text/javascript">
 56 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
 57 | hljs.initHighlightingOnLoad();
 58 | </script>
 59 | 
 60 | <!-- MathJax scripts -->
 61 | <script type="text/javascript" src="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML">
 62 | </script>
 63 | 
 64 | 
 65 | <style type="text/css">
 66 | body, td {
 67 |    font-family: sans-serif;
 68 |    background-color: white;
 69 |    font-size: 13px;
 70 | }
 71 | 
 72 | body {
 73 |   max-width: 800px;
 74 |   margin: auto;
 75 |   padding: 1em;
 76 |   line-height: 20px;
 77 | }
 78 | 
 79 | tt, code, pre {
 80 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
 81 | }
 82 | 
 83 | h1 {
 84 |    font-size:2.2em;
 85 | }
 86 | 
 87 | h2 {
 88 |    font-size:1.8em;
 89 | }
 90 | 
 91 | h3 {
 92 |    font-size:1.4em;
 93 | }
 94 | 
 95 | h4 {
 96 |    font-size:1.0em;
 97 | }
 98 | 
 99 | h5 {
100 |    font-size:0.9em;
101 | }
102 | 
103 | h6 {
104 |    font-size:0.8em;
105 | }
106 | 
107 | a:visited {
108 |    color: rgb(50%, 0%, 50%);
109 | }
110 | 
111 | pre, img {
112 |   max-width: 100%;
113 | }
114 | pre {
115 |   overflow-x: auto;
116 | }
117 | pre code {
118 |    display: block; padding: 0.5em;
119 | }
120 | 
121 | code {
122 |   font-size: 92%;
123 |   border: 1px solid #ccc;
124 | }
125 | 
126 | code[class] {
127 |   background-color: #F8F8F8;
128 | }
129 | 
130 | table, td, th {
131 |   border: none;
132 | }
133 | 
134 | blockquote {
135 |    color:#666666;
136 |    margin:0;
137 |    padding-left: 1em;
138 |    border-left: 0.5em #EEE solid;
139 | }
140 | 
141 | hr {
142 |    height: 0px;
143 |    border-bottom: none;
144 |    border-top-width: thin;
145 |    border-top-style: dotted;
146 |    border-top-color: #999999;
147 | }
148 | 
149 | @media print {
150 |    * {
151 |       background: transparent !important;
152 |       color: black !important;
153 |       filter:none !important;
154 |       -ms-filter: none !important;
155 |    }
156 | 
157 |    body {
158 |       font-size:12pt;
159 |       max-width:100%;
160 |    }
161 | 
162 |    a, a:visited {
163 |       text-decoration: underline;
164 |    }
165 | 
166 |    hr {
167 |       visibility: hidden;
168 |       page-break-before: always;
169 |    }
170 | 
171 |    pre, blockquote {
172 |       padding-right: 1em;
173 |       page-break-inside: avoid;
174 |    }
175 | 
176 |    tr, img {
177 |       page-break-inside: avoid;
178 |    }
179 | 
180 |    img {
181 |       max-width: 100% !important;
182 |    }
183 | 
184 |    @page :left {
185 |       margin: 15mm 20mm 15mm 10mm;
186 |    }
187 | 
188 |    @page :right {
189 |       margin: 15mm 10mm 15mm 20mm;
190 |    }
191 | 
192 |    p, h2, h3 {
193 |       orphans: 3; widows: 3;
194 |    }
195 | 
196 |    h2, h3 {
197 |       page-break-after: avoid;
198 |    }
199 | }
200 | </style>
201 | 
202 | 
203 | 
204 | </head>
205 | 
206 | <body>
207 | <h1>A topic model for movie reviews</h1>
208 | 
209 | <p>In this document, we fit an <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">LDA topic model</a> to the <a href="http://www.cs.cornell.edu/people/pabo/movie-review-data/">Cornell Movie Review Data</a> introduced by Pang, Lee, and Vaidyanathan in their 2002 EMNLP paper, where we use &#39;polarity dataset version 2.0&#39; (introduced in a subsequent Pang and Lee 2004 ACL paper). To fit the model, we used the R package <a href="http://cran.r-project.org/web/packages/lda/">lda</a> and we visualize the output using <a href="https://github.com/cpsievert/LDAvis">LDAvis</a>.</p>
210 | 
211 | <h3>The data</h3>
212 | 
213 | <p>For convenience, the R package &#39;LDAvisData&#39; provides data used to supplement LDAvis examples. The package provides an object named <code>reviews</code> which is a character vector of length 2000. Each element of that vector contains a single movie review. Note that <code>reviews</code> was created using <a href="https://github.com/cpsievert/moviereviews/blob/master/data-raw/reviews.R">this script</a>. </p>
214 | 
215 | <pre><code class="r"># LDAvisData can be installed from GitHub via &#39;devtools::install_github(&quot;cpsievert/LDAvisData&quot;)&#39;
216 | data(reviews, package = &quot;LDAvisData&quot;)
217 | </code></pre>
218 | 
219 | <h3>Pre-processing</h3>
220 | 
221 | <p>Before fitting a topic model, we need to tokenize the text. This dataset is already fairly clean, so we only remove punctuation and some common <a href="http://en.wikipedia.org/wiki/Stop_words">stop words</a>. In particular, we use the english stop words from the <a href="http://en.wikipedia.org/wiki/SMART_Information_Retrieval_System">SMART information retrieval system</a>, available in the R package <strong>tm</strong>.</p>
222 | 
223 | <pre><code class="r"># read in some stopwords:
224 | library(tm)
225 | stop_words &lt;- stopwords(&quot;SMART&quot;)
226 | 
227 | # pre-processing:
228 | reviews &lt;- gsub(&quot;&#39;&quot;, &quot;&quot;, reviews)  # remove apostrophes
229 | reviews &lt;- gsub(&quot;[[:punct:]]&quot;, &quot; &quot;, reviews)  # replace punctuation with space
230 | reviews &lt;- gsub(&quot;[[:cntrl:]]&quot;, &quot; &quot;, reviews)  # replace control characters with space
231 | reviews &lt;- gsub(&quot;^[[:space:]]+&quot;, &quot;&quot;, reviews) # remove whitespace at beginning of documents
232 | reviews &lt;- gsub(&quot;[[:space:]]+$&quot;, &quot;&quot;, reviews) # remove whitespace at end of documents
233 | reviews &lt;- tolower(reviews)  # force to lowercase
234 | 
235 | # tokenize on space and output as a list:
236 | doc.list &lt;- strsplit(reviews, &quot;[[:space:]]+&quot;)
237 | 
238 | # compute the table of terms:
239 | term.table &lt;- table(unlist(doc.list))
240 | term.table &lt;- sort(term.table, decreasing = TRUE)
241 | 
242 | # remove terms that are stop words or occur fewer than 5 times:
243 | del &lt;- names(term.table) %in% stop_words | term.table &lt; 5
244 | term.table &lt;- term.table[!del]
245 | vocab &lt;- names(term.table)
246 | 
247 | # now put the documents into the format required by the lda package:
248 | get.terms &lt;- function(x) {
249 |   index &lt;- match(x, vocab)
250 |   index &lt;- index[!is.na(index)]
251 |   rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
252 | }
253 | documents &lt;- lapply(doc.list, get.terms)
254 | </code></pre>
255 | 
256 | <h3>Using the R package &#39;lda&#39; for model fitting</h3>
257 | 
258 | <p>The object <code>documents</code> is a length-2000 list where each element represents one document, according to the specifications of the <strong>lda</strong> package. After creating this list, we compute a few statistics about the corpus:</p>
259 | 
260 | <pre><code class="r"># Compute some statistics related to the data set:
261 | D &lt;- length(documents)  # number of documents (2,000)
262 | W &lt;- length(vocab)  # number of terms in the vocab (14,568)
263 | doc.length &lt;- sapply(documents, function(x) sum(x[2, ]))  # number of tokens per document [312, 288, 170, 436, 291, ...]
264 | N &lt;- sum(doc.length)  # total number of tokens in the data (546,827)
265 | term.frequency &lt;- as.integer(term.table)  # frequencies of terms in the corpus [8939, 5544, 2411, 2410, 2143, ...]
266 | </code></pre>
267 | 
268 | <p>Next, we set up a topic model with 20 topics, relatively diffuse priors for the topic-term distributions (\(\eta\) = 0.02) and document-topic distributions (\(\alpha\)  = 0.02), and we set the collapsed Gibbs sampler to run for 5,000 iterations (slightly conservative to ensure convergence). A visual inspection of <code>fit$log.likelihood</code> shows that the MCMC algorithm has converged after 5,000 iterations. This block of code takes about 24 minutes to run on a laptop using a single core 1.7Ghz processor (and 8GB RAM).</p>
269 | 
270 | <pre><code class="r"># MCMC and model tuning parameters:
271 | K &lt;- 20
272 | G &lt;- 5000
273 | alpha &lt;- 0.02
274 | eta &lt;- 0.02
275 | 
276 | # Fit the model:
277 | library(lda)
278 | set.seed(357)
279 | t1 &lt;- Sys.time()
280 | fit &lt;- lda.collapsed.gibbs.sampler(documents = documents, K = K, vocab = vocab, 
281 |                                    num.iterations = G, alpha = alpha, 
282 |                                    eta = eta, initial = NULL, burnin = 0,
283 |                                    compute.log.likelihood = TRUE)
284 | t2 &lt;- Sys.time()
285 | t2 - t1  # about 24 minutes on laptop
286 | </code></pre>
287 | 
288 | <h3>Visualizing the fitted model with LDAvis</h3>
289 | 
290 | <p>To visualize the result using <a href="https://github.com/cpsievert/LDAvis/">LDAvis</a>, we&#39;ll need estimates of the document-topic distributions, which we denote by the \(D \times K\) matrix \(\theta\), and the set of topic-term distributions, which we denote by the \(K \times W\) matrix \(\phi\). We estimate the &ldquo;smoothed&rdquo; versions of these distributions (&ldquo;smoothed&rdquo; means that we&#39;ve incorporated the effects of the priors into the estimates) by cross-tabulating the latent topic assignments from the last iteration of the collapsed Gibbs sampler with the documents and the terms, respectively, and then adding pseudocounts according to the priors. A better estimator might average over multiple iterations of the Gibbs sampler (after convergence, assuming that the MCMC is sampling within a local mode and there is no label switching occurring), but we won&#39;t worry about that for now.</p>
291 | 
292 | <pre><code class="r">theta &lt;- t(apply(fit$document_sums + alpha, 2, function(x) x/sum(x)))
293 | phi &lt;- t(apply(t(fit$topics) + eta, 2, function(x) x/sum(x)))
294 | </code></pre>
295 | 
296 | <p>We&#39;ve already computed the number of tokens per document and the frequency of the terms across the entire corpus. We save these, along with \(\phi\), \(\theta\), and <code>vocab</code>, in a list as the data object <code>MovieReviews</code>, which is included in the <strong>LDAvis</strong> package.</p>
297 | 
298 | <pre><code class="r">MovieReviews &lt;- list(phi = phi,
299 |                      theta = theta,
300 |                      doc.length = doc.length,
301 |                      vocab = vocab,
302 |                      term.frequency = term.frequency)
303 | </code></pre>
304 | 
305 | <p>Now we&#39;re ready to call the <code>createJSON()</code> function in <strong>LDAvis</strong>. This function will return a character string representing a JSON object used to populate the visualization. The <code>createJSON()</code> function computes topic frequencies, inter-topic distances, and projects topics onto a two-dimensional plane to represent their similarity to each other. It also loops through a grid of values of a tuning parameter, \(0 \leq \lambda \leq 1\), that controls how the terms are ranked for each topic, where terms are listed in decreasing of <em>relevance</em>, where the relevance of term \(w\) to topic \(t\) is defined as \(\lambda \times p(w \mid t) + (1 - \lambda) \times p(w \mid t)/p(w)\). Values of \(\lambda\) near 1 give high relevance rankings to <em>frequent</em> terms within a given topic, whereas values of \(\lambda\) near zero give high relevance rankings to <em>exclusive</em> terms within a topic. The set of all terms which are ranked among the top-<code>R</code> most relevant terms for each topic are pre-computed by the <code>createJSON()</code> function and sent to the browser to be interactively visualized using D3 as part of the JSON object.</p>
306 | 
307 | <pre><code class="r">library(LDAvis)
308 | 
309 | # create the JSON object to feed the visualization:
310 | json &lt;- createJSON(phi = MovieReviews$phi, 
311 |                    theta = MovieReviews$theta, 
312 |                    doc.length = MovieReviews$doc.length, 
313 |                    vocab = MovieReviews$vocab, 
314 |                    term.frequency = MovieReviews$term.frequency)
315 | </code></pre>
316 | 
317 | <p>The <code>serVis()</code> function can take <code>json</code> and serve the result in a variety of ways. Here we&#39;ll write <code>json</code> to a file within the &#39;vis&#39; directory (along with other HTML and JavaScript required to render the page). You can see the result <a href="http://cpsievert.github.io/LDAvis/reviews/vis">here</a>.</p>
318 | 
319 | <pre><code class="r">serVis(json, out.dir = &#39;vis&#39;, open.browser = FALSE)
320 | </code></pre>
321 | 
322 | <p>If you discover something interesting in your data using <strong>LDAvis</strong>, you can share the result via a URL since the state of the visualization is stored in the URL at all times. For example, in the movie review data, you can quickly see that Topic 7 is broadly about comedies by linking directly to the state of LDAvis where the selected Topic is &ldquo;7&rdquo; and the value of \(\lambda\) is 0.6 with the following URL:</p>
323 | 
324 | <p><a href="http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&amp;lambda=0.6&amp;term=">http://cpsievert.github.io/LDAvis/reviews/vis/#topic=7&amp;lambda=0.6&amp;term=</a></p>
325 | 
326 | <p>You can also link to the term that is hovered. For example, when you look at the 30 most relevant terms for Topic 5 using a relevance setting of \(\lambda = 0.5\), the term &ldquo;action&rdquo; is the 6th bar from the top (i.e. the 6th most relevant term for this topic). The widths of the red and blue bars indicate that there is at least one other topic in which the term &ldquo;action&rdquo; appears frequently. By hovering over &ldquo;action&rdquo;, we see from the following state of <strong>LDAvis</strong> that term &ldquo;action&rdquo; also appears frequently in Topic 14 (as the 9th most relevant term):</p>
327 | 
328 | <p><a href="http://cpsievert.github.io/LDAvis/reviews/vis/#topic=14&amp;lambda=0.5&amp;term=action">http://cpsievert.github.io/LDAvis/reviews/vis/#topic=14&amp;lambda=0.5&amp;term=action</a></p>
329 | 
330 | <p>Comparing these two topics, we can see that Topic 5 discusses action in the context of movies about crime and police, whereas in Topic 14, the term &ldquo;action&rdquo;&ldquo; is also used frequently, but the topic is specifically about kung fu movies with Chinese actors (Jackie Chan and Jet Li, for example). These two topics both make heavy use of the word &quot;action&rdquo; but in slightly different contexts (i.e. slightly different styles of movies).</p>
331 | 
332 | <p>To encode a state of the visualization in the URL, you must include a string after the &ldquo;/&rdquo; of the form &ldquo;#topic=k&amp;labmda=l&amp;term=s&rdquo;, where &ldquo;k&rdquo;, &ldquo;l&rdquo;, and &ldquo;s&rdquo; are strings representing the topic to be selected, the value of \(\lambda\) to be used in the relevance calculation, and the term to be hovered, respectively. If no term hovering is desired, omit &ldquo;s&rdquo; from the URL. The topic, &ldquo;k&rdquo;, will be forced to an integer in \(\{0, 1, .., K\}\), and the value of \(\lambda\) will be forced to the interval \([0, 1]\), with non-numeric values returning the default state of the visualization (topic = 0, \(\lambda\) = 1, term = &ldquo;&rdquo;).</p>
333 | 
334 | </body>
335 | 
336 | </html>
337 | 


--------------------------------------------------------------------------------