├── .gitignore
├── README.md
├── css
    ├── footer_plus.css
    └── xaringan-themer.css
├── figs
    ├── austen-1.png
    ├── blue_jane.png
    ├── cover.png
    ├── lizzieskipping.gif
    ├── model_diagnostic-1.png
    ├── p_and_p_cover.png
    ├── plot_tf_idf-1.png
    ├── purple_emily.png
    ├── slider.gif
    ├── stm_video.png
    ├── stop.gif
    ├── tidytext_repo.png
    ├── tilecounts-1.png
    ├── tilerate-1.png
    ├── tmwr_0601.png
    ├── top_tags-1.png
    └── vexing.gif
├── header.html
├── intro.Rmd
├── intro.html
├── intro_files
    └── figure-html
    │   ├── unnamed-chunk-13-1.png
    │   ├── unnamed-chunk-21-1.png
    │   └── unnamed-chunk-27-1.png
├── libs
    └── remark-css
    │   └── default.css
├── modeling.Rmd
├── modeling.html
├── modeling_files
    └── figure-html
    │   ├── unnamed-chunk-10-1.png
    │   ├── unnamed-chunk-14-1.png
    │   ├── unnamed-chunk-23-1.png
    │   └── unnamed-chunk-26-1.png
├── pdfs
    ├── juliasilge-textmining-sdss-2.pdf
    └── juliasilge-textminnig-sdss-1.pdf
└── sdss2019.Rproj


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *cache*
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tidy Text Mining at SDSS 2019
 2 | 
 3 | Slides for my short course on text mining at the Symposium on Data Science and Statistics
 4 |  in May 2019
 5 | 
 6 | Check out the slides at [bit.ly/silge-sdss-1](https://bit.ly/silge-sdss-1) and [bit.ly/silge-sdss-2](https://bit.ly/silge-sdss-2)!
 7 | 
 8 | 
 9 | Slides created with [remark.js](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
10 | 
11 | My xaringan theme (from [xaringanthemer](https://pkg.garrickadenbuie.com/xaringanthemer/)):
12 | 
13 | ```
14 | mono_accent(
15 |     base_color           = "#09017F",
16 |     header_font_google   = google_font("Roboto", "700"),
17 |     text_font_google     = google_font("Roboto Condensed"),
18 |     code_font_google     = google_font("Droid Mono"),
19 |     code_highlight_color = "#D2B6E8"
20 |     )
21 | ```
22 | 


--------------------------------------------------------------------------------
/css/footer_plus.css:
--------------------------------------------------------------------------------
 1 | .large { font-size: 160% }
 2 | 
 3 | .title-slide {
 4 |   background-image: url(../figs/p_and_p_cover.png);
 5 |   background-size: cover;
 6 | }
 7 | 
 8 | .title-slide .remark-slide-number {
 9 |   display: none;
10 | }
11 | 
12 | .remark-slide-number {
13 |   display: none;
14 | }
15 | 
16 | div.my-footer {
17 |     background-color: #050045;
18 |     position: absolute;
19 |     bottom: 0px;
20 |     left: 0px;
21 |     height: 20px;
22 |     width: 100%;
23 | }
24 | div.my-footer span {
25 |     font-size: 10pt;
26 |     color: #F7F8FA;
27 |     position: absolute;
28 |     left: 15px;
29 |     bottom: 2px;
30 | }
31 | 


--------------------------------------------------------------------------------
/css/xaringan-themer.css:
--------------------------------------------------------------------------------
  1 | /* -------------------------------------------------------
  2 |  *
  3 |  *     !! This file was generated by xaringanthemer !!
  4 |  *
  5 |  *  Changes made to this file directly will be overwritten
  6 |  *  if you used xaringanthemer in your xaringan slides Rmd
  7 |  *
  8 |  *  Issues or likes?
  9 |  *    - https://github.com/gadenbuie/xaringanthemer
 10 |  *    - https://www.garrickadenbuie.com
 11 |  *
 12 |  *  Need help? Try:
 13 |  *    - vignette(package = "xaringanthemer")
 14 |  *    - ?xaringanthemer::write_xaringan_theme
 15 |  *    - xaringan wiki: https://github.com/yihui/xaringan/wiki
 16 |  *    - remarkjs wiki: https://github.com/gnab/remark/wiki
 17 |  *
 18 |  * ------------------------------------------------------- */
 19 | @import url(https://fonts.googleapis.com/css?family=Roboto+Condensed);
 20 | @import url(https://fonts.googleapis.com/css?family=Roboto:700);
 21 | @import url(https://fonts.googleapis.com/css?family=Droid+Mono);
 22 | 
 23 | 
 24 | body {
 25 |   font-family: Roboto Condensed, 'Palatino Linotype', 'Book Antiqua', Palatino, 'Microsoft YaHei', 'Songti SC', serif;
 26 |   font-weight: ;
 27 |   color: #272822;
 28 | }
 29 | h1, h2, h3 {
 30 |   font-family: Roboto;
 31 |   font-weight: normal;
 32 |   color: #09017F;
 33 | }
 34 | .remark-slide-content {
 35 |   background-color: #FFFFFF;
 36 |   font-size: 20px;
 37 |   
 38 |   
 39 |   
 40 |   padding: 1em 4em 1em 4em;
 41 | }
 42 | .remark-slide-content h1 {
 43 |     font-size: 55px;
 44 | }
 45 | .remark-slide-content h2 {
 46 |     font-size: 45px;
 47 | }
 48 | .remark-slide-content h3 {
 49 |     font-size: 35px;
 50 | }
 51 | .remark-code, .remark-inline-code {
 52 |   font-family: Droid Mono, 'Lucida Console', Monaco, monospace;
 53 | }
 54 | .remark-code {
 55 |   font-size: 0.9em;
 56 | }
 57 | .remark-inline-code {
 58 |   font-size: 1em;
 59 |   color: #09017F;
 60 |   
 61 |   
 62 | }
 63 | .remark-slide-number {
 64 |   color: #09017F;
 65 |   opacity: 1;
 66 |   font-size: 0.9em;
 67 | }
 68 | strong{color:#09017F;}
 69 | a, a > code {
 70 |   color: #09017F;
 71 |   text-decoration: none;
 72 | }
 73 | .footnote {
 74 |   
 75 |   position: absolute;
 76 |   bottom: 3em;
 77 |   padding-right: 4em;
 78 |   font-size: 0.9em;
 79 | }
 80 | .remark-code-line-highlighted {
 81 |   background-color: #D2B6E8;
 82 | }
 83 | .inverse {
 84 |   background-color: #09017F;
 85 |   color: #FFFFFF;
 86 |   
 87 | }
 88 | .inverse h1, .inverse h2, .inverse h3 {
 89 |   color: #FFFFFF;
 90 | }
 91 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 {
 92 |   color: #FFFFFF;
 93 | }
 94 | .title-slide {
 95 |   background-color: #09017F;
 96 |   
 97 |   
 98 |   
 99 | }
100 | .title-slide .remark-slide-number {
101 |   display: none;
102 | }
103 | /* Two-column layout */
104 | .left-column {
105 |   width: 20%;
106 |   height: 92%;
107 |   float: left;
108 | }
109 | .left-column h2, .left-column h3 {
110 |   color: #09017F99;
111 | }
112 | .left-column h2:last-of-type, .left-column h3:last-child {
113 |   color: #09017F;
114 | }
115 | .right-column {
116 |   width: 75%;
117 |   float: right;
118 |   padding-top: 1em;
119 | }
120 | .pull-left {
121 |   float: left;
122 |   width: 47%;
123 | }
124 | .pull-right {
125 |   float: right;
126 |   width: 47%;
127 | }
128 | .pull-right ~ * {
129 |   clear: both;
130 | }
131 | img, video, iframe {
132 |   max-width: 100%;
133 | }
134 | blockquote {
135 |   border-left: solid 5px #09017F80;
136 |   padding-left: 1em;
137 | }
138 | .remark-slide table {
139 |   margin: auto;
140 |   border-top: 1px solid #666;
141 |   border-bottom: 1px solid #666;
142 | }
143 | .remark-slide table thead th { border-bottom: 1px solid #ddd; }
144 | th, td { padding: 5px; }
145 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #B5B2D8 }
146 | table.dataTable tbody {
147 |   background-color: #FFFFFF;
148 |   color: #272822;
149 | }
150 | table.dataTable.display tbody tr.odd {
151 |   background-color: #FFFFFF;
152 | }
153 | table.dataTable.display tbody tr.even {
154 |   background-color: #B5B2D8;
155 | }
156 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover {
157 |   background-color: rgba(255, 255, 255, 0.5);
158 | }
159 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate {
160 |   color: #272822;
161 | }
162 | .dataTables_wrapper .dataTables_paginate .paginate_button {
163 |   color: #272822 !important;
164 | }
165 | 
166 | @page { margin: 0; }
167 | @media print {
168 |   .remark-slide-scaler {
169 |     width: 100% !important;
170 |     height: 100% !important;
171 |     transform: scale(1) !important;
172 |     top: 0 !important;
173 |     left: 0 !important;
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/figs/austen-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/austen-1.png


--------------------------------------------------------------------------------
/figs/blue_jane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/blue_jane.png


--------------------------------------------------------------------------------
/figs/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/cover.png


--------------------------------------------------------------------------------
/figs/lizzieskipping.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/lizzieskipping.gif


--------------------------------------------------------------------------------
/figs/model_diagnostic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/model_diagnostic-1.png


--------------------------------------------------------------------------------
/figs/p_and_p_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/p_and_p_cover.png


--------------------------------------------------------------------------------
/figs/plot_tf_idf-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/plot_tf_idf-1.png


--------------------------------------------------------------------------------
/figs/purple_emily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/purple_emily.png


--------------------------------------------------------------------------------
/figs/slider.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/slider.gif


--------------------------------------------------------------------------------
/figs/stm_video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stm_video.png


--------------------------------------------------------------------------------
/figs/stop.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stop.gif


--------------------------------------------------------------------------------
/figs/tidytext_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tidytext_repo.png


--------------------------------------------------------------------------------
/figs/tilecounts-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilecounts-1.png


--------------------------------------------------------------------------------
/figs/tilerate-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilerate-1.png


--------------------------------------------------------------------------------
/figs/tmwr_0601.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tmwr_0601.png


--------------------------------------------------------------------------------
/figs/top_tags-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/top_tags-1.png


--------------------------------------------------------------------------------
/figs/vexing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/vexing.gif


--------------------------------------------------------------------------------
/header.html:
--------------------------------------------------------------------------------
1 | <script src="https://use.fontawesome.com/5235085b15.js"></script>
2 | 


--------------------------------------------------------------------------------
/intro.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Text Mining"
  3 | subtitle: "<br><br>USING TIDY DATA PRINCIPLES"
  4 | author: "Julia Silge | SDSS | 29 May 2019"
  5 | output:
  6 |   xaringan::moon_reader:
  7 |     css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"]
  8 |     lib_dir: libs
  9 |     nature:
 10 |       highlightStyle: github
 11 |       highlightLines: true
 12 |       countIncrementalSlides: false
 13 |       ratio: "16:9"
 14 |     seal: false  
 15 |     includes:
 16 |       in_header: header.html
 17 | ---
 18 | 
 19 | ```{r setup, include=FALSE}
 20 | options(htmltools.dir.version = FALSE)
 21 | library(knitr)
 22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180)
 23 | library(ggplot2)
 24 | library(silgelib)
 25 | theme_set(theme_roboto())
 26 | ```
 27 | 
 28 | layout: true
 29 | 
 30 | <div class="my-footer"><span>bit.ly/silge-sdss-1</span></div> 
 31 | 
 32 | ---
 33 | 
 34 | class: inverse, center, middle
 35 | 
 36 | background-image: url(figs/p_and_p_cover.png)
 37 | background-size: cover
 38 | 
 39 | 
 40 | # Text Mining
 41 | 
 42 | <img src="figs/blue_jane.png" width="150px"/>
 43 | 
 44 | ### USING TIDY PRINCIPLES
 45 | 
 46 | .large[Julia Silge | SDSS | 29 May 2019]
 47 | 
 48 | ---
 49 | 
 50 | ## Let's install some packages
 51 | 
 52 | ```{r, eval=FALSE}
 53 | install.packages(c("tidyverse", 
 54 |                    "tidytext", 
 55 |                    "gutenbergr"))
 56 | ```
 57 | 
 58 | ---
 59 | 
 60 | class: right, middle
 61 | 
 62 | <img src="figs/blue_jane.png" width="150px"/>
 63 | 
 64 | # Find us at...
 65 | 
 66 | <a href="http://twitter.com/juliasilge"><i class="fa fa-twitter fa-fw"></i>&nbsp; @juliasilge</a><br>
 67 | <a href="http://github.com/juliasilge"><i class="fa fa-github fa-fw"></i>&nbsp; @juliasilge</a><br>
 68 | <a href="https://juliasilge.com"><i class="fa fa-link fa-fw"></i>&nbsp; juliasilge.com</a><br>
 69 | 
 70 | ---
 71 | 
 72 | class: right, middle
 73 | 
 74 | <img src="figs/blue_jane.png" width="150px"/>
 75 | 
 76 | # Find us at...
 77 | 
 78 | <a href="http://twitter.com/dataandme"><i class="fa fa-twitter fa-fw"></i>&nbsp; @dataandme</a><br>
 79 | <a href="http://github.com/batpigandme"><i class="fa fa-github fa-fw"></i>&nbsp; @batpigandme</a><br>
 80 | <a href="https://maraaverick.rbind.io"><i class="fa fa-link fa-fw"></i>&nbsp; maraaverick.rbind.io</a><br>
 81 | 
 82 | ---
 83 | 
 84 | class: inverse
 85 | 
 86 | ## Text in the real world
 87 | 
 88 | --
 89 | 
 90 | - .large[Text data is increasingly important `r emo::ji("books")`]
 91 | 
 92 | --
 93 | 
 94 | - .large[NLP training is scarce on the ground `r emo::ji("scream")`]
 95 | 
 96 | ---
 97 | 
 98 | background-image: url(figs/vexing.gif)
 99 | background-position: 50% 50%
100 | background-size: 650px
101 | 
102 | ---
103 | 
104 | background-image: url(figs/p_and_p_cover.png)
105 | background-size: cover
106 | 
107 | class: inverse, center, middle
108 | 
109 | # TIDY DATA PRINCIPLES + TEXT MINING = `r emo::ji("tada")`
110 | 
111 | ---
112 | 
113 | background-image: url(figs/tidytext_repo.png)
114 | background-size: 800px
115 | background-position: 50% 20%
116 | 
117 | class: bottom, right
118 | 
119 | .large[[https://github.com/juliasilge/tidytext](https://github.com/juliasilge/tidytext)]
120 | 
121 | .large[[http://tidytextmining.com/](http://tidytextmining.com/)]
122 | 
123 | ---
124 | 
125 | background-image: url(figs/cover.png)
126 | background-size: 450px
127 | background-position: 50% 50%
128 | 
129 | ---
130 | 
131 | <img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/>
132 | 
133 | ## What do we mean by tidy text?
134 | 
135 | 
136 | ```{r}
137 | text <- c("Because I could not stop for Death -",
138 |           "He kindly stopped for me -",
139 |           "The Carriage held but just Ourselves -",
140 |           "and Immortality")
141 | 
142 | text
143 | ```
144 | 
145 | ---
146 | 
147 | <img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/>
148 | 
149 | ## What do we mean by tidy text?
150 | 
151 | ```{r}
152 | library(tidyverse)
153 | text_df <- data_frame(line = 1:4, text = text)
154 | 
155 | text_df
156 | ```
157 | 
158 | ---
159 | 
160 | <img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/>
161 | 
162 | ## What do we mean by tidy text?
163 | 
164 | ```{r}
165 | library(tidytext)
166 | 
167 | text_df %>%
168 |   unnest_tokens(word, text)        #<<
169 | ```
170 | 
171 | ---
172 | 
173 | ## Gathering more data
174 | 
175 | .large[You can access the full text of many public domain works from [Project Gutenberg](https://www.gutenberg.org/) using the [gutenbergr](https://ropensci.org/tutorials/gutenbergr_tutorial.html) package.]
176 | 
177 | 
178 | ```{r}
179 | library(gutenbergr)
180 | 
181 | full_text <- gutenberg_download(1342)
182 | ```
183 | 
184 | .large[What book do *you* want to analyze today? `r emo::ji_glue(":book: :partying: :book:")`]
185 | 
186 | ---
187 | 
188 | ## Time to tidy your text!
189 | 
190 | ```{r}
191 | tidy_book <- full_text %>%
192 |   mutate(line = row_number()) %>%
193 |   unnest_tokens(word, text)                #<<
194 | 
195 | tidy_book
196 | ```
197 | 
198 | ---
199 | 
200 | ## What are the most common words?
201 | 
202 | ```{r}
203 | tidy_book %>%
204 |   count(word, sort = TRUE)
205 | ```
206 | 
207 | ---
208 | 
209 | background-image: url(figs/stop.gif)
210 | background-size: 500px
211 | background-position: 50% 50%
212 | 
213 | ## Stop words
214 | 
215 | ---
216 | 
217 | ## Stop words
218 | 
219 | ```{r}
220 | get_stopwords()
221 | ```
222 | 
223 | ---
224 | 
225 | ## Stop words
226 | 
227 | ```{r}
228 | get_stopwords(language = "es")
229 | ```
230 | 
231 | ---
232 | 
233 | ## Stop words
234 | 
235 | ```{r}
236 | get_stopwords(language = "pt")
237 | ```
238 | 
239 | ---
240 | 
241 | ## Stop words
242 | 
243 | ```{r}
244 | get_stopwords(source = "smart")
245 | ```
246 | 
247 | ---
248 | 
249 | ## What are the most common words?
250 | 
251 | ```{r, eval = FALSE}
252 | tidy_book %>%
253 |   anti_join(get_stopwords(source = "smart")) %>%
254 |   count(word, sort = TRUE) %>%
255 |   top_n(20) %>%
256 |   ggplot(aes(fct_reorder(word, n), n)) +            #<<
257 |   geom_col() +
258 |   coord_flip()
259 | ```
260 | 
261 | ---
262 | 
263 | ```{r, echo=FALSE, fig.height=4}
264 | tidy_book %>%
265 |   anti_join(get_stopwords(source = "smart")) %>%
266 |   count(word, sort = TRUE) %>%
267 |   top_n(20) %>%
268 |   ggplot(aes(fct_reorder(word, n), n)) +
269 |   geom_col(fill = "midnightblue", alpha = 0.9) +
270 |   coord_flip() +
271 |   scale_y_continuous(expand = c(0,0)) +
272 |   labs(x = NULL, y = "Number of occurrences")
273 | ```
274 | 
275 | ---
276 | 
277 | background-image: url(figs/tilecounts-1.png)
278 | background-size: 700px
279 | 
280 | ---
281 | 
282 | background-image: url(figs/tilerate-1.png)
283 | background-size: 700px
284 | 
285 | ---
286 | 
287 | background-image: url(figs/p_and_p_cover.png)
288 | background-size: cover
289 | 
290 | class: inverse, center, middle
291 | 
292 | ## SENTIMENT ANALYSIS `r emo::ji_glue(":smile: :cry: :angry:")`
293 | 
294 | ---
295 | 
296 | ## Sentiment lexicons
297 | 
298 | ```{r}
299 | get_sentiments("afinn")
300 | ```
301 | 
302 | ---
303 | 
304 | ## Sentiment lexicons
305 | 
306 | ```{r}
307 | get_sentiments("bing")
308 | ```
309 | 
310 | ---
311 | 
312 | ## Sentiment lexicons
313 | 
314 | 
315 | ```{r}
316 | get_sentiments("nrc")
317 | ```
318 | 
319 | ---
320 | 
321 | ## Sentiment lexicons
322 | 
323 | ```{r}
324 | get_sentiments("loughran")
325 | ```
326 | 
327 | ---
328 | 
329 | ## Implementing sentiment analysis
330 | 
331 | ```{r}
332 | tidy_book %>%
333 |   inner_join(get_sentiments("bing")) %>%            #<<
334 |   count(sentiment, sort = TRUE)
335 | ```
336 | 
337 | ---
338 | 
339 | ## Implementing sentiment analysis
340 | 
341 | ```{r}
342 | tidy_book %>%
343 |   inner_join(get_sentiments("bing")) %>%            
344 |   count(sentiment, word, sort = TRUE)             #<<
345 | ```
346 | 
347 | ---
348 | 
349 | ## Implementing sentiment analysis
350 | 
351 | ```{r, eval = FALSE}
352 | tidy_book %>%
353 |   inner_join(get_sentiments("bing")) %>%
354 |   count(sentiment, word, sort = TRUE) %>%
355 |   group_by(sentiment) %>%
356 |   top_n(10) %>%
357 |   ungroup %>%
358 |   ggplot(aes(fct_reorder(word, n),               #<<
359 |              n, 
360 |              fill = sentiment)) +
361 |   geom_col() +
362 |   coord_flip() +
363 |   facet_wrap(~ sentiment, scales = "free") 
364 | ```
365 | 
366 | ---
367 | 
368 | class: middle
369 | 
370 | ```{r, echo=FALSE, fig.height=4}
371 | tidy_book %>%
372 |   inner_join(get_sentiments("bing")) %>%
373 |   count(sentiment, word, sort = TRUE) %>%
374 |   group_by(sentiment) %>%
375 |   top_n(10) %>%
376 |   ungroup %>%
377 |   ggplot(aes(fct_reorder(word, n), n, fill = sentiment)) +
378 |   geom_col(alpha = 0.9, show.legend = FALSE) +
379 |   coord_flip() +
380 |   facet_wrap(~ sentiment, scales = "free") +
381 |   scale_y_continuous(expand = c(0,0)) +
382 |   labs(x = NULL, y = "Number of occurrences")
383 | ```
384 | 
385 | ---
386 | 
387 | background-image: url(figs/p_and_p_cover.png)
388 | background-size: cover
389 | 
390 | class: inverse, center, middle
391 | 
392 | ## WHAT IS A DOCUMENT ABOUT? `r emo::ji("thinking")`
393 | 
394 | ---
395 | 
396 | ## What is a document about?
397 | 
398 | - .large[Term frequency]
399 | - .large[Inverse document frequency]
400 | 
401 | $$idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}$$
402 | 
403 | ### tf-idf is about comparing **documents** within a **collection**.
404 | 
405 | ---
406 | 
407 | ## Understanding tf-idf
408 | 
409 | .large[Make a collection (*corpus*) for yourself! `r emo::ji("nail")`]
410 | 
411 | ```{r}
412 | full_collection <- gutenberg_download(c(1342, 158, 161, 141),
413 |                                       meta_fields = "title")
414 | 
415 | full_collection
416 | ```
417 | 
418 | ---
419 | 
420 | ## Counting word frequencies in your collection
421 | 
422 | ```{r}
423 | book_words <- full_collection %>%
424 |   unnest_tokens(word, text) %>%                #<<
425 |   count(title, word, sort = TRUE)
426 | 
427 | book_words
428 | ```
429 | 
430 | ---
431 | 
432 | ## Calculating tf-idf
433 | 
434 | .large[That's... super exciting???]
435 | 
436 | ```{r}
437 | book_tfidf <- book_words %>%
438 |   bind_tf_idf(word, title, n)            #<<
439 | 
440 | book_tfidf
441 | ```
442 | 
443 | ---
444 | 
445 | ## Calculating tf-idf
446 | 
447 | ```{r}
448 | book_tfidf %>%
449 |   arrange(-tf_idf)
450 | ```
451 | 
452 | ---
453 | 
454 | ## Calculating tf-idf
455 | 
456 | ```{r, eval = FALSE}
457 | book_tfidf %>%
458 |   group_by(title) %>%
459 |   top_n(10) %>%
460 |   ungroup %>%
461 |   ggplot(aes(fct_reorder(word, tf_idf),               #<<
462 |              tf_idf, 
463 |              fill = title)) +
464 |   geom_col(show.legend = FALSE) +
465 |   coord_flip() +
466 |   facet_wrap(~title, scales = "free")
467 | ```
468 | 
469 | ---
470 | 
471 | ```{r, echo=FALSE, fig.height=4}
472 | book_tfidf %>%
473 |   group_by(title) %>%
474 |   top_n(10) %>%
475 |   ungroup %>%
476 |   ggplot(aes(fct_reorder(word, tf_idf), 
477 |              tf_idf, 
478 |              fill = title)) +
479 |   geom_col(alpha = 0.9, show.legend = FALSE) +
480 |   coord_flip() +
481 |   facet_wrap(~title, scales = "free") +
482 |   scale_y_continuous(expand = c(0,0)) +
483 |   labs(x = NULL, y = "tf-idf")
484 | ```
485 | 
486 | ---
487 | 
488 | background-image: url(figs/plot_tf_idf-1.png)
489 | background-size: 800px
490 | 
491 | ---
492 | 
493 | ## N-grams... and beyond! `r emo::ji("rocket")`
494 | 
495 | ```{r}
496 | tidy_ngram <- full_text %>%
497 |   unnest_tokens(bigram, text, token = "ngrams", n = 2)        #<<
498 | 
499 | tidy_ngram
500 | ```
501 | 
502 | ---
503 | 
504 | ## N-grams... and beyond! `r emo::ji("rocket")`
505 | 
506 | ```{r}
507 | tidy_ngram %>%
508 |   count(bigram, sort = TRUE)
509 | ```
510 | 
511 | ---
512 | 
513 | ## N-grams... and beyond! `r emo::ji("rocket")`
514 | 
515 | ```{r}
516 | tidy_ngram %>%
517 |   separate(bigram, c("word1", "word2"), sep = " ") %>%         #<<
518 |   filter(!word1 %in% stop_words$word,
519 |          !word2 %in% stop_words$word) %>%
520 |   count(word1, word2, sort = TRUE)
521 | ```
522 | 
523 | ---
524 | 
525 | background-image: url(figs/p_and_p_cover.png)
526 | background-size: cover
527 | 
528 | class: inverse
529 | 
530 | ## What can you do with n-grams?
531 | 
532 | - .large[tf-idf of n-grams]
533 | 
534 | --
535 | 
536 | - .large[network analysis]
537 | 
538 | --
539 | 
540 | - .large[negation]
541 | 
542 | ---
543 | 
544 | background-image: url(figs/austen-1.png)
545 | background-size: 750px
546 | 
547 | ---
548 | 
549 | background-image: url(figs/slider.gif)
550 | background-position: 50% 70%
551 | 
552 | ## What can you do with n-grams?
553 | 
554 | ### [She Giggles, He Gallops](https://pudding.cool/2017/08/screen-direction/)
555 | 
556 | ---
557 | 
558 | class: left, middle
559 | 
560 | <img src="figs/blue_jane.png" width="150px"/>
561 | 
562 | # Thanks!
563 | 
564 | <a href="https://tidytextmining.com"><i class="fa fa-book fa-fw"></i>&nbsp; tidytextmining.com</a><br>
565 | <a href="http://twitter.com/juliasilge"><i class="fa fa-twitter fa-fw"></i>&nbsp; @juliasilge</a><br>
566 | <a href="http://github.com/juliasilge"><i class="fa fa-github fa-fw"></i>&nbsp; @juliasilge</a><br>
567 | <a href="https://juliasilge.com"><i class="fa fa-link fa-fw"></i>&nbsp; juliasilge.com</a><br>
568 | <a href="http://twitter.com/dataandme"><i class="fa fa-twitter fa-fw"></i>&nbsp; @dataandme</a><br>
569 | <a href="http://github.com/batpigandme"><i class="fa fa-github fa-fw"></i>&nbsp; @batpigandme</a><br>
570 | <a href="https://maraaverick.rbind.io"><i class="fa fa-link fa-fw"></i>&nbsp; maraaverick.rbind.io</a><br>
571 | 
572 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
573 | 


--------------------------------------------------------------------------------
/intro.html:
--------------------------------------------------------------------------------
   1 | <!DOCTYPE html>
   2 | <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
   3 |   <head>
   4 |     <title>Text Mining</title>
   5 |     <meta charset="utf-8" />
   6 |     <meta name="author" content="Julia Silge | SDSS | 29 May 2019" />
   7 |     <link href="libs/remark-css/default.css" rel="stylesheet" />
   8 |     <script src="https://use.fontawesome.com/5235085b15.js"></script>
   9 |     <link rel="stylesheet" href="css/xaringan-themer.css" type="text/css" />
  10 |     <link rel="stylesheet" href="css/footer_plus.css" type="text/css" />
  11 |   </head>
  12 |   <body>
  13 |     <textarea id="source">
  14 | 
  15 | 
  16 | 
  17 | 
  18 | layout: true
  19 | 
  20 | &lt;div class="my-footer"&gt;&lt;span&gt;bit.ly/silge-sdss-1&lt;/span&gt;&lt;/div&gt; 
  21 | 
  22 | ---
  23 | 
  24 | class: inverse, center, middle
  25 | 
  26 | background-image: url(figs/p_and_p_cover.png)
  27 | background-size: cover
  28 | 
  29 | 
  30 | # Text Mining
  31 | 
  32 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
  33 | 
  34 | ### USING TIDY PRINCIPLES
  35 | 
  36 | .large[Julia Silge | SDSS | 29 May 2019]
  37 | 
  38 | ---
  39 | 
  40 | ## Let's install some packages
  41 | 
  42 | 
  43 | ```r
  44 | install.packages(c("tidyverse", 
  45 |                    "tidytext", 
  46 |                    "gutenbergr"))
  47 | ```
  48 | 
  49 | ---
  50 | 
  51 | class: right, middle
  52 | 
  53 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
  54 | 
  55 | # Find us at...
  56 | 
  57 | &lt;a href="http://twitter.com/juliasilge"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
  58 | &lt;a href="http://github.com/juliasilge"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
  59 | &lt;a href="https://juliasilge.com"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; juliasilge.com&lt;/a&gt;&lt;br&gt;
  60 | 
  61 | ---
  62 | 
  63 | class: right, middle
  64 | 
  65 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
  66 | 
  67 | # Find us at...
  68 | 
  69 | &lt;a href="http://twitter.com/dataandme"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @dataandme&lt;/a&gt;&lt;br&gt;
  70 | &lt;a href="http://github.com/batpigandme"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @batpigandme&lt;/a&gt;&lt;br&gt;
  71 | &lt;a href="https://maraaverick.rbind.io"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; maraaverick.rbind.io&lt;/a&gt;&lt;br&gt;
  72 | 
  73 | ---
  74 | 
  75 | class: inverse
  76 | 
  77 | ## Text in the real world
  78 | 
  79 | --
  80 | 
  81 | - .large[Text data is increasingly important 📚]
  82 | 
  83 | --
  84 | 
  85 | - .large[NLP training is scarce on the ground 😱]
  86 | 
  87 | ---
  88 | 
  89 | background-image: url(figs/vexing.gif)
  90 | background-position: 50% 50%
  91 | background-size: 650px
  92 | 
  93 | ---
  94 | 
  95 | background-image: url(figs/p_and_p_cover.png)
  96 | background-size: cover
  97 | 
  98 | class: inverse, center, middle
  99 | 
 100 | # TIDY DATA PRINCIPLES + TEXT MINING = 🎉
 101 | 
 102 | ---
 103 | 
 104 | background-image: url(figs/tidytext_repo.png)
 105 | background-size: 800px
 106 | background-position: 50% 20%
 107 | 
 108 | class: bottom, right
 109 | 
 110 | .large[[https://github.com/juliasilge/tidytext](https://github.com/juliasilge/tidytext)]
 111 | 
 112 | .large[[http://tidytextmining.com/](http://tidytextmining.com/)]
 113 | 
 114 | ---
 115 | 
 116 | background-image: url(figs/cover.png)
 117 | background-size: 450px
 118 | background-position: 50% 50%
 119 | 
 120 | ---
 121 | 
 122 | &lt;img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/&gt;
 123 | 
 124 | ## What do we mean by tidy text?
 125 | 
 126 | 
 127 | 
 128 | ```r
 129 | text &lt;- c("Because I could not stop for Death -",
 130 |           "He kindly stopped for me -",
 131 |           "The Carriage held but just Ourselves -",
 132 |           "and Immortality")
 133 | 
 134 | text
 135 | ```
 136 | 
 137 | ```
 138 | ## [1] "Because I could not stop for Death -"  
 139 | ## [2] "He kindly stopped for me -"            
 140 | ## [3] "The Carriage held but just Ourselves -"
 141 | ## [4] "and Immortality"
 142 | ```
 143 | 
 144 | ---
 145 | 
 146 | &lt;img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/&gt;
 147 | 
 148 | ## What do we mean by tidy text?
 149 | 
 150 | 
 151 | ```r
 152 | library(tidyverse)
 153 | text_df &lt;- data_frame(line = 1:4, text = text)
 154 | 
 155 | text_df
 156 | ```
 157 | 
 158 | ```
 159 | ## # A tibble: 4 x 2
 160 | ##    line text                                  
 161 | ##   &lt;int&gt; &lt;chr&gt;                                 
 162 | ## 1     1 Because I could not stop for Death -  
 163 | ## 2     2 He kindly stopped for me -            
 164 | ## 3     3 The Carriage held but just Ourselves -
 165 | ## 4     4 and Immortality
 166 | ```
 167 | 
 168 | ---
 169 | 
 170 | &lt;img src="figs/purple_emily.png" style="position:absolute;top:20px;right:20px;" width="100px"/&gt;
 171 | 
 172 | ## What do we mean by tidy text?
 173 | 
 174 | 
 175 | ```r
 176 | library(tidytext)
 177 | 
 178 | text_df %&gt;%
 179 | * unnest_tokens(word, text)
 180 | ```
 181 | 
 182 | ```
 183 | ## # A tibble: 20 x 2
 184 | ##     line word       
 185 | ##    &lt;int&gt; &lt;chr&gt;      
 186 | ##  1     1 because    
 187 | ##  2     1 i          
 188 | ##  3     1 could      
 189 | ##  4     1 not        
 190 | ##  5     1 stop       
 191 | ##  6     1 for        
 192 | ##  7     1 death      
 193 | ##  8     2 he         
 194 | ##  9     2 kindly     
 195 | ## 10     2 stopped    
 196 | ## 11     2 for        
 197 | ## 12     2 me         
 198 | ## 13     3 the        
 199 | ## 14     3 carriage   
 200 | ## 15     3 held       
 201 | ## 16     3 but        
 202 | ## 17     3 just       
 203 | ## 18     3 ourselves  
 204 | ## 19     4 and        
 205 | ## 20     4 immortality
 206 | ```
 207 | 
 208 | ---
 209 | 
 210 | ## Gathering more data
 211 | 
 212 | .large[You can access the full text of many public domain works from [Project Gutenberg](https://www.gutenberg.org/) using the [gutenbergr](https://ropensci.org/tutorials/gutenbergr_tutorial.html) package.]
 213 | 
 214 | 
 215 | 
 216 | ```r
 217 | library(gutenbergr)
 218 | 
 219 | full_text &lt;- gutenberg_download(1342)
 220 | ```
 221 | 
 222 | .large[What book do *you* want to analyze today? 📖 👯‍♂️ 📖]
 223 | 
 224 | ---
 225 | 
 226 | ## Time to tidy your text!
 227 | 
 228 | 
 229 | ```r
 230 | tidy_book &lt;- full_text %&gt;%
 231 |   mutate(line = row_number()) %&gt;%
 232 | * unnest_tokens(word, text)
 233 | 
 234 | tidy_book
 235 | ```
 236 | 
 237 | ```
 238 | ## # A tibble: 122,204 x 3
 239 | ##    gutenberg_id  line word     
 240 | ##           &lt;int&gt; &lt;int&gt; &lt;chr&gt;    
 241 | ##  1         1342     1 pride    
 242 | ##  2         1342     1 and      
 243 | ##  3         1342     1 prejudice
 244 | ##  4         1342     3 by       
 245 | ##  5         1342     3 jane     
 246 | ##  6         1342     3 austen   
 247 | ##  7         1342     7 chapter  
 248 | ##  8         1342     7 1        
 249 | ##  9         1342    10 it       
 250 | ## 10         1342    10 is       
 251 | ## # … with 122,194 more rows
 252 | ```
 253 | 
 254 | ---
 255 | 
 256 | ## What are the most common words?
 257 | 
 258 | 
 259 | ```r
 260 | tidy_book %&gt;%
 261 |   count(word, sort = TRUE)
 262 | ```
 263 | 
 264 | ```
 265 | ## # A tibble: 6,538 x 2
 266 | ##    word      n
 267 | ##    &lt;chr&gt; &lt;int&gt;
 268 | ##  1 the    4331
 269 | ##  2 to     4162
 270 | ##  3 of     3610
 271 | ##  4 and    3585
 272 | ##  5 her    2203
 273 | ##  6 i      2065
 274 | ##  7 a      1954
 275 | ##  8 in     1880
 276 | ##  9 was    1843
 277 | ## 10 she    1695
 278 | ## # … with 6,528 more rows
 279 | ```
 280 | 
 281 | ---
 282 | 
 283 | background-image: url(figs/stop.gif)
 284 | background-size: 500px
 285 | background-position: 50% 50%
 286 | 
 287 | ## Stop words
 288 | 
 289 | ---
 290 | 
 291 | ## Stop words
 292 | 
 293 | 
 294 | ```r
 295 | get_stopwords()
 296 | ```
 297 | 
 298 | ```
 299 | ## # A tibble: 175 x 2
 300 | ##    word      lexicon 
 301 | ##    &lt;chr&gt;     &lt;chr&gt;   
 302 | ##  1 i         snowball
 303 | ##  2 me        snowball
 304 | ##  3 my        snowball
 305 | ##  4 myself    snowball
 306 | ##  5 we        snowball
 307 | ##  6 our       snowball
 308 | ##  7 ours      snowball
 309 | ##  8 ourselves snowball
 310 | ##  9 you       snowball
 311 | ## 10 your      snowball
 312 | ## # … with 165 more rows
 313 | ```
 314 | 
 315 | ---
 316 | 
 317 | ## Stop words
 318 | 
 319 | 
 320 | ```r
 321 | get_stopwords(language = "es")
 322 | ```
 323 | 
 324 | ```
 325 | ## # A tibble: 308 x 2
 326 | ##    word  lexicon 
 327 | ##    &lt;chr&gt; &lt;chr&gt;   
 328 | ##  1 de    snowball
 329 | ##  2 la    snowball
 330 | ##  3 que   snowball
 331 | ##  4 el    snowball
 332 | ##  5 en    snowball
 333 | ##  6 y     snowball
 334 | ##  7 a     snowball
 335 | ##  8 los   snowball
 336 | ##  9 del   snowball
 337 | ## 10 se    snowball
 338 | ## # … with 298 more rows
 339 | ```
 340 | 
 341 | ---
 342 | 
 343 | ## Stop words
 344 | 
 345 | 
 346 | ```r
 347 | get_stopwords(language = "pt")
 348 | ```
 349 | 
 350 | ```
 351 | ## # A tibble: 203 x 2
 352 | ##    word  lexicon 
 353 | ##    &lt;chr&gt; &lt;chr&gt;   
 354 | ##  1 de    snowball
 355 | ##  2 a     snowball
 356 | ##  3 o     snowball
 357 | ##  4 que   snowball
 358 | ##  5 e     snowball
 359 | ##  6 do    snowball
 360 | ##  7 da    snowball
 361 | ##  8 em    snowball
 362 | ##  9 um    snowball
 363 | ## 10 para  snowball
 364 | ## # … with 193 more rows
 365 | ```
 366 | 
 367 | ---
 368 | 
 369 | ## Stop words
 370 | 
 371 | 
 372 | ```r
 373 | get_stopwords(source = "smart")
 374 | ```
 375 | 
 376 | ```
 377 | ## # A tibble: 571 x 2
 378 | ##    word        lexicon
 379 | ##    &lt;chr&gt;       &lt;chr&gt;  
 380 | ##  1 a           smart  
 381 | ##  2 a's         smart  
 382 | ##  3 able        smart  
 383 | ##  4 about       smart  
 384 | ##  5 above       smart  
 385 | ##  6 according   smart  
 386 | ##  7 accordingly smart  
 387 | ##  8 across      smart  
 388 | ##  9 actually    smart  
 389 | ## 10 after       smart  
 390 | ## # … with 561 more rows
 391 | ```
 392 | 
 393 | ---
 394 | 
 395 | ## What are the most common words?
 396 | 
 397 | 
 398 | ```r
 399 | tidy_book %&gt;%
 400 |   anti_join(get_stopwords(source = "smart")) %&gt;%
 401 |   count(word, sort = TRUE) %&gt;%
 402 |   top_n(20) %&gt;%
 403 | * ggplot(aes(fct_reorder(word, n), n)) +
 404 |   geom_col() +
 405 |   coord_flip()
 406 | ```
 407 | 
 408 | ---
 409 | 
 410 | ![](intro_files/figure-html/unnamed-chunk-13-1.png)&lt;!-- --&gt;
 411 | 
 412 | ---
 413 | 
 414 | background-image: url(figs/tilecounts-1.png)
 415 | background-size: 700px
 416 | 
 417 | ---
 418 | 
 419 | background-image: url(figs/tilerate-1.png)
 420 | background-size: 700px
 421 | 
 422 | ---
 423 | 
 424 | background-image: url(figs/p_and_p_cover.png)
 425 | background-size: cover
 426 | 
 427 | class: inverse, center, middle
 428 | 
 429 | ## SENTIMENT ANALYSIS 😄 😢 😠
 430 | 
 431 | ---
 432 | 
 433 | ## Sentiment lexicons
 434 | 
 435 | 
 436 | ```r
 437 | get_sentiments("afinn")
 438 | ```
 439 | 
 440 | ```
 441 | ## # A tibble: 2,476 x 2
 442 | ##    word       score
 443 | ##    &lt;chr&gt;      &lt;int&gt;
 444 | ##  1 abandon       -2
 445 | ##  2 abandoned     -2
 446 | ##  3 abandons      -2
 447 | ##  4 abducted      -2
 448 | ##  5 abduction     -2
 449 | ##  6 abductions    -2
 450 | ##  7 abhor         -3
 451 | ##  8 abhorred      -3
 452 | ##  9 abhorrent     -3
 453 | ## 10 abhors        -3
 454 | ## # … with 2,466 more rows
 455 | ```
 456 | 
 457 | ---
 458 | 
 459 | ## Sentiment lexicons
 460 | 
 461 | 
 462 | ```r
 463 | get_sentiments("bing")
 464 | ```
 465 | 
 466 | ```
 467 | ## # A tibble: 6,788 x 2
 468 | ##    word        sentiment
 469 | ##    &lt;chr&gt;       &lt;chr&gt;    
 470 | ##  1 2-faced     negative 
 471 | ##  2 2-faces     negative 
 472 | ##  3 a+          positive 
 473 | ##  4 abnormal    negative 
 474 | ##  5 abolish     negative 
 475 | ##  6 abominable  negative 
 476 | ##  7 abominably  negative 
 477 | ##  8 abominate   negative 
 478 | ##  9 abomination negative 
 479 | ## 10 abort       negative 
 480 | ## # … with 6,778 more rows
 481 | ```
 482 | 
 483 | ---
 484 | 
 485 | ## Sentiment lexicons
 486 | 
 487 | 
 488 | 
 489 | ```r
 490 | get_sentiments("nrc")
 491 | ```
 492 | 
 493 | ```
 494 | ## # A tibble: 13,901 x 2
 495 | ##    word        sentiment
 496 | ##    &lt;chr&gt;       &lt;chr&gt;    
 497 | ##  1 abacus      trust    
 498 | ##  2 abandon     fear     
 499 | ##  3 abandon     negative 
 500 | ##  4 abandon     sadness  
 501 | ##  5 abandoned   anger    
 502 | ##  6 abandoned   fear     
 503 | ##  7 abandoned   negative 
 504 | ##  8 abandoned   sadness  
 505 | ##  9 abandonment anger    
 506 | ## 10 abandonment fear     
 507 | ## # … with 13,891 more rows
 508 | ```
 509 | 
 510 | ---
 511 | 
 512 | ## Sentiment lexicons
 513 | 
 514 | 
 515 | ```r
 516 | get_sentiments("loughran")
 517 | ```
 518 | 
 519 | ```
 520 | ## # A tibble: 4,149 x 2
 521 | ##    word         sentiment
 522 | ##    &lt;chr&gt;        &lt;chr&gt;    
 523 | ##  1 abandon      negative 
 524 | ##  2 abandoned    negative 
 525 | ##  3 abandoning   negative 
 526 | ##  4 abandonment  negative 
 527 | ##  5 abandonments negative 
 528 | ##  6 abandons     negative 
 529 | ##  7 abdicated    negative 
 530 | ##  8 abdicates    negative 
 531 | ##  9 abdicating   negative 
 532 | ## 10 abdication   negative 
 533 | ## # … with 4,139 more rows
 534 | ```
 535 | 
 536 | ---
 537 | 
 538 | ## Implementing sentiment analysis
 539 | 
 540 | 
 541 | ```r
 542 | tidy_book %&gt;%
 543 | * inner_join(get_sentiments("bing")) %&gt;%
 544 |   count(sentiment, sort = TRUE)
 545 | ```
 546 | 
 547 | ```
 548 | ## # A tibble: 2 x 2
 549 | ##   sentiment     n
 550 | ##   &lt;chr&gt;     &lt;int&gt;
 551 | ## 1 positive   5052
 552 | ## 2 negative   3652
 553 | ```
 554 | 
 555 | ---
 556 | 
 557 | ## Implementing sentiment analysis
 558 | 
 559 | 
 560 | ```r
 561 | tidy_book %&gt;%
 562 |   inner_join(get_sentiments("bing")) %&gt;%            
 563 | * count(sentiment, word, sort = TRUE)
 564 | ```
 565 | 
 566 | ```
 567 | ## # A tibble: 1,430 x 3
 568 | ##    sentiment word         n
 569 | ##    &lt;chr&gt;     &lt;chr&gt;    &lt;int&gt;
 570 | ##  1 negative  miss       283
 571 | ##  2 positive  well       224
 572 | ##  3 positive  good       200
 573 | ##  4 positive  great      142
 574 | ##  5 positive  enough     106
 575 | ##  6 positive  better      92
 576 | ##  7 positive  love        92
 577 | ##  8 positive  pleasure    92
 578 | ##  9 positive  happy       83
 579 | ## 10 positive  like        77
 580 | ## # … with 1,420 more rows
 581 | ```
 582 | 
 583 | ---
 584 | 
 585 | ## Implementing sentiment analysis
 586 | 
 587 | 
 588 | ```r
 589 | tidy_book %&gt;%
 590 |   inner_join(get_sentiments("bing")) %&gt;%
 591 |   count(sentiment, word, sort = TRUE) %&gt;%
 592 |   group_by(sentiment) %&gt;%
 593 |   top_n(10) %&gt;%
 594 |   ungroup %&gt;%
 595 | * ggplot(aes(fct_reorder(word, n),
 596 |              n, 
 597 |              fill = sentiment)) +
 598 |   geom_col() +
 599 |   coord_flip() +
 600 |   facet_wrap(~ sentiment, scales = "free") 
 601 | ```
 602 | 
 603 | ---
 604 | 
 605 | class: middle
 606 | 
 607 | ![](intro_files/figure-html/unnamed-chunk-21-1.png)&lt;!-- --&gt;
 608 | 
 609 | ---
 610 | 
 611 | background-image: url(figs/p_and_p_cover.png)
 612 | background-size: cover
 613 | 
 614 | class: inverse, center, middle
 615 | 
 616 | ## WHAT IS A DOCUMENT ABOUT? 🤔
 617 | 
 618 | ---
 619 | 
 620 | ## What is a document about?
 621 | 
 622 | - .large[Term frequency]
 623 | - .large[Inverse document frequency]
 624 | 
 625 | `$$idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}$$`
 626 | 
 627 | ### tf-idf is about comparing **documents** within a **collection**.
 628 | 
 629 | ---
 630 | 
 631 | ## Understanding tf-idf
 632 | 
 633 | .large[Make a collection (*corpus*) for yourself! 💅]
 634 | 
 635 | 
 636 | ```r
 637 | full_collection &lt;- gutenberg_download(c(1342, 158, 161, 141),
 638 |                                       meta_fields = "title")
 639 | 
 640 | full_collection
 641 | ```
 642 | 
 643 | ```
 644 | ## # A tibble: 57,251 x 3
 645 | ##    gutenberg_id text           title         
 646 | ##           &lt;int&gt; &lt;chr&gt;          &lt;chr&gt;         
 647 | ##  1          141 MANSFIELD PARK Mansfield Park
 648 | ##  2          141 ""             Mansfield Park
 649 | ##  3          141 (1814)         Mansfield Park
 650 | ##  4          141 ""             Mansfield Park
 651 | ##  5          141 ""             Mansfield Park
 652 | ##  6          141 By Jane Austen Mansfield Park
 653 | ##  7          141 ""             Mansfield Park
 654 | ##  8          141 ""             Mansfield Park
 655 | ##  9          141 ""             Mansfield Park
 656 | ## 10          141 ""             Mansfield Park
 657 | ## # … with 57,241 more rows
 658 | ```
 659 | 
 660 | ---
 661 | 
 662 | ## Counting word frequencies in your collection
 663 | 
 664 | 
 665 | ```r
 666 | book_words &lt;- full_collection %&gt;%
 667 | * unnest_tokens(word, text) %&gt;%
 668 |   count(title, word, sort = TRUE)
 669 | 
 670 | book_words
 671 | ```
 672 | 
 673 | ```
 674 | ## # A tibble: 28,395 x 3
 675 | ##    title               word      n
 676 | ##    &lt;chr&gt;               &lt;chr&gt; &lt;int&gt;
 677 | ##  1 Mansfield Park      the    6206
 678 | ##  2 Mansfield Park      to     5475
 679 | ##  3 Mansfield Park      and    5438
 680 | ##  4 Emma                to     5239
 681 | ##  5 Emma                the    5201
 682 | ##  6 Emma                and    4896
 683 | ##  7 Mansfield Park      of     4778
 684 | ##  8 Pride and Prejudice the    4331
 685 | ##  9 Emma                of     4291
 686 | ## 10 Pride and Prejudice to     4162
 687 | ## # … with 28,385 more rows
 688 | ```
 689 | 
 690 | ---
 691 | 
 692 | ## Calculating tf-idf
 693 | 
 694 | .large[That's... super exciting???]
 695 | 
 696 | 
 697 | ```r
 698 | book_tfidf &lt;- book_words %&gt;%
 699 | * bind_tf_idf(word, title, n)
 700 | 
 701 | book_tfidf
 702 | ```
 703 | 
 704 | ```
 705 | ## # A tibble: 28,395 x 6
 706 | ##    title               word      n     tf   idf tf_idf
 707 | ##    &lt;chr&gt;               &lt;chr&gt; &lt;int&gt;  &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt;
 708 | ##  1 Mansfield Park      the    6206 0.0387     0      0
 709 | ##  2 Mansfield Park      to     5475 0.0341     0      0
 710 | ##  3 Mansfield Park      and    5438 0.0339     0      0
 711 | ##  4 Emma                to     5239 0.0325     0      0
 712 | ##  5 Emma                the    5201 0.0323     0      0
 713 | ##  6 Emma                and    4896 0.0304     0      0
 714 | ##  7 Mansfield Park      of     4778 0.0298     0      0
 715 | ##  8 Pride and Prejudice the    4331 0.0354     0      0
 716 | ##  9 Emma                of     4291 0.0267     0      0
 717 | ## 10 Pride and Prejudice to     4162 0.0341     0      0
 718 | ## # … with 28,385 more rows
 719 | ```
 720 | 
 721 | ---
 722 | 
 723 | ## Calculating tf-idf
 724 | 
 725 | 
 726 | ```r
 727 | book_tfidf %&gt;%
 728 |   arrange(-tf_idf)
 729 | ```
 730 | 
 731 | ```
 732 | ## # A tibble: 28,395 x 6
 733 | ##    title                 word          n      tf   idf  tf_idf
 734 | ##    &lt;chr&gt;                 &lt;chr&gt;     &lt;int&gt;   &lt;dbl&gt; &lt;dbl&gt;   &lt;dbl&gt;
 735 | ##  1 Sense and Sensibility elinor      623 0.00519 1.39  0.00720
 736 | ##  2 Emma                  emma        786 0.00488 1.39  0.00677
 737 | ##  3 Sense and Sensibility marianne    492 0.00410 1.39  0.00569
 738 | ##  4 Mansfield Park        crawford    493 0.00307 1.39  0.00426
 739 | ##  5 Pride and Prejudice   darcy       373 0.00305 1.39  0.00423
 740 | ##  6 Mansfield Park        fanny       816 0.00509 0.693 0.00352
 741 | ##  7 Pride and Prejudice   elizabeth   597 0.00489 0.693 0.00339
 742 | ##  8 Emma                  weston      389 0.00242 1.39  0.00335
 743 | ##  9 Pride and Prejudice   bennet      294 0.00241 1.39  0.00334
 744 | ## 10 Mansfield Park        edmund      364 0.00227 1.39  0.00314
 745 | ## # … with 28,385 more rows
 746 | ```
 747 | 
 748 | ---
 749 | 
 750 | ## Calculating tf-idf
 751 | 
 752 | 
 753 | ```r
 754 | book_tfidf %&gt;%
 755 |   group_by(title) %&gt;%
 756 |   top_n(10) %&gt;%
 757 |   ungroup %&gt;%
 758 | * ggplot(aes(fct_reorder(word, tf_idf),
 759 |              tf_idf, 
 760 |              fill = title)) +
 761 |   geom_col(show.legend = FALSE) +
 762 |   coord_flip() +
 763 |   facet_wrap(~title, scales = "free")
 764 | ```
 765 | 
 766 | ---
 767 | 
 768 | ![](intro_files/figure-html/unnamed-chunk-27-1.png)&lt;!-- --&gt;
 769 | 
 770 | ---
 771 | 
 772 | background-image: url(figs/plot_tf_idf-1.png)
 773 | background-size: 800px
 774 | 
 775 | ---
 776 | 
 777 | ## N-grams... and beyond! 🚀
 778 | 
 779 | 
 780 | ```r
 781 | tidy_ngram &lt;- full_text %&gt;%
 782 | * unnest_tokens(bigram, text, token = "ngrams", n = 2)
 783 | 
 784 | tidy_ngram
 785 | ```
 786 | 
 787 | ```
 788 | ## # A tibble: 122,203 x 2
 789 | ##    gutenberg_id bigram        
 790 | ##           &lt;int&gt; &lt;chr&gt;         
 791 | ##  1         1342 pride and     
 792 | ##  2         1342 and prejudice 
 793 | ##  3         1342 prejudice by  
 794 | ##  4         1342 by jane       
 795 | ##  5         1342 jane austen   
 796 | ##  6         1342 austen chapter
 797 | ##  7         1342 chapter 1     
 798 | ##  8         1342 1 it          
 799 | ##  9         1342 it is         
 800 | ## 10         1342 is a          
 801 | ## # … with 122,193 more rows
 802 | ```
 803 | 
 804 | ---
 805 | 
 806 | ## N-grams... and beyond! 🚀
 807 | 
 808 | 
 809 | ```r
 810 | tidy_ngram %&gt;%
 811 |   count(bigram, sort = TRUE)
 812 | ```
 813 | 
 814 | ```
 815 | ## # A tibble: 54,998 x 2
 816 | ##    bigram       n
 817 | ##    &lt;chr&gt;    &lt;int&gt;
 818 | ##  1 of the     464
 819 | ##  2 to be      443
 820 | ##  3 in the     382
 821 | ##  4 i am       302
 822 | ##  5 of her     260
 823 | ##  6 to the     252
 824 | ##  7 it was     251
 825 | ##  8 mr darcy   243
 826 | ##  9 of his     234
 827 | ## 10 she was    209
 828 | ## # … with 54,988 more rows
 829 | ```
 830 | 
 831 | ---
 832 | 
 833 | ## N-grams... and beyond! 🚀
 834 | 
 835 | 
 836 | ```r
 837 | tidy_ngram %&gt;%
 838 | * separate(bigram, c("word1", "word2"), sep = " ") %&gt;%
 839 |   filter(!word1 %in% stop_words$word,
 840 |          !word2 %in% stop_words$word) %&gt;%
 841 |   count(word1, word2, sort = TRUE)
 842 | ```
 843 | 
 844 | ```
 845 | ## # A tibble: 5,922 x 3
 846 | ##    word1   word2           n
 847 | ##    &lt;chr&gt;   &lt;chr&gt;       &lt;int&gt;
 848 | ##  1 lady    catherine     100
 849 | ##  2 miss    bingley        72
 850 | ##  3 miss    bennet         60
 851 | ##  4 sir     william        38
 852 | ##  5 de      bourgh         35
 853 | ##  6 miss    darcy          34
 854 | ##  7 colonel forster        26
 855 | ##  8 colonel fitzwilliam    25
 856 | ##  9 cried   elizabeth      24
 857 | ## 10 miss    lucas          23
 858 | ## # … with 5,912 more rows
 859 | ```
 860 | 
 861 | ---
 862 | 
 863 | background-image: url(figs/p_and_p_cover.png)
 864 | background-size: cover
 865 | 
 866 | class: inverse
 867 | 
 868 | ## What can you do with n-grams?
 869 | 
 870 | - .large[tf-idf of n-grams]
 871 | 
 872 | --
 873 | 
 874 | - .large[network analysis]
 875 | 
 876 | --
 877 | 
 878 | - .large[negation]
 879 | 
 880 | ---
 881 | 
 882 | background-image: url(figs/austen-1.png)
 883 | background-size: 750px
 884 | 
 885 | ---
 886 | 
 887 | background-image: url(figs/slider.gif)
 888 | background-position: 50% 70%
 889 | 
 890 | ## What can you do with n-grams?
 891 | 
 892 | ### [She Giggles, He Gallops](https://pudding.cool/2017/08/screen-direction/)
 893 | 
 894 | ---
 895 | 
 896 | class: left, middle
 897 | 
 898 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
 899 | 
 900 | # Thanks!
 901 | 
 902 | &lt;a href="https://tidytextmining.com"&gt;&lt;i class="fa fa-book fa-fw"&gt;&lt;/i&gt;&amp;nbsp; tidytextmining.com&lt;/a&gt;&lt;br&gt;
 903 | &lt;a href="http://twitter.com/juliasilge"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
 904 | &lt;a href="http://github.com/juliasilge"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
 905 | &lt;a href="https://juliasilge.com"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; juliasilge.com&lt;/a&gt;&lt;br&gt;
 906 | &lt;a href="http://twitter.com/dataandme"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @dataandme&lt;/a&gt;&lt;br&gt;
 907 | &lt;a href="http://github.com/batpigandme"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @batpigandme&lt;/a&gt;&lt;br&gt;
 908 | &lt;a href="https://maraaverick.rbind.io"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; maraaverick.rbind.io&lt;/a&gt;&lt;br&gt;
 909 | 
 910 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
 911 |     </textarea>
 912 | <style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
 913 | <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
 914 | <script>var slideshow = remark.create({
 915 | "highlightStyle": "github",
 916 | "highlightLines": true,
 917 | "countIncrementalSlides": false,
 918 | "ratio": "16:9"
 919 | });
 920 | if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
 921 |   window.dispatchEvent(new Event('resize'));
 922 | });
 923 | (function(d) {
 924 |   var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
 925 |   if (!r) return;
 926 |   s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
 927 |   d.head.appendChild(s);
 928 | })(document);
 929 | 
 930 | (function(d) {
 931 |   var el = d.getElementsByClassName("remark-slides-area");
 932 |   if (!el) return;
 933 |   var slide, slides = slideshow.getSlides(), els = el[0].children;
 934 |   for (var i = 1; i < slides.length; i++) {
 935 |     slide = slides[i];
 936 |     if (slide.properties.continued === "true" || slide.properties.count === "false") {
 937 |       els[i - 1].className += ' has-continuation';
 938 |     }
 939 |   }
 940 |   var s = d.createElement("style");
 941 |   s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
 942 |   d.head.appendChild(s);
 943 | })(document);
 944 | // delete the temporary CSS (for displaying all slides initially) when the user
 945 | // starts to view slides
 946 | (function() {
 947 |   var deleted = false;
 948 |   slideshow.on('beforeShowSlide', function(slide) {
 949 |     if (deleted) return;
 950 |     var sheets = document.styleSheets, node;
 951 |     for (var i = 0; i < sheets.length; i++) {
 952 |       node = sheets[i].ownerNode;
 953 |       if (node.dataset["target"] !== "print-only") continue;
 954 |       node.parentNode.removeChild(node);
 955 |     }
 956 |     deleted = true;
 957 |   });
 958 | })();</script>
 959 | 
 960 | <script>
 961 | (function() {
 962 |   var links = document.getElementsByTagName('a');
 963 |   for (var i = 0; i < links.length; i++) {
 964 |     if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
 965 |       links[i].target = '_blank';
 966 |     }
 967 |   }
 968 | })();
 969 | </script>
 970 | 
 971 | <script>
 972 | slideshow._releaseMath = function(el) {
 973 |   var i, text, code, codes = el.getElementsByTagName('code');
 974 |   for (i = 0; i < codes.length;) {
 975 |     code = codes[i];
 976 |     if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
 977 |       text = code.textContent;
 978 |       if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
 979 |           /^\$\$(.|\s)+\$\$$/.test(text) ||
 980 |           /^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
 981 |         code.outerHTML = code.innerHTML;  // remove <code></code>
 982 |         continue;
 983 |       }
 984 |     }
 985 |     i++;
 986 |   }
 987 | };
 988 | slideshow._releaseMath(document);
 989 | </script>
 990 | <!-- dynamically load mathjax for compatibility with self-contained -->
 991 | <script>
 992 | (function () {
 993 |   var script = document.createElement('script');
 994 |   script.type = 'text/javascript';
 995 |   script.src  = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
 996 |   if (location.protocol !== 'file:' && /^https?:/.test(script.src))
 997 |     script.src  = script.src.replace(/^https?:/, '');
 998 |   document.getElementsByTagName('head')[0].appendChild(script);
 999 | })();
1000 | </script>
1001 |   </body>
1002 | </html>
1003 | 


--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/libs/remark-css/default.css:
--------------------------------------------------------------------------------
 1 | a, a > code {
 2 |   color: rgb(249, 38, 114);
 3 |   text-decoration: none;
 4 | }
 5 | .footnote {
 6 |   position: absolute;
 7 |   bottom: 3em;
 8 |   padding-right: 4em;
 9 |   font-size: 90%;
10 | }
11 | .remark-code-line-highlighted     { background-color: #ffff88; }
12 | 
13 | .inverse {
14 |   background-color: #272822;
15 |   color: #d6d6d6;
16 |   text-shadow: 0 0 20px #333;
17 | }
18 | .inverse h1, .inverse h2, .inverse h3 {
19 |   color: #f3f3f3;
20 | }
21 | /* Two-column layout */
22 | .left-column {
23 |   color: #777;
24 |   width: 20%;
25 |   height: 92%;
26 |   float: left;
27 | }
28 | .left-column h2:last-of-type, .left-column h3:last-child {
29 |   color: #000;
30 | }
31 | .right-column {
32 |   width: 75%;
33 |   float: right;
34 |   padding-top: 1em;
35 | }
36 | .pull-left {
37 |   float: left;
38 |   width: 47%;
39 | }
40 | .pull-right {
41 |   float: right;
42 |   width: 47%;
43 | }
44 | .pull-right ~ * {
45 |   clear: both;
46 | }
47 | img, video, iframe {
48 |   max-width: 100%;
49 | }
50 | blockquote {
51 |   border-left: solid 5px lightgray;
52 |   padding-left: 1em;
53 | }
54 | .remark-slide table {
55 |   margin: auto;
56 |   border-top: 1px solid #666;
57 |   border-bottom: 1px solid #666;
58 | }
59 | .remark-slide table thead th { border-bottom: 1px solid #ddd; }
60 | th, td { padding: 5px; }
61 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #eee }
62 | 
63 | @page { margin: 0; }
64 | @media print {
65 |   .remark-slide-scaler {
66 |     width: 100% !important;
67 |     height: 100% !important;
68 |     transform: scale(1) !important;
69 |     top: 0 !important;
70 |     left: 0 !important;
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/modeling.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Text Modeling"
  3 | subtitle: "<br><br>USING TIDY DATA PRINCIPLES"
  4 | author: "Julia Silge | SDSS | 29 May 2019"
  5 | output:
  6 |   xaringan::moon_reader:
  7 |     css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"]
  8 |     lib_dir: libs
  9 |     nature:
 10 |       highlightStyle: github
 11 |       highlightLines: true
 12 |       countIncrementalSlides: false
 13 |       ratio: "16:9"
 14 |     seal: false  
 15 |     includes:
 16 |       in_header: header.html
 17 | ---
 18 | 
 19 | ```{r setup, include=FALSE}
 20 | options(htmltools.dir.version = FALSE)
 21 | library(knitr)
 22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180)
 23 | library(ggplot2)
 24 | library(silgelib)
 25 | theme_set(theme_roboto())
 26 | ```
 27 | 
 28 | layout: true
 29 | 
 30 | <div class="my-footer"><span>bit.ly/silge-sdss-2</span></div> 
 31 | 
 32 | ---
 33 | 
 34 | class: inverse, center, middle
 35 | 
 36 | background-image: url(figs/p_and_p_cover.png)
 37 | background-size: cover
 38 | 
 39 | 
 40 | # Text Modeling
 41 | 
 42 | <img src="figs/blue_jane.png" width="150px"/>
 43 | 
 44 | ### USING TIDY PRINCIPLES
 45 | 
 46 | .large[Julia Silge | SDSS | 29 May 2019]
 47 | 
 48 | ---
 49 | 
 50 | ## Let's install some packages
 51 | 
 52 | ```{r, eval=FALSE}
 53 | install.packages(c("tidyverse", 
 54 |                    "tidytext", 
 55 |                    "gutenbergr",
 56 |                    "stm",
 57 |                    "glmnet",
 58 |                    "yardstick"))
 59 | ```
 60 | 
 61 | ---
 62 | 
 63 | class: right, middle
 64 | 
 65 | <img src="figs/blue_jane.png" width="150px"/>
 66 | 
 67 | # Find us at...
 68 | 
 69 | <a href="http://twitter.com/juliasilge"><i class="fa fa-twitter fa-fw"></i>&nbsp; @juliasilge</a><br>
 70 | <a href="http://github.com/juliasilge"><i class="fa fa-github fa-fw"></i>&nbsp; @juliasilge</a><br>
 71 | <a href="https://juliasilge.com"><i class="fa fa-link fa-fw"></i>&nbsp; juliasilge.com</a><br>
 72 | 
 73 | ---
 74 | 
 75 | class: right, middle
 76 | 
 77 | <img src="figs/blue_jane.png" width="150px"/>
 78 | 
 79 | # Find us at...
 80 | 
 81 | <a href="http://twitter.com/dataandme"><i class="fa fa-twitter fa-fw"></i>&nbsp; @dataandme</a><br>
 82 | <a href="http://github.com/batpigandme"><i class="fa fa-github fa-fw"></i>&nbsp; @batpigandme</a><br>
 83 | <a href="https://maraaverick.rbind.io"><i class="fa fa-link fa-fw"></i>&nbsp; maraaverick.rbind.io</a><br>
 84 | 
 85 | ---
 86 | 
 87 | class: right, inverse, middle
 88 | 
 89 | background-image: url(figs/p_and_p_cover.png)
 90 | background-size: cover
 91 | 
 92 | # TIDYING AND CASTING 
 93 | 
 94 | <h1 class="fa fa-check-circle fa-fw"></h1>
 95 | 
 96 | ---
 97 | 
 98 | background-image: url(figs/tmwr_0601.png)
 99 | background-size: 900px
100 | 
101 | ---
102 | 
103 | class: inverse
104 | 
105 | background-image: url(figs/p_and_p_cover.png)
106 | background-size: cover
107 | 
108 | # Two powerful NLP techniques
109 | 
110 | --
111 | 
112 | - .large[Topic modeling]
113 | 
114 | --
115 | 
116 | - .large[Text classification]
117 | 
118 | ---
119 | 
120 | class: inverse
121 | 
122 | background-image: url(figs/p_and_p_cover.png)
123 | background-size: cover
124 | 
125 | # Topic modeling
126 | 
127 | - .large[Each DOCUMENT = mixture of topics]
128 | 
129 | --
130 | 
131 | - .large[Each TOPIC = mixture of words]
132 | 
133 | ---
134 | 
135 | class: top
136 | 
137 | background-image: url(figs/top_tags-1.png)
138 | background-size: 800px
139 | 
140 | ---
141 | 
142 | class: center, middle, inverse
143 | 
144 | background-image: url(figs/p_and_p_cover.png)
145 | background-size: cover
146 | 
147 | # GREAT LIBRARY HEIST `r emo::ji("sleuth")`
148 | 
149 | ---
150 | 
151 | ## **Downloading your text data**
152 | 
153 | ```{r}
154 | library(tidyverse)
155 | library(gutenbergr)
156 | 
157 | titles <- c("Twenty Thousand Leagues under the Sea", 
158 |             "The War of the Worlds",
159 |             "Pride and Prejudice", 
160 |             "Great Expectations")
161 | 
162 | books <- gutenberg_works(title %in% titles) %>%
163 |   gutenberg_download(meta_fields = "title")
164 | 
165 | books
166 | ```
167 | 
168 | ---
169 | 
170 | ## **Someone has torn your books apart!** `r emo::ji("sob")`
171 | 
172 | 
173 | ```{r}
174 | by_chapter <- books %>%
175 |   group_by(title) %>%
176 |   mutate(chapter = cumsum(str_detect(text, 
177 |                                      regex("^chapter ", 
178 |                                            ignore_case = TRUE)))) %>%
179 |   ungroup() %>%
180 |   filter(chapter > 0) %>%
181 |   unite(document, title, chapter)
182 | 
183 | by_chapter
184 | ```
185 | 
186 | ---
187 | 
188 | ## **Can we put them back together?**
189 | 
190 | ```{r}
191 | library(tidytext)
192 | 
193 | word_counts <- by_chapter %>%
194 |   unnest_tokens(word, text) %>%               #<<
195 |   anti_join(get_stopwords(source = "smart")) %>%
196 |   count(document, word, sort = TRUE)
197 | 
198 | word_counts
199 | 
200 | ```
201 | 
202 | ---
203 | 
204 | ## **Can we put them back together?**
205 | 
206 | ```{r}
207 | words_sparse <- word_counts %>%
208 |   cast_sparse(document, word, n)         #<<
209 | 
210 | class(words_sparse)
211 | ```
212 | 
213 | ---
214 | 
215 | ## **Train a topic model**
216 | 
217 | Use a sparse matrix or a `quanteda::dfm` object as input
218 | 
219 | ```{r}
220 | library(stm)
221 | 
222 | topic_model <- stm(words_sparse, K = 4, 
223 |                    verbose = FALSE, init.type = "Spectral")
224 | 
225 | summary(topic_model)
226 | ```
227 | 
228 | ---
229 | 
230 | ## **Exploring the output of topic modeling**
231 | 
232 | .large[Time for tidying!]
233 | 
234 | ```{r}
235 | chapter_topics <- tidy(topic_model, matrix = "beta")
236 | 
237 | chapter_topics
238 | ```
239 | 
240 | ---
241 | 
242 | ## **Exploring the output of topic modeling**
243 | 
244 | ```{r}
245 | top_terms <- chapter_topics %>%
246 |   group_by(topic) %>%
247 |   top_n(10, beta) %>%
248 |   ungroup() %>%
249 |   arrange(topic, -beta)
250 | 
251 | top_terms
252 | ```
253 | 
254 | ---
255 | ## **Exploring the output of topic modeling**
256 | 
257 | ```{r, eval=FALSE}
258 | top_terms %>%
259 |   mutate(term = fct_reorder(term, beta)) %>%
260 |   ggplot(aes(term, beta, fill = factor(topic))) +
261 |   geom_col(show.legend = FALSE) +
262 |   facet_wrap(~ topic, scales = "free") +
263 |   coord_flip()
264 | ```
265 | 
266 | ---
267 | 
268 | ```{r, echo=FALSE, fig.height=4}
269 | top_terms %>%
270 |   ggplot(aes(reorder_within(term, beta, topic), beta, fill = factor(topic))) +
271 |   geom_col(show.legend = FALSE) +
272 |   facet_wrap(~ topic, scales = "free") +
273 |   coord_flip() +
274 |   scale_x_reordered() +
275 |   scale_y_continuous(expand = c(0,0)) +
276 |   labs(y = expression(beta), x = NULL)
277 | ```
278 | 
279 | ---
280 | 
281 | ## **How are documents classified?**
282 | 
283 | ```{r}
284 | chapters_gamma <- tidy(topic_model, matrix = "gamma",
285 |                        document_names = rownames(words_sparse))
286 | 
287 | chapters_gamma
288 | ```
289 | 
290 | ---
291 | 
292 | ## **How are documents classified?**
293 | 
294 | ```{r}
295 | chapters_parsed <- chapters_gamma %>%
296 |   separate(document, c("title", "chapter"), 
297 |            sep = "_", convert = TRUE)
298 | 
299 | chapters_parsed
300 | ```
301 | 
302 | ---
303 | 
304 | ## **How are documents classified?**
305 | 
306 | ```{r, eval=FALSE}
307 | chapters_parsed %>%
308 |   mutate(title = fct_reorder(title, gamma * topic)) %>%
309 |   ggplot(aes(factor(topic), gamma)) +
310 |   geom_boxplot() +
311 |   facet_wrap(~ title)
312 | ```
313 | 
314 | ---
315 | 
316 | ```{r, echo=FALSE, fig.height=4}
317 | chapters_parsed %>%
318 |   mutate(title = fct_reorder(title, gamma * topic)) %>%
319 |   ggplot(aes(factor(topic), gamma, color = factor(topic))) +
320 |   geom_boxplot(show.legend = FALSE) +
321 |   facet_wrap(~ title) +
322 |   labs(x = "Topic", y = expression(gamma))
323 | ```
324 | 
325 | ---
326 | 
327 | class: center, middle, inverse
328 | 
329 | background-image: url(figs/p_and_p_cover.png)
330 | background-size: cover
331 | 
332 | # GOING FARTHER `r emo::ji("rocket")`
333 | 
334 | ---
335 | 
336 | ## Tidying model output
337 | 
338 | ### Which words in each document are assigned to which topics?
339 | 
340 | - .large[`augment()`]
341 | - .large[Add information to each observation in the original data]
342 | 
343 | ---
344 | 
345 | background-image: url(figs/stm_video.png)
346 | background-size: 850px
347 | 
348 | ---
349 | 
350 | ## **Using stm**
351 | 
352 | - .large[Document-level covariates]
353 | 
354 | ```{r, eval=FALSE}
355 | topic_model <- stm(words_sparse, K = 0, init.type = "Spectral",
356 |                    prevalence = ~s(Year),
357 |                    data = covariates,
358 |                    verbose = FALSE)
359 | ```
360 | 
361 | - .large[Use functions for `semanticCoherence()`, `checkResiduals()`, `exclusivity()`, and more!]
362 | 
363 | - .large[Check out http://www.structuraltopicmodel.com/]
364 | 
365 | - .large[See [my blog post](https://juliasilge.com/blog/evaluating-stm/) for how to choose `K`, the number of topics]
366 | 
367 | ---
368 | 
369 | 
370 | background-image: url(figs/model_diagnostic-1.png)
371 | background-position: 50% 50%
372 | background-size: 950px
373 | 
374 | ---
375 | 
376 | # Stemming?
377 | 
378 | .large[Advice from [Schofield & Mimno](https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf)]
379 | 
380 | .large["Comparing Apples to Apple: The Effects of Stemmers on Topic Models"]
381 | 
382 | ---
383 | 
384 | class: right, middle
385 | 
386 | <h1 class="fa fa-quote-left fa-fw"></h1>
387 | 
388 | <h2> Despite their frequent use in topic modeling, we find that stemmers produce no meaningful improvement in likelihood and coherence and in fact can degrade topic stability. </h2>
389 | 
390 | <h1 class="fa fa-quote-right fa-fw"></h1>
391 | 
392 | ---
393 | 
394 | class: right, middle, inverse
395 | 
396 | background-image: url(figs/p_and_p_cover.png)
397 | background-size: cover
398 | 
399 | 
400 | # TEXT CLASSIFICATION
401 | <h1 class="fa fa-balance-scale fa-fw"></h1>
402 | 
403 | ---
404 | 
405 | ## **Downloading your text data**
406 | 
407 | ```{r}
408 | library(tidyverse)
409 | library(gutenbergr)
410 | 
411 | titles <- c("The War of the Worlds",
412 |             "Pride and Prejudice")
413 | 
414 | books <- gutenberg_works(title %in% titles) %>%
415 |   gutenberg_download(meta_fields = "title") %>%
416 |   mutate(document = row_number())
417 | 
418 | books
419 | ```
420 | 
421 | ---
422 | 
423 | ## **Making a tidy dataset**
424 | 
425 | .large[Use this kind of data structure for EDA! `r emo::ji("nail")`]
426 | 
427 | ```{r}
428 | library(tidytext)
429 | 
430 | tidy_books <- books %>%
431 |   unnest_tokens(word, text) %>%           #<<
432 |   group_by(word) %>%
433 |   filter(n() > 10) %>%
434 |   ungroup
435 | 
436 | tidy_books
437 | ```
438 | 
439 | ---
440 | 
441 | ## **Cast to a sparse matrix**
442 | 
443 | .large[And build a dataframe with a response variable]
444 | 
445 | ```{r}
446 | sparse_words <- tidy_books %>%
447 |   count(document, word, sort = TRUE) %>%
448 |   cast_sparse(document, word, n)               #<<
449 | 
450 | books_joined <- tibble(document = as.integer(rownames(sparse_words))) %>%
451 |   left_join(books %>%
452 |               select(document, title))
453 | ```
454 | 
455 | ---
456 | 
457 | ## **Train a glmnet model**
458 | 
459 | ```{r}
460 | library(glmnet)
461 | library(doMC)
462 | registerDoMC(cores = 8)
463 | 
464 | is_jane <- books_joined$title == "Pride and Prejudice"
465 | 
466 | model <- cv.glmnet(sparse_words, is_jane, family = "binomial", 
467 |                    parallel = TRUE, keep = TRUE)
468 | 
469 | ```
470 | 
471 | ---
472 | 
473 | ## **Tidying our model**
474 | 
475 | .large[Tidy, then filter to choose some lambda from glmnet output]
476 | 
477 | ```{r}
478 | library(broom)
479 | 
480 | coefs <- model$glmnet.fit %>%
481 |   tidy() %>%
482 |   filter(lambda == model$lambda.1se)
483 | 
484 | Intercept <- coefs %>%
485 |   filter(term == "(Intercept)") %>%
486 |   pull(estimate)
487 | ```
488 | 
489 | ---
490 | 
491 | ## **Tidying our model**
492 | 
493 | ```{r}
494 | classifications <- tidy_books %>%
495 |   inner_join(coefs, by = c("word" = "term")) %>%
496 |   group_by(document) %>%
497 |   summarize(score = sum(estimate)) %>%
498 |   mutate(probability = plogis(Intercept + score))
499 | 
500 | classifications
501 | ```
502 | 
503 | ---
504 | 
505 | ## **Understanding our model**
506 | 
507 | ```{r, eval=FALSE}
508 | coefs %>%
509 |   group_by(estimate > 0) %>%
510 |   top_n(10, abs(estimate)) %>%
511 |   ungroup %>%
512 |   ggplot(aes(fct_reorder(term, estimate), 
513 |              estimate, 
514 |              fill = estimate > 0)) +
515 |   geom_col(show.legend = FALSE) +
516 |   coord_flip()
517 | ```
518 | 
519 | ---
520 | 
521 | ```{r, echo = FALSE, fig.height=4}
522 | coefs %>%
523 |   group_by(estimate > 0) %>%
524 |   top_n(10, abs(estimate)) %>%
525 |   ungroup %>%
526 |   ggplot(aes(fct_reorder(term, estimate), estimate, fill = estimate > 0)) +
527 |   geom_col(show.legend = FALSE) +
528 |   coord_flip() +
529 |   labs(x = NULL,
530 |        title = "Coefficients that increase/decrease probability",
531 |        subtitle = "A document mentioning Martians is unlikely to be written by Jane Austen")
532 | ```
533 | 
534 | ---
535 | 
536 | ## **ROC**
537 | 
538 | ```{r}
539 | library(yardstick)
540 | 
541 | comment_classes <- classifications %>%
542 |   left_join(books %>%
543 |     select(title, document), by = "document") %>%
544 |   mutate(title = as.factor(title))
545 | ```
546 | 
547 | ---
548 | 
549 | ## **ROC**
550 | 
551 | ```{r eval=FALSE}
552 | comment_classes %>%
553 |   roc_curve(title, probability) %>%
554 |   ggplot(aes(x = 1 - specificity, y = sensitivity)) +
555 |   geom_line(
556 |     color = "midnightblue",
557 |     size = 1.5
558 |   ) +
559 |   geom_abline(
560 |     lty = 2, alpha = 0.5,
561 |     color = "gray50",
562 |     size = 1.2
563 |   )
564 | ```
565 | 
566 | ---
567 | 
568 | ```{r, echo = FALSE, fig.height=4}
569 | comment_classes %>%
570 |   roc_curve(title, probability) %>%
571 |   ggplot(aes(x = 1 - specificity, y = sensitivity)) +
572 |   geom_line(
573 |     color = "midnightblue",
574 |     size = 1.5
575 |   ) +
576 |   geom_abline(
577 |     lty = 2, alpha = 0.5,
578 |     color = "gray50",
579 |     size = 1.2
580 |   ) +
581 |   labs(
582 |     title = "ROC curve for text classification"
583 |   )
584 | ```
585 | 
586 | ---
587 | 
588 | ## **AUC for model**
589 | 
590 | ```{r}
591 | comment_classes %>%
592 |   roc_auc(title, probability)
593 | ```
594 | 
595 | ---
596 | 
597 | ## **Confusion matrix**
598 | 
599 | ```{r}
600 | comment_classes %>%
601 |   mutate(
602 |     prediction = case_when(
603 |       probability > 0.5 ~ "Pride and Prejudice",
604 |       TRUE ~ "The War of the Worlds"
605 |     ),
606 |     prediction = as.factor(prediction)
607 |   ) %>%
608 |   conf_mat(title, prediction)
609 | ```
610 | 
611 | ---
612 | 
613 | ## **Misclassifications**
614 | 
615 | Let's talk about misclassifications. Which documents here were incorrectly predicted to be written by Jane Austen?
616 | 
617 | ```{r}
618 | comment_classes %>%
619 |   filter(
620 |     probability > .8,                       #<<
621 |     title == "The War of the Worlds"        #<<
622 |   ) %>%
623 |   sample_n(10) %>%
624 |   inner_join(books %>%
625 |     select(document, text)) %>%
626 |   select(probability, text)
627 | ```
628 | 
629 | ---
630 | 
631 | ## **Misclassifications**
632 | 
633 | Let's talk about misclassifications. Which documents here were incorrectly predicted to *not* be written by Jane Austen?
634 | 
635 | ```{r}
636 | comment_classes %>%
637 |   filter(
638 |     probability < .3,                    #<<
639 |     title == "Pride and Prejudice"       #<<
640 |   ) %>%
641 |   sample_n(10) %>%
642 |   inner_join(books %>%
643 |     select(document, text)) %>%
644 |   select(probability, text)
645 | ```
646 | 
647 | ---
648 | 
649 | background-image: url(figs/tmwr_0601.png)
650 | background-position: 50% 70%
651 | background-size: 750px
652 | 
653 | ## **Workflow for text mining/modeling**
654 | 
655 | ---
656 | 
657 | background-image: url(figs/lizzieskipping.gif)
658 | background-position: 50% 55%
659 | background-size: 750px
660 | 
661 | # **Go explore real-world text!**
662 | 
663 | ---
664 | 
665 | class: left, middle
666 | 
667 | <img src="figs/blue_jane.png" width="150px"/>
668 | 
669 | # Thanks!
670 | 
671 | <a href="https://tidytextmining.com"><i class="fa fa-book fa-fw"></i>&nbsp; tidytextmining.com</a><br>
672 | <a href="http://twitter.com/juliasilge"><i class="fa fa-twitter fa-fw"></i>&nbsp; @juliasilge</a><br>
673 | <a href="http://github.com/juliasilge"><i class="fa fa-github fa-fw"></i>&nbsp; @juliasilge</a><br>
674 | <a href="https://juliasilge.com"><i class="fa fa-link fa-fw"></i>&nbsp; juliasilge.com</a><br>
675 | <a href="http://twitter.com/dataandme"><i class="fa fa-twitter fa-fw"></i>&nbsp; @dataandme</a><br>
676 | <a href="http://github.com/batpigandme"><i class="fa fa-github fa-fw"></i>&nbsp; @batpigandme</a><br>
677 | <a href="https://maraaverick.rbind.io"><i class="fa fa-link fa-fw"></i>&nbsp; maraaverick.rbind.io</a><br>
678 | 
679 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
680 | 


--------------------------------------------------------------------------------
/modeling.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
  3 |   <head>
  4 |     <title>Text Modeling</title>
  5 |     <meta charset="utf-8" />
  6 |     <meta name="author" content="Julia Silge | SDSS | 29 May 2019" />
  7 |     <link href="libs/remark-css/default.css" rel="stylesheet" />
  8 |     <script src="https://use.fontawesome.com/5235085b15.js"></script>
  9 |     <link rel="stylesheet" href="css/xaringan-themer.css" type="text/css" />
 10 |     <link rel="stylesheet" href="css/footer_plus.css" type="text/css" />
 11 |   </head>
 12 |   <body>
 13 |     <textarea id="source">
 14 | 
 15 | 
 16 | 
 17 | 
 18 | layout: true
 19 | 
 20 | &lt;div class="my-footer"&gt;&lt;span&gt;bit.ly/silge-sdss-2&lt;/span&gt;&lt;/div&gt; 
 21 | 
 22 | ---
 23 | 
 24 | class: inverse, center, middle
 25 | 
 26 | background-image: url(figs/p_and_p_cover.png)
 27 | background-size: cover
 28 | 
 29 | 
 30 | # Text Modeling
 31 | 
 32 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
 33 | 
 34 | ### USING TIDY PRINCIPLES
 35 | 
 36 | .large[Julia Silge | SDSS | 29 May 2019]
 37 | 
 38 | ---
 39 | 
 40 | ## Let's install some packages
 41 | 
 42 | 
 43 | ```r
 44 | install.packages(c("tidyverse", 
 45 |                    "tidytext", 
 46 |                    "gutenbergr",
 47 |                    "stm",
 48 |                    "glmnet",
 49 |                    "yardstick"))
 50 | ```
 51 | 
 52 | ---
 53 | 
 54 | class: right, middle
 55 | 
 56 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
 57 | 
 58 | # Find us at...
 59 | 
 60 | &lt;a href="http://twitter.com/juliasilge"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
 61 | &lt;a href="http://github.com/juliasilge"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
 62 | &lt;a href="https://juliasilge.com"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; juliasilge.com&lt;/a&gt;&lt;br&gt;
 63 | 
 64 | ---
 65 | 
 66 | class: right, middle
 67 | 
 68 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
 69 | 
 70 | # Find us at...
 71 | 
 72 | &lt;a href="http://twitter.com/dataandme"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @dataandme&lt;/a&gt;&lt;br&gt;
 73 | &lt;a href="http://github.com/batpigandme"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @batpigandme&lt;/a&gt;&lt;br&gt;
 74 | &lt;a href="https://maraaverick.rbind.io"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; maraaverick.rbind.io&lt;/a&gt;&lt;br&gt;
 75 | 
 76 | ---
 77 | 
 78 | class: right, inverse, middle
 79 | 
 80 | background-image: url(figs/p_and_p_cover.png)
 81 | background-size: cover
 82 | 
 83 | # TIDYING AND CASTING 
 84 | 
 85 | &lt;h1 class="fa fa-check-circle fa-fw"&gt;&lt;/h1&gt;
 86 | 
 87 | ---
 88 | 
 89 | background-image: url(figs/tmwr_0601.png)
 90 | background-size: 900px
 91 | 
 92 | ---
 93 | 
 94 | class: inverse
 95 | 
 96 | background-image: url(figs/p_and_p_cover.png)
 97 | background-size: cover
 98 | 
 99 | # Two powerful NLP techniques
100 | 
101 | --
102 | 
103 | - .large[Topic modeling]
104 | 
105 | --
106 | 
107 | - .large[Text classification]
108 | 
109 | ---
110 | 
111 | class: inverse
112 | 
113 | background-image: url(figs/p_and_p_cover.png)
114 | background-size: cover
115 | 
116 | # Topic modeling
117 | 
118 | - .large[Each DOCUMENT = mixture of topics]
119 | 
120 | --
121 | 
122 | - .large[Each TOPIC = mixture of words]
123 | 
124 | ---
125 | 
126 | class: top
127 | 
128 | background-image: url(figs/top_tags-1.png)
129 | background-size: 800px
130 | 
131 | ---
132 | 
133 | class: center, middle, inverse
134 | 
135 | background-image: url(figs/p_and_p_cover.png)
136 | background-size: cover
137 | 
138 | # GREAT LIBRARY HEIST 🕵️‍♀️
139 | 
140 | ---
141 | 
142 | ## **Downloading your text data**
143 | 
144 | 
145 | ```r
146 | library(tidyverse)
147 | library(gutenbergr)
148 | 
149 | titles &lt;- c("Twenty Thousand Leagues under the Sea", 
150 |             "The War of the Worlds",
151 |             "Pride and Prejudice", 
152 |             "Great Expectations")
153 | 
154 | books &lt;- gutenberg_works(title %in% titles) %&gt;%
155 |   gutenberg_download(meta_fields = "title")
156 | 
157 | books
158 | ```
159 | 
160 | ```
161 | ## # A tibble: 51,663 x 3
162 | ##    gutenberg_id text                                        title          
163 | ##           &lt;int&gt; &lt;chr&gt;                                       &lt;chr&gt;          
164 | ##  1           36 The War of the Worlds                       The War of the…
165 | ##  2           36 ""                                          The War of the…
166 | ##  3           36 by H. G. Wells [1898]                       The War of the…
167 | ##  4           36 ""                                          The War of the…
168 | ##  5           36 ""                                          The War of the…
169 | ##  6           36 "     But who shall dwell in these worlds … The War of the…
170 | ##  7           36 "     inhabited? .  .  .  Are we or they L… The War of the…
171 | ##  8           36 "     World? .  .  .  And how are all thin… The War of the…
172 | ##  9           36 "          KEPLER (quoted in The Anatomy o… The War of the…
173 | ## 10           36 ""                                          The War of the…
174 | ## # … with 51,653 more rows
175 | ```
176 | 
177 | ---
178 | 
179 | ## **Someone has torn your books apart!** 😭
180 | 
181 | 
182 | 
183 | ```r
184 | by_chapter &lt;- books %&gt;%
185 |   group_by(title) %&gt;%
186 |   mutate(chapter = cumsum(str_detect(text, 
187 |                                      regex("^chapter ", 
188 |                                            ignore_case = TRUE)))) %&gt;%
189 |   ungroup() %&gt;%
190 |   filter(chapter &gt; 0) %&gt;%
191 |   unite(document, title, chapter)
192 | 
193 | by_chapter
194 | ```
195 | 
196 | ```
197 | ## # A tibble: 51,602 x 3
198 | ##    gutenberg_id text                                       document        
199 | ##           &lt;int&gt; &lt;chr&gt;                                      &lt;chr&gt;           
200 | ##  1           36 CHAPTER ONE                                The War of the …
201 | ##  2           36 ""                                         The War of the …
202 | ##  3           36 THE EVE OF THE WAR                         The War of the …
203 | ##  4           36 ""                                         The War of the …
204 | ##  5           36 ""                                         The War of the …
205 | ##  6           36 No one would have believed in the last ye… The War of the …
206 | ##  7           36 century that this world was being watched… The War of the …
207 | ##  8           36 intelligences greater than man's and yet … The War of the …
208 | ##  9           36 men busied themselves about their various… The War of the …
209 | ## 10           36 scrutinised and studied, perhaps almost a… The War of the …
210 | ## # … with 51,592 more rows
211 | ```
212 | 
213 | ---
214 | 
215 | ## **Can we put them back together?**
216 | 
217 | 
218 | ```r
219 | library(tidytext)
220 | 
221 | word_counts &lt;- by_chapter %&gt;%
222 | * unnest_tokens(word, text) %&gt;%
223 |   anti_join(get_stopwords(source = "smart")) %&gt;%
224 |   count(document, word, sort = TRUE)
225 | 
226 | word_counts
227 | ```
228 | 
229 | ```
230 | ## # A tibble: 111,650 x 3
231 | ##    document               word        n
232 | ##    &lt;chr&gt;                  &lt;chr&gt;   &lt;int&gt;
233 | ##  1 Great Expectations_57  joe        88
234 | ##  2 Great Expectations_7   joe        70
235 | ##  3 Pride and Prejudice_18 mr         66
236 | ##  4 Great Expectations_17  biddy      63
237 | ##  5 Great Expectations_27  joe        58
238 | ##  6 Great Expectations_38  estella    58
239 | ##  7 Great Expectations_2   joe        56
240 | ##  8 Great Expectations_23  pocket     53
241 | ##  9 Great Expectations_15  joe        50
242 | ## 10 Great Expectations_18  joe        50
243 | ## # … with 111,640 more rows
244 | ```
245 | 
246 | ---
247 | 
248 | ## **Can we put them back together?**
249 | 
250 | 
251 | ```r
252 | words_sparse &lt;- word_counts %&gt;%
253 | * cast_sparse(document, word, n)
254 | 
255 | class(words_sparse)
256 | ```
257 | 
258 | ```
259 | ## [1] "dgCMatrix"
260 | ## attr(,"package")
261 | ## [1] "Matrix"
262 | ```
263 | 
264 | ---
265 | 
266 | ## **Train a topic model**
267 | 
268 | Use a sparse matrix or a `quanteda::dfm` object as input
269 | 
270 | 
271 | ```r
272 | library(stm)
273 | 
274 | topic_model &lt;- stm(words_sparse, K = 4, 
275 |                    verbose = FALSE, init.type = "Spectral")
276 | 
277 | summary(topic_model)
278 | ```
279 | 
280 | ```
281 | ## A topic model with 4 topics, 193 documents and a 18360 word dictionary.
282 | ```
283 | 
284 | ```
285 | ## Topic 1 Top Words:
286 | ##  	 Highest Prob: mr, elizabeth, mrs, darcy, bennet, miss, jane 
287 | ##  	 FREX: elizabeth, darcy, bennet, bingley, wickham, collins, lydia 
288 | ##  	 Lift: wickham, nephew, phillips, brighton, meryton, bourgh, mend 
289 | ##  	 Score: elizabeth, darcy, bennet, bingley, wickham, jane, lydia 
290 | ## Topic 2 Top Words:
291 | ##  	 Highest Prob: captain, nautilus, sea, nemo, ned, conseil, land 
292 | ##  	 FREX: nautilus, nemo, ned, conseil, canadian, ocean, seas 
293 | ##  	 Lift: vanikoro, indian, d'urville, reefs, scotia, shark's, solidification 
294 | ##  	 Score: nautilus, nemo, ned, conseil, canadian, ocean, captain 
295 | ## Topic 3 Top Words:
296 | ##  	 Highest Prob: mr, joe, miss, time, pip, looked, herbert 
297 | ##  	 FREX: joe, pip, herbert, wemmick, havisham, estella, biddy 
298 | ##  	 Lift: towel, giv, whimple, meantersay, jew, rot, barnard's 
299 | ##  	 Score: joe, wemmick, pip, jaggers, havisham, estella, herbert 
300 | ## Topic 4 Top Words:
301 | ##  	 Highest Prob: people, martians, man, time, black, men, night 
302 | ##  	 FREX: martians, martian, woking, mars, curate, pine, ulla 
303 | ##  	 Lift: martians, mars, curate, shepperton, henderson, hood, ripley 
304 | ##  	 Score: martians, martian, woking, cylinder, curate, ulla, pine
305 | ```
306 | 
307 | ---
308 | 
309 | ## **Exploring the output of topic modeling**
310 | 
311 | .large[Time for tidying!]
312 | 
313 | 
314 | ```r
315 | chapter_topics &lt;- tidy(topic_model, matrix = "beta")
316 | 
317 | chapter_topics
318 | ```
319 | 
320 | ```
321 | ## # A tibble: 73,440 x 3
322 | ##    topic term       beta
323 | ##    &lt;int&gt; &lt;chr&gt;     &lt;dbl&gt;
324 | ##  1     1 joe   8.69e-104
325 | ##  2     2 joe   3.03e-139
326 | ##  3     3 joe   1.21e-  2
327 | ##  4     4 joe   3.28e- 19
328 | ##  5     1 mr    1.90e-  2
329 | ##  6     2 mr    1.91e-  4
330 | ##  7     3 mr    1.22e-  2
331 | ##  8     4 mr    1.15e- 45
332 | ##  9     1 biddy 3.21e- 80
333 | ## 10     2 biddy 3.84e-149
334 | ## # … with 73,430 more rows
335 | ```
336 | 
337 | ---
338 | 
339 | ## **Exploring the output of topic modeling**
340 | 
341 | 
342 | ```r
343 | top_terms &lt;- chapter_topics %&gt;%
344 |   group_by(topic) %&gt;%
345 |   top_n(10, beta) %&gt;%
346 |   ungroup() %&gt;%
347 |   arrange(topic, -beta)
348 | 
349 | top_terms
350 | ```
351 | 
352 | ```
353 | ## # A tibble: 40 x 3
354 | ##    topic term         beta
355 | ##    &lt;int&gt; &lt;chr&gt;       &lt;dbl&gt;
356 | ##  1     1 mr        0.0190 
357 | ##  2     1 elizabeth 0.0141 
358 | ##  3     1 mrs       0.00886
359 | ##  4     1 darcy     0.00881
360 | ##  5     1 bennet    0.00694
361 | ##  6     1 miss      0.00674
362 | ##  7     1 jane      0.00652
363 | ##  8     1 bingley   0.00607
364 | ##  9     1 time      0.00493
365 | ## 10     1 good      0.00480
366 | ## # … with 30 more rows
367 | ```
368 | 
369 | ---
370 | ## **Exploring the output of topic modeling**
371 | 
372 | 
373 | ```r
374 | top_terms %&gt;%
375 |   mutate(term = fct_reorder(term, beta)) %&gt;%
376 |   ggplot(aes(term, beta, fill = factor(topic))) +
377 |   geom_col(show.legend = FALSE) +
378 |   facet_wrap(~ topic, scales = "free") +
379 |   coord_flip()
380 | ```
381 | 
382 | ---
383 | 
384 | ![](modeling_files/figure-html/unnamed-chunk-10-1.png)&lt;!-- --&gt;
385 | 
386 | ---
387 | 
388 | ## **How are documents classified?**
389 | 
390 | 
391 | ```r
392 | chapters_gamma &lt;- tidy(topic_model, matrix = "gamma",
393 |                        document_names = rownames(words_sparse))
394 | 
395 | chapters_gamma
396 | ```
397 | 
398 | ```
399 | ## # A tibble: 772 x 3
400 | ##    document               topic    gamma
401 | ##    &lt;chr&gt;                  &lt;int&gt;    &lt;dbl&gt;
402 | ##  1 Great Expectations_57      1 0.000792
403 | ##  2 Great Expectations_7       1 0.00340 
404 | ##  3 Pride and Prejudice_18     1 1.000   
405 | ##  4 Great Expectations_17      1 0.0480  
406 | ##  5 Great Expectations_27      1 0.000367
407 | ##  6 Great Expectations_38      1 0.00110 
408 | ##  7 Great Expectations_2       1 0.000531
409 | ##  8 Great Expectations_23      1 0.432   
410 | ##  9 Great Expectations_15      1 0.000565
411 | ## 10 Great Expectations_18      1 0.000277
412 | ## # … with 762 more rows
413 | ```
414 | 
415 | ---
416 | 
417 | ## **How are documents classified?**
418 | 
419 | 
420 | ```r
421 | chapters_parsed &lt;- chapters_gamma %&gt;%
422 |   separate(document, c("title", "chapter"), 
423 |            sep = "_", convert = TRUE)
424 | 
425 | chapters_parsed
426 | ```
427 | 
428 | ```
429 | ## # A tibble: 772 x 4
430 | ##    title               chapter topic    gamma
431 | ##    &lt;chr&gt;                 &lt;int&gt; &lt;int&gt;    &lt;dbl&gt;
432 | ##  1 Great Expectations       57     1 0.000792
433 | ##  2 Great Expectations        7     1 0.00340 
434 | ##  3 Pride and Prejudice      18     1 1.000   
435 | ##  4 Great Expectations       17     1 0.0480  
436 | ##  5 Great Expectations       27     1 0.000367
437 | ##  6 Great Expectations       38     1 0.00110 
438 | ##  7 Great Expectations        2     1 0.000531
439 | ##  8 Great Expectations       23     1 0.432   
440 | ##  9 Great Expectations       15     1 0.000565
441 | ## 10 Great Expectations       18     1 0.000277
442 | ## # … with 762 more rows
443 | ```
444 | 
445 | ---
446 | 
447 | ## **How are documents classified?**
448 | 
449 | 
450 | ```r
451 | chapters_parsed %&gt;%
452 |   mutate(title = fct_reorder(title, gamma * topic)) %&gt;%
453 |   ggplot(aes(factor(topic), gamma)) +
454 |   geom_boxplot() +
455 |   facet_wrap(~ title)
456 | ```
457 | 
458 | ---
459 | 
460 | ![](modeling_files/figure-html/unnamed-chunk-14-1.png)&lt;!-- --&gt;
461 | 
462 | ---
463 | 
464 | class: center, middle, inverse
465 | 
466 | background-image: url(figs/p_and_p_cover.png)
467 | background-size: cover
468 | 
469 | # GOING FARTHER 🚀
470 | 
471 | ---
472 | 
473 | ## Tidying model output
474 | 
475 | ### Which words in each document are assigned to which topics?
476 | 
477 | - .large[`augment()`]
478 | - .large[Add information to each observation in the original data]
479 | 
480 | ---
481 | 
482 | background-image: url(figs/stm_video.png)
483 | background-size: 850px
484 | 
485 | ---
486 | 
487 | ## **Using stm**
488 | 
489 | - .large[Document-level covariates]
490 | 
491 | 
492 | ```r
493 | topic_model &lt;- stm(words_sparse, K = 0, init.type = "Spectral",
494 |                    prevalence = ~s(Year),
495 |                    data = covariates,
496 |                    verbose = FALSE)
497 | ```
498 | 
499 | - .large[Use functions for `semanticCoherence()`, `checkResiduals()`, `exclusivity()`, and more!]
500 | 
501 | - .large[Check out http://www.structuraltopicmodel.com/]
502 | 
503 | - .large[See [my blog post](https://juliasilge.com/blog/evaluating-stm/) for how to choose `K`, the number of topics]
504 | 
505 | ---
506 | 
507 | 
508 | background-image: url(figs/model_diagnostic-1.png)
509 | background-position: 50% 50%
510 | background-size: 950px
511 | 
512 | ---
513 | 
514 | # Stemming?
515 | 
516 | .large[Advice from [Schofield &amp; Mimno](https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf)]
517 | 
518 | .large["Comparing Apples to Apple: The Effects of Stemmers on Topic Models"]
519 | 
520 | ---
521 | 
522 | class: right, middle
523 | 
524 | &lt;h1 class="fa fa-quote-left fa-fw"&gt;&lt;/h1&gt;
525 | 
526 | &lt;h2&gt; Despite their frequent use in topic modeling, we find that stemmers produce no meaningful improvement in likelihood and coherence and in fact can degrade topic stability. &lt;/h2&gt;
527 | 
528 | &lt;h1 class="fa fa-quote-right fa-fw"&gt;&lt;/h1&gt;
529 | 
530 | ---
531 | 
532 | class: right, middle, inverse
533 | 
534 | background-image: url(figs/p_and_p_cover.png)
535 | background-size: cover
536 | 
537 | 
538 | # TEXT CLASSIFICATION
539 | &lt;h1 class="fa fa-balance-scale fa-fw"&gt;&lt;/h1&gt;
540 | 
541 | ---
542 | 
543 | ## **Downloading your text data**
544 | 
545 | 
546 | ```r
547 | library(tidyverse)
548 | library(gutenbergr)
549 | 
550 | titles &lt;- c("The War of the Worlds",
551 |             "Pride and Prejudice")
552 | 
553 | books &lt;- gutenberg_works(title %in% titles) %&gt;%
554 |   gutenberg_download(meta_fields = "title") %&gt;%
555 |   mutate(document = row_number())
556 | 
557 | books
558 | ```
559 | 
560 | ```
561 | ## # A tibble: 19,504 x 4
562 | ##    gutenberg_id text                                 title         document
563 | ##           &lt;int&gt; &lt;chr&gt;                                &lt;chr&gt;            &lt;int&gt;
564 | ##  1           36 The War of the Worlds                The War of t…        1
565 | ##  2           36 ""                                   The War of t…        2
566 | ##  3           36 by H. G. Wells [1898]                The War of t…        3
567 | ##  4           36 ""                                   The War of t…        4
568 | ##  5           36 ""                                   The War of t…        5
569 | ##  6           36 "     But who shall dwell in these … The War of t…        6
570 | ##  7           36 "     inhabited? .  .  .  Are we or… The War of t…        7
571 | ##  8           36 "     World? .  .  .  And how are a… The War of t…        8
572 | ##  9           36 "          KEPLER (quoted in The An… The War of t…        9
573 | ## 10           36 ""                                   The War of t…       10
574 | ## # … with 19,494 more rows
575 | ```
576 | 
577 | ---
578 | 
579 | ## **Making a tidy dataset**
580 | 
581 | .large[Use this kind of data structure for EDA! 💅]
582 | 
583 | 
584 | ```r
585 | library(tidytext)
586 | 
587 | tidy_books &lt;- books %&gt;%
588 | * unnest_tokens(word, text) %&gt;%
589 |   group_by(word) %&gt;%
590 |   filter(n() &gt; 10) %&gt;%
591 |   ungroup
592 | 
593 | tidy_books
594 | ```
595 | 
596 | ```
597 | ## # A tibble: 159,707 x 4
598 | ##    gutenberg_id title                 document word 
599 | ##           &lt;int&gt; &lt;chr&gt;                    &lt;int&gt; &lt;chr&gt;
600 | ##  1           36 The War of the Worlds        1 the  
601 | ##  2           36 The War of the Worlds        1 war  
602 | ##  3           36 The War of the Worlds        1 of   
603 | ##  4           36 The War of the Worlds        1 the  
604 | ##  5           36 The War of the Worlds        3 by   
605 | ##  6           36 The War of the Worlds        6 but  
606 | ##  7           36 The War of the Worlds        6 who  
607 | ##  8           36 The War of the Worlds        6 shall
608 | ##  9           36 The War of the Worlds        6 in   
609 | ## 10           36 The War of the Worlds        6 these
610 | ## # … with 159,697 more rows
611 | ```
612 | 
613 | ---
614 | 
615 | ## **Cast to a sparse matrix**
616 | 
617 | .large[And build a dataframe with a response variable]
618 | 
619 | 
620 | ```r
621 | sparse_words &lt;- tidy_books %&gt;%
622 |   count(document, word, sort = TRUE) %&gt;%
623 | * cast_sparse(document, word, n)
624 | 
625 | books_joined &lt;- tibble(document = as.integer(rownames(sparse_words))) %&gt;%
626 |   left_join(books %&gt;%
627 |               select(document, title))
628 | ```
629 | 
630 | ---
631 | 
632 | ## **Train a glmnet model**
633 | 
634 | 
635 | ```r
636 | library(glmnet)
637 | library(doMC)
638 | registerDoMC(cores = 8)
639 | 
640 | is_jane &lt;- books_joined$title == "Pride and Prejudice"
641 | 
642 | model &lt;- cv.glmnet(sparse_words, is_jane, family = "binomial", 
643 |                    parallel = TRUE, keep = TRUE)
644 | ```
645 | 
646 | ---
647 | 
648 | ## **Tidying our model**
649 | 
650 | .large[Tidy, then filter to choose some lambda from glmnet output]
651 | 
652 | 
653 | ```r
654 | library(broom)
655 | 
656 | coefs &lt;- model$glmnet.fit %&gt;%
657 |   tidy() %&gt;%
658 |   filter(lambda == model$lambda.1se)
659 | 
660 | Intercept &lt;- coefs %&gt;%
661 |   filter(term == "(Intercept)") %&gt;%
662 |   pull(estimate)
663 | ```
664 | 
665 | ---
666 | 
667 | ## **Tidying our model**
668 | 
669 | 
670 | ```r
671 | classifications &lt;- tidy_books %&gt;%
672 |   inner_join(coefs, by = c("word" = "term")) %&gt;%
673 |   group_by(document) %&gt;%
674 |   summarize(score = sum(estimate)) %&gt;%
675 |   mutate(probability = plogis(Intercept + score))
676 | 
677 | classifications
678 | ```
679 | 
680 | ```
681 | ## # A tibble: 16,001 x 3
682 | ##    document  score probability
683 | ##       &lt;int&gt;  &lt;dbl&gt;       &lt;dbl&gt;
684 | ##  1        1 -2.34      0.110  
685 | ##  2        3  0.205     0.611  
686 | ##  3        6  1.85      0.890  
687 | ##  4        7 -1.02      0.315  
688 | ##  5        8 -1.25      0.268  
689 | ##  6        9 -0.526     0.430  
690 | ##  7       13 -0.238     0.502  
691 | ##  8       15 -5.47      0.00533
692 | ##  9       19  0.373     0.650  
693 | ## 10       21 -2.34      0.110  
694 | ## # … with 15,991 more rows
695 | ```
696 | 
697 | ---
698 | 
699 | ## **Understanding our model**
700 | 
701 | 
702 | ```r
703 | coefs %&gt;%
704 |   group_by(estimate &gt; 0) %&gt;%
705 |   top_n(10, abs(estimate)) %&gt;%
706 |   ungroup %&gt;%
707 |   ggplot(aes(fct_reorder(term, estimate), 
708 |              estimate, 
709 |              fill = estimate &gt; 0)) +
710 |   geom_col(show.legend = FALSE) +
711 |   coord_flip()
712 | ```
713 | 
714 | ---
715 | 
716 | ![](modeling_files/figure-html/unnamed-chunk-23-1.png)&lt;!-- --&gt;
717 | 
718 | ---
719 | 
720 | ## **ROC**
721 | 
722 | 
723 | ```r
724 | library(yardstick)
725 | 
726 | comment_classes &lt;- classifications %&gt;%
727 |   left_join(books %&gt;%
728 |     select(title, document), by = "document") %&gt;%
729 |   mutate(title = as.factor(title))
730 | ```
731 | 
732 | ---
733 | 
734 | ## **ROC**
735 | 
736 | 
737 | ```r
738 | comment_classes %&gt;%
739 |   roc_curve(title, probability) %&gt;%
740 |   ggplot(aes(x = 1 - specificity, y = sensitivity)) +
741 |   geom_line(
742 |     color = "midnightblue",
743 |     size = 1.5
744 |   ) +
745 |   geom_abline(
746 |     lty = 2, alpha = 0.5,
747 |     color = "gray50",
748 |     size = 1.2
749 |   )
750 | ```
751 | 
752 | ---
753 | 
754 | ![](modeling_files/figure-html/unnamed-chunk-26-1.png)&lt;!-- --&gt;
755 | 
756 | ---
757 | 
758 | ## **AUC for model**
759 | 
760 | 
761 | ```r
762 | comment_classes %&gt;%
763 |   roc_auc(title, probability)
764 | ```
765 | 
766 | ```
767 | ## # A tibble: 1 x 3
768 | ##   .metric .estimator .estimate
769 | ##   &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;
770 | ## 1 roc_auc binary         0.990
771 | ```
772 | 
773 | ---
774 | 
775 | ## **Confusion matrix**
776 | 
777 | 
778 | ```r
779 | comment_classes %&gt;%
780 |   mutate(
781 |     prediction = case_when(
782 |       probability &gt; 0.5 ~ "Pride and Prejudice",
783 |       TRUE ~ "The War of the Worlds"
784 |     ),
785 |     prediction = as.factor(prediction)
786 |   ) %&gt;%
787 |   conf_mat(title, prediction)
788 | ```
789 | 
790 | ```
791 | ##                        Truth
792 | ## Prediction              Pride and Prejudice The War of the Worlds
793 | ##   Pride and Prejudice                 10351                   484
794 | ##   The War of the Worlds                 264                  4902
795 | ```
796 | 
797 | ---
798 | 
799 | ## **Misclassifications**
800 | 
801 | Let's talk about misclassifications. Which documents here were incorrectly predicted to be written by Jane Austen?
802 | 
803 | 
804 | ```r
805 | comment_classes %&gt;%
806 |   filter(
807 | *   probability &gt; .8,
808 | *   title == "The War of the Worlds"
809 |   ) %&gt;%
810 |   sample_n(10) %&gt;%
811 |   inner_join(books %&gt;%
812 |     select(document, text)) %&gt;%
813 |   select(probability, text)
814 | ```
815 | 
816 | ```
817 | ## # A tibble: 10 x 2
818 | ##    probability text                                                        
819 | ##          &lt;dbl&gt; &lt;chr&gt;                                                       
820 | ##  1       0.858 ladies there being by no means the least active.            
821 | ##  2       0.851 is wrong as well as any, but not what is possible to tortur…
822 | ##  3       0.972 She put her hand to her throat--swayed.  I made a step forw…
823 | ##  4       0.962 "\"Be a man!\" said I.  \"You are scared out of your wits! …
824 | ##  5       0.832 "\"Take this!\" said the slender lady, and she gave my brot…
825 | ##  6       0.827 decorum were necessarily different from ours; and not only …
826 | ##  7       0.906 "\"Half a mile, you say?\" said he."                        
827 | ##  8       0.910 breed.  I tell you, I'm grim set on living.  And if I'm not…
828 | ##  9       0.854 would be advisable to kill him, lest his actions attracted …
829 | ## 10       0.919 winter.  Its air is much more attenuated than ours, its oce…
830 | ```
831 | 
832 | ---
833 | 
834 | ## **Misclassifications**
835 | 
836 | Let's talk about misclassifications. Which documents here were incorrectly predicted to *not* be written by Jane Austen?
837 | 
838 | 
839 | ```r
840 | comment_classes %&gt;%
841 |   filter(
842 | *   probability &lt; .3,
843 | *   title == "Pride and Prejudice"
844 |   ) %&gt;%
845 |   sample_n(10) %&gt;%
846 |   inner_join(books %&gt;%
847 |     select(document, text)) %&gt;%
848 |   select(probability, text)
849 | ```
850 | 
851 | ```
852 | ## # A tibble: 10 x 2
853 | ##    probability text                                                        
854 | ##          &lt;dbl&gt; &lt;chr&gt;                                                       
855 | ##  1       0.188 is so violent, that it would be the death of half the good …
856 | ##  2       0.208 blush. He absolutely started, and for a moment seemed immov…
857 | ##  3       0.269 of contradictions and varieties, sighed at the perverseness…
858 | ##  4       0.220 suddenly came forward from the road, which led behind it to…
859 | ##  5       0.199 "\"A little sea-bathing would set me up forever.\""         
860 | ##  6       0.286 it had just transpired that he had left gaming debts behind…
861 | ##  7       0.266 of the gates into the ground.                               
862 | ##  8       0.120 the happiest of men.                                        
863 | ##  9       0.218 They travelled as expeditiously as possible, and, sleeping …
864 | ## 10       0.279 the improvements it was receiving, he was happily employed …
865 | ```
866 | 
867 | ---
868 | 
869 | background-image: url(figs/tmwr_0601.png)
870 | background-position: 50% 70%
871 | background-size: 750px
872 | 
873 | ## **Workflow for text mining/modeling**
874 | 
875 | ---
876 | 
877 | background-image: url(figs/lizzieskipping.gif)
878 | background-position: 50% 55%
879 | background-size: 750px
880 | 
881 | # **Go explore real-world text!**
882 | 
883 | ---
884 | 
885 | class: left, middle
886 | 
887 | &lt;img src="figs/blue_jane.png" width="150px"/&gt;
888 | 
889 | # Thanks!
890 | 
891 | &lt;a href="https://tidytextmining.com"&gt;&lt;i class="fa fa-book fa-fw"&gt;&lt;/i&gt;&amp;nbsp; tidytextmining.com&lt;/a&gt;&lt;br&gt;
892 | &lt;a href="http://twitter.com/juliasilge"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
893 | &lt;a href="http://github.com/juliasilge"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @juliasilge&lt;/a&gt;&lt;br&gt;
894 | &lt;a href="https://juliasilge.com"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; juliasilge.com&lt;/a&gt;&lt;br&gt;
895 | &lt;a href="http://twitter.com/dataandme"&gt;&lt;i class="fa fa-twitter fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @dataandme&lt;/a&gt;&lt;br&gt;
896 | &lt;a href="http://github.com/batpigandme"&gt;&lt;i class="fa fa-github fa-fw"&gt;&lt;/i&gt;&amp;nbsp; @batpigandme&lt;/a&gt;&lt;br&gt;
897 | &lt;a href="https://maraaverick.rbind.io"&gt;&lt;i class="fa fa-link fa-fw"&gt;&lt;/i&gt;&amp;nbsp; maraaverick.rbind.io&lt;/a&gt;&lt;br&gt;
898 | 
899 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
900 |     </textarea>
901 | <style data-target="print-only">@media screen {.remark-slide-container{display:block;}.remark-slide-scaler{box-shadow:none;}}</style>
902 | <script src="https://remarkjs.com/downloads/remark-latest.min.js"></script>
903 | <script>var slideshow = remark.create({
904 | "highlightStyle": "github",
905 | "highlightLines": true,
906 | "countIncrementalSlides": false,
907 | "ratio": "16:9"
908 | });
909 | if (window.HTMLWidgets) slideshow.on('afterShowSlide', function (slide) {
910 |   window.dispatchEvent(new Event('resize'));
911 | });
912 | (function(d) {
913 |   var s = d.createElement("style"), r = d.querySelector(".remark-slide-scaler");
914 |   if (!r) return;
915 |   s.type = "text/css"; s.innerHTML = "@page {size: " + r.style.width + " " + r.style.height +"; }";
916 |   d.head.appendChild(s);
917 | })(document);
918 | 
919 | (function(d) {
920 |   var el = d.getElementsByClassName("remark-slides-area");
921 |   if (!el) return;
922 |   var slide, slides = slideshow.getSlides(), els = el[0].children;
923 |   for (var i = 1; i < slides.length; i++) {
924 |     slide = slides[i];
925 |     if (slide.properties.continued === "true" || slide.properties.count === "false") {
926 |       els[i - 1].className += ' has-continuation';
927 |     }
928 |   }
929 |   var s = d.createElement("style");
930 |   s.type = "text/css"; s.innerHTML = "@media print { .has-continuation { display: none; } }";
931 |   d.head.appendChild(s);
932 | })(document);
933 | // delete the temporary CSS (for displaying all slides initially) when the user
934 | // starts to view slides
935 | (function() {
936 |   var deleted = false;
937 |   slideshow.on('beforeShowSlide', function(slide) {
938 |     if (deleted) return;
939 |     var sheets = document.styleSheets, node;
940 |     for (var i = 0; i < sheets.length; i++) {
941 |       node = sheets[i].ownerNode;
942 |       if (node.dataset["target"] !== "print-only") continue;
943 |       node.parentNode.removeChild(node);
944 |     }
945 |     deleted = true;
946 |   });
947 | })();</script>
948 | 
949 | <script>
950 | (function() {
951 |   var links = document.getElementsByTagName('a');
952 |   for (var i = 0; i < links.length; i++) {
953 |     if (/^(https?:)?\/\//.test(links[i].getAttribute('href'))) {
954 |       links[i].target = '_blank';
955 |     }
956 |   }
957 | })();
958 | </script>
959 | 
960 | <script>
961 | slideshow._releaseMath = function(el) {
962 |   var i, text, code, codes = el.getElementsByTagName('code');
963 |   for (i = 0; i < codes.length;) {
964 |     code = codes[i];
965 |     if (code.parentNode.tagName !== 'PRE' && code.childElementCount === 0) {
966 |       text = code.textContent;
967 |       if (/^\\\((.|\s)+\\\)$/.test(text) || /^\\\[(.|\s)+\\\]$/.test(text) ||
968 |           /^\$\$(.|\s)+\$\$$/.test(text) ||
969 |           /^\\begin\{([^}]+)\}(.|\s)+\\end\{[^}]+\}$/.test(text)) {
970 |         code.outerHTML = code.innerHTML;  // remove <code></code>
971 |         continue;
972 |       }
973 |     }
974 |     i++;
975 |   }
976 | };
977 | slideshow._releaseMath(document);
978 | </script>
979 | <!-- dynamically load mathjax for compatibility with self-contained -->
980 | <script>
981 | (function () {
982 |   var script = document.createElement('script');
983 |   script.type = 'text/javascript';
984 |   script.src  = 'https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-MML-AM_CHTML';
985 |   if (location.protocol !== 'file:' && /^https?:/.test(script.src))
986 |     script.src  = script.src.replace(/^https?:/, '');
987 |   document.getElementsByTagName('head')[0].appendChild(script);
988 | })();
989 | </script>
990 |   </body>
991 | </html>
992 | 


--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-26-1.png


--------------------------------------------------------------------------------
/pdfs/juliasilge-textmining-sdss-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textmining-sdss-2.pdf


--------------------------------------------------------------------------------
/pdfs/juliasilge-textminnig-sdss-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textminnig-sdss-1.pdf


--------------------------------------------------------------------------------
/sdss2019.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------