├── .gitignore
├── README.md
├── css
├── footer_plus.css
└── xaringan-themer.css
├── figs
├── austen-1.png
├── blue_jane.png
├── cover.png
├── lizzieskipping.gif
├── model_diagnostic-1.png
├── p_and_p_cover.png
├── plot_tf_idf-1.png
├── purple_emily.png
├── slider.gif
├── stm_video.png
├── stop.gif
├── tidytext_repo.png
├── tilecounts-1.png
├── tilerate-1.png
├── tmwr_0601.png
├── top_tags-1.png
└── vexing.gif
├── header.html
├── intro.Rmd
├── intro.html
├── intro_files
└── figure-html
│ ├── unnamed-chunk-13-1.png
│ ├── unnamed-chunk-21-1.png
│ └── unnamed-chunk-27-1.png
├── libs
└── remark-css
│ └── default.css
├── modeling.Rmd
├── modeling.html
├── modeling_files
└── figure-html
│ ├── unnamed-chunk-10-1.png
│ ├── unnamed-chunk-14-1.png
│ ├── unnamed-chunk-23-1.png
│ └── unnamed-chunk-26-1.png
├── pdfs
├── juliasilge-textmining-sdss-2.pdf
└── juliasilge-textminnig-sdss-1.pdf
└── sdss2019.Rproj
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *cache*
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tidy Text Mining at SDSS 2019
2 |
3 | Slides for my short course on text mining at the Symposium on Data Science and Statistics
4 | in May 2019
5 |
6 | Check out the slides at [bit.ly/silge-sdss-1](https://bit.ly/silge-sdss-1) and [bit.ly/silge-sdss-2](https://bit.ly/silge-sdss-2)!
7 |
8 |
9 | Slides created with [remark.js](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
10 |
11 | My xaringan theme (from [xaringanthemer](https://pkg.garrickadenbuie.com/xaringanthemer/)):
12 |
13 | ```
14 | mono_accent(
15 | base_color = "#09017F",
16 | header_font_google = google_font("Roboto", "700"),
17 | text_font_google = google_font("Roboto Condensed"),
18 | code_font_google = google_font("Droid Mono"),
19 | code_highlight_color = "#D2B6E8"
20 | )
21 | ```
22 |
--------------------------------------------------------------------------------
/css/footer_plus.css:
--------------------------------------------------------------------------------
1 | .large { font-size: 160% }
2 |
3 | .title-slide {
4 | background-image: url(../figs/p_and_p_cover.png);
5 | background-size: cover;
6 | }
7 |
8 | .title-slide .remark-slide-number {
9 | display: none;
10 | }
11 |
12 | .remark-slide-number {
13 | display: none;
14 | }
15 |
16 | div.my-footer {
17 | background-color: #050045;
18 | position: absolute;
19 | bottom: 0px;
20 | left: 0px;
21 | height: 20px;
22 | width: 100%;
23 | }
24 | div.my-footer span {
25 | font-size: 10pt;
26 | color: #F7F8FA;
27 | position: absolute;
28 | left: 15px;
29 | bottom: 2px;
30 | }
31 |
--------------------------------------------------------------------------------
/css/xaringan-themer.css:
--------------------------------------------------------------------------------
1 | /* -------------------------------------------------------
2 | *
3 | * !! This file was generated by xaringanthemer !!
4 | *
5 | * Changes made to this file directly will be overwritten
6 | * if you used xaringanthemer in your xaringan slides Rmd
7 | *
8 | * Issues or likes?
9 | * - https://github.com/gadenbuie/xaringanthemer
10 | * - https://www.garrickadenbuie.com
11 | *
12 | * Need help? Try:
13 | * - vignette(package = "xaringanthemer")
14 | * - ?xaringanthemer::write_xaringan_theme
15 | * - xaringan wiki: https://github.com/yihui/xaringan/wiki
16 | * - remarkjs wiki: https://github.com/gnab/remark/wiki
17 | *
18 | * ------------------------------------------------------- */
19 | @import url(https://fonts.googleapis.com/css?family=Roboto+Condensed);
20 | @import url(https://fonts.googleapis.com/css?family=Roboto:700);
21 | @import url(https://fonts.googleapis.com/css?family=Droid+Mono);
22 |
23 |
24 | body {
25 | font-family: Roboto Condensed, 'Palatino Linotype', 'Book Antiqua', Palatino, 'Microsoft YaHei', 'Songti SC', serif;
26 | font-weight: ;
27 | color: #272822;
28 | }
29 | h1, h2, h3 {
30 | font-family: Roboto;
31 | font-weight: normal;
32 | color: #09017F;
33 | }
34 | .remark-slide-content {
35 | background-color: #FFFFFF;
36 | font-size: 20px;
37 |
38 |
39 |
40 | padding: 1em 4em 1em 4em;
41 | }
42 | .remark-slide-content h1 {
43 | font-size: 55px;
44 | }
45 | .remark-slide-content h2 {
46 | font-size: 45px;
47 | }
48 | .remark-slide-content h3 {
49 | font-size: 35px;
50 | }
51 | .remark-code, .remark-inline-code {
52 | font-family: Droid Mono, 'Lucida Console', Monaco, monospace;
53 | }
54 | .remark-code {
55 | font-size: 0.9em;
56 | }
57 | .remark-inline-code {
58 | font-size: 1em;
59 | color: #09017F;
60 |
61 |
62 | }
63 | .remark-slide-number {
64 | color: #09017F;
65 | opacity: 1;
66 | font-size: 0.9em;
67 | }
68 | strong{color:#09017F;}
69 | a, a > code {
70 | color: #09017F;
71 | text-decoration: none;
72 | }
73 | .footnote {
74 |
75 | position: absolute;
76 | bottom: 3em;
77 | padding-right: 4em;
78 | font-size: 0.9em;
79 | }
80 | .remark-code-line-highlighted {
81 | background-color: #D2B6E8;
82 | }
83 | .inverse {
84 | background-color: #09017F;
85 | color: #FFFFFF;
86 |
87 | }
88 | .inverse h1, .inverse h2, .inverse h3 {
89 | color: #FFFFFF;
90 | }
91 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 {
92 | color: #FFFFFF;
93 | }
94 | .title-slide {
95 | background-color: #09017F;
96 |
97 |
98 |
99 | }
100 | .title-slide .remark-slide-number {
101 | display: none;
102 | }
103 | /* Two-column layout */
104 | .left-column {
105 | width: 20%;
106 | height: 92%;
107 | float: left;
108 | }
109 | .left-column h2, .left-column h3 {
110 | color: #09017F99;
111 | }
112 | .left-column h2:last-of-type, .left-column h3:last-child {
113 | color: #09017F;
114 | }
115 | .right-column {
116 | width: 75%;
117 | float: right;
118 | padding-top: 1em;
119 | }
120 | .pull-left {
121 | float: left;
122 | width: 47%;
123 | }
124 | .pull-right {
125 | float: right;
126 | width: 47%;
127 | }
128 | .pull-right ~ * {
129 | clear: both;
130 | }
131 | img, video, iframe {
132 | max-width: 100%;
133 | }
134 | blockquote {
135 | border-left: solid 5px #09017F80;
136 | padding-left: 1em;
137 | }
138 | .remark-slide table {
139 | margin: auto;
140 | border-top: 1px solid #666;
141 | border-bottom: 1px solid #666;
142 | }
143 | .remark-slide table thead th { border-bottom: 1px solid #ddd; }
144 | th, td { padding: 5px; }
145 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #B5B2D8 }
146 | table.dataTable tbody {
147 | background-color: #FFFFFF;
148 | color: #272822;
149 | }
150 | table.dataTable.display tbody tr.odd {
151 | background-color: #FFFFFF;
152 | }
153 | table.dataTable.display tbody tr.even {
154 | background-color: #B5B2D8;
155 | }
156 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover {
157 | background-color: rgba(255, 255, 255, 0.5);
158 | }
159 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate {
160 | color: #272822;
161 | }
162 | .dataTables_wrapper .dataTables_paginate .paginate_button {
163 | color: #272822 !important;
164 | }
165 |
166 | @page { margin: 0; }
167 | @media print {
168 | .remark-slide-scaler {
169 | width: 100% !important;
170 | height: 100% !important;
171 | transform: scale(1) !important;
172 | top: 0 !important;
173 | left: 0 !important;
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/figs/austen-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/austen-1.png
--------------------------------------------------------------------------------
/figs/blue_jane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/blue_jane.png
--------------------------------------------------------------------------------
/figs/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/cover.png
--------------------------------------------------------------------------------
/figs/lizzieskipping.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/lizzieskipping.gif
--------------------------------------------------------------------------------
/figs/model_diagnostic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/model_diagnostic-1.png
--------------------------------------------------------------------------------
/figs/p_and_p_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/p_and_p_cover.png
--------------------------------------------------------------------------------
/figs/plot_tf_idf-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/plot_tf_idf-1.png
--------------------------------------------------------------------------------
/figs/purple_emily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/purple_emily.png
--------------------------------------------------------------------------------
/figs/slider.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/slider.gif
--------------------------------------------------------------------------------
/figs/stm_video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stm_video.png
--------------------------------------------------------------------------------
/figs/stop.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stop.gif
--------------------------------------------------------------------------------
/figs/tidytext_repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tidytext_repo.png
--------------------------------------------------------------------------------
/figs/tilecounts-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilecounts-1.png
--------------------------------------------------------------------------------
/figs/tilerate-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilerate-1.png
--------------------------------------------------------------------------------
/figs/tmwr_0601.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tmwr_0601.png
--------------------------------------------------------------------------------
/figs/top_tags-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/top_tags-1.png
--------------------------------------------------------------------------------
/figs/vexing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/vexing.gif
--------------------------------------------------------------------------------
/header.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/intro.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Text Mining"
3 | subtitle: " USING TIDY DATA PRINCIPLES"
4 | author: "Julia Silge | SDSS | 29 May 2019"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: github
11 | highlightLines: true
12 | countIncrementalSlides: false
13 | ratio: "16:9"
14 | seal: false
15 | includes:
16 | in_header: header.html
17 | ---
18 |
19 | ```{r setup, include=FALSE}
20 | options(htmltools.dir.version = FALSE)
21 | library(knitr)
22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180)
23 | library(ggplot2)
24 | library(silgelib)
25 | theme_set(theme_roboto())
26 | ```
27 |
28 | layout: true
29 |
30 |
31 |
32 | ---
33 |
34 | class: inverse, center, middle
35 |
36 | background-image: url(figs/p_and_p_cover.png)
37 | background-size: cover
38 |
39 |
40 | # Text Mining
41 |
42 |
43 |
44 | ### USING TIDY PRINCIPLES
45 |
46 | .large[Julia Silge | SDSS | 29 May 2019]
47 |
48 | ---
49 |
50 | ## Let's install some packages
51 |
52 | ```{r, eval=FALSE}
53 | install.packages(c("tidyverse",
54 | "tidytext",
55 | "gutenbergr"))
56 | ```
57 |
58 | ---
59 |
60 | class: right, middle
61 |
62 |
63 |
64 | # Find us at...
65 |
66 | @juliasilge
67 | @juliasilge
68 | juliasilge.com
69 |
70 | ---
71 |
72 | class: right, middle
73 |
74 |
75 |
76 | # Find us at...
77 |
78 | @dataandme
79 | @batpigandme
80 | maraaverick.rbind.io
81 |
82 | ---
83 |
84 | class: inverse
85 |
86 | ## Text in the real world
87 |
88 | --
89 |
90 | - .large[Text data is increasingly important `r emo::ji("books")`]
91 |
92 | --
93 |
94 | - .large[NLP training is scarce on the ground `r emo::ji("scream")`]
95 |
96 | ---
97 |
98 | background-image: url(figs/vexing.gif)
99 | background-position: 50% 50%
100 | background-size: 650px
101 |
102 | ---
103 |
104 | background-image: url(figs/p_and_p_cover.png)
105 | background-size: cover
106 |
107 | class: inverse, center, middle
108 |
109 | # TIDY DATA PRINCIPLES + TEXT MINING = `r emo::ji("tada")`
110 |
111 | ---
112 |
113 | background-image: url(figs/tidytext_repo.png)
114 | background-size: 800px
115 | background-position: 50% 20%
116 |
117 | class: bottom, right
118 |
119 | .large[[https://github.com/juliasilge/tidytext](https://github.com/juliasilge/tidytext)]
120 |
121 | .large[[http://tidytextmining.com/](http://tidytextmining.com/)]
122 |
123 | ---
124 |
125 | background-image: url(figs/cover.png)
126 | background-size: 450px
127 | background-position: 50% 50%
128 |
129 | ---
130 |
131 |
132 |
133 | ## What do we mean by tidy text?
134 |
135 |
136 | ```{r}
137 | text <- c("Because I could not stop for Death -",
138 | "He kindly stopped for me -",
139 | "The Carriage held but just Ourselves -",
140 | "and Immortality")
141 |
142 | text
143 | ```
144 |
145 | ---
146 |
147 |
148 |
149 | ## What do we mean by tidy text?
150 |
151 | ```{r}
152 | library(tidyverse)
153 | text_df <- data_frame(line = 1:4, text = text)
154 |
155 | text_df
156 | ```
157 |
158 | ---
159 |
160 |
161 |
162 | ## What do we mean by tidy text?
163 |
164 | ```{r}
165 | library(tidytext)
166 |
167 | text_df %>%
168 | unnest_tokens(word, text) #<<
169 | ```
170 |
171 | ---
172 |
173 | ## Gathering more data
174 |
175 | .large[You can access the full text of many public domain works from [Project Gutenberg](https://www.gutenberg.org/) using the [gutenbergr](https://ropensci.org/tutorials/gutenbergr_tutorial.html) package.]
176 |
177 |
178 | ```{r}
179 | library(gutenbergr)
180 |
181 | full_text <- gutenberg_download(1342)
182 | ```
183 |
184 | .large[What book do *you* want to analyze today? `r emo::ji_glue(":book: :partying: :book:")`]
185 |
186 | ---
187 |
188 | ## Time to tidy your text!
189 |
190 | ```{r}
191 | tidy_book <- full_text %>%
192 | mutate(line = row_number()) %>%
193 | unnest_tokens(word, text) #<<
194 |
195 | tidy_book
196 | ```
197 |
198 | ---
199 |
200 | ## What are the most common words?
201 |
202 | ```{r}
203 | tidy_book %>%
204 | count(word, sort = TRUE)
205 | ```
206 |
207 | ---
208 |
209 | background-image: url(figs/stop.gif)
210 | background-size: 500px
211 | background-position: 50% 50%
212 |
213 | ## Stop words
214 |
215 | ---
216 |
217 | ## Stop words
218 |
219 | ```{r}
220 | get_stopwords()
221 | ```
222 |
223 | ---
224 |
225 | ## Stop words
226 |
227 | ```{r}
228 | get_stopwords(language = "es")
229 | ```
230 |
231 | ---
232 |
233 | ## Stop words
234 |
235 | ```{r}
236 | get_stopwords(language = "pt")
237 | ```
238 |
239 | ---
240 |
241 | ## Stop words
242 |
243 | ```{r}
244 | get_stopwords(source = "smart")
245 | ```
246 |
247 | ---
248 |
249 | ## What are the most common words?
250 |
251 | ```{r, eval = FALSE}
252 | tidy_book %>%
253 | anti_join(get_stopwords(source = "smart")) %>%
254 | count(word, sort = TRUE) %>%
255 | top_n(20) %>%
256 | ggplot(aes(fct_reorder(word, n), n)) + #<<
257 | geom_col() +
258 | coord_flip()
259 | ```
260 |
261 | ---
262 |
263 | ```{r, echo=FALSE, fig.height=4}
264 | tidy_book %>%
265 | anti_join(get_stopwords(source = "smart")) %>%
266 | count(word, sort = TRUE) %>%
267 | top_n(20) %>%
268 | ggplot(aes(fct_reorder(word, n), n)) +
269 | geom_col(fill = "midnightblue", alpha = 0.9) +
270 | coord_flip() +
271 | scale_y_continuous(expand = c(0,0)) +
272 | labs(x = NULL, y = "Number of occurrences")
273 | ```
274 |
275 | ---
276 |
277 | background-image: url(figs/tilecounts-1.png)
278 | background-size: 700px
279 |
280 | ---
281 |
282 | background-image: url(figs/tilerate-1.png)
283 | background-size: 700px
284 |
285 | ---
286 |
287 | background-image: url(figs/p_and_p_cover.png)
288 | background-size: cover
289 |
290 | class: inverse, center, middle
291 |
292 | ## SENTIMENT ANALYSIS `r emo::ji_glue(":smile: :cry: :angry:")`
293 |
294 | ---
295 |
296 | ## Sentiment lexicons
297 |
298 | ```{r}
299 | get_sentiments("afinn")
300 | ```
301 |
302 | ---
303 |
304 | ## Sentiment lexicons
305 |
306 | ```{r}
307 | get_sentiments("bing")
308 | ```
309 |
310 | ---
311 |
312 | ## Sentiment lexicons
313 |
314 |
315 | ```{r}
316 | get_sentiments("nrc")
317 | ```
318 |
319 | ---
320 |
321 | ## Sentiment lexicons
322 |
323 | ```{r}
324 | get_sentiments("loughran")
325 | ```
326 |
327 | ---
328 |
329 | ## Implementing sentiment analysis
330 |
331 | ```{r}
332 | tidy_book %>%
333 | inner_join(get_sentiments("bing")) %>% #<<
334 | count(sentiment, sort = TRUE)
335 | ```
336 |
337 | ---
338 |
339 | ## Implementing sentiment analysis
340 |
341 | ```{r}
342 | tidy_book %>%
343 | inner_join(get_sentiments("bing")) %>%
344 | count(sentiment, word, sort = TRUE) #<<
345 | ```
346 |
347 | ---
348 |
349 | ## Implementing sentiment analysis
350 |
351 | ```{r, eval = FALSE}
352 | tidy_book %>%
353 | inner_join(get_sentiments("bing")) %>%
354 | count(sentiment, word, sort = TRUE) %>%
355 | group_by(sentiment) %>%
356 | top_n(10) %>%
357 | ungroup %>%
358 | ggplot(aes(fct_reorder(word, n), #<<
359 | n,
360 | fill = sentiment)) +
361 | geom_col() +
362 | coord_flip() +
363 | facet_wrap(~ sentiment, scales = "free")
364 | ```
365 |
366 | ---
367 |
368 | class: middle
369 |
370 | ```{r, echo=FALSE, fig.height=4}
371 | tidy_book %>%
372 | inner_join(get_sentiments("bing")) %>%
373 | count(sentiment, word, sort = TRUE) %>%
374 | group_by(sentiment) %>%
375 | top_n(10) %>%
376 | ungroup %>%
377 | ggplot(aes(fct_reorder(word, n), n, fill = sentiment)) +
378 | geom_col(alpha = 0.9, show.legend = FALSE) +
379 | coord_flip() +
380 | facet_wrap(~ sentiment, scales = "free") +
381 | scale_y_continuous(expand = c(0,0)) +
382 | labs(x = NULL, y = "Number of occurrences")
383 | ```
384 |
385 | ---
386 |
387 | background-image: url(figs/p_and_p_cover.png)
388 | background-size: cover
389 |
390 | class: inverse, center, middle
391 |
392 | ## WHAT IS A DOCUMENT ABOUT? `r emo::ji("thinking")`
393 |
394 | ---
395 |
396 | ## What is a document about?
397 |
398 | - .large[Term frequency]
399 | - .large[Inverse document frequency]
400 |
401 | $$idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}$$
402 |
403 | ### tf-idf is about comparing **documents** within a **collection**.
404 |
405 | ---
406 |
407 | ## Understanding tf-idf
408 |
409 | .large[Make a collection (*corpus*) for yourself! `r emo::ji("nail")`]
410 |
411 | ```{r}
412 | full_collection <- gutenberg_download(c(1342, 158, 161, 141),
413 | meta_fields = "title")
414 |
415 | full_collection
416 | ```
417 |
418 | ---
419 |
420 | ## Counting word frequencies in your collection
421 |
422 | ```{r}
423 | book_words <- full_collection %>%
424 | unnest_tokens(word, text) %>% #<<
425 | count(title, word, sort = TRUE)
426 |
427 | book_words
428 | ```
429 |
430 | ---
431 |
432 | ## Calculating tf-idf
433 |
434 | .large[That's... super exciting???]
435 |
436 | ```{r}
437 | book_tfidf <- book_words %>%
438 | bind_tf_idf(word, title, n) #<<
439 |
440 | book_tfidf
441 | ```
442 |
443 | ---
444 |
445 | ## Calculating tf-idf
446 |
447 | ```{r}
448 | book_tfidf %>%
449 | arrange(-tf_idf)
450 | ```
451 |
452 | ---
453 |
454 | ## Calculating tf-idf
455 |
456 | ```{r, eval = FALSE}
457 | book_tfidf %>%
458 | group_by(title) %>%
459 | top_n(10) %>%
460 | ungroup %>%
461 | ggplot(aes(fct_reorder(word, tf_idf), #<<
462 | tf_idf,
463 | fill = title)) +
464 | geom_col(show.legend = FALSE) +
465 | coord_flip() +
466 | facet_wrap(~title, scales = "free")
467 | ```
468 |
469 | ---
470 |
471 | ```{r, echo=FALSE, fig.height=4}
472 | book_tfidf %>%
473 | group_by(title) %>%
474 | top_n(10) %>%
475 | ungroup %>%
476 | ggplot(aes(fct_reorder(word, tf_idf),
477 | tf_idf,
478 | fill = title)) +
479 | geom_col(alpha = 0.9, show.legend = FALSE) +
480 | coord_flip() +
481 | facet_wrap(~title, scales = "free") +
482 | scale_y_continuous(expand = c(0,0)) +
483 | labs(x = NULL, y = "tf-idf")
484 | ```
485 |
486 | ---
487 |
488 | background-image: url(figs/plot_tf_idf-1.png)
489 | background-size: 800px
490 |
491 | ---
492 |
493 | ## N-grams... and beyond! `r emo::ji("rocket")`
494 |
495 | ```{r}
496 | tidy_ngram <- full_text %>%
497 | unnest_tokens(bigram, text, token = "ngrams", n = 2) #<<
498 |
499 | tidy_ngram
500 | ```
501 |
502 | ---
503 |
504 | ## N-grams... and beyond! `r emo::ji("rocket")`
505 |
506 | ```{r}
507 | tidy_ngram %>%
508 | count(bigram, sort = TRUE)
509 | ```
510 |
511 | ---
512 |
513 | ## N-grams... and beyond! `r emo::ji("rocket")`
514 |
515 | ```{r}
516 | tidy_ngram %>%
517 | separate(bigram, c("word1", "word2"), sep = " ") %>% #<<
518 | filter(!word1 %in% stop_words$word,
519 | !word2 %in% stop_words$word) %>%
520 | count(word1, word2, sort = TRUE)
521 | ```
522 |
523 | ---
524 |
525 | background-image: url(figs/p_and_p_cover.png)
526 | background-size: cover
527 |
528 | class: inverse
529 |
530 | ## What can you do with n-grams?
531 |
532 | - .large[tf-idf of n-grams]
533 |
534 | --
535 |
536 | - .large[network analysis]
537 |
538 | --
539 |
540 | - .large[negation]
541 |
542 | ---
543 |
544 | background-image: url(figs/austen-1.png)
545 | background-size: 750px
546 |
547 | ---
548 |
549 | background-image: url(figs/slider.gif)
550 | background-position: 50% 70%
551 |
552 | ## What can you do with n-grams?
553 |
554 | ### [She Giggles, He Gallops](https://pudding.cool/2017/08/screen-direction/)
555 |
556 | ---
557 |
558 | class: left, middle
559 |
560 |
561 |
562 | # Thanks!
563 |
564 | tidytextmining.com
565 | @juliasilge
566 | @juliasilge
567 | juliasilge.com
568 | @dataandme
569 | @batpigandme
570 | maraaverick.rbind.io
571 |
572 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
573 |
--------------------------------------------------------------------------------
/intro.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Text Mining
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
912 |
913 |
914 |
959 |
960 |
970 |
971 |
990 |
991 |
1001 |
1002 |
1003 |
--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-13-1.png
--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-21-1.png
--------------------------------------------------------------------------------
/intro_files/figure-html/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-27-1.png
--------------------------------------------------------------------------------
/libs/remark-css/default.css:
--------------------------------------------------------------------------------
1 | a, a > code {
2 | color: rgb(249, 38, 114);
3 | text-decoration: none;
4 | }
5 | .footnote {
6 | position: absolute;
7 | bottom: 3em;
8 | padding-right: 4em;
9 | font-size: 90%;
10 | }
11 | .remark-code-line-highlighted { background-color: #ffff88; }
12 |
13 | .inverse {
14 | background-color: #272822;
15 | color: #d6d6d6;
16 | text-shadow: 0 0 20px #333;
17 | }
18 | .inverse h1, .inverse h2, .inverse h3 {
19 | color: #f3f3f3;
20 | }
21 | /* Two-column layout */
22 | .left-column {
23 | color: #777;
24 | width: 20%;
25 | height: 92%;
26 | float: left;
27 | }
28 | .left-column h2:last-of-type, .left-column h3:last-child {
29 | color: #000;
30 | }
31 | .right-column {
32 | width: 75%;
33 | float: right;
34 | padding-top: 1em;
35 | }
36 | .pull-left {
37 | float: left;
38 | width: 47%;
39 | }
40 | .pull-right {
41 | float: right;
42 | width: 47%;
43 | }
44 | .pull-right ~ * {
45 | clear: both;
46 | }
47 | img, video, iframe {
48 | max-width: 100%;
49 | }
50 | blockquote {
51 | border-left: solid 5px lightgray;
52 | padding-left: 1em;
53 | }
54 | .remark-slide table {
55 | margin: auto;
56 | border-top: 1px solid #666;
57 | border-bottom: 1px solid #666;
58 | }
59 | .remark-slide table thead th { border-bottom: 1px solid #ddd; }
60 | th, td { padding: 5px; }
61 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #eee }
62 |
63 | @page { margin: 0; }
64 | @media print {
65 | .remark-slide-scaler {
66 | width: 100% !important;
67 | height: 100% !important;
68 | transform: scale(1) !important;
69 | top: 0 !important;
70 | left: 0 !important;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/modeling.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Text Modeling"
3 | subtitle: " USING TIDY DATA PRINCIPLES"
4 | author: "Julia Silge | SDSS | 29 May 2019"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: github
11 | highlightLines: true
12 | countIncrementalSlides: false
13 | ratio: "16:9"
14 | seal: false
15 | includes:
16 | in_header: header.html
17 | ---
18 |
19 | ```{r setup, include=FALSE}
20 | options(htmltools.dir.version = FALSE)
21 | library(knitr)
22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180)
23 | library(ggplot2)
24 | library(silgelib)
25 | theme_set(theme_roboto())
26 | ```
27 |
28 | layout: true
29 |
30 |
31 |
32 | ---
33 |
34 | class: inverse, center, middle
35 |
36 | background-image: url(figs/p_and_p_cover.png)
37 | background-size: cover
38 |
39 |
40 | # Text Modeling
41 |
42 |
43 |
44 | ### USING TIDY PRINCIPLES
45 |
46 | .large[Julia Silge | SDSS | 29 May 2019]
47 |
48 | ---
49 |
50 | ## Let's install some packages
51 |
52 | ```{r, eval=FALSE}
53 | install.packages(c("tidyverse",
54 | "tidytext",
55 | "gutenbergr",
56 | "stm",
57 | "glmnet",
58 | "yardstick"))
59 | ```
60 |
61 | ---
62 |
63 | class: right, middle
64 |
65 |
66 |
67 | # Find us at...
68 |
69 | @juliasilge
70 | @juliasilge
71 | juliasilge.com
72 |
73 | ---
74 |
75 | class: right, middle
76 |
77 |
78 |
79 | # Find us at...
80 |
81 | @dataandme
82 | @batpigandme
83 | maraaverick.rbind.io
84 |
85 | ---
86 |
87 | class: right, inverse, middle
88 |
89 | background-image: url(figs/p_and_p_cover.png)
90 | background-size: cover
91 |
92 | # TIDYING AND CASTING
93 |
94 |
95 |
96 | ---
97 |
98 | background-image: url(figs/tmwr_0601.png)
99 | background-size: 900px
100 |
101 | ---
102 |
103 | class: inverse
104 |
105 | background-image: url(figs/p_and_p_cover.png)
106 | background-size: cover
107 |
108 | # Two powerful NLP techniques
109 |
110 | --
111 |
112 | - .large[Topic modeling]
113 |
114 | --
115 |
116 | - .large[Text classification]
117 |
118 | ---
119 |
120 | class: inverse
121 |
122 | background-image: url(figs/p_and_p_cover.png)
123 | background-size: cover
124 |
125 | # Topic modeling
126 |
127 | - .large[Each DOCUMENT = mixture of topics]
128 |
129 | --
130 |
131 | - .large[Each TOPIC = mixture of words]
132 |
133 | ---
134 |
135 | class: top
136 |
137 | background-image: url(figs/top_tags-1.png)
138 | background-size: 800px
139 |
140 | ---
141 |
142 | class: center, middle, inverse
143 |
144 | background-image: url(figs/p_and_p_cover.png)
145 | background-size: cover
146 |
147 | # GREAT LIBRARY HEIST `r emo::ji("sleuth")`
148 |
149 | ---
150 |
151 | ## **Downloading your text data**
152 |
153 | ```{r}
154 | library(tidyverse)
155 | library(gutenbergr)
156 |
157 | titles <- c("Twenty Thousand Leagues under the Sea",
158 | "The War of the Worlds",
159 | "Pride and Prejudice",
160 | "Great Expectations")
161 |
162 | books <- gutenberg_works(title %in% titles) %>%
163 | gutenberg_download(meta_fields = "title")
164 |
165 | books
166 | ```
167 |
168 | ---
169 |
170 | ## **Someone has torn your books apart!** `r emo::ji("sob")`
171 |
172 |
173 | ```{r}
174 | by_chapter <- books %>%
175 | group_by(title) %>%
176 | mutate(chapter = cumsum(str_detect(text,
177 | regex("^chapter ",
178 | ignore_case = TRUE)))) %>%
179 | ungroup() %>%
180 | filter(chapter > 0) %>%
181 | unite(document, title, chapter)
182 |
183 | by_chapter
184 | ```
185 |
186 | ---
187 |
188 | ## **Can we put them back together?**
189 |
190 | ```{r}
191 | library(tidytext)
192 |
193 | word_counts <- by_chapter %>%
194 | unnest_tokens(word, text) %>% #<<
195 | anti_join(get_stopwords(source = "smart")) %>%
196 | count(document, word, sort = TRUE)
197 |
198 | word_counts
199 |
200 | ```
201 |
202 | ---
203 |
204 | ## **Can we put them back together?**
205 |
206 | ```{r}
207 | words_sparse <- word_counts %>%
208 | cast_sparse(document, word, n) #<<
209 |
210 | class(words_sparse)
211 | ```
212 |
213 | ---
214 |
215 | ## **Train a topic model**
216 |
217 | Use a sparse matrix or a `quanteda::dfm` object as input
218 |
219 | ```{r}
220 | library(stm)
221 |
222 | topic_model <- stm(words_sparse, K = 4,
223 | verbose = FALSE, init.type = "Spectral")
224 |
225 | summary(topic_model)
226 | ```
227 |
228 | ---
229 |
230 | ## **Exploring the output of topic modeling**
231 |
232 | .large[Time for tidying!]
233 |
234 | ```{r}
235 | chapter_topics <- tidy(topic_model, matrix = "beta")
236 |
237 | chapter_topics
238 | ```
239 |
240 | ---
241 |
242 | ## **Exploring the output of topic modeling**
243 |
244 | ```{r}
245 | top_terms <- chapter_topics %>%
246 | group_by(topic) %>%
247 | top_n(10, beta) %>%
248 | ungroup() %>%
249 | arrange(topic, -beta)
250 |
251 | top_terms
252 | ```
253 |
254 | ---
255 | ## **Exploring the output of topic modeling**
256 |
257 | ```{r, eval=FALSE}
258 | top_terms %>%
259 | mutate(term = fct_reorder(term, beta)) %>%
260 | ggplot(aes(term, beta, fill = factor(topic))) +
261 | geom_col(show.legend = FALSE) +
262 | facet_wrap(~ topic, scales = "free") +
263 | coord_flip()
264 | ```
265 |
266 | ---
267 |
268 | ```{r, echo=FALSE, fig.height=4}
269 | top_terms %>%
270 | ggplot(aes(reorder_within(term, beta, topic), beta, fill = factor(topic))) +
271 | geom_col(show.legend = FALSE) +
272 | facet_wrap(~ topic, scales = "free") +
273 | coord_flip() +
274 | scale_x_reordered() +
275 | scale_y_continuous(expand = c(0,0)) +
276 | labs(y = expression(beta), x = NULL)
277 | ```
278 |
279 | ---
280 |
281 | ## **How are documents classified?**
282 |
283 | ```{r}
284 | chapters_gamma <- tidy(topic_model, matrix = "gamma",
285 | document_names = rownames(words_sparse))
286 |
287 | chapters_gamma
288 | ```
289 |
290 | ---
291 |
292 | ## **How are documents classified?**
293 |
294 | ```{r}
295 | chapters_parsed <- chapters_gamma %>%
296 | separate(document, c("title", "chapter"),
297 | sep = "_", convert = TRUE)
298 |
299 | chapters_parsed
300 | ```
301 |
302 | ---
303 |
304 | ## **How are documents classified?**
305 |
306 | ```{r, eval=FALSE}
307 | chapters_parsed %>%
308 | mutate(title = fct_reorder(title, gamma * topic)) %>%
309 | ggplot(aes(factor(topic), gamma)) +
310 | geom_boxplot() +
311 | facet_wrap(~ title)
312 | ```
313 |
314 | ---
315 |
316 | ```{r, echo=FALSE, fig.height=4}
317 | chapters_parsed %>%
318 | mutate(title = fct_reorder(title, gamma * topic)) %>%
319 | ggplot(aes(factor(topic), gamma, color = factor(topic))) +
320 | geom_boxplot(show.legend = FALSE) +
321 | facet_wrap(~ title) +
322 | labs(x = "Topic", y = expression(gamma))
323 | ```
324 |
325 | ---
326 |
327 | class: center, middle, inverse
328 |
329 | background-image: url(figs/p_and_p_cover.png)
330 | background-size: cover
331 |
332 | # GOING FARTHER `r emo::ji("rocket")`
333 |
334 | ---
335 |
336 | ## Tidying model output
337 |
338 | ### Which words in each document are assigned to which topics?
339 |
340 | - .large[`augment()`]
341 | - .large[Add information to each observation in the original data]
342 |
343 | ---
344 |
345 | background-image: url(figs/stm_video.png)
346 | background-size: 850px
347 |
348 | ---
349 |
350 | ## **Using stm**
351 |
352 | - .large[Document-level covariates]
353 |
354 | ```{r, eval=FALSE}
355 | topic_model <- stm(words_sparse, K = 0, init.type = "Spectral",
356 | prevalence = ~s(Year),
357 | data = covariates,
358 | verbose = FALSE)
359 | ```
360 |
361 | - .large[Use functions for `semanticCoherence()`, `checkResiduals()`, `exclusivity()`, and more!]
362 |
363 | - .large[Check out http://www.structuraltopicmodel.com/]
364 |
365 | - .large[See [my blog post](https://juliasilge.com/blog/evaluating-stm/) for how to choose `K`, the number of topics]
366 |
367 | ---
368 |
369 |
370 | background-image: url(figs/model_diagnostic-1.png)
371 | background-position: 50% 50%
372 | background-size: 950px
373 |
374 | ---
375 |
376 | # Stemming?
377 |
378 | .large[Advice from [Schofield & Mimno](https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf)]
379 |
380 | .large["Comparing Apples to Apple: The Effects of Stemmers on Topic Models"]
381 |
382 | ---
383 |
384 | class: right, middle
385 |
386 |
387 |
388 | Despite their frequent use in topic modeling, we find that stemmers produce no meaningful improvement in likelihood and coherence and in fact can degrade topic stability.
389 |
390 |
391 |
392 | ---
393 |
394 | class: right, middle, inverse
395 |
396 | background-image: url(figs/p_and_p_cover.png)
397 | background-size: cover
398 |
399 |
400 | # TEXT CLASSIFICATION
401 |
402 |
403 | ---
404 |
405 | ## **Downloading your text data**
406 |
407 | ```{r}
408 | library(tidyverse)
409 | library(gutenbergr)
410 |
411 | titles <- c("The War of the Worlds",
412 | "Pride and Prejudice")
413 |
414 | books <- gutenberg_works(title %in% titles) %>%
415 | gutenberg_download(meta_fields = "title") %>%
416 | mutate(document = row_number())
417 |
418 | books
419 | ```
420 |
421 | ---
422 |
423 | ## **Making a tidy dataset**
424 |
425 | .large[Use this kind of data structure for EDA! `r emo::ji("nail")`]
426 |
427 | ```{r}
428 | library(tidytext)
429 |
430 | tidy_books <- books %>%
431 | unnest_tokens(word, text) %>% #<<
432 | group_by(word) %>%
433 | filter(n() > 10) %>%
434 | ungroup
435 |
436 | tidy_books
437 | ```
438 |
439 | ---
440 |
441 | ## **Cast to a sparse matrix**
442 |
443 | .large[And build a dataframe with a response variable]
444 |
445 | ```{r}
446 | sparse_words <- tidy_books %>%
447 | count(document, word, sort = TRUE) %>%
448 | cast_sparse(document, word, n) #<<
449 |
450 | books_joined <- tibble(document = as.integer(rownames(sparse_words))) %>%
451 | left_join(books %>%
452 | select(document, title))
453 | ```
454 |
455 | ---
456 |
457 | ## **Train a glmnet model**
458 |
459 | ```{r}
460 | library(glmnet)
461 | library(doMC)
462 | registerDoMC(cores = 8)
463 |
464 | is_jane <- books_joined$title == "Pride and Prejudice"
465 |
466 | model <- cv.glmnet(sparse_words, is_jane, family = "binomial",
467 | parallel = TRUE, keep = TRUE)
468 |
469 | ```
470 |
471 | ---
472 |
473 | ## **Tidying our model**
474 |
475 | .large[Tidy, then filter to choose some lambda from glmnet output]
476 |
477 | ```{r}
478 | library(broom)
479 |
480 | coefs <- model$glmnet.fit %>%
481 | tidy() %>%
482 | filter(lambda == model$lambda.1se)
483 |
484 | Intercept <- coefs %>%
485 | filter(term == "(Intercept)") %>%
486 | pull(estimate)
487 | ```
488 |
489 | ---
490 |
491 | ## **Tidying our model**
492 |
493 | ```{r}
494 | classifications <- tidy_books %>%
495 | inner_join(coefs, by = c("word" = "term")) %>%
496 | group_by(document) %>%
497 | summarize(score = sum(estimate)) %>%
498 | mutate(probability = plogis(Intercept + score))
499 |
500 | classifications
501 | ```
502 |
503 | ---
504 |
505 | ## **Understanding our model**
506 |
507 | ```{r, eval=FALSE}
508 | coefs %>%
509 | group_by(estimate > 0) %>%
510 | top_n(10, abs(estimate)) %>%
511 | ungroup %>%
512 | ggplot(aes(fct_reorder(term, estimate),
513 | estimate,
514 | fill = estimate > 0)) +
515 | geom_col(show.legend = FALSE) +
516 | coord_flip()
517 | ```
518 |
519 | ---
520 |
521 | ```{r, echo = FALSE, fig.height=4}
522 | coefs %>%
523 | group_by(estimate > 0) %>%
524 | top_n(10, abs(estimate)) %>%
525 | ungroup %>%
526 | ggplot(aes(fct_reorder(term, estimate), estimate, fill = estimate > 0)) +
527 | geom_col(show.legend = FALSE) +
528 | coord_flip() +
529 | labs(x = NULL,
530 | title = "Coefficients that increase/decrease probability",
531 | subtitle = "A document mentioning Martians is unlikely to be written by Jane Austen")
532 | ```
533 |
534 | ---
535 |
536 | ## **ROC**
537 |
538 | ```{r}
539 | library(yardstick)
540 |
541 | comment_classes <- classifications %>%
542 | left_join(books %>%
543 | select(title, document), by = "document") %>%
544 | mutate(title = as.factor(title))
545 | ```
546 |
547 | ---
548 |
549 | ## **ROC**
550 |
551 | ```{r eval=FALSE}
552 | comment_classes %>%
553 | roc_curve(title, probability) %>%
554 | ggplot(aes(x = 1 - specificity, y = sensitivity)) +
555 | geom_line(
556 | color = "midnightblue",
557 | size = 1.5
558 | ) +
559 | geom_abline(
560 | lty = 2, alpha = 0.5,
561 | color = "gray50",
562 | size = 1.2
563 | )
564 | ```
565 |
566 | ---
567 |
568 | ```{r, echo = FALSE, fig.height=4}
569 | comment_classes %>%
570 | roc_curve(title, probability) %>%
571 | ggplot(aes(x = 1 - specificity, y = sensitivity)) +
572 | geom_line(
573 | color = "midnightblue",
574 | size = 1.5
575 | ) +
576 | geom_abline(
577 | lty = 2, alpha = 0.5,
578 | color = "gray50",
579 | size = 1.2
580 | ) +
581 | labs(
582 | title = "ROC curve for text classification"
583 | )
584 | ```
585 |
586 | ---
587 |
588 | ## **AUC for model**
589 |
590 | ```{r}
591 | comment_classes %>%
592 | roc_auc(title, probability)
593 | ```
594 |
595 | ---
596 |
597 | ## **Confusion matrix**
598 |
599 | ```{r}
600 | comment_classes %>%
601 | mutate(
602 | prediction = case_when(
603 | probability > 0.5 ~ "Pride and Prejudice",
604 | TRUE ~ "The War of the Worlds"
605 | ),
606 | prediction = as.factor(prediction)
607 | ) %>%
608 | conf_mat(title, prediction)
609 | ```
610 |
611 | ---
612 |
613 | ## **Misclassifications**
614 |
615 | Let's talk about misclassifications. Which documents here were incorrectly predicted to be written by Jane Austen?
616 |
617 | ```{r}
618 | comment_classes %>%
619 | filter(
620 | probability > .8, #<<
621 | title == "The War of the Worlds" #<<
622 | ) %>%
623 | sample_n(10) %>%
624 | inner_join(books %>%
625 | select(document, text)) %>%
626 | select(probability, text)
627 | ```
628 |
629 | ---
630 |
631 | ## **Misclassifications**
632 |
633 | Let's talk about misclassifications. Which documents here were incorrectly predicted to *not* be written by Jane Austen?
634 |
635 | ```{r}
636 | comment_classes %>%
637 | filter(
638 | probability < .3, #<<
639 | title == "Pride and Prejudice" #<<
640 | ) %>%
641 | sample_n(10) %>%
642 | inner_join(books %>%
643 | select(document, text)) %>%
644 | select(probability, text)
645 | ```
646 |
647 | ---
648 |
649 | background-image: url(figs/tmwr_0601.png)
650 | background-position: 50% 70%
651 | background-size: 750px
652 |
653 | ## **Workflow for text mining/modeling**
654 |
655 | ---
656 |
657 | background-image: url(figs/lizzieskipping.gif)
658 | background-position: 50% 55%
659 | background-size: 750px
660 |
661 | # **Go explore real-world text!**
662 |
663 | ---
664 |
665 | class: left, middle
666 |
667 |
668 |
669 | # Thanks!
670 |
671 | tidytextmining.com
672 | @juliasilge
673 | @juliasilge
674 | juliasilge.com
675 | @dataandme
676 | @batpigandme
677 | maraaverick.rbind.io
678 |
679 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan)
680 |
--------------------------------------------------------------------------------
/modeling.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Text Modeling
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
901 |
902 |
903 |
948 |
949 |
959 |
960 |
979 |
980 |
990 |
991 |
992 |
--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-14-1.png
--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-23-1.png
--------------------------------------------------------------------------------
/modeling_files/figure-html/unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-26-1.png
--------------------------------------------------------------------------------
/pdfs/juliasilge-textmining-sdss-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textmining-sdss-2.pdf
--------------------------------------------------------------------------------
/pdfs/juliasilge-textminnig-sdss-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textminnig-sdss-1.pdf
--------------------------------------------------------------------------------
/sdss2019.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------