├── .gitignore ├── README.md ├── css ├── footer_plus.css └── xaringan-themer.css ├── figs ├── austen-1.png ├── blue_jane.png ├── cover.png ├── lizzieskipping.gif ├── model_diagnostic-1.png ├── p_and_p_cover.png ├── plot_tf_idf-1.png ├── purple_emily.png ├── slider.gif ├── stm_video.png ├── stop.gif ├── tidytext_repo.png ├── tilecounts-1.png ├── tilerate-1.png ├── tmwr_0601.png ├── top_tags-1.png └── vexing.gif ├── header.html ├── intro.Rmd ├── intro.html ├── intro_files └── figure-html │ ├── unnamed-chunk-13-1.png │ ├── unnamed-chunk-21-1.png │ └── unnamed-chunk-27-1.png ├── libs └── remark-css │ └── default.css ├── modeling.Rmd ├── modeling.html ├── modeling_files └── figure-html │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-14-1.png │ ├── unnamed-chunk-23-1.png │ └── unnamed-chunk-26-1.png ├── pdfs ├── juliasilge-textmining-sdss-2.pdf └── juliasilge-textminnig-sdss-1.pdf └── sdss2019.Rproj /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *cache* 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tidy Text Mining at SDSS 2019 2 | 3 | Slides for my short course on text mining at the Symposium on Data Science and Statistics 4 | in May 2019 5 | 6 | Check out the slides at [bit.ly/silge-sdss-1](https://bit.ly/silge-sdss-1) and [bit.ly/silge-sdss-2](https://bit.ly/silge-sdss-2)! 7 | 8 | 9 | Slides created with [remark.js](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan) 10 | 11 | My xaringan theme (from [xaringanthemer](https://pkg.garrickadenbuie.com/xaringanthemer/)): 12 | 13 | ``` 14 | mono_accent( 15 | base_color = "#09017F", 16 | header_font_google = google_font("Roboto", "700"), 17 | text_font_google = google_font("Roboto Condensed"), 18 | code_font_google = google_font("Droid Mono"), 19 | code_highlight_color = "#D2B6E8" 20 | ) 21 | ``` 22 | -------------------------------------------------------------------------------- /css/footer_plus.css: -------------------------------------------------------------------------------- 1 | .large { font-size: 160% } 2 | 3 | .title-slide { 4 | background-image: url(../figs/p_and_p_cover.png); 5 | background-size: cover; 6 | } 7 | 8 | .title-slide .remark-slide-number { 9 | display: none; 10 | } 11 | 12 | .remark-slide-number { 13 | display: none; 14 | } 15 | 16 | div.my-footer { 17 | background-color: #050045; 18 | position: absolute; 19 | bottom: 0px; 20 | left: 0px; 21 | height: 20px; 22 | width: 100%; 23 | } 24 | div.my-footer span { 25 | font-size: 10pt; 26 | color: #F7F8FA; 27 | position: absolute; 28 | left: 15px; 29 | bottom: 2px; 30 | } 31 | -------------------------------------------------------------------------------- /css/xaringan-themer.css: -------------------------------------------------------------------------------- 1 | /* ------------------------------------------------------- 2 | * 3 | * !! This file was generated by xaringanthemer !! 4 | * 5 | * Changes made to this file directly will be overwritten 6 | * if you used xaringanthemer in your xaringan slides Rmd 7 | * 8 | * Issues or likes? 9 | * - https://github.com/gadenbuie/xaringanthemer 10 | * - https://www.garrickadenbuie.com 11 | * 12 | * Need help? Try: 13 | * - vignette(package = "xaringanthemer") 14 | * - ?xaringanthemer::write_xaringan_theme 15 | * - xaringan wiki: https://github.com/yihui/xaringan/wiki 16 | * - remarkjs wiki: https://github.com/gnab/remark/wiki 17 | * 18 | * ------------------------------------------------------- */ 19 | @import url(https://fonts.googleapis.com/css?family=Roboto+Condensed); 20 | @import url(https://fonts.googleapis.com/css?family=Roboto:700); 21 | @import url(https://fonts.googleapis.com/css?family=Droid+Mono); 22 | 23 | 24 | body { 25 | font-family: Roboto Condensed, 'Palatino Linotype', 'Book Antiqua', Palatino, 'Microsoft YaHei', 'Songti SC', serif; 26 | font-weight: ; 27 | color: #272822; 28 | } 29 | h1, h2, h3 { 30 | font-family: Roboto; 31 | font-weight: normal; 32 | color: #09017F; 33 | } 34 | .remark-slide-content { 35 | background-color: #FFFFFF; 36 | font-size: 20px; 37 | 38 | 39 | 40 | padding: 1em 4em 1em 4em; 41 | } 42 | .remark-slide-content h1 { 43 | font-size: 55px; 44 | } 45 | .remark-slide-content h2 { 46 | font-size: 45px; 47 | } 48 | .remark-slide-content h3 { 49 | font-size: 35px; 50 | } 51 | .remark-code, .remark-inline-code { 52 | font-family: Droid Mono, 'Lucida Console', Monaco, monospace; 53 | } 54 | .remark-code { 55 | font-size: 0.9em; 56 | } 57 | .remark-inline-code { 58 | font-size: 1em; 59 | color: #09017F; 60 | 61 | 62 | } 63 | .remark-slide-number { 64 | color: #09017F; 65 | opacity: 1; 66 | font-size: 0.9em; 67 | } 68 | strong{color:#09017F;} 69 | a, a > code { 70 | color: #09017F; 71 | text-decoration: none; 72 | } 73 | .footnote { 74 | 75 | position: absolute; 76 | bottom: 3em; 77 | padding-right: 4em; 78 | font-size: 0.9em; 79 | } 80 | .remark-code-line-highlighted { 81 | background-color: #D2B6E8; 82 | } 83 | .inverse { 84 | background-color: #09017F; 85 | color: #FFFFFF; 86 | 87 | } 88 | .inverse h1, .inverse h2, .inverse h3 { 89 | color: #FFFFFF; 90 | } 91 | .title-slide, .title-slide h1, .title-slide h2, .title-slide h3 { 92 | color: #FFFFFF; 93 | } 94 | .title-slide { 95 | background-color: #09017F; 96 | 97 | 98 | 99 | } 100 | .title-slide .remark-slide-number { 101 | display: none; 102 | } 103 | /* Two-column layout */ 104 | .left-column { 105 | width: 20%; 106 | height: 92%; 107 | float: left; 108 | } 109 | .left-column h2, .left-column h3 { 110 | color: #09017F99; 111 | } 112 | .left-column h2:last-of-type, .left-column h3:last-child { 113 | color: #09017F; 114 | } 115 | .right-column { 116 | width: 75%; 117 | float: right; 118 | padding-top: 1em; 119 | } 120 | .pull-left { 121 | float: left; 122 | width: 47%; 123 | } 124 | .pull-right { 125 | float: right; 126 | width: 47%; 127 | } 128 | .pull-right ~ * { 129 | clear: both; 130 | } 131 | img, video, iframe { 132 | max-width: 100%; 133 | } 134 | blockquote { 135 | border-left: solid 5px #09017F80; 136 | padding-left: 1em; 137 | } 138 | .remark-slide table { 139 | margin: auto; 140 | border-top: 1px solid #666; 141 | border-bottom: 1px solid #666; 142 | } 143 | .remark-slide table thead th { border-bottom: 1px solid #ddd; } 144 | th, td { padding: 5px; } 145 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #B5B2D8 } 146 | table.dataTable tbody { 147 | background-color: #FFFFFF; 148 | color: #272822; 149 | } 150 | table.dataTable.display tbody tr.odd { 151 | background-color: #FFFFFF; 152 | } 153 | table.dataTable.display tbody tr.even { 154 | background-color: #B5B2D8; 155 | } 156 | table.dataTable.hover tbody tr:hover, table.dataTable.display tbody tr:hover { 157 | background-color: rgba(255, 255, 255, 0.5); 158 | } 159 | .dataTables_wrapper .dataTables_length, .dataTables_wrapper .dataTables_filter, .dataTables_wrapper .dataTables_info, .dataTables_wrapper .dataTables_processing, .dataTables_wrapper .dataTables_paginate { 160 | color: #272822; 161 | } 162 | .dataTables_wrapper .dataTables_paginate .paginate_button { 163 | color: #272822 !important; 164 | } 165 | 166 | @page { margin: 0; } 167 | @media print { 168 | .remark-slide-scaler { 169 | width: 100% !important; 170 | height: 100% !important; 171 | transform: scale(1) !important; 172 | top: 0 !important; 173 | left: 0 !important; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /figs/austen-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/austen-1.png -------------------------------------------------------------------------------- /figs/blue_jane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/blue_jane.png -------------------------------------------------------------------------------- /figs/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/cover.png -------------------------------------------------------------------------------- /figs/lizzieskipping.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/lizzieskipping.gif -------------------------------------------------------------------------------- /figs/model_diagnostic-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/model_diagnostic-1.png -------------------------------------------------------------------------------- /figs/p_and_p_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/p_and_p_cover.png -------------------------------------------------------------------------------- /figs/plot_tf_idf-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/plot_tf_idf-1.png -------------------------------------------------------------------------------- /figs/purple_emily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/purple_emily.png -------------------------------------------------------------------------------- /figs/slider.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/slider.gif -------------------------------------------------------------------------------- /figs/stm_video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stm_video.png -------------------------------------------------------------------------------- /figs/stop.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/stop.gif -------------------------------------------------------------------------------- /figs/tidytext_repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tidytext_repo.png -------------------------------------------------------------------------------- /figs/tilecounts-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilecounts-1.png -------------------------------------------------------------------------------- /figs/tilerate-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tilerate-1.png -------------------------------------------------------------------------------- /figs/tmwr_0601.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/tmwr_0601.png -------------------------------------------------------------------------------- /figs/top_tags-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/top_tags-1.png -------------------------------------------------------------------------------- /figs/vexing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/figs/vexing.gif -------------------------------------------------------------------------------- /header.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Text Mining" 3 | subtitle: "

USING TIDY DATA PRINCIPLES" 4 | author: "Julia Silge | SDSS | 29 May 2019" 5 | output: 6 | xaringan::moon_reader: 7 | css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"] 8 | lib_dir: libs 9 | nature: 10 | highlightStyle: github 11 | highlightLines: true 12 | countIncrementalSlides: false 13 | ratio: "16:9" 14 | seal: false 15 | includes: 16 | in_header: header.html 17 | --- 18 | 19 | ```{r setup, include=FALSE} 20 | options(htmltools.dir.version = FALSE) 21 | library(knitr) 22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180) 23 | library(ggplot2) 24 | library(silgelib) 25 | theme_set(theme_roboto()) 26 | ``` 27 | 28 | layout: true 29 | 30 | 31 | 32 | --- 33 | 34 | class: inverse, center, middle 35 | 36 | background-image: url(figs/p_and_p_cover.png) 37 | background-size: cover 38 | 39 | 40 | # Text Mining 41 | 42 | 43 | 44 | ### USING TIDY PRINCIPLES 45 | 46 | .large[Julia Silge | SDSS | 29 May 2019] 47 | 48 | --- 49 | 50 | ## Let's install some packages 51 | 52 | ```{r, eval=FALSE} 53 | install.packages(c("tidyverse", 54 | "tidytext", 55 | "gutenbergr")) 56 | ``` 57 | 58 | --- 59 | 60 | class: right, middle 61 | 62 | 63 | 64 | # Find us at... 65 | 66 |   @juliasilge
67 |   @juliasilge
68 |   juliasilge.com
69 | 70 | --- 71 | 72 | class: right, middle 73 | 74 | 75 | 76 | # Find us at... 77 | 78 |   @dataandme
79 |   @batpigandme
80 |   maraaverick.rbind.io
81 | 82 | --- 83 | 84 | class: inverse 85 | 86 | ## Text in the real world 87 | 88 | -- 89 | 90 | - .large[Text data is increasingly important `r emo::ji("books")`] 91 | 92 | -- 93 | 94 | - .large[NLP training is scarce on the ground `r emo::ji("scream")`] 95 | 96 | --- 97 | 98 | background-image: url(figs/vexing.gif) 99 | background-position: 50% 50% 100 | background-size: 650px 101 | 102 | --- 103 | 104 | background-image: url(figs/p_and_p_cover.png) 105 | background-size: cover 106 | 107 | class: inverse, center, middle 108 | 109 | # TIDY DATA PRINCIPLES + TEXT MINING = `r emo::ji("tada")` 110 | 111 | --- 112 | 113 | background-image: url(figs/tidytext_repo.png) 114 | background-size: 800px 115 | background-position: 50% 20% 116 | 117 | class: bottom, right 118 | 119 | .large[[https://github.com/juliasilge/tidytext](https://github.com/juliasilge/tidytext)] 120 | 121 | .large[[http://tidytextmining.com/](http://tidytextmining.com/)] 122 | 123 | --- 124 | 125 | background-image: url(figs/cover.png) 126 | background-size: 450px 127 | background-position: 50% 50% 128 | 129 | --- 130 | 131 | 132 | 133 | ## What do we mean by tidy text? 134 | 135 | 136 | ```{r} 137 | text <- c("Because I could not stop for Death -", 138 | "He kindly stopped for me -", 139 | "The Carriage held but just Ourselves -", 140 | "and Immortality") 141 | 142 | text 143 | ``` 144 | 145 | --- 146 | 147 | 148 | 149 | ## What do we mean by tidy text? 150 | 151 | ```{r} 152 | library(tidyverse) 153 | text_df <- data_frame(line = 1:4, text = text) 154 | 155 | text_df 156 | ``` 157 | 158 | --- 159 | 160 | 161 | 162 | ## What do we mean by tidy text? 163 | 164 | ```{r} 165 | library(tidytext) 166 | 167 | text_df %>% 168 | unnest_tokens(word, text) #<< 169 | ``` 170 | 171 | --- 172 | 173 | ## Gathering more data 174 | 175 | .large[You can access the full text of many public domain works from [Project Gutenberg](https://www.gutenberg.org/) using the [gutenbergr](https://ropensci.org/tutorials/gutenbergr_tutorial.html) package.] 176 | 177 | 178 | ```{r} 179 | library(gutenbergr) 180 | 181 | full_text <- gutenberg_download(1342) 182 | ``` 183 | 184 | .large[What book do *you* want to analyze today? `r emo::ji_glue(":book: :partying: :book:")`] 185 | 186 | --- 187 | 188 | ## Time to tidy your text! 189 | 190 | ```{r} 191 | tidy_book <- full_text %>% 192 | mutate(line = row_number()) %>% 193 | unnest_tokens(word, text) #<< 194 | 195 | tidy_book 196 | ``` 197 | 198 | --- 199 | 200 | ## What are the most common words? 201 | 202 | ```{r} 203 | tidy_book %>% 204 | count(word, sort = TRUE) 205 | ``` 206 | 207 | --- 208 | 209 | background-image: url(figs/stop.gif) 210 | background-size: 500px 211 | background-position: 50% 50% 212 | 213 | ## Stop words 214 | 215 | --- 216 | 217 | ## Stop words 218 | 219 | ```{r} 220 | get_stopwords() 221 | ``` 222 | 223 | --- 224 | 225 | ## Stop words 226 | 227 | ```{r} 228 | get_stopwords(language = "es") 229 | ``` 230 | 231 | --- 232 | 233 | ## Stop words 234 | 235 | ```{r} 236 | get_stopwords(language = "pt") 237 | ``` 238 | 239 | --- 240 | 241 | ## Stop words 242 | 243 | ```{r} 244 | get_stopwords(source = "smart") 245 | ``` 246 | 247 | --- 248 | 249 | ## What are the most common words? 250 | 251 | ```{r, eval = FALSE} 252 | tidy_book %>% 253 | anti_join(get_stopwords(source = "smart")) %>% 254 | count(word, sort = TRUE) %>% 255 | top_n(20) %>% 256 | ggplot(aes(fct_reorder(word, n), n)) + #<< 257 | geom_col() + 258 | coord_flip() 259 | ``` 260 | 261 | --- 262 | 263 | ```{r, echo=FALSE, fig.height=4} 264 | tidy_book %>% 265 | anti_join(get_stopwords(source = "smart")) %>% 266 | count(word, sort = TRUE) %>% 267 | top_n(20) %>% 268 | ggplot(aes(fct_reorder(word, n), n)) + 269 | geom_col(fill = "midnightblue", alpha = 0.9) + 270 | coord_flip() + 271 | scale_y_continuous(expand = c(0,0)) + 272 | labs(x = NULL, y = "Number of occurrences") 273 | ``` 274 | 275 | --- 276 | 277 | background-image: url(figs/tilecounts-1.png) 278 | background-size: 700px 279 | 280 | --- 281 | 282 | background-image: url(figs/tilerate-1.png) 283 | background-size: 700px 284 | 285 | --- 286 | 287 | background-image: url(figs/p_and_p_cover.png) 288 | background-size: cover 289 | 290 | class: inverse, center, middle 291 | 292 | ## SENTIMENT ANALYSIS `r emo::ji_glue(":smile: :cry: :angry:")` 293 | 294 | --- 295 | 296 | ## Sentiment lexicons 297 | 298 | ```{r} 299 | get_sentiments("afinn") 300 | ``` 301 | 302 | --- 303 | 304 | ## Sentiment lexicons 305 | 306 | ```{r} 307 | get_sentiments("bing") 308 | ``` 309 | 310 | --- 311 | 312 | ## Sentiment lexicons 313 | 314 | 315 | ```{r} 316 | get_sentiments("nrc") 317 | ``` 318 | 319 | --- 320 | 321 | ## Sentiment lexicons 322 | 323 | ```{r} 324 | get_sentiments("loughran") 325 | ``` 326 | 327 | --- 328 | 329 | ## Implementing sentiment analysis 330 | 331 | ```{r} 332 | tidy_book %>% 333 | inner_join(get_sentiments("bing")) %>% #<< 334 | count(sentiment, sort = TRUE) 335 | ``` 336 | 337 | --- 338 | 339 | ## Implementing sentiment analysis 340 | 341 | ```{r} 342 | tidy_book %>% 343 | inner_join(get_sentiments("bing")) %>% 344 | count(sentiment, word, sort = TRUE) #<< 345 | ``` 346 | 347 | --- 348 | 349 | ## Implementing sentiment analysis 350 | 351 | ```{r, eval = FALSE} 352 | tidy_book %>% 353 | inner_join(get_sentiments("bing")) %>% 354 | count(sentiment, word, sort = TRUE) %>% 355 | group_by(sentiment) %>% 356 | top_n(10) %>% 357 | ungroup %>% 358 | ggplot(aes(fct_reorder(word, n), #<< 359 | n, 360 | fill = sentiment)) + 361 | geom_col() + 362 | coord_flip() + 363 | facet_wrap(~ sentiment, scales = "free") 364 | ``` 365 | 366 | --- 367 | 368 | class: middle 369 | 370 | ```{r, echo=FALSE, fig.height=4} 371 | tidy_book %>% 372 | inner_join(get_sentiments("bing")) %>% 373 | count(sentiment, word, sort = TRUE) %>% 374 | group_by(sentiment) %>% 375 | top_n(10) %>% 376 | ungroup %>% 377 | ggplot(aes(fct_reorder(word, n), n, fill = sentiment)) + 378 | geom_col(alpha = 0.9, show.legend = FALSE) + 379 | coord_flip() + 380 | facet_wrap(~ sentiment, scales = "free") + 381 | scale_y_continuous(expand = c(0,0)) + 382 | labs(x = NULL, y = "Number of occurrences") 383 | ``` 384 | 385 | --- 386 | 387 | background-image: url(figs/p_and_p_cover.png) 388 | background-size: cover 389 | 390 | class: inverse, center, middle 391 | 392 | ## WHAT IS A DOCUMENT ABOUT? `r emo::ji("thinking")` 393 | 394 | --- 395 | 396 | ## What is a document about? 397 | 398 | - .large[Term frequency] 399 | - .large[Inverse document frequency] 400 | 401 | $$idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}$$ 402 | 403 | ### tf-idf is about comparing **documents** within a **collection**. 404 | 405 | --- 406 | 407 | ## Understanding tf-idf 408 | 409 | .large[Make a collection (*corpus*) for yourself! `r emo::ji("nail")`] 410 | 411 | ```{r} 412 | full_collection <- gutenberg_download(c(1342, 158, 161, 141), 413 | meta_fields = "title") 414 | 415 | full_collection 416 | ``` 417 | 418 | --- 419 | 420 | ## Counting word frequencies in your collection 421 | 422 | ```{r} 423 | book_words <- full_collection %>% 424 | unnest_tokens(word, text) %>% #<< 425 | count(title, word, sort = TRUE) 426 | 427 | book_words 428 | ``` 429 | 430 | --- 431 | 432 | ## Calculating tf-idf 433 | 434 | .large[That's... super exciting???] 435 | 436 | ```{r} 437 | book_tfidf <- book_words %>% 438 | bind_tf_idf(word, title, n) #<< 439 | 440 | book_tfidf 441 | ``` 442 | 443 | --- 444 | 445 | ## Calculating tf-idf 446 | 447 | ```{r} 448 | book_tfidf %>% 449 | arrange(-tf_idf) 450 | ``` 451 | 452 | --- 453 | 454 | ## Calculating tf-idf 455 | 456 | ```{r, eval = FALSE} 457 | book_tfidf %>% 458 | group_by(title) %>% 459 | top_n(10) %>% 460 | ungroup %>% 461 | ggplot(aes(fct_reorder(word, tf_idf), #<< 462 | tf_idf, 463 | fill = title)) + 464 | geom_col(show.legend = FALSE) + 465 | coord_flip() + 466 | facet_wrap(~title, scales = "free") 467 | ``` 468 | 469 | --- 470 | 471 | ```{r, echo=FALSE, fig.height=4} 472 | book_tfidf %>% 473 | group_by(title) %>% 474 | top_n(10) %>% 475 | ungroup %>% 476 | ggplot(aes(fct_reorder(word, tf_idf), 477 | tf_idf, 478 | fill = title)) + 479 | geom_col(alpha = 0.9, show.legend = FALSE) + 480 | coord_flip() + 481 | facet_wrap(~title, scales = "free") + 482 | scale_y_continuous(expand = c(0,0)) + 483 | labs(x = NULL, y = "tf-idf") 484 | ``` 485 | 486 | --- 487 | 488 | background-image: url(figs/plot_tf_idf-1.png) 489 | background-size: 800px 490 | 491 | --- 492 | 493 | ## N-grams... and beyond! `r emo::ji("rocket")` 494 | 495 | ```{r} 496 | tidy_ngram <- full_text %>% 497 | unnest_tokens(bigram, text, token = "ngrams", n = 2) #<< 498 | 499 | tidy_ngram 500 | ``` 501 | 502 | --- 503 | 504 | ## N-grams... and beyond! `r emo::ji("rocket")` 505 | 506 | ```{r} 507 | tidy_ngram %>% 508 | count(bigram, sort = TRUE) 509 | ``` 510 | 511 | --- 512 | 513 | ## N-grams... and beyond! `r emo::ji("rocket")` 514 | 515 | ```{r} 516 | tidy_ngram %>% 517 | separate(bigram, c("word1", "word2"), sep = " ") %>% #<< 518 | filter(!word1 %in% stop_words$word, 519 | !word2 %in% stop_words$word) %>% 520 | count(word1, word2, sort = TRUE) 521 | ``` 522 | 523 | --- 524 | 525 | background-image: url(figs/p_and_p_cover.png) 526 | background-size: cover 527 | 528 | class: inverse 529 | 530 | ## What can you do with n-grams? 531 | 532 | - .large[tf-idf of n-grams] 533 | 534 | -- 535 | 536 | - .large[network analysis] 537 | 538 | -- 539 | 540 | - .large[negation] 541 | 542 | --- 543 | 544 | background-image: url(figs/austen-1.png) 545 | background-size: 750px 546 | 547 | --- 548 | 549 | background-image: url(figs/slider.gif) 550 | background-position: 50% 70% 551 | 552 | ## What can you do with n-grams? 553 | 554 | ### [She Giggles, He Gallops](https://pudding.cool/2017/08/screen-direction/) 555 | 556 | --- 557 | 558 | class: left, middle 559 | 560 | 561 | 562 | # Thanks! 563 | 564 |   tidytextmining.com
565 |   @juliasilge
566 |   @juliasilge
567 |   juliasilge.com
568 |   @dataandme
569 |   @batpigandme
570 |   maraaverick.rbind.io
571 | 572 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan) 573 | -------------------------------------------------------------------------------- /intro.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Text Mining 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 912 | 913 | 914 | 959 | 960 | 970 | 971 | 990 | 991 | 1001 | 1002 | 1003 | -------------------------------------------------------------------------------- /intro_files/figure-html/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /intro_files/figure-html/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /intro_files/figure-html/unnamed-chunk-27-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/intro_files/figure-html/unnamed-chunk-27-1.png -------------------------------------------------------------------------------- /libs/remark-css/default.css: -------------------------------------------------------------------------------- 1 | a, a > code { 2 | color: rgb(249, 38, 114); 3 | text-decoration: none; 4 | } 5 | .footnote { 6 | position: absolute; 7 | bottom: 3em; 8 | padding-right: 4em; 9 | font-size: 90%; 10 | } 11 | .remark-code-line-highlighted { background-color: #ffff88; } 12 | 13 | .inverse { 14 | background-color: #272822; 15 | color: #d6d6d6; 16 | text-shadow: 0 0 20px #333; 17 | } 18 | .inverse h1, .inverse h2, .inverse h3 { 19 | color: #f3f3f3; 20 | } 21 | /* Two-column layout */ 22 | .left-column { 23 | color: #777; 24 | width: 20%; 25 | height: 92%; 26 | float: left; 27 | } 28 | .left-column h2:last-of-type, .left-column h3:last-child { 29 | color: #000; 30 | } 31 | .right-column { 32 | width: 75%; 33 | float: right; 34 | padding-top: 1em; 35 | } 36 | .pull-left { 37 | float: left; 38 | width: 47%; 39 | } 40 | .pull-right { 41 | float: right; 42 | width: 47%; 43 | } 44 | .pull-right ~ * { 45 | clear: both; 46 | } 47 | img, video, iframe { 48 | max-width: 100%; 49 | } 50 | blockquote { 51 | border-left: solid 5px lightgray; 52 | padding-left: 1em; 53 | } 54 | .remark-slide table { 55 | margin: auto; 56 | border-top: 1px solid #666; 57 | border-bottom: 1px solid #666; 58 | } 59 | .remark-slide table thead th { border-bottom: 1px solid #ddd; } 60 | th, td { padding: 5px; } 61 | .remark-slide thead, .remark-slide tfoot, .remark-slide tr:nth-child(even) { background: #eee } 62 | 63 | @page { margin: 0; } 64 | @media print { 65 | .remark-slide-scaler { 66 | width: 100% !important; 67 | height: 100% !important; 68 | transform: scale(1) !important; 69 | top: 0 !important; 70 | left: 0 !important; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /modeling.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Text Modeling" 3 | subtitle: "

USING TIDY DATA PRINCIPLES" 4 | author: "Julia Silge | SDSS | 29 May 2019" 5 | output: 6 | xaringan::moon_reader: 7 | css: ["default", "css/xaringan-themer.css", "css/footer_plus.css"] 8 | lib_dir: libs 9 | nature: 10 | highlightStyle: github 11 | highlightLines: true 12 | countIncrementalSlides: false 13 | ratio: "16:9" 14 | seal: false 15 | includes: 16 | in_header: header.html 17 | --- 18 | 19 | ```{r setup, include=FALSE} 20 | options(htmltools.dir.version = FALSE) 21 | library(knitr) 22 | knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, dpi = 180) 23 | library(ggplot2) 24 | library(silgelib) 25 | theme_set(theme_roboto()) 26 | ``` 27 | 28 | layout: true 29 | 30 | 31 | 32 | --- 33 | 34 | class: inverse, center, middle 35 | 36 | background-image: url(figs/p_and_p_cover.png) 37 | background-size: cover 38 | 39 | 40 | # Text Modeling 41 | 42 | 43 | 44 | ### USING TIDY PRINCIPLES 45 | 46 | .large[Julia Silge | SDSS | 29 May 2019] 47 | 48 | --- 49 | 50 | ## Let's install some packages 51 | 52 | ```{r, eval=FALSE} 53 | install.packages(c("tidyverse", 54 | "tidytext", 55 | "gutenbergr", 56 | "stm", 57 | "glmnet", 58 | "yardstick")) 59 | ``` 60 | 61 | --- 62 | 63 | class: right, middle 64 | 65 | 66 | 67 | # Find us at... 68 | 69 |   @juliasilge
70 |   @juliasilge
71 |   juliasilge.com
72 | 73 | --- 74 | 75 | class: right, middle 76 | 77 | 78 | 79 | # Find us at... 80 | 81 |   @dataandme
82 |   @batpigandme
83 |   maraaverick.rbind.io
84 | 85 | --- 86 | 87 | class: right, inverse, middle 88 | 89 | background-image: url(figs/p_and_p_cover.png) 90 | background-size: cover 91 | 92 | # TIDYING AND CASTING 93 | 94 |

95 | 96 | --- 97 | 98 | background-image: url(figs/tmwr_0601.png) 99 | background-size: 900px 100 | 101 | --- 102 | 103 | class: inverse 104 | 105 | background-image: url(figs/p_and_p_cover.png) 106 | background-size: cover 107 | 108 | # Two powerful NLP techniques 109 | 110 | -- 111 | 112 | - .large[Topic modeling] 113 | 114 | -- 115 | 116 | - .large[Text classification] 117 | 118 | --- 119 | 120 | class: inverse 121 | 122 | background-image: url(figs/p_and_p_cover.png) 123 | background-size: cover 124 | 125 | # Topic modeling 126 | 127 | - .large[Each DOCUMENT = mixture of topics] 128 | 129 | -- 130 | 131 | - .large[Each TOPIC = mixture of words] 132 | 133 | --- 134 | 135 | class: top 136 | 137 | background-image: url(figs/top_tags-1.png) 138 | background-size: 800px 139 | 140 | --- 141 | 142 | class: center, middle, inverse 143 | 144 | background-image: url(figs/p_and_p_cover.png) 145 | background-size: cover 146 | 147 | # GREAT LIBRARY HEIST `r emo::ji("sleuth")` 148 | 149 | --- 150 | 151 | ## **Downloading your text data** 152 | 153 | ```{r} 154 | library(tidyverse) 155 | library(gutenbergr) 156 | 157 | titles <- c("Twenty Thousand Leagues under the Sea", 158 | "The War of the Worlds", 159 | "Pride and Prejudice", 160 | "Great Expectations") 161 | 162 | books <- gutenberg_works(title %in% titles) %>% 163 | gutenberg_download(meta_fields = "title") 164 | 165 | books 166 | ``` 167 | 168 | --- 169 | 170 | ## **Someone has torn your books apart!** `r emo::ji("sob")` 171 | 172 | 173 | ```{r} 174 | by_chapter <- books %>% 175 | group_by(title) %>% 176 | mutate(chapter = cumsum(str_detect(text, 177 | regex("^chapter ", 178 | ignore_case = TRUE)))) %>% 179 | ungroup() %>% 180 | filter(chapter > 0) %>% 181 | unite(document, title, chapter) 182 | 183 | by_chapter 184 | ``` 185 | 186 | --- 187 | 188 | ## **Can we put them back together?** 189 | 190 | ```{r} 191 | library(tidytext) 192 | 193 | word_counts <- by_chapter %>% 194 | unnest_tokens(word, text) %>% #<< 195 | anti_join(get_stopwords(source = "smart")) %>% 196 | count(document, word, sort = TRUE) 197 | 198 | word_counts 199 | 200 | ``` 201 | 202 | --- 203 | 204 | ## **Can we put them back together?** 205 | 206 | ```{r} 207 | words_sparse <- word_counts %>% 208 | cast_sparse(document, word, n) #<< 209 | 210 | class(words_sparse) 211 | ``` 212 | 213 | --- 214 | 215 | ## **Train a topic model** 216 | 217 | Use a sparse matrix or a `quanteda::dfm` object as input 218 | 219 | ```{r} 220 | library(stm) 221 | 222 | topic_model <- stm(words_sparse, K = 4, 223 | verbose = FALSE, init.type = "Spectral") 224 | 225 | summary(topic_model) 226 | ``` 227 | 228 | --- 229 | 230 | ## **Exploring the output of topic modeling** 231 | 232 | .large[Time for tidying!] 233 | 234 | ```{r} 235 | chapter_topics <- tidy(topic_model, matrix = "beta") 236 | 237 | chapter_topics 238 | ``` 239 | 240 | --- 241 | 242 | ## **Exploring the output of topic modeling** 243 | 244 | ```{r} 245 | top_terms <- chapter_topics %>% 246 | group_by(topic) %>% 247 | top_n(10, beta) %>% 248 | ungroup() %>% 249 | arrange(topic, -beta) 250 | 251 | top_terms 252 | ``` 253 | 254 | --- 255 | ## **Exploring the output of topic modeling** 256 | 257 | ```{r, eval=FALSE} 258 | top_terms %>% 259 | mutate(term = fct_reorder(term, beta)) %>% 260 | ggplot(aes(term, beta, fill = factor(topic))) + 261 | geom_col(show.legend = FALSE) + 262 | facet_wrap(~ topic, scales = "free") + 263 | coord_flip() 264 | ``` 265 | 266 | --- 267 | 268 | ```{r, echo=FALSE, fig.height=4} 269 | top_terms %>% 270 | ggplot(aes(reorder_within(term, beta, topic), beta, fill = factor(topic))) + 271 | geom_col(show.legend = FALSE) + 272 | facet_wrap(~ topic, scales = "free") + 273 | coord_flip() + 274 | scale_x_reordered() + 275 | scale_y_continuous(expand = c(0,0)) + 276 | labs(y = expression(beta), x = NULL) 277 | ``` 278 | 279 | --- 280 | 281 | ## **How are documents classified?** 282 | 283 | ```{r} 284 | chapters_gamma <- tidy(topic_model, matrix = "gamma", 285 | document_names = rownames(words_sparse)) 286 | 287 | chapters_gamma 288 | ``` 289 | 290 | --- 291 | 292 | ## **How are documents classified?** 293 | 294 | ```{r} 295 | chapters_parsed <- chapters_gamma %>% 296 | separate(document, c("title", "chapter"), 297 | sep = "_", convert = TRUE) 298 | 299 | chapters_parsed 300 | ``` 301 | 302 | --- 303 | 304 | ## **How are documents classified?** 305 | 306 | ```{r, eval=FALSE} 307 | chapters_parsed %>% 308 | mutate(title = fct_reorder(title, gamma * topic)) %>% 309 | ggplot(aes(factor(topic), gamma)) + 310 | geom_boxplot() + 311 | facet_wrap(~ title) 312 | ``` 313 | 314 | --- 315 | 316 | ```{r, echo=FALSE, fig.height=4} 317 | chapters_parsed %>% 318 | mutate(title = fct_reorder(title, gamma * topic)) %>% 319 | ggplot(aes(factor(topic), gamma, color = factor(topic))) + 320 | geom_boxplot(show.legend = FALSE) + 321 | facet_wrap(~ title) + 322 | labs(x = "Topic", y = expression(gamma)) 323 | ``` 324 | 325 | --- 326 | 327 | class: center, middle, inverse 328 | 329 | background-image: url(figs/p_and_p_cover.png) 330 | background-size: cover 331 | 332 | # GOING FARTHER `r emo::ji("rocket")` 333 | 334 | --- 335 | 336 | ## Tidying model output 337 | 338 | ### Which words in each document are assigned to which topics? 339 | 340 | - .large[`augment()`] 341 | - .large[Add information to each observation in the original data] 342 | 343 | --- 344 | 345 | background-image: url(figs/stm_video.png) 346 | background-size: 850px 347 | 348 | --- 349 | 350 | ## **Using stm** 351 | 352 | - .large[Document-level covariates] 353 | 354 | ```{r, eval=FALSE} 355 | topic_model <- stm(words_sparse, K = 0, init.type = "Spectral", 356 | prevalence = ~s(Year), 357 | data = covariates, 358 | verbose = FALSE) 359 | ``` 360 | 361 | - .large[Use functions for `semanticCoherence()`, `checkResiduals()`, `exclusivity()`, and more!] 362 | 363 | - .large[Check out http://www.structuraltopicmodel.com/] 364 | 365 | - .large[See [my blog post](https://juliasilge.com/blog/evaluating-stm/) for how to choose `K`, the number of topics] 366 | 367 | --- 368 | 369 | 370 | background-image: url(figs/model_diagnostic-1.png) 371 | background-position: 50% 50% 372 | background-size: 950px 373 | 374 | --- 375 | 376 | # Stemming? 377 | 378 | .large[Advice from [Schofield & Mimno](https://mimno.infosci.cornell.edu/papers/schofield_tacl_2016.pdf)] 379 | 380 | .large["Comparing Apples to Apple: The Effects of Stemmers on Topic Models"] 381 | 382 | --- 383 | 384 | class: right, middle 385 | 386 |

387 | 388 |

Despite their frequent use in topic modeling, we find that stemmers produce no meaningful improvement in likelihood and coherence and in fact can degrade topic stability.

389 | 390 |

391 | 392 | --- 393 | 394 | class: right, middle, inverse 395 | 396 | background-image: url(figs/p_and_p_cover.png) 397 | background-size: cover 398 | 399 | 400 | # TEXT CLASSIFICATION 401 |

402 | 403 | --- 404 | 405 | ## **Downloading your text data** 406 | 407 | ```{r} 408 | library(tidyverse) 409 | library(gutenbergr) 410 | 411 | titles <- c("The War of the Worlds", 412 | "Pride and Prejudice") 413 | 414 | books <- gutenberg_works(title %in% titles) %>% 415 | gutenberg_download(meta_fields = "title") %>% 416 | mutate(document = row_number()) 417 | 418 | books 419 | ``` 420 | 421 | --- 422 | 423 | ## **Making a tidy dataset** 424 | 425 | .large[Use this kind of data structure for EDA! `r emo::ji("nail")`] 426 | 427 | ```{r} 428 | library(tidytext) 429 | 430 | tidy_books <- books %>% 431 | unnest_tokens(word, text) %>% #<< 432 | group_by(word) %>% 433 | filter(n() > 10) %>% 434 | ungroup 435 | 436 | tidy_books 437 | ``` 438 | 439 | --- 440 | 441 | ## **Cast to a sparse matrix** 442 | 443 | .large[And build a dataframe with a response variable] 444 | 445 | ```{r} 446 | sparse_words <- tidy_books %>% 447 | count(document, word, sort = TRUE) %>% 448 | cast_sparse(document, word, n) #<< 449 | 450 | books_joined <- tibble(document = as.integer(rownames(sparse_words))) %>% 451 | left_join(books %>% 452 | select(document, title)) 453 | ``` 454 | 455 | --- 456 | 457 | ## **Train a glmnet model** 458 | 459 | ```{r} 460 | library(glmnet) 461 | library(doMC) 462 | registerDoMC(cores = 8) 463 | 464 | is_jane <- books_joined$title == "Pride and Prejudice" 465 | 466 | model <- cv.glmnet(sparse_words, is_jane, family = "binomial", 467 | parallel = TRUE, keep = TRUE) 468 | 469 | ``` 470 | 471 | --- 472 | 473 | ## **Tidying our model** 474 | 475 | .large[Tidy, then filter to choose some lambda from glmnet output] 476 | 477 | ```{r} 478 | library(broom) 479 | 480 | coefs <- model$glmnet.fit %>% 481 | tidy() %>% 482 | filter(lambda == model$lambda.1se) 483 | 484 | Intercept <- coefs %>% 485 | filter(term == "(Intercept)") %>% 486 | pull(estimate) 487 | ``` 488 | 489 | --- 490 | 491 | ## **Tidying our model** 492 | 493 | ```{r} 494 | classifications <- tidy_books %>% 495 | inner_join(coefs, by = c("word" = "term")) %>% 496 | group_by(document) %>% 497 | summarize(score = sum(estimate)) %>% 498 | mutate(probability = plogis(Intercept + score)) 499 | 500 | classifications 501 | ``` 502 | 503 | --- 504 | 505 | ## **Understanding our model** 506 | 507 | ```{r, eval=FALSE} 508 | coefs %>% 509 | group_by(estimate > 0) %>% 510 | top_n(10, abs(estimate)) %>% 511 | ungroup %>% 512 | ggplot(aes(fct_reorder(term, estimate), 513 | estimate, 514 | fill = estimate > 0)) + 515 | geom_col(show.legend = FALSE) + 516 | coord_flip() 517 | ``` 518 | 519 | --- 520 | 521 | ```{r, echo = FALSE, fig.height=4} 522 | coefs %>% 523 | group_by(estimate > 0) %>% 524 | top_n(10, abs(estimate)) %>% 525 | ungroup %>% 526 | ggplot(aes(fct_reorder(term, estimate), estimate, fill = estimate > 0)) + 527 | geom_col(show.legend = FALSE) + 528 | coord_flip() + 529 | labs(x = NULL, 530 | title = "Coefficients that increase/decrease probability", 531 | subtitle = "A document mentioning Martians is unlikely to be written by Jane Austen") 532 | ``` 533 | 534 | --- 535 | 536 | ## **ROC** 537 | 538 | ```{r} 539 | library(yardstick) 540 | 541 | comment_classes <- classifications %>% 542 | left_join(books %>% 543 | select(title, document), by = "document") %>% 544 | mutate(title = as.factor(title)) 545 | ``` 546 | 547 | --- 548 | 549 | ## **ROC** 550 | 551 | ```{r eval=FALSE} 552 | comment_classes %>% 553 | roc_curve(title, probability) %>% 554 | ggplot(aes(x = 1 - specificity, y = sensitivity)) + 555 | geom_line( 556 | color = "midnightblue", 557 | size = 1.5 558 | ) + 559 | geom_abline( 560 | lty = 2, alpha = 0.5, 561 | color = "gray50", 562 | size = 1.2 563 | ) 564 | ``` 565 | 566 | --- 567 | 568 | ```{r, echo = FALSE, fig.height=4} 569 | comment_classes %>% 570 | roc_curve(title, probability) %>% 571 | ggplot(aes(x = 1 - specificity, y = sensitivity)) + 572 | geom_line( 573 | color = "midnightblue", 574 | size = 1.5 575 | ) + 576 | geom_abline( 577 | lty = 2, alpha = 0.5, 578 | color = "gray50", 579 | size = 1.2 580 | ) + 581 | labs( 582 | title = "ROC curve for text classification" 583 | ) 584 | ``` 585 | 586 | --- 587 | 588 | ## **AUC for model** 589 | 590 | ```{r} 591 | comment_classes %>% 592 | roc_auc(title, probability) 593 | ``` 594 | 595 | --- 596 | 597 | ## **Confusion matrix** 598 | 599 | ```{r} 600 | comment_classes %>% 601 | mutate( 602 | prediction = case_when( 603 | probability > 0.5 ~ "Pride and Prejudice", 604 | TRUE ~ "The War of the Worlds" 605 | ), 606 | prediction = as.factor(prediction) 607 | ) %>% 608 | conf_mat(title, prediction) 609 | ``` 610 | 611 | --- 612 | 613 | ## **Misclassifications** 614 | 615 | Let's talk about misclassifications. Which documents here were incorrectly predicted to be written by Jane Austen? 616 | 617 | ```{r} 618 | comment_classes %>% 619 | filter( 620 | probability > .8, #<< 621 | title == "The War of the Worlds" #<< 622 | ) %>% 623 | sample_n(10) %>% 624 | inner_join(books %>% 625 | select(document, text)) %>% 626 | select(probability, text) 627 | ``` 628 | 629 | --- 630 | 631 | ## **Misclassifications** 632 | 633 | Let's talk about misclassifications. Which documents here were incorrectly predicted to *not* be written by Jane Austen? 634 | 635 | ```{r} 636 | comment_classes %>% 637 | filter( 638 | probability < .3, #<< 639 | title == "Pride and Prejudice" #<< 640 | ) %>% 641 | sample_n(10) %>% 642 | inner_join(books %>% 643 | select(document, text)) %>% 644 | select(probability, text) 645 | ``` 646 | 647 | --- 648 | 649 | background-image: url(figs/tmwr_0601.png) 650 | background-position: 50% 70% 651 | background-size: 750px 652 | 653 | ## **Workflow for text mining/modeling** 654 | 655 | --- 656 | 657 | background-image: url(figs/lizzieskipping.gif) 658 | background-position: 50% 55% 659 | background-size: 750px 660 | 661 | # **Go explore real-world text!** 662 | 663 | --- 664 | 665 | class: left, middle 666 | 667 | 668 | 669 | # Thanks! 670 | 671 |   tidytextmining.com
672 |   @juliasilge
673 |   @juliasilge
674 |   juliasilge.com
675 |   @dataandme
676 |   @batpigandme
677 |   maraaverick.rbind.io
678 | 679 | Slides created with [**remark.js**](http://remarkjs.com/) and the R package [**xaringan**](https://github.com/yihui/xaringan) 680 | -------------------------------------------------------------------------------- /modeling.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Text Modeling 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 901 | 902 | 903 | 948 | 949 | 959 | 960 | 979 | 980 | 990 | 991 | 992 | -------------------------------------------------------------------------------- /modeling_files/figure-html/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /modeling_files/figure-html/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /modeling_files/figure-html/unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /modeling_files/figure-html/unnamed-chunk-26-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/modeling_files/figure-html/unnamed-chunk-26-1.png -------------------------------------------------------------------------------- /pdfs/juliasilge-textmining-sdss-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textmining-sdss-2.pdf -------------------------------------------------------------------------------- /pdfs/juliasilge-textminnig-sdss-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliasilge/sdss2019/498a37dabdd616a431b198d7966424fcc3d3e39d/pdfs/juliasilge-textminnig-sdss-1.pdf -------------------------------------------------------------------------------- /sdss2019.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | --------------------------------------------------------------------------------