├── .gitignore
├── files
    ├── Text_Analysis_in_R_files
    │   ├── figure-gfm
    │   │   └── unnamed-chunk-15-1.png
    │   ├── figure-markdown_github
    │   │   └── unnamed-chunk-13-1.png
    │   └── figure-markdown_github-ascii_identifiers
    │   │   └── unnamed-chunk-13-1.png
    ├── Text_Analysis_in_R.Rmd
    └── Text_Analysis_in_R.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.Rproj


--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-gfm/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-gfm/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-markdown_github/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-markdown_github/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Text Analysis in R: online appendix
2 | ============
3 | 
4 | This page contains the [online appendix](files/Text_Analysis_in_R.md) for [Welbers, van Atteveldt and Benoit (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238), that contains the example code presented in the article. The code in this appendix will be kept up-to-date with changes in the used packages, and as such can differ slightly from the code presented in the article.
5 | 
6 | In addition, this appendix contains references to other tutorials, that provide additional instructions for alternative, more in-dept or newly developed text anaysis operations.
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/files/Text_Analysis_in_R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Text Analysis in R: online appendix'
  3 | author: "Kasper Welbers, Wouter van Atteveldt & Kenneth Benoit"
  4 | date: "2023"
  5 | output: github_document
  6 | editor_options: 
  7 |   chunk_output_type: console
  8 | ---
  9 | 
 10 | ```{r setup, include=FALSE}
 11 | knitr::opts_chunk$set(echo = TRUE, collapse = TRUE)
 12 | ```
 13 | 
 14 | ## About this document
 15 | 
 16 | This is the online appendix for [Welbers, van Atteveldt & Benoit (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238), that contains the example code presented in the article. The code in this appendix will be kept up-to-date with changes in the used packages, and as such can differ slightly from the code presented in the article.
 17 | 
 18 | <!-- In addition, this appendix contains references to other tutorials, that provide additional instructions for alternative, more in-dept or newly developed text anaylysis operations. -->
 19 | 
 20 | ### required packages
 21 | 
 22 | The following packages have to be installed to run all the code examples. Note that the lines to install the packages only have to be run once.
 23 | 
 24 | ```{r, eval=F}
 25 | ################# PACKAGE       # SECTION IN ARTICLE
 26 | install.packages("readtext")    # data preparation
 27 | install.packages("stringi")     # data preparation
 28 | 
 29 | install.packages("quanteda")    # data preparation and analysis
 30 | install.packages('quanteda.textmodels') 
 31 | install.packages('quanteda.textstats')
 32 | install.packages('quanteda.textplots')
 33 | 
 34 | install.packages("topicmodels") # analysis
 35 | 
 36 | install.packages("spacyr")      # advanced topics
 37 | install.packages("corpustools") # advanced topics
 38 | ```
 39 | 
 40 | ## Data Preparation
 41 | 
 42 | ### String Operations
 43 | 
 44 | ```{r}
 45 | library(readtext)  
 46 | # url to Inaugural Address demo data that is provided by the readtext package 
 47 | filepath <- "https://raw.githubusercontent.com/kbenoit/readtext/master/inst/extdata/csv/inaugCorpus.csv"
 48 | 
 49 | rt <- readtext(filepath, text_field = "texts") 
 50 | rt
 51 | ```
 52 | 
 53 | ### String Operations
 54 | 
 55 | ```{r}
 56 | library(stringi) 
 57 | x <- c("The first string", ' The <font size="6">second string</font>') 
 58 | 
 59 | x <- stri_replace_all(x, "", regex = "<.*?>")   # remove html tags 
 60 | x <- stri_trim(x)                               # strip surrounding whitespace
 61 | x <- stri_trans_tolower(x)                      # transform to lower case 
 62 | x
 63 | ```
 64 | 
 65 | ### Preprocessing
 66 | 
 67 | #### Tokenization
 68 | 
 69 | ```{r}
 70 | library(quanteda) 
 71 | 
 72 | text <- "An example of preprocessing techniques" 
 73 | toks <- tokens(text)  # tokenize into unigrams 
 74 | toks
 75 | ```
 76 | 
 77 | #### Normalization: lowercasing and stemming
 78 | 
 79 | ```{r}
 80 | toks <- tokens_tolower(toks) 
 81 | toks <- tokens_wordstem(toks) 
 82 | toks
 83 | ```
 84 | 
 85 | #### Removing stopwords
 86 | 
 87 | ```{r}
 88 | sw <- stopwords("english")   # get character vector of stopwords 
 89 | head(sw)                     # show head (first 6) stopwords
 90 | tokens_remove(toks, sw)
 91 | ```
 92 | 
 93 | ### Document-Term Matrix
 94 | 
 95 | Since the publication of the Text Analysis in R paper, the quanteda package has gone through several updates. 
 96 | One important change is that many operations are now cut down into separate steps.
 97 | This works nicely together with the now common pipe notation (`|>`, or `%>%` in tidyverse).
 98 | 
 99 | Before, we created a dfm with one single do-it-all function. 
100 | Now, we run our data through a pipeline of functions that each perform a single step.
101 | 
102 | ```{r}
103 | text <-  c(d1 = "An example of preprocessing techniques",  
104 |            d2 = "An additional example",  
105 |            d3 = "A third example") 
106 | 
107 | dtm <- text |>
108 |   corpus() |>                          ## create quanteda corpus
109 |   tokens() |>                          ## tokenize the corpus
110 |   dfm() |>                             ## structure tokens as Document Term Matrix
111 |   dfm_tolower() |>                     ## preprocessing: lowercase
112 |   dfm_wordstem() |>                    ## preprocessing: stemming
113 |   dfm_remove(stopwords('english'))     ## preprocessing: remove English stopwords
114 |   
115 | dtm
116 | ```
117 | 
118 | Create the DTM using the inaugural speeches (rt) that we read into R above.
119 | 
120 | ```{r}
121 | dtm <- rt |> 
122 |   corpus() |> 
123 |   tokens() |>
124 |   dfm() |>
125 |   dfm_tolower() |>
126 |   dfm_wordstem() |>
127 |   dfm_remove(stopwords('english')) 
128 | 
129 | dtm
130 | ```
131 | 
132 | ### Filtering and weighting
133 | 
134 | ```{r}
135 | doc_freq <- docfreq(dtm)         # document frequency per term (column) 
136 | dtm <- dtm[, doc_freq >= 2]      # select terms with doc_freq >= 2 
137 | dtm <- dfm_tfidf(dtm)            # weight the features using tf-idf 
138 | head(dtm)
139 | ```
140 | 
141 | ## Analysis
142 | 
143 | Prepare DTM for analysis examples.
144 | 
145 | ```{r}
146 | dtm <- data_corpus_inaugural |>
147 |   corpus() |> 
148 |   tokens(remove_punct = T) |> 
149 |   dfm() |>
150 |   dfm_tolower() |> 
151 |   dfm_wordstem() |>
152 |   dfm_remove(stopwords('english'))
153 | 
154 | dtm
155 | ```
156 | 
157 | ### Counting and Dictionary
158 | 
159 | ```{r}
160 | myDict <- dictionary(list(terror = c("terror*"), 
161 |                           economy = c("job*", "business*", "econom*"))) 
162 | dict_dtm <- dfm_lookup(dtm, myDict, nomatch = "_unmatched") 
163 | tail(dict_dtm)
164 | ```
165 | 
166 | ### Supervised Machine Learning
167 | 
168 | ```{r}
169 | library(quanteda)
170 | library(quanteda.textmodels)
171 | ```
172 | 
173 | ```{r}
174 | set.seed(2) 
175 | # create a document variable indicating pre or post war 
176 | docvars(dtm, "is_prewar") <- docvars(dtm, "Year") < 1945 
177 | 
178 | # sample 40 documents for the training set and use remaining (18) for testing 
179 | train_dtm <- dfm_sample(dtm, size = 40)
180 | test_dtm <- dtm[setdiff(docnames(dtm), docnames(train_dtm)), ] 
181 | 
182 | # fit a Naive Bayes multinomial model and use it to predict the test data 
183 | nb_model <- textmodel_nb(train_dtm, y = docvars(train_dtm, "is_prewar")) 
184 | pred_nb <- predict(nb_model, newdata = test_dtm)
185 | 
186 | # compare prediction (rows) and actual is_prewar value (columns) in a table 
187 | table(prediction = pred_nb, is_prewar = docvars(test_dtm, "is_prewar"))
188 | ```
189 | 
190 | ### Unsupervised Machine Learning
191 | 
192 | ```{r}
193 | library(topicmodels) 
194 | 
195 | texts = corpus_reshape(data_corpus_inaugural, to = "paragraphs")
196 | 
197 | par_dtm <- texts |> corpus() |> tokens(remove_punct = T) |> 
198 |   dfm() |> dfm_tolower() |> dfm_wordstem() |> 
199 |   dfm_remove(stopwords('english')) |> dfm_trim(min_count = 5) |>
200 |   convert(to = 'topicmodels')
201 | 
202 | set.seed(1)
203 | lda_model <- topicmodels::LDA(par_dtm, method = "Gibbs", k = 5) 
204 | terms(lda_model, 5)
205 | ```
206 | 
207 | ### Statistics
208 | 
209 | ```{r}
210 | library(quanteda.textstats)
211 | library(quanteda.textplots)
212 | 
213 | # create DTM that contains Trump and Obama speeches
214 | dtm_pres <- data_corpus_inaugural |>
215 |   corpus_subset(President %in% c('Obama','Trump')) |>
216 |   tokens(remove_punct = T) |> 
217 |   dfm() |>
218 |   dfm_remove(stopwords('english'))
219 | 
220 | # compare target (in this case Trump) to rest of DTM (in this case only Obama).
221 | dtm_pres |>
222 |   dfm_group(President) |>
223 |   textstat_keyness(target = "Trump") |>
224 |   textplot_keyness()
225 | ```
226 | 
227 | 
228 | ## Advanced Topics
229 | 
230 | ### Advanced NLP
231 | 
232 | ```{r, eval=F}
233 | library(spacyr) 
234 | spacy_install()
235 | spacy_initialize()
236 | d <- spacy_parse("Bob Smith gave Alice his login information.", dependency = TRUE) 
237 | d[, -c(1,2)]
238 | ```
239 | 
240 | ### Word Positions and Syntax
241 | 
242 | ```{r}
243 | text <- "an example of preprocessing techniques" 
244 | 
245 | text |>
246 |   tokens() |>
247 |   tokens_ngrams(n=3, skip=0:1)
248 | ```
249 | 
250 | ```{r}
251 | library(corpustools)
252 |  
253 | tc <- create_tcorpus(sotu_texts, doc_column = "id") 
254 | hits <- search_features(tc, '"freedom americ*"~5')
255 | kwic <- get_kwic(tc, hits, ntokens = 3) 
256 | head(kwic$kwic, 3)
257 | ```
258 | 


--------------------------------------------------------------------------------
/files/Text_Analysis_in_R.md:
--------------------------------------------------------------------------------
  1 | Text Analysis in R: online appendix
  2 | ================
  3 | Kasper Welbers, Wouter van Atteveldt & Kenneth Benoit
  4 | 2023
  5 | 
  6 | ## About this document
  7 | 
  8 | This is the online appendix for [Welbers, van Atteveldt & Benoit
  9 | (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238),
 10 | that contains the example code presented in the article. The code in
 11 | this appendix will be kept up-to-date with changes in the used packages,
 12 | and as such can differ slightly from the code presented in the article.
 13 | 
 14 | <!-- In addition, this appendix contains references to other tutorials, that provide additional instructions for alternative, more in-dept or newly developed text anaylysis operations. -->
 15 | 
 16 | ### required packages
 17 | 
 18 | The following packages have to be installed to run all the code
 19 | examples. Note that the lines to install the packages only have to be
 20 | run once.
 21 | 
 22 | ``` r
 23 | ################# PACKAGE       # SECTION IN ARTICLE
 24 | install.packages("readtext")    # data preparation
 25 | install.packages("stringi")     # data preparation
 26 | 
 27 | install.packages("quanteda")    # data preparation and analysis
 28 | install.packages('quanteda.textmodels') 
 29 | install.packages('quanteda.textstats')
 30 | install.packages('quanteda.textplots')
 31 | 
 32 | install.packages("topicmodels") # analysis
 33 | 
 34 | install.packages("spacyr")      # advanced topics
 35 | install.packages("corpustools") # advanced topics
 36 | ```
 37 | 
 38 | ## Data Preparation
 39 | 
 40 | ### String Operations
 41 | 
 42 | ``` r
 43 | library(readtext)  
 44 | # url to Inaugural Address demo data that is provided by the readtext package 
 45 | filepath <- "https://raw.githubusercontent.com/kbenoit/readtext/master/inst/extdata/csv/inaugCorpus.csv"
 46 | 
 47 | rt <- readtext(filepath, text_field = "texts") 
 48 | rt
 49 | ## readtext object consisting of 5 documents and 3 docvars.
 50 | ## $text
 51 | ## [1] "# A data frame: 5 × 5"                                                   
 52 | ## [2] "  doc_id            text                 Year President  FirstName"      
 53 | ## [3] "  <chr>             <chr>               <int> <chr>      <chr>    "      
 54 | ## [4] "1 inaugCorpus.csv.1 \"\\\"Fellow-Cit\\\"...\"  1789 Washington George   "
 55 | ## [5] "2 inaugCorpus.csv.2 \"\\\"Fellow cit\\\"...\"  1793 Washington George   "
 56 | ## [6] "3 inaugCorpus.csv.3 \"\\\"When it wa\\\"...\"  1797 Adams      John     "
 57 | ## [7] "4 inaugCorpus.csv.4 \"\\\"Friends an\\\"...\"  1801 Jefferson  Thomas   "
 58 | ## [8] "5 inaugCorpus.csv.5 \"\\\"Proceeding\\\"...\"  1805 Jefferson  Thomas   "
 59 | ## 
 60 | ## $summary
 61 | ## $summary[[1]]
 62 | ## NULL
 63 | ## 
 64 | ## 
 65 | ## attr(,"class")
 66 | ## [1] "trunc_mat"
 67 | ```
 68 | 
 69 | ### String Operations
 70 | 
 71 | ``` r
 72 | library(stringi) 
 73 | x <- c("The first string", ' The <font size="6">second string</font>') 
 74 | 
 75 | x <- stri_replace_all(x, "", regex = "<.*?>")   # remove html tags 
 76 | x <- stri_trim(x)                               # strip surrounding whitespace
 77 | x <- stri_trans_tolower(x)                      # transform to lower case 
 78 | x
 79 | ## [1] "the first string"  "the second string"
 80 | ```
 81 | 
 82 | ### Preprocessing
 83 | 
 84 | #### Tokenization
 85 | 
 86 | ``` r
 87 | library(quanteda) 
 88 | ## Package version: 3.3.0
 89 | ## Unicode version: 14.0
 90 | ## ICU version: 70.1
 91 | ## Parallel computing: 8 of 8 threads used.
 92 | ## See https://quanteda.io for tutorials and examples.
 93 | ## 
 94 | ## Attaching package: 'quanteda'
 95 | ## The following objects are masked from 'package:readtext':
 96 | ## 
 97 | ##     docnames, docvars, texts
 98 | 
 99 | text <- "An example of preprocessing techniques" 
100 | toks <- tokens(text)  # tokenize into unigrams 
101 | toks
102 | ## Tokens consisting of 1 document.
103 | ## text1 :
104 | ## [1] "An"            "example"       "of"            "preprocessing"
105 | ## [5] "techniques"
106 | ```
107 | 
108 | #### Normalization: lowercasing and stemming
109 | 
110 | ``` r
111 | toks <- tokens_tolower(toks) 
112 | toks <- tokens_wordstem(toks) 
113 | toks
114 | ## Tokens consisting of 1 document.
115 | ## text1 :
116 | ## [1] "an"         "exampl"     "of"         "preprocess" "techniqu"
117 | ```
118 | 
119 | #### Removing stopwords
120 | 
121 | ``` r
122 | sw <- stopwords("english")   # get character vector of stopwords 
123 | head(sw)                     # show head (first 6) stopwords
124 | ## [1] "i"      "me"     "my"     "myself" "we"     "our"
125 | tokens_remove(toks, sw)
126 | ## Tokens consisting of 1 document.
127 | ## text1 :
128 | ## [1] "exampl"     "preprocess" "techniqu"
129 | ```
130 | 
131 | ### Document-Term Matrix
132 | 
133 | Since the publication of the Text Analysis in R paper, the quanteda
134 | package has gone through several updates. One important change is that
135 | many operations are now cut down into separate steps. This works nicely
136 | together with the now common pipe notation (`|>`, or `%>%` in
137 | tidyverse).
138 | 
139 | Before, we created a dfm with one single do-it-all function. Now, we run
140 | our data through a pipeline of functions that each perform a single
141 | step.
142 | 
143 | ``` r
144 | text <-  c(d1 = "An example of preprocessing techniques",  
145 |            d2 = "An additional example",  
146 |            d3 = "A third example") 
147 | 
148 | dtm <- text |>
149 |   corpus() |>                          ## create quanteda corpus
150 |   tokens() |>                          ## tokenize the corpus
151 |   dfm() |>                             ## structure tokens as Document Term Matrix
152 |   dfm_tolower() |>                     ## preprocessing: lowercase
153 |   dfm_wordstem() |>                    ## preprocessing: stemming
154 |   dfm_remove(stopwords('english'))     ## preprocessing: remove English stopwords
155 |   
156 | dtm
157 | ## Document-feature matrix of: 3 documents, 5 features (53.33% sparse) and 0 docvars.
158 | ##     features
159 | ## docs exampl preprocess techniqu addit third
160 | ##   d1      1          1        1     0     0
161 | ##   d2      1          0        0     1     0
162 | ##   d3      1          0        0     0     1
163 | ```
164 | 
165 | Create the DTM using the inaugural speeches (rt) that we read into R
166 | above.
167 | 
168 | ``` r
169 | dtm <- rt |> 
170 |   corpus() |> 
171 |   tokens() |>
172 |   dfm() |>
173 |   dfm_tolower() |>
174 |   dfm_wordstem() |>
175 |   dfm_remove(stopwords('english')) 
176 | 
177 | dtm
178 | ## Document-feature matrix of: 5 documents, 1,422 features (67.45% sparse) and 3 docvars.
179 | ##                    features
180 | ## docs                fellow-citizen senat hous repres : among vicissitud incid
181 | ##   inaugCorpus.csv.1              1     1    2      2 1     1          1     1
182 | ##   inaugCorpus.csv.2              0     0    0      0 1     0          0     0
183 | ##   inaugCorpus.csv.3              3     1    3      3 0     4          0     0
184 | ##   inaugCorpus.csv.4              2     0    0      1 1     1          0     0
185 | ##   inaugCorpus.csv.5              0     0    0      0 0     7          0     0
186 | ##                    features
187 | ## docs                life event
188 | ##   inaugCorpus.csv.1    1     2
189 | ##   inaugCorpus.csv.2    0     0
190 | ##   inaugCorpus.csv.3    2     0
191 | ##   inaugCorpus.csv.4    1     0
192 | ##   inaugCorpus.csv.5    2     1
193 | ## [ reached max_nfeat ... 1,412 more features ]
194 | ```
195 | 
196 | ### Filtering and weighting
197 | 
198 | ``` r
199 | doc_freq <- docfreq(dtm)         # document frequency per term (column) 
200 | dtm <- dtm[, doc_freq >= 2]      # select terms with doc_freq >= 2 
201 | dtm <- dfm_tfidf(dtm)            # weight the features using tf-idf 
202 | head(dtm)
203 | ## Document-feature matrix of: 5 documents, 530 features (46.34% sparse) and 3 docvars.
204 | ##                    features
205 | ## docs                fellow-citizen   senat    hous    repres         :
206 | ##   inaugCorpus.csv.1      0.2218487 0.39794 0.79588 0.4436975 0.2218487
207 | ##   inaugCorpus.csv.2      0         0       0       0         0.2218487
208 | ##   inaugCorpus.csv.3      0.6655462 0.39794 1.19382 0.6655462 0        
209 | ##   inaugCorpus.csv.4      0.4436975 0       0       0.2218487 0.2218487
210 | ##   inaugCorpus.csv.5      0         0       0       0         0        
211 | ##                    features
212 | ## docs                     among       life   event greater anxieti
213 | ##   inaugCorpus.csv.1 0.09691001 0.09691001 0.79588 0.39794 0.39794
214 | ##   inaugCorpus.csv.2 0          0          0       0       0      
215 | ##   inaugCorpus.csv.3 0.38764005 0.19382003 0       0       0.39794
216 | ##   inaugCorpus.csv.4 0.09691001 0.09691001 0       0.39794 0      
217 | ##   inaugCorpus.csv.5 0.67837009 0.19382003 0.39794 0       0      
218 | ## [ reached max_nfeat ... 520 more features ]
219 | ```
220 | 
221 | ## Analysis
222 | 
223 | Prepare DTM for analysis examples.
224 | 
225 | ``` r
226 | dtm <- data_corpus_inaugural |>
227 |   corpus() |> 
228 |   tokens(remove_punct = T) |> 
229 |   dfm() |>
230 |   dfm_tolower() |> 
231 |   dfm_wordstem() |>
232 |   dfm_remove(stopwords('english'))
233 | 
234 | dtm
235 | ## Document-feature matrix of: 59 documents, 5,468 features (89.25% sparse) and 4 docvars.
236 | ##                  features
237 | ## docs              fellow-citizen senat hous repres among vicissitud incid life
238 | ##   1789-Washington              1     1    2      2     1          1     1    1
239 | ##   1793-Washington              0     0    0      0     0          0     0    0
240 | ##   1797-Adams                   3     1    3      3     4          0     0    2
241 | ##   1801-Jefferson               2     0    0      1     1          0     0    1
242 | ##   1805-Jefferson               0     0    0      0     7          0     0    2
243 | ##   1809-Madison                 1     0    0      1     0          1     0    1
244 | ##                  features
245 | ## docs              event fill
246 | ##   1789-Washington     2    1
247 | ##   1793-Washington     0    0
248 | ##   1797-Adams          0    0
249 | ##   1801-Jefferson      0    0
250 | ##   1805-Jefferson      1    0
251 | ##   1809-Madison        0    1
252 | ## [ reached max_ndoc ... 53 more documents, reached max_nfeat ... 5,458 more features ]
253 | ```
254 | 
255 | ### Counting and Dictionary
256 | 
257 | ``` r
258 | myDict <- dictionary(list(terror = c("terror*"), 
259 |                           economy = c("job*", "business*", "econom*"))) 
260 | dict_dtm <- dfm_lookup(dtm, myDict, nomatch = "_unmatched") 
261 | tail(dict_dtm)
262 | ## Document-feature matrix of: 6 documents, 3 features (16.67% sparse) and 4 docvars.
263 | ##             features
264 | ## docs         terror economy _unmatched
265 | ##   2001-Bush       0       2        796
266 | ##   2005-Bush       0       1       1056
267 | ##   2009-Obama      1       7       1192
268 | ##   2013-Obama      0       6       1052
269 | ##   2017-Trump      1       5        723
270 | ##   2021-Biden      1       4       1146
271 | ```
272 | 
273 | ### Supervised Machine Learning
274 | 
275 | ``` r
276 | library(quanteda)
277 | library(quanteda.textmodels)
278 | ```
279 | 
280 | ``` r
281 | set.seed(2) 
282 | # create a document variable indicating pre or post war 
283 | docvars(dtm, "is_prewar") <- docvars(dtm, "Year") < 1945 
284 | 
285 | # sample 40 documents for the training set and use remaining (18) for testing 
286 | train_dtm <- dfm_sample(dtm, size = 40)
287 | test_dtm <- dtm[setdiff(docnames(dtm), docnames(train_dtm)), ] 
288 | 
289 | # fit a Naive Bayes multinomial model and use it to predict the test data 
290 | nb_model <- textmodel_nb(train_dtm, y = docvars(train_dtm, "is_prewar")) 
291 | pred_nb <- predict(nb_model, newdata = test_dtm)
292 | 
293 | # compare prediction (rows) and actual is_prewar value (columns) in a table 
294 | table(prediction = pred_nb, is_prewar = docvars(test_dtm, "is_prewar"))
295 | ##           is_prewar
296 | ## prediction FALSE TRUE
297 | ##      FALSE     6    0
298 | ##      TRUE      0   13
299 | ```
300 | 
301 | ### Unsupervised Machine Learning
302 | 
303 | ``` r
304 | library(topicmodels) 
305 | 
306 | texts = corpus_reshape(data_corpus_inaugural, to = "paragraphs")
307 | 
308 | par_dtm <- texts |> corpus() |> tokens(remove_punct = T) |> 
309 |   dfm() |> dfm_tolower() |> dfm_wordstem() |> 
310 |   dfm_remove(stopwords('english')) |> dfm_trim(min_count = 5) |>
311 |   convert(to = 'topicmodels')
312 | 
313 | set.seed(1)
314 | lda_model <- topicmodels::LDA(par_dtm, method = "Gibbs", k = 5) 
315 | terms(lda_model, 5)
316 | ##      Topic 1  Topic 2     Topic 3   Topic 4    Topic 5  
317 | ## [1,] "nation" "state"     "great"   "us"       "shall"  
318 | ## [2,] "peopl"  "govern"    "govern"  "american" "peopl"  
319 | ## [3,] "can"    "power"     "war"     "new"      "duti"   
320 | ## [4,] "must"   "constitut" "countri" "america"  "countri"
321 | ## [5,] "everi"  "ani"       "secur"   "world"    "citizen"
322 | ```
323 | 
324 | ### Statistics
325 | 
326 | ``` r
327 | library(quanteda.textstats)
328 | library(quanteda.textplots)
329 | 
330 | # create DTM that contains Trump and Obama speeches
331 | dtm_pres <- data_corpus_inaugural |>
332 |   corpus_subset(President %in% c('Obama','Trump')) |>
333 |   tokens(remove_punct = T) |> 
334 |   dfm() |>
335 |   dfm_remove(stopwords('english'))
336 | 
337 | # compare target (in this case Trump) to rest of DTM (in this case only Obama).
338 | dtm_pres |>
339 |   dfm_group(President) |>
340 |   textstat_keyness(target = "Trump") |>
341 |   textplot_keyness()
342 | ```
343 | 
344 | ![](Text_Analysis_in_R_files/figure-gfm/unnamed-chunk-15-1.png)<!-- -->
345 | 
346 | ## Advanced Topics
347 | 
348 | ### Advanced NLP
349 | 
350 | ``` r
351 | library(spacyr) 
352 | spacy_install()
353 | spacy_initialize()
354 | d <- spacy_parse("Bob Smith gave Alice his login information.", dependency = TRUE) 
355 | d[, -c(1,2)]
356 | ```
357 | 
358 | ### Word Positions and Syntax
359 | 
360 | ``` r
361 | text <- "an example of preprocessing techniques" 
362 | 
363 | text |>
364 |   tokens() |>
365 |   tokens_ngrams(n=3, skip=0:1)
366 | ## Tokens consisting of 1 document.
367 | ## text1 :
368 | ## [1] "an_example_of"                    "an_example_preprocessing"        
369 | ## [3] "an_of_preprocessing"              "an_of_techniques"                
370 | ## [5] "example_of_preprocessing"         "example_of_techniques"           
371 | ## [7] "example_preprocessing_techniques" "of_preprocessing_techniques"
372 | ```
373 | 
374 | ``` r
375 | library(corpustools)
376 |  
377 | tc <- create_tcorpus(sotu_texts, doc_column = "id") 
378 | hits <- search_features(tc, '"freedom americ*"~5')
379 | kwic <- get_kwic(tc, hits, ntokens = 3) 
380 | head(kwic$kwic, 3)
381 | ## [1] "...making progress toward <freedom> will find <America> is their friend..."    
382 | ## [2] "...friends, and <freedom> in Iraq will make <America> safer for generations..."
383 | ## [3] "...men who despise <freedom>, despise <America>, and aim..."
384 | ```
385 | 


--------------------------------------------------------------------------------