├── .gitignore
├── files
├── Text_Analysis_in_R_files
│ ├── figure-gfm
│ │ └── unnamed-chunk-15-1.png
│ ├── figure-markdown_github
│ │ └── unnamed-chunk-13-1.png
│ └── figure-markdown_github-ascii_identifiers
│ │ └── unnamed-chunk-13-1.png
├── Text_Analysis_in_R.Rmd
└── Text_Analysis_in_R.md
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.Rproj
--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-gfm/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-gfm/unnamed-chunk-15-1.png
--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-markdown_github/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-markdown_github/unnamed-chunk-13-1.png
--------------------------------------------------------------------------------
/files/Text_Analysis_in_R_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kasperwelbers/text_analysis_in_R/HEAD/files/Text_Analysis_in_R_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-13-1.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Text Analysis in R: online appendix
2 | ============
3 |
4 | This page contains the [online appendix](files/Text_Analysis_in_R.md) for [Welbers, van Atteveldt and Benoit (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238), that contains the example code presented in the article. The code in this appendix will be kept up-to-date with changes in the used packages, and as such can differ slightly from the code presented in the article.
5 |
6 | In addition, this appendix contains references to other tutorials, that provide additional instructions for alternative, more in-dept or newly developed text anaysis operations.
7 |
8 |
9 |
--------------------------------------------------------------------------------
/files/Text_Analysis_in_R.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Text Analysis in R: online appendix'
3 | author: "Kasper Welbers, Wouter van Atteveldt & Kenneth Benoit"
4 | date: "2023"
5 | output: github_document
6 | editor_options:
7 | chunk_output_type: console
8 | ---
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE, collapse = TRUE)
12 | ```
13 |
14 | ## About this document
15 |
16 | This is the online appendix for [Welbers, van Atteveldt & Benoit (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238), that contains the example code presented in the article. The code in this appendix will be kept up-to-date with changes in the used packages, and as such can differ slightly from the code presented in the article.
17 |
18 |
19 |
20 | ### required packages
21 |
22 | The following packages have to be installed to run all the code examples. Note that the lines to install the packages only have to be run once.
23 |
24 | ```{r, eval=F}
25 | ################# PACKAGE # SECTION IN ARTICLE
26 | install.packages("readtext") # data preparation
27 | install.packages("stringi") # data preparation
28 |
29 | install.packages("quanteda") # data preparation and analysis
30 | install.packages('quanteda.textmodels')
31 | install.packages('quanteda.textstats')
32 | install.packages('quanteda.textplots')
33 |
34 | install.packages("topicmodels") # analysis
35 |
36 | install.packages("spacyr") # advanced topics
37 | install.packages("corpustools") # advanced topics
38 | ```
39 |
40 | ## Data Preparation
41 |
42 | ### String Operations
43 |
44 | ```{r}
45 | library(readtext)
46 | # url to Inaugural Address demo data that is provided by the readtext package
47 | filepath <- "https://raw.githubusercontent.com/kbenoit/readtext/master/inst/extdata/csv/inaugCorpus.csv"
48 |
49 | rt <- readtext(filepath, text_field = "texts")
50 | rt
51 | ```
52 |
53 | ### String Operations
54 |
55 | ```{r}
56 | library(stringi)
57 | x <- c("The first string", ' The second string')
58 |
59 | x <- stri_replace_all(x, "", regex = "<.*?>") # remove html tags
60 | x <- stri_trim(x) # strip surrounding whitespace
61 | x <- stri_trans_tolower(x) # transform to lower case
62 | x
63 | ```
64 |
65 | ### Preprocessing
66 |
67 | #### Tokenization
68 |
69 | ```{r}
70 | library(quanteda)
71 |
72 | text <- "An example of preprocessing techniques"
73 | toks <- tokens(text) # tokenize into unigrams
74 | toks
75 | ```
76 |
77 | #### Normalization: lowercasing and stemming
78 |
79 | ```{r}
80 | toks <- tokens_tolower(toks)
81 | toks <- tokens_wordstem(toks)
82 | toks
83 | ```
84 |
85 | #### Removing stopwords
86 |
87 | ```{r}
88 | sw <- stopwords("english") # get character vector of stopwords
89 | head(sw) # show head (first 6) stopwords
90 | tokens_remove(toks, sw)
91 | ```
92 |
93 | ### Document-Term Matrix
94 |
95 | Since the publication of the Text Analysis in R paper, the quanteda package has gone through several updates.
96 | One important change is that many operations are now cut down into separate steps.
97 | This works nicely together with the now common pipe notation (`|>`, or `%>%` in tidyverse).
98 |
99 | Before, we created a dfm with one single do-it-all function.
100 | Now, we run our data through a pipeline of functions that each perform a single step.
101 |
102 | ```{r}
103 | text <- c(d1 = "An example of preprocessing techniques",
104 | d2 = "An additional example",
105 | d3 = "A third example")
106 |
107 | dtm <- text |>
108 | corpus() |> ## create quanteda corpus
109 | tokens() |> ## tokenize the corpus
110 | dfm() |> ## structure tokens as Document Term Matrix
111 | dfm_tolower() |> ## preprocessing: lowercase
112 | dfm_wordstem() |> ## preprocessing: stemming
113 | dfm_remove(stopwords('english')) ## preprocessing: remove English stopwords
114 |
115 | dtm
116 | ```
117 |
118 | Create the DTM using the inaugural speeches (rt) that we read into R above.
119 |
120 | ```{r}
121 | dtm <- rt |>
122 | corpus() |>
123 | tokens() |>
124 | dfm() |>
125 | dfm_tolower() |>
126 | dfm_wordstem() |>
127 | dfm_remove(stopwords('english'))
128 |
129 | dtm
130 | ```
131 |
132 | ### Filtering and weighting
133 |
134 | ```{r}
135 | doc_freq <- docfreq(dtm) # document frequency per term (column)
136 | dtm <- dtm[, doc_freq >= 2] # select terms with doc_freq >= 2
137 | dtm <- dfm_tfidf(dtm) # weight the features using tf-idf
138 | head(dtm)
139 | ```
140 |
141 | ## Analysis
142 |
143 | Prepare DTM for analysis examples.
144 |
145 | ```{r}
146 | dtm <- data_corpus_inaugural |>
147 | corpus() |>
148 | tokens(remove_punct = T) |>
149 | dfm() |>
150 | dfm_tolower() |>
151 | dfm_wordstem() |>
152 | dfm_remove(stopwords('english'))
153 |
154 | dtm
155 | ```
156 |
157 | ### Counting and Dictionary
158 |
159 | ```{r}
160 | myDict <- dictionary(list(terror = c("terror*"),
161 | economy = c("job*", "business*", "econom*")))
162 | dict_dtm <- dfm_lookup(dtm, myDict, nomatch = "_unmatched")
163 | tail(dict_dtm)
164 | ```
165 |
166 | ### Supervised Machine Learning
167 |
168 | ```{r}
169 | library(quanteda)
170 | library(quanteda.textmodels)
171 | ```
172 |
173 | ```{r}
174 | set.seed(2)
175 | # create a document variable indicating pre or post war
176 | docvars(dtm, "is_prewar") <- docvars(dtm, "Year") < 1945
177 |
178 | # sample 40 documents for the training set and use remaining (18) for testing
179 | train_dtm <- dfm_sample(dtm, size = 40)
180 | test_dtm <- dtm[setdiff(docnames(dtm), docnames(train_dtm)), ]
181 |
182 | # fit a Naive Bayes multinomial model and use it to predict the test data
183 | nb_model <- textmodel_nb(train_dtm, y = docvars(train_dtm, "is_prewar"))
184 | pred_nb <- predict(nb_model, newdata = test_dtm)
185 |
186 | # compare prediction (rows) and actual is_prewar value (columns) in a table
187 | table(prediction = pred_nb, is_prewar = docvars(test_dtm, "is_prewar"))
188 | ```
189 |
190 | ### Unsupervised Machine Learning
191 |
192 | ```{r}
193 | library(topicmodels)
194 |
195 | texts = corpus_reshape(data_corpus_inaugural, to = "paragraphs")
196 |
197 | par_dtm <- texts |> corpus() |> tokens(remove_punct = T) |>
198 | dfm() |> dfm_tolower() |> dfm_wordstem() |>
199 | dfm_remove(stopwords('english')) |> dfm_trim(min_count = 5) |>
200 | convert(to = 'topicmodels')
201 |
202 | set.seed(1)
203 | lda_model <- topicmodels::LDA(par_dtm, method = "Gibbs", k = 5)
204 | terms(lda_model, 5)
205 | ```
206 |
207 | ### Statistics
208 |
209 | ```{r}
210 | library(quanteda.textstats)
211 | library(quanteda.textplots)
212 |
213 | # create DTM that contains Trump and Obama speeches
214 | dtm_pres <- data_corpus_inaugural |>
215 | corpus_subset(President %in% c('Obama','Trump')) |>
216 | tokens(remove_punct = T) |>
217 | dfm() |>
218 | dfm_remove(stopwords('english'))
219 |
220 | # compare target (in this case Trump) to rest of DTM (in this case only Obama).
221 | dtm_pres |>
222 | dfm_group(President) |>
223 | textstat_keyness(target = "Trump") |>
224 | textplot_keyness()
225 | ```
226 |
227 |
228 | ## Advanced Topics
229 |
230 | ### Advanced NLP
231 |
232 | ```{r, eval=F}
233 | library(spacyr)
234 | spacy_install()
235 | spacy_initialize()
236 | d <- spacy_parse("Bob Smith gave Alice his login information.", dependency = TRUE)
237 | d[, -c(1,2)]
238 | ```
239 |
240 | ### Word Positions and Syntax
241 |
242 | ```{r}
243 | text <- "an example of preprocessing techniques"
244 |
245 | text |>
246 | tokens() |>
247 | tokens_ngrams(n=3, skip=0:1)
248 | ```
249 |
250 | ```{r}
251 | library(corpustools)
252 |
253 | tc <- create_tcorpus(sotu_texts, doc_column = "id")
254 | hits <- search_features(tc, '"freedom americ*"~5')
255 | kwic <- get_kwic(tc, hits, ntokens = 3)
256 | head(kwic$kwic, 3)
257 | ```
258 |
--------------------------------------------------------------------------------
/files/Text_Analysis_in_R.md:
--------------------------------------------------------------------------------
1 | Text Analysis in R: online appendix
2 | ================
3 | Kasper Welbers, Wouter van Atteveldt & Kenneth Benoit
4 | 2023
5 |
6 | ## About this document
7 |
8 | This is the online appendix for [Welbers, van Atteveldt & Benoit
9 | (2017)](http://www.tandfonline.com/doi/full/10.1080/19312458.2017.1387238),
10 | that contains the example code presented in the article. The code in
11 | this appendix will be kept up-to-date with changes in the used packages,
12 | and as such can differ slightly from the code presented in the article.
13 |
14 |
15 |
16 | ### required packages
17 |
18 | The following packages have to be installed to run all the code
19 | examples. Note that the lines to install the packages only have to be
20 | run once.
21 |
22 | ``` r
23 | ################# PACKAGE # SECTION IN ARTICLE
24 | install.packages("readtext") # data preparation
25 | install.packages("stringi") # data preparation
26 |
27 | install.packages("quanteda") # data preparation and analysis
28 | install.packages('quanteda.textmodels')
29 | install.packages('quanteda.textstats')
30 | install.packages('quanteda.textplots')
31 |
32 | install.packages("topicmodels") # analysis
33 |
34 | install.packages("spacyr") # advanced topics
35 | install.packages("corpustools") # advanced topics
36 | ```
37 |
38 | ## Data Preparation
39 |
40 | ### String Operations
41 |
42 | ``` r
43 | library(readtext)
44 | # url to Inaugural Address demo data that is provided by the readtext package
45 | filepath <- "https://raw.githubusercontent.com/kbenoit/readtext/master/inst/extdata/csv/inaugCorpus.csv"
46 |
47 | rt <- readtext(filepath, text_field = "texts")
48 | rt
49 | ## readtext object consisting of 5 documents and 3 docvars.
50 | ## $text
51 | ## [1] "# A data frame: 5 × 5"
52 | ## [2] " doc_id text Year President FirstName"
53 | ## [3] " "
54 | ## [4] "1 inaugCorpus.csv.1 \"\\\"Fellow-Cit\\\"...\" 1789 Washington George "
55 | ## [5] "2 inaugCorpus.csv.2 \"\\\"Fellow cit\\\"...\" 1793 Washington George "
56 | ## [6] "3 inaugCorpus.csv.3 \"\\\"When it wa\\\"...\" 1797 Adams John "
57 | ## [7] "4 inaugCorpus.csv.4 \"\\\"Friends an\\\"...\" 1801 Jefferson Thomas "
58 | ## [8] "5 inaugCorpus.csv.5 \"\\\"Proceeding\\\"...\" 1805 Jefferson Thomas "
59 | ##
60 | ## $summary
61 | ## $summary[[1]]
62 | ## NULL
63 | ##
64 | ##
65 | ## attr(,"class")
66 | ## [1] "trunc_mat"
67 | ```
68 |
69 | ### String Operations
70 |
71 | ``` r
72 | library(stringi)
73 | x <- c("The first string", ' The second string')
74 |
75 | x <- stri_replace_all(x, "", regex = "<.*?>") # remove html tags
76 | x <- stri_trim(x) # strip surrounding whitespace
77 | x <- stri_trans_tolower(x) # transform to lower case
78 | x
79 | ## [1] "the first string" "the second string"
80 | ```
81 |
82 | ### Preprocessing
83 |
84 | #### Tokenization
85 |
86 | ``` r
87 | library(quanteda)
88 | ## Package version: 3.3.0
89 | ## Unicode version: 14.0
90 | ## ICU version: 70.1
91 | ## Parallel computing: 8 of 8 threads used.
92 | ## See https://quanteda.io for tutorials and examples.
93 | ##
94 | ## Attaching package: 'quanteda'
95 | ## The following objects are masked from 'package:readtext':
96 | ##
97 | ## docnames, docvars, texts
98 |
99 | text <- "An example of preprocessing techniques"
100 | toks <- tokens(text) # tokenize into unigrams
101 | toks
102 | ## Tokens consisting of 1 document.
103 | ## text1 :
104 | ## [1] "An" "example" "of" "preprocessing"
105 | ## [5] "techniques"
106 | ```
107 |
108 | #### Normalization: lowercasing and stemming
109 |
110 | ``` r
111 | toks <- tokens_tolower(toks)
112 | toks <- tokens_wordstem(toks)
113 | toks
114 | ## Tokens consisting of 1 document.
115 | ## text1 :
116 | ## [1] "an" "exampl" "of" "preprocess" "techniqu"
117 | ```
118 |
119 | #### Removing stopwords
120 |
121 | ``` r
122 | sw <- stopwords("english") # get character vector of stopwords
123 | head(sw) # show head (first 6) stopwords
124 | ## [1] "i" "me" "my" "myself" "we" "our"
125 | tokens_remove(toks, sw)
126 | ## Tokens consisting of 1 document.
127 | ## text1 :
128 | ## [1] "exampl" "preprocess" "techniqu"
129 | ```
130 |
131 | ### Document-Term Matrix
132 |
133 | Since the publication of the Text Analysis in R paper, the quanteda
134 | package has gone through several updates. One important change is that
135 | many operations are now cut down into separate steps. This works nicely
136 | together with the now common pipe notation (`|>`, or `%>%` in
137 | tidyverse).
138 |
139 | Before, we created a dfm with one single do-it-all function. Now, we run
140 | our data through a pipeline of functions that each perform a single
141 | step.
142 |
143 | ``` r
144 | text <- c(d1 = "An example of preprocessing techniques",
145 | d2 = "An additional example",
146 | d3 = "A third example")
147 |
148 | dtm <- text |>
149 | corpus() |> ## create quanteda corpus
150 | tokens() |> ## tokenize the corpus
151 | dfm() |> ## structure tokens as Document Term Matrix
152 | dfm_tolower() |> ## preprocessing: lowercase
153 | dfm_wordstem() |> ## preprocessing: stemming
154 | dfm_remove(stopwords('english')) ## preprocessing: remove English stopwords
155 |
156 | dtm
157 | ## Document-feature matrix of: 3 documents, 5 features (53.33% sparse) and 0 docvars.
158 | ## features
159 | ## docs exampl preprocess techniqu addit third
160 | ## d1 1 1 1 0 0
161 | ## d2 1 0 0 1 0
162 | ## d3 1 0 0 0 1
163 | ```
164 |
165 | Create the DTM using the inaugural speeches (rt) that we read into R
166 | above.
167 |
168 | ``` r
169 | dtm <- rt |>
170 | corpus() |>
171 | tokens() |>
172 | dfm() |>
173 | dfm_tolower() |>
174 | dfm_wordstem() |>
175 | dfm_remove(stopwords('english'))
176 |
177 | dtm
178 | ## Document-feature matrix of: 5 documents, 1,422 features (67.45% sparse) and 3 docvars.
179 | ## features
180 | ## docs fellow-citizen senat hous repres : among vicissitud incid
181 | ## inaugCorpus.csv.1 1 1 2 2 1 1 1 1
182 | ## inaugCorpus.csv.2 0 0 0 0 1 0 0 0
183 | ## inaugCorpus.csv.3 3 1 3 3 0 4 0 0
184 | ## inaugCorpus.csv.4 2 0 0 1 1 1 0 0
185 | ## inaugCorpus.csv.5 0 0 0 0 0 7 0 0
186 | ## features
187 | ## docs life event
188 | ## inaugCorpus.csv.1 1 2
189 | ## inaugCorpus.csv.2 0 0
190 | ## inaugCorpus.csv.3 2 0
191 | ## inaugCorpus.csv.4 1 0
192 | ## inaugCorpus.csv.5 2 1
193 | ## [ reached max_nfeat ... 1,412 more features ]
194 | ```
195 |
196 | ### Filtering and weighting
197 |
198 | ``` r
199 | doc_freq <- docfreq(dtm) # document frequency per term (column)
200 | dtm <- dtm[, doc_freq >= 2] # select terms with doc_freq >= 2
201 | dtm <- dfm_tfidf(dtm) # weight the features using tf-idf
202 | head(dtm)
203 | ## Document-feature matrix of: 5 documents, 530 features (46.34% sparse) and 3 docvars.
204 | ## features
205 | ## docs fellow-citizen senat hous repres :
206 | ## inaugCorpus.csv.1 0.2218487 0.39794 0.79588 0.4436975 0.2218487
207 | ## inaugCorpus.csv.2 0 0 0 0 0.2218487
208 | ## inaugCorpus.csv.3 0.6655462 0.39794 1.19382 0.6655462 0
209 | ## inaugCorpus.csv.4 0.4436975 0 0 0.2218487 0.2218487
210 | ## inaugCorpus.csv.5 0 0 0 0 0
211 | ## features
212 | ## docs among life event greater anxieti
213 | ## inaugCorpus.csv.1 0.09691001 0.09691001 0.79588 0.39794 0.39794
214 | ## inaugCorpus.csv.2 0 0 0 0 0
215 | ## inaugCorpus.csv.3 0.38764005 0.19382003 0 0 0.39794
216 | ## inaugCorpus.csv.4 0.09691001 0.09691001 0 0.39794 0
217 | ## inaugCorpus.csv.5 0.67837009 0.19382003 0.39794 0 0
218 | ## [ reached max_nfeat ... 520 more features ]
219 | ```
220 |
221 | ## Analysis
222 |
223 | Prepare DTM for analysis examples.
224 |
225 | ``` r
226 | dtm <- data_corpus_inaugural |>
227 | corpus() |>
228 | tokens(remove_punct = T) |>
229 | dfm() |>
230 | dfm_tolower() |>
231 | dfm_wordstem() |>
232 | dfm_remove(stopwords('english'))
233 |
234 | dtm
235 | ## Document-feature matrix of: 59 documents, 5,468 features (89.25% sparse) and 4 docvars.
236 | ## features
237 | ## docs fellow-citizen senat hous repres among vicissitud incid life
238 | ## 1789-Washington 1 1 2 2 1 1 1 1
239 | ## 1793-Washington 0 0 0 0 0 0 0 0
240 | ## 1797-Adams 3 1 3 3 4 0 0 2
241 | ## 1801-Jefferson 2 0 0 1 1 0 0 1
242 | ## 1805-Jefferson 0 0 0 0 7 0 0 2
243 | ## 1809-Madison 1 0 0 1 0 1 0 1
244 | ## features
245 | ## docs event fill
246 | ## 1789-Washington 2 1
247 | ## 1793-Washington 0 0
248 | ## 1797-Adams 0 0
249 | ## 1801-Jefferson 0 0
250 | ## 1805-Jefferson 1 0
251 | ## 1809-Madison 0 1
252 | ## [ reached max_ndoc ... 53 more documents, reached max_nfeat ... 5,458 more features ]
253 | ```
254 |
255 | ### Counting and Dictionary
256 |
257 | ``` r
258 | myDict <- dictionary(list(terror = c("terror*"),
259 | economy = c("job*", "business*", "econom*")))
260 | dict_dtm <- dfm_lookup(dtm, myDict, nomatch = "_unmatched")
261 | tail(dict_dtm)
262 | ## Document-feature matrix of: 6 documents, 3 features (16.67% sparse) and 4 docvars.
263 | ## features
264 | ## docs terror economy _unmatched
265 | ## 2001-Bush 0 2 796
266 | ## 2005-Bush 0 1 1056
267 | ## 2009-Obama 1 7 1192
268 | ## 2013-Obama 0 6 1052
269 | ## 2017-Trump 1 5 723
270 | ## 2021-Biden 1 4 1146
271 | ```
272 |
273 | ### Supervised Machine Learning
274 |
275 | ``` r
276 | library(quanteda)
277 | library(quanteda.textmodels)
278 | ```
279 |
280 | ``` r
281 | set.seed(2)
282 | # create a document variable indicating pre or post war
283 | docvars(dtm, "is_prewar") <- docvars(dtm, "Year") < 1945
284 |
285 | # sample 40 documents for the training set and use remaining (18) for testing
286 | train_dtm <- dfm_sample(dtm, size = 40)
287 | test_dtm <- dtm[setdiff(docnames(dtm), docnames(train_dtm)), ]
288 |
289 | # fit a Naive Bayes multinomial model and use it to predict the test data
290 | nb_model <- textmodel_nb(train_dtm, y = docvars(train_dtm, "is_prewar"))
291 | pred_nb <- predict(nb_model, newdata = test_dtm)
292 |
293 | # compare prediction (rows) and actual is_prewar value (columns) in a table
294 | table(prediction = pred_nb, is_prewar = docvars(test_dtm, "is_prewar"))
295 | ## is_prewar
296 | ## prediction FALSE TRUE
297 | ## FALSE 6 0
298 | ## TRUE 0 13
299 | ```
300 |
301 | ### Unsupervised Machine Learning
302 |
303 | ``` r
304 | library(topicmodels)
305 |
306 | texts = corpus_reshape(data_corpus_inaugural, to = "paragraphs")
307 |
308 | par_dtm <- texts |> corpus() |> tokens(remove_punct = T) |>
309 | dfm() |> dfm_tolower() |> dfm_wordstem() |>
310 | dfm_remove(stopwords('english')) |> dfm_trim(min_count = 5) |>
311 | convert(to = 'topicmodels')
312 |
313 | set.seed(1)
314 | lda_model <- topicmodels::LDA(par_dtm, method = "Gibbs", k = 5)
315 | terms(lda_model, 5)
316 | ## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
317 | ## [1,] "nation" "state" "great" "us" "shall"
318 | ## [2,] "peopl" "govern" "govern" "american" "peopl"
319 | ## [3,] "can" "power" "war" "new" "duti"
320 | ## [4,] "must" "constitut" "countri" "america" "countri"
321 | ## [5,] "everi" "ani" "secur" "world" "citizen"
322 | ```
323 |
324 | ### Statistics
325 |
326 | ``` r
327 | library(quanteda.textstats)
328 | library(quanteda.textplots)
329 |
330 | # create DTM that contains Trump and Obama speeches
331 | dtm_pres <- data_corpus_inaugural |>
332 | corpus_subset(President %in% c('Obama','Trump')) |>
333 | tokens(remove_punct = T) |>
334 | dfm() |>
335 | dfm_remove(stopwords('english'))
336 |
337 | # compare target (in this case Trump) to rest of DTM (in this case only Obama).
338 | dtm_pres |>
339 | dfm_group(President) |>
340 | textstat_keyness(target = "Trump") |>
341 | textplot_keyness()
342 | ```
343 |
344 | 
345 |
346 | ## Advanced Topics
347 |
348 | ### Advanced NLP
349 |
350 | ``` r
351 | library(spacyr)
352 | spacy_install()
353 | spacy_initialize()
354 | d <- spacy_parse("Bob Smith gave Alice his login information.", dependency = TRUE)
355 | d[, -c(1,2)]
356 | ```
357 |
358 | ### Word Positions and Syntax
359 |
360 | ``` r
361 | text <- "an example of preprocessing techniques"
362 |
363 | text |>
364 | tokens() |>
365 | tokens_ngrams(n=3, skip=0:1)
366 | ## Tokens consisting of 1 document.
367 | ## text1 :
368 | ## [1] "an_example_of" "an_example_preprocessing"
369 | ## [3] "an_of_preprocessing" "an_of_techniques"
370 | ## [5] "example_of_preprocessing" "example_of_techniques"
371 | ## [7] "example_preprocessing_techniques" "of_preprocessing_techniques"
372 | ```
373 |
374 | ``` r
375 | library(corpustools)
376 |
377 | tc <- create_tcorpus(sotu_texts, doc_column = "id")
378 | hits <- search_features(tc, '"freedom americ*"~5')
379 | kwic <- get_kwic(tc, hits, ntokens = 3)
380 | head(kwic$kwic, 3)
381 | ## [1] "...making progress toward will find is their friend..."
382 | ## [2] "...friends, and in Iraq will make safer for generations..."
383 | ## [3] "...men who despise , despise , and aim..."
384 | ```
385 |
--------------------------------------------------------------------------------