├── .DS_Store ├── .gitignore ├── Homeworks ├── .DS_Store ├── HW1 │ ├── .DS_Store │ ├── HW1.pdf │ └── data │ │ └── mystery_excerpt.rds ├── HW2 │ ├── .DS_Store │ ├── HW2.pdf │ └── data │ │ ├── negative-words.txt │ │ ├── positive-words.txt │ │ └── yelp.csv └── HW3 │ └── HW3.pdf ├── Lectures-AS ├── 1-lecture0_intro.pdf ├── 10-unsupervised_II.pdf ├── 11-topic_models.pdf ├── 12-beyond_LDA.pdf ├── 13-text_12_special_topics_I.pdf ├── 2-Text_1_RepresentingText.pdf ├── 3-descriptive_inference_I.pdf ├── 4-descriptive_inference_2.pdf ├── 5-supervised_I.pdf ├── 6-supervised_II.pdf ├── 7-supervised_III.pdf ├── 8-supervised_IV.pdf └── 9-unsupervised_I.pdf ├── README.md ├── TAD_syllabus_Jan_2019.pdf ├── W10_04_18_19 ├── Session 10 - Unsupervised Learning IIa.R ├── Session-10.R └── blm_samp.csv ├── W11_04_25_19 └── Session 11.R ├── W12_04_02_19 ├── MTurk Fees.pdf ├── Session 12 - Special Topics.R └── WE-Validation.pdf ├── W1_01_31_19 ├── Session 1 - Introducing R.R ├── national_clinton_trump_6_20_2016.csv ├── plot1.pdf └── table1.csv ├── W2_02_07_19 └── Session 2 - Introducing Quanteda.R ├── W3_02_14_19 └── Session 3 - Descriptive Inference.R ├── W4_02_21_19 └── Session 4 - Descriptive Inference II.R ├── W5_02_28_19 ├── LaverGarry.cat ├── RID.CAT ├── Session 5 - Supervised Learning I.R └── conservative_manifestos │ ├── Con1918.txt │ ├── Con1922.txt │ ├── Con1923.txt │ ├── Con1924.txt │ ├── Con1929.txt │ ├── Con1931.txt │ ├── Con1935.txt │ ├── Con1945.txt │ ├── Con1950.txt │ ├── Con1951.txt │ ├── Con1955.txt │ ├── Con1959.txt │ ├── Con1964.txt │ ├── Con1966.txt │ ├── Con1970.txt │ ├── Con1974a.txt │ ├── Con1974b.txt │ ├── Con1979.txt │ ├── Con1983.txt │ ├── Con1987.txt │ ├── Con1992.txt │ ├── Con1997.txt │ └── Con2001.txt ├── W6_03_07_19 ├── Session 6 - Supervised Learning II.R ├── cons_labour_manifestos │ ├── Con1918.txt │ ├── Con1922.txt │ ├── Con1923.txt │ ├── Con1924.txt │ ├── Con1929.txt │ ├── Con1931.txt │ ├── Con1935.txt │ ├── Con1945.txt │ ├── Con1950.txt │ ├── Con1951.txt │ ├── Con1955.txt │ ├── Con1959.txt │ ├── Con1964.txt │ ├── Con1966.txt │ ├── Con1970.txt │ ├── Con1974a.txt │ ├── Con1974b.txt │ ├── Con1979.txt │ ├── Con1983.txt │ ├── Con1987.txt │ ├── Con1992.txt │ ├── Con1997.txt │ ├── Con2001.txt │ ├── Lab1918.txt │ ├── Lab1922.txt │ ├── Lab1923.txt │ ├── Lab1924.txt │ ├── Lab1929.txt │ ├── Lab1931.txt │ ├── Lab1935.txt │ ├── Lab1945.txt │ ├── Lab1950.txt │ ├── Lab1951.txt │ ├── Lab1955.txt │ ├── Lab1959.txt │ ├── Lab1964.txt │ ├── Lab1966.txt │ ├── Lab1970.txt │ ├── Lab1974a.txt │ ├── Lab1974b.txt │ ├── Lab1979.txt │ ├── Lab1983.txt │ ├── Lab1987.txt │ ├── Lab1992.txt │ ├── Lab1997.txt │ └── Lab2001.txt └── news_data.rds ├── W7_03_14_19 ├── Session 7 - Supervised Learning III.R ├── bullying.csv └── nyt-fb.csv ├── W8_03_28_19 ├── Session 8 - Supervised Learning IV.R ├── r8-test-all-terms.txt └── r8-train-all-terms.txt └── W9_04_11_19 └── Session 9 - Unsupervised Learning I.R /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | -------------------------------------------------------------------------------- /Homeworks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/.DS_Store -------------------------------------------------------------------------------- /Homeworks/HW1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW1/.DS_Store -------------------------------------------------------------------------------- /Homeworks/HW1/HW1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW1/HW1.pdf -------------------------------------------------------------------------------- /Homeworks/HW1/data/mystery_excerpt.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW1/data/mystery_excerpt.rds -------------------------------------------------------------------------------- /Homeworks/HW2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW2/.DS_Store -------------------------------------------------------------------------------- /Homeworks/HW2/HW2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW2/HW2.pdf -------------------------------------------------------------------------------- /Homeworks/HW3/HW3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Homeworks/HW3/HW3.pdf -------------------------------------------------------------------------------- /Lectures-AS/1-lecture0_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/1-lecture0_intro.pdf -------------------------------------------------------------------------------- /Lectures-AS/10-unsupervised_II.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/10-unsupervised_II.pdf -------------------------------------------------------------------------------- /Lectures-AS/11-topic_models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/11-topic_models.pdf -------------------------------------------------------------------------------- /Lectures-AS/12-beyond_LDA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/12-beyond_LDA.pdf -------------------------------------------------------------------------------- /Lectures-AS/13-text_12_special_topics_I.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/13-text_12_special_topics_I.pdf -------------------------------------------------------------------------------- /Lectures-AS/2-Text_1_RepresentingText.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/2-Text_1_RepresentingText.pdf -------------------------------------------------------------------------------- /Lectures-AS/3-descriptive_inference_I.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/3-descriptive_inference_I.pdf -------------------------------------------------------------------------------- /Lectures-AS/4-descriptive_inference_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/4-descriptive_inference_2.pdf -------------------------------------------------------------------------------- /Lectures-AS/5-supervised_I.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/5-supervised_I.pdf -------------------------------------------------------------------------------- /Lectures-AS/6-supervised_II.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/6-supervised_II.pdf -------------------------------------------------------------------------------- /Lectures-AS/7-supervised_III.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/7-supervised_III.pdf -------------------------------------------------------------------------------- /Lectures-AS/8-supervised_IV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/8-supervised_IV.pdf -------------------------------------------------------------------------------- /Lectures-AS/9-unsupervised_I.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/Lectures-AS/9-unsupervised_I.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### DS-GA 1015 Text-as-Data Spring 2019 2 | 3 | Materials for the lab component of DS-GA 1015 Text-as-Data (Spring 2019) can be found here. 4 | 5 | Lab is on Thursdays from 2-2:50 pm in room 110, 60 5th Ave (Center for Data Science). 6 | 7 | Some files for future labs may be online, but please check back at the beginning of each lab for the most updated version of that day's materials! 8 | 9 | #### How to get files on your machine 10 | 11 | GitHub users: [Fork](https://help.github.com/articles/fork-a-repo/) this repo. 12 | 13 | Non-GitHub users: Go to "Clone or download" > "Download ZIP" to download the files to your machine. 14 | 15 | #### Syncing to update files (pull changes) 16 | 17 | [Using terminal](https://help.github.com/articles/syncing-a-fork/) 18 | 19 | [Using browser](https://github.com/KirstieJane/STEMMRoleModels/wiki/Syncing-your-fork-to-the-original-repository-via-the-browser) 20 | 21 | #### Software 22 | 23 | Please make sure you have: 24 | 25 | - Latest version of R 26 | 27 | - Latest version of RStudio 28 | -------------------------------------------------------------------------------- /TAD_syllabus_Jan_2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/TAD_syllabus_Jan_2019.pdf -------------------------------------------------------------------------------- /W10_04_18_19/Session 10 - Unsupervised Learning IIa.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 04/18/2019 4 | # Lab adapted from: Leslie Huang. 5 | # additional resources: 6 | # original paper introducing LDA: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf 7 | # https://www.tidytextmining.com/topicmodeling.html 8 | # https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05 9 | # most recent addition to topic modeling methods: https://multithreaded.stitchfix.com/blog/2016/05/27/lda2vec/#topic=38&lambda=1&term= 10 | # human validation of topic models: https://dl.acm.org/citation.cfm?id=2984126 11 | 12 | # basic intuition: 13 | # a. documents are represented as random mixtures over latent topics. 14 | # b. a topic is characterized by a distribution over words. 15 | # we now propose a GENERATIVE MODEL OF THE DATA 16 | # want to maximize the probability of a corpus as a function of our parameters (of the dirichlets) and latent variables (doc topic mixtures and topic word distributions). 17 | 18 | rm(list = ls()) 19 | 20 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W10_04_18_19/") 21 | 22 | set.seed(1234) 23 | 24 | # Check for these packages, install them if you don't have them 25 | # install.packages("tidytext") 26 | # install.packages("topicmodels") 27 | # install.packages("ldatuning") 28 | # install.packages("stringi") 29 | # install.packages("rjson") 30 | 31 | libraries <- c("ldatuning", "topicmodels", "ggplot2", "dplyr", "rjson", "quanteda", "lubridate", "parallel", "doParallel", "tidytext", "stringi", "tidyr") 32 | lapply(libraries, require, character.only = TRUE) 33 | 34 | ## 1 Preprocessing 35 | 36 | # Load data 37 | blm_tweets <- read.csv("blm_samp.csv", stringsAsFactors = F) 38 | 39 | # Create date vectors 40 | blm_tweets$datetime <- as.POSIXct(strptime(blm_tweets$created_at, "%a %b %d %T %z %Y",tz = "GMT")) # full date/timestamp 41 | blm_tweets$date <- mdy(paste(month(blm_tweets$datetime), day(blm_tweets$datetime), year(blm_tweets$datetime), sep = "-")) # date only 42 | 43 | # Collapse tweets so we are looking at the total tweets at the day level 44 | blm_tweets_sum <- blm_tweets %>% group_by(date) %>% summarise(text = paste(text, collapse = " ")) 45 | 46 | # Remove non ASCII characters 47 | blm_tweets_sum$text <- stringi::stri_trans_general(blm_tweets_sum$text, "latin-ascii") 48 | 49 | # Removes solitary letters 50 | blm_tweets_sum$text <- gsub(" [A-z] ", " ", blm_tweets_sum$text) 51 | 52 | # As always we begin with a DFM. 53 | # Create DFM 54 | blm_dfm <-dfm(blm_tweets_sum$text, stem = F, remove_punct = T, tolower = T, remove_twitter = T, remove_numbers = TRUE, remove = c(stopwords("english"), "http","https","rt", "t.co")) 55 | 56 | # Topic models 57 | # note; pLSA (a probabilistic take on LSA) is a precursor to LDA 58 | ## 2 Selecting K 59 | 60 | # Identify an appropriate number of topics (FYI, this function takes a while) 61 | k_optimize_blm <- FindTopicsNumber( 62 | blm_dfm, 63 | topics = seq(from = 2, to = 30, by = 1), 64 | metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), 65 | method = "Gibbs", 66 | control = list(seed = 2017), 67 | mc.cores = detectCores(), # to usa all cores available 68 | verbose = TRUE 69 | ) 70 | 71 | FindTopicsNumber_plot(k_optimize_blm) 72 | 73 | # Where do these metrics come from? 74 | 75 | # Go here for the citations (and another tutorial) 76 | # https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html 77 | 78 | # What should you consider when choosing the number of topics you use in a topic model? 79 | 80 | # What does robustness mean here? 81 | 82 | ## 3 Visualizing Word weights 83 | 84 | # Set number of topics 85 | k <- 19 86 | 87 | # Fit the topic model with the chosen k 88 | system.time( 89 | blm_tm <- LDA(blm_dfm, k = k, method = "Gibbs", control = list(seed = 1234))) 90 | 91 | # Other parameters that we do not use here (because they increase the time the model takes) can be passed to the control parameter 92 | ?`LDAcontrol-class` 93 | # iter : num iterations 94 | # thin : every thin iteration is returned for iter iterations 95 | # burnin : number of initial iterations discarded 96 | 97 | ## Letter soup 98 | 99 | # gamma = posterior document distribution over topics 100 | # what are the dimensions of gamma? 101 | dim(blm_tm@gamma) 102 | blm_tm@gamma[1:5,1:5] 103 | rowSums(blm_tm@gamma) # each row sums to? 104 | 105 | # beta = topic distribution over words 106 | dim(blm_dfm) # how many features do we have? 107 | dim(blm_tm@beta) 108 | blm_tm@beta[1:5,1:5] 109 | sum(blm_tm@beta[1,]) # each row sums to? 110 | sum(exp(blm_tm@beta[10,])) # each row sums to? 111 | 112 | # Per topic per word proabilities matrix (beta) 113 | blm_topics <- tidy(blm_tm, matrix = "beta") 114 | head(blm_topics) 115 | 116 | # Side note: You can pass objects between tidytext() and topicmodels() functions because tidytext() implements topic models from topicmodels() 117 | 118 | # Generates a df of top terms 119 | blm_top_terms <- blm_topics %>% 120 | group_by(topic) %>% 121 | top_n(10, beta) %>% 122 | ungroup() %>% 123 | arrange(topic, -beta) 124 | 125 | head(blm_top_terms) 126 | 127 | # Creates a plot of the weights and terms by topic 128 | blm_top_terms %>% 129 | mutate(term = reorder(term, beta)) %>% 130 | ggplot(aes(term, beta, fill = factor(topic))) + 131 | geom_col(show.legend = FALSE) + 132 | facet_wrap(~ topic, scales = "free") + 133 | coord_flip() 134 | 135 | # Creates a plot of features with greatest difference in word probabilities between two topics 136 | blm_topics %>% 137 | mutate(topic = paste0("topic", topic)) %>% 138 | filter(topic %in% c("topic1", "topic2")) %>% 139 | spread(topic, beta) %>% 140 | filter(topic1 > .001 | topic2 > .001) %>% 141 | mutate(log_ratio = log2(topic2 / topic1)) %>% 142 | arrange(-abs(log_ratio)) %>% 143 | slice(c(1:10,(nrow(.)-9):nrow(.))) %>% 144 | arrange(-log_ratio) %>% 145 | mutate(term = factor(term, levels = unique(term))) %>% 146 | ggplot(aes(as.factor(term), log_ratio)) + 147 | geom_col(show.legend = FALSE) + 148 | xlab("Terms") + ylab("Log-Ratio") + 149 | coord_flip() 150 | 151 | ## 4 Visualizing topic trends over time 152 | 153 | # Store the results of the mixture of documents over topics 154 | doc_topics <- blm_tm@gamma 155 | 156 | # Store the results of words over topics 157 | #words_topics <- blm_tm@beta 158 | 159 | # Transpose the data so that the days are columns 160 | doc_topics <- t(doc_topics) 161 | dim(doc_topics) 162 | doc_topics[1:5,1:5] 163 | 164 | # Arrange topics 165 | # Find the top topic per column (day) 166 | max <- apply(doc_topics, 2, which.max) 167 | 168 | # Write a function that finds the second max 169 | which.max2 <- function(x){ 170 | which(x == sort(x,partial=(k-1))[k-1]) 171 | } 172 | 173 | max2 <- apply(doc_topics, 2, which.max2) 174 | max2 <- sapply(max2, max) 175 | 176 | # Coding police shooting events 177 | victim <- c("Freddie Gray", "Sandra Bland") 178 | shootings <- mdy(c("04/12/2015","7/13/2015")) 179 | 180 | # Combine data 181 | top2 <- data.frame(top_topic = max, second_topic = max2, date = ymd(blm_tweets_sum$date)) 182 | 183 | # Plot 184 | blm_plot <- ggplot(top2, aes(x=date, y=top_topic, pch="First")) 185 | 186 | blm_plot + geom_point(aes(x=date, y=second_topic, pch="Second") ) +theme_bw() + 187 | ylab("Topic Number") + ggtitle("BLM-Related Tweets from 2014 to 2016 over Topics") + geom_point() + xlab(NULL) + 188 | geom_vline(xintercept=as.numeric(shootings[1]), color = "blue", linetype=4) + # Freddie Gray (Topic) 189 | geom_vline(xintercept=as.numeric(shootings[2]), color = "black", linetype=4) + # Sandra Bland 190 | scale_shape_manual(values=c(18, 1), name = "Topic Rank") 191 | 192 | #---------------------------------- 193 | # Extra: LSA as topic modeling 194 | #---------------------------------- 195 | # LSA decomponses a DFM into the product of three matrices: 196 | # a. truncated term matrix from term vector matrix T (left singular vectors from the SVD of the original matrix) -> can be thought of as a topic-term-matrix 197 | # b. truncated document matrix from document vector matrix D (right singular vectors from the SVD of the original matrix) -> can be thought of as a document-topic-matrix 198 | # c. singular values: Matrix of scaling values to ensure that multiplying these matrices reconstructs TDM 199 | 200 | # set working directory 201 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/") 202 | library(lsa) 203 | #---------------------------------------- 204 | # 1. Load, clean and inspect data --- 205 | #---------------------------------------- 206 | news_data <- readRDS("news_data.rds") 207 | table(news_data$category) 208 | 209 | # let's work with 2 categories 210 | set.seed(1984) 211 | news_samp <- news_data %>% 212 | filter(category %in% c("POLITICS")) %>% 213 | group_by(category) %>% 214 | sample_n(1000) %>% # sample 250 of each to reduce computation time (for lab purposes) 215 | ungroup() %>% 216 | select(headline, category) %>% 217 | setNames(c("text", "class")) 218 | 219 | # get a sense of how the text looks 220 | dim(news_samp) 221 | head(news_samp$text[news_samp$class == "POLITICS"]) 222 | 223 | # some pre-processing (the rest we'll let dfm do) 224 | news_samp$text <- gsub(pattern = "'", "", news_samp$text) # replace apostrophes 225 | 226 | #---------------------------------------- 227 | # 2. Prepare Data --- 228 | #---------------------------------------- 229 | 230 | # create document feature matrix 231 | news_dfm <- dfm(news_samp$text, stem = TRUE, remove_punct = TRUE, remove = stopwords("english")) 232 | news_fdm <- convert(news_dfm, to = "lsa") 233 | 234 | #---------------------------------------- 235 | # 3. Estimate LSA --- 236 | #---------------------------------------- 237 | news_lsa <- lsa(news_fdm, dims = 20) 238 | 239 | # document-topic matrix 240 | dim(news_lsa$dk) 241 | news_lsa$dk[1:5,1:5] 242 | 243 | # topic-term matrix 244 | dim(news_fdm) 245 | dim(news_lsa$tk) 246 | news_lsa$tk[1:5,1:5] 247 | 248 | # Per topic per word weights 249 | library(reshape2) 250 | lsa_topics <- news_lsa$tk %>% reshape2:::melt.matrix(.) %>% setNames(c("term", "topic", "weight")) 251 | head(lsa_topics) 252 | 253 | # Generates a df of top terms 254 | lsa_top_terms <- lsa_topics %>% 255 | group_by(topic) %>% 256 | top_n(10, weight) %>% 257 | ungroup() %>% 258 | arrange(topic, -weight) 259 | 260 | # Creates a plot of the weights and terms by topic 261 | lsa_top_terms %>% 262 | mutate(term = reorder(term, weight)) %>% 263 | ggplot(aes(term, weight, fill = factor(topic))) + 264 | geom_col(show.legend = FALSE) + 265 | facet_wrap(~ topic, scales = "free") + 266 | coord_flip() 267 | 268 | 269 | 270 | 271 | -------------------------------------------------------------------------------- /W10_04_18_19/Session-10.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodriguez 2 | # Course: Text as Data 3 | # Date: 4/25/2019 4 | 5 | # Supervised vs. Unsupervised 6 | # topic-models: excellent for exploration 7 | # supervised exploration of topics: classification on text snippets with keywords (e.g. Venezuela project) 8 | # semi-supervised approaches: https://github.com/gregversteeg/corex_topic 9 | # see: https://medium.com/pew-research-center-decoded/overcoming-the-limitations-of-topic-models-with-a-semi-supervised-approach-b947374e0455 10 | 11 | # ----------------------------------------------- 12 | # Structural Topic Models --- 13 | # ----------------------------------------------- 14 | rm(list = ls()) 15 | libraries <- c("topicmodels", "dplyr", "stm", "quanteda") 16 | lapply(libraries, require, character.only = T) 17 | setwd("/Users/pedrorodriguez/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W10_04_18_19/") # set working directory 18 | 19 | # Loading data: Political blogs from the 2008 election on a conservative-liberal dimension 20 | data(poliblog5k) 21 | head(poliblog5k.meta) 22 | head(poliblog5k.voc) 23 | 24 | # Fits an STM model with 3 topics 25 | system.time( 26 | blog_stm <- stm(poliblog5k.docs, poliblog5k.voc, 3, prevalence = ~rating + s(day), data = poliblog5k.meta)) 27 | 28 | # A plot that summarizes the topics by what words occur most commonly in them 29 | plot(blog_stm, type = "labels") 30 | 31 | # A summary plot of the topics that ranks them by their average proportion in the corpus 32 | plot(blog_stm, type = "summary") 33 | 34 | # A visualization of what words are shared and distinctive to two topics 35 | plot(blog_stm, type="perspectives", topics = c(1,2)) 36 | 37 | # Estimates a regression with topics as the dependent variable and metadata as the independent variables 38 | # s() is a wrapper for bs() from the splines package 39 | # A spline of degree D is a function formed by connecting polynomial segments of degree D 40 | prep <- estimateEffect(1:3 ~ rating + s(day) , blog_stm, meta = poliblog5k.meta) 41 | 42 | # Plots the distribution of topics over time 43 | plot(prep, "day", blog_stm, topics = c(1,2), 44 | method = "continuous", xaxt = "n", xlab = "Date") 45 | 46 | # Plots the Difference in coverage of the topics according to liberal or conservative ideology 47 | plot(prep, "rating", model = blog_stm, 48 | method = "difference", cov.value1 = "Conservative", cov.value2 = "Liberal") 49 | 50 | # ----------------------------------------------- 51 | # Word Embeddings --- 52 | # ----------------------------------------------- 53 | # Are word embeddings supervised or unsupervised? 54 | # KEY DIFFERENCE between embeddings and other distributional semantic models we've seen: how we define context. 55 | # Context in the case of word embeddings is defined by a window (usually symmetric) around the target word. 56 | # GloVe vs. Word2Vec 57 | # cool/intuitive intro to W2V: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ 58 | 59 | library(text2vec) 60 | 61 | # choice parameters 62 | WINDOW_SIZE <- 6 63 | DIM <- 300 64 | ITERS <- 10 65 | MIN_COUNT <- 10 66 | 67 | # load data 68 | corpus <- readRDS("~/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/data/corpora.rds") 69 | text <- corpus[["R"]] 70 | rm(corpus) 71 | 72 | # shuffle text 73 | set.seed(42L) 74 | text <- sample(text) 75 | 76 | # ================================ 77 | # create vocab 78 | # ================================ 79 | tokens <- space_tokenizer(text) 80 | rm(text) 81 | it <- itoken(tokens, progressbar = FALSE) 82 | vocab <- create_vocabulary(it) 83 | vocab <- prune_vocabulary(vocab, term_count_min = MIN_COUNT) # keep only words that meet count threshold 84 | 85 | # ================================ 86 | # create term co-occurrence matrix 87 | # ================================ 88 | vectorizer <- vocab_vectorizer(vocab) 89 | tcm <- create_tcm(it, vectorizer, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric") 90 | 91 | # ================================ 92 | # set model parameters 93 | # ================================ 94 | glove <- GlobalVectors$new(word_vectors_size = DIM, 95 | vocabulary = vocab, 96 | x_max = 100, 97 | lambda = 1e-5) 98 | 99 | # ================================ 100 | # fit model 101 | # ================================ 102 | word_vectors_main <- glove$fit_transform(tcm, 103 | n_iter = ITERS, 104 | convergence_tol = 1e-3, 105 | n_check_convergence = 1L, 106 | n_threads = RcppParallel::defaultNumThreads()) 107 | 108 | # ================================ 109 | # get output 110 | # ================================ 111 | word_vectors_context <- glove$components 112 | word_vectors <- word_vectors_main + t(word_vectors_context) # word vectors 113 | 114 | # pre-estimated embeddings 115 | word_vectors_6_300_D <- readRDS("/Users/pedrorodriguez/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/CR-5yrs/word_vectors_R_6_300_D_3.rds") # local 116 | word_vectors_6_300_R <- readRDS("/Users/pedrorodriguez/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/CR-5yrs/word_vectors_R_6_300_R_3.rds") # local 117 | pretrained <- readRDS("/Users/pedrorodriguez/Dropbox/NYU/Teaching/Text as Data/homeworks/HW3/data/pretrained.rds") # GloVe pretrained (https://nlp.stanford.edu/projects/glove/) 118 | 119 | # function to compute nearest neighbors 120 | nearest_neighbors <- function(cue, embeds, N = 5, norm = "l2"){ 121 | cos_sim <- sim2(x = embeds, y = embeds[cue, , drop = FALSE], method = "cosine", norm = norm) 122 | nn <- cos_sim <- cos_sim[order(-cos_sim),] 123 | return(names(nn)[2:(N + 1)]) # cue is always the nearest neighbor hence dropped 124 | } 125 | 126 | # e.g. 127 | nearest_neighbors("welfare", word_vectors_6_300_D, N = 10, norm = "l2") 128 | nearest_neighbors("welfare", word_vectors_6_300_R, N = 10, norm = "l2") 129 | nearest_neighbors("welfare", pretrained, N = 10, norm = "l2") 130 | 131 | nearest_neighbors("abortion", word_vectors_6_300_D, N = 10, norm = "l2") 132 | nearest_neighbors("abortion", word_vectors_6_300_R, N = 10, norm = "l2") 133 | nearest_neighbors("abortion", pretrained, N = 10, norm = "l2") 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /W11_04_25_19/Session 11.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodriguez 2 | # Course: Text as Data 3 | # Date: 4/25/2019 4 | 5 | # Supervised vs. Unsupervised 6 | # topic-models: excellent for exploration 7 | # supervised exploration of topics: classification on text snippets with keywords (e.g. Venezuela project) 8 | # semi-supervised approaches: https://github.com/gregversteeg/corex_topic 9 | # see: https://medium.com/pew-research-center-decoded/overcoming-the-limitations-of-topic-models-with-a-semi-supervised-approach-b947374e0455 10 | 11 | # ----------------------------------------------- 12 | # Structural Topic Models --- 13 | # ----------------------------------------------- 14 | rm(list = ls()) 15 | libraries <- c("topicmodels", "dplyr", "stm", "quanteda") 16 | lapply(libraries, require, character.only = T) 17 | setwd("/Users/pedrorodriguez/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W10_04_18_19/") # set working directory 18 | 19 | # Loading data: Political blogs from the 2008 election on a conservative-liberal dimension 20 | data(poliblog5k) 21 | head(poliblog5k.meta) 22 | head(poliblog5k.voc) 23 | 24 | # Fits an STM model with 3 topics 25 | system.time( 26 | blog_stm <- stm(poliblog5k.docs, poliblog5k.voc, 3, prevalence = ~rating + s(day), data = poliblog5k.meta)) 27 | 28 | # A plot that summarizes the topics by what words occur most commonly in them 29 | plot(blog_stm, type = "labels") 30 | 31 | # A summary plot of the topics that ranks them by their average proportion in the corpus 32 | plot(blog_stm, type = "summary") 33 | 34 | # A visualization of what words are shared and distinctive to two topics 35 | plot(blog_stm, type="perspectives", topics = c(1,2)) 36 | 37 | # Estimates a regression with topics as the dependent variable and metadata as the independent variables 38 | # s() is a wrapper for bs() from the splines package 39 | # A spline of degree D is a function formed by connecting polynomial segments of degree D 40 | prep <- estimateEffect(1:3 ~ rating + s(day) , blog_stm, meta = poliblog5k.meta) 41 | 42 | # Plots the distribution of topics over time 43 | plot(prep, "day", blog_stm, topics = c(1,2), 44 | method = "continuous", xaxt = "n", xlab = "Date") 45 | 46 | # Plots the Difference in coverage of the topics according to liberal or conservative ideology 47 | plot(prep, "rating", model = blog_stm, 48 | method = "difference", cov.value1 = "Conservative", cov.value2 = "Liberal") 49 | 50 | # ----------------------------------------------- 51 | # Word Embeddings --- 52 | # ----------------------------------------------- 53 | # Are word embeddings supervised or unsupervised? 54 | # KEY DIFFERENCE between embeddings and other distributional semantic models we've seen: how we define context. 55 | # Context in the case of word embeddings is defined by a window (usually symmetric) around the target word. 56 | # GloVe vs. Word2Vec 57 | # cool/intuitive intro to W2V: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ 58 | 59 | library(text2vec) 60 | 61 | # choice parameters 62 | WINDOW_SIZE <- 6 63 | DIM <- 300 64 | ITERS <- 10 65 | MIN_COUNT <- 10 66 | 67 | # load data 68 | corpus <- readRDS("~/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/data/corpora.rds") 69 | text <- corpus[["R"]] 70 | rm(corpus) 71 | 72 | # shuffle text 73 | set.seed(42L) 74 | text <- sample(text) 75 | 76 | # ================================ 77 | # create vocab 78 | # ================================ 79 | tokens <- space_tokenizer(text) 80 | rm(text) 81 | it <- itoken(tokens, progressbar = FALSE) 82 | vocab <- create_vocabulary(it) 83 | vocab <- prune_vocabulary(vocab, term_count_min = MIN_COUNT) # keep only words that meet count threshold 84 | 85 | # ================================ 86 | # create term co-occurrence matrix 87 | # ================================ 88 | vectorizer <- vocab_vectorizer(vocab) 89 | tcm <- create_tcm(it, vectorizer, skip_grams_window = WINDOW_SIZE, skip_grams_window_context = "symmetric") 90 | 91 | # ================================ 92 | # set model parameters 93 | # ================================ 94 | glove <- GlobalVectors$new(word_vectors_size = DIM, 95 | vocabulary = vocab, 96 | x_max = 100, 97 | lambda = 1e-5) 98 | 99 | # ================================ 100 | # fit model 101 | # ================================ 102 | word_vectors_main <- glove$fit_transform(tcm, 103 | n_iter = ITERS, 104 | convergence_tol = 1e-3, 105 | n_check_convergence = 1L, 106 | n_threads = RcppParallel::defaultNumThreads()) 107 | 108 | # ================================ 109 | # get output 110 | # ================================ 111 | word_vectors_context <- glove$components 112 | word_vectors <- word_vectors_main + t(word_vectors_context) # word vectors 113 | 114 | # pre-estimated embeddings 115 | word_vectors_6_300_D <- readRDS("/Users/pedrorodriguez/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/CR-5yrs/word_vectors_R_6_300_D_3.rds") # local 116 | word_vectors_6_300_R <- readRDS("/Users/pedrorodriguez/Dropbox/Research/Neuropolitics/WordSelection/R/GloVe/CR-5yrs/word_vectors_R_6_300_R_3.rds") # local 117 | pretrained <- readRDS("/Users/pedrorodriguez/Dropbox/NYU/Teaching/Text as Data/homeworks/HW3/data/pretrained.rds") # GloVe pretrained (https://nlp.stanford.edu/projects/glove/) 118 | 119 | # function to compute nearest neighbors 120 | nearest_neighbors <- function(cue, embeds, N = 5, norm = "l2"){ 121 | cos_sim <- sim2(x = embeds, y = embeds[cue, , drop = FALSE], method = "cosine", norm = norm) 122 | nn <- cos_sim <- cos_sim[order(-cos_sim),] 123 | return(names(nn)[2:(N + 1)]) # cue is always the nearest neighbor hence dropped 124 | } 125 | 126 | # e.g. 127 | nearest_neighbors("welfare", word_vectors_6_300_D, N = 10, norm = "l2") 128 | nearest_neighbors("welfare", word_vectors_6_300_R, N = 10, norm = "l2") 129 | nearest_neighbors("welfare", pretrained, N = 10, norm = "l2") 130 | 131 | nearest_neighbors("abortion", word_vectors_6_300_D, N = 10, norm = "l2") 132 | nearest_neighbors("abortion", word_vectors_6_300_R, N = 10, norm = "l2") 133 | nearest_neighbors("abortion", pretrained, N = 10, norm = "l2") 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /W12_04_02_19/MTurk Fees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/W12_04_02_19/MTurk Fees.pdf -------------------------------------------------------------------------------- /W12_04_02_19/Session 12 - Special Topics.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro Rodríguez 2 | # Course: Text as Data 3 | # Code by: Leslie Huang (+ minor modifications by PR) 4 | # Date: 05/02/2019 5 | # Recitation 13: Special Topics I 6 | 7 | # First... Course Assessments! 8 | # Link: https://nyu.qualtrics.com/jfe/form/SV_3mGvDResIGQDtGZ 9 | 10 | #install.packages("bursts") 11 | library(bursts) 12 | library(quanteda) 13 | library(readtext) 14 | 15 | # 1 Loading bursty function: a repurposing of some guts of kleinberg() 16 | 17 | bursty <- function(word = "sioux", DTM, date) { 18 | word.vec <- DTM[, which(colnames(DTM) == word)] 19 | if(length(word.vec) == 0) { 20 | print(word, " does not exist in this corpus.") 21 | } 22 | else { 23 | word.times <- c(0,which(as.vector(word.vec)>0)) 24 | 25 | kl <- kleinberg(word.times, gamma = 0.5) 26 | kl$start <- date[kl$start+1] 27 | kl$end <- date[kl$end] 28 | max_level <- max(kl$level) 29 | 30 | plot(c(kl$start[1], kl$end[1]), c(1,max_level), 31 | type = "n", xlab = "Time", ylab = "Level", bty = "n", 32 | xlim = c(min(date), max(date)), ylim = c(1, max_level), 33 | yaxt = "n") 34 | axis(2, at = 1:max_level) 35 | 36 | for (i in 1:nrow(kl)) { 37 | if (kl$start[i] != kl$end[i]) { 38 | arrows(kl$start[i], kl$level[i], kl$end[i], kl$level[i], code = 3, angle = 90, 39 | length = 0.05) 40 | } 41 | else { 42 | points(kl$start[i], kl$level[i]) 43 | } 44 | } 45 | 46 | print(kl) 47 | } 48 | #note deviation from standard defaults bec don't have that much data 49 | } 50 | 51 | 52 | # 2 Let's use this on the Conservative and Labour manifestos 53 | setwd("/Users/pedrorodriguez/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/cons_labour_manifestos/") 54 | list.files() 55 | 56 | # Loading data 57 | 58 | manifesto <- readtext("*.txt", docvarsfrom=c("filenames")) 59 | 60 | manifesto_corpus <- corpus(manifesto) 61 | 62 | docvars(manifesto_corpus)$date <- as.numeric(gsub("[[:alpha:]]","",docvars(manifesto_corpus)$docvar1)) 63 | 64 | manifesto_dfm <- dfm(manifesto_corpus) 65 | 66 | # 3.1 Evaluating the burstiness of several key words 67 | 68 | bursty("thatcher", manifesto_dfm, docvars(manifesto_corpus)$date) 69 | 70 | bursty("churchill", manifesto_dfm, docvars(manifesto_corpus)$date) 71 | 72 | bursty("argentina", manifesto_dfm, docvars(manifesto_corpus)$date) 73 | 74 | -------------------------------------------------------------------------------- /W12_04_02_19/WE-Validation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/W12_04_02_19/WE-Validation.pdf -------------------------------------------------------------------------------- /W1_01_31_19/plot1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/W1_01_31_19/plot1.pdf -------------------------------------------------------------------------------- /W1_01_31_19/table1.csv: -------------------------------------------------------------------------------- 1 | "","Pollster","Population","mean_net_clinton" 2 | "1","ABC/Post","Adults",9 3 | "2","ABC/Post","Registered Voters",2.33333333333333 4 | "3","ARG","Registered Voters",0 5 | "4","Bloomberg/Selzer","Likely Voters",15 6 | "5","CBS","Registered Voters",8 7 | -------------------------------------------------------------------------------- /W2_02_07_19/Session 2 - Introducing Quanteda.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodriguez 2 | # Course: Text as Data 3 | # Date: 2/07/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | # other "similar" interesting packages: tm, tidytext 7 | 8 | #----------------------------- 9 | # 1 SETTING UP 10 | #----------------------------- 11 | 12 | # 1.1 Workspace ----------------------- 13 | 14 | # Clear Global Environment 15 | rm(list = ls()) 16 | 17 | # Set working directory 18 | #setwd("/Users/pedrorodriguez/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W2_02_07_19/") 19 | 20 | # 1.2 Installing quanteda 21 | 22 | # Install the latest stable version of quanteda from CRAN 23 | #install.packages("quanteda") # run this if you don't have quanteda already installed 24 | 25 | library(quanteda) 26 | library(ggplot2) 27 | library(dplyr) 28 | 29 | # What version of quanteda do you have loaded? 30 | # How many threads (cores) are you using? See the printout in the console 31 | 32 | # 1.3 Devtools and the quanteda corpus 33 | 34 | # Install the package "devtools" which is used to install packages directly from Github 35 | # install.packages("devtools") 36 | #library("devtools") 37 | 38 | # Use devtools to install some sample data 39 | #devtools::install_github("quanteda/quanteda.corpora") 40 | 41 | # Load it into our environment 42 | library(quanteda.corpora) 43 | 44 | # Read about the data available: https://github.com/quanteda/quanteda.corpora 45 | 46 | ### Note: Quanteda is still under development so it is changing! New features are being added but sometimes functions or function parameters are deprecated or renamed. This includes the very basic functions in this code demo! 47 | 48 | # This means that you may encounter many code examples, StackOverflow questions, and websites with outdated documentation, etc. that include functions or options that have been deprecated or renamed. 49 | 50 | # 1.4 Managing dependencies 51 | 52 | # If you want to ensure that your code for a project will not break when you update quanteda, I recommend using a dependency manager for R called packrat so that you can specify a dependency on a specific version of quanteda. 53 | # Find out about setting up packrat here: https://rstudio.github.io/packrat/walkthrough.html 54 | 55 | # 1.5 Versions of quanteda 56 | 57 | # to check version 58 | packageVersion("quanteda") 59 | 60 | # How would you get an older version of quanteda? (For example, if you accidentally installed the dev version from GitHub but you want to go back to the last stable release, or you want a legacy version to support old code.) 61 | 62 | # - Check the CRAN archive 63 | # use the install_version function, e.g.: 64 | # devtools::install_version("quanteda", version = "0.99.12", repos = "http://cran.us.r-project.org") 65 | 66 | # If you want the latest dev version of quanteda, it's on GitHub, but we will use the latest version from CRAN for stability/sanity reasons 67 | # devtools::install_github("quanteda/quanteda") 68 | 69 | # Concept review: Which of these are the same? 70 | # token 71 | # type 72 | # feature 73 | # word 74 | # term 75 | 76 | #----------------------------- 77 | # 1 THE CORPUS OBJECT 78 | #----------------------------- 79 | 80 | # quanteda's main input object is called a "corpus" (a way of organizing text data: generally includes text + metadata) 81 | 82 | # THERE ARE OTHER WAYS to organize text data 83 | # TAKE A LOOK AT: https://www.tidytextmining.com/tidytext.html 84 | 85 | # other popular text package with similar features: tm 86 | 87 | # 1.1 load the State of the Union (SOTU) corpus and look at a summary --------------------- 88 | #data("sotu", package = "quanteda.corpora") 89 | sotu <- data_corpus_sotu 90 | 91 | # a corpus consists of: (1) documents: text + doc level data (2) corpus metadata (3) extras (settings) 92 | head(docvars(sotu)) # document-level variables 93 | metacorpus(sotu) # corpus-level variables 94 | 95 | # ndoc identifies the number of documents in a corpus 96 | ndocs <- ndoc(sotu) 97 | 98 | # summary of the corpus (provides some summary statistics on the text combined with the metadata) 99 | corpusinfo <- summary(sotu, n = ndocs) # note n default is 100 100 | head(corpusinfo) 101 | # does tokens >= types always hold? 102 | 103 | # quick visualization 104 | token_plot <- ggplot(data = corpusinfo, aes(x = Date, y = Tokens, group = 1)) + geom_line() + geom_point() + theme_bw() 105 | token_plot 106 | 107 | # 1.2 subset corpus --------------------- 108 | summary(corpus_subset(sotu, President == "Trump")) 109 | trump_sotu <- corpus_subset(sotu, President == "Trump") 110 | 111 | # key words in context (KWIC) 112 | kwic_america <- kwic(trump_sotu, pattern = "america", valuetype = "regex", window = 6) 113 | kwic_america <- kwic(trump_sotu, pattern = "america") 114 | 115 | # keep only the text of the the 2018 SOTU 116 | trump_2018_text <- texts(trump_sotu)[2] 117 | 118 | # same as 119 | trump_2018_text <- trump_sotu[2] 120 | 121 | #----------------------------- 122 | # 2 TOKENIZING & STEMMING 123 | #----------------------------- 124 | 125 | ## 2.1 Tokenizing text --------------------- 126 | ?tokens 127 | tokenized_speech <- tokens(trump_2018_text) 128 | head(unname(unlist(tokenized_speech)), 20) 129 | 130 | # alternative using only base R 131 | tokenized_speech <- strsplit(trump_2018_text, " ") 132 | 133 | # remove punctuation when tokenizing 134 | tokenized_speech <- tokens(trump_2018_text, remove_punct = TRUE) 135 | head(unname(unlist(tokenized_speech)), 20) 136 | 137 | ## 2.2 Stemming --------------------- 138 | # SnowballC stemmer is based on the Porter stemmer (varies by language, english is default) 139 | ?tokens_wordstem 140 | stemmed_speech <- tokens_wordstem(tokenized_speech) # language is an argument 141 | head(unname(unlist(stemmed_speech)), 20) 142 | 143 | ## 2.3 Ngrams --------------------- 144 | tokenized_speech_ngrams <- tokens(trump_2018_text, remove_punct = TRUE, ngrams = c(1L, 2L)) 145 | head(unname(unlist(tokenized_speech_ngrams)), 20) 146 | tail(unname(unlist(tokenized_speech_ngrams)), 20) 147 | 148 | ## Types vs. Tokens 149 | ntoken(trump_2018_text) 150 | ntype(trump_2018_text) 151 | tokens(trump_2018_text) %>% unlist() %>% unique() %>% length() 152 | 153 | #----------------------------- 154 | # 3 DOCUMENT FEATURE MATRIX (~ DTM) 155 | #----------------------------- 156 | 157 | # WHAT'S THE POINT? 42 158 | # DOCUMENTS AS DISTRIBUTIONS 159 | 160 | ## 3.1 Creating a DFM --------------------- 161 | # input can be a document, corpus, etc 162 | trump_2018_dfm <- dfm(trump_2018_text) 163 | 164 | # inspect the first few features 165 | trump_2018_dfm[, 1:10] # why 0% sparse? 166 | 167 | # how many rows does this dfm have? 168 | dim(trump_2018_dfm) 169 | 170 | # top features in dfm 171 | topfeatures(trump_2018_dfm) 172 | 173 | # Are all of these features relevant? 174 | # Words? 175 | # Punctuation (maybe!!! --> think what the goal is. Can theory help?) 176 | 177 | #----------------------------- 178 | # 4 PREPROCESSING (~FEATURE ENGINEERING) 179 | #----------------------------- 180 | # pre-processing can be done prior to dfm OR use the pre-processing arguments of dfm 181 | ?dfm # see all options 182 | # NOTE: lowercase argument is by default TRUE 183 | # punctuation 184 | trump_2018_dfm <- dfm(trump_2018_text, remove_punct = TRUE) 185 | trump_2018_dfm[, 1:10] 186 | 187 | # stemming 188 | trump_2018_dfm <- dfm(trump_2018_text, stem = TRUE, remove_punct = TRUE) 189 | trump_2018_dfm[, 1:10] 190 | # can also apply stemmer afterwards using dfm_wordstem() on a dfm object 191 | 192 | ## 3.2 Stopwords --------------------- 193 | # Stopwords are commonly words that (presumably) add little understanding to the content of the document by themselves 194 | # The stopwords function takes a language as an input and produces a vector of stopwords compiled from that language 195 | 196 | stopwords("english") 197 | 198 | # Fun fact: Quanteda also supports stopwords for english, SMART, danish, french, greek, hungarian, 199 | # norwegian, russian, swedish, catalan, dutch, finnish, german, italian, portuguese, spanish, and arabic 200 | 201 | trump_2018_dfm_1 <- dfm(trump_2018_text, remove_punct = TRUE) 202 | trump_2018_dfm_2 <- dfm(trump_2018_text, remove = stopwords("english"), remove_punct = TRUE) 203 | 204 | topfeatures(trump_2018_dfm_1) 205 | topfeatures(trump_2018_dfm_2) 206 | 207 | # wordclouds 208 | textplot_wordcloud(trump_2018_dfm_1, max_words = 100) 209 | textplot_wordcloud(trump_2018_dfm_2, max_words = 100) 210 | 211 | #----------------------------- 212 | # 4 WEIGHTED DOCUMENT FEATURE MATRIX 213 | #----------------------------- 214 | # WHAT ARE WE WEIGHTING? 215 | 216 | # Now we will create a DFM of all the SOTU speeches 217 | full_dfm <- dfm(sotu, remove = stopwords("english"), remove_punct = TRUE) 218 | full_dfm[, 1:10] # notice sparsity 219 | topfeatures(full_dfm) 220 | topfeatures(full_dfm[nrow(full_dfm),]) 221 | 222 | # 4.1 tfidf - Frequency weighting 223 | weighted_dfm <- dfm_tfidf(full_dfm) # uses the absolute frequency of terms in each document 224 | topfeatures(weighted_dfm) 225 | topfeatures(weighted_dfm[nrow(weighted_dfm),]) 226 | 227 | # 4.2 tfidf - Relative frequency weighting 228 | ?dfm_tfidf 229 | normalized <- dfm_tfidf(full_dfm, scheme_tf = "prop") # Uses feature proportions within documents: divdes each term by the total count of features in the document 230 | topfeatures(normalized) 231 | topfeatures(normalized[nrow(normalized),]) 232 | 233 | #----------------------------- 234 | # 5 COLLOCATIONS 235 | #----------------------------- 236 | # bigrams 237 | head(textstat_collocations(trump_2018_text)) 238 | textstat_collocations(trump_2018_text) %>% arrange(-lambda) %>% slice(1:5) 239 | 240 | # trigrams 241 | ?textstat_collocations 242 | head(textstat_collocations(trump_2018_text, size = 3)) 243 | 244 | # Are there any other terms you all think are interesting? 245 | 246 | #----------------------------- 247 | # 6 REGULAR EXPRESSIONS 248 | #----------------------------- 249 | # regular expressions are a very powerful tool in wrangling text 250 | # not a focus of this class, but something to be aware of 251 | # cheatsheet for regex: https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf 252 | 253 | # grep 254 | s_index <- grep(" s ", texts(sotu)) 255 | head(s_index) 256 | 257 | # grepl 258 | s_index <- grepl(" s ", texts(sotu)) 259 | table(s_index) 260 | 261 | # grepl 262 | thank_index <- grepl("^Thank", texts(sotu)) 263 | table(thank_index) 264 | 265 | # this returns every speech that contains " s " -- JUST THE LETTER S BY ITSELF 266 | texts_with_s <- grep(" s ", texts(sotu), value = TRUE) 267 | 268 | # Here we create a vector of documents with " s " removed 269 | texts_without_s <- gsub(" s ", "", sotu) 270 | 271 | # ALWAYS TEST FIRST 272 | gsub(" s ", " ", "hello how s are you") 273 | grepl("^so", c("so today we", "never so today", "today never so")) 274 | 275 | # SUGGESTED PACKAGE to deal with regular expressions: stringr 276 | 277 | #----------------------------- 278 | # 7 PRE-PROCESSING CHOICES 279 | #----------------------------- 280 | # install.packages("preText") 281 | 282 | library("preText") 283 | 284 | # Run at home (takes a few minutes to run) 285 | # Example below taken from preText vignette: https://cran.r-project.org/web/packages/preText/vignettes/getting_started_with_preText.html 286 | 287 | preprocessed_documents <- factorial_preprocessing( 288 | sotu[1:50], 289 | use_ngrams = FALSE, 290 | infrequent_term_threshold = 0.2, 291 | verbose = FALSE) 292 | 293 | preText_results <- preText(preprocessed_documents, 294 | dataset_name = "SOTU Speeches", 295 | distance_method = "cosine", 296 | num_comparisons = 20, 297 | verbose = FALSE) 298 | 299 | preText_score_plot(preText_results) 300 | 301 | # Questions? 302 | 303 | # I recommend you check out: https://quanteda.io/articles/quickstart.html 304 | 305 | -------------------------------------------------------------------------------- /W3_02_14_19/Session 3 - Descriptive Inference.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodriguez 2 | # Course: Text as Data 3 | # Date: 2/14/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | ## Set up Quanteda 7 | 8 | # Clear Global Environment 9 | rm(list = ls()) 10 | 11 | # Libraries 12 | library(dplyr) 13 | library(quanteda) 14 | library(quanteda.corpora) 15 | 16 | # sophistication: https://github.com/kbenoit/sophistication 17 | # gutenbergr: http://www.gutenberg.org/wiki/Main_Page 18 | # stylest: https://github.com/leslie-huang/stylest/blob/master/vignettes/stylest-vignette.md 19 | 20 | #----------------------------- 21 | # 1 NON-ENGLISH TEXTS 22 | #----------------------------- 23 | 24 | # 1.1 Non-English stopwords 25 | 26 | stopwords(language = "spanish") 27 | 28 | stopwords(language = "german") 29 | 30 | stopwords(language = "zh", source = "misc") 31 | 32 | # 1.2 Text encoding 33 | 34 | # What is text encoding? 35 | # How do you figure out what kind you have (e.g. scraped text from the Internet)? 36 | # What kind of encoding can R and/or quanteda handle? 37 | 38 | # 1.3 Some types of text encoding 39 | # character encoding is a set of mappings between the bytes in the computer and the characters in the character set. 40 | # UTF-8 41 | # ASCII (subset of UTF-8) 42 | # Latin-1 43 | 44 | # UTF-8 represents characters from European languages (English, Spanish, German, French, etc) and some characters from Chinese/Japanese/Korean, plus emojis. 45 | 46 | # Note: Text obtained from Internet sources can be messy. Issues can especially arise when you are working with texts from multiple sources and you end up with a mixture of encodings. This can cause the encoding to be detected incorrectly when you read in the text. 47 | 48 | # 1.4 What encoding do you have? 49 | 50 | # You can check with this function in base R 51 | validUTF8("This is a sentence") 52 | 53 | # You can use the package utf8(), written by Patrick Perry from NYU 54 | # Read about it here: https://github.com/patperry/r-utf8 55 | # install.packages("utf8") 56 | library("utf8") 57 | 58 | as_utf8("\xF0\x9F\x98\x8D") 59 | print("\xF0\x9F\x98\x8D") # There are issues with base R's print() function for Unicode 60 | # any guesses what this is? 61 | utf8_print("\xF0\x9F\x98\x8D") 62 | # emojis unicodes: https://apps.timwhitlock.info/emoji/tables/unicode 63 | 64 | # 1.5 What if you get a weird character and you're not sure? 65 | 66 | # install.packages("stringi") 67 | library("stringi") 68 | 69 | # Use the encoding guesser to guess what this character is 70 | stri_enc_detect("0x00E3") 71 | 72 | # It's only a guess! 73 | 74 | # What's ISO-8859-1? 75 | # This is another name for the Latin-1 encoding. 76 | 77 | # 1.6 How do you convert encodings? 78 | test_str <- "São Paulo" 79 | validUTF8(test_str) 80 | converted_str <- iconv("São Paulo", from = "UTF-8", to = "latin1") 81 | 82 | converted_str 83 | validUTF8(converted_str) 84 | 85 | # Looks the same right? 86 | 87 | charToRaw(converted_str) # Latin-1 encoding 88 | 89 | charToRaw(test_str) # UTF-8 encoding 90 | 91 | # But what about here? 92 | iconv("ã", from = "UTF-8", to = "ASCII") 93 | 94 | # In most cases, your text will probably already be in UTF-8. 95 | # In most cases, you want to convert your text to UTF-8 (with the possible exception of languages that do not use the Latin alphabet) 96 | 97 | # The authors of quanteda have also written a package called readtext() that can also deal with encodings in text corpora! 98 | 99 | #----------------------------- 100 | # 2 HEAP'S LAW 101 | #----------------------------- 102 | # Token-type relationship in corpus 103 | # How might pre-processing affect this relationship? 104 | # Think about reducing the dimensionality of the problem. 105 | 106 | # M = kT^b 107 | 108 | # M = vocab size (num of types) 109 | # T = number of tokens 110 | 111 | # k, b are constants 112 | # 30 <= k <= 100 113 | # 0.4 <= b <= 0.6 114 | 115 | # 2.1 Example using data from the corpus of inaugural speeches 116 | tokens <- tokens(data_corpus_inaugural, remove_punct = TRUE) 117 | Tee <- sum(lengths(tokens)) 118 | 119 | inaug_dfm <- dfm(data_corpus_inaugural) 120 | 121 | M <- nfeat(inaug_dfm) # number of features = number of types 122 | 123 | # Let's check using parameter values from MRS Ch. 5 for a corpus with more than 100,000 tokens 124 | 125 | k <- 44 126 | b <- .49 127 | 128 | k * (Tee)^b 129 | 130 | M 131 | 132 | # Let's think about why (what types of texts are these?) 133 | 134 | # New parameters 135 | 136 | k <- 41 137 | b <- 0.46 138 | 139 | k * (Tee)^b 140 | 141 | #----------------------------- 142 | # 3 ZIPF'S LAW 143 | #----------------------------- 144 | # Term frequency in corpus and rank 145 | 146 | # x-axis: log of ranks 1 through 100 147 | # y-axis log of frequency of top 100 terms from the DFM 148 | 149 | plot(log10(1:100), log10(topfeatures(inaug_dfm, 100)), 150 | xlab = "log10(rank)", ylab = "log10(frequency)", main = "Top 100 Words in U.S. Presidential Inaugural Speech Corpus") 151 | 152 | # Fits a linear regression to check if slope is approx -1.0 153 | regression <- lm(log10(topfeatures(inaug_dfm, 100)) ~ log10(1:100)) 154 | 155 | # Adds the fitted line from regression to the plot 156 | abline(regression, col = "red") 157 | 158 | # Returns the 95% confidence intervals for the regression coefficients 159 | confint(regression) 160 | 161 | # Provides R-squared, F-test, and cofficient estimates from regression 162 | summary(regression) 163 | 164 | ## Stopwords: do they affect Zipf's law? 165 | 166 | mydfm <- dfm(data_corpus_inaugural, remove=stopwords("english")) 167 | 168 | plot(log10(1:100), log10(topfeatures(mydfm, 100)), 169 | xlab = "log10(rank)", ylab = "log10(frequency)", main = "Top 100 Words in U.S. Presidential Inaugural Speech Corpus (w/o stopwords)") 170 | 171 | # Regression to check if slope is approx -1.0 172 | regression <- lm(log10(topfeatures(mydfm, 100)) ~ log10(1:100)) 173 | abline(regression, col = "red") 174 | confint(regression) 175 | summary(regression) 176 | 177 | # Zipf's law as a feature selection tool (e.g. http://www.jmlr.org/papers/volume3/forman03a/forman03a_full.pdf) 178 | 179 | plot(1:100, topfeatures(inaug_dfm, 100), 180 | xlab = "rank", ylab = "frequency", main = "Top 100 Words in U.S. Presidential Inaugural Speech Corpus") 181 | 182 | plot(1:100, topfeatures(mydfm, 100), 183 | xlab = "rank", ylab = "frequency", main = "Top 100 Words in U.S. Presidential Inaugural Speech Corpus (w/o stopwords)") 184 | 185 | #----------------------------- 186 | # 4 KEY WORDS IN CONTEXT 187 | #----------------------------- 188 | ## good way to summarize info about a topic 189 | 190 | kwic(data_corpus_inaugural, "America", 3, case_insensitive = FALSE) 191 | 192 | help(kwic) 193 | 194 | # Suggested terms? 195 | 196 | #----------------------------- 197 | # 6 MEASURING SIMILARITY 198 | #----------------------------- 199 | # This helps illustrate the value of the vector representation 200 | 201 | # 6.1 Cosine similarity--take the dot product of two vectors 202 | # cos = x*y/|x||y| 203 | calculate_cosine_similarity <- function(vec1, vec2) { 204 | nominator <- vec1 %*% vec2 # %*% specifies dot product rather than entry by entry multiplication (we could also do: sum(x * y)) 205 | denominator <- sqrt(vec1 %*% vec1)*sqrt(vec2 %*% vec2) 206 | return(nominator/denominator) 207 | } 208 | 209 | # example 1 210 | x <- c(1, 2, 3) 211 | y <- c(1, 2, 3) 212 | 213 | # what should we get? 214 | calculate_cosine_similarity(x, y) 215 | 216 | # example 2 217 | a <- c(1, 2, 3) 218 | b <- c(-1, -2, -3) 219 | 220 | # what should we get? 221 | calculate_cosine_similarity(a, b) 222 | 223 | # Let's do it with texts 224 | obama_text <- texts(corpus_subset(data_corpus_inaugural, President == "Obama")) 225 | lincoln_text <- texts(corpus_subset(data_corpus_inaugural, President == "Lincoln")) 226 | 227 | # Make a dfm of these two 228 | obama_lincoln_dfm <- dfm(c(obama_text, lincoln_text), remove = stopwords("english"), stem = TRUE) 229 | 230 | # Calculate similarity 231 | similarity_obama_lincoln_with_preprocessing <- textstat_simil(obama_lincoln_dfm, margin = "documents", method = "cosine") 232 | as.matrix(similarity_obama_lincoln_with_preprocessing) 233 | 234 | # 6.2 Let's see how stopwords/stemming affect similarity 235 | 236 | obama_lincoln_no_preprocessing <- dfm(c(obama_text, lincoln_text)) 237 | 238 | # Calculate similarity 239 | 240 | similarity_obama_lincoln_with_no_preprocessing <- textstat_simil(obama_lincoln_no_preprocessing, margin = "documents", method = "cosine") 241 | 242 | as.matrix(similarity_obama_lincoln_with_no_preprocessing) 243 | 244 | # Make a dfm of a several documents 245 | 246 | several_inaug_dfm <- dfm(corpus_subset(data_corpus_inaugural , Year > 1980), remove = stopwords("english"), stem = TRUE) 247 | 248 | # Specific comparisons with Obama's first inauguration speech 249 | 250 | textstat_simil(several_inaug_dfm, "2009-Obama", margin = "documents", method = "correlation") 251 | 252 | # Other options available: Manhattan distance, cosine, etc. 253 | ?textstat_simil 254 | 255 | #----------------------------- 256 | # 7 STYLE 257 | #----------------------------- 258 | 259 | # 7.1 data collection (to be used in HW1) 260 | rm(list = ls()) 261 | # 7.1 Project Gutenberg: http://www.gutenberg.org/wiki/Main_Page 262 | # collection of (machine readable) novels and other texts + they have an R package! 263 | #install.packages("gutenbergr") 264 | # for more info refer to: https://cran.r-project.org/web/packages/gutenbergr/vignettes/intro.html 265 | library(gutenbergr) 266 | library(dplyr) 267 | gutenberg_works() 268 | 269 | # what do they have by Jane Austen? 270 | gutenberg_works() %>% filter(author == "Austen, Jane") 271 | 272 | # download "Emma" 273 | emma <- gutenberg_download(gutenberg_id = 158) 274 | #emma <- gutenberg_download(jane_austen$gutenberg_id[jane_austen$title == "Emma"], meta_fields = "title") # add other meta information 275 | 276 | # 7.2 stylest package: estimate speaker (author) style distinctiveness (vis-a-vis other authors) 277 | # see vignette: https://github.com/leslie-huang/stylest/blob/master/vignettes/stylest-vignette.md 278 | # early draft version of paper using this package: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3235506 279 | #install.packages("stylest") 280 | # source for this code: package vignette 281 | library(stylest) 282 | 283 | # data included in package 284 | data(novels_excerpts) 285 | 286 | # author list 287 | unique(novels_excerpts$author) 288 | 289 | # note how the data is organized 290 | str(novels_excerpts) 291 | 292 | # (1) select most informative (discriminative) features (subsets vocab by frequency percentile) 293 | filter <- corpus::text_filter(drop_punct = TRUE, drop_number = TRUE) # pre-processing choices 294 | set.seed(1984L) # why set seed? 295 | vocab_custom <- stylest_select_vocab(novels_excerpts$text, novels_excerpts$author, # fits n-fold cross-validation 296 | filter = filter, smooth = 1, nfold = 10, 297 | cutoff_pcts = c(25, 50, 75, 99)) 298 | 299 | vocab_custom$cutoff_pct_best # percentile with best prediction rate 300 | vocab_custom$miss_pct # rate of incorrectly predicted speakers of held-out texts 301 | 302 | # (2) subset features 303 | vocab_subset <- stylest_terms(novels_excerpts$text, novels_excerpts$author, vocab_custom$cutoff_pct_best , filter = filter) # USE SAME FILTER 304 | 305 | # (3) fit model with "optimal" percentile threshold (i.e. feature subset) 306 | style_model <- stylest_fit(novels_excerpts$text, novels_excerpts$author, terms = vocab_subset, filter = filter) 307 | 308 | # explore output 309 | head(stylest_term_influence(style_model, novels_excerpts$text, novels_excerpts$author)) # influential terms 310 | 311 | str(style_model) 312 | authors <- unique(novels_excerpts$author) 313 | term_usage <- style_model$rate 314 | lapply(authors, function(x) head(term_usage[x,][order(-term_usage[x,])])) %>% setNames(authors) 315 | 316 | # (4) predict speaker of a new text 317 | new_text <- emma$text[30:75] %>% paste(., collapse = "") 318 | pred <- stylest_predict(style_model, new_text) 319 | pred$predicted 320 | pred$log_probs 321 | 322 | #----------------------------- 323 | # 8 SOPHISTICATION 324 | #----------------------------- 325 | # motivation: flexibly measure the "sophistication" (ease of understanding) of political communication 326 | # see paper: https://www.nyu.edu/projects/spirling/documents/BMS_complex.pdf 327 | #install.packages("sophistication") 328 | # see vignette: https://github.com/kbenoit/sophistication 329 | # key insight: use crowdsourcing 330 | -------------------------------------------------------------------------------- /W4_02_21_19/Session 4 - Descriptive Inference II.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 2/21/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | # Setup environment 7 | rm(list = ls()) 8 | 9 | # 1 Loading packages --------------------------------------- 10 | library(quanteda) 11 | library(quanteda.corpora) 12 | library(dplyr) 13 | library(ggplot2) 14 | 15 | # 2 Load in data: Irish budget proposals from 2008-2012 ---- 16 | # "speeches and document-level variables from the debate over the Irish budget". 17 | 18 | data("data_corpus_irishbudgets") 19 | irish_budget_texts <- texts(data_corpus_irishbudgets) 20 | 21 | #------------------------------ 22 | # 3 LEXICAL DIVERSITY MEASURES 23 | #------------------------------ 24 | 25 | # 3.1 TTR 26 | budget_tokens <- tokens(irish_budget_texts, remove_punct = TRUE) 27 | 28 | # Num tokens per document 29 | num_tokens <- lengths(budget_tokens) 30 | 31 | num_types <- ntype(budget_tokens) 32 | 33 | irish_budget_TTR <- num_types / num_tokens 34 | 35 | head(irish_budget_TTR) 36 | 37 | # Would you expect the budgets to become more or less diverse over time? 38 | 39 | # 3.2 Mean per-document TTR scores by year, party 40 | 41 | TTR_by_year <- aggregate(irish_budget_TTR, by = list(data_corpus_irishbudgets[["year"]]$year), FUN = mean, na.rm = TRUE) %>% setNames(c("year", "TTR")) 42 | 43 | plot(TTR_by_year) 44 | 45 | aggregate(irish_budget_TTR, by = list(data_corpus_irishbudgets[["party"]]$party), FUN = mean) %>% setNames(c("party", "TTR")) 46 | 47 | # 3.3 Calculate TTR score by year, party 48 | 49 | # by year 50 | # textstat_lexdiv: "calculates the lexical diversity or complexity of text(s)" using any number of measures.' 51 | TTR <- textstat_lexdiv(budget_tokens, measure = "TTR") 52 | aggregate(TTR$TTR, by = list(data_corpus_irishbudgets[["year"]]$year), FUN = mean, na.rm = TRUE) %>% setNames(c("year", "TTR")) 53 | 54 | # Sidebar: using the "groups" parameter is how to group documents by a covariate -- note how this changes the ndocs of your corpus 55 | aggregate(TTR$TTR, by = list(data_corpus_irishbudgets[["party"]]$party), FUN = mean, na.rm = TRUE) %>% setNames(c("party", "TTR")) 56 | 57 | # Thoughts on TTR 58 | 59 | #------------------------------ 60 | # 4 COMPLEXITY (READIBILITY) MEASURES 61 | #------------------------------ 62 | 63 | # 4.1 FRE (https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests) 64 | textstat_readability(data_corpus_irishbudgets, "Flesch") %>% head() 65 | 66 | textstat_readability(texts(data_corpus_irishbudgets, groups = "year"), "Flesch") 67 | 68 | textstat_readability(texts(data_corpus_irishbudgets, groups = "party"), "Flesch") 69 | 70 | # 4.2 Dale-Chall measure (https://en.wikipedia.org/wiki/Dale–Chall_readability_formula) 71 | 72 | textstat_readability(data_corpus_irishbudgets, "Dale.Chall.old") %>% head() 73 | 74 | textstat_readability(texts(data_corpus_irishbudgets, groups = "year"), "Dale.Chall.old") 75 | 76 | textstat_readability(texts(data_corpus_irishbudgets, groups = "party"), measure = "Dale.Chall.old") 77 | 78 | # 4.3 let's compare each measure 79 | 80 | all_readability_measures <- textstat_readability(data_corpus_irishbudgets, c("Flesch", "Dale.Chall", "SMOG", "Coleman.Liau", "Fucks")) 81 | 82 | readability_matrix <- cbind(all_readability_measures$Flesch, all_readability_measures$Dale.Chall, all_readability_measures$SMOG, all_readability_measures$Coleman.Liau, all_readability_measures$Fucks) 83 | 84 | readability_cor <- cor(readability_matrix) 85 | rownames(readability_cor) <- c("Flesch", "Dale-Chall", "SMOG", "Coleman Liau", "Fucks") 86 | colnames(readability_cor) <- c("Flesch", "Dale-Chall", "SMOG", "Coleman Liau", "Fucks") 87 | readability_cor 88 | 89 | #------------------------------ 90 | # 5 BOOTSTRAPPING 91 | #------------------------------ 92 | # there are packages in R that help with bootstrapping: e.g. https://cran.r-project.org/web/packages/boot/boot.pdf 93 | 94 | # data prep: remove smaller parties (parties with only 1 document) 95 | large_parties <- data_corpus_irishbudgets$documents %>% group_by(party) %>% tally() %>% arrange(-n) %>% filter(n > 1) %>% select(party) %>% unlist() %>% unname() 96 | irbudgetsCorpSub <- corpus_subset(data_corpus_irishbudgets, (party %in% large_parties)) 97 | 98 | # convert corpus to df 99 | irbudgets_df <- irbudgetsCorpSub$documents %>% select(texts, party, year) %>% mutate(year = as.integer(year)) 100 | 101 | # Let's filter out any NAs 102 | irbudgets_df <- na.omit(irbudgets_df) 103 | 104 | # mean Flesch statistic per party 105 | flesch_point <- irbudgets_df$texts %>% textstat_readability(measure = "Flesch") %>% group_by(irbudgets_df$party) %>% summarise(mean_flesch = mean(Flesch)) %>% setNames(c("party", "mean")) %>% arrange(party) 106 | 107 | # ggplot point estimate 108 | ggplot(flesch_point, aes(x = party, y = mean, colour = party)) + 109 | geom_point() + 110 | coord_flip() + theme_bw() + scale_y_continuous(breaks=seq(floor(min(flesch_point$mean)), ceiling(max(flesch_point$mean)), by = 2)) + 111 | xlab("") + ylab("Mean Fleisch Score by Party") + theme(legend.position = "none") 112 | 113 | # We will use a loop to bootstrap a sample of texts and subsequently calculate standard errors 114 | iters <- 10 115 | 116 | library(pbapply) 117 | # build function to be used in bootstrapping 118 | boot_flesch <- function(party_data){ 119 | N <- nrow(party_data) 120 | bootstrap_sample <- sample_n(party_data, N, replace = TRUE) 121 | readability_results <- textstat_readability(bootstrap_sample$texts, measure = "Flesch") 122 | return(mean(readability_results$Flesch)) 123 | } 124 | 125 | # apply function to each party 126 | boot_flesch_by_party <- pblapply(large_parties, function(x){ 127 | sub_data <- irbudgets_df %>% filter(party == x) 128 | output_flesch <- lapply(1:iters, function(i) boot_flesch(sub_data)) 129 | return(unlist(output_flesch)) 130 | }) 131 | names(boot_flesch_by_party) <- large_parties 132 | 133 | # compute mean and std.errors 134 | party_means <- lapply(boot_flesch_by_party, mean) %>% unname() %>% unlist() 135 | party_ses <- lapply(boot_flesch_by_party, sd) %>% unname() %>% unlist() # bootstrap standard error = sample standard deviation bootstrap distribution 136 | 137 | # Plot results--party 138 | plot_dt <- tibble(party = large_parties, mean = party_means, ses = party_ses) 139 | 140 | # confidence intervals 141 | interval1 <- -qnorm((1-0.9)/2) # 90% multiplier 142 | interval2 <- -qnorm((1-0.95)/2) # 95% multiplier 143 | 144 | # ggplot point estimate + variance 145 | ggplot(plot_dt, aes(colour = party)) + 146 | geom_linerange(aes(x = party, ymin = mean - ses*interval1, ymax = mean + ses*interval1), lwd = 1, position = position_dodge(width = 1/2)) + 147 | geom_pointrange(aes(x = party, y = mean, ymin = mean - ses*interval2, ymax = mean + ses*interval2), lwd = 1/2, position = position_dodge(width = 1/2), shape = 21, fill = "WHITE") + 148 | coord_flip() + theme_bw() + scale_y_continuous(breaks=seq(floor(min(plot_dt$mean)), ceiling(max(plot_dt$mean)), by = 2)) + 149 | xlab("") + ylab("Mean Fleisch Score by Party") + theme(legend.position = "none") 150 | 151 | #------------------------------ 152 | # 6 SOPHISTICATION 153 | #------------------------------ 154 | rm(list = ls()) 155 | #devtools::install_github("kbenoit/sophistication") 156 | library("sophistication") 157 | 158 | # We'll run through the example from https://github.com/kbenoit/sophistication 159 | 160 | # Load data 161 | data(data_corpus_sotu, package = "quanteda.corpora") 162 | 163 | # Make snippets of 1 sentence each, then clean them 164 | snippetData <- snippets_make(data_corpus_sotu, nsentence = 1, minchar = 150, maxchar = 250) 165 | snippetData <- snippets_clean(snippetData) 166 | head(snippetData) 167 | 168 | # Sample the snippets 169 | testData <- sample_n(snippetData, 5) 170 | 171 | # generate n-1 pairs from n test snippets for a minimum spanning tree 172 | snippetPairsMST <- pairs_regular_make(testData) 173 | 174 | # generate more pairs from a larger sample of data 175 | snippetPairsAll <- pairs_regular_make(snippetData[sample(1:nrow(snippetData), 1000), ]) 176 | 177 | # Make some "Gold" questions -- for use with CrowdFlower workers 178 | # default reading level is Flesch and the default difference in readability of the two snippets in the pair is the 0.1 and 0.9 quintiles 179 | gold_questions <- pairs_gold_make(snippetPairsAll, n.pairs = 10) 180 | -------------------------------------------------------------------------------- /W5_02_28_19/LaverGarry.cat: -------------------------------------------------------------------------------- 1 | CULTURE 2 | CULTURE-HIGH 3 | ART (1) 4 | ARTISTIC (1) 5 | DANCE (1) 6 | GALLER* (1) 7 | MUSEUM* (1) 8 | MUSIC* (1) 9 | OPERA* (1) 10 | THEATRE* (1) 11 | CULTURE-POPULAR 12 | MEDIA (1) 13 | SPORT 14 | ANGLER* (1) 15 | PEOPLE (1) 16 | WAR_IN_IRAQ (1) 17 | CIVIL_WAR (1) 18 | ECONOMY 19 | +STATE+ 20 | ACCOMMODATION (1) 21 | AGE (1) 22 | AMBULANCE (1) 23 | ASSIST (1) 24 | BENEFIT (1) 25 | CARE (1) 26 | CARER* (1) 27 | CHILD* (1) 28 | CLASS (1) 29 | CLASSES (1) 30 | CLINICS (1) 31 | COLLECTIVE* (1) 32 | CONTRIBUTION* (1) 33 | COOPERATIVE* (1) 34 | CO-OPERATIVE* (1) 35 | DEPRIVATION (1) 36 | DISABILITIES (1) 37 | DISADVANTAGED (1) 38 | EDUCAT* (1) 39 | ELDERLY (1) 40 | EQUAL* (1) 41 | ESTABLISH (1) 42 | FAIR* (1) 43 | GUARANTEE* (1) 44 | HARDSHIP (1) 45 | HEALTH* (1) 46 | HOMELESS* (1) 47 | HOSPITAL* (1) 48 | HUNGER (1) 49 | INEQUAL* (1) 50 | INVEST (1) 51 | INVESTING (1) 52 | INVESTMENT (1) 53 | MEANS-TEST* (1) 54 | NURSE* (1) 55 | PATIENTS (1) 56 | PENSION (1) 57 | POOR (1) 58 | POORER (1) 59 | POOREST (1) 60 | POVERTY (1) 61 | REHOUSE* (1) 62 | RE-HOUSE* (1) 63 | SCHOOL (1) 64 | TEACH* (1) 65 | TRANSPORT (1) 66 | UNDERFUND* (1) 67 | UNEMPLOY* (1) 68 | VULNERABLE (1) 69 | WIDOW* (1) 70 | =STATE= 71 | ACCOUNTANT (1) 72 | ACCOUNTING (1) 73 | ACCOUNTS (1) 74 | ADVERT* (1) 75 | AIRLINE* (1) 76 | AIRPORT* (1) 77 | AUDIT* (1) 78 | BANK* (1) 79 | BARGAINING (1) 80 | BREADWINNER* (1) 81 | BUDGET* (1) 82 | BUY* (1) 83 | CARTEL* (1) 84 | CASH* (1) 85 | CHARGE* (1) 86 | COMMERCE* (1) 87 | COMPENSAT* (1) 88 | CONSUM* (1) 89 | COST* (1) 90 | CREDIT* (1) 91 | CUSTOMER* (1) 92 | DEBT* (1) 93 | DEFICIT* (1) 94 | DWELLING* (1) 95 | EARN* (1) 96 | ECON* (1) 97 | ELECTRICITY (1) 98 | ESTATE* (1) 99 | EXPORT* (1) 100 | FEE (1) 101 | FEES (1) 102 | FINANC* (1) 103 | HOUS* (1) 104 | IMPORT (1) 105 | IMPORTS (1) 106 | INDUSTR* (1) 107 | JOBS (1) 108 | LEASE* (1) 109 | LOAN* (1) 110 | MANUFACTUR* (1) 111 | MORTGAGE* (1) 112 | NEGOTIAT* (1) 113 | OPPORTUNITY (1) 114 | PARTNERSHIP* (1) 115 | PASSENGER* (1) 116 | PAY* (1) 117 | PERFORMANCE (1) 118 | PORT* (1) 119 | PRODUCTIVITY (1) 120 | PROFESSION* (1) 121 | PURCHAS* (1) 122 | RAILWAY* (1) 123 | REBATE* (1) 124 | RECESSION* (1) 125 | RESEARCH* (1) 126 | REVENUE* (1) 127 | SALAR* (1) 128 | SELL* (1) 129 | SETTLEMENT (1) 130 | SOFTWARE (1) 131 | SUPPLIER* (1) 132 | SUPPLY (1) 133 | TELECOM* (1) 134 | TELEPHON* (1) 135 | TENAN* (1) 136 | TOURIS* (1) 137 | TRADE (1) 138 | TRAIN* (1) 139 | WAGE* (1) 140 | WELFARE (1) 141 | WORK* (1) 142 | -STATE- 143 | ASSETS (1) 144 | AUTONOMY (1) 145 | BARRIER* (1) 146 | BID (1) 147 | BIDDERS (1) 148 | BIDDING (1) 149 | BURDEN* (1) 150 | CHARIT* (1) 151 | CHOICE* (1) 152 | COMPET* (1) 153 | CONFIDENCE (1) 154 | CONFISCATORY (1) 155 | CONSTRAIN* (1) 156 | CONTRACTING* (1) 157 | CONTRACTOR* (1) 158 | CONTROLLED (1) 159 | CONTROLLING (1) 160 | CONTROLS (1) 161 | CORPORATE (1) 162 | CORPORATION* (1) 163 | DEREGULATING (1) 164 | DISMANTL* (1) 165 | ENTREPRENEUR* (1) 166 | EXPENSIVE (1) 167 | FLEXIB* (1) 168 | FRANCHISE* (1) 169 | FUNDHOLD* (1) 170 | FUND-HOLDING (1) 171 | HOMESTEAD* (1) 172 | INITIATIVE (1) 173 | INTRUSIVE (1) 174 | INVESTOR* (1) 175 | LIBERALI* (1) 176 | MARKET* (1) 177 | MONETARY (1) 178 | MONEY (1) 179 | OWN* (1) 180 | PRIVATE (1) 181 | PRIVATELY (1) 182 | PRIVATISATIONS (1) 183 | PRIVATISED (1) 184 | PRIVATISING (1) 185 | PRODUCE* (1) 186 | PROFITABLE (1) 187 | REGULAT* (1) 188 | RETAIL* (1) 189 | RISK (1) 190 | RISKS (1) 191 | SAVINGS (1) 192 | SELL* (1) 193 | SHARES (1) 194 | SIMPLIF* (1) 195 | SPEND* (1) 196 | SPONSORSHIP (1) 197 | TAXABLE (1) 198 | TAXES (1) 199 | TAX-FREE (1) 200 | THRIFT* (1) 201 | TRADING (1) 202 | VALUE (1) 203 | VOLUNT* (1) 204 | VOUCHER* (1) 205 | ENVIRONMENT 206 | CON ENVIRONMENT 207 | PRODUC* (1) 208 | PRO ENVIRONMENT 209 | CAR (1) 210 | CATALYTIC (1) 211 | CHEMICAL* (1) 212 | CHIMNEY* (1) 213 | CLEAN* (1) 214 | CONGESTION (1) 215 | CYCLIST* (1) 216 | DEPLET* (1) 217 | ECOLOG* (1) 218 | EMISSION* (1) 219 | ENERGY-SAVING (1) 220 | ENVIRONMENT* (1) 221 | FUR (1) 222 | GREEN (1) 223 | HABITAT* (1) 224 | HEDGEROW* (1) 225 | HUSBANDED (1) 226 | LITTER* (1) 227 | OPENCAST (1) 228 | OPEN-CAST* (1) 229 | OZONE (1) 230 | PLANET (1) 231 | POPULATION (1) 232 | RECYCL* (1) 233 | RE-CYCL* (1) 234 | RE-USE (1) 235 | TOXIC (1) 236 | WARMING (1) 237 | GROUPS 238 | ETHNIC 239 | ASIAN* (1) 240 | BUDDHIST* (1) 241 | ETHNIC* (1) 242 | RACE (1) 243 | RACI* (1) 244 | WOMEN 245 | GIRLS (1) 246 | WOMAN (1) 247 | WOMEN (1) 248 | INSTITUTIONS 249 | CONSERVATIVE 250 | AUTHORITY (1) 251 | CONTINU* (1) 252 | DISRUPT* (1) 253 | INSPECT* (1) 254 | JURISDICTION* (1) 255 | LEGITIMATE (1) 256 | MANAG* (1) 257 | MORATORIUM (1) 258 | RUL* (1) 259 | STRIKE* (1) 260 | WHITEHALL (1) 261 | NEUTRAL 262 | ADMINISTR* (1) 263 | ADVIS* (1) 264 | AGENC* (1) 265 | AMALGAMAT* (1) 266 | APPOINT* (1) 267 | ASSEMBLY (1) 268 | CHAIR* (1) 269 | COMMISSION* (1) 270 | COMMITTEE* (1) 271 | CONSTITUEN* (1) 272 | COUNCIL* (1) 273 | DEPARTMENT* (1) 274 | DIRECTORATE* (1) 275 | EXECUTIVE* (1) 276 | HEADQUARTERS (1) 277 | LEGISLAT* (1) 278 | MECHANISM* (1) 279 | MINISTER* (1) 280 | OFFICE (1) 281 | OFFICES (1) 282 | OFFICIAL (1) 283 | OPERAT* (1) 284 | OPPOSITION (1) 285 | ORGANISATION* (1) 286 | PARLIAMENT* (1) 287 | PRESIDEN* (1) 288 | PROCEDUR* (1) 289 | PROCESS* (1) 290 | QUEEN (1) 291 | REGIST* (1) 292 | SCHEME* (1) 293 | SECRETARIAT* (1) 294 | SOVEREIGN* (1) 295 | SUBCOMMITTEE* (1) 296 | TRIBUNAL* (1) 297 | VOTE* (1) 298 | VOTING (1) 299 | WESTMINSTER (1) 300 | RADICAL 301 | ABOLITION (1) 302 | ACCOUNTABLE (1) 303 | ANSWERABLE (1) 304 | CONSULT* (1) 305 | CORRUPT* (1) 306 | DEMOCRATIC* (1) 307 | ELECT* (1) 308 | IMPLEMENT* (1) 309 | MODERN* (1) 310 | MONITOR* (1) 311 | REBUILD* (1) 312 | REEXAMINE* (1) 313 | REFORM* (1) 314 | RE-ORGANI* (1) 315 | REPEAL* (1) 316 | REPLACE* (1) 317 | REPRESENTAT* (1) 318 | SCANDAL* (1) 319 | SCRAP (1) 320 | SCRAP* (1) 321 | SCRUTIN* (1) 322 | TRANSFORM* (1) 323 | VOICE* (1) 324 | LAW_AND_ORDER 325 | LAW-CONSERVATIVE 326 | ASSAULTS (1) 327 | BAIL (1) 328 | BURGLAR* (1) 329 | CONSTAB* (1) 330 | CONVICT* (1) 331 | COURT (1) 332 | COURTS (1) 333 | CUSTOD* (1) 334 | DEALING (1) 335 | DELINQUEN* (1) 336 | DETER (1) 337 | DETER* (1) 338 | DISORDER (1) 339 | DRUG* (1) 340 | FINE (1) 341 | FINES (1) 342 | FIRMNESS (1) 343 | FORCE* (1) 344 | FRAUD* (1) 345 | GUARD* (1) 346 | HOOLIGAN* (1) 347 | ILLEGAL* (1) 348 | INTIMIDAT* (1) 349 | JOY-RIDE* (1) 350 | LAWLESS* (1) 351 | MAGISTRAT* (1) 352 | OFFENCE* (1) 353 | OFFICER* (1) 354 | PENAL* (1) 355 | POLICE (1) 356 | POLICEMEN (1) 357 | POLICING (1) 358 | PRISON* (1) 359 | PROBATION (1) 360 | PROSECUTION (1) 361 | PUNISH* (1) 362 | RE-OFFEND (1) 363 | RUC (1) 364 | SEIZ* (1) 365 | SENTENCE* (1) 366 | SHOP-LIFTING (1) 367 | SQUATTING (1) 368 | TERROR* (1) 369 | THEFT* (1) 370 | THUG* (1) 371 | TOUGH* (1) 372 | TRAFFICKER* (1) 373 | UNIFORMED (1) 374 | UNLAWFUL (1) 375 | VANDAL* (1) 376 | VICTIM* (1) 377 | VIGILAN* (1) 378 | LAW-LIBERAL 379 | HARASSMENT (1) 380 | NON-CUSTODIAL (1) 381 | RURAL 382 | AGRICULTUR* (1) 383 | BADGERS (1) 384 | BIRD* (1) 385 | COUNTRYSIDE (1) 386 | FARM* (1) 387 | FEED (1) 388 | FISH* (1) 389 | FOREST* (1) 390 | HENS (1) 391 | HORSE* (1) 392 | LANDSCAPE* (1) 393 | LANE* (1) 394 | LIVESTOCK (1) 395 | MEADOWS (1) 396 | VILLAGE* (1) 397 | WILDLIFE (1) 398 | URBAN 399 | TOWN* (1) 400 | VALUES 401 | CONSERVATIVE 402 | DEFEND (1) 403 | DEFENDED (1) 404 | DEFENDING (1) 405 | DISCIPLINE (1) 406 | GLORIES (1) 407 | GLORIOUS (1) 408 | GRAMMAR (1) 409 | HERITAGE (1) 410 | HISTOR* (1) 411 | HONOUR* (1) 412 | IMMIGRA* (1) 413 | INHERIT* (1) 414 | INTEGRITY (1) 415 | JUBILEE* (1) 416 | LEADER* (1) 417 | MAINTAIN (1) 418 | MAJESTY (1) 419 | MARRIAGE (1) 420 | OBSCEN* (1) 421 | PAST (1) 422 | PORNOGRAPH* (1) 423 | PRESERV* (1) 424 | PRIDE (1) 425 | PRINCIPL* (1) 426 | PROBITY (1) 427 | PROFESSIONALISM (1) 428 | PROUD (1) 429 | PUNCTUAL* (1) 430 | RECAPTURE* (1) 431 | RELIAB* (1) 432 | THREAT* (1) 433 | TRADITION* (1) 434 | LIBERAL 435 | CRUEL* (1) 436 | DISCRIMINAT* (1) 437 | HUMAN* (1) 438 | INJUSTICE* (1) 439 | INNOCENT (1) 440 | INTER_RACIAL (1) 441 | MINORIT* (1) 442 | REPRESSI* (1) 443 | RIGHTS (1) 444 | SEX* (1) 445 | -------------------------------------------------------------------------------- /W5_02_28_19/Session 5 - Supervised Learning I.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 2/28/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | #QUESTIONS FROM LAST LAB: 7 | # BOOTSTRAPPING EXAMPLE: why replace? 8 | # TTR: why the difference 9 | # KNITR 10 | 11 | #---------------------------------------- 12 | # 1 Set up environment --- 13 | #---------------------------------------- 14 | # clear global environment 15 | rm(list = ls()) 16 | 17 | # set path where our data is stored 18 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W5_02_28_19/") 19 | 20 | # load required libraries 21 | library(quanteda) 22 | library(quanteda.corpora) 23 | library(dplyr) 24 | 25 | #---------------------------------------- 26 | # ASIDE - Bootstrapping (question from last class) --- 27 | #---------------------------------------- 28 | # try setting replace = TRUE & replace = FALSE 29 | # for bootstrapping we need to set replace = TRUE 30 | sample_pop <- 1:5 31 | sample_pop_size <- length(sample_pop) 32 | lapply(1:5, function(x) sample(sample_pop, sample_pop_size, replace = TRUE)) 33 | 34 | #---------------------------------------- 35 | # 2 Load data: conservative manifestos --- 36 | #---------------------------------------- 37 | # read in the files 38 | filenames <- list.files(path = "conservative_manifestos", full.names=TRUE) 39 | cons_manifestos <- lapply(filenames, readLines) 40 | cons_manifestos <- unlist(lapply(cons_manifestos, function(x) paste(x, collapse = " "))) # because readLines returns a vector with each elements = lines 41 | 42 | # get the date docvar from the filename 43 | dates <- unlist(regmatches(unlist(filenames), gregexpr("[[:digit:]]+", unlist(filenames)))) 44 | 45 | # construct tibble (a tibble is an "enhanced" data.frame) 46 | #?tibble 47 | manifestos_df <- tibble(year = dates, text = cons_manifestos) 48 | 49 | #---------------------------------------- 50 | # 3 Regular expressions --- 51 | #---------------------------------------- 52 | 53 | # Examples 54 | words <- c("Washington Post", "NYT", "Wall Street Journal", "Peer-2-Peer", "Red State", "Cheese", "222", ",") 55 | 56 | # Exploring by character type 57 | #?grep 58 | grep("\\w", words, value = T) # Elements that have alphanumeric characters 59 | grep("\\w{7}", words, value = T) # Elements that have words that are at least 7 characters long 60 | grep("\\d", words, value = T) # Elements that contain numbers 61 | grep("\\W", words, value = T) # Elements that contain nonword characters (Including white space) 62 | 63 | # note that grep returns the full element that matched the pattern 64 | 65 | words2 <- c("voting", "votes", "devoted", "vote") 66 | 67 | grep("^vot", words2) # Returns the index of matching items in the vector 68 | grep("^vot", words2, value = T) # Returns the elements of the vector that matched the pattern 69 | grepl("^vot", words2) # Returns a logical vector indicating whether or not the component containes the expression 70 | 71 | # you can use the indices to select elements from the original vector that you want 72 | words2[grepl("^vot", words2)] 73 | 74 | presidents <- c("Roosevelt-33", "Roosevelt-37", "Obama-2003") 75 | 76 | # Use gsub to replace patterns with a string 77 | gsub("(\\w+)-(\\d{2})", "\\1-19\\2", presidents) # Parentheses can identify components that can later be referenced by \\1 - \\2 78 | gsub("(\\w+)-(\\d{2})$", "\\1-19\\2", presidents) # We want to use the $ to indicate that the pattern should come at the end of the word, to avoid the mismatch in Obama-192003 79 | 80 | # Note that regex expressions in R are similar to those in other languages but there are some key differences 81 | 82 | # Resources: 83 | # other packages to work with regular expressions: stringr, stringi 84 | # cheatsheet for regex: https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf 85 | # https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html 86 | # http://r4ds.had.co.nz/strings.html#matching-patterns-with-regular-expressions 87 | 88 | #---------------------------------------- 89 | # 4 Selecting Features from DFM using Regular Expressions --- 90 | #---------------------------------------- 91 | 92 | # Using simple texts 93 | 94 | testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with the newspaper from a a boy named Seamus, in his mouth." 95 | 96 | print(dfm(testText, select = "s$", valuetype = "regex")) # keep only words ending in "s" 97 | 98 | testTweets <- c("2 + 2 = 4 #1984", 99 | "I thought you said the park? Why are we at the vet? #QuestionsFromPets", 100 | "Holy freeway #flooding Batman! #californiastorms taking their toll.") 101 | 102 | print(dfm(testTweets, select="^#", valuetype = "regex")) # keep only hashtags i.e. expressions starting with a pound sign 103 | 104 | # Selecting features from a corpus 105 | 106 | data("data_corpus_irishbudget2010") 107 | 108 | irishbudgets_dfm <- dfm(data_corpus_irishbudget2010, select=c("tax|budg|^auster"), 109 | valuetype = "regex") # valuetype = "regex" ensures that the select input will be interpreted as a regular expression 110 | 111 | # You can pass a list of words to the "select" parameter in dfm, but using regular expressions can enable you to get all variants of a word 112 | View(irishbudgets_dfm) 113 | 114 | #---------------------------------------- 115 | # 5 Dictionaries --- 116 | #---------------------------------------- 117 | # Here, dictionary = list of words, not the data structure. 118 | # Python users: there is no dictionary object in R :( :( :( (Note: you can create dictionary-like objects using lists) 119 | 120 | mytexts <- c("The new law included a capital gains tax, and an inheritance tax.", 121 | "New York City has raised a taxes: an income tax and a sales tax.") 122 | 123 | mydict <- c("tax", "income", "capital", "gains", "inheritance") 124 | 125 | print(dfm(mytexts, select = mydict)) 126 | 127 | # Example: Laver Garry dictionary 128 | # https://rdrr.io/github/kbenoit/quanteda.dictionaries/man/data_dictionary_LaverGarry.html 129 | # https://provalisresearch.com/products/content-analysis-software/wordstat-dictionary/laver-garry-dictionary-of-policy-position/ 130 | lgdict <- dictionary(file = "LaverGarry.cat", format = "wordstat") 131 | 132 | # What's in this thing? 133 | lgdict 134 | 135 | # Run the conservative manifestos through this dictionary 136 | manifestos_lg <- dfm(manifestos_df$text, dictionary = lgdict) 137 | 138 | # how does this look 139 | as.matrix(manifestos_lg)[1:5, 1:5] 140 | featnames(manifestos_lg) 141 | 142 | # plot it 143 | plot(manifestos_df$year, 144 | manifestos_lg[,"CULTURE.SPORT"], 145 | xlab="Year", ylab="SPORTS", type="b", pch=19) 146 | 147 | plot(manifestos_df$year, 148 | manifestos_lg[,"VALUES.CONSERVATIVE"], 149 | xlab="Year", ylab="Conservative values", type="b", pch=19) 150 | 151 | plot(manifestos_df$year, 152 | manifestos_lg[,"INSTITUTIONS.CONSERVATIVE"] - manifestos_lg[,"INSTITUTIONS.RADICAL"], 153 | xlab="Year", ylab="Net Conservative Institutions", type="b", pch=19) 154 | 155 | # RID Dictionary--Regressive Imagery Dictionary 156 | # https://www.kovcomp.co.uk/wordstat/RID.html 157 | rid_dict <- dictionary(file = "RID.cat", format = "wordstat") 158 | 159 | data("data_corpus_sotu") 160 | 161 | sotus_texts <- texts(data_corpus_sotu) 162 | 163 | # Get the docvars from the corpus object 164 | year <- (data_corpus_sotu$documents$Date) 165 | pres <- (data_corpus_sotu$documents$President) 166 | 167 | sotu_rid_dfm <- dfm(data_corpus_sotu, dictionary = rid_dict) 168 | 169 | # Look at the categories 170 | featnames(sotu_rid_dfm) 171 | 172 | # Inspect the results graphically 173 | plot(year, 174 | sotu_rid_dfm[,"PRIMARY.REGR_KNOL.NARCISSISM"], 175 | xlab="Year", ylab="Narcissism", type="b", pch=19) 176 | 177 | plot(year, 178 | sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.FIRE"] + sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.ASCEND"] +sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.DESCENT"] + 179 | sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.DEPTH"] + sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.HEIGHT"] + sotu_rid_dfm[,"PRIMARY.ICARIAN_IM.WATER"], 180 | xlab="Year", ylab="Icarian-ness", type="b", pch=19) 181 | 182 | -------------------------------------------------------------------------------- /W5_02_28_19/conservative_manifestos/Con1918.txt: -------------------------------------------------------------------------------- 1 | 1918 Conservative Party General Election Manifesto 2 | The Manifesto of Lloyd George and Bonar Law 3 | 4 | The Coalition Government, supported by the strenuous and united labours of the whole nation, has now accomplished the gravest portion of its task. Our enemies have been defeated in the field, their armies are broke, and their Governments are over-turned. Thanks to the patient valour of the hosts of freedom, the knell of military autocracy has sounded forever in the Continent of Europe. Other tasks directly arising out of the war now await our nation, and can only be surmounted by the good sense, the patriotism, and the forbearance of our people. The unity of the nation which has the patriotism, and the forbearance of our people. The unity of the nation which has been the great secret of our strength in war must not be relaxed if the many anxious problems which the war has bequeathed to us are to be handled with the insight, courage, and prompritude which the times demand. 5 | 6 | As a preliminary to the soltuion of these problems it is essential that a fresh Parliament should be summoned, possessed of the authority with a General Election alone can give it, to make the peace of Europe and to deal with the difficult transitional period which will follow the cessation of hostilities. Indeed, the present Parliament has long outstayed its appointed term, and meanwhile millions of new voters, including for the first time representatives of the womanhood of the country, have been added to the electorate. It is right that the Government, upon whom it devolves in conjunction with our Dominions and our allies to settle the political future of Europe, should be supported by the confidence of the vast body of newly enfranchised citizens. 7 | 8 | We appeal, then, to every section of the electorate, without distinction of party, to support the Coalition Government in the execution of a policy devised in the interests of no particular class or section, but, so far as our light serves us, for the furtherance of the general good. Our first task must be to conclude a just and lasting peace, and so to establish the foundations of a new Europe that occasion for further wars may be for ever averted. The brilliant and conclusive triumph of the Allied Armies will, we hope, render it possible to reduce the burden of our armaments and to release by successive and progressive stages the labour and capital of the Empire for the arts of peace. To avert a repetition of the horrors of war, which are aggravated by the onward march of science, it will be the earnest endeavour of the Coalition Government to promote the formatino of a League of Nations, which may serve not only to ensure society against the calamitous results of militarism but to further a fruitful mutual understanding between the associated peoples. Never have the men and women of our race played so great and commanding a part in the affairs of the whole world as during the tempests and trials of this great war, and ever has the British name been so widely honoured. 9 | 10 | The care of the soldiers and sailors, officers and men, whose heroism has won for us this great deliverance, and who return to civil life, is a primary obligation of patriotism, and the Government will endeavour to assist such members of the armed forces of the Crown as may desire to avail themselves of facilities for special industrial training and to return to civil life under conditions worthy of their services to the country. Plans have been prepared, and will be put into execution as soon as the new Parliament assembles, whereby it will be the duty of public authorities and, if necessary, of the State itself to acquire land on simple and economical basis for men who have served in the war, either for cottages with gardens, allotments, or small holdings as the applicants may desire and be suited for, with frants provided to assist in training and initial equipment. In addition to this, we intend to secure and to promote the further development and cultivation of allotments and small holdings generally so far as may be required in the public interest. 11 | 12 | Increased production must necessarily be the basis of all schemes for the improvement the conditions of the people. The war has revealed the extent to which the resources of the country have been dissipated and depressed by lack of organisation or by wasteful organisation. It has been demonstrated that the land of the country, if properly cultivated and used, could have yielded food and other products of the soil to a much larger extent. It must be among the first tasks of the new Government to repair this error, which added so much to our difficulties in our struggles against the submarines of the enemy. 13 | 14 | The war has given fresh impetus to agriculture. This must not be allowed to expire. Scientific farming must be promoted, and the Government regard the maintenance of a satisfactory agricultural wage, the improvement of village life, and the development of rural industries as essential parts of an agricultural policy. Arrangements have been made whereby extensive afforestation and reclamation schemes may be entered upon without delay. A systematic improvement in the transport facilities of the resources of the soil, and the Government are preparing plans with a view to increasing these facilities on a large scale. 15 | 16 | The principal concern of every Government is the must be the condition of the great mass of the people who live by manual toil. The steadfast spirit of our workers, displayed on all the wide field of action opened out by the war - in the trenches, on the ocean, in the air, in field, mine, and factory - has left an imperishable mark on the heart and conscience of the nation. One of the first tasks of the Government will be to deal on broad and comprehensive lines with the housing of the people, which during the war has fallen so sadly into arrears, and upon which the well-being of the nation so largely depends. Larger opportunities for education, improved material conditions, and the prevention of degrading stndards of employment; a proper adaption to peace conditions of the experience which during the war we have gained in regard to the traffic in drink - these are among the conditions of social harmony which we shall earnestly endeavour to promote. It will be the fundamental object of the Coalition to promote the unity and development of our Empire and of the nations of which it is composed, to preserve for them the position and influence and authority which they have gained by their sacrifices and efforts in the cause of human liberty and progress, and to bring into being such conditions of living for the inhabitants of the British Isles as will secure plenty and opportunity to all. 17 | 18 | Until the country has returned to normal industrial conditions it would be premature to prescribe a fiscal policy intended for permanence. We must endeavour to reduce the war debt in such a manner as may inflict the least injury to industry and credit. The country will need all the food, all the raw material, and all the creidt which it can obtain, and fresh taxes ought not to be imposed on food or upon the raw materials of our industry. At the same time a preference will be given to our Colonies upon existing duties and upon any duties which, for our own purpose, may be subsequently imposed. One of the lessons which has been most clearly taught us by the war is the danger to the nation of being dependent upon other countries for vital supplies on which the life of the nation may depend. It is the intention therefore of the Government to preserve and maintain where necessary these key industries in the way which experience and examinatino may prove to be best adapted for the purpose. If production is to be maintained at the highest limit at home, security must be given against the unfair competitino to which our industries may be subjected by the dumping of goods produced abroad and sold on our market below the actual cost of production. The military institutions of the country must necessarily be dependent upon the needs of the Empire and the prospective requirements of any League for the preservation of peace to which this country may hereafter be a party. Meanwhile it will be the aim of the Government to carry through the inevitable reductions in our military and naval establishments with the least possible suffering to individuals and to the best advantage of industry and trade. 19 | 20 | Active measures will be needed to secure employment for the workers of the country. Industry will rightly claim to be liberated at the earliest possible moment from Government control. By the development and control in the best interest of the State of the economical production of power and light, of the railways and the means of communication, by the improvement of the Consular Service, and by the establishment of regular machinery for consultation with representative trade and industrial organisations on matters affecting their interest and prosperity, output will be increased, new markets opened out, and great economies effected in industrial production. 21 | 22 | It will be the duty of the new Government to remove all existing inequalities of the law as between men and women. 23 | 24 | It has been recognised by all parties that reform is urgently required in the constitution of the House of Lords, and it will be one of the objects of the Government to create a Second Chamber which will be based upon direct contract with the people, and will therefore be representative enough adequately to perform its functions. 25 | 26 | The people of this country are not unmindful of the conspicuous services rendered by the Princes and people of India to the common cause of civilisation during the war. The Cabinet has already defined in unmistakeable language the goal of British policy in India to the development of responsible government by gradual stages. To the general terms of that declaration we adhere and propose to give effect. 27 | 28 | Ireland is unhappily rent by contending forces, and the main body of Irish opinion has seldom been more inflamed or less disposed to compromise than it is at the present moment. So long as the Irish question remains unsettled there can be no political peace either in the United Kingdom or in the Empire, and we regard it as one of the first obligations of British statesmanship to explore all practical paths towards the settlement of this grave and difficult question on the basis of self-government. But there are two paths which are closed - the one leading to a complete severance of Ireland from the British Empire, and the other to the forcible submission of the six counties of Ulster to a Home Rule Parliament against their will. In imposing these two limitations we are only acting in accordance with the declared views of all English political leaders. 29 | 30 | It is a source of pride to be of this age, and to be members of this nation. In the whole course of the world's history no generation has been compelled to face sacrifices such as we have steadfastly endured, or perils such as we have victoriously confronted. Well and truly have rich and poor, castle and cottage, stood the ordeal of fire. Right earnestly do we trust that the united temper, the quiet fortitude, the high and resolute patriotism of our nation may be long preserved into the golden times of peace. 31 | -------------------------------------------------------------------------------- /W5_02_28_19/conservative_manifestos/Con1922.txt: -------------------------------------------------------------------------------- 1 | 1922 Conservative Party General Election Manifesto 2 | Andrew Bonar Law's Election Address 3 | 4 | His Majesty has been graciously pleased to appoint me First Minister of the Crown. I appeal to you to renew your confidence in myself as your representative, and to give your support to the new Government of which I am the head. The crisis which has arisen so suddenly has made it absolutely necessary that an immediate appeal should be made to the people, and in consequence it has been impossible to have an examination with my colleagues into the many questions with which we have to deal. Of necessity, therefore, the outlines of policy which I now submit to you cannot be as definite and precide as in other circumstances would have been possible. 5 | 6 | The crying need of the nation at this moment - a need which in my judgement far exceeds any other - is that we should have tranquility and stability both at home and abroad so that free scope should be given to the initative and enterprise of our citizens, for it is in that way far more than by any action of the Government that we can hope to recover from the economic and social results of the war. 7 | 8 | With this in view I think it is of the utmost importance that we should return as quickly as possible, to the normal procedure which existed before the war. In pursuance of this aim I am satisfied that the time has now come when a change should be made in the machinery of the central Government. Some of the work which has hitherto been done by the Cabinet Secretariat is essential and must be continued, but we intend to bring that body in its present form to an end, and I am certain that the necessary work can be continued, and the invaluable services of the present Secretary retained, in connection with the Treasury, which in the past has always been the central department of Government. As an illustration of the changes which we contemplate, instructions have been already given to transfer to the Foreign Office the machinery of the League of Nations, and in the same way to arrange, as regards any future International Conferences, that even where it is necessary that I as Prime Minister should be present, the machinery of the Conferences and the preliminary work in connection with them will be performed not by the Cabinet Secretariat but by the Foreign Office itself. 9 | 10 | At the present moment the first foreign interest not alone of Great Britain and of the British Empire, but of the world, is the re-establishment of peace. In all our foreign relations we intend to pursue an even course, loyally fulfilling the obligations we have undertaken, but resolutely determined not to extend our commitments, and should reasonable occasion arise to curtail them. It was by wholehearted co-operation, often under great difficulty, and with great differences of opinion, that we won the war. It is only by the same frank and full co-operation, conducted in the same spirit, with France and our other great Allies, that we can hope to solve the difficult problems with which we are not confronted. It is my confident hope that under the well-tried guidance of the Secretary of State for Foreign Affairs the negotiations for the settlement of the Near Eastern crisis will result in a true and lasting peace, conducing both to the political tranquillity of the Near and Middle East, with which so many of our Imperial interests are bound up, and to the personal security and happiness of the inhabitants of all races and creeds in the regions which have been the scene of so much disturbance and suffering. 11 | 12 | During the war the feeling supreme in the minds of men and women throughout the world was that a similar calamity should never again be allowed to fall upon mankind. It was to meet this feeling that the League of Nations was instituted, and it will be our earnest aim to give it wholehearted and practical support. The maintenance of our friendship and good understanding with the United States, based not on any formal friendship and good understanding with the United States, beased not on any formal friendship and good understanding with the United States, beased not on any formal alliance but on community of inherited ideals as well as on recent comradeship in arms, must always be a princial aim of British policy. Above all, we mean, in all matters affecting the external policy or security of the Empire, to act in close and continuous consultation with the Governments of the Dominions and of India in order to ensure that our policy shall keep fully in view both the interests and sentiments of our fellow subjects overseas, and at all times have behind it the moral support of the whole British Commonwealth. 13 | 14 | Our first task, if returned to power, will be the ratification of the Irish Treaty. We are prepared to take our part in making good that Treaty, both in the letter and in the spirit, and to co-operate with the Irish Government in the new relationship within the Empire which the Treaty will have created. We are equally pledged to safeguard the freedom of choice and the security of the Parliament and Government of Northern Ireland. We earnestly hope that further progress will be made in dealing with the anarchy in the South, and that both in the North and in the South it will be realised that the prosperity of Ireland as a whole can only be achieved by good will between the Governments and peoples of the two portions of that country. The position of the innocent victims of recent disturbances is a matter of the gravest concern to the people of this country, and it will be the duty of the Government to keep in the closest touch with the Government of the Irish Free State on this matter, so that just claims for compensation may have sympathetic consideration. 15 | 16 | We desire to promote the quiet and orderly development of India under the constitution which was conferred on her by the Act of 1919. The co-operation of all classes and sections is essential to the progress and prosperity of India, and, if that be secured, we can look forward with confidence to an industrial development which will add to her resources and give increased stability to her economic structure. 17 | 18 | At home our chief preoccupation at this time is the state of trade and employment. The immediate problem of unemployment this winter will call for emergency measures. Plans for dealing with the situation have already been considered by the late Government. They will be examined afresh by us with a view of seeing whether any improvements are possible, and the necessary steps will then be taken with the least avoidable delay. Such remedies, however, can only be palliatives, and the real recovery will not come except from the revival of trade and industry. To secure this result, the first eessential is to reduce expenditure to the lowest attainable level in the hope that the taxpayer may find some relief from the burden fo taxation which not only presses so heavily upon individuals, but is the greatest clog upon the wheels of national industry. 19 | 20 | Every Candidate, in every constituency, will, as I do, make retrenchment an essential part of his programme. All that I can possibly say, knowing how great are the difficulties, is that we should do our best to secure it. It will also be our endeavour in any way in our power to help trade, and the method of doing this, which seems to me most helpful, is the development of trade within the Empire itself. The markets, which for the time at least, as a consequence of the war, we have lost in Europe, can best be replaced by further development of trade with overseas countries, and especially of trade within the British Empire. We propose, therefore, immediately to consult the Governments of the self-governing Dominions and, if they approve, to summon, as early as possible, an Economic Conference with the view of finding in what way by mutual co-operation we can best develop the vast trade of which, in my opinion, the resources of the Empire admit. 21 | 22 | There is one branch of industry to which I must specially refer. As a consequence of the war, agriculture, the greatest of our national industries, is in a most serious condition, and demands the practical sympathy of the Government. It is not easy to specify the exact method by which that sympathy can be shown, but we shall immediately examine the whole problem afresh in the hope of making proposals which will assist the agricultural community to overcome the difficulties that now confront them. 23 | 24 | There are many measures of legislative and administrative importance which, in themselves, would be desirable, and which, in other electorate. But I do not feel that they can, at this moment, claim precedence over the nation's first need, which is, in every walk of life, to get on with its work with the minimum of interference at home and of disturbance abroad. 25 | Conservative Party Manifestos 26 | 27 | -------------------------------------------------------------------------------- /W5_02_28_19/conservative_manifestos/Con1923.txt: -------------------------------------------------------------------------------- 1 | 1923 Conservative Party General Election Manifesto 2 | Stanley Baldwin's Election Address 3 | 4 | In submitting myself to you for re-election, I propose frankly to put before you the present situation as I see it, and the measures which in the opinion of myself and my colleagues are necessary adequately to deal with it. 5 | 6 | 1. The unemployment and under-employment which our working people and our great national industries are now facing for the fourth winter in succession, on a scale unparalleled in our history, having created a problem which calls urgently for a solution. Their indefinite continuance threatens to impair permanently the trained skill and the independent spirit of our workers, to disorganise the whole fabric of industry and credit, and, by eating away the sources of revenue, to undermine the very foundations of our national and municipal life. 7 | 2. In large measure this state of affairs is due to the political and economic disorganisation of Europe consequent on the Great War. In accordance with the policy affirmed by the Imperial Conference we shall continue to devote every effort through the League of Nations and by every other practical means, to the restoration of a true peace in Europe. But that at the best must take time. A year ago Mr Bonar Law could still hope that a more settled condition of affairs was in prospect, absence of any modification of fiscal policy, of the ultimate necessity of which he himself was always convinced. Since the occupation of the Ruhr it has become evident that we are confronted by a situation which, even if it does not become worse, is not likely to be normal for years to come. 8 | 3. The disorganisation and poverty of Europe, accompanied by broken exchanges and by higher tariffs all the world over, have directly and indirectly narrowed the whole field of our foreign trade. In our own home market the bounty given to the importation of foreign goods by depreciated currencies, and by the reduced standard of living in many European countries, has exposed us to a competition which is essentially unfair and is paralysing enterprise and initiative. It is under such conditions that we have to find work for a population which, largely owing to the cessation during the war period of the normal flow of migration to the Dominions, has in the last census period increased by over a million and three quarter souls. 9 | 4. No Government with any sense of responsibility could continue to sit with tied hands watching the unequal struggle of our industries or content itself with palliatives which, valuable as they are to mitigate the hardship to individuals, must inevitably add to the burden of rates and taxes and thereby still further weaken our whole economic structure. Drastic measures have become necessary for dealing with present conditions as long as they continue. 10 | 5. The present Government hold themselves pledged by Mr Bonar Law not to make any fundamental change in the fiscal system of the country without consulting the electorate. Convinced, as I am, that only by such a change can a remedy be found, and that no partial measures such as the extension of the Safeguarding of Industries Act, can meet the situation, I am in honour bound to ask the people to release us from this pledge without further prejudicing the situation by any delay. That is the reason, and the only reason, which has made this election necessary. 11 | 6. What we propose to do for the assistance of employment in industry, if the nation approves, is to impose duties on imported manufactured goods, with the following objects:- 12 | * to raise revenue by methods less unfair to our own home production which at present bears the whole burden of local and national taxation, including the cost of relieving unemployment. 13 | * to give special assistance to industries which are suffering under unfair foreign competition; 14 | * to utilise these duties in order to negotiate for a reduction of foreign tariffs in those directions which would most benefit our export trade; 15 | * to give substantial preference to the Empire on the whole range of our duties with a view to promoting the continued extension of the principle of mutual preference which has already done so much for the expansion of our trade, and the development, in co-operation with the other Governments of the Empire, of the boundless resources of our common heritage. 16 | 7. Such a policy will defend our industries during the present emergency and will enable us, as more normal conditions return, to work effectively to secure a greater measure of real Free Trade both within the Empire and with foreign countries. Trade which is subject to the arbitrary interference of every foreign tariff, and at the mercy of every disturbance arising from the distractions of Europe, is in no sense free, and is certainly not fair to our own people. 17 | 8. It is not our intention, in any circumstances, to impose any duties on wheat, flour, oats, meat (including bacon and ham), cheese, butter or eggs. 18 | 9. While assisting the manufacturing industries of the country we propose also to give a direct measure of support to agriculture. Agriculture is not only, in itself, the greatest and most important of our national industries, but is of especial value as supplying the most stable and essentially complementary home market for our manufacturers. 19 | 10. We propose to afford this assistance by a bounty of £1 an acre on all holdings of arable land exceeding one acre. The main object of that bounty is to maintain employment on the land and so keep up the wages of agricultural labour. In order to make sure of this we shall decline to pay the bounty to any employer who pays less than 30/- a week to an able-bodied labourer. 20 | 11. The exclusion from any import duties of the essential foodstuffs which I have mentioned, as well as of raw materials, undoubtedly imposes a certain limitation upon the fullest extension of Imperial Preference. But even the preferences agreed to at the recent Economic Conference within our existing fiscal system, have been acknowledged as of the greatest value by the Dominion representatives, and our present proposals will offer a much wider field, the value of which will be progressively enhanced by the increasing range and variety of Empire production. 21 | 12. Moreover in the field of Empire development, as well as in that of home agriculture, we are not confined to the assistance furnished by duties. We have already given an earnest of our desire to promote a better distribution of the population of the Empire through the Empire Settlement Act, and at the Economic Conference we have undertaken to co-operate effectively with the Government of any part of the Empire in schemes of economic development. More especially do we intend to devote our attention to the development of cottom growing within the Empire, in order to keep down the cost of a raw material essential to our greatest exporting industry. 22 | 13. These measures constitute a single comprehensive and inter-dependent policy. Without additional revenue we cannot assist agriculture at home, but the income derived from the tariff will provide for this and leave us with means which can be devoted to cotton growing and other development in the Empire, and to the reduction of the duties on tea and sugar which fall so directly upon the working class household. 23 | 14. For the present emergency, and pending the introduction of our more extended proposals, we are making, and shall continue to make, every effort to increase the volume of work for our people. The Government are spending very large sums on every measure of emergency relief that can help in this direction. Further, the local Authorities of all kinds throughout the country, and great individual enterprises, such as the railways, with the assistance of the Government, or on its invitation, are co-operating wholeheartedly in the national endeavour to increase the volume of employment. This great combined effort of the Government, of the Local Authorities, and of individual enterprises, represents an expenditure of no less than £100 millions sterling. 24 | 15. The position of shipbuilding, one of the hardest hit of all our industries, is peculiar. It can only recover as shipping revives with the development of Empire and foreign trade which we believe will follow from our measures. We propose in the meantime to give it special assistance by accelerating the programme of light cruiser construction which will in any case become necessary in the near future. We are informed by our Naval advisers that some light cruisers will be required during the next few years in replacement of the County class, as well as a variety of smaller and auxiliary craft, and we intend that a substantial proportion of these shall be laid down as soon as the designs are ready and Parliamentary sanction secured. 25 | 16. The solution of the unemployment problem is the key to every necessary social reform. But I should like to repeat my convictino that we should aim aim at the reorganisatino of our various schemes of insurance against old age, ill-health and unemployment. More particularly should we devote our attention to investigating the possibilities of getting rid of the inconsistencies and the discouragement of thrift at present associated with the working of the Old Age Pensions Act. The encouragement of thrift and independence must be the underlying principle of all our social reforms. 26 | 27 | -------------------------------------------------------------------------------- /W5_02_28_19/conservative_manifestos/Con1931.txt: -------------------------------------------------------------------------------- 1 | 1931 Conservative Party General Election Manifesto 2 | The nation's duty: Stanley Baldwin's Election Message 3 | 4 | It is barely two months since my decision to join the National Government was unanimously endorsed at a meeting of Members of Parliament and Candidates held at the Kingsway Hall in London. At that time we expected that the co-operation then secured would last only a few weeks, but recent events have rendered it necessary, in my view, that the period of this co-operation should be extended. The Budget has been balanced. Borrowing has been stopped at the cost of sacrifices from every class of the community, sacrifices which are heavy but which, I hope and believe, as the result of a continuance of our policy may be temporary. But we have not yet balanced the Trade Account of the Nation: in other words, we are not yet earning enough to pay for what we have to buy from Overseas. Unless this position can be altered nothing can save us from ultimate bankruptcy. 5 | Our Country's Safety 6 | 7 | We must shrink from no steps to prove the stability of our country and to save our people from the disaster attaching to a currency fluctuating and falling through lack of confidence at home and abroad. 8 | A National Mandate 9 | 10 | To complete this work it is imperative that the Government should have a national mandate giving it freedom to use whatever means may be found necessary after careful examination to effect the end in view. It is necessary that in place of a small Parliamentary majority we should have a stable Government with a large majority backed by the resolution of a great majority of the electors. The country must show in no uncertain matter that it will have nothing to do with a party whose programme could only convert a situation grave already into one of chaos and catastrophe. Some of the problems that lie before us are wide as the world itself. Some are peculiar to ourselves. 11 | 12 | In the international field we have to consider war debts and reparations, disarmament, the unequal distribution of the world supply of gold and the mutual financial dependence of the countries of the world. Those questions may well tax the statesmanship of all nations. 13 | 14 | At home the paramount question is that of the adverse Balance of Trade, the redress of which is essential to secure our financial stability. This can be accomplished only be reducing imports, by increasing exports, or by a combination of both. 15 | Tariffs Essential 16 | 17 | I am prepared to examine any method which can effect what is required. I recognised that the situation is altered by the devaluation of the pound, but in my view the effect of that devaluation can be no valid substitute for a tariff, carefully designed and adjusted to meet the present situation. I shall, therefore, continue to press upon the electors that in my view the tariff is the quickest and most effective weapon not only to reduce excessive imports but to enable us to induce other countries to lower their tariff walls. 18 | 19 | The position of Agriculture is one which in my judgement is so desperate as to call for immediate and far-reaching measures of relief. To this end the first step should be assistance to cereal farmers, and we have in no way changed our view that the best form of assistance is by means of a quota and guaranteed price for wheat. 20 | 'Farmers Must be Secured Against Dumping' 21 | 22 | Farmers must be secured against dumping, which has brought so many branches of their industry to ruin. The production of food at home should be increased and the drain of men from the land stopped, and to this end and to make Imperial treaties which may be of enormous value to us as a nation we shall require such a fee hand as will allow us to use prohibitions, quotas or duties as may seem most effective in the circumstances. 23 | Empire Economic Unity 24 | 25 | The Problem of the Empire is to secure that economic unity for which we have so long striven. I hope that the reasons which led to a suspension of the Ottawa Conference have been overcome, and that it will be possible for the Canadian Government to renew its invitation. We shall then have a unique opportunity before us in the fact that it will fall to the National Government to accept that invitation. 26 | 27 | The ideal of Imperial Economic Unity is widespread today, and I am confident that the foundation of such unity will be well and truly laid with such general assent of our people as would have seemed impossible but a few short years ago. 28 | All must Help 29 | 30 | The National Government has with your help accomplished the first part of its work. We are passing through stern and difficult times; our task will be impossible without the support of the nation. 31 | 32 | For that support we appeal with confidence, and in the winning of that support I believe a great part will be played by those I am proud to lead. 33 | -------------------------------------------------------------------------------- /W6_03_07_19/Session 6 - Supervised Learning II.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 03/07/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | #---------------------------------------- 7 | # Set up environment --- 8 | #---------------------------------------- 9 | # clear global environment 10 | rm(list = ls()) 11 | 12 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/") 13 | 14 | # load required libraries 15 | library(quanteda) 16 | library(quanteda.corpora) 17 | library(readtext) 18 | library(dplyr) 19 | 20 | #---------------------------------------- 21 | # 1 Supervised Learning: Naive Bayes --- 22 | #---------------------------------------- 23 | #source of data: https://www.kaggle.com/rmisra/news-category-dataset#News_Category_Dataset_v2.json 24 | #library(rjson) 25 | #json_file <- "/Users/pedrorodriguez/Downloads/News_Category_Dataset_v2.json" 26 | #con = file(json_file, "r") 27 | #input <- readLines(con, -1L) 28 | #news_data <- lapply(X=input,fromJSON) 29 | #news_data <- lapply(news_data, function(x) as_tibble(t(unlist(x)))) 30 | #news_data <- do.call(rbind, news_data) 31 | #saveRDS(news_data, "~/Dropbox/NYU/Teaching/Text as Data/TaD-2018/W6_02_27_18/news_data.rds") 32 | 33 | # load data 34 | news_data <- readRDS("news_data.rds") 35 | 36 | # subset data and keep relevant variables 37 | news_samp <- news_data %>% filter(category %in% c("CRIME", "SPORTS")) %>% select(headline, category) %>% setNames(c("text", "class")) 38 | 39 | # get a sense of how the text looks 40 | dim(news_samp) 41 | head(news_samp$text[news_samp$class == "CRIME"]) 42 | head(news_samp$text[news_samp$class == "SPORTS"]) 43 | 44 | # some pre-processing (the rest will let dfm do) 45 | news_samp$text <- gsub(pattern = "'", "", news_samp$text) # replace apostrophes 46 | head(news_samp$text[news_samp$class == "SPORTS"]) 47 | 48 | # what's the distribution of classes? 49 | prop.table(table(news_samp$class)) 50 | 51 | # split sample into training & test sets 52 | set.seed(1984L) 53 | prop_train <- 0.8 54 | ids <- 1:nrow(news_samp) 55 | ids_train <- sample(ids, ceiling(prop_train*length(ids)), replace = FALSE) 56 | ids_test <- ids[-ids_train] 57 | train_set <- news_samp[ids_train,] 58 | test_set <- news_samp[ids_test,] 59 | 60 | # get dfm for each set 61 | train_dfm <- dfm(train_set$text, stem = TRUE, remove_punct = TRUE, remove = stopwords("english")) 62 | test_dfm <- dfm(test_set$text, stem = TRUE, remove_punct = TRUE, remove = stopwords("english")) 63 | 64 | # how does this look? 65 | as.matrix(train_dfm)[1:5,1:5] 66 | 67 | # match test set dfm to train set dfm features 68 | test_dfm <- dfm_match(test_dfm, features = featnames(train_dfm)) 69 | 70 | # w/o smoothing ---------------- 71 | 72 | # train model on the training set 73 | nb_model <- textmodel_nb(train_dfm, train_set$class, smooth = 0, prior = "uniform") 74 | 75 | # evaluate on test set 76 | predicted_class <- predict(nb_model, newdata = test_dfm) 77 | 78 | # baseline 79 | baseline_acc <- max(prop.table(table(test_set$class))) 80 | 81 | # get confusion matrix 82 | cmat <- table(test_set$class, predicted_class) 83 | nb_acc <- sum(diag(cmat))/sum(cmat) # accuracy = (TP + TN) / (TP + FP + TN + FN) 84 | nb_recall <- cmat[2,2]/sum(cmat[2,]) # recall = TP / (TP + FN) 85 | nb_precision <- cmat[2,2]/sum(cmat[,2]) # precision = TP / (TP + FP) 86 | nb_f1 <- 2*(nb_recall*nb_precision)/(nb_recall + nb_precision) 87 | 88 | # print 89 | cat( 90 | "Baseline Accuracy: ", baseline_acc, "\n", 91 | "Accuracy:", nb_acc, "\n", 92 | "Recall:", nb_recall, "\n", 93 | "Precision:", nb_precision, "\n", 94 | "F1-score:", nb_f1 95 | ) 96 | 97 | # w smoothing ---------------- 98 | 99 | # train model on the training set using Laplace smoothing 100 | nb_model_sm <- textmodel_nb(train_dfm, train_set$class, smooth = 1, prior = "uniform") 101 | 102 | # evaluate on test set 103 | predicted_class_sm <- predict(nb_model_sm, newdata = test_dfm) 104 | 105 | # get confusion matrix 106 | cmat_sm <- table(test_set$class, predicted_class_sm) 107 | nb_acc_sm <- sum(diag(cmat_sm))/sum(cmat_sm) # accuracy = (TP + TN) / (TP + FP + TN + FN) 108 | nb_recall_sm <- cmat_sm[2,2]/sum(cmat_sm[2,]) # recall = TP / (TP + FN) 109 | nb_precision_sm <- cmat_sm[2,2]/sum(cmat_sm[,2]) # precision = TP / (TP + FP) 110 | nb_f1_sm <- 2*(nb_recall_sm*nb_precision_sm)/(nb_recall_sm + nb_precision_sm) 111 | 112 | # print 113 | cat( 114 | "Baseline Accuracy: ", baseline_acc, "\n", 115 | "Accuracy:", nb_acc_sm, "\n", 116 | "Recall:", nb_recall_sm, "\n", 117 | "Precision:", nb_precision_sm, "\n", 118 | "F1-score:", nb_f1_sm 119 | ) 120 | 121 | # take a look at the most discriminant features (get some face validity) 122 | posterior <- tibble(feature = rownames(t(nb_model_sm$PcGw)), 123 | post_CRIME = t(nb_model_sm$PcGw)[,1], 124 | post_SPORTS = t(nb_model_sm$PcGw)[,2]) 125 | 126 | posterior %>% arrange(-post_SPORTS) %>% head(10) 127 | posterior %>% arrange(-post_CRIME) %>% head(10) 128 | 129 | # what does smoothing do? Reduces the "weight" place on new information (the likelihood) vis-a-vis the prior. 130 | plot(nb_model$PwGc[1,], nb_model_sm$PwGc[1,], xlim = c(0,0.02), ylim = c(0,0.02), xlab="No Smooth", ylab="Smooth") + abline(a = 0, b = 1, col = "red") 131 | 132 | #---------------------------------------- 133 | # 2 Classification using Word Scores --- 134 | #---------------------------------------- 135 | # Read in conservative and labour manifestos 136 | filenames <- list.files(path = "cons_labour_manifestos") 137 | 138 | # Party name and year are in the filename -- we can use regex to extract these to use as our docvars 139 | party <- unlist(regmatches(unlist(filenames), gregexpr("^[[:alpha:]]{3}", unlist(filenames)))) 140 | year <- unlist(regmatches(unlist(filenames), gregexpr("[[:digit:]]+", unlist(filenames)))) 141 | 142 | # This is how you would make a corpus with docvars from this data 143 | cons_labour_manifestos <- corpus(readtext("cons_labour_manifestos/*.txt")) 144 | docvars(cons_labour_manifestos, field = c("party", "year") ) <- data.frame(cbind(party, year)) 145 | 146 | # But we're going to use a dataframe 147 | cons_labour_df <- tibble(text = texts(cons_labour_manifestos), 148 | party = party, 149 | year = as.integer(year)) 150 | colnames(cons_labour_df) 151 | 152 | # keep vars of interest 153 | cons_labour_df <- cons_labour_df %>% select(text, party) %>% setNames(c("text", "class")) 154 | 155 | # what's the class distribution? 156 | prop.table(table(cons_labour_df$class)) 157 | 158 | # randomly sample a test speech 159 | set.seed(1984L) 160 | ids <- 1:nrow(cons_labour_df) 161 | ids_test <- sample(ids, 1, replace = FALSE) 162 | ids_train <- ids[-ids_test] 163 | train_set <- cons_labour_df[ids_train,] 164 | test_set <- cons_labour_df[ids_test,] 165 | 166 | # create DFMs 167 | train_dfm <- dfm(train_set$text, remove_punct = TRUE, remove = stopwords("english")) 168 | test_dfm <- dfm(test_set$text, remove_punct = TRUE, remove = stopwords("english")) 169 | 170 | # Word Score model w/o smoothing ---------------- 171 | ws_base <- textmodel_wordscores(train_dfm, 172 | y = (2 * as.numeric(train_set$class == "Lab")) - 1 # Y variable must be coded on a binary x in {-1,1} scale, so -1 = Conservative and 1 = Labour 173 | ) 174 | 175 | # Look at strongest features 176 | lab_features <- sort(ws_base$wordscores, decreasing = TRUE) # for labor 177 | lab_features[1:10] 178 | 179 | con_features <- sort(ws_base$wordscores, decreasing = FALSE) # for conservative 180 | con_features[1:10] 181 | 182 | # Can also check the score for specific features 183 | ws_base$wordscores[c("drugs", "minorities", "unemployment")] 184 | 185 | # predict that last speech 186 | test_set$class 187 | predict(ws_base, newdata = test_dfm, 188 | rescaling = "none", level = 0.95) 189 | 190 | # Word Score model w smoothing ---------------- 191 | ?textmodel_wordscores 192 | ws_sm <- textmodel_wordscores(train_dfm, 193 | y = (2 * as.numeric(train_set$class == "Lab")) - 1, # Y variable must be coded on a binary x in {-1,1} scale, so -1 = Conservative and 1 = Labour 194 | smooth = 1 195 | ) 196 | 197 | # Look at strongest features 198 | lab_features_sm <- sort(ws_sm$wordscores, decreasing = TRUE) # for labor 199 | lab_features_sm[1:10] 200 | 201 | con_features_sm <- sort(ws_sm$wordscores, decreasing = FALSE) # for conservative 202 | con_features_sm[1:10] 203 | 204 | # predict that last speech 205 | test_set$class 206 | predict(ws_base, newdata = test_dfm, 207 | rescaling = "none", level = 0.95) 208 | 209 | # Smoothing 210 | plot(ws_base$wordscores, ws_sm$wordscores, xlim=c(-1, 1), ylim=c(-1, 1), 211 | xlab="No Smooth", ylab="Smooth") 212 | 213 | #---------------------------------------- 214 | # 3 Applying Naive Bayes and Word Scores to Amicus texts from Evans et al --- 215 | #---------------------------------------- 216 | # Loading data 217 | data("data_corpus_amicus") 218 | 219 | # create dfm 220 | amicus_dfm <- dfm(data_corpus_amicus) 221 | 222 | # naive bayes model ---- 223 | 224 | # train NB model 225 | amNBmodel <- textmodel_nb(amicus_dfm, docvars(data_corpus_amicus, "trainclass")) 226 | 227 | # predict class label of test set 228 | amNBpredict <- predict(amNBmodel) 229 | 230 | # "confusion matrix": Naive Bayes 231 | nb_cmat <- table(docvars(data_corpus_amicus, "testclass"), amNBpredict) 232 | 233 | # baseline accuracy 234 | baseline_acc <- max(prop.table(table(docvars(data_corpus_amicus, "testclass")))) 235 | baseline_acc 236 | 237 | # get scores 238 | nb_acc <- sum(diag(nb_cmat))/sum(nb_cmat) # accuracy = (TP + TN) / (TP + FP + TN + FN) 239 | nb_recall <- nb_cmat[2,2]/sum(nb_cmat[2,]) # recall = TP / (TP + FN) 240 | nb_precision <- nb_cmat[2,2]/sum(nb_cmat[,2]) # precision = TP / (TP + FP) 241 | nb_f1 <- 2*(nb_recall*nb_precision)/(nb_recall + nb_precision) 242 | 243 | # print 244 | cat( 245 | "Baseline Accuracy: ", baseline_acc, "\n", 246 | "Accuracy:", nb_acc, "\n", 247 | "Recall:", nb_recall, "\n", 248 | "Precision:", nb_precision, "\n", 249 | "F1-score:", nb_f1 250 | ) 251 | 252 | # wordscore model ---- 253 | 254 | # create reference texts 255 | reference <- c(1, 1, -1, -1, rep(NA, 98)) # class labels 256 | 257 | # train ws model 258 | amWSmodel <- textmodel_wordscores(amicus_dfm, reference, smooth = 1) 259 | 260 | # plot nb and ws scores 261 | plot(amWSmodel$wordscores, c(1, -1) %*% amNBmodel$PcGw, xlab="Wordscore", ylab = "Linear Posterior Class Pr. Diff") 262 | 263 | # let's look at predictions from our wordscores model 264 | amWSpredict <- predict(amWSmodel) 265 | amWSresults <- ifelse(amWSpredict > 0, "P", "R") 266 | 267 | # "confusion matrix": wordscores 268 | ws_cmat <- table(docvars(data_corpus_amicus, "testclass"), amWSresults) 269 | 270 | # get scores 271 | ws_acc <- sum(diag(ws_cmat))/sum(ws_cmat) # accuracy = (TP + TN) / (TP + FP + TN + FN) 272 | ws_recall <- ws_cmat[2,2]/sum(ws_cmat[2,]) # recall = TP / (TP + FN) 273 | ws_precision <- ws_cmat[2,2]/sum(ws_cmat[,2]) # precision = TP / (TP + FP) 274 | ws_f1 <- 2*(ws_recall*ws_precision)/(ws_recall + ws_precision) 275 | 276 | # print 277 | cat( 278 | "Baseline Accuracy: ", baseline_acc, "\n", 279 | "Accuracy:", ws_acc, "\n", 280 | "Recall:", ws_recall, "\n", 281 | "Precision:", ws_precision, "\n", 282 | "F1-score:", ws_f1 283 | ) 284 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Con1918.txt: -------------------------------------------------------------------------------- 1 | 1918 Conservative Party General Election Manifesto 2 | The Manifesto of Lloyd George and Bonar Law 3 | 4 | The Coalition Government, supported by the strenuous and united labours of the whole nation, has now accomplished the gravest portion of its task. Our enemies have been defeated in the field, their armies are broke, and their Governments are over-turned. Thanks to the patient valour of the hosts of freedom, the knell of military autocracy has sounded forever in the Continent of Europe. Other tasks directly arising out of the war now await our nation, and can only be surmounted by the good sense, the patriotism, and the forbearance of our people. The unity of the nation which has the patriotism, and the forbearance of our people. The unity of the nation which has been the great secret of our strength in war must not be relaxed if the many anxious problems which the war has bequeathed to us are to be handled with the insight, courage, and prompritude which the times demand. 5 | 6 | As a preliminary to the soltuion of these problems it is essential that a fresh Parliament should be summoned, possessed of the authority with a General Election alone can give it, to make the peace of Europe and to deal with the difficult transitional period which will follow the cessation of hostilities. Indeed, the present Parliament has long outstayed its appointed term, and meanwhile millions of new voters, including for the first time representatives of the womanhood of the country, have been added to the electorate. It is right that the Government, upon whom it devolves in conjunction with our Dominions and our allies to settle the political future of Europe, should be supported by the confidence of the vast body of newly enfranchised citizens. 7 | 8 | We appeal, then, to every section of the electorate, without distinction of party, to support the Coalition Government in the execution of a policy devised in the interests of no particular class or section, but, so far as our light serves us, for the furtherance of the general good. Our first task must be to conclude a just and lasting peace, and so to establish the foundations of a new Europe that occasion for further wars may be for ever averted. The brilliant and conclusive triumph of the Allied Armies will, we hope, render it possible to reduce the burden of our armaments and to release by successive and progressive stages the labour and capital of the Empire for the arts of peace. To avert a repetition of the horrors of war, which are aggravated by the onward march of science, it will be the earnest endeavour of the Coalition Government to promote the formatino of a League of Nations, which may serve not only to ensure society against the calamitous results of militarism but to further a fruitful mutual understanding between the associated peoples. Never have the men and women of our race played so great and commanding a part in the affairs of the whole world as during the tempests and trials of this great war, and ever has the British name been so widely honoured. 9 | 10 | The care of the soldiers and sailors, officers and men, whose heroism has won for us this great deliverance, and who return to civil life, is a primary obligation of patriotism, and the Government will endeavour to assist such members of the armed forces of the Crown as may desire to avail themselves of facilities for special industrial training and to return to civil life under conditions worthy of their services to the country. Plans have been prepared, and will be put into execution as soon as the new Parliament assembles, whereby it will be the duty of public authorities and, if necessary, of the State itself to acquire land on simple and economical basis for men who have served in the war, either for cottages with gardens, allotments, or small holdings as the applicants may desire and be suited for, with frants provided to assist in training and initial equipment. In addition to this, we intend to secure and to promote the further development and cultivation of allotments and small holdings generally so far as may be required in the public interest. 11 | 12 | Increased production must necessarily be the basis of all schemes for the improvement the conditions of the people. The war has revealed the extent to which the resources of the country have been dissipated and depressed by lack of organisation or by wasteful organisation. It has been demonstrated that the land of the country, if properly cultivated and used, could have yielded food and other products of the soil to a much larger extent. It must be among the first tasks of the new Government to repair this error, which added so much to our difficulties in our struggles against the submarines of the enemy. 13 | 14 | The war has given fresh impetus to agriculture. This must not be allowed to expire. Scientific farming must be promoted, and the Government regard the maintenance of a satisfactory agricultural wage, the improvement of village life, and the development of rural industries as essential parts of an agricultural policy. Arrangements have been made whereby extensive afforestation and reclamation schemes may be entered upon without delay. A systematic improvement in the transport facilities of the resources of the soil, and the Government are preparing plans with a view to increasing these facilities on a large scale. 15 | 16 | The principal concern of every Government is the must be the condition of the great mass of the people who live by manual toil. The steadfast spirit of our workers, displayed on all the wide field of action opened out by the war - in the trenches, on the ocean, in the air, in field, mine, and factory - has left an imperishable mark on the heart and conscience of the nation. One of the first tasks of the Government will be to deal on broad and comprehensive lines with the housing of the people, which during the war has fallen so sadly into arrears, and upon which the well-being of the nation so largely depends. Larger opportunities for education, improved material conditions, and the prevention of degrading stndards of employment; a proper adaption to peace conditions of the experience which during the war we have gained in regard to the traffic in drink - these are among the conditions of social harmony which we shall earnestly endeavour to promote. It will be the fundamental object of the Coalition to promote the unity and development of our Empire and of the nations of which it is composed, to preserve for them the position and influence and authority which they have gained by their sacrifices and efforts in the cause of human liberty and progress, and to bring into being such conditions of living for the inhabitants of the British Isles as will secure plenty and opportunity to all. 17 | 18 | Until the country has returned to normal industrial conditions it would be premature to prescribe a fiscal policy intended for permanence. We must endeavour to reduce the war debt in such a manner as may inflict the least injury to industry and credit. The country will need all the food, all the raw material, and all the creidt which it can obtain, and fresh taxes ought not to be imposed on food or upon the raw materials of our industry. At the same time a preference will be given to our Colonies upon existing duties and upon any duties which, for our own purpose, may be subsequently imposed. One of the lessons which has been most clearly taught us by the war is the danger to the nation of being dependent upon other countries for vital supplies on which the life of the nation may depend. It is the intention therefore of the Government to preserve and maintain where necessary these key industries in the way which experience and examinatino may prove to be best adapted for the purpose. If production is to be maintained at the highest limit at home, security must be given against the unfair competitino to which our industries may be subjected by the dumping of goods produced abroad and sold on our market below the actual cost of production. The military institutions of the country must necessarily be dependent upon the needs of the Empire and the prospective requirements of any League for the preservation of peace to which this country may hereafter be a party. Meanwhile it will be the aim of the Government to carry through the inevitable reductions in our military and naval establishments with the least possible suffering to individuals and to the best advantage of industry and trade. 19 | 20 | Active measures will be needed to secure employment for the workers of the country. Industry will rightly claim to be liberated at the earliest possible moment from Government control. By the development and control in the best interest of the State of the economical production of power and light, of the railways and the means of communication, by the improvement of the Consular Service, and by the establishment of regular machinery for consultation with representative trade and industrial organisations on matters affecting their interest and prosperity, output will be increased, new markets opened out, and great economies effected in industrial production. 21 | 22 | It will be the duty of the new Government to remove all existing inequalities of the law as between men and women. 23 | 24 | It has been recognised by all parties that reform is urgently required in the constitution of the House of Lords, and it will be one of the objects of the Government to create a Second Chamber which will be based upon direct contract with the people, and will therefore be representative enough adequately to perform its functions. 25 | 26 | The people of this country are not unmindful of the conspicuous services rendered by the Princes and people of India to the common cause of civilisation during the war. The Cabinet has already defined in unmistakeable language the goal of British policy in India to the development of responsible government by gradual stages. To the general terms of that declaration we adhere and propose to give effect. 27 | 28 | Ireland is unhappily rent by contending forces, and the main body of Irish opinion has seldom been more inflamed or less disposed to compromise than it is at the present moment. So long as the Irish question remains unsettled there can be no political peace either in the United Kingdom or in the Empire, and we regard it as one of the first obligations of British statesmanship to explore all practical paths towards the settlement of this grave and difficult question on the basis of self-government. But there are two paths which are closed - the one leading to a complete severance of Ireland from the British Empire, and the other to the forcible submission of the six counties of Ulster to a Home Rule Parliament against their will. In imposing these two limitations we are only acting in accordance with the declared views of all English political leaders. 29 | 30 | It is a source of pride to be of this age, and to be members of this nation. In the whole course of the world's history no generation has been compelled to face sacrifices such as we have steadfastly endured, or perils such as we have victoriously confronted. Well and truly have rich and poor, castle and cottage, stood the ordeal of fire. Right earnestly do we trust that the united temper, the quiet fortitude, the high and resolute patriotism of our nation may be long preserved into the golden times of peace. 31 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Con1922.txt: -------------------------------------------------------------------------------- 1 | 1922 Conservative Party General Election Manifesto 2 | Andrew Bonar Law's Election Address 3 | 4 | His Majesty has been graciously pleased to appoint me First Minister of the Crown. I appeal to you to renew your confidence in myself as your representative, and to give your support to the new Government of which I am the head. The crisis which has arisen so suddenly has made it absolutely necessary that an immediate appeal should be made to the people, and in consequence it has been impossible to have an examination with my colleagues into the many questions with which we have to deal. Of necessity, therefore, the outlines of policy which I now submit to you cannot be as definite and precide as in other circumstances would have been possible. 5 | 6 | The crying need of the nation at this moment - a need which in my judgement far exceeds any other - is that we should have tranquility and stability both at home and abroad so that free scope should be given to the initative and enterprise of our citizens, for it is in that way far more than by any action of the Government that we can hope to recover from the economic and social results of the war. 7 | 8 | With this in view I think it is of the utmost importance that we should return as quickly as possible, to the normal procedure which existed before the war. In pursuance of this aim I am satisfied that the time has now come when a change should be made in the machinery of the central Government. Some of the work which has hitherto been done by the Cabinet Secretariat is essential and must be continued, but we intend to bring that body in its present form to an end, and I am certain that the necessary work can be continued, and the invaluable services of the present Secretary retained, in connection with the Treasury, which in the past has always been the central department of Government. As an illustration of the changes which we contemplate, instructions have been already given to transfer to the Foreign Office the machinery of the League of Nations, and in the same way to arrange, as regards any future International Conferences, that even where it is necessary that I as Prime Minister should be present, the machinery of the Conferences and the preliminary work in connection with them will be performed not by the Cabinet Secretariat but by the Foreign Office itself. 9 | 10 | At the present moment the first foreign interest not alone of Great Britain and of the British Empire, but of the world, is the re-establishment of peace. In all our foreign relations we intend to pursue an even course, loyally fulfilling the obligations we have undertaken, but resolutely determined not to extend our commitments, and should reasonable occasion arise to curtail them. It was by wholehearted co-operation, often under great difficulty, and with great differences of opinion, that we won the war. It is only by the same frank and full co-operation, conducted in the same spirit, with France and our other great Allies, that we can hope to solve the difficult problems with which we are not confronted. It is my confident hope that under the well-tried guidance of the Secretary of State for Foreign Affairs the negotiations for the settlement of the Near Eastern crisis will result in a true and lasting peace, conducing both to the political tranquillity of the Near and Middle East, with which so many of our Imperial interests are bound up, and to the personal security and happiness of the inhabitants of all races and creeds in the regions which have been the scene of so much disturbance and suffering. 11 | 12 | During the war the feeling supreme in the minds of men and women throughout the world was that a similar calamity should never again be allowed to fall upon mankind. It was to meet this feeling that the League of Nations was instituted, and it will be our earnest aim to give it wholehearted and practical support. The maintenance of our friendship and good understanding with the United States, based not on any formal friendship and good understanding with the United States, beased not on any formal friendship and good understanding with the United States, beased not on any formal alliance but on community of inherited ideals as well as on recent comradeship in arms, must always be a princial aim of British policy. Above all, we mean, in all matters affecting the external policy or security of the Empire, to act in close and continuous consultation with the Governments of the Dominions and of India in order to ensure that our policy shall keep fully in view both the interests and sentiments of our fellow subjects overseas, and at all times have behind it the moral support of the whole British Commonwealth. 13 | 14 | Our first task, if returned to power, will be the ratification of the Irish Treaty. We are prepared to take our part in making good that Treaty, both in the letter and in the spirit, and to co-operate with the Irish Government in the new relationship within the Empire which the Treaty will have created. We are equally pledged to safeguard the freedom of choice and the security of the Parliament and Government of Northern Ireland. We earnestly hope that further progress will be made in dealing with the anarchy in the South, and that both in the North and in the South it will be realised that the prosperity of Ireland as a whole can only be achieved by good will between the Governments and peoples of the two portions of that country. The position of the innocent victims of recent disturbances is a matter of the gravest concern to the people of this country, and it will be the duty of the Government to keep in the closest touch with the Government of the Irish Free State on this matter, so that just claims for compensation may have sympathetic consideration. 15 | 16 | We desire to promote the quiet and orderly development of India under the constitution which was conferred on her by the Act of 1919. The co-operation of all classes and sections is essential to the progress and prosperity of India, and, if that be secured, we can look forward with confidence to an industrial development which will add to her resources and give increased stability to her economic structure. 17 | 18 | At home our chief preoccupation at this time is the state of trade and employment. The immediate problem of unemployment this winter will call for emergency measures. Plans for dealing with the situation have already been considered by the late Government. They will be examined afresh by us with a view of seeing whether any improvements are possible, and the necessary steps will then be taken with the least avoidable delay. Such remedies, however, can only be palliatives, and the real recovery will not come except from the revival of trade and industry. To secure this result, the first eessential is to reduce expenditure to the lowest attainable level in the hope that the taxpayer may find some relief from the burden fo taxation which not only presses so heavily upon individuals, but is the greatest clog upon the wheels of national industry. 19 | 20 | Every Candidate, in every constituency, will, as I do, make retrenchment an essential part of his programme. All that I can possibly say, knowing how great are the difficulties, is that we should do our best to secure it. It will also be our endeavour in any way in our power to help trade, and the method of doing this, which seems to me most helpful, is the development of trade within the Empire itself. The markets, which for the time at least, as a consequence of the war, we have lost in Europe, can best be replaced by further development of trade with overseas countries, and especially of trade within the British Empire. We propose, therefore, immediately to consult the Governments of the self-governing Dominions and, if they approve, to summon, as early as possible, an Economic Conference with the view of finding in what way by mutual co-operation we can best develop the vast trade of which, in my opinion, the resources of the Empire admit. 21 | 22 | There is one branch of industry to which I must specially refer. As a consequence of the war, agriculture, the greatest of our national industries, is in a most serious condition, and demands the practical sympathy of the Government. It is not easy to specify the exact method by which that sympathy can be shown, but we shall immediately examine the whole problem afresh in the hope of making proposals which will assist the agricultural community to overcome the difficulties that now confront them. 23 | 24 | There are many measures of legislative and administrative importance which, in themselves, would be desirable, and which, in other electorate. But I do not feel that they can, at this moment, claim precedence over the nation's first need, which is, in every walk of life, to get on with its work with the minimum of interference at home and of disturbance abroad. 25 | Conservative Party Manifestos 26 | 27 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Con1923.txt: -------------------------------------------------------------------------------- 1 | 1923 Conservative Party General Election Manifesto 2 | Stanley Baldwin's Election Address 3 | 4 | In submitting myself to you for re-election, I propose frankly to put before you the present situation as I see it, and the measures which in the opinion of myself and my colleagues are necessary adequately to deal with it. 5 | 6 | 1. The unemployment and under-employment which our working people and our great national industries are now facing for the fourth winter in succession, on a scale unparalleled in our history, having created a problem which calls urgently for a solution. Their indefinite continuance threatens to impair permanently the trained skill and the independent spirit of our workers, to disorganise the whole fabric of industry and credit, and, by eating away the sources of revenue, to undermine the very foundations of our national and municipal life. 7 | 2. In large measure this state of affairs is due to the political and economic disorganisation of Europe consequent on the Great War. In accordance with the policy affirmed by the Imperial Conference we shall continue to devote every effort through the League of Nations and by every other practical means, to the restoration of a true peace in Europe. But that at the best must take time. A year ago Mr Bonar Law could still hope that a more settled condition of affairs was in prospect, absence of any modification of fiscal policy, of the ultimate necessity of which he himself was always convinced. Since the occupation of the Ruhr it has become evident that we are confronted by a situation which, even if it does not become worse, is not likely to be normal for years to come. 8 | 3. The disorganisation and poverty of Europe, accompanied by broken exchanges and by higher tariffs all the world over, have directly and indirectly narrowed the whole field of our foreign trade. In our own home market the bounty given to the importation of foreign goods by depreciated currencies, and by the reduced standard of living in many European countries, has exposed us to a competition which is essentially unfair and is paralysing enterprise and initiative. It is under such conditions that we have to find work for a population which, largely owing to the cessation during the war period of the normal flow of migration to the Dominions, has in the last census period increased by over a million and three quarter souls. 9 | 4. No Government with any sense of responsibility could continue to sit with tied hands watching the unequal struggle of our industries or content itself with palliatives which, valuable as they are to mitigate the hardship to individuals, must inevitably add to the burden of rates and taxes and thereby still further weaken our whole economic structure. Drastic measures have become necessary for dealing with present conditions as long as they continue. 10 | 5. The present Government hold themselves pledged by Mr Bonar Law not to make any fundamental change in the fiscal system of the country without consulting the electorate. Convinced, as I am, that only by such a change can a remedy be found, and that no partial measures such as the extension of the Safeguarding of Industries Act, can meet the situation, I am in honour bound to ask the people to release us from this pledge without further prejudicing the situation by any delay. That is the reason, and the only reason, which has made this election necessary. 11 | 6. What we propose to do for the assistance of employment in industry, if the nation approves, is to impose duties on imported manufactured goods, with the following objects:- 12 | * to raise revenue by methods less unfair to our own home production which at present bears the whole burden of local and national taxation, including the cost of relieving unemployment. 13 | * to give special assistance to industries which are suffering under unfair foreign competition; 14 | * to utilise these duties in order to negotiate for a reduction of foreign tariffs in those directions which would most benefit our export trade; 15 | * to give substantial preference to the Empire on the whole range of our duties with a view to promoting the continued extension of the principle of mutual preference which has already done so much for the expansion of our trade, and the development, in co-operation with the other Governments of the Empire, of the boundless resources of our common heritage. 16 | 7. Such a policy will defend our industries during the present emergency and will enable us, as more normal conditions return, to work effectively to secure a greater measure of real Free Trade both within the Empire and with foreign countries. Trade which is subject to the arbitrary interference of every foreign tariff, and at the mercy of every disturbance arising from the distractions of Europe, is in no sense free, and is certainly not fair to our own people. 17 | 8. It is not our intention, in any circumstances, to impose any duties on wheat, flour, oats, meat (including bacon and ham), cheese, butter or eggs. 18 | 9. While assisting the manufacturing industries of the country we propose also to give a direct measure of support to agriculture. Agriculture is not only, in itself, the greatest and most important of our national industries, but is of especial value as supplying the most stable and essentially complementary home market for our manufacturers. 19 | 10. We propose to afford this assistance by a bounty of £1 an acre on all holdings of arable land exceeding one acre. The main object of that bounty is to maintain employment on the land and so keep up the wages of agricultural labour. In order to make sure of this we shall decline to pay the bounty to any employer who pays less than 30/- a week to an able-bodied labourer. 20 | 11. The exclusion from any import duties of the essential foodstuffs which I have mentioned, as well as of raw materials, undoubtedly imposes a certain limitation upon the fullest extension of Imperial Preference. But even the preferences agreed to at the recent Economic Conference within our existing fiscal system, have been acknowledged as of the greatest value by the Dominion representatives, and our present proposals will offer a much wider field, the value of which will be progressively enhanced by the increasing range and variety of Empire production. 21 | 12. Moreover in the field of Empire development, as well as in that of home agriculture, we are not confined to the assistance furnished by duties. We have already given an earnest of our desire to promote a better distribution of the population of the Empire through the Empire Settlement Act, and at the Economic Conference we have undertaken to co-operate effectively with the Government of any part of the Empire in schemes of economic development. More especially do we intend to devote our attention to the development of cottom growing within the Empire, in order to keep down the cost of a raw material essential to our greatest exporting industry. 22 | 13. These measures constitute a single comprehensive and inter-dependent policy. Without additional revenue we cannot assist agriculture at home, but the income derived from the tariff will provide for this and leave us with means which can be devoted to cotton growing and other development in the Empire, and to the reduction of the duties on tea and sugar which fall so directly upon the working class household. 23 | 14. For the present emergency, and pending the introduction of our more extended proposals, we are making, and shall continue to make, every effort to increase the volume of work for our people. The Government are spending very large sums on every measure of emergency relief that can help in this direction. Further, the local Authorities of all kinds throughout the country, and great individual enterprises, such as the railways, with the assistance of the Government, or on its invitation, are co-operating wholeheartedly in the national endeavour to increase the volume of employment. This great combined effort of the Government, of the Local Authorities, and of individual enterprises, represents an expenditure of no less than £100 millions sterling. 24 | 15. The position of shipbuilding, one of the hardest hit of all our industries, is peculiar. It can only recover as shipping revives with the development of Empire and foreign trade which we believe will follow from our measures. We propose in the meantime to give it special assistance by accelerating the programme of light cruiser construction which will in any case become necessary in the near future. We are informed by our Naval advisers that some light cruisers will be required during the next few years in replacement of the County class, as well as a variety of smaller and auxiliary craft, and we intend that a substantial proportion of these shall be laid down as soon as the designs are ready and Parliamentary sanction secured. 25 | 16. The solution of the unemployment problem is the key to every necessary social reform. But I should like to repeat my convictino that we should aim aim at the reorganisatino of our various schemes of insurance against old age, ill-health and unemployment. More particularly should we devote our attention to investigating the possibilities of getting rid of the inconsistencies and the discouragement of thrift at present associated with the working of the Old Age Pensions Act. The encouragement of thrift and independence must be the underlying principle of all our social reforms. 26 | 27 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Con1931.txt: -------------------------------------------------------------------------------- 1 | 1931 Conservative Party General Election Manifesto 2 | The nation's duty: Stanley Baldwin's Election Message 3 | 4 | It is barely two months since my decision to join the National Government was unanimously endorsed at a meeting of Members of Parliament and Candidates held at the Kingsway Hall in London. At that time we expected that the co-operation then secured would last only a few weeks, but recent events have rendered it necessary, in my view, that the period of this co-operation should be extended. The Budget has been balanced. Borrowing has been stopped at the cost of sacrifices from every class of the community, sacrifices which are heavy but which, I hope and believe, as the result of a continuance of our policy may be temporary. But we have not yet balanced the Trade Account of the Nation: in other words, we are not yet earning enough to pay for what we have to buy from Overseas. Unless this position can be altered nothing can save us from ultimate bankruptcy. 5 | Our Country's Safety 6 | 7 | We must shrink from no steps to prove the stability of our country and to save our people from the disaster attaching to a currency fluctuating and falling through lack of confidence at home and abroad. 8 | A National Mandate 9 | 10 | To complete this work it is imperative that the Government should have a national mandate giving it freedom to use whatever means may be found necessary after careful examination to effect the end in view. It is necessary that in place of a small Parliamentary majority we should have a stable Government with a large majority backed by the resolution of a great majority of the electors. The country must show in no uncertain matter that it will have nothing to do with a party whose programme could only convert a situation grave already into one of chaos and catastrophe. Some of the problems that lie before us are wide as the world itself. Some are peculiar to ourselves. 11 | 12 | In the international field we have to consider war debts and reparations, disarmament, the unequal distribution of the world supply of gold and the mutual financial dependence of the countries of the world. Those questions may well tax the statesmanship of all nations. 13 | 14 | At home the paramount question is that of the adverse Balance of Trade, the redress of which is essential to secure our financial stability. This can be accomplished only be reducing imports, by increasing exports, or by a combination of both. 15 | Tariffs Essential 16 | 17 | I am prepared to examine any method which can effect what is required. I recognised that the situation is altered by the devaluation of the pound, but in my view the effect of that devaluation can be no valid substitute for a tariff, carefully designed and adjusted to meet the present situation. I shall, therefore, continue to press upon the electors that in my view the tariff is the quickest and most effective weapon not only to reduce excessive imports but to enable us to induce other countries to lower their tariff walls. 18 | 19 | The position of Agriculture is one which in my judgement is so desperate as to call for immediate and far-reaching measures of relief. To this end the first step should be assistance to cereal farmers, and we have in no way changed our view that the best form of assistance is by means of a quota and guaranteed price for wheat. 20 | 'Farmers Must be Secured Against Dumping' 21 | 22 | Farmers must be secured against dumping, which has brought so many branches of their industry to ruin. The production of food at home should be increased and the drain of men from the land stopped, and to this end and to make Imperial treaties which may be of enormous value to us as a nation we shall require such a fee hand as will allow us to use prohibitions, quotas or duties as may seem most effective in the circumstances. 23 | Empire Economic Unity 24 | 25 | The Problem of the Empire is to secure that economic unity for which we have so long striven. I hope that the reasons which led to a suspension of the Ottawa Conference have been overcome, and that it will be possible for the Canadian Government to renew its invitation. We shall then have a unique opportunity before us in the fact that it will fall to the National Government to accept that invitation. 26 | 27 | The ideal of Imperial Economic Unity is widespread today, and I am confident that the foundation of such unity will be well and truly laid with such general assent of our people as would have seemed impossible but a few short years ago. 28 | All must Help 29 | 30 | The National Government has with your help accomplished the first part of its work. We are passing through stern and difficult times; our task will be impossible without the support of the nation. 31 | 32 | For that support we appeal with confidence, and in the winning of that support I believe a great part will be played by those I am proud to lead. 33 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Lab1918.txt: -------------------------------------------------------------------------------- 1 | 1918 Labour Party General Election Manifesto 2 | Labour's call to the people 3 | 4 | The Labour Party has left the Coalition, and is appealing to the men and women of the country with a programme that is a challenge to reaction. 5 | A peace of reconciliation 6 | 7 | Victory has been achieved, and Labour claims no mean share in its achievement. Not only have the workers supplied the vast majority of our soldiers and sailors, and sustained the burden of war at home; the democratic diplomacy which found expression in the War Aims of Labour has been one of the most powerful factors in winning the war, and must be the most powerful factor in the rebuilding of the world. The Peace which Labour demands is a Peace of International Co-operation. It declares absolutely against secret diplomacy and any form of economic war, and demands, as an essential part of the Peace Treaty, an International Labour Charter incorporated in the very structure of the League of Free Peoples. 8 | Hands off democracy! 9 | 10 | Labour welcomes the extension of liberty and democracy in Europe. It has warned the Coalition that opposition towards the young democracies of the Continent, and especially that intervention on the side of European reaction, will be disastrous. Labour demands the immediate withdrawal of the Allied forces from Russia. In the interest of world-democracy it stands for the immediate restoration of the Workers International. 11 | Freedom for Ireland 12 | 13 | The principles which Labour acclaims as Allied war aims it will apply to our own subject peoples. Freedom for Ireland and India it claims as democratic rights, and it will extend to all subject peoples the right of self-determination within the British Commonwealth of Free Nations. Labour's appeal to the people is not a sectional appeal, unless an appeal which excludes only militarists, profiteers, and place-hunters be regarded as sectional. It includes all who are determined that the fruits of victory shall not be wasted in the interests of riches or reaction. Especially does Labour appeal to two sections of the community - to the soldiers and sailors who have fought the nation's battles abroad, and to the men and women workers at home. 14 | No conscription 15 | 16 | The returning soldier or sailor will find himself once more a worker. His cause is one with that of the workers at home. Civil and industrial liberties have been largely suspended during the war; and soldier and worker want their liberties back now. The Labour Party stands for the destruction of all war-time measures in restraint of civil or industrial liberty, the repeal of the Defence of the Realm Act, the complete abolition of Conscription, and the release of all political prisoners. It stands for free citizenship, a Free Parliament, for Free Speech, and against the domination of the Press by sinister political influences. 17 | The land for the workers 18 | 19 | The Labour Party means to introduce large schemes of land reorganisation, and it is fully aware that this can only be done in the teeth of the most powerful vested interests. land nationalisation is a vital necessity; the land is the people's and must be developed so as to afford a high standard of life to a growing rural population not be subsidies or tariffs, but by scientific methods, and the freeing of the soil from landlordism and reaction. 20 | A million good houses 21 | 22 | Labour demands a substantial and permanent improvement in the housing of the whole people. At least a million new houses must be built at once at the State's expense, and let at fair rents, and these houses must be fit for men and women to live in. Labour will press for a really comprehensive Public Health Act co-ordinating all health authorities, based on prevention rather than cure, and free from servile or inquisitorial features. It will also press for real public education, free and open to all, with maintenance scholarships without distinction of class, and for justice to the teachers, upon whom education finally depends. 23 | A levy on capital 24 | 25 | Labour will resist every attempt to place burdens upon the porr by indirect taxation. Labour is firm against tariffs and for Free Trade. The way to deal with unfair competition of imports made under sweated conditions is not by tariffs, but by international labour legislation, which will make sweating impossible. In paying the War Debt, Labour will place the burden on the broadest backs by a special tax capital. Those who have made fortunes out of the war must pay for the war; and Labour will insist upon heavily graduated direct taxation with a raising of the exemption limit. That is what Labour means by the Conscription of Wealth. 26 | Industrial democracy 27 | 28 | In industry, Labour demands the immediate nationalisation and democratic control of vital public services, such as mines, railways, shipping, armaments, and electric power; the fullest recognition and utmost extension of trade unionism, both in private employment and in the public services. It works for an altogether higher status for labour, which will mean also better pay and conditions. The national minimum is a first step, and with this must go the abolition of the menace of unemployment, the recognition of the universal right to work or maintenance, the legal limitation of hours of labour, and the drastic amendment of the Acts dealing with factory conditions, safety, and workmen's compensation. 29 | The real Women's Party 30 | 31 | Labour has always stood for equal rights for both sexes, when other parties were ignoring or persecuting women. In politics, the Labour Party stands for complete adult suffrage, in industry for equal pay and the organisation of men and women workers in one trade union movement. To the woman worker and to the wife of the working man or the soldier, Labour can make a confident appeal. Better pay and pensions for the workman or soldier mean better conditions for his wife and family. There must be no sex party: the Labour Party is the Women's Party. Woman is the Chancellor of the Exchequer of the home. Labour stands with the Co-operative Movement in its insistence on reasonable food prices and fair distribution, and in its resistance to unfair taxation. The Labour Party will do all it can to aid co-operators in their struggle for a democratic food organisation and against unfair discrimination. Labour and Co-operation are a single movement, and in the coming battle with reaction they must fight side by side. 32 | 33 | Labour's programme is comprehensive and constructive. It is designed to build a new world, and to build it by constitutional means. It is a programme of national and international justice, founded on permanent democratic principles. Even in an election as sinister as this, in which a large part of the nation's youth is arbitrarily disfranchised by the Government, Labour confidently appeals to the country to support its programme of social justice and economic freedom. 34 | 35 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Lab1922.txt: -------------------------------------------------------------------------------- 1 | The Coalition has been destroyed and a Conservative Government has been formed to carry out a policy of naked reaction. Labour is appealing to the men and women of the country on a policy of International Peace and National Reconstruction. 2 | 3 | Peace and the League of Nations 4 | 5 | Revision of the Peace Treaties, which have caused greater international wrongs than they removed, is the first step to Peace. German reparations must be brought within Germany’s capacity to pay. Turkey’s relations with Europe and the Freedom of the Straits can only be dealt with in an International Conference attended by representatives of all countries concerned. 6 | 7 | Labour is working for an all-inclusive League of Nations with power to deal with international disputes by methods of judicial arbitration and conciliation. Through the League of Nations an agreement can be reached for a limitation of armaments, with general disarmament as the goal. 8 | 9 | Freedom in the Empire 10 | 11 | Labour advocates the recognition of the real independence of Egypt and self-government for India. 12 | 13 | Labour demands the prompt and cordial acceptance of the new constitution of the Irish Free State, and supports every effort to make Ireland united, prosperous, and contented. 14 | 15 | How to find the money 16 | 17 | Labour recognises the urgent need of lifting from the trade and industry of the country the deadweight burden of the National Debt. It therefore proposes the creation of a War Debt Redemption Fund, by a special graduated levy on fortunes exceeding £5000. Labour will not penalise thrift, but will require some restitution from the profiteers out of the huge fortunes made in the war. 18 | 19 | To secure the necessary annual revenue, Labour advocates a system of taxation which will distribute the burden fairly according to 'ability to pay'. 20 | 21 | It proposes an increase of the Death Duties on large estates and of the Super-Tax on large incomes; incomes below £250 a year would be exempt from taxation and there would be a reduction in the tax on all incomes under £500 a year, with a steeper graduation of the scale above that limit. 22 | 23 | Taxation of Land Values will secure to the community socially-created wealth now diverted to private hands. 24 | 25 | Labour is in principle opposed to indirect taxation. It stands for an untaxed breakfast table, and wishes to free trade and industry from all burdensome imposts, whether customs, excise, or stamp duties. 26 | 27 | No 'penny-wise' economy 28 | 29 | Labour attaches the utmost importance to economy in the public administration. But we do not believe in starving the public services, least of all do we countenance the notion of economics at the expense of the poor for the benefit of the rich. 30 | 31 | Reduced expenditure on the children's education and health, the safety of the workers, and the well-being of mothers and babies is the costliest kind of waste. 32 | 33 | By a revision of the National Grants in Aid to local Authorities, we believe an equitable reduction of rates in all the severely pressed districts can be secured. 34 | 35 | Policy for unemployment 36 | 37 | Unemployment and low wages, caused largely by the policy of the Liberal and Unionist Government, have brought distress to the bulk of the working people. 38 | 39 | Labour's policy is to provide work, or maintenance, for the unemployed by reopening trade with foreign countries, by the national organisation of production, and by a large programme of necessary and useful public works. 40 | 41 | Agriculture 42 | 43 | The plight of Agriculture can only be dealt with by a bold policy of reorganisation. Those who produce the nation's food must not go hungry. Labour proposes to require the landlords to sacrifice rents rather than to ask the farm-workers to accept starvation wages. 44 | 45 | We advocate the restoration of the Agricultural Wages Board to enforce an adequate national wages standard. 46 | 47 | We propose also the establishment of representative Councils of Agriculture to promote all-round improvements in the use of the land, the reduction of transport charges, the development of co-operative methods, and the fostering of rural industries. 48 | 49 | Revision of the Game Laws, improved school facilities, more and better cottages, increased hospital accommodation and fuller opportunities of recreation are included in Labour's rural policy. 50 | 51 | Industrial reorganisation 52 | 53 | Labour is resolved to change as speedily as possible by constructive measures the social and economic system which confers unfair privileges on the few, and unobserved hardship on the many. The working of this system has brought unemployment and reduced wages to the workers, suffering and starvation to their families, loss of opportunities for full method and physical development to their children, anxiety and worry to the salaried and professional workers and small traders. Labour means to bring about a more equitable distribution of the wealth produced by the common effort of the workers by hand and brain. 54 | 55 | Our industrial policy involves the prompt Nationalisation of Mines, as recommended by the Sankey Commission, and the Nationalisation of Railways, with an increased share of control for the workers, an improved Workmen's Compensation Act, and other measures for the protection of the workpeople. 56 | 57 | We oppose all attempts to interfere with the Trade Boards. 58 | 59 | Houses and Health 60 | 61 | Our social programme includes a national scheme of housing which will end the scandal of a homeless population and replace the slums by decent homes. 62 | 63 | Most generous provision for the Old Age Pensioners has been one of Labour's constant demands, and is one we shall continue to press. We shall also urge the removal of the present unjust deductions from Old Age Pensions where Friendly Society or Trade Union benefits or small savings exist, whereby thrift is at present penalised. We stand for the complete supersession of the Poor Law and the institution of a system of Pensions for Widowed Mothers. Labour will strive to stop the continued attempts now being made to cut off or cut down the ex-Service men's pensions; it demands the conversion of conditional into permanent pensions, and it will resist all attempts to transfer the mentally or physically infirm to the Poor Law. 64 | 65 | In our view, the rule to be applied by the Pensions Ministry throughout its administration should be 'Fit for service, fit for pension'. 66 | 67 | More power to democracy 68 | 69 | The Labour Party has always declared its opposition to measures which increase the power of the wealthier classes to frustrate the people's will. The Parliament Act must stand, and there must be no restoration of the Lord's veto. 70 | 71 | Our policy is to remove all existing disabilities affecting women as citizens, voters, and workers. Adult suffrage, reform of Parliamentary procedure, and control of Ministers by the House of Commons are included in our programme to make the people's will effective by constitutional means. In accordance with these democratic principles Labour must stand for the control of the Liquor Traffic according to the people's will. 72 | 73 | Higher standard of Life 74 | 75 | The task of government is to raise the standard of life and labour for those whose work of hand and brain increases the nation's wealth. 76 | 77 | Parliaments have in the past been too much concerned to protect the privileges and extend the power of the rich. Labour wants to increase the happiness and prosperity of the poor, by better housing, better schooling, better living, better health, more leisure, more freedom, more opportunities for enjoying the good things of life. 78 | 79 | We shall defend the school as we defend the home, and we aim at providing the rising generation with full protection from harmful and degrading moral and social conditions. 80 | 81 | Against revolution 82 | 83 | Labour's programme is the best bulwark against violent upheaval and class wars. Democratic government can be made effective in this country without bloodshed or violence. Labour's policy is to bring about a more equitable distribution of the nation's wealth by constitutional means. This is neither Bolshevism nor Communism, but common sense and justice. 84 | 85 | This is Labour's alternative to Reaction and Revolution. 86 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Lab1923.txt: -------------------------------------------------------------------------------- 1 |  2 | 1923 Labour Party General Election Manifesto 3 | Labour's appeal to the nation 4 | 5 | After a year of barren effort, the Conservative Government has admitted its inability to cope with the problem of Unemployment, and is seeking to cover up its failure by putting the nation to the trouble and expense of an election on the Tariff issue. 6 | Tariffs no remedy 7 | 8 | The Labour Party challenges the Tariff policy and the whole conception of economic relations underlying it. Tariffs are not a remedy for Unemployment. They are an impediment to the free interchange of goods and services upon which civilised society rests. They foster a spirit of profiteering, materialism and selfishness, poison the life of nations, lead to corruption in politics, promote trusts and monopolies, and impoverish the people. They perpetuate inequalities in the distrubution of the world's wealth won by the labour of hands and brain. These inequalities the Labour Party means to remove. 9 | 'Work or maintenance' 10 | 11 | Unemployment is a recurrent feature of the existing economic system, common to every industrialised country, irrespective of whether it has Protection or Free Trade. The Labour Party alone has a positive remedy for it. We denounce as wholly inadequate and belated the programme of winter work produced by the Government, which offers the prospect of employment for only a fraction of the unemployed in a few industries; and in particular provides no relief for women and young persons. 12 | Labour's unemployment programme 13 | 14 | The Labour Party has urged the immediate adoption of national schemes of productive work, with adequate maintenance for those who cannot obtain employment to earn a livelihood for themselves and their families. The flow of young workers from the schools must be regulated to relieve the pressure on the labour market, and full educational training, with maintenance, must be provided for the young people who are now exposed to the perils and temptations of the streets. 15 | 16 | The Labour Programme of National Work includes the establishment of a National System of Electrical Power Supply, the development of Transport by road, rail and canal, and the improvement of national resources by Land Drainage, Reclamation, Afforestation, Town Planning and Housing Schemes. These not only provide a remedy for the present distress, but are also investments for the future. 17 | Help for agriculture 18 | 19 | Agriculture, as the largest and most essential of the nation's industries, calls for special measures to restore its prosperity and to give the land workers a living wage. The Labour Policy is one that will develop Agriculture and raise the standard of rural life by establishing machinery for regulating wages with an assured minimum, providing Credit and State Insurance facilities for Farmers and Small-holders, promoting and assisting Co-operative Methods in Production and Distribution, so as to help stabilise prices, and make the fullest use of the results of research. 20 | The Land 21 | 22 | The Labour Party proposes to restore to the people their lost rights in the Land, including Minerals, and to that end will work for re-equipping the Land Valuation Department, securing to the community the economic rent of land, and facilitating the acquisition of land for public use. 23 | Peace among the nations 24 | 25 | Labour's vision of an ordered world embraces the nations now torn with enmity and strife. It stands, therefore, for a policy of International Co-operation through a strengthened and enlarged League of Nations; the settlement of disputes by conciliation and judicial arbitration; the immediate calling by the British Government of an International Conference (including Germany on terms of equality) to deal with the Revision of the Versailles Treaty, especially Reparations and Debts; and the resumption of free economic and diplomatic relations with Russia. This will pave the way for Disarmament, the only security for the nations. 26 | Relief for the taxpayer 27 | 28 | Labour condemsn the failure of the Government to take steps to reduce the dead-weight War Debt. No effective reform of the National Finances can be attempted until the steady drain of a million pounds a day in interest is stopped. Treasury experts, in evidence before a Select Committee of the House of Commons, expressed their view that a Tax on War Fortunes could be levied, and have therefore admitted both the principle and its practicability. A labour Chancellor of the Exchequer, in consultation with Treasury experts, would at once work out a scheme to impose a non-recurring, graduated War Debt Redemption levy on all individual fortunes in excess of £5,000, to be devoted solely to the reduction of the Debt. 29 | 30 | The saving thus effected, with reduction of expenditure on armaments, other sane economies, and the increased revenue derived from Taxation of Land Values, would make it possible to reduce the burden of Income Tax, abolish not only the Food Duties, but also the Entertainments Tax and the Corporation Profits Tax, as well as provide money for necessary Social Services. 31 | The Commonwealth of Co-operative Service 32 | 33 | The Labour Party is working for the creation of a Commonwealth Co-operative Service. It believes that so far only a beginning has been made in the scientific organisation of industry. It will apply in a practical spirit the principle of Public Ownership and Control to the Mines, the Railway Service and the Electrical Power Stations, and the development of Municipal Services. It will make work safe for the worker by stricter Inspection of Workplaces, and more effective measures against Accidents and Industrial Diseases. It will provide fuller Compensation for the Workers and improve the Standard of Hours. 34 | The aged, the widows, the children 35 | 36 | Labour Policy is directed to the creation of a humane and civilised society. When Labour rules it will take care that little children shall not needlessly die; it will give to every child equality of opportunity in Education; it will make generous provision for the aged people, the widowed mothers, the sick and disabled citizens. 37 | 38 | It will abolish the slums, promptly build an adequate suppy of decent homes and resist decontrol till the shortage is satisfied. It will place the Drink Traffic under popular control. 39 | Ex-service men's pensions 40 | 41 | In accordance with its past actions inside and outside Parliament, the Labour Party will do its utmost to see that the Ex-Service men and their dependants have fair play. 42 | Equal rights 43 | 44 | Labour stands for equality between men and women: equal political and legal rights, equal rights and privileges in parenthood, equal pay for equal work. 45 | Labour's practical idealism 46 | 47 | The Labour Party submits to the men and women of the country its full programme. It urges them to refuse to make this General Election a wretched partisan squablle about mean and huckstering policies. It appeals to all citizens to take a generous and courageous stand for right and justice, to believe in the possibility of building up a sane and ordered society, to oppose the squalid materialism that dominates the world today, and to hold out their hands in friendship and good-will to the struggling people everywhere who want only freedom, security and a happier life. 48 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Lab1935.txt: -------------------------------------------------------------------------------- 1 | 1935 Labour Party General Election Manifesto 2 | The Labour Party's call to power 3 | 4 | Four years have passed since the 'National' Government obtained a swollen majority in the House of Commons on a campaign of fraud, misrepresentation and panic. The Government has now decided to plunge the nation into an electoral struggle in the midst of an international crisis. 5 | 6 | The Labour Party deplores this attempt to exploit for partisan ends a situation of grave international anxiety. 7 | 8 | It accepts the challenge and enters the Election confident of victory. 9 | Four barren years 10 | 11 | At the end of four years the country faces the grim spectacle of two million workless with an army of well over a million and a half people on the Poor law, and with the deepening tragedy of the distressed areas. Whilst doles of varying kinds have been dispensed on a lavish scale to industry after industry, not a single constructive step has been taken to improve the lot of the people. 12 | 13 | * The Government has robbed the unemployed of benefit and subjected them to a harsh and cruel household means test. 14 | * It withdrew, under a storm of public indignation, its new Unemployment Regulations, and after nine months of reconsideration of this burning question it has ignominiously failed to produce any policy for the proper care of the unemployed. 15 | * It has retarded the building of houses to let, curtailed schemes on food and other necessaries of life and by deliberately organising restriction of supplies. 16 | 17 | The international situation 18 | 19 | The Government has a terrible responsibility for the present international situation. It did nothing to check the aggression of Japan in the Far East, and thus seriously discredited the League of Nations and undermined the Collective Peace System. 20 | 21 | It has wrecked the Disarmament Conference by resisting all the constructive proposals made by other States. As regards air armaments, in particular, Lord Londondery has boasted that he succeeded, though with great difficulty, in preventing an agreement for the complete abolition of all national air forces. 22 | 23 | The Government has helped to restart the arms race, and it failed to make Signor Mussolini understand that, if he broke the peace in Africa, Britain would join with other nations in upholding the authority of the League. 24 | 25 | Too late to stop the war, the Government ranged itself at the eleventh hour behind the Covenant at Geneva. Even so, its action has been slow and half-hearted. Whilst paying lip-service to the League it is planning a vast and expensive rearmament programme, which will only stimulate similar programmes elsewhere. This Government is a danger to the peace of the world and to the security of the country. 26 | Labour's peace policy 27 | 28 | The Labour Party calls for a reversal of this suicidal foreign policy. It seeks wholehearted co-operation with the League of nations and with all States outside the League which desire peace. It standard firmly for the Collective Peace System. It demands speedy action, through the League, to bring the war in Africa to an end, to be followed by an immediate resumption of negotiations for all-round disarmament. 29 | 30 | Labour will efficiently maintain such defence forces as are necessary and consistent with our membership of the League; the best defence is not huge competitive national armaments, but the organisation of collective security against any aggressor and the agreed reduction of national armaments everywhere. 31 | 32 | Labour will propose to other nations the complete abolition of all national air forces, the effective international control of civil aviation and the creation of an international air police force; large reductions by international agreement in naval and military forces; and the abolition of the private manufacture of, and trade in, arms. 33 | 34 | A Labour Government would also seek full international co-operation in economic and industrial questions, with a view to increasing trade and raising standards of living throughout the world, and removing the economic causes of war, through equitable arrangements for access to markets, for the international control of sources of supply of raw materials, and for the extension of the mandate system for colonial territories. 35 | A bold policy of Socialist Reconstruction 36 | 37 | At home, the Labour Party will pursue its policy of Socialist Reconstruction. Labour has already put before the country, boldly and clearly, schemes of public ownership for the efficient conduct, in the national interest, of banking, coal and its products, transport, electricity, iron and steel, and cotton. 38 | 39 | It has also declared for the public ownership of land, in order that the community should profit by its value and proper use, the reorganisation of agriculture, the introduction of unemployment insurance for farm workers, the abolition of the 'tied' cottage, and the provision of cheap cottages in the countryside. 40 | 41 | Labour is pledged to a comprehensive programme of industrial legislation, so as to secure reasonable hours and conditions of employment for all workers and adequate compensation for the accidents of working life. It would restore the freedom of Trade Unions lost through the Trade Disputes and the Trade Unions Act. It would repeal the unjust and penal tax which the Government has imposed upon Co-operative Societies. 42 | 43 | Labour in power will attack the problem of the distressed areas by special steps designed to deal with the root causes of their troubles, as part of a vigorous policy of national planning. Labour will sweep away the humiliating means test imposed by the 'National' Government and will provide adequately for the unemployed, but will seek above all to reabsorb idle workers into productive employment by far-reaching schemes of national development. The Labour Party stands for a big move forward in education, including the raising of the school-leaving age with adequate maintenance allowances. It will vigorously develop the health services, and, in particular, will treat as one of its immediate concerns the terrible and neglected problem of maternal mortality. It favours an increase in the amount of old age pensions and a lowering of the qualifying age. It will go ahead with the provision of healthy homes for the people at reasonable rents, until the needs of the nation are fully met. 44 | 45 | Labour seeks a mandate to carry out this programme by constitutional and democratic means, and with this end in view, it seeks power to abolish the House of Lords and improve the procedure of the House of Commons. 46 | 47 | Labour asks the Nation for a Parliamentary Majority to promote Socialism at home and Peace abroad. 48 | -------------------------------------------------------------------------------- /W6_03_07_19/cons_labour_manifestos/Lab1951.txt: -------------------------------------------------------------------------------- 1 | Labour Party Election Manifesto 2 | 3 | Labour - proud of its record, sure in its policies - confidently asks the electors to renew its mandate. 4 | 5 | Four major tasks face our nation: to secure peace; to maintain full employment and increase production; to bring down the cost of living; to build a just society. Only with a Labour Government can the British people achieve these aims. 6 | Peace 7 | 8 | Our first aim is to save the peace of the world. Labour has striven hard since 1945 to bring all the nations together in world-wide co-operation through the United Nations. We have had grievous disappointments, particularly with the Soviet Union, but we shall persevere. We do not for one moment accept the view that a third world war is inevitable. We arm to save the peace. 9 | 10 | The Labour Government decided without hesitation that Britain must play her full part in the strengthening of collective defence. Britain must be strong: so must the Commonwealth. 11 | 12 | But peace cannot be preserved by arms alone. Peace depends equally on bringing freedom from poverty to lands where hunger and disease are the lot of the masses. Britain's Labour Government has given a lead in economic assistance to these lands. As our armed strength grows, more attention must be given to the under-developed regions of the world. Only a Labour Government would do this. 13 | 14 | The Tory still thinks in terms of Victorian imperialism and colonial exploitation. His reaction in a crisis is to threaten force. His narrow outlook is an obstacle to that world-wide co-operation which alone makes peace secure. He would have denied freedom to India, Pakistan, Ceylon and Burma. 15 | 16 | It is this that makes the election so critical, not only for the people of Britain but for the whole world. Anxious eyes will be watching what we do. If the election were to result in a Tory victory there would be no major power in the councils of the Western nations represented by Labour. 17 | 18 | Surely now, even more than ever before, it is vital to the fate of civilisation that the voice of Labour should be heard wherever and whenever the issues of war and peace are discussed between the spokesmen of the Great Powers. 19 | Full Employment and Production 20 | 21 | Full employment through six years of peace is the greatest of all Labour's achievements. It has never happened before. It has meant a revolution in the lives of our people. To-day, there are half a million unfilled vacancies at the employment exchanges. Under Labour - more jobs than workers. Under the Tories - more workers than jobs. 22 | 23 | Largely due to full employment, with everyone contributing to the national product, production in Britain since 1945 has risen twice as fast each year as under the Tories. Our industrial and agricultural output is now 50 per cent above pre-war, but we must do better still to improve our living standards, to fulfil our obligations in collective defence and to play our part in assisting under-developed regions. Almost 20 per cent of the national income is now devoted to new capital equipment for the nation. This is higher than ever in British history. 24 | 25 | World shortage of raw materials has steeply raised the prices of our imports and re opened the dollar gap. The difficulties are great. But we can conquer them. 26 | 27 | We shall do everything possible to stimulate production at home and to expand our exports. We shall press on with the development of new sources of raw materials, particularly within the Commonwealth. 28 | 29 | We shall attack monopolies and combines which restrict production and keep prices and profits too high. We shall prohibit by law the withholding of supplies to traders who bring prices down. 30 | 31 | We shall take over concerns which fail the nation and start new public enterprises wherever this will serve the national interest. We shall help industry with scientific and technical aid. 32 | 33 | We shall establish Development Councils, by compulsion if necessary, wherever this will help industrial efficiency. 34 | 35 | We shall associate the workers more closely with the administration of public industries and services. 36 | 37 | The British countryside which was being ruined and depopulated before the war is more prosperous under Labour than ever before. Our farmers and farmworkers have beaten all records in the production of home-grown food. We shall continue the policy of guaranteed prices for the farmer and good conditions of labour for the farmworker. 38 | 39 | We shall further extend electricity and water supplies and sewerage in rural areas. We shall encourage agricultural co-operatives. We shall stop the creation of new tied cottages under the cottage certificate system as the first step towards a just and comprehensive solution of the tied cottage problem. 40 | 41 | Under the Tories there was never full employment. Year after year millions were with out work. The Tories gave us the distressed areas. They betrayed agriculture; they encouraged monopolies and cartels. They are condemned by their record. 42 | Cost of Living 43 | 44 | Rising world prices have increased the cost of living, but much less in Britain than in most other countries. Our people have been sheltered against rising prices by Labour's policy of price control; by rent control; by food subsidies worth 12s. a week to the average family; by utility production and by bulk purchase which has kept down the cost of imports. 45 | 46 | Though long overdue improvements in the miners' wages and working conditions have been made, the price of coal under nationalisation is less than in any other country in Europe, or in the United States. 47 | 48 | Our Government has started international discussions for a fairer distribution of raw materials and for lower and more stable prices. Largely through this initiative the prices of textiles and clothing, including children's clothing, have recently been much reduced. This brings great benefit to the housewife and is a welcome change from the previous upward movement of the cost of living. 49 | 50 | We hope to see this fall extend to other prices soon. With this object we shall extend and strengthen price controls. We shall set up new auction markets in provincial towns to reduce the price of fruit and vegetables to the housewife. She will have fresher supplies, and when unnecessary middlemen are cut out the grower will get better and more stable prices. We shall overhaul marketing in other trades with the same object. 51 | 52 | Tory policy would cause a catastrophic rise in the cost of living. They are for high profits and against controls. They demand the abandonment of bulk purchase. They want to end the utility scheme. They would allow landlords to raise rents. 53 | Social Justice 54 | 55 | Contrast Britain in the inter-war years with Britain to-day. Then we had mass unemployment; mass fear; mass misery. Now we have full employment. 56 | 57 | Then millions suffered from insecurity and want. Now we have social security for every man, woman and child. 58 | 59 | Then dread of doctors bills was a nightmare in countless homes so that good health cost more than most people could afford to pay. Now we have a national health scheme which is the admiration of the post-war world. 60 | 61 | Then we had the workhouse and the Poor Law for the old people. Now we have a national insurance system covering the whole population with greatly improved pensions and a humane National Assistance scheme. 62 | 63 | Then only 39 per cent of the nation's personal incomes after taxation went to the wage earner, and 34 per cent to rent, interest and profit. Now, following Labour's great reforms in taxation, 48 per cent goes in wages and only 25 per cent in rent, interest and profit. 64 | 65 | There has, indeed, been progress, but much more remains to be done in the redistribution of income and of property to ensure that those who create the nation's wealth receive their just reward. Half of Britain's wealth is still owned by 1 per cent of the population. 66 | 67 | Labour will press forward towards greater social equality and the establishment of equal opportunities for all. We shall extend our policy of giving all young people equal opportunities in education. We shall encourage a spirit of hope and adventure in the young. 68 | 69 | As soon as tax reductions become possible we shall still further reduce taxation of wages, salaries, moderate incomes and moderate inheritances. We shall also take steps to abolish the differences between the payment of men and women in the public services. On the other hand, we shall limit dividends by law, increase taxation on the small minority who own great fortunes and large unearned incomes, and take measures to prevent large capital gains. 70 | 71 | The Tories are against a more equal society. They stand, as they have always stood, for privilege. In Parliament they proposed cuts in taxation on large incomes and fought the profits tax. They opposed the dividend freeze. 72 | 73 | In order to reduce the taxes of the well-to-do they would cut down the social services and penalise the great mass of people. They now suggest 'some sort of an excess profits tax'. In the interests of the nation Labour would stop all excess profits. 74 | 75 | They have voted in Parliament against the National Health Service, and they condemned the Labour Government for being 'too hasty' in introducing family allowances and raising old age pensions. 76 | 77 | Under Labour more than 1,300,000 new dwellings have been built since the war. We shall maintain the present rate of 200,000 new houses a year and increase it as soon as raw materials and manpower can be spared. Most of these houses will as now be built for rent and not for sale, and for the benefit of those whose housing need is greatest. 78 | 79 | We shall give security to householders and shopkeepers by leasehold enfranchisement and by other changes in the law. 80 | Forward with Labour or Backward with the Tories 81 | 82 | We ask the electors to renew their vote of confidence in the Labour Party. It is a simple choice - Labour or Tory. 83 | 84 | Look first at the past records, for we have both made history. But what kind of history? To-day, after six years of Labour rule and in spite of post-war difficulties, the standard of living of the vast majority of our people is higher than ever it was in the days of Tory rule. Never have the old folk been better cared for. Never had we so happy and healthy a young generation as we see in Britain to-day. 85 | 86 | Scotland and Wales have a new vitality. The great areas of depression have gone. There has been much devolution of administration from Whitehall and this will be carried further. 87 | 88 | Welfare at home, peace abroad, with a constant striving for international co-operation - this is Labour's aim. The Tories with their dark past, full of bitter memories for so many of our people, promise no light for the future. They would take us backward into poverty and insecurity at home and grave perils abroad. 89 | 90 | -------------------------------------------------------------------------------- /W6_03_07_19/news_data.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prodriguezsosa/Text-as-Data-Lab-Spring-2019/e5533d2fa9e873fa71ca26b45ec1e82248b43959/W6_03_07_19/news_data.rds -------------------------------------------------------------------------------- /W7_03_14_19/Session 7 - Supervised Learning III.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 03/14/2019 4 | # Lab adapted from: Kevin Munger, Patrick Chester and Leslie Huang. 5 | 6 | #---------------------------------------- 7 | # Set up environment --- 8 | #---------------------------------------- 9 | # clear global environment 10 | rm(list = ls()) 11 | 12 | # load required libraries 13 | library(dplyr) 14 | library(RTextTools) 15 | 16 | # set working directory 17 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/") 18 | 19 | # SUGGESTION: we looked at both RTextTools & caret, but use caret if possible rather than RTextTools given the latter has been archived 20 | 21 | #---------------------------------------- 22 | # 1. Load, clean and inspect data --- 23 | #---------------------------------------- 24 | news_data <- readRDS("news_data.rds") 25 | table(news_data$category) 26 | 27 | # let's work with 2 categories 28 | news_samp <- news_data %>% filter(category %in% c("WEIRD NEWS", "GOOD NEWS")) %>% select(headline, category) %>% setNames(c("text", "class")) 29 | 30 | # get a sense of how the text looks 31 | dim(news_samp) 32 | head(news_samp$text[news_samp$class == "WEIRD NEWS"]) 33 | head(news_samp$text[news_samp$class == "GOOD NEWS"]) 34 | 35 | # some pre-processing (the rest will let dfm do) 36 | news_samp$text <- gsub(pattern = "'", "", news_samp$text) # replace apostrophes 37 | news_samp$class <- recode(news_samp$class, "WEIRD NEWS" = "weird", "GOOD NEWS" = "good") 38 | 39 | # what's the distribution of classes? 40 | prop.table(table(news_samp$class)) 41 | 42 | # randomize order (notice how we split below) 43 | set.seed(1984) 44 | news_samp <- news_samp %>% sample_n(nrow(news_samp)) 45 | rownames(news_samp) <- NULL 46 | 47 | #---------------------------------------- 48 | # 2. Support Vector Machine (SVM) using RTextTools --- 49 | #---------------------------------------- 50 | 51 | # A. create document feature matrix (RTextTools has its own function) 52 | news_dfm <- create_matrix(news_samp$text, language = "english", removePunctuation = TRUE, removeNumbers = TRUE, stemWords = TRUE) 53 | 54 | # B. create a "container" (an RTextTools object) with training and test sets 55 | training_break <- as.integer(0.9 * nrow(news_samp)) # define training set break 56 | container <- create_container(matrix = news_dfm, 57 | labels = news_samp$class, 58 | trainSize = 1:training_break, 59 | testSize = (training_break + 1):nrow(news_dfm), 60 | virgin = FALSE) 61 | 62 | # C. now we can train any number of models 63 | print_algorithms() 64 | 65 | # SVM - linear 66 | svm.linear <- train_model(container, "SVM", kernel = "linear") 67 | 68 | # SVM - radial 69 | svm.radial <- train_model(container, "SVM", kernel = "radial") 70 | 71 | # D. out of sample evaluation 72 | 73 | # predict test set classes 74 | svm.linear.classify <- classify_model(container, svm.linear) 75 | svm.radial.classify <- classify_model(container, svm.radial) 76 | 77 | # get confusion matrix 78 | cmat <- table(news_samp$class[(training_break + 1):nrow(news_dfm)], svm.linear.classify$SVM_LABEL) 79 | linear_acc <- sum(diag(cmat))/sum(cmat) # accuracy = (TP + TN) / (TP + FP + TN + FN) 80 | 81 | cmat <- table(news_samp$class[(training_break + 1):nrow(news_dfm)], svm.radial.classify$SVM_LABEL) 82 | radial_acc <- sum(diag(cmat))/sum(cmat) # accuracy = (TP + TN) / (TP + FP + TN + FN) 83 | 84 | # print 85 | cat( 86 | "Baseline Accuracy: ", baseline_acc, "\n", 87 | "SVM-Linear Accuracy:", linear_acc, "\n", 88 | "SVM-Radial Accuracy:", radial_acc 89 | ) 90 | 91 | # each model has its own set of tuning parameters 92 | ?train_model 93 | 94 | ## Comments: 95 | # linear vs radial kernels 96 | # radial more flexible BUT/HENCE can overfit 97 | # linear kernel is faster 98 | # nfold is the number of times you have a different test set 99 | 100 | #---------------------------------------- 101 | # 3. Support Vector Machine (SVM) using Caret --- 102 | #---------------------------------------- 103 | library(caret) 104 | library(quanteda) 105 | 106 | # create document feature matrix 107 | news_dfm <- dfm(news_samp$text, stem = TRUE, remove_punct = TRUE, remove = stopwords("english")) %>% convert("matrix") 108 | 109 | # A. the caret package has it's own partitioning function 110 | set.seed(1984) 111 | ids_train <- createDataPartition(1:nrow(news_dfm), p = 0.8, list = FALSE, times = 1) 112 | train_x <- news_dfm[ids_train, ] %>% as.data.frame() # train set data 113 | train_y <- news_samp$class[ids_train] %>% as.factor() # train set labels 114 | test_x <- news_dfm[-ids_train, ] %>% as.data.frame() # test set data 115 | test_y <- news_samp$class[-ids_train] %>% as.factor() # test set labels 116 | 117 | # baseline 118 | baseline_acc <- max(prop.table(table(test_y))) 119 | 120 | # B. define training options (we've done this manually above) 121 | trctrl <- trainControl(method = "none") 122 | #trctrl <- trainControl(method = "LOOCV", p = 0.8) 123 | 124 | # C. train model (caret gives us access to even more options) 125 | # see: https://topepo.github.io/caret/available-models.html 126 | 127 | # svm - linear 128 | svm_mod_linear <- train(x = train_x, 129 | y = train_y, 130 | method = "svmLinear", 131 | trControl = trctrl) 132 | 133 | svm_linear_pred <- predict(svm_mod_linear, newdata = test_x) 134 | svm_linear_cmat <- confusionMatrix(svm_linear_pred, test_y) 135 | 136 | # svm - radial 137 | svm_mod_radial <- train(x = train_x, 138 | y = train_y, 139 | method = "svmRadial", 140 | trControl = trctrl) 141 | 142 | svm_radial_pred <- predict(svm_mod_radial, newdata = test_x) 143 | svm_radial_cmat <- confusionMatrix(svm_radial_pred, test_y) 144 | 145 | cat( 146 | "Baseline Accuracy: ", baseline_acc, "\n", 147 | "SVM-Linear Accuracy:", svm_linear_cmat$overall[["Accuracy"]], "\n", 148 | "SVM-Radial Accuracy:", svm_radial_cmat$overall[["Accuracy"]] 149 | ) 150 | 151 | # why may result differ from above? 152 | 153 | -------------------------------------------------------------------------------- /W8_03_28_19/Session 8 - Supervised Learning IV.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 03/28/2019 4 | 5 | # discuss HW2 typos 6 | 7 | #---------------------------------------- 8 | # go over concepts --- 9 | #---------------------------------------- 10 | # Ensemble methods 11 | # Ensemble = a group of predictors 12 | # intuition: wisdom of the crowds (best if predictions are independent) 13 | # 1. Bagging: use different random subsets of the training set w. replacement 14 | # 2. Pasting: use different random subsets of the training set w/o replacement 15 | # 3. Boosting: train predictors sequentially, each trying to correct its predecessor 16 | # 3a. AdaBoost: increase weight of instances that predecessor underfitted/misclassified 17 | # 3b. Gradient Boosting: fit new predictor to the residual erors made by the previous predictor 18 | # 4. Stacking: train a model to aggregate predictions of all predictors in an ensemble (can have several layers) 19 | # Random Forest is an ensemble of decision trees (generally use bagging) 20 | # Random Forest introduces additional source of randomness when growing trees: 21 | # non RF-decision trees search for the best feature among ALL features when splitting a node 22 | # decision-trees in a RF algorithm search for the best feature among a random subset of features 23 | 24 | #---------------------------------------- 25 | # Set up environment --- 26 | #---------------------------------------- 27 | # clear global environment 28 | rm(list = ls()) 29 | 30 | # load required libraries 31 | library(dplyr) 32 | library(randomForest) 33 | library(mlbench) 34 | library(caret) 35 | 36 | # set working directory 37 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/") 38 | 39 | #---------------------------------------- 40 | # 1. Load, clean and inspect data --- 41 | #---------------------------------------- 42 | news_data <- readRDS("news_data.rds") 43 | table(news_data$category) 44 | 45 | # let's work with 2 categories 46 | set.seed(1984) 47 | news_samp <- news_data %>% 48 | filter(category %in% c("MONEY", "LATINO VOICES")) %>% 49 | group_by(category) %>% 50 | sample_n(500) %>% # sample 250 of each to reduce computation time (for lab purposes) 51 | ungroup() %>% 52 | select(headline, category) %>% 53 | setNames(c("text", "class")) 54 | 55 | # get a sense of how the text looks 56 | dim(news_samp) 57 | head(news_samp$text[news_samp$class == "MONEY"]) 58 | head(news_samp$text[news_samp$class == "LATINO VOICES"]) 59 | 60 | # some pre-processing (the rest we'll let dfm do) 61 | news_samp$text <- gsub(pattern = "'", "", news_samp$text) # replace apostrophes 62 | news_samp$class <- recode(news_samp$class, "MONEY" = "money", "LATINO VOICES" = "latino") 63 | 64 | # what's the distribution of classes? 65 | prop.table(table(news_samp$class)) 66 | 67 | # randomize order (notice how we split below) 68 | set.seed(1984) 69 | news_samp <- news_samp %>% sample_n(nrow(news_samp)) 70 | rownames(news_samp) <- NULL 71 | 72 | #---------------------------------------- 73 | # 2. Prepare Data --- 74 | #---------------------------------------- 75 | library(caret) 76 | library(quanteda) 77 | 78 | # create document feature matrix 79 | news_dfm <- dfm(news_samp$text, stem = TRUE, remove_punct = TRUE, remove = stopwords("english")) %>% convert("matrix") 80 | 81 | # keep tokens that appear in at least 5 headlines 82 | presen_absent <- news_dfm 83 | presen_absent[presen_absent > 0] <- 1 84 | feature_count <- apply(presen_absent, 2, sum) 85 | features <- names(which(feature_count > 5)) 86 | news_dfm <- news_dfm[,features] 87 | 88 | # caret package has it's own partitioning function 89 | set.seed(1984) 90 | ids_train <- createDataPartition(1:nrow(news_dfm), p = 0.8, list = FALSE, times = 1) 91 | train_x <- news_dfm[ids_train, ] %>% as.data.frame() # train set data 92 | train_y <- news_samp$class[ids_train] %>% as.factor() # train set labels 93 | test_x <- news_dfm[-ids_train, ] %>% as.data.frame() # test set data 94 | test_y <- news_samp$class[-ids_train] %>% as.factor() # test set labels 95 | 96 | #---------------------------------------- 97 | # 3. Using RandomForest --- 98 | #---------------------------------------- 99 | library(randomForest) 100 | mtry = sqrt(ncol(train_x)) # number of features to sample at each split 101 | ntree = 51 # numbre of trees to grow 102 | # more trees generally improve accuracy but at the cost of computation time 103 | # odd numbers avoid ties (recall default aggregation is "majority voting") 104 | set.seed(1984) 105 | system.time(rf.base <- randomForest(x = train_x, y = train_y, ntree = ntree, mtry = mtry)) 106 | token_importance <- round(importance(rf.base, 2), 2) 107 | head(rownames(token_importance)[order(-token_importance)]) 108 | 109 | # print results 110 | print(rf.base) 111 | 112 | # plot importance 113 | # gini impurity = how "pure" is given node ~ class distribution 114 | # = 0 if all instances the node applies to are of the same class 115 | # upper bound depends on number of instances 116 | varImpPlot(rf.base, n.var = 10, main = "Variable Importance") 117 | 118 | #---------------------------------------- 119 | # 4. RandomForest Using Caret --- 120 | #---------------------------------------- 121 | trainControl <- trainControl(method = "cv", number = 5) 122 | metric <- "Accuracy" 123 | mtry <- sqrt(ncol(train_x)) 124 | ntree = 51 125 | tunegrid <- expand.grid(.mtry = mtry) 126 | set.seed(1984) 127 | system.time(rf.caret <- train(x = train_x, y = train_y, method = "rf", metric = metric, tuneGrid = tunegrid, trControl = trainControl, 128 | ntree = ntree)) 129 | 130 | # print results 131 | print(rf.caret) 132 | 133 | # plot importance 134 | varImpPlot(rf.caret$finalModel, n.var = 10, main = "Variable Importance") 135 | 136 | #---------------------------------------- 137 | # 5. RandomForest Using Caret + tuning --- 138 | #---------------------------------------- 139 | # note: the package RandomForest also has its own tuning function: tuneRF 140 | trainControl <- trainControl(method = "cv", number = 5) 141 | metric <- "Accuracy" 142 | tunegrid <- expand.grid(.mtry = c(0.5*mtry, mtry, 1.5*mtry)) # at the moment caret only allows tuning of mtry (partly b/c ntree is just a matter of computational constratints) 143 | set.seed(1984) 144 | system.time(rf.grid <- train(x = train_x, y = train_y, method = "rf", metric = metric, tuneGrid = tunegrid, trControl = trainControl, 145 | ntree = ntree)) 146 | # print grid search results 147 | print(rf.grid) 148 | 149 | # plot grid search results 150 | plot(rf.grid) 151 | 152 | #---------------------------------------- 153 | # 6. RandomForest Using Caret + manual tuning --- 154 | #---------------------------------------- 155 | mtry <- sqrt(ncol(train_x)) 156 | tunegrid <- expand.grid(.mtry = mtry) 157 | # ntree = 1 158 | set.seed(1984) 159 | system.time(rf.man1 <- train(x = train_x, y = train_y, method = "rf", metric = metric, tuneGrid = tunegrid, trControl = trainControl, ntree = 1)) 160 | 161 | # ntree = 5 162 | set.seed(1984) 163 | system.time(rf.man2 <- train(x = train_x, y = train_y, method = "rf", metric = metric, tuneGrid = tunegrid, trControl = trainControl, ntree = 5)) 164 | 165 | # ntree = 101 166 | set.seed(1984) 167 | system.time(rf.man3 <- train(x = train_x, y = train_y, method = "rf", metric = metric, tuneGrid = tunegrid, trControl = trainControl, ntree = 51)) 168 | 169 | # collect results & summarize 170 | results <- resamples(list(rf1 = rf.man1, rf5 = rf.man2, rf51 = rf.man3)) 171 | summary(results) 172 | 173 | # test set accuracy 174 | confusionMatrix(predict(rf.man1, newdata = test_x), test_y) 175 | confusionMatrix(predict(rf.man2, newdata = test_x), test_y) 176 | confusionMatrix(predict(rf.man3, newdata = test_x), test_y) 177 | 178 | # box and whisker plots to compare models 179 | scales <- list(x = list(relation = "free"), y = list(relation = "free")) 180 | bwplot(results, scales = scales) 181 | -------------------------------------------------------------------------------- /W9_04_11_19/Session 9 - Unsupervised Learning I.R: -------------------------------------------------------------------------------- 1 | # TA: Pedro L. Rodríguez 2 | # Course: Text as Data 3 | # Date: 04/11/2019 4 | # Lab adapted from: Leslie Huang. 5 | 6 | # Set up workspace 7 | rm(list = ls()) 8 | 9 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W9_04_11_19/") 10 | 11 | # Loading packages 12 | #install.packages("lsa") 13 | #install.packages("factoextra") 14 | 15 | library(quanteda) 16 | library(quanteda.corpora) 17 | library(dplyr) 18 | 19 | ## 1 PCA 20 | 21 | # 1.1 Two functions in base R for PCA: 22 | # see: http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/118-principal-component-analysis-in-r-prcomp-vs-princomp/ 23 | ?prcomp # uses the singular value decomposition approach: examines the covariances/correlations between individuals 24 | ?princomp # uses the spectral decomposition approach: examines the covariances/correlations between variables (need more individuals than variables) 25 | 26 | # Remember to center your data! (default = TRUE) -- use scale() on your matrix beforehand, or the option in prcomp() 27 | # And don't have any missing values! 28 | 29 | library(factoextra) # makes it easy to work with PCA (great for visualization) 30 | 31 | # 1.2 Example 32 | data("data_corpus_sotu") 33 | SOTU <- corpus_subset(data_corpus_sotu, Date > "1900-01-01") 34 | 35 | SOTU_dfm <- dfm(SOTU, 36 | stem = T, 37 | remove_punct = T, 38 | remove = stopwords("english") 39 | ) 40 | 41 | 42 | SOTU_mat <- convert(SOTU_dfm, to = "matrix") # convert to matrix 43 | 44 | # run pca 45 | SOTU_pca <- prcomp(SOTU_mat, center = TRUE, scale = TRUE) 46 | 47 | # visualize eigenvalues (scree plot: shows percentage of variance explained by each dimension) 48 | fviz_eig(SOTU_pca, addlabels = TRUE) 49 | 50 | # Loadings for each variable: columns contain the eigenvectors 51 | SOTU_pca$rotation[1:10, 1:5] 52 | dim(SOTU_pca$rotation) 53 | 54 | # Q: can we interpret the dimensions? 55 | # top loadings on PC1 56 | pc_loadings <- SOTU_pca$rotation 57 | 58 | # what do we expect this correlation to be? 59 | cor(pc_loadings[,1], pc_loadings[,2]) # these should be orthogonal 60 | 61 | # token loadings 62 | N <- 10 63 | pc1_loading <- tibble(token = rownames(pc_loadings), loading = as.vector(pc_loadings[,1])) %>% arrange(-loading) 64 | pc1_loading$loading <- scale(pc1_loading$loading, center = TRUE) 65 | pc1_loading <- rbind(top_n(pc1_loading, N, loading),top_n(pc1_loading, -N, loading)) 66 | pc1_loading <- transform(pc1_loading, token = factor(token, levels = unique(token))) 67 | 68 | # plot top tokens according to absolute loading values 69 | ggplot(pc1_loading, aes(token, loading)) + 70 | geom_bar(stat = "identity", fill = ifelse(pc1_loading$loading <= 0, "grey20", "grey70")) + 71 | coord_flip() + 72 | xlab("Tokens") + ylab("Tokens with Top Loadings on PC1") + 73 | scale_colour_grey(start = .3, end = .7) + 74 | theme(panel.background = element_blank(), 75 | axis.text.x = element_text(size=16), 76 | axis.text.y = element_text(size=16), 77 | axis.title.y = element_text(size=18, margin = margin(t = 0, r = 15, b = 0, l = 15)), 78 | axis.title.x = element_text(size=18, margin = margin(t = 15, r = 0, b = 15, l = 0)), 79 | legend.text=element_text(size=16), 80 | legend.title=element_blank(), 81 | legend.key=element_blank(), 82 | legend.position = "top", 83 | legend.spacing.x = unit(0.25, 'cm'), 84 | plot.margin=unit(c(1,1,0,0),"cm")) 85 | 86 | # Value of the rotated data: your "new", dimensionality reduced data 87 | View(SOTU_pca$x) # each observation 88 | 89 | # retrieve most similar documents 90 | library(text2vec) 91 | 92 | # function computes cosine similarity between query and all documents and returns N most similar 93 | nearest_neighbors <- function(query, low_dim_space, N = 5, norm = "l2"){ 94 | cos_sim <- sim2(x = low_dim_space, y = low_dim_space[query, , drop = FALSE], method = "cosine", norm = norm) 95 | nn <- cos_sim <- cos_sim[order(-cos_sim),] 96 | return(names(nn)[2:(N + 1)]) # query is always the nearest neighbor hence dropped 97 | } 98 | 99 | # apply to document retrieval 100 | nearest_neighbors(query = "Obama-2009", low_dim_space = SOTU_pca$x, N = 5, norm = "l2") 101 | 102 | # Visualization resources: 103 | 104 | # Tutorial from factoextra author about how to use his package to explore and visualize PCA results: http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/ 105 | 106 | # See here for visualizing PCA with the ggbiplot library: https://www.r-bloggers.com/computing-and-visualizing-pca-in-r/ 107 | 108 | 109 | ## 2 Latent Semantic Analysis (LSA) aka Latent Semantic Indexing (LSI) 110 | 111 | library(lsa) 112 | # foundational theory: distributional hypothesis 113 | # what is the context in LSA? document 114 | 115 | # Let's keep using the SOTU data from before 116 | SOTU_mat_lsa <- convert(SOTU_dfm, to = "lsa") # convert to transposed matrix (so terms are rows and columns are documents = TDM) 117 | SOTU_mat_lsa <- lw_logtf(SOTU_mat_lsa) * gw_idf(SOTU_mat_lsa) # local - global weighting (akin to TFIDF) 118 | 119 | # 2.1 Create LSA weights using TDM 120 | SOTU_lsa <- lsa(SOTU_mat_lsa) 121 | #lsa(myMatrix, dims = dimcalc_share(share = 0.8)) # share = fraction of the sum of the selected singular values over the sum of all singular values, default is 0.5 122 | 123 | # what do we expect this correlation to be? 124 | cor(SOTU_lsa$tk[,1], SOTU_lsa$tk[,2]) # these should be orthogonal 125 | 126 | # 2.2 Check to see what a good number of dimensions is 127 | ?dimcalc_share 128 | 129 | # lsa_obj$tk = truncated term matrix from term vector matrix T (constituting left singular vectors from the SVD of the original matrix) 130 | # lsa_obj$dk = truncated document matrix from document vector matrix D (constituting right singular vectors from the SVD of the original matrix) 131 | # lsa_obj$sk = singular values: Matrix of scaling values to ensure that multiplying these matrices reconstructs TDM 132 | # see: https://cran.r-project.org/web/packages/lsa/lsa.pdf 133 | 134 | # Lecture example uses dims = 5 135 | SOTU_lsa_5 <- lsa(SOTU_mat_lsa, 5) 136 | 137 | # display generated LSA space 138 | ?as.textmatrix 139 | SOTU_lsa_5_mat <- t(as.textmatrix(SOTU_lsa_5)) 140 | 141 | # 2.3 Q: What are these documents about? 142 | # Compare features for a few speeches 143 | SOTU_dfm@Dimnames$docs[9] 144 | topfeatures(SOTU_dfm[9,]) 145 | 146 | # With 5 dims: 147 | sort(SOTU_lsa_5_mat[9,], decreasing=T)[1:10] 148 | 149 | # With auto (21) dims: 150 | sort(t(as.textmatrix(SOTU_lsa))[9, ], decreasing = T)[1:10] 151 | 152 | # Another example: 153 | SOTU_dfm@Dimnames$docs[55] 154 | topfeatures(SOTU_dfm[55,]) 155 | 156 | sort(SOTU_lsa_5_mat[55,], decreasing=T)[1:10] 157 | sort(t(as.textmatrix(SOTU_lsa))[55, ], decreasing = T)[1:10] 158 | 159 | # Q: How are words related? 160 | # associate(): a method to identify words that are most similar to other words using a LSA 161 | ?associate 162 | # uses cosine similarity between input term and other terms 163 | SOTU_lsa_mat <- as.textmatrix(SOTU_lsa) 164 | 165 | oil <- associate(SOTU_lsa_mat, "oil", "cosine", threshold = .7) 166 | oil[1:10] 167 | 168 | health <- associate(SOTU_lsa_mat, "health", "cosine", threshold = .7) 169 | health[1:10] 170 | 171 | # Keep this in mind when we do topic models! 172 | 173 | ## 2 WORDFISH 174 | # one-dimensional text scaling method. 175 | # unlike wordscores, it does not require reference texts 176 | 177 | # How is it different from other approaches we've used for scaling? 178 | 179 | # 2.1 Read in conservative and labour manifestos (from Recitation 6) 180 | setwd("~/Drobox/GitHub/Text-as-Data-Lab-Spring-2019/W6_03_07_19/cons_labour_manifestos") 181 | 182 | files <- list.files( full.names=TRUE) 183 | text <- lapply(files, readLines) 184 | text <- unlist(lapply(text, function(x) paste(x, collapse = " "))) 185 | 186 | # Name data 187 | files <- unlist(files) 188 | files <- gsub("./", "", files ) 189 | files <- gsub(".txt", "", files ) 190 | 191 | # Create metadata 192 | year <- unlist(strsplit(files, "[^0-9]+")) 193 | year <- year[year!=""] 194 | 195 | party <- unlist(strsplit(files, "[^A-z]+")) 196 | party <- party[party!="a" & party!="b"] 197 | 198 | #create data frame 199 | man_df <- data.frame(year = factor(as.numeric(year)), 200 | party = factor(party), 201 | text = text, 202 | stringsAsFactors = FALSE) 203 | 204 | # add text labels 205 | man_df$text_label <- paste(man_df$party, man_df$year, sep = "_") 206 | 207 | 208 | lab_con_dfm <- dfm(man_df$text, 209 | stem = T, 210 | remove = stopwords("english"), 211 | remove_punct = T 212 | ) 213 | 214 | # 2.2 fit wordfish 215 | lab_con_dfm@Dimnames$docs <- man_df$text_label 216 | # Setting the index on parties 217 | manifestos_fish <- textmodel_wordfish(lab_con_dfm, c(1,24)) # second parameter corresponds to index texts 218 | 219 | # visualize one-dimensional scaling 220 | textplot_scale1d(manifestos_fish) 221 | textplot_scale1d(manifestos_fish, groups = man_df$party) 222 | 223 | # Plot of document positions 224 | plot(year[1:23], manifestos_fish$theta[1:23]) # These are the conservative manifestos 225 | points(year[24:46], manifestos_fish$theta[24:46], pch = 8) # These are the Labour manifestos 226 | 227 | plot(as.factor(party), manifestos_fish$theta) 228 | 229 | # most important features--word fixed effects 230 | words <- manifestos_fish$psi # values 231 | names(words) <- manifestos_fish$features # the words 232 | 233 | sort(words)[1:50] 234 | sort(words, decreasing=T)[1:50] 235 | 236 | # Guitar plot 237 | weights <- manifestos_fish$beta 238 | 239 | plot(weights, words) 240 | 241 | # also check out wordshoal! --------------------------------------------------------------------------------