├── .gitignore
├── README.Rmd
├── README.md
├── articles
├── Blei2003.pdf
├── Blei2007.pdf
├── Blei2008.pdf
├── Blei2009.pdf
├── Blei2012.pdf
├── Chang2009.pdf
├── Griffiths2004.pdf
├── Griffiths2007.pdf
├── Gruen2011.pdf
├── Mimno2007.pdf
├── Mimno2013.Rmd
├── Newman2006.pdf
├── Papadimitriou1997.pdf
├── Ponweiser2012.pdf
├── Roberts2013.pdf
├── Roberts2014.pdf
├── Robertsnd.pdf
├── Sievert2014a.pdf
├── Sievert2014b.pdf
├── Srivastava2009.pdf
├── Steyvers2007.pdf
├── Taddy2012.pdf
└── Tang2014.pdf
├── data
├── Jockers_data
│ ├── plainText
│ │ ├── austen.txt
│ │ └── melville.txt
│ └── taggedCorpus
│ │ ├── Carleton1.xml.txt
│ │ ├── Carleton10.xml.txt
│ │ ├── Carleton11.xml.txt
│ │ ├── Carleton12.xml.txt
│ │ ├── Carleton13.xml.txt
│ │ ├── Carleton14.xml.txt
│ │ ├── Carleton2.xml.txt
│ │ ├── Carleton3.xml.txt
│ │ ├── Carleton4.xml.txt
│ │ ├── Carleton5.xml.txt
│ │ ├── Carleton6.xml.txt
│ │ ├── Carleton7.xml.txt
│ │ ├── Carleton8.xml.txt
│ │ ├── Carleton9.xml.txt
│ │ ├── Edgeworth1.xml.txt
│ │ ├── LeFanu1.xml.txt
│ │ ├── LeFanu2.xml.txt
│ │ ├── LeFanu3.xml.txt
│ │ ├── LeFanu4.xml.txt
│ │ ├── LeFanu5.xml.txt
│ │ ├── LeFanu6.xml.txt
│ │ ├── LeFanu7.xml.txt
│ │ ├── Lewis.xml.txt
│ │ ├── Norris1.xml.txt
│ │ ├── Norris2.xml.txt
│ │ ├── Norris3.xml.txt
│ │ ├── Norris4.xml.txt
│ │ ├── Polidori1.xml.txt
│ │ ├── Quigley1.xml.txt
│ │ ├── Quigley2.xml.txt
│ │ ├── anonymous.xml.txt
│ │ ├── donovan1.xml.txt
│ │ ├── donovan2.xml.txt
│ │ ├── driscoll1.xml.txt
│ │ ├── driscoll2.xml.txt
│ │ ├── driscoll3.xml.txt
│ │ ├── jessop1.xml.txt
│ │ ├── jessop2.xml.txt
│ │ ├── jessop3.xml.txt
│ │ ├── kyne1.xml.txt
│ │ ├── kyne2.xml.txt
│ │ ├── mcHenry1.xml.txt
│ │ └── mcHenry2.xml.txt
├── LDAvis_reviews.txt
├── mallet_texts
│ ├── README
│ ├── numeric
│ │ ├── boxes.txt
│ │ └── puffins.txt
│ └── web
│ │ ├── de
│ │ ├── apollo8.txt
│ │ ├── fiv.txt
│ │ ├── habichtsadler.txt
│ │ ├── hoechst.txt
│ │ ├── indogermanische.txt
│ │ ├── konrad.txt
│ │ ├── marcellinus.txt
│ │ ├── rostock.txt
│ │ ├── sadat.txt
│ │ ├── t40.txt
│ │ ├── ulrich.txt
│ │ └── wildenstein.txt
│ │ └── en
│ │ ├── elizabeth_needham.txt
│ │ ├── equipartition_theorem.txt
│ │ ├── gunnhild.txt
│ │ ├── hawes.txt
│ │ ├── hill.txt
│ │ ├── shiloh.txt
│ │ ├── sunderland_echo.txt
│ │ ├── thespis.txt
│ │ ├── thylacine.txt
│ │ ├── uranus.txt
│ │ ├── yard.txt
│ │ └── zinta.txt
├── recipes
│ ├── README.md
│ ├── allr_recipes.txt
│ ├── epic_recipes.txt
│ ├── map.txt
│ └── menu_recipes.txt
└── stm_gadarian.csv
├── functions
├── optimal_k.R
└── topicmodels2LDAvis.R
├── inst
└── figure
│ ├── topic-model.jpg
│ ├── unnamed-chunk-10-1.png
│ ├── unnamed-chunk-11-1.png
│ ├── unnamed-chunk-12-1.png
│ ├── unnamed-chunk-5-1.png
│ ├── unnamed-chunk-6-1.png
│ ├── unnamed-chunk-7-1.png
│ ├── unnamed-chunk-8-1.png
│ └── unnamed-chunk-9-1.png
├── presentations
└── Blei2009.pdf
├── scripts
└── Example_topic_model_analysis.R
├── stopword_lists
├── Jockers_stoplist-exp.csv
├── Jockers_stoplist.csv
├── mallet_stoplists
│ ├── README
│ ├── de.txt
│ ├── en.txt
│ ├── fi.txt
│ ├── fr.txt
│ └── jp.txt
├── python_stoplists
│ ├── arabic.txt
│ ├── catalan.txt
│ ├── danish.txt
│ ├── dutch.txt
│ ├── english.txt
│ ├── finnish.txt
│ ├── french.txt
│ ├── german.txt
│ ├── hungarian.txt
│ ├── italian.txt
│ ├── languages.json
│ ├── norwegian.txt
│ ├── portuguese.txt
│ ├── romanian.txt
│ ├── russian.txt
│ ├── spanish.txt
│ ├── swedish.txt
│ ├── turkish.txt
│ └── ukrainian.txt
└── tm_stoplists
│ ├── SMART.txt
│ ├── catalan.txt
│ ├── danish.txt
│ ├── dutch.txt
│ ├── english.txt
│ ├── finnish.txt
│ ├── french.txt
│ ├── german.txt
│ ├── hungarian.txt
│ ├── italian.txt
│ ├── norwegian.txt
│ ├── portuguese.txt
│ ├── romanian.txt
│ ├── russian.txt
│ ├── spanish.txt
│ └── swedish.txt
└── topicmodels_learning.Rproj
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Rprofile
5 | optimal_k2.R
6 | optimal_k1.R
7 | README.html
8 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Topic Models Learning and R Resources"
3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
4 | output:
5 | md_document:
6 | toc: true
7 | toc_depth: 2
8 | ---
9 | ```{r, echo=FALSE, message=FALSE}
10 | # rmarkdown::render("README.Rmd", "all"); md_toc()
11 | library(knitr)
12 | knit_hooks$set(htmlcap = function(before, options, envir) {
13 | if(!before) {
14 | paste('
',options$htmlcap,"
",sep="")
15 | }
16 | })
17 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE)
18 | knitr::opts_chunk$set(fig.path = "inst/figure/")
19 | ```
20 |
21 | This is a collection documenting the resources I find related to topic models with an R flavored focus. A *topic model* is a type of [*generative*](http://stackoverflow.com/questions/879432/what-is-the-difference-between-a-generative-and-discriminative-algorithm) model used to "discover" latent topics that compose a *corpus* or collection of documents. Typically topic modeling is used on a collection of text documents but can be used for other modes including use as caption generation for images.
22 |
23 | 
24 |
25 | # Just the Essentials
26 |
27 | This is my run down of the minimal readings, websites, videos, & scripts the reader needs to become familiar with topic modeling. The list is in an order I believe will be of greatest use and contains a nice mix of introduction, theory, application, and interpretation. As you want to learn more about topic modeling, the other sections will become more useful.
28 |
29 | 1. Boyd-Graber, J. (2013). [Computational Linguistics I: Topic Modeling](https://www.youtube.com/watch?v=4p9MSJy761Y)
30 | 2. Underwood, T. (2012). [Topic Modeling Made Just Simple Enough](http://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)
31 | 3. Weingart, S. (2012). [Topic Modeling for Humanists: A Guided Tour](http://www.scottbot.net/HIAL/?p=19113)
32 | 4. Blei, D. M. (2012). [Probabilistic topic models](/articles/Blei2012.pdf). *Communications of the ACM, (55)*4, 77-84. doi:10.1145/2133806.2133826
33 | 5. inkhorn82 (2014). [A Delicious Analysis! (aka topic modelling using recipes)](http://rforwork.info/2014/02/17/a-delicious-analysis/) [(CODE)](https://gist.githubusercontent.com/inkhorn/9044779/raw/c7f0ba30d424aaeb75c5e221d12566f6732c4f29/recipe%20analysis.R)
34 | 6. Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of Statistical Software, 40*(13), 1-30.
35 | 7. Marwick, B. (2014a). [The input parameters for using latent Dirichlet allocation](http://stats.stackexchange.com/a/25128/7482)
36 | 8. Tang, J., Meng, Z., Nguyen, X. , Mei, Q. , & Zhang, M. (2014). [Understanding the limiting factors of topic modeling via posterior contraction analysis](/articles/Tang2014.pdf). In *31 st International Conference on Machine Learning*, 190-198.
37 | 9. Sievert, C. (2014). [LDAvis: A method for visualizing and interpreting topic models](https://www.youtube.com/watch?v=IksL96ls4o0)
38 | 10. Rhody, L. M. (2012). [Some Assembly Required: Understanding and Interpreting Topics in LDA Models of Figurative Language](http://www.lisarhody.com/some-assembly-required)
39 | 11. Rinker, T.W. (2015). [R Script: Example Topic Model Analysis](https://raw.githubusercontent.com/trinker/topicmodels_learning/master/scripts/Example_topic_model_analysis.R)
40 |
41 | # Key Players
42 |
43 | Papadimitriou, Raghavan, Tamaki & Vempala, Santosh (1997) first introduced the notion of topic modeling in their ["Latent Semantic Indexing: A probabilistic analysis"](/articles/Papadimitriou1997.pdf). Thomas Hofmann (1999) developed "Probabilistic latent semantic indexing". Blei, Ng, & Jordan (2003) proposed *latent Dirichlet allocation* (LDA) as a means of modeling documents with multiple topics but assumes the topic are uncorrelated. Blei & Lafferty (2007) proposed *correlated topics model* (CTM), extending LDA to allow for correlations between topics. Roberts, Stewart, Tingley, & Airoldi (2013) propose a [*Structural Topic Model*](/articles/Roberts2013.pdf) (STM), allowing the inclusion of meta-data in the modeling process.
44 |
45 | # Videos
46 |
47 | ## Introductory
48 |
49 | - Boyd-Graber, J. (2013). [Computational Linguistics I: Topic Modeling](https://www.youtube.com/watch?v=4p9MSJy761Y)
50 |
51 | ## Theory
52 |
53 | - Blei, D. (2007) [Modeling Science: Dynamic Topic Models of Scholarly Research](https://www.youtube.com/watch?v=7BMsuyBPx90)
54 | - Blei, D. (2009) [Topic Models: Parts I & II](http://videolectures.net/mlss09uk_blei_tm/#) ([Lecture Notes](/presentations/Blei2009.pdf))
55 | - Jordan, M. (2014) [A Short History of Topic Models](https://www.youtube.com/watch?v=fBNsHPtTAGs)
56 |
57 |
58 | ## Visualization
59 |
60 | - Sievert, C. (2014) [LDAvis: A method for visualizing and interpreting topic models](https://www.youtube.com/watch?v=IksL96ls4o0)
61 | - Maybe, B. (2015) [SavvySharpa: Visualizing Topic Models](https://www.youtube.com/watch?v=tGxW2BzC_DU)
62 |
63 | # Articles
64 |
65 | ## Applied
66 |
67 | - Marwick, B. 2013. [Discovery of Emergent Issues and Controversies in Anthropology Using Text Mining, Topic Modeling, and Social Network Analysis of Microblog Content](https://www.academia.edu/5508141/Discovery_of_Emergent_Issues_and_Controversies_in_Anthropology_Using_Text_Mining_Topic_Modeling_and_Social_Network_Analysis_of_Microblog_Content). In Yanchang Zhao, Yonghua Cen (eds) Data Mining Applications with R. Elsevier. p. 63-93
68 |
69 | - Newman, D.J. & Block, S. (2006). [Probabilistic topic decomposition of an eighteenth-century American newspaper](/articles/Newman2006.pdf). *Journal of the American Society for Information Science and Technology. 57*(6), 753-767. doi:10.1002/asi.v57:6
70 |
71 |
72 | ## Theoretical
73 |
74 | - Blei, D. M. (2012). [Probabilistic topic models](/articles/Blei2012.pdf). *Communications of the ACM, (55)*4, 77-84. doi:10.1145/2133806.2133826
75 | - Blei, D. M. & Lafferty, J. D. (2007) [A correlated topic model of Science](/articles/Blei2007.pdf). *The Annals of Applied Statistics 1*(1), 17-35. doi:10.1214/07-AOAS114
76 | - Blei, D. M. & Lafferty, J. D. (2009) [Topic models](/articles/Blei2009.pdf). In A Srivastava, M Sahami (eds.), [*Text mining: classification, clustering, and applications*](/articles/Srivastava2009.pdf). Chapman & Hall/CRC Press. 71-93.
77 | - Blei, D. M. & McAuliffe, J. (2008). [Supervised topic models](/articles/Blei2008.pdf). In Advances in Neural Information Processing Systems 20, 1-8.
78 | - Blei, D. M., Ng, A.Y., & Jordan, M.I. (2003). [Latent Dirichlet Allocation](/articles/Blei2003.pdf). *Journal of Machine Learning Research, 3*, 993-1022.
79 | - Chang, J., Boyd-Graber, J. , Wang, C., Gerrish, S., & Blei. D. (2009). [Reading tea leaves: How humans interpret topic models](/articles/Chang2009.pdf). In *Neural Information Processing Systems*.
80 | - Griffiths, T.L. & Steyvers, M. (2004). [Finding Scientific Topics](/articles/Griffiths2004.pdf). Proceedings of the National
81 | Academy of Sciences of the United States of America, 101, 5228-5235.
82 | - Griffiths, T.L., Steyvers, M., & Tenenbaum, J.B.T. (2007). [Topics in Semantic Representation](/articles/Griffiths2007.pdf). *Psychological Review, 114*(2), 211-244.
83 | - Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of Statistical Software, 40*(13), 1-30.
84 | - Mimno, D. & A. Mccallum. (2007). [Organizing the OCA: learning faceted subjects from a library of digital books](/articles/Mimno2007.pdf). In *Joint Conference on Digital Libraries*. ACM Press, New York, NY, 376–385.
85 | - Ponweiser, M. (2012). [Latent Dirichlet Allocation in R (Diploma Thesis)](/articles/Ponweiser2012.pdf). Vienna University of Economics and Business, Vienna
86 | - Roberts M.E., Stewart B.M., Tingley D., & Airoldi E.M. (2013) [The Structural Topic Model and Applied Social Science](/articles/Roberts2013.pdf). *Advances in Neural Information Processing Systems Workshop on Topic Models: Computation, Application, and Evaluation*, 1-4.
87 | - Roberts, M., Stewart, B., Tingley, D., Lucas, C., Leder-Luis, J., Gadarian, S., Albertson, B., et al. (2014). [Structural topic models for open ended survey responses](/articles/Roberts2014.pdf). *American Journal of Political Science, American Journal of Political Science, 58*(4), 1064-1082.
88 | - Roberts, M., Stewart, B., Tingley, D. (n.d.). [stm: R Package for Structural Topic Models](/articles/Robertsnd.pdf), 1-49.
89 | - Sievert, C. & Shirley, K. E. (2014a). [LDAvis: A Method for Visualizing and Interpreting Topics.](/articles/Sievert2014a.pdf) in *Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces* 63-70.
90 | - Steyvers, M. & Griffiths, T. (2007). [Probabilistic topic models](/articles/Steyvers2007.pdf). In T. Landauer, D McNamara, S. Dennis, and W. Kintsch (eds), *Latent Semantic Analysis: A Road to Meaning*. Laurence Erlbaum
91 | - Taddy, M.A. (2012). [On Estimation and Selection for Topic Models](/articles/Taddy2012.pdf) In *Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS 2012)*, 1184-1193.
92 | - Tang, J., Meng, Z., Nguyen, X. , Mei, Q. , & Zhang, M. (2014). [Understanding the limiting factors of topic modeling via posterior contraction analysis](/articles/Tang2014.pdf). In *31 st International Conference on Machine Learning*, 190-198.
93 |
94 | # Websites & Blogs
95 |
96 | - Blei, D. (n.d.). [Topic Modeling](https://www.cs.princeton.edu/~blei/topicmodeling.html)
97 | - Jockers, M.L. (2013). ["Secret" Recipe for Topic Modeling Themes](http://www.matthewjockers.net/2013/04/12/secret-recipe-for-topic-modeling-themes/)
98 | - Jones, T. (n.d.). [Topic Models Reading List](http://www.biasedestimates.com/p/topic-models-reading-list.html)
99 | - Marwick, B. (2014a). [The input parameters for using latent Dirichlet allocation](http://stats.stackexchange.com/a/25128/7482)
100 | - Marwick, B. (2014b). [Topic models: cross validation with loglikelihood or perplexity](http://stackoverflow.com/a/21394092/1000343)
101 | - Rhody, L. M. (2012). [Some Assembly Required: Understanding and Interpreting Topics in LDA Models of Figurative Language](http://www.lisarhody.com/some-assembly-required)
102 | - Schmidt, B.M. (2012). [Words Alone: Dismantling Topic Models in the Humanities](http://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/)
103 | - Underwood, T. (2012a). [Topic Modeling Made Just Simple Enough](http://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)
104 | - Underwood, T. (2012b). [What kinds of "topics" does topic modeling actually produce?](http://tedunderwood.com/2012/04/01/what-kinds-of-topics-does-topic-modeling-actually-produce/)
105 | - Weingart, S. (2012). [Topic Modeling for Humanists: A Guided Tour](http://www.scottbot.net/HIAL/?p=19113)
106 | - Weingart, S. (2011). [Topic Modeling and Network Analysis](http://www.scottbot.net/HIAL/?p=221)
107 |
108 |
109 | # R Resources
110 |
111 | ## Package Comparisons
112 |
113 | | Package | Functionality | Pluses | Author | R Language Interface |
114 | |-------------- | -------------|---------|----------|---------------------|
115 | | lda* | Collapsed Gibbs for LDA | Graphing utilities | Chang | R |
116 | | topicmodels | LDA and CTM | Follows Blei's implementation; great vignette; takes | C | [DTM](https://en.wikipedia.org/wiki/Document-term_matrix) | Grüen & Hornik |
117 | | stm | Model w/ meta-data | Great documentation; nice visualization | Roberts, Stewart, & Tingley | C |
118 | | LDAvis | Interactive visualization | Aids in model interpretation | Sievert & Shirley | R + Shiny |
119 | | mallet** | LDA | [MALLET](http://programminghistorian.org/lessons/topic-modeling-and-mallet) is well known | Mimno | Java |
120 |
121 | \*[*StackExchange discussion of lda vs. topicmodels*](http://stats.stackexchange.com/questions/24441/two-r-packages-for-topic-modeling-lda-and-topicmodels)
122 | \*\*[*Setting Up MALLET*](http://programminghistorian.org/lessons/topic-modeling-and-mallet)
123 |
124 |
125 | ## R Specific References
126 |
127 | - Chang J. (2010). lda: Collapsed Gibbs Sampling Methods for Topic Models. http://CRAN.R-project.org/package=lda.
128 | - Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of Statistical Software, 40*(13), 1-30.
129 | - Mimno, D. (2013). [vignette-mallet: A wrapper around the Java machine learning tool MALLET](/articles/Mimno2013.Rmd). https://CRAN.R-project.org/package=mallet
130 | - Ponweiser, M. (2012). [Latent Dirichlet Allocation in R (Diploma Thesis)](/articles/Ponweiser2012.pdf). Vienna University of Economics and Business, Vienna.
131 | - Roberts, M., Stewart, B., Tingley, D. (n.d.). [stm: R Package for Structural Topic Models](/articles/Robertsnd.pdf), 1-49.
132 | - Sievert, C. & Shirley, K. E. (2014a). [LDAvis: A Method for Visualizing and Interpreting Topics.](Sievert2014a.pdf) *Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces* 63-70.
133 | - Sievert, C. & Shirley, K. E. (2014b). [Vignette: LDAvis details.](/articles/Sievert2014b.pdf) 1-5.
134 |
135 |
136 | ## Example Modeling
137 |
138 | - Awati, K. (2015). [A gentle introduction to topic modeling using R](https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/)
139 | - Dubins, M. (2013). [Topic Modeling in Python and R: A Rather Nosy Analysis of the Enron Email Corpus](https://dzone.com/articles/topic-modeling-python-and-r)
140 | - Goodrich, B. (2015) [Topic Modeling Twitter Using R](https://www.linkedin.com/pulse/topic-modeling-twitter-using-r-bryan-goodrich) [(CODE)](https://gist.githubusercontent.com/bryangoodrich/7b5ef683ce8db592669e/raw/3402e7390d10a0282dc0d6309ed4df9a4fb1cf5d/TwitterTopics.r)
141 | - inkhorn82 (2014). [A Delicious Analysis! (aka topic modelling using recipes)](http://rforwork.info/2014/02/17/a-delicious-analysis/) [(CODE)](https://gist.githubusercontent.com/inkhorn/9044779/raw/c7f0ba30d424aaeb75c5e221d12566f6732c4f29/recipe%20analysis.R)
142 | - Jockers, M.L. (2014).[Introduction to Text Analysis and Topic Modeling with R](http://www.matthewjockers.net/materials/dh-2014-introduction-to-text-analysis-and-topic-modeling-with-r/)
143 | - Medina, L. (2015). [Conspiracy Theories - Topic Modeling & Keyword Extraction](http://voidpatterns.org/2015/03/conspiracy-theories-topic-modeling-keyword-extraction/)
144 | - Sievert, C. (n.d.). [A topic model for movie reviews](http://cpsievert.github.io/LDAvis/reviews/reviews.html)
145 | - Sievert, C. (2014). [Topic Modeling In R](https://ropensci.org/blog/2014/04/16/topic-modeling-in-R/)
146 |
147 | # Topic Modeling R Demo
148 |
149 | ## topicmodels Package
150 |
151 | The .R script for this demonstration can be downloaded from [scripts/Example_topic_model_analysis.R](https://raw.githubusercontent.com/trinker/topicmodels_learning/master/scripts/Example_topic_model_analysis.R)
152 |
153 | ### Install/Load Tools & Data
154 |
155 | ```{r}
156 | if (!require("pacman")) install.packages("pacman")
157 | pacman::p_load_gh("trinker/gofastr")
158 | pacman::p_load(tm, topicmodels, dplyr, tidyr, igraph, devtools, LDAvis, ggplot2)
159 |
160 | ## Source topicmodels2LDAvis & optimal_k functions
161 | invisible(lapply(
162 | file.path(
163 | "https://raw.githubusercontent.com/trinker/topicmodels_learning/master/functions",
164 | c("topicmodels2LDAvis.R", "optimal_k.R")
165 | ),
166 | devtools::source_url
167 | ))
168 |
169 | data(presidential_debates_2012)
170 | ```
171 |
172 |
173 | ### Generate Stopwords
174 | ```{r}
175 | stops <- c(
176 | tm::stopwords("english"),
177 | tm::stopwords("SMART"),
178 | "governor", "president", "mister", "obama","romney"
179 | ) %>%
180 | gofastr::prep_stopwords()
181 | ```
182 |
183 |
184 | ### Create the DocumentTermMatrix
185 |
186 | ```{r}
187 | doc_term_mat <- presidential_debates_2012 %>%
188 | with(gofastr::q_dtm_stem(dialogue, paste(person, time, sep = "_"))) %>%
189 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
190 | gofastr::filter_tf_idf() %>%
191 | gofastr::filter_documents()
192 | ```
193 |
194 | ### Control List
195 |
196 | ```{r}
197 | control <- list(burnin = 500, iter = 1000, keep = 100, seed = 2500)
198 | ```
199 |
200 |
201 | ### Determine Optimal Number of Topics
202 |
203 | The plot below shows the harmonic mean of the log likelihoods against k (number of topics).
204 |
205 | ```{r, eval=FALSE}
206 | (k <- optimal_k(doc_term_mat, 40, control = control))
207 | ```
208 |
209 | ```{r, echo=FALSE}
210 | (k <- optimal_k(doc_term_mat, 40, control = control, drop.seed = FALSE))
211 | ```
212 |
213 | It appears the optimal number of topics is ~k = `r as.numeric(k)`.
214 |
215 | ### Run the Model
216 |
217 | ```{r}
218 | control[["seed"]] <- 100
219 | lda_model <- topicmodels::LDA(doc_term_mat, k=as.numeric(k), method = "Gibbs",
220 | control = control)
221 | ```
222 |
223 | ### Plot the Topics Per Person & Time
224 |
225 | ```{r, fig.width=10, fig.height=12}
226 | topics <- topicmodels::posterior(lda_model, doc_term_mat)[["topics"]]
227 | topic_dat <- dplyr::add_rownames(as.data.frame(topics), "Person_Time")
228 | colnames(topic_dat)[-1] <- apply(terms(lda_model, 10), 2, paste, collapse = ", ")
229 |
230 | tidyr::gather(topic_dat, Topic, Proportion, -c(Person_Time)) %>%
231 | tidyr::separate(Person_Time, c("Person", "Time"), sep = "_") %>%
232 | dplyr::mutate(Person = factor(Person,
233 | levels = c("OBAMA", "ROMNEY", "LEHRER", "SCHIEFFER", "CROWLEY", "QUESTION" ))
234 | ) %>%
235 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
236 | ggplot2::geom_bar() +
237 | ggplot2::coord_flip() +
238 | ggplot2::facet_grid(Person~Time) +
239 | ggplot2::guides(fill=FALSE) +
240 | ggplot2::xlab("Proportion")
241 | ```
242 |
243 |
244 | ### Plot the Topics Matrix as a Heatmap
245 |
246 | ```{r}
247 | heatmap(topics, scale = "none")
248 | ```
249 |
250 | ### Network of the Word Distributions Over Topics (Topic Relation)
251 |
252 | ```{r}
253 | post <- topicmodels::posterior(lda_model)
254 |
255 | cor_mat <- cor(t(post[["terms"]]))
256 | cor_mat[ cor_mat < .05 ] <- 0
257 | diag(cor_mat) <- 0
258 |
259 | graph <- graph.adjacency(cor_mat, weighted=TRUE, mode="lower")
260 | graph <- delete.edges(graph, E(graph)[ weight < 0.05])
261 |
262 | E(graph)$edge.width <- E(graph)$weight*20
263 | V(graph)$label <- paste("Topic", V(graph))
264 | V(graph)$size <- colSums(post[["topics"]]) * 15
265 |
266 | par(mar=c(0, 0, 3, 0))
267 | set.seed(110)
268 | plot.igraph(graph, edge.width = E(graph)$edge.width,
269 | edge.color = "orange", vertex.color = "orange",
270 | vertex.frame.color = NA, vertex.label.color = "grey30")
271 | title("Strength Between Topics Based On Word Probabilities", cex.main=.8)
272 | ```
273 |
274 |
275 | ### Network of the Topics Over Dcouments (Topic Relation)
276 |
277 | ```{r, fig.width=8, fig.height=8}
278 | minval <- .1
279 | topic_mat <- topicmodels::posterior(lda_model)[["topics"]]
280 |
281 | graph <- graph_from_incidence_matrix(topic_mat, weighted=TRUE)
282 | graph <- delete.edges(graph, E(graph)[ weight < minval])
283 |
284 | E(graph)$edge.width <- E(graph)$weight*17
285 | E(graph)$color <- "blue"
286 | V(graph)$color <- ifelse(grepl("^\\d+$", V(graph)$name), "grey75", "orange")
287 | V(graph)$frame.color <- NA
288 | V(graph)$label <- ifelse(grepl("^\\d+$", V(graph)$name), paste("topic", V(graph)$name), gsub("_", "\n", V(graph)$name))
289 | V(graph)$size <- c(rep(10, nrow(topic_mat)), colSums(topic_mat) * 20)
290 | V(graph)$label.color <- ifelse(grepl("^\\d+$", V(graph)$name), "red", "grey30")
291 |
292 | par(mar=c(0, 0, 3, 0))
293 | set.seed(369)
294 | plot.igraph(graph, edge.width = E(graph)$edge.width,
295 | vertex.color = adjustcolor(V(graph)$color, alpha.f = .4))
296 | title("Topic & Document Relationships", cex.main=.8)
297 | ```
298 |
299 |
300 | ### LDAvis of Model
301 |
302 | The output from **LDAvis** is not easily embedded within an R markdown document, however, the reader may [see the results here](http://trinker.github.io/LDAvis/example/).
303 |
304 | ```{r, eval=FALSE}
305 | lda_model %>%
306 | topicmodels2LDAvis() %>%
307 | LDAvis::serVis()
308 | ```
309 |
310 | ```{r, echo=FALSE, message=FALSE, results="hide"}
311 | targ <- "C:/Users/Tyler/GitHub/trinker.github.com/LDAvis/example/lda.json"
312 | unlink(targ,,TRUE)
313 | temp <- tempfile()
314 |
315 | lda_model %>%
316 | topicmodels2LDAvis() %>%
317 | LDAvis::serVis(temp, open.browser = FALSE) %>%
318 | invisible()
319 |
320 | file.copy(file.path(temp, "lda.json"), pathr::parse_path(targ) %>% pathr::front())
321 | pathr::open_path("C:/Users/Tyler/GitHub/trinker.github.com/trinker.github.com.Rproj")
322 | ```
323 |
324 | ### Apply Model to New Data
325 |
326 | ```{r, eval=FALSE}
327 | ## Create the DocumentTermMatrix for New Data
328 | doc_term_mat2 <- partial_republican_debates_2015 %>%
329 | with(gofastr::q_dtm_stem(dialogue, paste(person, location, sep = "_"))) %>%
330 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
331 | gofastr::filter_tf_idf() %>%
332 | gofastr::filter_documents()
333 |
334 |
335 | ## Update Control List
336 | control2 <- control
337 | control2[["estimate.beta"]] <- FALSE
338 |
339 |
340 | ## Run the Model for New Data
341 | lda_model2 <- topicmodels::LDA(doc_term_mat2, k = k, model = lda_model,
342 | control = list(seed = 100, estimate.beta = FALSE))
343 |
344 |
345 | ## Plot the Topics Per Person & Location for New Data
346 | topics2 <- topicmodels::posterior(lda_model2, doc_term_mat2)[["topics"]]
347 | topic_dat2 <- dplyr::add_rownames(as.data.frame(topics2), "Person_Location")
348 | colnames(topic_dat2)[-1] <- apply(terms(lda_model2, 10), 2, paste, collapse = ", ")
349 |
350 | tidyr::gather(topic_dat2, Topic, Proportion, -c(Person_Location)) %>%
351 | tidyr::separate(Person_Location, c("Person", "Location"), sep = "_") %>%
352 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
353 | ggplot2::geom_bar() +
354 | ggplot2::coord_flip() +
355 | ggplot2::facet_grid(Person~Location) +
356 | ggplot2::guides(fill=FALSE) +
357 | ggplot2::xlab("Proportion")
358 |
359 |
360 | ## LDAvis of Model for New Data
361 | lda_model2 %>%
362 | topicmodels2LDAvis() %>%
363 | LDAvis::serVis()
364 | ```
365 |
366 | # Contributing
367 |
368 | You are welcome to:
369 | * submit suggestions and bug-reports at:
370 | * send a pull request on:
371 | * compose a friendly e-mail to:
372 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Topic Models Learning and R Resources [](https://twitter.com/intent/follow?screen_name=tylerrinker)
2 | ============
3 |
4 |
5 | This is a collection documenting the resources I find related to topic
6 | models with an R flavored focus. A *topic model* is a type of
7 | [*generative*](http://stackoverflow.com/questions/879432/what-is-the-difference-between-a-generative-and-discriminative-algorithm)
8 | model used to "discover" latent topics that compose a *corpus* or
9 | collection of documents. Typically topic modeling is used on a
10 | collection of text documents but can be used for other modes including
11 | use as caption generation for images.
12 |
13 | 
14 |
15 |
16 | Table of Contents
17 | ============
18 |
19 | - [Just the Essentials](#just-the-essentials)
20 | - [Key Players](#key-players)
21 | - [Videos](#videos)
22 | - [Introductory](#introductory)
23 | - [Theory](#theory)
24 | - [Visualization](#visualization)
25 | - [Articles](#articles)
26 | - [Applied](#applied)
27 | - [Theoretical](#theoretical)
28 | - [Websites & Blogs](#websites--blogs)
29 | - [R Resources](#r-resources)
30 | - [Package Comparisons](#package-comparisons)
31 | - [R Specific References](#r-specific-references)
32 | - [Example Modeling](#example-modeling)
33 | - [Topic Modeling R Demo](#topic-modeling-r-demo)
34 | - [topicmodels Package](#topicmodels-package)
35 | - [Contributing](#contributing)
36 |
37 | Just the Essentials
38 | ============
39 |
40 |
41 | This is my run down of the minimal readings, websites, videos, & scripts
42 | the reader needs to become familiar with topic modeling. The list is in
43 | an order I believe will be of greatest use and contains a nice mix of
44 | introduction, theory, application, and interpretation. As you want to
45 | learn more about topic modeling, the other sections will become more
46 | useful.
47 |
48 | 1. Boyd-Graber, J. (2013). [Computational Linguistics I: Topic
49 | Modeling](https://www.youtube.com/watch?v=4p9MSJy761Y)
50 | 2. Underwood, T. (2012). [Topic Modeling Made Just Simple
51 | Enough](http://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)
52 | 3. Weingart, S. (2012). [Topic Modeling for Humanists: A Guided
53 | Tour](http://www.scottbot.net/HIAL/?p=19113)
54 | 4. Blei, D. M. (2012). [Probabilistic topic
55 | models](/articles/Blei2012.pdf). *Communications of the ACM, (55)*4,
56 | 77-84.
57 | 5. inkhorn82 (2014). [A Delicious Analysis! (aka topic modelling using
58 | recipes)](http://rforwork.info/2014/02/17/a-delicious-analysis/)
59 | [(CODE)](https://gist.githubusercontent.com/inkhorn/9044779/raw/c7f0ba30d424aaeb75c5e221d12566f6732c4f29/recipe%20analysis.R)
60 | 6. Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for
61 | Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of
62 | Statistical Software, 40*(13), 1-30.
63 | 7. Marwick, B. (2014a). [The input parameters for using latent
64 | Dirichlet allocation](http://stats.stackexchange.com/a/25128/7482)
65 | 8. Tang, J., Meng, Z., Nguyen, X. , Mei, Q. , & Zhang, M. (2014).
66 | [Understanding the limiting factors of topic modeling via posterior
67 | contraction analysis](/articles/Tang2014.pdf). In *31 st
68 | International Conference on Machine Learning*, 190-198.
69 | 9. Sievert, C. (2014). [LDAvis: A method for visualizing and
70 | interpreting topic
71 | models](https://www.youtube.com/watch?v=IksL96ls4o0)
72 | 10. Rhody, L. M. (2012). [Some Assembly Required: Understanding and Interpreting Topics in LDA Models of Figurative Language](http://www.lisarhody.com/some-assembly-required)
73 | 11. Rinker, T.W. (2015). [R Script: Example Topic Model
74 | Analysis](https://raw.githubusercontent.com/trinker/topicmodels_learning/master/scripts/Example_topic_model_analysis.R)
75 |
76 | Key Players
77 | ===========
78 |
79 | Papadimitriou, Raghavan, Tamaki & Vempala, Santosh (1997) first
80 | introduced the notion of topic modeling in their ["Latent Semantic
81 | Indexing: A probabilistic analysis"](/articles/Papadimitriou1997.pdf).
82 | Thomas Hofmann (1999) developed "Probabilistic latent semantic
83 | indexing". Blei, Ng, & Jordan (2003) proposed *latent Dirichlet
84 | allocation* (LDA) as a means of modeling documents with multiple topics
85 | but assumes the topic are uncorrelated. Blei & Lafferty (2007) proposed
86 | *correlated topics model* (CTM), extending LDA to allow for correlations
87 | between topics. Roberts, Stewart, Tingley, & Airoldi (2013) propose a
88 | [*Structural Topic Model*](/articles/Roberts2013.pdf) (STM), allowing
89 | the inclusion of meta-data in the modeling process.
90 |
91 | Videos
92 | ======
93 |
94 | Introductory
95 | ------------
96 |
97 | - Boyd-Graber, J. (2013). [Computational Linguistics I: Topic
98 | Modeling](https://www.youtube.com/watch?v=4p9MSJy761Y)
99 |
100 | Theory
101 | ------
102 |
103 | - Blei, D. (2007) [Modeling Science: Dynamic Topic Models of Scholarly
104 | Research](https://www.youtube.com/watch?v=7BMsuyBPx90)
105 | - Blei, D. (2009) [Topic Models: Parts I &
106 | II](http://videolectures.net/mlss09uk_blei_tm/#) ([Lecture
107 | Notes](/presentations/Blei2009.pdf))
108 | - Jordan, M. (2014) [A Short History of Topic
109 | Models](https://www.youtube.com/watch?v=fBNsHPtTAGs)
110 |
111 | Visualization
112 | -------------
113 |
114 | - Sievert, C. (2014) [LDAvis: A method for visualizing and
115 | interpreting topic
116 | models](https://www.youtube.com/watch?v=IksL96ls4o0)
117 | - Maybe, B. (2015) [SavvySharpa: Visualizing Topic
118 | Models](https://www.youtube.com/watch?v=tGxW2BzC_DU)
119 |
120 | Articles
121 | ========
122 |
123 | Applied
124 | -------
125 |
126 | - Marwick, B. 2013. [Discovery of Emergent Issues and Controversies in Anthropology Using Text Mining, Topic Modeling, and Social Network Analysis of Microblog Content](https://www.academia.edu/5508141/Discovery_of_Emergent_Issues_and_Controversies_in_Anthropology_Using_Text_Mining_Topic_Modeling_and_Social_Network_Analysis_of_Microblog_Content). In Yanchang Zhao, Yonghua Cen (eds) Data Mining Applications with R. Elsevier. p. 63-93
127 |
128 |
129 | - Newman, D.J. & Block, S. (2006). [Probabilistic topic decomposition of
130 | an eighteenth-century American newspaper](/articles/Newman2006.pdf).
131 | *Journal of the American Society for Information Science and Technology.
132 | 57*(6), 753-767.
133 |
134 | Theoretical
135 | -----------
136 |
137 | - Blei, D. M. (2012). [Probabilistic topic
138 | models](/articles/Blei2012.pdf). *Communications of the ACM, (55)*4,
139 | 77-84.
140 | - Blei, D. M. & Lafferty, J. D. (2007) [A correlated topic model of
141 | Science](/articles/Blei2007.pdf). *The Annals of Applied Statistics
142 | 1*(1), 17-35.
143 | - Blei, D. M. & Lafferty, J. D. (2009) [Topic
144 | models](/articles/Blei2009.pdf). In A Srivastava, M Sahami (eds.),
145 | [*Text mining: classification, clustering, and
146 | applications*](/articles/Srivastava2009.pdf). Chapman & Hall/CRC
147 | Press. 71-93.
148 | - Blei, D. M. & McAuliffe, J. (2008). [Supervised topic
149 | models](/articles/Blei2008.pdf). In Advances in Neural Information
150 | Processing Systems 20, 1-8.
151 | - Blei, D. M., Ng, A.Y., & Jordan, M.I. (2003). [Latent Dirichlet
152 | Allocation](/articles/Blei2003.pdf). *Journal of Machine Learning
153 | Research, 3*, 993-1022.
154 | - Chang, J., Boyd-Graber, J. , Wang, C., Gerrish, S., & Blei. D.
155 | (2009). [Reading tea leaves: How humans interpret topic
156 | models](/articles/Chang2009.pdf). In *Neural Information Processing
157 | Systems*.
158 | - Griffiths, T.L. & Steyvers, M. (2004). [Finding Scientific
159 | Topics](/articles/Griffiths2004.pdf). Proceedings of the National
160 | Academy of Sciences of the United States of America, 101, 5228-5235.
161 | - Griffiths, T.L., Steyvers, M., & Tenenbaum, J.B.T. (2007). [Topics
162 | in Semantic Representation](/articles/Griffiths2007.pdf).
163 | *Psychological Review, 114*(2), 211-244.
164 | - Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for
165 | Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of
166 | Statistical Software, 40*(13), 1-30.
167 | - Mimno, D. & A. Mccallum. (2007). [Organizing the OCA: learning
168 | faceted subjects from a library of digital
169 | books](/articles/Mimno2007.pdf). In *Joint Conference on Digital
170 | Libraries*. ACM Press, New York, NY, 376–385.
171 | - Ponweiser, M. (2012). [Latent Dirichlet Allocation in R (Diploma
172 | Thesis)](/articles/Ponweiser2012.pdf). Vienna University of
173 | Economics and Business, Vienna
174 | - Roberts M.E., Stewart B.M., Tingley D., & Airoldi E.M. (2013) [The
175 | Structural Topic Model and Applied Social
176 | Science](/articles/Roberts2013.pdf). *Advances in Neural Information
177 | Processing Systems Workshop on Topic Models: Computation,
178 | Application, and Evaluation*, 1-4.
179 | - Roberts, M., Stewart, B., Tingley, D., Lucas, C., Leder-Luis, J.,
180 | Gadarian, S., Albertson, B., et al. (2014). [Structural topic models
181 | for open ended survey responses](/articles/Roberts2014.pdf).
182 | *American Journal of Political Science, American Journal of
183 | Political Science, 58*(4), 1064-1082.
184 | - Roberts, M., Stewart, B., Tingley, D. (n.d.). [stm: R Package for
185 | Structural Topic Models](/articles/Robertsnd.pdf), 1-49.
186 | - Sievert, C. & Shirley, K. E. (2014a). [LDAvis: A Method for
187 | Visualizing and Interpreting Topics.](/articles/Sievert2014a.pdf) in
188 | *Proceedings of the Workshop on Interactive Language Learning,
189 | Visualization, and Interfaces* 63-70.
190 | - Steyvers, M. & Griffiths, T. (2007). [Probabilistic topic
191 | models](/articles/Steyvers2007.pdf). In T. Landauer, D McNamara, S.
192 | Dennis, and W. Kintsch (eds), *Latent Semantic Analysis: A Road to
193 | Meaning*. Laurence Erlbaum
194 | - Taddy, M.A. (2012). [On Estimation and Selection for Topic
195 | Models](/articles/Taddy2012.pdf) In *Proceedings of the 15th
196 | International Conference on Artificial Intelligence and Statistics
197 | (AISTATS 2012)*, 1184-1193.
198 | - Tang, J., Meng, Z., Nguyen, X. , Mei, Q. , & Zhang, M. (2014).
199 | [Understanding the limiting factors of topic modeling via posterior
200 | contraction analysis](/articles/Tang2014.pdf). In *31 st
201 | International Conference on Machine Learning*, 190-198.
202 |
203 | Websites & Blogs
204 | ================
205 |
206 | - Blei, D. (n.d.). [Topic
207 | Modeling](https://www.cs.princeton.edu/~blei/topicmodeling.html)
208 | - Jockers, M.L. (2013). ["Secret" Recipe for Topic Modeling
209 | Themes](http://www.matthewjockers.net/2013/04/12/secret-recipe-for-topic-modeling-themes/)
210 | - Jones, T. (n.d.). [Topic Models Reading
211 | List](http://www.biasedestimates.com/p/topic-models-reading-list.html)
212 | - Marwick, B. (2014a). [The input parameters for using latent
213 | Dirichlet allocation](http://stats.stackexchange.com/a/25128/7482)
214 | - Marwick, B. (2014b). [Topic models: cross validation with
215 | loglikelihood or
216 | perplexity](http://stackoverflow.com/a/21394092/1000343)
217 | - Rhody, L. M. (2012). [Some Assembly Required: Understanding and Interpreting
218 | Topics in LDA Models of Figurative Language](http://www.lisarhody.com/some-assembly-required)
219 | - Schmidt, B.M. (2012). [Words Alone: Dismantling Topic Models in the
220 | Humanities](http://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/)
221 | - Underwood, T. (2012a). [Topic Modeling Made Just Simple
222 | Enough](http://tedunderwood.com/2012/04/07/topic-modeling-made-just-simple-enough/)
223 | - Underwood, T. (2012b). [What kinds of "topics" does topic modeling
224 | actually
225 | produce?](http://tedunderwood.com/2012/04/01/what-kinds-of-topics-does-topic-modeling-actually-produce/)
226 | - Weingart, S. (2012). [Topic Modeling for Humanists: A Guided
227 | Tour](http://www.scottbot.net/HIAL/?p=19113)
228 | - Weingart, S. (2011). [Topic Modeling and Network
229 | Analysis](http://www.scottbot.net/HIAL/?p=221)
230 |
231 | R Resources
232 | ===========
233 |
234 | Package Comparisons
235 | -------------------
236 |
237 |
238 |
239 |
246 |
247 |
248 |
249 | lda* |
250 | Collapsed Gibbs for LDA |
251 | Graphing utilities |
252 | Chang |
253 | R |
254 |
255 |
256 | topicmodels |
257 | LDA and CTM |
258 | Follows Blei's implementation; great vignette; takes |
259 | C |
260 | DTM |
261 |
262 |
263 | stm |
264 | Model w/ meta-data |
265 | Great documentation; nice visualization |
266 | Roberts, Stewart, & Tingley |
267 | C |
268 |
269 |
270 | LDAvis |
271 | Interactive visualization |
272 | Aids in model interpretation |
273 | Sievert & Shirley |
274 | R + Shiny |
275 |
276 |
277 | mallet** |
278 | LDA |
279 | MALLET is well known |
280 | Mimno |
281 | Java |
282 |
283 |
284 |
285 |
286 | \*[*StackExchange discussion of lda vs.
287 | topicmodels*](http://stats.stackexchange.com/questions/24441/two-r-packages-for-topic-modeling-lda-and-topicmodels)
288 | \*\*[*Setting Up
289 | MALLET*](http://programminghistorian.org/lessons/topic-modeling-and-mallet)
290 |
291 | R Specific References
292 | ---------------------
293 |
294 | - Chang J. (2010). lda: Collapsed Gibbs Sampling Methods for Topic
295 | Models. .
296 | - Grüen, B. & Hornik, K. (2011). [topicmodels: An R Package for
297 | Fitting Topic Models.](/articles/Gruen2011.pdf). *Journal of
298 | Statistical Software, 40*(13), 1-30.
299 | - Mimno, D. (2013). [vignette-mallet: A wrapper around the Java
300 | machine learning tool MALLET](/articles/Mimno2013.Rmd).
301 |
302 | - Ponweiser, M. (2012). [Latent Dirichlet Allocation in R (Diploma
303 | Thesis)](/articles/Ponweiser2012.pdf). Vienna University of
304 | Economics and Business, Vienna.
305 | - Roberts, M., Stewart, B., Tingley, D. (n.d.). [stm: R Package for
306 | Structural Topic Models](/articles/Robertsnd.pdf), 1-49.
307 | - Sievert, C. & Shirley, K. E. (2014a). [LDAvis: A Method for
308 | Visualizing and Interpreting Topics.](Sievert2014a.pdf) *Proceedings
309 | of the Workshop on Interactive Language Learning, Visualization, and
310 | Interfaces* 63-70.
311 | - Sievert, C. & Shirley, K. E. (2014b). [Vignette: LDAvis
312 | details.](/articles/Sievert2014b.pdf) 1-5.
313 |
314 | Example Modeling
315 | ----------------
316 |
317 | - Awati, K. (2015). [A gentle introduction to topic modeling using
318 | R](https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/)
319 | - Dubins, M. (2013). [Topic Modeling in Python and R: A Rather Nosy
320 | Analysis of the Enron Email
321 | Corpus](https://dzone.com/articles/topic-modeling-python-and-r)
322 | - Goodrich, B. (2015) [Topic Modeling Twitter Using
323 | R](https://www.linkedin.com/pulse/topic-modeling-twitter-using-r-bryan-goodrich)
324 | [(CODE)](https://gist.githubusercontent.com/bryangoodrich/7b5ef683ce8db592669e/raw/3402e7390d10a0282dc0d6309ed4df9a4fb1cf5d/TwitterTopics.r)
325 | - inkhorn82 (2014). [A Delicious Analysis! (aka topic modelling using
326 | recipes)](http://rforwork.info/2014/02/17/a-delicious-analysis/)
327 | [(CODE)](https://gist.githubusercontent.com/inkhorn/9044779/raw/c7f0ba30d424aaeb75c5e221d12566f6732c4f29/recipe%20analysis.R)
328 | - Jockers, M.L. (2014).[Introduction to Text Analysis and Topic
329 | Modeling with
330 | R](http://www.matthewjockers.net/materials/dh-2014-introduction-to-text-analysis-and-topic-modeling-with-r/)
331 | - Medina, L. (2015). [Conspiracy Theories - Topic Modeling & Keyword
332 | Extraction](http://voidpatterns.org/2015/03/conspiracy-theories-topic-modeling-keyword-extraction/)
333 | - Sievert, C. (n.d.). [A topic model for movie
334 | reviews](http://cpsievert.github.io/LDAvis/reviews/reviews.html)
335 | - Sievert, C. (2014). [Topic Modeling In R](https://ropensci.org/blog/2014/04/16/topic-modeling-in-R/)
336 |
337 | Topic Modeling R Demo
338 | =====================
339 |
340 | topicmodels Package
341 | -------------------
342 |
343 | The .R script for this demonstration can be downloaded from
344 | [scripts/Example\_topic\_model\_analysis.R](https://raw.githubusercontent.com/trinker/topicmodels_learning/master/scripts/Example_topic_model_analysis.R)
345 |
346 | ### Install/Load Tools & Data
347 |
348 | if (!require("pacman")) install.packages("pacman")
349 | pacman::p_load_gh("trinker/gofastr")
350 | pacman::p_load(tm, topicmodels, dplyr, tidyr, igraph, devtools, LDAvis, ggplot2)
351 |
352 | ## Source topicmodels2LDAvis & optimal_k functions
353 | invisible(lapply(
354 | file.path(
355 | "https://raw.githubusercontent.com/trinker/topicmodels_learning/master/functions",
356 | c("topicmodels2LDAvis.R", "optimal_k.R")
357 | ),
358 | devtools::source_url
359 | ))
360 |
361 | ## SHA-1 hash of file is 5ac52af21ce36dfe8f529b4fe77568ced9307cf0
362 | ## SHA-1 hash of file is 7f0ab64a94948c8b60ba29dddf799e3f6c423435
363 |
364 | data(presidential_debates_2012)
365 |
366 | ### Generate Stopwords
367 |
368 | stops <- c(
369 | tm::stopwords("english"),
370 | tm::stopwords("SMART"),
371 | "governor", "president", "mister", "obama","romney"
372 | ) %>%
373 | gofastr::prep_stopwords()
374 |
375 | ### Create the DocumentTermMatrix
376 |
377 | doc_term_mat <- presidential_debates_2012 %>%
378 | with(gofastr::q_dtm_stem(dialogue, paste(person, time, sep = "_"))) %>%
379 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
380 | gofastr::filter_tf_idf() %>%
381 | gofastr::filter_documents()
382 |
383 | ### Control List
384 |
385 | control <- list(burnin = 500, iter = 1000, keep = 100, seed = 2500)
386 |
387 | ### Determine Optimal Number of Topics
388 |
389 | The plot below shows the harmonic mean of the log likelihoods against k
390 | (number of topics).
391 |
392 | (k <- optimal_k(doc_term_mat, 40, control = control))
393 |
394 | ##
395 | ## Grab a cup of coffee this could take a while...
396 |
397 | ## 10 of 40 iterations (Current: 08:54:32; Elapsed: .2 mins)
398 | ## 20 of 40 iterations (Current: 08:55:07; Elapsed: .8 mins; Remaining: ~2.3 mins)
399 | ## 30 of 40 iterations (Current: 08:56:03; Elapsed: 1.7 mins; Remaining: ~1.3 mins)
400 | ## 40 of 40 iterations (Current: 08:57:30; Elapsed: 3.2 mins; Remaining: ~0 mins)
401 | ## Optimal number of topics = 20
402 |
403 | 
404 |
405 | It appears the optimal number of topics is ~k = 20.
406 |
407 | ### Run the Model
408 |
409 | control[["seed"]] <- 100
410 | lda_model <- topicmodels::LDA(doc_term_mat, k=as.numeric(k), method = "Gibbs",
411 | control = control)
412 |
413 | ### Plot the Topics Per Person & Time
414 |
415 | topics <- topicmodels::posterior(lda_model, doc_term_mat)[["topics"]]
416 | topic_dat <- dplyr::add_rownames(as.data.frame(topics), "Person_Time")
417 | colnames(topic_dat)[-1] <- apply(terms(lda_model, 10), 2, paste, collapse = ", ")
418 |
419 | tidyr::gather(topic_dat, Topic, Proportion, -c(Person_Time)) %>%
420 | tidyr::separate(Person_Time, c("Person", "Time"), sep = "_") %>%
421 | dplyr::mutate(Person = factor(Person,
422 | levels = c("OBAMA", "ROMNEY", "LEHRER", "SCHIEFFER", "CROWLEY", "QUESTION" ))
423 | ) %>%
424 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
425 | ggplot2::geom_bar() +
426 | ggplot2::coord_flip() +
427 | ggplot2::facet_grid(Person~Time) +
428 | ggplot2::guides(fill=FALSE) +
429 | ggplot2::xlab("Proportion")
430 |
431 | 
432 |
433 | ### Plot the Topics Matrix as a Heatmap
434 |
435 | heatmap(topics, scale = "none")
436 |
437 | 
438 |
439 | ### Network of the Word Distributions Over Topics (Topic Relation)
440 |
441 | post <- topicmodels::posterior(lda_model)
442 |
443 | cor_mat <- cor(t(post[["terms"]]))
444 | cor_mat[ cor_mat < .05 ] <- 0
445 | diag(cor_mat) <- 0
446 |
447 | graph <- graph.adjacency(cor_mat, weighted=TRUE, mode="lower")
448 | graph <- delete.edges(graph, E(graph)[ weight < 0.05])
449 |
450 | E(graph)$edge.width <- E(graph)$weight*20
451 | V(graph)$label <- paste("Topic", V(graph))
452 | V(graph)$size <- colSums(post[["topics"]]) * 15
453 |
454 | par(mar=c(0, 0, 3, 0))
455 | set.seed(110)
456 | plot.igraph(graph, edge.width = E(graph)$edge.width,
457 | edge.color = "orange", vertex.color = "orange",
458 | vertex.frame.color = NA, vertex.label.color = "grey30")
459 | title("Strength Between Topics Based On Word Probabilities", cex.main=.8)
460 |
461 | 
462 |
463 | ### Network of the Topics Over Dcouments (Topic Relation)
464 |
465 | minval <- .1
466 | topic_mat <- topicmodels::posterior(lda_model)[["topics"]]
467 |
468 | graph <- graph_from_incidence_matrix(topic_mat, weighted=TRUE)
469 | graph <- delete.edges(graph, E(graph)[ weight < minval])
470 |
471 | E(graph)$edge.width <- E(graph)$weight*17
472 | E(graph)$color <- "blue"
473 | V(graph)$color <- ifelse(grepl("^\\d+$", V(graph)$name), "grey75", "orange")
474 | V(graph)$frame.color <- NA
475 | V(graph)$label <- ifelse(grepl("^\\d+$", V(graph)$name), paste("topic", V(graph)$name), gsub("_", "\n", V(graph)$name))
476 | V(graph)$size <- c(rep(10, nrow(topic_mat)), colSums(topic_mat) * 20)
477 | V(graph)$label.color <- ifelse(grepl("^\\d+$", V(graph)$name), "red", "grey30")
478 |
479 | par(mar=c(0, 0, 3, 0))
480 | set.seed(369)
481 | plot.igraph(graph, edge.width = E(graph)$edge.width,
482 | vertex.color = adjustcolor(V(graph)$color, alpha.f = .4))
483 | title("Topic & Document Relationships", cex.main=.8)
484 |
485 | 
486 |
487 | ### LDAvis of Model
488 |
489 | The output from **LDAvis** is not easily embedded within an R markdown
490 | document, however, the reader may [see the results
491 | here](http://trinker.github.io/LDAvis/example/).
492 |
493 | lda_model %>%
494 | topicmodels2LDAvis() %>%
495 | LDAvis::serVis()
496 |
497 | ### Apply Model to New Data
498 |
499 | ## Create the DocumentTermMatrix for New Data
500 | doc_term_mat2 <- partial_republican_debates_2015 %>%
501 | with(gofastr::q_dtm_stem(dialogue, paste(person, location, sep = "_"))) %>%
502 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
503 | gofastr::filter_tf_idf() %>%
504 | gofastr::filter_documents()
505 |
506 |
507 | ## Update Control List
508 | control2 <- control
509 | control2[["estimate.beta"]] <- FALSE
510 |
511 |
512 | ## Run the Model for New Data
513 | lda_model2 <- topicmodels::LDA(doc_term_mat2, k = k, model = lda_model,
514 | control = list(seed = 100, estimate.beta = FALSE))
515 |
516 |
517 | ## Plot the Topics Per Person & Location for New Data
518 | topics2 <- topicmodels::posterior(lda_model2, doc_term_mat2)[["topics"]]
519 | topic_dat2 <- dplyr::add_rownames(as.data.frame(topics2), "Person_Location")
520 | colnames(topic_dat2)[-1] <- apply(terms(lda_model2, 10), 2, paste, collapse = ", ")
521 |
522 | tidyr::gather(topic_dat2, Topic, Proportion, -c(Person_Location)) %>%
523 | tidyr::separate(Person_Location, c("Person", "Location"), sep = "_") %>%
524 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
525 | ggplot2::geom_bar() +
526 | ggplot2::coord_flip() +
527 | ggplot2::facet_grid(Person~Location) +
528 | ggplot2::guides(fill=FALSE) +
529 | ggplot2::xlab("Proportion")
530 |
531 |
532 | ## LDAvis of Model for New Data
533 | lda_model2 %>%
534 | topicmodels2LDAvis() %>%
535 | LDAvis::serVis()
536 |
537 | Contributing
538 | ============
539 |
540 | You are welcome to:
541 | * submit suggestions and bug-reports at:
542 | * send a pull request on:
543 | * compose a friendly e-mail to:
544 |
--------------------------------------------------------------------------------
/articles/Blei2003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Blei2003.pdf
--------------------------------------------------------------------------------
/articles/Blei2007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Blei2007.pdf
--------------------------------------------------------------------------------
/articles/Blei2008.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Blei2008.pdf
--------------------------------------------------------------------------------
/articles/Blei2009.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Blei2009.pdf
--------------------------------------------------------------------------------
/articles/Blei2012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Blei2012.pdf
--------------------------------------------------------------------------------
/articles/Chang2009.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Chang2009.pdf
--------------------------------------------------------------------------------
/articles/Griffiths2004.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Griffiths2004.pdf
--------------------------------------------------------------------------------
/articles/Griffiths2007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Griffiths2007.pdf
--------------------------------------------------------------------------------
/articles/Gruen2011.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Gruen2011.pdf
--------------------------------------------------------------------------------
/articles/Mimno2007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Mimno2007.pdf
--------------------------------------------------------------------------------
/articles/Mimno2013.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to R mallet"
3 | author: "David Mimno"
4 | date: "`r Sys.Date()`"
5 | output: rmarkdown::html_vignette
6 | vignette: >
7 | %\VignetteIndexEntry{mallet}
8 | %\VignetteEngine{knitr::rmarkdown}
9 | %\VignetteEncoding{UTF-8}
10 | ---
11 |
12 | ## Installation
13 |
14 | The ```mallet``` R package is available on CRAN. To install, simply use ```install.packages()```
15 |
16 | ```{r, eval=FALSE}
17 | install.packages("mallet")
18 | ```
19 |
20 | To load the package, simply use ```library()```.
21 |
22 | ```{r}
23 | library(mallet)
24 | ```
25 |
26 |
27 | ## Usage
28 |
29 | We start out by using the example data from the ```tm``` package.
30 |
31 | ```{r}
32 | library(tm)
33 | reut21578 <- system.file("texts", "crude", package = "tm")
34 | reuters <- VCorpus(DirSource(reut21578), readerControl = list(reader = readReut21578XMLasPlain))
35 | reuters_text_vector <- unlist(lapply(reuters, as.character))
36 | ```
37 |
38 | We can also use the stopword file from the ```tm``` package.
39 |
40 | ```{r}
41 | stopwords_en <- system.file("stopwords/english.dat", package = "tm")
42 | ```
43 |
44 | Create a mallet instance list object. Right now I have to specify the stoplist as a file, I can't pass in a list from R.
45 | This function has a few hidden options (whether to lowercase, how we define a token). See ```?mallet.import``` for details.
46 |
47 | ```{r}
48 | mallet.instances <- mallet.import(id.array = as.character(1:length(reuters_text_vector)),
49 | text.array = reuters_text_vector,
50 | stoplist.file = stopwords_en,
51 | token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
52 | ```
53 |
54 | Create a topic trainer object.
55 |
56 | ```{r}
57 | topic.model <- MalletLDA(num.topics=5, alpha.sum = 1, beta = 0.1)
58 | ```
59 |
60 | Load our documents. We could also pass in the filename of a saved instance list file that we build from the command-line tools.
61 |
62 | ```{r}
63 | topic.model$loadDocuments(mallet.instances)
64 | ```
65 |
66 | Get the vocabulary, and some statistics about word frequencies. These may be useful in further curating the stopword list.
67 |
68 | ```{r}
69 | vocabulary <- topic.model$getVocabulary()
70 | head(vocabulary)
71 |
72 | word.freqs <- mallet.word.freqs(topic.model)
73 | head(word.freqs)
74 | ```
75 |
76 | Get the vocabulary, and some statistics about word frequencies. These may be useful in further curating the stopword list.
77 |
78 | ```{r}
79 | vocabulary <- topic.model$getVocabulary()
80 | head(vocabulary)
81 |
82 | word.freqs <- mallet.word.freqs(topic.model)
83 | head(word.freqs)
84 | ```
85 |
86 |
87 | Optimize hyperparameters every 20 iterations, after 50 burn-in iterations.
88 |
89 | ```{r}
90 | topic.model$setAlphaOptimization(20, 50)
91 | ```
92 |
93 | Now train a model. Note that hyperparameter optimization is on, by default. We can specify the number of iterations. Here we'll use a large-ish round number.
94 |
95 | ```{r}
96 | topic.model$train(200)
97 | ```
98 |
99 | **NEW** Run through a few iterations where we pick the best topic for each token, rather than sampling from the posterior distribution.
100 |
101 | ```{r}
102 | topic.model$maximize(10)
103 | ```
104 |
105 | Get the probability of topics in documents and the probability of words in topics. By default, these functions return raw word counts. Here we want probabilities,so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
106 |
107 | ```{r}
108 | doc.topics <- mallet.doc.topics(topic.model, smoothed=TRUE, normalized=TRUE)
109 | topic.words <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)
110 | ```
111 |
112 | What are the top words in topic 2? Notice that R indexes from 1 and Java from 0, so this will be the topic that mallet called topic 1.
113 |
114 | ```{r}
115 | mallet.top.words(topic.model, word.weights = topic.words[2,], num.top.words = 5)
116 | ```
117 |
118 | Show the first document with at least 5% tokens belonging to topic 1.
119 |
120 | ```{r}
121 | inspect(reuters[doc.topics[,1] > 0.05][1])
122 | ```
123 |
124 | How do topics differ across different sub-corpora?
125 |
126 | ```{r}
127 | usa_articles <- unlist(meta(reuters, "places")) == "usa"
128 |
129 | usa.topic.words <- mallet.subset.topic.words(topic.model,
130 | subset.docs = usa_articles,
131 | smoothed=TRUE,
132 | normalized=TRUE)
133 | other.topic.words <- mallet.subset.topic.words(topic.model,
134 | subset.docs = !usa_articles,
135 | smoothed=TRUE,
136 | normalized=TRUE)
137 | ```
138 |
139 | How do they compare?
140 |
141 | ```{r}
142 | head(mallet.top.words(topic.model, usa.topic.words[1,]))
143 | head(mallet.top.words(topic.model, other.topic.words[1,]))
144 | ```
145 |
--------------------------------------------------------------------------------
/articles/Newman2006.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Newman2006.pdf
--------------------------------------------------------------------------------
/articles/Papadimitriou1997.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Papadimitriou1997.pdf
--------------------------------------------------------------------------------
/articles/Ponweiser2012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Ponweiser2012.pdf
--------------------------------------------------------------------------------
/articles/Roberts2013.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Roberts2013.pdf
--------------------------------------------------------------------------------
/articles/Roberts2014.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Roberts2014.pdf
--------------------------------------------------------------------------------
/articles/Robertsnd.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Robertsnd.pdf
--------------------------------------------------------------------------------
/articles/Sievert2014a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Sievert2014a.pdf
--------------------------------------------------------------------------------
/articles/Sievert2014b.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Sievert2014b.pdf
--------------------------------------------------------------------------------
/articles/Srivastava2009.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Srivastava2009.pdf
--------------------------------------------------------------------------------
/articles/Steyvers2007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Steyvers2007.pdf
--------------------------------------------------------------------------------
/articles/Taddy2012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Taddy2012.pdf
--------------------------------------------------------------------------------
/articles/Tang2014.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/articles/Tang2014.pdf
--------------------------------------------------------------------------------
/data/mallet_texts/README:
--------------------------------------------------------------------------------
1 | MALLET Sample Data Sets
2 |
3 | /web
4 |
5 | This sample data includes the text of 24 "featured articles"
6 | from Wikipedia, 12 from the English version, and 12 from the
7 | German version. They were retrieved in December 2008.
8 | The text is in UTF-8 encoding.
9 |
--------------------------------------------------------------------------------
/data/mallet_texts/numeric/boxes.txt:
--------------------------------------------------------------------------------
1 | first big height=3.2 width=1.7 length=12.0 blue heavy
2 | second small height=1.2 width=1.2 length=3 yellow light
3 | third big height=5.1 width=5.1 length=3 red light
4 | fourth small height=0.9 width=3.0 length=1.1 yellow heavy
5 |
--------------------------------------------------------------------------------
/data/mallet_texts/numeric/puffins.txt:
--------------------------------------------------------------------------------
1 | 16 45 39.2 38 3
2 | 15 65 47.0 36 12
3 | 10 40 24.3 14 18
4 | 7 20 30.0 16 21
5 | 11 40 47.6 6 27
6 | 7 80 47.6 9 36
7 | 4 80 45.6 7 39
8 | 0 15 27.8 8 45
9 | 0 0 41.9 8 54
10 | 0 20 36.8 5 60
11 | 15 40 34.9 31 3
12 | 21 60 45.2 37 12
13 | 12 95 32.9 24 18
14 | 8 50 26.6 11 24
15 | 9 80 32.7 10 30
16 | 6 80 38.1 5 36
17 | 0 60 31.4 5 39
18 | 0 70 32.7 2 48
19 | 0 35 38.1 8 51
20 | 16 60 37.1 35 6
21 | 25 60 47.1 35 12
22 | 13 85 34.0 23 18
23 | 13 90 43.6 12 21
24 | 11 20 30.8 9 27
25 | 3 85 34.6 6 33
26 | 0 30 37.7 8 42
27 | 0 75 45.5 5 48
28 | 0 15 51.4 8 54
29 | 18 40 32.1 36 6
30 | 19 40 35.4 37 9
31 | 8 90 30.2 11 18
32 | 12 80 33.9 9 24
33 | 10 80 40.2 11 30
34 | 3 75 33.5 7 36
35 | 0 65 40.3 10 42
36 | 0 80 40.3 12 45
37 | 0 50 43.1 13 51
38 | 0 50 42.0 3 57
39 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/apollo8.txt:
--------------------------------------------------------------------------------
1 | Apollo 8 war der zweite bemannte Raumflug des amerikanischen Apollo-Programms und der erste bemannte Flug zum Mond und damit zu einem anderen Himmelskörper. Die drei Astronauten Frank Borman, William Anders und James Lovell waren die ersten Menschen, die mit eigenen Augen die Rückseite des Mondes sahen. Apollo 8 startete am 21. Dezember 1968 vom Kennedy Space Center in Florida und erreichte drei Tage später, am 24. Dezember 1968, die Mondumlaufbahn. Große Bekanntheit erlangte die Fernsehübertragung aus dem Mondorbit, während der die drei Astronauten die ersten Zeilen der Schöpfungsgeschichte als Weihnachtsbotschaft verlasen. Nach zehn Umkreisungen des Mondes leiteten die Astronauten am frühen Morgen des 25. Dezember die Rückkehr zur Erde ein, wo die Rückkehrkapsel am 27. Dezember im Pazifischen Ozean wasserte.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/fiv.txt:
--------------------------------------------------------------------------------
1 | Das Feline Immundefizienz-Virus (FIV) ist ein Virus aus der Familie der Retroviren. Das Virus löst bei Katzen eine Immunschwächekrankheit aus, die als Felines Immundefizienzsyndrom oder umgangssprachlich als Katzen-AIDS bezeichnet wird, da sie der Erkrankung AIDS beim Menschen stark ähnelt. Menschen können sich jedoch mit FIV nicht infizieren. FIV gehört innerhalb der Retroviren zur Gattung der Lentiviren und wurde 1986, also vier Jahre nach der Entdeckung des Humanen Immundefizienz-Virus (HIV), zum ersten Mal beschrieben. Die Erkrankung ist bisher nicht wirkungsvoll behandelbar, verläuft aber oft über lange Zeit symptomlos. Langfristig wird jedoch das Immunsystem zerstört und Sekundärinfektionen führen zum Tod. Bisher wurden neun verschiedene Stämme des Virus aus elf verschiedenen Katzenarten isoliert, darunter spezifische Stämme aus Löwen und Pumas. Auch in der Tüpfelhyäne, die nicht zur Familie der Katzen gehört, wurde FIV gefunden. Neben dem Felinen Coronavirus, dem Erreger der Felinen Infektiösen Peritonitis (FIP) und dem Felinen Leukosevirus (FeLV), dem Erreger der Katzenleukämie, gehört das Virus zu den Auslösern der klinisch bedeutsamsten viralen Infektionskrankheiten bei Hauskatzen.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/habichtsadler.txt:
--------------------------------------------------------------------------------
1 | Der Habichtsadler (Hieraaetus fasciatus, Synonym Aquila fasciata) ist eine Vogelart aus der Familie der Habichtartigen (Accipitridae). Dieser mittelgroße, kräftige und sehr agile Adler bewohnt trockene, felsige Regionen in Südeuropa, Nordafrika und im Süden Asiens, wo er sich von kleinen bis mittelgroßen Wirbeltieren ernährt. Der Bestand des Habichtsadlers ist in Südeuropa vor allem aufgrund illegaler Verfolgung seit Jahrzehnten rückläufig, daher gilt die Art hier als stark gefährdet.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/hoechst.txt:
--------------------------------------------------------------------------------
1 | Höchst am Main ist ein Stadtteil von Frankfurt am Main, hat etwa 13.500 Einwohner (Stand 31. Dezember 2006[1]) und liegt rund zehn Kilometer westlich der Frankfurter Innenstadt[2] an der Mündung der Nidda in den Main. Höchst war im Gegensatz zu den meisten anderen Stadtteilen eine alte Stadt mit Stadtrecht seit 1355 und ist bis heute das wichtigste städtische Subzentrum im Frankfurter Westen. 1928 wurde Höchst nach Frankfurt eingemeindet. Bis 1987 war Höchst Verwaltungssitz eines eigenen Landkreises, der seit 1928 den Namen Main-Taunus-Kreis trägt. Höchst ist Zentrum des Ortsbezirks Frankfurt-West mit 120.000 Einwohnern.
2 | Der Name Höchst wurde durch die Hoechst AG (1863–1999) weltweit bekannt. Mit einer Unterbrechung von 27 Jahren zwischen 1925 und 1952 war Höchst Sitz des Chemie- und Pharmakonzerns. Dessen ehemaliges Stammwerk ist heute als Industriepark Höchst einer der größten Industriestandorte Deutschlands. Bedeutendstes Baudenkmal Höchsts ist die karolingische Justinuskirche, die in wesentlichen Teilen aus dem 9. Jahrhundert stammt. Die gut erhaltene Höchster Altstadt steht seit 1972 unter Denkmalschutz. Die meisten Fachwerkhäuser auf mittelalterlichem Stadtgrundriss stammen aus der Zeit nach dem großen Stadtbrand von 1586.
3 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/indogermanische.txt:
--------------------------------------------------------------------------------
1 | Die indogermanische Ursprache (Protoindoeuropäisch, PIE), auch Indoeuropäisch oder Urindogermanisch (UIG), ist die gemeinsame Vorläuferin der indogermanischen Sprachen, wie sie vor vielleicht 5000 Jahren vermutlich in der Nähe des Schwarzen Meeres gesprochen wurde. Es ist eine der großen Leistungen der Sprachwissenschaftler seit dem Beginn des 19. Jahrhunderts, aus der Betrachtung der Gemeinsamkeiten und der systematischen Unterschiede der indogermanischen Sprachen untereinander eine plausible Beschreibung der Gestalt dieser Ursprache extrapoliert zu haben.
2 | Der Erwerb der Sprachfähigkeit durch die Menschheit lag zur Zeit der indogermanischen Ursprache etwa 200.000 Jahre zurück; die Benennung der rekonstruierten Sprache als „indogermanische Ursprache“ impliziert daher keinesfalls, dass die Sprache in irgendeiner Hinsicht „archaisch“ oder „primitiv“ gewesen sei. Ebenso wenig handelt es sich bei ihrer Rekonstruktion um den Versuch, die sogenannte „Welt-Ursprache“ zu finden.
3 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/konrad.txt:
--------------------------------------------------------------------------------
1 | Konrad I. (* um 881; † 23. Dezember 918 in Weilburg; beerdigt in Fulda) war seit 906 Herzog von Franken und von 911 bis 918 König des Ostfrankenreichs.
2 | Adelsfehden zwischen den mächtigen Aristokratenfamilien um die Vorherrschaft in den einzelnen Stammesgebieten des ostfränkischen Reiches, die wiederholten Ungarneinfälle und die Schwäche des karolingischen Königtums führten zur Etablierung regionaler Mittelgewalten, den späteren Herzogtümern. In diese Zeit fiel der Aufstieg Konrads, der selbst ein Repräsentant dieser aufsteigenden Regionen und zugleich am Regiment des ostfränkischen Karolingers Ludwig des Kindes beteiligt war. Als König versuchte Konrad, sich der anbahnenden Auflösung des Reichsverbandes entgegen zu stellen und die Herrschaft wieder im ganzen Reich auszuüben. Seine siebenjährige Regierungszeit ist daher hauptsächlich durch die Konflikte mit den ostfränkischen Herzögen (duces) der einzelnen Teilreiche und durch die Ungarneinfälle geprägt. Konrads Herrschaft bildete den Übergang von den Karolingern zu den Ottonen, da es ihm nicht gelang, eine neue Königsdynastie zu begründen. Er führte die Herrschaftspraxis der Karolinger fort.
3 | Seine Zeit gehört zu den quellenärmsten des gesamten Mittelalters. Während die Jahrzehnte später verfassten ottonischen Geschichtswerke Konrad noch positive Eigenschaften zuweisen, gilt er in der Forschung oftmals mit seiner ganzen Regierungszeit als gescheitert. Lange Zeit wurde die Königswahl Konrads als Beginn einer deutschen Geschichte verortet. Erst jüngst setzte sich die Auffassung durch, dass das deutsche Reich nicht in einem Akt, sondern in einem lang dauernden Prozess entstand. Gleichwohl wird Konrad als wichtiger Akteur in dieser Entwicklung angesehen.
4 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/marcellinus.txt:
--------------------------------------------------------------------------------
1 | Ammianus Marcellinus (* um 330 in Antiochia am Orontes, Syrien; † um 395 [spätestens um 400] wahrscheinlich in Rom) war ein römischer Historiker. Er ist neben Prokopios von Caesarea der bedeutendste spätantike Geschichtsschreiber und schrieb in lateinischer Sprache, obwohl seine Muttersprache das Griechische war.
2 | Seine Res gestae sind das letzte bedeutende lateinische Geschichtswerk der Antike. Die erhaltenen Teile umfassen die Jahre von 353 bis 378 und beschreiben die Zeit unmittelbar vor Beginn der großen Völkerwanderung, in der sich die antike Mittelmeerwelt grundlegend verändern sollte. Ammianus hat als Soldat unter den Kaisern Constantius II. und Julian Apostata gedient und viele der von ihm geschilderten Ereignisse selbst miterlebt. Obwohl er mehr als andere antike Geschichtsschreiber um Objektivität bemüht war, wird seine persönliche Sicht bisweilen recht deutlich. So beurteilte er etwa Constantius II. teilweise sehr negativ, während er von Julian ein ausgesprochen positives Bild zeichnete. Der Wert seiner Res gestae für die Erforschung des 4. Jahrhunderts ist dennoch unbestritten.
3 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/rostock.txt:
--------------------------------------------------------------------------------
1 | Der F.C. Hansa Rostock ist ein deutscher Fußballverein aus Rostock in Mecklenburg-Vorpommern mit rund 4450 Mitgliedern.[1] Gegründet wurde der F.C. Hansa als Fußballclub am 28. Dezember 1965 mit der Ausgliederung der Fußballabteilung aus dem am 11. November 1954 gegründeten SC Empor Rostock.
2 | Als einziger Verein aus den neuen Bundesländern spielte Hansa stets mindestens in der jeweils zweithöchsten Spielklasse der nationalen Verbände der Deutschen Demokratischen Republik und der Bundesrepublik Deutschland und verbuchte seine größten Erfolge mit dem Gewinn der 1990/91 unter der Bezeichnung NOFV-Oberliga letztmalig ausgespielten Meisterschaft sowie des 1991 unter der Bezeichnung NOFV-Pokal letztmalig ausgetragenen Pokalfinals der Deutschen Demokratischen Republik.
3 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/sadat.txt:
--------------------------------------------------------------------------------
1 | Muhammad Anwar as-Sadat, arabisch محمد أنور السادات, DMG Muhammad Anwar as-Sādāt, (* 25. Dezember 1918 in Mit Abul-kum, einem Dorf im Nildelta; † 6. Oktober 1981 in Kairo) war ein ägyptischer Staatsmann. Neben Nasser und anderen war er Mitgründer des Geheimbunds der Freien Offiziere, seit dem Staatsstreich 1952 bekleidete er hohe Ämter. Als Nachfolger Nassers wurde er 1970 Staatspräsident. Sadat führte Ägypten in den Jom-Kippur-Krieg 1973, löste das Land aus der engen Bindung an die Sowjetunion und schloss 1979 Frieden mit Israel. Für seine Bemühungen im Friedensprozess mit Israel erhielt er zusammen mit Menachem Begin 1978 den Friedensnobelpreis. Sadat fiel einem Attentat zum Opfer, das Gegner seiner Politik der Aussöhnung mit Israel verübten.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/t40.txt:
--------------------------------------------------------------------------------
1 | Der T-40 war ein sowjetischer leichter Schwimmpanzer zur Zeit des Zweiten Weltkrieges. Die damalige sowjetische Klassifikation ordnete ihn als „kleinen Panzer“ ein.
2 | Das Konstruktionsbüro des Werks Nr. 37 in Moskau entwickelte den T-40 in der ersten Hälfte des Jahres 1939. Als Chefkonstrukteur wirkte Nikolai Alexandrowitsch Astrow, einer der damals führenden Spezialisten in der Entwicklung leichter Panzer. Die Rote Armee nahm den Panzer im Dezember 1939 an und das Werk Nr. 37 produzierte ihn bis Dezember 1941 in Serie.
3 | Während der Serienfertigung modifizierten die Entwickler den T-40 mehrfach, sowohl um die Fertigung zu vereinfachen, als auch um Panzerung und Feuerkraft zu verstärken. Die späteren Ausführungen, in der Literatur als T-40S und T-30 bezeichnet, wiesen einen besseren Panzerschutz auf und trugen als Hauptbewaffnung eine kleinkalibrige automatische Kanone statt eines überschweren Maschinengewehrs. Gleichzeitig verloren die Panzer ihre Schwimmfähigkeit. Diese Varianten stellten Übergangsmodelle zum Nachfolger des T-40, dem leichten Panzer T-60, dar.
4 | Die Rote Armee setzte den T-40 hauptsächlich in den Kämpfen der Anfangsphase der deutschen Invasion in der UdSSR ein. Die meisten T-40-Panzer wurden im Spätherbst des Jahres 1941 in der Schlacht um Moskau eingesetzt. Fast alle Panzer gingen hier und in Folge verloren, so dass der Typ bereits 1942 aus den Panzertruppen verschwand. Einzelne verbliebene Fahrzeuge verwendete die Armee zu Trainingszwecken bis zum Ende des Krieges.
5 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/ulrich.txt:
--------------------------------------------------------------------------------
1 | Ulrich von Wilamowitz-Moellendorff (* 22. Dezember 1848 auf Gut Markowitz, Kujawien, Provinz Posen; † 25. September 1931 in Berlin; vollständiger Name Enno [auch: Emmo] Friedrich Wichard Ulrich von Wilamowitz-Moellendorff) war ein deutscher klassischer Philologe. Er lehrte und forschte als Professor in Greifswald (1876–1883), Göttingen (1883–1897) und Berlin (1897–1921). Mit seinen Editionsprojekten, seiner Erneuerung der Textkritik und Textinterpretation, seiner Einflussnahme auf die preußische Berufungspolitik und seiner Tätigkeit als Wissenschaftsorganisator war er einer der führenden Vertreter seines Faches und prägte die Klassische Philologie des 20. Jahrhunderts im internationalen Raum nachhaltig. Durch seine Monografien zu vielen Bereichen der griechischen Literatur, seine Neudefinition des Faches und nicht zuletzt durch seine zahlreichen Schüler übte er großen Einfluss auf die Klassische Philologie aus. Als Präsident der Preußischen Akademie der Wissenschaften brachte er viele Akademievorhaben auf den Weg, besonders die Inscriptiones Graecae, die bis heute alle in Griechenland entdeckten Inschriften verzeichnen und herausgeben.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/de/wildenstein.txt:
--------------------------------------------------------------------------------
1 | Die Burg Wildenstein liegt über dem Donaudurchbruch durch die Schwäbische Alb. Sie gehört zur Gemeinde Leibertingen im Landkreis Sigmaringen. Ihre heutige Form, insbesondere die der Außenanlage, stellt fast unverändert den Zustand zwischen 1514 und 1554 dar, als sie unter Gottfried Werner von Zimmern zu einer frühneuzeitlichen Festung umgebaut wurde. Sowohl Hauptburg als auch Vorburg stehen auf künstlich abgeschrofften Felsen und sind nur über Brücken zugänglich. Der über die gesamte Breite der Burg reichende Halsgraben mit einer Breite von 20 Metern und ursprünglichen Tiefe von ebenfalls 20 Metern hat, wie der berühmte Stich Matthäus Merians zeigt, bereits in der Vergangenheit Besucher der Burg sehr beeindruckt. Im Innern besitzt die Burg großflächige Renaissance-Wandmalereien von circa 1538 bis 1540 mit Blumenranken und Vogelmotiven, sowie, in einem Bilderzyklus, die komplette Sigenotsage.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/elizabeth_needham.txt:
--------------------------------------------------------------------------------
1 | Elizabeth Needham (died 3 May 1731), also known as Mother Needham, was an English procuress and brothel-keeper of 18th-century London, who has been identified as the bawd greeting Moll Hackabout in the first plate of William Hogarth's series of satirical etchings, A Harlot's Progress. Although Needham was notorious in London at the time, little is recorded of her life, and no genuine portraits of her survive. Her house was the most exclusive in London and her customers came from the highest strata of fashionable society, but she eventually crossed the moral reformers of the day and died as a result of the severe treatment she received after being sentenced to stand in the pillory.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/equipartition_theorem.txt:
--------------------------------------------------------------------------------
1 | The equipartition theorem is a formula from statistical mechanics that relates the temperature of a system with its average energies. The original idea of equipartition was that, in thermal equilibrium, energy is shared equally among its various forms; for example, the average kinetic energy in the translational motion of a molecule should equal the average kinetic energy in its rotational motion. Like the virial theorem, the equipartition theorem gives the total average kinetic and potential energies for a system at a given temperature, from which the system's heat capacity can be computed. However, equipartition also gives the average values of individual components of the energy. It can be applied to any classical system in thermal equilibrium, no matter how complicated. The equipartition theorem can be used to derive the classical ideal gas law, and the Dulong–Petit law for the specific heat capacities of solids. It can also be used to predict the properties of stars, even white dwarfs and neutron stars, since it holds even when relativistic effects are considered. Although the equipartition theorem makes very accurate predictions in certain conditions, it becomes inaccurate when quantum effects are significant, namely at low enough temperatures.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/gunnhild.txt:
--------------------------------------------------------------------------------
1 | Gunnhild konungamóðir (mother of kings) or Gunnhild Gormsdóttir[1] (c. 910 – c. 980) was the wife of Erik Bloodaxe (king of Norway 930–34, "king" of Orkney c. 937–54, and king of Jórvík 948–49 and 952–54). Gunnhild is a prominent figure in many Norse sagas, including Fagrskinna, Egil's Saga, Njal's Saga, and Heimskringla. Many of the details of her life are disputed, including her parentage. Gunnhild lived during a time of great change in Norway. Her father-in-law Harald Fairhair had recently united much of Norway under his rule. Shortly after his death, Gunnhild and her husband were overthrown and exiled. She spent much of the rest of her life in exile in Orkney, Jorvik and Denmark. A number of her many children with Erik became co-rulers of Norway in the late tenth century. What details of her life are known come largely from Icelandic sources; because the Icelanders were generally hostile to her and her husband, scholars regard some of the episodes reported in them as suspect.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/hawes.txt:
--------------------------------------------------------------------------------
1 | Richard Hawes (1797–1877) was a United States Representative from Kentucky and the second Confederate Governor of Kentucky. Originally a Whig, Hawes became a Democrat following the dissolution of the Whig party in the 1850s. At the outbreak of the American Civil War, Hawes was a supporter of Kentucky's doctrine of armed neutrality. When the Commonwealth's neutrality was breached in September 1861, Hawes fled to Virginia and enlisted as a brigade commissary under Confederate general Humphrey Marshall. He was elected Confederate governor of the Commonwealth following the late George W. Johnson's death at the Battle of Shiloh. Hawes and the Confederate government traveled with Braxton Bragg's Army of Tennessee, and when Bragg invaded Kentucky in October 1862, he captured Frankfort and held an inauguration ceremony for Hawes. The ceremony was interrupted, however, by forces under Union general Don Carlos Buell, and the Confederates were driven from the Commonwealth following the Battle of Perryville. Hawes relocated to Virginia, where he continued to lobby President Jefferson Davis to attempt another invasion of Kentucky. Following the war, he returned to his home in Paris, Kentucky, swore an oath of allegiance to the Union, and was allowed to return to his law practice.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/hill.txt:
--------------------------------------------------------------------------------
1 | Clem Hill (1877–1945) was an Australian cricketer who played 49 Test matches as a specialist batsman between 1896 and 1912. He captained the Australian team in ten Tests, winning five and losing five. A prolific run scorer, Hill scored 3,412 runs in Test cricket—a world record at the time of his retirement—at an average of 39.21 per innings, including seven centuries. In 1902, Hill was the first batsman to make 1,000 Test runs in a calendar year, a feat that would not be repeated for 45 years. His innings of 365 scored against New South Wales for South Australia in 1900–01 was a Sheffield Shield record for 27 years. His Test cricket career ended in controversy after he was involved in a brawl with cricket administrator and fellow Test selector Peter McAlister in 1912. He was one of the "Big Six", a group of leading Australian cricketers who boycotted the 1912 Triangular Tournament in England when the players were stripped of the right to appoint the tour manager. The boycott effectively ended his Test career. After retiring from cricket, Hill worked in the horse racing industry as a stipendiary steward and later as a handicapper for races including the Caulfield Cup.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/shiloh.txt:
--------------------------------------------------------------------------------
1 | The Battle of Shiloh, also known as the Battle of Pittsburg Landing, was a major battle in the Western Theater of the American Civil War, fought on April 6 and April 7, 1862, in southwestern Tennessee. Confederate forces under Generals Albert Sidney Johnston and P.G.T. Beauregard launched a surprise attack against the Union Army of Maj. Gen. Ulysses S. Grant and came very close to defeating his army.
2 | On the first day of battle, the Confederates struck with the intention of driving the Union defenders away from the Tennessee River and into the swamps of Owl Creek to the west, hoping to defeat Grant's Army of the Tennessee before it could link up with Maj. Gen. Don Carlos Buell's Army of the Ohio. The Confederate battle lines became confused during the fierce fighting, and Grant's men instead fell back in the direction of Pittsburg Landing to the northeast. A position on a slightly sunken road, nicknamed the "Hornet's Nest", defended by the men of Brig. Gens. Benjamin M. Prentiss's and W.H.L. Wallace's divisions, provided critical time for the rest of the Union line to stabilize under the protection of numerous artillery batteries. Gen. Johnston was killed during the first day's fighting, and Beauregard, his second in command, decided against assaulting the final Union position that night.
3 | Reinforcements from Gen. Buell arrived in the evening and turned the tide the next morning, when he and Grant launched a counterattack along the entire line. The Confederates were forced to retreat from the bloodiest battle in United States history up to that time, ending their hopes that they could block the Union advance into northern Mississippi.
4 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/sunderland_echo.txt:
--------------------------------------------------------------------------------
1 | The Sunderland Echo is an evening provincial newspaper serving the Sunderland, South Tyneside and East Durham areas of North East England. The newspaper was founded by Samuel Storey, Edward Backhouse, Edward Temperley Gourley, Charles Palmer, Richard Ruddock, Thomas Glaholm and Thomas Scott Turnbull in 1873, as the Sunderland Daily Echo and Shipping Gazette. Designed to provide a platform for the Radical views held by Storey and his partners, it was also Sunderland's first local daily paper. The inaugural edition of the Echo was printed in Press Lane, Sunderland on 22 December 1873; 1,000 copies were produced and sold for a halfpenny each. The Echo survived intense competition in its early years, as well as the depression of the 1930s and two World Wars. Sunderland was heavily bombed in the Second World War and, although the Echo building was undamaged, it was forced to print its competitor's paper under wartime rules. It was during this time that the paper's format changed, from a broadsheet to its current tabloid layout, because of national newsprint shortages.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/thespis.txt:
--------------------------------------------------------------------------------
1 | Thespis is an operatic extravaganza that was the first collaboration between dramatist W. S. Gilbert and composer Arthur Sullivan. It was never published, and most of the music is now lost. However, Gilbert and Sullivan went on to become one of the most famous and successful partnerships in Victorian England, creating a string of comic opera hits, including H.M.S. Pinafore, The Pirates of Penzance and The Mikado, that continue to be popular. Thespis premièred in London at the Gaiety Theatre on 26 December 1871. Like many productions at that theatre, it was written in a broad, burlesque style, considerably different from Gilbert and Sullivan's later works. It was a modest success—for a Christmas entertainment of the time—and closed on 8 March 1872, after a run of 63 performances. It was advertised as "An entirely original Grotesque Opera in Two Acts". The story follows an acting troupe headed by Thespis, the legendary Greek father of the drama, who temporarily trade places with the gods on Mount Olympus, who have grown elderly and ignored. The actors turn out to be comically inept rulers. Having seen the ensuing mayhem down below, the angry gods return, sending the actors back to Earth as "eminent tragedians, whom no one ever goes to see."
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/thylacine.txt:
--------------------------------------------------------------------------------
1 | The Thylacine (pronounced /ˈθaɪləsaɪn/) (Thylacinus cynocephalus, Greek: dog-headed pouched one) was the largest known carnivorous marsupial of modern times. Native to continental Australia, Tasmania and New Guinea, it is thought to have become extinct in the 20th century. It is commonly known as the Tasmanian Tiger (because of its striped back), the Tasmanian Wolf, and colloquially the Tassie (or Tazzy) Tiger or simply the Tiger.[3] It was the last extant member of its genus, Thylacinus, although several related species have been found in the fossil record dating back to the early Miocene.
2 | The Thylacine became extinct on the Australian mainland thousands of years before European settlement of the continent, but it survived on the island of Tasmania along with several endemic species, including the Tasmanian Devil. Intensive hunting encouraged by bounties is generally blamed for its extinction, but other contributory factors may have been disease, the introduction of dogs, and human encroachment into its habitat. Despite it being officially classified as extinct, sightings are still reported.
3 | Like the tigers and wolves of the Northern Hemisphere, from which it obtained two of its common names, the Thylacine was an apex predator. As a marsupial, it was not related to these placental mammals, but because of convergent evolution it displayed the same general form and adaptations. Its closest living relative is the Tasmanian Devil.
4 | The Thylacine was one of only two marsupials to have a pouch in both sexes (the other is the Water Opossum). The male Thylacine had a pouch that acted as a protective sheath, protecting the male's external reproductive organs while running through thick brush.
5 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/uranus.txt:
--------------------------------------------------------------------------------
1 | The rings of Uranus were discovered on March 10, 1977, by James L. Elliot, Edward W. Dunham, and Douglas J. Mink. Two additional rings were discovered in 1986 by the Voyager 2 spacecraft, and two outer rings were found in 2003–2005 by the Hubble Space Telescope. A number of faint dust bands and incomplete arcs may exist between the main rings. The rings are extremely dark—the Bond albedo of the rings' particles does not exceed 2%. They are likely composed of water ice with the addition of some dark radiation-processed organics. The majority of Uranus's rings are opaque and only a few kilometres wide. The ring system contains little dust overall; it consists mostly of large bodies 0.2–20 m in diameter. The relative lack of dust in the ring system is due to aerodynamic drag from the extended Uranian exosphere—corona. The rings of Uranus are thought to be relatively young, at not more than 600 million years. The mechanism that confines the narrow rings is not well understood. The Uranian ring system probably originated from the collisional fragmentation of a number of moons that once existed around the planet. After colliding, the moons broke up into numerous particles, which survived as narrow and optically dense rings only in strictly confined zones of maximum stability.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/yard.txt:
--------------------------------------------------------------------------------
1 | Robert Sterling Yard (1861–1945) was an American writer, journalist and wilderness activist. Yard graduated from Princeton University and spent the first twenty years of his career as a journalist, editor and publisher. In 1915 he was recruited by his friend Stephen Mather to help publicize the need for an independent national park agency. Their numerous publications were part of a movement that resulted in legislative support for a National Park Service in 1916. Yard served as head of the National Parks Educational Committee for several years after its conception, but tension within the NPS led him to concentrate on non-government initiatives. He became executive secretary of the National Parks Association in 1919. Yard worked to promote the national parks as well as educate Americans about their use. Creating high standards based on aesthetic ideals for park selection, he also opposed commercialism and industrialization of what he called "America's masterpieces". These standards caused discord with his peers. After helping to establish a relationship between the NPA and the United States Forest Service, Yard later became involved in the protection of wilderness areas. In 1935 he became one of the eight founding members of The Wilderness Society and acted as its first president from 1937 until his death eight years later. Yard is now considered an important figure in the modern wilderness movement.
2 |
--------------------------------------------------------------------------------
/data/mallet_texts/web/en/zinta.txt:
--------------------------------------------------------------------------------
1 | Preity Zinta (born 1975) is an Indian film actress. She has appeared in Hindi films of Bollywood, as well as Telugu and English-language movies. After graduating with a degree in criminal psychology, Zinta made her acting debut in Dil Se in 1998 followed by a role in Soldier the same year. These performances earned her a Filmfare Best Female Debut Award, and she was later recognised for her role as a teenage single mother in Kya Kehna (2000). She subsequently played a variety of character types, and in doing so has been credited with changing the image of a Hindi film heroine. Zinta received her first Filmfare Best Actress Award in 2003 for her performance in the drama Kal Ho Naa Ho. She went on to play the lead female role in two consecutive annual top-grossing films in India: the science fiction film Koi... Mil Gaya, her biggest commercial success, and the star-crossed romance Veer-Zaara, which earned her critical acclaim. She was later noted for her portrayal of independent, modern Indian women in Salaam Namaste and Kabhi Alvida Naa Kehna, top-grossing productions in overseas markets. These accomplishments have established her as a leading actress of Hindi cinema. In addition to movie acting, Zinta has written a series of columns for BBC News Online South Asia, is a regular stage performer, and along with boyfriend Ness Wadia she is a co-owner of the Indian Premier League cricket team Kings XI Punjab.
2 |
--------------------------------------------------------------------------------
/data/recipes/README.md:
--------------------------------------------------------------------------------
1 | * `map.txt`: the mapping for collapsing regions.
2 | * `epic_recipes.txt`: recipes from epicurious. the first column is the cuisine.
3 | * `allr_recipes.txt`: recipes from allrecipes.com
4 | * `menu_recipes.txt`: recipes from menupan.com
5 |
6 |
--------------------------------------------------------------------------------
/data/recipes/map.txt:
--------------------------------------------------------------------------------
1 | Canada NorthAmerican
2 | Turkey MiddleEastern
3 | east_asian EastAsian
4 | Caribbean LatinAmerican
5 | Bangladesh SouthAsian
6 | chinese EastAsian
7 | mexico LatinAmerican
8 | Lebanon MiddleEastern
9 | japanese EastAsian
10 | North-African African
11 | MiddleEastern MiddleEastern
12 | Indian SouthAsian
13 | asian
14 | Italy SouthernEuropean
15 | EasternEuropean_Russian EasternEuropean
16 | Israel MiddleEastern
17 | Korea EastAsian
18 | Iran MiddleEastern
19 | Eastern-Europe EasternEuropean
20 | Jewish MiddleEastern
21 | South-African African
22 | Vietnamese SoutheastAsian
23 | UK-and-Ireland WesternEuropean
24 | French WesternEuropean
25 | Mediterranean SouthernEuropean
26 | Central_SouthAmerican LatinAmerican
27 | Cajun_Creole NorthAmerican
28 | Belgium WesternEuropean
29 | China EastAsian
30 | korean EastAsian
31 | Germany WesternEuropean
32 | South-America LatinAmerican
33 | Spain SouthernEuropean
34 | Netherlands WesternEuropean
35 | Scandinavia NorthernEuropean
36 | Philippines SoutheastAsian
37 | Indonesia SoutheastAsian
38 | East-African African
39 | Scandinavian NorthernEuropean
40 | Greek SouthernEuropean
41 | American NorthAmerican
42 | Vietnam SoutheastAsian
43 | western WesternEuropean
44 | African African
45 | Switzerland WesternEuropean
46 | West-African African
47 | France WesternEuropean
48 | Thai SoutheastAsian
49 | Thailand SoutheastAsian
50 | Italian SouthernEuropean
51 | Pakistan SouthAsian
52 | Irish WesternEuropean
53 | Mexican LatinAmerican
54 | Portugal SouthernEuropean
55 | Chinese EastAsian
56 | Mexico LatinAmerican
57 | German WesternEuropean
58 | Spanish_Portuguese SouthernEuropean
59 | India SouthAsian
60 | Japanese EastAsian
61 | Moroccan African
62 | Southern_SoulFood NorthAmerican
63 | Malaysia SoutheastAsian
64 | Austria WesternEuropean
65 | English_Scottish WesternEuropean
66 | Asian
67 | Southwestern NorthAmerican
68 | Japan EastAsian
69 | italian SouthernEuropean
70 |
--------------------------------------------------------------------------------
/functions/optimal_k.R:
--------------------------------------------------------------------------------
1 | #' Find Optimal Number of Topics
2 | #'
3 | #' Iteratively produces models and then compares the harmonic mean of the log
4 | #' likelihoods in a graphical output.
5 | #'
6 | #' @param x A \code{\link[tm]{DocumentTermMatrix}}.
7 | #' @param max.k Maximum number of topics to fit (start small [i.e., default of
8 | #' 30] and add as necessary).
9 | #' @param harmonic.mean Logical. If \code{TRUE} the harmonic means of the
10 | #' log likelihoods are used to determine k (see
11 | #' \url{http://stackoverflow.com/a/21394092/1000343}). Otherwise just the log
12 | #' likelihoods are graphed against k (see
13 | #' \url{http://stats.stackexchange.com/a/25128/7482}).
14 | #' @param burnin Object of class \code{"integer"}; number of omitted Gibbs
15 | #' iterations at beginning, by default equals 0.
16 | #' @param iter Object of class \code{"integer"}; number of Gibbs iterations, by
17 | #' default equals 2000.
18 | #' @param keep Object of class \code{"integer"}; if a positive integer, the
19 | #' log likelihood is saved every keep iterations.
20 | #' @param method The method to be used for fitting; currently
21 | #' \code{method = "VEM"} or \code{method= "Gibbs"} are supported.
22 | #' @param drop.seed Logical. If \code{TRUE} \code{seed} argument is dropped from
23 | #' \code{control}.
24 | #' @param \ldots Other arguments passed to \code{??LDAcontrol}.
25 | #' @return Returns the \code{\link[base]{data.frame}} of k (nuber of topics) and
26 | #' the associated log likelihood.
27 | #' @references \url{http://stackoverflow.com/a/21394092/1000343} \cr
28 | #' \url{http://stats.stackexchange.com/a/25128/7482} \cr
29 | #' Ponweiser, M. (2012). Latent Dirichlet Allocation in R (Diploma Thesis).
30 | #' Vienna University of Economics and Business, Vienna.
31 | #' http://epub.wu.ac.at/3558/1/main.pdf \cr\cr
32 | #' Griffiths, T.L., and Steyvers, M. (2004). Finding scientific topics.
33 | #' Proceedings of the National Academy of Sciences of the United States of America
34 | #' 101(Suppl 1), 5228 - 5235. \url{http://www.pnas.org/content/101/suppl_1/5228.full.pdf}
35 | #' @keywords k topicmodel
36 | #' @export
37 | #' @author Ben Marwick and Tyler Rinker .
38 | #' @examples
39 | #' ## Install/Load Tools & Data
40 | #' if (!require("pacman")) install.packages("pacman")
41 | #' pacman::p_load_gh("trinker/gofastr")
42 | #' pacman::p_load(tm, topicmodels, dplyr, tidyr, devtools, LDAvis, ggplot2)
43 | #'
44 | #'
45 | #' ## Source topicmodels2LDAvis function
46 | #' devtools::source_url("https://gist.githubusercontent.com/trinker/477d7ae65ff6ca73cace/raw/79dbc9d64b17c3c8befde2436fdeb8ec2124b07b/topicmodels2LDAvis")
47 | #'
48 | #' data(presidential_debates_2012)
49 | #'
50 | #'
51 | #' ## Generate Stopwords
52 | #' stops <- c(
53 | #' tm::stopwords("english"),
54 | #' "governor", "president", "mister", "obama","romney"
55 | #' ) %>%
56 | #' gofastr::prep_stopwords()
57 | #'
58 | #'
59 | #' ## Create the DocumentTermMatrix
60 | #' doc_term_mat <- presidential_debates_2012 %>%
61 | #' with(gofastr::q_dtm_stem(dialogue, paste(person, time, sep = "_"))) %>%
62 | #' gofastr::remove_stopwords(stops) %>%
63 | #' gofastr::filter_tf_idf() %>%
64 | #' gofastr::filter_documents()
65 | #'
66 | #'
67 | #' opti_k1 <- optimal_k(doc_term_mat)
68 | #' opti_k1
69 | #'
70 | #' opti_k2 <- optimal_k(doc_term_mat, harmonic.mean = FALSE)
71 | #' opti_k2
72 | optimal_k <- function(x, max.k = 30, harmonic.mean = TRUE,
73 | control = if (harmonic.mean) list(burnin = 500, iter = 1000, keep = 100) else NULL,
74 | method = if (harmonic.mean) "Gibbs" else "VEM", verbose = TRUE, drop.seed = TRUE, ...){
75 |
76 | if (isTRUE(drop.seed)){
77 | control[["seed"]] <- NULL
78 | }
79 |
80 | if (isTRUE(harmonic.mean)) {
81 | optimal_k1(x, max.k = max.k, control = control, method = method, verbose = verbose, ...)
82 | } else {
83 | optimal_k2(x, max.k = max.k, control = control, method = method, ...)
84 | }
85 | }
86 |
87 | #' Plots a plot.optimal_k1 Object
88 | #'
89 | #' Plots a plot.optimal_k1 object
90 | #'
91 | #' @param x A \code{optimal_k1} object.
92 | #' @param \ldots Ignored.
93 | #' @method plot plot.optimal_k1
94 | #' @export
95 | plot.optimal_k1 <- function(x, ...){
96 |
97 | y <- attributes(x)[["k_dataframe"]]
98 | y <- y[y[["k"]] == as.numeric(x), ]
99 |
100 | ggplot2::ggplot(attributes(x)[["k_dataframe"]], ggplot2::aes_string(x="k", y="harmonic_mean")) +
101 | ggplot2::xlab(sprintf("Number of Topics (Optimal Number: %s)", as.numeric(x))) +
102 | ggplot2::ylab("Harmonic Mean of Log Likelihood") +
103 | ggplot2::geom_smooth(method = "loess", fill=NA) +
104 | geom_point(data=y, color="red", fill=NA, size = 6, shape = 21) +
105 | ggplot2::geom_line(size=1) +
106 | ggplot2::theme_bw() +
107 | ggplot2::theme(
108 | axis.title.x = ggplot2::element_text(vjust = -0.25, size = 14),
109 | axis.title.y = ggplot2::element_text(size = 14, angle=90)
110 | )
111 | }
112 |
113 | #' Prints a optimal_k Object
114 | #'
115 | #' Prints a optimal_k object
116 | #'
117 | #' @param x A \code{optimal_k} object.
118 | #' @param \ldots Ignored.
119 | #' @method print optimal_k
120 | #' @export
121 | print.optimal_k <- function(x, ...){
122 |
123 | print(graphics::plot(x))
124 |
125 | }
126 |
127 |
128 |
129 |
130 | optimal_k1 <- function(x, max.k = 30,
131 | control = list(burnin = 500, iter = 1000, keep = 100), method = "Gibbs",
132 | verbose = TRUE, ...){
133 |
134 |
135 | if (max.k > 20) {
136 | message("\nGrab a cup of coffee this could take a while...\n")
137 | flush.console()
138 | }
139 |
140 | tic <- Sys.time()
141 | v <- rep(NA, floor(max.k/10))
142 | dat <- data.frame(k = v, time = v)
143 | end <- data.frame(k = max.k^2)
144 |
145 | hm_many <- sapply(2:max.k, function(k){
146 | if (k %% 10 == 0){
147 | time <- as.numeric(difftime(Sys.time(), tic, units = "mins"))
148 | dat[k/10, 1:2] <<- c(k^2, time)
149 | if (k/10 > 1) {
150 | fit <- with(dat, lm(time~k))
151 | pred <- predict(fit, end) - time
152 | if (pred < 0) pred <- 0
153 | est <- paste0("; Remaining: ~", time2char(pred), " mins")
154 | } else {
155 | est <- ""
156 | }
157 | cur <- format(Sys.time(), format="%I:%M:%S")
158 | elapsed <- time2char(time)
159 | #gsub("^0+", "", as.character(round(as.numeric(difftime(Sys.time(), tic, units = "mins")), 1)))
160 | cat(sprintf("%s of %s iterations (Current: %s; Elapsed: %s mins%s)\n", k, max.k, cur, elapsed, est)); flush.console()
161 | }
162 | burnin <- control[["burnin"]]
163 | keep <- control[["keep"]]
164 | if (is.null(burnin) | is.null(keep)) stop("Supply burnin & keep to control")
165 | fitted <- topicmodels::LDA(x, k = k, method = method, control = control)
166 | logLiks <- fitted@logLiks[-c(1:(burnin/keep))]
167 | harmonicMean(logLiks)
168 | })
169 |
170 | out <- c(2:max.k)[which.max(hm_many)]
171 | if (which.max(hm_many) == max.k) warning("Optimal K is last value; suggest increasing `max.k`")
172 | class(out) <- c("optimal_k", "optimal_k1", class(out))
173 | attributes(out)[["k_dataframe"]] <- data.frame(
174 | k = 2:max.k,
175 | harmonic_mean = hm_many
176 | )
177 | if (isTRUE(verbose)) cat(sprintf("Optimal number of topics = %s\n",as.numeric(out)))
178 | out
179 | }
180 |
181 | harmonicMean <- function(logLikelihoods, precision=2000L) {
182 | llMed <- Rmpfr::median(logLikelihoods)
183 | as.double(llMed - log(Rmpfr::mean(exp(-Rmpfr::mpfr(logLikelihoods, prec = precision) + llMed))))
184 | }
185 |
186 | optimal_k2 <- function(x, max.k = 30, control = NULL, method = "VEM", ...){
187 |
188 | if (max.k > 20) {
189 | message("\nGrab a cup of coffee this could take a while...\n")
190 | flush.console()
191 | }
192 |
193 | tic <- Sys.time()
194 | v <- rep(NA, floor(max.k/10))
195 | dat <- data.frame(k = v, time = v)
196 | end <- data.frame(k = max.k^2)
197 |
198 | best_model <- lapply(seq(2, max.k, by=1), function(k){
199 | if (k %% 10 == 0){
200 | time <- as.numeric(difftime(Sys.time(), tic, units = "mins"))
201 | dat[k/10, 1:2] <<- c(k^2, time)
202 | if (k/10 > 1) {
203 | fit <- with(dat, lm(time~k))
204 | est <- paste0("; Remaining: ~", time2char(predict(fit, end) - time), " mins")
205 | } else {
206 | est <- ""
207 | }
208 | cur <- format(Sys.time(), format="%I:%M:%S")
209 | elapsed <- time2char(time)
210 | #gsub("^0+", "", as.character(round(as.numeric(difftime(Sys.time(), tic, units = "mins")), 1)))
211 | cat(sprintf("%s of %s iterations (Current: %s; Elapsed: %s mins%s)\n", k, max.k, cur, elapsed, est)); flush.console()
212 | }
213 | topicmodels::LDA(x, k = k, method = method, control = control, ...)
214 | })
215 |
216 | out <- data.frame(
217 | k = c(2:max.k),
218 | logLik = sapply(best_model, logLik)
219 | )
220 |
221 | class(out) <- c("optimal_k", "optimal_k2", "data.frame")
222 | out
223 | }
224 |
225 | time2char <- function(x){
226 | x <- as.character(round(x, 1))
227 | if (identical("0", x)) return(x)
228 | gsub("^0+", "", x)
229 | }
230 |
231 | #' Plots a plot.optimal_k2 Object
232 | #'
233 | #' Plots a plot.optimal_k2 object
234 | #'
235 | #' @param x A \code{optimal_k2} object.
236 | #' @param \ldots Ignored.
237 | #' @method plot plot.optimal_k2
238 | #' @export
239 | plot.optimal_k2 <- function(x, ...){
240 |
241 | ggplot2::ggplot(x, ggplot2::aes_string(x="k", y="logLik")) +
242 | ggplot2::xlab("Number of Topics") +
243 | ggplot2::ylab("Log Likelihood") +
244 | ggplot2::geom_smooth(size=.8, se=FALSE, method="loess") +
245 | ggplot2::geom_line(size=1) +
246 | ggplot2::theme_bw() +
247 | ggplot2::theme(
248 | axis.title.x = ggplot2::element_text(vjust = -0.25, size = 14),
249 | axis.title.y = ggplot2::element_text(size = 14, angle=90)
250 | )
251 |
252 | }
253 |
254 | if (!require("pacman")) install.packages("pacman"); library(pacman)
255 | pacman::p_load(ggplot2, topicmodels, Rmpfr)
--------------------------------------------------------------------------------
/functions/topicmodels2LDAvis.R:
--------------------------------------------------------------------------------
1 | #' Transform Model Output for Use with the LDAvis Package
2 | #'
3 | #' Convert a \pkg{topicmodels} output into the JSON form required by the \pkg{LDAvis} package.
4 | #'
5 | #' @param model A \code{\link[]{topicmodel}} object.
6 | #' @param \ldots Currently ignored.
7 | #' @seealso \code{\link[LDAvis]{createJSON}}
8 | #' @export
9 | #' @examples
10 | #' \dontrun{
11 | #' data("AssociatedPress", package = "topicmodels")
12 | #' model <- LDA(AssociatedPress[1:20,], control = list(alpha = 0.1), k = 3)
13 | #' LDAvis::serVis(topicmodels2LDAvis(model))
14 | #' }
15 | topicmodels2LDAvis <- function(x, ...){
16 | post <- topicmodels::posterior(x)
17 | if (ncol(post[["topics"]]) < 3) stop("The model must contain > 2 topics")
18 | mat <- x@wordassignments
19 | LDAvis::createJSON(
20 | phi = post[["terms"]],
21 | theta = post[["topics"]],
22 | vocab = colnames(post[["terms"]]),
23 | doc.length = slam::row_sums(mat, na.rm = TRUE),
24 | term.frequency = slam::col_sums(mat, na.rm = TRUE)
25 | )
26 | }
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/inst/figure/topic-model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/topic-model.jpg
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-11-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-12-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-7-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/inst/figure/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/inst/figure/unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/presentations/Blei2009.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/presentations/Blei2009.pdf
--------------------------------------------------------------------------------
/scripts/Example_topic_model_analysis.R:
--------------------------------------------------------------------------------
1 | ## Install/Load Tools & Data
2 | if (!require("pacman")) install.packages("pacman")
3 | pacman::p_load_gh("trinker/gofastr")
4 | pacman::p_load(tm, topicmodels, dplyr, tidyr, igraph, devtools, LDAvis, ggplot2)
5 |
6 |
7 | ## Source topicmodels2LDAvis & optimal_k functions
8 | invisible(lapply(
9 | file.path(
10 | "https://raw.githubusercontent.com/trinker/topicmodels_learning/master/functions",
11 | c("topicmodels2LDAvis.R", "optimal_k.R")
12 | ),
13 | devtools::source_url
14 | ))
15 |
16 | data(presidential_debates_2012)
17 |
18 |
19 | ## Generate Stopwords
20 | stops <- c(
21 | tm::stopwords("english"),
22 | tm::stopwords("SMART"),
23 | "governor", "president", "mister", "obama","romney"
24 | ) %>%
25 | gofastr::prep_stopwords()
26 |
27 |
28 | ## Create the DocumentTermMatrix
29 | doc_term_mat <- presidential_debates_2012 %>%
30 | with(gofastr::q_dtm_stem(dialogue, paste(person, time, sep = "_"))) %>%
31 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
32 | gofastr::filter_tf_idf() %>%
33 | gofastr::filter_documents()
34 |
35 |
36 | ## Control List
37 | control <- list(burnin = 500, iter = 1000, keep = 100)
38 |
39 | ## Determine Optimal Number of Topics
40 | (k <- optimal_k(doc_term_mat, 40, control = control))
41 |
42 |
43 | ## Run the Model
44 | control[["seed"]] <- 100
45 | lda_model <- topicmodels::LDA(doc_term_mat, k=as.numeric(k), method = "Gibbs",
46 | control = control)
47 |
48 |
49 | ## Plot the Topics Per Person & Time
50 | topics <- topicmodels::posterior(lda_model, doc_term_mat)[["topics"]]
51 | topic_dat <- dplyr::add_rownames(as.data.frame(topics), "Person_Time")
52 | colnames(topic_dat)[-1] <- apply(terms(lda_model, 10), 2, paste, collapse = ", ")
53 |
54 | tidyr::gather(topic_dat, Topic, Proportion, -c(Person_Time)) %>%
55 | tidyr::separate(Person_Time, c("Person", "Time"), sep = "_") %>%
56 | dplyr::mutate(Person = factor(Person,
57 | levels = c("OBAMA", "ROMNEY", "LEHRER", "SCHIEFFER", "CROWLEY", "QUESTION" ))
58 | ) %>%
59 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
60 | ggplot2::geom_bar() +
61 | ggplot2::coord_flip() +
62 | ggplot2::facet_grid(Person~Time) +
63 | ggplot2::guides(fill=FALSE) +
64 | ggplot2::xlab("Proportion")
65 |
66 |
67 | ## Plot the Topics Matrix as a Heatmap
68 | heatmap(topics, scale = "none")
69 |
70 |
71 | ## Network of the Word Distributions Over Topics
72 | post <- topicmodels::posterior(lda_model)
73 |
74 | cor_mat <- cor(t(post[["terms"]]))
75 | cor_mat[ cor_mat < .05 ] <- 0
76 | diag(cor_mat) <- 0
77 |
78 | graph <- graph.adjacency(cor_mat, weighted=TRUE, mode="lower")
79 | graph <- delete.edges(graph, E(graph)[ weight < 0.05])
80 |
81 | E(graph)$edge.width <- E(graph)$weight*20
82 | V(graph)$label <- paste("Topic", V(graph))
83 | V(graph)$size <- colSums(post[["topics"]]) * 15
84 |
85 | par(mar=c(0, 0, 3, 0))
86 | set.seed(110)
87 | plot.igraph(graph, edge.width = E(graph)$edge.width,
88 | edge.color = "orange", vertex.color = "orange",
89 | vertex.frame.color = NA, vertex.label.color = "grey30")
90 | title("Strength Between Topics Based On Word Probabilities", cex.main=.8)
91 |
92 |
93 | ## Network of the Topics Over Documents
94 | minval <- .1
95 | topic_mat <- topicmodels::posterior(lda_model)[["topics"]]
96 |
97 | graph <- graph_from_incidence_matrix(topic_mat, weighted=TRUE)
98 | graph <- delete.edges(graph, E(graph)[ weight < minval])
99 |
100 | E(graph)$edge.width <- E(graph)$weight*17
101 | E(graph)$color <- "blue"
102 | V(graph)$color <- ifelse(grepl("^\\d+$", V(graph)$name), "grey75", "orange")
103 | V(graph)$frame.color <- NA
104 | V(graph)$label <- ifelse(grepl("^\\d+$", V(graph)$name), paste("topic", V(graph)$name), gsub("_", "\n", V(graph)$name))
105 | V(graph)$size <- c(rep(10, nrow(topic_mat)), colSums(topic_mat) * 20)
106 | V(graph)$label.color <- ifelse(grepl("^\\d+$", V(graph)$name), "red", "grey30")
107 |
108 | par(mar=c(0, 0, 3, 0))
109 | set.seed(365)
110 | plot.igraph(graph, edge.width = E(graph)$edge.width,
111 | vertex.color = adjustcolor(V(graph)$color, alpha.f = .4))
112 | title("Topic & Document Relationships", cex.main=.8)
113 |
114 |
115 | ## LDAvis of Model
116 | lda_model %>%
117 | topicmodels2LDAvis() %>%
118 | LDAvis::serVis()
119 |
120 |
121 | ##==================##
122 | ## Fitting New Data ##
123 | ##==================##
124 |
125 |
126 | ## Create the DocumentTermMatrix for New Data
127 | doc_term_mat2 <- partial_republican_debates_2015 %>%
128 | with(gofastr::q_dtm_stem(dialogue, paste(person, location, sep = "_"))) %>%
129 | gofastr::remove_stopwords(stops, stem=TRUE) %>%
130 | gofastr::filter_tf_idf() %>%
131 | gofastr::filter_documents()
132 |
133 |
134 | ## Run the Model for New Data
135 | control2 <- control
136 | control2[["estimate.beta"]] <- FALSE
137 |
138 | lda_model2 <- topicmodels::LDA(doc_term_mat2, k = as.numeric(k), model = lda_model,
139 | control = control2)
140 |
141 |
142 | ## Plot the Topics Per Person & Location for New Data
143 | topics2 <- topicmodels::posterior(lda_model2, doc_term_mat2)[["topics"]]
144 | topic_dat2 <- dplyr::add_rownames(as.data.frame(topics2), "Person_Location")
145 | colnames(topic_dat2)[-1] <- apply(terms(lda_model2, 10), 2, paste, collapse = ", ")
146 |
147 | tidyr::gather(topic_dat2, Topic, Proportion, -c(Person_Location)) %>%
148 | tidyr::separate(Person_Location, c("Person", "Location"), sep = "_") %>%
149 | ggplot2::ggplot(ggplot2::aes(weight=Proportion, x=Topic, fill=Topic)) +
150 | ggplot2::geom_bar() +
151 | ggplot2::coord_flip() +
152 | ggplot2::facet_grid(Person~Location) +
153 | ggplot2::guides(fill=FALSE) +
154 | ggplot2::xlab("Proportion")
155 |
156 |
157 | ## LDAvis of Model for New Data
158 | lda_model2 %>%
159 | topicmodels2LDAvis() %>%
160 | LDAvis::serVis()
--------------------------------------------------------------------------------
/stopword_lists/Jockers_stoplist.csv:
--------------------------------------------------------------------------------
1 | a
able
about
above
according
accordingly
across
actually
after
afterwards
again
against
ain't
all
allow
allows
almost
alone
along
already
also
although
always
am
among
amongst
an
an'
and
another
any
anybody
anyhow
anyone
anything
anyway
anyways
anywhere
apart
appear
appreciate
appropriate
are
aren't
around
as
aside
ask
asking
associated
at
available
away
awfully
b
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
believe
below
beside
besides
best
better
between
beyond
both
brief
but
by
c
c'mon
came
can
can't
cannot
cant
cause
causes
certain
certainly
changes
clearly
co
com
come
comes
concerning
consequently
consider
considering
contain
containing
contains
corresponding
could
couldn't
course
currently
d
definitely
described
despite
did
didn't
different
do
does
doesn't
doing
don't
done
down
downwards
during
e
each
edu
eg
eight
either
else
elsewhere
enough
entirely
especially
et
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
exactly
example
except
f
far
few
fifth
first
five
followed
following
follows
for
former
formerly
forth
four
from
further
furthermore
g
get
gets
getting
given
gives
go
goes
going
gone
got
gotten
greetings
h
had
hadn't
happens
hardly
has
hasn't
have
haven't
having
he
he's
hello
help
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
hi
him
himself
his
hither
hopefully
how
howbeit
however
i
i'd
i'll
i'm
i've
ie
if
ignored
immediate
in
inasmuch
inc
indeed
indicate
indicated
indicates
inner
insofar
instead
into
inward
is
isn't
it
it'd
it'll
it's
its
itself
j
just
k
keep
keeps
kept
know
knows
known
l
last
lately
later
latter
latterly
least
less
lest
let
let's
like
liked
likely
little
look
looking
looks
ltd
m
mainly
many
may
maybe
me
mean
meanwhile
merely
might
more
moreover
most
mostly
much
must
my
myself
n
name
namely
nd
near
nearly
necessary
need
needs
neither
never
nevertheless
new
next
nine
no
nobody
non
none
noone
nor
normally
not
nothing
novel
now
nowhere
o
o'
obviously
of
off
often
oh
ok
okay
old
on
once
one
ones
only
onto
or
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
own
p
particular
particularly
per
perhaps
placed
please
plus
possible
presumably
probably
provides
q
que
quite
qv
r
rather
rd
re
really
reasonably
regarding
regardless
regards
relatively
respectively
right
s
said
same
saw
say
saying
says
second
secondly
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sensible
sent
serious
seriously
seven
several
shall
she
should
shouldn't
since
six
so
some
somebody
somehow
someone
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specified
specify
specifying
still
sub
such
sup
sure
t
'tain't
t's
take
taken
tell
tends
th
than
thank
thanks
thanx
that
that's
thats
the
their
theirs
them
themselves
then
thence
there
there's
thereafter
thereby
therefore
therein
theres
thereupon
these
they
they'd
they'll
they're
they've
thing
think
third
this
thorough
thoroughly
those
though
three
through
throughout
thru
thus
to
together
too
took
toward
towards
tried
tries
truly
try
trying
twice
two
u
un
under
unfortunately
unless
unlikely
until
unto
up
upon
us
use
used
useful
uses
using
usually
v
value
various
very
via
viz
vs
w
want
wants
was
wasn't
way
we
we'd
we'll
we're
we've
welcome
well
went
were
weren't
what
what's
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who's
whoever
whole
whom
whose
why
will
willing
wish
with
within
without
won't
wonder
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
zero
'tis
'twas
a
could've
dear
he'd
he'll
how'd
how'll
how's
might've
mightn't
must've
mustn't
shan't
she'd
she'll
she's
should've
that'll
tis
twas
what'd
when'd
when'll
when's
where'd
where'll
who'd
who'll
why'd
why'll
why's
would've
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/README:
--------------------------------------------------------------------------------
1 | English stoplist is the standard Mallet stoplist.
2 |
3 | German, French, Finnish are borrowed from http://www.ranks.nl.
4 |
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/de.txt:
--------------------------------------------------------------------------------
1 | aber
2 | als
3 | am
4 | an
5 | auch
6 | auf
7 | aus
8 | bei
9 | bin
10 | bis
11 | bist
12 | da
13 | dadurch
14 | daher
15 | darum
16 | das
17 | daß
18 | dass
19 | dein
20 | deine
21 | dem
22 | den
23 | der
24 | des
25 | dessen
26 | deshalb
27 | die
28 | dies
29 | dieser
30 | dieses
31 | doch
32 | dort
33 | du
34 | durch
35 | ein
36 | eine
37 | einem
38 | einen
39 | einer
40 | eines
41 | er
42 | es
43 | euer
44 | eure
45 | für
46 | hatte
47 | hatten
48 | hattest
49 | hattet
50 | hier
51 | hinter
52 | ich
53 | ihr
54 | ihre
55 | im
56 | in
57 | ist
58 | ja
59 | jede
60 | jedem
61 | jeden
62 | jeder
63 | jedes
64 | jener
65 | jenes
66 | jetzt
67 | kann
68 | kannst
69 | können
70 | könnt
71 | machen
72 | mein
73 | meine
74 | mit
75 | muß
76 | mußt
77 | musst
78 | müssen
79 | müßt
80 | nach
81 | nachdem
82 | nein
83 | nicht
84 | nun
85 | oder
86 | seid
87 | sein
88 | seine
89 | sich
90 | sie
91 | sind
92 | soll
93 | sollen
94 | sollst
95 | sollt
96 | sonst
97 | soweit
98 | sowie
99 | und
100 | unser
101 | unsere
102 | unter
103 | vom
104 | von
105 | vor
106 | wann
107 | warum
108 | was
109 | weiter
110 | weitere
111 | wenn
112 | wer
113 | werde
114 | werden
115 | werdet
116 | weshalb
117 | wie
118 | wieder
119 | wieso
120 | wir
121 | wird
122 | wirst
123 | wo
124 | woher
125 | wohin
126 | zu
127 | zum
128 | zur
129 | über
130 |
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/en.txt:
--------------------------------------------------------------------------------
1 | a
2 | able
3 | about
4 | above
5 | according
6 | accordingly
7 | across
8 | actually
9 | after
10 | afterwards
11 | again
12 | against
13 | all
14 | allow
15 | allows
16 | almost
17 | alone
18 | along
19 | already
20 | also
21 | although
22 | always
23 | am
24 | among
25 | amongst
26 | an
27 | and
28 | another
29 | any
30 | anybody
31 | anyhow
32 | anyone
33 | anything
34 | anyway
35 | anyways
36 | anywhere
37 | apart
38 | appear
39 | appreciate
40 | appropriate
41 | are
42 | around
43 | as
44 | aside
45 | ask
46 | asking
47 | associated
48 | at
49 | available
50 | away
51 | awfully
52 | b
53 | be
54 | became
55 | because
56 | become
57 | becomes
58 | becoming
59 | been
60 | before
61 | beforehand
62 | behind
63 | being
64 | believe
65 | below
66 | beside
67 | besides
68 | best
69 | better
70 | between
71 | beyond
72 | both
73 | brief
74 | but
75 | by
76 | c
77 | came
78 | can
79 | cannot
80 | cant
81 | cause
82 | causes
83 | certain
84 | certainly
85 | changes
86 | clearly
87 | co
88 | com
89 | come
90 | comes
91 | concerning
92 | consequently
93 | consider
94 | considering
95 | contain
96 | containing
97 | contains
98 | corresponding
99 | could
100 | course
101 | currently
102 | d
103 | definitely
104 | described
105 | despite
106 | did
107 | different
108 | do
109 | does
110 | doing
111 | done
112 | down
113 | downwards
114 | during
115 | e
116 | each
117 | edu
118 | eg
119 | eight
120 | either
121 | else
122 | elsewhere
123 | enough
124 | entirely
125 | especially
126 | et
127 | etc
128 | even
129 | ever
130 | every
131 | everybody
132 | everyone
133 | everything
134 | everywhere
135 | ex
136 | exactly
137 | example
138 | except
139 | f
140 | far
141 | few
142 | fifth
143 | first
144 | five
145 | followed
146 | following
147 | follows
148 | for
149 | former
150 | formerly
151 | forth
152 | four
153 | from
154 | further
155 | furthermore
156 | g
157 | get
158 | gets
159 | getting
160 | given
161 | gives
162 | go
163 | goes
164 | going
165 | gone
166 | got
167 | gotten
168 | greetings
169 | h
170 | had
171 | happens
172 | hardly
173 | has
174 | have
175 | having
176 | he
177 | hello
178 | help
179 | hence
180 | her
181 | here
182 | hereafter
183 | hereby
184 | herein
185 | hereupon
186 | hers
187 | herself
188 | hi
189 | him
190 | himself
191 | his
192 | hither
193 | hopefully
194 | how
195 | howbeit
196 | however
197 | i
198 | ie
199 | if
200 | ignored
201 | immediate
202 | in
203 | inasmuch
204 | inc
205 | indeed
206 | indicate
207 | indicated
208 | indicates
209 | inner
210 | insofar
211 | instead
212 | into
213 | inward
214 | is
215 | it
216 | its
217 | itself
218 | j
219 | just
220 | k
221 | keep
222 | keeps
223 | kept
224 | know
225 | knows
226 | known
227 | l
228 | last
229 | lately
230 | later
231 | latter
232 | latterly
233 | least
234 | less
235 | lest
236 | let
237 | like
238 | liked
239 | likely
240 | little
241 | look
242 | looking
243 | looks
244 | ltd
245 | m
246 | mainly
247 | many
248 | may
249 | maybe
250 | me
251 | mean
252 | meanwhile
253 | merely
254 | might
255 | more
256 | moreover
257 | most
258 | mostly
259 | much
260 | must
261 | my
262 | myself
263 | n
264 | name
265 | namely
266 | nd
267 | near
268 | nearly
269 | necessary
270 | need
271 | needs
272 | neither
273 | never
274 | nevertheless
275 | new
276 | next
277 | nine
278 | no
279 | nobody
280 | non
281 | none
282 | noone
283 | nor
284 | normally
285 | not
286 | nothing
287 | novel
288 | now
289 | nowhere
290 | o
291 | obviously
292 | of
293 | off
294 | often
295 | oh
296 | ok
297 | okay
298 | old
299 | on
300 | once
301 | one
302 | ones
303 | only
304 | onto
305 | or
306 | other
307 | others
308 | otherwise
309 | ought
310 | our
311 | ours
312 | ourselves
313 | out
314 | outside
315 | over
316 | overall
317 | own
318 | p
319 | particular
320 | particularly
321 | per
322 | perhaps
323 | placed
324 | please
325 | plus
326 | possible
327 | presumably
328 | probably
329 | provides
330 | q
331 | que
332 | quite
333 | qv
334 | r
335 | rather
336 | rd
337 | re
338 | really
339 | reasonably
340 | regarding
341 | regardless
342 | regards
343 | relatively
344 | respectively
345 | right
346 | s
347 | said
348 | same
349 | saw
350 | say
351 | saying
352 | says
353 | second
354 | secondly
355 | see
356 | seeing
357 | seem
358 | seemed
359 | seeming
360 | seems
361 | seen
362 | self
363 | selves
364 | sensible
365 | sent
366 | serious
367 | seriously
368 | seven
369 | several
370 | shall
371 | she
372 | should
373 | since
374 | six
375 | so
376 | some
377 | somebody
378 | somehow
379 | someone
380 | something
381 | sometime
382 | sometimes
383 | somewhat
384 | somewhere
385 | soon
386 | sorry
387 | specified
388 | specify
389 | specifying
390 | still
391 | sub
392 | such
393 | sup
394 | sure
395 | t
396 | take
397 | taken
398 | tell
399 | tends
400 | th
401 | than
402 | thank
403 | thanks
404 | thanx
405 | that
406 | thats
407 | the
408 | their
409 | theirs
410 | them
411 | themselves
412 | then
413 | thence
414 | there
415 | thereafter
416 | thereby
417 | therefore
418 | therein
419 | theres
420 | thereupon
421 | these
422 | they
423 | think
424 | third
425 | this
426 | thorough
427 | thoroughly
428 | those
429 | though
430 | three
431 | through
432 | throughout
433 | thru
434 | thus
435 | to
436 | together
437 | too
438 | took
439 | toward
440 | towards
441 | tried
442 | tries
443 | truly
444 | try
445 | trying
446 | twice
447 | two
448 | u
449 | un
450 | under
451 | unfortunately
452 | unless
453 | unlikely
454 | until
455 | unto
456 | up
457 | upon
458 | us
459 | use
460 | used
461 | useful
462 | uses
463 | using
464 | usually
465 | uucp
466 | v
467 | value
468 | various
469 | very
470 | via
471 | viz
472 | vs
473 | w
474 | want
475 | wants
476 | was
477 | way
478 | we
479 | welcome
480 | well
481 | went
482 | were
483 | what
484 | whatever
485 | when
486 | whence
487 | whenever
488 | where
489 | whereafter
490 | whereas
491 | whereby
492 | wherein
493 | whereupon
494 | wherever
495 | whether
496 | which
497 | while
498 | whither
499 | who
500 | whoever
501 | whole
502 | whom
503 | whose
504 | why
505 | will
506 | willing
507 | wish
508 | with
509 | within
510 | without
511 | wonder
512 | would
513 | would
514 | x
515 | y
516 | yes
517 | yet
518 | you
519 | your
520 | yours
521 | yourself
522 | yourselves
523 | z
524 | zero
525 |
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/fi.txt:
--------------------------------------------------------------------------------
1 | aiemmin
2 | aika
3 | aikaa
4 | aikaan
5 | aikaisemmin
6 | aikaisin
7 | aikajen
8 | aikana
9 | aikoina
10 | aikoo
11 | aikovat
12 | aina
13 | ainakaan
14 | ainakin
15 | ainoa
16 | ainoat
17 | aiomme
18 | aion
19 | aiotte
20 | aist
21 | aivan
22 | ajan
23 | älä
24 | alas
25 | alemmas
26 | älköön
27 | alkuisin
28 | alkuun
29 | alla
30 | alle
31 | aloitamme
32 | aloitan
33 | aloitat
34 | aloitatte
35 | aloitattivat
36 | aloitettava
37 | aloitettevaksi
38 | aloitettu
39 | aloitimme
40 | aloitin
41 | aloitit
42 | aloititte
43 | aloittaa
44 | aloittamatta
45 | aloitti
46 | aloittivat
47 | alta
48 | aluksi
49 | alussa
50 | alusta
51 | annettavaksi
52 | annetteva
53 | annettu
54 | antaa
55 | antamatta
56 | antoi
57 | aoua
58 | apu
59 | asia
60 | asiaa
61 | asian
62 | asiasta
63 | asiat
64 | asioiden
65 | asioihin
66 | asioita
67 | asti
68 | avuksi
69 | avulla
70 | avun
71 | avutta
72 | edellä
73 | edelle
74 | edelleen
75 | edeltä
76 | edemmäs
77 | edes
78 | edessä
79 | edestä
80 | ehkä
81 | ei
82 | eikä
83 | eilen
84 | eivät
85 | eli
86 | ellei
87 | elleivät
88 | ellemme
89 | ellen
90 | ellet
91 | ellette
92 | emme
93 | en
94 | enää
95 | enemmän
96 | eniten
97 | ennen
98 | ensi
99 | ensimmäinen
100 | ensimmäiseksi
101 | ensimmäisen
102 | ensimmäisenä
103 | ensimmäiset
104 | ensimmäisiä
105 | ensimmäisiksi
106 | ensimmäisinä
107 | ensimmäistä
108 | ensin
109 | entinen
110 | entisen
111 | entisiä
112 | entistä
113 | entisten
114 | eräät
115 | eräiden
116 | eräs
117 | eri
118 | erittäin
119 | erityisesti
120 | esi
121 | esiin
122 | esillä
123 | esimerkiksi
124 | et
125 | eteen
126 | etenkin
127 | että
128 | ette
129 | ettei
130 | halua
131 | haluaa
132 | haluamatta
133 | haluamme
134 | haluan
135 | haluat
136 | haluatte
137 | haluavat
138 | halunnut
139 | halusi
140 | halusimme
141 | halusin
142 | halusit
143 | halusitte
144 | halusivat
145 | halutessa
146 | haluton
147 | hän
148 | häneen
149 | hänellä
150 | hänelle
151 | häneltä
152 | hänen
153 | hänessä
154 | hänestä
155 | hänet
156 | he
157 | hei
158 | heidän
159 | heihin
160 | heille
161 | heiltä
162 | heissä
163 | heistä
164 | heitä
165 | helposti
166 | heti
167 | hetkellä
168 | hieman
169 | huolimatta
170 | huomenna
171 | hyvä
172 | hyvää
173 | hyvät
174 | hyviä
175 | hyvien
176 | hyviin
177 | hyviksi
178 | hyville
179 | hyviltä
180 | hyvin
181 | hyvinä
182 | hyvissä
183 | hyvistä
184 | ihan
185 | ilman
186 | ilmeisesti
187 | itse
188 | itseään
189 | itsensä
190 | ja
191 | jää
192 | jälkeen
193 | jälleen
194 | jo
195 | johon
196 | joiden
197 | joihin
198 | joiksi
199 | joilla
200 | joille
201 | joilta
202 | joissa
203 | joista
204 | joita
205 | joka
206 | jokainen
207 | jokin
208 | joko
209 | joku
210 | jolla
211 | jolle
212 | jolloin
213 | jolta
214 | jompikumpi
215 | jonka
216 | jonkin
217 | jonne
218 | joo
219 | jopa
220 | jos
221 | joskus
222 | jossa
223 | josta
224 | jota
225 | jotain
226 | joten
227 | jotenkin
228 | jotenkuten
229 | jotka
230 | jotta
231 | jouduimme
232 | jouduin
233 | jouduit
234 | jouduitte
235 | joudumme
236 | joudun
237 | joudutte
238 | joukkoon
239 | joukossa
240 | joukosta
241 | joutua
242 | joutui
243 | joutuivat
244 | joutumaan
245 | joutuu
246 | joutuvat
247 | juuri
248 | kahdeksan
249 | kahdeksannen
250 | kahdella
251 | kahdelle
252 | kahdelta
253 | kahden
254 | kahdessa
255 | kahdesta
256 | kahta
257 | kahteen
258 | kai
259 | kaiken
260 | kaikille
261 | kaikilta
262 | kaikkea
263 | kaikki
264 | kaikkia
265 | kaikkiaan
266 | kaikkialla
267 | kaikkialle
268 | kaikkialta
269 | kaikkien
270 | kaikkin
271 | kaksi
272 | kannalta
273 | kannattaa
274 | kanssa
275 | kanssaan
276 | kanssamme
277 | kanssani
278 | kanssanne
279 | kanssasi
280 | kauan
281 | kauemmas
282 | kautta
283 | kehen
284 | keiden
285 | keihin
286 | keiksi
287 | keillä
288 | keille
289 | keiltä
290 | keinä
291 | keissä
292 | keistä
293 | keitä
294 | keittä
295 | keitten
296 | keneen
297 | keneksi
298 | kenellä
299 | kenelle
300 | keneltä
301 | kenen
302 | kenenä
303 | kenessä
304 | kenestä
305 | kenet
306 | kenettä
307 | kennessästä
308 | kerran
309 | kerta
310 | kertaa
311 | kesken
312 | keskimäärin
313 | ketä
314 | ketkä
315 | kiitos
316 | kohti
317 | koko
318 | kokonaan
319 | kolmas
320 | kolme
321 | kolmen
322 | kolmesti
323 | koska
324 | koskaan
325 | kovin
326 | kuin
327 | kuinka
328 | kuitenkaan
329 | kuitenkin
330 | kuka
331 | kukaan
332 | kukin
333 | kumpainen
334 | kumpainenkaan
335 | kumpi
336 | kumpikaan
337 | kumpikin
338 | kun
339 | kuten
340 | kuuden
341 | kuusi
342 | kuutta
343 | kyllä
344 | kymmenen
345 | kyse
346 | lähekkäin
347 | lähellä
348 | lähelle
349 | läheltä
350 | lähemmäs
351 | lähes
352 | lähinnä
353 | lähtien
354 | läpi
355 | liian
356 | liki
357 | lisää
358 | lisäksi
359 | luo
360 | mahdollisimman
361 | mahdollista
362 | me
363 | meidän
364 | meillä
365 | meille
366 | melkein
367 | melko
368 | menee
369 | meneet
370 | menemme
371 | menen
372 | menet
373 | menette
374 | menevät
375 | meni
376 | menimme
377 | menin
378 | menit
379 | menivät
380 | mennessä
381 | mennyt
382 | menossa
383 | mihin
384 | mikä
385 | mikään
386 | mikäli
387 | mikin
388 | miksi
389 | milloin
390 | minä
391 | minne
392 | minun
393 | minut
394 | missä
395 | mistä
396 | mitä
397 | mitään
398 | miten
399 | moi
400 | molemmat
401 | mones
402 | monesti
403 | monet
404 | moni
405 | moniaalla
406 | moniaalle
407 | moniaalta
408 | monta
409 | muassa
410 | muiden
411 | muita
412 | muka
413 | mukaan
414 | mukaansa
415 | mukana
416 | mutta
417 | muu
418 | muualla
419 | muualle
420 | muualta
421 | muuanne
422 | muulloin
423 | muun
424 | muut
425 | muuta
426 | muutama
427 | muutaman
428 | muuten
429 | myöhemmin
430 | myös
431 | myöskään
432 | myöskin
433 | myötä
434 | näiden
435 | näin
436 | näissä
437 | näissähin
438 | näissälle
439 | näissältä
440 | näissästä
441 | näitä
442 | nämä
443 | ne
444 | neljä
445 | neljää
446 | neljän
447 | niiden
448 | niin
449 | niistä
450 | niitä
451 | noin
452 | nopeammin
453 | nopeasti
454 | nopeiten
455 | nro
456 | nuo
457 | nyt
458 | ohi
459 | oikein
460 | ole
461 | olemme
462 | olen
463 | olet
464 | olette
465 | oleva
466 | olevan
467 | olevat
468 | oli
469 | olimme
470 | olin
471 | olisi
472 | olisimme
473 | olisin
474 | olisit
475 | olisitte
476 | olisivat
477 | olit
478 | olitte
479 | olivat
480 | olla
481 | olleet
482 | olli
483 | ollut
484 | oma
485 | omaa
486 | omaan
487 | omaksi
488 | omalle
489 | omalta
490 | oman
491 | omassa
492 | omat
493 | omia
494 | omien
495 | omiin
496 | omiksi
497 | omille
498 | omilta
499 | omissa
500 | omista
501 | on
502 | onkin
503 | onko
504 | ovat
505 | päälle
506 | paikoittain
507 | paitsi
508 | pakosti
509 | paljon
510 | paremmin
511 | parempi
512 | parhaillaan
513 | parhaiten
514 | peräti
515 | perusteella
516 | pian
517 | pieneen
518 | pieneksi
519 | pienellä
520 | pienelle
521 | pieneltä
522 | pienempi
523 | pienestä
524 | pieni
525 | pienin
526 | puolesta
527 | puolestaan
528 | runsaasti
529 | saakka
530 | sadam
531 | sama
532 | samaa
533 | samaan
534 | samalla
535 | samallalta
536 | samallassa
537 | samallasta
538 | saman
539 | samat
540 | samoin
541 | sata
542 | sataa
543 | satojen
544 | se
545 | seitsemän
546 | sekä
547 | sen
548 | seuraavat
549 | siellä
550 | sieltä
551 | siihen
552 | siinä
553 | siis
554 | siitä
555 | sijaan
556 | siksi
557 | sillä
558 | silloin
559 | silti
560 | sinä
561 | sinne
562 | sinua
563 | sinulle
564 | sinulta
565 | sinun
566 | sinussa
567 | sinusta
568 | sinut
569 | sisäkkäin
570 | sisällä
571 | sitä
572 | siten
573 | sitten
574 | suoraan
575 | suuntaan
576 | suuren
577 | suuret
578 | suuri
579 | suuria
580 | suurin
581 | suurten
582 | taa
583 | täällä
584 | täältä
585 | taas
586 | taemmas
587 | tähän
588 | tahansa
589 | tai
590 | takaa
591 | takaisin
592 | takana
593 | takia
594 | tällä
595 | tällöin
596 | tämä
597 | tämän
598 | tänä
599 | tänään
600 | tänne
601 | tapauksessa
602 | tässä
603 | tästä
604 | tätä
605 | täten
606 | tavalla
607 | tavoitteena
608 | täysin
609 | täytyvät
610 | täytyy
611 | te
612 | tietysti
613 | todella
614 | toinen
615 | toisaalla
616 | toisaalle
617 | toisaalta
618 | toiseen
619 | toiseksi
620 | toisella
621 | toiselle
622 | toiselta
623 | toisemme
624 | toisen
625 | toisensa
626 | toisessa
627 | toisesta
628 | toista
629 | toistaiseksi
630 | toki
631 | tosin
632 | tuhannen
633 | tuhat
634 | tule
635 | tulee
636 | tulemme
637 | tulen
638 | tulet
639 | tulette
640 | tulevat
641 | tulimme
642 | tulin
643 | tulisi
644 | tulisimme
645 | tulisin
646 | tulisit
647 | tulisitte
648 | tulisivat
649 | tulit
650 | tulitte
651 | tulivat
652 | tulla
653 | tulleet
654 | tullut
655 | tuntuu
656 | tuo
657 | tuolla
658 | tuolloin
659 | tuolta
660 | tuonne
661 | tuskin
662 | tykö
663 | usea
664 | useasti
665 | useimmiten
666 | usein
667 | useita
668 | uudeksi
669 | uudelleen
670 | uuden
671 | uudet
672 | uusi
673 | uusia
674 | uusien
675 | uusinta
676 | uuteen
677 | uutta
678 | vaan
679 | vähän
680 | vähemmän
681 | vähintään
682 | vähiten
683 | vai
684 | vaiheessa
685 | vaikea
686 | vaikean
687 | vaikeat
688 | vaikeilla
689 | vaikeille
690 | vaikeilta
691 | vaikeissa
692 | vaikeista
693 | vaikka
694 | vain
695 | välillä
696 | varmasti
697 | varsin
698 | varsinkin
699 | varten
700 | vasta
701 | vastaan
702 | vastakkain
703 | verran
704 | vielä
705 | vierekkäin
706 | vieri
707 | viiden
708 | viime
709 | viimeinen
710 | viimeisen
711 | viimeksi
712 | viisi
713 | voi
714 | voidaan
715 | voimme
716 | voin
717 | voisi
718 | voit
719 | voitte
720 | voivat
721 | vuoden
722 | vuoksi
723 | vuosi
724 | vuosien
725 | vuosina
726 | vuotta
727 | yhä
728 | yhdeksän
729 | yhden
730 | yhdessä
731 | yhtä
732 | yhtäällä
733 | yhtäälle
734 | yhtäältä
735 | yhtään
736 | yhteen
737 | yhteensä
738 | yhteydessä
739 | yhteyteen
740 | yksi
741 | yksin
742 | yksittäin
743 | yleensä
744 | ylemmäs
745 | yli
746 | ylös
747 | ympäri
748 |
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/fr.txt:
--------------------------------------------------------------------------------
1 | alors
2 | au
3 | aucuns
4 | aussi
5 | autre
6 | avant
7 | avec
8 | avoir
9 | bon
10 | car
11 | ce
12 | cela
13 | ces
14 | ceux
15 | chaque
16 | ci
17 | comme
18 | comment
19 | dans
20 | des
21 | du
22 | dedans
23 | dehors
24 | depuis
25 | deux
26 | devrait
27 | doit
28 | donc
29 | dos
30 | droite
31 | début
32 | elle
33 | elles
34 | en
35 | encore
36 | essai
37 | est
38 | et
39 | eu
40 | fait
41 | faites
42 | fois
43 | font
44 | force
45 | haut
46 | hors
47 | ici
48 | il
49 | ils
50 | je
51 | juste
52 | la
53 | le
54 | les
55 | leur
56 | là
57 | ma
58 | maintenant
59 | mais
60 | mes
61 | mine
62 | moins
63 | mon
64 | mot
65 | même
66 | ni
67 | nommés
68 | notre
69 | nous
70 | nouveaux
71 | ou
72 | où
73 | par
74 | parce
75 | parole
76 | pas
77 | personnes
78 | peut
79 | peu
80 | pièce
81 | plupart
82 | pour
83 | pourquoi
84 | quand
85 | que
86 | quel
87 | quelle
88 | quelles
89 | quels
90 | qui
91 | sa
92 | sans
93 | ses
94 | seulement
95 | si
96 | sien
97 | son
98 | sont
99 | sous
100 | soyez
101 | sujet
102 | sur
103 | ta
104 | tandis
105 | tellement
106 | tels
107 | tes
108 | ton
109 | tous
110 | tout
111 | trop
112 | très
113 | tu
114 | valeur
115 | voie
116 | voient
117 | vont
118 | votre
119 | vous
120 | vu
121 | ça
122 | étaient
123 | état
124 | étions
125 | été
126 | être
--------------------------------------------------------------------------------
/stopword_lists/mallet_stoplists/jp.txt:
--------------------------------------------------------------------------------
1 | これ
2 | それ
3 | あれ
4 | この
5 | その
6 | あの
7 | ここ
8 | そこ
9 | あそこ
10 | こちら
11 | どこ
12 | だれ
13 | なに
14 | なん
15 | 何
16 | 私
17 | 貴方
18 | 貴方方
19 | 我々
20 | 私達
21 | あの人
22 | あのかた
23 | 彼女
24 | 彼
25 | です
26 | あります
27 | おります
28 | います
29 | は
30 | が
31 | の
32 | に
33 | を
34 | で
35 | え
36 | から
37 | まで
38 | より
39 | も
40 | どの
41 | と
42 | し
43 | それで
44 | しかし
45 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/arabic.txt:
--------------------------------------------------------------------------------
1 | فى
2 | في
3 | كل
4 | لم
5 | لن
6 | له
7 | من
8 | هو
9 | هي
10 | قوة
11 | كما
12 | لها
13 | منذ
14 | وقد
15 | ولا
16 | نفسه
17 | لقاء
18 | مقابل
19 | هناك
20 | وقال
21 | وكان
22 | نهاية
23 | وقالت
24 | وكانت
25 | للامم
26 | فيه
27 | كلم
28 | لكن
29 | وفي
30 | وقف
31 | ولم
32 | ومن
33 | وهو
34 | وهي
35 | يوم
36 | فيها
37 | منها
38 | مليار
39 | لوكالة
40 | يكون
41 | يمكن
42 | مليون
43 | حيث
44 | اكد
45 | الا
46 | اما
47 | امس
48 | السابق
49 | التى
50 | التي
51 | اكثر
52 | ايار
53 | ايضا
54 | ثلاثة
55 | الذاتي
56 | الاخيرة
57 | الثاني
58 | الثانية
59 | الذى
60 | الذي
61 | الان
62 | امام
63 | ايام
64 | خلال
65 | حوالى
66 | الذين
67 | الاول
68 | الاولى
69 | بين
70 | ذلك
71 | دون
72 | حول
73 | حين
74 | الف
75 | الى
76 | انه
77 | اول
78 | ضمن
79 | انها
80 | جميع
81 | الماضي
82 | الوقت
83 | المقبل
84 | اليوم
85 | ـ
86 | ف
87 | و
88 | و6
89 | قد
90 | لا
91 | ما
92 | مع
93 | مساء
94 | هذا
95 | واحد
96 | واضاف
97 | واضافت
98 | فان
99 | قبل
100 | قال
101 | كان
102 | لدى
103 | نحو
104 | هذه
105 | وان
106 | واكد
107 | كانت
108 | واوضح
109 | مايو
110 | ب
111 | ا
112 | أ
113 | ،
114 | عشر
115 | عدد
116 | عدة
117 | عشرة
118 | عدم
119 | عام
120 | عاما
121 | عن
122 | عند
123 | عندما
124 | على
125 | عليه
126 | عليها
127 | زيارة
128 | سنة
129 | سنوات
130 | تم
131 | ضد
132 | بعد
133 | بعض
134 | اعادة
135 | اعلنت
136 | بسبب
137 | حتى
138 | اذا
139 | احد
140 | اثر
141 | برس
142 | باسم
143 | غدا
144 | شخصا
145 | صباح
146 | اطار
147 | اربعة
148 | اخرى
149 | بان
150 | اجل
151 | غير
152 | بشكل
153 | حاليا
154 | بن
155 | به
156 | ثم
157 | اف
158 | ان
159 | او
160 | اي
161 | بها
162 | صفر
163 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/catalan.txt:
--------------------------------------------------------------------------------
1 | a
2 | abans
3 | algun
4 | alguna
5 | algunes
6 | alguns
7 | altre
8 | amb
9 | ambdós
10 | anar
11 | ans
12 | aquell
13 | aquelles
14 | aquells
15 | aquí
16 | bastant
17 | bé
18 | cada
19 | com
20 | consegueixo
21 | conseguim
22 | conseguir
23 | consigueix
24 | consigueixen
25 | consigueixes
26 | dalt
27 | de
28 | des de
29 | dins
30 | el
31 | elles
32 | ells
33 | els
34 | en
35 | ens
36 | entre
37 | era
38 | erem
39 | eren
40 | eres
41 | es
42 | és
43 | éssent
44 | està
45 | estan
46 | estat
47 | estava
48 | estem
49 | esteu
50 | estic
51 | ets
52 | fa
53 | faig
54 | fan
55 | fas
56 | fem
57 | fer
58 | feu
59 | fi
60 | haver
61 | i
62 | inclòs
63 | jo
64 | la
65 | les
66 | llarg
67 | llavors
68 | mentre
69 | meu
70 | mode
71 | molt
72 | molts
73 | nosaltres
74 | o
75 | on
76 | per
77 | per
78 | per que
79 | però
80 | perquè
81 | podem
82 | poden
83 | poder
84 | podeu
85 | potser
86 | primer
87 | puc
88 | quan
89 | quant
90 | qui
91 | sabem
92 | saben
93 | saber
94 | sabeu
95 | sap
96 | saps
97 | sense
98 | ser
99 | seu
100 | seus
101 | si
102 | soc
103 | solament
104 | sols
105 | som
106 | sota
107 | també
108 | te
109 | tene
110 | tenim
111 | tenir
112 | teniu
113 | teu
114 | tinc
115 | tot
116 | últim
117 | un
118 | un
119 | una
120 | unes
121 | uns
122 | ús
123 | va
124 | vaig
125 | van
126 | vosaltres
127 |
128 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/danish.txt:
--------------------------------------------------------------------------------
1 | ad
2 | af
3 | alle
4 | alt
5 | anden
6 | at
7 | blev
8 | blive
9 | bliver
10 | da
11 | de
12 | dem
13 | den
14 | denne
15 | der
16 | deres
17 | det
18 | dette
19 | dig
20 | din
21 | disse
22 | dog
23 | du
24 | efter
25 | eller
26 | en
27 | end
28 | er
29 | et
30 | for
31 | fra
32 | ham
33 | han
34 | hans
35 | har
36 | havde
37 | have
38 | hende
39 | hendes
40 | her
41 | hos
42 | hun
43 | hvad
44 | hvis
45 | hvor
46 | i
47 | ikke
48 | ind
49 | jeg
50 | jer
51 | jo
52 | kunne
53 | man
54 | mange
55 | med
56 | meget
57 | men
58 | mig
59 | min
60 | mine
61 | mit
62 | mod
63 | ned
64 | noget
65 | nogle
66 | nu
67 | når
68 | og
69 | også
70 | om
71 | op
72 | os
73 | over
74 | på
75 | selv
76 | sig
77 | sin
78 | sine
79 | sit
80 | skal
81 | skulle
82 | som
83 | sådan
84 | thi
85 | til
86 | ud
87 | under
88 | var
89 | vi
90 | vil
91 | ville
92 | vor
93 | være
94 | været
95 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/dutch.txt:
--------------------------------------------------------------------------------
1 | aan
2 | al
3 | alles
4 | als
5 | altijd
6 | andere
7 | ben
8 | bij
9 | daar
10 | dan
11 | dat
12 | de
13 | der
14 | deze
15 | die
16 | dit
17 | doch
18 | doen
19 | door
20 | dus
21 | een
22 | eens
23 | en
24 | er
25 | ge
26 | geen
27 | geweest
28 | haar
29 | had
30 | heb
31 | hebben
32 | heeft
33 | hem
34 | het
35 | hier
36 | hij
37 | hoe
38 | hun
39 | iemand
40 | iets
41 | ik
42 | in
43 | is
44 | ja
45 | je
46 | kan
47 | kon
48 | kunnen
49 | maar
50 | me
51 | meer
52 | men
53 | met
54 | mij
55 | mijn
56 | moet
57 | na
58 | naar
59 | niet
60 | niets
61 | nog
62 | nu
63 | of
64 | om
65 | omdat
66 | onder
67 | ons
68 | ook
69 | op
70 | over
71 | reeds
72 | te
73 | tegen
74 | toch
75 | toen
76 | tot
77 | u
78 | uit
79 | uw
80 | van
81 | veel
82 | voor
83 | want
84 | waren
85 | was
86 | wat
87 | werd
88 | wezen
89 | wie
90 | wil
91 | worden
92 | wordt
93 | zal
94 | ze
95 | zelf
96 | zich
97 | zij
98 | zijn
99 | zo
100 | zonder
101 | zou
102 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/english.txt:
--------------------------------------------------------------------------------
1 | a
2 | about
3 | above
4 | after
5 | again
6 | against
7 | all
8 | am
9 | an
10 | and
11 | any
12 | are
13 | aren't
14 | as
15 | at
16 | be
17 | because
18 | been
19 | before
20 | being
21 | below
22 | between
23 | both
24 | but
25 | by
26 | can't
27 | cannot
28 | could
29 | couldn't
30 | did
31 | didn't
32 | do
33 | does
34 | doesn't
35 | doing
36 | don't
37 | down
38 | during
39 | each
40 | few
41 | for
42 | from
43 | further
44 | had
45 | hadn't
46 | has
47 | hasn't
48 | have
49 | haven't
50 | having
51 | he
52 | he'd
53 | he'll
54 | he's
55 | her
56 | here
57 | here's
58 | hers
59 | herself
60 | him
61 | himself
62 | his
63 | how
64 | how's
65 | i
66 | i'd
67 | i'll
68 | i'm
69 | i've
70 | if
71 | in
72 | into
73 | is
74 | isn't
75 | it
76 | it's
77 | its
78 | itself
79 | let's
80 | me
81 | more
82 | most
83 | mustn't
84 | my
85 | myself
86 | no
87 | nor
88 | not
89 | of
90 | off
91 | on
92 | once
93 | only
94 | or
95 | other
96 | ought
97 | our
98 | ours
99 | ourselves
100 | out
101 | over
102 | own
103 | same
104 | shan't
105 | she
106 | she'd
107 | she'll
108 | she's
109 | should
110 | shouldn't
111 | so
112 | some
113 | such
114 | than
115 | that
116 | that's
117 | the
118 | their
119 | theirs
120 | them
121 | themselves
122 | then
123 | there
124 | there's
125 | these
126 | they
127 | they'd
128 | they'll
129 | they're
130 | they've
131 | this
132 | those
133 | through
134 | to
135 | too
136 | under
137 | until
138 | up
139 | very
140 | was
141 | wasn't
142 | we
143 | we'd
144 | we'll
145 | we're
146 | we've
147 | were
148 | weren't
149 | what
150 | what's
151 | when
152 | when's
153 | where
154 | where's
155 | which
156 | while
157 | who
158 | who's
159 | whom
160 | why
161 | why's
162 | with
163 | won't
164 | would
165 | wouldn't
166 | you
167 | you'd
168 | you'll
169 | you're
170 | you've
171 | your
172 | yours
173 | yourself
174 | yourselves
175 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/finnish.txt:
--------------------------------------------------------------------------------
1 | ei
2 | eivät
3 | emme
4 | en
5 | et
6 | ette
7 | että
8 | he
9 | heidän
10 | heidät
11 | heihin
12 | heille
13 | heillä
14 | heiltä
15 | heissä
16 | heistä
17 | heitä
18 | hän
19 | häneen
20 | hänelle
21 | hänellä
22 | häneltä
23 | hänen
24 | hänessä
25 | hänestä
26 | hänet
27 | häntä
28 | itse
29 | ja
30 | johon
31 | joiden
32 | joihin
33 | joiksi
34 | joilla
35 | joille
36 | joilta
37 | joina
38 | joissa
39 | joista
40 | joita
41 | joka
42 | joksi
43 | jolla
44 | jolle
45 | jolta
46 | jona
47 | jonka
48 | jos
49 | jossa
50 | josta
51 | jota
52 | jotka
53 | kanssa
54 | keiden
55 | keihin
56 | keiksi
57 | keille
58 | keillä
59 | keiltä
60 | keinä
61 | keissä
62 | keistä
63 | keitä
64 | keneen
65 | keneksi
66 | kenelle
67 | kenellä
68 | keneltä
69 | kenen
70 | kenenä
71 | kenessä
72 | kenestä
73 | kenet
74 | ketkä
75 | ketkä
76 | ketä
77 | koska
78 | kuin
79 | kuka
80 | kun
81 | me
82 | meidän
83 | meidät
84 | meihin
85 | meille
86 | meillä
87 | meiltä
88 | meissä
89 | meistä
90 | meitä
91 | mihin
92 | miksi
93 | mikä
94 | mille
95 | millä
96 | miltä
97 | minkä
98 | minkä
99 | minua
100 | minulla
101 | minulle
102 | minulta
103 | minun
104 | minussa
105 | minusta
106 | minut
107 | minuun
108 | minä
109 | minä
110 | missä
111 | mistä
112 | mitkä
113 | mitä
114 | mukaan
115 | mutta
116 | ne
117 | niiden
118 | niihin
119 | niiksi
120 | niille
121 | niillä
122 | niiltä
123 | niin
124 | niin
125 | niinä
126 | niissä
127 | niistä
128 | niitä
129 | noiden
130 | noihin
131 | noiksi
132 | noilla
133 | noille
134 | noilta
135 | noin
136 | noina
137 | noissa
138 | noista
139 | noita
140 | nuo
141 | nyt
142 | näiden
143 | näihin
144 | näiksi
145 | näille
146 | näillä
147 | näiltä
148 | näinä
149 | näissä
150 | näistä
151 | näitä
152 | nämä
153 | ole
154 | olemme
155 | olen
156 | olet
157 | olette
158 | oli
159 | olimme
160 | olin
161 | olisi
162 | olisimme
163 | olisin
164 | olisit
165 | olisitte
166 | olisivat
167 | olit
168 | olitte
169 | olivat
170 | olla
171 | olleet
172 | ollut
173 | on
174 | ovat
175 | poikki
176 | se
177 | sekä
178 | sen
179 | siihen
180 | siinä
181 | siitä
182 | siksi
183 | sille
184 | sillä
185 | sillä
186 | siltä
187 | sinua
188 | sinulla
189 | sinulle
190 | sinulta
191 | sinun
192 | sinussa
193 | sinusta
194 | sinut
195 | sinuun
196 | sinä
197 | sinä
198 | sitä
199 | tai
200 | te
201 | teidän
202 | teidät
203 | teihin
204 | teille
205 | teillä
206 | teiltä
207 | teissä
208 | teistä
209 | teitä
210 | tuo
211 | tuohon
212 | tuoksi
213 | tuolla
214 | tuolle
215 | tuolta
216 | tuon
217 | tuona
218 | tuossa
219 | tuosta
220 | tuota
221 | tähän
222 | täksi
223 | tälle
224 | tällä
225 | tältä
226 | tämä
227 | tämän
228 | tänä
229 | tässä
230 | tästä
231 | tätä
232 | vaan
233 | vai
234 | vaikka
235 | yli
236 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/french.txt:
--------------------------------------------------------------------------------
1 | a
2 | ai
3 | aie
4 | aient
5 | aies
6 | ait
7 | alors
8 | as
9 | au
10 | aucun
11 | aura
12 | aurai
13 | auraient
14 | aurais
15 | aurait
16 | auras
17 | aurez
18 | auriez
19 | aurions
20 | aurons
21 | auront
22 | aussi
23 | autre
24 | aux
25 | avaient
26 | avais
27 | avait
28 | avant
29 | avec
30 | avez
31 | aviez
32 | avions
33 | avoir
34 | avons
35 | ayant
36 | ayez
37 | ayons
38 | bon
39 | car
40 | ce
41 | ceci
42 | cela
43 | ces
44 | cet
45 | cette
46 | ceux
47 | chaque
48 | ci
49 | comme
50 | comment
51 | d
52 | dans
53 | de
54 | dedans
55 | dehors
56 | depuis
57 | des
58 | deux
59 | devoir
60 | devrait
61 | devrez
62 | devriez
63 | devrions
64 | devrons
65 | devront
66 | dois
67 | doit
68 | donc
69 | dos
70 | droite
71 | du
72 | dès
73 | début
74 | dù
75 | elle
76 | elles
77 | en
78 | encore
79 | es
80 | est
81 | et
82 | eu
83 | eue
84 | eues
85 | eurent
86 | eus
87 | eusse
88 | eussent
89 | eusses
90 | eussiez
91 | eussions
92 | eut
93 | eux
94 | eûmes
95 | eût
96 | eûtes
97 | faire
98 | fais
99 | faisez
100 | fait
101 | faites
102 | fois
103 | font
104 | force
105 | furent
106 | fus
107 | fusse
108 | fussent
109 | fusses
110 | fussiez
111 | fussions
112 | fut
113 | fûmes
114 | fût
115 | fûtes
116 | haut
117 | hors
118 | ici
119 | il
120 | ils
121 | j
122 | je
123 | juste
124 | l
125 | la
126 | le
127 | les
128 | leur
129 | leurs
130 | lui
131 | là
132 | m
133 | ma
134 | maintenant
135 | mais
136 | me
137 | mes
138 | moi
139 | moins
140 | mon
141 | mot
142 | même
143 | n
144 | ne
145 | ni
146 | nom
147 | nommé
148 | nommée
149 | nommés
150 | nos
151 | notre
152 | nous
153 | nouveau
154 | nouveaux
155 | on
156 | ont
157 | ou
158 | où
159 | par
160 | parce
161 | parole
162 | pas
163 | personne
164 | personnes
165 | peu
166 | peut
167 | plupart
168 | pour
169 | pourquoi
170 | qu
171 | quand
172 | que
173 | quel
174 | quelle
175 | quelles
176 | quels
177 | qui
178 | sa
179 | sans
180 | se
181 | sera
182 | serai
183 | seraient
184 | serais
185 | serait
186 | seras
187 | serez
188 | seriez
189 | serions
190 | serons
191 | seront
192 | ses
193 | seulement
194 | si
195 | sien
196 | soi
197 | soient
198 | sois
199 | soit
200 | sommes
201 | son
202 | sont
203 | sous
204 | soyez
205 | soyons
206 | suis
207 | sujet
208 | sur
209 | t
210 | ta
211 | tandis
212 | te
213 | tellement
214 | tels
215 | tes
216 | toi
217 | ton
218 | tous
219 | tout
220 | trop
221 | très
222 | tu
223 | un
224 | une
225 | valeur
226 | voient
227 | vois
228 | voit
229 | vont
230 | vos
231 | votre
232 | vous
233 | vu
234 | y
235 | à
236 | ça
237 | étaient
238 | étais
239 | était
240 | étant
241 | état
242 | étiez
243 | étions
244 | été
245 | étés
246 | êtes
247 | être
248 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/german.txt:
--------------------------------------------------------------------------------
1 | aber
2 | alle
3 | allem
4 | allen
5 | aller
6 | alles
7 | als
8 | also
9 | am
10 | an
11 | ander
12 | andere
13 | anderem
14 | anderen
15 | anderer
16 | anderes
17 | anderm
18 | andern
19 | anders
20 | auch
21 | auf
22 | aus
23 | bei
24 | bin
25 | bis
26 | bist
27 | da
28 | damit
29 | dann
30 | das
31 | dass
32 | dasselbe
33 | dazu
34 | daß
35 | dein
36 | deine
37 | deinem
38 | deinen
39 | deiner
40 | deines
41 | dem
42 | demselben
43 | den
44 | denn
45 | denselben
46 | der
47 | derer
48 | derselbe
49 | derselben
50 | des
51 | desselben
52 | dessen
53 | dich
54 | die
55 | dies
56 | diese
57 | dieselbe
58 | dieselben
59 | diesem
60 | diesen
61 | dieser
62 | dieses
63 | dir
64 | doch
65 | dort
66 | du
67 | durch
68 | ein
69 | eine
70 | einem
71 | einen
72 | einer
73 | eines
74 | einig
75 | einige
76 | einigem
77 | einigen
78 | einiger
79 | einiges
80 | einmal
81 | er
82 | es
83 | etwas
84 | euch
85 | euer
86 | eure
87 | eurem
88 | euren
89 | eurer
90 | eures
91 | für
92 | gegen
93 | gewesen
94 | hab
95 | habe
96 | haben
97 | hat
98 | hatte
99 | hatten
100 | hier
101 | hin
102 | hinter
103 | ich
104 | ihm
105 | ihn
106 | ihnen
107 | ihr
108 | ihre
109 | ihrem
110 | ihren
111 | ihrer
112 | ihres
113 | im
114 | in
115 | indem
116 | ins
117 | ist
118 | jede
119 | jedem
120 | jeden
121 | jeder
122 | jedes
123 | jene
124 | jenem
125 | jenen
126 | jener
127 | jenes
128 | jetzt
129 | kann
130 | kein
131 | keine
132 | keinem
133 | keinen
134 | keiner
135 | keines
136 | können
137 | könnte
138 | machen
139 | man
140 | manche
141 | manchem
142 | manchen
143 | mancher
144 | manches
145 | mein
146 | meine
147 | meinem
148 | meinen
149 | meiner
150 | meines
151 | mich
152 | mir
153 | mit
154 | muss
155 | musste
156 | nach
157 | nicht
158 | nichts
159 | noch
160 | nun
161 | nur
162 | ob
163 | oder
164 | ohne
165 | sehr
166 | sein
167 | seine
168 | seinem
169 | seinen
170 | seiner
171 | seines
172 | selbst
173 | sich
174 | sie
175 | sind
176 | so
177 | solche
178 | solchem
179 | solchen
180 | solcher
181 | solches
182 | soll
183 | sollte
184 | sondern
185 | sonst
186 | um
187 | und
188 | uns
189 | unse
190 | unsem
191 | unsen
192 | unser
193 | unses
194 | unter
195 | viel
196 | vom
197 | von
198 | vor
199 | war
200 | waren
201 | warst
202 | was
203 | weg
204 | weil
205 | weiter
206 | welche
207 | welchem
208 | welchen
209 | welcher
210 | welches
211 | wenn
212 | werde
213 | werden
214 | wie
215 | wieder
216 | will
217 | wir
218 | wird
219 | wirst
220 | wo
221 | wollen
222 | wollte
223 | während
224 | würde
225 | würden
226 | zu
227 | zum
228 | zur
229 | zwar
230 | zwischen
231 | über
232 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/hungarian.txt:
--------------------------------------------------------------------------------
1 | a
2 | abban
3 | ahhoz
4 | ahogy
5 | ahol
6 | aki
7 | akik
8 | akkor
9 | alatt
10 | amely
11 | amelyek
12 | amelyekben
13 | amelyeket
14 | amelyet
15 | amelynek
16 | ami
17 | amikor
18 | amit
19 | amolyan
20 | amíg
21 | annak
22 | arra
23 | arról
24 | az
25 | azok
26 | azon
27 | azonban
28 | azt
29 | aztán
30 | azután
31 | azzal
32 | azért
33 | be
34 | belül
35 | benne
36 | bár
37 | cikk
38 | cikkek
39 | cikkeket
40 | csak
41 | de
42 | e
43 | ebben
44 | eddig
45 | egy
46 | egyes
47 | egyetlen
48 | egyik
49 | egyre
50 | egyéb
51 | egész
52 | ehhez
53 | ekkor
54 | el
55 | ellen
56 | elsõ
57 | elég
58 | elõ
59 | elõször
60 | elõtt
61 | emilyen
62 | ennek
63 | erre
64 | ez
65 | ezek
66 | ezen
67 | ezt
68 | ezzel
69 | ezért
70 | fel
71 | felé
72 | hanem
73 | hiszen
74 | hogy
75 | hogyan
76 | igen
77 | ill
78 | ill.
79 | illetve
80 | ilyen
81 | ilyenkor
82 | ismét
83 | ison
84 | itt
85 | jobban
86 | jó
87 | jól
88 | kell
89 | kellett
90 | keressünk
91 | keresztül
92 | ki
93 | kívül
94 | között
95 | közül
96 | legalább
97 | legyen
98 | lehet
99 | lehetett
100 | lenne
101 | lenni
102 | lesz
103 | lett
104 | maga
105 | magát
106 | majd
107 | majd
108 | meg
109 | mellett
110 | mely
111 | melyek
112 | mert
113 | mi
114 | mikor
115 | milyen
116 | minden
117 | mindenki
118 | mindent
119 | mindig
120 | mint
121 | mintha
122 | mit
123 | mivel
124 | miért
125 | most
126 | már
127 | más
128 | másik
129 | még
130 | míg
131 | nagy
132 | nagyobb
133 | nagyon
134 | ne
135 | nekem
136 | neki
137 | nem
138 | nincs
139 | néha
140 | néhány
141 | nélkül
142 | olyan
143 | ott
144 | pedig
145 | persze
146 | rá
147 | s
148 | saját
149 | sem
150 | semmi
151 | sok
152 | sokat
153 | sokkal
154 | szemben
155 | szerint
156 | szinte
157 | számára
158 | talán
159 | tehát
160 | teljes
161 | tovább
162 | továbbá
163 | több
164 | ugyanis
165 | utolsó
166 | után
167 | utána
168 | vagy
169 | vagyis
170 | vagyok
171 | valaki
172 | valami
173 | valamint
174 | való
175 | van
176 | vannak
177 | vele
178 | vissza
179 | viszont
180 | volna
181 | volt
182 | voltak
183 | voltam
184 | voltunk
185 | által
186 | általában
187 | át
188 | én
189 | éppen
190 | és
191 | így
192 | õ
193 | õk
194 | õket
195 | össze
196 | úgy
197 | új
198 | újabb
199 | újra
200 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/italian.txt:
--------------------------------------------------------------------------------
1 | a
2 | abbia
3 | abbiamo
4 | abbiano
5 | abbiate
6 | ad
7 | adesso
8 | agl
9 | agli
10 | ai
11 | al
12 | all
13 | alla
14 | alle
15 | allo
16 | allora
17 | altre
18 | altri
19 | altro
20 | anche
21 | ancora
22 | avemmo
23 | avendo
24 | avere
25 | avesse
26 | avessero
27 | avessi
28 | avessimo
29 | aveste
30 | avesti
31 | avete
32 | aveva
33 | avevamo
34 | avevano
35 | avevate
36 | avevi
37 | avevo
38 | avrai
39 | avranno
40 | avrebbe
41 | avrebbero
42 | avrei
43 | avremmo
44 | avremo
45 | avreste
46 | avresti
47 | avrete
48 | avrà
49 | avrò
50 | avuta
51 | avute
52 | avuti
53 | avuto
54 | c
55 | che
56 | chi
57 | ci
58 | coi
59 | col
60 | come
61 | con
62 | contro
63 | cui
64 | da
65 | dagl
66 | dagli
67 | dai
68 | dal
69 | dall
70 | dalla
71 | dalle
72 | dallo
73 | degl
74 | degli
75 | dei
76 | del
77 | dell
78 | della
79 | delle
80 | dello
81 | dentro
82 | di
83 | dov
84 | dove
85 | e
86 | ebbe
87 | ebbero
88 | ebbi
89 | ecco
90 | ed
91 | era
92 | erano
93 | eravamo
94 | eravate
95 | eri
96 | ero
97 | essendo
98 | faccia
99 | facciamo
100 | facciano
101 | facciate
102 | faccio
103 | facemmo
104 | facendo
105 | facesse
106 | facessero
107 | facessi
108 | facessimo
109 | faceste
110 | facesti
111 | faceva
112 | facevamo
113 | facevano
114 | facevate
115 | facevi
116 | facevo
117 | fai
118 | fanno
119 | farai
120 | faranno
121 | fare
122 | farebbe
123 | farebbero
124 | farei
125 | faremmo
126 | faremo
127 | fareste
128 | faresti
129 | farete
130 | farà
131 | farò
132 | fece
133 | fecero
134 | feci
135 | fino
136 | fosse
137 | fossero
138 | fossi
139 | fossimo
140 | foste
141 | fosti
142 | fra
143 | fu
144 | fui
145 | fummo
146 | furono
147 | giù
148 | gli
149 | ha
150 | hai
151 | hanno
152 | ho
153 | i
154 | il
155 | in
156 | io
157 | l
158 | la
159 | le
160 | lei
161 | li
162 | lo
163 | loro
164 | lui
165 | ma
166 | me
167 | mi
168 | mia
169 | mie
170 | miei
171 | mio
172 | ne
173 | negl
174 | negli
175 | nei
176 | nel
177 | nell
178 | nella
179 | nelle
180 | nello
181 | no
182 | noi
183 | non
184 | nostra
185 | nostre
186 | nostri
187 | nostro
188 | o
189 | per
190 | perché
191 | però
192 | più
193 | pochi
194 | poco
195 | qua
196 | quale
197 | quanta
198 | quante
199 | quanti
200 | quanto
201 | quasi
202 | quella
203 | quelle
204 | quelli
205 | quello
206 | questa
207 | queste
208 | questi
209 | questo
210 | qui
211 | quindi
212 | sarai
213 | saranno
214 | sarebbe
215 | sarebbero
216 | sarei
217 | saremmo
218 | saremo
219 | sareste
220 | saresti
221 | sarete
222 | sarà
223 | sarò
224 | se
225 | sei
226 | senza
227 | si
228 | sia
229 | siamo
230 | siano
231 | siate
232 | siete
233 | sono
234 | sopra
235 | sotto
236 | sta
237 | stai
238 | stando
239 | stanno
240 | starai
241 | staranno
242 | stare
243 | starebbe
244 | starebbero
245 | starei
246 | staremmo
247 | staremo
248 | stareste
249 | staresti
250 | starete
251 | starà
252 | starò
253 | stava
254 | stavamo
255 | stavano
256 | stavate
257 | stavi
258 | stavo
259 | stemmo
260 | stesse
261 | stessero
262 | stessi
263 | stessimo
264 | stesso
265 | steste
266 | stesti
267 | stette
268 | stettero
269 | stetti
270 | stia
271 | stiamo
272 | stiano
273 | stiate
274 | sto
275 | su
276 | sua
277 | sue
278 | sugl
279 | sugli
280 | sui
281 | sul
282 | sull
283 | sulla
284 | sulle
285 | sullo
286 | suo
287 | suoi
288 | te
289 | ti
290 | tra
291 | tu
292 | tua
293 | tue
294 | tuo
295 | tuoi
296 | tutti
297 | tutto
298 | un
299 | una
300 | uno
301 | vai
302 | vi
303 | voi
304 | vostra
305 | vostre
306 | vostri
307 | vostro
308 | è
309 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/languages.json:
--------------------------------------------------------------------------------
1 | {
2 | "ar": "arabic",
3 | "ca": "catalan",
4 | "da": "danish",
5 | "nl": "dutch",
6 | "en": "english",
7 | "fi": "finnish",
8 | "fr": "french",
9 | "de": "german",
10 | "hu": "hungarian",
11 | "it": "italian",
12 | "nb": "norwegian",
13 | "pt": "portuguese",
14 | "ro": "romanian",
15 | "ru": "russian",
16 | "es": "spanish",
17 | "sv": "swedish",
18 | "tr": "turkish",
19 | "uk": "ukrainian"
20 | }
21 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/norwegian.txt:
--------------------------------------------------------------------------------
1 | alle
2 | at
3 | av
4 | bare
5 | begge
6 | ble
7 | blei
8 | bli
9 | blir
10 | blitt
11 | både
12 | båe
13 | da
14 | de
15 | deg
16 | dei
17 | deim
18 | deira
19 | deires
20 | dem
21 | den
22 | denne
23 | der
24 | dere
25 | deres
26 | det
27 | dette
28 | di
29 | din
30 | disse
31 | ditt
32 | du
33 | dykk
34 | dykkar
35 | då
36 | eg
37 | ein
38 | eit
39 | eitt
40 | eller
41 | elles
42 | en
43 | enn
44 | er
45 | et
46 | ett
47 | etter
48 | for
49 | fordi
50 | fra
51 | før
52 | ha
53 | hadde
54 | han
55 | hans
56 | har
57 | hennar
58 | henne
59 | hennes
60 | her
61 | hjå
62 | ho
63 | hoe
64 | honom
65 | hoss
66 | hossen
67 | hun
68 | hva
69 | hvem
70 | hver
71 | hvilke
72 | hvilken
73 | hvis
74 | hvor
75 | hvordan
76 | hvorfor
77 | i
78 | ikke
79 | ikkje
80 | ingen
81 | ingi
82 | inkje
83 | inn
84 | inni
85 | ja
86 | jeg
87 | kan
88 | kom
89 | korleis
90 | korso
91 | kun
92 | kunne
93 | kva
94 | kvar
95 | kvarhelst
96 | kven
97 | kvi
98 | kvifor
99 | man
100 | mange
101 | me
102 | med
103 | medan
104 | meg
105 | meget
106 | mellom
107 | men
108 | mi
109 | min
110 | mine
111 | mitt
112 | mot
113 | mykje
114 | ned
115 | no
116 | noe
117 | noen
118 | noka
119 | noko
120 | nokon
121 | nokor
122 | nokre
123 | nå
124 | når
125 | og
126 | også
127 | om
128 | opp
129 | oss
130 | over
131 | på
132 | samme
133 | seg
134 | selv
135 | si
136 | sia
137 | sidan
138 | siden
139 | sin
140 | sine
141 | sitt
142 | sjøl
143 | skal
144 | skulle
145 | slik
146 | so
147 | som
148 | somme
149 | somt
150 | så
151 | sånn
152 | til
153 | um
154 | upp
155 | ut
156 | uten
157 | var
158 | vart
159 | varte
160 | ved
161 | vere
162 | verte
163 | vi
164 | vil
165 | ville
166 | vore
167 | vors
168 | vort
169 | være
170 | vært
171 | vår
172 | å
173 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/portuguese.txt:
--------------------------------------------------------------------------------
1 | a
2 | ao
3 | aos
4 | aquela
5 | aquelas
6 | aquele
7 | aqueles
8 | aquilo
9 | as
10 | até
11 | com
12 | como
13 | da
14 | das
15 | de
16 | dela
17 | delas
18 | dele
19 | deles
20 | depois
21 | do
22 | dos
23 | e
24 | ela
25 | elas
26 | ele
27 | eles
28 | em
29 | entre
30 | era
31 | eram
32 | essa
33 | essas
34 | esse
35 | esses
36 | esta
37 | estamos
38 | estas
39 | estava
40 | estavam
41 | este
42 | esteja
43 | estejam
44 | estejamos
45 | estes
46 | esteve
47 | estive
48 | estivemos
49 | estiver
50 | estivera
51 | estiveram
52 | estiverem
53 | estivermos
54 | estivesse
55 | estivessem
56 | estivéramos
57 | estivéssemos
58 | estou
59 | está
60 | estávamos
61 | estão
62 | eu
63 | foi
64 | fomos
65 | for
66 | fora
67 | foram
68 | forem
69 | formos
70 | fosse
71 | fossem
72 | fui
73 | fôramos
74 | fôssemos
75 | haja
76 | hajam
77 | hajamos
78 | havemos
79 | hei
80 | houve
81 | houvemos
82 | houver
83 | houvera
84 | houveram
85 | houverei
86 | houverem
87 | houveremos
88 | houveria
89 | houveriam
90 | houvermos
91 | houverá
92 | houverão
93 | houveríamos
94 | houvesse
95 | houvessem
96 | houvéramos
97 | houvéssemos
98 | há
99 | hão
100 | isso
101 | isto
102 | já
103 | lhe
104 | lhes
105 | mais
106 | mas
107 | me
108 | mesmo
109 | meu
110 | meus
111 | minha
112 | minhas
113 | muito
114 | na
115 | nas
116 | nem
117 | no
118 | nos
119 | nossa
120 | nossas
121 | nosso
122 | nossos
123 | num
124 | numa
125 | não
126 | nós
127 | o
128 | os
129 | ou
130 | para
131 | pela
132 | pelas
133 | pelo
134 | pelos
135 | por
136 | qual
137 | quando
138 | que
139 | quem
140 | se
141 | seja
142 | sejam
143 | sejamos
144 | sem
145 | serei
146 | seremos
147 | seria
148 | seriam
149 | será
150 | serão
151 | seríamos
152 | seu
153 | seus
154 | somos
155 | sou
156 | sua
157 | suas
158 | são
159 | só
160 | também
161 | te
162 | tem
163 | temos
164 | tenha
165 | tenham
166 | tenhamos
167 | tenho
168 | terei
169 | teremos
170 | teria
171 | teriam
172 | terá
173 | terão
174 | teríamos
175 | teu
176 | teus
177 | teve
178 | tinha
179 | tinham
180 | tive
181 | tivemos
182 | tiver
183 | tivera
184 | tiveram
185 | tiverem
186 | tivermos
187 | tivesse
188 | tivessem
189 | tivéramos
190 | tivéssemos
191 | tu
192 | tua
193 | tuas
194 | tém
195 | tínhamos
196 | um
197 | uma
198 | você
199 | vocês
200 | vos
201 | à
202 | às
203 | éramos
204 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/romanian.txt:
--------------------------------------------------------------------------------
1 | vreo
2 | acelea
3 | cita
4 | degraba
5 | lor
6 | alta
7 | tot
8 | ai
9 | dat
10 | x
11 | despre
12 | peste
13 | bine
14 | dar
15 | foarte
16 | z
17 | avea
18 | multi
19 | cit
20 | alt
21 | mai
22 | sa
23 | fie
24 | tu
25 | multe
26 | e
27 | orice
28 | dintr
29 | se
30 | g
31 | intr
32 | niste
33 | multa
34 | insa
35 | il
36 | fost
37 | a
38 | abia
39 | nimic
40 | sub
41 | acel
42 | in
43 | altceva
44 | si
45 | avem
46 | altfel
47 | c
48 | ea
49 | acest
50 | li
51 | parca
52 | fi
53 | dintre
54 | unele
55 | m
56 | acestei
57 | mare
58 | cel
59 | este
60 | pe
61 | atitia
62 | uneori
63 | acela
64 | iti
65 | astazi
66 | acestui
67 | o
68 | imi
69 | ele
70 | ceilalti
71 | pai
72 | fata
73 | noua
74 | sa-ti
75 | altul
76 | au
77 | i
78 | prin
79 | conform
80 | aceste
81 | anume
82 | azi
83 | k
84 | unul
85 | ala
86 | unei
87 | fara
88 | ei
89 | la
90 | aceeasi
91 | u
92 | inapoi
93 | acestea
94 | acesta
95 | catre
96 | sale
97 | asupra
98 | as
99 | aceea
100 | ba
101 | ale
102 | da
103 | le
104 | apoi
105 | aia
106 | suntem
107 | cum
108 | isi
109 | inainte
110 | s
111 | de
112 | cind
113 | cumva
114 | chiar
115 | acestia
116 | daca
117 | sunt
118 | care
119 | al
120 | numai
121 | cui
122 | sus
123 | tocmai
124 | prea
125 | cu
126 | mi
127 | eu
128 | doar
129 | niciodata
130 | exact
131 | putini
132 | aiurea
133 | tuturor
134 | celor
135 | astfel
136 | atunci
137 | citeva
138 | cat
139 | sau
140 | fel
141 | intre
142 | acolo
143 | nostri
144 | ma
145 | mult
146 | una
147 | ceea
148 | iar
149 | sintem
150 | ati
151 | din
152 | geaba
153 | sai
154 | caruia
155 | adica
156 | inca
157 | are
158 | aici
159 | ca
160 | ia
161 | nici
162 | d
163 | oricum
164 | asta
165 | carora
166 | face
167 | citiva
168 | voi
169 | unor
170 | f
171 | atat
172 | toata
173 | alaturi
174 | cea
175 | nu
176 | totusi
177 | ce
178 | altii
179 | acum
180 | sint
181 | capat
182 | mod
183 | deasupra
184 | cam
185 | vom
186 | b
187 | toate
188 | careia
189 | aceasta
190 | atit
191 | nimeni
192 | ii
193 | ci
194 | unde
195 | ul
196 | plus
197 | era
198 | sa-mi
199 | l
200 | spre
201 | dupa
202 | nou
203 | cele
204 | acea
205 | un
206 | incit
207 | n
208 | cei
209 | or
210 | va
211 | deci
212 | acelasi
213 | atatea
214 | h
215 | vor
216 | decit
217 | noi
218 | cineva
219 | desi
220 | ceva
221 | j
222 | ului
223 | atitea
224 | avut
225 | ar
226 | pina
227 | t
228 | atata
229 | unui
230 | el
231 | citi
232 | asa
233 | totul
234 | pentru
235 | atita
236 | v
237 | alti
238 | asemenea
239 | atatia
240 | te
241 | ne
242 | deja
243 | unii
244 | p
245 | atare
246 | cite
247 | cine
248 | cand
249 | toti
250 | vreun
251 | ori
252 | r
253 | alte
254 | lui
255 | ti
256 | ni
257 | aceia
258 | am
259 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/russian.txt:
--------------------------------------------------------------------------------
1 | а
2 | в
3 | г
4 | е
5 | ж
6 | и
7 | к
8 | м
9 | о
10 | с
11 | т
12 | у
13 | я
14 | бы
15 | во
16 | вы
17 | да
18 | до
19 | ее
20 | ей
21 | ею
22 | её
23 | же
24 | за
25 | из
26 | им
27 | их
28 | ли
29 | мы
30 | на
31 | не
32 | ни
33 | но
34 | ну
35 | нх
36 | об
37 | он
38 | от
39 | по
40 | со
41 | та
42 | те
43 | то
44 | ту
45 | ты
46 | уж
47 | без
48 | был
49 | вам
50 | вас
51 | ваш
52 | вон
53 | вот
54 | все
55 | всю
56 | вся
57 | всё
58 | где
59 | год
60 | два
61 | две
62 | дел
63 | для
64 | его
65 | ему
66 | еще
67 | ещё
68 | или
69 | ими
70 | имя
71 | как
72 | кем
73 | ком
74 | кто
75 | лет
76 | мне
77 | мог
78 | мож
79 | мои
80 | мой
81 | мор
82 | моя
83 | моё
84 | над
85 | нам
86 | нас
87 | наш
88 | нее
89 | ней
90 | нем
91 | нет
92 | нею
93 | неё
94 | них
95 | оба
96 | она
97 | они
98 | оно
99 | под
100 | пор
101 | при
102 | про
103 | раз
104 | сам
105 | сих
106 | так
107 | там
108 | тем
109 | тех
110 | том
111 | тот
112 | тою
113 | три
114 | тут
115 | уже
116 | чем
117 | что
118 | эта
119 | эти
120 | это
121 | эту
122 | алло
123 | буду
124 | будь
125 | бывь
126 | была
127 | были
128 | было
129 | быть
130 | вами
131 | ваша
132 | ваше
133 | ваши
134 | ведь
135 | весь
136 | вниз
137 | всем
138 | всех
139 | всею
140 | года
141 | году
142 | даже
143 | двух
144 | день
145 | если
146 | есть
147 | зато
148 | кого
149 | кому
150 | куда
151 | лишь
152 | люди
153 | мало
154 | меля
155 | меня
156 | мимо
157 | мира
158 | мной
159 | мною
160 | мочь
161 | надо
162 | нами
163 | наша
164 | наше
165 | наши
166 | него
167 | нему
168 | ниже
169 | ними
170 | один
171 | пока
172 | пора
173 | пять
174 | рано
175 | сама
176 | сами
177 | само
178 | саму
179 | свое
180 | свои
181 | свою
182 | себе
183 | себя
184 | семь
185 | стал
186 | суть
187 | твой
188 | твоя
189 | твоё
190 | тебе
191 | тебя
192 | теми
193 | того
194 | тоже
195 | тому
196 | туда
197 | хоть
198 | хотя
199 | чаще
200 | чего
201 | чему
202 | чтоб
203 | чуть
204 | этим
205 | этих
206 | этой
207 | этом
208 | этот
209 | более
210 | будем
211 | будет
212 | будто
213 | будут
214 | вверх
215 | вдали
216 | вдруг
217 | везде
218 | внизу
219 | время
220 | всего
221 | всеми
222 | всему
223 | всюду
224 | давно
225 | даром
226 | долго
227 | друго
228 | жизнь
229 | занят
230 | затем
231 | зачем
232 | здесь
233 | иметь
234 | какая
235 | какой
236 | когда
237 | кроме
238 | лучше
239 | между
240 | менее
241 | много
242 | могут
243 | может
244 | можно
245 | можхо
246 | назад
247 | низко
248 | нужно
249 | одной
250 | около
251 | опять
252 | очень
253 | перед
254 | позже
255 | после
256 | потом
257 | почти
258 | пятый
259 | разве
260 | рядом
261 | самим
262 | самих
263 | самой
264 | самом
265 | своей
266 | своих
267 | сеаой
268 | снова
269 | собой
270 | собою
271 | такая
272 | также
273 | такие
274 | такое
275 | такой
276 | тобой
277 | тобою
278 | тогда
279 | тысяч
280 | уметь
281 | часто
282 | через
283 | чтобы
284 | шесть
285 | этими
286 | этого
287 | этому
288 | близко
289 | больше
290 | будете
291 | будешь
292 | бывает
293 | важная
294 | важное
295 | важные
296 | важный
297 | вокруг
298 | восемь
299 | всегда
300 | второй
301 | далеко
302 | дальше
303 | девять
304 | десять
305 | должно
306 | другая
307 | другие
308 | других
309 | другое
310 | другой
311 | занята
312 | занято
313 | заняты
314 | значит
315 | именно
316 | иногда
317 | каждая
318 | каждое
319 | каждые
320 | каждый
321 | кругом
322 | меньше
323 | начала
324 | нельзя
325 | нибудь
326 | никуда
327 | ничего
328 | обычно
329 | однако
330 | одного
331 | отсюда
332 | первый
333 | потому
334 | почему
335 | просто
336 | против
337 | раньше
338 | самими
339 | самого
340 | самому
341 | своего
342 | сейчас
343 | сказал
344 | совсем
345 | теперь
346 | только
347 | третий
348 | хорошо
349 | хотеть
350 | хочешь
351 | четыре
352 | шестой
353 | восьмой
354 | впрочем
355 | времени
356 | говорил
357 | говорит
358 | девятый
359 | десятый
360 | кажется
361 | конечно
362 | которая
363 | которой
364 | которые
365 | который
366 | которых
367 | наверху
368 | наконец
369 | недавно
370 | немного
371 | нередко
372 | никогда
373 | однажды
374 | посреди
375 | сегодня
376 | седьмой
377 | сказала
378 | сказать
379 | сколько
380 | слишком
381 | сначала
382 | спасибо
383 | человек
384 | двадцать
385 | довольно
386 | которого
387 | наиболее
388 | недалеко
389 | особенно
390 | отовсюду
391 | двадцатый
392 | миллионов
393 | несколько
394 | прекрасно
395 | процентов
396 | четвертый
397 | двенадцать
398 | непрерывно
399 | пожалуйста
400 | пятнадцать
401 | семнадцать
402 | тринадцать
403 | двенадцатый
404 | одиннадцать
405 | пятнадцатый
406 | семнадцатый
407 | тринадцатый
408 | шестнадцать
409 | восемнадцать
410 | девятнадцать
411 | одиннадцатый
412 | четырнадцать
413 | шестнадцатый
414 | восемнадцатый
415 | девятнадцатый
416 | действительно
417 | четырнадцатый
418 | многочисленная
419 | многочисленное
420 | многочисленные
421 | многочисленный
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/spanish.txt:
--------------------------------------------------------------------------------
1 | a
2 | al
3 | algo
4 | algunas
5 | algunos
6 | ante
7 | antes
8 | como
9 | con
10 | contra
11 | cual
12 | cuando
13 | de
14 | del
15 | desde
16 | donde
17 | durante
18 | e
19 | el
20 | ella
21 | ellas
22 | ellos
23 | en
24 | entre
25 | era
26 | erais
27 | eran
28 | eras
29 | eres
30 | es
31 | esa
32 | esas
33 | ese
34 | eso
35 | esos
36 | esta
37 | estaba
38 | estabais
39 | estaban
40 | estabas
41 | estad
42 | estada
43 | estadas
44 | estado
45 | estados
46 | estamos
47 | estando
48 | estar
49 | estaremos
50 | estará
51 | estarán
52 | estarás
53 | estaré
54 | estaréis
55 | estaría
56 | estaríais
57 | estaríamos
58 | estarían
59 | estarías
60 | estas
61 | este
62 | estemos
63 | esto
64 | estos
65 | estoy
66 | estuve
67 | estuviera
68 | estuvierais
69 | estuvieran
70 | estuvieras
71 | estuvieron
72 | estuviese
73 | estuvieseis
74 | estuviesen
75 | estuvieses
76 | estuvimos
77 | estuviste
78 | estuvisteis
79 | estuviéramos
80 | estuviésemos
81 | estuvo
82 | está
83 | estábamos
84 | estáis
85 | están
86 | estás
87 | esté
88 | estéis
89 | estén
90 | estés
91 | fue
92 | fuera
93 | fuerais
94 | fueran
95 | fueras
96 | fueron
97 | fuese
98 | fueseis
99 | fuesen
100 | fueses
101 | fui
102 | fuimos
103 | fuiste
104 | fuisteis
105 | fuéramos
106 | fuésemos
107 | ha
108 | habida
109 | habidas
110 | habido
111 | habidos
112 | habiendo
113 | habremos
114 | habrá
115 | habrán
116 | habrás
117 | habré
118 | habréis
119 | habría
120 | habríais
121 | habríamos
122 | habrían
123 | habrías
124 | habéis
125 | había
126 | habíais
127 | habíamos
128 | habían
129 | habías
130 | han
131 | has
132 | hasta
133 | hay
134 | haya
135 | hayamos
136 | hayan
137 | hayas
138 | hayáis
139 | he
140 | hemos
141 | hube
142 | hubiera
143 | hubierais
144 | hubieran
145 | hubieras
146 | hubieron
147 | hubiese
148 | hubieseis
149 | hubiesen
150 | hubieses
151 | hubimos
152 | hubiste
153 | hubisteis
154 | hubiéramos
155 | hubiésemos
156 | hubo
157 | la
158 | las
159 | le
160 | les
161 | lo
162 | los
163 | me
164 | mi
165 | mis
166 | mucho
167 | muchos
168 | muy
169 | más
170 | mí
171 | mía
172 | mías
173 | mío
174 | míos
175 | nada
176 | ni
177 | no
178 | nos
179 | nosotras
180 | nosotros
181 | nuestra
182 | nuestras
183 | nuestro
184 | nuestros
185 | o
186 | os
187 | otra
188 | otras
189 | otro
190 | otros
191 | para
192 | pero
193 | poco
194 | por
195 | porque
196 | que
197 | quien
198 | quienes
199 | qué
200 | se
201 | sea
202 | seamos
203 | sean
204 | seas
205 | seremos
206 | será
207 | serán
208 | serás
209 | seré
210 | seréis
211 | sería
212 | seríais
213 | seríamos
214 | serían
215 | serías
216 | seáis
217 | sido
218 | siendo
219 | sin
220 | sobre
221 | sois
222 | somos
223 | son
224 | soy
225 | su
226 | sus
227 | suya
228 | suyas
229 | suyo
230 | suyos
231 | sí
232 | también
233 | tanto
234 | te
235 | tendremos
236 | tendrá
237 | tendrán
238 | tendrás
239 | tendré
240 | tendréis
241 | tendría
242 | tendríais
243 | tendríamos
244 | tendrían
245 | tendrías
246 | tened
247 | tenemos
248 | tenga
249 | tengamos
250 | tengan
251 | tengas
252 | tengo
253 | tengáis
254 | tenida
255 | tenidas
256 | tenido
257 | tenidos
258 | teniendo
259 | tenéis
260 | tenía
261 | teníais
262 | teníamos
263 | tenían
264 | tenías
265 | ti
266 | tiene
267 | tienen
268 | tienes
269 | todo
270 | todos
271 | tu
272 | tus
273 | tuve
274 | tuviera
275 | tuvierais
276 | tuvieran
277 | tuvieras
278 | tuvieron
279 | tuviese
280 | tuvieseis
281 | tuviesen
282 | tuvieses
283 | tuvimos
284 | tuviste
285 | tuvisteis
286 | tuviéramos
287 | tuviésemos
288 | tuvo
289 | tuya
290 | tuyas
291 | tuyo
292 | tuyos
293 | tú
294 | un
295 | una
296 | uno
297 | unos
298 | vosotras
299 | vosotros
300 | vuestra
301 | vuestras
302 | vuestro
303 | vuestros
304 | y
305 | ya
306 | yo
307 | él
308 | éramos
309 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/swedish.txt:
--------------------------------------------------------------------------------
1 | alla
2 | allt
3 | att
4 | av
5 | blev
6 | bli
7 | blir
8 | blivit
9 | de
10 | dem
11 | den
12 | denna
13 | deras
14 | dess
15 | dessa
16 | det
17 | detta
18 | dig
19 | din
20 | dina
21 | ditt
22 | du
23 | där
24 | då
25 | efter
26 | ej
27 | eller
28 | en
29 | er
30 | era
31 | ert
32 | ett
33 | från
34 | för
35 | ha
36 | hade
37 | han
38 | hans
39 | har
40 | henne
41 | hennes
42 | hon
43 | honom
44 | hur
45 | här
46 | i
47 | icke
48 | ingen
49 | inom
50 | inte
51 | jag
52 | ju
53 | kan
54 | kunde
55 | man
56 | med
57 | mellan
58 | men
59 | mig
60 | min
61 | mina
62 | mitt
63 | mot
64 | mycket
65 | ni
66 | nu
67 | när
68 | någon
69 | något
70 | några
71 | och
72 | om
73 | oss
74 | på
75 | samma
76 | sedan
77 | sig
78 | sin
79 | sina
80 | sitta
81 | själv
82 | skulle
83 | som
84 | så
85 | sådan
86 | sådana
87 | sådant
88 | till
89 | under
90 | upp
91 | ut
92 | utan
93 | vad
94 | var
95 | vara
96 | varför
97 | varit
98 | varje
99 | vars
100 | vart
101 | vem
102 | vi
103 | vid
104 | vilka
105 | vilkas
106 | vilken
107 | vilket
108 | vår
109 | våra
110 | vårt
111 | än
112 | är
113 | åt
114 | över
115 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/turkish.txt:
--------------------------------------------------------------------------------
1 | mu
2 | onlar
3 | seksen
4 | ama
5 | trilyon
6 | buna
7 | bizim
8 | þeyden
9 | yirmi
10 | altý
11 | iki
12 | seni
13 | doksan
14 | dört
15 | bunun
16 | ki
17 | nereye
18 | altmýþ
19 | hem
20 | milyon
21 | kez
22 | otuz
23 | beþ
24 | elli
25 | bizi
26 | da
27 | sekiz
28 | ve
29 | çok
30 | bu
31 | veya
32 | ya
33 | kýrk
34 | onlarýn
35 | ona
36 | bana
37 | yetmiþ
38 | milyar
39 | þunu
40 | senden
41 | birþeyi
42 | dokuz
43 | yani
44 | kimi
45 | þeyler
46 | kim
47 | neden
48 | senin
49 | yedi
50 | niye
51 | üç
52 | þey
53 | mý
54 | tüm
55 | onlari
56 | bunda
57 | ise
58 | þundan
59 | hep
60 | þuna
61 | bin
62 | ben
63 | ondan
64 | kimden
65 | bazý
66 | belki
67 | ne
68 | bundan
69 | gibi
70 | de
71 | onlardan
72 | sizi
73 | sizin
74 | daha
75 | niçin
76 | þunda
77 | INSERmi
78 | bunu
79 | beni
80 | ile
81 | þu
82 | þeyi
83 | sizden
84 | defa
85 | biz
86 | için
87 | dahi
88 | siz
89 | nerde
90 | kime
91 | birþey
92 | birkez
93 | her
94 | biri
95 | on
96 | mü
97 | diye
98 | acaba
99 | sen
100 | en
101 | hepsi
102 | bir
103 | bizden
104 | sanki
105 | benim
106 | nerede
107 | onu
108 | benden
109 | yüz
110 | birkaç
111 | çünkü
112 | nasýl
113 | hiç
114 | katrilyon
115 |
--------------------------------------------------------------------------------
/stopword_lists/python_stoplists/ukrainian.txt:
--------------------------------------------------------------------------------
1 | a
2 | б
3 | в
4 | г
5 | е
6 | ж
7 | з
8 | м
9 | т
10 | у
11 | я
12 | є
13 | і
14 | аж
15 | ви
16 | де
17 | до
18 | за
19 | зі
20 | ми
21 | на
22 | не
23 | ну
24 | нх
25 | ні
26 | по
27 | та
28 | ти
29 | то
30 | ту
31 | ті
32 | це
33 | цю
34 | ця
35 | ці
36 | чи
37 | ще
38 | що
39 | як
40 | їй
41 | їм
42 | їх
43 | її
44 | або
45 | але
46 | ало
47 | без
48 | був
49 | вам
50 | вас
51 | ваш
52 | вже
53 | все
54 | всю
55 | вся
56 | від
57 | він
58 | два
59 | дві
60 | для
61 | ким
62 | мож
63 | моя
64 | моє
65 | мої
66 | міг
67 | між
68 | мій
69 | над
70 | нам
71 | нас
72 | наш
73 | нею
74 | неї
75 | них
76 | ніж
77 | ній
78 | ось
79 | при
80 | про
81 | під
82 | пір
83 | раз
84 | рік
85 | сам
86 | сих
87 | сім
88 | так
89 | там
90 | теж
91 | тим
92 | тих
93 | той
94 | тою
95 | три
96 | тут
97 | хоч
98 | хто
99 | цей
100 | цим
101 | цих
102 | час
103 | щоб
104 | яка
105 | які
106 | адже
107 | буде
108 | буду
109 | будь
110 | була
111 | були
112 | було
113 | бути
114 | вами
115 | ваша
116 | ваше
117 | ваші
118 | весь
119 | вниз
120 | вона
121 | вони
122 | воно
123 | всею
124 | всім
125 | всіх
126 | втім
127 | геть
128 | далі
129 | двох
130 | день
131 | дуже
132 | зате
133 | його
134 | йому
135 | каже
136 | кого
137 | коли
138 | кому
139 | крім
140 | куди
141 | лише
142 | люди
143 | мало
144 | мати
145 | мене
146 | мені
147 | миру
148 | мною
149 | може
150 | нами
151 | наша
152 | наше
153 | наші
154 | ними
155 | ніби
156 | один
157 | поки
158 | пора
159 | рано
160 | року
161 | році
162 | сама
163 | саме
164 | саму
165 | самі
166 | свою
167 | своє
168 | свої
169 | себе
170 | собі
171 | став
172 | суть
173 | така
174 | таке
175 | такі
176 | твоя
177 | твоє
178 | твій
179 | тебе
180 | тими
181 | тобі
182 | того
183 | тоді
184 | тому
185 | туди
186 | хоча
187 | хіба
188 | цими
189 | цієї
190 | часу
191 | чого
192 | чому
193 | який
194 | яких
195 | якої
196 | якщо
197 | ім'я
198 | інша
199 | інше
200 | інші
201 | буває
202 | будеш
203 | більш
204 | вгору
205 | вміти
206 | внизу
207 | вісім
208 | давно
209 | даром
210 | добре
211 | довго
212 | друго
213 | дякую
214 | життя
215 | зараз
216 | знову
217 | какая
218 | кожен
219 | кожна
220 | кожне
221 | кожні
222 | краще
223 | ледве
224 | майже
225 | менше
226 | могти
227 | можна
228 | назад
229 | немає
230 | нижче
231 | нього
232 | однак
233 | п'ять
234 | перед
235 | поруч
236 | потім
237 | проти
238 | після
239 | років
240 | самим
241 | самих
242 | самій
243 | свого
244 | своєї
245 | своїх
246 | собою
247 | справ
248 | такий
249 | також
250 | тепер
251 | тисяч
252 | тобою
253 | треба
254 | трохи
255 | усюди
256 | усіма
257 | хочеш
258 | цього
259 | цьому
260 | часто
261 | через
262 | шість
263 | якого
264 | іноді
265 | інший
266 | інших
267 | багато
268 | будемо
269 | будете
270 | будуть
271 | більше
272 | всього
273 | всьому
274 | далеко
275 | десять
276 | досить
277 | другий
278 | дійсно
279 | завжди
280 | звідси
281 | зовсім
282 | кругом
283 | кілька
284 | людина
285 | можуть
286 | навіть
287 | навіщо
288 | нагорі
289 | небудь
290 | низько
291 | ніколи
292 | нікуди
293 | нічого
294 | обидва
295 | одного
296 | однієї
297 | п'ятий
298 | перший
299 | просто
300 | раніше
301 | раптом
302 | самими
303 | самого
304 | самому
305 | сказав
306 | скрізь
307 | сьомий
308 | третій
309 | тільки
310 | хотіти
311 | чотири
312 | чудово
313 | шостий
314 | близько
315 | важлива
316 | важливе
317 | важливі
318 | вдалині
319 | восьмий
320 | говорив
321 | дев'ять
322 | десятий
323 | зайнята
324 | зайнято
325 | зайняті
326 | занадто
327 | значить
328 | навколо
329 | нарешті
330 | нерідко
331 | повинно
332 | посеред
333 | початку
334 | пізніше
335 | сказала
336 | сказати
337 | скільки
338 | спасибі
339 | частіше
340 | важливий
341 | двадцять
342 | дев'ятий
343 | зазвичай
344 | зайнятий
345 | звичайно
346 | здається
347 | найбільш
348 | не можна
349 | недалеко
350 | особливо
351 | потрібно
352 | спочатку
353 | сьогодні
354 | численна
355 | численне
356 | численні
357 | відсотків
358 | двадцятий
359 | звідусіль
360 | мільйонів
361 | нещодавно
362 | прекрасно
363 | четвертий
364 | численний
365 | будь ласка
366 | дванадцять
367 | одинадцять
368 | сімнадцять
369 | тринадцять
370 | безперервно
371 | дванадцятий
372 | одинадцятий
373 | одного разу
374 | п'ятнадцять
375 | сімнадцятий
376 | тринадцятий
377 | шістнадцять
378 | вісімнадцять
379 | п'ятнадцятий
380 | чотирнадцять
381 | шістнадцятий
382 | вісімнадцятий
383 | дев'ятнадцять
384 | чотирнадцятий
385 | дев'ятнадцятий
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/SMART.txt:
--------------------------------------------------------------------------------
1 | a
2 | a's
3 | able
4 | about
5 | above
6 | according
7 | accordingly
8 | across
9 | actually
10 | after
11 | afterwards
12 | again
13 | against
14 | ain't
15 | all
16 | allow
17 | allows
18 | almost
19 | alone
20 | along
21 | already
22 | also
23 | although
24 | always
25 | am
26 | among
27 | amongst
28 | an
29 | and
30 | another
31 | any
32 | anybody
33 | anyhow
34 | anyone
35 | anything
36 | anyway
37 | anyways
38 | anywhere
39 | apart
40 | appear
41 | appreciate
42 | appropriate
43 | are
44 | aren't
45 | around
46 | as
47 | aside
48 | ask
49 | asking
50 | associated
51 | at
52 | available
53 | away
54 | awfully
55 | b
56 | be
57 | became
58 | because
59 | become
60 | becomes
61 | becoming
62 | been
63 | before
64 | beforehand
65 | behind
66 | being
67 | believe
68 | below
69 | beside
70 | besides
71 | best
72 | better
73 | between
74 | beyond
75 | both
76 | brief
77 | but
78 | by
79 | c
80 | c'mon
81 | c's
82 | came
83 | can
84 | can't
85 | cannot
86 | cant
87 | cause
88 | causes
89 | certain
90 | certainly
91 | changes
92 | clearly
93 | co
94 | com
95 | come
96 | comes
97 | concerning
98 | consequently
99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
572 |
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/catalan.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/catalan.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/danish.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/danish.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/dutch.txt:
--------------------------------------------------------------------------------
1 | de
2 | en
3 | van
4 | ik
5 | te
6 | dat
7 | die
8 | in
9 | een
10 | hij
11 | het
12 | niet
13 | zijn
14 | is
15 | was
16 | op
17 | aan
18 | met
19 | als
20 | voor
21 | had
22 | er
23 | maar
24 | om
25 | hem
26 | dan
27 | zou
28 | of
29 | wat
30 | mijn
31 | men
32 | dit
33 | zo
34 | door
35 | over
36 | ze
37 | zich
38 | bij
39 | ook
40 | tot
41 | je
42 | mij
43 | uit
44 | der
45 | daar
46 | haar
47 | naar
48 | heb
49 | hoe
50 | heeft
51 | hebben
52 | deze
53 | u
54 | want
55 | nog
56 | zal
57 | me
58 | zij
59 | nu
60 | ge
61 | geen
62 | omdat
63 | iets
64 | worden
65 | toch
66 | al
67 | waren
68 | veel
69 | meer
70 | doen
71 | toen
72 | moet
73 | ben
74 | zonder
75 | kan
76 | hun
77 | dus
78 | alles
79 | onder
80 | ja
81 | eens
82 | hier
83 | wie
84 | werd
85 | altijd
86 | doch
87 | wordt
88 | wezen
89 | kunnen
90 | ons
91 | zelf
92 | tegen
93 | na
94 | reeds
95 | wil
96 | kon
97 | niets
98 | uw
99 | iemand
100 | geweest
101 | andere
102 |
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/english.txt:
--------------------------------------------------------------------------------
1 | i
2 | me
3 | my
4 | myself
5 | we
6 | our
7 | ours
8 | ourselves
9 | you
10 | your
11 | yours
12 | yourself
13 | yourselves
14 | he
15 | him
16 | his
17 | himself
18 | she
19 | her
20 | hers
21 | herself
22 | it
23 | its
24 | itself
25 | they
26 | them
27 | their
28 | theirs
29 | themselves
30 | what
31 | which
32 | who
33 | whom
34 | this
35 | that
36 | these
37 | those
38 | am
39 | is
40 | are
41 | was
42 | were
43 | be
44 | been
45 | being
46 | have
47 | has
48 | had
49 | having
50 | do
51 | does
52 | did
53 | doing
54 | would
55 | should
56 | could
57 | ought
58 | i'm
59 | you're
60 | he's
61 | she's
62 | it's
63 | we're
64 | they're
65 | i've
66 | you've
67 | we've
68 | they've
69 | i'd
70 | you'd
71 | he'd
72 | she'd
73 | we'd
74 | they'd
75 | i'll
76 | you'll
77 | he'll
78 | she'll
79 | we'll
80 | they'll
81 | isn't
82 | aren't
83 | wasn't
84 | weren't
85 | hasn't
86 | haven't
87 | hadn't
88 | doesn't
89 | don't
90 | didn't
91 | won't
92 | wouldn't
93 | shan't
94 | shouldn't
95 | can't
96 | cannot
97 | couldn't
98 | mustn't
99 | let's
100 | that's
101 | who's
102 | what's
103 | here's
104 | there's
105 | when's
106 | where's
107 | why's
108 | how's
109 | a
110 | an
111 | the
112 | and
113 | but
114 | if
115 | or
116 | because
117 | as
118 | until
119 | while
120 | of
121 | at
122 | by
123 | for
124 | with
125 | about
126 | against
127 | between
128 | into
129 | through
130 | during
131 | before
132 | after
133 | above
134 | below
135 | to
136 | from
137 | up
138 | down
139 | in
140 | out
141 | on
142 | off
143 | over
144 | under
145 | again
146 | further
147 | then
148 | once
149 | here
150 | there
151 | when
152 | where
153 | why
154 | how
155 | all
156 | any
157 | both
158 | each
159 | few
160 | more
161 | most
162 | other
163 | some
164 | such
165 | no
166 | nor
167 | not
168 | only
169 | own
170 | same
171 | so
172 | than
173 | too
174 | very
175 |
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/finnish.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/finnish.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/french.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/french.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/german.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/german.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/hungarian.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/hungarian.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/italian.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/italian.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/norwegian.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/norwegian.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/portuguese.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/portuguese.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/romanian.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/topicmodels_learning/365a49be7af915638e8741ca3d1b9586eb5c6af6/stopword_lists/tm_stoplists/romanian.txt
--------------------------------------------------------------------------------
/stopword_lists/tm_stoplists/russian.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |