├── .Rbuildignore ├── .Rprofile ├── .gitignore ├── .gitmodules ├── .travis.yml ├── CONDUCT.md ├── DESCRIPTION ├── LICENSE ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── coerce.R ├── corpus_frame.R ├── deprecated.R ├── frame-stats.R ├── frame.R ├── gutenberg.R ├── json.R ├── logging.R ├── sentence.R ├── stem.R ├── style.R ├── term.R ├── text-base.R ├── text-primitive.R ├── text-stats.R ├── text-utils.R ├── text.R ├── text_filter.R ├── text_locate.R ├── text_split.R ├── text_stats.R ├── text_types.R ├── token.R ├── util.R └── wordlist.R ├── README.md ├── TODO.md ├── _pkgdown.yml ├── appveyor.yml ├── bench ├── bench-term_matrix.R ├── bench-term_matrix.Rout └── bench.R ├── data ├── abbreviations.rda ├── abbreviations │ └── 01_make_abbreviations.R ├── affect_wordnet.rda ├── affect_wordnet │ ├── 01_make_tsv.py │ ├── 02_make_rda.R │ ├── README │ ├── wn-affect-1.1 │ │ ├── a-hierarchy.xml │ │ ├── a-synsets.xml │ │ └── readme-wn-affect-1.1.txt │ ├── wnaffect.tsv │ └── wordnet-1.6 │ │ ├── LICENSE │ │ ├── dict │ │ ├── Makefile │ │ ├── adj.exc │ │ ├── adv.exc │ │ ├── cntlist │ │ ├── cousin.exc │ │ ├── cousin.tops │ │ ├── data.adj │ │ ├── data.adv │ │ ├── data.noun │ │ ├── data.verb │ │ ├── index.adj │ │ ├── index.adv │ │ ├── index.gloss │ │ ├── index.noun │ │ ├── index.sense │ │ ├── index.verb │ │ ├── lexnames │ │ ├── noun.exc │ │ ├── sentidx.vrb │ │ ├── sents.vrb │ │ ├── stoplist.pl │ │ └── verb.exc │ │ └── man │ │ └── html │ │ ├── binsrch.htm │ │ ├── cntlist.htm │ │ ├── glossidx.htm │ │ ├── lexnames.htm │ │ ├── morph.htm │ │ ├── morphy.htm │ │ ├── senseidx.htm │ │ ├── taglist.htm │ │ ├── uniqbeg.htm │ │ ├── wn.htm │ │ ├── wnb.htm │ │ ├── wndb.htm │ │ ├── wngloss.htm │ │ ├── wngroups.htm │ │ ├── wninput.htm │ │ ├── wnintro1.htm │ │ ├── wnintro3.htm │ │ ├── wnintro5.htm │ │ ├── wnintro7.htm │ │ ├── wnlicens.htm │ │ ├── wnpkgs.htm │ │ ├── wnsearch.htm │ │ ├── wnstats.htm │ │ └── wnutil.htm ├── federalist.rda ├── federalist │ ├── .gitignore │ ├── 01_download_raw.sh │ ├── 02_make_json.py │ └── 03_make_rda.R ├── sentiment_afinn.rda ├── sentiment_afinn │ ├── 01_download_raw.sh │ ├── 02_make_rda.R │ └── AFINN │ │ ├── AFINN-111.txt │ │ ├── AFINN-96.txt │ │ └── AFINN-README.txt ├── stopwords.rda └── stopwords │ └── 01_make_stopwords.R ├── docs ├── CNAME ├── LICENSE.html ├── articles │ ├── chinese-wordcloud-1.png │ ├── chinese.html │ ├── corpus-emotion-1.png │ ├── corpus-heapslaw-1.png │ ├── corpus-witch-occurrences-1.png │ ├── corpus.html │ ├── gender-estimates-1.png │ ├── gender-estimates_se-1.png │ ├── gender-signif-1.png │ ├── gender.html │ ├── index.html │ ├── stemmer.html │ ├── textdata.html │ └── unicode.html ├── authors.html ├── favicon.ico ├── index.html ├── jquery.sticky-kit.min.js ├── link.svg ├── logo.png ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js └── reference │ ├── abbreviations.html │ ├── affect_wordnet.html │ ├── corpus-deprecated.html │ ├── corpus-package.html │ ├── corpus_frame.html │ ├── corpus_text.html │ ├── federalist.html │ ├── figures │ ├── banner.png │ ├── logo.png │ └── logo │ │ ├── 01_make_logo.R │ │ ├── README │ │ ├── c-07.jpg │ │ ├── logo-slide.tiff │ │ └── logo.key │ ├── gutenberg_corpus.html │ ├── index.html │ ├── new_stemmer.html │ ├── print.corpus_frame.html │ ├── read_ndjson.html │ ├── sentiment_afinn.html │ ├── stem_snowball.html │ ├── stopwords.html │ ├── term_matrix.html │ ├── term_stats.html │ ├── text_filter.html │ ├── text_locate.html │ ├── text_split.html │ ├── text_stats.html │ ├── text_sub.html │ ├── text_tokens.html │ ├── text_types.html │ └── utf8.html ├── inst └── WORDLIST ├── man ├── abbreviations.Rd ├── affect_wordnet.Rd ├── corpus-deprecated.Rd ├── corpus-package.Rd ├── corpus_frame.Rd ├── corpus_text.Rd ├── federalist.Rd ├── figures │ ├── banner.png │ ├── logo.png │ └── logo │ │ ├── 01_make_logo.R │ │ ├── README │ │ ├── c-07.jpg │ │ ├── logo-slide.tiff │ │ └── logo.key ├── gutenberg_corpus.Rd ├── new_stemmer.Rd ├── print.corpus_frame.Rd ├── read_ndjson.Rd ├── sentiment_afinn.Rd ├── stem_snowball.Rd ├── stopwords.Rd ├── term_matrix.Rd ├── term_stats.Rd ├── text_filter.Rd ├── text_locate.Rd ├── text_split.Rd ├── text_stats.Rd ├── text_sub.Rd ├── text_tokens.Rd └── text_types.Rd ├── src ├── Makevars ├── context.c ├── decode.c ├── filebuf.c ├── init.c ├── json.c ├── logging.c ├── mkchar.c ├── ndjson.c ├── rcorpus.h ├── search.c ├── stemmer.c ├── term_matrix.c ├── term_stats.c ├── termset.c ├── text.c ├── text_c.c ├── text_filter.c ├── text_locate.c ├── text_methods.c ├── text_nunit.c ├── text_split.c ├── text_sub.c ├── text_tokens.c ├── text_trunc.c ├── text_types.c ├── util.c └── wordlist.c ├── tests ├── testthat.R └── testthat │ ├── helper-capture_output.R │ ├── helper-locale.R │ ├── helper-options.R │ ├── test-foreign.R │ ├── test-frame-stats.R │ ├── test-frame.R │ ├── test-gutenberg_corpus.R │ ├── test-json_record.R │ ├── test-json_scalar.R │ ├── test-json_serialize.R │ ├── test-read_ndjson.R │ ├── test-stemmer.R │ ├── test-term_counts.R │ ├── test-term_matrix.R │ ├── test-term_stats.R │ ├── test-text-stats.R │ ├── test-text.R │ ├── test-text_base.R │ ├── test-text_c.R │ ├── test-text_filter.R │ ├── test-text_format.R │ ├── test-text_index.R │ ├── test-text_locate.R │ ├── test-text_names.R │ ├── test-text_nunit.R │ ├── test-text_primitive.R │ ├── test-text_print.R │ ├── test-text_split_sentences.R │ ├── test-text_split_tokens.R │ ├── test-text_stats.R │ ├── test-text_sub.R │ ├── test-text_tokens.R │ ├── test-text_types.R │ └── test-wordlist.R └── vignettes ├── chinese-wordcloud-1.png ├── chinese.Rmd ├── chinese.Rmd.in ├── corpus-emotion-1.png ├── corpus-heapslaw-1.png ├── corpus-witch-occurrences-1.png ├── corpus.Rmd ├── corpus.Rmd.in ├── gender-estimates-1.png ├── gender-estimates_se-1.png ├── gender-signif-1.png ├── gender.Rmd ├── gender.Rmd.in ├── stemmer.Rmd ├── stemmer.Rmd.in ├── textdata.Rmd └── textdata.Rmd.in /.Rbuildignore: -------------------------------------------------------------------------------- 1 | [.]a$ 2 | [.]o$ 3 | [.]so$ 4 | ^[.]Rprofile$ 5 | ^[.]git 6 | ^[.]travis[.]yml$ 7 | ^_pkgdown[.]yml$ 8 | ^CONDUCT[.]md$ 9 | ^Makefile$ 10 | ^NEWS[.]md$ 11 | ^README[.]md$ 12 | ^TODO[.]md$ 13 | ^appveyor[.]yml$ 14 | ^bench$ 15 | ^data/abbreviations$ 16 | ^data/abbreviations/ 17 | ^data/affect_wordnet$ 18 | ^data/affect_wordnet/ 19 | ^data/federalist$ 20 | ^data/federalist/ 21 | ^data/sentiment_afinn$ 22 | ^data/sentiment_afinn/ 23 | ^data/stopwords$ 24 | ^data/stopwords/ 25 | ^dist$ 26 | ^docs$ 27 | ^man/figures/logo$ 28 | ^man/figures/logo/ 29 | ^src/corpus/[.]git$ 30 | ^src/corpus/[.]travis[.]yml$ 31 | ^src/corpus/Makefile$ 32 | ^src/corpus/data$ 33 | ^src/corpus/lib/utf8lite/[.]git$ 34 | ^src/corpus/lib/utf8lite/[.]travis[.]yml$ 35 | ^src/corpus/lib/utf8lite/Makefile$ 36 | ^src/corpus/lib/utf8lite/data$ 37 | ^vignettes/gender[.]Rmd 38 | ^vignettes/gender-.*png 39 | ^vignettes/.*[.]in$ 40 | ^_pkgdown\.yml$ 41 | -------------------------------------------------------------------------------- /.Rprofile: -------------------------------------------------------------------------------- 1 | if (interactive()) { 2 | if (requireNamespace("devtools", quietly = TRUE)) { 3 | devtools::load_all(".") 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.o 3 | *.so 4 | .Rhistory 5 | /NEWS 6 | /README 7 | /dist/ 8 | /docs/articles/*.Rmd.in 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/corpus"] 2 | path = src/corpus 3 | url = https://github.com/patperry/corpus.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: R 2 | cache: packages 3 | 4 | # valgrind (disabled; takes too long, and errors don't affect the build) 5 | # 6 | # r_check_args: '--use-valgrind' 7 | # 8 | # addons: 9 | # apt: 10 | # packages: 11 | # - valgrind 12 | 13 | r_github_packages: 14 | - jimhester/covr 15 | 16 | before_install: 17 | - make NEWS 18 | - make README 19 | 20 | # for devel version of 'utf8', put the following in `before_install`: 21 | # - pushd $(mktemp -d) 22 | # - git clone --recursive https://github.com/patperry/r-utf8.git 23 | # - Rscript -e 'devtools::install("r-utf8")' 24 | # - popd 25 | 26 | matrix: 27 | include: 28 | - os: linux 29 | r: oldrel 30 | - os: linux 31 | dist: trusty 32 | r: release 33 | env: R_CODECOV=true 34 | - os: linux 35 | r: devel 36 | 37 | warnings_are_errors: true 38 | 39 | after_success: 40 | - export LC_ALL="C" 41 | - export TEST_WEB_RESOURCES="true" 42 | - if [[ "${R_CODECOV}" ]]; then Rscript -e 'covr::codecov(line_exclusions = c("R/deprecated.R", "R/wordlist.R", "src/wordlist.c", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))'; fi 43 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, 8 | body size, disability, ethnicity, gender identity and expression, level of 9 | experience, nationality, personal appearance, race, religion, or sexual 10 | identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an 52 | appointed representative at an online or offline event. Representation of a 53 | project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by opening an issue or contacting one or more of the project 59 | maintainers. All complaints will be reviewed and investigated and will result 60 | in a response that is deemed necessary and appropriate to the circumstances. 61 | The project team is obligated to maintain confidentiality with regard to the 62 | reporter of an incident. Further details of specific enforcement policies may 63 | be posted separately. 64 | 65 | Project maintainers who do not follow or enforce the Code of Conduct in good 66 | faith may face temporary or permanent repercussions as determined by other 67 | members of the project's leadership. 68 | 69 | ## Attribution 70 | 71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 72 | version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 73 | 74 | [homepage]: http://contributor-covenant.org 75 | [version]: http://contributor-covenant.org/version/1/4/ 76 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: corpus 2 | Version: 0.10.0.9000 3 | Title: Text Corpus Analysis 4 | Authors@R: c( 5 | person(c("Patrick", "O."), "Perry", 6 | role = c("aut", "cph", "cre"), 7 | email = "pperry@stern.nyu.edu"), 8 | person(c("Finn", "\u00c5rup"), "Nielsen", 9 | role = c("cph", "dtc"), 10 | comment = "AFINN Sentiment Lexicon"), 11 | person("Martin Porter and Richard Boulton", 12 | role = c("ctb", "cph", "dtc"), 13 | comment = "Snowball Stemmer and Stopword Lists"), 14 | person("The Regents of the University of California", 15 | role = c("ctb", "cph"), 16 | comment = "Strtod Library Procedure"), 17 | person("Carlo Strapparava and Alessandro Valitutti", 18 | role = c("cph", "dtc"), 19 | comment = "WordNet-Affect Lexicon"), 20 | person("Unicode, Inc.", 21 | role = c("cph", "dtc"), 22 | comment = "Unicode Character Database")) 23 | Depends: 24 | R (>= 3.3), 25 | Imports: 26 | stats, 27 | utf8 (>= 1.1.0) 28 | Suggests: 29 | knitr, 30 | Matrix, 31 | testthat 32 | Enhances: 33 | quanteda, 34 | tm 35 | Description: Text corpus data analysis, with full support for international text (Unicode). Functions for reading data from newline-delimited 'JSON' files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies, including n-grams. 36 | License: Apache License (== 2.0) | file LICENSE 37 | URL: http://corpustext.com, 38 | https://github.com/patperry/r-corpus 39 | BugReports: https://github.com/patperry/r-corpus/issues 40 | LazyData: Yes 41 | Encoding: UTF-8 42 | VignetteBuilder: knitr 43 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | RSCRIPT= Rscript --vanilla 2 | CORPUS_LIB= src/corpus.so 3 | BUILT_VIGNETTES= \ 4 | vignettes/chinese.Rmd vignettes/corpus.Rmd vignettes/gender.Rmd \ 5 | vignettes/stemmer.Rmd vignettes/textdata.Rmd 6 | 7 | all: $(CORPUS_LIB) $(BUILT_VIGNETTES) 8 | 9 | $(CORPUS_LIB): 10 | $(RSCRIPT) -e 'devtools::compile_dll(".")' 11 | 12 | NEWS: NEWS.md 13 | sed -e 's/^### //g; s/`//g' $< > $@ 14 | 15 | README: README.md 16 | sed -e '/\*Corpus\*/,$$!d' \ 17 | -e 's/…../.../' \ 18 | -e 's/..…/.../' \ 19 | -e 's/⋮/./' $< > $@ 20 | 21 | vignettes/%.Rmd: vignettes/%.Rmd.in 22 | $(RSCRIPT) -e 'devtools::load_all("."); setwd("vignettes"); knitr::knit(basename("$<"), basename("$@"))' 23 | 24 | bench: 25 | $(RSCRIPT) -e 'devtools::load_all("."); source("bench/bench.R")' 26 | 27 | check: $(CORPUS_LIB) 28 | $(RSCRIPT) -e 'devtools::test(".")' 29 | 30 | clean: 31 | $(RSCRIPT) -e 'devtools::clean_dll(".")' 32 | 33 | cov: 34 | $(RSCRIPT) -e 'covr::package_coverage(line_exclusions = c("R/deprecated.R", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))' 35 | 36 | dist: $(BUILT_VIGNETTES) NEWS README 37 | mkdir -p dist && cd dist && R CMD build .. 38 | 39 | distclean: clean 40 | rm -rf $(BUILT_VIGNETTES) 41 | 42 | doc: $(BUILT_VIGNETTES) NEWS README 43 | 44 | install: $(CORPUS_LIB) 45 | $(RSCRIPT) -e 'devtools::install(".")' 46 | 47 | site: $(BUILT_VIGNETTES) 48 | $(RSCRIPT) -e 'pkgdown::build_site(".")' 49 | 50 | .PHONY: all bench check clean con dist distclean doc install site 51 | -------------------------------------------------------------------------------- /R/deprecated.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /R/frame-stats.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | na.fail.corpus_frame <- function(object, ...) 16 | { 17 | if (!any(vapply(object, anyNA, FALSE))) 18 | object 19 | else stop("missing values in object") 20 | } 21 | 22 | 23 | na.omit.corpus_frame <- function(object, ...) 24 | { 25 | if (!any(vapply(object, anyNA, FALSE))) { 26 | return(object) 27 | } 28 | 29 | # find the missing entries; cast to a matrix 30 | na <- matrix(c(lapply(object, is.na), recursive = TRUE), 31 | ncol = length(object)) 32 | 33 | # find rows containing missing entries 34 | omit <- which(apply(na, 1, any)) 35 | names(omit) <- rownames(object)[omit] 36 | 37 | # drop the rows that miss observations 38 | object <- object[-omit,,drop = FALSE] 39 | attr(omit, "class") <- "omit" 40 | attr(object, "na.action") <- omit 41 | object 42 | } 43 | 44 | 45 | na.exclude.corpus_frame <- function(object, ...) 46 | { 47 | object <- na.omit.corpus_frame(object, ...) 48 | exclude <- attr(object, "na.action") 49 | if (is.null(exclude)) { 50 | return(object) 51 | } 52 | 53 | attr(exclude, "class") <- "exclude" 54 | attr(object, "na.action") <- exclude 55 | object 56 | } 57 | -------------------------------------------------------------------------------- /R/logging.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | logging_off <- function() 17 | { 18 | .Call(C_logging_off) 19 | } 20 | 21 | 22 | logging_on <- function() 23 | { 24 | .Call(C_logging_on) 25 | } 26 | -------------------------------------------------------------------------------- /R/sentence.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | text_nsentence <- function(x, filter = NULL, ...) 17 | { 18 | with_rethrow({ 19 | x <- as_corpus_text(x, filter, ...) 20 | }) 21 | .Call(C_text_nsentence, x) 22 | } 23 | -------------------------------------------------------------------------------- /R/stem.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | stem_snowball <- function(x, algorithm = "en") 17 | { 18 | with_rethrow({ 19 | x <- as_character_vector("x", x) 20 | algorithm <- as_snowball_algorithm("algorithm", algorithm) 21 | }) 22 | 23 | .Call(C_stem_snowball, x, algorithm) 24 | } 25 | 26 | 27 | new_stemmer<- function(term, stem, default = NULL, duplicates = "first", 28 | vectorize = TRUE) 29 | { 30 | call <- sys.call() 31 | with_rethrow({ 32 | term <- as_character_vector("term", term) 33 | stem <- as_character_vector("stem", stem) 34 | default <- as_character_scalar("default", default) 35 | duplicates <- as_enum("duplicates", duplicates, 36 | c("first", "last", "omit", "fail")) 37 | }) 38 | 39 | if (is.null(term)) { 40 | term <- character() 41 | } 42 | 43 | if (is.null(stem)) { 44 | stem <- character() 45 | } 46 | 47 | if (length(term) != length(stem)) { 48 | stop("'term' argument length must equal 'stem' argument length") 49 | } 50 | 51 | if (duplicates == "last") { 52 | term <- rev(term) 53 | stem <- rev(stem) 54 | duplicates <- "first" 55 | } 56 | 57 | if (duplicates != "first") { 58 | dup <- duplicated(term) 59 | if (duplicates == "omit") { 60 | dups <- term[dup] 61 | rm <- term %in% dups 62 | term <- term[!rm] 63 | stem <- stem[!rm] 64 | } else if (any(dup)) { # duplicates == "fail" 65 | stop("'term' argument entries must be unique") 66 | } 67 | } 68 | 69 | # parse dynamically so that we can add a comment with the function call 70 | comment <- paste(" #", deparse(call), collapse = "\n") 71 | if (is.null(default)) { 72 | src <- paste('function(x) {', 73 | comment, 74 | ' i <- match(x, term, 0L)', 75 | ' if (i > 0L)', 76 | ' stem[[i]]', 77 | ' else x', 78 | '}', 79 | sep = '\n') 80 | } else { 81 | src <- paste('function(x) {', 82 | comment, 83 | ' i <- match(x, term, 0L)', 84 | ' if (i > 0L)', 85 | ' stem[[i]]', 86 | ' else default', 87 | '}', 88 | sep = '\n') 89 | } 90 | 91 | env <- new.env() 92 | assign("term", term, env) 93 | assign("stem", stem, env) 94 | assign("default", default, env) 95 | stem_term <- eval(parse(text = src), env) 96 | 97 | if (vectorize) { 98 | vsrc <- paste('function(x) {', 99 | comment, 100 | ' vapply(x, stem_term, "", USE.NAMES = !is.null(names(x)))', 101 | '}', 102 | sep = '\n') 103 | assign("stem_term", stem_term, env) 104 | stem_term <- eval(parse(text = vsrc, keep.source = TRUE), env) 105 | } 106 | 107 | stem_term 108 | } 109 | -------------------------------------------------------------------------------- /R/style.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # RStudio doesn't support ANSI faint, use gray instead 17 | style_faint <- "38;5;246" #666666 18 | 19 | # RStudio ANSI bold is broken, use color instead 20 | # https://github.com/rstudio/rstudio/issues/1721 21 | style_bold <- "38;5;203" #FF3333 22 | #style_bold <- "36" # cyan 23 | #style_bold <- "38;5;63" #3333FF 24 | -------------------------------------------------------------------------------- /R/text-stats.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | na.fail.corpus_text <- function(object, ...) 16 | { 17 | if (!anyNA(object)) 18 | object 19 | else stop("missing values in object") 20 | } 21 | 22 | 23 | na.omit.corpus_text <- function(object, ...) 24 | { 25 | if (!anyNA(object)) { 26 | return(object) 27 | } 28 | 29 | omit <- which(is.na(object)) 30 | names(omit) <- names(object)[omit] 31 | object <- object[-omit] 32 | attr(omit, "class") <- "omit" 33 | attr(object, "na.action") <- omit 34 | object 35 | } 36 | 37 | 38 | na.exclude.corpus_text <- function(object, ...) 39 | { 40 | object <- na.omit.corpus_text(object, ...) 41 | exclude <- attr(object, "na.action") 42 | if (is.null(exclude)) { 43 | return(object) 44 | } 45 | 46 | attr(exclude, "class") <- "exclude" 47 | attr(object, "na.action") <- exclude 48 | object 49 | } 50 | -------------------------------------------------------------------------------- /R/text-utils.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | str.corpus_text <- function(object, ...) 16 | { 17 | n <- length(object) 18 | if (n == 0) { 19 | "text(0)" 20 | } else { 21 | paste0("text [1:", n, "]") 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /R/text.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | as_corpus_text <- function(x, filter = NULL, ..., names = NULL) 16 | { 17 | UseMethod("as_corpus_text") 18 | } 19 | 20 | 21 | as_corpus_text.default <- function(x, filter = NULL, ..., names = NULL) 22 | { 23 | if (length(dim(x)) > 1) { 24 | stop("cannot convert multi-dimensional array to text") 25 | } 26 | 27 | x <- structure(as.character(x), names = names(x)) 28 | as_corpus_text(x, filter = filter, ..., names = names) 29 | } 30 | 31 | 32 | as_corpus_text.character <- function(x, filter = NULL, ..., names = NULL) 33 | { 34 | if (length(dim(x)) > 1) { 35 | stop("cannot convert multi-dimensional array to text") 36 | } 37 | 38 | with_rethrow({ 39 | x <- as_utf8(x) 40 | }) 41 | 42 | if (is.null(names)) { 43 | names <- names(x) 44 | if (anyDuplicated(names)) { 45 | warning("renaming entries with duplicate names") 46 | names <- make.unique(names) 47 | } 48 | } 49 | 50 | x <- .Call(C_as_text_character, x, NULL) 51 | as_corpus_text(x, filter = filter, ..., names = names) 52 | } 53 | 54 | 55 | as_corpus_text.corpus_json <- function(x, filter = NULL, ..., names = NULL) 56 | { 57 | if (length(dim(x)) == 2) { 58 | if (!"text" %in% names(x)) { 59 | stop("no column named \"text\" in JSON object") 60 | } 61 | x <- x[["text"]] 62 | } else { 63 | x <- .Call(C_as_text_json, x, NULL) 64 | } 65 | as_corpus_text(x, filter = filter, ..., names = names) 66 | } 67 | 68 | 69 | as_corpus_text.corpus_text <- function(x, filter = NULL, ..., names = NULL) 70 | { 71 | if (!is_corpus_text(x)) { 72 | stop("argument is not a valid text object") 73 | } 74 | 75 | with_rethrow({ 76 | filter <- as_filter("filter", filter) 77 | names <- as_names("names", names, length(x)) 78 | }) 79 | 80 | attrs <- attributes(x) 81 | for (a in names(attrs)) { 82 | if (!a %in% c("class", "names")) { 83 | attr(x, a) <- NULL 84 | } 85 | } 86 | attr(x, "class") <- "corpus_text" 87 | 88 | if (!is.null(names)) { 89 | names(x) <- names 90 | } 91 | if (!is.null(filter)) { 92 | text_filter(x) <- filter 93 | } 94 | 95 | props <- list(...) 96 | if (length(props) > 0) { 97 | pnames <- names(props) 98 | if (is.null(pnames) || any(pnames == "")) { 99 | stop("unnamed arguments are not allowed") 100 | } 101 | f <- text_filter(x) 102 | for (name in names(props)) { 103 | f[[name]] <- props[[name]] 104 | } 105 | text_filter(x) <- f 106 | } 107 | 108 | x 109 | } 110 | 111 | 112 | as_corpus_text.data.frame <- function(x, filter = NULL, ..., names = NULL) 113 | { 114 | if (!is.data.frame(x)) { 115 | stop("argument is not a valid data frame") 116 | } 117 | if (!"text" %in% names(x)) { 118 | stop("no column named \"text\" in data frame") 119 | } 120 | 121 | text <- x[["text"]] 122 | if (.row_names_info(x) > 0) { 123 | names(text) <- row.names(x) 124 | } 125 | 126 | as_corpus_text(text, filter = filter, ..., names = names) 127 | } 128 | 129 | 130 | # tm::Corpus 131 | as_corpus_text.Corpus <- function(x, filter = NULL, ..., names = NULL) 132 | { 133 | with_package("tm", { 134 | x <- vapply(x, as.character, "") 135 | }) 136 | as_corpus_text(x, filter = filter, ..., names = names) 137 | } 138 | 139 | # quanteda::corpus 140 | as_corpus_text.corpus <- function(x, filter = NULL, ..., names = NULL) 141 | { 142 | with_package("quanteda", { 143 | text <- quanteda::texts(x) 144 | }) 145 | as_corpus_text(text, filter = filter, ..., names = names) 146 | } 147 | 148 | 149 | is_corpus_text <- function(x) 150 | { 151 | if (!inherits(x, "corpus_text")) { 152 | return(FALSE) 153 | } 154 | .Call(C_text_valid, x) 155 | } 156 | -------------------------------------------------------------------------------- /R/text_split.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | text_split <- function(x, units = "sentences", size = 1, filter = NULL, ...) 17 | { 18 | with_rethrow({ 19 | x <- as_corpus_text(x, filter, ...) 20 | units <- as_enum("units", units, choices = c("sentences", "tokens")) 21 | size <- as_size(size) 22 | }) 23 | 24 | if (units == "sentences") { 25 | ans <- .Call(C_text_split_sentences, x, size) 26 | } else { 27 | stopifnot(units == "tokens") 28 | ans <- .Call(C_text_split_tokens, x, size) 29 | } 30 | 31 | ans$parent <- structure(as.integer(ans$parent), class = "factor", 32 | levels = labels(x)) 33 | ans 34 | } 35 | -------------------------------------------------------------------------------- /R/text_stats.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | text_stats <- function(x, filter = NULL, ...) 17 | { 18 | with_rethrow({ 19 | x <- as_corpus_text(x, filter, ...) 20 | }) 21 | 22 | ans <- data.frame(tokens = text_ntoken(x), 23 | types = text_ntype(x), 24 | sentences = text_nsentence(x), 25 | row.names = names(x)) 26 | class(ans) <- c("corpus_frame", "data.frame") 27 | ans 28 | } 29 | -------------------------------------------------------------------------------- /R/text_types.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | text_ntype <- function(x, filter = NULL, collapse = FALSE, ...) 17 | { 18 | with_rethrow({ 19 | x <- as_corpus_text(x, filter, ...) 20 | collapse <- as_option("collapse", collapse) 21 | }) 22 | .Call(C_text_ntype, x, collapse) 23 | } 24 | 25 | 26 | text_types <- function(x, filter = NULL, collapse = FALSE, ...) 27 | { 28 | with_rethrow({ 29 | x <- as_corpus_text(x, filter, ...) 30 | collapse <- as_option("collapse", collapse) 31 | }) 32 | typs <- .Call(C_text_types, x, collapse) 33 | if (collapse) { 34 | typs <- sort(typs, method = "radix") 35 | } else { 36 | typs <- lapply(typs, sort, method = "radix") 37 | } 38 | typs 39 | } 40 | -------------------------------------------------------------------------------- /R/token.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | text_tokens <- function(x, filter = NULL, ...) 17 | { 18 | with_rethrow({ 19 | x <- as_corpus_text(x, filter, ...) 20 | }) 21 | .Call(C_text_tokens, x) 22 | } 23 | 24 | 25 | text_ntoken <- function(x, filter = NULL, ...) 26 | { 27 | with_rethrow({ 28 | x <- as_corpus_text(x, filter, ...) 29 | }) 30 | .Call(C_text_ntoken, x) 31 | } 32 | 33 | 34 | text_sub <- function(x, start = 1L, end = -1L, filter = NULL, ...) 35 | { 36 | with_rethrow({ 37 | x <- as_corpus_text(x, filter, ...) 38 | }) 39 | n <- length(x) 40 | 41 | if (!(is.numeric(start) 42 | && (length(dim(start)) <= 1 43 | || is.matrix(start) && ncol(start) == 2))) { 44 | stop("'start' must be an integer vector or two-column matrix") 45 | } 46 | 47 | nstart <- if (is.matrix(start)) nrow(start) else length(start) 48 | if ((nstart == 0 && n > 0) || (nstart > 0 && n %% nstart != 0)) { 49 | stop("'start' length does not evenly divide argument length") 50 | } 51 | 52 | if (is.matrix(start)) { 53 | if (!missing(end)) { 54 | warning("'end' argument is ignored when 'start' is a two-column matrix") 55 | } 56 | end <- as.integer(start[,2]) 57 | start <- as.integer(start[,1]) 58 | } else { 59 | start <- as.integer(start) 60 | 61 | if (!(is.numeric(end) && length(dim(end)) <= 1)) { 62 | stop("'end' must be an integer vector") 63 | } 64 | 65 | nend <- length(end) 66 | if ((nend == 0 && n > 0) || (nend > 0 && n %% nend != 0)) { 67 | stop("'end' length does not evenly divide argument length") 68 | } 69 | end <- as.integer(end) 70 | } 71 | 72 | .Call(C_text_sub, x, start, end) 73 | } 74 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | with_rethrow <- function(expr) 17 | { 18 | parentcall <- sys.call(-1) 19 | eval(envir = parent.frame(), 20 | withCallingHandlers(expr, 21 | error = function(e, call = parentcall) { 22 | e$call <- call 23 | stop(e) 24 | }, 25 | warning = function(w, call = parentcall) { 26 | w$call <- call 27 | warning(w) 28 | invokeRestart("muffleWarning") 29 | }, 30 | message = function(m, call = parentcall) { 31 | m$call <- call 32 | } 33 | ) 34 | ) 35 | } 36 | 37 | 38 | with_package <- function(package, expr) 39 | { 40 | if (!isNamespaceLoaded(package)) { 41 | if (!requireNamespace(package, quietly = TRUE)) { 42 | stop(sprintf("Failed attaching name space for package '%s'", 43 | package)) 44 | } 45 | } 46 | 47 | force(expr) 48 | expr 49 | } 50 | -------------------------------------------------------------------------------- /R/wordlist.R: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Patrick O. Perry. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | abbreviations <- function(kind = "english") 17 | { 18 | .Deprecated("abbreviations_en") 19 | with_rethrow({ 20 | wordlist(kind, function(k) .Call(C_abbreviations, k)) 21 | }) 22 | } 23 | 24 | 25 | stopwords <- function(kind = "english") 26 | { 27 | .Deprecated("stopwords_en") 28 | with_rethrow({ 29 | wordlist(kind, function(k) .Call(C_stopwords, k)) 30 | }) 31 | } 32 | 33 | 34 | wordlist <- function(kind, call) 35 | { 36 | kind <- as_kind(kind) 37 | 38 | words <- character() 39 | for (k in kind) { 40 | wk <- call(k) 41 | words <- c(words, wk) 42 | } 43 | 44 | if (length(words) == 0) { 45 | return(NULL) 46 | } 47 | 48 | words <- unique(sort(words, method = "radix")) 49 | words 50 | } 51 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | To Do 2 | ===== 3 | 4 | Bugs 5 | ---- 6 | 7 | (no known bugs) 8 | 9 | 10 | Features 11 | -------- 12 | 13 | * Add a `text_untoken()` function to turn token sequence into text: 14 | insert word-joiner (U+2060) to keep multi-word phrases together; 15 | put specified space character (ZWSP or SP) between tokens 16 | 17 | * wrap.pad, width arguments to `utf8_print` 18 | 19 | * `token_kind` and `token_map` functions (?) 20 | 21 | * Add demonstration of dictionary scaling with `text_match`: 22 | 23 | m <- text_match(x, dict$term) 24 | score <- tapply(dict$score[m$term], m$text, mean, default = 0) 25 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | params: 3 | ganalytics: UA-4636081-3 4 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | - ps: Bootstrap 12 | - git submodule update --init --recursive 13 | 14 | cache: 15 | - C:\RLibrary 16 | 17 | # Adapt as necessary starting from here 18 | 19 | build_script: 20 | - travis-tool.sh install_deps 21 | 22 | test_script: 23 | - travis-tool.sh run_tests 24 | 25 | on_failure: 26 | - 7z a failure.zip *.Rcheck\* 27 | - appveyor PushArtifact failure.zip 28 | 29 | artifacts: 30 | - path: '*.Rcheck\**\*.log' 31 | name: Logs 32 | 33 | - path: '*.Rcheck\**\*.out' 34 | name: Logs 35 | 36 | - path: '*.Rcheck\**\*.fail' 37 | name: Logs 38 | 39 | - path: '*.Rcheck\**\*.Rout' 40 | name: Logs 41 | 42 | - path: '\*_*.tar.gz' 43 | name: Bits 44 | 45 | - path: '\*_*.zip' 46 | name: Bits 47 | -------------------------------------------------------------------------------- /bench/bench-term_matrix.R: -------------------------------------------------------------------------------- 1 | library("dplyr", warn.conflicts = FALSE) 2 | library("janeaustenr") 3 | library("magrittr") 4 | library("stringr") 5 | 6 | lines <- (austen_books() 7 | %>% group_by(book) 8 | %>% mutate( 9 | linenumber = row_number(), 10 | chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 11 | ignore_case = TRUE)))) 12 | %>% ungroup()) 13 | 14 | text <- c(tapply(lines$text, paste(lines$book, lines$chapter), 15 | paste, collapse = "\n")) 16 | if (packageVersion("janeaustenr") < '0.1.5') { 17 | text <- iconv(text, "latin1", "UTF-8") 18 | } 19 | 20 | stop_words <- stopwords("english") 21 | 22 | make_matrix <- function(text, ngrams = 1) { 23 | f <- corpus::token_filter(stemmer = "english", drop_punct = TRUE, 24 | drop_number = TRUE, drop = stop_words) 25 | stats <- corpus::term_counts(text, f, ngrams = ngrams, min = 5) 26 | x <- corpus::term_matrix(text, f, select = stats$term) 27 | x 28 | } 29 | 30 | results <- microbenchmark::microbenchmark( 31 | unigrams = make_matrix(text, 1), 32 | bigrams = make_matrix(text, 1:2), 33 | trigrams = make_matrix(text, 1:3), 34 | "4-grams" = make_matrix(text, 1:4), 35 | "5-grams" = make_matrix(text, 1:5), 36 | times = 5 37 | ) 38 | 39 | print(results) 40 | -------------------------------------------------------------------------------- /bench/bench-term_matrix.Rout: -------------------------------------------------------------------------------- 1 | Unit: milliseconds 2 | expr min lq mean median uq max neval 3 | unigrams 236.2898 236.8613 270.4002 247.5848 257.6114 373.6539 5 4 | bigrams 297.9806 300.2708 309.5385 301.1933 321.1791 327.0687 5 5 | trigrams 302.7908 314.1854 322.2455 315.6736 318.7499 359.8280 5 6 | 4-grams 307.0178 318.1708 318.1261 318.5333 323.0232 323.8856 5 7 | 5-grams 305.0779 314.1598 316.7992 314.9060 321.3384 328.5137 5 8 | -------------------------------------------------------------------------------- /bench/bench.R: -------------------------------------------------------------------------------- 1 | 2 | Sys.setlocale(locale = "C") 3 | files <- dir("bench", "^bench-.*\\.[rR]$", full.names = TRUE) 4 | for (file in files) { 5 | name <- substr(file, 1, nchar(file) - 2) 6 | message("Running ", name, "...", appendLF = FALSE) 7 | time <- proc.time() 8 | sink(paste0(file, "out")) 9 | set.seed(0) 10 | NS <- new.env() 11 | source(file, local = NS) 12 | sink() 13 | new_time <- proc.time() 14 | diff <- summary(structure(new_time - time, class = "proc_time")) 15 | elapsed <- diff[["user"]] + diff[["system"]] 16 | message("done. (", elapsed, "s)") 17 | } 18 | -------------------------------------------------------------------------------- /data/abbreviations.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/abbreviations.rda -------------------------------------------------------------------------------- /data/abbreviations/01_make_abbreviations.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | kinds <- c(de = "german", 4 | en = "english", 5 | es = "spanish", 6 | fr = "french", 7 | it = "italian", 8 | pt = "portuguese", 9 | ru = "russian") 10 | 11 | for (lang in names(kinds)) { 12 | words <- suppressWarnings(corpus:::abbreviations(kinds[[lang]])) 13 | words <- stringr::str_sort(words, locale = lang) 14 | assign(paste0("abbreviations_", lang), words) 15 | } 16 | 17 | filename <- file.path("..", paste0("abbreviations.rda")) 18 | save(list = paste0("abbreviations_", names(kinds)), file = filename) 19 | tools::resaveRdaFiles(filename) 20 | 21 | -------------------------------------------------------------------------------- /data/affect_wordnet.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/affect_wordnet.rda -------------------------------------------------------------------------------- /data/affect_wordnet/02_make_rda.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | raw <- read.table("wnaffect.tsv", header = TRUE, stringsAsFactors = FALSE) 4 | raw$pos <- factor(raw$pos, levels = c("NOUN", "ADJ", "VERB", "ADV")) 5 | raw$category <- factor(raw$category, levels = unique(raw$category)) 6 | raw$emotion <- factor(raw$emotion, levels = unique(raw$emotion)) 7 | 8 | affect_wordnet <- raw 9 | class(affect_wordnet) <- c("corpus_frame", "data.frame") 10 | 11 | save(affect_wordnet, file = "../affect_wordnet.rda") 12 | tools::resaveRdaFiles("../affect_wordnet.rda") 13 | -------------------------------------------------------------------------------- /data/affect_wordnet/README: -------------------------------------------------------------------------------- 1 | WordNet 1.6 2 | ----------- 3 | 4 | URL: https://wordnet.princeton.edu 5 | License: custom (MIT-like) 6 | Path: wordnet-1.6 7 | 8 | Dowloaded from https://wordnet.princeton.edu/wordnet/download/old-versions 9 | 10 | 11 | WordNet-Affect Lexicon 12 | ---------------------- 13 | 14 | URL: http://wndomains.fbk.eu/wnaffect.html 15 | License: Creative Commons Attribution 3.0 Unported License 16 | Path: wn-affect-1.1 17 | 18 | Downloaded from https://github.com/larsmans/wordnet-domains-sentiwords 19 | 20 | 21 | Notes 22 | ----- 23 | 24 | + wn-affect-1.1/a-hierarchy.xml organizes emotion aspects into a hierarchy 25 | 26 | + wn-affect-1.1/a-synsets.xml associates named aspects with wordnet-1.6 synsets 27 | 28 | + some named aspects do not have associated synsets, e.g. "merriment" 29 | 30 | + three categories in `a-synsets` do not appear in `a-hierarchy`. We remap 31 | them as follows: 32 | 33 | joy-pride -> self-pride 34 | levity-gaiety -> playfulness 35 | general-gaiety -> merriment 36 | 37 | + we merge neutral and ambiguous emotion categories into "neutral" 38 | -------------------------------------------------------------------------------- /data/affect_wordnet/wn-affect-1.1/readme-wn-affect-1.1.txt: -------------------------------------------------------------------------------- 1 | Characteristics of WordNet-Affect 1.1 2 | 3 | This version includes a smaller number of synsets but the semantic 4 | organization is more well-structured. 5 | 6 | a) Affective Hierarchy 7 | 8 | The affective label "emotion" is expanded in order to include a subset of 9 | new labels, identifying emotional states. These labels, named "affective 10 | categories", are hierarchically organized. 11 | 12 | 13 | b) Valence 14 | 15 | The hierarchy was initially obtained from the hyponym subtree of the 16 | synset "n#feeling#1", but some modifications were performed in order to 17 | classify affective synsets according to emotional valence. In particular, 18 | affective categories are partitioned in 4 classes: "positive" (e.g. joy), 19 | "negative" (e.g. sadness), "ambiguous" (e.g. surprise), and "neutral" 20 | (e.g. apathy). 21 | 22 | c) Causative/Stative Attribute 23 | 24 | Synsets of part of speech (pos) "adjective", "verb", and "adverb" 25 | present an addictional label representing their "causative" or "stative" 26 | semantic function. For example, an emotional adjective is "causative" 27 | if it refers to some emotion that is caused by the entity represented by 28 | the modified noun (e.g. "amusing movie"). On the other hand, an emotional 29 | adjective is "stative" if it refers to the emotion owned or felt by the 30 | subject denoted by the modified noun (e.g. "cheerful/happy boy"). 31 | 32 | --------------------------------------------------------------------- 33 | Differences with respect to WordNet-Affect 1.0 34 | 35 | - Source files are formatted in XML standard. 36 | 37 | - Previous affective labels were renamed and expressed without 38 | abbreviations. The mapping between previous and present labels is the 39 | following: 40 | 41 | phy -> physical-state 42 | beh -> behaviour 43 | sit -> (emotion eliciting) situation 44 | tra -> trait 45 | sen -> sensation 46 | cog -> cognitive-state 47 | moo -> mood 48 | emo -> emotion 49 | eds -> edonic-signal 50 | 51 | - We removed the label "core" (referring to manually annotated synsets) 52 | and other labels automatically added to synsets (by application of WordNet 53 | relations, such as "similar-to"). In fact, all synsets in WordNet-Affect 54 | 1.1 were manually reviewed and it is no more useful to trace how they 55 | were collected. 56 | 57 | - Synsets that are not tagged with the label "emo(tion)" in the previous 58 | version are not present in current release. In order to retrieve these 59 | synsets, you have to refer to source files of WordNet-Affect 1.0 60 | 61 | --------------------------------------------------------------------- 62 | File description 63 | 64 | a-hierarchy.xml: 65 | 66 | Includes the affective hierarchy. Each item has 2 attributes: 67 | 68 | name = affective category label 69 | isa = category parent in the hierarchy 70 | 71 | 72 | a-synsets.xml: 73 | 74 | Includes synsets associated with the affective hierarchy. Synsets are 75 | classified according to their pos. 76 | 77 | Synsets of pos "noun" have the following attributes: 78 | 79 | id = label identifying current synset 80 | categ = affective category label 81 | 82 | Synsets of other pos ("adjective", "verb", and "adverb") have the 83 | following attributes: 84 | 85 | id = label identifying current synset 86 | noun-id = id of the noun synset from which the current one was derived 87 | causat-stat = causative/stative label 88 | 89 | The reason why not-noun synsets are connected to the affective categories 90 | via noun synsets is because this relation allows us to study to what 91 | extent the causative/stative character of adjectives, verbs and adverbs 92 | depends on the morphological variation of nouns. In the next release of 93 | WordNet-Affect, it is reasonable to characterize this semantic function 94 | of morphology. 95 | 96 | --------------------------------------------------------------------- 97 | Plans for the future 98 | 99 | The next version of WordNet-Affect will include all synsets that 100 | in WordNet-Affect 1.0 are annotated with labels different from "emo" 101 | (emotion) and that are not included in the current release. In particular, 102 | we want to distinguish labels representing mental states (e.g. cognitive 103 | states, attitudes), attributes of mental states (e.g. valence, intensity 104 | or level of arousal), and other semantic characteristics (e.c. behaviours, 105 | emotion-eliciting situations, emotional responces). Finally, we want to 106 | select only one label for each synset, taking into account its hypernyms. 107 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/LICENSE: -------------------------------------------------------------------------------- 1 | WordNet Release 1.6 2 | 3 | This software and database is being provided to you, the LICENSEE, by 4 | Princeton University under the following license. By obtaining, using 5 | and/or copying this software and database, you agree that you have 6 | read, understood, and will comply with these terms and conditions.: 7 | 8 | Permission to use, copy, modify and distribute this software and 9 | database and its documentation for any purpose and without fee or 10 | royalty is hereby granted, provided that you agree to comply with 11 | the following copyright notice and statements, including the disclaimer, 12 | and that the same appear on ALL copies of the software, database and 13 | documentation, including modifications that you make for internal 14 | use or for distribution. 15 | 16 | WordNet 1.6 Copyright 1997 by Princeton University. All rights reserved. 17 | 18 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON 19 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 20 | IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON 21 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- 22 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE 23 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT 24 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR 25 | OTHER RIGHTS. 26 | 27 | The name of Princeton University or Princeton may not be used in 28 | advertising or publicity pertaining to distribution of the software 29 | and/or database. Title to copyright in this software, database and 30 | any associated documentation shall at all times remain with 31 | Princeton University and LICENSEE agrees to preserve same. 32 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/dict/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/sh 2 | 3 | # Makefile for WordNet 1.6 database directory "dict" 4 | 5 | WN_ROOT = /usr/local/wordnet1.6 6 | WN_INSTALLDIR = $(WN_ROOT)/dict 7 | 8 | INSTALLCMD = cp 9 | INSTALLFLAGS = -p 10 | 11 | WN_FILES = data.noun data.verb data.adj data.adv index.noun index.verb index.adj index.adv noun.exc verb.exc adj.exc adv.exc cousin.exc cousin.tops index.sense cntlist lexnames index.gloss sentidx.vrb sents.vrb stoplist.pl 12 | 13 | all: $(WN_FILES) 14 | 15 | install: $(WN_FILES) 16 | @if [ ! -d $(WN_INSTALLDIR) ] ; then \ 17 | echo "Making directory $(WN_INSTALLDIR)" ; \ 18 | mkdir -p $(WN_INSTALLDIR) ; \ 19 | chmod 755 $(WN_INSTALLDIR) ; \ 20 | fi ; 21 | @echo "Installing database files in $(WN_INSTALLDIR)" 22 | @for file in $(WN_FILES) ; \ 23 | do \ 24 | filename=$(WN_INSTALLDIR)/$$file ; \ 25 | if [ -f $$filename ] ; then \ 26 | echo "Cannot install $$filename: file exists" ; \ 27 | else \ 28 | echo "Installing $$filename" ; \ 29 | $(INSTALLCMD) $(INSTALLFLAGS) $$file $$filename ; \ 30 | fi ; \ 31 | done ; 32 | @echo "Done installing database files in $(WN_INSTALLDIR)" 33 | 34 | uninstall: 35 | @echo "Cannot uninstall database files automatically" ; \ 36 | echo "You must delete them from $(WN_INSTALLDIR) manually" ; \ 37 | echo "This is dangerous if you set INSTALLCMD to 'mv'" ; \ 38 | echo "Since this is your only copy of WordNet" ; 39 | 40 | reallyuninstall: 41 | @echo "Uninstalling database files from $(WN_INSTALLDIR)" 42 | @for file in $(WN_FILES) ; \ 43 | do \ 44 | filename=$(WN_INSTALLDIR)/$$file ; \ 45 | if [ ! -f $$filename ] ; then \ 46 | echo "Cannot uninstall $$filename: not present" ; \ 47 | else \ 48 | echo "Uninstalling $$filename" ; \ 49 | rm -f $$filename ; \ 50 | fi ; \ 51 | done ; 52 | @echo "Done uninstalling database files from $(WN_INSTALLDIR)" 53 | 54 | clean: 55 | @rm -f *~ "#"* 56 | 57 | cleandbfiles: 58 | @echo "Removing WordNet 1.6 database files from `pwd`" 59 | @for file in $(WN_FILES) ; \ 60 | do \ 61 | if [ ! -f $$file ] ; then \ 62 | echo "Cannot remove $$file" ; \ 63 | else \ 64 | echo "Removing $$file" ; \ 65 | rm -f $$file ; \ 66 | fi ; \ 67 | done ; 68 | @echo "Done removing WordNet 1.6 database files" 69 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/dict/adv.exc: -------------------------------------------------------------------------------- 1 | best well 2 | better well 3 | deeper deeply 4 | farther far 5 | further far 6 | harder hard 7 | hardest hard 8 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/dict/lexnames: -------------------------------------------------------------------------------- 1 | 00 adj.all 3 2 | 01 adj.pert 3 3 | 02 adv.all 4 4 | 03 noun.Tops 1 5 | 04 noun.act 1 6 | 05 noun.animal 1 7 | 06 noun.artifact 1 8 | 07 noun.attribute 1 9 | 08 noun.body 1 10 | 09 noun.cognition 1 11 | 10 noun.communication 1 12 | 11 noun.event 1 13 | 12 noun.feeling 1 14 | 13 noun.food 1 15 | 14 noun.group 1 16 | 15 noun.location 1 17 | 16 noun.motive 1 18 | 17 noun.object 1 19 | 18 noun.person 1 20 | 19 noun.phenomenon 1 21 | 20 noun.plant 1 22 | 21 noun.possession 1 23 | 22 noun.process 1 24 | 23 noun.quantity 1 25 | 24 noun.relation 1 26 | 25 noun.shape 1 27 | 26 noun.state 1 28 | 27 noun.substance 1 29 | 28 noun.time 1 30 | 29 verb.body 2 31 | 30 verb.change 2 32 | 31 verb.cognition 2 33 | 32 verb.communication 2 34 | 33 verb.competition 2 35 | 34 verb.consumption 2 36 | 35 verb.contact 2 37 | 36 verb.creation 2 38 | 37 verb.emotion 2 39 | 38 verb.motion 2 40 | 39 verb.perception 2 41 | 40 verb.possession 2 42 | 41 verb.social 2 43 | 42 verb.stative 2 44 | 43 verb.weather 2 45 | 44 adj.ppl 3 46 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/binsrch.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | BINSRCH(3WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | bin_search, copyfile, replace_line, insert_line 13 |

SYNOPSIS

14 |

15 | char *bin_search(char 16 | *key, FILE *fp);

17 | void copyfile(FILE *fromfp, FILE *tofp);

18 | char *replace_line(char 19 | *new_line, char *key, FILE *fp);

20 | 21 |

DESCRIPTION

22 |

23 | The WordNet library contains 24 | several general purpose functions for performing a binary search and modifying 25 | sorted files.

26 | bin_search() is the primary binary search algorithm to search 27 | for key as the first item on a line in the file pointed to by fp . The 28 | delimiter between the key and the rest of the fields on the line, if any, 29 | must be a space. A pointer to a static variable containing the entire 30 | line is returned. NULL 31 | is returned if a match is not found.

32 | The remaining 33 | functions are not used by WordNet, and are only briefly described.

34 | copyfile() 35 | copies the contents of one file to another.

36 | replace_line() replaces a line 37 | in a file having searchkey key with the contents of new_line . It returns 38 | the original line or NULL 39 | in case of error.

40 | insert_line() finds the proper 41 | place to insert the contents of new_line , having searchkey key in the 42 | sorted file pointed to by fp . It returns NULL 43 | if a line with this searchkey 44 | is already in the file. 45 |

NOTES

46 | The maximum length of key is 1024.

47 | The 48 | maximum line length in a file is 25K for Unix platforms, and 8K for the 49 | PC and Macintosh platforms.

50 | If there are no additional fields after the 51 | search key, the key must be followed by at least one space before the 52 | newline character. 53 |

SEE ALSO

54 | morph(3WN) 55 | , wnsearch(3WN) 56 | , wnutil(3WN) 57 | , wnintro(5WN) 58 | . 59 | 60 |

WARNINGS

61 | binsearch() returns a pointer to a static character buffer. 62 | The returned string should be copied by the caller if the results need 63 | to be saved, as a subsequent call will replace the contents of the static 64 | buffer.

65 |

66 | 67 |


68 | Table of Contents

69 |

77 | 78 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/morph.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | MORPH(3WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | morphinit, re_morphinit, morphstr, morphword 13 |

SYNOPSIS

14 |

15 | #include 16 | "wn.h"

17 | int morphinit(void);

18 | int re_morphinit(void);

19 | char *morphstr(char 20 | *origstr, int pos);

21 | char *morphword(char *word, int pos); 22 |

DESCRIPTION 23 |

24 |

25 | The WordNet morphological processor, Morphy, is accessed through these 26 | functions:

27 | morphinit() is used to open the exception list files. It returns 28 | 0 if successful, -1 otherwise. The exception list files must be opened 29 | before morphstr() or morphword() are called.

30 | re_morphinit() is used to 31 | close the exception list files and reopen them, and is used exclusively 32 | for WordNet development. Return codes are as described above.

33 | morphstr() 34 | is the basic user interface to Morphy. It tries to find the base form 35 | (lemma) of the word or collocation origstr in the specified pos . The 36 | first call (with origstr specified) returns a pointer to the first base 37 | form found. Subsequent calls requesting base forms of the same string 38 | must be made with the first argument of NULL. 39 | When no more base forms 40 | for origstr can be found, NULL 41 | is returned.

42 | morphword() tries to find 43 | the base form of word in the specified pos . This function is called by 44 | morphstr() for each individual word in a collocation. 45 |

NOTES

46 | morphinit() 47 | is called by wninit() and is not intended to be called directly by an 48 | application. Applications wishing to use WordNet and/or the morphological 49 | functions must call wninit() at the start of the program. See wnutil(3WN) 50 | 51 | for more information.

52 | origstr may be either a word or a collocation formed 53 | by joining individual words with underscore characters (_ ).

54 | Usually only 55 | morphstr() is called from applications, as it works on both words and 56 | collocations.

57 | pos must be one of the following:

58 |

1    NOUN
59 | 2    VERB
60 | 3    ADJECTIVE 61 |
62 | 4    ADVERB
63 | 5    ADJECTIVE SATELLITE
64 |
65 |

66 | If ADJECTIVE SATELLITE 67 | is passed, it is treated 68 | by morphstr() as ADJECTIVE. 69 | 70 |

SEE ALSO

71 | wnsearch(3WN) 72 | , wndb(5WN) 73 | , morphy(7WN) 74 | . 75 |

76 | 77 |

WARNINGS

78 | Passing an invalid part of speech will result in a core dump. 79 |

80 | The WordNet database files must be open to use morphstr() or morphword(). 81 |

82 | 83 |


84 | Table of Contents

85 |

93 | 94 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/uniqbeg.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | UNIQBEG(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | uniqbeg - unique beginners for noun hierarchies 13 |

DESCRIPTION

14 | All 15 | of the WordNet noun synsets are organized into hierarchies. Each synset 16 | is part of at least one hierarchy, headed by a synset called a unique 17 | beginner. All of these synsets originate in the lexicographer file noun.Tops 18 | . From any noun synset, except a unique beginner, the hypernym pointers 19 | can be traced up to one of the following synsets:

20 |

{ entity, something, 21 | (anything having existence (living or nonliving)) }
22 |

23 | { psychological_feature, 24 | (a feature of the mental life of a living organism) }
25 |

26 | { abstraction, 27 | (a general concept formed by extracting common features from specific 28 | examples) }
29 |

30 | { state, (the way something is with respect to its main 31 | attributes; "the current state of knowledge";
32 |    "his state of health"; 33 | "in a weak financial state") }
34 |

35 | { event, (something that happens at a 36 | given place and time) }
37 |

38 | { act, human_action, human_activity, (something 39 | that people do or cause to happen) }
40 |

41 | { group, grouping, (any number 42 | of entities (members) considered as a unit) }
43 |

44 | { possession, (anything 45 | owned or possessed) }
46 |

47 | { phenomenon, (any state or process known through 48 | the senses rather than by intuition or reasoning) }
49 |

50 | 51 |

NOTES

52 | The lexicographer 53 | files are not included in the WordNet package. 54 |

FILES

55 | 56 |
57 | 58 |
noun.Tops
59 |
unique 60 | beginners for nouns
61 |
62 | 63 |

SEE ALSO

64 | wndb(5WN) 65 | , wninput(5WN) 66 | , wngloss(7WN) 67 | .

68 | 69 |


70 | Table of Contents

71 |

78 | 79 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/wnintro1.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNINTRO(1WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnintro - WordNet user commands 13 |

SYNOPSIS

14 |

15 | escort - search semantic 16 | concordances for sentences containing semantic tags

17 | wn - command line 18 | interface to WordNet database

19 | wnb - window based WordNet browser 20 |

DESCRIPTION 21 |

22 | This section of the WordNet Reference Manual contains manual pages that 23 | describe commands available with the various WordNet system packages.

24 | 25 | The WordNet interfaces wn(1WN) 26 | and wnb(1WN) 27 | allow the user to search 28 | the WordNet database and display the information textually. escort(1WN) 29 | 30 | is a window based browser for searching the semantic concordances. 31 |

ENVIRONMENT 32 | VARIABLES

33 | 34 |
35 | 36 |
WNHOME
37 |
Base directory for WordNet. Unix default is /usr/local/wordnet1.6 38 | , PC default is C:\wn16 , Macintosh default is : .
39 | 40 |
WNSEARCHDIR
41 |
Directory 42 | in which the WordNet database has been installed. Unix default is WNHOME/dict 43 | , PC default is WNHOME\dict , Macintosh default is :Database .
44 | 45 |
WNDBVERSION 46 |
47 |
Indicates which format the WordNet database files in WNSEARCHDIR are 48 | in. The default is 1.6 . Setting WNDBVERION to 1.5 allows the 1.6 commands 49 | to work with the 1.5 database files.
50 |
51 | 52 |

SEE ALSO

53 | wnintro(3WN) 54 | , wnintro(5WN) 55 | , 56 | wnintro(7WN) 57 | .

58 | Miller, G. A. (1990), ed. "WordNet: An On-Line Lexical Database" 59 | . International Journal of Lexicography, Volume 3, Number 4.

60 | Miller, G. 61 | A., et al. (1990, 1993). "Five Papers on WordNet" . Cognitive Science Laboratory 62 | Report 43. (Available from ftp://ftp.cogsci.princeton.edu/wordnet/ .)

63 | Fellbaum, 64 | C. (1998), ed. "WordNet: An Electronic Lexical Database" . MIT Press, Cambridge, 65 | MA. 66 |

AVAILABILITY

67 | WordNet has a World Wide Web site at http://www.cogsci.princeton.edu/~wn/ 68 | 69 | . From this web site users can learn about the WordNet project, run several 70 | different interfaces to the WordNet database, and download various WordNet 71 | system packages and "Five Papers on WordNet" .

72 | 73 |


74 | Table of Contents

75 |

83 | 84 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/wnintro5.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNINTRO(5WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnintro - introduction to descriptions of WordNet file formats 13 | 14 |

SYNOPSIS

15 |

16 | cntlist - format of cntlist file

17 | cxtfile - format of semantically 18 | tagged file

19 | glossidx - index of words found in synset glosses

20 | lexnames 21 | - list of lexicographer file names and numbers

22 | prologdb - description 23 | of Prolog database files

24 | senseidx - format of sense index file

25 | sensemap 26 | - mapping from senses in WordNet 1.5 to corresponding 1.6 senses

27 | taglist 28 | - format of taglist file

29 | wndb - format of WordNet database files

30 | wninput 31 | - format of WordNet lexicographer files 32 |

DESCRIPTION

33 | This section of the 34 | WordNet Reference Manual contains manual pages that describe the formats 35 | of the various files included in different WordNet 1.6 packages. 36 |

NOMENCLATURE 37 |

38 | All files are in ASCII. Fields are generally separated by one space, unless 39 | otherwise noted, and each line is terminated with a newline character. 40 | In the file format descriptions, terms in italics refer to field names. 41 | Characters or strings in boldface represent an actual character or string 42 | as it appears in the file. Items enclosed in italicized square brackets 43 | ([ ] ) may not be present. Since several files contain fields that have 44 | the identical meaning, field names are consistently defined. For example, 45 | several WordNet files contain one or more synset_offset fields. In each 46 | case, the definition of synset_offset is identical. 47 |

SEE ALSO

48 | wnintro(1WN) 49 | , 50 | wnintro(3WN) 51 | , wnintro(7WN) 52 | , wngloss(7WN) 53 | .

54 | Miller, G. A. (1990), ed. "WordNet: 55 | An On-Line Lexical Database" . International Journal of Lexicography, Volume 56 | 3, Number 4.

57 | Miller, G. A., et al. (1990, 1993). "Five Papers on WordNet" 58 | . Cognitive Science Laboratory Report 43. (Available from ftp://ftp.cogsci.princeton.edu/wordnet/ 59 | .)

60 | Fellbaum, C. (1998), ed. "WordNet: An Electronic Lexical Database" . MIT 61 | Press, Cambridge, MA.

62 |

63 | 64 |


65 | Table of Contents

66 |

73 | 74 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/wnintro7.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNINTRO(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnintro - introduction to miscellaneous WordNet information 13 |

SYNOPSIS 14 |

15 |

16 | morphy - discussion of WordNet's morphological processing

17 | semcor - discussion 18 | of semantic concordances

19 | uniqbeg - unique beginners for noun hierarchies 20 |

21 | wngloss - glossary of terms used in WordNet

22 | wngroups - discussion of WordNet 23 | search code to group similar senses

24 | wnlicens - text of WordNet license 25 | agreement

26 | wnpkgs - information about WordNet packages and distribution 27 |

28 | wnstats - database statistics 29 |

DESCRIPTION

30 | This section of the WordNet 31 | Reference Manual contains manual pages that describe various topics related 32 | to WordNet and the semantic concordances, and a glossary of terms. 33 |

SEE 34 | ALSO

35 | wnintro(1WN) 36 | , wnintro(3WN) 37 | , wnintro(5WN) 38 | , wngloss(7WN) 39 | .

40 | Miller, G. 41 | A. (1990), ed. "WordNet: An On-Line Lexical Database" . International Journal 42 | of Lexicography, Volume 3, Number 4.

43 | Miller, G. A., et al. (1990, 1993). 44 | "Five Papers on WordNet" . Cognitive Science Laboratory Report 43. (Available 45 | from ftp://ftp.cogsci.princeton.edu/wordnet/ .)

46 | Fellbaum, C. (1998), ed. "WordNet: 47 | An Electronic Lexical Database" . MIT Press, Cambridge, MA.

48 | 49 |


50 | Table of Contents

51 |

57 | 58 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/wnlicens.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNLICENS(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnlicens - text of WordNet license 13 |

DESCRIPTION

14 | WordNet Release 1.6 15 |

16 | This software and database is being provided to you, the LICENSEE, by 17 | Princeton University under the following license. By obtaining, using 18 | and/or copying this software and database, you agree that you have 19 | read, understood, and will comply with these terms and conditions.: 20 | Permission to use, copy, modify and distribute this software and 21 | database and its documentation for any purpose and without fee or royalty 22 | is hereby granted, provided that you agree to comply with the following 23 | copyright notice and statements, including the disclaimer, and that 24 | the same appear on ALL copies of the software, database and documentation, 25 | including modifications that you make for internal use or for distribution. 26 | WordNet 1.6 Copyright 1997 by Princeton University. All rights reserved. 27 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY 28 | MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF 29 | EXAMPLE, BUT NOT LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS 30 | OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE 31 | OR THAT THE USE OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION 32 | WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR 33 | OTHER RIGHTS. The name of Princeton University or Princeton may 34 | not be used in advertising or publicity pertaining to distribution of 35 | the software and/or database. Title to copyright in this software, database 36 | and any associated documentation shall at all times remain with Princeton 37 | University and LICENSEE agrees to preserve same.

38 | 39 |


40 | Table of Contents

41 |

45 | 46 | -------------------------------------------------------------------------------- /data/affect_wordnet/wordnet-1.6/man/html/wnstats.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | WNSTATS(7WN) manual page 7 | 8 | 9 | Table of Contents

10 | 11 |

NAME

12 | wnstats - WordNet 1.6 database statistics 13 |

DESCRIPTION

14 | 15 |

Number of 16 | words, synsets, and senses

17 |
18 | 19 | 20 | 21 | 23 | 24 | 25 | 26 |
POSUnique Strings Synsets Total Senses
Noun 94474 66025 116317
Verb 10319 12127 22066
Adjective 22 | 20170 17915 29881
Adverb 4546 3575 5677
Totals 121962 99642 173941
27 |

28 | 29 |

Polysemy information

30 |

31 |

32 | 33 | 35 | 36 | 37 | 38 | 39 | 40 |
POSMonosemous Words
and Senses
Polysemous Words Polysemous Senses
Noun 81909 12564 34 | 34408
Verb 5751 4567 16315
Adjective 14795 5374 15086
Adverb 3795 750 1882
Totals 106250 23255 67691
41 |

42 |

43 | 46 | 47 | 48 | 49 | 50 | 51 |
POS Average 44 | Polysemy
Including Monosemous Words
Average 45 | Polysemy
Excluding Monosemous Words
Noun 1.23 2.73
Verb 2.13 3.57
Adjective 1.48 2.80
Adverb 1.24 2.50
52 | 53 |

NOTES

54 | Statistics for all types of adjectives 55 | and adjective satellites are combined.

56 | The total of all unique noun, 57 | verb, adjective, and adverb strings is greater than 121962. However, many 58 | strings are unique within a syntactic category, but are in more than one 59 | syntactic category. The figure in the table represents the unique strings 60 | when all syntactic categories are combined.

61 |

62 | 63 |


64 | Table of Contents

65 |

74 | 75 | -------------------------------------------------------------------------------- /data/federalist.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/federalist.rda -------------------------------------------------------------------------------- /data/federalist/.gitignore: -------------------------------------------------------------------------------- 1 | /federalist.json 2 | /pg18.txt 3 | -------------------------------------------------------------------------------- /data/federalist/01_download_raw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | FILE="pg18.txt" 6 | FILE_MD5="126eca27b879c078a189ba0783186330" 7 | 8 | if [ ! -f $FILE ]; 9 | then 10 | echo "Downloading raw data file '$FILE' from gutenberg.org" 11 | curl 'http://www.gutenberg.org/cache/epub/18/pg18.txt' -o ${FILE}.download 12 | mv ${FILE}.download ${FILE} 13 | fi 14 | 15 | echo "Checking raw data file '${FILE}'" 16 | md5sum -c - <<< "${FILE_MD5} ${FILE}" 17 | -------------------------------------------------------------------------------- /data/federalist/03_make_rda.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | library("corpus") 4 | 5 | raw <- read_ndjson("federalist.json", text = NULL, stringsAsFactors = FALSE) 6 | 7 | name <- paste("Federalist No.", raw$paper_id) 8 | 9 | venue <- raw$venue 10 | venue[venue == "For the Independent Fournal"] <- "For the Independent Journal" 11 | venue[grep("^From M[cC]", venue)] <- "From McLean's Edition, New York" 12 | 13 | author <- raw$author 14 | author[author == "HAMILTON"] <- "Hamilton" 15 | author[author == "HAMILTON AND MADISON"] <- NA 16 | author[author == "HAMILTON OR MADISON"] <- NA 17 | author[author == "JAY"] <- "Jay" 18 | author[author == "MADISON"] <- "Madison" 19 | author[raw$paper_id == 58] <- NA # follow Mosteller and Wallace 20 | 21 | date <- raw$date 22 | date <- sub("^(Tuesday|Thursday|Friday),? ", "", date) 23 | 24 | invisible(Sys.setlocale("LC_TIME", "C")) 25 | date <- as.Date(date, "%B %d, %Y") 26 | 27 | federalist <- data.frame(name, 28 | title = raw$title, 29 | venue, 30 | date, 31 | author, 32 | text = raw$text, 33 | stringsAsFactors = FALSE) 34 | class(federalist) <- c("corpus_frame", "data.frame") 35 | 36 | save(federalist, file = "../federalist.rda") 37 | tools::resaveRdaFiles("../federalist.rda") 38 | -------------------------------------------------------------------------------- /data/sentiment_afinn.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/sentiment_afinn.rda -------------------------------------------------------------------------------- /data/sentiment_afinn/01_download_raw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | FILE="imm6010.zip" 6 | FILE_MD5="ea6216f43d27188ea2b5bfadf068ff37" 7 | 8 | if [ ! -f $FILE ]; 9 | then 10 | echo "Downloading raw data file '$FILE' from www2.imm.dtu.dk" 11 | curl 'http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip' -o ${FILE}.download 12 | mv ${FILE}.download ${FILE} 13 | fi 14 | 15 | if [ ! -d AFINN ]; 16 | then 17 | unzip ${FILE} 18 | fi 19 | -------------------------------------------------------------------------------- /data/sentiment_afinn/02_make_rda.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | raw <- read.delim(file.path("AFINN", "AFINN-111.txt"), 4 | encoding = "UTF-8", header = FALSE, stringsAsFactors = FALSE) 5 | names(raw) <- c("term", "score") 6 | raw$term <- trimws(raw$term) 7 | 8 | # exclude multi-type terms 9 | multi <- grepl("[[:space:]]", raw$term) 10 | raw <- raw[!multi,] 11 | 12 | # discared row names 13 | rownames(raw) <- NULL 14 | 15 | sentiment_afinn <- raw 16 | class(sentiment_afinn) <- c("corpus_frame", "data.frame") 17 | 18 | save(sentiment_afinn, file = "../sentiment_afinn.rda") 19 | tools::resaveRdaFiles("../sentiment_afinn.rda") 20 | -------------------------------------------------------------------------------- /data/sentiment_afinn/AFINN/AFINN-README.txt: -------------------------------------------------------------------------------- 1 | AFINN is a list of English words rated for valence with an integer 2 | between minus five (negative) and plus five (positive). The words have 3 | been manually labeled by Finn Årup Nielsen in 2009-2011. The file 4 | is tab-separated. There are two versions: 5 | 6 | AFINN-111: Newest version with 2477 words and phrases. 7 | 8 | AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there 9 | are 1480 lines, as some words are listed twice. The word list in not 10 | entirely in alphabetic ordering. 11 | 12 | An evaluation of the word list is available in: 13 | 14 | Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for 15 | sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903 16 | 17 | The list was used in: 18 | 19 | Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni, 20 | Michael Etter, "Good Friends, Bad News - Affect and Virality in 21 | Twitter", The 2011 International Workshop on Social Computing, 22 | Network, and Services (SocialComNet 2011). 23 | 24 | 25 | This database of words is copyright protected and distributed under 26 | "Open Database License (ODbL) v1.0" 27 | http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar 28 | copyleft license. 29 | 30 | See comments on the word list here: 31 | http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis 32 | 33 | 34 | In Python the file may be read into a dictionary with: 35 | 36 | >>> afinn = dict(map(lambda (k,v): (k,int(v)), 37 | [ line.split('\t') for line in open("AFINN-111.txt") ])) 38 | >>> afinn["Good".lower()] 39 | 3 40 | >>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split())) 41 | 2 42 | 43 | 44 | -------------------------------------------------------------------------------- /data/stopwords.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/stopwords.rda -------------------------------------------------------------------------------- /data/stopwords/01_make_stopwords.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | kinds <- c(da = "danish", 4 | de = "german", 5 | en = "english", 6 | es = "spanish", 7 | fi = "finnish", 8 | fr = "french", 9 | hu = "hungarian", 10 | it = "italian", 11 | nl = "dutch", 12 | no = "norwegian", 13 | pt = "portuguese", 14 | ru = "russian", 15 | sv = "swedish") 16 | 17 | for (lang in names(kinds)) { 18 | words <- suppressWarnings(corpus:::stopwords(kinds[[lang]])) 19 | words <- stringr::str_sort(words, locale = lang) 20 | assign(paste0("stopwords_", lang), words) 21 | } 22 | 23 | filename <- file.path("..", paste0("stopwords.rda")) 24 | save(list = paste0("stopwords_", names(kinds)), file = filename) 25 | tools::resaveRdaFiles(filename) 26 | 27 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | corpustext.com -------------------------------------------------------------------------------- /docs/articles/chinese-wordcloud-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/chinese-wordcloud-1.png -------------------------------------------------------------------------------- /docs/articles/corpus-emotion-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-emotion-1.png -------------------------------------------------------------------------------- /docs/articles/corpus-heapslaw-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-heapslaw-1.png -------------------------------------------------------------------------------- /docs/articles/corpus-witch-occurrences-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-witch-occurrences-1.png -------------------------------------------------------------------------------- /docs/articles/gender-estimates-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates-1.png -------------------------------------------------------------------------------- /docs/articles/gender-estimates_se-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates_se-1.png -------------------------------------------------------------------------------- /docs/articles/gender-signif-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-signif-1.png -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/favicon.ico -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net 3 | */ 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/logo.png -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticker footer */ 2 | body > .container { 3 | display: flex; 4 | padding-top: 60px; 5 | min-height: calc(100vh); 6 | flex-direction: column; 7 | } 8 | 9 | body > .container .row { 10 | flex: 1; 11 | } 12 | 13 | footer { 14 | margin-top: 45px; 15 | padding: 35px 0 36px; 16 | border-top: 1px solid #e5e5e5; 17 | color: #666; 18 | display: flex; 19 | } 20 | footer p { 21 | margin-bottom: 0; 22 | } 23 | footer div { 24 | flex: 1; 25 | } 26 | footer .pkgdown { 27 | text-align: right; 28 | } 29 | footer p { 30 | margin-bottom: 0; 31 | } 32 | 33 | img.icon { 34 | float: right; 35 | } 36 | 37 | img { 38 | max-width: 100%; 39 | } 40 | 41 | /* Section anchors ---------------------------------*/ 42 | 43 | a.anchor { 44 | margin-left: -30px; 45 | display:inline-block; 46 | width: 30px; 47 | height: 30px; 48 | visibility: hidden; 49 | 50 | background-image: url(./link.svg); 51 | background-repeat: no-repeat; 52 | background-size: 20px 20px; 53 | background-position: center center; 54 | } 55 | 56 | .hasAnchor:hover a.anchor { 57 | visibility: visible; 58 | } 59 | 60 | @media (max-width: 767px) { 61 | .hasAnchor:hover a.anchor { 62 | visibility: hidden; 63 | } 64 | } 65 | 66 | 67 | /* Fixes for fixed navbar --------------------------*/ 68 | 69 | .contents h1, .contents h2, .contents h3, .contents h4 { 70 | padding-top: 60px; 71 | margin-top: -60px; 72 | } 73 | 74 | /* Static header placement on mobile devices */ 75 | @media (max-width: 767px) { 76 | .navbar-fixed-top { 77 | position: absolute; 78 | } 79 | .navbar { 80 | padding: 0; 81 | } 82 | } 83 | 84 | 85 | /* Sidebar --------------------------*/ 86 | 87 | #sidebar { 88 | margin-top: 30px; 89 | } 90 | #sidebar h2 { 91 | font-size: 1.5em; 92 | margin-top: 1em; 93 | } 94 | 95 | #sidebar h2:first-child { 96 | margin-top: 0; 97 | } 98 | 99 | #sidebar .list-unstyled li { 100 | margin-bottom: 0.5em; 101 | } 102 | 103 | /* Reference index & topics ----------------------------------------------- */ 104 | 105 | .ref-index th {font-weight: normal;} 106 | .ref-index h2 {font-size: 20px;} 107 | 108 | .ref-index td {vertical-align: top;} 109 | .ref-index .alias {width: 40%;} 110 | .ref-index .title {width: 60%;} 111 | 112 | .ref-index .alias {width: 40%;} 113 | .ref-index .title {width: 60%;} 114 | 115 | .ref-arguments th {text-align: right; padding-right: 10px;} 116 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 117 | .ref-arguments .name {width: 20%;} 118 | .ref-arguments .desc {width: 80%;} 119 | 120 | /* Nice scrolling for wide elements --------------------------------------- */ 121 | 122 | table { 123 | display: block; 124 | overflow: auto; 125 | } 126 | 127 | /* Syntax highlighting ---------------------------------------------------- */ 128 | 129 | pre { 130 | word-wrap: normal; 131 | word-break: normal; 132 | border: 1px solid #eee; 133 | } 134 | 135 | pre, code { 136 | background-color: #f8f8f8; 137 | color: #333; 138 | } 139 | 140 | pre .img { 141 | margin: 5px 0; 142 | } 143 | 144 | pre .img img { 145 | background-color: #fff; 146 | display: block; 147 | height: auto; 148 | } 149 | 150 | code a, pre a { 151 | color: #375f84; 152 | } 153 | 154 | .fl {color: #1514b5;} 155 | .fu {color: #000000;} /* function */ 156 | .ch,.st {color: #036a07;} /* string */ 157 | .kw {color: #264D66;} /* keyword */ 158 | .co {color: #888888;} /* comment */ 159 | 160 | .message { color: black; font-weight: bolder;} 161 | .error { color: orange; font-weight: bolder;} 162 | .warning { color: #6A0366; font-weight: bolder;} 163 | 164 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | $("#sidebar").stick_in_parent({offset_top: 40}); 3 | $('body').scrollspy({ 4 | target: '#sidebar', 5 | offset: 60 6 | }); 7 | 8 | var cur_path = paths(location.pathname); 9 | $("#navbar ul li a").each(function(index, value) { 10 | if (value.text == "Home") 11 | return; 12 | if (value.getAttribute("href") === "#") 13 | return; 14 | 15 | var path = paths(value.pathname); 16 | if (is_prefix(cur_path, path)) { 17 | // Add class to parent
  • , and enclosing
  • if in dropdown 18 | var menu_anchor = $(value); 19 | menu_anchor.parent().addClass("active"); 20 | menu_anchor.closest("li.dropdown").addClass("active"); 21 | } 22 | }); 23 | }); 24 | 25 | function paths(pathname) { 26 | var pieces = pathname.split("/"); 27 | pieces.shift(); // always starts with / 28 | 29 | var end = pieces[pieces.length - 1]; 30 | if (end === "index.html" || end === "") 31 | pieces.pop(); 32 | return(pieces); 33 | } 34 | 35 | function is_prefix(needle, haystack) { 36 | if (needle.length > haystack.lengh) 37 | return(false); 38 | 39 | for (var i = 0; i < haystack.length; i++) { 40 | if (needle[i] != haystack[i]) 41 | return(false); 42 | } 43 | 44 | return(true); 45 | } 46 | -------------------------------------------------------------------------------- /docs/reference/figures/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/banner.png -------------------------------------------------------------------------------- /docs/reference/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo.png -------------------------------------------------------------------------------- /docs/reference/figures/logo/01_make_logo.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | library("magick") 4 | 5 | slide <- image_read("logo-slide.tiff") 6 | banner <- image_crop(slide, "370x80+2+2") 7 | image_write(banner, "../banner.png", format = "png") 8 | 9 | ccap <- image_read("c-07.jpg") 10 | logo <- image_scale(ccap, "80x80") 11 | image_write(logo, "../logo.png", format = "png") 12 | -------------------------------------------------------------------------------- /docs/reference/figures/logo/README: -------------------------------------------------------------------------------- 1 | 2 | The logo was created in Mac OS pages, then exported to TIFF. The script 3 | `logo.R` crops the TIFF and converts to PNG. 4 | 5 | -- 6 | 7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html 8 | 9 | "This and numerous other stock print foundry images were included in the book 10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop & 11 | Company." 12 | 13 | 14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt). 15 | -------------------------------------------------------------------------------- /docs/reference/figures/logo/c-07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/c-07.jpg -------------------------------------------------------------------------------- /docs/reference/figures/logo/logo-slide.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo-slide.tiff -------------------------------------------------------------------------------- /docs/reference/figures/logo/logo.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo.key -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Aarup 2 | AFINN 3 | Årup 4 | Baidu 5 | Baum 6 | bigram 7 | bigrams 8 | blog 9 | CEUR 10 | deserialization 11 | deserialize 12 | deserialized 13 | dorothy 14 | doesn 15 | emoji 16 | Emoji 17 | ESWC 18 | Haiyan 19 | ignorables 20 | indices 21 | JSON 22 | kana 23 | knitr 24 | LF 25 | microblogs 26 | Microposts 27 | Mosteller 28 | NDJSON 29 | NFKC 30 | NFKD 31 | ODbL 32 | PDFs 33 | Publius 34 | Quanteda 35 | rmarkdown 36 | RTF 37 | Silge 38 | Spolsky 39 | stopt 40 | stopwords 41 | STSong 42 | tokenization 43 | Tokenization 44 | tokenize 45 | Tokenize 46 | tokenizer 47 | tokenizes 48 | tokenizing 49 | toto 50 | UAX 51 | unigrams 52 | Unported 53 | URLs 54 | Valitutti 55 | VignetteEncoding 56 | VignetteEngine 57 | VignetteIndexEntry 58 | wordcloud 59 | WordNet 60 | xa 61 | xff 62 | xffff 63 | york 64 | -------------------------------------------------------------------------------- /man/abbreviations.Rd: -------------------------------------------------------------------------------- 1 | \name{abbreviations} 2 | \docType{data} 3 | \alias{abbreviations} 4 | \alias{abbreviations_de} 5 | \alias{abbreviations_en} 6 | \alias{abbreviations_es} 7 | \alias{abbreviations_fr} 8 | \alias{abbreviations_it} 9 | \alias{abbreviations_pt} 10 | \alias{abbreviations_ru} 11 | \title{Abbreviations} 12 | \description{ 13 | Lists of common abbreviations. 14 | } 15 | \details{ 16 | The \code{abbreviations_} objects are character vectors of abbreviations. 17 | These are words or phrases containing full stops (periods, ambiguous sentence 18 | terminators) that require special handling for sentence detection and 19 | tokenization. 20 | 21 | The original lists were compiled by the 22 | \href{http://cldr.unicode.org/}{Unicode Common Locale Data Repository}. We 23 | have tailored the English list by adding single-letter abbreviations and 24 | making a few other additions. 25 | 26 | The built-in abbreviation lists are reasonable defaults, but they may require 27 | further tailoring to suit your particular task. 28 | } 29 | \usage{ 30 | abbreviations_de 31 | abbreviations_en 32 | abbreviations_es 33 | abbreviations_fr 34 | abbreviations_it 35 | abbreviations_pt 36 | abbreviations_ru 37 | } 38 | \format{A character vector of unique abbreviations.} 39 | \seealso{ 40 | \code{\link{text_filter}}. 41 | } 42 | \keyword{datasets} 43 | -------------------------------------------------------------------------------- /man/affect_wordnet.Rd: -------------------------------------------------------------------------------- 1 | \name{affect_wordnet} 2 | \docType{data} 3 | \alias{affect_wordnet} 4 | \title{WordNet-Affect Lexicon} 5 | \description{ 6 | The WordNet-Affect Lexicon is a hand-curate collection of 7 | emotion-related words (nouns, verbs, adjectives, and adverbs), 8 | classified as \dQuote{Positive}, \dQuote{Negative}, 9 | \dQuote{Neutral}, or \dQuote{Ambiguous} and categorized into 10 | 28 subcategories (\dQuote{Joy}, \dQuote{Love}, \dQuote{Fear}, 11 | etc.). 12 | 13 | Terms can and do appear in multiple categories. 14 | 15 | The original lexicon contains multi-word phrases, but they 16 | are excluded here. Also, we removed the term \sQuote{thing} 17 | from the lexicon. 18 | 19 | The original WordNet-Affect lexicon is distributed as part 20 | of the WordNet Domains project, which is licensed under a 21 | \href{https://creativecommons.org/licenses/by/3.0/}{Creative Commons Attribution 3.0 Unported License}. 22 | You are free to share and adapt the lexicon, as long as you 23 | give attribution to the original authors. 24 | } 25 | \usage{affect_wordnet} 26 | \format{A data frame with one row for each term classification.} 27 | \source{\url{http://wndomains.fbk.eu/wnaffect.html}} 28 | \references{ 29 | Strapparava, C and Valitutti A. (2004). 30 | WordNet-Affect: an affective extension of WordNet. 31 | \emph{Proceedings of the 4th International Conference on Language 32 | Resources and Evaluation} 33 | 1083--1086. 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /man/corpus-deprecated.Rd: -------------------------------------------------------------------------------- 1 | \name{corpus-deprecated} 2 | \alias{corpus-deprecated} 3 | \title{Deprecated Functions in Package \pkg{corpus}} 4 | \description{ 5 | These functions are provided for compatibility with older versions of 6 | \pkg{corpus} only, and may be defunct as soon as the next release. 7 | } 8 | %\usage{ 9 | %} 10 | %\arguments{ 11 | %} 12 | %\details{ 13 | %} 14 | \seealso{ 15 | \code{\link{Deprecated}} 16 | } 17 | \keyword{internal} 18 | \keyword{misc} 19 | -------------------------------------------------------------------------------- /man/corpus-package.Rd: -------------------------------------------------------------------------------- 1 | \name{corpus-package} 2 | \alias{corpus-package} 3 | \alias{corpus} 4 | \docType{package} 5 | \title{ 6 | The Corpus Package 7 | } 8 | \description{ 9 | Text corpus analysis functions 10 | } 11 | \details{ 12 | This package contains functions for text corpus analysis. To create a text 13 | object, use the \code{\link{read_ndjson}} or \code{\link{as_corpus_text}} 14 | function. 15 | To split text into sentences or token blocks, use \code{\link{text_split}}. 16 | To specify preprocessing behavior for transforming a text into a 17 | token sequence, use \code{\link{text_filter}}. To tokenize text 18 | or compute term frequencies, use \code{\link{text_tokens}}, 19 | \code{\link{term_stats}} or \code{\link{term_matrix}}. 20 | To search for or count specific terms, 21 | use \code{\link{text_locate}}, \code{\link{text_count}}, or 22 | \code{\link{text_detect}}. 23 | 24 | For a complete list of functions, use \code{library(help = "corpus")}. 25 | } 26 | \author{ 27 | Patrick O. Perry 28 | } 29 | \keyword{ package } 30 | -------------------------------------------------------------------------------- /man/corpus_frame.Rd: -------------------------------------------------------------------------------- 1 | \name{corpus_frame} 2 | \alias{as_corpus_frame} 3 | \alias{as_corpus_frame.character} 4 | \alias{as_corpus_frame.Corpus} 5 | \alias{as_corpus_frame.corpus} 6 | \alias{as_corpus_frame.corpus_json} 7 | \alias{as_corpus_frame.corpus_text} 8 | \alias{as_corpus_frame.data.frame} 9 | \alias{as_corpus_frame.default} 10 | \alias{corpus_frame} 11 | \alias{is_corpus_frame} 12 | \title{Corpus Data Frame} 13 | \description{ 14 | Create or test for corpus objects. 15 | } 16 | \usage{ 17 | corpus_frame(..., row.names = NULL, filter = NULL) 18 | 19 | as_corpus_frame(x, filter = NULL, ..., row.names = NULL) 20 | 21 | is_corpus_frame(x) 22 | } 23 | \arguments{ 24 | \item{\dots}{data frame columns for \code{corpus_frame}; 25 | further arguments passed to \code{as_corpus_text} from 26 | \code{as_corpus_frame}.} 27 | 28 | \item{row.names}{character vector of row names for the corpus object.} 29 | 30 | \item{filter}{text filter object for the \code{"text"} column in the 31 | corpus object.} 32 | 33 | \item{x}{object to be coerced or tested.} 34 | } 35 | \details{ 36 | These functions create or convert another object to a corpus object. 37 | A corpus object is just a data frame with special functions for 38 | printing, and a column names \code{"text"} of type \code{"corpus_text"}. 39 | 40 | \code{corpus} has similar semantics to the \code{\link{data.frame}} 41 | function, except that string columns do not get converted to factors. 42 | 43 | \code{as_corpus_frame} converts another object to a corpus data frame 44 | object. By default, the method converts \code{x} to a data frame with 45 | a column named \code{"text"} of type \code{"corpus_text"}, and sets the 46 | class attribute of the result to \code{c("corpus_frame", "data.frame")}. 47 | 48 | \code{is_corpus_frame} tests whether \code{x} is a data frame with a column 49 | named \code{"text"} of type \code{"corpus_text"}. 50 | 51 | \code{as_corpus_frame} is generic: you can write methods to 52 | handle specific classes of objects. 53 | } 54 | \value{ 55 | \code{corpus_frame} creates a data frame with a column named \code{"text"} 56 | of type \code{"corpus_text"}, and a class attribute set to 57 | \code{c("corpus_frame", "data.frame")}. 58 | 59 | \code{as_corpus_frame} attempts to coerce its argument to a corpus 60 | data frame object, setting the \code{row.names} and calling 61 | \code{\link{as_corpus_text}} on the \code{"text"} column with 62 | the \code{filter} and \code{\dots} arguments. 63 | 64 | \code{is_corpus_frame} returns \code{TRUE} or \code{FALSE} depending on 65 | whether its argument is a valid corpus object or not. 66 | } 67 | \seealso{ 68 | \code{\link{corpus-package}}, \code{\link{print.corpus_frame}}, 69 | \code{\link{corpus_text}}, \code{\link{read_ndjson}}. 70 | } 71 | \examples{ 72 | # convert a data frame: 73 | emoji <- data.frame(text = sapply(0x1f600 + 1:30, intToUtf8), 74 | stringsAsFactors = FALSE) 75 | as_corpus_frame(emoji) 76 | 77 | # construct directly (no need for stringsAsFactors = FALSE): 78 | corpus_frame(text = sapply(0x1f600 + 1:30, intToUtf8)) 79 | 80 | # convert a character vector: 81 | as_corpus_frame(c(a = "goodnight", b = "moon")) # keeps names 82 | as_corpus_frame(c(a = "goodnight", b = "moon"), row.names = NULL) # drops names 83 | } 84 | \keyword{classes} 85 | -------------------------------------------------------------------------------- /man/corpus_text.Rd: -------------------------------------------------------------------------------- 1 | \name{corpus_text} 2 | \alias{as_corpus_text} 3 | \alias{as_corpus_text.character} 4 | \alias{as_corpus_text.Corpus} 5 | \alias{as_corpus_text.corpus} 6 | \alias{as_corpus_text.corpus_json} 7 | \alias{as_corpus_text.corpus_text} 8 | \alias{as_corpus_text.data.frame} 9 | \alias{as_corpus_text.default} 10 | \alias{corpus_text} 11 | \alias{is_corpus_text} 12 | \title{Text Objects} 13 | \description{ 14 | Create or test for text objects. 15 | } 16 | \usage{ 17 | as_corpus_text(x, filter = NULL, ..., names = NULL) 18 | 19 | is_corpus_text(x) 20 | } 21 | \arguments{ 22 | \item{x}{object to be coerced or tested.} 23 | 24 | \item{filter}{if non-\code{NULL}, a text filter for the converted result.} 25 | 26 | \item{\dots}{text filter properties to set on the result.} 27 | 28 | \item{names}{if non-\code{NULL} character vector of names for 29 | the converted result.} 30 | } 31 | \details{ 32 | The \code{corpus_text} type is a new data type provided by the \code{corpus} 33 | package suitable for processing international (Unicode) text. Text vectors 34 | behave like character vectors (and can be converted to them with the 35 | \code{as.character} function). They can be created using the 36 | \code{\link{read_ndjson}} function or by converting another object using the 37 | \code{as_corpus_text} function. 38 | 39 | All text objects have a \code{\link{text_filter}} property specify how to 40 | transform the text into tokens or segment it into sentences. 41 | 42 | The default behavior for \code{as_corpus_text} is to proceed as follows: 43 | \enumerate{ 44 | \item If \code{x} is a \code{character} vector, then we create 45 | a new \code{text} vector from \code{x}. 46 | 47 | \item If \code{x} is a data frame, then we call \code{as_corpus_text} 48 | on \code{x$text} if a column named \code{"text"} exists in 49 | the data frame. If the data frame does not have a column 50 | named \code{"text"}, then we fail with an error message. 51 | 52 | \item If \code{x} is a \code{corpus_text} object, then we drop all 53 | attributes and we set the class to \code{"corpus_text"}. 54 | 55 | \item The default behavior for when none of the above conditions 56 | are true is to call \code{as.character} on the object first, 57 | preserving the names, and then and call \code{as_corpus_text} on 58 | the returned character object. 59 | } 60 | 61 | In all cases, when the \code{names} is \code{NULL}, we set the result 62 | names to \code{names(x)} (or \code{rownames(x)} for a data frame 63 | argument). When \code{names} is a character vector, we set the result names 64 | to this vector of names 65 | 66 | Similarly, when \code{filter} is \code{NULL}, we set the result text 67 | filter to \code{text_filter(x)}. When \code{filter} is non-\code{NULL} 68 | missing, we set the result text filter to this value. In either case, 69 | if there are additional names arguments, then we override the filter 70 | properties specified by the names of these arguments with the new values 71 | given. 72 | 73 | Note that the special handling for the names of the object is different 74 | from the other R conversion functions (\code{as.numeric}, 75 | \code{as.character}, etc.), which drop the names. 76 | 77 | \code{as_corpus_text} is generic: you can write methods to handle specific 78 | classes of objects. 79 | } 80 | \value{ 81 | \code{as_corpus_text} attempts to coerce its argument to \code{text} type and 82 | set its \code{names} and \code{text_filter} properties; it strips 83 | all other attributes. 84 | 85 | \code{is_corpus_text} returns \code{TRUE} or \code{FALSE} depending on 86 | whether its argument is of text type or not. 87 | } 88 | \seealso{ 89 | \code{\link{as_utf8}}, \code{\link{text_filter}}, \code{\link{read_ndjson}}. 90 | } 91 | \examples{ 92 | as_corpus_text("hello, world!") 93 | as_corpus_text(c(a = "goodnight", b = "moon")) # keeps names 94 | 95 | # set a filter property 96 | as_corpus_text(c(a = "goodnight", b = "moon"), stemmer = "english") 97 | 98 | is_corpus_text("hello") # FALSE, "hello" is character, not text 99 | } 100 | \keyword{classes} 101 | -------------------------------------------------------------------------------- /man/federalist.Rd: -------------------------------------------------------------------------------- 1 | \name{federalist} 2 | \docType{data} 3 | \alias{federalist} 4 | \title{The Federalist Papers} 5 | \description{ 6 | \cite{The Federalist Papers} comprise 85 articles published under the 7 | pseudonym \dQuote{Publius} in New York newspapers between 1787 and 8 | 1788, written to convince residents to ratify the \cite{Constitution}. 9 | John Jay wrote 5 papers, while Alexander Hamilton and James Madison 10 | wrote the remaining 80. Between the last two authors there are 11 | conflicting accounts of which author wrote which paper. Most sources 12 | agree on the authorships of 65 papers (51 by Hamilton and 14 by Madison), 13 | but 15 papers are in dispute. 14 | 15 | In one of the earliest examples of statistical text analysis, F. Mosteller 16 | and D. L. Wallace used a form of Naive Bayes classification to identify 17 | the authorships of the 15 disputed papers, finding strong evidence that 18 | Madison was the author of all of the disputed papers. 19 | } 20 | \usage{federalist} 21 | \format{A data frame with 85 rows, one for each paper.} 22 | \source{\url{http://www.gutenberg.org/ebooks/18}} 23 | \references{ 24 | Mosteller, F and Wallace, D. L. (1963). 25 | Inference in an authorship problem. 26 | \emph{Journal of the American Statistical Association} 27 | \strong{58} 275--309. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/figures/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/banner.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/logo/01_make_logo.R: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/Rscript --vanilla 2 | 3 | library("magick") 4 | 5 | slide <- image_read("logo-slide.tiff") 6 | banner <- image_crop(slide, "370x80+2+2") 7 | image_write(banner, "../banner.png", format = "png") 8 | 9 | ccap <- image_read("c-07.jpg") 10 | logo <- image_scale(ccap, "80x80") 11 | image_write(logo, "../logo.png", format = "png") 12 | -------------------------------------------------------------------------------- /man/figures/logo/README: -------------------------------------------------------------------------------- 1 | 2 | The logo was created in Mac OS pages, then exported to TIFF. The script 3 | `logo.R` crops the TIFF and converts to PNG. 4 | 5 | -- 6 | 7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html 8 | 9 | "This and numerous other stock print foundry images were included in the book 10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop & 11 | Company." 12 | 13 | 14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt). 15 | -------------------------------------------------------------------------------- /man/figures/logo/c-07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/c-07.jpg -------------------------------------------------------------------------------- /man/figures/logo/logo-slide.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo-slide.tiff -------------------------------------------------------------------------------- /man/figures/logo/logo.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo.key -------------------------------------------------------------------------------- /man/gutenberg_corpus.Rd: -------------------------------------------------------------------------------- 1 | \name{gutenberg_corpus} 2 | \alias{gutenberg_corpus} 3 | \title{Project Gutenberg Corpora} 4 | \description{ 5 | Get a corpus of texts from Project Gutenberg. 6 | } 7 | \usage{ 8 | gutenberg_corpus(ids, filter = NULL, mirror = NULL, verbose = TRUE, ...) 9 | } 10 | \arguments{ 11 | \item{ids}{an integer vector of requested Gutenberg text IDs.} 12 | 13 | \item{filter}{a text filter to set on the corpus.} 14 | 15 | \item{mirror}{a character string URL for the Gutenberg mirror to use, 16 | or NULL to determine automatically.} 17 | 18 | \item{verbose}{a logical scalar indicating whether to print progress 19 | updates to the console.} 20 | 21 | \item{...}{additional arguments passed to \code{as_corpus}.} 22 | } 23 | \details{ 24 | \code{gutenberg_corpus} downloads a set of texts from Project Gutenberg, 25 | creating a corpus with the texts as rows. You specify the texts for inclusion 26 | using their Project Gutenberg IDs, passed to the function in the 27 | \code{ids} argument. 28 | 29 | You can search for Project Gutenberg texts and get their IDs using the 30 | \code{gutenberg_works} function from the \code{gutenbergr} package. 31 | } 32 | \value{ 33 | A corpus (data frame) with three columns: \code{"title"}, \code{"author"}, 34 | and \code{"text"}. 35 | } 36 | \seealso{ 37 | \code{\link{corpus_frame}}. 38 | } 39 | \examples{ 40 | # get the texts of George Eliot's novels 41 | \dontrun{eliot <- gutenberg_corpus(c(145, 550, 6688))} 42 | } 43 | -------------------------------------------------------------------------------- /man/new_stemmer.Rd: -------------------------------------------------------------------------------- 1 | \name{new_stemmer} 2 | \alias{new_stemmer} 3 | \title{Stemmer Construction} 4 | \description{ 5 | Make a stemmer from a set of (term, stem) pairs. 6 | } 7 | \usage{ 8 | new_stemmer(term, stem, default = NULL, duplicates = "first", 9 | vectorize = TRUE) 10 | } 11 | \arguments{ 12 | \item{term}{character vector of terms to stem.} 13 | 14 | \item{stem}{character vector the same length as \code{term} with entries 15 | giving the corresponding stems.} 16 | 17 | \item{default}{if non-\code{NULL}, a default value to use for terms 18 | that do not have a stem; \code{NULL} specifies that such terms 19 | should be left unchanged.} 20 | 21 | \item{duplicates}{action to take for duplicates in the \code{term} list. See 22 | \sQuote{Details}}. 23 | 24 | \item{vectorize}{whether to produce a vectorized stemmer that accepts and 25 | returns vector arguments.} 26 | } 27 | \details{ 28 | Giving a list of terms and a corresponding list of stems, this produces a 29 | function that maps terms to their corresponding entry. If 30 | \code{default = NULL}, then values absent from the \code{term} argument 31 | get left as-is; otherwise, they get replaced by the \code{default} value. 32 | 33 | The \code{duplicates} argument indicates the action to take if 34 | there are duplicate entries in the \code{term} argument: 35 | \itemize{ 36 | \item \code{duplicates = "first"} take the first matching entry in the 37 | \code{stem} list. 38 | 39 | \item \code{duplicates = "last"} take the last matching entry in the 40 | \code{stem} list. 41 | 42 | \item \code{duplicates = "omit"} use the \code{default} value for 43 | duplicated terms. 44 | 45 | \item \code{duplicates = "fail"} raise an error if there are duplicated 46 | terms. 47 | } 48 | } 49 | \value{ 50 | By default, with \code{vectorize = TRUE}, the resulting stemmer accepts a 51 | character vector as input and returns a character vector of the same length 52 | with entries giving the stems of the corresponding input entries. 53 | 54 | Setting \code{vectorize = FALSE} gives a function that accepts a single input 55 | and returns a single output. This can be more efficient when used as part of 56 | a \code{\link{text_filter}}. 57 | } 58 | \seealso{ 59 | \code{\link{stem_snowball}, \link{text_filter}}, \code{\link{text_tokens}}. 60 | } 61 | \examples{ 62 | # map uppercase to lowercase, leave others unchanged 63 | stemmer <- new_stemmer(LETTERS, letters) 64 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3")) 65 | 66 | # map uppercase to lowercase, drop others 67 | stemmer <- new_stemmer(LETTERS, letters, default = NA) 68 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3")) 69 | } 70 | -------------------------------------------------------------------------------- /man/print.corpus_frame.Rd: -------------------------------------------------------------------------------- 1 | \name{print.corpus_frame} 2 | \title{Corpus Data Frame Printing} 3 | \alias{format.corpus_frame} 4 | \alias{print.corpus_frame} 5 | \description{ 6 | Printing and formatting corpus data frames. 7 | } 8 | \usage{ 9 | \method{print}{corpus_frame}(x, rows = 20L, chars = NULL, digits = NULL, 10 | quote = FALSE, na.print = NULL, print.gap = NULL,right = FALSE, 11 | row.names = TRUE, max = NULL, display = TRUE, ...) 12 | 13 | \method{format}{corpus_frame}(x, chars = NULL, na.encode = TRUE, quote = FALSE, 14 | na.print = NULL, print.gap = NULL, ..., justify = "none") 15 | } 16 | \arguments{ 17 | \item{x}{data frame object to print or format.} 18 | 19 | \item{rows}{integer scalar giving the maximum number of rows to print 20 | before truncating the output. A negative or missing value indicates 21 | no upper limit.} 22 | 23 | \item{chars}{maximum number of character units to display; see 24 | \code{\link{utf8_format}}.} 25 | 26 | \item{digits}{minimal number of significant digits; see 27 | \code{\link{print.default}}.} 28 | 29 | \item{quote}{logical scalar indicating whether to put surrounding 30 | double-quotes (\code{'"'}) around character strings and escape 31 | internal double-quotes.} 32 | 33 | \item{na.print}{character string (or \code{NULL}) indicating 34 | the encoding for \code{NA} values. Ignored when 35 | \code{na.encode} is \code{FALSE}.} 36 | 37 | \item{print.gap}{non-negative integer (or \code{NULL}) giving the 38 | number of spaces in gaps between columns; set to \code{NULL} 39 | or \code{1} for a single space.} 40 | 41 | \item{right}{logical indicating whether to right-align columns 42 | (ignored for text, character, and factor columns).} 43 | 44 | \item{row.names}{logical indicating whether to print row names, or 45 | a character vector giving alternate row names to display.} 46 | 47 | \item{max}{maximum number of entries to print; defaults to 48 | \code{getOption("max.print")}.} 49 | 50 | \item{display}{logical scalar indicating whether to optimize the 51 | printing for display, not byte-for-byte data transmission; 52 | see \code{utf8_encode}.} 53 | 54 | \item{justify}{justification; one of \code{"left"}, \code{"right"}, 55 | \code{"centre"}, or \code{"none"}. Can be abbreviated.} 56 | 57 | \item{na.encode}{logical scalar indicating whether to encode 58 | \code{NA} values as character strings.} 59 | 60 | \item{...}{further arguments passed to or from other methods.} 61 | } 62 | \details{ 63 | The \code{"corpus_frame"} class is a subclass of \code{"data.frame"}, 64 | overriding the default print and format methods. To apply this 65 | class to a data frame, set is class to 66 | \code{c("corpus_frame", "data.frame")}. 67 | 68 | Corpus frame printing left-justifies character and text columns, 69 | truncates the output, and displays emoji on Mac OS. 70 | } 71 | \seealso{ 72 | \code{\link{corpus_frame}}, \code{\link{print.data.frame}}, 73 | \code{\link{utf8_print}} 74 | } 75 | \examples{ 76 | # default data frame printing 77 | x <- data.frame(text = c("hello world", intToUtf8(0x1f638 + 0:3), letters)) 78 | print(x) 79 | 80 | # corpus frame printing 81 | y <- x 82 | class(y) <- c("corpus_frame", "data.frame") 83 | print(y) 84 | 85 | print(y, 10) # change truncation limit 86 | } 87 | -------------------------------------------------------------------------------- /man/sentiment_afinn.Rd: -------------------------------------------------------------------------------- 1 | \name{sentiment_afinn} 2 | \docType{data} 3 | \alias{sentiment_afinn} 4 | \title{AFINN Sentiment Lexicon} 5 | \description{ 6 | The AFINN lexicon is a list of English terms manually rated 7 | for valence with an integer between -5 (negative) and +5 8 | (positive) by Finn \enc{Årup}{Aarup} Nielsen between 9 | 2009 and 2011. 10 | 11 | The original lexicon contains some multi-word phrases, but they 12 | are excluded here. 13 | 14 | The original lexicon is distributed under the 15 | \href{https://opendatacommons.org/licenses/odbl/1-0/}{Open Database License (ODbL) v1.0}. 16 | You are free to share, create works from, and adapt the lexicon, as long as you 17 | attribute the original lexicon in your work. If you adapt the lexicon, you 18 | must keep the adapted lexicon open and apply a similar license. 19 | } 20 | \usage{sentiment_afinn} 21 | \format{A data frame with one row for each term} 22 | \source{\url{http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010}} 23 | \references{ 24 | Finn \enc{Årup}{Aarup} Nielsen 25 | A new ANEW: Evaluation of a word list for sentiment analysis in microblogs. 26 | \emph{Proceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages 718 in CEUR Workshop Proceedings} 27 | 93-98. 2011 May. 28 | \url{http://arxiv.org/abs/1103.2903}. 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /man/stem_snowball.Rd: -------------------------------------------------------------------------------- 1 | \name{stem_snowball} 2 | \alias{stem_snowball} 3 | \title{Snowball Stemmer} 4 | \description{ 5 | Stem a set of terms using one of the algorithms provided by the 6 | Snowball stemming library. 7 | } 8 | \usage{ 9 | stem_snowball(x, algorithm = "en") 10 | } 11 | \arguments{ 12 | \item{x}{character vector of terms to stem.} 13 | 14 | \item{algorithm}{stemming algorithm; see \sQuote{Details} for the valid 15 | choices.} 16 | } 17 | \details{ 18 | Apply a Snowball stemming algorithm to a vector of input terms, \code{x}, 19 | returning the result in a character vector of the same length with the 20 | same names. 21 | 22 | The \code{algorithm} argument specifies the stemming algorithm. Valid choices 23 | include the following: 24 | \code{"ar"} (\code{"arabic"}), 25 | \code{"da"} (\code{"danish"}), 26 | \code{"de"} (\code{"german"}), 27 | \code{"en"} (\code{"english"}), 28 | \code{"es"} (\code{"spanish"}), 29 | \code{"fi"} (\code{"finnish"}), 30 | \code{"fr"} (\code{"french"}), 31 | \code{"hu"} (\code{"hungarian"}), 32 | \code{"it"} (\code{"italian"}), 33 | \code{"nl"} (\code{"dutch"}), 34 | \code{"no"} (\code{"norwegian"}), 35 | \code{"pt"} (\code{"portuguese"}), 36 | \code{"ro"} (\code{"romanian"}), 37 | \code{"ru"} (\code{"russian"}), 38 | \code{"sv"} (\code{"swedish"}), 39 | \code{"ta"} (\code{"tamil"}), 40 | \code{"tr"} (\code{"turkish"}), 41 | and \code{"porter"}. 42 | Setting \code{algorithm = NULL} gives a stemmer that returns its input 43 | unchanged. 44 | 45 | The function only stems single-word terms of kind "letter"; it leaves 46 | other inputs (multi-word terms, and terms of kind "number", "punct", and 47 | "symbol") unchanged. 48 | 49 | The \href{http://snowballstem.org/algorithms/}{Snowball stemming library} 50 | provides the underlying implementation. The \code{wordStem} function from 51 | the \pkg{SnowballC} package provides a similar interface, but that function 52 | applies the algorithm to all input terms, regardless of the kind of the term. 53 | } 54 | \value{ 55 | A character vector the same length and names as the input, \code{x}, with 56 | entries containing the corresponding stems. 57 | } 58 | \seealso{ 59 | \code{\link{new_stemmer}}, \link{text_filter}. 60 | } 61 | \examples{ 62 | # apply english stemming algorithm; don't stem non-letter terms 63 | stem_snowball(c("win", "winning", "winner", "#winning")) 64 | 65 | # compare with SnowballC, which stems all kinds, not just letter 66 | \dontrun{SnowballC::wordStem(c("win", "winning", "winner", "#winning"), "en")} 67 | } 68 | -------------------------------------------------------------------------------- /man/stopwords.Rd: -------------------------------------------------------------------------------- 1 | \name{stopwords} 2 | \docType{data} 3 | \alias{stopwords} 4 | \alias{stopwords_da} 5 | \alias{stopwords_de} 6 | \alias{stopwords_en} 7 | \alias{stopwords_es} 8 | \alias{stopwords_fi} 9 | \alias{stopwords_fr} 10 | \alias{stopwords_hu} 11 | \alias{stopwords_it} 12 | \alias{stopwords_nl} 13 | \alias{stopwords_no} 14 | \alias{stopwords_pt} 15 | \alias{stopwords_ru} 16 | \alias{stopwords_sv} 17 | \title{Stop Words} 18 | \description{ 19 | Lists of common function words (\sQuote{stop} words). 20 | } 21 | \details{ 22 | The \code{stopwords_} objects are character vectors of case-folded 23 | \sQuote{stop} words. These are common function words that often get discarded 24 | before performing other text analysis tasks. 25 | 26 | There are lists available for the following languages: 27 | Danish (\code{stopwords_da}), Dutch (\code{stopwords_nl}), 28 | English (\code{stopwords_en}), Finnish (\code{stopwords_fi}), 29 | French (\code{stopwords_fr}, German (\code{stopwords_de}) 30 | Hungarian (\code{stopwords_hu}), Italian (\code{stopwords_it}), 31 | Norwegian (\code{stopwords_no}), Portuguese (\code{stopwords_pt}), 32 | Russian (\code{stopwords_ru}), Spanish (\code{stopwords_es}), 33 | and Swedish (\code{stopwords_sv}). 34 | 35 | These built-in word lists are reasonable defaults, but they may require 36 | further tailoring to suit your particular task. The original lists were 37 | compiled by the \href{http://snowballstem.org/}{Snowball stemming project}. 38 | Following the Quanteda text analysis software, we have tailored the original 39 | lists by adding the word "will" to the English list. 40 | } 41 | \usage{ 42 | stopwords_da 43 | stopwords_de 44 | stopwords_en 45 | stopwords_es 46 | stopwords_fi 47 | stopwords_fr 48 | stopwords_hu 49 | stopwords_it 50 | stopwords_nl 51 | stopwords_no 52 | stopwords_pt 53 | stopwords_ru 54 | stopwords_sv 55 | } 56 | \format{A character vector of unique stop words.} 57 | \seealso{ 58 | \code{\link{text_filter}} 59 | } 60 | \keyword{datasets} 61 | -------------------------------------------------------------------------------- /man/term_matrix.Rd: -------------------------------------------------------------------------------- 1 | \name{term_matrix} 2 | \alias{term_counts} 3 | \alias{term_matrix} 4 | \title{Term Frequency Tabulation} 5 | \description{ 6 | Tokenize a set of texts and compute a term frequency matrix. 7 | } 8 | \usage{ 9 | term_matrix(x, filter = NULL, ngrams = NULL, select = NULL, 10 | group = NULL, transpose = FALSE, ...) 11 | 12 | term_counts(x, filter = NULL, ngrams = NULL, select = NULL, 13 | group = NULL, ...) 14 | } 15 | \arguments{ 16 | \item{x}{a text vector to tokenize.} 17 | 18 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 19 | the default text filter for \code{x}.} 20 | 21 | \item{ngrams}{an integer vector of n-gram lengths to include, or 22 | \code{NULL} to use the \code{select} argument to determine the 23 | n-gram lengths.} 24 | 25 | \item{select}{a character vector of terms to count, or \code{NULL} to 26 | count all terms that appear in \code{x}.} 27 | 28 | \item{group}{if non-\code{NULL}, a factor, character string, or 29 | integer vector the same length of \code{x} specifying the grouping 30 | behavior.} 31 | 32 | \item{transpose}{a logical value indicating whether to transpose the 33 | result, putting terms as rows instead of columns.} 34 | 35 | \item{\dots}{additional properties to set on the text filter.} 36 | } 37 | \details{ 38 | \code{term_matrix} tokenizes a set of texts and computes the occurrence 39 | counts for each term, returning the result as a sparse matrix 40 | (texts-by-terms). \code{term_counts} returns the same information, but 41 | in a data frame. 42 | 43 | If \code{ngrams} is non-\code{NULL}, then multi-type n-grams are 44 | included in the output for all lengths appearing in the \code{ngrams} 45 | argument. If \code{ngrams} is \code{NULL} but \code{select} is 46 | non-\code{NULL}, then all n-grams appearing in the \code{select} set 47 | are included. If both \code{ngrams} and \code{select} are \code{NULL}, 48 | then only unigrams (single type terms) are included. 49 | 50 | If \code{group} is \code{NULL}, then the output has one set of term 51 | counts for each input text. Otherwise, we convert \code{group} to 52 | a \code{factor} and compute one set of term counts for each level. 53 | Texts with \code{NA} values for \code{group} get skipped. 54 | } 55 | \value{ 56 | \code{term_matrix} with \code{transpose = FALSE} returns a sparse matrix 57 | in \code{"dgCMatrix"} format with one column for each term and one row for 58 | each input text or (if \code{group} is non-\code{NULL}) for each grouping 59 | level. If \code{filter$select} is non-\code{NULL}, then the column names 60 | will be equal to \code{filter$select}. Otherwise, the columns are assigned 61 | in arbitrary order. 62 | 63 | \code{term_matrix} with \code{transpose = TRUE} returns the transpose of 64 | the term matrix, in \code{"dgCMatrix"} format. 65 | 66 | \code{term_counts} with \code{group = NULL} returns a data frame with one 67 | row for each entry of the term matrix, and columns \code{"text"}, 68 | \code{"term"}, and \code{"count"} giving the text ID, term, and count. 69 | The \code{"term"} column is a factor with levels equal to the selected 70 | terms. The \code{"text"} 71 | column is a factor with levels equal to \code{names(as_corpus_text(x))}; 72 | calling \code{as.integer} on the \code{"text"} column converts from 73 | the factor values to the integer row index in the term matrix. 74 | 75 | \code{term_counts} with \code{group} non-\code{NULL} behaves similarly, 76 | but the result instead has columns named \code{"group"}, \code{"term"}, 77 | and \code{"count"}, with \code{"group"} giving the grouping level, as 78 | a factor. 79 | } 80 | \seealso{ 81 | \code{\link{text_tokens}}, \code{\link{term_stats}}. 82 | } 83 | \examples{ 84 | text <- c("A rose is a rose is a rose.", 85 | "A Rose is red, a violet is blue!", 86 | "A rose by any other name would smell as sweet.") 87 | term_matrix(text) 88 | 89 | # select certain terms 90 | term_matrix(text, select = c("rose", "red", "violet", "sweet")) 91 | 92 | # specify a grouping factor 93 | term_matrix(text, group = c("Good", "Bad", "Good")) 94 | 95 | # include higher-order n-grams 96 | term_matrix(text, ngrams = 1:3) 97 | 98 | # select certain multi-type terms 99 | term_matrix(text, select = c("a rose", "a violet", "sweet", "smell")) 100 | 101 | # transpose the result 102 | term_matrix(text, ngrams = 1:2, transpose = TRUE)[1:10, ] # first 10 rows 103 | 104 | # data frame 105 | head(term_counts(text), n = 10) # first 10 rows 106 | 107 | # with grouping 108 | term_counts(text, group = c("Good", "Bad", "Good")) 109 | 110 | # taking names from the input 111 | term_counts(c(a = "One sentence.", b = "Another", c = "!!")) 112 | } 113 | -------------------------------------------------------------------------------- /man/term_stats.Rd: -------------------------------------------------------------------------------- 1 | \name{term_stats} 2 | \alias{term_stats} 3 | \title{Term Statistics} 4 | \description{ 5 | Tokenize a set of texts and tabulate the term occurrence statistics. 6 | } 7 | \usage{ 8 | term_stats(x, filter = NULL, ngrams = NULL, 9 | min_count = NULL, max_count = NULL, 10 | min_support = NULL, max_support = NULL, types = FALSE, 11 | subset, ...) 12 | } 13 | \arguments{ 14 | \item{x}{a text vector to tokenize.} 15 | 16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 17 | the default text filter for \code{x}.} 18 | 19 | \item{ngrams}{an integer vector of n-gram lengths to include, or 20 | \code{NULL} for length-1 n-grams only.} 21 | 22 | \item{min_count}{a numeric scalar giving the minimum term count to include 23 | in the output, or \code{NULL} for no minimum count.} 24 | 25 | \item{max_count}{a numeric scalar giving the maximum term count to include 26 | in the output, or \code{NULL} for no maximum count.} 27 | 28 | \item{min_support}{a numeric scalar giving the minimum term support to 29 | include in the output, or \code{NULL} for no minimum support.} 30 | 31 | \item{max_support}{a numeric scalar giving the maximum term support to 32 | include in the output, or \code{NULL} for no maximum support.} 33 | 34 | \item{types}{a logical value indicating whether to include columns for 35 | the types that make up the terms.} 36 | 37 | \item{subset}{logical expression indicating elements or rows to keep: 38 | missing values are taken as false.} 39 | 40 | \item{\dots}{additional properties to set on the text filter.} 41 | } 42 | \details{ 43 | \code{term_stats} tokenizes a set of texts and computes the occurrence 44 | counts and supports for each term. The \sQuote{count} is the number of 45 | occurrences of the term across all texts; the \sQuote{support} is the 46 | number of texts containing the term. Each appearance of a term 47 | increments its count by one. Likewise, an appearance of a term in text 48 | \code{i} increments its support once, not for each occurrence 49 | in the text. 50 | 51 | To include multi-type terms, specify the designed term lengths using 52 | the \code{ngrams} argument. 53 | } 54 | \value{ 55 | A data frame with columns named \code{term}, \code{count}, and 56 | \code{support}, with one row for each appearing term. Rows are sorted 57 | in descending order according to \code{support} and then \code{count}, 58 | with ties broken lexicographically by \code{term}, using the 59 | character ordering determined by the current locale 60 | (see \code{\link{Comparison}} for details). 61 | 62 | If \code{types = TRUE}, then the result also includes columns named 63 | \code{type1}, \code{type2}, etc. for the types that make up the 64 | term. 65 | } 66 | \seealso{ 67 | \code{\link{text_tokens}}, \code{\link{term_matrix}}. 68 | } 69 | \examples{ 70 | term_stats("A rose is a rose is a rose.") 71 | 72 | # remove punctuation and English stop words 73 | term_stats("A rose is a rose is a rose.", 74 | text_filter(drop_symbol = TRUE, drop = stopwords_en)) 75 | 76 | # unigrams, bigrams, and trigrams 77 | term_stats("A rose is a rose is a rose.", ngrams = 1:3) 78 | 79 | # also include the type information 80 | term_stats("A rose is a rose is a rose.", ngrams = 1:3, types = TRUE) 81 | } 82 | -------------------------------------------------------------------------------- /man/text_locate.Rd: -------------------------------------------------------------------------------- 1 | \name{text_locate} 2 | \alias{text_count} 3 | \alias{text_detect} 4 | \alias{text_locate} 5 | \alias{text_match} 6 | \alias{text_sample} 7 | \alias{text_subset} 8 | \title{Searching for Terms} 9 | \description{ 10 | Look for instances of one or more terms in a set of texts. 11 | } 12 | \usage{ 13 | text_locate(x, terms, filter = NULL, ...) 14 | 15 | text_count(x, terms, filter = NULL, ...) 16 | 17 | text_detect(x, terms, filter = NULL, ...) 18 | 19 | text_match(x, terms, filter = NULL, ...) 20 | 21 | text_sample(x, terms, size = NULL, filter = NULL, ...) 22 | 23 | text_subset(x, terms, filter = NULL, ...) 24 | } 25 | \arguments{ 26 | \item{x}{a text or character vector.} 27 | 28 | \item{terms}{a character vector of search terms.} 29 | 30 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 31 | the default text filter for \code{x}.} 32 | 33 | \item{size}{the maximum number of results to return, or \code{NULL}.} 34 | 35 | \item{\dots}{additional properties to set on the text filter.} 36 | } 37 | \details{ 38 | \code{text_locate} finds all instances of the search terms in the 39 | input text, along with their contexts. 40 | 41 | \code{text_count} counts the number of search term instances in 42 | each element of the text vector. 43 | 44 | \code{text_detect} indicates whether each text contains at least 45 | one of the search terms. 46 | 47 | \code{text_match} reports the matching instances as a factor variable 48 | with levels equal to the \code{terms} argument. 49 | 50 | \code{text_subset} returns the texts that contain the search terms. 51 | 52 | \code{text_sample} returns a random sample of the results from 53 | \code{text_locate}, in random order. This is this is useful for 54 | hand-inspecting a subset of the \code{text_locate} matches. 55 | } 56 | \value{ 57 | \code{text_count} and \code{text_detect} return a numeric vector and 58 | a logical vector, respectively, with length equal to the number of input 59 | texts and names equal to the text names. 60 | 61 | \code{text_locate} and \code{text_sample} both return a data frame with 62 | one row for each search result and columns named \sQuote{text}, \sQuote{before}, 63 | \sQuote{instance}, and \sQuote{after}. The \sQuote{text} column gives 64 | the name of the text containing the instance; \sQuote{before} and 65 | \sQuote{after} are text vectors giving the text before and after the 66 | instance. The \sQuote{instance} column gives the token or tokens matching 67 | the search term. 68 | 69 | \code{text_match} returns a data frame for one row for each search result, 70 | with columns names \sQuote{text} and \sQuote{term}. Both columns are 71 | factors. The \sQuote{text} column has levels equal to the text labels, 72 | and the \sQuote{term} column has levels equal to \code{terms} argument. 73 | 74 | \code{text_subset} returns the subset of texts that contain the given 75 | search terms. The resulting has its \code{text_filter} set to the 76 | passed-in \code{filter} argument. 77 | } 78 | \seealso{ 79 | \code{\link{term_stats}}, \code{\link{term_matrix}}. 80 | } 81 | \examples{ 82 | text <- c("Rose is a rose is a rose is a rose.", 83 | "A rose by any other name would smell as sweet.", 84 | "Snow White and Rose Red") 85 | 86 | text_count(text, "rose") 87 | text_detect(text, "rose") 88 | text_locate(text, "rose") 89 | text_match(text, "rose") 90 | text_sample(text, "rose", 3) 91 | text_subset(text, "a rose") 92 | 93 | # search for multiple terms 94 | text_locate(text, c("rose", "rose red", "snow white")) 95 | } 96 | -------------------------------------------------------------------------------- /man/text_stats.Rd: -------------------------------------------------------------------------------- 1 | \name{text_stats} 2 | \alias{text_stats} 3 | \title{Text Statistics} 4 | \description{ 5 | Report descriptive statistics for a set of texts. 6 | } 7 | \usage{ 8 | text_stats(x, filter = NULL, ...) 9 | } 10 | \arguments{ 11 | \item{x}{a text corpus.} 12 | 13 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 14 | the default text filter for \code{x}.} 15 | 16 | \item{\dots}{additional properties to set on the text filter.} 17 | } 18 | \details{ 19 | \code{text_stats} reports descriptive statistics for a set of texts: 20 | the number of tokens, unique types, and sentences. 21 | } 22 | \value{ 23 | A data frame with columns named \code{tokens}, \code{types}, and 24 | \code{sentences}, with one row for each text. 25 | } 26 | \seealso{ 27 | \code{\link{text_filter}}, \code{\link{term_stats}}. 28 | } 29 | \examples{ 30 | text_stats(c("A rose is a rose is a rose.", 31 | "A Rose is red. A violet is blue!")) 32 | } 33 | -------------------------------------------------------------------------------- /man/text_sub.Rd: -------------------------------------------------------------------------------- 1 | \name{text_sub} 2 | \alias{text_sub} 3 | \title{Text Subsequences} 4 | \description{ 5 | Extract token subsequences from a set of texts. 6 | } 7 | \usage{ 8 | text_sub(x, start = 1L, end = -1L, filter = NULL, ...) 9 | } 10 | \arguments{ 11 | \item{x}{text vector or corpus object.} 12 | 13 | \item{start}{integer vector giving the starting positions of the 14 | subsequences, or a two-column integer matrix giving the starting 15 | and ending positions.} 16 | 17 | \item{end}{integer vector giving the ending positions of the 18 | subsequences; ignored if \code{start} is a two-column matrix.} 19 | 20 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 21 | the default text filter for \code{x}.} 22 | 23 | \item{\dots}{additional properties to set on the text filter.} 24 | } 25 | \details{ 26 | \code{text_sub} extracts token subsequences from a set of texts. 27 | The \code{start} and \code{end} arguments specifying the 28 | positions of the subsequences within the parent texts, as an inclusive 29 | range. Negative indices are interpreted as counting from the end of 30 | the text, with \code{-1L} referring to the last element. 31 | } 32 | \value{ 33 | A text vector with the same length and names as \code{x}, with the 34 | desired subsequences. 35 | } 36 | \seealso{ 37 | \code{\link{text_tokens}}, \code{\link{text_ntoken}}. 38 | } 39 | \examples{ 40 | x <- as_corpus_text(c("A man, a plan.", "A \"canal\"?", "Panama!"), 41 | drop_punct = TRUE) 42 | 43 | # entire text 44 | text_sub(x, 1, -1) 45 | 46 | # first three elements 47 | text_sub(x, 1, 3) 48 | 49 | # last two elements 50 | text_sub(x, -2, -1) 51 | } 52 | -------------------------------------------------------------------------------- /man/text_types.Rd: -------------------------------------------------------------------------------- 1 | \name{text_types} 2 | \alias{text_ntype} 3 | \alias{text_types} 4 | \title{Text Type Sets} 5 | \description{ 6 | Get or measure the set of types (unique token values). 7 | } 8 | \usage{ 9 | text_types(x, filter = NULL, collapse = FALSE, ...) 10 | 11 | text_ntype(x, filter = NULL, collapse = FALSE, ...) 12 | } 13 | \arguments{ 14 | \item{x}{a text or character vector.} 15 | 16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of 17 | the default text filter for \code{x}.} 18 | 19 | \item{collapse}{a logical value indicating whether to collapse the 20 | aggregation over all rows of the input.} 21 | 22 | \item{\dots}{additional properties to set on the text filter.} 23 | } 24 | \details{ 25 | \code{text_ntype} counts the number of unique types in each text; 26 | \code{text_types} returns the set of unique types, as a character 27 | vector. Types are determined according to the \code{filter} argument. 28 | } 29 | \value{ 30 | If \code{collapse = FALSE}, then \code{text_ntype} produces a numeric 31 | vector with the same length and names as the input text, with the elements 32 | giving the number of units in the corresponding texts. For 33 | \code{text_types}, the result is a list of character vector with each 34 | vector giving the unique types in the corresponding text, ordered 35 | according to the \code{\link{sort}} function. 36 | 37 | If \code{collapse = TRUE}, then we aggregate over all rows of the input. 38 | In this case, \code{text_ntype} produces a scalar indicating the number 39 | of unique types in \code{x}, and \code{text_types} produces a character 40 | vector with the unique types. 41 | } 42 | \seealso{ 43 | \code{\link{text_filter}}, \code{\link{text_tokens}}. 44 | } 45 | \examples{ 46 | text <- c("I saw Mr. Jones today.", 47 | "Split across\na line.", 48 | "What. Are. You. Doing????", 49 | "She asked 'do you really mean that?' and I said 'yes.'") 50 | 51 | # count the number of unique types 52 | text_ntype(text) 53 | text_ntype(text, collapse = TRUE) 54 | 55 | # get the type sets 56 | text_types(text) 57 | text_types(text, collapse = TRUE) 58 | } 59 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CFLAGS = -Icorpus/src 2 | PKG_LIBS = -L. -lccorpus 3 | 4 | SNOWBALL = corpus/lib/libstemmer_c 5 | STEMMER_O = $(SNOWBALL)/src_c/stem_UTF_8_arabic.o \ 6 | $(SNOWBALL)/src_c/stem_UTF_8_danish.o \ 7 | $(SNOWBALL)/src_c/stem_UTF_8_dutch.o \ 8 | $(SNOWBALL)/src_c/stem_UTF_8_english.o \ 9 | $(SNOWBALL)/src_c/stem_UTF_8_finnish.o \ 10 | $(SNOWBALL)/src_c/stem_UTF_8_french.o \ 11 | $(SNOWBALL)/src_c/stem_UTF_8_german.o \ 12 | $(SNOWBALL)/src_c/stem_UTF_8_hungarian.o \ 13 | $(SNOWBALL)/src_c/stem_UTF_8_italian.o \ 14 | $(SNOWBALL)/src_c/stem_UTF_8_norwegian.o \ 15 | $(SNOWBALL)/src_c/stem_UTF_8_porter.o \ 16 | $(SNOWBALL)/src_c/stem_UTF_8_portuguese.o \ 17 | $(SNOWBALL)/src_c/stem_UTF_8_romanian.o \ 18 | $(SNOWBALL)/src_c/stem_UTF_8_russian.o \ 19 | $(SNOWBALL)/src_c/stem_UTF_8_spanish.o \ 20 | $(SNOWBALL)/src_c/stem_UTF_8_swedish.o \ 21 | $(SNOWBALL)/src_c/stem_UTF_8_tamil.o \ 22 | $(SNOWBALL)/src_c/stem_UTF_8_turkish.o \ 23 | $(SNOWBALL)/runtime/api.o \ 24 | $(SNOWBALL)/runtime/utilities.o \ 25 | $(SNOWBALL)/libstemmer/libstemmer_utf8.o 26 | 27 | UTF8LITE = corpus/lib/utf8lite 28 | UTF8LITE_O = $(UTF8LITE)/src/array.o \ 29 | $(UTF8LITE)/src/char.o \ 30 | $(UTF8LITE)/src/encode.o \ 31 | $(UTF8LITE)/src/error.o \ 32 | $(UTF8LITE)/src/escape.o \ 33 | $(UTF8LITE)/src/graph.o \ 34 | $(UTF8LITE)/src/graphscan.o \ 35 | $(UTF8LITE)/src/normalize.o \ 36 | $(UTF8LITE)/src/render.o \ 37 | $(UTF8LITE)/src/text.o \ 38 | $(UTF8LITE)/src/textassign.o \ 39 | $(UTF8LITE)/src/textiter.o \ 40 | $(UTF8LITE)/src/textmap.o 41 | 42 | LIBCORPUS = corpus/lib/strntod.o corpus/lib/strntoimax.o \ 43 | corpus/src/array.o corpus/src/census.o corpus/src/data.o \ 44 | corpus/src/datatype.o corpus/src/error.o corpus/src/filebuf.o \ 45 | corpus/src/filter.o corpus/src/intset.o corpus/src/memory.o \ 46 | corpus/src/ngram.o corpus/src/search.o corpus/src/sentfilter.o \ 47 | corpus/src/sentscan.o corpus/src/stem.o corpus/src/stopword.o \ 48 | corpus/src/symtab.o corpus/src/table.o corpus/src/termset.o \ 49 | corpus/src/textset.o corpus/src/tree.o corpus/src/wordscan.o \ 50 | $(STEMMER_O) $(UTF8LITE_O) 51 | 52 | $(SHLIB): libccorpus.a 53 | 54 | libccorpus.a: $(LIBCORPUS) 55 | $(AR) rcs $@ $(LIBCORPUS) 56 | $(RANLIB) $@ 57 | 58 | clean: 59 | rm -f $(LIBCORPUS) $(SHLIB) $(OBJECTS) libccorpus.a 60 | -------------------------------------------------------------------------------- /src/context.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "rcorpus.h" 22 | 23 | #define CONTEXT_TAG install("corpus::context") 24 | 25 | 26 | struct context { 27 | void *data; 28 | void (*destroy_func)(void *); 29 | }; 30 | 31 | 32 | void free_context(SEXP x) 33 | { 34 | struct context *ctx = R_ExternalPtrAddr(x); 35 | R_SetExternalPtrAddr(x, NULL); 36 | if (ctx) { 37 | if (ctx->destroy_func) { 38 | (ctx->destroy_func)(ctx->data); 39 | } 40 | corpus_free(ctx->data); 41 | corpus_free(ctx); 42 | } 43 | } 44 | 45 | 46 | SEXP alloc_context(size_t size, void (*destroy_func)(void *)) 47 | { 48 | SEXP ans; 49 | struct context *ctx = NULL; 50 | void *obj = NULL; 51 | int err = 0; 52 | 53 | PROTECT(ans = R_MakeExternalPtr(NULL, CONTEXT_TAG, R_NilValue)); 54 | R_RegisterCFinalizerEx(ans, free_context, TRUE); 55 | 56 | TRY_ALLOC(obj = corpus_calloc(1, size == 0 ? 1 : size)); 57 | TRY_ALLOC(ctx = corpus_calloc(1, sizeof(*ctx))); 58 | 59 | ctx->data = obj; 60 | ctx->destroy_func = destroy_func; 61 | R_SetExternalPtrAddr(ans, ctx); 62 | ctx = NULL; 63 | obj = NULL; 64 | out: 65 | corpus_free(ctx); 66 | corpus_free(obj); 67 | CHECK_ERROR(err); 68 | UNPROTECT(1); 69 | return ans; 70 | } 71 | 72 | 73 | int is_context(SEXP x) 74 | { 75 | return ((TYPEOF(x) == EXTPTRSXP) 76 | && (R_ExternalPtrTag(x) == CONTEXT_TAG)); 77 | } 78 | 79 | 80 | void *as_context(SEXP x) 81 | { 82 | struct context *ctx; 83 | 84 | if (!is_context(x)) { 85 | error("invalid context object"); 86 | } 87 | 88 | ctx = R_ExternalPtrAddr(x); 89 | return ctx->data; 90 | } 91 | -------------------------------------------------------------------------------- /src/filebuf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include "corpus/src/memory.h" 20 | #include "corpus/src/filebuf.h" 21 | #include 22 | #include "rcorpus.h" 23 | 24 | #define FILEBUF_TAG install("corpus::filebuf") 25 | 26 | 27 | static struct corpus_filebuf *filebuf_new(const char *filename) 28 | { 29 | struct corpus_filebuf *obj = NULL; 30 | struct corpus_filebuf buf; 31 | 32 | errno = 0; 33 | 34 | if (corpus_filebuf_init(&buf, filename) == 0) { 35 | if (!(obj = corpus_malloc(sizeof(*obj)))) { 36 | corpus_filebuf_destroy(&buf); 37 | error("failed allocating memory"); 38 | } 39 | *obj = buf; 40 | } else { 41 | if (errno) { 42 | error("cannot open file '%s': %s", 43 | filename, strerror(errno)); 44 | } else { 45 | error("cannot open file '%s'", filename); 46 | } 47 | } 48 | 49 | return obj; 50 | } 51 | 52 | 53 | static void filebuf_free(struct corpus_filebuf *buf) 54 | { 55 | if (buf) { 56 | corpus_filebuf_destroy(buf); 57 | corpus_free(buf); 58 | } 59 | } 60 | 61 | 62 | static void free_filebuf(SEXP sbuf) 63 | { 64 | struct corpus_filebuf *buf = R_ExternalPtrAddr(sbuf); 65 | R_SetExternalPtrAddr(sbuf, NULL); 66 | filebuf_free(buf); 67 | } 68 | 69 | 70 | SEXP alloc_filebuf(SEXP sfile) 71 | { 72 | SEXP ans, sclass, shandle, snames; 73 | struct corpus_filebuf *buf; 74 | const char *file; 75 | 76 | if (!(isString(sfile) && LENGTH(sfile) == 1)) { 77 | error("invalid 'file' argument"); 78 | } 79 | 80 | file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0))); 81 | 82 | PROTECT(shandle = R_MakeExternalPtr(NULL, FILEBUF_TAG, R_NilValue)); 83 | R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE); 84 | 85 | buf = filebuf_new(file); 86 | R_SetExternalPtrAddr(shandle, buf); 87 | 88 | PROTECT(ans = allocVector(VECSXP, 2)); 89 | SET_VECTOR_ELT(ans, 0, shandle); 90 | SET_VECTOR_ELT(ans, 1, sfile); 91 | 92 | PROTECT(snames = allocVector(STRSXP, 2)); 93 | SET_STRING_ELT(snames, 0, mkChar("handle")); 94 | SET_STRING_ELT(snames, 1, mkChar("file")); 95 | setAttrib(ans, R_NamesSymbol, snames); 96 | 97 | PROTECT(sclass = allocVector(STRSXP, 1)); 98 | SET_STRING_ELT(sclass, 0, mkChar("filebuf")); 99 | setAttrib(ans, R_ClassSymbol, sclass); 100 | 101 | UNPROTECT(4); 102 | return ans; 103 | } 104 | 105 | 106 | int is_filebuf(SEXP sbuf) 107 | { 108 | SEXP handle, file; 109 | 110 | if (!isVectorList(sbuf)) { 111 | return 0; 112 | } 113 | 114 | handle = getListElement(sbuf, "handle"); 115 | if (handle == R_NilValue) { 116 | return 0; 117 | } 118 | 119 | file = getListElement(sbuf, "file"); 120 | if (file == R_NilValue) { 121 | return 0; 122 | } 123 | 124 | return ((TYPEOF(handle) == EXTPTRSXP) 125 | && (R_ExternalPtrTag(handle) == FILEBUF_TAG)); 126 | } 127 | 128 | 129 | struct corpus_filebuf *as_filebuf(SEXP sbuf) 130 | { 131 | SEXP shandle, sfile; 132 | struct corpus_filebuf *buf; 133 | const char *file; 134 | 135 | if (!is_filebuf(sbuf)) { 136 | error("invalid 'filebuf' object"); 137 | } 138 | 139 | shandle = getListElement(sbuf, "handle"); 140 | buf = R_ExternalPtrAddr(shandle); 141 | 142 | if (buf == NULL) { 143 | R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE); 144 | 145 | sfile = getListElement(sbuf, "file"); 146 | file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0))); 147 | buf = filebuf_new(file); 148 | 149 | if (buf == NULL) { 150 | if (errno) { 151 | error("cannot open file '%s': %s", file, 152 | strerror(errno)); 153 | } else { 154 | error("cannot open file '%s'", file); 155 | } 156 | } 157 | 158 | R_SetExternalPtrAddr(shandle, buf); 159 | } 160 | 161 | return buf; 162 | } 163 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "rcorpus.h" 22 | 23 | #define CALLDEF(name, n) {#name, (DL_FUNC) &name, n} 24 | 25 | static const R_CallMethodDef CallEntries[] = { 26 | CALLDEF(abbreviations, 1), 27 | CALLDEF(alloc_text_handle, 0), 28 | CALLDEF(anyNA_text, 1), 29 | CALLDEF(as_character_json, 1), 30 | CALLDEF(as_character_text, 1), 31 | CALLDEF(as_integer_json, 1), 32 | CALLDEF(as_double_json, 1), 33 | CALLDEF(as_list_json, 1), 34 | CALLDEF(as_logical_json, 1), 35 | CALLDEF(as_text_character, 2), 36 | CALLDEF(as_text_filter_connector, 1), 37 | CALLDEF(as_text_json, 2), 38 | CALLDEF(dim_json, 1), 39 | CALLDEF(is_na_text, 1), 40 | CALLDEF(length_json, 1), 41 | CALLDEF(length_text, 1), 42 | CALLDEF(logging_off, 0), 43 | CALLDEF(logging_on, 0), 44 | CALLDEF(mmap_ndjson, 2), 45 | CALLDEF(names_json, 1), 46 | CALLDEF(names_text, 1), 47 | CALLDEF(print_json, 1), 48 | CALLDEF(read_ndjson, 2), 49 | CALLDEF(simplify_json, 1), 50 | CALLDEF(stem_snowball, 2), 51 | CALLDEF(stopwords, 1), 52 | CALLDEF(subscript_json, 2), 53 | CALLDEF(subset_json, 3), 54 | CALLDEF(term_stats, 7), 55 | CALLDEF(term_matrix, 4), 56 | CALLDEF(text_c, 3), 57 | CALLDEF(text_count, 2), 58 | CALLDEF(text_detect, 2), 59 | CALLDEF(text_locate, 2), 60 | CALLDEF(text_match, 2), 61 | CALLDEF(text_nsentence, 1), 62 | CALLDEF(text_ntoken, 1), 63 | CALLDEF(text_ntype, 2), 64 | CALLDEF(text_split_sentences, 2), 65 | CALLDEF(text_split_tokens, 2), 66 | CALLDEF(text_sub, 3), 67 | CALLDEF(text_trunc, 3), 68 | CALLDEF(text_tokens, 1), 69 | CALLDEF(text_types, 2), 70 | CALLDEF(text_valid, 1), 71 | {NULL, NULL, 0} 72 | }; 73 | 74 | 75 | void R_init_corpus(DllInfo *dll) 76 | { 77 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 78 | R_useDynamicSymbols(dll, FALSE); 79 | R_forceSymbols(dll, TRUE); 80 | } 81 | -------------------------------------------------------------------------------- /src/logging.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "rcorpus.h" 18 | #include "corpus/src/error.h" 19 | 20 | 21 | static void ignore_message(int code, const char *message) 22 | { 23 | (void)code; 24 | (void)message; 25 | } 26 | 27 | 28 | SEXP logging_off(void) 29 | { 30 | corpus_log_func = ignore_message; 31 | return R_NilValue; 32 | } 33 | 34 | 35 | SEXP logging_on(void) 36 | { 37 | corpus_log_func = NULL; 38 | return R_NilValue; 39 | } 40 | -------------------------------------------------------------------------------- /src/mkchar.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "rcorpus.h" 19 | 20 | 21 | static void mkchar_ensure(struct mkchar *mk, int nmin); 22 | 23 | 24 | void mkchar_init(struct mkchar *mk) 25 | { 26 | mk->buf = NULL; 27 | mk->size = 0; 28 | } 29 | 30 | 31 | SEXP mkchar_get(struct mkchar *mk, const struct utf8lite_text *text) 32 | { 33 | SEXP ans; 34 | uint8_t *ptr; 35 | size_t len = UTF8LITE_TEXT_SIZE(text); 36 | struct utf8lite_text_iter it; 37 | 38 | if (len > INT_MAX) { 39 | error("character string length exceeds maximum (%d)", INT_MAX); 40 | } 41 | 42 | if (text->ptr == NULL) { 43 | ans = NA_STRING; 44 | } else { 45 | if (UTF8LITE_TEXT_HAS_ESC(text)) { 46 | mkchar_ensure(mk, (int)len); 47 | 48 | utf8lite_text_iter_make(&it, text); 49 | ptr = mk->buf; 50 | while (utf8lite_text_iter_advance(&it)) { 51 | utf8lite_encode_utf8(it.current, &ptr); 52 | } 53 | len = (size_t)(ptr - mk->buf); 54 | ptr = mk->buf; 55 | } else { 56 | ptr = (uint8_t *)text->ptr; 57 | } 58 | 59 | ans = mkCharLenCE((char *)ptr, (int)len, CE_UTF8); 60 | } 61 | 62 | return ans; 63 | } 64 | 65 | 66 | static void mkchar_ensure(struct mkchar *mk, int nmin) 67 | { 68 | int size = mk->size; 69 | 70 | if (nmin <= size) { 71 | return; 72 | } 73 | 74 | corpus_array_size_add(&size, 1, 0, nmin); // can't overflow 75 | mk->buf = (void *)R_alloc(size, sizeof(uint8_t)); 76 | mk->size = size; 77 | } 78 | -------------------------------------------------------------------------------- /src/ndjson.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "rcorpus.h" 19 | 20 | 21 | SEXP mmap_ndjson(SEXP sfile, SEXP stext) 22 | { 23 | SEXP ans, sbuf; 24 | 25 | PROTECT(sbuf = alloc_filebuf(sfile)); 26 | PROTECT(ans = alloc_json(sbuf, R_NilValue, R_NilValue, stext)); 27 | as_json(ans); // force data load 28 | UNPROTECT(2); 29 | 30 | return ans; 31 | } 32 | 33 | 34 | SEXP read_ndjson(SEXP sbuffer, SEXP stext) 35 | { 36 | SEXP ans; 37 | 38 | assert(TYPEOF(sbuffer) == RAWSXP); 39 | 40 | PROTECT(ans = alloc_json(sbuffer, R_NilValue, R_NilValue, stext)); 41 | as_json(ans); // force data load 42 | UNPROTECT(1); 43 | 44 | return ans; 45 | } 46 | -------------------------------------------------------------------------------- /src/search.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include "rcorpus.h" 19 | 20 | #define SEARCH_TAG install("corpus::search") 21 | 22 | 23 | static struct corpus_search *search_new(void); 24 | 25 | struct corpus_search *search_new(void) 26 | { 27 | struct corpus_search *obj; 28 | int err; 29 | 30 | TRY_ALLOC(obj = corpus_calloc(1, sizeof(*obj))); 31 | TRY(corpus_search_init(obj)); 32 | 33 | err = 0; 34 | out: 35 | if (err) { 36 | corpus_free(obj); 37 | Rf_error("memory allocation failure"); 38 | } 39 | 40 | return obj; 41 | } 42 | 43 | 44 | void corpus_search_free(struct corpus_search *obj) 45 | { 46 | if (!obj) { 47 | return; 48 | } 49 | 50 | corpus_search_destroy(obj); 51 | corpus_free(obj); 52 | } 53 | 54 | 55 | static void free_search(SEXP obj) 56 | { 57 | struct corpus_search *search = R_ExternalPtrAddr(obj); 58 | corpus_search_free(search); 59 | R_ClearExternalPtr(obj); 60 | } 61 | 62 | 63 | int is_search(SEXP ssearch) 64 | { 65 | return ((TYPEOF(ssearch) == EXTPTRSXP) 66 | && (R_ExternalPtrTag(ssearch) == SEARCH_TAG)); 67 | } 68 | 69 | 70 | struct corpus_search *as_search(SEXP ssearch) 71 | { 72 | if (!is_search(ssearch)) { 73 | Rf_error("invalid 'search' object"); 74 | } 75 | return R_ExternalPtrAddr(ssearch); 76 | } 77 | 78 | 79 | SEXP alloc_search(SEXP sterms, const char *name, struct corpus_filter *filter) 80 | { 81 | SEXP ans, sset, items; 82 | const struct corpus_termset_term *term; 83 | struct corpus_search *obj; 84 | struct termset *termset; 85 | int i, n; 86 | int err = 0, nprot; 87 | 88 | nprot = 0; 89 | 90 | obj = search_new(); 91 | PROTECT(ans = R_MakeExternalPtr(obj, SEARCH_TAG, R_NilValue)); nprot++; 92 | R_RegisterCFinalizerEx(ans, free_search, TRUE); 93 | 94 | PROTECT(sset = alloc_termset(sterms, name, filter, 1)); nprot++; 95 | termset = as_termset(sset); 96 | items = items_termset(sset); 97 | R_SetExternalPtrProtected(ans, items); 98 | 99 | n = termset->nitem; 100 | for (i = 0; i < n; i++) { 101 | RCORPUS_CHECK_INTERRUPT(i); 102 | term = &termset->set.items[i]; 103 | TRY(corpus_search_add(obj, term->type_ids, 104 | term->length, NULL)); 105 | } 106 | 107 | out: 108 | CHECK_ERROR(err); 109 | UNPROTECT(nprot); 110 | return ans; 111 | } 112 | 113 | 114 | SEXP items_search(SEXP ssearch) 115 | { 116 | return R_ExternalPtrProtected(ssearch); 117 | } 118 | -------------------------------------------------------------------------------- /src/text_methods.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "rcorpus.h" 23 | 24 | 25 | SEXP names_text(SEXP text) 26 | { 27 | if (!is_text(text)) { 28 | error("invalid text object"); 29 | } 30 | return getListElement(text, "names"); 31 | } 32 | 33 | 34 | SEXP filter_text(SEXP text) 35 | { 36 | if (!is_text(text)) { 37 | error("invalid text object"); 38 | } 39 | return getListElement(text, "filter"); 40 | } 41 | 42 | 43 | SEXP length_text(SEXP stext) 44 | { 45 | R_xlen_t len; 46 | as_text(stext, &len); 47 | return ScalarReal((double)len); 48 | } 49 | 50 | 51 | SEXP is_na_text(SEXP stext) 52 | { 53 | SEXP ans; 54 | struct utf8lite_text *text; 55 | R_xlen_t i, n; 56 | int *isna; 57 | 58 | text = as_text(stext, &n); 59 | PROTECT(ans = allocVector(LGLSXP, n)); 60 | isna = LOGICAL(ans); 61 | 62 | for (i = 0; i < n; i++) { 63 | RCORPUS_CHECK_INTERRUPT(i); 64 | 65 | if (text[i].ptr) { 66 | isna[i] = FALSE; 67 | } else { 68 | isna[i] = TRUE; 69 | } 70 | } 71 | 72 | UNPROTECT(1); 73 | return ans; 74 | } 75 | 76 | 77 | SEXP anyNA_text(SEXP stext) 78 | { 79 | struct utf8lite_text *text; 80 | R_xlen_t i, n; 81 | int anyNA; 82 | 83 | text = as_text(stext, &n); 84 | 85 | anyNA = FALSE; 86 | for (i = 0; i < n; i++) { 87 | RCORPUS_CHECK_INTERRUPT(i); 88 | 89 | if (!text[i].ptr) { 90 | anyNA = TRUE; 91 | break; 92 | } 93 | } 94 | 95 | return ScalarLogical(anyNA); 96 | } 97 | -------------------------------------------------------------------------------- /src/text_nunit.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include "rcorpus.h" 21 | 22 | 23 | SEXP text_nsentence(SEXP sx) 24 | { 25 | SEXP ans, names; 26 | struct corpus_sentfilter *filter; 27 | const struct utf8lite_text *text; 28 | double *count; 29 | R_xlen_t i, n, nunit; 30 | int nprot, err = 0; 31 | 32 | nprot = 0; 33 | 34 | // x 35 | PROTECT(sx = coerce_text(sx)); nprot++; 36 | text = as_text(sx, &n); 37 | filter = text_sentfilter(sx); 38 | names = names_text(sx); 39 | 40 | PROTECT(ans = allocVector(REALSXP, n)); nprot++; 41 | setAttrib(ans, R_NamesSymbol, names); 42 | count = REAL(ans); 43 | 44 | for (i = 0; i < n; i++) { 45 | RCORPUS_CHECK_INTERRUPT(i); 46 | 47 | if (!text[i].ptr) { // missing value 48 | count[i] = NA_REAL; 49 | continue; 50 | } 51 | 52 | if (UTF8LITE_TEXT_SIZE(&text[i]) == 0) { // empty text 53 | count[i] = 0; 54 | continue; 55 | } 56 | 57 | TRY(corpus_sentfilter_start(filter, &text[i])); 58 | 59 | nunit = 0; 60 | while (corpus_sentfilter_advance(filter)) { 61 | nunit++; 62 | } 63 | TRY(filter->error); 64 | 65 | count[i] = (double)nunit; 66 | } 67 | 68 | out: 69 | CHECK_ERROR(err); 70 | UNPROTECT(nprot); 71 | return ans; 72 | } 73 | 74 | 75 | SEXP text_ntoken(SEXP sx) 76 | { 77 | SEXP ans, names; 78 | struct corpus_filter *filter; 79 | const struct utf8lite_text *text; 80 | double *count; 81 | R_xlen_t i, n, nunit; 82 | int nprot, err = 0; 83 | 84 | nprot = 0; 85 | 86 | PROTECT(sx = coerce_text(sx)); nprot++; 87 | text = as_text(sx, &n); 88 | names = names_text(sx); 89 | filter = text_filter(sx); 90 | 91 | PROTECT(ans = allocVector(REALSXP, n)); nprot++; 92 | setAttrib(ans, R_NamesSymbol, names); 93 | count = REAL(ans); 94 | 95 | for (i = 0; i < n; i++) { 96 | RCORPUS_CHECK_INTERRUPT(i); 97 | 98 | if (!text[i].ptr) { // missing text 99 | count[i] = NA_REAL; 100 | continue; 101 | } 102 | 103 | TRY(corpus_filter_start(filter, &text[i])); 104 | 105 | nunit = 0; 106 | 107 | while (corpus_filter_advance(filter)) { 108 | if (filter->type_id < 0) { 109 | continue; 110 | } 111 | nunit++; 112 | } 113 | TRY(filter->error); 114 | 115 | count[i] = (double)nunit; 116 | } 117 | 118 | out: 119 | UNPROTECT(nprot); 120 | CHECK_ERROR(err); 121 | return ans; 122 | } 123 | -------------------------------------------------------------------------------- /src/text_sub.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "rcorpus.h" 18 | 19 | 20 | static int text_len(const struct utf8lite_text *text, 21 | struct corpus_filter *filter) 22 | { 23 | int err = 0, len = 0; 24 | 25 | TRY(corpus_filter_start(filter, text)); 26 | while (corpus_filter_advance(filter)) { 27 | if (filter->type_id == CORPUS_TYPE_NONE) { 28 | continue; 29 | } 30 | len++; 31 | } 32 | TRY(filter->error); 33 | out: 34 | CHECK_ERROR(err); 35 | return len; 36 | } 37 | 38 | 39 | SEXP text_sub(SEXP sx, SEXP sstart, SEXP send) 40 | { 41 | SEXP ans, sources, table, tsource, trow, tstart, tstop, names, sfilter; 42 | const struct utf8lite_text *text; 43 | const uint8_t *base, *ptr; 44 | struct corpus_filter *filter; 45 | const int *start, *end; 46 | R_xlen_t i, n, nstart, nend; 47 | int err = 0, nprot = 0, s, e, j, m; 48 | 49 | text = as_text(sx, &n); 50 | filter = text_filter(sx); 51 | sources = getListElement(sx, "sources"); 52 | table = getListElement(sx, "table"); 53 | tsource = getListElement(table, "source"); 54 | trow = getListElement(table, "row"); 55 | tstart = getListElement(table, "start"); 56 | tstop = getListElement(table, "stop"); 57 | names = names_text(sx); 58 | sfilter = filter_text(sx); 59 | 60 | PROTECT(tstart = duplicate(tstart)); nprot++; 61 | PROTECT(tstop = duplicate(tstop)); nprot++; 62 | 63 | start = INTEGER(sstart); 64 | nstart = XLENGTH(sstart); 65 | 66 | end = INTEGER(send); 67 | nend = XLENGTH(send); 68 | 69 | for (i = 0; i < n; i++) { 70 | RCORPUS_CHECK_INTERRUPT(i); 71 | 72 | s = start[i % nstart]; 73 | e = end[i % nend]; 74 | 75 | // handle missing text, missing endpoints 76 | if (!text[i].ptr || s == NA_INTEGER || e == NA_INTEGER) { 77 | INTEGER(tstart)[i] = NA_INTEGER; 78 | INTEGER(tstop)[i] = NA_INTEGER; 79 | continue; 80 | } 81 | 82 | // convert negative indices to non-negative, 83 | // except for end = -1 84 | if (s < 0 || e < -1) { 85 | m = text_len(&text[i], filter); 86 | 87 | if (s < 0) { 88 | s = s + m + 1; 89 | if (s < 0) { 90 | s = 0; 91 | } 92 | } 93 | 94 | if (e < -1) { 95 | e = e + m + 1; 96 | if (e < 0) { 97 | e = 0; 98 | } 99 | } 100 | } 101 | 102 | // clip start to [1,Inf) 103 | if (s == 0) { 104 | s = 1; 105 | } 106 | 107 | base = text[i].ptr - (INTEGER(tstart)[i] - 1); 108 | 109 | // find start 110 | j = 0; 111 | TRY(corpus_filter_start(filter, &text[i])); 112 | while (j != s && corpus_filter_advance(filter)) { 113 | if (filter->type_id == CORPUS_TYPE_NONE) { 114 | // skip ignored 115 | continue; 116 | } 117 | j++; 118 | } 119 | TRY(filter->error); 120 | 121 | // handle case when start is after end of text 122 | if (j < s) { 123 | INTEGER(tstart)[i] = INTEGER(tstop)[i] + 1; 124 | continue; 125 | } 126 | 127 | // set subsequence start 128 | ptr = filter->current.ptr; 129 | INTEGER(tstart)[i] = (int)(ptr - base) + 1; 130 | 131 | // handle case when end is the last token 132 | if (e == -1) { 133 | continue; 134 | } 135 | 136 | // find end 137 | while (j != e + 1 && corpus_filter_advance(filter)) { 138 | if (filter->type_id == CORPUS_TYPE_NONE) { 139 | // skip ignored 140 | continue; 141 | } 142 | j++; 143 | } 144 | TRY(filter->error); 145 | 146 | // handle case when end is after end of text 147 | if (j < e + 1) { 148 | continue; 149 | } 150 | 151 | // set subsequence end 152 | ptr = filter->current.ptr; 153 | INTEGER(tstop)[i] = (int)(ptr - base); 154 | } 155 | 156 | PROTECT(ans = alloc_text(sources, tsource, trow, tstart, tstop, 157 | names, sfilter)); 158 | nprot++; 159 | 160 | out: 161 | UNPROTECT(nprot); 162 | CHECK_ERROR(err); 163 | return ans; 164 | } 165 | -------------------------------------------------------------------------------- /src/text_trunc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "rcorpus.h" 18 | 19 | static SEXP trunc_left(struct mkchar *mk, const struct utf8lite_text *text, 20 | int chars) 21 | { 22 | struct utf8lite_graphscan scan; 23 | struct utf8lite_text sub; 24 | int err = 0, width = 0, w; 25 | 26 | sub.ptr = text->ptr; 27 | sub.attr = UTF8LITE_TEXT_BITS(text); 28 | 29 | utf8lite_graphscan_make(&scan, text); 30 | while (utf8lite_graphscan_advance(&scan)) { 31 | TRY(utf8lite_graph_measure(&scan.current, 0, &w)); 32 | if (w > 0) { 33 | if (width > chars - w) { 34 | break; 35 | } 36 | width += w; 37 | } 38 | } 39 | sub.attr |= (size_t)(scan.ptr - text->ptr); 40 | out: 41 | CHECK_ERROR(err); 42 | return mkchar_get(mk, &sub); 43 | } 44 | 45 | 46 | static SEXP trunc_right(struct mkchar *mk, const struct utf8lite_text *text, 47 | int chars) 48 | { 49 | struct utf8lite_graphscan scan; 50 | struct utf8lite_text sub; 51 | const uint8_t *end; 52 | int err = 0, width = 0, w; 53 | 54 | sub.ptr = NULL; 55 | sub.attr = UTF8LITE_TEXT_BITS(text); 56 | end = text->ptr + UTF8LITE_TEXT_SIZE(text); 57 | 58 | utf8lite_graphscan_make(&scan, text); 59 | utf8lite_graphscan_skip(&scan); 60 | while (utf8lite_graphscan_retreat(&scan)) { 61 | TRY(utf8lite_graph_measure(&scan.current, 0, &w)); 62 | if (w > 0) { 63 | if (width > chars - w) { 64 | break; 65 | } 66 | width += w; 67 | } 68 | } 69 | utf8lite_graphscan_retreat(&scan); 70 | sub.ptr = (uint8_t *)scan.ptr; 71 | sub.attr |= (size_t)(end - sub.ptr); 72 | out: 73 | CHECK_ERROR(err); 74 | return mkchar_get(mk, &sub); 75 | } 76 | 77 | 78 | SEXP text_trunc(SEXP sx, SEXP schars, SEXP sright) 79 | { 80 | SEXP ans, names, elt; 81 | struct mkchar mk; 82 | const struct utf8lite_text *text; 83 | R_xlen_t i, n; 84 | int nprot = 0, chars, right; 85 | 86 | text = as_text(sx, &n); 87 | chars = INTEGER(schars)[0]; 88 | right = LOGICAL(sright)[0] == TRUE; 89 | mkchar_init(&mk); 90 | 91 | PROTECT(ans = allocVector(STRSXP, n)); nprot++; 92 | PROTECT(names = names_text(sx)); nprot++; 93 | setAttrib(ans, R_NamesSymbol, names); 94 | 95 | for (i = 0; i < n; i++) { 96 | RCORPUS_CHECK_INTERRUPT(i); 97 | 98 | if (!text[i].ptr) { 99 | elt = NA_STRING; 100 | } else if (right) { 101 | elt = trunc_right(&mk, &text[i], chars); 102 | } else { 103 | elt = trunc_left(&mk, &text[i], chars); 104 | } 105 | SET_STRING_ELT(ans, i, elt); 106 | } 107 | 108 | UNPROTECT(nprot); 109 | return ans; 110 | } 111 | -------------------------------------------------------------------------------- /src/util.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "rcorpus.h" 22 | 23 | 24 | /* based on R-Exts Section 5.9.6 "handling lists" */ 25 | int findListElement(SEXP list, const char *str) 26 | { 27 | SEXP names; 28 | int i, n; 29 | int nprot = 0; 30 | int ans = -1; 31 | 32 | if (list == R_NilValue) { 33 | goto out; 34 | } 35 | 36 | PROTECT(names = getAttrib(list, R_NamesSymbol)); nprot++; 37 | if (names == R_NilValue) { 38 | goto out; 39 | } 40 | 41 | n = LENGTH(list); 42 | for (i = 0; i < n; i++) { 43 | if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) { 44 | ans = i; 45 | goto out; 46 | } 47 | } 48 | out: 49 | UNPROTECT(nprot); 50 | return ans; 51 | } 52 | 53 | 54 | SEXP getListElement(SEXP list, const char *str) 55 | { 56 | int i = findListElement(list, str); 57 | if (i < 0) { 58 | return R_NilValue; 59 | } 60 | return VECTOR_ELT(list, i); 61 | } 62 | 63 | 64 | double *as_weights(SEXP sweights, R_xlen_t n) 65 | { 66 | R_xlen_t n0; 67 | 68 | if (sweights == R_NilValue) { 69 | return NULL; 70 | } 71 | 72 | n0 = XLENGTH(sweights); 73 | if (n0 != n) { 74 | error("invalid 'weights' vector;" 75 | " length is %"PRIu64" but should be %"PRIu64, 76 | (uint64_t)n0, (uint64_t)n); 77 | } 78 | 79 | return REAL(sweights); 80 | } 81 | -------------------------------------------------------------------------------- /src/wordlist.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Patrick O. Perry. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "rcorpus.h" 18 | 19 | 20 | static SEXP wordlist(const uint8_t **(*callback)(const char *, int *), 21 | SEXP skind) 22 | { 23 | SEXP ans; 24 | const char **strs; 25 | const char *kind; 26 | int i, n; 27 | 28 | if (skind == R_NilValue) { 29 | return R_NilValue; 30 | } 31 | 32 | PROTECT(skind = coerceVector(skind, STRSXP)); 33 | if (STRING_ELT(skind, 0) == NA_STRING) { 34 | UNPROTECT(1); 35 | return R_NilValue; 36 | } 37 | 38 | // assume utf8 encoding 39 | kind = CHAR(STRING_ELT(skind, 0)); 40 | strs = (const char **)callback(kind, &n); 41 | 42 | if (!strs) { 43 | error("unknown kind (\"%s\")", kind); 44 | } 45 | 46 | PROTECT(ans = allocVector(STRSXP, n)); 47 | for (i = 0; i < n; i++) { 48 | RCORPUS_CHECK_INTERRUPT(i); 49 | SET_STRING_ELT(ans, i, mkCharCE(strs[i], CE_UTF8)); 50 | } 51 | 52 | UNPROTECT(2); 53 | return ans; 54 | } 55 | 56 | 57 | SEXP abbreviations(SEXP skind) 58 | { 59 | return wordlist(corpus_sentsuppress_list, skind); 60 | } 61 | 62 | 63 | SEXP stopwords(SEXP skind) 64 | { 65 | return wordlist(corpus_stopword_list, skind); 66 | } 67 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(corpus) 3 | 4 | test_check("corpus") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-capture_output.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | if (utils::packageVersion("testthat") <= "1.0.2") { 4 | capture_output <- function(code, print = FALSE, width = 80) { 5 | oldwidth <- getOption("width") 6 | if (width != oldwidth) { 7 | options(width = width) 8 | on.exit(options(width = oldwidth), add = TRUE) 9 | } 10 | testthat::capture_output(code, print) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/testthat/helper-locale.R: -------------------------------------------------------------------------------- 1 | switch_ctype <- function(mode = c("C", "UTF-8")) 2 | { 3 | mode <- match.arg(mode) 4 | 5 | if (mode == "UTF-8") { 6 | sysname <- Sys.info()[["sysname"]] 7 | if (sysname == "Windows") { 8 | ctype <- "English_United States.1252" 9 | } else if (sysname == "Darwin") { 10 | ctype <- "UTF-8" 11 | } else { 12 | ctype <- "en_US.utf8" 13 | } 14 | } else { 15 | ctype <- "C" 16 | } 17 | 18 | ctype0 <- Sys.getlocale("LC_CTYPE") 19 | suppressWarnings({ 20 | Sys.setlocale("LC_CTYPE", ctype) 21 | }) 22 | if (Sys.getlocale("LC_CTYPE") != ctype) { 23 | skip(paste0("Cannot change locale to '", ctype, "'")) 24 | } 25 | if (mode == "UTF-8" && !utf8::output_utf8()) { 26 | skip("Cannot change to UTF-8 output") 27 | } 28 | 29 | ctype0 30 | } 31 | -------------------------------------------------------------------------------- /tests/testthat/helper-options.R: -------------------------------------------------------------------------------- 1 | 2 | options(encoding = "UTF-8") 3 | -------------------------------------------------------------------------------- /tests/testthat/test-frame-stats.R: -------------------------------------------------------------------------------- 1 | context("frame-stats") 2 | 3 | 4 | test_that("'na.fail' works", { 5 | data <- corpus_frame(x = 26:1, text = letters) 6 | expect_equal(na.fail(data), data) 7 | 8 | data <- corpus_frame(x = c(NA, 26:2), text = letters) 9 | expect_error(na.fail(data), "missing values in object") 10 | 11 | data <- corpus_frame(x = 1:26, text = c(NA, letters[-1])) 12 | expect_error(na.fail(data), "missing values in object") 13 | }) 14 | 15 | 16 | test_that("'na.omit' works", { 17 | data <- corpus_frame(text = c(NA, "a", "b", NA, "c"), x = 2:6) 18 | actual <- na.omit(data) 19 | expected <- corpus_frame(text = c("a", "b", "c"), x = c(3, 4, 6), 20 | row.names = c(2L, 3L, 5L)) 21 | omit <- c("1" = 1L, "4" = 4L) 22 | attr(omit, "class") <- "omit" 23 | attr(expected, "na.action") <- omit 24 | expect_equal(actual, expected) 25 | 26 | expect_equal(na.omit(corpus_frame(x = 1:26, text = letters)), 27 | corpus_frame(x = 1:26, text = letters)) 28 | }) 29 | 30 | 31 | test_that("'na.exclude' works", { 32 | data <- corpus_frame(text = letters[1:5], x = c(4, 3, NA, 1, -7), 33 | row.names = c("A", "B", "C", "D", "E")) 34 | actual <- na.exclude(data) 35 | expected <- corpus_frame(text = c("a", "b", "d", "e"), 36 | x = c(4, 3, 1, -7), 37 | row.names = c("A", "B", "D", "E")) 38 | exclude <- c("C" = 3L) 39 | attr(exclude, "class") <- "exclude" 40 | attr(expected, "na.action") <- exclude 41 | expect_equal(actual, expected) 42 | 43 | expect_equal(na.exclude(corpus_frame(x = 1:26, text = letters)), 44 | corpus_frame(x = 1:26, text = letters)) 45 | }) 46 | -------------------------------------------------------------------------------- /tests/testthat/test-gutenberg_corpus.R: -------------------------------------------------------------------------------- 1 | context("gutenberg_corpus") 2 | 3 | test_that("'gutenberg_corpus' can download Jules Verne in French", { 4 | if (!identical(Sys.getenv("TEST_WEB_RESOURCES"), "true")) { 5 | skip("Not running web resource tests") 6 | } 7 | 8 | data <- gutenberg_corpus(800, verbose = FALSE) 9 | expect_equal(data$title, "Le Tour du Monde en 80 Jours") 10 | expect_equal(data$author, "Jules Verne") 11 | expect_equal(data$language, "French") 12 | expect_equal(nchar(as.character(data$text)), 421335) 13 | }) 14 | 15 | 16 | test_that("'gutenberg_corpus' can handle NA", { 17 | data <- gutenberg_corpus(NA) 18 | expect_equal(data, corpus_frame(title = NA_character_, 19 | author = NA_character_, 20 | language = NA_character_, 21 | text = NA_character_)) 22 | }) 23 | -------------------------------------------------------------------------------- /tests/testthat/test-json_serialize.R: -------------------------------------------------------------------------------- 1 | context("json_serialize") 2 | 3 | 4 | test_that("serializing json works", { 5 | x <- c("S", "P", "Q", "R") 6 | file <- tempfile() 7 | writeLines(paste0('"', x, '"'), file) 8 | ds <- read_ndjson(file, simplify=FALSE) 9 | 10 | file2 <- tempfile() 11 | saveRDS(ds, file2) 12 | ds2 <- readRDS(file2) 13 | 14 | expect_equal(as.character(ds), as.character(ds2)) 15 | }) 16 | 17 | 18 | test_that("serializing mmapped json works", { 19 | x <- c("S", "P", "Q", "R") 20 | file <- tempfile() 21 | writeLines(paste0('"', x, '"'), file) 22 | ds <- read_ndjson(file, mmap=TRUE, simplify=FALSE) 23 | 24 | file2 <- tempfile() 25 | saveRDS(ds, file2) 26 | ds2 <- readRDS(file2) 27 | 28 | expect_equal(as.character(ds), as.character(ds2)) 29 | }) 30 | 31 | 32 | test_that("serializing mmapped json should use relative, not absolute path", { 33 | wd <- getwd() 34 | on.exit(setwd(wd)) 35 | 36 | x <- c("S", "P", "Q", "R") 37 | 38 | # create and change directory to dir/a 39 | dir <- tempfile() 40 | dir.create(dir) 41 | setwd(dir) 42 | dir.create("a") 43 | setwd("a") 44 | 45 | # save dir/a/data.json 46 | # save dir/a/obj.rds 47 | writeLines(paste0('{"x": "', x, '"}'), "data.json") 48 | ds <- read_ndjson("data.json", mmap=TRUE, simplify=FALSE) 49 | saveRDS(ds, "obj.rds") 50 | 51 | # move the files to 52 | # dir/data.json 53 | # dir/obj.rds 54 | file.rename(file.path(dir, "a", "data.json"), file.path(dir, "data.json")) 55 | file.rename(file.path(dir, "a", "obj.rds"), file.path(dir, "obj.rds")) 56 | 57 | # set the working directory to dir 58 | setwd(dir) 59 | unlink(file.path(dir, "a"), recursive=TRUE) 60 | 61 | # read obj.rds 62 | ds2 <- readRDS("obj.rds") 63 | expect_equal(as.character(ds2$x), x) 64 | }) 65 | 66 | 67 | test_that("serializing json subset works", { 68 | x <- LETTERS 69 | file <- tempfile() 70 | writeLines(paste0('"', x, '"'), file) 71 | ds <- read_ndjson(file, simplify=FALSE) 72 | 73 | i <- seq(2, 26, 2) 74 | ds <- ds[i] 75 | 76 | file2 <- tempfile() 77 | saveRDS(ds, file2) 78 | ds2 <- readRDS(file2) 79 | 80 | expect_equal(as.character(ds), as.character(ds2)) 81 | }) 82 | 83 | 84 | test_that("serializing json field works", { 85 | x <- LETTERS 86 | y <- 3.14 * seq_along(LETTERS) - 10 87 | file <- tempfile() 88 | writeLines(paste0('{"x": "', x, '", "z": { "y": ', y, "} }"), file) 89 | ds <- read_ndjson(file, simplify=FALSE) 90 | 91 | ds <- ds$z 92 | 93 | file2 <- tempfile() 94 | saveRDS(ds, file2) 95 | ds2 <- readRDS(file2) 96 | 97 | expect_equal(as.numeric(ds$y), as.numeric(ds2$y)) 98 | }) 99 | 100 | 101 | test_that("serializing json nested fields works", { 102 | x <- 1:10 103 | file <- tempfile() 104 | writeLines(paste0('{"f1": {"f2": {"f3": {"x": ', x, '}}}}'), file) 105 | ds <- read_ndjson(file, simplify=FALSE) 106 | 107 | ds <- ds$f1$f2$f3 108 | 109 | file2 <- tempfile() 110 | saveRDS(ds, file2) 111 | ds2 <- readRDS(file2) 112 | 113 | expect_equal(as.integer(ds$x), as.numeric(ds2$x)) 114 | }) 115 | 116 | 117 | test_that("serializing json field subset works", { 118 | x <- LETTERS 119 | y <- 3.14 * seq_along(LETTERS) - 10 120 | file <- tempfile() 121 | writeLines(paste0('{"z": {"x": "', x, '"}, "y": ', y, "}"), file) 122 | ds <- read_ndjson(file, simplify = FALSE) 123 | 124 | i <- c(20, 2, 9, 4, 6, 2) 125 | ds <- ds[i, "z"] 126 | 127 | file2 <- tempfile() 128 | saveRDS(ds, file2) 129 | ds2 <- readRDS(file2) 130 | 131 | expect_equal(as.character(ds$x), as.character(ds2$x)) 132 | }) 133 | -------------------------------------------------------------------------------- /tests/testthat/test-read_ndjson.R: -------------------------------------------------------------------------------- 1 | context("read_ndjson") 2 | 3 | 4 | test_that("reading a non-existent file should fail", { 5 | corpus:::logging_off() 6 | expect_error(read_ndjson("foobar", mmap=TRUE), "cannot open file 'foobar'") 7 | corpus:::logging_on() 8 | }) 9 | 10 | 11 | test_that("passing a connection when mmap is TRUE should fail", { 12 | tmp <- tempfile() 13 | writeLines(character(), tmp) 14 | file <- file(tmp) 15 | on.exit(close(file)) 16 | expect_error(read_ndjson(file, mmap = TRUE), 17 | "'file' must be a character string when 'mmap' is TRUE") 18 | }) 19 | 20 | 21 | test_that("passing a file name should succeed", { 22 | file <- tempfile() 23 | writeLines('"foo"', file) 24 | expect_equal(read_ndjson(file), "foo") 25 | }) 26 | 27 | 28 | test_that("passing a closed connection should succeed", { 29 | tmp <- tempfile() 30 | file <- file(tmp) 31 | writeLines('"foo"', file) 32 | expect_equal(read_ndjson(file), "foo") 33 | }) 34 | 35 | 36 | test_that("passing an empty file should succeed", { 37 | file <- tempfile() 38 | writeLines(character(), file) 39 | expect_equal(read_ndjson(file), NULL) 40 | }) 41 | 42 | 43 | test_that("passing a nonscalar should fail", { 44 | expect_error(read_ndjson(17), 45 | "'file' must be a character string or connection") 46 | }) 47 | -------------------------------------------------------------------------------- /tests/testthat/test-stemmer.R: -------------------------------------------------------------------------------- 1 | context("stemmer.R") 2 | 3 | test_that("can use a custom function as a stemmer", { 4 | x <- LETTERS 5 | stem <- function(word) "?" 6 | expect_equal(text_tokens(x, stemmer = stem), 7 | as.list(rep("?", length(x)))) 8 | }) 9 | 10 | 11 | test_that("handles stemmer logical NAs", { 12 | x <- paste(LETTERS, collapse = " ") 13 | stemmer <- 14 | function(x) { 15 | if (x %in% c("a", "e", "i", "o", "u")) { 16 | paste0(toupper(x), "*") 17 | } else { 18 | NA 19 | } 20 | } 21 | actual <- text_tokens(x, stemmer = stemmer) 22 | expected <- list(c("A*", "E*", "I*", "O*", "U*")) 23 | expect_equal(actual, expected) 24 | }) 25 | 26 | 27 | test_that("handles stemmer character NAs", { 28 | x <- paste(LETTERS, collapse = " ") 29 | stemmer <- function(x) NA_character_ 30 | actual <- text_tokens(x, stemmer = stemmer) 31 | expected <- list(character()) 32 | expect_equal(actual, expected) 33 | }) 34 | 35 | 36 | test_that("handles stemmer errors", { 37 | x <- LETTERS 38 | 39 | expect_error(text_tokens(x, stemmer = function(w) c("?", "?")), 40 | "'stemmer' returned multiple values for input \"a\"") 41 | 42 | expect_error(text_tokens(x, stemmer = function(w) character()), 43 | "'stemmer' did not return a value for input \"a\"") 44 | 45 | expect_error(text_tokens(x, stemmer = function(w) NULL), 46 | "'stemmer' did not return a value for input \"a\"") 47 | 48 | expect_error(text_tokens(x, stemmer = function(w) 7), 49 | "'stemmer' returned a non-string value for input \"a\"") 50 | }) 51 | 52 | 53 | test_that("handles internal stemmer errors", { 54 | expect_error(text_tokens("hello", stemmer = function(x) stop("what?")), 55 | "'stemmer' raised an error for input \"hello\"") 56 | }) 57 | 58 | 59 | test_that("'new_stemmer' can detect errors", { 60 | expect_error(new_stemmer(c("a", "b"), c("a")), 61 | "'term' argument length must equal 'stem' argument length") 62 | }) 63 | 64 | 65 | test_that("'new_stemmer' can handle empty inputs", { 66 | fn <- new_stemmer(NULL, NULL) 67 | expect_equal(fn("a"), "a") 68 | }) 69 | 70 | 71 | test_that("'new_stemmer' can use a default", { 72 | fn <- new_stemmer(LETTERS, letters, default = NA) 73 | expect_equal(fn("A"), "a") 74 | expect_equal(fn("AB"), NA_character_) 75 | }) 76 | 77 | 78 | test_that("'new_stemmer' can handle duplicates", { 79 | term <- c("a", "a", "b", "c", "c", "c", "d") 80 | stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d") 81 | 82 | fn <- new_stemmer(term, stem, duplicates = "first", vectorize = FALSE) 83 | expect_equal(sapply(term, fn, USE.NAMES = FALSE), 84 | c("a1", "a1", "b", "c1", "c1", "c1", "d")) 85 | 86 | fn <- new_stemmer(term, stem, duplicates = "last", vectorize = FALSE) 87 | expect_equal(sapply(term, fn, USE.NAMES = FALSE), 88 | c("a2", "a2", "b", "c3", "c3", "c3", "d")) 89 | 90 | fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = FALSE) 91 | expect_equal(sapply(term, fn, USE.NAMES = FALSE), 92 | c("a", "a", "b", "c", "c", "c", "d")) 93 | }) 94 | 95 | 96 | test_that("'new_stemmer' can vectorize ", { 97 | term <- c("a", "a", "b", "c", "c", "c", "d") 98 | stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d") 99 | 100 | fn <- new_stemmer(term, stem, duplicates = "first", vectorize = TRUE) 101 | expect_equal(fn(term), c("a1", "a1", "b", "c1", "c1", "c1", "d")) 102 | 103 | fn <- new_stemmer(term, stem, duplicates = "last", vectorize = TRUE) 104 | expect_equal(fn(term), c("a2", "a2", "b", "c3", "c3", "c3", "d")) 105 | 106 | fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = TRUE) 107 | expect_equal(fn(term), c("a", "a", "b", "c", "c", "c", "d")) 108 | 109 | expect_error(new_stemmer(term, stem, duplicates = "fail", 110 | vectorize = TRUE), 111 | "'term' argument entries must be unique") 112 | }) 113 | 114 | 115 | test_that("'stem_snowball' can handle NULL algorithm", { 116 | x <- c("win", "winning", "winner", "#winning") 117 | expect_equal(stem_snowball(x, NULL), x) 118 | }) 119 | 120 | 121 | test_that("'stem_snowball' can handle NULL input", { 122 | expect_equal(stem_snowball(NULL), NULL) 123 | }) 124 | 125 | 126 | test_that("'stem_snowball' can handle stem input", { 127 | x <- c("win", "winning", "winner", "#winning") 128 | expect_equal(stem_snowball(x), 129 | c("win", "win", "winner", "#winning")) 130 | }) 131 | -------------------------------------------------------------------------------- /tests/testthat/test-term_counts.R: -------------------------------------------------------------------------------- 1 | context("term_counts") 2 | 3 | test_that("'term_counts' gives equivalent results to 'term_matrix'", { 4 | text <- c(a="A rose is a rose is a rose.", 5 | b="A Rose is red, a violet is blue!", 6 | c="A rose by any other name would smell as sweet.") 7 | x <- term_matrix(text) 8 | tf <- term_counts(text) 9 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$text), 10 | j = as.integer(tf$term), 11 | x = tf$count, 12 | dimnames = list(levels(tf$text), 13 | colnames(x))) 14 | expect_equal(x, xtf) 15 | }) 16 | 17 | 18 | test_that("'term_counts' gives equivalent results to 'term_matrix' no names", { 19 | text <- c("A rose is a rose is a rose.", 20 | "A Rose is red, a violet is blue!", 21 | "A rose by any other name would smell as sweet.") 22 | x <- term_matrix(text) 23 | rownames(x) <- as.character(seq_along(text)) 24 | tf <- term_counts(text) 25 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$text), 26 | j = as.integer(tf$term), 27 | x = tf$count, 28 | dimnames = list(levels(tf$text), 29 | colnames(x))) 30 | expect_equal(x, xtf) 31 | }) 32 | 33 | 34 | test_that("'term_counts' with group gives equivalent results to 'term_matrix'", { 35 | text <- c(a="A rose is a rose is a rose.", 36 | b="A Rose is red, a violet is blue!", 37 | c="A rose by any other name would smell as sweet.") 38 | g <- factor(c("X", "Y", "X")) 39 | x <- term_matrix(text, group = g) 40 | tf <- term_counts(text, group = g) 41 | xtf <- Matrix::sparseMatrix(i = as.integer(tf$group), 42 | j = as.integer(tf$term), 43 | x = tf$count, 44 | dimnames = list(levels(tf$group), 45 | colnames(x))) 46 | expect_equal(x, xtf) 47 | }) 48 | -------------------------------------------------------------------------------- /tests/testthat/test-text-stats.R: -------------------------------------------------------------------------------- 1 | context("text-stats") 2 | 3 | test_that("'na.fail' works", { 4 | x <- as_corpus_text(letters) 5 | expect_equal(na.fail(x), x) 6 | 7 | expect_error(na.fail(c(x, NA)), "missing values in object") 8 | }) 9 | 10 | 11 | test_that("'na.omit' works", { 12 | x <- as_corpus_text(c(NA, "a", "b", NA, "c")) 13 | actual <- na.omit(x) 14 | expected <- as_corpus_text(c("a", "b", "c")) 15 | omit <- c(1L, 4L) 16 | attr(omit, "class") <- "omit" 17 | attr(expected, "na.action") <- omit 18 | expect_equal(actual, expected) 19 | 20 | expect_equal(na.omit(as_corpus_text(letters)), 21 | as_corpus_text(letters)) 22 | }) 23 | 24 | 25 | test_that("'na.exclude' works", { 26 | x <- as_corpus_text(c(r = NA, s = "a", t = "b", u = NA, v = "c")) 27 | actual <- na.exclude(x) 28 | expected <- as_corpus_text(c(s = "a", t = "b", v = "c")) 29 | exclude <- c(r = 1L, u = 4L) 30 | attr(exclude, "class") <- "exclude" 31 | attr(expected, "na.action") <- exclude 32 | expect_equal(actual, expected) 33 | 34 | expect_equal(na.exclude(as_corpus_text(letters)), 35 | as_corpus_text(letters)) 36 | }) 37 | -------------------------------------------------------------------------------- /tests/testthat/test-text_c.R: -------------------------------------------------------------------------------- 1 | context("text_c") 2 | 3 | 4 | test_that("c should not drop text_filter", { 5 | x <- as_corpus_text("hello") 6 | text_filter(x) <- text_filter(map_case = FALSE) 7 | y <- as_corpus_text("world") 8 | z <- c(x, y) 9 | expect_equal(text_filter(z), text_filter(x)) 10 | }) 11 | 12 | 13 | test_that("c should work with named or unnamed args", { 14 | x <- as_corpus_text("hello") 15 | y <- as_corpus_text("world") 16 | expect_equal(c(x, y), as_corpus_text(c("hello", "world"))) 17 | expect_equal(c(a = x, b = y), as_corpus_text(c(a = "hello", b = "world"))) 18 | }) 19 | 20 | 21 | test_that("c should work with complex args", { 22 | x <- c(a=as_corpus_text(c("hello", NA, "world")), "!", c=7) 23 | expect_equal(x, as_corpus_text(c(a1 = "hello", a2 = NA, a3 = "world", 24 | "4" = "!", c = "7"))) 25 | }) 26 | 27 | 28 | test_that("c should work with a single unnamed argument", { 29 | x0 <- as_corpus_text(c(a = "hello", b = "goodbye", "!")) 30 | x <- x0 31 | text_filter(x) <- text_filter(map_case = FALSE) 32 | y <- c(x) 33 | expect_equal(y, x) 34 | }) 35 | 36 | 37 | test_that("c should work with a single named argument", { 38 | x <- as_corpus_text(c(a = "hello", b = "goodbye", "!")) 39 | y <- c(a = x) 40 | expect_equal(y, as_corpus_text(c(a.a = "hello", a.b = "goodbye", a3 = "!"))) 41 | }) 42 | 43 | 44 | test_that("c should work with no names", { 45 | x <- as_corpus_text(c("hello", "goodbye", "!")) 46 | z <- c(x, x) 47 | expect_equal(names(z), NULL) 48 | expect_equal(as.character(z), c(as.character(x), as.character(x))) 49 | }) 50 | 51 | 52 | test_that("c should work with no arguments", { 53 | z <- c.corpus_text() 54 | expect_equal(z, as_corpus_text(c())) 55 | }) 56 | 57 | 58 | test_that("c should support use.names = FALSE", { 59 | z <- c(a=as_corpus_text("x"), y=c(z="z", "w"), use.names = FALSE) 60 | expect_equal(z, as_corpus_text(c("x", "z", "w"))) 61 | }) 62 | 63 | 64 | test_that("c should support lists with recursive = TRUE", { 65 | z <- c.corpus_text(list(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c", 66 | recursive = TRUE) 67 | expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c"))) 68 | }) 69 | 70 | 71 | test_that("c should support pairlists with recursive = TRUE", { 72 | z <- c.corpus_text(pairlist(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c", 73 | recursive = TRUE) 74 | expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c"))) 75 | }) 76 | 77 | 78 | test_that("c can handle NA after named", { 79 | z <- c(as_corpus_text(c(x = "a")), NA) 80 | expect_equal(z, as_corpus_text(c(x = "a", "2" = NA))) 81 | }) 82 | 83 | 84 | test_that("c should take filter from first value", { 85 | f <- text_filter(map_case = FALSE) 86 | x <- as_corpus_text(letters, filter = f) 87 | y <- as_corpus_text(LETTERS) 88 | z <- c(x, y) 89 | expect_equal(z, as_corpus_text(c(letters, LETTERS), filter = f)) 90 | 91 | z2 <- c(y, x) 92 | expect_equal(z2, as_corpus_text(c(LETTERS, letters))) 93 | }) 94 | 95 | 96 | test_that("c should work with duplicate names", { 97 | x <- as_corpus_text(c(a = "hello", b = "goodbye", "!")) 98 | z <- c(x, x) 99 | expect_equal(names(z), c(names(x), paste0(names(x), ".1"))) 100 | expect_equal(as.character(z), c(as.character(x), as.character(x))) 101 | }) 102 | -------------------------------------------------------------------------------- /tests/testthat/test-text_names.R: -------------------------------------------------------------------------------- 1 | context("text_names") 2 | 3 | 4 | test_that("`names` should be NULL for new text", { 5 | x <- as_corpus_text(c("A", "B", "C")) 6 | expect_equal(names(x), NULL) 7 | 8 | expect_equal(names(as_corpus_text(character())), NULL) 9 | }) 10 | 11 | 12 | test_that("`names<-` should work on text", { 13 | x <- as_corpus_text(LETTERS) 14 | names(x) <- rev(LETTERS) 15 | expect_equal(names(x), rev(LETTERS)) 16 | }) 17 | 18 | 19 | test_that("setting `names<-` to NULL should restore defaults", { 20 | x <- as_corpus_text(c(a="x", b="y")) 21 | names(x) <- NULL 22 | expect_equal(names(x), NULL) 23 | }) 24 | 25 | 26 | test_that("`as_corpus_text` should not drop names", { 27 | x <- as_corpus_text(c(a="1", b="2")) 28 | expect_equal(names(x), c("a", "b")) 29 | }) 30 | 31 | 32 | test_that("`all.equal` should test names", { 33 | x <- as_corpus_text(1:3) 34 | y <- x 35 | names(y) <- c("a", "b", "c") 36 | expect_equal(all.equal(x, y), "names for current but not for target") 37 | expect_equal(all.equal(y, x), "names for target but not for current") 38 | }) 39 | 40 | 41 | test_that("`as_corpus_text` should not drop names", { 42 | x <- as_corpus_text(c(foo="hello")) 43 | y <- as_corpus_text(x) 44 | 45 | expect_equal(y, as_corpus_text(c(foo="hello"))) 46 | }) 47 | 48 | 49 | test_that("`as_corpus_text` should drop attributes", { 50 | x <- as_corpus_text("hello") 51 | attr(x, "foo") <- "bar" 52 | y <- as_corpus_text(x) 53 | 54 | expect_equal(y, as_corpus_text("hello")) 55 | }) 56 | 57 | 58 | test_that("`as_corpus_text` should drop attributes for JSON objects", { 59 | file <- tempfile() 60 | writeLines('{"text": "hello"}', file) 61 | x <- read_ndjson(file)$text 62 | 63 | attr(x, "foo") <- "bar" 64 | y <- as_corpus_text(x) 65 | 66 | expect_equal(y, as_corpus_text("hello")) 67 | }) 68 | 69 | 70 | test_that("`names<-` should not modify copies", { 71 | x <- as_corpus_text(1:3) 72 | y <- x 73 | names(y) <- c("a", "b", "c") 74 | expect_equal(names(x), NULL) 75 | expect_equal(names(y), c("a", "b", "c")) 76 | }) 77 | 78 | 79 | test_that("`names<-` should preserve attributes", { 80 | x <- as_corpus_text(1:3) 81 | attr(x, "foo") <- "bar" 82 | names(x) <- c("a", "b", "c") 83 | expect_equal(names(x), c("a", "b", "c")) 84 | expect_equal(attr(x, "foo"), "bar") 85 | }) 86 | 87 | 88 | test_that("`names<-` should not allow NA", { 89 | x <- as_corpus_text(1:3) 90 | expect_error(names(x) <- c("a", NA, "b"), 91 | "missing values in 'names' are not allowed") 92 | }) 93 | 94 | 95 | test_that("`names<-` should not allow duplicates", { 96 | x <- as_corpus_text(1:3) 97 | expect_error(names(x) <- c("a", "b", "a"), 98 | "duplicate 'names' are not allowed") 99 | }) 100 | 101 | 102 | test_that("names should error for non-text", { 103 | expect_error(names.corpus_text("hello"), "invalid text object") 104 | }) 105 | -------------------------------------------------------------------------------- /tests/testthat/test-text_nunit.R: -------------------------------------------------------------------------------- 1 | context("text_nunit") 2 | 3 | 4 | test_that("text_nsentence can works on sentences", { 5 | text <- c(a="He said, 'Are you going?' John Shook his head.", 6 | b="'Are you going?' John asked", 7 | c="This. Is. A. Long. Sentence!!!", 8 | d="Why all the shouting??") 9 | n0 <- text_nsentence(text) 10 | split <- text_split(text, "sentences") 11 | n <- c(with(split, tapply(index, parent, length))) 12 | names(n) <- names(text) 13 | expect_equal(n, n0) 14 | }) 15 | 16 | 17 | test_that("text_nsentence handles NA and empty", { 18 | expect_equal(text_nsentence(c(NA, "")), c(NA, 0)) 19 | }) 20 | 21 | 22 | test_that("text_ntoken can works on tokens", { 23 | text <- c(a="He said, 'Are you going?' John Shook his head.", 24 | b="'Are you going?' John asked", 25 | c="This. Is. A. Long. Sentence!!!", 26 | d="Why all the shouting??") 27 | n0 <- text_ntoken(text) 28 | split <- text_split(text, "tokens") 29 | n <- c(with(split, tapply(index, parent, length))) 30 | names(n) <- names(text) 31 | expect_equal(n, n0) 32 | }) 33 | 34 | 35 | test_that("text_ntoken handles NA and empty", { 36 | expect_equal(text_ntoken(c(NA, "")), c(NA, 0)) 37 | }) 38 | -------------------------------------------------------------------------------- /tests/testthat/test-text_primitive.R: -------------------------------------------------------------------------------- 1 | context("text_primitive") 2 | 3 | 4 | test_that("anyNA should work", { 5 | x <- as_corpus_text(c("a", NA, "", "b")) 6 | y <- as_corpus_text(c()) 7 | z <- as_corpus_text(letters) 8 | 9 | expect_true(anyNA(x)) 10 | expect_false(anyNA(y)) 11 | expect_false(anyNA(z)) 12 | }) 13 | 14 | 15 | test_that("converting to character should work", { 16 | x <- c("hello", NA, "world", "") 17 | y <- as_corpus_text(x) 18 | expect_equal(as.character(y), x) 19 | }) 20 | 21 | 22 | test_that("conversions should work", { 23 | expect_equal(as.complex(as_corpus_text("1+2i")), 1+2i) 24 | expect_equal(as.double(as_corpus_text("3.14")), 3.14) 25 | expect_equal(as.integer(as_corpus_text("3.14")), 3) 26 | expect_equal(as.logical(as_corpus_text(c("TRUE", "FALSE", "NA"))), 27 | c(TRUE, FALSE, NA)) 28 | expect_equal(as.numeric(as_corpus_text("3.14")), 3.14) 29 | expect_equal(as.raw(as_corpus_text("1")), as.raw("1")) 30 | 31 | expect_warning(x <- as.numeric(as_corpus_text("foo")), 32 | "NAs introduced by coercion") 33 | expect_equal(x, NA_real_) 34 | }) 35 | 36 | 37 | test_that("is.na should work", { 38 | x <- as_corpus_text(c("a", NA, "", "b")) 39 | expect_equal(is.na(x), c(FALSE, TRUE, FALSE, FALSE)) 40 | expect_equal(is.na(as_corpus_text(c())), logical()) 41 | }) 42 | 43 | 44 | test_that("rep should work", { 45 | x <- as_corpus_text(c("a", "b", "c")) 46 | y <- rep(x, 7) 47 | expect_equal(y, as_corpus_text(rep(c("a", "b", "c"), 7))) 48 | }) 49 | 50 | 51 | test_that("rep should work with names", { 52 | x <- as_corpus_text(c(x="a", y="b")) 53 | y <- rep(x, 2) 54 | expect_equal(y, as_corpus_text(c(x="a", y="b", x.1="a", y.1="b"))) 55 | }) 56 | 57 | 58 | test_that("invalid operations should error", { 59 | x <- as_corpus_text("hello") 60 | expect_error(x$names, "$ operator is invalid for text objects", 61 | fixed = TRUE) 62 | expect_error(x$names <- "foo", "$<- operator is invalid for text objects", 63 | fixed = TRUE) 64 | expect_error(as.environment(x), 65 | "'as.environment' is invalid for text objects") 66 | }) 67 | 68 | test_that("setting length on invalid text should fail", { 69 | x <- letters 70 | expect_error(`length<-.corpus_text`(x, 5), "invalid text object") 71 | }) 72 | 73 | test_that("setting invalid length should fail", { 74 | x <- as_corpus_text(letters) 75 | expect_error(length(x) <- NULL, "'length' cannot be NULL") 76 | expect_error(length(x) <- "1", "'length' must be numeric") 77 | expect_error(length(x) <- c(1, 1), "'length' must have length 1") 78 | expect_error(length(x) <- NA, "'length' cannot be NA") 79 | expect_error(length(x) <- NaN, "'length' cannot be NaN") 80 | expect_error(length(x) <- -1, "'length' cannot be negative") 81 | expect_error(length(x) <- 2^53 + 2, "'length' cannot be above 2\\^53") 82 | }) 83 | 84 | 85 | test_that("setting short length should work", { 86 | x <- as_corpus_text(letters) 87 | length(x) <- 10 88 | expect_equal(x, as_corpus_text(letters[1:10])) 89 | }) 90 | 91 | 92 | test_that("setting same length should work", { 93 | x <- as_corpus_text(letters) 94 | length(x) <- 26 95 | expect_equal(x, as_corpus_text(letters)) 96 | }) 97 | 98 | 99 | test_that("setting long length should work", { 100 | x <- as_corpus_text(letters) 101 | length(x) <- 30 102 | expect_equal(x, as_corpus_text(c(letters, rep(NA, 4)))) 103 | }) 104 | -------------------------------------------------------------------------------- /tests/testthat/test-text_print.R: -------------------------------------------------------------------------------- 1 | context("text_print") 2 | 3 | test_that("'print.text' works without names", { 4 | ctype <- switch_ctype("C") 5 | on.exit(Sys.setlocale("LC_CTYPE", ctype)) 6 | 7 | x <- as_corpus_text(LETTERS) 8 | expected <- c( 9 | ' [1] "A" "B" "C" "D" "E" "F" "G" "H" "I"', 10 | '[10] "J" "K" "L" "M" "N" "O" "P" "Q" "R"', 11 | '[19] "S" "T"', 12 | '... (26 entries total)') 13 | 14 | expect_equal(strsplit(capture_output(print(x), width = 40), "\n")[[1]], 15 | expected) 16 | }) 17 | 18 | 19 | test_that("'print.text' works with names", { 20 | ctype <- switch_ctype("C") 21 | on.exit(Sys.setlocale("LC_CTYPE", ctype)) 22 | 23 | x <- as_corpus_text(LETTERS, names = paste0("foo", 1:26)) 24 | expected <- c( 25 | 'foo1 foo2 foo3 foo4 foo5 foo6 foo7 foo8 ', 26 | '"A" "B" "C" "D" "E" "F" "G" "H" ', 27 | 'foo9 foo10 foo11 foo12 foo13 foo14 foo15 foo16', 28 | '"I" "J" "K" "L" "M" "N" "O" "P" ', 29 | 'foo17 foo18 foo19 foo20', 30 | '"Q" "R" "S" "T" ', 31 | '... (26 entries total)') 32 | 33 | expect_equal(strsplit(capture_output(print(x), width = 50), "\n")[[1]], 34 | expected) 35 | }) 36 | 37 | 38 | test_that("'print.text' works for empty", { 39 | x <- as_corpus_text(character()) 40 | expect_equal(capture_output(print(x)), 41 | "text vector with 0 entries") 42 | }) 43 | 44 | 45 | test_that("'print.text' works for NULL", { 46 | expect_equal(print.corpus_text(NULL), NULL) 47 | }) 48 | 49 | 50 | test_that("'print.text' errors for invalid", { 51 | expect_error(print.corpus_text("hello"), "argument is not a valid text object") 52 | }) 53 | 54 | 55 | test_that("'print.text' with negative rows prints entire object", { 56 | x <- as_corpus_text(LETTERS) 57 | expect_equal(capture_output(print(x, -1)), 58 | capture_output(print(as.character(x)))) 59 | }) 60 | 61 | 62 | test_that("'print.text' errors for invalid inputs", { 63 | x <- as_corpus_text(LETTERS) 64 | expect_error(print(x, chars = -1), "'chars' must be non-negative") 65 | expect_error(print(x, chars = NA), "'chars' cannot be NA") 66 | expect_error(print(x, chars = c(1,1)), "'chars' must have length 1") 67 | expect_error(print(x, na.print = NA), "'na.print' cannot be NA") 68 | expect_error(print(x, print.gap = 1025), "'print.gap' must be less than or equal to 1024") 69 | }) 70 | 71 | 72 | test_that("'format.text' works for empty", { 73 | expect_equal(format(as_corpus_text(character())), character()) 74 | }) 75 | -------------------------------------------------------------------------------- /tests/testthat/test-text_split_tokens.R: -------------------------------------------------------------------------------- 1 | context("text_split_tokens") 2 | 3 | 4 | test_that("'split_tokens' can split into threes", { 5 | text <- c(paste(LETTERS, collapse = " "), 6 | paste(letters, collapse = " ")) 7 | 8 | expect_equal(text_split(text, "tokens", 3), 9 | structure(class = c("corpus_frame", "data.frame"), 10 | data.frame(parent = factor(as.character(c(rep(1, 9), rep(2, 9)))), 11 | index = c(1:9, 1:9), 12 | text = as_corpus_text(c("A B C ", "D E F ", "G H I ", "J K L ", 13 | "M N O ", "P Q R ", "S T U ", "V W X ", 14 | "Y Z", 15 | "a b c ", "d e f ", "g h i ", "j k l ", 16 | "m n o ", "p q r ", "s t u ", "v w x ", 17 | "y z")), 18 | row.names = NULL))) 19 | }) 20 | 21 | 22 | test_that("'split_tokens' doesn't count dropped tokens", { 23 | text <- c(paste(LETTERS, collapse = " "), 24 | paste(letters, collapse = " ")) 25 | f <- text_filter(drop = c("a", "e", "i", "o", "u")) 26 | 27 | expect_equal(text_split(text, "tokens", 5, filter = f), 28 | structure(class = c("corpus_frame", "data.frame"), 29 | data.frame(parent = factor(as.character(c(rep(1, 5), rep(2, 5)))), 30 | index = c(1:5, 1:5), 31 | text = as_corpus_text(c("A B C D E F G ", "H I J K L ", 32 | "M N O P Q ", "R S T U V ", "W X Y Z", 33 | "a b c d e f g ", "h i j k l ", 34 | "m n o p q ", "r s t u v ", 35 | "w x y z"), 36 | filter = f), 37 | row.names = NULL))) 38 | }) 39 | 40 | 41 | test_that("'split_tokens' keeps trailing whitespace", { 42 | expect_equal(text_split("abc ", "tokens", 2), 43 | structure(class = c("corpus_frame", "data.frame"), 44 | data.frame(parent = factor("1"), index = 1, 45 | text = as_corpus_text("abc "), row.names = NULL))) 46 | }) 47 | 48 | 49 | test_that("'split_tokens' handles whitespace-only text", { 50 | expect_equal(text_split(" ", "tokens", 1), 51 | structure(class = c("corpus_frame", "data.frame"), 52 | data.frame(parent = factor("1"), 53 | index = 1, text = as_corpus_text(" "), row.names = NULL))) 54 | }) 55 | 56 | 57 | test_that("'split_tokens' handles empty and missing text", { 58 | expect_equal(text_split(c("", NA, NA, "", "a"), "tokens", 1), 59 | structure(class = c("corpus_frame", "data.frame"), 60 | data.frame(parent = factor(c("1", "4", "5"), 61 | levels = as.character(1:5)), 62 | index = c(1, 1, 1), 63 | text = as_corpus_text(c("", "", "a")), 64 | row.names = NULL))) 65 | }) 66 | -------------------------------------------------------------------------------- /tests/testthat/test-text_stats.R: -------------------------------------------------------------------------------- 1 | context("text_stats") 2 | 3 | 4 | test_that("'text_stats' works on a simple example", { 5 | x <- c("A rose is a rose is a rose.", "A Rose is red. A violet is blue!") 6 | actual <- text_stats(x) 7 | expected <- data.frame(tokens = text_ntoken(x), 8 | types = text_ntype(x), 9 | sentences = text_nsentence(x)) 10 | class(expected) <- c("corpus_frame", "data.frame") 11 | expect_equal(actual, expected) 12 | }) 13 | -------------------------------------------------------------------------------- /tests/testthat/test-text_types.R: -------------------------------------------------------------------------------- 1 | context("text_types") 2 | 3 | test_that("'text_types' works elementwise", { 4 | text <- c("I saw Mr. Jones today.", 5 | NA, 6 | "", 7 | "Split across\na line.", 8 | "What. Are. You. Doing????", 9 | "She asked 'do you really mean that?' and I said 'yes.'") 10 | toks <- text_tokens(text) 11 | typs <- lapply(toks, function(x) unique(sort(x, method = "radix"))) 12 | typs_tot <- unique(sort(c(toks, recursive = TRUE), method = "radix")) 13 | 14 | expect_equal(text_types(text), typs) 15 | expect_equal(text_types(text, collapse = TRUE), typs_tot) 16 | }) 17 | 18 | 19 | test_that("text_ntype works on types", { 20 | expect_equal(text_ntype(LETTERS, collapse = TRUE), 26) 21 | 22 | expect_equal(text_ntype(paste(LETTERS, letters, LETTERS)), 23 | rep(1, 26)) 24 | }) 25 | 26 | 27 | test_that("text_ntype handles NA, empty", { 28 | expect_equal(text_ntype(c("", NA, "hello world")), 29 | c(0, NA, 2)) 30 | 31 | expect_equal(text_ntype(c("", NA, "hello world"), collapse = TRUE), 32 | NA_real_) 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test-wordlist.R: -------------------------------------------------------------------------------- 1 | 2 | context("wordlist") 3 | 4 | 5 | test_that("'abbreviations' has common acronyms", { 6 | expect_true(all(c("Mr.", "Mrs.", "Ms.") %in% abbreviations_en)) 7 | expect_true(all(c("ap. J.-C.", "av. J.-C.") %in% abbreviations_fr)) 8 | }) 9 | 10 | 11 | test_that("'stopwords' has common function words", { 12 | expect_true(all(c("the", "and", "is") %in% stopwords_en)) 13 | }) 14 | -------------------------------------------------------------------------------- /vignettes/chinese-wordcloud-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/chinese-wordcloud-1.png -------------------------------------------------------------------------------- /vignettes/corpus-emotion-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-emotion-1.png -------------------------------------------------------------------------------- /vignettes/corpus-heapslaw-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-heapslaw-1.png -------------------------------------------------------------------------------- /vignettes/corpus-witch-occurrences-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-witch-occurrences-1.png -------------------------------------------------------------------------------- /vignettes/gender-estimates-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates-1.png -------------------------------------------------------------------------------- /vignettes/gender-estimates_se-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates_se-1.png -------------------------------------------------------------------------------- /vignettes/gender-signif-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-signif-1.png --------------------------------------------------------------------------------