├── .Rbuildignore
├── .Rprofile
├── .gitignore
├── .gitmodules
├── .travis.yml
├── CONDUCT.md
├── DESCRIPTION
├── LICENSE
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    ├── coerce.R
    ├── corpus_frame.R
    ├── deprecated.R
    ├── frame-stats.R
    ├── frame.R
    ├── gutenberg.R
    ├── json.R
    ├── logging.R
    ├── sentence.R
    ├── stem.R
    ├── style.R
    ├── term.R
    ├── text-base.R
    ├── text-primitive.R
    ├── text-stats.R
    ├── text-utils.R
    ├── text.R
    ├── text_filter.R
    ├── text_locate.R
    ├── text_split.R
    ├── text_stats.R
    ├── text_types.R
    ├── token.R
    ├── util.R
    └── wordlist.R
├── README.md
├── TODO.md
├── _pkgdown.yml
├── appveyor.yml
├── bench
    ├── bench-term_matrix.R
    ├── bench-term_matrix.Rout
    └── bench.R
├── data
    ├── abbreviations.rda
    ├── abbreviations
    │   └── 01_make_abbreviations.R
    ├── affect_wordnet.rda
    ├── affect_wordnet
    │   ├── 01_make_tsv.py
    │   ├── 02_make_rda.R
    │   ├── README
    │   ├── wn-affect-1.1
    │   │   ├── a-hierarchy.xml
    │   │   ├── a-synsets.xml
    │   │   └── readme-wn-affect-1.1.txt
    │   ├── wnaffect.tsv
    │   └── wordnet-1.6
    │   │   ├── LICENSE
    │   │   ├── dict
    │   │       ├── Makefile
    │   │       ├── adj.exc
    │   │       ├── adv.exc
    │   │       ├── cntlist
    │   │       ├── cousin.exc
    │   │       ├── cousin.tops
    │   │       ├── data.adj
    │   │       ├── data.adv
    │   │       ├── data.noun
    │   │       ├── data.verb
    │   │       ├── index.adj
    │   │       ├── index.adv
    │   │       ├── index.gloss
    │   │       ├── index.noun
    │   │       ├── index.sense
    │   │       ├── index.verb
    │   │       ├── lexnames
    │   │       ├── noun.exc
    │   │       ├── sentidx.vrb
    │   │       ├── sents.vrb
    │   │       ├── stoplist.pl
    │   │       └── verb.exc
    │   │   └── man
    │   │       └── html
    │   │           ├── binsrch.htm
    │   │           ├── cntlist.htm
    │   │           ├── glossidx.htm
    │   │           ├── lexnames.htm
    │   │           ├── morph.htm
    │   │           ├── morphy.htm
    │   │           ├── senseidx.htm
    │   │           ├── taglist.htm
    │   │           ├── uniqbeg.htm
    │   │           ├── wn.htm
    │   │           ├── wnb.htm
    │   │           ├── wndb.htm
    │   │           ├── wngloss.htm
    │   │           ├── wngroups.htm
    │   │           ├── wninput.htm
    │   │           ├── wnintro1.htm
    │   │           ├── wnintro3.htm
    │   │           ├── wnintro5.htm
    │   │           ├── wnintro7.htm
    │   │           ├── wnlicens.htm
    │   │           ├── wnpkgs.htm
    │   │           ├── wnsearch.htm
    │   │           ├── wnstats.htm
    │   │           └── wnutil.htm
    ├── federalist.rda
    ├── federalist
    │   ├── .gitignore
    │   ├── 01_download_raw.sh
    │   ├── 02_make_json.py
    │   └── 03_make_rda.R
    ├── sentiment_afinn.rda
    ├── sentiment_afinn
    │   ├── 01_download_raw.sh
    │   ├── 02_make_rda.R
    │   └── AFINN
    │   │   ├── AFINN-111.txt
    │   │   ├── AFINN-96.txt
    │   │   └── AFINN-README.txt
    ├── stopwords.rda
    └── stopwords
    │   └── 01_make_stopwords.R
├── docs
    ├── CNAME
    ├── LICENSE.html
    ├── articles
    │   ├── chinese-wordcloud-1.png
    │   ├── chinese.html
    │   ├── corpus-emotion-1.png
    │   ├── corpus-heapslaw-1.png
    │   ├── corpus-witch-occurrences-1.png
    │   ├── corpus.html
    │   ├── gender-estimates-1.png
    │   ├── gender-estimates_se-1.png
    │   ├── gender-signif-1.png
    │   ├── gender.html
    │   ├── index.html
    │   ├── stemmer.html
    │   ├── textdata.html
    │   └── unicode.html
    ├── authors.html
    ├── favicon.ico
    ├── index.html
    ├── jquery.sticky-kit.min.js
    ├── link.svg
    ├── logo.png
    ├── news
    │   └── index.html
    ├── pkgdown.css
    ├── pkgdown.js
    └── reference
    │   ├── abbreviations.html
    │   ├── affect_wordnet.html
    │   ├── corpus-deprecated.html
    │   ├── corpus-package.html
    │   ├── corpus_frame.html
    │   ├── corpus_text.html
    │   ├── federalist.html
    │   ├── figures
    │       ├── banner.png
    │       ├── logo.png
    │       └── logo
    │       │   ├── 01_make_logo.R
    │       │   ├── README
    │       │   ├── c-07.jpg
    │       │   ├── logo-slide.tiff
    │       │   └── logo.key
    │   ├── gutenberg_corpus.html
    │   ├── index.html
    │   ├── new_stemmer.html
    │   ├── print.corpus_frame.html
    │   ├── read_ndjson.html
    │   ├── sentiment_afinn.html
    │   ├── stem_snowball.html
    │   ├── stopwords.html
    │   ├── term_matrix.html
    │   ├── term_stats.html
    │   ├── text_filter.html
    │   ├── text_locate.html
    │   ├── text_split.html
    │   ├── text_stats.html
    │   ├── text_sub.html
    │   ├── text_tokens.html
    │   ├── text_types.html
    │   └── utf8.html
├── inst
    └── WORDLIST
├── man
    ├── abbreviations.Rd
    ├── affect_wordnet.Rd
    ├── corpus-deprecated.Rd
    ├── corpus-package.Rd
    ├── corpus_frame.Rd
    ├── corpus_text.Rd
    ├── federalist.Rd
    ├── figures
    │   ├── banner.png
    │   ├── logo.png
    │   └── logo
    │   │   ├── 01_make_logo.R
    │   │   ├── README
    │   │   ├── c-07.jpg
    │   │   ├── logo-slide.tiff
    │   │   └── logo.key
    ├── gutenberg_corpus.Rd
    ├── new_stemmer.Rd
    ├── print.corpus_frame.Rd
    ├── read_ndjson.Rd
    ├── sentiment_afinn.Rd
    ├── stem_snowball.Rd
    ├── stopwords.Rd
    ├── term_matrix.Rd
    ├── term_stats.Rd
    ├── text_filter.Rd
    ├── text_locate.Rd
    ├── text_split.Rd
    ├── text_stats.Rd
    ├── text_sub.Rd
    ├── text_tokens.Rd
    └── text_types.Rd
├── src
    ├── Makevars
    ├── context.c
    ├── decode.c
    ├── filebuf.c
    ├── init.c
    ├── json.c
    ├── logging.c
    ├── mkchar.c
    ├── ndjson.c
    ├── rcorpus.h
    ├── search.c
    ├── stemmer.c
    ├── term_matrix.c
    ├── term_stats.c
    ├── termset.c
    ├── text.c
    ├── text_c.c
    ├── text_filter.c
    ├── text_locate.c
    ├── text_methods.c
    ├── text_nunit.c
    ├── text_split.c
    ├── text_sub.c
    ├── text_tokens.c
    ├── text_trunc.c
    ├── text_types.c
    ├── util.c
    └── wordlist.c
├── tests
    ├── testthat.R
    └── testthat
    │   ├── helper-capture_output.R
    │   ├── helper-locale.R
    │   ├── helper-options.R
    │   ├── test-foreign.R
    │   ├── test-frame-stats.R
    │   ├── test-frame.R
    │   ├── test-gutenberg_corpus.R
    │   ├── test-json_record.R
    │   ├── test-json_scalar.R
    │   ├── test-json_serialize.R
    │   ├── test-read_ndjson.R
    │   ├── test-stemmer.R
    │   ├── test-term_counts.R
    │   ├── test-term_matrix.R
    │   ├── test-term_stats.R
    │   ├── test-text-stats.R
    │   ├── test-text.R
    │   ├── test-text_base.R
    │   ├── test-text_c.R
    │   ├── test-text_filter.R
    │   ├── test-text_format.R
    │   ├── test-text_index.R
    │   ├── test-text_locate.R
    │   ├── test-text_names.R
    │   ├── test-text_nunit.R
    │   ├── test-text_primitive.R
    │   ├── test-text_print.R
    │   ├── test-text_split_sentences.R
    │   ├── test-text_split_tokens.R
    │   ├── test-text_stats.R
    │   ├── test-text_sub.R
    │   ├── test-text_tokens.R
    │   ├── test-text_types.R
    │   └── test-wordlist.R
└── vignettes
    ├── chinese-wordcloud-1.png
    ├── chinese.Rmd
    ├── chinese.Rmd.in
    ├── corpus-emotion-1.png
    ├── corpus-heapslaw-1.png
    ├── corpus-witch-occurrences-1.png
    ├── corpus.Rmd
    ├── corpus.Rmd.in
    ├── gender-estimates-1.png
    ├── gender-estimates_se-1.png
    ├── gender-signif-1.png
    ├── gender.Rmd
    ├── gender.Rmd.in
    ├── stemmer.Rmd
    ├── stemmer.Rmd.in
    ├── textdata.Rmd
    └── textdata.Rmd.in


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | [.]a$
 2 | [.]o$
 3 | [.]so$
 4 | ^[.]Rprofile$
 5 | ^[.]git
 6 | ^[.]travis[.]yml$
 7 | ^_pkgdown[.]yml$
 8 | ^CONDUCT[.]md$
 9 | ^Makefile$
10 | ^NEWS[.]md$
11 | ^README[.]md$
12 | ^TODO[.]md$
13 | ^appveyor[.]yml$
14 | ^bench$
15 | ^data/abbreviations$
16 | ^data/abbreviations/
17 | ^data/affect_wordnet$
18 | ^data/affect_wordnet/
19 | ^data/federalist$
20 | ^data/federalist/
21 | ^data/sentiment_afinn$
22 | ^data/sentiment_afinn/
23 | ^data/stopwords$
24 | ^data/stopwords/
25 | ^dist$
26 | ^docs$
27 | ^man/figures/logo$
28 | ^man/figures/logo/
29 | ^src/corpus/[.]git$
30 | ^src/corpus/[.]travis[.]yml$
31 | ^src/corpus/Makefile$
32 | ^src/corpus/data$
33 | ^src/corpus/lib/utf8lite/[.]git$
34 | ^src/corpus/lib/utf8lite/[.]travis[.]yml$
35 | ^src/corpus/lib/utf8lite/Makefile$
36 | ^src/corpus/lib/utf8lite/data$
37 | ^vignettes/gender[.]Rmd
38 | ^vignettes/gender-.*png
39 | ^vignettes/.*[.]in$
40 | ^_pkgdown\.yml$
41 | 


--------------------------------------------------------------------------------
/.Rprofile:
--------------------------------------------------------------------------------
1 | if (interactive()) {
2 |     if (requireNamespace("devtools", quietly = TRUE)) {
3 |         devtools::load_all(".")
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.a
2 | *.o
3 | *.so
4 | .Rhistory
5 | /NEWS
6 | /README
7 | /dist/
8 | /docs/articles/*.Rmd.in
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/corpus"]
2 | 	path = src/corpus
3 | 	url = https://github.com/patperry/corpus.git
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: R
 2 | cache: packages
 3 | 
 4 | # valgrind (disabled; takes too long, and errors don't affect the build)
 5 | #
 6 | # r_check_args: '--use-valgrind'
 7 | #
 8 | # addons:
 9 | #  apt:
10 | #    packages:
11 | #    - valgrind
12 | 
13 | r_github_packages:
14 |   - jimhester/covr
15 | 
16 | before_install:
17 |   - make NEWS
18 |   - make README
19 | 
20 | # for devel version of 'utf8', put the following in `before_install`:
21 | #  - pushd $(mktemp -d)
22 | #  - git clone --recursive https://github.com/patperry/r-utf8.git
23 | #  - Rscript -e 'devtools::install("r-utf8")'
24 | #  - popd
25 | 
26 | matrix:
27 |   include:
28 |     - os: linux
29 |       r: oldrel
30 |     - os: linux
31 |       dist: trusty
32 |       r: release
33 |       env: R_CODECOV=true
34 |     - os: linux
35 |       r: devel
36 | 
37 | warnings_are_errors: true
38 | 
39 | after_success:
40 |   - export LC_ALL="C"
41 |   - export TEST_WEB_RESOURCES="true"
42 |   - if [[ "${R_CODECOV}" ]]; then Rscript -e 'covr::codecov(line_exclusions = c("R/deprecated.R", "R/wordlist.R", "src/wordlist.c", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))'; fi
43 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age,
 8 | body size, disability, ethnicity, gender identity and expression, level of
 9 | experience, nationality, personal appearance, race, religion, or sexual
10 | identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an
52 | appointed representative at an online or offline event. Representation of a
53 | project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by opening an issue or contacting one or more of the project
59 | maintainers. All complaints will be reviewed and investigated and will result
60 | in a response that is deemed necessary and appropriate to the circumstances.
61 | The project team is obligated to maintain confidentiality with regard to the
62 | reporter of an incident.  Further details of specific enforcement policies may
63 | be posted separately.
64 | 
65 | Project maintainers who do not follow or enforce the Code of Conduct in good
66 | faith may face temporary or permanent repercussions as determined by other
67 | members of the project's leadership.
68 | 
69 | ## Attribution
70 | 
71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
72 | version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
73 | 
74 | [homepage]: http://contributor-covenant.org
75 | [version]: http://contributor-covenant.org/version/1/4/
76 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: corpus
 2 | Version: 0.10.0.9000
 3 | Title: Text Corpus Analysis
 4 | Authors@R: c(
 5 |   person(c("Patrick", "O."), "Perry",
 6 |          role = c("aut", "cph", "cre"),
 7 |          email = "pperry@stern.nyu.edu"),
 8 |   person(c("Finn", "\u00c5rup"), "Nielsen",
 9 |          role = c("cph", "dtc"),
10 |          comment = "AFINN Sentiment Lexicon"),
11 |   person("Martin Porter and Richard Boulton",
12 |          role = c("ctb", "cph", "dtc"),
13 |          comment = "Snowball Stemmer and Stopword Lists"),
14 |   person("The Regents of the University of California",
15 |          role = c("ctb", "cph"),
16 |          comment = "Strtod Library Procedure"),
17 |   person("Carlo Strapparava and Alessandro Valitutti",
18 |          role = c("cph", "dtc"),
19 |          comment = "WordNet-Affect Lexicon"),
20 |   person("Unicode, Inc.",
21 |          role = c("cph", "dtc"),
22 |          comment = "Unicode Character Database"))
23 | Depends:
24 |   R (>= 3.3),
25 | Imports:
26 |   stats,
27 |   utf8 (>= 1.1.0)
28 | Suggests:
29 |   knitr,
30 |   Matrix,
31 |   testthat
32 | Enhances:
33 |   quanteda,
34 |   tm
35 | Description: Text corpus data analysis, with full support for international text (Unicode).  Functions for reading data from newline-delimited 'JSON' files, for normalizing and tokenizing text, for searching for term occurrences, and for computing term occurrence frequencies, including n-grams.
36 | License: Apache License (== 2.0) | file LICENSE
37 | URL: http://corpustext.com,
38 |      https://github.com/patperry/r-corpus
39 | BugReports: https://github.com/patperry/r-corpus/issues
40 | LazyData: Yes
41 | Encoding: UTF-8
42 | VignetteBuilder: knitr
43 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | RSCRIPT= Rscript --vanilla
 2 | CORPUS_LIB= src/corpus.so
 3 | BUILT_VIGNETTES= \
 4 | 	vignettes/chinese.Rmd vignettes/corpus.Rmd vignettes/gender.Rmd \
 5 | 	vignettes/stemmer.Rmd vignettes/textdata.Rmd
 6 | 
 7 | all: $(CORPUS_LIB) $(BUILT_VIGNETTES)
 8 | 
 9 | $(CORPUS_LIB):
10 | 	$(RSCRIPT) -e 'devtools::compile_dll(".")'
11 | 
12 | NEWS: NEWS.md
13 | 	sed -e 's/^### //g; s/`//g' $< > $@
14 | 
15 | README: README.md
16 | 	sed -e '/\*Corpus\*/,$$!d' \
17 | 		-e 's/…../.../' \
18 | 		-e 's/..…/.../' \
19 | 		-e 's/⋮/./' $< > $@
20 | 
21 | vignettes/%.Rmd: vignettes/%.Rmd.in
22 | 	$(RSCRIPT) -e 'devtools::load_all("."); setwd("vignettes"); knitr::knit(basename("$<"), basename("$@"))'
23 | 
24 | bench:
25 | 	$(RSCRIPT) -e 'devtools::load_all("."); source("bench/bench.R")'
26 | 
27 | check: $(CORPUS_LIB)
28 | 	$(RSCRIPT) -e 'devtools::test(".")'
29 | 
30 | clean:
31 | 	$(RSCRIPT) -e 'devtools::clean_dll(".")'
32 | 
33 | cov:
34 | 	$(RSCRIPT) -e 'covr::package_coverage(line_exclusions = c("R/deprecated.R", list.files("src/corpus", recursive = TRUE, full.names = TRUE)))'
35 | 
36 | dist: $(BUILT_VIGNETTES) NEWS README
37 | 	mkdir -p dist && cd dist && R CMD build ..
38 | 
39 | distclean: clean
40 | 	rm -rf $(BUILT_VIGNETTES)
41 | 
42 | doc: $(BUILT_VIGNETTES) NEWS README
43 | 
44 | install: $(CORPUS_LIB)
45 | 	$(RSCRIPT) -e 'devtools::install(".")'
46 | 
47 | site: $(BUILT_VIGNETTES)
48 | 	$(RSCRIPT) -e 'pkgdown::build_site(".")'
49 | 
50 | .PHONY: all bench check clean con dist distclean doc install site
51 | 


--------------------------------------------------------------------------------
/R/deprecated.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 


--------------------------------------------------------------------------------
/R/frame-stats.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | na.fail.corpus_frame <- function(object, ...)
16 | {
17 |     if (!any(vapply(object, anyNA, FALSE)))
18 |         object
19 |     else stop("missing values in object")
20 | }
21 | 
22 | 
23 | na.omit.corpus_frame <- function(object, ...)
24 | {
25 |     if (!any(vapply(object, anyNA, FALSE))) {
26 |         return(object)
27 |     }
28 | 
29 |     # find the missing entries; cast to a matrix
30 |     na <- matrix(c(lapply(object, is.na), recursive = TRUE),
31 |                  ncol = length(object))
32 | 
33 |     # find rows containing missing entries
34 |     omit <- which(apply(na, 1, any))
35 |     names(omit) <- rownames(object)[omit]
36 | 
37 |     # drop the rows that miss observations
38 |     object <- object[-omit,,drop = FALSE]
39 |     attr(omit, "class") <- "omit"
40 |     attr(object, "na.action") <- omit
41 |     object
42 | }
43 | 
44 | 
45 | na.exclude.corpus_frame <- function(object, ...)
46 | {
47 |     object <- na.omit.corpus_frame(object, ...)
48 |     exclude <- attr(object, "na.action")
49 |     if (is.null(exclude)) {
50 |         return(object)
51 |     }
52 | 
53 |     attr(exclude, "class") <- "exclude"
54 |     attr(object, "na.action") <- exclude
55 |     object
56 | }
57 | 


--------------------------------------------------------------------------------
/R/logging.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | logging_off <- function()
17 | {
18 |     .Call(C_logging_off)
19 | }
20 | 
21 | 
22 | logging_on <- function()
23 | {
24 |     .Call(C_logging_on)
25 | }
26 | 


--------------------------------------------------------------------------------
/R/sentence.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | text_nsentence <- function(x, filter = NULL, ...)
17 | {
18 |     with_rethrow({
19 |         x <- as_corpus_text(x, filter, ...)
20 |     })
21 |     .Call(C_text_nsentence, x)
22 | }
23 | 


--------------------------------------------------------------------------------
/R/stem.R:
--------------------------------------------------------------------------------
  1 | #  Copyright 2017 Patrick O. Perry.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | 
 16 | stem_snowball <- function(x, algorithm = "en")
 17 | {
 18 |     with_rethrow({
 19 |         x <- as_character_vector("x", x)
 20 |         algorithm <- as_snowball_algorithm("algorithm", algorithm)
 21 |     })
 22 | 
 23 |     .Call(C_stem_snowball, x, algorithm)
 24 | }
 25 | 
 26 | 
 27 | new_stemmer<- function(term, stem, default = NULL, duplicates = "first",
 28 |                        vectorize = TRUE)
 29 | {
 30 |     call <- sys.call()
 31 |     with_rethrow({
 32 |         term <- as_character_vector("term", term)
 33 |         stem <- as_character_vector("stem", stem)
 34 |         default <- as_character_scalar("default", default)
 35 |         duplicates <- as_enum("duplicates", duplicates,
 36 |                               c("first", "last", "omit", "fail"))
 37 |     })
 38 | 
 39 |     if (is.null(term)) {
 40 |         term <- character()
 41 |     }
 42 | 
 43 |     if (is.null(stem)) {
 44 |         stem <- character()
 45 |     }
 46 | 
 47 |     if (length(term) != length(stem)) {
 48 |         stop("'term' argument length must equal 'stem' argument length")
 49 |     }
 50 | 
 51 |     if (duplicates == "last") {
 52 |         term <- rev(term)
 53 |         stem <- rev(stem)
 54 |         duplicates <- "first"
 55 |     }
 56 | 
 57 |     if (duplicates != "first") {
 58 |         dup <- duplicated(term)
 59 |         if (duplicates == "omit") {
 60 |             dups <- term[dup]
 61 |             rm <- term %in% dups
 62 |             term <- term[!rm]
 63 |             stem <- stem[!rm]
 64 |         } else if (any(dup)) { # duplicates == "fail"
 65 |             stop("'term' argument entries must be unique")
 66 |         }
 67 |     }
 68 | 
 69 |     # parse dynamically so that we can add a comment with the function call
 70 |     comment <- paste("    #", deparse(call), collapse = "\n")
 71 |     if (is.null(default)) {
 72 |         src <- paste('function(x) {',
 73 |             comment,
 74 |             '    i <- match(x, term, 0L)',
 75 |             '    if (i > 0L)',
 76 |             '        stem[[i]]',
 77 |             '    else x',
 78 |             '}',
 79 |             sep = '\n')
 80 |     } else {
 81 |         src <- paste('function(x) {',
 82 |             comment,
 83 |             '    i <- match(x, term, 0L)',
 84 |             '    if (i > 0L)',
 85 |             '        stem[[i]]',
 86 |             '    else default',
 87 |             '}',
 88 |             sep = '\n')
 89 |     }
 90 | 
 91 |     env <- new.env()
 92 |     assign("term", term, env)
 93 |     assign("stem", stem, env)
 94 |     assign("default", default, env)
 95 |     stem_term <- eval(parse(text = src), env)
 96 | 
 97 |     if (vectorize) {
 98 |         vsrc <- paste('function(x) {',
 99 |             comment,
100 |             '    vapply(x, stem_term, "", USE.NAMES = !is.null(names(x)))',
101 |             '}',
102 |             sep = '\n')
103 |         assign("stem_term", stem_term, env)
104 |         stem_term <- eval(parse(text = vsrc, keep.source = TRUE), env)
105 |     }
106 | 
107 |     stem_term
108 | }
109 | 


--------------------------------------------------------------------------------
/R/style.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | # RStudio doesn't support ANSI faint, use gray instead
17 | style_faint <- "38;5;246" #666666
18 | 
19 | # RStudio ANSI bold is broken, use color instead
20 | # https://github.com/rstudio/rstudio/issues/1721
21 | style_bold <- "38;5;203" #FF3333
22 | #style_bold <- "36" # cyan
23 | #style_bold <- "38;5;63" #3333FF
24 | 


--------------------------------------------------------------------------------
/R/text-stats.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | na.fail.corpus_text <- function(object, ...)
16 | {
17 |     if (!anyNA(object))
18 |         object
19 |     else stop("missing values in object")
20 | }
21 | 
22 | 
23 | na.omit.corpus_text <- function(object, ...)
24 | {
25 |     if (!anyNA(object)) {
26 |         return(object)
27 |     }
28 | 
29 |     omit <- which(is.na(object))
30 |     names(omit) <- names(object)[omit]
31 |     object <- object[-omit]
32 |     attr(omit, "class") <- "omit"
33 |     attr(object, "na.action") <- omit
34 |     object
35 | }
36 | 
37 | 
38 | na.exclude.corpus_text <- function(object, ...)
39 | {
40 |     object <- na.omit.corpus_text(object, ...)
41 |     exclude <- attr(object, "na.action")
42 |     if (is.null(exclude)) {
43 |         return(object)
44 |     }
45 | 
46 |     attr(exclude, "class") <- "exclude"
47 |     attr(object, "na.action") <- exclude
48 |     object
49 | }
50 | 


--------------------------------------------------------------------------------
/R/text-utils.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | str.corpus_text <- function(object, ...)
16 | {
17 |     n <- length(object)
18 |     if (n == 0) {
19 |         "text(0)"
20 |     } else {
21 |         paste0("text [1:", n, "]")
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/R/text.R:
--------------------------------------------------------------------------------
  1 | #  Copyright 2017 Patrick O. Perry.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | as_corpus_text <- function(x, filter = NULL, ..., names = NULL)
 16 | {
 17 |     UseMethod("as_corpus_text")
 18 | }
 19 | 
 20 | 
 21 | as_corpus_text.default <- function(x, filter = NULL, ..., names = NULL)
 22 | {
 23 |     if (length(dim(x)) > 1) {
 24 |         stop("cannot convert multi-dimensional array to text")
 25 |     }
 26 | 
 27 |     x <- structure(as.character(x), names = names(x))
 28 |     as_corpus_text(x, filter = filter, ..., names = names)
 29 | }
 30 | 
 31 | 
 32 | as_corpus_text.character <- function(x, filter = NULL, ..., names = NULL)
 33 | {
 34 |     if (length(dim(x)) > 1) {
 35 |         stop("cannot convert multi-dimensional array to text")
 36 |     }
 37 | 
 38 |     with_rethrow({
 39 |         x <- as_utf8(x)
 40 |     })
 41 | 
 42 |     if (is.null(names)) {
 43 |         names <- names(x)
 44 |         if (anyDuplicated(names)) {
 45 |             warning("renaming entries with duplicate names")
 46 |             names <- make.unique(names)
 47 |         }
 48 |     }
 49 | 
 50 |     x <- .Call(C_as_text_character, x, NULL)
 51 |     as_corpus_text(x, filter = filter, ..., names = names)
 52 | }
 53 | 
 54 | 
 55 | as_corpus_text.corpus_json <- function(x, filter = NULL, ..., names = NULL)
 56 | {
 57 |     if (length(dim(x)) == 2) {
 58 |         if (!"text" %in% names(x)) {
 59 |             stop("no column named \"text\" in JSON object")
 60 |         }
 61 |         x <- x[["text"]]
 62 |     } else {
 63 |         x <- .Call(C_as_text_json, x, NULL)
 64 |     }
 65 |     as_corpus_text(x, filter = filter, ..., names = names)
 66 | }
 67 | 
 68 | 
 69 | as_corpus_text.corpus_text <- function(x, filter = NULL, ..., names = NULL)
 70 | {
 71 |     if (!is_corpus_text(x)) {
 72 |         stop("argument is not a valid text object")
 73 |     }
 74 | 
 75 |     with_rethrow({
 76 |         filter <- as_filter("filter", filter)
 77 |         names <- as_names("names", names, length(x))
 78 |     })
 79 | 
 80 |     attrs <- attributes(x)
 81 |     for (a in names(attrs)) {
 82 |         if (!a %in% c("class", "names")) {
 83 |             attr(x, a) <- NULL
 84 |         }
 85 |     }
 86 |     attr(x, "class") <- "corpus_text"
 87 | 
 88 |     if (!is.null(names)) {
 89 |         names(x) <- names
 90 |     }
 91 |     if (!is.null(filter)) {
 92 |         text_filter(x) <- filter
 93 |     }
 94 | 
 95 |     props <- list(...)
 96 |     if (length(props) > 0) {
 97 |         pnames <- names(props)
 98 |         if (is.null(pnames) || any(pnames == "")) {
 99 |             stop("unnamed arguments are not allowed")
100 |         }
101 |         f <- text_filter(x)
102 |         for (name in names(props)) {
103 |             f[[name]] <- props[[name]]
104 |         }
105 |         text_filter(x) <- f
106 |     }
107 | 
108 |     x
109 | }
110 | 
111 | 
112 | as_corpus_text.data.frame <- function(x, filter = NULL, ..., names = NULL)
113 | {
114 |     if (!is.data.frame(x)) {
115 |         stop("argument is not a valid data frame")
116 |     }
117 |     if (!"text" %in% names(x)) {
118 |             stop("no column named \"text\" in data frame")
119 |     }
120 | 
121 |     text <- x[["text"]]
122 |     if (.row_names_info(x) > 0) {
123 |         names(text) <- row.names(x)
124 |     }
125 | 
126 |     as_corpus_text(text, filter = filter, ..., names = names)
127 | }
128 | 
129 | 
130 | # tm::Corpus
131 | as_corpus_text.Corpus <- function(x, filter = NULL, ..., names = NULL)
132 | {
133 |     with_package("tm", {
134 |         x <- vapply(x, as.character, "")
135 |     })
136 |     as_corpus_text(x, filter = filter, ..., names = names)
137 | }
138 | 
139 | # quanteda::corpus
140 | as_corpus_text.corpus <- function(x, filter = NULL, ..., names = NULL)
141 | {
142 |     with_package("quanteda", {
143 |         text <- quanteda::texts(x)
144 |     })
145 |     as_corpus_text(text, filter = filter, ..., names = names)
146 | }
147 | 
148 | 
149 | is_corpus_text <- function(x)
150 | {
151 |     if (!inherits(x, "corpus_text")) {
152 |         return(FALSE)
153 |     }
154 |     .Call(C_text_valid, x)
155 | }
156 | 


--------------------------------------------------------------------------------
/R/text_split.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | text_split <- function(x, units = "sentences", size = 1, filter = NULL, ...)
17 | {
18 |     with_rethrow({
19 |         x <- as_corpus_text(x, filter, ...)
20 |         units <- as_enum("units", units, choices = c("sentences", "tokens"))
21 |         size <- as_size(size)
22 |     })
23 | 
24 |     if (units == "sentences") {
25 |         ans <- .Call(C_text_split_sentences, x, size)
26 |     } else {
27 |         stopifnot(units == "tokens")
28 |         ans <- .Call(C_text_split_tokens, x, size)
29 |     }
30 | 
31 |     ans$parent <- structure(as.integer(ans$parent), class = "factor",
32 |                             levels = labels(x))
33 |     ans
34 | }
35 | 


--------------------------------------------------------------------------------
/R/text_stats.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | text_stats <- function(x, filter = NULL, ...)
17 | {
18 |     with_rethrow({
19 |         x <- as_corpus_text(x, filter, ...)
20 |     })
21 | 
22 |     ans <- data.frame(tokens = text_ntoken(x),
23 |                       types = text_ntype(x),
24 |                       sentences = text_nsentence(x),
25 |                       row.names = names(x))
26 |     class(ans) <- c("corpus_frame", "data.frame")
27 |     ans
28 | }
29 | 


--------------------------------------------------------------------------------
/R/text_types.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | text_ntype <- function(x, filter = NULL, collapse = FALSE, ...)
17 | {
18 |     with_rethrow({
19 |         x <- as_corpus_text(x, filter, ...)
20 |         collapse <- as_option("collapse", collapse)
21 |     })
22 |     .Call(C_text_ntype, x, collapse)
23 | }
24 | 
25 | 
26 | text_types <- function(x, filter = NULL, collapse = FALSE, ...)
27 | {
28 |     with_rethrow({
29 |         x <- as_corpus_text(x, filter, ...)
30 |         collapse <- as_option("collapse", collapse)
31 |     })
32 |     typs <- .Call(C_text_types, x, collapse)
33 |     if (collapse) {
34 |         typs <- sort(typs, method = "radix")
35 |     } else {
36 |         typs <- lapply(typs, sort, method = "radix")
37 |     }
38 |     typs
39 | }
40 | 


--------------------------------------------------------------------------------
/R/token.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | text_tokens <- function(x, filter = NULL, ...)
17 | {
18 |     with_rethrow({
19 |         x <- as_corpus_text(x, filter, ...)
20 |     })
21 |     .Call(C_text_tokens, x)
22 | }
23 | 
24 | 
25 | text_ntoken <- function(x, filter = NULL, ...)
26 | {
27 |     with_rethrow({
28 |         x <- as_corpus_text(x, filter, ...)
29 |     })
30 |     .Call(C_text_ntoken, x)
31 | }
32 | 
33 | 
34 | text_sub <- function(x, start = 1L, end = -1L, filter = NULL, ...)
35 | {
36 |     with_rethrow({
37 |         x <- as_corpus_text(x, filter, ...)
38 |     })
39 |     n <- length(x)
40 | 
41 |     if (!(is.numeric(start)
42 |           && (length(dim(start)) <= 1
43 |               || is.matrix(start) && ncol(start) == 2))) {
44 |         stop("'start' must be an integer vector or two-column matrix")
45 |     }
46 | 
47 |     nstart <- if (is.matrix(start)) nrow(start) else length(start)
48 |     if ((nstart == 0 && n > 0) || (nstart > 0 && n %% nstart != 0)) {
49 |         stop("'start' length does not evenly divide argument length")
50 |     }
51 | 
52 |     if (is.matrix(start)) {
53 |         if (!missing(end)) {
54 |             warning("'end' argument is ignored when 'start' is a two-column matrix")
55 |         }
56 |         end <- as.integer(start[,2])
57 |         start <- as.integer(start[,1])
58 |     } else {
59 |         start <- as.integer(start)
60 | 
61 |         if (!(is.numeric(end) && length(dim(end)) <= 1)) {
62 |             stop("'end' must be an integer vector")
63 |         }
64 | 
65 |         nend <- length(end)
66 |         if ((nend == 0 && n > 0) || (nend > 0 && n %% nend != 0)) {
67 |             stop("'end' length does not evenly divide argument length")
68 |         }
69 |         end <- as.integer(end)
70 |     }
71 | 
72 |     .Call(C_text_sub, x, start, end)
73 | }
74 | 


--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | with_rethrow <- function(expr)
17 | {
18 |     parentcall <- sys.call(-1)
19 |     eval(envir = parent.frame(),
20 |         withCallingHandlers(expr,
21 |             error = function(e, call = parentcall) {
22 |                 e$call <- call
23 |                 stop(e)
24 |             },
25 |             warning = function(w, call = parentcall) {
26 |                 w$call <- call
27 |                 warning(w)
28 |                 invokeRestart("muffleWarning")
29 |             },
30 |             message = function(m, call = parentcall) {
31 |                 m$call <- call
32 |             }
33 |         )
34 |     )
35 | }
36 | 
37 | 
38 | with_package <- function(package, expr)
39 | {
40 |     if (!isNamespaceLoaded(package)) {
41 |         if (!requireNamespace(package, quietly = TRUE)) {
42 |             stop(sprintf("Failed attaching name space for package '%s'",
43 |                          package))
44 |         }
45 |     }
46 | 
47 |     force(expr)
48 |     expr
49 | }
50 | 


--------------------------------------------------------------------------------
/R/wordlist.R:
--------------------------------------------------------------------------------
 1 | #  Copyright 2017 Patrick O. Perry.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 
16 | abbreviations <- function(kind = "english")
17 | {
18 |     .Deprecated("abbreviations_en")
19 |     with_rethrow({
20 |         wordlist(kind, function(k) .Call(C_abbreviations, k))
21 |     })
22 | }
23 | 
24 | 
25 | stopwords <- function(kind = "english")
26 | {
27 |     .Deprecated("stopwords_en")
28 |     with_rethrow({
29 |         wordlist(kind, function(k) .Call(C_stopwords, k))
30 |     })
31 | }
32 | 
33 | 
34 | wordlist <- function(kind, call)
35 | {
36 |     kind <- as_kind(kind)
37 | 
38 |     words <- character()
39 |     for (k in kind) {
40 |         wk <- call(k)
41 |         words <- c(words, wk)
42 |     }
43 | 
44 |     if (length(words) == 0) {
45 |         return(NULL)
46 |     }
47 | 
48 |     words <- unique(sort(words, method = "radix"))
49 |     words
50 | }
51 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | To Do
 2 | =====
 3 | 
 4 | Bugs
 5 | ----
 6 | 
 7 |  (no known bugs)
 8 | 
 9 | 
10 | Features
11 | --------
12 | 
13 |  * Add a `text_untoken()` function to turn token sequence into text:
14 |    insert word-joiner (U+2060) to keep multi-word phrases together;
15 |    put specified space character (ZWSP or SP) between tokens
16 | 
17 |  * wrap.pad, width arguments to `utf8_print`
18 | 
19 |  * `token_kind` and `token_map` functions (?)
20 | 
21 |  * Add demonstration of dictionary scaling with `text_match`:
22 | 
23 |        m <- text_match(x, dict$term)
24 |        score <- tapply(dict$score[m$term], m$text, mean, default = 0)
25 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | template:
2 |   params:
3 |     ganalytics: UA-4636081-3
4 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   - ps: Bootstrap
12 |   - git submodule update --init --recursive
13 | 
14 | cache:
15 |   - C:\RLibrary
16 | 
17 | # Adapt as necessary starting from here
18 | 
19 | build_script:
20 |   - travis-tool.sh install_deps
21 | 
22 | test_script:
23 |   - travis-tool.sh run_tests
24 | 
25 | on_failure:
26 |   - 7z a failure.zip *.Rcheck\*
27 |   - appveyor PushArtifact failure.zip
28 | 
29 | artifacts:
30 |   - path: '*.Rcheck\**\*.log'
31 |     name: Logs
32 | 
33 |   - path: '*.Rcheck\**\*.out'
34 |     name: Logs
35 | 
36 |   - path: '*.Rcheck\**\*.fail'
37 |     name: Logs
38 | 
39 |   - path: '*.Rcheck\**\*.Rout'
40 |     name: Logs
41 | 
42 |   - path: '\*_*.tar.gz'
43 |     name: Bits
44 | 
45 |   - path: '\*_*.zip'
46 |     name: Bits
47 | 


--------------------------------------------------------------------------------
/bench/bench-term_matrix.R:
--------------------------------------------------------------------------------
 1 | library("dplyr", warn.conflicts = FALSE)
 2 | library("janeaustenr")
 3 | library("magrittr")
 4 | library("stringr")
 5 | 
 6 | lines <- (austen_books()
 7 |           %>% group_by(book)
 8 |           %>% mutate(
 9 |     linenumber = row_number(),
10 |     chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
11 |                                             ignore_case = TRUE))))
12 |           %>% ungroup())
13 | 
14 | text <- c(tapply(lines$text, paste(lines$book, lines$chapter),
15 |                  paste, collapse = "\n"))
16 | if (packageVersion("janeaustenr") < '0.1.5') {
17 |     text <- iconv(text, "latin1", "UTF-8")
18 | }
19 | 
20 | stop_words <- stopwords("english")
21 | 
22 | make_matrix <- function(text, ngrams = 1) {
23 |     f <- corpus::token_filter(stemmer = "english", drop_punct = TRUE,
24 |                               drop_number = TRUE, drop = stop_words)
25 |     stats <- corpus::term_counts(text, f, ngrams = ngrams, min = 5)
26 |     x <- corpus::term_matrix(text, f, select = stats$term)
27 |     x
28 | }
29 | 
30 | results <- microbenchmark::microbenchmark(
31 |     unigrams = make_matrix(text, 1),
32 |     bigrams = make_matrix(text, 1:2),
33 |     trigrams = make_matrix(text, 1:3),
34 |     "4-grams" = make_matrix(text, 1:4),
35 |     "5-grams" = make_matrix(text, 1:5),
36 |     times = 5
37 | )
38 | 
39 | print(results)
40 | 


--------------------------------------------------------------------------------
/bench/bench-term_matrix.Rout:
--------------------------------------------------------------------------------
1 | Unit: milliseconds
2 |      expr      min       lq     mean   median       uq      max neval
3 |  unigrams 236.2898 236.8613 270.4002 247.5848 257.6114 373.6539     5
4 |   bigrams 297.9806 300.2708 309.5385 301.1933 321.1791 327.0687     5
5 |  trigrams 302.7908 314.1854 322.2455 315.6736 318.7499 359.8280     5
6 |   4-grams 307.0178 318.1708 318.1261 318.5333 323.0232 323.8856     5
7 |   5-grams 305.0779 314.1598 316.7992 314.9060 321.3384 328.5137     5
8 | 


--------------------------------------------------------------------------------
/bench/bench.R:
--------------------------------------------------------------------------------
 1 | 
 2 | Sys.setlocale(locale = "C")
 3 | files <- dir("bench", "^bench-.*\\.[rR]$", full.names = TRUE)
 4 | for (file in files) {
 5 |     name <- substr(file, 1, nchar(file) - 2)
 6 |     message("Running ", name, "...", appendLF = FALSE)
 7 |     time <- proc.time()
 8 |     sink(paste0(file, "out"))
 9 |     set.seed(0)
10 |     NS <- new.env()
11 |     source(file, local = NS)
12 |     sink()
13 |     new_time <- proc.time()
14 |     diff <- summary(structure(new_time - time, class = "proc_time"))
15 |     elapsed <- diff[["user"]] + diff[["system"]]
16 |     message("done. (", elapsed, "s)")
17 | }
18 | 


--------------------------------------------------------------------------------
/data/abbreviations.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/abbreviations.rda


--------------------------------------------------------------------------------
/data/abbreviations/01_make_abbreviations.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | kinds <- c(de = "german",
 4 |            en = "english",
 5 |            es = "spanish",
 6 |            fr = "french",
 7 |            it = "italian",
 8 |            pt = "portuguese",
 9 |            ru = "russian")
10 | 
11 | for (lang in names(kinds)) {
12 |     words <- suppressWarnings(corpus:::abbreviations(kinds[[lang]]))
13 |     words <- stringr::str_sort(words, locale = lang)
14 |     assign(paste0("abbreviations_", lang), words)
15 | }
16 | 
17 | filename <- file.path("..", paste0("abbreviations.rda"))
18 | save(list = paste0("abbreviations_", names(kinds)), file = filename)
19 | tools::resaveRdaFiles(filename)
20 | 
21 | 


--------------------------------------------------------------------------------
/data/affect_wordnet.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/affect_wordnet.rda


--------------------------------------------------------------------------------
/data/affect_wordnet/02_make_rda.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | raw <- read.table("wnaffect.tsv", header = TRUE, stringsAsFactors = FALSE)
 4 | raw$pos <- factor(raw$pos, levels = c("NOUN", "ADJ", "VERB", "ADV"))
 5 | raw$category <- factor(raw$category, levels = unique(raw$category))
 6 | raw$emotion <- factor(raw$emotion, levels = unique(raw$emotion))
 7 | 
 8 | affect_wordnet <- raw
 9 | class(affect_wordnet) <- c("corpus_frame", "data.frame")
10 | 
11 | save(affect_wordnet, file = "../affect_wordnet.rda")
12 | tools::resaveRdaFiles("../affect_wordnet.rda")
13 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/README:
--------------------------------------------------------------------------------
 1 | WordNet 1.6
 2 | -----------
 3 | 
 4 | URL: https://wordnet.princeton.edu
 5 | License: custom (MIT-like)
 6 | Path: wordnet-1.6
 7 | 
 8 | Dowloaded from https://wordnet.princeton.edu/wordnet/download/old-versions
 9 | 
10 | 
11 | WordNet-Affect Lexicon
12 | ----------------------
13 | 
14 | URL: http://wndomains.fbk.eu/wnaffect.html
15 | License: Creative Commons Attribution 3.0 Unported License
16 | Path: wn-affect-1.1
17 | 
18 | Downloaded from  https://github.com/larsmans/wordnet-domains-sentiwords
19 | 
20 | 
21 | Notes
22 | -----
23 | 
24 |   + wn-affect-1.1/a-hierarchy.xml organizes emotion aspects into a hierarchy
25 | 
26 |   + wn-affect-1.1/a-synsets.xml associates named aspects with wordnet-1.6 synsets
27 | 
28 |   + some named aspects do not have associated synsets, e.g. "merriment"
29 | 
30 |   + three categories in `a-synsets` do not appear in `a-hierarchy`. We remap
31 |     them as follows:
32 | 
33 |         joy-pride      -> self-pride
34 |         levity-gaiety  -> playfulness
35 |         general-gaiety -> merriment
36 | 
37 |   + we merge neutral and ambiguous emotion categories into "neutral"
38 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wn-affect-1.1/readme-wn-affect-1.1.txt:
--------------------------------------------------------------------------------
  1 |                         Characteristics of WordNet-Affect 1.1
  2 | 
  3 | This version includes a smaller number of synsets but the semantic
  4 | organization is more well-structured.
  5 | 
  6 | a) Affective Hierarchy
  7 | 
  8 | The affective label "emotion" is expanded in order to include a subset of
  9 | new labels, identifying emotional states. These labels, named "affective
 10 | categories", are hierarchically organized.
 11 | 
 12 | 
 13 | b) Valence
 14 | 
 15 | The hierarchy was initially obtained from the hyponym subtree of the
 16 | synset "n#feeling#1", but some modifications were performed in order to
 17 | classify affective synsets according to emotional valence. In particular,
 18 | affective categories are partitioned in 4 classes: "positive" (e.g. joy),
 19 | "negative" (e.g. sadness), "ambiguous" (e.g. surprise), and "neutral"
 20 | (e.g. apathy).
 21 | 
 22 | c) Causative/Stative Attribute
 23 | 
 24 | Synsets of part of speech (pos) "adjective", "verb", and "adverb"
 25 | present an addictional label representing their "causative" or "stative"
 26 | semantic function. For example, an emotional adjective is "causative"
 27 | if it refers to some emotion that is caused by the entity represented by
 28 | the modified noun (e.g. "amusing movie"). On the other hand, an emotional
 29 | adjective is "stative" if it refers to the emotion owned or felt by the
 30 | subject denoted by the modified noun (e.g. "cheerful/happy boy").
 31 | 
 32 | ---------------------------------------------------------------------
 33 |                         Differences with respect to WordNet-Affect 1.0
 34 | 
 35 | - Source files are formatted in XML standard.
 36 | 
 37 | - Previous affective labels were renamed and expressed without
 38 | abbreviations. The mapping between previous and present labels is the
 39 | following:
 40 | 
 41 |         phy -> physical-state
 42 |         beh -> behaviour
 43 |         sit -> (emotion eliciting) situation
 44 |         tra -> trait
 45 |         sen -> sensation
 46 |         cog -> cognitive-state
 47 |         moo -> mood
 48 |         emo -> emotion
 49 |         eds -> edonic-signal
 50 | 
 51 | - We removed the label "core" (referring to manually annotated synsets)
 52 | and other labels automatically added to synsets (by application of WordNet
 53 | relations, such as "similar-to"). In fact, all synsets in WordNet-Affect
 54 | 1.1 were manually reviewed and it is no more useful to trace how they
 55 | were collected.
 56 | 
 57 | - Synsets that are not tagged with the label "emo(tion)" in the previous
 58 | version are not present in current release. In order to retrieve these
 59 | synsets, you have to refer to source files of WordNet-Affect 1.0
 60 | 
 61 | ---------------------------------------------------------------------
 62 |                         File description
 63 | 
 64 | a-hierarchy.xml:
 65 | 
 66 | Includes the affective hierarchy. Each item has 2 attributes:
 67 | 
 68 |          name = affective category label
 69 |      isa  = category parent in the hierarchy
 70 | 
 71 | 
 72 | a-synsets.xml:
 73 | 
 74 | Includes synsets associated with the affective hierarchy. Synsets are
 75 | classified according to their pos.
 76 | 
 77 | Synsets of pos "noun" have the following attributes:
 78 | 
 79 |            id = label identifying current synset
 80 |         categ = affective category label
 81 | 
 82 | Synsets of other pos ("adjective", "verb", and "adverb") have the
 83 | following attributes:
 84 | 
 85 |                 id = label identifying current synset
 86 |            noun-id = id of the noun synset from which the current one was derived
 87 |        causat-stat = causative/stative label
 88 | 
 89 | The reason why not-noun synsets are connected to the affective categories
 90 | via noun synsets is because this relation allows us to study to what
 91 | extent the causative/stative character of adjectives, verbs and adverbs
 92 | depends on the morphological variation of nouns. In the next release of
 93 | WordNet-Affect, it is reasonable to characterize this semantic function
 94 | of morphology.
 95 | 
 96 | ---------------------------------------------------------------------
 97 |                         Plans for the future
 98 | 
 99 | The next version of WordNet-Affect will include all synsets that
100 | in WordNet-Affect 1.0 are annotated with labels different from "emo"
101 | (emotion) and that are not included in the current release. In particular,
102 | we want to distinguish labels representing mental states (e.g. cognitive
103 | states, attitudes), attributes of mental states (e.g. valence, intensity
104 | or level of arousal), and other semantic characteristics (e.c. behaviours,
105 | emotion-eliciting situations, emotional responces). Finally, we want to
106 | select only one label for each synset, taking into account its hypernyms.
107 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/LICENSE:
--------------------------------------------------------------------------------
 1 | WordNet Release 1.6
 2 | 
 3 | This software and database is being provided to you, the LICENSEE, by  
 4 | Princeton University under the following license.  By obtaining, using  
 5 | and/or copying this software and database, you agree that you have  
 6 | read, understood, and will comply with these terms and conditions.:  
 7 |   
 8 | Permission to use, copy, modify and distribute this software and  
 9 | database and its documentation for any purpose and without fee or  
10 | royalty is hereby granted, provided that you agree to comply with  
11 | the following copyright notice and statements, including the disclaimer,  
12 | and that the same appear on ALL copies of the software, database and  
13 | documentation, including modifications that you make for internal  
14 | use or for distribution.  
15 |   
16 | WordNet 1.6 Copyright 1997 by Princeton University.  All rights reserved.  
17 |   
18 | THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON  
19 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR  
20 | IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON  
21 | UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-  
22 | ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE  
23 | OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT  
24 | INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR  
25 | OTHER RIGHTS.  
26 |   
27 | The name of Princeton University or Princeton may not be used in  
28 | advertising or publicity pertaining to distribution of the software  
29 | and/or database.  Title to copyright in this software, database and  
30 | any associated documentation shall at all times remain with  
31 | Princeton University and LICENSEE agrees to preserve same.  
32 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | 
 3 | # Makefile for WordNet 1.6 database directory "dict"
 4 | 
 5 | WN_ROOT = /usr/local/wordnet1.6
 6 | WN_INSTALLDIR = $(WN_ROOT)/dict
 7 | 
 8 | INSTALLCMD = cp
 9 | INSTALLFLAGS = -p
10 | 
11 | WN_FILES = data.noun data.verb data.adj data.adv index.noun index.verb index.adj index.adv noun.exc verb.exc adj.exc adv.exc cousin.exc cousin.tops index.sense cntlist lexnames index.gloss sentidx.vrb sents.vrb stoplist.pl
12 | 
13 | all: $(WN_FILES)
14 | 
15 | install: $(WN_FILES)
16 | 	@if [ ! -d $(WN_INSTALLDIR) ] ; then \
17 | 		echo "Making directory $(WN_INSTALLDIR)" ; \
18 | 		mkdir -p $(WN_INSTALLDIR) ; \
19 | 		chmod 755 $(WN_INSTALLDIR) ; \
20 | 	fi ; 
21 | 	@echo "Installing database files in $(WN_INSTALLDIR)"
22 | 	@for file in $(WN_FILES) ; \
23 | 	 do \
24 | 		filename=$(WN_INSTALLDIR)/$$file ; \
25 | 		if [ -f $$filename ] ; then \
26 | 			echo "Cannot install $$filename: file exists" ; \
27 | 		else \
28 | 			echo "Installing $$filename" ; \
29 | 			$(INSTALLCMD) $(INSTALLFLAGS) $$file $$filename ; \
30 | 		fi ; \
31 | 	 done ;
32 | 	@echo "Done installing database files in $(WN_INSTALLDIR)"
33 | 
34 | uninstall:
35 | 	@echo "Cannot uninstall database files automatically" ; \
36 | 	echo "You must delete them from $(WN_INSTALLDIR) manually" ; \
37 | 	echo "This is dangerous if you set INSTALLCMD to 'mv'" ; \
38 | 	echo "Since this is your only copy of WordNet" ;
39 | 
40 | reallyuninstall:
41 | 	@echo "Uninstalling database files from $(WN_INSTALLDIR)"
42 | 	@for file in $(WN_FILES) ; \
43 | 	 do \
44 | 		filename=$(WN_INSTALLDIR)/$$file ; \
45 | 		if [ ! -f $$filename ] ; then \
46 | 			echo "Cannot uninstall $$filename: not present" ; \
47 | 		else \
48 | 			echo "Uninstalling $$filename" ; \
49 | 			rm -f $$filename ; \
50 | 		fi ; \
51 | 	 done ;
52 | 	@echo "Done uninstalling database files from $(WN_INSTALLDIR)"
53 | 
54 | clean:
55 | 	@rm -f *~ "#"*
56 | 
57 | cleandbfiles:
58 | 	@echo "Removing WordNet 1.6 database files from `pwd`"
59 | 	@for file in $(WN_FILES) ; \
60 | 	 do \
61 | 		if [ ! -f $$file ] ; then \
62 | 			echo "Cannot remove $$file" ; \
63 | 		else \
64 | 			echo "Removing $$file" ; \
65 | 			rm -f $$file ; \
66 | 		fi ; \
67 | 	done ;
68 | 	@echo "Done removing WordNet 1.6 database files"
69 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/adv.exc:
--------------------------------------------------------------------------------
1 | best well
2 | better well
3 | deeper deeply
4 | farther far
5 | further far
6 | harder hard
7 | hardest hard
8 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/dict/lexnames:
--------------------------------------------------------------------------------
 1 | 00	adj.all	3
 2 | 01	adj.pert	3 
 3 | 02	adv.all	4
 4 | 03	noun.Tops	1  
 5 | 04	noun.act	1
 6 | 05	noun.animal	1
 7 | 06	noun.artifact	1
 8 | 07	noun.attribute	1
 9 | 08	noun.body	1
10 | 09	noun.cognition	1
11 | 10	noun.communication	1
12 | 11	noun.event	1
13 | 12	noun.feeling	1
14 | 13	noun.food	1
15 | 14	noun.group	1
16 | 15	noun.location	1
17 | 16	noun.motive	1
18 | 17	noun.object	1
19 | 18	noun.person	1
20 | 19	noun.phenomenon	1
21 | 20	noun.plant	1
22 | 21	noun.possession	1
23 | 22	noun.process	1
24 | 23	noun.quantity	1
25 | 24	noun.relation	1
26 | 25	noun.shape	1
27 | 26	noun.state	1
28 | 27	noun.substance	1
29 | 28	noun.time	1
30 | 29	verb.body	2
31 | 30	verb.change	2
32 | 31	verb.cognition	2
33 | 32	verb.communication	2
34 | 33	verb.competition	2
35 | 34	verb.consumption	2
36 | 35	verb.contact	2
37 | 36	verb.creation	2
38 | 37	verb.emotion	2
39 | 38	verb.motion	2
40 | 39	verb.perception	2
41 | 40	verb.possession	2
42 | 41	verb.social	2
43 | 42	verb.stative	2
44 | 43	verb.weather	2
45 | 44 	adj.ppl	3
46 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/binsrch.htm:
--------------------------------------------------------------------------------
 1 | <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>BINSRCH(3WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | bin_search, copyfile, replace_line, insert_line  
13 | <H2><A NAME="sect1" HREF="#toc1">SYNOPSIS </A></H2>
14 | <P>
15 | <B>char *bin_search(char 
16 | *key, FILE *fp); </B> <P>
17 | <B>void copyfile(FILE *fromfp, FILE *tofp); </B> <P>
18 | <B>char *replace_line(char 
19 | *new_line, char *key, FILE *fp); </B> <P>
20 |   
21 | <H2><A NAME="sect2" HREF="#toc2">DESCRIPTION </A></H2>
22 | <P>
23 | The WordNet library contains 
24 | several general purpose functions for performing a binary search and modifying 
25 | sorted files. <P>
26 | <B>bin_search()</B> is the primary binary search algorithm to search 
27 | for <I>key </I> as the first item on a line in the file pointed to by <I>fp </I>.  The 
28 | delimiter between the key and the rest of the fields on the line, if any, 
29 | must be a space.  A pointer to a static variable containing the entire 
30 | line is returned. <FONT SIZE=-1><B>NULL </B></FONT>
31 |   is returned if a match is not found. <P>
32 | The remaining 
33 | functions are not used by WordNet, and are only briefly described. <P>
34 | <B>copyfile()</B> 
35 | copies the contents of one file to another. <P>
36 | <B>replace_line()</B> replaces a line 
37 | in a file having searchkey <I>key </I> with the contents of <I>new_line </I>. It returns 
38 | the original line or <FONT SIZE=-1><B>NULL </B></FONT>
39 |   in case of error. <P>
40 | <B>insert_line()</B> finds the proper 
41 | place to insert the contents of <I>new_line </I>, having searchkey <I>key </I> in the 
42 | sorted file pointed to by <I>fp </I>. It returns <FONT SIZE=-1><B>NULL </B></FONT>
43 |   if a line with this searchkey 
44 | is already in the file.  
45 | <H2><A NAME="sect3" HREF="#toc3">NOTES </A></H2>
46 | The maximum length of <I>key </I> is 1024. <P>
47 |  The 
48 | maximum line length in a file is 25K for Unix platforms, and 8K for the 
49 | PC and Macintosh platforms. <P>
50 |  If there are no additional fields after the 
51 | search key, the key must be followed by at least one space before the 
52 | newline character.  
53 | <H2><A NAME="sect4" HREF="#toc4">SEE ALSO </A></H2>
54 | <B><A HREF="morph.htm">morph</B>(3WN)</A>
55 | , <B><A HREF="wnsearch.htm">wnsearch</B>(3WN)</A>
56 | , <B><A HREF="wnutil.htm">wnutil</B>(3WN)</A>
57 | , <B><A HREF="wnintro.htm">wnintro</B>(5WN)</A>
58 | . 
59 |  
60 | <H2><A NAME="sect5" HREF="#toc5">WARNINGS </A></H2>
61 | <B>binsearch() </B> returns a pointer to a static character buffer. 
62 | The returned string should be copied by the caller if the results need 
63 | to be saved, as a subsequent call will replace the contents of the static 
64 | buffer. <P>
65 |  <P>
66 | 
67 | <HR><P>
68 | <A NAME="toc"><B>Table of Contents</B></A><P>
69 | <UL>
70 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
71 | <LI><A NAME="toc1" HREF="#sect1">SYNOPSIS</A></LI>
72 | <LI><A NAME="toc2" HREF="#sect2">DESCRIPTION</A></LI>
73 | <LI><A NAME="toc3" HREF="#sect3">NOTES</A></LI>
74 | <LI><A NAME="toc4" HREF="#sect4">SEE ALSO</A></LI>
75 | <LI><A NAME="toc5" HREF="#sect5">WARNINGS</A></LI>
76 | </UL>
77 | </BODY></HTML>
78 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/morph.htm:
--------------------------------------------------------------------------------
 1 | <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>MORPH(3WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | morphinit, re_morphinit, morphstr, morphword  
13 | <H2><A NAME="sect1" HREF="#toc1">SYNOPSIS </A></H2>
14 | <P>
15 | <B>#include 
16 | "wn.h" </B> <P>
17 | <B>int morphinit(void); </B> <P>
18 | <B>int re_morphinit(void); </B> <P>
19 | <B>char *morphstr(char 
20 | *origstr, int pos); </B> <P>
21 | <B>char *morphword(char *word, int pos); </B>  
22 | <H2><A NAME="sect2" HREF="#toc2">DESCRIPTION 
23 | </A></H2>
24 | <P>
25 | The WordNet morphological processor, Morphy, is accessed through these 
26 | functions: <P>
27 | <B>morphinit()</B> is used to open the exception list files.  It returns 
28 | <B>0 </B> if successful, <B>-1 </B> otherwise.  The exception list files must be opened 
29 | before <B>morphstr() </B> or <B>morphword()</B> are called. <P>
30 | <B>re_morphinit()</B> is used to 
31 | close the exception list files and reopen them, and is used exclusively 
32 | for WordNet development.  Return codes are as described above. <P>
33 | <B>morphstr()</B> 
34 | is the basic user interface to Morphy.  It tries to find the base form 
35 | (lemma) of the word or collocation <I>origstr </I> in the specified <I>pos </I>.  The 
36 | first call (with <I>origstr </I> specified) returns a pointer to the first base 
37 | form found.  Subsequent calls requesting base forms of the same string 
38 | must be made with the first argument of <FONT SIZE=-1><B>NULL. </B></FONT>
39 |   When no more base forms 
40 | for <I>origstr </I> can be found, <FONT SIZE=-1><B>NULL </B></FONT>
41 |   is returned. <P>
42 | <B>morphword()</B> tries to find 
43 | the base form of <I>word </I> in the specified <I>pos </I>. This function is called by 
44 | <B>morphstr()</B> for each individual word in a collocation.  
45 | <H2><A NAME="sect3" HREF="#toc3">NOTES </A></H2>
46 | <B>morphinit()</B> 
47 | is called by  <B>wninit() </B> and is not intended to be called directly by an 
48 | application. Applications wishing to use WordNet and/or the morphological 
49 | functions must call <B>wninit() </B> at the start of the program.  See <B><A HREF="wnutil.htm">wnutil</B>(3WN)</A>
50 |  
51 | for more information. <P>
52 |  <I>origstr </I> may be either a word or a collocation formed 
53 | by joining individual words with underscore characters (<B>_ </B>). <P>
54 |  Usually only 
55 | <B>morphstr() </B> is called from applications, as it works on both words and 
56 | collocations. <P>
57 |  <I>pos </I> must be one of the following: <P>
58 |   <blockquote><B>1 </B><tt> </tt>&nbsp;<tt> </tt>&nbsp;NOUN <BR>
59 |  <B>2 </B><tt> </tt>&nbsp;<tt> </tt>&nbsp;VERB <BR>
60 |  <B>3 </B><tt> </tt>&nbsp;<tt> </tt>&nbsp;ADJECTIVE 
61 | <BR>
62 |  <B>4 </B><tt> </tt>&nbsp;<tt> </tt>&nbsp;ADVERB <BR>
63 |  <B>5 </B><tt> </tt>&nbsp;<tt> </tt>&nbsp;ADJECTIVE SATELLITE <BR>
64 |   </blockquote>
65 | <P>
66 |  If  <FONT SIZE=-1><B>ADJECTIVE SATELLITE </B></FONT>
67 |   is passed, it is treated 
68 | by <B>morphstr() </B> as <FONT SIZE=-1><B>ADJECTIVE. </B></FONT>
69 |    
70 | <H2><A NAME="sect4" HREF="#toc4">SEE ALSO </A></H2>
71 | <B><A HREF="wnsearch.htm">wnsearch</B>(3WN)</A>
72 | , <B><A HREF="wndb.htm">wndb</B>(5WN)</A>
73 | , <B><A HREF="morphy.htm">morphy</B>(7WN)</A>
74 | . 
75 | <P>
76 |   
77 | <H2><A NAME="sect5" HREF="#toc5">WARNINGS </A></H2>
78 | Passing an invalid part of speech will result in a core dump. 
79 | <P>
80 |  The WordNet database files must be open to use <B>morphstr() </B> or <B>morphword(). 
81 |  <P>
82 | 
83 | <HR><P>
84 | <A NAME="toc"><B>Table of Contents</B></A><P>
85 | <UL>
86 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
87 | <LI><A NAME="toc1" HREF="#sect1">SYNOPSIS</A></LI>
88 | <LI><A NAME="toc2" HREF="#sect2">DESCRIPTION</A></LI>
89 | <LI><A NAME="toc3" HREF="#sect3">NOTES</A></LI>
90 | <LI><A NAME="toc4" HREF="#sect4">SEE ALSO</A></LI>
91 | <LI><A NAME="toc5" HREF="#sect5">WARNINGS</A></LI>
92 | </UL>
93 | </BODY></HTML>
94 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/uniqbeg.htm:
--------------------------------------------------------------------------------
 1 | <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>UNIQBEG(7WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | uniqbeg - unique beginners for noun hierarchies  
13 | <H2><A NAME="sect1" HREF="#toc1">DESCRIPTION </A></H2>
14 | All 
15 | of the WordNet noun synsets are organized into hierarchies.  Each synset 
16 | is part of at least one hierarchy, headed by a synset called a unique 
17 | beginner.  All of these synsets originate in the lexicographer file <B>noun.Tops 
18 | </B>.  From any noun synset, except a unique beginner, the hypernym pointers 
19 | can be traced up to one of the following synsets: <P>
20 |   <blockquote>{ entity, something, 
21 | (anything having existence (living or nonliving)) }  <BR>
22 |  <P>
23 |  { psychological_feature, 
24 | (a feature of the mental life of a living organism) } <BR>
25 |  <P>
26 |  { abstraction, 
27 | (a general concept formed by extracting common features from specific 
28 | examples) } <BR>
29 |  <P>
30 |  { state, (the way something is with respect to its main 
31 | attributes; "the current state of knowledge";  <BR>
32 |  <tt> </tt>&nbsp;<tt> </tt>&nbsp;"his state of health"; 
33 | "in a weak financial state") } <BR>
34 |  <P>
35 |  { event, (something that happens at a 
36 | given place and time) } <BR>
37 |  <P>
38 |  { act, human_action, human_activity, (something 
39 | that people do or cause to happen) } <BR>
40 |  <P>
41 |  { group, grouping, (any number 
42 | of entities (members) considered as a unit) } <BR>
43 |  <P>
44 |  { possession, (anything 
45 | owned or possessed) } <BR>
46 |  <P>
47 |  { phenomenon, (any state or process known through 
48 | the senses rather than by intuition or reasoning) } <BR>
49 |   </blockquote>
50 |  
51 | <H2><A NAME="sect2" HREF="#toc2">NOTES </A></H2>
52 | The lexicographer 
53 | files are not included in the WordNet package.  
54 | <H2><A NAME="sect3" HREF="#toc3">FILES </A></H2>
55 | 
56 | <DL>
57 | 
58 | <DT><B>noun.Tops</B>  </DT>
59 | <DD>unique 
60 | beginners for nouns </DD>
61 | </DL>
62 |  
63 | <H2><A NAME="sect4" HREF="#toc4">SEE ALSO </A></H2>
64 | <B><A HREF="wndb.htm">wndb</B>(5WN)</A>
65 | , <B><A HREF="wninput.htm">wninput</B>(5WN)</A>
66 | , <B><A HREF="wngloss.htm">wngloss</B>(7WN)</A>
67 | . <P>
68 | 
69 | <HR><P>
70 | <A NAME="toc"><B>Table of Contents</B></A><P>
71 | <UL>
72 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
73 | <LI><A NAME="toc1" HREF="#sect1">DESCRIPTION</A></LI>
74 | <LI><A NAME="toc2" HREF="#sect2">NOTES</A></LI>
75 | <LI><A NAME="toc3" HREF="#sect3">FILES</A></LI>
76 | <LI><A NAME="toc4" HREF="#sect4">SEE ALSO</A></LI>
77 | </UL>
78 | </BODY></HTML>
79 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro1.htm:
--------------------------------------------------------------------------------
 1 |  <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>WNINTRO(1WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | wnintro - WordNet user commands  
13 | <H2><A NAME="sect1" HREF="#toc1">SYNOPSIS </A></H2>
14 | <P>
15 | <B>escort </B> - search semantic 
16 | concordances for sentences containing semantic tags <P>
17 | <B>wn </B> - command line 
18 | interface to WordNet database <P>
19 | <B>wnb </B> - window based WordNet browser  
20 | <H2><A NAME="sect2" HREF="#toc2">DESCRIPTION 
21 | </A></H2>
22 | This section of the <I>WordNet Reference Manual </I> contains manual pages that 
23 | describe commands available with the various WordNet system packages. <P>
24 |  
25 | The WordNet interfaces  <B><A HREF="wn.htm">wn</B>(1WN)</A>
26 |  and <B><A HREF="wnb.htm">wnb</B>(1WN)</A>
27 |  allow the user to search 
28 | the WordNet database and display the information textually.   <B><A HREF="escort.htm">escort</B>(1WN)</A>
29 |  
30 | is a window based browser for searching the semantic concordances.  
31 | <H2><A NAME="sect3" HREF="#toc3">ENVIRONMENT 
32 | VARIABLES </A></H2>
33 | 
34 | <DL>
35 | 
36 | <DT><B>WNHOME</B>  </DT>
37 | <DD>Base directory for WordNet.  Unix default is  <B>/usr/local/wordnet1.6 
38 | </B>, PC default is <B>C:\wn16 </B>, Macintosh default is <B>: </B>. </DD>
39 | 
40 | <DT><B>WNSEARCHDIR</B>  </DT>
41 | <DD>Directory 
42 | in which the WordNet database has been installed.  Unix default is <B>WNHOME/dict 
43 | </B>, PC default is <B>WNHOME\dict </B>, Macintosh default is <B>:Database </B>. </DD>
44 | 
45 | <DT><B>WNDBVERSION</B> 
46 |  </DT>
47 | <DD>Indicates which format the WordNet database files in <B>WNSEARCHDIR </B> are 
48 | in.  The default is <B>1.6 </B>.  Setting <B>WNDBVERION </B> to <B>1.5 </B> allows the 1.6 commands 
49 | to work with the 1.5 database files. </DD>
50 | </DL>
51 |  
52 | <H2><A NAME="sect4" HREF="#toc4">SEE ALSO </A></H2>
53 | <B><A HREF="wnintro.htm">wnintro</B>(3WN)</A>
54 | , <B><A HREF="wnintro.htm">wnintro</B>(5WN)</A>
55 | , 
56 | <B><A HREF="wnintro.htm">wnintro</B>(7WN)</A>
57 | . <P>
58 | Miller, G. A. (1990), ed. <I>"WordNet: An On-Line Lexical Database" 
59 | </I>. International Journal of Lexicography, Volume 3, Number 4.   <P>
60 | Miller, G. 
61 | A., et al. (1990, 1993). <I>"Five Papers on WordNet" </I>. Cognitive Science Laboratory 
62 | Report 43. (Available from <a href="ftp://ftp.cogsci.princeton.edu/wordnet/">ftp://ftp.cogsci.princeton.edu/wordnet/ </a>.) <P>
63 | Fellbaum, 
64 | C. (1998), ed. <I>"WordNet: An Electronic Lexical Database" </I>. MIT Press, Cambridge, 
65 | MA.  
66 | <H2><A NAME="sect5" HREF="#toc5">AVAILABILITY </A></H2>
67 | WordNet has a World Wide Web site at <B><A HREF="http://www.cogsci.princeton.edu/~wn/">http://www.cogsci.princeton.edu/~wn/</A>
68 |  
69 | </B>.  From this web site users can learn about the WordNet project, run several 
70 | different interfaces to the WordNet database, and download various WordNet 
71 | system packages and <I>"Five Papers on WordNet" </I>. <P>
72 | 
73 | <HR><P>
74 | <A NAME="toc"><B>Table of Contents</B></A><P>
75 | <UL>
76 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
77 | <LI><A NAME="toc1" HREF="#sect1">SYNOPSIS</A></LI>
78 | <LI><A NAME="toc2" HREF="#sect2">DESCRIPTION</A></LI>
79 | <LI><A NAME="toc3" HREF="#sect3">ENVIRONMENT VARIABLES</A></LI>
80 | <LI><A NAME="toc4" HREF="#sect4">SEE ALSO</A></LI>
81 | <LI><A NAME="toc5" HREF="#sect5">AVAILABILITY</A></LI>
82 | </UL>
83 | </BODY></HTML>
84 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro5.htm:
--------------------------------------------------------------------------------
 1 |  <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>WNINTRO(5WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | wnintro - introduction to descriptions of WordNet file formats  
13 | 
14 | <H2><A NAME="sect1" HREF="#toc1">SYNOPSIS </A></H2>
15 | <P>
16 | <B>cntlist </B> - format of <B>cntlist </B> file <P>
17 | <B>cxtfile </B> - format of semantically 
18 | tagged file <P>
19 | <B>glossidx </B> - index of words found in synset glosses <P>
20 | <B>lexnames 
21 | </B> - list of lexicographer file names and numbers <P>
22 | <B>prologdb </B> - description 
23 | of Prolog database files <P>
24 | <B>senseidx </B> - format of sense index file <P>
25 | <B>sensemap 
26 | </B> - mapping from senses in WordNet 1.5 to corresponding 1.6 senses <P>
27 | <B>taglist 
28 | </B> - format of <B>taglist </B> file <P>
29 | <B>wndb </B> - format of WordNet database files  <P>
30 | <B>wninput 
31 | </B> - format of WordNet lexicographer files  
32 | <H2><A NAME="sect2" HREF="#toc2">DESCRIPTION </A></H2>
33 | This section of the 
34 | <I>WordNet Reference Manual </I> contains manual pages that describe the formats 
35 | of the various files included in different WordNet 1.6 packages.  
36 | <H2><A NAME="sect3" HREF="#toc3">NOMENCLATURE 
37 | </A></H2>
38 | All files are in ASCII.  Fields are generally separated by one space, unless 
39 | otherwise noted, and each line is terminated with a newline character. 
40 |  In the file format descriptions, terms in <I>italics </I> refer to field names. 
41 |  Characters or strings in <B>boldface </B> represent an actual character or string 
42 | as it appears in the file. Items enclosed in italicized square brackets 
43 | (<I>[ ] </I>) may not be present. Since several files contain fields that have 
44 | the identical meaning, field names are consistently defined.  For example, 
45 | several WordNet files contain one or more <I>synset_offset </I> fields.  In each 
46 | case, the definition of <I>synset_offset </I> is identical.  
47 | <H2><A NAME="sect4" HREF="#toc4">SEE ALSO </A></H2>
48 | <B><A HREF="wnintro.htm">wnintro</B>(1WN)</A>
49 | , 
50 | <B><A HREF="wnintro.htm">wnintro</B>(3WN)</A>
51 | , <B><A HREF="wnintro.htm">wnintro</B>(7WN)</A>
52 | , <B><A HREF="wngloss.htm">wngloss</B>(7WN)</A>
53 | . <P>
54 | Miller, G. A. (1990), ed. <I>"WordNet: 
55 | An On-Line Lexical Database" </I>. International Journal of Lexicography, Volume 
56 | 3, Number 4.   <P>
57 | Miller, G. A., et al. (1990, 1993). <I>"Five Papers on WordNet" 
58 | </I>. Cognitive Science Laboratory Report 43. (Available from <a href="ftp://ftp.cogsci.princeton.edu/wordnet/">ftp://ftp.cogsci.princeton.edu/wordnet/ 
59 | </a>.) <P>
60 | Fellbaum, C. (1998), ed. <I>"WordNet: An Electronic Lexical Database" </I>. MIT 
61 | Press, Cambridge, MA. <P>
62 |  <P>
63 | 
64 | <HR><P>
65 | <A NAME="toc"><B>Table of Contents</B></A><P>
66 | <UL>
67 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
68 | <LI><A NAME="toc1" HREF="#sect1">SYNOPSIS</A></LI>
69 | <LI><A NAME="toc2" HREF="#sect2">DESCRIPTION</A></LI>
70 | <LI><A NAME="toc3" HREF="#sect3">NOMENCLATURE</A></LI>
71 | <LI><A NAME="toc4" HREF="#sect4">SEE ALSO</A></LI>
72 | </UL>
73 | </BODY></HTML>
74 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnintro7.htm:
--------------------------------------------------------------------------------
 1 |  <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>WNINTRO(7WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | wnintro - introduction to miscellaneous WordNet information  
13 | <H2><A NAME="sect1" HREF="#toc1">SYNOPSIS 
14 | </A></H2>
15 | <P>
16 | <B>morphy </B> - discussion of WordNet's morphological processing <P>
17 | <B>semcor </B> - discussion 
18 | of semantic concordances <P>
19 | <B>uniqbeg </B> - unique beginners for noun hierarchies 
20 | <P>
21 | <B>wngloss </B> - glossary of terms used in WordNet <P>
22 | <B>wngroups </B> - discussion of WordNet 
23 | search code to group similar senses <P>
24 | <B>wnlicens </B> - text of WordNet license 
25 | agreement <P>
26 | <B>wnpkgs </B> - information about WordNet packages and distribution 
27 | <P>
28 | <B>wnstats </B> - database statistics  
29 | <H2><A NAME="sect2" HREF="#toc2">DESCRIPTION </A></H2>
30 | This section of the <I>WordNet 
31 | Reference Manual </I> contains manual pages that describe various topics related 
32 | to WordNet and the semantic concordances, and a glossary of terms.  
33 | <H2><A NAME="sect3" HREF="#toc3">SEE 
34 | ALSO </A></H2>
35 | <B><A HREF="wnintro.htm">wnintro</B>(1WN)</A>
36 | , <B><A HREF="wnintro.htm">wnintro</B>(3WN)</A>
37 | , <B><A HREF="wnintro.htm">wnintro</B>(5WN)</A>
38 | , <B><A HREF="wngloss.htm">wngloss</B>(7WN)</A>
39 | . <P>
40 | Miller, G. 
41 | A. (1990), ed. <I>"WordNet: An On-Line Lexical Database" </I>. International Journal 
42 | of Lexicography, Volume 3, Number 4.   <P>
43 | Miller, G. A., et al. (1990, 1993). 
44 | <I>"Five Papers on WordNet" </I>. Cognitive Science Laboratory Report 43. (Available 
45 | from <a href="ftp://ftp.cogsci.princeton.edu/wordnet/">ftp://ftp.cogsci.princeton.edu/wordnet/ </a>.) <P>
46 | Fellbaum, C. (1998), ed. <I>"WordNet: 
47 | An Electronic Lexical Database" </I>. MIT Press, Cambridge, MA. <P>
48 | 
49 | <HR><P>
50 | <A NAME="toc"><B>Table of Contents</B></A><P>
51 | <UL>
52 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
53 | <LI><A NAME="toc1" HREF="#sect1">SYNOPSIS</A></LI>
54 | <LI><A NAME="toc2" HREF="#sect2">DESCRIPTION</A></LI>
55 | <LI><A NAME="toc3" HREF="#sect3">SEE ALSO</A></LI>
56 | </UL>
57 | </BODY></HTML>
58 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnlicens.htm:
--------------------------------------------------------------------------------
 1 | <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>WNLICENS(7WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | wnlicens - text of WordNet license  
13 | <H2><A NAME="sect1" HREF="#toc1">DESCRIPTION </A></H2>
14 | WordNet Release 1.6 
15 | <P>
16 |  This software and database is being provided to you, the LICENSEE, by 
17 |   Princeton University under the following license.  By obtaining, using 
18 |   and/or copying this software and database, you agree that you have  
19 |  read, understood, and will comply with these terms and conditions.:   
20 |    Permission to use, copy, modify and distribute this software and   
21 | database and its documentation for any purpose and without fee or   royalty 
22 | is hereby granted, provided that you agree to comply with   the following 
23 | copyright notice and statements, including the disclaimer,   and that 
24 | the same appear on ALL copies of the software, database and   documentation, 
25 | including modifications that you make for internal   use or for distribution. 
26 |      WordNet 1.6 Copyright 1997 by Princeton University.  All rights reserved. 
27 |      THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON   UNIVERSITY 
28 | MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR   IMPLIED.  BY WAY OF 
29 | EXAMPLE, BUT NOT LIMITATION, PRINCETON   UNIVERSITY MAKES NO REPRESENTATIONS 
30 | OR WARRANTIES OF MERCHANT-   ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE 
31 | OR THAT THE USE   OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION 
32 | WILL NOT   INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR 
33 |   OTHER RIGHTS.      The name of Princeton University or Princeton may 
34 | not be used in   advertising or publicity pertaining to distribution of 
35 | the software   and/or database.  Title to copyright in this software, database 
36 | and   any associated documentation shall at all times remain with   Princeton 
37 | University and LICENSEE agrees to preserve same.   <P>
38 | 
39 | <HR><P>
40 | <A NAME="toc"><B>Table of Contents</B></A><P>
41 | <UL>
42 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
43 | <LI><A NAME="toc1" HREF="#sect1">DESCRIPTION</A></LI>
44 | </UL>
45 | </BODY></HTML>
46 | 


--------------------------------------------------------------------------------
/data/affect_wordnet/wordnet-1.6/man/html/wnstats.htm:
--------------------------------------------------------------------------------
 1 | <!-- manual page source format generated by PolyglotMan v3.0.3a12, -->
 2 | <!-- available via anonymous ftp from ftp.cs.berkeley.edu:/ucb/people/phelps/tcltk/rman.tar.Z -->
 3 | 
 4 | <HTML>
 5 | <HEAD>
 6 | <TITLE>WNSTATS(7WN) manual page</TITLE>
 7 | </HEAD>
 8 | <BODY>
 9 | <A HREF="#toc">Table of Contents</A><P>
10 |  
11 | <H2><A NAME="sect0" HREF="#toc0">NAME </A></H2>
12 | wnstats - WordNet 1.6 database statistics  
13 | <H2><A NAME="sect1" HREF="#toc1">DESCRIPTION </A></H2>
14 |  
15 | <H3><A NAME="sect2" HREF="#toc2">Number of 
16 | words, synsets, and senses </A></H3>
17 |  <center><TABLE BORDER cellpadding=3>
18 |  <TR> <TH ALIGN=CENTER>POS<Th ALIGN=CENTER>Unique Strings <Th ALIGN=CENTER>Synsets <Th ALIGN=CENTER>Total Senses </TR>
19 |  <TR>  <TR> <TD ALIGN=LEFT>Noun </TD> <TD ALIGN=RIGHT>94474 </TD> <TD ALIGN=RIGHT>66025 </TD> <TD ALIGN=RIGHT>116317 </TD> </TR>
20 |  <TR> <TD ALIGN=LEFT>Verb </TD> <TD ALIGN=RIGHT>10319 </TD> <TD ALIGN=RIGHT>12127 </TD> <TD ALIGN=RIGHT>22066 </TD> </TR>
21 |  <TR> <TD ALIGN=LEFT>Adjective 
22 | </TD> <TD ALIGN=RIGHT>20170 </TD> <TD ALIGN=RIGHT>17915 </TD> <TD ALIGN=RIGHT>29881 </TD> </TR>
23 |  <TR> <TD ALIGN=LEFT>Adverb </TD> <TD ALIGN=RIGHT>4546 </TD> <TD ALIGN=RIGHT>3575 </TD> <TD ALIGN=RIGHT>5677 </TD> </TR>
24 |  <TR>  <TR> <Th ALIGN=LEFT>Totals <TD ALIGN=RIGHT>121962 </TD> 
25 | <TD ALIGN=RIGHT>99642 </TD> <TD ALIGN=RIGHT>173941 </TD> </TR>
26 |  </TABLE></center>
27 | <P>
28 |   
29 | <H3><A NAME="sect3" HREF="#toc3">Polysemy information </A></H3>
30 | <P>
31 |   <center><TABLE BORDER cellpadding=3>
32 |  <TR> <Th ALIGN=CENTER>POS<Th ALIGN=CENTER>Monosemous Words<br>and Senses<Th ALIGN=CENTER>Polysemous Words <Th ALIGN=CENTER>Polysemous Senses </TR>
33 |  <TR>  <TR> <TD ALIGN=LEFT>Noun </TD> <TD ALIGN=RIGHT>81909 </TD> <TD ALIGN=RIGHT>12564 
34 | </TD> <TD ALIGN=RIGHT>34408 </TD> </TR>
35 |  <TR> <TD ALIGN=LEFT>Verb </TD> <TD ALIGN=RIGHT>5751 </TD> <TD ALIGN=RIGHT>4567 </TD> <TD ALIGN=RIGHT>16315 </TD> </TR>
36 |  <TR> <TD ALIGN=LEFT>Adjective </TD> <TD ALIGN=RIGHT>14795 </TD> <TD ALIGN=RIGHT>5374 </TD> <TD ALIGN=RIGHT>15086 </TD> </TR>
37 |  <TR> 
38 | <TD ALIGN=LEFT>Adverb </TD> <TD ALIGN=RIGHT>3795 </TD> <TD ALIGN=RIGHT>750 </TD> <TD ALIGN=RIGHT>1882 </TD> </TR>
39 |  <TR>  <TR> <Th ALIGN=LEFT>Totals <TD ALIGN=RIGHT>106250 </TD> <TD ALIGN=RIGHT>23255 </TD> <TD ALIGN=RIGHT>67691 </TD> </TR>
40 |  </TABLE></center>
41 | <P>
42 |   <center><TABLE BORDER cellpadding=3>
43 |  <TR> <Th ALIGN=CENTER>POS <Th ALIGN=CENTER>Average
44 | Polysemy<br>Including Monosemous Words <Th ALIGN=CENTER>Average
45 | Polysemy<br>Excluding Monosemous Words </TR>
46 |  <TR>  <TR> <TD ALIGN=LEFT>Noun </TD> <TD ALIGN=RIGHT>1.23 </TD> <TD ALIGN=RIGHT>2.73 </TD> </TR>
47 |  <TR> <TD ALIGN=LEFT>Verb </TD> <TD ALIGN=RIGHT>2.13 </TD> <TD ALIGN=RIGHT>3.57 </TD> </TR>
48 |  <TR> <TD ALIGN=LEFT>Adjective </TD> 
49 | <TD ALIGN=RIGHT>1.48 </TD> <TD ALIGN=RIGHT>2.80 </TD> </TR>
50 |  <TR> <TD ALIGN=LEFT>Adverb </TD> <TD ALIGN=RIGHT>1.24 </TD> <TD ALIGN=RIGHT>2.50 </TD> </TR>
51 |  </TABLE></center>
52 |  
53 | <H2><A NAME="sect4" HREF="#toc4">NOTES </A></H2>
54 | Statistics for all types of adjectives 
55 | and adjective satellites are combined.   <P>
56 |  The total of all unique noun, 
57 | verb, adjective, and adverb strings is greater than 121962.  However, many 
58 | strings are unique within a syntactic category, but are in more than one 
59 | syntactic category.  The figure in the table represents the unique strings 
60 | when all syntactic categories are combined. <P>
61 |  <P>
62 | 
63 | <HR><P>
64 | <A NAME="toc"><B>Table of Contents</B></A><P>
65 | <UL>
66 | <LI><A NAME="toc0" HREF="#sect0">NAME</A></LI>
67 | <LI><A NAME="toc1" HREF="#sect1">DESCRIPTION</A></LI>
68 | <UL>
69 | <LI><A NAME="toc2" HREF="#sect2">Number of words, synsets, and senses</A></LI>
70 | <LI><A NAME="toc3" HREF="#sect3">Polysemy information</A></LI>
71 | </UL>
72 | <LI><A NAME="toc4" HREF="#sect4">NOTES</A></LI>
73 | </UL>
74 | </BODY></HTML>
75 | 


--------------------------------------------------------------------------------
/data/federalist.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/federalist.rda


--------------------------------------------------------------------------------
/data/federalist/.gitignore:
--------------------------------------------------------------------------------
1 | /federalist.json
2 | /pg18.txt
3 | 


--------------------------------------------------------------------------------
/data/federalist/01_download_raw.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | FILE="pg18.txt"
 6 | FILE_MD5="126eca27b879c078a189ba0783186330"
 7 | 
 8 | if [ ! -f $FILE ];
 9 | then
10 |     echo "Downloading raw data file '$FILE' from gutenberg.org"
11 |     curl 'http://www.gutenberg.org/cache/epub/18/pg18.txt' -o ${FILE}.download
12 |     mv ${FILE}.download ${FILE}
13 | fi
14 | 
15 | echo "Checking raw data file '${FILE}'"
16 | md5sum -c - <<< "${FILE_MD5} ${FILE}"
17 | 


--------------------------------------------------------------------------------
/data/federalist/03_make_rda.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | library("corpus")
 4 | 
 5 | raw <- read_ndjson("federalist.json", text = NULL, stringsAsFactors = FALSE)
 6 | 
 7 | name <- paste("Federalist No.", raw$paper_id)
 8 | 
 9 | venue <- raw$venue
10 | venue[venue == "For the Independent Fournal"] <- "For the Independent Journal"
11 | venue[grep("^From M[cC]", venue)] <- "From McLean's Edition, New York"
12 | 
13 | author <- raw$author
14 | author[author == "HAMILTON"] <- "Hamilton"
15 | author[author == "HAMILTON AND MADISON"] <- NA
16 | author[author == "HAMILTON OR MADISON"] <- NA
17 | author[author == "JAY"] <- "Jay"
18 | author[author == "MADISON"] <- "Madison"
19 | author[raw$paper_id == 58] <- NA # follow Mosteller and Wallace
20 | 
21 | date <- raw$date
22 | date <- sub("^(Tuesday|Thursday|Friday),? ", "", date)
23 | 
24 | invisible(Sys.setlocale("LC_TIME", "C"))
25 | date <- as.Date(date, "%B %d, %Y")
26 | 
27 | federalist <- data.frame(name,
28 |                          title = raw$title,
29 |                          venue,
30 |                          date,
31 |                          author,
32 |                          text = raw$text,
33 |                          stringsAsFactors = FALSE)
34 | class(federalist) <- c("corpus_frame", "data.frame")
35 | 
36 | save(federalist, file = "../federalist.rda")
37 | tools::resaveRdaFiles("../federalist.rda")
38 | 


--------------------------------------------------------------------------------
/data/sentiment_afinn.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/sentiment_afinn.rda


--------------------------------------------------------------------------------
/data/sentiment_afinn/01_download_raw.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | FILE="imm6010.zip"
 6 | FILE_MD5="ea6216f43d27188ea2b5bfadf068ff37"
 7 | 
 8 | if [ ! -f $FILE ];
 9 | then
10 |     echo "Downloading raw data file '$FILE' from www2.imm.dtu.dk"
11 |     curl 'http://www2.imm.dtu.dk/pubdb/views/edoc_download.php/6010/zip/imm6010.zip' -o ${FILE}.download
12 |     mv ${FILE}.download ${FILE}
13 | fi
14 | 
15 | if [ ! -d AFINN ];
16 | then
17 |     unzip ${FILE}
18 | fi
19 | 


--------------------------------------------------------------------------------
/data/sentiment_afinn/02_make_rda.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | raw <- read.delim(file.path("AFINN", "AFINN-111.txt"),
 4 |                   encoding = "UTF-8", header = FALSE, stringsAsFactors = FALSE)
 5 | names(raw) <- c("term", "score")
 6 | raw$term <- trimws(raw$term)
 7 | 
 8 | # exclude multi-type terms
 9 | multi <- grepl("[[:space:]]", raw$term)
10 | raw <- raw[!multi,]
11 | 
12 | # discared row names
13 | rownames(raw) <- NULL
14 | 
15 | sentiment_afinn <- raw
16 | class(sentiment_afinn) <- c("corpus_frame", "data.frame")
17 | 
18 | save(sentiment_afinn, file = "../sentiment_afinn.rda")
19 | tools::resaveRdaFiles("../sentiment_afinn.rda")
20 | 


--------------------------------------------------------------------------------
/data/sentiment_afinn/AFINN/AFINN-README.txt:
--------------------------------------------------------------------------------
 1 | AFINN is a list of English words rated for valence with an integer
 2 | between minus five (negative) and plus five (positive). The words have
 3 | been manually labeled by Finn Årup Nielsen in 2009-2011. The file
 4 | is tab-separated. There are two versions:
 5 | 
 6 | AFINN-111: Newest version with 2477 words and phrases.
 7 | 
 8 | AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
 9 | are 1480 lines, as some words are listed twice. The word list in not
10 | entirely in alphabetic ordering.  
11 | 
12 | An evaluation of the word list is available in:
13 | 
14 | Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
15 | sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
16 | 
17 | The list was used in: 
18 | 
19 | Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
20 | Michael Etter, "Good Friends, Bad News - Affect and Virality in
21 | Twitter", The 2011 International Workshop on Social Computing,
22 | Network, and Services (SocialComNet 2011).
23 | 
24 | 
25 | This database of words is copyright protected and distributed under
26 | "Open Database License (ODbL) v1.0"
27 | http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
28 | copyleft license.
29 | 
30 | See comments on the word list here:
31 | http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
32 | 
33 | 
34 | In Python the file may be read into a dictionary with:
35 | 
36 | >>> afinn = dict(map(lambda (k,v): (k,int(v)), 
37 |                      [ line.split('\t') for line in open("AFINN-111.txt") ]))
38 | >>> afinn["Good".lower()]
39 | 3
40 | >>> sum(map(lambda word: afinn.get(word, 0), "Rainy day but still in a good mood".lower().split()))
41 | 2
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/data/stopwords.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/data/stopwords.rda


--------------------------------------------------------------------------------
/data/stopwords/01_make_stopwords.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | kinds <- c(da = "danish",
 4 |            de = "german",
 5 |            en = "english",
 6 |            es = "spanish",
 7 |            fi = "finnish",
 8 |            fr = "french",
 9 |            hu = "hungarian",
10 |            it = "italian",
11 |            nl = "dutch",
12 |            no = "norwegian",
13 |            pt = "portuguese",
14 |            ru = "russian",
15 |            sv = "swedish")
16 | 
17 | for (lang in names(kinds)) {
18 |     words <- suppressWarnings(corpus:::stopwords(kinds[[lang]]))
19 |     words <- stringr::str_sort(words, locale = lang)
20 |     assign(paste0("stopwords_", lang), words)
21 | }
22 | 
23 | filename <- file.path("..", paste0("stopwords.rda"))
24 | save(list = paste0("stopwords_", names(kinds)), file = filename)
25 | tools::resaveRdaFiles(filename)
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | corpustext.com


--------------------------------------------------------------------------------
/docs/articles/chinese-wordcloud-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/chinese-wordcloud-1.png


--------------------------------------------------------------------------------
/docs/articles/corpus-emotion-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-emotion-1.png


--------------------------------------------------------------------------------
/docs/articles/corpus-heapslaw-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-heapslaw-1.png


--------------------------------------------------------------------------------
/docs/articles/corpus-witch-occurrences-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/corpus-witch-occurrences-1.png


--------------------------------------------------------------------------------
/docs/articles/gender-estimates-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates-1.png


--------------------------------------------------------------------------------
/docs/articles/gender-estimates_se-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-estimates_se-1.png


--------------------------------------------------------------------------------
/docs/articles/gender-signif-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/articles/gender-signif-1.png


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/favicon.ico


--------------------------------------------------------------------------------
/docs/jquery.sticky-kit.min.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | http://leafo.net
 3 | */
 4 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k));
 5 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("<div />"))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q,
 6 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),e<F&&(m=!1,c=q,null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),
 7 | h.detach()),b={position:"",width:"",top:""},a.css(b).removeClass(t).trigger("sticky_kit:unstick")),B&&(b=f.height(),u+q>b&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}),
 8 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize",
 9 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n<K;n++)d=this[n],J(b(d));return this}}).call(this);
10 | 


--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#75AADB;}
 7 | </style>
 8 | <path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
 9 | 	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
10 | 	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
11 | 	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/logo.png


--------------------------------------------------------------------------------
/docs/pkgdown.css:
--------------------------------------------------------------------------------
  1 | /* Sticker footer */
  2 | body > .container {
  3 |   display: flex;
  4 |   padding-top: 60px;
  5 |   min-height: calc(100vh);
  6 |   flex-direction: column;
  7 | }
  8 | 
  9 | body > .container .row {
 10 |   flex: 1;
 11 | }
 12 | 
 13 | footer {
 14 |   margin-top: 45px;
 15 |   padding: 35px 0 36px;
 16 |   border-top: 1px solid #e5e5e5;
 17 |   color: #666;
 18 |   display: flex;
 19 | }
 20 | footer p {
 21 |   margin-bottom: 0;
 22 | }
 23 | footer div {
 24 |   flex: 1;
 25 | }
 26 | footer .pkgdown {
 27 |   text-align: right;
 28 | }
 29 | footer p {
 30 |   margin-bottom: 0;
 31 | }
 32 | 
 33 | img.icon {
 34 |   float: right;
 35 | }
 36 | 
 37 | img {
 38 |   max-width: 100%;
 39 | }
 40 | 
 41 | /* Section anchors ---------------------------------*/
 42 | 
 43 | a.anchor {
 44 |   margin-left: -30px;
 45 |   display:inline-block;
 46 |   width: 30px;
 47 |   height: 30px;
 48 |   visibility: hidden;
 49 | 
 50 |   background-image: url(./link.svg);
 51 |   background-repeat: no-repeat;
 52 |   background-size: 20px 20px;
 53 |   background-position: center center;
 54 | }
 55 | 
 56 | .hasAnchor:hover a.anchor {
 57 |   visibility: visible;
 58 | }
 59 | 
 60 | @media (max-width: 767px) {
 61 |   .hasAnchor:hover a.anchor {
 62 |     visibility: hidden;
 63 |   }
 64 | }
 65 | 
 66 | 
 67 | /* Fixes for fixed navbar --------------------------*/
 68 | 
 69 | .contents h1, .contents h2, .contents h3, .contents h4 {
 70 |   padding-top: 60px;
 71 |   margin-top: -60px;
 72 | }
 73 | 
 74 | /* Static header placement on mobile devices */
 75 | @media (max-width: 767px) {
 76 |   .navbar-fixed-top {
 77 |     position: absolute;
 78 |   }
 79 |   .navbar {
 80 |     padding: 0;
 81 |   }
 82 | }
 83 | 
 84 | 
 85 | /* Sidebar --------------------------*/
 86 | 
 87 | #sidebar {
 88 |   margin-top: 30px;
 89 | }
 90 | #sidebar h2 {
 91 |   font-size: 1.5em;
 92 |   margin-top: 1em;
 93 | }
 94 | 
 95 | #sidebar h2:first-child {
 96 |   margin-top: 0;
 97 | }
 98 | 
 99 | #sidebar .list-unstyled li {
100 |   margin-bottom: 0.5em;
101 | }
102 | 
103 | /* Reference index & topics ----------------------------------------------- */
104 | 
105 | .ref-index th {font-weight: normal;}
106 | .ref-index h2 {font-size: 20px;}
107 | 
108 | .ref-index td {vertical-align: top;}
109 | .ref-index .alias {width: 40%;}
110 | .ref-index .title {width: 60%;}
111 | 
112 | .ref-index .alias {width: 40%;}
113 | .ref-index .title {width: 60%;}
114 | 
115 | .ref-arguments th {text-align: right; padding-right: 10px;}
116 | .ref-arguments th, .ref-arguments td {vertical-align: top;}
117 | .ref-arguments .name {width: 20%;}
118 | .ref-arguments .desc {width: 80%;}
119 | 
120 | /* Nice scrolling for wide elements --------------------------------------- */
121 | 
122 | table {
123 |   display: block;
124 |   overflow: auto;
125 | }
126 | 
127 | /* Syntax highlighting ---------------------------------------------------- */
128 | 
129 | pre {
130 |   word-wrap: normal;
131 |   word-break: normal;
132 |   border: 1px solid #eee;
133 | }
134 | 
135 | pre, code {
136 |   background-color: #f8f8f8;
137 |   color: #333;
138 | }
139 | 
140 | pre .img {
141 |   margin: 5px 0;
142 | }
143 | 
144 | pre .img img {
145 |   background-color: #fff;
146 |   display: block;
147 |   height: auto;
148 | }
149 | 
150 | code a, pre a {
151 |   color: #375f84;
152 | }
153 | 
154 | .fl      {color: #1514b5;}
155 | .fu      {color: #000000;} /* function */
156 | .ch,.st  {color: #036a07;} /* string */
157 | .kw      {color: #264D66;} /* keyword */
158 | .co      {color: #888888;} /* comment */
159 | 
160 | .message { color: black;   font-weight: bolder;}
161 | .error   { color: orange;  font-weight: bolder;}
162 | .warning { color: #6A0366; font-weight: bolder;}
163 | 
164 | 


--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
 1 | $(function() {
 2 |   $("#sidebar").stick_in_parent({offset_top: 40});
 3 |   $('body').scrollspy({
 4 |     target: '#sidebar',
 5 |     offset: 60
 6 |   });
 7 | 
 8 |   var cur_path = paths(location.pathname);
 9 |   $("#navbar ul li a").each(function(index, value) {
10 |     if (value.text == "Home")
11 |       return;
12 |     if (value.getAttribute("href") === "#")
13 |       return;
14 | 
15 |     var path = paths(value.pathname);
16 |     if (is_prefix(cur_path, path)) {
17 |       // Add class to parent <li>, and enclosing <li> if in dropdown
18 |       var menu_anchor = $(value);
19 |       menu_anchor.parent().addClass("active");
20 |       menu_anchor.closest("li.dropdown").addClass("active");
21 |     }
22 |   });
23 | });
24 | 
25 | function paths(pathname) {
26 |   var pieces = pathname.split("/");
27 |   pieces.shift(); // always starts with /
28 | 
29 |   var end = pieces[pieces.length - 1];
30 |   if (end === "index.html" || end === "")
31 |     pieces.pop();
32 |   return(pieces);
33 | }
34 | 
35 | function is_prefix(needle, haystack) {
36 |   if (needle.length > haystack.lengh)
37 |     return(false);
38 | 
39 |   for (var i = 0; i < haystack.length; i++) {
40 |     if (needle[i] != haystack[i])
41 |       return(false);
42 |   }
43 | 
44 |   return(true);
45 | }
46 | 


--------------------------------------------------------------------------------
/docs/reference/figures/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/banner.png


--------------------------------------------------------------------------------
/docs/reference/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo.png


--------------------------------------------------------------------------------
/docs/reference/figures/logo/01_make_logo.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | library("magick")
 4 | 
 5 | slide <- image_read("logo-slide.tiff")
 6 | banner <- image_crop(slide, "370x80+2+2")
 7 | image_write(banner, "../banner.png", format = "png")
 8 | 
 9 | ccap <- image_read("c-07.jpg")
10 | logo <- image_scale(ccap, "80x80")
11 | image_write(logo, "../logo.png", format = "png")
12 | 


--------------------------------------------------------------------------------
/docs/reference/figures/logo/README:
--------------------------------------------------------------------------------
 1 | 
 2 | The logo was created in Mac OS pages, then exported to TIFF. The script
 3 | `logo.R` crops the TIFF and converts to PNG.
 4 | 
 5 | --
 6 | 
 7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html
 8 | 
 9 | "This and numerous other stock print foundry images were included in the book
10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop &
11 | Company."
12 | 
13 | 
14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt).
15 | 


--------------------------------------------------------------------------------
/docs/reference/figures/logo/c-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/c-07.jpg


--------------------------------------------------------------------------------
/docs/reference/figures/logo/logo-slide.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo-slide.tiff


--------------------------------------------------------------------------------
/docs/reference/figures/logo/logo.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/docs/reference/figures/logo/logo.key


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | Aarup
 2 | AFINN
 3 | Årup
 4 | Baidu
 5 | Baum
 6 | bigram
 7 | bigrams
 8 | blog
 9 | CEUR
10 | deserialization
11 | deserialize
12 | deserialized
13 | dorothy
14 | doesn
15 | emoji
16 | Emoji
17 | ESWC
18 | Haiyan
19 | ignorables
20 | indices
21 | JSON
22 | kana
23 | knitr
24 | LF
25 | microblogs
26 | Microposts
27 | Mosteller
28 | NDJSON
29 | NFKC
30 | NFKD
31 | ODbL
32 | PDFs
33 | Publius
34 | Quanteda
35 | rmarkdown
36 | RTF
37 | Silge
38 | Spolsky
39 | stopt
40 | stopwords
41 | STSong
42 | tokenization
43 | Tokenization
44 | tokenize
45 | Tokenize
46 | tokenizer
47 | tokenizes
48 | tokenizing
49 | toto
50 | UAX
51 | unigrams
52 | Unported
53 | URLs
54 | Valitutti
55 | VignetteEncoding
56 | VignetteEngine
57 | VignetteIndexEntry
58 | wordcloud
59 | WordNet
60 | xa
61 | xff
62 | xffff
63 | york
64 | 


--------------------------------------------------------------------------------
/man/abbreviations.Rd:
--------------------------------------------------------------------------------
 1 | \name{abbreviations}
 2 | \docType{data}
 3 | \alias{abbreviations}
 4 | \alias{abbreviations_de}
 5 | \alias{abbreviations_en}
 6 | \alias{abbreviations_es}
 7 | \alias{abbreviations_fr}
 8 | \alias{abbreviations_it}
 9 | \alias{abbreviations_pt}
10 | \alias{abbreviations_ru}
11 | \title{Abbreviations}
12 | \description{
13 | Lists of common abbreviations.
14 | }
15 | \details{
16 | The \code{abbreviations_} objects are character vectors of abbreviations.
17 | These are words or phrases containing full stops (periods, ambiguous sentence
18 | terminators) that require special handling for sentence detection and
19 | tokenization.
20 | 
21 | The original lists were compiled by the
22 | \href{http://cldr.unicode.org/}{Unicode Common Locale Data Repository}. We
23 | have tailored the English list by adding single-letter abbreviations and
24 | making a few other additions.
25 | 
26 | The built-in abbreviation lists are reasonable defaults, but they may require
27 | further tailoring to suit your particular task.
28 | }
29 | \usage{
30 | abbreviations_de
31 | abbreviations_en
32 | abbreviations_es
33 | abbreviations_fr
34 | abbreviations_it
35 | abbreviations_pt
36 | abbreviations_ru
37 | }
38 | \format{A character vector of unique abbreviations.}
39 | \seealso{
40 | \code{\link{text_filter}}.
41 | }
42 | \keyword{datasets}
43 | 


--------------------------------------------------------------------------------
/man/affect_wordnet.Rd:
--------------------------------------------------------------------------------
 1 | \name{affect_wordnet}
 2 | \docType{data}
 3 | \alias{affect_wordnet}
 4 | \title{WordNet-Affect Lexicon}
 5 | \description{
 6 |     The WordNet-Affect Lexicon is a hand-curate collection of
 7 |     emotion-related words (nouns, verbs, adjectives, and adverbs),
 8 |     classified as \dQuote{Positive}, \dQuote{Negative},
 9 |     \dQuote{Neutral}, or \dQuote{Ambiguous} and categorized into
10 |     28 subcategories (\dQuote{Joy}, \dQuote{Love}, \dQuote{Fear},
11 |     etc.).
12 | 
13 |     Terms can and do appear in multiple categories.
14 | 
15 |     The original lexicon contains multi-word phrases, but they
16 |     are excluded here. Also, we removed the term \sQuote{thing}
17 |     from the lexicon.
18 | 
19 |     The original WordNet-Affect lexicon is distributed as part
20 |     of the WordNet Domains project, which is licensed under a
21 |     \href{https://creativecommons.org/licenses/by/3.0/}{Creative Commons Attribution 3.0 Unported License}.
22 |     You are free to share and adapt the lexicon, as long as you
23 |     give attribution to the original authors.
24 | }
25 | \usage{affect_wordnet}
26 | \format{A data frame with one row for each term classification.}
27 | \source{\url{http://wndomains.fbk.eu/wnaffect.html}}
28 | \references{
29 |     Strapparava, C and Valitutti A. (2004).
30 |     WordNet-Affect: an affective extension of WordNet.
31 |     \emph{Proceedings of the 4th International Conference on Language
32 |         Resources and Evaluation}
33 |     1083--1086.
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/man/corpus-deprecated.Rd:
--------------------------------------------------------------------------------
 1 | \name{corpus-deprecated}
 2 | \alias{corpus-deprecated}
 3 | \title{Deprecated Functions in Package \pkg{corpus}}
 4 | \description{
 5 | These functions are provided for compatibility with older versions of
 6 | \pkg{corpus} only, and may be defunct as soon as the next release.
 7 | }
 8 | %\usage{
 9 | %}
10 | %\arguments{
11 | %}
12 | %\details{
13 | %}
14 | \seealso{
15 | \code{\link{Deprecated}}
16 | }
17 | \keyword{internal}
18 | \keyword{misc}
19 | 


--------------------------------------------------------------------------------
/man/corpus-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{corpus-package}
 2 | \alias{corpus-package}
 3 | \alias{corpus}
 4 | \docType{package}
 5 | \title{
 6 | The Corpus Package
 7 | }
 8 | \description{
 9 | Text corpus analysis functions
10 | }
11 | \details{
12 | This package contains functions for text corpus analysis. To create a text
13 | object, use the \code{\link{read_ndjson}} or \code{\link{as_corpus_text}}
14 | function.
15 | To split text into sentences or token blocks, use \code{\link{text_split}}.
16 | To specify preprocessing behavior for transforming a text into a
17 | token sequence, use \code{\link{text_filter}}. To tokenize text
18 | or compute term frequencies, use \code{\link{text_tokens}},
19 | \code{\link{term_stats}} or \code{\link{term_matrix}}.
20 | To search for or count specific terms,
21 | use \code{\link{text_locate}}, \code{\link{text_count}}, or
22 | \code{\link{text_detect}}.
23 | 
24 | For a complete list of functions, use \code{library(help = "corpus")}.
25 | }
26 | \author{
27 | Patrick O. Perry
28 | }
29 | \keyword{ package }
30 | 


--------------------------------------------------------------------------------
/man/corpus_frame.Rd:
--------------------------------------------------------------------------------
 1 | \name{corpus_frame}
 2 | \alias{as_corpus_frame}
 3 | \alias{as_corpus_frame.character}
 4 | \alias{as_corpus_frame.Corpus}
 5 | \alias{as_corpus_frame.corpus}
 6 | \alias{as_corpus_frame.corpus_json}
 7 | \alias{as_corpus_frame.corpus_text}
 8 | \alias{as_corpus_frame.data.frame}
 9 | \alias{as_corpus_frame.default}
10 | \alias{corpus_frame}
11 | \alias{is_corpus_frame}
12 | \title{Corpus Data Frame}
13 | \description{
14 | Create or test for corpus objects.
15 | }
16 | \usage{
17 | corpus_frame(..., row.names = NULL, filter = NULL)
18 | 
19 | as_corpus_frame(x, filter = NULL, ..., row.names = NULL)
20 | 
21 | is_corpus_frame(x)
22 | }
23 | \arguments{
24 | \item{\dots}{data frame columns for \code{corpus_frame};
25 |     further arguments passed to \code{as_corpus_text} from
26 |     \code{as_corpus_frame}.}
27 | 
28 | \item{row.names}{character vector of row names for the corpus object.}
29 | 
30 | \item{filter}{text filter object for the \code{"text"} column in the
31 |     corpus object.}
32 | 
33 | \item{x}{object to be coerced or tested.}
34 | }
35 | \details{
36 |     These functions create or convert another object to a corpus object.
37 |     A corpus object is just a data frame with special functions for
38 |     printing, and a column names \code{"text"} of type \code{"corpus_text"}.
39 | 
40 |     \code{corpus} has similar semantics to the \code{\link{data.frame}}
41 |     function, except that string columns do not get converted to factors.
42 |     
43 |     \code{as_corpus_frame} converts another object to a corpus data frame
44 |     object. By default, the method converts \code{x} to a data frame with
45 |     a column named \code{"text"} of type \code{"corpus_text"}, and sets the
46 |     class attribute of the result to \code{c("corpus_frame", "data.frame")}.
47 | 
48 |     \code{is_corpus_frame} tests whether \code{x} is a data frame with a column
49 |     named \code{"text"} of type \code{"corpus_text"}.
50 |     
51 |     \code{as_corpus_frame} is generic: you can write methods to
52 |     handle specific classes of objects.
53 | }
54 | \value{
55 |     \code{corpus_frame} creates a data frame with a column named \code{"text"}
56 |         of type \code{"corpus_text"}, and a class attribute set to
57 |         \code{c("corpus_frame", "data.frame")}.
58 | 
59 |     \code{as_corpus_frame} attempts to coerce its argument to a corpus
60 |         data frame object, setting the \code{row.names} and calling
61 |         \code{\link{as_corpus_text}} on the \code{"text"} column with
62 |         the \code{filter} and \code{\dots} arguments.
63 | 
64 |     \code{is_corpus_frame} returns \code{TRUE} or \code{FALSE} depending on
65 |         whether its argument is a valid corpus object or not.
66 | }
67 | \seealso{
68 | \code{\link{corpus-package}}, \code{\link{print.corpus_frame}},
69 | \code{\link{corpus_text}}, \code{\link{read_ndjson}}.
70 | }
71 | \examples{
72 | # convert a data frame:
73 | emoji <- data.frame(text = sapply(0x1f600 + 1:30, intToUtf8),
74 |                     stringsAsFactors = FALSE)
75 | as_corpus_frame(emoji)
76 | 
77 | # construct directly (no need for stringsAsFactors = FALSE):
78 | corpus_frame(text = sapply(0x1f600 + 1:30, intToUtf8))
79 |     
80 | # convert a character vector:
81 | as_corpus_frame(c(a = "goodnight", b = "moon")) # keeps names
82 | as_corpus_frame(c(a = "goodnight", b = "moon"), row.names = NULL) # drops names
83 | }
84 | \keyword{classes}
85 | 


--------------------------------------------------------------------------------
/man/corpus_text.Rd:
--------------------------------------------------------------------------------
  1 | \name{corpus_text}
  2 | \alias{as_corpus_text}
  3 | \alias{as_corpus_text.character}
  4 | \alias{as_corpus_text.Corpus}
  5 | \alias{as_corpus_text.corpus}
  6 | \alias{as_corpus_text.corpus_json}
  7 | \alias{as_corpus_text.corpus_text}
  8 | \alias{as_corpus_text.data.frame}
  9 | \alias{as_corpus_text.default}
 10 | \alias{corpus_text}
 11 | \alias{is_corpus_text}
 12 | \title{Text Objects}
 13 | \description{
 14 | Create or test for text objects.
 15 | }
 16 | \usage{
 17 | as_corpus_text(x, filter = NULL, ..., names = NULL)
 18 | 
 19 | is_corpus_text(x)
 20 | }
 21 | \arguments{
 22 |   \item{x}{object to be coerced or tested.}
 23 | 
 24 |   \item{filter}{if non-\code{NULL}, a text filter for the converted result.}
 25 | 
 26 |   \item{\dots}{text filter properties to set on the result.}
 27 | 
 28 |   \item{names}{if non-\code{NULL} character vector of names for
 29 |     the converted result.}
 30 | }
 31 | \details{
 32 | The \code{corpus_text} type is a new data type provided by the \code{corpus}
 33 | package suitable for processing international (Unicode) text. Text vectors
 34 | behave like character vectors (and can be converted to them with the
 35 | \code{as.character} function). They can be created using the
 36 | \code{\link{read_ndjson}} function or by converting another object using the
 37 | \code{as_corpus_text} function.
 38 | 
 39 | All text objects have a \code{\link{text_filter}} property specify how to
 40 | transform the text into tokens or segment it into sentences.
 41 | 
 42 | The default behavior for \code{as_corpus_text} is to proceed as follows:
 43 | \enumerate{
 44 |   \item If \code{x} is a \code{character} vector, then we create
 45 |     a new \code{text} vector from \code{x}.
 46 | 
 47 |    \item If \code{x} is a data frame, then we call \code{as_corpus_text}
 48 |      on \code{x$text} if a column named \code{"text"} exists in
 49 |      the data frame.  If the data frame does not have a column
 50 |      named \code{"text"}, then we fail with an error message.
 51 | 
 52 |    \item If \code{x} is a \code{corpus_text} object, then we drop all
 53 |      attributes and we set the class to \code{"corpus_text"}.
 54 | 
 55 |    \item The default behavior for when none of the above conditions
 56 |      are true is to call \code{as.character} on the object first,
 57 |      preserving the names, and then and call \code{as_corpus_text} on
 58 |      the returned character object.
 59 | }
 60 | 
 61 | In all cases, when the \code{names} is \code{NULL}, we set the result
 62 | names to \code{names(x)} (or \code{rownames(x)} for a data frame
 63 | argument). When \code{names} is a character vector, we set the result names
 64 | to this vector of names
 65 | 
 66 | Similarly, when \code{filter} is \code{NULL}, we set the result text
 67 | filter to \code{text_filter(x)}. When \code{filter} is non-\code{NULL}
 68 | missing, we set the result text filter to this value. In either case,
 69 | if there are additional names arguments, then we override the filter
 70 | properties specified by the names of these arguments with the new values
 71 | given.
 72 | 
 73 | Note that the special handling for the names of the object is different
 74 | from the other R conversion functions (\code{as.numeric},
 75 | \code{as.character}, etc.), which drop the names.
 76 | 
 77 | \code{as_corpus_text} is generic: you can write methods to handle specific
 78 | classes of objects.
 79 | }
 80 | \value{
 81 | \code{as_corpus_text} attempts to coerce its argument to \code{text} type and
 82 |   set its \code{names} and \code{text_filter} properties; it strips
 83 |   all other attributes.
 84 | 
 85 | \code{is_corpus_text} returns \code{TRUE} or \code{FALSE} depending on
 86 |   whether its argument is of text type or not.
 87 | }
 88 | \seealso{
 89 | \code{\link{as_utf8}}, \code{\link{text_filter}}, \code{\link{read_ndjson}}.
 90 | }
 91 | \examples{
 92 | as_corpus_text("hello, world!")
 93 | as_corpus_text(c(a = "goodnight", b = "moon")) # keeps names
 94 | 
 95 | # set a filter property
 96 | as_corpus_text(c(a = "goodnight", b = "moon"), stemmer = "english") 
 97 | 
 98 | is_corpus_text("hello") # FALSE, "hello" is character, not text
 99 | }
100 | \keyword{classes}
101 | 


--------------------------------------------------------------------------------
/man/federalist.Rd:
--------------------------------------------------------------------------------
 1 | \name{federalist}
 2 | \docType{data}
 3 | \alias{federalist}
 4 | \title{The Federalist Papers}
 5 | \description{
 6 |     \cite{The Federalist Papers} comprise 85 articles published under the
 7 |     pseudonym \dQuote{Publius} in New York newspapers between 1787 and
 8 |     1788, written to convince residents to ratify the \cite{Constitution}.
 9 |     John Jay wrote 5 papers, while Alexander Hamilton and James Madison
10 |     wrote the remaining 80.  Between the last two authors there are
11 |     conflicting accounts of which author wrote which paper.  Most sources
12 |     agree on the authorships of 65 papers (51 by Hamilton and 14 by Madison),
13 |     but 15 papers are in dispute.
14 | 
15 |     In one of the earliest examples of statistical text analysis, F. Mosteller
16 |     and D. L. Wallace used a form of Naive Bayes classification to identify
17 |     the authorships of the 15 disputed papers, finding strong evidence that
18 |     Madison was the author of all of the disputed papers.
19 | }
20 | \usage{federalist}
21 | \format{A data frame with 85 rows, one for each paper.}
22 | \source{\url{http://www.gutenberg.org/ebooks/18}}
23 | \references{
24 |     Mosteller, F and Wallace, D. L. (1963).
25 |     Inference in an authorship problem.
26 |     \emph{Journal of the American Statistical Association}
27 |     \strong{58} 275--309.
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/figures/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/banner.png


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo.png


--------------------------------------------------------------------------------
/man/figures/logo/01_make_logo.R:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/Rscript --vanilla
 2 | 
 3 | library("magick")
 4 | 
 5 | slide <- image_read("logo-slide.tiff")
 6 | banner <- image_crop(slide, "370x80+2+2")
 7 | image_write(banner, "../banner.png", format = "png")
 8 | 
 9 | ccap <- image_read("c-07.jpg")
10 | logo <- image_scale(ccap, "80x80")
11 | image_write(logo, "../logo.png", format = "png")
12 | 


--------------------------------------------------------------------------------
/man/figures/logo/README:
--------------------------------------------------------------------------------
 1 | 
 2 | The logo was created in Mac OS pages, then exported to TIFF. The script
 3 | `logo.R` crops the TIFF and converts to PNG.
 4 | 
 5 | --
 6 | 
 7 | C drop capital is public domain, from http://www.reusableart.com/c-07.html
 8 | 
 9 | "This and numerous other stock print foundry images were included in the book
10 | The Proverbs of Scotland from 1868. It was published by Alexander Hislop &
11 | Company."
12 | 
13 | 
14 | Font is Matthew Butterick's "Equity" (Caps A Regular, 96pt).
15 | 


--------------------------------------------------------------------------------
/man/figures/logo/c-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/c-07.jpg


--------------------------------------------------------------------------------
/man/figures/logo/logo-slide.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo-slide.tiff


--------------------------------------------------------------------------------
/man/figures/logo/logo.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/man/figures/logo/logo.key


--------------------------------------------------------------------------------
/man/gutenberg_corpus.Rd:
--------------------------------------------------------------------------------
 1 | \name{gutenberg_corpus}
 2 | \alias{gutenberg_corpus}
 3 | \title{Project Gutenberg Corpora}
 4 | \description{
 5 | Get a corpus of texts from Project Gutenberg.
 6 | }
 7 | \usage{
 8 | gutenberg_corpus(ids, filter = NULL, mirror = NULL, verbose = TRUE, ...)
 9 | }
10 | \arguments{
11 | \item{ids}{an integer vector of requested Gutenberg text IDs.}
12 | 
13 | \item{filter}{a text filter to set on the corpus.}
14 | 
15 | \item{mirror}{a character string URL for the Gutenberg mirror to use,
16 |     or NULL to determine automatically.}
17 | 
18 | \item{verbose}{a logical scalar indicating whether to print progress
19 |     updates to the console.}
20 | 
21 | \item{...}{additional arguments passed to \code{as_corpus}.}
22 | }
23 | \details{
24 | \code{gutenberg_corpus} downloads a set of texts from Project Gutenberg,
25 | creating a corpus with the texts as rows. You specify the texts for inclusion
26 | using their Project Gutenberg IDs, passed to the function in the
27 | \code{ids} argument.
28 | 
29 | You can search for Project Gutenberg texts and get their IDs using the
30 | \code{gutenberg_works} function from the \code{gutenbergr} package.
31 | }
32 | \value{
33 | A corpus (data frame) with three columns: \code{"title"}, \code{"author"},
34 | and \code{"text"}.
35 | }
36 | \seealso{
37 | \code{\link{corpus_frame}}.
38 | }
39 | \examples{
40 | # get the texts of George Eliot's novels
41 | \dontrun{eliot <- gutenberg_corpus(c(145, 550, 6688))}
42 | }
43 | 


--------------------------------------------------------------------------------
/man/new_stemmer.Rd:
--------------------------------------------------------------------------------
 1 | \name{new_stemmer}
 2 | \alias{new_stemmer}
 3 | \title{Stemmer Construction}
 4 | \description{
 5 |     Make a stemmer from a set of (term, stem) pairs.
 6 | }
 7 | \usage{
 8 | new_stemmer(term, stem, default = NULL, duplicates = "first",
 9 |             vectorize = TRUE)
10 | }
11 | \arguments{
12 | \item{term}{character vector of terms to stem.}
13 | 
14 | \item{stem}{character vector the same length as \code{term} with entries
15 |    giving the corresponding stems.}
16 | 
17 | \item{default}{if non-\code{NULL}, a default value to use for terms
18 |     that do not have a stem; \code{NULL} specifies that such terms
19 |     should be left unchanged.}
20 | 
21 | \item{duplicates}{action to take for duplicates in the \code{term} list. See
22 |     \sQuote{Details}}.
23 | 
24 | \item{vectorize}{whether to produce a vectorized stemmer that accepts and
25 |     returns vector arguments.}
26 | }
27 | \details{
28 | Giving a list of terms and a corresponding list of stems, this produces a
29 | function that maps terms to their corresponding entry. If
30 | \code{default = NULL}, then values absent from the \code{term} argument
31 | get left as-is; otherwise, they get replaced by the \code{default} value.
32 | 
33 | The \code{duplicates} argument indicates the action to take if
34 | there are duplicate entries in the \code{term} argument:
35 | \itemize{
36 |   \item \code{duplicates = "first"} take the first matching entry in the
37 |     \code{stem} list.
38 | 
39 |   \item \code{duplicates = "last"} take the last matching entry in the
40 |     \code{stem} list.
41 | 
42 |   \item \code{duplicates = "omit"} use the \code{default} value for
43 |     duplicated terms.
44 | 
45 |   \item \code{duplicates = "fail"} raise an error if there are duplicated
46 |     terms.
47 | }
48 | }
49 | \value{
50 | By default, with \code{vectorize = TRUE}, the resulting stemmer accepts a
51 | character vector as input and returns a character vector of the same length
52 | with entries giving the stems of the corresponding input entries.
53 | 
54 | Setting \code{vectorize = FALSE} gives a function that accepts a single input
55 | and returns a single output. This can be more efficient when used as part of
56 | a \code{\link{text_filter}}.
57 | }
58 | \seealso{
59 | \code{\link{stem_snowball}, \link{text_filter}}, \code{\link{text_tokens}}.
60 | }
61 | \examples{
62 | # map uppercase to lowercase, leave others unchanged
63 | stemmer <- new_stemmer(LETTERS, letters)
64 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3"))
65 | 
66 | # map uppercase to lowercase, drop others
67 | stemmer <- new_stemmer(LETTERS, letters, default = NA)
68 | stemmer(c("A", "E", "I", "O", "U", "1", "2", "3"))
69 | }
70 | 


--------------------------------------------------------------------------------
/man/print.corpus_frame.Rd:
--------------------------------------------------------------------------------
 1 | \name{print.corpus_frame}
 2 | \title{Corpus Data Frame Printing}
 3 | \alias{format.corpus_frame}
 4 | \alias{print.corpus_frame}
 5 | \description{
 6 | Printing and formatting corpus data frames.
 7 | }
 8 | \usage{
 9 | \method{print}{corpus_frame}(x, rows = 20L, chars = NULL, digits = NULL,
10 |       quote = FALSE, na.print = NULL, print.gap = NULL,right = FALSE,
11 |       row.names = TRUE, max = NULL, display = TRUE, ...)
12 | 
13 | \method{format}{corpus_frame}(x, chars = NULL, na.encode = TRUE, quote = FALSE,
14 |        na.print = NULL, print.gap = NULL, ..., justify = "none")
15 | }
16 | \arguments{
17 |     \item{x}{data frame object to print or format.}
18 | 
19 |     \item{rows}{integer scalar giving the maximum number of rows to print
20 |         before truncating the output. A negative or missing value indicates
21 |         no upper limit.}
22 | 
23 |     \item{chars}{maximum number of character units to display; see
24 |         \code{\link{utf8_format}}.}
25 | 
26 |     \item{digits}{minimal number of significant digits; see
27 |         \code{\link{print.default}}.}
28 | 
29 |     \item{quote}{logical scalar indicating whether to put surrounding
30 |         double-quotes (\code{'"'}) around character strings and escape
31 |         internal double-quotes.}
32 | 
33 |     \item{na.print}{character string (or \code{NULL}) indicating
34 |         the encoding for \code{NA} values. Ignored when
35 |         \code{na.encode} is \code{FALSE}.}
36 | 
37 |     \item{print.gap}{non-negative integer (or \code{NULL}) giving the
38 |         number of spaces in gaps between columns; set to \code{NULL}
39 |         or \code{1} for a single space.}
40 | 
41 |     \item{right}{logical indicating whether to right-align columns
42 |         (ignored for text, character, and factor columns).}
43 | 
44 |     \item{row.names}{logical indicating whether to print row names, or
45 |         a character vector giving alternate row names to display.}
46 | 
47 |     \item{max}{maximum number of entries to print; defaults to
48 |         \code{getOption("max.print")}.}
49 | 
50 |     \item{display}{logical scalar indicating whether to optimize the
51 |         printing for display, not byte-for-byte data transmission;
52 |         see \code{utf8_encode}.}
53 | 
54 |     \item{justify}{justification; one of \code{"left"}, \code{"right"},
55 |         \code{"centre"}, or \code{"none"}. Can be abbreviated.}
56 | 
57 |     \item{na.encode}{logical scalar indicating whether to encode
58 |          \code{NA} values as character strings.}
59 | 
60 |     \item{...}{further arguments passed to or from other methods.}
61 | }
62 | \details{
63 |     The \code{"corpus_frame"} class is a subclass of \code{"data.frame"},
64 |     overriding the default print and format methods. To apply this
65 |     class to a data frame, set is class to
66 |     \code{c("corpus_frame", "data.frame")}.
67 | 
68 |     Corpus frame printing left-justifies character and text columns,
69 |     truncates the output, and displays emoji on Mac OS.
70 | }
71 | \seealso{
72 | \code{\link{corpus_frame}}, \code{\link{print.data.frame}},
73 | \code{\link{utf8_print}}
74 | }
75 | \examples{
76 | # default data frame printing
77 | x <- data.frame(text = c("hello world", intToUtf8(0x1f638 + 0:3), letters))
78 | print(x)
79 | 
80 | # corpus frame printing
81 | y <- x
82 | class(y) <- c("corpus_frame", "data.frame")
83 | print(y)
84 | 
85 | print(y, 10) # change truncation limit
86 | }
87 | 


--------------------------------------------------------------------------------
/man/sentiment_afinn.Rd:
--------------------------------------------------------------------------------
 1 | \name{sentiment_afinn}
 2 | \docType{data}
 3 | \alias{sentiment_afinn}
 4 | \title{AFINN Sentiment Lexicon}
 5 | \description{
 6 | The AFINN lexicon is a list of English terms manually rated
 7 | for valence with an integer between -5 (negative) and +5
 8 | (positive) by Finn \enc{Årup}{Aarup} Nielsen between
 9 | 2009 and 2011.
10 | 
11 | The original lexicon contains some multi-word phrases, but they
12 | are excluded here.
13 | 
14 | The original lexicon is distributed under the
15 | \href{https://opendatacommons.org/licenses/odbl/1-0/}{Open Database License (ODbL) v1.0}.
16 | You are free to share, create works from, and adapt the lexicon, as long as you
17 | attribute the original lexicon in your work. If you adapt the lexicon, you
18 | must keep the adapted lexicon open and apply a similar license.
19 | }
20 | \usage{sentiment_afinn}
21 | \format{A data frame with one row for each term}
22 | \source{\url{http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010}}
23 | \references{
24 | Finn \enc{Årup}{Aarup} Nielsen
25 | A new ANEW: Evaluation of a word list for sentiment analysis in microblogs.
26 | \emph{Proceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages 718 in CEUR Workshop Proceedings}
27 | 93-98. 2011 May.
28 | \url{http://arxiv.org/abs/1103.2903}.
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/man/stem_snowball.Rd:
--------------------------------------------------------------------------------
 1 | \name{stem_snowball}
 2 | \alias{stem_snowball}
 3 | \title{Snowball Stemmer}
 4 | \description{
 5 | Stem a set of terms using one of the algorithms provided by the
 6 | Snowball stemming library.
 7 | }
 8 | \usage{
 9 | stem_snowball(x, algorithm = "en")
10 | }
11 | \arguments{
12 | \item{x}{character vector of terms to stem.}
13 | 
14 | \item{algorithm}{stemming algorithm; see \sQuote{Details} for the valid
15 |     choices.}
16 | }
17 | \details{
18 | Apply a Snowball stemming algorithm to a vector of input terms, \code{x},
19 | returning the result in a character vector of the same length with the
20 | same names.
21 | 
22 | The \code{algorithm} argument specifies the stemming algorithm. Valid choices
23 | include the following:
24 |     \code{"ar"} (\code{"arabic"}),
25 |     \code{"da"} (\code{"danish"}),
26 | 	\code{"de"} (\code{"german"}),
27 |     \code{"en"} (\code{"english"}),
28 |     \code{"es"} (\code{"spanish"}),
29 |     \code{"fi"} (\code{"finnish"}),
30 |     \code{"fr"} (\code{"french"}),
31 |     \code{"hu"} (\code{"hungarian"}),
32 |     \code{"it"} (\code{"italian"}),
33 |     \code{"nl"} (\code{"dutch"}),
34 |     \code{"no"} (\code{"norwegian"}),
35 |     \code{"pt"} (\code{"portuguese"}),
36 | 	\code{"ro"} (\code{"romanian"}),
37 |     \code{"ru"} (\code{"russian"}),
38 |     \code{"sv"} (\code{"swedish"}),
39 |     \code{"ta"} (\code{"tamil"}),
40 |     \code{"tr"} (\code{"turkish"}),
41 |     and \code{"porter"}.
42 | Setting \code{algorithm = NULL} gives a stemmer that returns its input
43 | unchanged.
44 | 
45 | The function only stems single-word terms of kind "letter"; it leaves
46 | other inputs (multi-word terms, and terms of kind "number", "punct", and
47 | "symbol") unchanged.
48 | 
49 | The \href{http://snowballstem.org/algorithms/}{Snowball stemming library}
50 | provides the underlying implementation. The \code{wordStem} function from
51 | the \pkg{SnowballC} package provides a similar interface, but that function
52 | applies the algorithm to all input terms, regardless of the kind of the term.
53 | }
54 | \value{
55 | A character vector the same length and names as the input, \code{x}, with
56 | entries containing the corresponding stems.
57 | }
58 | \seealso{
59 | \code{\link{new_stemmer}}, \link{text_filter}.
60 | }
61 | \examples{
62 | # apply english stemming algorithm; don't stem non-letter terms
63 | stem_snowball(c("win", "winning", "winner", "#winning"))
64 | 
65 | # compare with SnowballC, which stems all kinds, not just letter
66 | \dontrun{SnowballC::wordStem(c("win", "winning", "winner", "#winning"), "en")}
67 | }
68 | 


--------------------------------------------------------------------------------
/man/stopwords.Rd:
--------------------------------------------------------------------------------
 1 | \name{stopwords}
 2 | \docType{data}
 3 | \alias{stopwords}
 4 | \alias{stopwords_da}
 5 | \alias{stopwords_de}
 6 | \alias{stopwords_en}
 7 | \alias{stopwords_es}
 8 | \alias{stopwords_fi}
 9 | \alias{stopwords_fr}
10 | \alias{stopwords_hu}
11 | \alias{stopwords_it}
12 | \alias{stopwords_nl}
13 | \alias{stopwords_no}
14 | \alias{stopwords_pt}
15 | \alias{stopwords_ru}
16 | \alias{stopwords_sv}
17 | \title{Stop Words}
18 | \description{
19 | Lists of common function words (\sQuote{stop} words).
20 | }
21 | \details{
22 | The \code{stopwords_} objects are character vectors of case-folded
23 | \sQuote{stop} words. These are common function words that often get discarded
24 | before performing other text analysis tasks.
25 | 
26 | There are lists available for the following languages:
27 | Danish (\code{stopwords_da}), Dutch (\code{stopwords_nl}),
28 | English (\code{stopwords_en}), Finnish (\code{stopwords_fi}),
29 | French (\code{stopwords_fr}, German (\code{stopwords_de})
30 | Hungarian (\code{stopwords_hu}), Italian (\code{stopwords_it}),
31 | Norwegian (\code{stopwords_no}), Portuguese (\code{stopwords_pt}),
32 | Russian (\code{stopwords_ru}), Spanish (\code{stopwords_es}),
33 | and Swedish (\code{stopwords_sv}).
34 | 
35 | These built-in word lists are reasonable defaults, but they may require
36 | further tailoring to suit your particular task. The original lists were
37 | compiled by the \href{http://snowballstem.org/}{Snowball stemming project}.
38 | Following the Quanteda text analysis software, we have tailored the original
39 | lists by adding the word "will" to the English list.
40 | }
41 | \usage{
42 | stopwords_da
43 | stopwords_de
44 | stopwords_en
45 | stopwords_es
46 | stopwords_fi
47 | stopwords_fr
48 | stopwords_hu
49 | stopwords_it
50 | stopwords_nl
51 | stopwords_no
52 | stopwords_pt
53 | stopwords_ru
54 | stopwords_sv
55 | }
56 | \format{A character vector of unique stop words.}
57 | \seealso{
58 | \code{\link{text_filter}}
59 | }
60 | \keyword{datasets}
61 | 


--------------------------------------------------------------------------------
/man/term_matrix.Rd:
--------------------------------------------------------------------------------
  1 | \name{term_matrix}
  2 | \alias{term_counts}
  3 | \alias{term_matrix}
  4 | \title{Term Frequency Tabulation}
  5 | \description{
  6 | Tokenize a set of texts and compute a term frequency matrix.
  7 | }
  8 | \usage{
  9 | term_matrix(x, filter = NULL, ngrams = NULL, select = NULL,
 10 |             group = NULL, transpose = FALSE, ...)
 11 | 
 12 | term_counts(x, filter = NULL, ngrams = NULL, select = NULL,
 13 |             group = NULL, ...)
 14 | }
 15 | \arguments{
 16 | \item{x}{a text vector to tokenize.}
 17 | 
 18 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
 19 |     the default text filter for \code{x}.}
 20 | 
 21 | \item{ngrams}{an integer vector of n-gram lengths to include, or
 22 |     \code{NULL} to use the \code{select} argument to determine the
 23 |     n-gram lengths.}
 24 | 
 25 | \item{select}{a character vector of terms to count, or \code{NULL} to
 26 |     count all terms that appear in \code{x}.}
 27 | 
 28 | \item{group}{if non-\code{NULL}, a factor, character string, or
 29 |     integer vector the same length of \code{x} specifying the grouping
 30 |     behavior.}
 31 | 
 32 | \item{transpose}{a logical value indicating whether to transpose the
 33 |     result, putting terms as rows instead of columns.}
 34 | 
 35 | \item{\dots}{additional properties to set on the text filter.}
 36 | }
 37 | \details{
 38 | \code{term_matrix} tokenizes a set of texts and computes the occurrence
 39 | counts for each term, returning the result as a sparse matrix
 40 | (texts-by-terms). \code{term_counts} returns the same information, but
 41 | in a data frame.
 42 | 
 43 | If \code{ngrams} is non-\code{NULL}, then multi-type n-grams are
 44 | included in the output for all lengths appearing in the \code{ngrams}
 45 | argument. If \code{ngrams} is \code{NULL} but \code{select} is
 46 | non-\code{NULL}, then all n-grams appearing in the \code{select} set
 47 | are included. If both \code{ngrams} and \code{select} are \code{NULL},
 48 | then only unigrams (single type terms) are included.
 49 | 
 50 | If \code{group} is \code{NULL}, then the output has one set of term
 51 | counts for each input text. Otherwise, we convert \code{group} to
 52 | a \code{factor} and compute one set of term counts for each level.
 53 | Texts with \code{NA} values for \code{group} get skipped.
 54 | }
 55 | \value{
 56 | \code{term_matrix} with \code{transpose = FALSE} returns a sparse matrix
 57 | in \code{"dgCMatrix"} format with one column for each term and one row for
 58 | each input text or (if \code{group} is non-\code{NULL}) for each grouping
 59 | level.  If \code{filter$select} is non-\code{NULL}, then the column names
 60 | will be equal to \code{filter$select}. Otherwise, the columns are assigned
 61 | in arbitrary order.
 62 | 
 63 | \code{term_matrix} with \code{transpose = TRUE} returns the transpose of
 64 | the term matrix, in \code{"dgCMatrix"} format.
 65 | 
 66 | \code{term_counts} with \code{group = NULL} returns a data frame with one
 67 | row for each entry of the term matrix, and columns \code{"text"},
 68 | \code{"term"}, and \code{"count"} giving the text ID, term, and count.
 69 | The \code{"term"} column is a factor with levels equal to the selected
 70 | terms. The \code{"text"}
 71 | column is a factor with levels equal to \code{names(as_corpus_text(x))};
 72 | calling \code{as.integer} on the \code{"text"} column converts from
 73 | the factor values to the integer row index in the term matrix.
 74 | 
 75 | \code{term_counts} with \code{group} non-\code{NULL} behaves similarly,
 76 | but the result instead has columns named \code{"group"}, \code{"term"},
 77 | and \code{"count"}, with \code{"group"} giving the grouping level, as
 78 | a factor.
 79 | }
 80 | \seealso{
 81 | \code{\link{text_tokens}}, \code{\link{term_stats}}.
 82 | }
 83 | \examples{
 84 | text <- c("A rose is a rose is a rose.",
 85 |           "A Rose is red, a violet is blue!",
 86 |           "A rose by any other name would smell as sweet.")
 87 | term_matrix(text)
 88 | 
 89 | # select certain terms
 90 | term_matrix(text, select = c("rose", "red", "violet", "sweet"))
 91 | 
 92 | # specify a grouping factor
 93 | term_matrix(text, group = c("Good", "Bad", "Good"))
 94 | 
 95 | # include higher-order n-grams
 96 | term_matrix(text, ngrams = 1:3)
 97 | 
 98 | # select certain multi-type terms
 99 | term_matrix(text, select = c("a rose", "a violet", "sweet", "smell"))
100 | 
101 | # transpose the result
102 | term_matrix(text, ngrams = 1:2, transpose = TRUE)[1:10, ] # first 10 rows
103 | 
104 | # data frame
105 | head(term_counts(text), n = 10) # first 10 rows
106 | 
107 | # with grouping
108 | term_counts(text, group = c("Good", "Bad", "Good"))
109 | 
110 | # taking names from the input
111 | term_counts(c(a = "One sentence.", b = "Another", c = "!!"))
112 | }
113 | 


--------------------------------------------------------------------------------
/man/term_stats.Rd:
--------------------------------------------------------------------------------
 1 | \name{term_stats}
 2 | \alias{term_stats}
 3 | \title{Term Statistics}
 4 | \description{
 5 |     Tokenize a set of texts and tabulate the term occurrence statistics.
 6 | }
 7 | \usage{
 8 | term_stats(x, filter = NULL, ngrams = NULL,
 9 |            min_count = NULL, max_count = NULL,
10 |            min_support = NULL, max_support = NULL, types = FALSE,
11 |            subset, ...)
12 | }
13 | \arguments{
14 | \item{x}{a text vector to tokenize.}
15 | 
16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
17 |     the default text filter for \code{x}.}
18 | 
19 | \item{ngrams}{an integer vector of n-gram lengths to include, or
20 |     \code{NULL} for length-1 n-grams only.}
21 | 
22 | \item{min_count}{a numeric scalar giving the minimum term count to include
23 |     in the output, or \code{NULL} for no minimum count.}
24 | 
25 | \item{max_count}{a numeric scalar giving the maximum term count to include
26 |     in the output, or \code{NULL} for no maximum count.}
27 | 
28 | \item{min_support}{a numeric scalar giving the minimum term support to
29 |     include in the output, or \code{NULL} for no minimum support.}
30 | 
31 | \item{max_support}{a numeric scalar giving the maximum term support to
32 |     include in the output, or \code{NULL} for no maximum support.}
33 | 
34 | \item{types}{a logical value indicating whether to include columns for
35 |     the types that make up the terms.}
36 | 
37 | \item{subset}{logical expression indicating elements or rows to keep:
38 |     missing values are taken as false.}
39 | 
40 | \item{\dots}{additional properties to set on the text filter.}
41 | }
42 | \details{
43 |     \code{term_stats} tokenizes a set of texts and computes the occurrence
44 |     counts and supports for each term. The \sQuote{count} is the number of
45 |     occurrences of the term across all texts; the \sQuote{support} is the
46 |     number of texts containing the term. Each appearance of a term
47 |     increments its count by one. Likewise, an appearance of a term in text
48 |     \code{i} increments its support once, not for each occurrence
49 |     in the text.
50 | 
51 |     To include multi-type terms, specify the designed term lengths using
52 |     the \code{ngrams} argument.
53 | }
54 | \value{
55 |     A data frame with columns named \code{term}, \code{count}, and
56 |     \code{support}, with one row for each appearing term. Rows are sorted
57 |     in descending order according to \code{support} and then \code{count},
58 |     with ties broken lexicographically by \code{term}, using the
59 |     character ordering determined by the current locale
60 |     (see \code{\link{Comparison}} for details).
61 | 
62 |     If \code{types = TRUE}, then the result also includes columns named
63 |     \code{type1}, \code{type2}, etc. for the types that make up the
64 |     term.
65 | }
66 | \seealso{
67 |     \code{\link{text_tokens}}, \code{\link{term_matrix}}.
68 | }
69 | \examples{
70 | term_stats("A rose is a rose is a rose.")
71 | 
72 | # remove punctuation and English stop words
73 | term_stats("A rose is a rose is a rose.",
74 |            text_filter(drop_symbol = TRUE, drop = stopwords_en))
75 | 
76 | # unigrams, bigrams, and trigrams
77 | term_stats("A rose is a rose is a rose.", ngrams = 1:3)
78 | 
79 | # also include the type information
80 | term_stats("A rose is a rose is a rose.", ngrams = 1:3, types = TRUE)
81 | }
82 | 


--------------------------------------------------------------------------------
/man/text_locate.Rd:
--------------------------------------------------------------------------------
 1 | \name{text_locate}
 2 | \alias{text_count}
 3 | \alias{text_detect}
 4 | \alias{text_locate}
 5 | \alias{text_match}
 6 | \alias{text_sample}
 7 | \alias{text_subset}
 8 | \title{Searching for Terms}
 9 | \description{
10 |     Look for instances of one or more terms in a set of texts.
11 | }
12 | \usage{
13 | text_locate(x, terms, filter = NULL, ...)
14 | 
15 | text_count(x, terms, filter = NULL, ...)
16 | 
17 | text_detect(x, terms, filter = NULL, ...)
18 | 
19 | text_match(x, terms, filter = NULL, ...)
20 | 
21 | text_sample(x, terms, size = NULL, filter = NULL, ...)
22 | 
23 | text_subset(x, terms, filter = NULL, ...)
24 | }
25 | \arguments{
26 | \item{x}{a text or character vector.}
27 | 
28 | \item{terms}{a character vector of search terms.}
29 | 
30 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
31 |     the default text filter for \code{x}.}
32 | 
33 | \item{size}{the maximum number of results to return, or \code{NULL}.}
34 | 
35 | \item{\dots}{additional properties to set on the text filter.}
36 | }
37 | \details{
38 | \code{text_locate} finds all instances of the search terms in the
39 | input text, along with their contexts.
40 | 
41 | \code{text_count} counts the number of search term instances in
42 | each element of the text vector.
43 | 
44 | \code{text_detect} indicates whether each text contains at least
45 | one of the search terms.
46 | 
47 | \code{text_match} reports the matching instances as a factor variable
48 | with levels equal to the \code{terms} argument.
49 | 
50 | \code{text_subset} returns the texts that contain the search terms.
51 | 
52 | \code{text_sample} returns a random sample of the results from
53 | \code{text_locate}, in random order. This is this is useful for
54 | hand-inspecting a subset of the \code{text_locate} matches.
55 | }
56 | \value{
57 | \code{text_count} and \code{text_detect} return a numeric vector and
58 | a logical vector, respectively, with length equal to the number of input
59 | texts and names equal to the text names.
60 | 
61 | \code{text_locate} and \code{text_sample} both return a data frame with
62 | one row for each search result and columns named \sQuote{text}, \sQuote{before},
63 | \sQuote{instance}, and \sQuote{after}. The \sQuote{text} column gives
64 | the name of the text containing the instance; \sQuote{before} and
65 | \sQuote{after} are text vectors giving the text before and after the
66 | instance. The \sQuote{instance} column gives the token or tokens matching
67 | the search term.
68 | 
69 | \code{text_match} returns a data frame for one row for each search result,
70 | with columns names \sQuote{text} and \sQuote{term}. Both columns are
71 | factors. The \sQuote{text} column has levels equal to the text labels,
72 | and the \sQuote{term} column has levels equal to \code{terms} argument.
73 | 
74 | \code{text_subset} returns the subset of texts that contain the given
75 | search terms.  The resulting has its \code{text_filter} set to the
76 | passed-in \code{filter} argument.
77 | }
78 | \seealso{
79 | \code{\link{term_stats}}, \code{\link{term_matrix}}.
80 | }
81 | \examples{
82 | text <- c("Rose is a rose is a rose is a rose.",
83 |           "A rose by any other name would smell as sweet.",
84 |           "Snow White and Rose Red")
85 | 
86 | text_count(text, "rose")
87 | text_detect(text, "rose")
88 | text_locate(text, "rose")
89 | text_match(text, "rose")
90 | text_sample(text, "rose", 3)
91 | text_subset(text, "a rose")
92 | 
93 | # search for multiple terms
94 | text_locate(text, c("rose", "rose red", "snow white"))
95 | }
96 | 


--------------------------------------------------------------------------------
/man/text_stats.Rd:
--------------------------------------------------------------------------------
 1 | \name{text_stats}
 2 | \alias{text_stats}
 3 | \title{Text Statistics}
 4 | \description{
 5 |     Report descriptive statistics for a set of texts.
 6 | }
 7 | \usage{
 8 | text_stats(x, filter = NULL, ...)
 9 | }
10 | \arguments{
11 | \item{x}{a text corpus.}
12 | 
13 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
14 |     the default text filter for \code{x}.}
15 | 
16 | \item{\dots}{additional properties to set on the text filter.}
17 | }
18 | \details{
19 |     \code{text_stats} reports descriptive statistics for a set of texts:
20 |     the number of tokens, unique types, and sentences.
21 | }
22 | \value{
23 |     A data frame with columns named \code{tokens}, \code{types}, and
24 |     \code{sentences}, with one row for each text.
25 | }
26 | \seealso{
27 |     \code{\link{text_filter}}, \code{\link{term_stats}}.
28 | }
29 | \examples{
30 | text_stats(c("A rose is a rose is a rose.",
31 |              "A Rose is red. A violet is blue!"))
32 | }
33 | 


--------------------------------------------------------------------------------
/man/text_sub.Rd:
--------------------------------------------------------------------------------
 1 | \name{text_sub}
 2 | \alias{text_sub}
 3 | \title{Text Subsequences}
 4 | \description{
 5 |     Extract token subsequences from a set of texts.
 6 | }
 7 | \usage{
 8 | text_sub(x, start = 1L, end = -1L, filter = NULL, ...)
 9 | }
10 | \arguments{
11 | \item{x}{text vector or corpus object.}
12 | 
13 | \item{start}{integer vector giving the starting positions of the
14 |     subsequences, or a two-column integer matrix giving the starting
15 |     and ending positions.}
16 | 
17 | \item{end}{integer vector giving the ending positions of the
18 |     subsequences; ignored if \code{start} is a two-column matrix.}
19 | 
20 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
21 |     the default text filter for \code{x}.}
22 | 
23 | \item{\dots}{additional properties to set on the text filter.}
24 | }
25 | \details{
26 |     \code{text_sub} extracts token subsequences from a set of texts.
27 |     The \code{start} and \code{end} arguments specifying the
28 |     positions of the subsequences within the parent texts, as an inclusive
29 |     range.  Negative indices are interpreted as counting from the end of
30 |     the text, with \code{-1L} referring to the last element.
31 | }
32 | \value{
33 |     A text vector with the same length and names as \code{x}, with the
34 |     desired subsequences.
35 | }
36 | \seealso{
37 |     \code{\link{text_tokens}}, \code{\link{text_ntoken}}.
38 | }
39 | \examples{
40 | x <- as_corpus_text(c("A man, a plan.", "A \"canal\"?", "Panama!"),
41 |                     drop_punct = TRUE)
42 | 
43 | # entire text
44 | text_sub(x, 1, -1)
45 | 
46 | # first three elements
47 | text_sub(x, 1, 3)
48 | 
49 | # last two elements
50 | text_sub(x, -2, -1)
51 | }
52 | 


--------------------------------------------------------------------------------
/man/text_types.Rd:
--------------------------------------------------------------------------------
 1 | \name{text_types}
 2 | \alias{text_ntype}
 3 | \alias{text_types}
 4 | \title{Text Type Sets}
 5 | \description{
 6 |     Get or measure the set of types (unique token values).
 7 | }
 8 | \usage{
 9 | text_types(x, filter = NULL, collapse = FALSE, ...)
10 | 
11 | text_ntype(x, filter = NULL, collapse = FALSE, ...)
12 | }
13 | \arguments{
14 | \item{x}{a text or character vector.}
15 | 
16 | \item{filter}{if non-\code{NULL}, a text filter to to use instead of
17 |     the default text filter for \code{x}.}
18 | 
19 | \item{collapse}{a logical value indicating whether to collapse the
20 |     aggregation over all rows of the input.}
21 | 
22 | \item{\dots}{additional properties to set on the text filter.}
23 | }
24 | \details{
25 |     \code{text_ntype} counts the number of unique types in each text;
26 |     \code{text_types} returns the set of unique types, as a character
27 |     vector. Types are determined according to the \code{filter} argument.
28 | }
29 | \value{
30 |     If \code{collapse = FALSE}, then \code{text_ntype} produces a numeric
31 |     vector with the same length and names as the input text, with the elements
32 |     giving the number of units in the corresponding texts.  For
33 |     \code{text_types}, the result is a list of character vector with each
34 |     vector giving the unique types in the corresponding text, ordered
35 |     according to the \code{\link{sort}} function.
36 | 
37 |     If \code{collapse = TRUE}, then we aggregate over all rows of the input.
38 |     In this case, \code{text_ntype} produces a scalar indicating the number
39 |     of unique types in \code{x}, and \code{text_types} produces a character
40 |     vector with the unique types.
41 | }
42 | \seealso{
43 |     \code{\link{text_filter}}, \code{\link{text_tokens}}.
44 | }
45 | \examples{
46 | text <- c("I saw Mr. Jones today.",
47 |           "Split across\na line.",
48 |           "What. Are. You. Doing????",
49 |           "She asked 'do you really mean that?' and I said 'yes.'")
50 | 
51 | # count the number of unique types
52 | text_ntype(text)
53 | text_ntype(text, collapse = TRUE)
54 | 
55 | # get the type sets
56 | text_types(text)
57 | text_types(text, collapse = TRUE)
58 | }
59 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | PKG_CFLAGS = -Icorpus/src
 2 | PKG_LIBS = -L. -lccorpus
 3 | 
 4 | SNOWBALL = corpus/lib/libstemmer_c
 5 | STEMMER_O = $(SNOWBALL)/src_c/stem_UTF_8_arabic.o \
 6 | 			$(SNOWBALL)/src_c/stem_UTF_8_danish.o \
 7 | 			$(SNOWBALL)/src_c/stem_UTF_8_dutch.o \
 8 | 			$(SNOWBALL)/src_c/stem_UTF_8_english.o \
 9 | 			$(SNOWBALL)/src_c/stem_UTF_8_finnish.o \
10 | 			$(SNOWBALL)/src_c/stem_UTF_8_french.o \
11 | 			$(SNOWBALL)/src_c/stem_UTF_8_german.o \
12 | 			$(SNOWBALL)/src_c/stem_UTF_8_hungarian.o \
13 | 			$(SNOWBALL)/src_c/stem_UTF_8_italian.o \
14 | 			$(SNOWBALL)/src_c/stem_UTF_8_norwegian.o \
15 | 			$(SNOWBALL)/src_c/stem_UTF_8_porter.o \
16 | 			$(SNOWBALL)/src_c/stem_UTF_8_portuguese.o \
17 | 			$(SNOWBALL)/src_c/stem_UTF_8_romanian.o \
18 | 			$(SNOWBALL)/src_c/stem_UTF_8_russian.o \
19 | 			$(SNOWBALL)/src_c/stem_UTF_8_spanish.o \
20 | 			$(SNOWBALL)/src_c/stem_UTF_8_swedish.o \
21 | 			$(SNOWBALL)/src_c/stem_UTF_8_tamil.o \
22 | 			$(SNOWBALL)/src_c/stem_UTF_8_turkish.o \
23 | 			$(SNOWBALL)/runtime/api.o \
24 | 			$(SNOWBALL)/runtime/utilities.o \
25 | 			$(SNOWBALL)/libstemmer/libstemmer_utf8.o
26 | 
27 | UTF8LITE = corpus/lib/utf8lite
28 | UTF8LITE_O = $(UTF8LITE)/src/array.o \
29 |              $(UTF8LITE)/src/char.o \
30 |              $(UTF8LITE)/src/encode.o \
31 |              $(UTF8LITE)/src/error.o \
32 |              $(UTF8LITE)/src/escape.o \
33 |              $(UTF8LITE)/src/graph.o \
34 |              $(UTF8LITE)/src/graphscan.o \
35 |              $(UTF8LITE)/src/normalize.o \
36 |              $(UTF8LITE)/src/render.o \
37 |              $(UTF8LITE)/src/text.o \
38 |              $(UTF8LITE)/src/textassign.o \
39 |              $(UTF8LITE)/src/textiter.o \
40 |              $(UTF8LITE)/src/textmap.o
41 | 
42 | LIBCORPUS = corpus/lib/strntod.o corpus/lib/strntoimax.o \
43 | 	corpus/src/array.o corpus/src/census.o corpus/src/data.o \
44 | 	corpus/src/datatype.o corpus/src/error.o corpus/src/filebuf.o \
45 | 	corpus/src/filter.o corpus/src/intset.o corpus/src/memory.o \
46 | 	corpus/src/ngram.o corpus/src/search.o corpus/src/sentfilter.o \
47 |     corpus/src/sentscan.o corpus/src/stem.o corpus/src/stopword.o \
48 | 	corpus/src/symtab.o corpus/src/table.o corpus/src/termset.o \
49 | 	corpus/src/textset.o corpus/src/tree.o corpus/src/wordscan.o \
50 | 	$(STEMMER_O) $(UTF8LITE_O)
51 | 
52 | $(SHLIB): libccorpus.a
53 | 
54 | libccorpus.a: $(LIBCORPUS)
55 | 	$(AR) rcs $@ $(LIBCORPUS)
56 | 	$(RANLIB) $@
57 | 
58 | clean:
59 | 	rm -f $(LIBCORPUS) $(SHLIB) $(OBJECTS) libccorpus.a
60 | 


--------------------------------------------------------------------------------
/src/context.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <inttypes.h>
18 | #include <stddef.h>
19 | #include <stdlib.h>
20 | #include <stdint.h>
21 | #include "rcorpus.h"
22 | 
23 | #define CONTEXT_TAG install("corpus::context")
24 | 
25 | 
26 | struct context {
27 | 	void *data;
28 | 	void (*destroy_func)(void *);
29 | };
30 | 
31 | 
32 | void free_context(SEXP x)
33 | {
34 |         struct context *ctx = R_ExternalPtrAddr(x);
35 |         R_SetExternalPtrAddr(x, NULL);
36 | 	if (ctx) {
37 | 		if (ctx->destroy_func) {
38 | 			(ctx->destroy_func)(ctx->data);
39 | 		}
40 | 		corpus_free(ctx->data);
41 | 		corpus_free(ctx);
42 | 	}
43 | }
44 | 
45 | 
46 | SEXP alloc_context(size_t size, void (*destroy_func)(void *))
47 | {
48 | 	SEXP ans;
49 | 	struct context *ctx = NULL;
50 | 	void *obj = NULL;
51 | 	int err = 0;
52 | 
53 | 	PROTECT(ans = R_MakeExternalPtr(NULL, CONTEXT_TAG, R_NilValue));
54 |         R_RegisterCFinalizerEx(ans, free_context, TRUE);
55 | 
56 | 	TRY_ALLOC(obj = corpus_calloc(1, size == 0 ? 1 : size));
57 | 	TRY_ALLOC(ctx = corpus_calloc(1, sizeof(*ctx)));
58 | 
59 | 	ctx->data = obj;
60 | 	ctx->destroy_func = destroy_func;
61 |         R_SetExternalPtrAddr(ans, ctx);
62 | 	ctx = NULL;
63 | 	obj = NULL;
64 | out:
65 | 	corpus_free(ctx);
66 | 	corpus_free(obj);
67 | 	CHECK_ERROR(err);
68 | 	UNPROTECT(1);
69 | 	return ans;
70 | }
71 | 
72 | 
73 | int is_context(SEXP x)
74 | {
75 | 	return ((TYPEOF(x) == EXTPTRSXP)
76 |                 && (R_ExternalPtrTag(x) == CONTEXT_TAG));
77 | }
78 | 
79 | 
80 | void *as_context(SEXP x)
81 | {
82 | 	struct context *ctx;
83 | 
84 | 	if (!is_context(x)) {
85 | 		error("invalid context object");
86 | 	}
87 | 
88 | 	ctx = R_ExternalPtrAddr(x);
89 | 	return ctx->data;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/filebuf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <errno.h>
 18 | #include <stdlib.h>
 19 | #include "corpus/src/memory.h"
 20 | #include "corpus/src/filebuf.h"
 21 | #include <Rdefines.h>
 22 | #include "rcorpus.h"
 23 | 
 24 | #define FILEBUF_TAG install("corpus::filebuf")
 25 | 
 26 | 
 27 | static struct corpus_filebuf *filebuf_new(const char *filename)
 28 | {
 29 | 	struct corpus_filebuf *obj = NULL;
 30 | 	struct corpus_filebuf buf;
 31 | 
 32 | 	errno = 0;
 33 | 
 34 | 	if (corpus_filebuf_init(&buf, filename) == 0) {
 35 | 		if (!(obj = corpus_malloc(sizeof(*obj)))) {
 36 | 			corpus_filebuf_destroy(&buf);
 37 | 			error("failed allocating memory");
 38 | 		}
 39 | 		*obj = buf;
 40 | 	} else {
 41 | 		if (errno) {
 42 | 			error("cannot open file '%s': %s",
 43 | 				filename, strerror(errno));
 44 | 		} else {
 45 | 			error("cannot open file '%s'", filename);
 46 | 		}
 47 | 	}
 48 | 
 49 | 	return obj;
 50 | }
 51 | 
 52 | 
 53 | static void filebuf_free(struct corpus_filebuf *buf)
 54 | {
 55 | 	if (buf) {
 56 | 		corpus_filebuf_destroy(buf);
 57 | 		corpus_free(buf);
 58 | 	}
 59 | }
 60 | 
 61 | 
 62 | static void free_filebuf(SEXP sbuf)
 63 | {
 64 |         struct corpus_filebuf *buf = R_ExternalPtrAddr(sbuf);
 65 | 	R_SetExternalPtrAddr(sbuf, NULL);
 66 | 	filebuf_free(buf);
 67 | }
 68 | 
 69 | 
 70 | SEXP alloc_filebuf(SEXP sfile)
 71 | {
 72 | 	SEXP ans, sclass, shandle, snames;
 73 | 	struct corpus_filebuf *buf;
 74 | 	const char *file;
 75 | 
 76 | 	if (!(isString(sfile) && LENGTH(sfile) == 1)) {
 77 |                 error("invalid 'file' argument");
 78 |         }
 79 | 
 80 | 	file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0)));
 81 | 
 82 | 	PROTECT(shandle = R_MakeExternalPtr(NULL, FILEBUF_TAG, R_NilValue));
 83 | 	R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE);
 84 | 
 85 | 	buf = filebuf_new(file);
 86 | 	R_SetExternalPtrAddr(shandle, buf);
 87 | 
 88 | 	PROTECT(ans = allocVector(VECSXP, 2));
 89 | 	SET_VECTOR_ELT(ans, 0, shandle);
 90 | 	SET_VECTOR_ELT(ans, 1, sfile);
 91 | 
 92 | 	PROTECT(snames = allocVector(STRSXP, 2));
 93 | 	SET_STRING_ELT(snames, 0, mkChar("handle"));
 94 | 	SET_STRING_ELT(snames, 1, mkChar("file"));
 95 | 	setAttrib(ans, R_NamesSymbol, snames);
 96 | 
 97 | 	PROTECT(sclass = allocVector(STRSXP, 1));
 98 | 	SET_STRING_ELT(sclass, 0, mkChar("filebuf"));
 99 | 	setAttrib(ans, R_ClassSymbol, sclass);
100 | 
101 | 	UNPROTECT(4);
102 | 	return ans;
103 | }
104 | 
105 | 
106 | int is_filebuf(SEXP sbuf)
107 | {
108 | 	SEXP handle, file;
109 | 
110 | 	if (!isVectorList(sbuf)) {
111 | 		return 0;
112 | 	}
113 | 
114 | 	handle = getListElement(sbuf, "handle");
115 | 	if (handle == R_NilValue) {
116 | 		return 0;
117 | 	}
118 | 
119 | 	file = getListElement(sbuf, "file");
120 | 	if (file == R_NilValue) {
121 | 		return 0;
122 | 	}
123 | 
124 | 	return ((TYPEOF(handle) == EXTPTRSXP)
125 | 		&& (R_ExternalPtrTag(handle) == FILEBUF_TAG));
126 | }
127 | 
128 | 
129 | struct corpus_filebuf *as_filebuf(SEXP sbuf)
130 | {
131 | 	SEXP shandle, sfile;
132 | 	struct corpus_filebuf *buf;
133 | 	const char *file;
134 | 
135 | 	if (!is_filebuf(sbuf)) {
136 | 		error("invalid 'filebuf' object");
137 | 	}
138 | 
139 | 	shandle = getListElement(sbuf, "handle");
140 | 	buf = R_ExternalPtrAddr(shandle);
141 | 
142 | 	if (buf == NULL) {
143 | 		R_RegisterCFinalizerEx(shandle, free_filebuf, TRUE);
144 | 
145 | 		sfile = getListElement(sbuf, "file");
146 | 		file = R_ExpandFileName(CHAR(STRING_ELT(sfile, 0)));
147 | 		buf = filebuf_new(file);
148 | 
149 | 		if (buf == NULL) {
150 | 			if (errno) {
151 | 				error("cannot open file '%s': %s", file,
152 | 				      strerror(errno));
153 | 			} else {
154 | 				error("cannot open file '%s'", file);
155 | 			}
156 | 		}
157 | 
158 | 		R_SetExternalPtrAddr(shandle, buf);
159 | 	}
160 | 
161 | 	return buf;
162 | }
163 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <stddef.h>
18 | #include <stdint.h>
19 | #include <Rdefines.h>
20 | #include <R_ext/Rdynload.h>
21 | #include "rcorpus.h"
22 | 
23 | #define CALLDEF(name, n)  {#name, (DL_FUNC) &name, n}
24 | 
25 | static const R_CallMethodDef CallEntries[] = {
26 | 	CALLDEF(abbreviations, 1),
27 | 	CALLDEF(alloc_text_handle, 0),
28 | 	CALLDEF(anyNA_text, 1),
29 | 	CALLDEF(as_character_json, 1),
30 | 	CALLDEF(as_character_text, 1),
31 | 	CALLDEF(as_integer_json, 1),
32 | 	CALLDEF(as_double_json, 1),
33 | 	CALLDEF(as_list_json, 1),
34 | 	CALLDEF(as_logical_json, 1),
35 | 	CALLDEF(as_text_character, 2),
36 | 	CALLDEF(as_text_filter_connector, 1),
37 | 	CALLDEF(as_text_json, 2),
38 | 	CALLDEF(dim_json, 1),
39 | 	CALLDEF(is_na_text, 1),
40 | 	CALLDEF(length_json, 1),
41 | 	CALLDEF(length_text, 1),
42 | 	CALLDEF(logging_off, 0),
43 | 	CALLDEF(logging_on, 0),
44 | 	CALLDEF(mmap_ndjson, 2),
45 | 	CALLDEF(names_json, 1),
46 | 	CALLDEF(names_text, 1),
47 | 	CALLDEF(print_json, 1),
48 | 	CALLDEF(read_ndjson, 2),
49 | 	CALLDEF(simplify_json, 1),
50 | 	CALLDEF(stem_snowball, 2),
51 | 	CALLDEF(stopwords, 1),
52 | 	CALLDEF(subscript_json, 2),
53 | 	CALLDEF(subset_json, 3),
54 | 	CALLDEF(term_stats, 7),
55 | 	CALLDEF(term_matrix, 4),
56 | 	CALLDEF(text_c, 3),
57 | 	CALLDEF(text_count, 2),
58 | 	CALLDEF(text_detect, 2),
59 | 	CALLDEF(text_locate, 2),
60 | 	CALLDEF(text_match, 2),
61 | 	CALLDEF(text_nsentence, 1),
62 | 	CALLDEF(text_ntoken, 1),
63 | 	CALLDEF(text_ntype, 2),
64 | 	CALLDEF(text_split_sentences, 2),
65 | 	CALLDEF(text_split_tokens, 2),
66 | 	CALLDEF(text_sub, 3),
67 | 	CALLDEF(text_trunc, 3),
68 | 	CALLDEF(text_tokens, 1),
69 | 	CALLDEF(text_types, 2),
70 | 	CALLDEF(text_valid, 1),
71 |         {NULL, NULL, 0}
72 | };
73 | 
74 | 
75 | void R_init_corpus(DllInfo *dll)
76 | {
77 | 	R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
78 | 	R_useDynamicSymbols(dll, FALSE);
79 | 	R_forceSymbols(dll, TRUE);
80 | }
81 | 


--------------------------------------------------------------------------------
/src/logging.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "rcorpus.h"
18 | #include "corpus/src/error.h"
19 | 
20 | 
21 | static void ignore_message(int code, const char *message)
22 | {
23 | 	(void)code;
24 | 	(void)message;
25 | }
26 | 
27 | 
28 | SEXP logging_off(void)
29 | {
30 | 	corpus_log_func = ignore_message;
31 | 	return R_NilValue;
32 | }
33 | 
34 | 
35 | SEXP logging_on(void)
36 | {
37 | 	corpus_log_func = NULL;
38 | 	return R_NilValue;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/mkchar.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <limits.h>
18 | #include "rcorpus.h"
19 | 
20 | 
21 | static void mkchar_ensure(struct mkchar *mk, int nmin);
22 | 
23 | 
24 | void mkchar_init(struct mkchar *mk)
25 | {
26 | 	mk->buf = NULL;
27 | 	mk->size = 0;
28 | }
29 | 
30 | 
31 | SEXP mkchar_get(struct mkchar *mk, const struct utf8lite_text *text)
32 | {
33 | 	SEXP ans;
34 | 	uint8_t *ptr;
35 | 	size_t len = UTF8LITE_TEXT_SIZE(text);
36 | 	struct utf8lite_text_iter it;
37 | 
38 | 	if (len > INT_MAX) {
39 | 		error("character string length exceeds maximum (%d)", INT_MAX);
40 | 	}
41 | 
42 | 	if (text->ptr == NULL) {
43 | 		ans = NA_STRING;
44 | 	} else {
45 | 		if (UTF8LITE_TEXT_HAS_ESC(text)) {
46 | 			mkchar_ensure(mk, (int)len);
47 | 
48 | 			utf8lite_text_iter_make(&it, text);
49 | 			ptr = mk->buf;
50 | 			while (utf8lite_text_iter_advance(&it)) {
51 | 				utf8lite_encode_utf8(it.current, &ptr);
52 | 			}
53 | 			len = (size_t)(ptr - mk->buf);
54 | 			ptr = mk->buf;
55 | 		} else {
56 | 			ptr = (uint8_t *)text->ptr;
57 | 		}
58 | 
59 | 		ans = mkCharLenCE((char *)ptr, (int)len, CE_UTF8);
60 | 	}
61 | 
62 | 	return ans;
63 | }
64 | 
65 | 
66 | static void mkchar_ensure(struct mkchar *mk, int nmin)
67 | {
68 | 	int size = mk->size;
69 | 
70 | 	if (nmin <= size) {
71 | 		return;
72 | 	}
73 | 
74 | 	corpus_array_size_add(&size, 1, 0, nmin); // can't overflow
75 | 	mk->buf = (void *)R_alloc(size, sizeof(uint8_t));
76 | 	mk->size = size;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/ndjson.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <assert.h>
18 | #include "rcorpus.h"
19 | 
20 | 
21 | SEXP mmap_ndjson(SEXP sfile, SEXP stext)
22 | {
23 | 	SEXP ans, sbuf;
24 | 
25 | 	PROTECT(sbuf = alloc_filebuf(sfile));
26 | 	PROTECT(ans = alloc_json(sbuf, R_NilValue, R_NilValue, stext));
27 | 	as_json(ans); // force data load
28 | 	UNPROTECT(2);
29 | 
30 | 	return ans;
31 | }
32 | 
33 | 
34 | SEXP read_ndjson(SEXP sbuffer, SEXP stext)
35 | {
36 | 	SEXP ans;
37 | 
38 | 	assert(TYPEOF(sbuffer) == RAWSXP);
39 | 
40 | 	PROTECT(ans = alloc_json(sbuffer, R_NilValue, R_NilValue, stext));
41 | 	as_json(ans); // force data load
42 | 	UNPROTECT(1);
43 | 
44 | 	return ans;
45 | }
46 | 


--------------------------------------------------------------------------------
/src/search.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stddef.h>
 18 | #include "rcorpus.h"
 19 | 
 20 | #define SEARCH_TAG install("corpus::search")
 21 | 
 22 | 
 23 | static struct corpus_search *search_new(void);
 24 | 
 25 | struct corpus_search *search_new(void)
 26 | {
 27 | 	struct corpus_search *obj;
 28 | 	int err;
 29 | 
 30 | 	TRY_ALLOC(obj = corpus_calloc(1, sizeof(*obj)));
 31 | 	TRY(corpus_search_init(obj));
 32 | 
 33 | 	err = 0;
 34 | out:
 35 | 	if (err) {
 36 | 		corpus_free(obj);
 37 | 		Rf_error("memory allocation failure");
 38 | 	}
 39 | 
 40 | 	return obj;
 41 | }
 42 | 
 43 | 
 44 | void corpus_search_free(struct corpus_search *obj)
 45 | {
 46 | 	if (!obj) {
 47 | 		return;
 48 | 	}
 49 | 
 50 | 	corpus_search_destroy(obj);
 51 | 	corpus_free(obj);
 52 | }
 53 | 
 54 | 
 55 | static void free_search(SEXP obj)
 56 | {
 57 |         struct corpus_search *search = R_ExternalPtrAddr(obj);
 58 | 	corpus_search_free(search);
 59 | 	R_ClearExternalPtr(obj);
 60 | }
 61 | 
 62 | 
 63 | int is_search(SEXP ssearch)
 64 | {
 65 | 	return ((TYPEOF(ssearch) == EXTPTRSXP)
 66 | 		&& (R_ExternalPtrTag(ssearch) == SEARCH_TAG));
 67 | }
 68 | 
 69 | 
 70 | struct corpus_search *as_search(SEXP ssearch)
 71 | {
 72 | 	if (!is_search(ssearch)) {
 73 | 		Rf_error("invalid 'search' object");
 74 | 	}
 75 | 	return R_ExternalPtrAddr(ssearch);
 76 | }
 77 | 
 78 | 
 79 | SEXP alloc_search(SEXP sterms, const char *name, struct corpus_filter *filter)
 80 | {
 81 | 	SEXP ans, sset, items;
 82 | 	const struct corpus_termset_term *term;
 83 | 	struct corpus_search *obj;
 84 | 	struct termset *termset;
 85 | 	int i, n;
 86 | 	int err = 0, nprot;
 87 | 
 88 | 	nprot = 0;
 89 | 
 90 | 	obj = search_new();
 91 | 	PROTECT(ans = R_MakeExternalPtr(obj, SEARCH_TAG, R_NilValue)); nprot++;
 92 | 	R_RegisterCFinalizerEx(ans, free_search, TRUE);
 93 | 
 94 | 	PROTECT(sset = alloc_termset(sterms, name, filter, 1)); nprot++;
 95 | 	termset = as_termset(sset);
 96 | 	items = items_termset(sset);
 97 | 	R_SetExternalPtrProtected(ans, items);
 98 | 
 99 | 	n = termset->nitem;
100 | 	for (i = 0; i < n; i++) {
101 | 		RCORPUS_CHECK_INTERRUPT(i);
102 | 		term = &termset->set.items[i];
103 | 		TRY(corpus_search_add(obj, term->type_ids,
104 | 				      term->length, NULL));
105 | 	}
106 | 
107 | out:
108 | 	CHECK_ERROR(err);
109 | 	UNPROTECT(nprot);
110 | 	return ans;
111 | }
112 | 
113 | 
114 | SEXP items_search(SEXP ssearch)
115 | {
116 | 	return R_ExternalPtrProtected(ssearch);
117 | }
118 | 


--------------------------------------------------------------------------------
/src/text_methods.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <inttypes.h>
18 | #include <float.h>
19 | #include <limits.h>
20 | #include <stdbool.h>
21 | #include <stdint.h>
22 | #include "rcorpus.h"
23 | 
24 | 
25 | SEXP names_text(SEXP text)
26 | {
27 | 	if (!is_text(text)) {
28 | 		error("invalid text object");
29 | 	}
30 | 	return getListElement(text, "names");
31 | }
32 | 
33 | 
34 | SEXP filter_text(SEXP text)
35 | {
36 | 	if (!is_text(text)) {
37 | 		error("invalid text object");
38 | 	}
39 | 	return getListElement(text, "filter");
40 | }
41 | 
42 | 
43 | SEXP length_text(SEXP stext)
44 | {
45 | 	R_xlen_t len;
46 | 	as_text(stext, &len);
47 | 	return ScalarReal((double)len);
48 | }
49 | 
50 | 
51 | SEXP is_na_text(SEXP stext)
52 | {
53 | 	SEXP ans;
54 | 	struct utf8lite_text *text;
55 | 	R_xlen_t i, n;
56 | 	int *isna;
57 | 
58 | 	text = as_text(stext, &n);
59 | 	PROTECT(ans = allocVector(LGLSXP, n));
60 | 	isna = LOGICAL(ans);
61 | 
62 | 	for (i = 0; i < n; i++) {
63 | 		RCORPUS_CHECK_INTERRUPT(i);
64 | 
65 | 		if (text[i].ptr) {
66 | 			isna[i] = FALSE;
67 | 		} else {
68 | 			isna[i] = TRUE;
69 | 		}
70 | 	}
71 | 
72 | 	UNPROTECT(1);
73 | 	return ans;
74 | }
75 | 
76 | 
77 | SEXP anyNA_text(SEXP stext)
78 | {
79 | 	struct utf8lite_text *text;
80 | 	R_xlen_t i, n;
81 | 	int anyNA;
82 | 
83 | 	text = as_text(stext, &n);
84 | 
85 | 	anyNA = FALSE;
86 | 	for (i = 0; i < n; i++) {
87 | 		RCORPUS_CHECK_INTERRUPT(i);
88 | 
89 | 		if (!text[i].ptr) {
90 | 			anyNA = TRUE;
91 | 			break;
92 | 		}
93 | 	}
94 | 
95 | 	return ScalarLogical(anyNA);
96 | }
97 | 


--------------------------------------------------------------------------------
/src/text_nunit.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stddef.h>
 18 | #include <stdlib.h>
 19 | #include <string.h>
 20 | #include "rcorpus.h"
 21 | 
 22 | 
 23 | SEXP text_nsentence(SEXP sx)
 24 | {
 25 | 	SEXP ans, names;
 26 | 	struct corpus_sentfilter *filter;
 27 | 	const struct utf8lite_text *text;
 28 | 	double *count;
 29 | 	R_xlen_t i, n, nunit;
 30 | 	int nprot, err = 0;
 31 | 
 32 | 	nprot = 0;
 33 | 
 34 | 	// x
 35 | 	PROTECT(sx = coerce_text(sx)); nprot++;
 36 | 	text = as_text(sx, &n);
 37 | 	filter = text_sentfilter(sx);
 38 | 	names = names_text(sx);
 39 | 
 40 | 	PROTECT(ans = allocVector(REALSXP, n)); nprot++;
 41 | 	setAttrib(ans, R_NamesSymbol, names);
 42 | 	count = REAL(ans);
 43 | 
 44 | 	for (i = 0; i < n; i++) {
 45 | 		RCORPUS_CHECK_INTERRUPT(i);
 46 | 
 47 | 		if (!text[i].ptr) { // missing value
 48 | 			count[i] = NA_REAL;
 49 | 			continue;
 50 | 		}
 51 | 
 52 | 		if (UTF8LITE_TEXT_SIZE(&text[i]) == 0) { // empty text
 53 | 			count[i] = 0;
 54 | 			continue;
 55 | 		}
 56 | 
 57 | 		TRY(corpus_sentfilter_start(filter, &text[i]));
 58 | 
 59 | 		nunit = 0;
 60 | 		while (corpus_sentfilter_advance(filter)) {
 61 | 			nunit++;
 62 | 		}
 63 | 		TRY(filter->error);
 64 | 
 65 | 		count[i] = (double)nunit;
 66 | 	}
 67 | 
 68 | out:
 69 | 	CHECK_ERROR(err);
 70 | 	UNPROTECT(nprot);
 71 | 	return ans;
 72 | }
 73 | 
 74 | 
 75 | SEXP text_ntoken(SEXP sx)
 76 | {
 77 | 	SEXP ans, names;
 78 | 	struct corpus_filter *filter;
 79 | 	const struct utf8lite_text *text;
 80 | 	double *count;
 81 | 	R_xlen_t i, n, nunit;
 82 | 	int nprot, err = 0;
 83 | 
 84 | 	nprot = 0;
 85 | 
 86 | 	PROTECT(sx = coerce_text(sx)); nprot++;
 87 | 	text = as_text(sx, &n);
 88 | 	names = names_text(sx);
 89 | 	filter = text_filter(sx);
 90 | 
 91 | 	PROTECT(ans = allocVector(REALSXP, n)); nprot++;
 92 | 	setAttrib(ans, R_NamesSymbol, names);
 93 | 	count = REAL(ans);
 94 | 
 95 | 	for (i = 0; i < n; i++) {
 96 | 		RCORPUS_CHECK_INTERRUPT(i);
 97 | 
 98 | 		if (!text[i].ptr) { // missing text
 99 | 			count[i] = NA_REAL;
100 | 			continue;
101 | 		}
102 | 
103 | 		TRY(corpus_filter_start(filter, &text[i]));
104 | 
105 | 		nunit = 0;
106 | 
107 | 		while (corpus_filter_advance(filter)) {
108 | 			if (filter->type_id < 0) {
109 | 				continue;
110 | 			}
111 | 			nunit++;
112 | 		}
113 | 		TRY(filter->error);
114 | 
115 | 		count[i] = (double)nunit;
116 | 	}
117 | 
118 | out:
119 | 	UNPROTECT(nprot);
120 | 	CHECK_ERROR(err);
121 | 	return ans;
122 | }
123 | 


--------------------------------------------------------------------------------
/src/text_sub.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "rcorpus.h"
 18 | 
 19 | 
 20 | static int text_len(const struct utf8lite_text *text,
 21 | 		    struct corpus_filter *filter)
 22 | {
 23 | 	int err = 0, len = 0;
 24 | 
 25 | 	TRY(corpus_filter_start(filter, text));
 26 | 	while (corpus_filter_advance(filter)) {
 27 | 		if (filter->type_id == CORPUS_TYPE_NONE) {
 28 | 			continue;
 29 | 		}
 30 | 		len++;
 31 | 	}
 32 | 	TRY(filter->error);
 33 | out:
 34 | 	CHECK_ERROR(err);
 35 | 	return len;
 36 | }
 37 | 
 38 | 
 39 | SEXP text_sub(SEXP sx, SEXP sstart, SEXP send)
 40 | {
 41 | 	SEXP ans, sources, table, tsource, trow, tstart, tstop, names, sfilter;
 42 | 	const struct utf8lite_text *text;
 43 | 	const uint8_t *base, *ptr;
 44 | 	struct corpus_filter *filter;
 45 | 	const int *start, *end;
 46 | 	R_xlen_t i, n, nstart, nend;
 47 | 	int err = 0, nprot = 0, s, e, j, m;
 48 | 
 49 | 	text = as_text(sx, &n);
 50 | 	filter = text_filter(sx);
 51 | 	sources = getListElement(sx, "sources");
 52 | 	table = getListElement(sx, "table");
 53 | 	tsource = getListElement(table, "source");
 54 | 	trow = getListElement(table, "row");
 55 | 	tstart = getListElement(table, "start");
 56 | 	tstop = getListElement(table, "stop");
 57 | 	names = names_text(sx);
 58 | 	sfilter = filter_text(sx);
 59 | 
 60 | 	PROTECT(tstart = duplicate(tstart)); nprot++;
 61 | 	PROTECT(tstop = duplicate(tstop)); nprot++;
 62 | 
 63 | 	start = INTEGER(sstart);
 64 | 	nstart = XLENGTH(sstart);
 65 | 
 66 | 	end = INTEGER(send);
 67 | 	nend = XLENGTH(send);
 68 | 
 69 | 	for (i = 0; i < n; i++) {
 70 | 		RCORPUS_CHECK_INTERRUPT(i);
 71 | 
 72 | 		s = start[i % nstart];
 73 | 		e = end[i % nend];
 74 | 
 75 | 		// handle missing text, missing endpoints
 76 | 		if (!text[i].ptr || s == NA_INTEGER || e == NA_INTEGER) {
 77 | 			INTEGER(tstart)[i] = NA_INTEGER;
 78 | 			INTEGER(tstop)[i] = NA_INTEGER;
 79 | 			continue;
 80 | 		}
 81 | 
 82 | 		// convert negative indices to non-negative,
 83 | 		// except for end = -1
 84 | 		if (s < 0 || e < -1) {
 85 | 			m = text_len(&text[i], filter);
 86 | 
 87 | 			if (s < 0) {
 88 | 				s = s + m + 1;
 89 | 				if (s < 0) {
 90 | 					s = 0;
 91 | 				}
 92 | 			}
 93 | 
 94 | 			if (e < -1) {
 95 | 				e = e + m + 1;
 96 | 				if (e < 0) {
 97 | 					e = 0;
 98 | 				}
 99 | 			}
100 | 		}
101 | 
102 | 		// clip start to [1,Inf)
103 | 		if (s == 0) {
104 | 			s = 1;
105 | 		}
106 | 
107 | 		base = text[i].ptr - (INTEGER(tstart)[i] - 1);
108 | 
109 | 		// find start
110 | 		j = 0;
111 | 		TRY(corpus_filter_start(filter, &text[i]));
112 | 		while (j != s && corpus_filter_advance(filter)) {
113 | 			if (filter->type_id == CORPUS_TYPE_NONE) {
114 | 				// skip ignored
115 | 				continue;
116 | 			}
117 | 			j++;
118 | 		}
119 | 		TRY(filter->error);
120 | 
121 | 		// handle case when start is after end of text
122 | 		if (j < s) {
123 | 			INTEGER(tstart)[i] = INTEGER(tstop)[i] + 1;
124 | 			continue;
125 | 		}
126 | 
127 | 		// set subsequence start
128 | 		ptr = filter->current.ptr;
129 | 		INTEGER(tstart)[i] = (int)(ptr - base) + 1;
130 | 
131 | 		// handle case when end is the last token
132 | 		if (e == -1) {
133 | 			continue;
134 | 		}
135 | 
136 | 		// find end
137 | 		while (j != e + 1 && corpus_filter_advance(filter)) {
138 | 			if (filter->type_id == CORPUS_TYPE_NONE) {
139 | 				// skip ignored
140 | 				continue;
141 | 			}
142 | 			j++;
143 | 		}
144 | 		TRY(filter->error);
145 | 
146 | 		// handle case when end is after end of text
147 | 		if (j < e + 1) {
148 | 			continue;
149 | 		}
150 | 
151 | 		// set subsequence end
152 | 		ptr = filter->current.ptr;
153 | 		INTEGER(tstop)[i] = (int)(ptr - base);
154 | 	}
155 | 
156 | 	PROTECT(ans = alloc_text(sources, tsource, trow, tstart, tstop,
157 | 				 names, sfilter));
158 | 	nprot++;
159 | 
160 | out:
161 | 	UNPROTECT(nprot);
162 | 	CHECK_ERROR(err);
163 | 	return ans;
164 | }
165 | 


--------------------------------------------------------------------------------
/src/text_trunc.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Patrick O. Perry.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "rcorpus.h"
 18 | 
 19 | static SEXP trunc_left(struct mkchar *mk, const struct utf8lite_text *text,
 20 | 		       int chars)
 21 | {
 22 | 	struct utf8lite_graphscan scan;
 23 | 	struct utf8lite_text sub;
 24 | 	int err = 0, width = 0, w;
 25 | 
 26 | 	sub.ptr = text->ptr;
 27 | 	sub.attr = UTF8LITE_TEXT_BITS(text);
 28 | 
 29 | 	utf8lite_graphscan_make(&scan, text);
 30 | 	while (utf8lite_graphscan_advance(&scan)) {
 31 | 		TRY(utf8lite_graph_measure(&scan.current, 0, &w));
 32 | 		if (w > 0) {
 33 | 			if (width > chars - w) {
 34 | 				break;
 35 | 			}
 36 | 			width += w;
 37 | 		}
 38 | 	}
 39 | 	sub.attr |= (size_t)(scan.ptr - text->ptr);
 40 | out:
 41 | 	CHECK_ERROR(err);
 42 | 	return mkchar_get(mk, &sub);
 43 | }
 44 | 
 45 | 
 46 | static SEXP trunc_right(struct mkchar *mk, const struct utf8lite_text *text,
 47 | 			int chars)
 48 | {
 49 | 	struct utf8lite_graphscan scan;
 50 | 	struct utf8lite_text sub;
 51 | 	const uint8_t *end;
 52 | 	int err = 0, width = 0, w;
 53 | 
 54 | 	sub.ptr = NULL;
 55 | 	sub.attr = UTF8LITE_TEXT_BITS(text);
 56 | 	end = text->ptr + UTF8LITE_TEXT_SIZE(text);
 57 | 
 58 | 	utf8lite_graphscan_make(&scan, text);
 59 | 	utf8lite_graphscan_skip(&scan);
 60 | 	while (utf8lite_graphscan_retreat(&scan)) {
 61 | 		TRY(utf8lite_graph_measure(&scan.current, 0, &w));
 62 | 		if (w > 0) {
 63 | 			if (width > chars - w) {
 64 | 				break;
 65 | 			}
 66 | 			width += w;
 67 | 		}
 68 | 	}
 69 | 	utf8lite_graphscan_retreat(&scan);
 70 | 	sub.ptr = (uint8_t *)scan.ptr;
 71 | 	sub.attr |= (size_t)(end - sub.ptr);
 72 | out:
 73 | 	CHECK_ERROR(err);
 74 | 	return mkchar_get(mk, &sub);
 75 | }
 76 | 
 77 | 
 78 | SEXP text_trunc(SEXP sx, SEXP schars, SEXP sright)
 79 | {
 80 |         SEXP ans, names, elt;
 81 | 	struct mkchar mk;
 82 |         const struct utf8lite_text *text;
 83 | 	R_xlen_t i, n;
 84 | 	int nprot = 0, chars, right;
 85 | 
 86 | 	text = as_text(sx, &n);
 87 | 	chars = INTEGER(schars)[0];
 88 | 	right = LOGICAL(sright)[0] == TRUE;
 89 | 	mkchar_init(&mk);
 90 | 
 91 | 	PROTECT(ans = allocVector(STRSXP, n)); nprot++;
 92 | 	PROTECT(names = names_text(sx)); nprot++;
 93 | 	setAttrib(ans, R_NamesSymbol, names);
 94 | 
 95 | 	for (i = 0; i < n; i++) {
 96 | 		RCORPUS_CHECK_INTERRUPT(i);
 97 | 
 98 | 		if (!text[i].ptr) {
 99 | 			elt = NA_STRING;
100 | 		} else if (right) {
101 | 			elt = trunc_right(&mk, &text[i], chars);
102 | 		} else {
103 | 			elt = trunc_left(&mk, &text[i], chars);
104 | 		}
105 | 		SET_STRING_ELT(ans, i, elt);
106 | 	}
107 | 
108 | 	UNPROTECT(nprot);
109 | 	return ans;
110 | }
111 | 


--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <inttypes.h>
18 | #include <stdint.h>
19 | #include <string.h>
20 | #include <Rdefines.h>
21 | #include "rcorpus.h"
22 | 
23 | 
24 | /* based on R-Exts Section 5.9.6 "handling lists" */
25 | int findListElement(SEXP list, const char *str)
26 | {
27 | 	SEXP names;
28 | 	int i, n;
29 | 	int nprot = 0;
30 | 	int ans = -1;
31 | 
32 | 	if (list == R_NilValue) {
33 | 		goto out;
34 | 	}
35 | 
36 | 	PROTECT(names = getAttrib(list, R_NamesSymbol)); nprot++;
37 | 	if (names == R_NilValue) {
38 | 		goto out;
39 | 	}
40 | 
41 | 	n = LENGTH(list);
42 | 	for (i = 0; i < n; i++) {
43 | 		if(strcmp(CHAR(STRING_ELT(names, i)), str) == 0) {
44 | 			ans = i;
45 | 			goto out;
46 | 		}
47 | 	}
48 | out:
49 | 	UNPROTECT(nprot);
50 | 	return ans;
51 | }
52 | 
53 | 
54 | SEXP getListElement(SEXP list, const char *str)
55 | {
56 | 	int i = findListElement(list, str);
57 | 	if (i < 0) {
58 | 		return R_NilValue;
59 | 	}
60 | 	return VECTOR_ELT(list, i);
61 | }
62 | 
63 | 
64 | double *as_weights(SEXP sweights, R_xlen_t n)
65 | {
66 | 	R_xlen_t n0;
67 | 
68 | 	if (sweights == R_NilValue) {
69 | 		return NULL;
70 | 	}
71 | 
72 | 	n0 = XLENGTH(sweights);
73 | 	if (n0 != n) {
74 | 		error("invalid 'weights' vector;"
75 | 		      " length is %"PRIu64" but should be %"PRIu64,
76 | 		      (uint64_t)n0, (uint64_t)n);
77 | 	}
78 | 
79 | 	return REAL(sweights);
80 | }
81 | 


--------------------------------------------------------------------------------
/src/wordlist.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Patrick O. Perry.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "rcorpus.h"
18 | 
19 | 
20 | static SEXP wordlist(const uint8_t **(*callback)(const char *, int *),
21 | 		     SEXP skind)
22 | {
23 | 	SEXP ans;
24 | 	const char **strs;
25 | 	const char *kind;
26 | 	int i, n;
27 | 
28 | 	if (skind == R_NilValue) {
29 | 		return R_NilValue;
30 | 	}
31 | 
32 |         PROTECT(skind = coerceVector(skind, STRSXP));
33 | 	if (STRING_ELT(skind, 0) == NA_STRING) {
34 | 		UNPROTECT(1);
35 | 		return R_NilValue;
36 | 	}
37 | 
38 | 	// assume utf8 encoding
39 | 	kind = CHAR(STRING_ELT(skind, 0));
40 | 	strs = (const char **)callback(kind, &n);
41 | 
42 | 	if (!strs) {
43 | 		error("unknown kind (\"%s\")", kind);
44 | 	}
45 | 
46 | 	PROTECT(ans = allocVector(STRSXP, n));
47 | 	for (i = 0; i < n; i++) {
48 | 		RCORPUS_CHECK_INTERRUPT(i);
49 | 		SET_STRING_ELT(ans, i, mkCharCE(strs[i], CE_UTF8));
50 | 	}
51 | 
52 | 	UNPROTECT(2);
53 | 	return ans;
54 | }
55 | 
56 | 
57 | SEXP abbreviations(SEXP skind)
58 | {
59 | 	return wordlist(corpus_sentsuppress_list, skind);
60 | }
61 | 
62 | 
63 | SEXP stopwords(SEXP skind)
64 | {
65 | 	return wordlist(corpus_stopword_list, skind);
66 | }
67 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(corpus)
3 | 
4 | test_check("corpus")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-capture_output.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | if (utils::packageVersion("testthat") <= "1.0.2") {
 4 |     capture_output <- function(code, print = FALSE, width = 80) {
 5 |         oldwidth <- getOption("width")
 6 |         if (width != oldwidth) {
 7 |             options(width = width)
 8 |             on.exit(options(width = oldwidth), add = TRUE)
 9 |         }
10 |         testthat::capture_output(code, print)
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-locale.R:
--------------------------------------------------------------------------------
 1 | switch_ctype <- function(mode = c("C", "UTF-8"))
 2 | {
 3 |     mode <- match.arg(mode)
 4 | 
 5 |     if (mode == "UTF-8") {
 6 |         sysname <- Sys.info()[["sysname"]]
 7 |         if (sysname == "Windows") {
 8 |             ctype <- "English_United States.1252"
 9 |         } else if (sysname == "Darwin") {
10 |             ctype <- "UTF-8"
11 |         } else {
12 |             ctype <- "en_US.utf8"
13 |         }
14 |     } else {
15 |         ctype <- "C"
16 |     }
17 | 
18 |     ctype0 <- Sys.getlocale("LC_CTYPE")
19 |     suppressWarnings({
20 |         Sys.setlocale("LC_CTYPE", ctype)
21 |     })
22 |     if (Sys.getlocale("LC_CTYPE") != ctype) {
23 |         skip(paste0("Cannot change locale to '", ctype, "'"))
24 |     }
25 |     if (mode == "UTF-8" && !utf8::output_utf8()) {
26 |         skip("Cannot change to UTF-8 output")
27 |     }
28 | 
29 |     ctype0
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-options.R:
--------------------------------------------------------------------------------
1 | 
2 | options(encoding = "UTF-8")
3 | 


--------------------------------------------------------------------------------
/tests/testthat/test-frame-stats.R:
--------------------------------------------------------------------------------
 1 | context("frame-stats")
 2 | 
 3 | 
 4 | test_that("'na.fail' works", {
 5 |     data <- corpus_frame(x = 26:1, text = letters)
 6 |     expect_equal(na.fail(data), data)
 7 | 
 8 |     data <- corpus_frame(x = c(NA, 26:2), text = letters)
 9 |     expect_error(na.fail(data), "missing values in object")
10 | 
11 |     data <- corpus_frame(x = 1:26, text = c(NA, letters[-1]))
12 |     expect_error(na.fail(data), "missing values in object")
13 | })
14 | 
15 | 
16 | test_that("'na.omit' works", {
17 |     data <- corpus_frame(text = c(NA, "a", "b", NA, "c"), x = 2:6)
18 |     actual <- na.omit(data)
19 |     expected <- corpus_frame(text = c("a", "b", "c"), x = c(3, 4, 6),
20 |                              row.names = c(2L, 3L, 5L))
21 |     omit <- c("1" = 1L, "4" = 4L)
22 |     attr(omit, "class") <- "omit"
23 |     attr(expected, "na.action") <- omit
24 |     expect_equal(actual, expected)
25 | 
26 |     expect_equal(na.omit(corpus_frame(x = 1:26, text = letters)),
27 |                  corpus_frame(x = 1:26, text = letters))
28 | })
29 | 
30 | 
31 | test_that("'na.exclude' works", {
32 |     data <- corpus_frame(text = letters[1:5], x = c(4, 3, NA, 1, -7),
33 |                          row.names = c("A", "B", "C", "D", "E"))
34 |     actual <- na.exclude(data)
35 |     expected <- corpus_frame(text = c("a", "b", "d", "e"),
36 |                               x = c(4, 3, 1, -7),
37 |                               row.names = c("A", "B", "D", "E"))
38 |     exclude <- c("C" = 3L)
39 |     attr(exclude, "class") <- "exclude"
40 |     attr(expected, "na.action") <- exclude
41 |     expect_equal(actual, expected)
42 | 
43 |     expect_equal(na.exclude(corpus_frame(x = 1:26, text = letters)),
44 |                  corpus_frame(x = 1:26, text = letters))
45 | })
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test-gutenberg_corpus.R:
--------------------------------------------------------------------------------
 1 | context("gutenberg_corpus")
 2 | 
 3 | test_that("'gutenberg_corpus' can download Jules Verne in French", {
 4 |     if (!identical(Sys.getenv("TEST_WEB_RESOURCES"), "true")) {
 5 |         skip("Not running web resource tests")
 6 |     }
 7 | 
 8 |     data <- gutenberg_corpus(800, verbose = FALSE)
 9 |     expect_equal(data$title, "Le Tour du Monde en 80 Jours")
10 |     expect_equal(data$author, "Jules Verne")
11 |     expect_equal(data$language, "French")
12 |     expect_equal(nchar(as.character(data$text)), 421335)
13 | })
14 | 
15 | 
16 | test_that("'gutenberg_corpus' can handle NA", {
17 |     data <- gutenberg_corpus(NA)
18 |     expect_equal(data, corpus_frame(title = NA_character_,
19 |                                     author = NA_character_,
20 |                                     language = NA_character_,
21 |                                     text = NA_character_))
22 | })
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-json_serialize.R:
--------------------------------------------------------------------------------
  1 | context("json_serialize")
  2 | 
  3 | 
  4 | test_that("serializing json works", {
  5 |     x <- c("S", "P", "Q", "R")
  6 |     file <- tempfile()
  7 |     writeLines(paste0('"', x, '"'), file)
  8 |     ds <- read_ndjson(file, simplify=FALSE)
  9 | 
 10 |     file2 <- tempfile()
 11 |     saveRDS(ds, file2)
 12 |     ds2 <- readRDS(file2)
 13 | 
 14 |     expect_equal(as.character(ds), as.character(ds2))
 15 | })
 16 | 
 17 | 
 18 | test_that("serializing mmapped json works", {
 19 |     x <- c("S", "P", "Q", "R")
 20 |     file <- tempfile()
 21 |     writeLines(paste0('"', x, '"'), file)
 22 |     ds <- read_ndjson(file, mmap=TRUE, simplify=FALSE)
 23 | 
 24 |     file2 <- tempfile()
 25 |     saveRDS(ds, file2)
 26 |     ds2 <- readRDS(file2)
 27 | 
 28 |     expect_equal(as.character(ds), as.character(ds2))
 29 | })
 30 | 
 31 | 
 32 | test_that("serializing mmapped json should use relative, not absolute path", {
 33 |     wd <- getwd()
 34 |     on.exit(setwd(wd))
 35 | 
 36 |     x <- c("S", "P", "Q", "R")
 37 | 
 38 |     # create and change directory to dir/a
 39 |     dir <- tempfile()
 40 |     dir.create(dir)
 41 |     setwd(dir)
 42 |     dir.create("a")
 43 |     setwd("a")
 44 | 
 45 |     # save dir/a/data.json
 46 |     # save dir/a/obj.rds
 47 |     writeLines(paste0('{"x": "', x, '"}'), "data.json")
 48 |     ds <- read_ndjson("data.json", mmap=TRUE, simplify=FALSE)
 49 |     saveRDS(ds, "obj.rds")
 50 | 
 51 |     # move the files to
 52 |     # dir/data.json
 53 |     # dir/obj.rds
 54 |     file.rename(file.path(dir, "a", "data.json"), file.path(dir, "data.json"))
 55 |     file.rename(file.path(dir, "a", "obj.rds"), file.path(dir, "obj.rds"))
 56 | 
 57 |     # set the working directory to dir
 58 |     setwd(dir)
 59 |     unlink(file.path(dir, "a"), recursive=TRUE)
 60 | 
 61 |     # read obj.rds
 62 |     ds2 <- readRDS("obj.rds")
 63 |     expect_equal(as.character(ds2$x), x)
 64 | })
 65 | 
 66 | 
 67 | test_that("serializing json subset works", {
 68 |     x <- LETTERS
 69 |     file <- tempfile()
 70 |     writeLines(paste0('"', x, '"'), file)
 71 |     ds <- read_ndjson(file, simplify=FALSE)
 72 | 
 73 |     i <- seq(2, 26, 2)
 74 |     ds <- ds[i]
 75 | 
 76 |     file2 <- tempfile()
 77 |     saveRDS(ds, file2)
 78 |     ds2 <- readRDS(file2)
 79 | 
 80 |     expect_equal(as.character(ds), as.character(ds2))
 81 | })
 82 | 
 83 | 
 84 | test_that("serializing json field works", {
 85 |     x <- LETTERS
 86 |     y <- 3.14 * seq_along(LETTERS) - 10
 87 |     file <- tempfile()
 88 |     writeLines(paste0('{"x": "', x, '", "z": { "y": ', y, "} }"), file)
 89 |     ds <- read_ndjson(file, simplify=FALSE)
 90 | 
 91 |     ds <- ds$z
 92 | 
 93 |     file2 <- tempfile()
 94 |     saveRDS(ds, file2)
 95 |     ds2 <- readRDS(file2)
 96 | 
 97 |     expect_equal(as.numeric(ds$y), as.numeric(ds2$y))
 98 | })
 99 | 
100 | 
101 | test_that("serializing json nested fields works", {
102 |     x <- 1:10
103 |     file <- tempfile()
104 |     writeLines(paste0('{"f1": {"f2": {"f3": {"x": ', x, '}}}}'), file)
105 |     ds <- read_ndjson(file, simplify=FALSE)
106 | 
107 |     ds <- ds$f1$f2$f3
108 | 
109 |     file2 <- tempfile()
110 |     saveRDS(ds, file2)
111 |     ds2 <- readRDS(file2)
112 | 
113 |     expect_equal(as.integer(ds$x), as.numeric(ds2$x))
114 | })
115 | 
116 | 
117 | test_that("serializing json field subset works", {
118 |     x <- LETTERS
119 |     y <- 3.14 * seq_along(LETTERS) - 10
120 |     file <- tempfile()
121 |     writeLines(paste0('{"z": {"x": "', x, '"}, "y": ', y, "}"), file)
122 |     ds <- read_ndjson(file, simplify = FALSE)
123 | 
124 |     i <- c(20, 2, 9, 4, 6, 2)
125 |     ds <- ds[i, "z"]
126 | 
127 |     file2 <- tempfile()
128 |     saveRDS(ds, file2)
129 |     ds2 <- readRDS(file2)
130 | 
131 |     expect_equal(as.character(ds$x), as.character(ds2$x))
132 | })
133 | 


--------------------------------------------------------------------------------
/tests/testthat/test-read_ndjson.R:
--------------------------------------------------------------------------------
 1 | context("read_ndjson")
 2 | 
 3 | 
 4 | test_that("reading a non-existent file should fail", {
 5 |     corpus:::logging_off()
 6 |     expect_error(read_ndjson("foobar", mmap=TRUE), "cannot open file 'foobar'")
 7 |     corpus:::logging_on()
 8 | })
 9 | 
10 | 
11 | test_that("passing a connection when mmap is TRUE should fail", {
12 |     tmp <- tempfile()
13 |     writeLines(character(), tmp)
14 |     file <- file(tmp)
15 |     on.exit(close(file))
16 |     expect_error(read_ndjson(file, mmap = TRUE),
17 |                  "'file' must be a character string when 'mmap' is TRUE")
18 | })
19 | 
20 | 
21 | test_that("passing a file name should succeed", {
22 |     file <- tempfile()
23 |     writeLines('"foo"', file)
24 |     expect_equal(read_ndjson(file), "foo")
25 | })
26 | 
27 | 
28 | test_that("passing a closed connection should succeed", {
29 |     tmp <- tempfile()
30 |     file <- file(tmp)
31 |     writeLines('"foo"', file)
32 |     expect_equal(read_ndjson(file), "foo")
33 | })
34 | 
35 | 
36 | test_that("passing an empty file should succeed", {
37 |     file <- tempfile()
38 |     writeLines(character(), file)
39 |     expect_equal(read_ndjson(file), NULL)
40 | })
41 | 
42 | 
43 | test_that("passing a nonscalar should fail", {
44 |     expect_error(read_ndjson(17),
45 |                  "'file' must be a character string or connection")
46 | })
47 | 


--------------------------------------------------------------------------------
/tests/testthat/test-stemmer.R:
--------------------------------------------------------------------------------
  1 | context("stemmer.R")
  2 | 
  3 | test_that("can use a custom function as a stemmer", {
  4 |     x <- LETTERS
  5 |     stem <- function(word) "?"
  6 |     expect_equal(text_tokens(x, stemmer = stem),
  7 |                  as.list(rep("?", length(x))))
  8 | })
  9 | 
 10 | 
 11 | test_that("handles stemmer logical NAs", {
 12 |     x <- paste(LETTERS, collapse = " ")
 13 |     stemmer <-
 14 |         function(x) {
 15 |             if (x %in% c("a", "e", "i", "o", "u")) {
 16 |                 paste0(toupper(x), "*")
 17 |             } else {
 18 |                 NA
 19 |             }
 20 |         }
 21 |     actual <- text_tokens(x, stemmer = stemmer)
 22 |     expected <- list(c("A*", "E*", "I*", "O*", "U*"))
 23 |     expect_equal(actual, expected)
 24 | })
 25 | 
 26 | 
 27 | test_that("handles stemmer character NAs", {
 28 |     x <- paste(LETTERS, collapse = " ")
 29 |     stemmer <- function(x) NA_character_
 30 |     actual <- text_tokens(x, stemmer = stemmer)
 31 |     expected <- list(character())
 32 |     expect_equal(actual, expected)
 33 | })
 34 | 
 35 | 
 36 | test_that("handles stemmer errors", {
 37 |     x <- LETTERS
 38 | 
 39 |     expect_error(text_tokens(x, stemmer = function(w) c("?", "?")),
 40 |                  "'stemmer' returned multiple values for input \"a\"")
 41 | 
 42 |     expect_error(text_tokens(x, stemmer = function(w) character()),
 43 |                  "'stemmer' did not return a value for input \"a\"")
 44 | 
 45 |     expect_error(text_tokens(x, stemmer = function(w) NULL),
 46 |                  "'stemmer' did not return a value for input \"a\"")
 47 | 
 48 |     expect_error(text_tokens(x, stemmer = function(w) 7),
 49 |                  "'stemmer' returned a non-string value for input \"a\"")
 50 | })
 51 | 
 52 | 
 53 | test_that("handles internal stemmer errors", {
 54 |     expect_error(text_tokens("hello", stemmer = function(x) stop("what?")),
 55 |                  "'stemmer' raised an error for input \"hello\"")
 56 | })
 57 | 
 58 | 
 59 | test_that("'new_stemmer' can detect errors", {
 60 |     expect_error(new_stemmer(c("a", "b"), c("a")),
 61 |                  "'term' argument length must equal 'stem' argument length")
 62 | })
 63 | 
 64 | 
 65 | test_that("'new_stemmer' can handle empty inputs", {
 66 |     fn <- new_stemmer(NULL, NULL)
 67 |     expect_equal(fn("a"), "a")
 68 | })
 69 | 
 70 | 
 71 | test_that("'new_stemmer' can use a default", {
 72 |     fn <- new_stemmer(LETTERS, letters, default = NA)
 73 |     expect_equal(fn("A"), "a")
 74 |     expect_equal(fn("AB"), NA_character_)
 75 | })
 76 | 
 77 | 
 78 | test_that("'new_stemmer' can handle duplicates", {
 79 |     term <- c("a", "a", "b", "c", "c", "c", "d")
 80 |     stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d")
 81 | 
 82 |     fn <- new_stemmer(term, stem, duplicates = "first", vectorize = FALSE)
 83 |     expect_equal(sapply(term, fn, USE.NAMES = FALSE),
 84 |                  c("a1", "a1", "b", "c1", "c1", "c1", "d"))
 85 | 
 86 |     fn <- new_stemmer(term, stem, duplicates = "last", vectorize = FALSE)
 87 |     expect_equal(sapply(term, fn, USE.NAMES = FALSE),
 88 |                  c("a2", "a2", "b", "c3", "c3", "c3", "d"))
 89 | 
 90 |     fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = FALSE)
 91 |     expect_equal(sapply(term, fn, USE.NAMES = FALSE),
 92 |                  c("a", "a", "b", "c", "c", "c", "d"))
 93 | })
 94 | 
 95 | 
 96 | test_that("'new_stemmer' can vectorize ", {
 97 |     term <- c("a", "a", "b", "c", "c", "c", "d")
 98 |     stem <- c("a1", "a2", "b", "c1", "c2", "c3", "d")
 99 | 
100 |     fn <- new_stemmer(term, stem, duplicates = "first", vectorize = TRUE)
101 |     expect_equal(fn(term), c("a1", "a1", "b", "c1", "c1", "c1", "d"))
102 | 
103 |     fn <- new_stemmer(term, stem, duplicates = "last", vectorize = TRUE)
104 |     expect_equal(fn(term), c("a2", "a2", "b", "c3", "c3", "c3", "d"))
105 | 
106 |     fn <- new_stemmer(term, stem, duplicates = "omit", vectorize = TRUE)
107 |     expect_equal(fn(term), c("a", "a", "b", "c", "c", "c", "d"))
108 | 
109 |     expect_error(new_stemmer(term, stem, duplicates = "fail",
110 |                               vectorize = TRUE),
111 |                  "'term' argument entries must be unique")
112 | })
113 | 
114 | 
115 | test_that("'stem_snowball' can handle NULL algorithm", {
116 |     x <- c("win", "winning", "winner", "#winning")
117 |     expect_equal(stem_snowball(x, NULL), x)
118 | })
119 | 
120 | 
121 | test_that("'stem_snowball' can handle NULL input", {
122 |     expect_equal(stem_snowball(NULL), NULL)
123 | })
124 | 
125 | 
126 | test_that("'stem_snowball' can handle stem input", {
127 |     x <- c("win", "winning", "winner", "#winning")
128 |     expect_equal(stem_snowball(x),
129 |                  c("win", "win", "winner", "#winning"))
130 | })
131 | 


--------------------------------------------------------------------------------
/tests/testthat/test-term_counts.R:
--------------------------------------------------------------------------------
 1 | context("term_counts")
 2 | 
 3 | test_that("'term_counts' gives equivalent results to 'term_matrix'", {
 4 |     text <- c(a="A rose is a rose is a rose.",
 5 |               b="A Rose is red, a violet is blue!",
 6 |               c="A rose by any other name would smell as sweet.")
 7 |     x <- term_matrix(text)
 8 |     tf <- term_counts(text)
 9 |     xtf <- Matrix::sparseMatrix(i = as.integer(tf$text),
10 |                                 j = as.integer(tf$term),
11 |                                 x = tf$count,
12 |                                 dimnames = list(levels(tf$text),
13 |                                                 colnames(x)))
14 |     expect_equal(x, xtf)
15 | })
16 | 
17 | 
18 | test_that("'term_counts' gives equivalent results to 'term_matrix' no names", {
19 |     text <- c("A rose is a rose is a rose.",
20 |               "A Rose is red, a violet is blue!",
21 |               "A rose by any other name would smell as sweet.")
22 |     x <- term_matrix(text)
23 |     rownames(x) <- as.character(seq_along(text))
24 |     tf <- term_counts(text)
25 |     xtf <- Matrix::sparseMatrix(i = as.integer(tf$text),
26 |                                 j = as.integer(tf$term),
27 |                                 x = tf$count,
28 |                                 dimnames = list(levels(tf$text),
29 |                                                 colnames(x)))
30 |     expect_equal(x, xtf)
31 | })
32 | 
33 | 
34 | test_that("'term_counts' with group gives equivalent results to 'term_matrix'", {
35 |     text <- c(a="A rose is a rose is a rose.",
36 |               b="A Rose is red, a violet is blue!",
37 |               c="A rose by any other name would smell as sweet.")
38 |     g <- factor(c("X", "Y", "X"))
39 |     x <- term_matrix(text, group = g)
40 |     tf <- term_counts(text, group = g)
41 |     xtf <- Matrix::sparseMatrix(i = as.integer(tf$group),
42 |                                 j = as.integer(tf$term),
43 |                                 x = tf$count,
44 |                                 dimnames = list(levels(tf$group),
45 |                                                 colnames(x)))
46 |     expect_equal(x, xtf)
47 | })
48 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text-stats.R:
--------------------------------------------------------------------------------
 1 | context("text-stats")
 2 | 
 3 | test_that("'na.fail' works", {
 4 |     x <- as_corpus_text(letters)
 5 |     expect_equal(na.fail(x), x)
 6 | 
 7 |     expect_error(na.fail(c(x, NA)), "missing values in object")
 8 | })
 9 | 
10 | 
11 | test_that("'na.omit' works", {
12 |     x <- as_corpus_text(c(NA, "a", "b", NA, "c"))
13 |     actual <- na.omit(x)
14 |     expected <- as_corpus_text(c("a", "b", "c"))
15 |     omit <- c(1L, 4L)
16 |     attr(omit, "class") <- "omit"
17 |     attr(expected, "na.action") <- omit
18 |     expect_equal(actual, expected)
19 | 
20 |     expect_equal(na.omit(as_corpus_text(letters)),
21 |                  as_corpus_text(letters))
22 | })
23 | 
24 | 
25 | test_that("'na.exclude' works", {
26 |     x <- as_corpus_text(c(r = NA, s = "a", t = "b", u = NA, v = "c"))
27 |     actual <- na.exclude(x)
28 |     expected <- as_corpus_text(c(s = "a", t = "b", v = "c"))
29 |     exclude <- c(r = 1L, u = 4L)
30 |     attr(exclude, "class") <- "exclude"
31 |     attr(expected, "na.action") <- exclude
32 |     expect_equal(actual, expected)
33 | 
34 |     expect_equal(na.exclude(as_corpus_text(letters)),
35 |                  as_corpus_text(letters))
36 | })
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_c.R:
--------------------------------------------------------------------------------
  1 | context("text_c")
  2 | 
  3 | 
  4 | test_that("c should not drop text_filter", {
  5 |     x <- as_corpus_text("hello")
  6 |     text_filter(x) <- text_filter(map_case = FALSE)
  7 |     y <- as_corpus_text("world")
  8 |     z <- c(x, y)
  9 |     expect_equal(text_filter(z), text_filter(x))
 10 | })
 11 | 
 12 | 
 13 | test_that("c should work with named or unnamed args", {
 14 |     x <- as_corpus_text("hello")
 15 |     y <- as_corpus_text("world")
 16 |     expect_equal(c(x, y), as_corpus_text(c("hello", "world")))
 17 |     expect_equal(c(a = x, b = y), as_corpus_text(c(a = "hello", b = "world")))
 18 | })
 19 | 
 20 | 
 21 | test_that("c should work with complex args", {
 22 |     x <- c(a=as_corpus_text(c("hello", NA, "world")), "!", c=7)
 23 |     expect_equal(x, as_corpus_text(c(a1 = "hello", a2 = NA, a3 = "world",
 24 |                               "4" = "!", c = "7")))
 25 | })
 26 | 
 27 | 
 28 | test_that("c should work with a single unnamed argument", {
 29 |     x0 <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
 30 |     x <- x0
 31 |     text_filter(x) <- text_filter(map_case = FALSE)
 32 |     y <- c(x)
 33 |     expect_equal(y, x)
 34 | })
 35 | 
 36 | 
 37 | test_that("c should work with a single named argument", {
 38 |     x <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
 39 |     y <- c(a = x)
 40 |     expect_equal(y, as_corpus_text(c(a.a = "hello", a.b = "goodbye", a3 = "!")))
 41 | })
 42 | 
 43 | 
 44 | test_that("c should work with no names", {
 45 |     x <- as_corpus_text(c("hello", "goodbye", "!"))
 46 |     z <- c(x, x)
 47 |     expect_equal(names(z), NULL)
 48 |     expect_equal(as.character(z), c(as.character(x), as.character(x)))
 49 | })
 50 | 
 51 | 
 52 | test_that("c should work with no arguments", {
 53 |     z <- c.corpus_text()
 54 |     expect_equal(z, as_corpus_text(c()))
 55 | })
 56 | 
 57 | 
 58 | test_that("c should support use.names = FALSE", {
 59 |     z <- c(a=as_corpus_text("x"), y=c(z="z", "w"), use.names = FALSE)
 60 |     expect_equal(z, as_corpus_text(c("x", "z", "w")))
 61 | })
 62 | 
 63 | 
 64 | test_that("c should support lists with recursive = TRUE", {
 65 |     z <- c.corpus_text(list(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c",
 66 |                        recursive = TRUE)
 67 |     expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c")))
 68 | })
 69 | 
 70 | 
 71 | test_that("c should support pairlists with recursive = TRUE", {
 72 |     z <- c.corpus_text(pairlist(x = as_corpus_text("a"), y = as_corpus_text("b")), z = "c",
 73 |                        recursive = TRUE)
 74 |     expect_equal(z, as_corpus_text(c(x = "a", y = "b", z = "c")))
 75 | })
 76 | 
 77 | 
 78 | test_that("c can handle NA after named", {
 79 |     z <- c(as_corpus_text(c(x = "a")), NA)
 80 |     expect_equal(z, as_corpus_text(c(x = "a", "2" = NA)))
 81 | })
 82 | 
 83 | 
 84 | test_that("c should take filter from first value", {
 85 |     f <- text_filter(map_case = FALSE)
 86 |     x <- as_corpus_text(letters, filter = f)
 87 |     y <- as_corpus_text(LETTERS)
 88 |     z <- c(x, y)
 89 |     expect_equal(z, as_corpus_text(c(letters, LETTERS), filter = f))
 90 | 
 91 |     z2 <- c(y, x)
 92 |     expect_equal(z2, as_corpus_text(c(LETTERS, letters)))
 93 | })
 94 | 
 95 | 
 96 | test_that("c should work with duplicate names", {
 97 |     x <- as_corpus_text(c(a = "hello", b = "goodbye", "!"))
 98 |     z <- c(x, x)
 99 |     expect_equal(names(z), c(names(x), paste0(names(x), ".1")))
100 |     expect_equal(as.character(z), c(as.character(x), as.character(x)))
101 | })
102 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_names.R:
--------------------------------------------------------------------------------
  1 | context("text_names")
  2 | 
  3 | 
  4 | test_that("`names` should be NULL for new text", {
  5 |     x <- as_corpus_text(c("A", "B", "C"))
  6 |     expect_equal(names(x), NULL)
  7 | 
  8 |     expect_equal(names(as_corpus_text(character())), NULL)
  9 | })
 10 | 
 11 | 
 12 | test_that("`names<-` should work on text", {
 13 |     x <- as_corpus_text(LETTERS)
 14 |     names(x) <- rev(LETTERS)
 15 |     expect_equal(names(x), rev(LETTERS))
 16 | })
 17 | 
 18 | 
 19 | test_that("setting `names<-` to NULL should restore defaults", {
 20 |     x <- as_corpus_text(c(a="x", b="y"))
 21 |     names(x) <- NULL
 22 |     expect_equal(names(x), NULL)
 23 | })
 24 | 
 25 | 
 26 | test_that("`as_corpus_text` should not drop names", {
 27 |     x <- as_corpus_text(c(a="1", b="2"))
 28 |     expect_equal(names(x), c("a", "b"))
 29 | })
 30 | 
 31 | 
 32 | test_that("`all.equal` should test names", {
 33 |     x <- as_corpus_text(1:3)
 34 |     y <- x
 35 |     names(y) <- c("a", "b", "c")
 36 |     expect_equal(all.equal(x, y), "names for current but not for target")
 37 |     expect_equal(all.equal(y, x), "names for target but not for current")
 38 | })
 39 | 
 40 | 
 41 | test_that("`as_corpus_text` should not drop names", {
 42 |     x <- as_corpus_text(c(foo="hello"))
 43 |     y <- as_corpus_text(x)
 44 | 
 45 |     expect_equal(y, as_corpus_text(c(foo="hello")))
 46 | })
 47 | 
 48 | 
 49 | test_that("`as_corpus_text` should drop attributes", {
 50 |     x <- as_corpus_text("hello")
 51 |     attr(x, "foo") <- "bar"
 52 |     y <- as_corpus_text(x)
 53 | 
 54 |     expect_equal(y, as_corpus_text("hello"))
 55 | })
 56 | 
 57 | 
 58 | test_that("`as_corpus_text` should drop attributes for JSON objects", {
 59 |     file <- tempfile()
 60 |     writeLines('{"text": "hello"}', file)
 61 |     x <- read_ndjson(file)$text
 62 | 
 63 |     attr(x, "foo") <- "bar"
 64 |     y <- as_corpus_text(x)
 65 | 
 66 |     expect_equal(y, as_corpus_text("hello"))
 67 | })
 68 | 
 69 | 
 70 | test_that("`names<-` should not modify copies", {
 71 |     x <- as_corpus_text(1:3)
 72 |     y <- x
 73 |     names(y) <- c("a", "b", "c")
 74 |     expect_equal(names(x), NULL)
 75 |     expect_equal(names(y), c("a", "b", "c"))
 76 | })
 77 | 
 78 | 
 79 | test_that("`names<-` should preserve attributes", {
 80 |     x <- as_corpus_text(1:3)
 81 |     attr(x, "foo") <- "bar"
 82 |     names(x) <- c("a", "b", "c")
 83 |     expect_equal(names(x), c("a", "b", "c"))
 84 |     expect_equal(attr(x, "foo"), "bar")
 85 | })
 86 | 
 87 | 
 88 | test_that("`names<-` should not allow NA", {
 89 |     x <- as_corpus_text(1:3)
 90 |     expect_error(names(x) <- c("a", NA, "b"),
 91 |                  "missing values in 'names' are not allowed")
 92 | })
 93 | 
 94 | 
 95 | test_that("`names<-` should not allow duplicates", {
 96 |     x <- as_corpus_text(1:3)
 97 |     expect_error(names(x) <- c("a", "b", "a"),
 98 |                  "duplicate 'names' are not allowed")
 99 | })
100 | 
101 | 
102 | test_that("names should error for non-text", {
103 |     expect_error(names.corpus_text("hello"), "invalid text object")
104 | })
105 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_nunit.R:
--------------------------------------------------------------------------------
 1 | context("text_nunit")
 2 | 
 3 | 
 4 | test_that("text_nsentence can works on sentences", {
 5 |     text <- c(a="He said, 'Are you going?' John Shook his head.",
 6 |               b="'Are you going?' John asked",
 7 |               c="This. Is. A. Long. Sentence!!!",
 8 |               d="Why all the shouting??")
 9 |     n0 <- text_nsentence(text)
10 |     split <- text_split(text, "sentences")
11 |     n <- c(with(split, tapply(index, parent, length)))
12 |     names(n) <- names(text)
13 |     expect_equal(n, n0)
14 | })
15 | 
16 | 
17 | test_that("text_nsentence handles NA and empty", {
18 |     expect_equal(text_nsentence(c(NA, "")), c(NA, 0))
19 | })
20 | 
21 | 
22 | test_that("text_ntoken can works on tokens", {
23 |     text <- c(a="He said, 'Are you going?' John Shook his head.",
24 |               b="'Are you going?' John asked",
25 |               c="This. Is. A. Long. Sentence!!!",
26 |               d="Why all the shouting??")
27 |     n0 <- text_ntoken(text)
28 |     split <- text_split(text, "tokens")
29 |     n <- c(with(split, tapply(index, parent, length)))
30 |     names(n) <- names(text)
31 |     expect_equal(n, n0)
32 | })
33 | 
34 | 
35 | test_that("text_ntoken handles NA and empty", {
36 |     expect_equal(text_ntoken(c(NA, "")), c(NA, 0))
37 | })
38 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_primitive.R:
--------------------------------------------------------------------------------
  1 | context("text_primitive")
  2 | 
  3 | 
  4 | test_that("anyNA should work", {
  5 |     x <- as_corpus_text(c("a", NA, "", "b"))
  6 |     y <- as_corpus_text(c())
  7 |     z <- as_corpus_text(letters)
  8 | 
  9 |     expect_true(anyNA(x))
 10 |     expect_false(anyNA(y))
 11 |     expect_false(anyNA(z))
 12 | })
 13 | 
 14 | 
 15 | test_that("converting to character should work", {
 16 |     x <- c("hello", NA, "world", "")
 17 |     y <- as_corpus_text(x)
 18 |     expect_equal(as.character(y), x)
 19 | })
 20 | 
 21 | 
 22 | test_that("conversions should work", {
 23 |     expect_equal(as.complex(as_corpus_text("1+2i")), 1+2i)
 24 |     expect_equal(as.double(as_corpus_text("3.14")), 3.14)
 25 |     expect_equal(as.integer(as_corpus_text("3.14")), 3)
 26 |     expect_equal(as.logical(as_corpus_text(c("TRUE", "FALSE", "NA"))),
 27 |                  c(TRUE, FALSE, NA))
 28 |     expect_equal(as.numeric(as_corpus_text("3.14")), 3.14)
 29 |     expect_equal(as.raw(as_corpus_text("1")), as.raw("1"))
 30 | 
 31 |     expect_warning(x <- as.numeric(as_corpus_text("foo")),
 32 |                    "NAs introduced by coercion")
 33 |     expect_equal(x, NA_real_)
 34 | })
 35 | 
 36 | 
 37 | test_that("is.na should work", {
 38 |     x <- as_corpus_text(c("a", NA, "", "b"))
 39 |     expect_equal(is.na(x), c(FALSE, TRUE, FALSE, FALSE))
 40 |     expect_equal(is.na(as_corpus_text(c())), logical())
 41 | })
 42 | 
 43 | 
 44 | test_that("rep should work", {
 45 |     x <- as_corpus_text(c("a", "b", "c"))
 46 |     y <- rep(x, 7)
 47 |     expect_equal(y, as_corpus_text(rep(c("a", "b", "c"), 7)))
 48 | })
 49 | 
 50 | 
 51 | test_that("rep should work with names", {
 52 |     x <- as_corpus_text(c(x="a", y="b"))
 53 |     y <- rep(x, 2)
 54 |     expect_equal(y, as_corpus_text(c(x="a", y="b", x.1="a", y.1="b")))
 55 | })
 56 | 
 57 | 
 58 | test_that("invalid operations should error", {
 59 |     x <- as_corpus_text("hello")
 60 |     expect_error(x$names, "$ operator is invalid for text objects",
 61 |                  fixed = TRUE)
 62 |     expect_error(x$names <- "foo", "$<- operator is invalid for text objects",
 63 |                  fixed = TRUE)
 64 |     expect_error(as.environment(x),
 65 |                  "'as.environment' is invalid for text objects")
 66 | })
 67 | 
 68 | test_that("setting length on invalid text should fail", {
 69 |     x <- letters
 70 |     expect_error(`length<-.corpus_text`(x, 5), "invalid text object")
 71 | })
 72 | 
 73 | test_that("setting invalid length should fail", {
 74 |     x <- as_corpus_text(letters)
 75 |     expect_error(length(x) <- NULL, "'length' cannot be NULL")
 76 |     expect_error(length(x) <- "1", "'length' must be numeric")
 77 |     expect_error(length(x) <- c(1, 1), "'length' must have length 1")
 78 |     expect_error(length(x) <- NA, "'length' cannot be NA")
 79 |     expect_error(length(x) <- NaN, "'length' cannot be NaN")
 80 |     expect_error(length(x) <- -1, "'length' cannot be negative")
 81 |     expect_error(length(x) <- 2^53 + 2, "'length' cannot be above 2\\^53")
 82 | })
 83 | 
 84 | 
 85 | test_that("setting short length should work", {
 86 |     x <- as_corpus_text(letters)
 87 |     length(x) <- 10
 88 |     expect_equal(x, as_corpus_text(letters[1:10]))
 89 | })
 90 | 
 91 | 
 92 | test_that("setting same length should work", {
 93 |     x <- as_corpus_text(letters)
 94 |     length(x) <- 26
 95 |     expect_equal(x, as_corpus_text(letters))
 96 | })
 97 | 
 98 | 
 99 | test_that("setting long length should work", {
100 |     x <- as_corpus_text(letters)
101 |     length(x) <- 30
102 |     expect_equal(x, as_corpus_text(c(letters, rep(NA, 4))))
103 | })
104 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_print.R:
--------------------------------------------------------------------------------
 1 | context("text_print")
 2 | 
 3 | test_that("'print.text' works without names", {
 4 |     ctype <- switch_ctype("C")
 5 |     on.exit(Sys.setlocale("LC_CTYPE", ctype))
 6 | 
 7 |     x <- as_corpus_text(LETTERS)
 8 |     expected <- c(
 9 | ' [1] "A" "B" "C" "D" "E" "F" "G" "H" "I"',
10 | '[10] "J" "K" "L" "M" "N" "O" "P" "Q" "R"',
11 | '[19] "S" "T"',
12 | '...  (26 entries total)')
13 | 
14 |     expect_equal(strsplit(capture_output(print(x), width = 40), "\n")[[1]],
15 |                  expected)
16 | })
17 | 
18 | 
19 | test_that("'print.text' works with names", {
20 |     ctype <- switch_ctype("C")
21 |     on.exit(Sys.setlocale("LC_CTYPE", ctype))
22 | 
23 |     x <- as_corpus_text(LETTERS, names = paste0("foo", 1:26))
24 |     expected <- c(
25 | 'foo1  foo2  foo3  foo4  foo5  foo6  foo7  foo8 ',
26 | '"A"   "B"   "C"   "D"   "E"   "F"   "G"   "H"  ',
27 | 'foo9  foo10 foo11 foo12 foo13 foo14 foo15 foo16',
28 | '"I"   "J"   "K"   "L"   "M"   "N"   "O"   "P"  ',
29 | 'foo17 foo18 foo19 foo20',
30 | '"Q"   "R"   "S"   "T"  ',
31 | '... (26 entries total)')
32 | 
33 |     expect_equal(strsplit(capture_output(print(x), width = 50), "\n")[[1]],
34 |                  expected)
35 | })
36 | 
37 | 
38 | test_that("'print.text' works for empty", {
39 |     x <- as_corpus_text(character())
40 |     expect_equal(capture_output(print(x)),
41 |                  "text vector with 0 entries")
42 | })
43 | 
44 | 
45 | test_that("'print.text' works for NULL", {
46 |     expect_equal(print.corpus_text(NULL), NULL)
47 | })
48 | 
49 | 
50 | test_that("'print.text' errors for invalid", {
51 |     expect_error(print.corpus_text("hello"), "argument is not a valid text object")
52 | })
53 | 
54 | 
55 | test_that("'print.text' with negative rows prints entire object", {
56 |     x <- as_corpus_text(LETTERS)
57 |     expect_equal(capture_output(print(x, -1)),
58 |                  capture_output(print(as.character(x))))
59 | })
60 | 
61 | 
62 | test_that("'print.text' errors for invalid inputs", {
63 |     x <- as_corpus_text(LETTERS)
64 |     expect_error(print(x, chars = -1), "'chars' must be non-negative")
65 |     expect_error(print(x, chars = NA), "'chars' cannot be NA")
66 |     expect_error(print(x, chars = c(1,1)), "'chars' must have length 1")
67 |     expect_error(print(x, na.print = NA), "'na.print' cannot be NA")
68 |     expect_error(print(x, print.gap = 1025), "'print.gap' must be less than or equal to 1024")
69 | })
70 | 
71 | 
72 | test_that("'format.text' works for empty", {
73 |     expect_equal(format(as_corpus_text(character())), character())
74 | })
75 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_split_tokens.R:
--------------------------------------------------------------------------------
 1 | context("text_split_tokens")
 2 | 
 3 | 
 4 | test_that("'split_tokens' can split into threes", {
 5 |     text <- c(paste(LETTERS, collapse = " "),
 6 |               paste(letters, collapse = " "))
 7 | 
 8 |     expect_equal(text_split(text, "tokens", 3),
 9 |         structure(class = c("corpus_frame", "data.frame"),
10 |             data.frame(parent = factor(as.character(c(rep(1, 9), rep(2, 9)))),
11 |                    index = c(1:9, 1:9),
12 |                    text = as_corpus_text(c("A B C ", "D E F ", "G H I ", "J K L ",
13 |                                     "M N O ", "P Q R ", "S T U ", "V W X ",
14 |                                     "Y Z",
15 |                                     "a b c ", "d e f ", "g h i ", "j k l ",
16 |                                     "m n o ", "p q r ", "s t u ", "v w x ",
17 |                                     "y z")),
18 |                    row.names = NULL)))
19 | })
20 | 
21 | 
22 | test_that("'split_tokens' doesn't count dropped tokens", {
23 |     text <- c(paste(LETTERS, collapse = " "),
24 |               paste(letters, collapse = " "))
25 |     f <- text_filter(drop = c("a", "e", "i", "o", "u"))
26 | 
27 |     expect_equal(text_split(text, "tokens", 5, filter = f),
28 |         structure(class = c("corpus_frame", "data.frame"),
29 |             data.frame(parent = factor(as.character(c(rep(1, 5), rep(2, 5)))),
30 |                        index = c(1:5, 1:5),
31 |                        text = as_corpus_text(c("A B C D E F G ", "H I J K L ",
32 |                                         "M N O P Q ", "R S T U V ", "W X Y Z",
33 |                                         "a b c d e f g ", "h i j k l ",
34 |                                         "m n o p q ", "r s t u v ",
35 |                                         "w x y z"),
36 |                                       filter = f),
37 |                        row.names = NULL)))
38 | })
39 | 
40 | 
41 | test_that("'split_tokens' keeps trailing whitespace", {
42 |     expect_equal(text_split("abc  ", "tokens", 2),
43 |         structure(class = c("corpus_frame", "data.frame"),
44 |             data.frame(parent = factor("1"), index = 1,
45 |                        text = as_corpus_text("abc  "), row.names = NULL)))
46 | })
47 | 
48 | 
49 | test_that("'split_tokens' handles whitespace-only text", {
50 |     expect_equal(text_split("   ", "tokens", 1),
51 |         structure(class = c("corpus_frame", "data.frame"),
52 |             data.frame(parent = factor("1"),
53 |                        index = 1, text = as_corpus_text("   "), row.names = NULL)))
54 | })
55 | 
56 | 
57 | test_that("'split_tokens' handles empty and missing text", {
58 |     expect_equal(text_split(c("", NA, NA, "", "a"), "tokens", 1),
59 |         structure(class = c("corpus_frame", "data.frame"),
60 |             data.frame(parent = factor(c("1", "4", "5"),
61 |                                        levels = as.character(1:5)),
62 |                        index = c(1, 1, 1),
63 |                        text = as_corpus_text(c("", "", "a")),
64 |                        row.names = NULL)))
65 | })
66 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_stats.R:
--------------------------------------------------------------------------------
 1 | context("text_stats")
 2 | 
 3 | 
 4 | test_that("'text_stats' works on a simple example", {
 5 |     x <- c("A rose is a rose is a rose.", "A Rose is red. A violet is blue!")
 6 |     actual <- text_stats(x)
 7 |     expected <- data.frame(tokens = text_ntoken(x),
 8 |                            types = text_ntype(x),
 9 |                            sentences = text_nsentence(x))
10 |     class(expected) <- c("corpus_frame", "data.frame")
11 |     expect_equal(actual, expected)
12 | })
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text_types.R:
--------------------------------------------------------------------------------
 1 | context("text_types")
 2 | 
 3 | test_that("'text_types' works elementwise", {
 4 |     text <- c("I saw Mr. Jones today.",
 5 |               NA,
 6 |               "",
 7 |               "Split across\na line.",
 8 |               "What. Are. You. Doing????",
 9 |               "She asked 'do you really mean that?' and I said 'yes.'")
10 |     toks <- text_tokens(text)
11 |     typs <- lapply(toks, function(x) unique(sort(x, method = "radix")))
12 |     typs_tot <- unique(sort(c(toks, recursive = TRUE), method = "radix"))
13 |      
14 |     expect_equal(text_types(text), typs)
15 |     expect_equal(text_types(text, collapse = TRUE), typs_tot)
16 | })
17 | 
18 | 
19 | test_that("text_ntype works on types", {
20 |     expect_equal(text_ntype(LETTERS, collapse = TRUE), 26)
21 | 
22 |     expect_equal(text_ntype(paste(LETTERS, letters, LETTERS)),
23 |                  rep(1, 26))
24 | })
25 | 
26 | 
27 | test_that("text_ntype handles NA, empty", {
28 |     expect_equal(text_ntype(c("", NA, "hello world")),
29 |                  c(0, NA, 2))
30 | 
31 |     expect_equal(text_ntype(c("", NA, "hello world"), collapse = TRUE),
32 |                  NA_real_)
33 | })
34 | 


--------------------------------------------------------------------------------
/tests/testthat/test-wordlist.R:
--------------------------------------------------------------------------------
 1 | 
 2 | context("wordlist")
 3 | 
 4 | 
 5 | test_that("'abbreviations' has common acronyms", {
 6 |     expect_true(all(c("Mr.", "Mrs.", "Ms.") %in% abbreviations_en))
 7 |     expect_true(all(c("ap. J.-C.", "av. J.-C.") %in% abbreviations_fr))
 8 | })
 9 | 
10 | 
11 | test_that("'stopwords' has common function words", {
12 |     expect_true(all(c("the", "and", "is") %in% stopwords_en))
13 | })
14 | 


--------------------------------------------------------------------------------
/vignettes/chinese-wordcloud-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/chinese-wordcloud-1.png


--------------------------------------------------------------------------------
/vignettes/corpus-emotion-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-emotion-1.png


--------------------------------------------------------------------------------
/vignettes/corpus-heapslaw-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-heapslaw-1.png


--------------------------------------------------------------------------------
/vignettes/corpus-witch-occurrences-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/corpus-witch-occurrences-1.png


--------------------------------------------------------------------------------
/vignettes/gender-estimates-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates-1.png


--------------------------------------------------------------------------------
/vignettes/gender-estimates_se-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-estimates_se-1.png


--------------------------------------------------------------------------------
/vignettes/gender-signif-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patperry/r-corpus/db176c2fb02cf2125d3c08f58044093142e473fa/vignettes/gender-signif-1.png


--------------------------------------------------------------------------------