├── man
    ├── .Rapp.history
    ├── tif-package.Rd
    ├── tif_is_corpus_character.Rd
    ├── tif_is_tokens_list.Rd
    ├── tif_is_dtm.Rd
    ├── tif_is_corpus_df.Rd
    ├── tif_is_tokens_df.Rd
    └── tif_as.Rd
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── .Rbuildignore
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-validators.R
├── inst
    └── examples
    │   ├── tif_is_dtm.R
    │   ├── tif_is_corpus_character.R
    │   ├── tif_is_tokens_list.R
    │   ├── tif_is_corpus_df.R
    │   ├── tif_as.R
    │   └── tif_is_tokens_df.R
├── R
    ├── pkg.R
    ├── coercion.R
    └── validators.R
├── NAMESPACE
├── DESCRIPTION
├── CONDUCT.md
├── NEWS.md
└── README.md


/man/.Rapp.history:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .Rproj.user
3 | tif.Rproj
4 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | CONDUCT.md
4 | ^\.github$
5 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(tif)
3 | 
4 | test_check("tif")
5 | 


--------------------------------------------------------------------------------
/inst/examples/tif_is_dtm.R:
--------------------------------------------------------------------------------
1 | #' @importFrom Matrix Matrix
2 | dtm <- Matrix::Matrix(0, ncol = 26, nrow = 5, sparse = TRUE)
3 | colnames(dtm) <- LETTERS
4 | rownames(dtm) <- sprintf("doc%d", 1:5)
5 | 
6 | tif_is_dtm(dtm)
7 | 


--------------------------------------------------------------------------------
/inst/examples/tif_is_corpus_character.R:
--------------------------------------------------------------------------------
1 | corpus <- c("Aujourd'hui, maman est morte.",
2 |             "It was a pleasure to burn.",
3 |             "All this happened, more or less.")
4 | 
5 | tif_is_corpus_character(corpus)
6 | 
7 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut")
8 | tif_is_corpus_character(corpus)
9 | 


--------------------------------------------------------------------------------
/inst/examples/tif_is_tokens_list.R:
--------------------------------------------------------------------------------
1 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"),
2 |                doc2 = c("it", "was", "a", "pleasure", "to", "burn"),
3 |                doc3 = c("all", "this", "happened", "more", "or", "less"))
4 | tif_is_tokens_list(tokens)
5 | 
6 | names(tokens) <- c("doc1", "doc2", "doc3")
7 | tif_is_tokens_list(tokens)
8 | 


--------------------------------------------------------------------------------
/inst/examples/tif_is_corpus_df.R:
--------------------------------------------------------------------------------
 1 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"),
 2 |                      text = c("Aujourd'hui, maman est morte.",
 3 |                       "It was a pleasure to burn.",
 4 |                       "All this happened, more or less."),
 5 |                      stringsAsFactors = FALSE)
 6 | 
 7 | tif_is_corpus_df(corpus)
 8 | 
 9 | corpus$author <- c("Camus", "Bradbury", "Vonnegut")
10 | tif_is_corpus_df(corpus)
11 | 


--------------------------------------------------------------------------------
/inst/examples/tif_as.R:
--------------------------------------------------------------------------------
 1 | # coerce corpus object
 2 | corpus <- c("Aujourd'hui, maman est morte.",
 3 |             "It was a pleasure to burn.",
 4 |             "All this happened, more or less.")
 5 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut")
 6 | 
 7 | new <- tif_as_corpus_df(corpus)
 8 | new
 9 | tif_as_corpus_character(new)
10 | 
11 | # coerce tokens object
12 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"),
13 |                doc2 = c("it", "was", "a", "pleasure", "to", "burn"),
14 |                doc3 = c("all", "this", "happened", "more", "or", "less"))
15 | 
16 | new <- tif_as_tokens_df(tokens)
17 | new
18 | tif_as_tokens_list(new)
19 | 
20 | 


--------------------------------------------------------------------------------
/inst/examples/tif_is_tokens_df.R:
--------------------------------------------------------------------------------
 1 | tokens <- data.frame(doc_id = c("doc1", "doc1", "doc1", "doc1",
 2 |                                 "doc2",  "doc2", "doc2", "doc2",
 3 |                                 "doc2", "doc2", "doc3", "doc3",
 4 |                                 "doc3", "doc3", "doc3", "doc3"),
 5 |                      token = c("aujourd'hui", "maman", "est",
 6 |                                "morte", "it", "was", "a", "pleasure",
 7 |                                "to", "burn", "all", "this", "happened",
 8 |                                "more", "or", "less"),
 9 |                      stringsAsFactors = FALSE)
10 | 
11 | tif_is_tokens_df(tokens)
12 | 
13 | tokens$pos <- "NOUN"
14 | tokens$NER <- ""
15 | tokens$sentiment <- runif(16L)
16 | tif_is_tokens_df(tokens)
17 | 


--------------------------------------------------------------------------------
/R/pkg.R:
--------------------------------------------------------------------------------
 1 | #' tif: Text Interchange Formats
 2 | #'
 3 | #' This package describes and validates formats for storing
 4 | #' common object arising in text analysis as native R objects.
 5 | #' Representations of a text corpus, document term matrix, and
 6 | #' tokenized text are included. The corpus and tokens objects
 7 | #' have multiple valid formats. Packages compliant with the
 8 | #' tif proposal should accept all valid formats and should
 9 | #' directly return, or provide conversion functions, for
10 | #' converting outputs into at least one of the formats (when
11 | #' applicable). The tokenized text format is extensible to
12 | #' include other annotations such as part of speech tags and
13 | #' named entities.
14 | #'
15 | #' @import Matrix
16 | #'
17 | #' @docType package
18 | "_PACKAGE"
19 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(tif_as_corpus_character,character)
 4 | S3method(tif_as_corpus_character,data.frame)
 5 | S3method(tif_as_corpus_character,default)
 6 | S3method(tif_as_corpus_df,character)
 7 | S3method(tif_as_corpus_df,data.frame)
 8 | S3method(tif_as_corpus_df,default)
 9 | S3method(tif_as_tokens_df,data.frame)
10 | S3method(tif_as_tokens_df,default)
11 | S3method(tif_as_tokens_df,list)
12 | S3method(tif_as_tokens_list,data.frame)
13 | S3method(tif_as_tokens_list,default)
14 | S3method(tif_as_tokens_list,list)
15 | export(tif_as_corpus_character)
16 | export(tif_as_corpus_df)
17 | export(tif_as_tokens_df)
18 | export(tif_as_tokens_list)
19 | export(tif_is_corpus_character)
20 | export(tif_is_corpus_df)
21 | export(tif_is_dtm)
22 | export(tif_is_tokens_df)
23 | export(tif_is_tokens_list)
24 | import(Matrix)
25 | importFrom(Matrix,Matrix)
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tif
 2 | Type: Package
 3 | Title: Text Interchange Format
 4 | Version: 0.4
 5 | Authors@R: c(person("Taylor", "Arnold", role = c("aut", "cre"),
 6 |                      email = "tarnold2@richmond.edu"),
 7 |               person("Ken", "Benoit", role = "aut",
 8 |                      email = "k.r.benoit@lse.ac.uk"),
 9 |               person("Lincoln", "Mullen", role = "aut",
10 |                      email = "lmullen@gmu.edu "),
11 |               person("Adam", "Obeng", role = "aut",
12 |                      email = "contact@adamobeng.com"),
13 |               person("rOpenSci Text Workshop Participants (2017)",
14 |                      role = "aut"))
15 | Maintainer: Taylor B. Arnold <tarnold2@richmond.edu>
16 | Description: Provides validation functions for common
17 |     interchange formats for representing text data in R.
18 |     Includes formats for corpus objects, document term
19 |     matrices, and tokens. Other annotations can be stored
20 |     by overloading the tokens structure.
21 | Imports: Matrix
22 | License: GPL-2
23 | Encoding: UTF-8
24 | URL: https://docs.ropensci.org/tif, https://github.com/ropensci/tif
25 | BugReports: http://github.com/ropensci/tif/issues
26 | RoxygenNote: 7.2.1
27 | Suggests: testthat
28 | 


--------------------------------------------------------------------------------
/man/tif-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pkg.R
 3 | \docType{package}
 4 | \name{tif-package}
 5 | \alias{tif}
 6 | \alias{tif-package}
 7 | \title{tif: Text Interchange Formats}
 8 | \description{
 9 | This package describes and validates formats for storing
10 | common object arising in text analysis as native R objects.
11 | Representations of a text corpus, document term matrix, and
12 | tokenized text are included. The corpus and tokens objects
13 | have multiple valid formats. Packages compliant with the
14 | tif proposal should accept all valid formats and should
15 | directly return, or provide conversion functions, for
16 | converting outputs into at least one of the formats (when
17 | applicable). The tokenized text format is extensible to
18 | include other annotations such as part of speech tags and
19 | named entities.
20 | }
21 | \seealso{
22 | Useful links:
23 | \itemize{
24 |   \item \url{https://docs.ropensci.org/tif}
25 |   \item \url{https://github.com/ropensci/tif}
26 |   \item Report bugs at \url{http://github.com/ropensci/tif/issues}
27 | }
28 | 
29 | }
30 | \author{
31 | \strong{Maintainer}: Taylor Arnold \email{taylor.arnold@acm.org}
32 | 
33 | Authors:
34 | \itemize{
35 |   \item Ken Benoit \email{k.r.benoit@lse.ac.uk}
36 |   \item Lincoln Mullen \email{lmullen@gmu.edu }
37 |   \item Adam Obeng \email{contact@adamobeng.com}
38 |   \item rOpenSci Text Workshop Participants (2017)
39 | }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http:contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/man/tif_is_corpus_character.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validators.R
 3 | \name{tif_is_corpus_character}
 4 | \alias{tif_is_corpus_character}
 5 | \title{Validate Corpus Character Vector Object}
 6 | \usage{
 7 | tif_is_corpus_character(corpus, warn = FALSE)
 8 | }
 9 | \arguments{
10 | \item{corpus}{a corpus object to test for validity}
11 | 
12 | \item{warn}{logical. Should the function produce a
13 | verbose warning for the condition for which
14 | the validation fails. Useful for testing.}
15 | }
16 | \value{
17 | a logical vector of length one indicating
18 |                whether the input is a valid corpus
19 | }
20 | \description{
21 | A valid character vector corpus object is an character
22 | vector with UTF-8 encoding. If it has names, this should
23 | be a unique character also in UTF-8 encoding. No other
24 | attributes should be present.
25 | }
26 | \details{
27 | The tests are run sequentially and the function returns,
28 | with a warning if the warn flag is set, on the first test
29 | that fails. We use this implementation because some tests
30 | may fail entirely or be meaningless if the prior ones are
31 | note passed.
32 | }
33 | \examples{
34 | corpus <- c("Aujourd'hui, maman est morte.",
35 |             "It was a pleasure to burn.",
36 |             "All this happened, more or less.")
37 | 
38 | tif_is_corpus_character(corpus)
39 | 
40 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut")
41 | tif_is_corpus_character(corpus)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/tif_is_tokens_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validators.R
 3 | \name{tif_is_tokens_list}
 4 | \alias{tif_is_tokens_list}
 5 | \title{Validate Tokens List Object}
 6 | \usage{
 7 | tif_is_tokens_list(tokens, warn = FALSE)
 8 | }
 9 | \arguments{
10 | \item{tokens}{a tokens object to test for validity}
11 | 
12 | \item{warn}{logical. Should the function produce a
13 | verbose warning for the condition for which
14 | the validation fails. Useful for testing.}
15 | }
16 | \value{
17 | a logical vector of length one indicating
18 |                whether the input is a valid tokens
19 | }
20 | \description{
21 | A valid corpus tokens object is (possibly named) list of
22 | character vectors. The character vectors, as well as
23 | names, should be in UTF-8 encoding. No other attributes
24 | should be present in either the list or any of its elements.
25 | }
26 | \details{
27 | The tests are run sequentially and the function returns,
28 | with a warning if the warn flag is set, on the first test
29 | that fails. We use this implementation because some tests
30 | may fail entirely or be meaningless if the prior ones are
31 | note passed.
32 | }
33 | \examples{
34 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"),
35 |                doc2 = c("it", "was", "a", "pleasure", "to", "burn"),
36 |                doc3 = c("all", "this", "happened", "more", "or", "less"))
37 | tif_is_tokens_list(tokens)
38 | 
39 | names(tokens) <- c("doc1", "doc2", "doc3")
40 | tif_is_tokens_list(tokens)
41 | }
42 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v2
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/man/tif_is_dtm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validators.R
 3 | \name{tif_is_dtm}
 4 | \alias{tif_is_dtm}
 5 | \title{Validate Document Term Matrix Object}
 6 | \usage{
 7 | tif_is_dtm(dtm, warn = FALSE)
 8 | }
 9 | \arguments{
10 | \item{dtm}{a document term matrix object to test
11 | the validity of}
12 | 
13 | \item{warn}{logical. Should the function produce a
14 | verbose warning for the condition for which
15 | the validation fails. Useful for testing.}
16 | }
17 | \value{
18 | a logical vector of length one indicating
19 |               whether the input is a valid document term
20 |               matrix
21 | }
22 | \description{
23 | A valid document term matrix is a sparse matrix with
24 | the row representing documents and columns representing
25 | terms. The row names is a character vector giving the
26 | document ids with no duplicated entries. The column
27 | names is a character vector giving the terms of the
28 | matrix with no duplicated entries. The spare matrix
29 | should inherit from the Matrix class dgCMatrix.
30 | }
31 | \details{
32 | The tests are run sequentially and the function returns,
33 | with a warning if the warn flag is set, on the first test
34 | that fails. We use this implementation because some tests
35 | may fail entirely or be meaningless if the prior ones are
36 | note passed. For example, if the dtm object is not a matrix
37 | it may not contain row or column names.
38 | }
39 | \examples{
40 | #' @importFrom Matrix Matrix
41 | dtm <- Matrix::Matrix(0, ncol = 26, nrow = 5, sparse = TRUE)
42 | colnames(dtm) <- LETTERS
43 | rownames(dtm) <- sprintf("doc\%d", 1:5)
44 | 
45 | tif_is_dtm(dtm)
46 | }
47 | 


--------------------------------------------------------------------------------
/man/tif_is_corpus_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validators.R
 3 | \name{tif_is_corpus_df}
 4 | \alias{tif_is_corpus_df}
 5 | \title{Validate Corpus Data Frame Object}
 6 | \usage{
 7 | tif_is_corpus_df(corpus, warn = FALSE)
 8 | }
 9 | \arguments{
10 | \item{corpus}{a corpus object to test for validity}
11 | 
12 | \item{warn}{logical. Should the function produce a
13 | verbose warning for the condition for which
14 | the validation fails. Useful for testing.}
15 | }
16 | \value{
17 | a logical vector of length one indicating
18 |                whether the input is a valid corpus
19 | }
20 | \description{
21 | A valid data frame corpus object is an object that
22 | least two columns. One column must be called doc_id
23 | and be a character vector with UTF-8 encoding. Document
24 | ids must be unique. There must also be a column called text
25 | and must also be a character vector in UTF-8 encoding. Each
26 | individual document is represented by a single row in
27 | the data frame. Addition document-level metadata columns
28 | and corpus level attributes are allowed but not required.
29 | }
30 | \details{
31 | The tests are run sequentially and the function returns,
32 | with a warning if the warn flag is set, on the first test
33 | that fails. We use this implementation because some tests
34 | may fail entirely or be meaningless if the prior ones are
35 | note passed. For example, if the corpus object does not
36 | have a variable named "text" it does not make sense to
37 | check whether this column is a character vector.
38 | }
39 | \examples{
40 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"),
41 |                      text = c("Aujourd'hui, maman est morte.",
42 |                       "It was a pleasure to burn.",
43 |                       "All this happened, more or less."),
44 |                      stringsAsFactors = FALSE)
45 | 
46 | tif_is_corpus_df(corpus)
47 | 
48 | corpus$author <- c("Camus", "Bradbury", "Vonnegut")
49 | tif_is_corpus_df(corpus)
50 | }
51 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # tif 0.3.0
 2 | 
 3 | * Further discussion has lead us to simplify the corpus and token data frame
 4 | formats. The doc_id, text, and token columns can be in any position within the
 5 | data frame.
 6 | 
 7 | # tif 0.2.0
 8 | 
 9 | * After a round of input for the initial version of the specification,
10 | we decided to allow two formats for corpus and tokens objects. In addition
11 | to the original data frame variants there is a character vector corpus
12 | object and a list-based tokens object. Converts between the various types
13 | are now included in the package.
14 | 
15 | ### New Functions
16 | 
17 | * `tif_is_corpus_character` returns TRUE or FALSE for whether the input
18 | is a valid character vector corpus object.
19 | 
20 | * `tif_is_tokens_list` returns TRUE or FALSE for whether the input
21 | is a valid list-based tokens object.
22 | 
23 | * `tif_as_corpus_character` takes a valid tif corpus object and returns
24 | a character vector corpus object.
25 | 
26 | * `tif_as_corpus_df` takes a valid tif corpus object and returns
27 | a data frame corpus object.
28 | 
29 | * `tif_as_tokens_character` takes a valid tif tokens object and returns
30 | a list-based tokens object.
31 | 
32 | * `tif_as_tokens_df` takes a valid tif tokens object and returns
33 | a list-based tokens object.
34 | 
35 | ### Renamed Functions
36 | 
37 | * The old validate functions have been renamed `tif_is_corpus_df`,
38 | `tif_is_dtm` and `tif_is_tokens_df`. This is more in line with base-R
39 | functions and separates the "df" version of the corpus and tokens from
40 | the alternative new forms.
41 | 
42 | # tif 0.1.0
43 | 
44 | * This is the initial implementation of the ideas discussed at
45 | the rOpenSci Text Workshop from 21-22 April 2017.
46 | 
47 | ### New Functions
48 | 
49 | * `tif_corpus_validate` returns TRUE or FALSE for whether the input
50 | is a valid corpus object.
51 | 
52 | * `tif_dtm_validate` returns TRUE or FALSE for whether the input is
53 | a valid document corpus object.
54 | 
55 | * `tif_tokens_validate` returns TRUE or FALSE for whether the input is
56 | a valid tokens object.
57 | 
58 | ### Known issues
59 | 
60 | * do not yet have a test suite for the package
61 | 
62 | * encoding checkin is not yet working
63 | 
64 | 


--------------------------------------------------------------------------------
/man/tif_is_tokens_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validators.R
 3 | \name{tif_is_tokens_df}
 4 | \alias{tif_is_tokens_df}
 5 | \title{Validate Tokens Data Frame Object}
 6 | \usage{
 7 | tif_is_tokens_df(tokens, warn = FALSE)
 8 | }
 9 | \arguments{
10 | \item{tokens}{a tokens object to test for validity}
11 | 
12 | \item{warn}{logical. Should the function produce a
13 | verbose warning for the condition for which
14 | the validation fails. Useful for testing.}
15 | }
16 | \value{
17 | a logical vector of length one indicating
18 |                whether the input is a valid tokens object
19 | }
20 | \description{
21 | A valid tokens data frame object is a data frame or an
22 | object that inherits a data frame. It has no row names
23 | and has at least two columns. It must a contain column called
24 | doc_id that is a character vector with UTF-8 encoding.
25 | Document ids must be unique. It must also contain a column called
26 | token that must also be a character vector in UTF-8 encoding.
27 | Each individual token is represented by a single row in
28 | the data frame. Addition token-level metadata columns
29 | are allowed but not required.
30 | }
31 | \details{
32 | The tests are run sequentially and the function returns,
33 | with a warning if the warn flag is set, on the first test
34 | that fails. We use this implementation because some tests
35 | may fail entirely or be meaningless if the prior ones are
36 | note passed. For example, if the tokens object does not
37 | have a variable named "doc_id" it does not make sense to
38 | check whether this column is a character vector.
39 | }
40 | \examples{
41 | tokens <- data.frame(doc_id = c("doc1", "doc1", "doc1", "doc1",
42 |                                 "doc2",  "doc2", "doc2", "doc2",
43 |                                 "doc2", "doc2", "doc3", "doc3",
44 |                                 "doc3", "doc3", "doc3", "doc3"),
45 |                      token = c("aujourd'hui", "maman", "est",
46 |                                "morte", "it", "was", "a", "pleasure",
47 |                                "to", "burn", "all", "this", "happened",
48 |                                "more", "or", "less"),
49 |                      stringsAsFactors = FALSE)
50 | 
51 | tif_is_tokens_df(tokens)
52 | 
53 | tokens$pos <- "NOUN"
54 | tokens$NER <- ""
55 | tokens$sentiment <- runif(16L)
56 | tif_is_tokens_df(tokens)
57 | }
58 | 


--------------------------------------------------------------------------------
/man/tif_as.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/coercion.R
 3 | \name{tif_as}
 4 | \alias{tif_as}
 5 | \alias{tif_as_corpus_character}
 6 | \alias{tif_as_corpus_character.default}
 7 | \alias{tif_as_corpus_character.character}
 8 | \alias{tif_as_corpus_character.data.frame}
 9 | \alias{tif_as_corpus_df}
10 | \alias{tif_as_corpus_df.default}
11 | \alias{tif_as_corpus_df.character}
12 | \alias{tif_as_corpus_df.data.frame}
13 | \alias{tif_as_tokens_df}
14 | \alias{tif_as_tokens_df.default}
15 | \alias{tif_as_tokens_df.list}
16 | \alias{tif_as_tokens_df.data.frame}
17 | \alias{tif_as_tokens_list}
18 | \alias{tif_as_tokens_list.default}
19 | \alias{tif_as_tokens_list.list}
20 | \alias{tif_as_tokens_list.data.frame}
21 | \title{Coerce Between tif Object Specifications}
22 | \usage{
23 | tif_as_corpus_character(corpus)
24 | 
25 | \method{tif_as_corpus_character}{default}(corpus)
26 | 
27 | \method{tif_as_corpus_character}{character}(corpus)
28 | 
29 | \method{tif_as_corpus_character}{data.frame}(corpus)
30 | 
31 | tif_as_corpus_df(corpus)
32 | 
33 | \method{tif_as_corpus_df}{default}(corpus)
34 | 
35 | \method{tif_as_corpus_df}{character}(corpus)
36 | 
37 | \method{tif_as_corpus_df}{data.frame}(corpus)
38 | 
39 | tif_as_tokens_df(tokens)
40 | 
41 | \method{tif_as_tokens_df}{default}(tokens)
42 | 
43 | \method{tif_as_tokens_df}{list}(tokens)
44 | 
45 | \method{tif_as_tokens_df}{data.frame}(tokens)
46 | 
47 | tif_as_tokens_list(tokens)
48 | 
49 | \method{tif_as_tokens_list}{default}(tokens)
50 | 
51 | \method{tif_as_tokens_list}{list}(tokens)
52 | 
53 | \method{tif_as_tokens_list}{data.frame}(tokens)
54 | }
55 | \arguments{
56 | \item{corpus}{valid tif corpus object to coerce}
57 | 
58 | \item{tokens}{valid tif tokens object to coerce}
59 | }
60 | \description{
61 | These functions convert between the various valid
62 | formats for corpus and tokens objects. By using these
63 | in other packages, maintainers need to only handle
64 | whichever specific format they would like to work
65 | with, but gain the freedom to output (or convert
66 | into) the one most suited to their package's paradigm.
67 | }
68 | \details{
69 | No explicit checking is done on the input; the output
70 | is guaranteed to be valid only if the input is a valid
71 | format. In fact, we make an effort to not modify an
72 | object that appears to be in the required format already
73 | due to R's copy on modify semantics.
74 | }
75 | \examples{
76 | # coerce corpus object
77 | corpus <- c("Aujourd'hui, maman est morte.",
78 |             "It was a pleasure to burn.",
79 |             "All this happened, more or less.")
80 | names(corpus) <- c("Camus", "Bradbury", "Vonnegut")
81 | 
82 | new <- tif_as_corpus_df(corpus)
83 | new
84 | tif_as_corpus_character(new)
85 | 
86 | # coerce tokens object
87 | tokens <- list(doc1 = c("aujourd'hui", "maman", "est", "morte"),
88 |                doc2 = c("it", "was", "a", "pleasure", "to", "burn"),
89 |                doc3 = c("all", "this", "happened", "more", "or", "less"))
90 | 
91 | new <- tif_as_tokens_df(tokens)
92 | new
93 | tif_as_tokens_list(new)
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/testthat/test-validators.R:
--------------------------------------------------------------------------------
  1 | test_that("tiFALSE_is_corpus_df", {
  2 |   # A minimal valid corpus
  3 |   tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE)
  4 |   expect_true(tif_is_corpus_df(tc))
  5 | 
  6 |   # Corpus with an additional class
  7 |   tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE)
  8 |   class(tc) <- c("data.table", "data.frame")
  9 |   expect_true(tif_is_corpus_df(tc))
 10 | 
 11 |   # Corpora with incorrect classes
 12 |   tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE)
 13 |   expect_false(tif_is_corpus_df(as.matrix(tc)))
 14 |   expect_warning(
 15 |     tif_is_corpus_df(as.matrix(tc), warn = TRUE),
 16 |     "corpus object must inherit the data.frame class"
 17 |   )
 18 |   expect_false(tif_is_corpus_df(unclass(tc)))
 19 |   expect_warning(
 20 |     tif_is_corpus_df(unclass(tc), warn = TRUE),
 21 |     "corpus object must inherit the data.frame class"
 22 |   )
 23 | 
 24 |   # Corpora with only one column
 25 |   tc <- data.frame(doc_id = "1", stringsAsFactors = FALSE)
 26 |   expect_false(tif_is_corpus_df(tc))
 27 |   expect_warning(
 28 |     tif_is_corpus_df(tc, warn = TRUE),
 29 |     "corpus object must contain at least two columns"
 30 |   )
 31 |   tc <- data.frame(text = c("foobar"), stringsAsFactors = FALSE)
 32 |   expect_false(tif_is_corpus_df(tc))
 33 |   expect_warning(
 34 |     tif_is_corpus_df(tc, warn = TRUE),
 35 |     "corpus object must contain at least two columns"
 36 |   )
 37 | 
 38 |   # A corpus with rownames
 39 |   tc <- data.frame(doc_id = "1", text = "foobar", stringsAsFactors = FALSE)
 40 |   rownames(tc) <- "baz"
 41 |   expect_false(tif_is_corpus_df(tc))
 42 |   expect_warning(
 43 |     tif_is_corpus_df(tc, warn = TRUE),
 44 |     "corpus object should not contain row names"
 45 |   )
 46 | 
 47 |   # Corpora with incorrect column types
 48 |   tc <- data.frame(doc_id = 1, text = "foobar", stringsAsFactors = FALSE)
 49 |   expect_false(tif_is_corpus_df(tc))
 50 |   expect_warning(
 51 |     tif_is_corpus_df(tc, warn = TRUE),
 52 |     "doc_id must be a character vector"
 53 |   )
 54 |   tc <- data.frame(doc_id = as.factor("1"), text = "foobar",
 55 |                    stringsAsFactors = FALSE)
 56 |   expect_false(tif_is_corpus_df(tc))
 57 |   expect_warning(
 58 |     tif_is_corpus_df(tc, warn = TRUE),
 59 |     "doc_id must be a character vector"
 60 |   )
 61 | 
 62 |   tc <- data.frame(doc_id = "1", text = 1, stringsAsFactors = FALSE)
 63 |   expect_false(tif_is_corpus_df(tc))
 64 |   expect_warning(
 65 |     tif_is_corpus_df(tc, warn = TRUE),
 66 |     "text must be a character vector"
 67 |   )
 68 |   tc <- data.frame(doc_id = "1", text = as.factor("foobar"),
 69 |                    stringsAsFactors = FALSE)
 70 |   expect_false(tif_is_corpus_df(tc))
 71 |   expect_warning(
 72 |     tif_is_corpus_df(tc, warn = TRUE),
 73 |     "text must be a character vector"
 74 |   )
 75 | 
 76 |   # If both are incorrect, warning should be for doc_id
 77 |   tc <- data.frame(doc_id  =  1, text = "foobar")
 78 |   expect_false(tif_is_corpus_df(tc))
 79 |   expect_warning(
 80 |     tif_is_corpus_df(tc, warn = TRUE),
 81 |     "doc_id must be a character vector"
 82 |   )
 83 |   # If both are incorrect, warning should be for doc_id
 84 |   tc <- data.frame(doc_id = 1, text = 1)
 85 |   expect_false(tif_is_corpus_df(tc))
 86 |   expect_warning(
 87 |     tif_is_corpus_df(tc, warn = TRUE),
 88 |     "doc_id must be a character vector"
 89 |   )
 90 | 
 91 |   # A corpus with duplicated doc_id s
 92 |   tc <- data.frame(doc_id = c("1", "2", "1"), text = rep("foobar", 3),
 93 |                    stringsAsFactors = FALSE)
 94 |   expect_false(tif_is_corpus_df(tc))
 95 |   expect_warning(
 96 |     tif_is_corpus_df(tc, warn = TRUE),
 97 |     "there are duplicated document ids in the corpus"
 98 |   )
 99 | 
100 | })
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## tif: Text Interchange Formats
  2 | 
  3 | <!-- badges: start -->
  4 | [![R-CMD-check](https://github.com/ropensci/tif/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/ropensci/tif/actions/workflows/R-CMD-check.yaml)
  5 | <!-- badges: end -->
  6 | 
  7 | This package describes and validates formats for storing
  8 | common object arising in text analysis as native R objects.
  9 | Representations of a text corpus, document term matrix, and
 10 | tokenized text are included. The tokenized text format is
 11 | extensible to include other annotations. There are two versions
 12 | of the corpus and tokens objects; packages should accept
 13 | both and return or coerce to at least one of these.
 14 | 
 15 | ## Installation
 16 | 
 17 | You can install the development version using devtools:
 18 | 
 19 | ```{r}
 20 | devtools::install_github("ropensci/tif")
 21 | ```
 22 | 
 23 | ## Usage
 24 | 
 25 | The package can be used to check that a particular object is in a valid 
 26 | format. For example, here we see that the object `corpus` is a valid corpus
 27 | data frame:
 28 | 
 29 | ```{r}
 30 | library(tif)
 31 | corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"),
 32 |                      text = c("Aujourd'hui, maman est morte.",
 33 |                       "It was a pleasure to burn.",
 34 |                       "All this happened, more or less."),
 35 |                      stringsAsFactors = FALSE)
 36 | 
 37 | tif_is_corpus_df(corpus)
 38 | ```
 39 | ```
 40 | TRUE
 41 | ```
 42 | 
 43 | The package also has functions to convert between the list and data frame
 44 | formats for corpus and token object. For example:
 45 | 
 46 | ```{r}
 47 | tif_as_corpus_character(corpus)
 48 | ```
 49 | ```
 50 |                               doc1                               doc2 
 51 |    "Aujourd'hui, maman est morte."       "It was a pleasure to burn." 
 52 |                               doc3 
 53 | "All this happened, more or less." 
 54 | ```
 55 | 
 56 | Note that extra meta data columns will be lost in the conversion from a data
 57 | frame to a named character vector.
 58 | 
 59 | ## Details
 60 | 
 61 | This package describes and validates formats for storing
 62 | common object arising in text analysis as native R objects.
 63 | Representations of a text corpus, document term matrix, and
 64 | tokenized text are included. The tokenized text format is
 65 | extensible to include other annotations. There are two versions
 66 | of the corpus and tokens objects; packages should accept and return
 67 | at least one of these.
 68 | 
 69 | **corpus** (data frame) - A valid corpus data frame object
 70 | is a data frame with at least two columns. The first column
 71 | is called doc_id and is a character vector with UTF-8 encoding. Document
 72 | ids must be unique. The second column is called text and
 73 | must also be a character vector in UTF-8 encoding. Each
 74 | individual document is represented by a single row in
 75 | the data frame. Addition document-level metadata columns
 76 | and corpus level attributes are allowed but not required.
 77 | 
 78 | **corpus** (character vector) - A valid character vector corpus
 79 | object is an character vector with UTF-8 encoding. If it has
 80 | names, this should be a unique character also in UTF-8
 81 | encoding. No other attributes should be present.
 82 | 
 83 | **dtm** - A valid document term matrix is a sparse matrix with
 84 | the row representing documents and columns representing
 85 | terms. The row names is a character vector giving the
 86 | document ids with no duplicated entries. The column
 87 | names is a character vector giving the terms of the
 88 | matrix with no duplicated entries. The sparse matrix
 89 | should inherit from the Matrix class dgCMatrix.
 90 | 
 91 | **tokens** (data frame) - A valid data frame tokens
 92 | object is a data frame with at least two columns. There must be
 93 | a column called doc_id that is a character vector
 94 | with UTF-8 encoding. Document ids must be unique.
 95 | There must also be a column called token that must also be a
 96 | character vector in UTF-8 encoding.
 97 | Each individual token is represented by a single row in
 98 | the data frame. Addition token-level metadata columns
 99 | are allowed but not required. 
100 | 
101 | **tokens** (list) - A valid corpus tokens object is (possibly
102 | named) list of character vectors. The character vectors, as
103 | well as names, should be in UTF-8 encoding. No other
104 | attributes should be present in either the list or any of its
105 | elements.
106 | 


--------------------------------------------------------------------------------
/R/coercion.R:
--------------------------------------------------------------------------------
  1 | #' Coerce Between tif Object Specifications
  2 | #'
  3 | #' These functions convert between the various valid
  4 | #' formats for corpus and tokens objects. By using these
  5 | #' in other packages, maintainers need to only handle
  6 | #' whichever specific format they would like to work
  7 | #' with, but gain the freedom to output (or convert
  8 | #' into) the one most suited to their package's paradigm.
  9 | #'
 10 | #' @param corpus    valid tif corpus object to coerce
 11 | #' @param tokens    valid tif tokens object to coerce
 12 | #'
 13 | #' @details
 14 | #' No explicit checking is done on the input; the output
 15 | #' is guaranteed to be valid only if the input is a valid
 16 | #' format. In fact, we make an effort to not modify an
 17 | #' object that appears to be in the required format already
 18 | #' due to R's copy on modify semantics.
 19 | #'
 20 | #' @example inst/examples/tif_as.R
 21 | #' @name tif_as
 22 | NULL
 23 | 
 24 | #' @export
 25 | #' @rdname tif_as
 26 | tif_as_corpus_character <- function(corpus) {
 27 |   UseMethod("tif_as_corpus_character")
 28 | }
 29 | 
 30 | #' @rdname tif_as
 31 | #' @export
 32 | tif_as_corpus_character.default <- function(corpus) {
 33 | 
 34 |   nd <- length(dim(corpus))
 35 |   if (nd <= 1L) {
 36 |     out <- as.character(corpus)
 37 |   } else if (nd == 2L) {
 38 |     out <- as.data.frame(corpus)
 39 |   } else {
 40 |     stop(sprintf("Cannot convert object of class %s to tif corpus",
 41 |                  class(corpus)))
 42 |   }
 43 | 
 44 |   return(out)
 45 | }
 46 | 
 47 | #' @rdname tif_as
 48 | #' @export
 49 | tif_as_corpus_character.character <- function(corpus) {
 50 |   return(corpus)
 51 | }
 52 | 
 53 | 
 54 | #' @rdname tif_as
 55 | #' @export
 56 | tif_as_corpus_character.data.frame <- function(corpus) {
 57 | 
 58 |   out <- as.character(corpus$text)
 59 |   names(out) <- corpus$doc_id
 60 | 
 61 |   return(out)
 62 | }
 63 | 
 64 | #' @export
 65 | #' @rdname tif_as
 66 | tif_as_corpus_df <- function(corpus) {
 67 |   UseMethod("tif_as_corpus_df")
 68 | }
 69 | 
 70 | #' @rdname tif_as
 71 | #' @export
 72 | tif_as_corpus_df.default <- function(corpus) {
 73 | 
 74 |   nd <- length(dim(corpus))
 75 |   if (nd <= 1L) {
 76 |     out <- as.character(corpus)
 77 |     tif_as_corpus_df(out)
 78 |   } else if (nd == 2L) {
 79 |     out <- as.data.frame(corpus)
 80 |   } else {
 81 |     stop(sprintf("Cannot convert object of class %s to tif corpus",
 82 |                  class(corpus)))
 83 |   }
 84 | 
 85 |   return(out)
 86 | }
 87 | 
 88 | #' @rdname tif_as
 89 | #' @export
 90 | tif_as_corpus_df.character <- function(corpus) {
 91 | 
 92 |   # Need to convert from character
 93 |   if (is.null(names(corpus))) {
 94 |     doc_id <- sprintf("doc%d", seq_along(corpus))
 95 |   } else {
 96 |     doc_id <- names(corpus)
 97 |   }
 98 |   out <- data.frame(doc_id = doc_id, text = as.character(corpus),
 99 |                     stringsAsFactors = FALSE)
100 |   return(out)
101 | }
102 | 
103 | #' @rdname tif_as
104 | #' @export
105 | tif_as_corpus_df.data.frame <- function(corpus) {
106 |   return(corpus)
107 | }
108 | 
109 | #' @export
110 | #' @rdname tif_as
111 | tif_as_tokens_df <- function(tokens) {
112 |   UseMethod("tif_as_tokens_df")
113 | }
114 | 
115 | #' @rdname tif_as
116 | #' @export
117 | tif_as_tokens_df.default <- function(tokens) {
118 | 
119 |   nd <- length(dim(tokens))
120 |   if (nd == 2L) {
121 |     out <- as.data.frame(tokens)
122 |     tif_as_tokens_df(out)
123 |   } else {
124 |     stop("Cannot convert object of class ", class(tokens),
125 |          " to tif tokens")
126 |   }
127 | 
128 |   return(out)
129 | }
130 | 
131 | #' @rdname tif_as
132 | #' @export
133 | tif_as_tokens_df.list <- function(tokens) {
134 | 
135 |   if (is.null(names(tokens))) {
136 |     doc_id <- sprintf("doc%d", seq_along(tokens))
137 |   } else {
138 |     doc_id <- names(tokens)
139 |   }
140 |   doc_id <- rep(doc_id, lengths(tokens))
141 |   out <- data.frame(doc_id = unlist(doc_id, use.names = FALSE),
142 |                     token = unlist(tokens, use.names = FALSE),
143 |                     stringsAsFactors = FALSE)
144 | 
145 |   return(out)
146 | }
147 | 
148 | 
149 | #' @rdname tif_as
150 | #' @export
151 | tif_as_tokens_df.data.frame <- function(tokens) {
152 |   return(tokens)
153 | }
154 | 
155 | 
156 | #' @export
157 | #' @rdname tif_as
158 | tif_as_tokens_list <- function(tokens) {
159 |   UseMethod("tif_as_tokens_list")
160 | }
161 | 
162 | #' @rdname tif_as
163 | #' @export
164 | tif_as_tokens_list.default <- function(tokens) {
165 | 
166 |   nd <- length(dim(tokens))
167 |   if (nd == 2L) {
168 |     out <- as.data.frame(tokens)
169 |   } else {
170 |     stop("Cannot convert object of class ", class(tokens),
171 |          " to tif tokens")
172 |   }
173 | 
174 |   return(out)
175 | }
176 | 
177 | #' @rdname tif_as
178 | #' @export
179 | tif_as_tokens_list.list <- function(tokens) {
180 |   return(tokens)
181 | }
182 | 
183 | 
184 | #' @rdname tif_as
185 | #' @export
186 | tif_as_tokens_list.data.frame <- function(tokens) {
187 |   out <- split(tokens$token, tokens$doc_id)
188 |   return(out)
189 | }
190 | 


--------------------------------------------------------------------------------
/R/validators.R:
--------------------------------------------------------------------------------
  1 | #' Validate Corpus Data Frame Object
  2 | #'
  3 | #' A valid data frame corpus object is an object that
  4 | #  inherits a data frame. It has no row names and has at
  5 | #' least two columns. One column must be called doc_id
  6 | #' and be a character vector with UTF-8 encoding. Document
  7 | #' ids must be unique. There must also be a column called text
  8 | #' and must also be a character vector in UTF-8 encoding. Each
  9 | #' individual document is represented by a single row in
 10 | #' the data frame. Addition document-level metadata columns
 11 | #' and corpus level attributes are allowed but not required.
 12 | #'
 13 | #' @param corpus  a corpus object to test for validity
 14 | #' @param warn    logical. Should the function produce a
 15 | #'                verbose warning for the condition for which
 16 | #'                the validation fails. Useful for testing.
 17 | #' @return        a logical vector of length one indicating
 18 | #'                whether the input is a valid corpus
 19 | #'
 20 | #' @details
 21 | #' The tests are run sequentially and the function returns,
 22 | #' with a warning if the warn flag is set, on the first test
 23 | #' that fails. We use this implementation because some tests
 24 | #' may fail entirely or be meaningless if the prior ones are
 25 | #' note passed. For example, if the corpus object does not
 26 | #' have a variable named "text" it does not make sense to
 27 | #' check whether this column is a character vector.
 28 | #'
 29 | #' @example inst/examples/tif_is_corpus_df.R
 30 | #' @export
 31 | tif_is_corpus_df <- function(corpus, warn = FALSE) {
 32 | 
 33 |   if (!inherits(corpus, "data.frame")) {
 34 |     if (warn) warning("corpus object must inherit the data.frame class")
 35 |     return(FALSE)
 36 |   }
 37 | 
 38 |   if (ncol(corpus) <= 1L) {
 39 |     if (warn) warning("corpus object must contain at least two columns")
 40 |     return(FALSE)
 41 |   }
 42 | 
 43 |   if (!all(c("doc_id", "text") %in% names(corpus))) {
 44 |     if (warn) warning("corpus object must contain columns named ",
 45 |                       "'doc_id' and 'text'")
 46 |     return(FALSE)
 47 |   }
 48 | 
 49 |   if (.row_names_info(corpus, type = 1) > 0) {
 50 |     if (warn) warning("corpus object should not contain row names")
 51 |     return(FALSE)
 52 |   }
 53 | 
 54 |   if (!is.character(corpus$doc_id)) {
 55 |     if (warn) warning("doc_id must be a character vector")
 56 |     return(FALSE)
 57 |   }
 58 | 
 59 |   if (!is.character(corpus$text)) {
 60 |     if (warn) warning("text must be a character vector")
 61 |     return(FALSE)
 62 |   }
 63 | 
 64 |   # if (Encoding(corpus$doc_id) != "UTF-8") {
 65 |   #   if (warn) warning("doc_id column must be UTF-8 encoded")
 66 |   #   return(FALSE)
 67 |   # }
 68 | 
 69 |   # if (Encoding(corpus$text) != "UTF-8") {
 70 |   #   if (warn) warning("text column must be UTF-8 encoded")
 71 |   #   return(FALSE)
 72 |   # }
 73 | 
 74 |   if (any(duplicated(corpus$doc_id))) {
 75 |     if (warn) warning("there are duplicated document ids in the corpus")
 76 |     return(FALSE)
 77 |   }
 78 | 
 79 |   return(TRUE)
 80 | }
 81 | 
 82 | #' Validate Corpus Character Vector Object
 83 | #'
 84 | #' A valid character vector corpus object is an character
 85 | #' vector with UTF-8 encoding. If it has names, this should
 86 | #' be a unique character also in UTF-8 encoding. No other
 87 | #' attributes should be present.
 88 | #'
 89 | #' @param corpus  a corpus object to test for validity
 90 | #' @param warn    logical. Should the function produce a
 91 | #'                verbose warning for the condition for which
 92 | #'                the validation fails. Useful for testing.
 93 | #' @return        a logical vector of length one indicating
 94 | #'                whether the input is a valid corpus
 95 | #'
 96 | #' @details
 97 | #' The tests are run sequentially and the function returns,
 98 | #' with a warning if the warn flag is set, on the first test
 99 | #' that fails. We use this implementation because some tests
100 | #' may fail entirely or be meaningless if the prior ones are
101 | #' note passed.
102 | #'
103 | #' @example inst/examples/tif_is_corpus_character.R
104 | #' @export
105 | tif_is_corpus_character <- function(corpus, warn = FALSE) {
106 | 
107 |   if (!is.character(corpus)) {
108 |     if (warn) warning("corpus object must be a character vector")
109 |     return(FALSE)
110 |   }
111 | 
112 |   if (!is.null(names(corpus)) && any(duplicated(names(corpus)))) {
113 |     if (warn) warning("names of corpus object must not be duplicated")
114 |     return(FALSE)
115 |   }
116 | 
117 |   if (!is.null(attributes(corpus)) &&
118 |       any(names(attributes(corpus)) != "names")) {
119 |     if (warn) warning("corpus object should only have 'names' attribute")
120 |     return(FALSE)
121 |   }
122 | 
123 |   if (!is.null(names(corpus)) && !is.character(names(corpus))) {
124 |     if (warn) warning("corpus object names should be a character vector")
125 |     return(FALSE)
126 |   }
127 | 
128 |   # if (Encoding(corpus) != "UTF-8") {
129 |   #   if (warn) warning("corpus must be UTF-8 encoded")
130 |   #   return(FALSE)
131 |   # }
132 | 
133 |   # if (!is.null(names(corpus)) && Encoding(names(corpus)) != "UTF-8") {
134 |   #   if (warn) warning("corpus names must be UTF-8 encoded")
135 |   #   return(FALSE)
136 |   # }
137 | 
138 |   return(TRUE)
139 | }
140 | 
141 | 
142 | #' Validate Document Term Matrix Object
143 | #'
144 | #' A valid document term matrix is a sparse matrix with
145 | #' the row representing documents and columns representing
146 | #' terms. The row names is a character vector giving the
147 | #' document ids with no duplicated entries. The column
148 | #' names is a character vector giving the terms of the
149 | #' matrix with no duplicated entries. The spare matrix
150 | #' should inherit from the Matrix class dgCMatrix.
151 | #'
152 | #' @param dtm    a document term matrix object to test
153 | #'               the validity of
154 | #' @param warn   logical. Should the function produce a
155 | #'               verbose warning for the condition for which
156 | #'               the validation fails. Useful for testing.
157 | #' @return       a logical vector of length one indicating
158 | #'               whether the input is a valid document term
159 | #'               matrix
160 | #'
161 | #' @details
162 | #' The tests are run sequentially and the function returns,
163 | #' with a warning if the warn flag is set, on the first test
164 | #' that fails. We use this implementation because some tests
165 | #' may fail entirely or be meaningless if the prior ones are
166 | #' note passed. For example, if the dtm object is not a matrix
167 | #' it may not contain row or column names.
168 | #'
169 | #' @example inst/examples/tif_is_dtm.R
170 | #' @importFrom Matrix Matrix
171 | #' @export
172 | tif_is_dtm <- function(dtm, warn = FALSE) {
173 | 
174 |   if (!inherits(dtm, "dgCMatrix")) {
175 |     if (warn) warning("document term matrix object must inherit",
176 |                       "the dgCMatrix class")
177 |     return(FALSE)
178 |   }
179 | 
180 |   if (is.null(colnames(dtm))) {
181 |     if (warn) warning("document term matrix object must have column names")
182 |     return(FALSE)
183 |   }
184 | 
185 |   if (is.null(rownames(dtm))) {
186 |     if (warn) warning("document term matrix object must have row names")
187 |     return(FALSE)
188 |   }
189 | 
190 |   if (!is.character(rownames(dtm))) {
191 |     if (warn) warning("document term matrix object must have character",
192 |                       "row names")
193 |     return(FALSE)
194 |   }
195 | 
196 |   if (!is.character(colnames(dtm))) {
197 |     if (warn) warning("document term matrix object must have character",
198 |                       "column names")
199 |     return(FALSE)
200 |   }
201 | 
202 |   if (any(duplicated(rownames(dtm)))) {
203 |     if (warn) warning("document term matrix object has duplicated row names")
204 |     return(FALSE)
205 |   }
206 | 
207 |   if (any(duplicated(colnames(dtm)))) {
208 |     if (warn) warning("document term matrix object has duplicated column",
209 |                       "names")
210 |     return(FALSE)
211 |   }
212 | 
213 |   return(TRUE)
214 | }
215 | 
216 | #' Validate Tokens Data Frame Object
217 | #'
218 | #' A valid tokens data frame object is a data frame or an
219 | #' object that inherits a data frame. It has no row names
220 | #' and has at least two columns. It must a contain column called
221 | #' doc_id that is a character vector with UTF-8 encoding.
222 | #' Document ids must be unique. It must also contain a column called
223 | #' token that must also be a character vector in UTF-8 encoding.
224 | #' Each individual token is represented by a single row in
225 | #' the data frame. Addition token-level metadata columns
226 | #' are allowed but not required.
227 | #'
228 | #' @param tokens  a tokens object to test for validity
229 | #' @param warn    logical. Should the function produce a
230 | #'                verbose warning for the condition for which
231 | #'                the validation fails. Useful for testing.
232 | #' @return        a logical vector of length one indicating
233 | #'                whether the input is a valid tokens object
234 | #'
235 | #' @details
236 | #' The tests are run sequentially and the function returns,
237 | #' with a warning if the warn flag is set, on the first test
238 | #' that fails. We use this implementation because some tests
239 | #' may fail entirely or be meaningless if the prior ones are
240 | #' note passed. For example, if the tokens object does not
241 | #' have a variable named "doc_id" it does not make sense to
242 | #' check whether this column is a character vector.
243 | #'
244 | #' @example inst/examples/tif_is_tokens_df.R
245 | #' @export
246 | tif_is_tokens_df <- function(tokens, warn = FALSE) {
247 | 
248 |   if (!inherits(tokens, "data.frame")) {
249 |     if (warn) warning("tokens object must inherit the data.frame class")
250 |     return(FALSE)
251 |   }
252 | 
253 |   if (ncol(tokens) <= 1L) {
254 |     if (warn) warning("tokens object must contain at least two columns")
255 |     return(FALSE)
256 |   }
257 | 
258 | 
259 |   if (!all(c("doc_id", "token") %in% names(tokens))) {
260 |     if (warn) warning("data frame must contain columns named",
261 |                       "'doc_id' and 'token'")
262 |     return(FALSE)
263 |   }
264 | 
265 |   if (.row_names_info(tokens, type = 1) > 0) {
266 |     if (warn) warning("tokens object should not contain row names")
267 |     return(FALSE)
268 |   }
269 | 
270 |   if (!is.character(tokens$doc_id)) {
271 |     if (warn) warning("doc_id must be a character vector")
272 |     return(FALSE)
273 |   }
274 | 
275 |   if (!is.character(tokens$token)) {
276 |     if (warn) warning("text must be a character vector")
277 |     return(FALSE)
278 |   }
279 | 
280 |   # if (Encoding(tokens$doc_id) != "UTF-8") {
281 |   #   if (warn) warning("doc_id column must be UTF-8 encoded")
282 |   #   return(FALSE)
283 |   # }
284 | 
285 |   # if (Encoding(tokens$token) != "UTF-8") {
286 |   #   if (warn) warning("token column must be UTF-8 encoded")
287 |   #   return(FALSE)
288 |   # }
289 | 
290 |   return(TRUE)
291 | }
292 | 
293 | #' Validate Tokens List Object
294 | #'
295 | #' A valid corpus tokens object is (possibly named) list of
296 | #' character vectors. The character vectors, as well as
297 | #' names, should be in UTF-8 encoding. No other attributes
298 | #' should be present in either the list or any of its elements.
299 | #'
300 | #' @param tokens  a tokens object to test for validity
301 | #' @param warn    logical. Should the function produce a
302 | #'                verbose warning for the condition for which
303 | #'                the validation fails. Useful for testing.
304 | #' @return        a logical vector of length one indicating
305 | #'                whether the input is a valid tokens
306 | #'
307 | #' @details
308 | #' The tests are run sequentially and the function returns,
309 | #' with a warning if the warn flag is set, on the first test
310 | #' that fails. We use this implementation because some tests
311 | #' may fail entirely or be meaningless if the prior ones are
312 | #' note passed.
313 | #'
314 | #' @example inst/examples/tif_is_tokens_list.R
315 | #' @export
316 | tif_is_tokens_list <- function(tokens, warn = FALSE) {
317 | 
318 |   if (!is.list(tokens)) {
319 |     if (warn) warning("tokens object must be a list")
320 |     return(FALSE)
321 |   }
322 | 
323 |   if (!is.null(names(tokens)) && any(duplicated(names(tokens)))) {
324 |     if (warn) warning("names of tokens object must not be duplicated")
325 |     return(FALSE)
326 |   }
327 | 
328 |   if (!is.null(attributes(tokens)) &&
329 |       any(names(attributes(tokens)) != "names")) {
330 |     if (warn) warning("tokens object should only have 'names' attribute")
331 |     return(FALSE)
332 |   }
333 | 
334 |   if (!is.null(names(tokens)) && !is.character(names(tokens))) {
335 |     if (warn) warning("tokens object names should be a character vector")
336 |     return(FALSE)
337 |   }
338 | 
339 |   if (any(unlist(lapply(tokens, is.null)))) {
340 |     if (warn) warning("no elements of tokens should be 'NULL'")
341 |     return(FALSE)
342 |   }
343 | 
344 |   if (!all(unlist(lapply(tokens, is.character)))) {
345 |     if (warn) warning("elements of tokens should all be a character vectors")
346 |     return(FALSE)
347 |   }
348 | 
349 |   if (!all(unlist(lapply(lapply(tokens, attributes), is.null)))) {
350 |     if (warn) {
351 |       warning("elements of tokens should have no additional attributes")
352 |     }
353 |     return(FALSE)
354 |   }
355 | 
356 |   # if (!all(sapply(tokens, Encoding) == "UTF-8")) {
357 |   #   if (warn) warning("elements of tokens must be UTF-8 encoded")
358 |   #   return(FALSE)
359 |   # }
360 | 
361 |   # if (!is.null(names(tokens)) && Encoding(names(tokens)) != "UTF-8") {
362 |   #   if (warn) warning("tokens names must be UTF-8 encoded")
363 |   #   return(FALSE)
364 |   # }
365 | 
366 |   return(TRUE)
367 | }
368 | 


--------------------------------------------------------------------------------