├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── bind_list.R
    ├── bind_table.R
    ├── bind_vector.R
    ├── change_index.R
    ├── cluster_matrix.R
    ├── column_to_rownames.R
    ├── combine.R
    ├── duration.R
    ├── flatten.R
    ├── from_to.R
    ├── grab_index.R
    ├── grab_match.R
    ├── mtabulate.R
    ├── split_index.R
    ├── split_match.R
    ├── split_match_regex_to_transcript.R
    ├── split_portion.R
    ├── split_run.R
    ├── split_sentence.R
    ├── split_sentence_token.R
    ├── split_speaker.R
    ├── split_token.R
    ├── split_transcript.R
    ├── split_word.R
    ├── textshape-package.R
    ├── tidy_colo_dtm.R
    ├── tidy_dtm.R
    ├── tidy_list.R
    ├── tidy_matrix.R
    ├── tidy_table.R
    ├── tidy_vector.R
    ├── unique_pairs.R
    ├── unnest_text.R
    └── utils.R
├── README.Rmd
├── README.md
├── data
    ├── DATA.rda
    ├── golden_rules.rda
    ├── hamlet.rda
    └── simple_dtm.rda
├── inst
    ├── build.R
    ├── docs
    │   └── Simpsons_Roasting_on_an_Open_Fire_Script.pdf
    ├── extra_statdoc
    │   └── readme.R
    ├── make_data
    │   └── golden_rules.R
    ├── staticdocs
    │   └── index.R
    └── supporting_docs
    │   └── LaverGarry.zip
├── man
    ├── DATA.Rd
    ├── bind_list.Rd
    ├── bind_table.Rd
    ├── bind_vector.Rd
    ├── change_index.Rd
    ├── cluster_matrix.Rd
    ├── column_to_rownames.Rd
    ├── combine.Rd
    ├── duration.Rd
    ├── flatten.Rd
    ├── from_to.Rd
    ├── golden_rules.Rd
    ├── grab_index.Rd
    ├── grab_match.Rd
    ├── hamlet.Rd
    ├── mtabulate.Rd
    ├── simple_dtm.Rd
    ├── split_index.Rd
    ├── split_match.Rd
    ├── split_portion.Rd
    ├── split_run.Rd
    ├── split_sentence.Rd
    ├── split_sentence_token.Rd
    ├── split_speaker.Rd
    ├── split_token.Rd
    ├── split_transcript.Rd
    ├── split_word.Rd
    ├── textshape.Rd
    ├── tidy_colo_dtm.Rd
    ├── tidy_dtm.Rd
    ├── tidy_list.Rd
    ├── tidy_matrix.Rd
    ├── tidy_table.Rd
    ├── tidy_vector.Rd
    ├── unique_pairs.Rd
    └── unnest_text.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-change_index.R
    │   └── test-mtabulate.R
└── tools
    ├── figure
        ├── unnamed-chunk-10-1.png
        ├── unnamed-chunk-16-1.png
        ├── unnamed-chunk-17-1.png
        ├── unnamed-chunk-18-1.png
        ├── unnamed-chunk-19-1.png
        ├── unnamed-chunk-21-1.png
        ├── unnamed-chunk-22-1.png
        ├── unnamed-chunk-23-1.png
        ├── unnamed-chunk-27-1.png
        ├── unnamed-chunk-32-1.png
        ├── unnamed-chunk-44-1.png
        ├── unnamed-chunk-45-1.png
        ├── unnamed-chunk-46-1.png
        ├── unnamed-chunk-55-1.png
        ├── unnamed-chunk-8-1.png
        └── unnamed-chunk-9-1.png
    └── textshape_logo
        ├── r_textshape.png
        ├── r_textshape.pptx
        ├── r_textshapea.png
        └── resize_icon.txt


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.gitignore
 4 | NEWS.md
 5 | FAQ.md
 6 | NEWS.html
 7 | FAQ.html
 8 | ^\.travis\.yml$
 9 | travis-tool.sh
10 | inst/web
11 | contributors.geojson
12 | inst/build.R
13 | ^.*\.Rprofile$
14 | README.Rmd
15 | README.R
16 | travis.yml
17 | tools/textshape_logo/r_textshapea.png
18 | tools/textshape_logo/r_textshape.pptx
19 | tools/textshape_logo/resize_icon.txt
20 | inst/staticdocs
21 | inst/extra_statdoc
22 | inst/maintenance.R
23 | Thumbs.db
24 | inst/make_data
25 | inst/supporting_docs
26 | ^CODE_OF_CONDUCT\.md$
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | # Example code in package build process
 4 | *-Ex.R
 5 | .Rprofile
 6 | .Rproj.user
 7 | textshape.Rproj
 8 | inst/maintenance.R
 9 | Thumbs.db
10 | .DS_Store
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | r:
 3 |   - release
 4 |   - devel
 5 | sudo: false
 6 | cache: packages
 7 | 
 8 | 
 9 | r_github_packages: jimhester/covr
10 | 
11 | before_install:
12 |   - sh -e /etc/init.d/xvfb start
13 | 
14 | after_success:
15 |   - Rscript -e 'covr::coveralls()'
16 | 
17 | r_build_args: "--resave-data=best"
18 | r_check_args: "--as-cran"
19 | 
20 | env:
21 |    global:
22 |      - DISPLAY=:99.0
23 |      - BOOTSTRAP_LATEX=1
24 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http://contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: textshape
 2 | Title: Tools for Reshaping Text
 3 | Version: 1.7.6
 4 | Authors@R: c(
 5 |         person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), 
 6 |         person("Joran", "Elias", role = "ctb"), 
 7 |         person("Matthew", "Flickinger", role = "ctb"), 
 8 |         person('Paul', 'Foster', role = "ctb")
 9 |     )
10 | Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
11 | Description: Tools that can be used to reshape and restructure text data.
12 | Depends: R (>= 3.4.0)
13 | Imports: data.table, slam, stats, stringi, utils
14 | Suggests: testthat
15 | License: GPL-2
16 | LazyData: TRUE
17 | RoxygenNote: 7.2.3
18 | Encoding: UTF-8
19 | URL: https://github.com/trinker/textshape
20 | BugReports: https://github.com/trinker/textshape/issues
21 | Collate: 
22 |     'bind_list.R'
23 |     'bind_table.R'
24 |     'bind_vector.R'
25 |     'change_index.R'
26 |     'cluster_matrix.R'
27 |     'column_to_rownames.R'
28 |     'combine.R'
29 |     'duration.R'
30 |     'flatten.R'
31 |     'from_to.R'
32 |     'grab_index.R'
33 |     'grab_match.R'
34 |     'mtabulate.R'
35 |     'split_index.R'
36 |     'split_match.R'
37 |     'split_match_regex_to_transcript.R'
38 |     'split_portion.R'
39 |     'split_run.R'
40 |     'split_sentence.R'
41 |     'split_sentence_token.R'
42 |     'split_speaker.R'
43 |     'split_token.R'
44 |     'split_transcript.R'
45 |     'split_word.R'
46 |     'textshape-package.R'
47 |     'tidy_colo_dtm.R'
48 |     'utils.R'
49 |     'tidy_dtm.R'
50 |     'tidy_list.R'
51 |     'tidy_matrix.R'
52 |     'tidy_table.R'
53 |     'tidy_vector.R'
54 |     'unique_pairs.R'
55 |     'unnest_text.R'
56 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(combine,data.frame)
 4 | S3method(combine,default)
 5 | S3method(duration,data.frame)
 6 | S3method(duration,default)
 7 | S3method(duration,numeric)
 8 | S3method(from_to,character)
 9 | S3method(from_to,data.frame)
10 | S3method(from_to,default)
11 | S3method(from_to,factor)
12 | S3method(from_to,numeric)
13 | S3method(grab_index,character)
14 | S3method(grab_index,data.frame)
15 | S3method(grab_index,default)
16 | S3method(grab_index,list)
17 | S3method(grab_index,matrix)
18 | S3method(grab_match,character)
19 | S3method(grab_match,data.frame)
20 | S3method(grab_match,list)
21 | S3method(split_index,character)
22 | S3method(split_index,data.frame)
23 | S3method(split_index,default)
24 | S3method(split_index,factor)
25 | S3method(split_index,list)
26 | S3method(split_index,matrix)
27 | S3method(split_index,numeric)
28 | S3method(split_run,data.frame)
29 | S3method(split_run,default)
30 | S3method(split_sentence,data.frame)
31 | S3method(split_sentence,default)
32 | S3method(split_sentence_token,data.frame)
33 | S3method(split_sentence_token,default)
34 | S3method(split_token,data.frame)
35 | S3method(split_token,default)
36 | S3method(split_word,data.frame)
37 | S3method(split_word,default)
38 | S3method(unique_pairs,data.table)
39 | S3method(unique_pairs,default)
40 | export(as_list)
41 | export(bind_list)
42 | export(bind_table)
43 | export(bind_vector)
44 | export(change_index)
45 | export(cluster_matrix)
46 | export(column_to_rownames)
47 | export(combine)
48 | export(duration)
49 | export(ends)
50 | export(flatten)
51 | export(from_to)
52 | export(from_to_summarize)
53 | export(grab_index)
54 | export(grab_match)
55 | export(mtabulate)
56 | export(split_index)
57 | export(split_match)
58 | export(split_match_regex)
59 | export(split_portion)
60 | export(split_run)
61 | export(split_sentence)
62 | export(split_sentence_token)
63 | export(split_speaker)
64 | export(split_token)
65 | export(split_transcript)
66 | export(split_word)
67 | export(starts)
68 | export(tidy_adjacency_matrix)
69 | export(tidy_colo_dtm)
70 | export(tidy_colo_tdm)
71 | export(tidy_dtm)
72 | export(tidy_list)
73 | export(tidy_matrix)
74 | export(tidy_table)
75 | export(tidy_tdm)
76 | export(tidy_vector)
77 | export(unique_pairs)
78 | export(unnest_text)
79 | importFrom(data.table,":=")
80 | importFrom(data.table,.N)
81 | 


--------------------------------------------------------------------------------
/R/bind_list.R:
--------------------------------------------------------------------------------
 1 | #' Row Bind a List of Named Dataframes or Vectors
 2 | #'
 3 | #' Deprecated, use \code{\link[textshape]{tidy_list}} instead.
 4 | #'
 5 | #' @param x A named \code{\link[base]{list}} of
 6 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.
 7 | #' @param id.name The name to use for the column created from the \code{\link[base]{list}}.
 8 | #' @param content.name The name to use for the column created from the \code{\link[base]{list}}
 9 | #' of \code{\link[base]{vector}}s (only used if \code{x} is  \code{\link[base]{vector}}).
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{list}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' bind_list(list(p=1:500, r=letters))
17 | #' bind_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
18 | #'
19 | #' ## 2015 Vice-Presidential Debates Example
20 | #' if (!require("pacman")) install.packages("pacman")
21 | #' pacman::p_load(rvest, magrittr, xml2)
22 | #'
23 | #' debates <- c(
24 | #'     wisconsin = "110908",
25 | #'     boulder = "110906",
26 | #'     california = "110756",
27 | #'     ohio = "110489"
28 | #' )
29 | #'
30 | #' lapply(debates, function(x){
31 | #'     xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
32 | #'         rvest::html_nodes("p") %>%
33 | #'         rvest::html_text() %>%
34 | #'         textshape::split_index(grep("^[A-Z]+:", .)) %>%
35 | #'         textshape::combine() %>%
36 | #'         textshape::split_transcript() %>%
37 | #'         textshape::split_sentence()
38 | #' }) %>%
39 | #'     textshape::bind_list("location")
40 | #' }
41 | bind_list <- function(x, id.name= "id", content.name = "content", ...){
42 | 
43 |     warning(
44 |         paste0(
45 |             "Deprecated, use textshape::tidy_list() instead.\n`bind_list()` ", 
46 |             "will be removed in the next version."
47 |         ), 
48 |         call. = FALSE
49 |     )
50 | 
51 | 
52 |     if (is.data.frame(x[[1]])){
53 |         bind_list_df(x = x, id.name = id.name)
54 |     } else {
55 | 
56 |         if (is.vector(x[[1]])){
57 |             bind_list_vector(x = x, id.name = id.name, content.name = content.name)
58 |         } else {
59 |             stop("`x` must be a list of `data.frame`s or `vector`s")
60 |         }
61 |     }
62 | }
63 | 
64 | 
65 | 
66 | bind_list_df <- function (x, id.name = "id"){
67 |     if (is.null(names(x))) {
68 |         names(x) <- seq_along(x)
69 |     }
70 |     list.names <- rep(names(x), sapply2(x, nrow))
71 |     x <- lapply(x, data.table::as.data.table)
72 |     x[['fill']] <- TRUE    
73 |     out <- data.frame(list.names, do.call(rbind, x),
74 |         row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
75 |     colnames(out)[1] <- id.name
76 |     data.table::data.table(out)
77 | }
78 | 
79 | bind_list_vector <- function(x, id.name= "id", content.name = "content"){
80 |     if (is.null(names(x))) {
81 |         names(x) <- seq_along(x)
82 |     }
83 |     dat <- data.frame(
84 |         rep(names(x), sapply2(x, length)),
85 |         unlist(x, use.names = FALSE),
86 |         stringsAsFactors = FALSE,
87 |         check.names = FALSE,
88 |         row.names = NULL
89 |     )
90 |     colnames(dat) <- c(id.name, content.name)
91 |     data.table::data.table(dat)
92 | }
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/R/bind_table.R:
--------------------------------------------------------------------------------
 1 | #' Column Bind a Table's Values with Its Names
 2 | #'
 3 | #' Deprecated, use \code{\link[textshape]{tidy_table}} instead.
 4 | #'
 5 | #' @param x A \code{\link[base]{table}}.
 6 | #' @param id.name The name to use for the column created from the \code{\link[base]{table}}
 7 | #' \code{\link[base]{names}}.
 8 | #' @param content.name The name to use for the column created from the \code{\link[base]{table}}
 9 | #' values.
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{table}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' x <- table(sample(LETTERS[1:6], 1000, TRUE))
17 | #' bind_table(x)
18 | #' }
19 | bind_table <- function(x, id.name= "id", content.name = "content", ...){
20 | 
21 |     warning(
22 |         paste0(
23 |             "Deprecated, use textshape::tidy_table() instead.\n`bind_table()` ", 
24 |             "will be removed in the next version."
25 |         ), 
26 |         call. = FALSE
27 |     )
28 | 
29 |     stopifnot(is.table(x))
30 |     
31 |     out <- data.table::data.table(x = names(x), y = unname(c(x)))
32 |     data.table::setnames(out, c(id.name, content.name))
33 |     out
34 | 
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/R/bind_vector.R:
--------------------------------------------------------------------------------
 1 | #' Column Bind an Atomic Vector's Values with Its Names
 2 | #'
 3 | #' Deprecated, use \code{\link[textshape]{tidy_vector}} instead.
 4 | #'
 5 | #' @param x A named atomic \code{\link[base]{vector}}.
 6 | #' @param id.name The name to use for the column created from the \code{\link[base]{vector}}
 7 | #' \code{\link[base]{names}}.
 8 | #' @param content.name The name to use for the column created from the \code{\link[base]{vector}}
 9 | #' values.
10 | #' @param \ldots ignored.
11 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
12 | #' from the \code{\link[base]{vector}} as an \code{id} column.
13 | #' @export
14 | #' @examples
15 | #' \dontrun{
16 | #' x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
17 | #' bind_vector(x)
18 | #' }
19 | bind_vector <- function(x, id.name= "id", content.name = "content", ...){
20 | 
21 |     warning(
22 |         paste0(
23 |             "Deprecated, use textshape::tidy_vector() instead.\n`bind_vector()` ", 
24 |             "will be removed in the next version."
25 |         ), 
26 |         call. = FALSE
27 |     )
28 | 
29 |     stopifnot(is.atomic(x))
30 |     
31 |     if (is.null(names)) {
32 |         out <- data.table::as.data.table(x)
33 |         data.table::setnames(out, id.name)
34 |     } else {
35 |         out <- data.table::data.table(x = names(x), y = unname(x))
36 |         data.table::setnames(out, c(id.name, content.name))
37 |     }
38 |     out
39 | }
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/R/change_index.R:
--------------------------------------------------------------------------------
 1 | #' Indexing of Changes in Runs
 2 | #'
 3 | #' Find the indices of changes in runs in a vector.  This function pairs well 
 4 | #' with \code{split_index} and is the default for the \code{indices} in all 
 5 | #' \code{split_index} functions that act on atomic vectors.
 6 | #'
 7 | #' @param x A vector.
 8 | #' @param \ldots ignored.
 9 | #' @return Returns a vector of integer indices of where a vector initially 
10 | #' changes.
11 | #' @export
12 | #' @seealso \code{\link[textshape]{split_index}}
13 | #' @examples
14 | #' set.seed(10)
15 | #' (x <- sample(0:1, 20, TRUE))
16 | #' change_index(x)
17 | #' split_index(x, change_index(x))
18 | #'
19 | #'
20 | #' (p_chng <- change_index(CO2[["Plant"]]))
21 | #' split_index(CO2[["Plant"]], p_chng)
22 | change_index <- function (x, ...) {
23 |     utils::head(1 + cumsum(rle(as.character(x))[[1]]), -1)
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/R/cluster_matrix.R:
--------------------------------------------------------------------------------
 1 | #' Reorder a Matrix Based on Hierarchical Clustering
 2 | #' 
 3 | #' Reorder matrix rows, columns, or both via hierarchical clustering.
 4 | #' 
 5 | #' @param x A matrix.
 6 | #' @param dim The dimension to reorder (cluster); must be set to "row", "col", 
 7 | #' or "both".
 8 | #' @param method The agglomeration method to be used (see 
 9 | #' \code{\link[stats]{hclust}}).
10 | #' @param \ldots ignored.
11 | #' @return Returns a reordered matrix.
12 | #' @export
13 | #' @seealso \code{\link[stats]{hclust}}
14 | #' @examples
15 | #' cluster_matrix(mtcars)
16 | #' cluster_matrix(mtcars, dim = 'row')
17 | #' cluster_matrix(mtcars, dim = 'col')
18 | #' 
19 | #' \dontrun{
20 | #' if (!require("pacman")) install.packages("pacman")
21 | #' pacman::p_load(tidyverse, viridis, gridExtra)
22 | #' 
23 | #' ## plot heatmap w/o clustering
24 | #' wo <- mtcars %>%
25 | #'     cor() %>%
26 | #'     tidy_matrix('car', 'var') %>%
27 | #'     ggplot(aes(var, car, fill = value)) +
28 | #'          geom_tile() +
29 | #'          scale_fill_viridis(name = expression(r[xy])) +
30 | #'          theme(
31 | #'              axis.text.y = element_text(size = 8)   ,
32 | #'              axis.text.x = element_text(
33 | #'                  size = 8, 
34 | #'                  hjust = 1, 
35 | #'                  vjust = 1, 
36 | #'                  angle = 45
37 | #'              ),   
38 | #'              legend.position = 'bottom',
39 | #'              legend.key.height = grid::unit(.1, 'cm'),
40 | #'              legend.key.width = grid::unit(.5, 'cm')
41 | #'          ) +
42 | #'          labs(subtitle = "With Out Clustering")
43 | #' 
44 | #' ## plot heatmap w clustering
45 | #' w <- mtcars %>%
46 | #'     cor() %>%
47 | #'     cluster_matrix() %>%
48 | #'     tidy_matrix('car', 'var') %>%
49 | #'     mutate(
50 | #'         var = factor(var, levels = unique(var)),
51 | #'         car = factor(car, levels = unique(car))        
52 | #'     ) %>%
53 | #'     group_by(var) %>%
54 | #'     ggplot(aes(var, car, fill = value)) +
55 | #'          geom_tile() +
56 | #'          scale_fill_viridis(name = expression(r[xy])) +
57 | #'          theme(
58 | #'              axis.text.y = element_text(size = 8)   ,
59 | #'              axis.text.x = element_text(
60 | #'                  size = 8, 
61 | #'                  hjust = 1, 
62 | #'                  vjust = 1, 
63 | #'                  angle = 45
64 | #'              ),   
65 | #'              legend.position = 'bottom',
66 | #'              legend.key.height = grid::unit(.1, 'cm'),
67 | #'              legend.key.width = grid::unit(.5, 'cm')               
68 | #'          ) +
69 | #'          labs(subtitle = "With Clustering")
70 | #' 
71 | #' gridExtra::grid.arrange(wo, w, ncol = 2)
72 | #' }
73 | cluster_matrix <- function(x, dim = 'both', method = "ward.D2", ...){
74 |     
75 |     stopifnot(is.matrix(x) | is.data.frame(x))
76 |     
77 |     switch(dim,
78 |         row = {
79 |                 hc1 <- stats::hclust(stats::dist(x), method = method)
80 |                 x[hc1$order, ]
81 |             },
82 |         col = {
83 |                 hc2 <- stats::hclust(stats::dist(t(x)), method = method)
84 |                 x[, hc2$order]
85 |             },
86 |         both = {
87 |                 hc1 <- stats::hclust(stats::dist(x), method = method)
88 |                 hc2 <- stats::hclust(stats::dist(t(x)), method = method)
89 |                 x[hc1$order, hc2$order]
90 |         },
91 |         stop('`dim` must be set to "row", "col", or "both"')
92 |     )
93 |     
94 | }
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/R/column_to_rownames.R:
--------------------------------------------------------------------------------
 1 | #' Add a Column as Rownames
 2 | #'
 3 | #' Takes an existing column and uses it as rownames instead.  This is useful
 4 | #' when turning a \code{\link[base]{data.frame}} into a \code{\link[base]{matrix}}.
 5 | #' Inspired by the \pkg{tibble} package's \code{column_to_row} which is now
 6 | #' deprecated if done on a \pkg{tibble} object.  By coercing to a
 7 | #' \code{\link[base]{data.frame}} this problem is avoided.
 8 | #'
 9 | #' @param x An object that can be coerced to a \code{\link[base]{data.frame}}.
10 | #' @param loc The column location as either an integer or string index location.
11 | #' Must be unique row names.
12 | #' @return Returns a \code{\link[base]{data.frame}} with the specified column
13 | #' moved to rownames.
14 | #' @export
15 | #' @examples
16 | #' state_dat <- data.frame(state.name, state.area, state.center, state.division)
17 | #' column_to_rownames(state_dat)
18 | #' column_to_rownames(state_dat, 'state.name')
19 | column_to_rownames <- function(x, loc = 1){
20 |     
21 |     x <- as.data.frame(x, check.names = FALSE, stringsAsFactors = FALSE)
22 |     if (!is.numeric(loc)) loc <- which(names(x) %in% loc)[1]
23 |     rownames(x) <- x[[loc]]
24 |     x[[loc]] <- NULL
25 |     x
26 |     
27 | }
28 | 


--------------------------------------------------------------------------------
/R/combine.R:
--------------------------------------------------------------------------------
 1 | #' Combine Elements
 2 | #'
 3 | #' Combine (\code{\link[base]{paste}}) elements (\code{\link[base]{vector}}s,
 4 | #' \code{\link[base]{list}}s, or \code{\link[base]{data.frame}}s) together
 5 | #' with \code{collapse = TRUE}.
 6 | #'
 7 | #' @param x A \code{\link[base]{data.frame}} or character vector with runs.
 8 | #' @param text.var The name of the text variable.
 9 | #' @param fix.punctuation logical If \code{TRUE} spaces before/after punctuation
10 | #' that should not be are a removed (regex used:
11 | #' \code{"(\\s+(?=[,.?!;:\%-]))|((?<=[$-])\\s+)"}).
12 | #' @param \ldots Ignored.
13 | #' @export
14 | #' @rdname combine
15 | #' @return Returns a vector (if given a list/vector) or an expanded
16 | #' \code{\link[data.table]{data.table}} with elements pasted together.
17 | #' @examples
18 | #' (x <- split_token(DATA[["state"]][1], FALSE))
19 | #' combine(x)
20 | #'
21 | #' (x2 <- split_token(DATA[["state"]], FALSE))
22 | #' combine(x2)
23 | #'
24 | #' (x3 <- split_sentence(DATA))
25 | #'
26 | #' ## without dropping the non-group variable column
27 | #' combine(x3)
28 | #'
29 | #' ## Dropping the non-group variable column
30 | #' combine(x3[, 1:5, with=FALSE])
31 | combine <- function(x, ...) {
32 |     UseMethod("combine")
33 | }
34 | 
35 | #' @export
36 | #' @rdname combine
37 | #' @method combine default
38 | combine.default <- function(x, fix.punctuation = TRUE, ...) {
39 | 
40 |     if(!is.list(x)) x <- list(x)
41 |     x <- unlist(lapply(x, paste, collapse = " "))
42 |     if (isTRUE(fix.punctuation)){
43 |         x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
44 |     }
45 |     unname(x)
46 | }
47 | 
48 | #' @export
49 | #' @rdname combine
50 | #' @method combine data.frame
51 | combine.data.frame <- function(x, text.var = TRUE, ...) {
52 | 
53 |     nms <- colnames(x)
54 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
55 | 
56 |     text.var <- detect_text_column(x, text.var)
57 | 
58 |     group.vars <- nms[!nms %in% text.var]
59 | 
60 |     express1 <- parse(text=
61 |         paste0(
62 |             "list(",
63 |             text.var,
64 |             " = paste(",
65 |             text.var,
66 |             ", collapse = \" \"))"
67 |         )
68 |     )
69 |     z <- z[, eval(express1), by = group.vars]
70 |     data.table::setcolorder(z, nms)
71 |     z
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/R/duration.R:
--------------------------------------------------------------------------------
  1 | #' Duration of Turns of Talk
  2 | #'
  3 | #' \code{duration} - Calculate duration (start and end times) for duration of 
  4 | #' turns of talk measured in words.
  5 | #'
  6 | #' @param x A \code{\link[base]{data.frame}} or character vector with a text
  7 | #' variable or a numeric vector.
  8 | #' @param text.var The name of the text variable.  If \code{TRUE}
  9 | #' \code{duration} tries to detect the text column.
 10 | #' @param grouping.var The grouping variables.  Default \code{NULL} generates
 11 | #' one word list for all text.  Also takes a single grouping variable or a list
 12 | #' of 1 or more grouping variables.
 13 | #' @param \ldots Ignored.
 14 | #' @export
 15 | #' @rdname duration
 16 | #' @importFrom data.table .N :=
 17 | #' @return Returns a vector or data frame of starts and/or ends.
 18 | #' @examples
 19 | #' (x <- c(
 20 | #'     "Mr. Brown comes! He says hello. i give him coffee.",
 21 | #'     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
 22 | #'     "go there"
 23 | #' ))
 24 | #' duration(x)
 25 | #' group <- c("A", "B", "A")
 26 | #' duration(x, group)
 27 | #'
 28 | #' groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
 29 | #' duration(x, groups)
 30 | #'
 31 | #' data(DATA)
 32 | #' duration(DATA)
 33 | #'
 34 | #' ## Larger data set
 35 | #' duration(hamlet)
 36 | #'
 37 | #' ## Integer values
 38 | #' x <- sample(1:10, 10)
 39 | #' duration(x)
 40 | #' starts(x)
 41 | #' ends(x)
 42 | duration <- function(x, ...) {
 43 |     UseMethod("duration")
 44 | }
 45 | 
 46 | #' @export
 47 | #' @rdname duration
 48 | #' @method duration default
 49 | duration.default <- function(x, grouping.var = NULL, ...) {
 50 | 
 51 |     if(is.null(grouping.var)) {
 52 |         G <- "all"
 53 |         ilen <- 1
 54 |     } else {
 55 |         if (is.list(grouping.var)) {
 56 |             m <- unlist(as.character(substitute(grouping.var))[-1])
 57 |             m <- sapply2(strsplit(m, "$", fixed=TRUE), function(x) {
 58 |                     x[length(x)]
 59 |                 }
 60 |             )
 61 |             ilen <- length(grouping.var)
 62 |             G <- paste(m, collapse="&")
 63 |         } else {
 64 |             G <- as.character(substitute(grouping.var))
 65 |             ilen <- length(G)
 66 |             G <- G[length(G)]
 67 |         }
 68 |     }
 69 |     if(is.null(grouping.var)){
 70 |         grouping <- rep("all", length(x))
 71 |     } else {
 72 |         if (is.list(grouping.var) & length(grouping.var)>1) {
 73 |             grouping <- grouping.var
 74 |         } else {
 75 |             grouping <- unlist(grouping.var)
 76 |         }
 77 |     }
 78 |     if (G == "") G <- paste(names(grouping.var), collapse="&")
 79 | 
 80 |     dat <- stats::setNames(
 81 |         data.frame(as.data.frame(grouping), x),
 82 |         c(strsplit(G, "&")[[1]], "text.var")
 83 |     )
 84 | 
 85 |     duration.data.frame(dat, "text.var")
 86 | 
 87 | }
 88 | 
 89 | #' @export
 90 | #' @rdname duration
 91 | #' @method duration data.frame
 92 | duration.data.frame <- function(x, text.var = TRUE, ...) {
 93 | 
 94 |     word.count <- NULL
 95 |     nms <- colnames(x)
 96 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
 97 | 
 98 |     text.var <- detect_text_column(x, text.var)
 99 | 
100 |     express1 <- parse(
101 |         text=paste0("word.count := stringi::stri_count_words(", text.var, ")")
102 |     )
103 |     
104 |     z[, eval(express1)][,
105 |         'word.count' := ifelse(is.na(word.count), 0, word.count)][,
106 |         'end' := cumsum(word.count)]
107 | 
108 |     z[["start"]] <- c(1, utils::head(z[["end"]] + 1, -1))
109 | 
110 |     colord <- c(nms[!nms %in% text.var], "word.count", "start", "end", text.var)
111 |     data.table:: setcolorder(z, colord)
112 |     
113 |     z[]
114 |     
115 | }
116 | 
117 | 
118 | #' @export
119 | #' @rdname duration
120 | #' @method duration numeric
121 | duration.numeric <- function(x, ...){
122 |     dat <- data.frame(x = x, end = cumsum(x))
123 |     dat[["start"]] <- c(1, utils::head(dat[["end"]] + 1 , -1))
124 |     dat[c(1, 3:2)]
125 | }
126 | 
127 | 
128 | #' Duration of Turns of Talk
129 | #'
130 | #' \code{startss} - Calculate start times from a numeric vector.
131 | #'
132 | #' @rdname duration
133 | #' @export
134 | starts <- function(x, ...) c(1, utils::head(ends(x) + 1 , -1))
135 | 
136 | 
137 | 
138 | #' Duration of Turns of Talk
139 | #'
140 | #' \code{ends} - Calculate end times from a numeric vector.
141 | #'
142 | #' @rdname duration
143 | #' @export
144 | ends <- function(x, ...) cumsum(x)
145 | 
146 | 


--------------------------------------------------------------------------------
/R/flatten.R:
--------------------------------------------------------------------------------
 1 | #' Flatten a Nested List of Vectors Into a Single Tier List of Vectors
 2 | #'
 3 | #' Flatten a named, nested list of atomic vectors to a single level using the
 4 | #' concatenated list/atomic vector names as the names of the single tiered
 5 | #' list.
 6 | #'
 7 | #' @param x A nested, named list of vectors.
 8 | #' @param sep A separator to use for the concatenation of the names from the
 9 | #' nested list.
10 | #' @param \ldots ignored.
11 | #' @return Returns a flattened list.
12 | #' @author StackOverflow user @@Michael and Paul Foster and Tyler
13 | #' Rinker <tyler.rinker@@gmail.com>.
14 | #' @export
15 | #' @note The order of the list is sorted alphabetically.  Pull requests for the
16 | #' option to return the original order would be appreciated.
17 | #' @references \url{https://stackoverflow.com/a/41882883/1000343} \cr
18 | #' \url{https://stackoverflow.com/a/48357114/1000343}
19 | #' @examples
20 | #' x <- list(
21 | #'     urban = list(
22 | #'         cars = c('volvo', 'ford'),
23 | #'         food.dining = list(
24 | #'             local.business = c('carls'),
25 | #'             chain.business = c('dennys', 'panera')
26 | #'         )
27 | #'     ),
28 | #'     rural = list(
29 | #'         land.use = list(
30 | #'             farming =list(
31 | #'                 dairy = c('cows'),
32 | #'                 vegie.plan = c('carrots')
33 | #'             )
34 | #'         ),
35 | #'         social.rec = list(
36 | #'             community.center = c('town.square')
37 | #'         ),
38 | #'         people.type = c('good', 'bad', 'in.between')
39 | #'     ),
40 | #'     other.locales = c('suburban'),
41 | #'     missing = list(
42 | #'         unknown = c(),
43 | #'         known = c()
44 | #'     ),
45 | #'     end = c('wow')
46 | #' )
47 | #'
48 | #' x
49 | #'
50 | #' flatten(x)
51 | #' flatten(x, ' -> ')
52 | flatten <- function(x , sep = '_', ...){
53 | 
54 |     stopifnot(is.list(x))
55 | 
56 |     x <- fix_names(x)
57 | 
58 |     out<- flatten_h(x)
59 | 
60 |     names(out) <- gsub('\\.', sep, names(out))
61 | 
62 |     names(out) <- gsub('unlikelystringtodupe', '.', names(out), fixed = TRUE)
63 | 
64 |     out[order(names(out))]
65 | 
66 | }
67 | 
68 | flatten_h <- function(x){
69 | 
70 |     z <- unlist(lapply(x, function(y) class(y)[1] == "list"))
71 | 
72 |     out <- c(x[!z], unlist(x[z], recursive=FALSE))
73 | 
74 |     if (sum(z)){
75 |         Recall(out)
76 |     } else {
77 |         out
78 |     }
79 | }
80 | 
81 | fix_names <- function(x) {
82 | 
83 |     if (is.list(x)) {
84 |         names(x) <- gsub('\\.', 'unlikelystringtodupe', names(x))
85 |         lapply(x, fix_names)
86 |     } else {
87 | 
88 |         x
89 |     }
90 | }
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/R/from_to.R:
--------------------------------------------------------------------------------
  1 | #' Prepare Discourse Data for Network Plotting
  2 | #'
  3 | #' \code{from_to} - Add the next speaker as the from variable in a to/from
  4 | #' network data structure.  Assumes that the flow of discourse is coming from
  5 | #' person A to person B, or at the very least the talk is taken up by person B.
  6 | #' Works by taking the vector of speakers and shifting everything down one and
  7 | #' then adding a closing element.
  8 | #'
  9 | #' @param x A data form \code{vector} or \code{data.frame}.
 10 | #' @param final The name of the closing element or node.
 11 | #' @param \ldots Ignored.
 12 | #' @param from.var A character string naming the column to be considered the
 13 | #' origin of the talk.
 14 | #' @param id.vars The variables that correspond to the speaker or are attributes
 15 | #' of the speaker (from variable).
 16 | #' @param text.var The name of the text variable.  If \code{TRUE}
 17 | #' \code{duration} tries to detect the text column.
 18 | #' @return Returns a vector (if given a vector) or an augmented
 19 | #' \code{\link[data.table]{data.table}}.
 20 | #' @rdname from_to
 21 | #' @export
 22 | #' @examples
 23 | #' from_to(DATA, 'person')
 24 | #' from_to_summarize(DATA, 'person')
 25 | #' from_to_summarize(DATA, 'person', c('sex', 'adult'))
 26 | #' \dontrun{
 27 | #' if (!require("pacman")) install.packages("pacman"); library(pacman)
 28 | #' p_load(dplyr, geomnet, qdap, stringi, scales)
 29 | #' p_load_current_gh('trinker/textsahpe')
 30 | #'
 31 | #' dat <- from_to_summarize(DATA, 'person', c('sex', 'adult')) %>%
 32 | #'     mutate(words = rescale(word.count, c(.5, 1.5)))
 33 | #'
 34 | #' dat %>%
 35 | #'     ggplot(aes(from_id = from, to_id = to)) +
 36 | #'         geom_net(
 37 | #'             aes(linewidth = words),
 38 | #'             layout.alg = "fruchtermanreingold",
 39 | #'             directed = TRUE,
 40 | #'             labelon = TRUE,
 41 | #'             size = 1,
 42 | #'             labelcolour = 'black',
 43 | #'             ecolour = "grey70",
 44 | #'             arrowsize = 1,
 45 | #'             curvature = .1
 46 | #'         ) +
 47 | #'         theme_net() +
 48 | #'         xlim(c(-0.05, 1.05))
 49 | #' }
 50 | from_to <- function(x, ...){
 51 |     UseMethod("from_to")
 52 | }
 53 | 
 54 | #' @export
 55 | #' @method from_to default
 56 | #' @rdname from_to
 57 | from_to.default <- function(x, final = 'End', ...){
 58 |     c(x[-1], final)
 59 | }
 60 | 
 61 | #' @export
 62 | #' @method from_to character
 63 | #' @rdname from_to
 64 | from_to.character <- function(x, final = 'End', ...){
 65 |     c(x[-1], final)
 66 | }
 67 | 
 68 | #' @export
 69 | #' @method from_to factor
 70 | #' @rdname from_to
 71 | from_to.factor <- function(x, final = 'End', ...){
 72 |     factor(c(as.character(x[-1]), final), levels = c(levels(x), final))
 73 | }
 74 | 
 75 | #' @export
 76 | #' @method from_to numeric
 77 | #' @rdname from_to
 78 | from_to.numeric <- function(x, final = 'End', ...){
 79 |     c(as.character(x[-1]), final)
 80 | }
 81 | 
 82 | #' @export
 83 | #' @method from_to data.frame
 84 | #' @rdname from_to
 85 | from_to.data.frame <- function(x, from.var, final = 'End', ...){
 86 | 
 87 |     data.table::data.table(data.frame(
 88 |         from = x[[from.var]],
 89 |         to = from_to(x[[from.var]]),
 90 |         x,
 91 |         stringsAsFactors = FALSE
 92 |     ))
 93 | 
 94 | }
 95 | 
 96 | #' Prepare Discourse Data for Network Plotting
 97 | #'
 98 | #' \code{from_to_summarize} - A wrapper for \code{from_to.data.frame} that
 99 | #' adds a \code{word.count} column and then combines duplicate rows.
100 | #'
101 | #' @rdname from_to
102 | #' @export
103 | from_to_summarize <- function(x, from.var, id.vars = NULL, text.var = TRUE,
104 |     ...){
105 | 
106 |     word.count <- NULL
107 | 
108 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
109 |     if (!is.null(id.vars)) {
110 |         w <- unique(z[, c(from.var, id.vars), with=FALSE])
111 |     }
112 | 
113 |     text.var <- detect_text_column(x, text.var)
114 | 
115 |     express1 <- parse(
116 |         text=paste0("word.count := stringi::stri_count_words(", text.var, ")")
117 |     )
118 | 
119 |     z <- z[, eval(express1)][,
120 |         'word.count' := ifelse(is.na(word.count), 0, word.count)][]
121 | 
122 |     out <- from_to(z, from.var)[,
123 |         list(word.count = sum(word.count)), c('from', 'to')]
124 | 
125 |     if (!is.null(id.vars)) {
126 |        out <- merge(out, w, all.x=TRUE, by.x = 'from', by.y = from.var)
127 |     }
128 | 
129 |     out
130 | }
131 | 


--------------------------------------------------------------------------------
/R/grab_index.R:
--------------------------------------------------------------------------------
 1 | #' Get Elements Matching Between 2 Points
 2 | #'
 3 | #' Use regexes to get all the elements between two points.
 4 | #'
 5 | #' @param x A character vector, \code{\link[base]{data.frame}}, or list.
 6 | #' @param from An integer to start from (if \code{NULL} defaults to the first
 7 | #' element/row).
 8 | #' @param to A integer to get up to (if \code{NULL} defaults to the last
 9 | #' element/row).
10 | #' @param \ldots ignored.
11 | #' @return Returns a subset of the original data set.
12 | #' @export
13 | #' @examples
14 | #' grab_index(DATA, from = 2, to = 4)
15 | #' grab_index(DATA$state, from = 2, to = 4)
16 | #' grab_index(DATA$state, from = 2)
17 | #' grab_index(DATA$state, to = 4)
18 | #' grab_index(matrix(1:100, nrow = 10), 2, 4)
19 | grab_index <- function(x, from = NULL, to = NULL, ...){
20 | 
21 |     UseMethod('grab_index')
22 | 
23 | }
24 | 
25 | #' @export
26 | #' @rdname grab_index
27 | #' @method grab_index character
28 | grab_index.character <- function(x, from = NULL, to = NULL, ...){
29 | 
30 |     grab_index.default(x, from = from, to = to, ...)
31 | 
32 | }
33 | 
34 | #' @export
35 | #' @rdname grab_index
36 | #' @method grab_index default
37 | grab_index.default <- function(x, from = NULL, to = NULL, ...){
38 | 
39 |     if (is.null(from)) from  <- 1
40 |     if (is.null(to)) to <- length(x)
41 | 
42 |     if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
43 |     if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
44 |     x[from:to]
45 | 
46 | }
47 | 
48 | #' @export
49 | #' @rdname grab_index
50 | #' @method grab_index list
51 | grab_index.list <- function(x, from = NULL, to = NULL, ...){
52 | 
53 |     grab_index.default(x, from = from, to = to, ...)
54 | 
55 | }
56 | 
57 | #' @export
58 | #' @rdname grab_index
59 | #' @method grab_index data.frame
60 | grab_index.data.frame <- function(x, from = NULL, to = NULL, ...){
61 | 
62 |     if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
63 |     if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
64 |     x[from:to,, drop =FALSE]
65 | 
66 | }
67 | 
68 | 
69 | ## Helper function(s)
70 | #' @export
71 | #' @rdname grab_index
72 | #' @method grab_index matrix
73 | grab_index.matrix <- function(x, from = NULL, to = NULL, ...){
74 | 
75 |     if (from < 1 | from > length(x)) stop('`from` must be > 1 & < length(x)')
76 |     if (to < 1 | to > length(x)) stop('`to` must be > 1 & < length(x)')
77 |     x[from:to,, drop =FALSE]
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/R/grab_match.R:
--------------------------------------------------------------------------------
  1 | #' Get Elements Matching Between 2 Points
  2 | #' 
  3 | #' Use regexes to get all the elements between two points.
  4 | #' 
  5 | #' @param x A character vector, \code{\link[base]{data.frame}}, or list.
  6 | #' @param from A regex to start getting from (if \code{NULL} defaults to the 
  7 | #' first element/row).
  8 | #' @param to A regex to get up to (if \code{NULL} defaults to the last element/row).
  9 | #' @param from.n If more than one element matches \code{from} this dictates 
 10 | #' which one should be used.  Must be an integer up to the number of possible
 11 | #' matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
 12 | #' possible), or \code{'n'} (the same as \code{'last'}).
 13 | #' @param to.n If more than one element matches \code{to} this dictates 
 14 | #' which one should be used.  Must be an integer up to the number of possible
 15 | #' matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
 16 | #' possible), or \code{'n'} (the same as \code{'last'}).
 17 | #' @param \ldots Other arguments passed to \code{\link[base]{grep}}, most notable
 18 | #' is \code{ignore.case}.
 19 | #' @param text.var The name of the text variable with matches. If \code{TRUE}
 20 | #' \code{grab_match} tries to detect the text column.
 21 | #' @return Returns a subset of the original data set.
 22 | #' @export
 23 | #' @examples
 24 | #' grab_match(DATA$state, from = 'dumb', to = 'liar')
 25 | #' grab_match(DATA$state, from = 'dumb')
 26 | #' grab_match(DATA$state, to = 'liar')
 27 | #' grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE)
 28 | #' grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE, 
 29 | #'     from.n = 'first', to.n = 'last')
 30 | #' grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
 31 | #' 
 32 | #' ## Data.frame: attempts to find text.var
 33 | #' grab_match(DATA, from = 'dumb', to = 'liar')
 34 | grab_match <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
 35 | 
 36 |     UseMethod('grab_match')
 37 |    
 38 | }
 39 | 
 40 | #' @export
 41 | #' @rdname grab_match
 42 | #' @method grab_match character
 43 | grab_match.character <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
 44 | 
 45 |     locs <- grab_match_helper(x = x, from = from, to = to, from.n = from.n, to.n = to.n)
 46 | 
 47 |     grab_index(x, locs[['from.ind']], locs[['to.ind']])
 48 | }
 49 | 
 50 | #' @export
 51 | #' @rdname grab_match
 52 | #' @method grab_match list
 53 | grab_match.list <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...){
 54 | 
 55 |     locs <- grab_match_helper(x = lapply(x, unlist), from = from, to = to, from.n = from.n, to.n = to.n)
 56 | 
 57 |     grab_index(x, locs[['from.ind']], locs[['to.ind']])
 58 | }
 59 | 
 60 | #' @export
 61 | #' @rdname grab_match
 62 | #' @method grab_match data.frame
 63 | grab_match.data.frame <- function(x, from = NULL, to = NULL, from.n = 1, to.n = 1, 
 64 |     text.var = TRUE, ...){
 65 | 
 66 |     text.var <- detect_text_column(x, text.var)
 67 |     
 68 |     locs <- grab_match_helper(x = x[[text.var]], from = from, to = to, from.n = from.n, to.n = to.n)
 69 | 
 70 |     grab_index(x, locs[['from.ind']], locs[['to.ind']])
 71 | }
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | grab_match_helper <- function(x, from, to, from.n, to.n, ...){
 78 | 
 79 |     from.n <- nth(from.n)
 80 |     to.n <- nth(to.n)
 81 | 
 82 |     if (is.null(from)) {
 83 |         fi <- 1
 84 |     } else {
 85 |         fi <- get_index(from.n, match_to_index(x, from, use = 'from', ...), use = 'from.n')
 86 |     }
 87 |     
 88 |     
 89 |     if (is.null(to)) {
 90 |         ti <- length(x)
 91 |     } else {    
 92 |         ti <- get_index(to.n, match_to_index(x, to, use = 'to', ...), use = 'to.n')
 93 |     }
 94 |     
 95 |     c(
 96 |    	     from.ind = fi,
 97 |     	 to.ind = ti
 98 |     )
 99 |       
100 | }
101 | 
102 | 
103 | get_index <- function(desired.i, possible.i, use, ...){
104 |     
105 |     if (!is.infinite(desired.i) && desired.i > length(possible.i)) {
106 |         warning(sprintf('desired `%s` exceeds number of matches; using first match instead', use))
107 |         return(possible.i[1])
108 |     }
109 |     if (is.infinite(desired.i)){
110 |         return(possible.i[length(possible.i)])
111 |     }
112 |     possible.i[desired.i]
113 | }
114 | 
115 | 
116 | match_to_index <- function(x, regex, use, ...){
117 |     
118 |     if (is.null(regex)) return(NULL)
119 |  
120 |     out <- grep(regex, x, perl = TRUE, ...)
121 |     if (length(out) == 0) stop(sprintf('`%s` did not have any matches', use))
122 |     out
123 | }
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/R/mtabulate.R:
--------------------------------------------------------------------------------
 1 | #' Tabulate Frequency Counts for Multiple Vectors
 2 | #'
 3 | #' \code{mtabulate} - Similar to \code{\link[base]{tabulate}} that works on 
 4 | #' multiple vectors.
 5 | #'
 6 | #' @param vects A \code{\link[base]{vector}}, \code{\link[base]{list}}, or
 7 | #' \code{\link[base]{data.frame}} of named/unnamed vectors.
 8 | #' @keywords tabulate frequency
 9 | #' @export
10 | #' @seealso \code{\link[base]{tabulate}}
11 | #' @return \code{mtabulate} - Returns a \code{\link[base]{data.frame}} with 
12 | #' columns equal to number of unique elements and the number of rows equal to 
13 | #' the the original length of the \code{\link[base]{vector}}, 
14 | #' \code{\link[base]{list}}, or \code{\link[base]{data.frame}} (length equals 
15 | #' number of columns in \code{\link[base]{data.frame}}).  If list of vectors is 
16 | #' named these will be the rownames of the dataframe.
17 | #' @author Joran Elias and Tyler Rinker <tyler.rinker@@gmail.com>.
18 | #' @rdname mtabulate
19 | #' @references \url{https://stackoverflow.com/a/9961324/1000343}
20 | #' @examples
21 | #' mtabulate(list(w=letters[1:10], x=letters[1:5], z=letters))
22 | #' mtabulate(list(mtcars$cyl[1:10]))
23 | #'
24 | #' ## Dummy coding
25 | #' mtabulate(mtcars$cyl[1:10])
26 | #' mtabulate(CO2[, "Plant"])
27 | #'
28 | #' dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3))
29 | #' mtabulate(dat)
30 | #' as_list(mtabulate(dat))
31 | #' t(mtabulate(dat))
32 | #' as_list(t(mtabulate(dat)))
33 | mtabulate <- function(vects) {
34 |     
35 |     lev <- sort(unique(unlist(vects)))
36 |     dat <- do.call(rbind, lapply(vects, function(x, lev){
37 |         tabulate(factor(x, levels = lev, ordered = TRUE),
38 |         nbins = length(lev))}, lev = lev))
39 |     colnames(dat) <- sort(lev)
40 |     data.frame(dat, check.names = FALSE)
41 |     
42 | }
43 | 
44 | 
45 | 
46 | #' Tabulate Frequency Counts for Multiple Vectors
47 | #'
48 | #' \code{as_list} - Convert a count matrix to a named list of elements.  The
49 | #' semantic inverse of \code{mtabulate}.
50 | #'
51 | #' @param mat A matrix of counts.
52 | #' @param nm A character vector of names to assign to the list.
53 | #' @rdname mtabulate
54 | #' @return \code{as_list} - Returns a list of elements.
55 | #' @export
56 | as_list <- function(mat, nm = rownames(mat)) {
57 |     
58 |     nms <- colnames(mat)
59 | 
60 |     lst <- lapply(seq_len(nrow(mat)), function(i) rep(nms, mat[i, , drop =FALSE]))
61 | 
62 |     #if (nrow(mat) == 1) lst <- list(c(lst))                 
63 |     if (!is.list(lst) & is.atomic(lst)) lst <- as.list(lst)    
64 |     if(!is.list(lst)) lst <- lapply(seq_len(ncol(lst)), function(i) lst[, i])
65 |     stats::setNames(lst,  nm = nm)
66 |     
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/R/split_index.R:
--------------------------------------------------------------------------------
  1 | #' Split Data Forms at Specified Indices
  2 | #'
  3 | #' Split data forms at specified integer indices.
  4 | #'
  5 | #' @param x A data form (\code{list}, \code{vector}, \code{data.frame},
  6 | #' \code{matrix}).
  7 | #' @param indices A vector of integer indices to split at.  If \code{indices}
  8 | #' contains the index 1, it will be silently dropped.  The default value when
  9 | #' \code{x} evaluates to \code{TRUE} for \code{\link[base]{is.atomic}} is to use
 10 | #' \code{\link[textshape]{change_index}(x)}.
 11 | #' @param names Optional vector of names to give to the list elements.
 12 | #' @param \ldots Ignored.
 13 | #' @return Returns of list of data forms broken at the \code{indices}.
 14 | #' @note Two dimensional object will retain dimension (i.e., \code{drop = FALSE}
 15 | #' is used).
 16 | #' @seealso \code{\link[textshape]{change_index}}
 17 | #' @export
 18 | #' @examples
 19 | #' ## character
 20 | #' split_index(LETTERS, c(4, 10, 16))
 21 | #' split_index(LETTERS, c(4, 10, 16), c("dog", "cat", "chicken", "rabbit"))
 22 | #'
 23 | #' ## numeric
 24 | #' split_index(1:100, c(33, 66))
 25 | #'
 26 | #' ## factor
 27 | #' (p_chng <- change_index(CO2[["Plant"]]))
 28 | #' split_index(CO2[["Plant"]], p_chng)
 29 | #' #`change_index` was unnecessary as it is the default of atomic vectors
 30 | #' split_index(CO2[["Plant"]])
 31 | #'
 32 | #' ## list
 33 | #' split_index(as.list(LETTERS), c(4, 10, 16))
 34 | #'
 35 | #' ## data.frame
 36 | #' (vs_change <- change_index(mtcars[["vs"]]))
 37 | #' split_index(mtcars, vs_change)
 38 | #'
 39 | #' ## matrix
 40 | #' (mat <- matrix(1:50, nrow=10))
 41 | #' split_index(mat, c(3, 6, 10))
 42 | split_index <- function(x, 
 43 |     indices = if (is.atomic(x)) {NULL} else {change_index(x)}, names = NULL, 
 44 |     ...) {
 45 | 
 46 |     indices
 47 |     names
 48 |     UseMethod("split_index")
 49 | 
 50 | }
 51 | 
 52 | #' @export
 53 | #' @method split_index list
 54 | #' @rdname split_index
 55 | split_index.list <- function(x, indices, names = NULL, ...) {
 56 | 
 57 |     names <- name_len_check(indices, names)
 58 |     out <- split_index_vector(x, indices, ...)
 59 |     if(!is.null(names)) names(out) <- names
 60 |     out
 61 | }
 62 | 
 63 | #' @export
 64 | #' @method split_index data.frame
 65 | #' @rdname split_index
 66 | split_index.data.frame <- function(x, indices, names = NULL, ...) {
 67 | 
 68 |     names <- name_len_check(indices, names)
 69 |     out <- split_index_mat(x, indices, ...)
 70 |     if(!is.null(names)) names(out) <- names
 71 |     out
 72 | }
 73 | 
 74 | #' @export
 75 | #' @method split_index matrix
 76 | #' @rdname split_index
 77 | split_index.matrix <- function(x, indices, names = NULL, ...) {
 78 | 
 79 |     names <- name_len_check(indices, names)
 80 |     out <- split_index_mat(x, indices, ...)
 81 |     if(!is.null(names)) names(out) <- names
 82 |     out
 83 | }
 84 | 
 85 | #' @export
 86 | #' @method split_index numeric
 87 | #' @rdname split_index
 88 | split_index.numeric <- function(x, indices = change_index(x), names = NULL, 
 89 |     ...) {
 90 | 
 91 |     names <- name_len_check(indices, names)
 92 |     out <- split_index_vector(x, indices, ...)
 93 |     if(!is.null(names)) names(out) <- names
 94 |     out
 95 | }
 96 | 
 97 | #' @export
 98 | #' @method split_index factor
 99 | #' @rdname split_index
100 | split_index.factor <- function(x, indices = change_index(x), names = NULL, 
101 |     ...) {
102 | 
103 |     names <- name_len_check(indices, names)
104 |     out <- split_index_vector(x, indices, ...)
105 |     if(!is.null(names)) names(out) <- names
106 |     out
107 | }
108 | 
109 | #' @export
110 | #' @method split_index character
111 | #' @rdname split_index
112 | split_index.character <- function(x, indices = change_index(x), 
113 |     names = NULL, ...) {
114 | 
115 |     names <- name_len_check(indices, names)
116 |     out <- split_index_vector(x, indices, ...)
117 |     if(!is.null(names)) names(out) <- names
118 |     out
119 | }
120 | 
121 | #' @export
122 | #' @method split_index default
123 | #' @rdname split_index
124 | split_index.default <- function(x, indices = change_index(x), 
125 |     names = NULL, ...) {
126 | 
127 |     names <- name_len_check(indices, names)
128 |     out <- split_index_vector(x, indices, ...)
129 |     if(!is.null(names)) names(out) <- names
130 |     out
131 | }
132 | 
133 | 
134 | 
135 | 
136 | split_index_vector <- function(x, indices){
137 |     if (any(indices %in% "1")) indices <- indices[!indices %in% "1"]
138 |     starts <- c(1, indices)
139 |     Map(function(s, e) {x[s:e]}, starts, c(indices - 1, length(x)))
140 | }
141 | 
142 | 
143 | split_index_mat <- function(x, indices, names = NULL, ...) {
144 | 
145 |     indices <- indices[!indices %in% "1"]
146 |     len <- nrow(x)
147 |     if (len < max(indices)) {
148 |         stop(
149 |             "One or more `indices` elements exceeds nrow of `x`",
150 |             call. = FALSE
151 |         )
152 |     }
153 | 
154 |     starts <- c(1, indices)
155 |     Map(function(s, e) {x[s:e, ,drop=FALSE]}, starts, c(indices - 1, nrow(x)))
156 | 
157 | }
158 | 
159 | 
160 | 
161 | name_len_check <- function(indices, names) {
162 | 
163 |     if (is.null(names)) return(names)
164 |     check <- length(indices) + 1 == length(names)
165 |     if(!check) {
166 |         warning(
167 |             paste(
168 |                 "length of `names` muse be equal to length", 
169 |                 "of `indices` + 1; ignoring `names`", 
170 |             ),
171 |             call. = FALSE
172 |         )
173 |     }
174 |     if (!check) NULL else names
175 | }
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/R/split_match.R:
--------------------------------------------------------------------------------
 1 | #' Split a Vector By Split Points
 2 | #'
 3 | #' \code{split_match} - Splits a \code{vector} into a list of vectors based on
 4 | #' split points.
 5 | #'
 6 | #' @param x A vector with split points.
 7 | #' @param split A vector of places (elements) to split on or a regular
 8 | #' expression if \code{regex} argument is \code{TRUE}.
 9 | #' @param include An integer of \code{1} (\code{split} character(s) are not
10 | #' included in the output), \code{2} (\code{split} character(s) are included at
11 | #' the beginning of the output), or \code{3} (\code{split} character(s) are
12 | #' included at the end of the output).
13 | #' @param regex logical.  If \code{TRUE} regular expressions will be enabled for
14 | #' \code{split} argument.
15 | #' @param \ldots other arguments passed to \code{\link[base]{grep}} and
16 | #' \code{\link[base]{grepl}}.
17 | #' @return Returns a list of vectors.
18 | #' @author Matthew Flickinger and Tyler Rinker <tyler.rinker@@gmail.com>.
19 | #' @references \url{https://stackoverflow.com/a/24319217/1000343}
20 | #' @export
21 | #' @rdname split_match
22 | #' @examples
23 | #' set.seed(15)
24 | #' x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10)))
25 | #'
26 | #' split_match(x)
27 | #' split_match(x, "C")
28 | #' split_match(x, c("", "C"))
29 | #'
30 | #' split_match(x, include = 0)
31 | #' split_match(x, include = 1)
32 | #' split_match(x, include = 2)
33 | #'
34 | #' set.seed(15)
35 | #' x <- sample(1:11, 25, TRUE, prob=c(.2, rep(.08, 10)))
36 | #' split_match(x, 1)
37 | split_match <- function(x, split = "", include = FALSE, regex = FALSE, ...) {
38 | 
39 |     include <- as.numeric(include)
40 | 
41 |     if (length(include) != 1 || !include %in% 0:2) {
42 |         stop("Supply 0, 1, or 2 to `include`")
43 |     }
44 | 
45 |     if (include %in% 0:1){
46 |         if (!regex){
47 |             breaks <- x %in% split
48 |         } else {
49 |             breaks <- grepl(split, x, ...)
50 |       }
51 | 
52 |         if(include == 1) {
53 |             inds <- rep(TRUE, length(breaks))
54 |         } else {
55 |             inds <- !breaks
56 |         }
57 |         out <- split(x[inds], cumsum(breaks)[inds])
58 |         names(out) <- seq_along(out)
59 |         out
60 | 
61 |     } else {
62 |         if (!regex){
63 |             locs <- which(x %in% split)
64 |         } else {
65 | 		locs <- grep(split, x, ...)
66 |         }
67 | 
68 |         start <- c(1, locs + 1)
69 |         end <- c(locs, length(x))
70 | 
71 |         lapply(Map(":", start, end), function(ind){
72 |             x[ind]
73 |         })
74 |     }
75 | }
76 | 
77 | 
78 | #' Split a Vector By Split Points
79 | #'
80 | #' \code{split_match_regex} - \code{split_match} with \code{regex = TRUE}.
81 | #' 
82 | #' @export
83 | #' @rdname split_match
84 | split_match_regex <- function(x, split = "", include = FALSE, ...){
85 |     split_match(x =x, split = split, include = include, regex = TRUE, ...)
86 | }
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/R/split_match_regex_to_transcript.R:
--------------------------------------------------------------------------------
  1 | #' #' Split Text by Regex Into a Transcript
  2 | #' #'
  3 | #' #' A wrapper for \code{\link[textshape]{split_match_regex}} and
  4 | #' #' \pkg{textreadr}'s \code{as_transript} to detect person variable, split the
  5 | #' #' text into turns of talk, and convert to a data.frame with \code{person} and
  6 | #' #' \code{dialogue} variables.  There is a bit of cleansing that is closer to
  7 | #' #' \code{as_transript} than \code{\link[textshape]{split_transcript}}.
  8 | #' #'
  9 | #' #' @param x A vector with split points.
 10 | #' #' @param person.regex A vector of places (elements) to split on or a regular
 11 | #' #' expression if \code{regex} argument is \code{TRUE}.
 12 | #' #' @param col.names  A character vector specifying the column names of the
 13 | #' #' transcript columns.
 14 | #' #' @param dash A character string to replace the en and em dashes special
 15 | #' #' characters (default is to remove).
 16 | #' #' @param ellipsis A character string to replace the ellipsis special
 17 | #' #' characters.
 18 | #' #' @param quote2bracket logical. If \code{TRUE} replaces curly quotes with
 19 | #' #' curly braces (default is \code{FALSE}).  If \code{FALSE} curly quotes are
 20 | #' #' removed.
 21 | #' #' @param rm.empty.rows logical.  If \code{TRUE}
 22 | #' #' \code{\link[textreadr]{read_transcript}}  attempts to remove empty rows.
 23 | #' #' @param skip Integer; the number of lines of the data file to skip before
 24 | #' #' beginning to read data.
 25 | #' #' @param \ldots ignored.
 26 | #' #' @return Returns a data.frame of dialogue and people.
 27 | #' #' @export
 28 | #' split_match_regex_to_transcript <- function (x, person.regex = "^[A-Z]{3,}",
 29 | #'     col.names = c("Person", "Dialogue"), dash = "", ellipsis = "...",
 30 | #'     quote2bracket = FALSE, rm.empty.rows = TRUE, skip = 0, ...) {
 31 | #'
 32 | #'     text2transcript(
 33 | #'         combine_list(
 34 | #'             split_match(x, split = person.regex, include = TRUE, regex = TRUE)
 35 | #'         ),
 36 | #'         person.regex = person.regex,
 37 | #'         col.names = col.names,
 38 | #'         dash = dash,
 39 | #'         ellipsis = ellipsis,
 40 | #'         quote2bracket = quote2bracket,
 41 | #'         rm.empty.rows = rm.empty.rows,
 42 | #'         skip = skip,
 43 | #'         ...
 44 | #'     )
 45 | #'
 46 | #' }
 47 | #'
 48 | #'
 49 | #' combine_list <- function (x, fix.punctuation = TRUE, ...) {
 50 | #'
 51 | #'     if (!is.list(x)) x <- list(x)
 52 | #'     x <- unlist(lapply(x, paste, collapse = " "))
 53 | #'     if (isTRUE(fix.punctuation)) {
 54 | #'         x <- gsub("(\\s+(?=[,.?!;:%-]))|((?<=[$-])\\s+)", "", x, perl = TRUE)
 55 | #'     }
 56 | #'     unname(x)
 57 | #'
 58 | #' }
 59 | #'
 60 | #'
 61 | #'
 62 | #' text2transcript <- function(text, person.regex = NULL,
 63 | #'     col.names = c("Person", "Dialogue"), text.var = NULL,
 64 | #'     merge.broke.tot = TRUE, header = FALSE, dash = "", ellipsis = "...",
 65 | #'     quote2bracket = FALSE, rm.empty.rows = TRUE, na = "", skip = 0, ...) {
 66 | #'
 67 | #'     sep <- ":"
 68 | #'     text <- unlist(strsplit(text, "\n"))
 69 | #'     text <- paste(
 70 | #'         gsub(
 71 | #'             paste0('(', person.regex, ')'),
 72 | #'             "\\1SEP_PLACE_HOLDER",
 73 | #'             text,
 74 | #'             perl = TRUE
 75 | #'         ),
 76 | #'         collapse = "\n"
 77 | #'     )
 78 | #'
 79 | #'     text <- gsub(":", "SYMBOL_PLACE_HOLDER", text)
 80 | #'     text <- gsub("SEP_PLACE_HOLDER", ":", text, fixed = TRUE)
 81 | #'
 82 | #'     ## Use read.table to split read the text as a table
 83 | #'     x <- utils::read.table(
 84 | #'         text=text,
 85 | #'         header = header,
 86 | #'         sep = sep,
 87 | #'         skip=skip,
 88 | #'         quote = ""
 89 | #'     )
 90 | #'
 91 | #'     x[[2]] <- gsub("SYMBOL_PLACE_HOLDER", ":", x[[2]], fixed = TRUE)
 92 | #'
 93 | #'     if (!is.null(text.var) & !is.numeric(text.var)) {
 94 | #'         text.var <- which(colnames(x) == text.var)
 95 | #'     } else {
 96 | #'         text.col <- function(dataframe) {
 97 | #'             dial <- function(x) {
 98 | #'                 if(is.factor(x) | is.character(x)) {
 99 | #'                     n <- max(nchar(as.character(x)), na.rm = TRUE)
100 | #'                 } else {
101 | #'                     n <- NA
102 | #'                 }
103 | #'             }
104 | #'             which.max(unlist(lapply(dataframe, dial)))
105 | #'         }
106 | #'         text.var <- text.col(x)
107 | #'     }
108 | #'
109 | #'     x[[text.var]] <- trimws(
110 | #'         iconv(as.character(x[[text.var]]), "", "ASCII", "byte")
111 | #'     )
112 | #'
113 | #'     if (is.logical(quote2bracket)) {
114 | #'         if (quote2bracket) {
115 | #'             rbrac <- "}"
116 | #'             lbrac <- "{"
117 | #'         } else {
118 | #'             lbrac <- rbrac <- ""
119 | #'         }
120 | #'     } else {
121 | #'             rbrac <- quote2bracket[2]
122 | #'             lbrac <- quote2bracket[1]
123 | #'     }
124 | #'
125 | #'     ser <- c("<e2><80><9c>", "<e2><80><9d>", "<e2><80><98>", "<e2><80><99>",
126 | #'     	"<e2><80><9b>", "<ef><bc><87>", "<e2><80><a6>", "<e2><80><93>",
127 | #'     	"<e2><80><94>", "<c3><a1>", "<c3><a9>", "<c2><bd>")
128 | #'
129 | #'     reps <- c(lbrac, rbrac, "'", "'", "'", "'", ellipsis, dash, dash, "a", "e",
130 | #'         "half")
131 | #'
132 | #'     Encoding(x[[text.var]]) <-"latin1"
133 | #'     x[[text.var]] <- clean(.mgsub(ser, reps, x[[text.var]]))
134 | #'     if(rm.empty.rows) {
135 | #'         x <- rm_empty_row(x)
136 | #'     }
137 | #'     if (!is.null(col.names)) {
138 | #'         colnames(x) <- col.names
139 | #'     }
140 | #'
141 | #'     x <- as.data.frame(x, stringsAsFactors = FALSE)
142 | #'
143 | #'     if (merge.broke.tot) {
144 | #'         x <- combine_tot(x)
145 | #'     }
146 | #'     x <- rm_na_row(x, rm.empty.rows)
147 | #'     class(x) <- c("textreadr", "data.frame")
148 | #'     x
149 | #' }
150 | #'
151 | #' clean <- function (text.var) {
152 | #'     gsub("\\s+", " ", gsub("\\\\r|\\\\n|\\n|\\\\t", " ", text.var))
153 | #' }
154 | #'
155 | #' rm_na_row <- function(x, remove = TRUE) {
156 | #'     if (!remove) return(x)
157 | #'     x[rowSums(is.na(x)) != ncol(x), ]
158 | #' }
159 | #'
160 | #' rm_empty_row <- function(dataframe) {
161 | #'     x <- paste2(dataframe, sep = "")
162 | #'     x <- gsub("\\s+", "", x)
163 | #'     ind <- x != ""
164 | #'     return(dataframe[ind, , drop = FALSE])
165 | #' }
166 | #'
167 | #' #Helper function used in read.transcript
168 | #' #' @importFrom data.table :=
169 | #' combine_tot <- function(x){
170 | #'     person <- NULL
171 | #'     nms <- colnames(x)
172 | #'     colnames(x) <- c('person', 'z')
173 | #'     x <- data.table::data.table(x)
174 | #'
175 | #'     exp <- parse(text='list(text = paste(z, collapse = " "))')[[1]]
176 | #'     out <- x[, eval(exp),
177 | #'         by = list(person, 'new' = data.table::rleid(person))][,
178 | #'         'new' := NULL][]
179 | #'     data.table::setnames(out, nms)
180 | #'     out
181 | #' }
182 | #'
183 | 


--------------------------------------------------------------------------------
/R/split_portion.R:
--------------------------------------------------------------------------------
  1 | #' Break Text Into Ordered Word Chunks
  2 | #'
  3 | #' Some visualizations and algorithms require text to be broken into chunks of
  4 | #' ordered words.  \code{split_portion} breaks text, optionally by grouping
  5 | #' variables, into equal chunks.  The chunk size can be specified by giving
  6 | #' number of words to be in each chunk or the number of chunks.
  7 | #'
  8 | #' @param text.var The text variable
  9 | #' @param grouping.var The grouping variables.  Default \code{NULL} generates
 10 | #' one word list for all text.  Also takes a single grouping variable or a list
 11 | #' of 1 or more grouping variables.
 12 | #' @param n.words An integer specifying the number of words in each chunk (must
 13 | #' specify n.chunks or n.words).
 14 | #' @param n.chunks An integer specifying the number of chunks (must specify
 15 | #' n.chunks or n.words).
 16 | #' @param as.string logical.  If \code{TRUE} the chunks are returned as a single
 17 | #' string.  If \code{FALSE} the chunks are returned as a vector of single words.
 18 | #' @param rm.unequal logical. If \code{TRUE} final chunks that are unequal in
 19 | #' length to the other chunks are removed.
 20 | #' @param as.table logical.  If \code{TRUE} the list output is coerced to
 21 | #' \code{\link[data.table]{data.table}} or \pkg{tibble}.
 22 | #' @param \ldots Ignored.
 23 | #' @return Returns a list or \code{\link[data.table]{data.table}} of text chunks.
 24 | #' @keywords chunks group text
 25 | #' @export
 26 | #' @examples
 27 | #' with(DATA, split_portion(state, n.chunks = 10))
 28 | #' with(DATA, split_portion(state, n.words = 10))
 29 | #' with(DATA, split_portion(state, n.chunks = 10, as.string=FALSE))
 30 | #' with(DATA, split_portion(state, n.chunks = 10, rm.unequal=TRUE))
 31 | #' with(DATA, split_portion(state, person, n.chunks = 10))
 32 | #' with(DATA, split_portion(state, list(sex, adult), n.words = 10))
 33 | #' with(DATA, split_portion(state, person, n.words = 10, rm.unequal=TRUE))
 34 | #'
 35 | #' ## Bigger data
 36 | #' with(hamlet, split_portion(dialogue, person, n.chunks = 10))
 37 | #' with(hamlet, split_portion(dialogue, list(act, scene, person), n.chunks = 10))
 38 | #' with(hamlet, split_portion(dialogue, person, n.words = 300))
 39 | #' with(hamlet, split_portion(dialogue, list(act, scene, person), n.words = 300))
 40 | split_portion <- function(text.var, grouping.var = NULL, n.words, n.chunks,
 41 |     as.string = TRUE, rm.unequal = FALSE, as.table = TRUE, ...){
 42 | 
 43 |     if (missing(n.chunks) && missing(n.words)) {
 44 |         stop("Must supply either `n.chunks` or `n.words`")
 45 |     }
 46 | 
 47 |     if(is.null(grouping.var)) {
 48 |         G <- "all"
 49 |         ilen <- 1
 50 |     } else {
 51 |         if (is.list(grouping.var)) {
 52 |             m <- unlist(as.character(substitute(grouping.var))[-1])
 53 |             m <- sapply2(strsplit(m, "$", fixed=TRUE), function(x) {
 54 |                     x[length(x)]
 55 |                 }
 56 |             )
 57 |             ilen <- length(grouping.var)
 58 |             G <- paste(m, collapse="&")
 59 |         } else {
 60 |             G <- as.character(substitute(grouping.var))
 61 |             ilen <- length(G)
 62 |             G <- G[length(G)]
 63 |         }
 64 |     }
 65 |     if(is.null(grouping.var)){
 66 |         grouping <- rep("all", length(text.var))
 67 |     } else {
 68 |         if (is.list(grouping.var) & length(grouping.var)>1) {
 69 |             grouping <- paste2(grouping.var)
 70 |         } else {
 71 |             grouping <- unlist(grouping.var)
 72 |         }
 73 |     }
 74 | 
 75 |     ## split into ordered words by grouping variable
 76 |     dat <- lapply(split(as.character(text.var), grouping), function(x) {
 77 |         unlist(stringi::stri_split_regex(x, "\\s+"))
 78 |     })
 79 | 
 80 | 
 81 |     if (!missing(n.chunks)){
 82 | 
 83 |         ## Check that n.chunks is integer
 84 |         if (!is.Integer(n.chunks)){
 85 |             stop("`n.chunks` must be an integer")
 86 |         }
 87 |         out <- lapply(
 88 |             dat, 
 89 |             split_portion_help_groups, 
 90 |             N = n.chunks, 
 91 |             ub = as.string, 
 92 |             rme = rm.unequal
 93 |         )
 94 | 
 95 |     } else {
 96 | 
 97 |         ## Check that n.words is integer
 98 |         if (!is.Integer(n.words)){
 99 |             stop("`n.words` must be an integer")
100 |         }
101 |         out <- lapply(
102 |             dat, 
103 |             split_portion_help_words, 
104 |             N = n.words, 
105 |             ub = as.string, 
106 |             rme = rm.unequal
107 |         )
108 |     }
109 | 
110 |     grpvar <- stats::setNames(
111 |         as.data.frame(
112 |             do.call(
113 |                 rbind,
114 |                 strsplit(names(unlist(out)), "\\.")
115 |             ),
116 |             stringsAsFactors = FALSE
117 |         ), c(strsplit(G, "&")[[1]], "index")
118 |     )
119 | 
120 |     if (isTRUE(as.table) & isTRUE(as.string)){
121 |         out <- data.frame(
122 |             grpvar,
123 |             text.var = unlist(out),
124 |             stringsAsFactors = FALSE,
125 |             row.names=NULL
126 |         )
127 |         data.table::setDT(out)
128 |         out <- out
129 |     }
130 |     out
131 | }
132 | 
133 | 
134 | split_portion_help_groups <- function(x, N, ub, rme){
135 | 
136 |     len <- length(x)
137 |     size <- floor(len/N)
138 | 
139 |     ## make the groups, leftover are unequal sized last group
140 |     grabs <- rep(seq_len(N), each = size)
141 |     if (N * size < len){
142 |         leftover <- rep(N + 1, len - N * size)
143 |         grabs <- c(grabs, leftover)
144 |     }
145 | 
146 |     y <- suppressWarnings(split(x, grabs))
147 |     if (rme){
148 |         ylen <- length(y)
149 |         ## if there is only one chunk it is returned
150 |         if (ylen != 1) {
151 |             lens <- lengths(y)
152 |             if (!Reduce("==", utils::tail(lens, 2))) y <- y[1:(ylen-1)]
153 |         }
154 |     }
155 |     if (ub) y <- lapply(y, paste, collapse = " ")
156 |     y
157 | }
158 | 
159 | split_portion_help_words <- function(x, N, ub, rme){
160 | 
161 |     len <- length(x)
162 |     groups <- ceiling(len/N)
163 |     y <- suppressWarnings(split(x, rep(seq_len(groups), each = N)))
164 |     if (rme){
165 |         ylen <- length(y)
166 |         if (ylen == 1) return(NULL)
167 |         if (length(y[[ylen]]) != N) y <- y[1:(ylen-1)]
168 |     }
169 |     if (ub) y <- lapply(y, paste, collapse=" ")
170 |     y
171 | }
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/R/split_run.R:
--------------------------------------------------------------------------------
 1 | #' Split Runs
 2 | #'
 3 | #' Split runs of consecutive characters.
 4 | #'
 5 | #' @param x A \code{\link[base]{data.frame}} or character vector with runs.
 6 | #' @param text.var The name of the text variable with runs. If \code{TRUE}
 7 | #' \code{split_word} tries to detect the text column with runs.
 8 | #' @param \ldots Ignored.
 9 | #' @export
10 | #' @rdname split_run
11 | #' @importFrom data.table .N :=
12 | #' @return Returns a list of vectors of runs or an expanded
13 | #' \code{\link[data.table]{data.table}} with runs split apart.
14 | #' @examples
15 | #' x1 <- c(
16 | #'      "122333444455555666666",
17 | #'      NA,
18 | #'      "abbcccddddeeeeeffffff",
19 | #'      "sddfg",
20 | #'      "11112222333"
21 | #' )
22 | #'
23 | #' x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
24 | #'
25 | #' split_run(x)
26 | #'
27 | #'
28 | #' DATA[["run.col"]] <- x
29 | #' split_run(DATA, "run.col")
30 | split_run <- function(x, ...) {
31 |     UseMethod("split_run")
32 | }
33 | 
34 | #' @export
35 | #' @rdname split_run
36 | #' @method split_run default
37 | split_run.default <- function(x, ...) {
38 |     strsplit(x, "(?<=([\\S^]))(?!\\1)", perl = TRUE)
39 | }
40 | 
41 | #' @export
42 | #' @rdname split_run
43 | #' @method split_run data.frame
44 | split_run.data.frame <- function(x, text.var = TRUE, ...) {
45 | 
46 |     element_id <- NULL
47 |     nms <- colnames(x)
48 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
49 | 
50 |     text.var <- detect_text_column(x, text.var)
51 | 
52 |     z[, element_id := 1:.N]
53 |     express1 <- parse(
54 |         text=paste0(text.var, " := list(split_run.default(", text.var, "))")
55 |     )
56 |     z[, eval(express1)]
57 | # browser()
58 |     express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
59 |     z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][, 
60 |         c(nms, "element_id"), with = FALSE]
61 |     z[, 'sentence_id' := 1:.N, by = list(element_id)][]
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/R/split_sentence.R:
--------------------------------------------------------------------------------
  1 | #' Split Sentences
  2 | #'
  3 | #' Split sentences.
  4 | #'
  5 | #' @param x A \code{\link[base]{data.frame}} or character vector with sentences.
  6 | #' @param text.var The name of the text variable.  If \code{TRUE}
  7 | #' \code{split_sentence} tries to detect the column with sentences.
  8 | #' @param \ldots Ignored.
  9 | #' @export
 10 | #' @rdname split_sentence
 11 | #' @importFrom data.table .N :=
 12 | #' @return Returns a list of vectors of sentences or a expanded
 13 | #' \code{\link[base]{data.frame}} with sentences split apart.
 14 | #' @examples
 15 | #' (x <- c(paste0(
 16 | #'     "Mr. Brown comes! He says hello. i give him coffee.  i will ",
 17 | #'     "go at 5 p. m. eastern time.  Or somewhere in between!go there"
 18 | #' ),
 19 | #' paste0(
 20 | #'     "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
 21 | #'     "The time has come. The time is now. Just go. Go. GO!",
 22 | #'     "I don't care how."
 23 | #' )))
 24 | #' split_sentence(x)
 25 | #'
 26 | #' data(DATA)
 27 | #' split_sentence(DATA)
 28 | #'
 29 | #' \dontrun{
 30 | #' ## Kevin S. Dias' sentence boundary disambiguation test set
 31 | #' data(golden_rules)
 32 | #' library(magrittr)
 33 | #'
 34 | #' golden_rules %$%
 35 | #'     split_sentence(Text)
 36 | #' }
 37 | split_sentence <- function(x, ...) {
 38 |     UseMethod("split_sentence")
 39 | }
 40 | 
 41 | #' @export
 42 | #' @rdname split_sentence
 43 | #' @method split_sentence default
 44 | split_sentence.default <- function(x, ...) {
 45 |     get_sents2(x)
 46 | }
 47 | 
 48 | #' @export
 49 | #' @rdname split_sentence
 50 | #' @method split_sentence data.frame
 51 | split_sentence.data.frame <- function(x, text.var = TRUE, ...) {
 52 | 
 53 |     element_id <- NULL
 54 |     nms <- colnames(x)
 55 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
 56 | 
 57 |     text.var <- detect_text_column(x, text.var)
 58 | 
 59 |     z[, element_id := 1:.N]
 60 |     express1 <- parse(
 61 |         text=paste0(text.var, " := list(get_sents2(", text.var, "))")
 62 |     )
 63 |     z[, eval(express1)]
 64 | 
 65 |     express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
 66 |     z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][, 
 67 |         c(nms, "element_id"), with = FALSE]
 68 |     z[, 'sentence_id' := 1:.N, by = list(element_id)][]
 69 | 
 70 | }
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | #get_sents <- function(x) {
 77 | #    x <- stringi::stri_replace_all_regex(stringi::stri_trans_tolower(x), sent_regex, "")
 78 | #    stringi::stri_split_regex(x, "(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?|\\!)\\s")
 79 | #}
 80 | 
 81 | 
 82 | abbr_rep_1 <- lapply(list(
 83 |     Titles   = c('mr', 'mrs', 'ms', 'dr', 'prof', 'sen', 'rep',
 84 |                  'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
 85 |                  'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj'),
 86 | 
 87 |     Entities = c('dept', 'univ', 'uni', 'assn'),
 88 | 
 89 |     Misc     = c('vs', 'mt'),
 90 | 
 91 |     Streets  = c('st')
 92 | ), function(x){
 93 |     fl <- sub("(^[a-z])(.+)", "\\1", x)
 94 |     sprintf("[%s%s]%s", fl, toupper(fl), sub("(^[a-z])(.+)", "\\2", x))
 95 | })
 96 | 
 97 | abbr_rep_2 <- lapply(list(
 98 |     Titles      = c('jr', 'sr'),
 99 | 
100 |     Entities    = c('bros', 'inc', 'ltd', 'co', 'corp', 'plc'),
101 | 
102 |     Months      = c('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
103 |                     'aug', 'sep', 'oct', 'nov', 'dec', 'sept'),
104 | 
105 |     Days        = c('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'),
106 | 
107 |     Misc        = c('etc', 'esp', 'cf', 'al'),
108 | 
109 |     Streets     = c('ave', 'bld', 'blvd', 'cl', 'ct', 'cres', 'rd'),
110 | 
111 |     ## measures from:http://englishplus.com/grammar/00000058.htm
112 |     ## excluded b/c likely to overlap with actual words: {'in', 'oz'}
113 |     Measurement = c('ft', 'gal', 'mi', 'tbsp', 'tsp', 'yd', 'qt',
114 |                     'sq', 'pt', 'lb', 'lbs')
115 | ), function(x){
116 |     fl <- sub("(^[a-z])(.+)", "\\1", x)
117 |     sprintf("[%s%s]%s", fl, toupper(fl), sub("(^[a-z])(.+)", "\\2", x))
118 | })
119 | 
120 | 
121 | period_reg <- paste0(
122 |     "(?:(?<=[a-z])\\.\\s(?=[a-z]\\.))",
123 |         "|",
124 |     "(?:(?<=([ .][a-z]))\\.)(?!(?:\\s[A-Z]|$)|(?:\\s\\s))",
125 |         "|",
126 |     "(?:(?<=[A-Z])\\.(?=\\s??[A-Z]\\.))",
127 |         "|",
128 |     "(?:(?<=[A-Z])\\.(?!\\s+[A-Z][A-Za-z]))"
129 | )
130 | 
131 | # gsub("(((?<=\\b(%s))\\.)(\\s+(?![A-Z])))", '[[[]]]',
132 | #     'With the co. in hand they were Co. parts in co. I want', perl=TRUE)
133 | 
134 | ## there are 2 sets of abbreviation lists abbr_rep_1 & abbr_rep_2.  This is
135 | ## because the first set will likely have a proper noun following them (e.g.
136 | ## Dr. Rinker) while the latter will not and if they are followed by a capital
137 | ## letter then the abbreviation likely ends the sentence and a split should
138 | ## occur there.  This is baked into the replacement logic for splitting.
139 | sent_regex <- sprintf(
140 |     "((?<=\\b(%s))\\.)|((?<=\\b(%s))\\.(?!\\s+[A-Z]))|%s|(%s)",
141 |     paste(unlist(abbr_rep_1), collapse = "|"),
142 |     paste(unlist(abbr_rep_2), collapse = "|"),
143 |     period_reg,
144 | 	'\\.(?=\\d+)'
145 | )
146 | 
147 | 
148 | get_sents2 <- function(x) {
149 | 
150 |     y <- stringi::stri_replace_all_regex(
151 |         trimws(x), 
152 |         '([Pp])(\\.)(\\s*[Ss])(\\.)', 
153 |         '$1<<<TEMP>>>$3<<<TEMP>>>'
154 |     )
155 |     y <- stringi::stri_replace_all_regex(y, sent_regex, "<<<TEMP>>>")
156 |     y <- stringi::stri_replace_all_regex(
157 |         y, 
158 |         '(\\b[Nn]o)(\\.)(\\s+\\d)', 
159 |         '$1<<<TEMP>>>$3'
160 |     )
161 |     y <- stringi::stri_replace_all_regex(
162 |         y, 
163 |         '(\\b\\d+\\s+in)(\\.)(\\s[a-z])', 
164 |         '$1<<<TEMP>>>$3'
165 |     )
166 |     y <- stringi::stri_replace_all_regex(
167 |         y, 
168 |         '([?.!]+)([\'])([^,])', 
169 |         '<<<SQUOTE>>>$1  $3'
170 |     )
171 |     y <- stringi::stri_replace_all_regex(
172 |         y, 
173 |         '([?.!]+)(["])([^,])', 
174 |         '<<<DQUOTE>>>$1  $3'
175 |     )
176 |     ## midde name handling
177 |     y <- stringi::stri_replace_all_regex(y,
178 |         '(\\b[A-Z][a-z]+\\s[A-Z])(\\.)(\\s[A-Z][a-z]+\\b)',
179 |         '$1<<<TEMP>>>$3'
180 |     )
181 | 
182 |     #2 middle names
183 |     y <- stringi::stri_replace_all_regex(y,
184 |         '(\\b[A-Z][a-z]+\\s[A-Z])(\\.)(\\s[A-Z])(\\.)(\\s[A-Z][a-z]+\\b)',
185 |         '$1<<<TEMP>>>$3<<<TEMP>>>$5'
186 |     )
187 |     y <- stringi::stri_split_regex(
188 |         y, 
189 |         paste0(
190 |             "((?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?|\\!)(\\s|", 
191 |             "(?=[a-zA-Z][a-zA-Z]*\\s)))|(?<=[A-Z][a-z][.?!])\\s+"
192 |         )
193 |     )
194 | 
195 | 
196 | 
197 |     lens <- cumsum(lengths(y)) + 1
198 |     locs <- lens[seq_len(length(lens) - 1)]
199 | 
200 |     y <- trimws(unlist(y))
201 | 
202 |     y <- stringi::stri_replace_all_fixed(y, "<<<TEMP>>>", ".")
203 |     y <- stringi::stri_replace_all_regex(y, "(<<<DQUOTE>>>)([?.!]+)", "$2\"")
204 |     y <- stringi::stri_replace_all_regex(y, "(<<<SQUOTE>>>)([?.!]+)", "$2'")
205 | 
206 |     split_index(y, locs)
207 | }
208 | 
209 | get_sentences2 <- function(x, ...) {
210 |     lapply(
211 |         lapply(
212 |             get_sents2(trimws(x)), 
213 |             function(x) {
214 |                 gsub("<<<TEMP>>>", ".", x)
215 |             }
216 |         ),
217 |         function(x) {
218 |             gsub("^\\s+|\\s+$", "", x)
219 |         }
220 |     )
221 | }
222 | 
223 | 
224 | 


--------------------------------------------------------------------------------
/R/split_sentence_token.R:
--------------------------------------------------------------------------------
 1 | #' Split Sentences & Tokens
 2 | #'
 3 | #' Split sentences and tokens.
 4 | #'
 5 | #' @param x A \code{\link[base]{data.frame}} or character vector with sentences.
 6 | #' @param text.var The name of the text variable.  If \code{TRUE}
 7 | #' \code{split_sentence_token} tries to detect the column with sentences.
 8 | #' @param lower logical.  If \code{TRUE} the words are converted to lower case.
 9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_sentence_token
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of sentences or a expanded
14 | #' \code{\link[base]{data.frame}} with sentences split apart.
15 | #' @examples
16 | #' (x <- c(paste0(
17 | #'     "Mr. Brown comes! He says hello. i give him coffee.  i will ",
18 | #'     "go at 5 p. m. eastern time.  Or somewhere in between!go there"
19 | #' ),
20 | #' paste0(
21 | #'     "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
22 | #'     "The time has come. The time is now. Just go. Go. GO!",
23 | #'     "I don't care how."
24 | #' )))
25 | #' split_sentence_token(x)
26 | #'
27 | #' data(DATA)
28 | #' split_sentence_token(DATA)
29 | #'
30 | #' \dontrun{
31 | #' ## Kevin S. Dias' sentence boundary disambiguation test set
32 | #' data(golden_rules)
33 | #' library(magrittr)
34 | #'
35 | #' golden_rules %$%
36 | #'     split_sentence_token(Text)
37 | #' }
38 | split_sentence_token <- function(x,  ...) {
39 |     UseMethod("split_sentence_token")
40 | }
41 | 
42 | #' @export
43 | #' @rdname split_sentence_token
44 | #' @method split_sentence_token default
45 | split_sentence_token.default <- function(x, lower = TRUE, ...) {
46 |     
47 |     split_sentence_token.data.frame(data.frame(text = x, 
48 |         stringsAsFactors = FALSE))
49 |     
50 | }
51 | 
52 | #' @export
53 | #' @rdname split_sentence_token
54 | #' @method split_sentence_token data.frame
55 | split_sentence_token.data.frame <- function(x, text.var = TRUE, 
56 |     lower = TRUE, ...) {
57 | 
58 |     element_id <- NULL
59 |     z <- split_sentence(x, text.var = text.var, ...)
60 | 
61 |     nms <- colnames(z)
62 | 
63 |     text.var <- detect_text_column(z, text.var)
64 |     
65 |     express1 <- parse(
66 |         text=paste0(
67 |             text.var, 
68 |             " := list(split_token.default(", 
69 |             text.var, 
70 |             ", lower = ", 
71 |             lower, 
72 |             "))"
73 |         )
74 |     )
75 |     
76 |     z[, eval(express1)]
77 | 
78 |     express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
79 |     z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][, 
80 |         c(nms), with = FALSE]
81 |     z[, 'token_id' := 1:.N, by = list(element_id)][]
82 | 
83 | }
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/R/split_speaker.R:
--------------------------------------------------------------------------------
 1 | #' Break and Stretch if Multiple Persons per Cell
 2 | #'
 3 | #' Look for cells with multiple people and create separate rows for each person.
 4 | #'
 5 | #' @param dataframe A dataframe that contains the person variable.
 6 | #' @param speaker.var The person variable to be stretched.
 7 | #' @param sep The separator(s) to search for and break on.  Default is:
 8 | #' c("and", "&", ",")
 9 | #' @param \ldots Ignored.
10 | #' @return Returns an expanded dataframe with person variable stretched and
11 | #' accompanying rows repeated.
12 | #' @export
13 | #' @examples
14 | #' \dontrun{
15 | #' DATA$person <- as.character(DATA$person)
16 | #' DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
17 | #'     "greg, sally", "sam and sally")
18 | #'
19 | #' split_speaker(DATA)
20 | #'
21 | #' DATA$person[c(1, 4, 6)] <- c("greg_sally_sam",
22 | #'     "greg.sally", "sam; sally")
23 | #'
24 | #' split_speaker(DATA, sep = c(".", "_", ";"))
25 | #'
26 | #' DATA <- textshape::DATA  #reset DATA
27 | #' }
28 | split_speaker <- function (dataframe, speaker.var = 1, sep = c("and", "&", ","),
29 |     ...){
30 | 
31 |     element_id <- NULL
32 |     nms <- colnames(dataframe)
33 |     speaker.var <- colnames(dataframe[,speaker.var, drop=FALSE])
34 |     z <- data.table::data.table(data.frame(dataframe, stringsAsFactors = FALSE))
35 | 
36 |     z[, element_id := 1:.N]
37 |     express1 <- parse(text=
38 |         paste0(
39 |             speaker.var,
40 |             " := list(splittify(",
41 |             speaker.var,
42 |             ", c(",
43 |             paste(paste0("\"", sep, "\""), collapse=", "),
44 |             ")))"
45 |         )
46 |     )
47 | 
48 |     z[, eval(express1)]
49 | 
50 |     express2 <- parse(
51 |         text=paste0(".(", speaker.var, "=unlist(", speaker.var, "))")
52 |     )
53 |     
54 |     z <- z[, 
55 |             eval(express2), 
56 |             by = c(colnames(z)[!colnames(z) %in% speaker.var])
57 |         ][, 
58 |         c(nms, "element_id"), with = FALSE]
59 |     z[, 'split_id' := 1:.N, by = list(element_id)][]
60 | 
61 | }
62 | 
63 | 
64 | splittify <- function(x, y) {
65 | 
66 |     y <- .mgsub(esc, paste0('\\', esc), y, perl = FALSE)
67 | 
68 |     lapply(x, function(z) {
69 |         trimws(
70 |             grep("^\\s*$",
71 |                 strsplit(as.character(z), paste(paste(y), collapse="|"))[[1]],
72 |                 value=TRUE,
73 |                 invert = TRUE
74 |             )
75 |         )
76 |     })
77 | }
78 | 
79 | esc <- c(".", "|", "(", ")", "[", "{", "^", "$", "*", "+", "?")
80 | 


--------------------------------------------------------------------------------
/R/split_token.R:
--------------------------------------------------------------------------------
 1 | #' Split Tokens
 2 | #'
 3 | #' Split tokens.
 4 | #'
 5 | #' @param x A \code{\link[base]{data.frame}} or character vector with tokens.
 6 | #' @param text.var The name of the text variable.  If \code{TRUE}
 7 | #' \code{split_token} tries to detect the text column with tokens.
 8 | #' @param lower logical.  If \code{TRUE} the words are converted to lower case.
 9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_token
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of tokens or an expanded
14 | #' \code{\link[data.table]{data.table}} with tokens split apart.
15 | #' @examples
16 | #' (x <- c(
17 | #'     "Mr. Brown comes! He says hello. i give him coffee.",
18 | #'     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
19 | #'     "go there"
20 | #' ))
21 | #' split_token(x)
22 | #' split_token(x, lower=FALSE)
23 | #'
24 | #' data(DATA)
25 | #' split_token(DATA)
26 | #' split_token(DATA, lower=FALSE)
27 | #'
28 | #' ## Larger data set
29 | #' split_token(hamlet)
30 | split_token <- function(x, ...) {
31 |     UseMethod("split_token")
32 | }
33 | 
34 | #' @export
35 | #' @rdname split_token
36 | #' @method split_token default
37 | split_token.default <- function(x, lower = TRUE, ...) {
38 | 
39 |     if (lower) {
40 |         x <- stringi::stri_trans_tolower(x)
41 |     }
42 |     x <- stringi::stri_replace_all_regex(x, "(^|\\s)(')(\\w.+?)(')(\\s|$)", "$1$2 $3 $4$5")
43 |     x <- trimws(stringi::stri_replace_all_regex(x, "([^'[:^punct:]])", " $1 "))
44 |     stringi::stri_split_regex(x, "\\s+")
45 | }
46 | 
47 | 
48 | #' @export
49 | #' @rdname split_token
50 | #' @method split_token data.frame
51 | split_token.data.frame <- function(x, text.var = TRUE, lower = TRUE, ...) {
52 | 
53 |     element_id <- NULL
54 |     nms <- colnames(x)
55 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
56 | 
57 |     text.var <- detect_text_column(x, text.var)
58 | 
59 |     z[, element_id := 1:.N]
60 |     express1 <- parse(
61 |         text=paste0(
62 |             text.var, 
63 |             " := list(split_token.default(", 
64 |             text.var, 
65 |             ", lower = ", 
66 |             lower, 
67 |             "))"
68 |         )
69 |     )
70 |     z[, eval(express1)]
71 | 
72 |     express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
73 |     z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][, 
74 |         c(nms, "element_id"), with = FALSE]
75 |     z[, 'token_id' := 1:.N, by = list(element_id)][]
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/R/split_transcript.R:
--------------------------------------------------------------------------------
 1 | #' Split a Transcript Style Vector on Delimiter & Coerce to Dataframe
 2 | #'
 3 | #' Split a transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}
 4 | #' into a name and dialogue vector that is coerced to a \code{\link[data.table]{data.table}}.
 5 | #' Leading/trailing white space in the columns is stripped out.
 6 | #'
 7 | #' @param x A transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}.
 8 | #' @param delim The delimiter to split on.
 9 | #' @param colnames The column names to use for the \code{\link[data.table]{data.table}}
10 | #' output.
11 | #' @param max.delim An integer stating how many characters may come before a
12 | #' delimiter is found.  This is useful for the case when a colon is the delimiter
13 | #' but time stamps are also found in the text.
14 | #' @param \ldots Ignored.
15 | #' @return Returns a 2 column \code{\link[data.table]{data.table}}.
16 | #' @export
17 | #' @examples
18 | #' split_transcript(c("greg: Who me", "sarah: yes you!"))
19 | #'
20 | #' \dontrun{
21 | #' ## 2015 Vice-Presidential Debates Example
22 | #' if (!require("pacman")) install.packages("pacman")
23 | #' pacman::p_load(rvest, magrittr, xml2)
24 | #'
25 | #' debates <- c(
26 | #'     wisconsin = "110908",
27 | #'     boulder = "110906",
28 | #'     california = "110756",
29 | #'     ohio = "110489"
30 | #' )
31 | #'
32 | #' lapply(debates, function(x){
33 | #'     xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
34 | #'         rvest::html_nodes("p") %>%
35 | #'         rvest::html_text() %>%
36 | #'         textshape::split_index(grep("^[A-Z]+:", .)) %>%
37 | #'         textshape::combine() %>%
38 | #'         textshape::split_transcript() %>%
39 | #'         textshape::split_sentence()
40 | #' })
41 | #' }
42 | split_transcript <- function(x, delim = ":", colnames = c("person", "dialogue"),
43 |     max.delim = 15, ...){
44 | 
45 |     V1 <- V2 <- NULL
46 |     
47 |     if (!is.null(max.delim)) {
48 |         x <- gsub(
49 |             paste0('(^[^',delim, ']{0,', max.delim, '})([', delim, '])'), 
50 |             "\\1textshapesplithere", 
51 |             x, 
52 |             perl = TRUE
53 |         )
54 |     } else {
55 |         x <- sub(delim, "textshapesplithere", x)
56 |     }
57 |     
58 |     dat <- data.table::data.table(
59 |             do.call(rbind, strsplit(x , "textshapesplithere"))
60 |         )[,
61 |         'V1' := trimws(V1)][,
62 |         'V2' := trimws(V2)][]
63 |     
64 |     data.table::setnames(dat, c("V1", "V2"), colnames)
65 |     dat
66 | }
67 | 


--------------------------------------------------------------------------------
/R/split_word.R:
--------------------------------------------------------------------------------
 1 | #' Split Words
 2 | #'
 3 | #' Split words.
 4 | #'
 5 | #' @param x A \code{\link[base]{data.frame}} or character vector with words.
 6 | #' @param text.var The name of the text variable.  If \code{TRUE}
 7 | #' \code{split_word} tries to detect the text column with words.
 8 | #' @param lower logical.  If \code{TRUE} the words are converted to lower case.
 9 | #' @param \ldots Ignored.
10 | #' @export
11 | #' @rdname split_word
12 | #' @importFrom data.table .N :=
13 | #' @return Returns a list of vectors of words or an expanded
14 | #' \code{\link[data.table]{data.table}} with words split apart.
15 | #' @examples
16 | #' (x <- c(
17 | #'     "Mr. Brown comes! He says hello. i give him coffee.",
18 | #'     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
19 | #'     "go there"
20 | #' ))
21 | #' split_word(x)
22 | #' split_word(x, lower=FALSE)
23 | #'
24 | #' data(DATA)
25 | #' split_word(DATA)
26 | #' split_word(DATA, lower=FALSE)
27 | #'
28 | #' ## Larger data set
29 | #' split_word(hamlet)
30 | split_word <- function(x, ...) {
31 |     UseMethod("split_word")
32 | }
33 | 
34 | #' @export
35 | #' @rdname split_word
36 | #' @method split_word default
37 | split_word.default <- function(x, lower = TRUE, ...) {
38 |     if (lower) {
39 |         x <- stringi::stri_trans_tolower(x)
40 |     }
41 |     stringi::stri_extract_all_words(x)
42 | }
43 | 
44 | #' @export
45 | #' @rdname split_word
46 | #' @method split_word data.frame
47 | split_word.data.frame <- function(x, text.var = TRUE, lower = TRUE, ...) {
48 | 
49 |     element_id <- NULL
50 |     nms <- colnames(x)
51 |     z <- data.table::data.table(data.frame(x, stringsAsFactors = FALSE))
52 | 
53 |     text.var <- detect_text_column(x, text.var)
54 | 
55 |     z[, element_id := 1:.N]
56 |     express1 <- parse(
57 |         text=paste0(
58 |             text.var, 
59 |             " := list(split_word.default(", 
60 |             text.var, 
61 |             ", lower = ", 
62 |             lower, 
63 |             "))"
64 |         )
65 |     )
66 |     z[, eval(express1)]
67 | 
68 |     express2 <- parse(text=paste0(".(", text.var, "=unlist(", text.var, "))"))
69 |     z <- z[, eval(express2), by = c(colnames(z)[!colnames(z) %in% text.var])][, 
70 |         c(nms, "element_id"), with = FALSE]
71 |     z[, 'word_id' := 1:.N, by = list(element_id)][]
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/R/textshape-package.R:
--------------------------------------------------------------------------------
  1 | #' Tools for Reshaping Text
  2 | #'
  3 | #' Tools that can be used to reshape and restructure text data.
  4 | #' @docType package
  5 | #' @name textshape
  6 | #' @aliases textshape package-textshape
  7 | NULL
  8 | 
  9 | #' Fictitious Classroom Dialogue
 10 | #'
 11 | #' A fictitious dataset useful for small demonstrations.
 12 | #'
 13 | #' @details
 14 | #' \itemize{
 15 | #'   \item person. Speaker
 16 | #'   \item sex. Gender
 17 | #'   \item adult. Dummy coded adult (0-no; 1-yes)
 18 | #'   \item state. Statement (dialogue)
 19 | #'   \item code. Dialogue coding scheme
 20 | #' }
 21 | #'
 22 | #' @docType data
 23 | #' @keywords datasets
 24 | #' @name DATA
 25 | #' @usage data(DATA)
 26 | #' @format A data frame with 11 rows and 5 variables
 27 | NULL
 28 | 
 29 | #' Hamlet (Complete & Split by Sentence)
 30 | #'
 31 | #' A dataset containing the complete dialogue of Hamlet with turns of talk split
 32 | #' into sentences.
 33 | #'
 34 | #' @details
 35 | #' \itemize{
 36 | #'   \item act. The act (akin to repeated measures)
 37 | #'   \item tot. The turn of talk
 38 | #'   \item scene. The scene (nested within an act)
 39 | #'   \item location. Location of the scene
 40 | #'   \item person. Character in the play
 41 | #'   \item died. Logical coded death variable if yes the character dies in the
 42 | #'   play
 43 | #'   \item dialogue. The spoken dialogue
 44 | #' }
 45 | #'
 46 | #' @docType data
 47 | #' @keywords datasets
 48 | #' @name hamlet
 49 | #' @usage data(hamlet)
 50 | #' @format A data frame with 2007 rows and 7 variables
 51 | #' @references
 52 | #' http://www.gutenberg.org
 53 | NULL
 54 | 
 55 | 
 56 | 
 57 | #' Simple \code{\link[tm]{DocumentTermMatrix}}
 58 | #'
 59 | #' A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
 60 | #'
 61 | #' @details
 62 | #' \describe{
 63 | #'   \item{i}{The document locations}
 64 | #'   \item{j}{The term locations}
 65 | #'   \item{v}{The count of terms for that particular element position}
 66 | #'   \item{nrow}{The number of rows}
 67 | #'   \item{ncol}{The number of columns}
 68 | #'   \item{dimnames}{document and terms}
 69 | #' }
 70 | #'
 71 | #' @docType data
 72 | #' @keywords datasets
 73 | #' @name simple_dtm
 74 | #' @usage data(simple_dtm)
 75 | #' @format A list with 6 elements
 76 | NULL
 77 | 
 78 | 
 79 | #' Sentence Boundary Disambiguation Edge Cases
 80 | #'
 81 | #' A slightly filtered dataset containing Dias's sentence boundary
 82 | #' disambiguation edge cases.  This is a nested data set with the outcome
 83 | #' column as a nested list of desired splits.  The non-ASCII cases and spaced
 84 | #' ellipsis examples have been removed.
 85 | #'
 86 | #' @details
 87 | #' \itemize{
 88 | #'   \item Rule. The name of the rule to test
 89 | #'   \item Text. The testing text
 90 | #'   \item Outcome. The desired outcome of the sentence disambiguation
 91 | #' }
 92 | #'
 93 | #' @docType data
 94 | #' @keywords datasets
 95 | #' @name golden_rules
 96 | #' @usage data(golden_rules)
 97 | #' @format A data frame with 45 rows and 3 variables
 98 | #' @references Dias, Kevin S. 2015. Golden Rules (English).
 99 | #' Retrieved: https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt
100 | NULL
101 | 


--------------------------------------------------------------------------------
/R/tidy_colo_dtm.R:
--------------------------------------------------------------------------------
 1 | #' Convert a 
 2 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
 3 | #' into Collocating Words in Tidy Form
 4 | #'
 5 | #' Converts non-zero elements of a
 6 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
 7 | #' into a tidy data set made of collocating words.
 8 | #'
 9 | #' @param x A 
10 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
11 | #' @param \ldots Ignored.
12 | #' @return Returns a tidied data.frame.
13 | #' @rdname tidy_colo_dtm
14 | #' @export
15 | #' @seealso \code{\link[textshape]{unique_pairs}}
16 | #' @examples
17 | #' data(simple_dtm)
18 | #'
19 | #' tidied <- tidy_colo_dtm(simple_dtm)
20 | #' tidied
21 | #' unique_pairs(tidied)
22 | #'
23 | #' \dontrun{
24 | #' if (!require("pacman")) install.packages("pacman")
25 | #' pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
26 | #' pacman::p_load(tidyverse, magrittr, ggstance)
27 | #'
28 | #' my_dtm <- with(
29 | #'     presidential_debates_2012, 
30 | #'     q_dtm(dialogue, paste(time, tot, sep = "_"))
31 | #' )
32 | #'
33 | #' tidy_colo_dtm(my_dtm) %>%
34 | #'     tbl_df() %>%
35 | #'     filter(!term_1 %in% c('i', lexicon::sw_onix) & 
36 | #'         !term_2 %in% lexicon::sw_onix
37 | #'     ) %>%
38 | #'     filter(term_1 != term_2) %>%
39 | #'     unique_pairs() %>%
40 | #'     filter(n > 15) %>%
41 | #'     complete(term_1, term_2, fill = list(n = 0)) %>%
42 | #'     ggplot(aes(x = term_1, y = term_2, fill = n)) +
43 | #'         geom_tile() +
44 | #'         scale_fill_gradient(low= 'white', high = 'red') +
45 | #'         theme(axis.text.x = element_text(angle = 45, hjust = 1))
46 | #' }
47 | tidy_colo_tdm <- function(x, ...){
48 | 
49 |     term_1 <- NULL
50 | 
51 |     x <- slam::as.simple_triplet_matrix(
52 |         slam::tcrossprod_simple_triplet_matrix(x, y = NULL)
53 |     )
54 | 
55 |     data.table::data.table(
56 |         term_1 = x[['dimnames']][['Terms']][x[['i']]],
57 |         term_2 = x[['dimnames']][['Terms']][x[['j']]],
58 |         n = x[['v']]
59 |     )[order(term_1), ]
60 | }
61 | 
62 | 
63 | 
64 | #' @rdname tidy_colo_dtm
65 | #' @export
66 | tidy_colo_dtm <- function(x, ...){
67 | 
68 |     term_1 <- NULL
69 | 
70 |     x <- slam::as.simple_triplet_matrix(
71 |         slam::crossprod_simple_triplet_matrix(x, y = NULL)
72 |     )
73 | 
74 |     data.table::data.table(
75 |         term_1 = x[['dimnames']][['Terms']][x[['i']]],
76 |         term_2 = x[['dimnames']][['Terms']][x[['j']]],
77 |         n = x[['v']]
78 |     )[order(term_1), ]
79 | }
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/R/tidy_dtm.R:
--------------------------------------------------------------------------------
 1 | #' Convert a 
 2 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
 3 | #' into Tidy Form
 4 | #'
 5 | #' Converts non-zero elements of a
 6 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
 7 | #' into a tidy data set.
 8 | #'
 9 | #'
10 | #' @param x A 
11 | #' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
12 | #' @param \ldots ignored.
13 | #' @return Returns a tidied data.frame.
14 | #' @rdname tidy_dtm
15 | #' @include utils.R
16 | #' @export
17 | #' @examples
18 | #' data(simple_dtm)
19 | #'
20 | #' tidy_dtm(simple_dtm)
21 | #'
22 | #' \dontrun{
23 | #' if (!require("pacman")) install.packages("pacman")
24 | #' pacman::p_load_current_gh('trinker/gofastr')
25 | #' pacman::p_load(tidyverse, magrittr, ggstance)
26 | #'
27 | #' my_dtm <- with(
28 | #'     presidential_debates_2012, 
29 | #'     q_dtm(dialogue, paste(time, tot, sep = "_"))
30 | #' )
31 | #'
32 | #' tidy_dtm(my_dtm) %>%
33 | #'     tidyr::extract(
34 | #'         col = doc, 
35 | #'         into = c("time", "turn", "sentence"), 
36 | #'         regex = "(\\d)_(\\d+)\\.(\\d+)"
37 | #'     ) %>%
38 | #'     mutate(
39 | #'         time = as.numeric(time),
40 | #'         turn = as.numeric(turn),
41 | #'         sentence = as.numeric(sentence)
42 | #'     ) %>%
43 | #'     tbl_df() %T>%
44 | #'     print() %>%
45 | #'     group_by(time, term) %>%
46 | #'     summarize(n = sum(n)) %>%
47 | #'     group_by(time) %>%
48 | #'     arrange(desc(n)) %>%
49 | #'     slice(1:10) %>%
50 | #'     ungroup() %>%
51 | #'     mutate(
52 | #'         term = factor(paste(term, time, sep = "__"),
53 | #'             levels = rev(paste(term, time, sep = "__")))
54 | #'     ) %>%
55 | #'     ggplot(aes(x = n, y = term)) +
56 | #'         geom_barh(stat='identity') +
57 | #'         facet_wrap(~time, ncol=2, scales = 'free_y') +
58 | #'         scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
59 | #' }
60 | tidy_dtm <- function(x, ...){
61 | 
62 |     doc <- NULL
63 | 
64 |     docfun <- function(docs) {
65 |         if (is_numeric_doc_names(x)) {as.integer(docs)} else {docs}
66 |     }
67 | 
68 |     data.table::data.table(
69 |         doc = x[['dimnames']][['Docs']][x[['i']]],
70 |         term = x[['dimnames']][['Terms']][x[['j']]],
71 |         n = x[['v']],
72 |         i = x[['i']],
73 |         j = x[['j']]
74 |     )[, doc := docfun(doc)][order(doc), ][]
75 | }
76 | 
77 | 
78 | 
79 | #' @rdname tidy_dtm
80 | #' @export
81 | tidy_tdm <- function(x, ...){
82 | 
83 |     doc <- NULL
84 | 
85 |     docfun <- function(docs) {
86 |         if (is_numeric_doc_names(x)) {as.integer(docs)} else {docs}
87 |     }
88 | 
89 |     data.table::data.table(
90 |         doc = x[['dimnames']][['Docs']][x[['j']]],
91 |         term = x[['dimnames']][['Terms']][x[['i']]],
92 |         n = x[['v']],
93 |         i = x[['j']],
94 |         j = x[['i']]
95 |     )[, doc := docfun(doc)][order(doc), ][]
96 | }
97 | 


--------------------------------------------------------------------------------
/R/tidy_list.R:
--------------------------------------------------------------------------------
  1 | #' Tidy a List of Named Dataframes or Named Vectors or Vectors
  2 | #'
  3 | #' \code{\link[base]{rbind}} a named \code{\link[base]{list}} of
  4 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}s to
  5 | #' output a single \code{\link[base]{data.frame}} with the 
  6 | #' \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id} 
  7 | #' column.
  8 | #'
  9 | #' @param x A named \code{\link[base]{list}} of
 10 | #' \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.
 11 | #' @param id.name The name to use for the column created from the 
 12 | #' \code{\link[base]{list}}.
 13 | #' @param content.name The name to use for the column created from the 
 14 | #' \code{\link[base]{list}} of \code{\link[base]{vector}}s (only used if 
 15 | #' \code{x} is \code{\link[base]{vector}}).
 16 | #' @param content.attribute.name The name to use for the column created from the
 17 | #' \code{\link[base]{list}} of names given to the \code{\link[base]{vector}}s
 18 | #' (only used if \code{x} is named \code{\link[base]{vector}}).
 19 | #' @param \ldots Ignored.
 20 | #' @return Returns a \code{\link[data.table]{data.table}} with the 
 21 | #' \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id} 
 22 | #' column.
 23 | #' @export
 24 | #' @examples
 25 | #' tidy_list(list(p=1:500, r=letters))
 26 | #' tidy_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
 27 | #'
 28 | #' x <- list(
 29 | #'     a = setNames(c(1:4), LETTERS[1:4]),
 30 | #'     b = setNames(c(7:9), LETTERS[7:9]),
 31 | #'     c = setNames(c(10:15), LETTERS[10:15]),
 32 | #'     d = c(x=4, y=6, 4),
 33 | #'     e = setNames(1:10, sample(state.abb, 10, TRUE)),
 34 | #'     f = setNames(1:10, sample(month.abb, 10, TRUE))
 35 | #' )
 36 | #'
 37 | #' tidy_list(x)
 38 | #'
 39 | #' \dontrun{
 40 | #' ## 2015 Vice-Presidential Debates Example
 41 | #' if (!require("pacman")) install.packages("pacman")
 42 | #' pacman::p_load(rvest, magrittr, xml2)
 43 | #'
 44 | #' debates <- c(
 45 | #'     wisconsin = "110908",
 46 | #'     boulder = "110906",
 47 | #'     california = "110756",
 48 | #'     ohio = "110489"
 49 | #' )
 50 | #'
 51 | #' lapply(debates, function(x){
 52 | #'     paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x) %>%
 53 | #'         xml2::read_html() %>%
 54 | #'         rvest::html_nodes("p") %>%
 55 | #'         rvest::html_text() %>%
 56 | #'         textshape::split_index(grep("^[A-Z]+:", .)) %>%
 57 | #'         textshape::combine() %>%
 58 | #'         textshape::split_transcript() %>%
 59 | #'         textshape::split_sentence()
 60 | #' }) %>%
 61 | #'     textshape::tidy_list("location")
 62 | #' }
 63 | tidy_list <- function(x, id.name= "id", content.name = "content",
 64 |     content.attribute.name = 'attribute', ...){
 65 | 
 66 |     if (is.data.frame(x[[1]])){
 67 |         tidy_list_df(x = x, id.name = id.name)
 68 |     } else {
 69 | 
 70 |         if (is.atomic(x[[1]])){
 71 |             
 72 |             tidy_list_vector(x = x, id.name = id.name, 
 73 |                 content.name = content.name, 
 74 |                 content.attribute.name = content.attribute.name
 75 |             )
 76 |             
 77 |         } else {
 78 |             stop("`x` must be a list of `data.frame`s or atomic `vector`s")
 79 |         }
 80 |     }
 81 | }
 82 | 
 83 | 
 84 | 
 85 | tidy_list_df <- function (x, id.name = "id"){
 86 |     if (is.null(names(x))) {
 87 |         names(x) <- seq_along(x)
 88 |     }
 89 |     list.names <- rep(names(x), sapply2(x, nrow))
 90 |     x <- lapply(x, data.table::as.data.table)
 91 |     x[['fill']] <- TRUE
 92 |     out <- data.frame(list.names, do.call(rbind, x),
 93 |         row.names = NULL, check.names = FALSE, stringsAsFactors = FALSE)
 94 |     colnames(out)[1] <- id.name
 95 |     data.table::data.table(out)
 96 | }
 97 | 
 98 | tidy_list_vector <- function(x, id.name = "id",
 99 |     content.attribute.name = 'attribute', content.name = "content"){
100 | 
101 |     if (is.null(names(x))) {
102 |         names(x) <- seq_along(x)
103 |     }
104 | 
105 |     if (all(!sapply2(x, function(y) is.null(names(y))))){
106 | 
107 |         tidy_list(
108 |             lapply(x, tidy_vector, content.attribute.name , content.name ), 
109 |             id.name
110 |         )
111 | 
112 |     } else {
113 | 
114 |         dat <- data.frame(
115 |             rep(names(x), sapply2(x, length)),
116 |             unlist(x, use.names = FALSE),
117 |             stringsAsFactors = FALSE,
118 |             check.names = FALSE,
119 |             row.names = NULL
120 |         )
121 |         colnames(dat) <- c(id.name, content.name)
122 |         data.table::data.table(dat)
123 | 
124 |     }
125 | }
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/R/tidy_matrix.R:
--------------------------------------------------------------------------------
 1 | #' Convert a Matrix into Tidy Form
 2 | #' 
 3 | #' \code{tidy_matrix} - Converts matrices into a tidy data set.  Essentially, a
 4 | #' stacking of the matrix columns and repeating row/column names as necessary.
 5 | #' 
 6 | #' @param x A matrix.
 7 | #' @param row.name A string to use for the row names that are now a column.
 8 | #' @param col.name A string to use for the column names that are now a column.
 9 | #' @param value.name A string to use for the values that are now a column.
10 | #' @param \ldots ignored.
11 | #' @return Returns a tidied \code{data.frame}.
12 | #' @export
13 | #' @rdname tidy_matrix
14 | #' @examples
15 | #' mat <- matrix(1:16, nrow = 4,
16 | #'     dimnames = list(LETTERS[1:4], LETTERS[23:26])
17 | #' )
18 | #' 
19 | #' mat
20 | #' tidy_matrix(mat)
21 | #' 
22 | #' 
23 | #' data(simple_dtm)
24 | #' tidy_matrix(as.matrix(simple_dtm), 'doc', 'term', 'n')
25 | #' 
26 | #' X <- as.matrix(simple_dtm[1:10, 1:10])
27 | #' tidy_adjacency_matrix(crossprod(X))
28 | #' tidy_adjacency_matrix(crossprod(t(X)))
29 | tidy_matrix <- function(x, row.name = 'row', col.name = 'col', 
30 |     value.name = 'value', ...){
31 |     
32 |     stopifnot(is.matrix(x))
33 |     
34 |     out <- data.table::data.table(x = rep(rownames(x), ncol(x)), 
35 |         y = rep(colnames(x), each = nrow(x)), 
36 |         z = c(x))
37 |     data.table::setnames(out, c(row.name, col.name, value.name)) 
38 |     
39 |     out
40 |     
41 | }
42 | 
43 | #' Convert a Matrix into Tidy Form
44 | #' 
45 | #' \code{tidy_adjacency_matrix} - A wrapper for \code{tidy_matrix} with the 
46 | #' \code{row.name}, \code{col.name}, & \code{value.name} all set to 
47 | #' \code{"from"},\code{"to"}, & \code{"n"}, assuming preparation for network 
48 | #' analysis.
49 | #' 
50 | #' @export
51 | #' @rdname tidy_matrix
52 | tidy_adjacency_matrix <- function(x, ...){
53 |     
54 |     tidy_matrix(x, row.name = 'from', col.name = 'to', value.name = 'n', ...)
55 |     
56 | }
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/R/tidy_table.R:
--------------------------------------------------------------------------------
 1 | #' Tidy a Table: Bind Its Values with Its Names
 2 | #'
 3 | #' \code{\link[base]{cbind}} a \code{\link[base]{table}}'s values with its
 4 | #' \code{\link[base]{names}} to form \code{id} (from the names) and
 5 | #' \code{content} columns.
 6 | #'
 7 | #' @param x A \code{\link[base]{table}}.
 8 | #' @param id.name The name to use for the column created from the \code{\link[base]{table}}
 9 | #' \code{\link[base]{names}}.
10 | #' @param content.name The name to use for the column created from the \code{\link[base]{table}}
11 | #' values.
12 | #' @param \ldots ignored.
13 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
14 | #' from the \code{\link[base]{table}} as an \code{id} column.
15 | #' @export
16 | #' @examples
17 | #' x <- table(sample(LETTERS[1:6], 1000, TRUE))
18 | #' tidy_table(x)
19 | tidy_table <- function(x, id.name= "id", content.name = "content", ...){
20 | 
21 |     stopifnot(is.table(x))
22 |     out <- data.table::data.table(x = names(x), y = unname(c(x)))
23 |     data.table::setnames(out, c(id.name, content.name))
24 |     out
25 | 
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/R/tidy_vector.R:
--------------------------------------------------------------------------------
 1 | #' Tidy a Named Atomic Vector: Bind Its Values with Its Names
 2 | #'
 3 | #' \code{\link[base]{cbind}} a named atomic \code{\link[base]{vector}}'s values
 4 | #' with its \code{\link[base]{names}} to form \code{id} (from the names) and
 5 | #' \code{content} columns.
 6 | #'
 7 | #' @param x A named atomic \code{\link[base]{vector}}.
 8 | #' @param id.name The name to use for the column created from the \code{\link[base]{vector}}
 9 | #' \code{\link[base]{names}}.
10 | #' @param content.name The name to use for the column created from the \code{\link[base]{vector}}
11 | #' values.
12 | #' @param \ldots ignored.
13 | #' @return Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
14 | #' from the \code{\link[base]{vector}} as an \code{id} column.
15 | #' @export
16 | #' @examples
17 | #' x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
18 | #' tidy_vector(x)
19 | tidy_vector <- function(x, id.name= "id", content.name = "content", ...){
20 | 
21 |     stopifnot(is.atomic(x))
22 |     if (is.null(names)) {
23 |         out <- data.table::as.data.table(x)
24 |         data.table::setnames(out, id.name)
25 |     } else {
26 |         out <- data.table::data.table(x = names(x), y = unname(x))
27 |         data.table::setnames(out, c(id.name, content.name))
28 |     }
29 |     out
30 | }
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/R/unique_pairs.R:
--------------------------------------------------------------------------------
 1 | #' Extract Only Unique Pairs of Collocating Words in 
 2 | #' \code{\link[textshape]{tidy_colo_dtm}}
 3 | #'
 4 | #' \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
 5 | #' the tidied data.frame.  This means that the upper and lower triangles are
 6 | #' used redundantly.  This function eliminates this redundancy by dropping one
 7 | #' set of the pairs from a tidied data.frame.
 8 | #'
 9 | #' @param x A \code{\link[base]{data.frame}} with two columns that contain
10 | #' redundant pairs.
11 | #' @param col1 A string naming column 1.
12 | #' @param col2 A string naming column 2.
13 | #' @param \ldots ignored.
14 | #' @return Returns a filtered \code{\link[base]{data.frame}}.
15 | #' @export
16 | #' @seealso \code{\link[textshape]{tidy_colo_dtm}}
17 | #' @examples
18 | #' dat <- data.frame(
19 | #'     term_1 = LETTERS[1:10],
20 | #'     term_2 = LETTERS[10:1],
21 | #'     stringsAsFactors = FALSE
22 | #' )
23 | #'
24 | #' unique_pairs(dat)
25 | unique_pairs <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
26 | 
27 |     UseMethod('unique_pairs')
28 | }
29 | 
30 | #' @export
31 | #' @rdname unique_pairs
32 | #' @method unique_pairs default
33 | unique_pairs.default <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
34 | 
35 |     x[!duplicated(apply(
36 |         data.table::data.table(x[, c(col1, col2)]), 
37 |         1, 
38 |         sorter
39 |     )),]
40 | }
41 | 
42 | #' @export
43 | #' @rdname unique_pairs
44 | #' @method unique_pairs data.table
45 | unique_pairs.data.table <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
46 | 
47 |     x[!duplicated(apply(
48 |         data.table::data.table(x[, c(col1, col2), with = FALSE]), 
49 |         1, 
50 |         sorter
51 |     )),]
52 | }
53 | 
54 | sorter <- function(x) paste(sort(x), collapse = "")
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/R/unnest_text.R:
--------------------------------------------------------------------------------
  1 | #' Un-nest Nested Text Columns
  2 | #' 
  3 | #' Un-nest nested text columns in a data.frame.   Attempts to locate the nested 
  4 | #' text column without specifying. 
  5 | #' 
  6 | #' @param dataframe A dataframe object.
  7 | #' @param column Column name to search for markers/terms. 
  8 | #' @param integer.rownames logical.  If \code{TRUE} then the rownames are 
  9 | #' numbered 1 through number of rows, otherwise the original row number is 
 10 | #' retained followed by a period and the element number from the list.
 11 | #' @param \ldots ignored.
 12 | #' @return Returns an un-nested data.frame.
 13 | #' @export
 14 | #' @examples
 15 | #' dat <- DATA
 16 | #' 
 17 | #' ## Add a nested/list text column
 18 | #' dat$split <- lapply(dat$state, function(x) {
 19 | #'     unlist(strsplit(x, '(?<=[?!.])\\s+', perl = TRUE))
 20 | #' })
 21 | #' 
 22 | #' unnest_text(dat)
 23 | #' unnest_text(dat, integer.rownames = FALSE)
 24 | #' 
 25 | #' ## Add a second nested integer column
 26 | #' dat$d <- lapply(dat$split, nchar)
 27 | #' \dontrun{
 28 | #' unnest_text(dat) # causes error, must supply column explicitly
 29 | #' }
 30 | #' unnest_text(dat, 'split')
 31 | #' 
 32 | #' ## As a data.table
 33 | #' library(data.table)
 34 | #' dt_dat <- data.table::as.data.table(data.table::copy(dat))
 35 | #' unnest_text(dt_dat, 'split')
 36 | #' \dontrun{
 37 | #' unnest_text(dt_dat, 'd')
 38 | #' }
 39 | #' 
 40 | #' \dontrun{
 41 | #' ## As a tibble
 42 | #' library(tibble)
 43 | #' t_dat <- tibble:::as_tibble(dat)
 44 | #' unnest_text(t_dat, 'split')
 45 | #' }
 46 | unnest_text <- function(dataframe, column, integer.rownames = TRUE, ...){
 47 | 
 48 |     if (missing(column)) {
 49 |         
 50 |         column <- names(dataframe)[!unlist(lapply(as.data.frame(dataframe), 
 51 |             is.atomic))]
 52 |         if (length(column) == 0) {
 53 |             stop(paste(
 54 |                 "There appears to be no nested columns.", 
 55 |                 "Please supply `column` explicitly."
 56 |             ), call. = FALSE)
 57 |         }
 58 |         
 59 |         if (length(column) > 1) {
 60 |             stop(paste(
 61 |                 "There appears to be multiple nested columns.", 
 62 |                 "Please supply `column` explicitly."
 63 |             ), call. = FALSE)
 64 |         }
 65 |         message(sprintf('Nested column detected, un-nesting: %s', column))
 66 |         
 67 |     }
 68 | 
 69 |     nms <- colnames(dataframe)
 70 |     
 71 |     lens <- lengths(dataframe[[column]])
 72 |     col <- unlist(dataframe[[column]])
 73 | 
 74 |     if (!is.character(col)) {
 75 |         warning(
 76 |             sprintf(
 77 |                 paste0(
 78 |                     'Un-nesting: `%s`\nThis is not a character column.\n\n', 
 79 |                     'Perhaps you want to use `tidyr::unnest` instead?'
 80 |                 ), 
 81 |                 column
 82 |             ), 
 83 |             call. = FALSE
 84 |         )
 85 |     }
 86 |     
 87 |     dataframe[[column]] <- NA
 88 |     
 89 |     dataframe <- dataframe[rep(seq_len(nrow(dataframe)), lens),]
 90 |     
 91 |     dataframe[[column]] <- col
 92 |     if (isTRUE(integer.rownames)) {
 93 |         rownames(dataframe) <- NULL
 94 |     } else {
 95 |         rnms <- rownames(dataframe)
 96 |         rnms <- ifelse(grepl('\\.', rnms), rnms, paste0(rnms, '.0'))
 97 |         
 98 |         rownames(dataframe) <- paste0(
 99 |             gsub('\\.+$', '', rnms),
100 |             '.',    
101 |             as.integer(gsub('^\\d+\\.', '', rnms)) + 1
102 |         )
103 |     }
104 |     
105 |     dataframe
106 | 
107 | }
108 | 
109 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | .mgsub <- function (pattern, replacement, text.var, fixed = TRUE,
  2 | 	order.pattern = fixed, perl = !fixed, ...) {
  3 | 
  4 |     if (fixed && order.pattern) {
  5 |         ord <- rev(order(nchar(pattern)))
  6 |         pattern <- pattern[ord]
  7 |         if (length(replacement) != 1) replacement <- replacement[ord]
  8 |     }
  9 |     
 10 |     if (length(replacement) == 1) {
 11 |         replacement <- rep(replacement, length(pattern))
 12 |     }
 13 | 
 14 |     for (i in seq_along(pattern)){
 15 |         text.var <- gsub(pattern[i], replacement[i], text.var, fixed = fixed, 
 16 |             perl = perl, ...)
 17 |     }
 18 | 
 19 |     text.var
 20 | }
 21 | 
 22 | 
 23 | nth <- function(x, ...){
 24 |     if (is.null(x)) return(1)
 25 |     if (is.numeric(x)) x <- as.integer(x)
 26 |     if (is.integer(x) && x < 1) stop('`from.n` and `to.n` must be > 1')
 27 |     if (is.integer(x)) return(x)
 28 |     if (is.character(x)) {
 29 |         switch(x,
 30 |             first = 1,
 31 |             last =,
 32 |             n = Inf,
 33 |             stop('If supplying a string to `from.n` or `to.n` must be one of: c("first", "last", or "n")')
 34 |         )
 35 |     }
 36 | }
 37 | 
 38 | mark_start <- function(class){
 39 |     sprintf("<mark class=\"%s\">", class)
 40 | }
 41 | 
 42 | mark_end <- "</mark>"
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | add_row_id <- function(x){
 49 |     lens <- lapply(x, length)
 50 |     rep(seq_along(lens), unlist(lens))
 51 | }
 52 | 
 53 | title_tag_table <- function (mat) {
 54 |     unlist(lapply(seq_len(nrow(mat)), function(i) {
 55 |         x <- unlist(mat[i, , drop = FALSE])
 56 |         names(x[x > 0][1])
 57 |     }))
 58 | }
 59 | 
 60 | 
 61 | 
 62 | paste2 <- function (multi.columns, sep = ".", handle.na = TRUE, trim = TRUE) {
 63 |     if (is.matrix(multi.columns)) {
 64 |         multi.columns <- data.frame(multi.columns, stringsAsFactors = FALSE)
 65 |     }
 66 |     if (trim)
 67 |         multi.columns <- lapply(multi.columns, function(x) {
 68 |             gsub("^\\s+|\\s+$", "", x)
 69 |         })
 70 |     if (!is.data.frame(multi.columns) & is.list(multi.columns)) {
 71 |         multi.columns <- do.call("cbind", multi.columns)
 72 |     }
 73 |     if (handle.na) {
 74 |         m <- apply(multi.columns, 1, function(x) {
 75 |             if (any(is.na(x))) {
 76 |                 NA
 77 |             } else {
 78 |                 paste(x, collapse = sep)
 79 |             }
 80 |         })
 81 |     } else {
 82 |         m <- apply(multi.columns, 1, paste, collapse = sep)
 83 |     }
 84 |     names(m) <- NULL
 85 |     return(m)
 86 | }
 87 | 
 88 | is.Integer <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol
 89 | 
 90 | pad <- function (x, padding = max(nchar(as.character(x))), sort = TRUE,
 91 |     type = "detect") {
 92 |     
 93 |     poss <- c("detect", "numeric", "character", "d", "s")
 94 |     
 95 |     if (!type %in% poss){
 96 |         stop(
 97 |             paste(
 98 |                 "type must be: \"detect\",", 
 99 |                 "\"numeric\"\\\"d\" or \"character\"\\\"s\""
100 |             ), 
101 |             call. = FALSE
102 |         )
103 |     }
104 |     
105 |     Rel <- c(NA, "d", "s", "d", "s")
106 |     type <- Rel[poss %in% type]
107 |     if (is.na(type)) {
108 |         type <- ifelse(is.numeric(x), "d", "s")
109 |     }
110 |     x <- sprintf_ish(x, padding, type)
111 |     if (sort) {
112 |         x <- sort(x)
113 |     }
114 |     x
115 | }
116 | 
117 | sprintf_ish <- function(x, padding, type){
118 |     OS <- Sys.info()[['sysname']]
119 | 
120 |     if (OS %in% c("Windows", "Linux")) {
121 |         sprintf(paste0("%0", padding, type), x)
122 |     } else {
123 |         type <- ifelse(type == "s", " ", "0")
124 |         pads <- sapply2(padding - nchar(x), function(i)  {
125 |             if (i == 0) return("")
126 |             paste(rep(type, i), collapse = "")
127 |         })
128 |         paste0(pads, x)
129 | 
130 |     }
131 | }
132 | 
133 | 
134 | is_numeric_doc_names <- function(x, ...){
135 |     UseMethod('is_numeric_doc_names')
136 | }
137 | 
138 | 
139 | is_numeric_doc_names.TermDocumentMatrix <- function(x, ...){
140 |     colnames_numeric <- suppressWarnings(as.integer(colnames(x)))
141 |     !anyNA(colnames_numeric) && 
142 |         isTRUE(all.equal(stats::sd(diff(colnames_numeric)), 0))
143 | }
144 | 
145 | 
146 | is_numeric_doc_names.DocumentTermMatrix <- function(x, ...){
147 |     rownames_numeric <- suppressWarnings(as.integer(rownames(x)))
148 |     !anyNA(rownames_numeric) && 
149 |         isTRUE(all.equal(stats::sd(diff(rownames_numeric)), 0))
150 | }
151 | 
152 | ## function to detect text columns
153 | detect_text_column <- function(dat, text.var){
154 |     
155 |     if (isTRUE(text.var)) {
156 |     
157 |         dat <- as.data.frame(dat, stringsAsFactors = FALSE)
158 |         
159 |         mean_lens <- unlist(lapply(dat, function(y) {
160 |          
161 |             if(!is.character(y) && !is.factor(y)) return(0)
162 |             mean(nchar(as.character(y)), na.rm = TRUE)
163 |             
164 |         }))
165 |     
166 |         max_cols <- which.max(mean_lens)
167 |         
168 |         text.var <- colnames(dat)[max_cols[1]]
169 |         
170 |         if (length(text.var) == 0 | sum(as.integer(mean_lens)) == 0) {
171 |             stop(
172 |                 paste(
173 |                     "Could not detect ` text.var`.", 
174 |                     "Please supply `text.var` explicitly."
175 |                 ),
176 |                 call. = FALSE
177 |             )
178 |         }
179 |         
180 |         if (length(max_cols) > 1) {
181 |             warning(
182 |                 sprintf(
183 |                     'More than one text column detected...using `%s`', 
184 |                     text.var
185 |                 ), 
186 |                 call. = FALSE
187 |             )    
188 |         }
189 |     } 
190 |     
191 |     text.var
192 |     
193 | }
194 | 
195 | 
196 | sapply2 <- function (X, FUN, ...) {
197 |     unlist(lapply(X, FUN, ...))
198 | }
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "textshape"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   md_document:
  6 |     toc: true      
  7 | ---
  8 | 
  9 | ```{r, echo=FALSE, message=FALSE, warning=FALSE}
 10 | library(knitr)
 11 | knitr::opts_chunk$set(fig.path = "tools/figure/")
 12 | combine <- textshape::combine
 13 | 
 14 | library(tidyverse)
 15 | library(magrittr)
 16 | library(ggstance)
 17 | library(textshape)
 18 | library(gridExtra)
 19 | library(viridis)
 20 | library(quanteda)
 21 | library(gofastr)
 22 | 
 23 | ## desc <- suppressWarnings(readLines("DESCRIPTION"))
 24 | ## regex <- "(^Version:\\s+)(\\d+\\.\\d+\\.\\d+)"
 25 | ## loc <- grep(regex, desc)
 26 | ## ver <- gsub(regex, "\\2", desc[loc])
 27 | ## verbadge <- sprintf('<a href="https://img.shields.io/badge/Version-%s-orange.svg"><img ## src="https://img.shields.io/badge/Version-%s-orange.svg" alt="Version"/></a></p>', ver, ver)
 28 | ## verbadge <- ''
 29 | ```
 30 | 
 31 | 
 32 | ```{r, echo=FALSE}
 33 | knit_hooks$set(htmlcap = function(before, options, envir) {
 34 |   if(!before) {
 35 |     paste('<p class="caption"><b><em>',options$htmlcap,"</em></b></p>",sep="")
 36 |     }
 37 |     })
 38 | knitr::opts_knit$set(self.contained = TRUE, cache = FALSE)
 39 | ```
 40 | 
 41 | [![Project Status: Inactive – The project has reached a stable, usable state but is no longer being actively developed; support/maintenance will be provided as time allows.](http://www.repostatus.org/badges/latest/inactive.svg)](https://www.repostatus.org/)
 42 | [![](http://cranlogs.r-pkg.org/badges/textshape)](https://cran.r-project.org/package=textshape)
 43 | 
 44 | 
 45 | ![](tools/textshape_logo/r_textshape.png)
 46 | 
 47 | **textshape** is small suite of text reshaping and restructuring functions.  Many of these functions are descended from tools in the [**qdapTools**](https://github.com/trinker/qdapTools) package.  This brings reshaping tools under one roof with specific functionality of the package limited to text reshaping.  
 48 | 
 49 | Other R packages provide some of the same functionality.  **textshape** differs from these packages in that it is designed to help the user take unstructured data (or implicitly structured), extract it into a structured format, and then restructure into common text analysis formats for use in the next stage of the text analysis pipeline.  The implicit structure of seemingly unstructured data is often detectable/expressible by the researcher.  **textshape** provides tools (e.g., `split_match`) to enable the researcher to convert this tacit knowledge into a form that can be used to reformat data into more structured formats.  This package is meant to be used jointly with the [**textclean**](https://github.com/trinker/textclean) package, which provides cleaning and text normalization functionality.  
 50 | 
 51 | # Functions
 52 | 
 53 | Most of the functions split, expand, grab, or tidy a `vector`, `list`, `data.frame`, or `DocumentTermMatrix`.  The `combine`, `duration`, `mtabulate`, & `flatten` functions are notable exceptions.  The table below describes the functions and their use:
 54 | 
 55 | | Function         | Used On                        |  Description                                                 |
 56 | |------------------|--------------------------------|--------------------------------------------------------------|
 57 | | `combine`        | `vector`, `list`, `data.frame` | Combine and collapse elements                                |
 58 | | `tidy_list`      | `list` of `vector`s or `data.frame`s | Row bind a list and repeat list names as id column     |
 59 | | `tidy_vector`    | `vector`                       | Column bind a named atomic `vector`'s names and values       |
 60 | | `tidy_table`     | `table`                        | Column bind a `table`'s names and values                     |
 61 | | `tidy_matrix`    | `matrix`                       | Stack values, repeat column row names accordingly            |
 62 | | `tidy_dtm`/`tidy_tdm` | `DocumentTermMatrix`      | Tidy format `DocumentTermMatrix`/`TermDocumentMatrix`        |
 63 | | `tidy_colo_dtm`/`tidy_colo_tdm` | `DocumentTermMatrix`      | Tidy format of collocating words  from a `DocumentTermMatrix`/`TermDocumentMatrix`        |
 64 | | `duration`       | `vector`, `data.frame`         | Get duration (start-end times) for turns of talk in n words  |
 65 | | `from_to`        | `vector`, `data.frame`         | Prepare speaker data for a flow network                      |
 66 | | `mtabulate`      | `vector`, `list`, `data.frame` | Dataframe/list version of `tabulate` to produce count matrix |
 67 | | `flatten`        | `list`                         | Flatten nested, named list to single tier                    |
 68 | | `unnest_text`    | `data.frame`                   | Unnest a nested text column                                  |
 69 | | `split_index`    | `vector`, `list`, `data.frame` | Split at specified indices                                   |
 70 | | `split_match`    | `vector`                       | Split vector at specified character/regex match              |
 71 | | `split_portion`  | `vector`\*                     | Split data into portioned chunks                             |
 72 | | `split_run`      | `vector`, `data.frame`         | Split runs (e.g., "aaabbbbcdddd")                            |
 73 | | `split_sentence` | `vector`, `data.frame`         | Split sentences                                              |
 74 | | `split_speaker`  | `data.frame`                   | Split combined speakers (e.g., "Josh, Jake, Jim")            |
 75 | | `split_token`    | `vector`, `data.frame`         | Split words and punctuation                                  |
 76 | | `split_transcript` | `vector`                     | Split speaker and dialogue (e.g., "greg: Who me")            |
 77 | | `split_word`     | `vector`, `data.frame`         | Split words                                                  |
 78 | | `grab_index`     | `vector`, `data.frame`, `list` | Grab from an index up to a second index                      |
 79 | | `grab_match`     | `vector`, `data.frame`, `list` | Grab from a regex match up to a second regex match           |
 80 | | `column_to_rownames` | `data.frame`               | Add a column as rownames                                     |
 81 | | `cluster_matrix` | `matrix`                       | Reorder column/rows of a matrix via hierarchical clustering  |
 82 | 
 83 | 
 84 | \****Note:*** *Text vector accompanied by aggregating `grouping.var` argument, which can be in the form of a `vector`, `list`, or  `data.frame`*
 85 | 
 86 | 
 87 | 
 88 | # Installation
 89 | 
 90 | To download the development version of **textshape**:
 91 | 
 92 | Download the [zip ball](https://github.com/trinker/textshape/zipball/master) or [tar ball](https://github.com/trinker/textshape/tarball/master), decompress and run `R CMD INSTALL` on it, or use the **pacman** package to install the development version:
 93 | 
 94 | ```r
 95 | if (!require("pacman")) install.packages("pacman")
 96 | pacman::p_load_gh("trinker/textshape")
 97 | ```
 98 | 
 99 | # Contact
100 | 
101 | You are welcome to:
102 | 
103 | * submit suggestions and bug-reports at: <https://github.com/trinker/textshape/issues>
104 | 
105 | 
106 | # Contributing
107 | 
108 | Contributions are welcome from anyone subject to the following rules:
109 | 
110 | - Abide by the [code of conduct](https://github.com/trinker/textshape/blob/master/CODE_OF_CONDUCT.md).
111 | - Follow the style conventions of the package (indentation, function & argument naming, commenting, etc.)
112 | - All contributions must be consistent with the package license (GPL-2)
113 | - Submit contributions as a pull request. Clearly state what the changes are and try to keep the number of changes per pull request as low as possible.
114 | - If you make big changes, add your name to the 'Author' field.
115 | 
116 | # Examples
117 | 
118 | The main shaping functions can be broken into the categories of (a) binding, (b) combining, &#40;c) tabulating, (d) spanning, (e) splitting, (f) grabbing & (e) tidying.  The majority of functions in **textshape** fall into the category of splitting and expanding (the semantic opposite of combining).  These sections will provide example uses of the functions from **textshape** within the three categories.
119 | 
120 | # Loading Dependencies
121 | 
122 | ```r
123 | if (!require("pacman")) install.packages("pacman")
124 | pacman::p_load(tidyverse, magrittr, ggstance, viridis, gridExtra, quanteda)
125 | pacman::p_load_current_gh('trinker/gofastr', 'trinker/textshape')
126 | ```
127 | 
128 | ## Tidying
129 | 
130 | The `tidy_xxx` functions convert untidy structures into [tidy format](https://www.jstatsoft.org/article/view/v059i10).  Tidy formatted text data structures are particularly useful for interfacing with **ggplot2**, which expects this form.  
131 | 
132 | The `tidy_list` function is used in the style of `do.call(rbind, list(x1, x2))` as a convenient way to bind together multiple named `data.frame`s or `vectors`s into a single `data.frame` with the `list` `names` acting as an id column.  The `data.frame` bind is particularly useful for binding transcripts from different observations.  Additionally, `tidy_vector` and `tidy_table` are provided for `cbinding` a `table`'s or named atomic `vector`'s values and names as separate columns in a `data.frame`.  Lastly, `tidy_dtm`/`tidy_tdm` provide convenient ways to tidy a `DocumentTermMatrix` or `TermDocumentMatrix`.
133 | 
134 | #### A Vector
135 | 
136 | ```{r}
137 | x <- list(p=1:500, r=letters)
138 | tidy_list(x)
139 | ```
140 | 
141 | #### A Dataframe
142 | 
143 | ```{r}
144 | x <- list(p=mtcars, r=mtcars, z=mtcars, d=mtcars)
145 | tidy_list(x) 
146 | ```
147 | 
148 | #### A Named Vector
149 | 
150 | ```{r}
151 | x <- setNames(
152 |     sample(LETTERS[1:6], 1000, TRUE), 
153 |     sample(state.name[1:5], 1000, TRUE)
154 | )
155 | tidy_vector(x)
156 | ```
157 | 
158 | #### A Table
159 | 
160 | ```{r}
161 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
162 | tidy_table(x)
163 | ```
164 | 
165 | #### A Matrix
166 | 
167 | ```{r}
168 | mat <- matrix(1:16, nrow = 4,
169 |     dimnames = list(LETTERS[1:4], LETTERS[23:26])
170 | )
171 | 
172 | mat
173 | 
174 | tidy_matrix(mat)
175 | ```
176 | 
177 | With clustering (column and row reordering) via the `cluster_matrix` function. 
178 | 
179 | ```{r}
180 | ## plot heatmap w/o clustering
181 | wo <- mtcars %>%
182 |     cor() %>%
183 |     tidy_matrix('car', 'var') %>%
184 |     ggplot(aes(var, car, fill = value)) +
185 |          geom_tile() +
186 |          scale_fill_viridis(name = expression(r[xy])) +
187 |          theme(
188 |              axis.text.y = element_text(size = 8)   ,
189 |              axis.text.x = element_text(size = 8, hjust = 1, vjust = 1, angle = 45),   
190 |              legend.position = 'bottom',
191 |              legend.key.height = grid::unit(.1, 'cm'),
192 |              legend.key.width = grid::unit(.5, 'cm')
193 |          ) +
194 |          labs(subtitle = "With Out Clustering")
195 | 
196 | ## plot heatmap w clustering
197 | w <- mtcars %>%
198 |     cor() %>%
199 |     cluster_matrix() %>%
200 |     tidy_matrix('car', 'var') %>%
201 |     mutate(
202 |         var = factor(var, levels = unique(var)),
203 |         car = factor(car, levels = unique(car))        
204 |     ) %>%
205 |     group_by(var) %>%
206 |     ggplot(aes(var, car, fill = value)) +
207 |          geom_tile() +
208 |          scale_fill_viridis(name = expression(r[xy])) +
209 |          theme(
210 |              axis.text.y = element_text(size = 8)   ,
211 |              axis.text.x = element_text(size = 8, hjust = 1, vjust = 1, angle = 45),   
212 |              legend.position = 'bottom',
213 |              legend.key.height = grid::unit(.1, 'cm'),
214 |              legend.key.width = grid::unit(.5, 'cm')               
215 |          ) +
216 |          labs(subtitle = "With Clustering")
217 | 
218 | grid.arrange(wo, w, ncol = 2)
219 | 
220 | ```
221 | 
222 | 
223 | #### A DocumentTermMatrix
224 | 
225 | The `tidy_dtm` and `tidy_tdm` functions convert a `DocumentTermMatrix` or `TermDocumentMatrix` into a tidied data set.  
226 | 
227 | ```{r, warning=FALSE}
228 | my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
229 | 
230 | tidy_dtm(my_dtm) %>%
231 |     tidyr::extract(doc, c("time", "turn", "sentence"), "(\\d)_(\\d+)\\.(\\d+)") %>%
232 |     mutate(
233 |         time = as.numeric(time),
234 |         turn = as.numeric(turn),
235 |         sentence = as.numeric(sentence)
236 |     ) %>%
237 |     tbl_df() %T>%
238 |     print() %>%
239 |     group_by(time, term) %>%
240 |     summarize(n = sum(n)) %>%
241 |     group_by(time) %>%
242 |     arrange(desc(n)) %>%
243 |     slice(1:10) %>%
244 |     mutate(term = factor(paste(term, time, sep = "__"), levels = rev(paste(term, time, sep = "__")))) %>%
245 |     ggplot(aes(x = n, y = term)) +
246 |         geom_barh(stat='identity') +
247 |         facet_wrap(~time, ncol=2, scales = 'free_y') +
248 |         scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
249 | ```
250 | 
251 | 
252 | #### A DocumentTermMatrix of Collocations
253 | 
254 | The `tidy_colo_dtm` and `tidy_colo_tdm` functions convert a `DocumentTermMatrix` or `TermDocumentMatrix` into a collocation matrix and then a tidied data set.  
255 | 
256 | ```{r}
257 | my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
258 | sw <- unique(c(
259 |     lexicon::sw_jockers, 
260 |     lexicon::sw_loughran_mcdonald_long, 
261 |     lexicon::sw_fry_1000
262 | ))
263 | 
264 | tidy_colo_dtm(my_dtm) %>%
265 |     tbl_df() %>%
266 |     filter(!term_1 %in% c('i', sw) & !term_2 %in% sw) %>%
267 |     filter(term_1 != term_2) %>%
268 |     unique_pairs() %>%
269 |     filter(n > 15) %>%
270 |     complete(term_1, term_2, fill = list(n = 0)) %>%
271 |     ggplot(aes(x = term_1, y = term_2, fill = n)) +
272 |         geom_tile() +
273 |         scale_fill_gradient(low= 'white', high = 'red') +
274 |         theme(axis.text.x = element_text(angle = 45, hjust = 1))
275 | ```
276 | 
277 | 
278 | ## Combining
279 | 
280 | The `combine` function acts like `paste(x, collapse=" ")` on vectors and lists of vectors.  On dataframes multiple text cells are pasted together within grouping variables.
281 | 
282 | #### A Vector
283 | 
284 | ```{r}
285 | x <- c("Computer", "is", "fun", ".", "Not", "too", "fun", ".")
286 | combine(x)
287 | ```
288 | 
289 | #### A Dataframe
290 | 
291 | ```{r}
292 | (dat <- split_sentence(DATA))
293 | combine(dat[, 1:5, with=FALSE])
294 | ```
295 | 
296 | ## Tabulating
297 | 
298 | `mtabulate` allows the user to transform data types into a dataframe of counts.
299 | 
300 | #### A Vector
301 | 
302 | ```{r}
303 | (x <- list(w=letters[1:10], x=letters[1:5], z=letters))
304 | mtabulate(x)
305 | 
306 | ## Dummy coding
307 | mtabulate(mtcars$cyl[1:10])
308 | ```
309 | 
310 | 
311 | #### A Dataframe
312 | 
313 | ```{r}
314 | (dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3)))
315 | mtabulate(dat)
316 | t(mtabulate(dat))
317 | ```
318 | 
319 | ## Spanning
320 | 
321 | Often it is useful to know the duration (start-end) of turns of talk.  The `duration` function calculates start-end durations as n words.
322 | 
323 | 
324 | #### A Vector
325 | 
326 | ```{r}
327 | (x <- c(
328 |     "Mr. Brown comes! He says hello. i give him coffee.",
329 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
330 |     "go there"
331 | ))
332 | duration(x)
333 | # With grouping variables
334 | groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
335 | duration(x, groups)
336 | ```
337 | 
338 | 
339 | #### A Dataframe
340 | 
341 | ```{r}
342 | duration(DATA)
343 | ```
344 | 
345 | #### Gantt Plot
346 | 
347 | ```{r}
348 | library(ggplot2)
349 | ggplot(duration(DATA), aes(x = start, xend = end, y = person, yend = person, color = sex)) +
350 |     geom_segment(size=4) +
351 |     xlab("Duration (Words)") +
352 |     ylab("Person")
353 | ```
354 | 
355 | ## Splitting
356 | 
357 | The following section provides examples of available splitting functions.
358 | 
359 | ### Indices
360 | 
361 | `split_index` allows the user to supply the integer indices of where to split a data type.
362 | 
363 | #### A Vector
364 | 
365 | ```{r}
366 | split_index(
367 |     LETTERS, 
368 |     indices = c(4, 10, 16), 
369 |     names = c("dog", "cat", "chicken", "rabbit")
370 | )
371 | ```
372 | 
373 | #### A Dataframe
374 | 
375 | Here I calculate the indices of every time the `vs` variable in the `mtcars` data set changes and then split the dataframe on those indices.  The `change_index` function is handy for extracting the indices of changes in runs within an atomic vector.
376 | 
377 | ```{r}
378 | (vs_change <- change_index(mtcars[["vs"]]))
379 | split_index(mtcars, indices = vs_change)
380 | ```
381 | 
382 | ### Matches
383 | 
384 | `split_match` splits on elements that match exactly or via a regular expression match.
385 | 
386 | #### Exact Match
387 | 
388 | ```{r}
389 | set.seed(15)
390 | (x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10))))
391 | 
392 | split_match(x)
393 | split_match(x, split = "C")
394 | split_match(x, split = c("", "C"))
395 | ## Don't include
396 | split_match(x, include = 0)
397 | ## Include at beginning
398 | split_match(x, include = 1)
399 | ## Include at end
400 | split_match(x, include = 2)
401 | ```
402 | 
403 | #### Regex Match
404 | 
405 | Here I use the regex `"^I"` to break on any vectors containing the capital letter I as the first character.
406 | 
407 | ```{r}
408 | split_match(DATA[["state"]], split = "^I", regex=TRUE, include = 1)
409 | ```
410 | 
411 | 
412 | ### Portions
413 | 
414 | At times it is useful to split texts into portioned chunks, operate on the chunks and aggregate the results.  `split_portion` allows the user to do this sort of text shaping.  We can split into n chunks  per grouping variable (via `n.chunks`) or into chunks of n length (via `n.words`).
415 | 
416 | #### A Vector
417 | 
418 | 
419 | ```{r}
420 | with(DATA, split_portion(state, n.chunks = 10))
421 | with(DATA, split_portion(state, n.words = 10))
422 | ```
423 | 
424 | #### A Dataframe
425 | 
426 | ```{r}
427 | with(DATA, split_portion(state, list(sex, adult), n.words = 10))
428 | ```
429 | 
430 | 
431 | ### Runs
432 | 
433 | `split_run` allows the user to split up runs of identical characters.
434 | 
435 | ```{r}
436 | x1 <- c(
437 |      "122333444455555666666",
438 |      NA,
439 |      "abbcccddddeeeeeffffff",
440 |      "sddfg",
441 |      "11112222333"
442 | )
443 | 
444 | x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
445 | 
446 | split_run(x)
447 | ```
448 | 
449 | #### Dataframe
450 | 
451 | ```{r}
452 | DATA[["run.col"]] <- x
453 | split_run(DATA)
454 | ## Reset the DATA dataset
455 | DATA <- textshape::DATA
456 | ```
457 | 
458 | ### Sentences
459 | 
460 | `split_sentece` provides a mapping + regex approach to splitting sentences.  It is less accurate than the Stanford parser but more accurate than a simple regular expression approach alone.
461 | 
462 | #### A Vector
463 | 
464 | ```{r}
465 | (x <- paste0(
466 |     "Mr. Brown comes! He says hello. i give him coffee.  i will ",
467 |     "go at 5 p. m. eastern time.  Or somewhere in between!go there"
468 | ))
469 | split_sentence(x)
470 | ```
471 | 
472 | #### A Dataframe
473 | 
474 | ```{r}
475 | split_sentence(DATA)
476 | ```
477 | 
478 | ### Speakers
479 | 
480 | Often speakers may talk in unison.  This is often displayed in a single cell as a comma separated string of speakers.  Some analysis may require this information to be parsed out and replicated as one turn per speaker.  The `split_speaker` function accomplishes this.
481 | 
482 | ```{r}
483 | DATA$person <- as.character(DATA$person)
484 | DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
485 |     "greg, sally", "sam and sally")
486 | DATA
487 | 
488 | split_speaker(DATA)
489 | ## Reset the DATA dataset
490 | DATA <- textshape::DATA
491 | ```
492 | 
493 | ### Tokens
494 | 
495 | The `split_token` function split data into words and punctuation.
496 | 
497 | #### A Vector
498 | 
499 | ```{r}
500 | (x <- c(
501 |     "Mr. Brown comes! He says hello. i give him coffee.",
502 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
503 |     "go there"
504 | ))
505 | split_token(x)
506 | ```
507 | 
508 | #### A Dataframe
509 | 
510 | ```{r}
511 |  split_token(DATA)
512 | ```
513 | 
514 | 
515 | ### Transcript
516 | 
517 | The `split_transcript` function splits `vector`s with speaker prefixes (e.g., `c("greg: Who me", "sarah: yes you!")`) into a two column `data.frame`.
518 | 
519 | #### A Vector
520 | 
521 | ```{r}
522 | (x <- c(
523 |     "greg: Who me", 
524 |     "sarah: yes you!",
525 |     "greg: well why didn't you say so?",
526 |     "sarah: I did but you weren't listening.",
527 |     "greg: oh :-/ I see...",
528 |     "dan: Ok let's meet at 4:30 pm for drinks"
529 | ))
530 | 
531 | split_transcript(x)
532 | ```
533 | 
534 | ### Words
535 | 
536 | The `split_word` function splits data into words.
537 | 
538 | #### A Vector
539 | 
540 | ```{r}
541 | (x <- c(
542 |     "Mr. Brown comes! He says hello. i give him coffee.",
543 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
544 |     "go there"
545 | ))
546 | split_word(x)
547 | ```
548 | 
549 | #### A Dataframe
550 | 
551 | ```{r}
552 |  split_word(DATA)
553 | ```
554 | 
555 | ## Grabbing
556 | 
557 | The following section provides examples of available grabbing (from a starting point up to an ending point) functions.
558 | 
559 | ### Indices
560 | 
561 | `grab_index` allows the user to supply the integer indices of where to grab (from - up to) a data type.
562 | 
563 | #### A Vector 
564 | 
565 | ```{r}
566 | grab_index(DATA$state, from = 2, to = 4)
567 | grab_index(DATA$state, from = 9)
568 | grab_index(DATA$state, to = 3)
569 | ```
570 | 
571 | #### A Dataframe
572 | 
573 | ```{r}
574 | grab_index(DATA, from = 2, to = 4)
575 | ```
576 | 
577 | #### A List
578 | 
579 | ```{r}
580 | grab_index(as.list(DATA$state), from = 2, to = 4)
581 | ```
582 | 
583 | ### Matches
584 | 
585 | `grab_match` grabs (from - up to) elements that match a regular expression.
586 | 
587 | 
588 | #### A Vector 
589 | 
590 | ```{r}
591 | grab_match(DATA$state, from = 'dumb', to = 'liar')
592 | grab_match(DATA$state, from = '^What are')
593 | grab_match(DATA$state, to = 'we do[?]')
594 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE, 
595 |     from.n = 'last', to.n = 'first')
596 | ```
597 | 
598 | #### A Dataframe
599 | 
600 | ```{r}
601 | grab_match(DATA, from = 'dumb', to = 'liar')
602 | ```
603 | 
604 | #### A List
605 | 
606 | ```{r}
607 | grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
608 | ```
609 | 
610 | ## Putting It Together 
611 | 
612 | Eduardo Flores blogged about [What the candidates say, analyzing republican debates using R](https://www.r-bloggers.com/2015/11/what-the-candidates-say-analyzing-republican-debates-using-r/) where he demonstrated some scraping and analysis techniques.  Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/tidyverse/magrittr) pipeline.  The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates.  The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.
613 | 
614 | ```{r}
615 | if (!require("pacman")) install.packages("pacman")
616 | pacman::p_load(rvest, magrittr, xml2)
617 | 
618 | debates <- c(
619 |     wisconsin = "110908",
620 |     boulder = "110906",
621 |     california = "110756",
622 |     ohio = "110489"
623 | )
624 | 
625 | lapply(debates, function(x){
626 |     xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) %>%
627 |         rvest::html_nodes("p") %>%
628 |         rvest::html_text() %>%
629 |         textshape::split_index(., grep("^[A-Z]+:", .)) %>%
630 |         #textshape::split_match("^[A-Z]+:", TRUE, TRUE) %>% #equal to line above
631 |         textshape::combine() %>%
632 |         textshape::split_transcript() %>%
633 |         textshape::split_sentence()
634 | }) %>%
635 |     textshape::tidy_list("location")
636 | ```
637 | 
638 | 
639 | 
640 | 


--------------------------------------------------------------------------------
/data/DATA.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/DATA.rda


--------------------------------------------------------------------------------
/data/golden_rules.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/golden_rules.rda


--------------------------------------------------------------------------------
/data/hamlet.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/hamlet.rda


--------------------------------------------------------------------------------
/data/simple_dtm.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/data/simple_dtm.rda


--------------------------------------------------------------------------------
/inst/build.R:
--------------------------------------------------------------------------------
 1 | root <- Sys.getenv("USERPROFILE")
 2 | pack <- basename(getwd())
 3 | 
 4 | quick <-  TRUE
 5 | pdf <- FALSE
 6 | 
 7 | unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)
 8 | devtools::document()
 9 | devtools::install(quick = quick, build_vignettes = FALSE, dependencies = TRUE)
10 | 
11 | if(pdf){
12 |     path <- find.package(pack)
13 |     system(paste(shQuote(file.path(R.home("bin"), "R")), "CMD", "Rd2pdf", shQuote(path)))
14 |     file.copy(paste0(pack, '.pdf'), file.path(root,"Desktop", paste0(pack, '.pdf')))
15 |     while (file.exists(paste0(pack, ".pdf"))) {unlink(paste0(pack, ".pdf"), recursive = TRUE, force = TRUE)}
16 |     empts <- grep("^\\.Rd", dir(all.files = TRUE), value = TRUE)
17 |     unlink(empts, recursive = TRUE, force = TRUE)
18 | }
19 | 
20 | message("Done!")
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/inst/docs/Simpsons_Roasting_on_an_Open_Fire_Script.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/inst/docs/Simpsons_Roasting_on_an_Open_Fire_Script.pdf


--------------------------------------------------------------------------------
/inst/extra_statdoc/readme.R:
--------------------------------------------------------------------------------
1 | <p><img src="https://raw.githubusercontent.com/trinker/textshape/master/inst/textshape_logo/r_textshape.png" width="300"/><br/>
2 | <p><a href="http://trinker.github.com/textshape_dev">textshape</a> is a...</p>
3 | <p>Download the development version of textshape <a href="https://github.com/trinker/textshape/">here</a>
4 | 


--------------------------------------------------------------------------------
/inst/make_data/golden_rules.R:
--------------------------------------------------------------------------------
 1 | pacman::p_load(textshape, textclean, tidyverse, stringi, magrittr)
 2 | 
 3 | golden_rules <- 'https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt' %>%
 4 |     readLines() %>%
 5 |     `[`(3:197) %>%
 6 |     # drop_element('^\\s*$') %>%
 7 |     split_match('^\\s*$', FALSE, TRUE) %>%
 8 |     map(function(x){
 9 | 
10 |         tibble::tibble(
11 |             Rule = x %>%
12 |                 `[`(1) %>%
13 |                 stringi::stri_replace_all_regex('^\\d+\\)\\s*', '') %>%
14 |                 trimws(),
15 |             Text = x[2],
16 |             Outcome = x %>%
17 |                 `[`(3) %>%
18 |                 stringi::stri_replace_all_regex('^\\[', 'c(') %>%
19 |                 stringi::stri_replace_all_regex('\\]$', ')') %>%
20 |                 {parse(text = .)} %>%
21 |                 eval()
22 |         )
23 |     }) %>%
24 |     bind_rows() %>%
25 |     nest(-c(Rule, Text), .key = Outcome) %>%
26 |     filter(!grepl("[^ -~]", Text))
27 | 
28 | # golden_rules %$%
29 | #     split_sentence(Text)
30 | 
31 | # pax::new_data(golden_rules)
32 | #golden_rules$Text
33 | 


--------------------------------------------------------------------------------
/inst/staticdocs/index.R:
--------------------------------------------------------------------------------
1 | library(staticdocs)
2 | 
3 | sd_section("",
4 |   "Function for...",
5 |   c(
6 |       "myfun"
7 |   )
8 | )


--------------------------------------------------------------------------------
/inst/supporting_docs/LaverGarry.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/inst/supporting_docs/LaverGarry.zip


--------------------------------------------------------------------------------
/man/DATA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textshape-package.R
 3 | \docType{data}
 4 | \name{DATA}
 5 | \alias{DATA}
 6 | \title{Fictitious Classroom Dialogue}
 7 | \format{
 8 | A data frame with 11 rows and 5 variables
 9 | }
10 | \usage{
11 | data(DATA)
12 | }
13 | \description{
14 | A fictitious dataset useful for small demonstrations.
15 | }
16 | \details{
17 | \itemize{
18 |   \item person. Speaker
19 |   \item sex. Gender
20 |   \item adult. Dummy coded adult (0-no; 1-yes)
21 |   \item state. Statement (dialogue)
22 |   \item code. Dialogue coding scheme
23 | }
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/bind_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bind_list.R
 3 | \name{bind_list}
 4 | \alias{bind_list}
 5 | \title{Row Bind a List of Named Dataframes or Vectors}
 6 | \usage{
 7 | bind_list(x, id.name = "id", content.name = "content", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A named \code{\link[base]{list}} of
11 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.}
12 | 
13 | \item{id.name}{The name to use for the column created from the \code{\link[base]{list}}.}
14 | 
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{list}}
16 | of \code{\link[base]{vector}}s (only used if \code{x} is  \code{\link[base]{vector}}).}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{list}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_list}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | bind_list(list(p=1:500, r=letters))
30 | bind_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
31 | 
32 | ## 2015 Vice-Presidential Debates Example
33 | if (!require("pacman")) install.packages("pacman")
34 | pacman::p_load(rvest, magrittr, xml2)
35 | 
36 | debates <- c(
37 |     wisconsin = "110908",
38 |     boulder = "110906",
39 |     california = "110756",
40 |     ohio = "110489"
41 | )
42 | 
43 | lapply(debates, function(x){
44 |     xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) \%>\%
45 |         rvest::html_nodes("p") \%>\%
46 |         rvest::html_text() \%>\%
47 |         textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
48 |         textshape::combine() \%>\%
49 |         textshape::split_transcript() \%>\%
50 |         textshape::split_sentence()
51 | }) \%>\%
52 |     textshape::bind_list("location")
53 | }
54 | }
55 | 


--------------------------------------------------------------------------------
/man/bind_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bind_table.R
 3 | \name{bind_table}
 4 | \alias{bind_table}
 5 | \title{Column Bind a Table's Values with Its Names}
 6 | \usage{
 7 | bind_table(x, id.name = "id", content.name = "content", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{\link[base]{table}}.}
11 | 
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{table}}
13 | \code{\link[base]{names}}.}
14 | 
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{table}}
16 | values.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{table}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_table}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
30 | bind_table(x)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/bind_vector.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bind_vector.R
 3 | \name{bind_vector}
 4 | \alias{bind_vector}
 5 | \title{Column Bind an Atomic Vector's Values with Its Names}
 6 | \usage{
 7 | bind_vector(x, id.name = "id", content.name = "content", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A named atomic \code{\link[base]{vector}}.}
11 | 
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{vector}}
13 | \code{\link[base]{names}}.}
14 | 
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{vector}}
16 | values.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{vector}} as an \code{id} column.
23 | }
24 | \description{
25 | Deprecated, use \code{\link[textshape]{tidy_vector}} instead.
26 | }
27 | \examples{
28 | \dontrun{
29 | x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
30 | bind_vector(x)
31 | }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/change_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/change_index.R
 3 | \name{change_index}
 4 | \alias{change_index}
 5 | \title{Indexing of Changes in Runs}
 6 | \usage{
 7 | change_index(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A vector.}
11 | 
12 | \item{\ldots}{ignored.}
13 | }
14 | \value{
15 | Returns a vector of integer indices of where a vector initially 
16 | changes.
17 | }
18 | \description{
19 | Find the indices of changes in runs in a vector.  This function pairs well 
20 | with \code{split_index} and is the default for the \code{indices} in all 
21 | \code{split_index} functions that act on atomic vectors.
22 | }
23 | \examples{
24 | set.seed(10)
25 | (x <- sample(0:1, 20, TRUE))
26 | change_index(x)
27 | split_index(x, change_index(x))
28 | 
29 | 
30 | (p_chng <- change_index(CO2[["Plant"]]))
31 | split_index(CO2[["Plant"]], p_chng)
32 | }
33 | \seealso{
34 | \code{\link[textshape]{split_index}}
35 | }
36 | 


--------------------------------------------------------------------------------
/man/cluster_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cluster_matrix.R
 3 | \name{cluster_matrix}
 4 | \alias{cluster_matrix}
 5 | \title{Reorder a Matrix Based on Hierarchical Clustering}
 6 | \usage{
 7 | cluster_matrix(x, dim = "both", method = "ward.D2", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A matrix.}
11 | 
12 | \item{dim}{The dimension to reorder (cluster); must be set to "row", "col", 
13 | or "both".}
14 | 
15 | \item{method}{The agglomeration method to be used (see 
16 | \code{\link[stats]{hclust}}).}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a reordered matrix.
22 | }
23 | \description{
24 | Reorder matrix rows, columns, or both via hierarchical clustering.
25 | }
26 | \examples{
27 | cluster_matrix(mtcars)
28 | cluster_matrix(mtcars, dim = 'row')
29 | cluster_matrix(mtcars, dim = 'col')
30 | 
31 | \dontrun{
32 | if (!require("pacman")) install.packages("pacman")
33 | pacman::p_load(tidyverse, viridis, gridExtra)
34 | 
35 | ## plot heatmap w/o clustering
36 | wo <- mtcars \%>\%
37 |     cor() \%>\%
38 |     tidy_matrix('car', 'var') \%>\%
39 |     ggplot(aes(var, car, fill = value)) +
40 |          geom_tile() +
41 |          scale_fill_viridis(name = expression(r[xy])) +
42 |          theme(
43 |              axis.text.y = element_text(size = 8)   ,
44 |              axis.text.x = element_text(
45 |                  size = 8, 
46 |                  hjust = 1, 
47 |                  vjust = 1, 
48 |                  angle = 45
49 |              ),   
50 |              legend.position = 'bottom',
51 |              legend.key.height = grid::unit(.1, 'cm'),
52 |              legend.key.width = grid::unit(.5, 'cm')
53 |          ) +
54 |          labs(subtitle = "With Out Clustering")
55 | 
56 | ## plot heatmap w clustering
57 | w <- mtcars \%>\%
58 |     cor() \%>\%
59 |     cluster_matrix() \%>\%
60 |     tidy_matrix('car', 'var') \%>\%
61 |     mutate(
62 |         var = factor(var, levels = unique(var)),
63 |         car = factor(car, levels = unique(car))        
64 |     ) \%>\%
65 |     group_by(var) \%>\%
66 |     ggplot(aes(var, car, fill = value)) +
67 |          geom_tile() +
68 |          scale_fill_viridis(name = expression(r[xy])) +
69 |          theme(
70 |              axis.text.y = element_text(size = 8)   ,
71 |              axis.text.x = element_text(
72 |                  size = 8, 
73 |                  hjust = 1, 
74 |                  vjust = 1, 
75 |                  angle = 45
76 |              ),   
77 |              legend.position = 'bottom',
78 |              legend.key.height = grid::unit(.1, 'cm'),
79 |              legend.key.width = grid::unit(.5, 'cm')               
80 |          ) +
81 |          labs(subtitle = "With Clustering")
82 | 
83 | gridExtra::grid.arrange(wo, w, ncol = 2)
84 | }
85 | }
86 | \seealso{
87 | \code{\link[stats]{hclust}}
88 | }
89 | 


--------------------------------------------------------------------------------
/man/column_to_rownames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/column_to_rownames.R
 3 | \name{column_to_rownames}
 4 | \alias{column_to_rownames}
 5 | \title{Add a Column as Rownames}
 6 | \usage{
 7 | column_to_rownames(x, loc = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{An object that can be coerced to a \code{\link[base]{data.frame}}.}
11 | 
12 | \item{loc}{The column location as either an integer or string index location.
13 | Must be unique row names.}
14 | }
15 | \value{
16 | Returns a \code{\link[base]{data.frame}} with the specified column
17 | moved to rownames.
18 | }
19 | \description{
20 | Takes an existing column and uses it as rownames instead.  This is useful
21 | when turning a \code{\link[base]{data.frame}} into a \code{\link[base]{matrix}}.
22 | Inspired by the \pkg{tibble} package's \code{column_to_row} which is now
23 | deprecated if done on a \pkg{tibble} object.  By coercing to a
24 | \code{\link[base]{data.frame}} this problem is avoided.
25 | }
26 | \examples{
27 | state_dat <- data.frame(state.name, state.area, state.center, state.division)
28 | column_to_rownames(state_dat)
29 | column_to_rownames(state_dat, 'state.name')
30 | }
31 | 


--------------------------------------------------------------------------------
/man/combine.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/combine.R
 3 | \name{combine}
 4 | \alias{combine}
 5 | \alias{combine.default}
 6 | \alias{combine.data.frame}
 7 | \title{Combine Elements}
 8 | \usage{
 9 | combine(x, ...)
10 | 
11 | \method{combine}{default}(x, fix.punctuation = TRUE, ...)
12 | 
13 | \method{combine}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with runs.}
17 | 
18 | \item{fix.punctuation}{logical If \code{TRUE} spaces before/after punctuation
19 | that should not be are a removed (regex used:
20 | \code{"(\\s+(?=[,.?!;:\%-]))|((?<=[$-])\\s+)"}).}
21 | 
22 | \item{text.var}{The name of the text variable.}
23 | 
24 | \item{\ldots}{Ignored.}
25 | }
26 | \value{
27 | Returns a vector (if given a list/vector) or an expanded
28 | \code{\link[data.table]{data.table}} with elements pasted together.
29 | }
30 | \description{
31 | Combine (\code{\link[base]{paste}}) elements (\code{\link[base]{vector}}s,
32 | \code{\link[base]{list}}s, or \code{\link[base]{data.frame}}s) together
33 | with \code{collapse = TRUE}.
34 | }
35 | \examples{
36 | (x <- split_token(DATA[["state"]][1], FALSE))
37 | combine(x)
38 | 
39 | (x2 <- split_token(DATA[["state"]], FALSE))
40 | combine(x2)
41 | 
42 | (x3 <- split_sentence(DATA))
43 | 
44 | ## without dropping the non-group variable column
45 | combine(x3)
46 | 
47 | ## Dropping the non-group variable column
48 | combine(x3[, 1:5, with=FALSE])
49 | }
50 | 


--------------------------------------------------------------------------------
/man/duration.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/duration.R
 3 | \name{duration}
 4 | \alias{duration}
 5 | \alias{duration.default}
 6 | \alias{duration.data.frame}
 7 | \alias{duration.numeric}
 8 | \alias{starts}
 9 | \alias{ends}
10 | \title{Duration of Turns of Talk}
11 | \usage{
12 | duration(x, ...)
13 | 
14 | \method{duration}{default}(x, grouping.var = NULL, ...)
15 | 
16 | \method{duration}{data.frame}(x, text.var = TRUE, ...)
17 | 
18 | \method{duration}{numeric}(x, ...)
19 | 
20 | starts(x, ...)
21 | 
22 | ends(x, ...)
23 | }
24 | \arguments{
25 | \item{x}{A \code{\link[base]{data.frame}} or character vector with a text
26 | variable or a numeric vector.}
27 | 
28 | \item{grouping.var}{The grouping variables.  Default \code{NULL} generates
29 | one word list for all text.  Also takes a single grouping variable or a list
30 | of 1 or more grouping variables.}
31 | 
32 | \item{text.var}{The name of the text variable.  If \code{TRUE}
33 | \code{duration} tries to detect the text column.}
34 | 
35 | \item{\ldots}{Ignored.}
36 | }
37 | \value{
38 | Returns a vector or data frame of starts and/or ends.
39 | }
40 | \description{
41 | \code{duration} - Calculate duration (start and end times) for duration of 
42 | turns of talk measured in words.
43 | 
44 | \code{startss} - Calculate start times from a numeric vector.
45 | 
46 | \code{ends} - Calculate end times from a numeric vector.
47 | }
48 | \examples{
49 | (x <- c(
50 |     "Mr. Brown comes! He says hello. i give him coffee.",
51 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
52 |     "go there"
53 | ))
54 | duration(x)
55 | group <- c("A", "B", "A")
56 | duration(x, group)
57 | 
58 | groups <- list(group1 = c("A", "B", "A"), group2 = c("red", "red", "green"))
59 | duration(x, groups)
60 | 
61 | data(DATA)
62 | duration(DATA)
63 | 
64 | ## Larger data set
65 | duration(hamlet)
66 | 
67 | ## Integer values
68 | x <- sample(1:10, 10)
69 | duration(x)
70 | starts(x)
71 | ends(x)
72 | }
73 | 


--------------------------------------------------------------------------------
/man/flatten.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/flatten.R
 3 | \name{flatten}
 4 | \alias{flatten}
 5 | \title{Flatten a Nested List of Vectors Into a Single Tier List of Vectors}
 6 | \usage{
 7 | flatten(x, sep = "_", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A nested, named list of vectors.}
11 | 
12 | \item{sep}{A separator to use for the concatenation of the names from the
13 | nested list.}
14 | 
15 | \item{\ldots}{ignored.}
16 | }
17 | \value{
18 | Returns a flattened list.
19 | }
20 | \description{
21 | Flatten a named, nested list of atomic vectors to a single level using the
22 | concatenated list/atomic vector names as the names of the single tiered
23 | list.
24 | }
25 | \note{
26 | The order of the list is sorted alphabetically.  Pull requests for the
27 | option to return the original order would be appreciated.
28 | }
29 | \examples{
30 | x <- list(
31 |     urban = list(
32 |         cars = c('volvo', 'ford'),
33 |         food.dining = list(
34 |             local.business = c('carls'),
35 |             chain.business = c('dennys', 'panera')
36 |         )
37 |     ),
38 |     rural = list(
39 |         land.use = list(
40 |             farming =list(
41 |                 dairy = c('cows'),
42 |                 vegie.plan = c('carrots')
43 |             )
44 |         ),
45 |         social.rec = list(
46 |             community.center = c('town.square')
47 |         ),
48 |         people.type = c('good', 'bad', 'in.between')
49 |     ),
50 |     other.locales = c('suburban'),
51 |     missing = list(
52 |         unknown = c(),
53 |         known = c()
54 |     ),
55 |     end = c('wow')
56 | )
57 | 
58 | x
59 | 
60 | flatten(x)
61 | flatten(x, ' -> ')
62 | }
63 | \references{
64 | \url{https://stackoverflow.com/a/41882883/1000343} \cr
65 | \url{https://stackoverflow.com/a/48357114/1000343}
66 | }
67 | \author{
68 | StackOverflow user @Michael and Paul Foster and Tyler
69 | Rinker <tyler.rinker@gmail.com>.
70 | }
71 | 


--------------------------------------------------------------------------------
/man/from_to.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/from_to.R
 3 | \name{from_to}
 4 | \alias{from_to}
 5 | \alias{from_to.default}
 6 | \alias{from_to.character}
 7 | \alias{from_to.factor}
 8 | \alias{from_to.numeric}
 9 | \alias{from_to.data.frame}
10 | \alias{from_to_summarize}
11 | \title{Prepare Discourse Data for Network Plotting}
12 | \usage{
13 | from_to(x, ...)
14 | 
15 | \method{from_to}{default}(x, final = "End", ...)
16 | 
17 | \method{from_to}{character}(x, final = "End", ...)
18 | 
19 | \method{from_to}{factor}(x, final = "End", ...)
20 | 
21 | \method{from_to}{numeric}(x, final = "End", ...)
22 | 
23 | \method{from_to}{data.frame}(x, from.var, final = "End", ...)
24 | 
25 | from_to_summarize(x, from.var, id.vars = NULL, text.var = TRUE, ...)
26 | }
27 | \arguments{
28 | \item{x}{A data form \code{vector} or \code{data.frame}.}
29 | 
30 | \item{final}{The name of the closing element or node.}
31 | 
32 | \item{from.var}{A character string naming the column to be considered the
33 | origin of the talk.}
34 | 
35 | \item{id.vars}{The variables that correspond to the speaker or are attributes
36 | of the speaker (from variable).}
37 | 
38 | \item{text.var}{The name of the text variable.  If \code{TRUE}
39 | \code{duration} tries to detect the text column.}
40 | 
41 | \item{\ldots}{Ignored.}
42 | }
43 | \value{
44 | Returns a vector (if given a vector) or an augmented
45 | \code{\link[data.table]{data.table}}.
46 | }
47 | \description{
48 | \code{from_to} - Add the next speaker as the from variable in a to/from
49 | network data structure.  Assumes that the flow of discourse is coming from
50 | person A to person B, or at the very least the talk is taken up by person B.
51 | Works by taking the vector of speakers and shifting everything down one and
52 | then adding a closing element.
53 | 
54 | \code{from_to_summarize} - A wrapper for \code{from_to.data.frame} that
55 | adds a \code{word.count} column and then combines duplicate rows.
56 | }
57 | \examples{
58 | from_to(DATA, 'person')
59 | from_to_summarize(DATA, 'person')
60 | from_to_summarize(DATA, 'person', c('sex', 'adult'))
61 | \dontrun{
62 | if (!require("pacman")) install.packages("pacman"); library(pacman)
63 | p_load(dplyr, geomnet, qdap, stringi, scales)
64 | p_load_current_gh('trinker/textsahpe')
65 | 
66 | dat <- from_to_summarize(DATA, 'person', c('sex', 'adult')) \%>\%
67 |     mutate(words = rescale(word.count, c(.5, 1.5)))
68 | 
69 | dat \%>\%
70 |     ggplot(aes(from_id = from, to_id = to)) +
71 |         geom_net(
72 |             aes(linewidth = words),
73 |             layout.alg = "fruchtermanreingold",
74 |             directed = TRUE,
75 |             labelon = TRUE,
76 |             size = 1,
77 |             labelcolour = 'black',
78 |             ecolour = "grey70",
79 |             arrowsize = 1,
80 |             curvature = .1
81 |         ) +
82 |         theme_net() +
83 |         xlim(c(-0.05, 1.05))
84 | }
85 | }
86 | 


--------------------------------------------------------------------------------
/man/golden_rules.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textshape-package.R
 3 | \docType{data}
 4 | \name{golden_rules}
 5 | \alias{golden_rules}
 6 | \title{Sentence Boundary Disambiguation Edge Cases}
 7 | \format{
 8 | A data frame with 45 rows and 3 variables
 9 | }
10 | \usage{
11 | data(golden_rules)
12 | }
13 | \description{
14 | A slightly filtered dataset containing Dias's sentence boundary
15 | disambiguation edge cases.  This is a nested data set with the outcome
16 | column as a nested list of desired splits.  The non-ASCII cases and spaced
17 | ellipsis examples have been removed.
18 | }
19 | \details{
20 | \itemize{
21 |   \item Rule. The name of the rule to test
22 |   \item Text. The testing text
23 |   \item Outcome. The desired outcome of the sentence disambiguation
24 | }
25 | }
26 | \references{
27 | Dias, Kevin S. 2015. Golden Rules (English).
28 | Retrieved: https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/man/grab_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/grab_index.R
 3 | \name{grab_index}
 4 | \alias{grab_index}
 5 | \alias{grab_index.character}
 6 | \alias{grab_index.default}
 7 | \alias{grab_index.list}
 8 | \alias{grab_index.data.frame}
 9 | \alias{grab_index.matrix}
10 | \title{Get Elements Matching Between 2 Points}
11 | \usage{
12 | grab_index(x, from = NULL, to = NULL, ...)
13 | 
14 | \method{grab_index}{character}(x, from = NULL, to = NULL, ...)
15 | 
16 | \method{grab_index}{default}(x, from = NULL, to = NULL, ...)
17 | 
18 | \method{grab_index}{list}(x, from = NULL, to = NULL, ...)
19 | 
20 | \method{grab_index}{data.frame}(x, from = NULL, to = NULL, ...)
21 | 
22 | \method{grab_index}{matrix}(x, from = NULL, to = NULL, ...)
23 | }
24 | \arguments{
25 | \item{x}{A character vector, \code{\link[base]{data.frame}}, or list.}
26 | 
27 | \item{from}{An integer to start from (if \code{NULL} defaults to the first
28 | element/row).}
29 | 
30 | \item{to}{A integer to get up to (if \code{NULL} defaults to the last
31 | element/row).}
32 | 
33 | \item{\ldots}{ignored.}
34 | }
35 | \value{
36 | Returns a subset of the original data set.
37 | }
38 | \description{
39 | Use regexes to get all the elements between two points.
40 | }
41 | \examples{
42 | grab_index(DATA, from = 2, to = 4)
43 | grab_index(DATA$state, from = 2, to = 4)
44 | grab_index(DATA$state, from = 2)
45 | grab_index(DATA$state, to = 4)
46 | grab_index(matrix(1:100, nrow = 10), 2, 4)
47 | }
48 | 


--------------------------------------------------------------------------------
/man/grab_match.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/grab_match.R
 3 | \name{grab_match}
 4 | \alias{grab_match}
 5 | \alias{grab_match.character}
 6 | \alias{grab_match.list}
 7 | \alias{grab_match.data.frame}
 8 | \title{Get Elements Matching Between 2 Points}
 9 | \usage{
10 | grab_match(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
11 | 
12 | \method{grab_match}{character}(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
13 | 
14 | \method{grab_match}{list}(x, from = NULL, to = NULL, from.n = 1, to.n = 1, ...)
15 | 
16 | \method{grab_match}{data.frame}(
17 |   x,
18 |   from = NULL,
19 |   to = NULL,
20 |   from.n = 1,
21 |   to.n = 1,
22 |   text.var = TRUE,
23 |   ...
24 | )
25 | }
26 | \arguments{
27 | \item{x}{A character vector, \code{\link[base]{data.frame}}, or list.}
28 | 
29 | \item{from}{A regex to start getting from (if \code{NULL} defaults to the 
30 | first element/row).}
31 | 
32 | \item{to}{A regex to get up to (if \code{NULL} defaults to the last element/row).}
33 | 
34 | \item{from.n}{If more than one element matches \code{from} this dictates 
35 | which one should be used.  Must be an integer up to the number of possible
36 | matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
37 | possible), or \code{'n'} (the same as \code{'last'}).}
38 | 
39 | \item{to.n}{If more than one element matches \code{to} this dictates 
40 | which one should be used.  Must be an integer up to the number of possible
41 | matches, \code{'first'} (equal to \code{1}), \code{'last'} (the last match
42 | possible), or \code{'n'} (the same as \code{'last'}).}
43 | 
44 | \item{text.var}{The name of the text variable with matches. If \code{TRUE}
45 | \code{grab_match} tries to detect the text column.}
46 | 
47 | \item{\ldots}{Other arguments passed to \code{\link[base]{grep}}, most notable
48 | is \code{ignore.case}.}
49 | }
50 | \value{
51 | Returns a subset of the original data set.
52 | }
53 | \description{
54 | Use regexes to get all the elements between two points.
55 | }
56 | \examples{
57 | grab_match(DATA$state, from = 'dumb', to = 'liar')
58 | grab_match(DATA$state, from = 'dumb')
59 | grab_match(DATA$state, to = 'liar')
60 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE)
61 | grab_match(DATA$state, from = 'no', to = 'the', ignore.case = TRUE, 
62 |     from.n = 'first', to.n = 'last')
63 | grab_match(as.list(DATA$state), from = 'dumb', to = 'liar')
64 | 
65 | ## Data.frame: attempts to find text.var
66 | grab_match(DATA, from = 'dumb', to = 'liar')
67 | }
68 | 


--------------------------------------------------------------------------------
/man/hamlet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textshape-package.R
 3 | \docType{data}
 4 | \name{hamlet}
 5 | \alias{hamlet}
 6 | \title{Hamlet (Complete & Split by Sentence)}
 7 | \format{
 8 | A data frame with 2007 rows and 7 variables
 9 | }
10 | \usage{
11 | data(hamlet)
12 | }
13 | \description{
14 | A dataset containing the complete dialogue of Hamlet with turns of talk split
15 | into sentences.
16 | }
17 | \details{
18 | \itemize{
19 |   \item act. The act (akin to repeated measures)
20 |   \item tot. The turn of talk
21 |   \item scene. The scene (nested within an act)
22 |   \item location. Location of the scene
23 |   \item person. Character in the play
24 |   \item died. Logical coded death variable if yes the character dies in the
25 |   play
26 |   \item dialogue. The spoken dialogue
27 | }
28 | }
29 | \references{
30 | http://www.gutenberg.org
31 | }
32 | \keyword{datasets}
33 | 


--------------------------------------------------------------------------------
/man/mtabulate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mtabulate.R
 3 | \name{mtabulate}
 4 | \alias{mtabulate}
 5 | \alias{as_list}
 6 | \title{Tabulate Frequency Counts for Multiple Vectors}
 7 | \usage{
 8 | mtabulate(vects)
 9 | 
10 | as_list(mat, nm = rownames(mat))
11 | }
12 | \arguments{
13 | \item{vects}{A \code{\link[base]{vector}}, \code{\link[base]{list}}, or
14 | \code{\link[base]{data.frame}} of named/unnamed vectors.}
15 | 
16 | \item{mat}{A matrix of counts.}
17 | 
18 | \item{nm}{A character vector of names to assign to the list.}
19 | }
20 | \value{
21 | \code{mtabulate} - Returns a \code{\link[base]{data.frame}} with 
22 | columns equal to number of unique elements and the number of rows equal to 
23 | the the original length of the \code{\link[base]{vector}}, 
24 | \code{\link[base]{list}}, or \code{\link[base]{data.frame}} (length equals 
25 | number of columns in \code{\link[base]{data.frame}}).  If list of vectors is 
26 | named these will be the rownames of the dataframe.
27 | 
28 | \code{as_list} - Returns a list of elements.
29 | }
30 | \description{
31 | \code{mtabulate} - Similar to \code{\link[base]{tabulate}} that works on 
32 | multiple vectors.
33 | 
34 | \code{as_list} - Convert a count matrix to a named list of elements.  The
35 | semantic inverse of \code{mtabulate}.
36 | }
37 | \examples{
38 | mtabulate(list(w=letters[1:10], x=letters[1:5], z=letters))
39 | mtabulate(list(mtcars$cyl[1:10]))
40 | 
41 | ## Dummy coding
42 | mtabulate(mtcars$cyl[1:10])
43 | mtabulate(CO2[, "Plant"])
44 | 
45 | dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3))
46 | mtabulate(dat)
47 | as_list(mtabulate(dat))
48 | t(mtabulate(dat))
49 | as_list(t(mtabulate(dat)))
50 | }
51 | \references{
52 | \url{https://stackoverflow.com/a/9961324/1000343}
53 | }
54 | \seealso{
55 | \code{\link[base]{tabulate}}
56 | }
57 | \author{
58 | Joran Elias and Tyler Rinker <tyler.rinker@gmail.com>.
59 | }
60 | \keyword{frequency}
61 | \keyword{tabulate}
62 | 


--------------------------------------------------------------------------------
/man/simple_dtm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textshape-package.R
 3 | \docType{data}
 4 | \name{simple_dtm}
 5 | \alias{simple_dtm}
 6 | \title{Simple \code{\link[tm]{DocumentTermMatrix}}}
 7 | \format{
 8 | A list with 6 elements
 9 | }
10 | \usage{
11 | data(simple_dtm)
12 | }
13 | \description{
14 | A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
15 | }
16 | \details{
17 | \describe{
18 |   \item{i}{The document locations}
19 |   \item{j}{The term locations}
20 |   \item{v}{The count of terms for that particular element position}
21 |   \item{nrow}{The number of rows}
22 |   \item{ncol}{The number of columns}
23 |   \item{dimnames}{document and terms}
24 | }
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/split_index.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_index.R
 3 | \name{split_index}
 4 | \alias{split_index}
 5 | \alias{split_index.list}
 6 | \alias{split_index.data.frame}
 7 | \alias{split_index.matrix}
 8 | \alias{split_index.numeric}
 9 | \alias{split_index.factor}
10 | \alias{split_index.character}
11 | \alias{split_index.default}
12 | \title{Split Data Forms at Specified Indices}
13 | \usage{
14 | split_index(
15 |   x,
16 |   indices = if (is.atomic(x)) {
17 |      NULL
18 |  } else {
19 |      change_index(x)
20 |  },
21 |   names = NULL,
22 |   ...
23 | )
24 | 
25 | \method{split_index}{list}(x, indices, names = NULL, ...)
26 | 
27 | \method{split_index}{data.frame}(x, indices, names = NULL, ...)
28 | 
29 | \method{split_index}{matrix}(x, indices, names = NULL, ...)
30 | 
31 | \method{split_index}{numeric}(x, indices = change_index(x), names = NULL, ...)
32 | 
33 | \method{split_index}{factor}(x, indices = change_index(x), names = NULL, ...)
34 | 
35 | \method{split_index}{character}(x, indices = change_index(x), names = NULL, ...)
36 | 
37 | \method{split_index}{default}(x, indices = change_index(x), names = NULL, ...)
38 | }
39 | \arguments{
40 | \item{x}{A data form (\code{list}, \code{vector}, \code{data.frame},
41 | \code{matrix}).}
42 | 
43 | \item{indices}{A vector of integer indices to split at.  If \code{indices}
44 | contains the index 1, it will be silently dropped.  The default value when
45 | \code{x} evaluates to \code{TRUE} for \code{\link[base]{is.atomic}} is to use
46 | \code{\link[textshape]{change_index}(x)}.}
47 | 
48 | \item{names}{Optional vector of names to give to the list elements.}
49 | 
50 | \item{\ldots}{Ignored.}
51 | }
52 | \value{
53 | Returns of list of data forms broken at the \code{indices}.
54 | }
55 | \description{
56 | Split data forms at specified integer indices.
57 | }
58 | \note{
59 | Two dimensional object will retain dimension (i.e., \code{drop = FALSE}
60 | is used).
61 | }
62 | \examples{
63 | ## character
64 | split_index(LETTERS, c(4, 10, 16))
65 | split_index(LETTERS, c(4, 10, 16), c("dog", "cat", "chicken", "rabbit"))
66 | 
67 | ## numeric
68 | split_index(1:100, c(33, 66))
69 | 
70 | ## factor
71 | (p_chng <- change_index(CO2[["Plant"]]))
72 | split_index(CO2[["Plant"]], p_chng)
73 | #`change_index` was unnecessary as it is the default of atomic vectors
74 | split_index(CO2[["Plant"]])
75 | 
76 | ## list
77 | split_index(as.list(LETTERS), c(4, 10, 16))
78 | 
79 | ## data.frame
80 | (vs_change <- change_index(mtcars[["vs"]]))
81 | split_index(mtcars, vs_change)
82 | 
83 | ## matrix
84 | (mat <- matrix(1:50, nrow=10))
85 | split_index(mat, c(3, 6, 10))
86 | }
87 | \seealso{
88 | \code{\link[textshape]{change_index}}
89 | }
90 | 


--------------------------------------------------------------------------------
/man/split_match.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_match.R
 3 | \name{split_match}
 4 | \alias{split_match}
 5 | \alias{split_match_regex}
 6 | \title{Split a Vector By Split Points}
 7 | \usage{
 8 | split_match(x, split = "", include = FALSE, regex = FALSE, ...)
 9 | 
10 | split_match_regex(x, split = "", include = FALSE, ...)
11 | }
12 | \arguments{
13 | \item{x}{A vector with split points.}
14 | 
15 | \item{split}{A vector of places (elements) to split on or a regular
16 | expression if \code{regex} argument is \code{TRUE}.}
17 | 
18 | \item{include}{An integer of \code{1} (\code{split} character(s) are not
19 | included in the output), \code{2} (\code{split} character(s) are included at
20 | the beginning of the output), or \code{3} (\code{split} character(s) are
21 | included at the end of the output).}
22 | 
23 | \item{regex}{logical.  If \code{TRUE} regular expressions will be enabled for
24 | \code{split} argument.}
25 | 
26 | \item{\ldots}{other arguments passed to \code{\link[base]{grep}} and
27 | \code{\link[base]{grepl}}.}
28 | }
29 | \value{
30 | Returns a list of vectors.
31 | }
32 | \description{
33 | \code{split_match} - Splits a \code{vector} into a list of vectors based on
34 | split points.
35 | 
36 | \code{split_match_regex} - \code{split_match} with \code{regex = TRUE}.
37 | }
38 | \examples{
39 | set.seed(15)
40 | x <- sample(c("", LETTERS[1:10]), 25, TRUE, prob=c(.2, rep(.08, 10)))
41 | 
42 | split_match(x)
43 | split_match(x, "C")
44 | split_match(x, c("", "C"))
45 | 
46 | split_match(x, include = 0)
47 | split_match(x, include = 1)
48 | split_match(x, include = 2)
49 | 
50 | set.seed(15)
51 | x <- sample(1:11, 25, TRUE, prob=c(.2, rep(.08, 10)))
52 | split_match(x, 1)
53 | }
54 | \references{
55 | \url{https://stackoverflow.com/a/24319217/1000343}
56 | }
57 | \author{
58 | Matthew Flickinger and Tyler Rinker <tyler.rinker@gmail.com>.
59 | }
60 | 


--------------------------------------------------------------------------------
/man/split_portion.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_portion.R
 3 | \name{split_portion}
 4 | \alias{split_portion}
 5 | \title{Break Text Into Ordered Word Chunks}
 6 | \usage{
 7 | split_portion(
 8 |   text.var,
 9 |   grouping.var = NULL,
10 |   n.words,
11 |   n.chunks,
12 |   as.string = TRUE,
13 |   rm.unequal = FALSE,
14 |   as.table = TRUE,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{text.var}{The text variable}
20 | 
21 | \item{grouping.var}{The grouping variables.  Default \code{NULL} generates
22 | one word list for all text.  Also takes a single grouping variable or a list
23 | of 1 or more grouping variables.}
24 | 
25 | \item{n.words}{An integer specifying the number of words in each chunk (must
26 | specify n.chunks or n.words).}
27 | 
28 | \item{n.chunks}{An integer specifying the number of chunks (must specify
29 | n.chunks or n.words).}
30 | 
31 | \item{as.string}{logical.  If \code{TRUE} the chunks are returned as a single
32 | string.  If \code{FALSE} the chunks are returned as a vector of single words.}
33 | 
34 | \item{rm.unequal}{logical. If \code{TRUE} final chunks that are unequal in
35 | length to the other chunks are removed.}
36 | 
37 | \item{as.table}{logical.  If \code{TRUE} the list output is coerced to
38 | \code{\link[data.table]{data.table}} or \pkg{tibble}.}
39 | 
40 | \item{\ldots}{Ignored.}
41 | }
42 | \value{
43 | Returns a list or \code{\link[data.table]{data.table}} of text chunks.
44 | }
45 | \description{
46 | Some visualizations and algorithms require text to be broken into chunks of
47 | ordered words.  \code{split_portion} breaks text, optionally by grouping
48 | variables, into equal chunks.  The chunk size can be specified by giving
49 | number of words to be in each chunk or the number of chunks.
50 | }
51 | \examples{
52 | with(DATA, split_portion(state, n.chunks = 10))
53 | with(DATA, split_portion(state, n.words = 10))
54 | with(DATA, split_portion(state, n.chunks = 10, as.string=FALSE))
55 | with(DATA, split_portion(state, n.chunks = 10, rm.unequal=TRUE))
56 | with(DATA, split_portion(state, person, n.chunks = 10))
57 | with(DATA, split_portion(state, list(sex, adult), n.words = 10))
58 | with(DATA, split_portion(state, person, n.words = 10, rm.unequal=TRUE))
59 | 
60 | ## Bigger data
61 | with(hamlet, split_portion(dialogue, person, n.chunks = 10))
62 | with(hamlet, split_portion(dialogue, list(act, scene, person), n.chunks = 10))
63 | with(hamlet, split_portion(dialogue, person, n.words = 300))
64 | with(hamlet, split_portion(dialogue, list(act, scene, person), n.words = 300))
65 | }
66 | \keyword{chunks}
67 | \keyword{group}
68 | \keyword{text}
69 | 


--------------------------------------------------------------------------------
/man/split_run.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_run.R
 3 | \name{split_run}
 4 | \alias{split_run}
 5 | \alias{split_run.default}
 6 | \alias{split_run.data.frame}
 7 | \title{Split Runs}
 8 | \usage{
 9 | split_run(x, ...)
10 | 
11 | \method{split_run}{default}(x, ...)
12 | 
13 | \method{split_run}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with runs.}
17 | 
18 | \item{text.var}{The name of the text variable with runs. If \code{TRUE}
19 | \code{split_word} tries to detect the text column with runs.}
20 | 
21 | \item{\ldots}{Ignored.}
22 | }
23 | \value{
24 | Returns a list of vectors of runs or an expanded
25 | \code{\link[data.table]{data.table}} with runs split apart.
26 | }
27 | \description{
28 | Split runs of consecutive characters.
29 | }
30 | \examples{
31 | x1 <- c(
32 |      "122333444455555666666",
33 |      NA,
34 |      "abbcccddddeeeeeffffff",
35 |      "sddfg",
36 |      "11112222333"
37 | )
38 | 
39 | x <- c(rep(x1, 2), ">>???,,,,....::::;[[")
40 | 
41 | split_run(x)
42 | 
43 | 
44 | DATA[["run.col"]] <- x
45 | split_run(DATA, "run.col")
46 | }
47 | 


--------------------------------------------------------------------------------
/man/split_sentence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_sentence.R
 3 | \name{split_sentence}
 4 | \alias{split_sentence}
 5 | \alias{split_sentence.default}
 6 | \alias{split_sentence.data.frame}
 7 | \title{Split Sentences}
 8 | \usage{
 9 | split_sentence(x, ...)
10 | 
11 | \method{split_sentence}{default}(x, ...)
12 | 
13 | \method{split_sentence}{data.frame}(x, text.var = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with sentences.}
17 | 
18 | \item{text.var}{The name of the text variable.  If \code{TRUE}
19 | \code{split_sentence} tries to detect the column with sentences.}
20 | 
21 | \item{\ldots}{Ignored.}
22 | }
23 | \value{
24 | Returns a list of vectors of sentences or a expanded
25 | \code{\link[base]{data.frame}} with sentences split apart.
26 | }
27 | \description{
28 | Split sentences.
29 | }
30 | \examples{
31 | (x <- c(paste0(
32 |     "Mr. Brown comes! He says hello. i give him coffee.  i will ",
33 |     "go at 5 p. m. eastern time.  Or somewhere in between!go there"
34 | ),
35 | paste0(
36 |     "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
37 |     "The time has come. The time is now. Just go. Go. GO!",
38 |     "I don't care how."
39 | )))
40 | split_sentence(x)
41 | 
42 | data(DATA)
43 | split_sentence(DATA)
44 | 
45 | \dontrun{
46 | ## Kevin S. Dias' sentence boundary disambiguation test set
47 | data(golden_rules)
48 | library(magrittr)
49 | 
50 | golden_rules \%$\%
51 |     split_sentence(Text)
52 | }
53 | }
54 | 


--------------------------------------------------------------------------------
/man/split_sentence_token.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_sentence_token.R
 3 | \name{split_sentence_token}
 4 | \alias{split_sentence_token}
 5 | \alias{split_sentence_token.default}
 6 | \alias{split_sentence_token.data.frame}
 7 | \title{Split Sentences & Tokens}
 8 | \usage{
 9 | split_sentence_token(x, ...)
10 | 
11 | \method{split_sentence_token}{default}(x, lower = TRUE, ...)
12 | 
13 | \method{split_sentence_token}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with sentences.}
17 | 
18 | \item{lower}{logical.  If \code{TRUE} the words are converted to lower case.}
19 | 
20 | \item{text.var}{The name of the text variable.  If \code{TRUE}
21 | \code{split_sentence_token} tries to detect the column with sentences.}
22 | 
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of sentences or a expanded
27 | \code{\link[base]{data.frame}} with sentences split apart.
28 | }
29 | \description{
30 | Split sentences and tokens.
31 | }
32 | \examples{
33 | (x <- c(paste0(
34 |     "Mr. Brown comes! He says hello. i give him coffee.  i will ",
35 |     "go at 5 p. m. eastern time.  Or somewhere in between!go there"
36 | ),
37 | paste0(
38 |     "Marvin K. Mooney Will You Please Go Now!", "The time has come.",
39 |     "The time has come. The time is now. Just go. Go. GO!",
40 |     "I don't care how."
41 | )))
42 | split_sentence_token(x)
43 | 
44 | data(DATA)
45 | split_sentence_token(DATA)
46 | 
47 | \dontrun{
48 | ## Kevin S. Dias' sentence boundary disambiguation test set
49 | data(golden_rules)
50 | library(magrittr)
51 | 
52 | golden_rules \%$\%
53 |     split_sentence_token(Text)
54 | }
55 | }
56 | 


--------------------------------------------------------------------------------
/man/split_speaker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_speaker.R
 3 | \name{split_speaker}
 4 | \alias{split_speaker}
 5 | \title{Break and Stretch if Multiple Persons per Cell}
 6 | \usage{
 7 | split_speaker(dataframe, speaker.var = 1, sep = c("and", "&", ","), ...)
 8 | }
 9 | \arguments{
10 | \item{dataframe}{A dataframe that contains the person variable.}
11 | 
12 | \item{speaker.var}{The person variable to be stretched.}
13 | 
14 | \item{sep}{The separator(s) to search for and break on.  Default is:
15 | c("and", "&", ",")}
16 | 
17 | \item{\ldots}{Ignored.}
18 | }
19 | \value{
20 | Returns an expanded dataframe with person variable stretched and
21 | accompanying rows repeated.
22 | }
23 | \description{
24 | Look for cells with multiple people and create separate rows for each person.
25 | }
26 | \examples{
27 | \dontrun{
28 | DATA$person <- as.character(DATA$person)
29 | DATA$person[c(1, 4, 6)] <- c("greg, sally, & sam",
30 |     "greg, sally", "sam and sally")
31 | 
32 | split_speaker(DATA)
33 | 
34 | DATA$person[c(1, 4, 6)] <- c("greg_sally_sam",
35 |     "greg.sally", "sam; sally")
36 | 
37 | split_speaker(DATA, sep = c(".", "_", ";"))
38 | 
39 | DATA <- textshape::DATA  #reset DATA
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/man/split_token.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_token.R
 3 | \name{split_token}
 4 | \alias{split_token}
 5 | \alias{split_token.default}
 6 | \alias{split_token.data.frame}
 7 | \title{Split Tokens}
 8 | \usage{
 9 | split_token(x, ...)
10 | 
11 | \method{split_token}{default}(x, lower = TRUE, ...)
12 | 
13 | \method{split_token}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with tokens.}
17 | 
18 | \item{lower}{logical.  If \code{TRUE} the words are converted to lower case.}
19 | 
20 | \item{text.var}{The name of the text variable.  If \code{TRUE}
21 | \code{split_token} tries to detect the text column with tokens.}
22 | 
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of tokens or an expanded
27 | \code{\link[data.table]{data.table}} with tokens split apart.
28 | }
29 | \description{
30 | Split tokens.
31 | }
32 | \examples{
33 | (x <- c(
34 |     "Mr. Brown comes! He says hello. i give him coffee.",
35 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
36 |     "go there"
37 | ))
38 | split_token(x)
39 | split_token(x, lower=FALSE)
40 | 
41 | data(DATA)
42 | split_token(DATA)
43 | split_token(DATA, lower=FALSE)
44 | 
45 | ## Larger data set
46 | split_token(hamlet)
47 | }
48 | 


--------------------------------------------------------------------------------
/man/split_transcript.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_transcript.R
 3 | \name{split_transcript}
 4 | \alias{split_transcript}
 5 | \title{Split a Transcript Style Vector on Delimiter & Coerce to Dataframe}
 6 | \usage{
 7 | split_transcript(
 8 |   x,
 9 |   delim = ":",
10 |   colnames = c("person", "dialogue"),
11 |   max.delim = 15,
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}.}
17 | 
18 | \item{delim}{The delimiter to split on.}
19 | 
20 | \item{colnames}{The column names to use for the \code{\link[data.table]{data.table}}
21 | output.}
22 | 
23 | \item{max.delim}{An integer stating how many characters may come before a
24 | delimiter is found.  This is useful for the case when a colon is the delimiter
25 | but time stamps are also found in the text.}
26 | 
27 | \item{\ldots}{Ignored.}
28 | }
29 | \value{
30 | Returns a 2 column \code{\link[data.table]{data.table}}.
31 | }
32 | \description{
33 | Split a transcript style vector (e.g., \code{c("greg: Who me", "sarah: yes you!")}
34 | into a name and dialogue vector that is coerced to a \code{\link[data.table]{data.table}}.
35 | Leading/trailing white space in the columns is stripped out.
36 | }
37 | \examples{
38 | split_transcript(c("greg: Who me", "sarah: yes you!"))
39 | 
40 | \dontrun{
41 | ## 2015 Vice-Presidential Debates Example
42 | if (!require("pacman")) install.packages("pacman")
43 | pacman::p_load(rvest, magrittr, xml2)
44 | 
45 | debates <- c(
46 |     wisconsin = "110908",
47 |     boulder = "110906",
48 |     california = "110756",
49 |     ohio = "110489"
50 | )
51 | 
52 | lapply(debates, function(x){
53 |     xml2::read_html(paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x)) \%>\%
54 |         rvest::html_nodes("p") \%>\%
55 |         rvest::html_text() \%>\%
56 |         textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
57 |         textshape::combine() \%>\%
58 |         textshape::split_transcript() \%>\%
59 |         textshape::split_sentence()
60 | })
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/man/split_word.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/split_word.R
 3 | \name{split_word}
 4 | \alias{split_word}
 5 | \alias{split_word.default}
 6 | \alias{split_word.data.frame}
 7 | \title{Split Words}
 8 | \usage{
 9 | split_word(x, ...)
10 | 
11 | \method{split_word}{default}(x, lower = TRUE, ...)
12 | 
13 | \method{split_word}{data.frame}(x, text.var = TRUE, lower = TRUE, ...)
14 | }
15 | \arguments{
16 | \item{x}{A \code{\link[base]{data.frame}} or character vector with words.}
17 | 
18 | \item{lower}{logical.  If \code{TRUE} the words are converted to lower case.}
19 | 
20 | \item{text.var}{The name of the text variable.  If \code{TRUE}
21 | \code{split_word} tries to detect the text column with words.}
22 | 
23 | \item{\ldots}{Ignored.}
24 | }
25 | \value{
26 | Returns a list of vectors of words or an expanded
27 | \code{\link[data.table]{data.table}} with words split apart.
28 | }
29 | \description{
30 | Split words.
31 | }
32 | \examples{
33 | (x <- c(
34 |     "Mr. Brown comes! He says hello. i give him coffee.",
35 |     "I'll go at 5 p. m. eastern time.  Or somewhere in between!",
36 |     "go there"
37 | ))
38 | split_word(x)
39 | split_word(x, lower=FALSE)
40 | 
41 | data(DATA)
42 | split_word(DATA)
43 | split_word(DATA, lower=FALSE)
44 | 
45 | ## Larger data set
46 | split_word(hamlet)
47 | }
48 | 


--------------------------------------------------------------------------------
/man/textshape.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/textshape-package.R
 3 | \docType{package}
 4 | \name{textshape}
 5 | \alias{textshape}
 6 | \alias{package-textshape}
 7 | \title{Tools for Reshaping Text}
 8 | \description{
 9 | Tools that can be used to reshape and restructure text data.
10 | }
11 | 


--------------------------------------------------------------------------------
/man/tidy_colo_dtm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_colo_dtm.R
 3 | \name{tidy_colo_tdm}
 4 | \alias{tidy_colo_tdm}
 5 | \alias{tidy_colo_dtm}
 6 | \title{Convert a 
 7 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
 8 | into Collocating Words in Tidy Form}
 9 | \usage{
10 | tidy_colo_tdm(x, ...)
11 | 
12 | tidy_colo_dtm(x, ...)
13 | }
14 | \arguments{
15 | \item{x}{A 
16 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.}
17 | 
18 | \item{\ldots}{Ignored.}
19 | }
20 | \value{
21 | Returns a tidied data.frame.
22 | }
23 | \description{
24 | Converts non-zero elements of a
25 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
26 | into a tidy data set made of collocating words.
27 | }
28 | \examples{
29 | data(simple_dtm)
30 | 
31 | tidied <- tidy_colo_dtm(simple_dtm)
32 | tidied
33 | unique_pairs(tidied)
34 | 
35 | \dontrun{
36 | if (!require("pacman")) install.packages("pacman")
37 | pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
38 | pacman::p_load(tidyverse, magrittr, ggstance)
39 | 
40 | my_dtm <- with(
41 |     presidential_debates_2012, 
42 |     q_dtm(dialogue, paste(time, tot, sep = "_"))
43 | )
44 | 
45 | tidy_colo_dtm(my_dtm) \%>\%
46 |     tbl_df() \%>\%
47 |     filter(!term_1 \%in\% c('i', lexicon::sw_onix) & 
48 |         !term_2 \%in\% lexicon::sw_onix
49 |     ) \%>\%
50 |     filter(term_1 != term_2) \%>\%
51 |     unique_pairs() \%>\%
52 |     filter(n > 15) \%>\%
53 |     complete(term_1, term_2, fill = list(n = 0)) \%>\%
54 |     ggplot(aes(x = term_1, y = term_2, fill = n)) +
55 |         geom_tile() +
56 |         scale_fill_gradient(low= 'white', high = 'red') +
57 |         theme(axis.text.x = element_text(angle = 45, hjust = 1))
58 | }
59 | }
60 | \seealso{
61 | \code{\link[textshape]{unique_pairs}}
62 | }
63 | 


--------------------------------------------------------------------------------
/man/tidy_dtm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_dtm.R
 3 | \name{tidy_dtm}
 4 | \alias{tidy_dtm}
 5 | \alias{tidy_tdm}
 6 | \title{Convert a 
 7 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
 8 | into Tidy Form}
 9 | \usage{
10 | tidy_dtm(x, ...)
11 | 
12 | tidy_tdm(x, ...)
13 | }
14 | \arguments{
15 | \item{x}{A 
16 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a tidied data.frame.
22 | }
23 | \description{
24 | Converts non-zero elements of a
25 | \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} 
26 | into a tidy data set.
27 | }
28 | \examples{
29 | data(simple_dtm)
30 | 
31 | tidy_dtm(simple_dtm)
32 | 
33 | \dontrun{
34 | if (!require("pacman")) install.packages("pacman")
35 | pacman::p_load_current_gh('trinker/gofastr')
36 | pacman::p_load(tidyverse, magrittr, ggstance)
37 | 
38 | my_dtm <- with(
39 |     presidential_debates_2012, 
40 |     q_dtm(dialogue, paste(time, tot, sep = "_"))
41 | )
42 | 
43 | tidy_dtm(my_dtm) \%>\%
44 |     tidyr::extract(
45 |         col = doc, 
46 |         into = c("time", "turn", "sentence"), 
47 |         regex = "(\\\\d)_(\\\\d+)\\\\.(\\\\d+)"
48 |     ) \%>\%
49 |     mutate(
50 |         time = as.numeric(time),
51 |         turn = as.numeric(turn),
52 |         sentence = as.numeric(sentence)
53 |     ) \%>\%
54 |     tbl_df() \%T>\%
55 |     print() \%>\%
56 |     group_by(time, term) \%>\%
57 |     summarize(n = sum(n)) \%>\%
58 |     group_by(time) \%>\%
59 |     arrange(desc(n)) \%>\%
60 |     slice(1:10) \%>\%
61 |     ungroup() \%>\%
62 |     mutate(
63 |         term = factor(paste(term, time, sep = "__"),
64 |             levels = rev(paste(term, time, sep = "__")))
65 |     ) \%>\%
66 |     ggplot(aes(x = n, y = term)) +
67 |         geom_barh(stat='identity') +
68 |         facet_wrap(~time, ncol=2, scales = 'free_y') +
69 |         scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
70 | }
71 | }
72 | 


--------------------------------------------------------------------------------
/man/tidy_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_list.R
 3 | \name{tidy_list}
 4 | \alias{tidy_list}
 5 | \title{Tidy a List of Named Dataframes or Named Vectors or Vectors}
 6 | \usage{
 7 | tidy_list(
 8 |   x,
 9 |   id.name = "id",
10 |   content.name = "content",
11 |   content.attribute.name = "attribute",
12 |   ...
13 | )
14 | }
15 | \arguments{
16 | \item{x}{A named \code{\link[base]{list}} of
17 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}.}
18 | 
19 | \item{id.name}{The name to use for the column created from the 
20 | \code{\link[base]{list}}.}
21 | 
22 | \item{content.name}{The name to use for the column created from the 
23 | \code{\link[base]{list}} of \code{\link[base]{vector}}s (only used if 
24 | \code{x} is \code{\link[base]{vector}}).}
25 | 
26 | \item{content.attribute.name}{The name to use for the column created from the
27 | \code{\link[base]{list}} of names given to the \code{\link[base]{vector}}s
28 | (only used if \code{x} is named \code{\link[base]{vector}}).}
29 | 
30 | \item{\ldots}{Ignored.}
31 | }
32 | \value{
33 | Returns a \code{\link[data.table]{data.table}} with the 
34 | \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id} 
35 | column.
36 | }
37 | \description{
38 | \code{\link[base]{rbind}} a named \code{\link[base]{list}} of
39 | \code{\link[base]{data.frame}}s or \code{\link[base]{vector}}s to
40 | output a single \code{\link[base]{data.frame}} with the 
41 | \code{\link[base]{names}} from the \code{\link[base]{list}} as an \code{id} 
42 | column.
43 | }
44 | \examples{
45 | tidy_list(list(p=1:500, r=letters))
46 | tidy_list(list(p=mtcars, r=mtcars, z=mtcars, d=mtcars))
47 | 
48 | x <- list(
49 |     a = setNames(c(1:4), LETTERS[1:4]),
50 |     b = setNames(c(7:9), LETTERS[7:9]),
51 |     c = setNames(c(10:15), LETTERS[10:15]),
52 |     d = c(x=4, y=6, 4),
53 |     e = setNames(1:10, sample(state.abb, 10, TRUE)),
54 |     f = setNames(1:10, sample(month.abb, 10, TRUE))
55 | )
56 | 
57 | tidy_list(x)
58 | 
59 | \dontrun{
60 | ## 2015 Vice-Presidential Debates Example
61 | if (!require("pacman")) install.packages("pacman")
62 | pacman::p_load(rvest, magrittr, xml2)
63 | 
64 | debates <- c(
65 |     wisconsin = "110908",
66 |     boulder = "110906",
67 |     california = "110756",
68 |     ohio = "110489"
69 | )
70 | 
71 | lapply(debates, function(x){
72 |     paste0("http://www.presidency.ucsb.edu/ws/index.php?pid=", x) \%>\%
73 |         xml2::read_html() \%>\%
74 |         rvest::html_nodes("p") \%>\%
75 |         rvest::html_text() \%>\%
76 |         textshape::split_index(grep("^[A-Z]+:", .)) \%>\%
77 |         textshape::combine() \%>\%
78 |         textshape::split_transcript() \%>\%
79 |         textshape::split_sentence()
80 | }) \%>\%
81 |     textshape::tidy_list("location")
82 | }
83 | }
84 | 


--------------------------------------------------------------------------------
/man/tidy_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_matrix.R
 3 | \name{tidy_matrix}
 4 | \alias{tidy_matrix}
 5 | \alias{tidy_adjacency_matrix}
 6 | \title{Convert a Matrix into Tidy Form}
 7 | \usage{
 8 | tidy_matrix(x, row.name = "row", col.name = "col", value.name = "value", ...)
 9 | 
10 | tidy_adjacency_matrix(x, ...)
11 | }
12 | \arguments{
13 | \item{x}{A matrix.}
14 | 
15 | \item{row.name}{A string to use for the row names that are now a column.}
16 | 
17 | \item{col.name}{A string to use for the column names that are now a column.}
18 | 
19 | \item{value.name}{A string to use for the values that are now a column.}
20 | 
21 | \item{\ldots}{ignored.}
22 | }
23 | \value{
24 | Returns a tidied \code{data.frame}.
25 | }
26 | \description{
27 | \code{tidy_matrix} - Converts matrices into a tidy data set.  Essentially, a
28 | stacking of the matrix columns and repeating row/column names as necessary.
29 | 
30 | \code{tidy_adjacency_matrix} - A wrapper for \code{tidy_matrix} with the 
31 | \code{row.name}, \code{col.name}, & \code{value.name} all set to 
32 | \code{"from"},\code{"to"}, & \code{"n"}, assuming preparation for network 
33 | analysis.
34 | }
35 | \examples{
36 | mat <- matrix(1:16, nrow = 4,
37 |     dimnames = list(LETTERS[1:4], LETTERS[23:26])
38 | )
39 | 
40 | mat
41 | tidy_matrix(mat)
42 | 
43 | 
44 | data(simple_dtm)
45 | tidy_matrix(as.matrix(simple_dtm), 'doc', 'term', 'n')
46 | 
47 | X <- as.matrix(simple_dtm[1:10, 1:10])
48 | tidy_adjacency_matrix(crossprod(X))
49 | tidy_adjacency_matrix(crossprod(t(X)))
50 | }
51 | 


--------------------------------------------------------------------------------
/man/tidy_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_table.R
 3 | \name{tidy_table}
 4 | \alias{tidy_table}
 5 | \title{Tidy a Table: Bind Its Values with Its Names}
 6 | \usage{
 7 | tidy_table(x, id.name = "id", content.name = "content", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{\link[base]{table}}.}
11 | 
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{table}}
13 | \code{\link[base]{names}}.}
14 | 
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{table}}
16 | values.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{table}} as an \code{id} column.
23 | }
24 | \description{
25 | \code{\link[base]{cbind}} a \code{\link[base]{table}}'s values with its
26 | \code{\link[base]{names}} to form \code{id} (from the names) and
27 | \code{content} columns.
28 | }
29 | \examples{
30 | x <- table(sample(LETTERS[1:6], 1000, TRUE))
31 | tidy_table(x)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/tidy_vector.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_vector.R
 3 | \name{tidy_vector}
 4 | \alias{tidy_vector}
 5 | \title{Tidy a Named Atomic Vector: Bind Its Values with Its Names}
 6 | \usage{
 7 | tidy_vector(x, id.name = "id", content.name = "content", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A named atomic \code{\link[base]{vector}}.}
11 | 
12 | \item{id.name}{The name to use for the column created from the \code{\link[base]{vector}}
13 | \code{\link[base]{names}}.}
14 | 
15 | \item{content.name}{The name to use for the column created from the \code{\link[base]{vector}}
16 | values.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns a \code{\link[data.table]{data.table}} with the \code{\link[base]{names}}
22 | from the \code{\link[base]{vector}} as an \code{id} column.
23 | }
24 | \description{
25 | \code{\link[base]{cbind}} a named atomic \code{\link[base]{vector}}'s values
26 | with its \code{\link[base]{names}} to form \code{id} (from the names) and
27 | \code{content} columns.
28 | }
29 | \examples{
30 | x <- setNames(sample(LETTERS[1:6], 1000, TRUE), sample(state.name[1:5], 1000, TRUE))
31 | tidy_vector(x)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/unique_pairs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/unique_pairs.R
 3 | \name{unique_pairs}
 4 | \alias{unique_pairs}
 5 | \alias{unique_pairs.default}
 6 | \alias{unique_pairs.data.table}
 7 | \title{Extract Only Unique Pairs of Collocating Words in 
 8 | \code{\link[textshape]{tidy_colo_dtm}}}
 9 | \usage{
10 | unique_pairs(x, col1 = "term_1", col2 = "term_2", ...)
11 | 
12 | \method{unique_pairs}{default}(x, col1 = "term_1", col2 = "term_2", ...)
13 | 
14 | \method{unique_pairs}{data.table}(x, col1 = "term_1", col2 = "term_2", ...)
15 | }
16 | \arguments{
17 | \item{x}{A \code{\link[base]{data.frame}} with two columns that contain
18 | redundant pairs.}
19 | 
20 | \item{col1}{A string naming column 1.}
21 | 
22 | \item{col2}{A string naming column 2.}
23 | 
24 | \item{\ldots}{ignored.}
25 | }
26 | \value{
27 | Returns a filtered \code{\link[base]{data.frame}}.
28 | }
29 | \description{
30 | \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
31 | the tidied data.frame.  This means that the upper and lower triangles are
32 | used redundantly.  This function eliminates this redundancy by dropping one
33 | set of the pairs from a tidied data.frame.
34 | }
35 | \examples{
36 | dat <- data.frame(
37 |     term_1 = LETTERS[1:10],
38 |     term_2 = LETTERS[10:1],
39 |     stringsAsFactors = FALSE
40 | )
41 | 
42 | unique_pairs(dat)
43 | }
44 | \seealso{
45 | \code{\link[textshape]{tidy_colo_dtm}}
46 | }
47 | 


--------------------------------------------------------------------------------
/man/unnest_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/unnest_text.R
 3 | \name{unnest_text}
 4 | \alias{unnest_text}
 5 | \title{Un-nest Nested Text Columns}
 6 | \usage{
 7 | unnest_text(dataframe, column, integer.rownames = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{dataframe}{A dataframe object.}
11 | 
12 | \item{column}{Column name to search for markers/terms.}
13 | 
14 | \item{integer.rownames}{logical.  If \code{TRUE} then the rownames are 
15 | numbered 1 through number of rows, otherwise the original row number is 
16 | retained followed by a period and the element number from the list.}
17 | 
18 | \item{\ldots}{ignored.}
19 | }
20 | \value{
21 | Returns an un-nested data.frame.
22 | }
23 | \description{
24 | Un-nest nested text columns in a data.frame.   Attempts to locate the nested 
25 | text column without specifying.
26 | }
27 | \examples{
28 | dat <- DATA
29 | 
30 | ## Add a nested/list text column
31 | dat$split <- lapply(dat$state, function(x) {
32 |     unlist(strsplit(x, '(?<=[?!.])\\\\s+', perl = TRUE))
33 | })
34 | 
35 | unnest_text(dat)
36 | unnest_text(dat, integer.rownames = FALSE)
37 | 
38 | ## Add a second nested integer column
39 | dat$d <- lapply(dat$split, nchar)
40 | \dontrun{
41 | unnest_text(dat) # causes error, must supply column explicitly
42 | }
43 | unnest_text(dat, 'split')
44 | 
45 | ## As a data.table
46 | library(data.table)
47 | dt_dat <- data.table::as.data.table(data.table::copy(dat))
48 | unnest_text(dt_dat, 'split')
49 | \dontrun{
50 | unnest_text(dt_dat, 'd')
51 | }
52 | 
53 | \dontrun{
54 | ## As a tibble
55 | library(tibble)
56 | t_dat <- tibble:::as_tibble(dat)
57 | unnest_text(t_dat, 'split')
58 | }
59 | }
60 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library("testthat")
2 | library("textshape")
3 | 
4 | test_check("textshape")


--------------------------------------------------------------------------------
/tests/testthat/test-change_index.R:
--------------------------------------------------------------------------------
1 | context("Checking change_index")
2 | 
3 | test_that("change_index ...",{
4 | 
5 | 
6 | })
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/testthat/test-mtabulate.R:
--------------------------------------------------------------------------------
1 | context("Checking mtabulate")
2 | 
3 | test_that("mtabulate ...",{
4 | 
5 | 
6 | })
7 | 
8 | 


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-22-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-32-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-32-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-44-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-44-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-45-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-45-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-46-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-46-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-55-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-55-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/tools/figure/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/figure/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshape.png


--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshape.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshape.pptx


--------------------------------------------------------------------------------
/tools/textshape_logo/r_textshapea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trinker/textshape/61dc5f1e0ba817571d97d87751959f78613ad92c/tools/textshape_logo/r_textshapea.png


--------------------------------------------------------------------------------
/tools/textshape_logo/resize_icon.txt:
--------------------------------------------------------------------------------
1 | cd C:\Users\Tyler\GitHub\textshape\tools\textshape_logo
2 | ffmpeg -i r_textshapea.png -vf scale=150:-1 r_textshape.png


--------------------------------------------------------------------------------