├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CONDUCT.md
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── collapse_rows.R
    ├── path_string.R
    ├── pmc_caption.R
    ├── pmc_metadata.R
    ├── pmc_reference.R
    ├── pmc_table.R
    ├── pmc_text.R
    ├── pmc_xml.R
    ├── repeat_sub.R
    ├── separate_genes.R
    ├── separate_refs.R
    ├── separate_tags.R
    ├── separate_text.R
    └── tidypmc-package.R
├── README.Rmd
├── README.html
├── README.md
├── codecov.yml
├── codemeta.json
├── inst
    └── extdata
    │   ├── PMC2231364.xml
    │   ├── PMC6095483.xml
    │   ├── PMC6358576_PMC6358589.xml
    │   └── PMC6385181.xml
├── man
    ├── collapse_rows.Rd
    ├── pmc_caption.Rd
    ├── pmc_metadata.Rd
    ├── pmc_reference.Rd
    ├── pmc_table.Rd
    ├── pmc_text.Rd
    ├── pmc_xml.Rd
    ├── separate_genes.Rd
    ├── separate_refs.Rd
    ├── separate_tags.Rd
    ├── separate_text.Rd
    └── tidypmc.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── tests-pmc_other.R
    │   ├── tests-pmc_table.R
    │   ├── tests-pmc_text.R
    │   └── tests-separate.R
└── vignettes
    ├── pmcftp.Rmd
    ├── pmcftp.md
    ├── tidypmc.Rmd
    └── tidypmc.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | CONDUCT\.md$
2 | ^codecov\.yml$
3 | ^\.travis\.yml$
4 | ^Meta$
5 | ^doc$
6 | ^README*
7 | codemeta.json
8 | 
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Meta
 2 | doc
 3 | # History files
 4 | .Rhistory
 5 | .Rapp.history
 6 | 
 7 | # Session Data files
 8 | .RData
 9 | .DS_Store
10 | # Example code in package build process
11 | *-Ex.R
12 | # Output files from R CMD build
13 | /*.tar.gz
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | # RStudio files
17 | .Rproj.user/
18 | # produced vignettes
19 | vignettes/*.html
20 | vignettes/*.pdf
21 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
22 | .httr-oauth
23 | # knitr and R markdown default cache directories
24 | /*_cache/
25 | /cache/
26 | # Temporary files created by R markdown
27 | *.utf8.md
28 | *.knit.md
29 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
30 | rsconnect/
31 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 | 
3 | language: R
4 | sudo: false
5 | cache: packages
6 | 
7 | after_success:
8 |   - Rscript -e 'covr::codecov()'
9 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (http://contributor-covenant.org), version 1.0.0, available at 
25 | http://contributor-covenant.org/version/1/0/0/
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tidypmc
 2 | Type: Package
 3 | Title: Parse Full Text XML Documents from PubMed Central
 4 | Version: 1.8
 5 | Authors@R: person("Chris", "Stubben", role = c("aut", "cre"), email = "chris.stubben@hci.utah.edu")
 6 | Description: Parse XML documents from the Open Access subset of Europe PubMed Central <https://europepmc.org>
 7 |     including section paragraphs, tables, captions and references.
 8 | URL: https://docs.ropensci.org/tidypmc, https://github.com/ropensci/tidypmc
 9 | BugReports: https://github.com/ropensci/tidypmc/issues
10 | License: GPL-3
11 | Encoding: UTF-8
12 | VignetteBuilder: knitr
13 | Imports:
14 |     xml2,
15 |     tokenizers,
16 |     stringr,
17 |     tibble,
18 |     dplyr,
19 |     readr
20 | Suggests:
21 |     europepmc,
22 |     tidytext,
23 |     rmarkdown,
24 |     knitr,
25 |     testthat,
26 |     covr
27 | RoxygenNote: 6.1.1
28 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(collapse_rows)
 4 | export(pmc_caption)
 5 | export(pmc_metadata)
 6 | export(pmc_reference)
 7 | export(pmc_table)
 8 | export(pmc_text)
 9 | export(pmc_xml)
10 | export(separate_genes)
11 | export(separate_refs)
12 | export(separate_tags)
13 | export(separate_text)
14 | importFrom(dplyr,"%>%")
15 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | tidypmc 1.8 (dev)
 2 | =========================
 3 | 
 4 | ### DOCUMENTATION FIXES
 5 | 
 6 |   * Added a NEWS.md file (#2)
 7 | 
 8 | 
 9 | tidypmc 1.7 (2019-08-01)
10 | =========================
11 | 
12 | ### NEW FEATURES
13 | 
14 |   * released to CRAN
15 | 


--------------------------------------------------------------------------------
/R/collapse_rows.R:
--------------------------------------------------------------------------------
 1 | #' Collapse a list of PubMed Central tables
 2 | #'
 3 | #' Collapse rows into a semi-colon delimited list with column names and cell
 4 | #' values
 5 | #'
 6 | #' @param pmc a list of tables, usually from \code{\link{pmc_table}}
 7 | #' @param na.string  additional cell values to skip, default is NA and ""
 8 | #'
 9 | #' @return A tibble with table and row number and collapsed text
10 | #'
11 | #' @author Chris Stubben
12 | #'
13 | #' @examples
14 | #' x <- data.frame(
15 | #'   genes = c("aroB", "glnP", "ndhA", "pyrF"),
16 | #'   fold_change = c(2.5, 1.7, -3.1, -2.6)
17 | #' )
18 | #' collapse_rows(list(`Table 1` = x))
19 | #' @export
20 | 
21 | collapse_rows <- function(pmc, na.string) {
22 |   if (is.null(pmc)) {
23 |     cr1 <- NULL
24 |   } else {
25 |     if (class(pmc)[1] != "list") pmc <- list(Table = pmc)
26 |     if (!is.data.frame(pmc[[1]])) {
27 |       stop("pmc should be a list of tables from pmc_table")
28 |     }
29 |     n1 <- length(pmc)
30 |     tbls <- vector("list", n1)
31 |     names(tbls) <- names(pmc)
32 |     for (i in seq_len(n1)) {
33 |       x <- data.frame(pmc[[i]], check.names = FALSE)
34 |       y <- names(x)
35 |       n <- nrow(x)
36 |       if (nrow(x) == 0) {
37 |         tbls[[i]] <- NULL
38 |       } else {
39 |         ## convert factors to character
40 |         f1 <- vapply(x, is.factor, logical(1))
41 |         if (any(f1)) for (k in which(f1)) x[, k] <- as.character(x[, k])
42 |         # combine (and skip empty fields)
43 |         cx <- vector("character", n)
44 |         for (j in seq_len(n)) {
45 |           n2 <- is.na(x[j, ]) | as.character(x[j, ]) == "" | x[j, ] == "\u00A0"
46 |           if (!missing(na.string)) n2 <- n2 | as.character(x[j, ]) == na.string
47 |           rowx <- paste(paste(y[!n2], x[j, !n2], sep = "="), collapse = "; ")
48 |           cx[j] <- rowx
49 |         }
50 |         z <- tibble::tibble(row = seq_along(cx), text = cx)
51 |         tbls[[i]] <- z
52 |       }
53 |     }
54 |     cr1 <- dplyr::bind_rows(tbls, .id = "table")
55 |   }
56 |   cr1
57 | }
58 | 


--------------------------------------------------------------------------------
/R/path_string.R:
--------------------------------------------------------------------------------
 1 | #' Print a hierarchical path string
 2 | #'
 3 | #' Print a hierarchical path string from a vector of names and levels
 4 | #'
 5 | #' @param x a vector of names
 6 | #' @param n a vector of numbers with indentation level
 7 | #'
 8 | #' @return a character vector
 9 | #'
10 | #' @note Used by \code{\link{pmc_text}} to print full path to subsection title
11 | #'
12 | #' @author Chris Stubben
13 | #'
14 | #' @examples
15 | #' x <- c("carnivores", "bears", "polar", "grizzly", "cats", "tiger", "rodents")
16 | #' n <- c(1, 2, 3, 3, 2, 3, 1)
17 | #' path_string(x, n)
18 | #' @noRd
19 | 
20 | path_string <- function(x, n) {
21 |   n2 <- length(n)
22 |   if (is.factor(x)) x <- as.character(x)
23 |   if (!is.numeric(n)) stop("n should be a vector of numbers")
24 |   if (n2 != length(x)) stop("x and n should be the same length")
25 |   z <- vector("list", n2)
26 |   if (min(n) > 1) n <- n - min(n) + 1
27 |   ## start with empty vector
28 |   path <- ""
29 |   for (i in seq_len(n2)) {
30 |     ## add name at position n[i]
31 |     path[n[i]] <- x[i]
32 |     ## drop names if n[i] decreases
33 |     path <- path[seq_len(n[i])]
34 |     ### paste together names
35 |     z[[i]] <- paste(path, collapse = "; ")
36 |   }
37 |   z <- unlist(z)
38 |   ## check if any NA?
39 |   z <- gsub("NA; ", "", z)
40 |   z
41 | }
42 | 


--------------------------------------------------------------------------------
/R/pmc_caption.R:
--------------------------------------------------------------------------------
  1 | #' Split captions into sentences
  2 | #'
  3 | #' Split figure, table and supplementary material captions into sentences
  4 | #'
  5 | #' @param doc \code{xml_document} from PubMed Central
  6 | #'
  7 | #' @return a tibble with tag, label, sentence number and text
  8 | #'
  9 | #' @author Chris Stubben
 10 | #'
 11 | #' @examples
 12 | #' # doc <- pmc_xml("PMC2231364") # OR
 13 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
 14 | #'   package = "tidypmc"
 15 | #' ))
 16 | #' x <- pmc_caption(doc)
 17 | #' x
 18 | #' dplyr::filter(x, sentence == 1)
 19 | #' @export
 20 | 
 21 | pmc_caption <- function(doc) {
 22 |   if (class(doc)[1] != "xml_document") {
 23 |     stop("doc should be an xml_document from PubMed Central")
 24 |   }
 25 |   ### Figures
 26 |   z <- xml2::xml_find_all(doc, "//fig")
 27 |   # cat(as.character(z[[1]]))
 28 |   if (length(z) > 0) {
 29 |     n <- length(z)
 30 |     message("Found ", n, ifelse(n > 1, " figures", " figure"))
 31 |     ## should have label and caption?
 32 |     f1 <- vapply(z, function(x) xml2::xml_text(
 33 |         xml2::xml_find_first(x, "./label"),
 34 |         trim = TRUE
 35 |       ), character(1))
 36 |     # get caption /title and /p tags together since some caption titles are
 37 |     # missing, in bold tags or have very long titles that should be split.
 38 |     # use node() to avoid pasting /title and /p sentences without a space
 39 |     f2 <- vapply(z, function(x) paste(xml2::xml_text(
 40 |         xml2::xml_find_all(x, "./caption/*")
 41 |       ), collapse = " "), character(1))
 42 |     if (all(is.na(f1)) & all(f2 == "")) {
 43 |       ## ANY label and ANY paragrah
 44 |       f1 <- vapply(z, function(x) xml2::xml_text(
 45 |           xml2::xml_find_first(x, ".//label"),
 46 |           trim = TRUE
 47 |         ), character(1))
 48 |       f2 <- vapply(z, function(x) xml2::xml_text(
 49 |           xml2::xml_find_first(x, ".//p")
 50 |         ), character(1))
 51 |     }
 52 |     names(f2) <- gsub("\\.$", "", f1)
 53 |     ## only some fig tags with media only
 54 |     f2 <- f2[f2 != ""]
 55 |     #  text in media/ tag
 56 |     if (length(f2) == 0) {
 57 |       message(" No figure /caption or /p tag to parse - link to image only?")
 58 |       figs <- NULL
 59 |     } else {
 60 |       x1 <- vapply(f2, tokenizers::tokenize_sentences, list(1))
 61 |       figs <- dplyr::bind_rows(
 62 |         lapply(x1, function(z)
 63 |           tibble::tibble(sentence = seq_along(z), text = z)),
 64 |         .id = "label"
 65 |       )
 66 |     }
 67 |   } else {
 68 |     figs <- NULL
 69 |   }
 70 |   ### Tables
 71 |   z <- xml2::xml_find_all(doc, "//table-wrap")
 72 |   if (length(z) > 0) {
 73 |     n <- length(z)
 74 |     message("Found ", n, ifelse(n > 1, " tables", " table"))
 75 |     ## should have label and caption?
 76 |     f1 <- vapply(z, function(x) xml2::xml_text(
 77 |         xml2::xml_find_first(x, "./label"),
 78 |         trim = TRUE
 79 |       ), character(1))
 80 |     # some with long subcaptions
 81 |     f2 <- vapply(z, function(x) paste(xml2::xml_text(
 82 |         xml2::xml_find_all(x, "./caption/*")
 83 |       ), collapse = " "), character(1))
 84 |     names(f2) <- gsub("\\.$", "", f1)
 85 |     ## only some table tags with media only
 86 |     f2 <- f2[f2 != ""]
 87 |     x1 <- vapply(f2, tokenizers::tokenize_sentences, list(1))
 88 |     tbls <- dplyr::bind_rows(
 89 |       lapply(x1, function(z)
 90 |         tibble::tibble(sentence = seq_along(z), text = z)),
 91 |       .id = "label"
 92 |     )
 93 |   } else {
 94 |     tbls <- NULL
 95 |   }
 96 |   ### Supplements
 97 |   z <- xml2::xml_find_all(doc, "//supplementary-material")
 98 |   if (length(z) > 0) {
 99 |     if (!all(xml2::xml_text(z, trim = TRUE) == "")) {
100 |       n <- length(z)
101 |       message("Found ", n, ifelse(n > 1, " supplements", " supplement"))
102 |       ## label often missing
103 |       f1 <- vapply(z, function(x) xml2::xml_text(
104 |           xml2::xml_find_first(x, "./label"),
105 |           trim = TRUE
106 |         ), character(1))
107 |       # use paste ./caption/* to avoid mashing together title and p like
108 |       # Additional file 1Figure S1
109 |       f2 <- vapply(z, function(x) paste(xml2::xml_text(
110 |           xml2::xml_find_all(x, "./caption/*")
111 |         ), collapse = " "), character(1))
112 |       # mBio with /p tags only, others with media/captions only
113 |       if (all(f2 == "")) {
114 |         f2 <- vapply(z, function(x) xml2::xml_text(
115 |             xml2::xml_find_first(x, ".//p")
116 |           ), character(1))
117 |         f2[is.na(f2)] <- ""
118 |       }
119 |       # remove period to avoid splitting (DOC), (XLSX) into new sentences -
120 |       # misses (XLSX 32 kb)
121 |       f2 <- gsub("\\.( \\([A-Z]+\\))", "\\1", f2)
122 |       x1 <- vapply(f2, tokenizers::tokenize_sentences,
123 |         list(1),
124 |         USE.NAMES = FALSE
125 |       )
126 |       if (all(is.na(f1))) {
127 |         y <- vapply(x1, function(x) x[1], character(1))
128 |         # if all have more than 1 sentence, then use first for label if all
129 |         # are less than 40 characters?
130 |         if (all(vapply(x1, length, integer(1)) > 1) & all(nchar(y) < 40)) {
131 |           f1 <- y
132 |           x1 <- lapply(x1, function(x) x[-1])
133 |         } else {
134 |           if (length(y) == 1) {
135 |             message(" Missing supplement label tag, using File S1")
136 |           } else {
137 |             message(
138 |               " Missing supplement label tag, using File S1 to S",
139 |               length(y)
140 |             )
141 |           }
142 |           f1 <- paste0("File S", seq_along(y))
143 |         }
144 |       }
145 |       names(x1) <- gsub("\\.$", "", f1)
146 |       sups <- dplyr::bind_rows(
147 |         lapply(x1, function(z)
148 |           tibble::tibble(sentence = seq_along(z), text = z)),
149 |         .id = "label"
150 |       )
151 |     } else {
152 |       message(" No text found in supplement tag")
153 |       sups <- NULL
154 |     }
155 |   } else {
156 |     sups <- NULL
157 |   }
158 |   x <- dplyr::bind_rows(list(figure = figs, table = tbls, supplement = sups),
159 |     .id = "tag"
160 |   )
161 |   if (nrow(x) == 0) {
162 |     message("No caption tags found")
163 |     x <- NULL
164 |   }
165 |   x
166 | }
167 | 


--------------------------------------------------------------------------------
/R/pmc_metadata.R:
--------------------------------------------------------------------------------
  1 | #' Get article metadata
  2 | #'
  3 | #' Get a list of journal and article metadata in /front tag
  4 | #'
  5 | #' @param doc \code{xml_document} from PubMed Central
  6 | #'
  7 | #' @return  a list
  8 | #'
  9 | #' @author Chris Stubben
 10 | #'
 11 | #' @examples
 12 | #' # doc <- pmc_xml("PMC2231364") # OR
 13 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
 14 | #'   package = "tidypmc"
 15 | #' ))
 16 | #' pmc_metadata(doc)
 17 | #' @export
 18 | 
 19 | pmc_metadata <- function(doc) {
 20 |   if (class(doc)[1] != "xml_document") {
 21 |     stop("doc should be an xml_document from PubMed Central")
 22 |   }
 23 |   z <- vector("list")
 24 |   ## //front has journal-meta and article-meta
 25 |   # cat(as.character(xml2::xml_find_all(doc, "//front//journal-meta")))
 26 |   pmcid <- xml2::xml_text(xml2::xml_find_first(
 27 |     doc,   "//front//article-id[@pub-id-type='pmcid']"
 28 |   ))
 29 |   if (!is.na(pmcid)) z[["PMCID"]] <- paste0("PMC", pmcid)
 30 |   t1 <- xml2::xml_text(xml2::xml_find_first(
 31 |     doc,   "//front//article-title"
 32 |   ), trim = TRUE)
 33 |   if (!is.na(t1)) {
 34 |     z[["Title"]] <- t1
 35 |     a1 <- xml2::xml_text(xml2::xml_find_all(
 36 |       doc, "//front//contrib[not(@contrib-type='editor')]/name/given-names"
 37 |     ))
 38 |     a2 <- xml2::xml_text(xml2::xml_find_all(
 39 |       doc, "//front//contrib[not(@contrib-type='editor')]/name/surname"
 40 |     ))
 41 |     if (length(a1) != length(a2)) {
 42 |       message("WARNING: Check author names -missing first or last tag")
 43 |     }
 44 |     authors <- paste(a1, a2)
 45 |     ## comma-delimited string (easier to bind_rows with multiple pmcids)
 46 |     authors <- paste(authors, collapse = ", ")
 47 |     z[["Authors"]] <- authors
 48 |     ## Year published,  use collection else ppub year?
 49 |     year <- xml2::xml_text(xml2::xml_find_first(
 50 |       doc, "//front//pub-date[@pub-type='collection']/year"
 51 |     ))
 52 |     if (is.na(year)) {
 53 |       year <- xml2::xml_text(xml2::xml_find_first(
 54 |         doc,       "//front//pub-date[@pub-type='ppub']/year"
 55 |       ))
 56 |     }
 57 |     if (is.na(year)) {
 58 |       year <- xml2::xml_text(xml2::xml_find_first(
 59 |         doc,       "//front//pub-date[@pub-type='epub']/year"
 60 |       ))
 61 |     }
 62 |     if (!is.na(year)) z[["Year"]] <- as.integer(year)
 63 |     # Journal meta
 64 |     journal <- xml2::xml_text(xml2::xml_find_first(
 65 |       doc, "//front//journal-meta//journal-title"
 66 |     ))
 67 |     if (!is.na(journal)) z[["Journal"]] <- journal
 68 |     ## volume and issue in article metadata
 69 |     volume <- xml2::xml_text(xml2::xml_find_first(
 70 |       doc, "//front//article-meta/volume"
 71 |     ))
 72 |     if (!is.na(volume)) z[["Volume"]] <- volume
 73 |     issue <- xml2::xml_text(xml2::xml_find_first(
 74 |       doc, "//front//article-meta/issue"
 75 |     ))
 76 |     if (!is.na(issue)) z[["Issue"]] <- issue
 77 |     # PAGES
 78 |     p1 <- xml2::xml_text(xml2::xml_find_first(
 79 |       doc, "//front//article-meta/fpage"
 80 |     ))
 81 |     if (!is.na(p1)) {
 82 |       p2 <- xml2::xml_text(xml2::xml_find_first(
 83 |         doc,       "//front//article-meta/lpage"
 84 |       ))
 85 |       if (p1 != p2) p1 <- paste(p1, p2, sep = "-")
 86 |     } else {
 87 |       p1 <- xml2::xml_text(xml2::xml_find_first(
 88 |         doc,       "//front//article-meta/elocation-id"
 89 |       ))
 90 |     }
 91 |     z[["Pages"]] <- p1
 92 |     # More PUB Dates  - tags always sorted day, month, year?
 93 |     epub <- xml2::xml_text(xml2::xml_find_all(
 94 |       doc, "//front//pub-date[@pub-type='epub']/*"
 95 |     ))
 96 |     if (length(epub) > 0) {
 97 |       z[["Published online"]] <- paste(rev(epub), collapse = "-")
 98 |     }
 99 |     rec <- xml2::xml_text(xml2::xml_find_all(
100 |       doc, "//front//history/date[@date-type='received']/*"
101 |     ))
102 |     if (length(rec) > 0) z[["Date received"]] <- paste(rev(rec), collapse = "-")
103 |     ## DOI
104 |     doi <- xml2::xml_text(xml2::xml_find_first(
105 |       doc, "//front//article-id[@pub-id-type='doi']"
106 |     ))
107 |     if (!is.na(doi)) z[["DOI"]] <- doi
108 |     # Publisher
109 |     x <- xml2::xml_text(xml2::xml_find_first(
110 |       doc, "//front//journal-meta//publisher-name"
111 |     ))
112 |     if (!is.na(x)) z[["Publisher"]] <- x
113 |   } else {
114 |     message("No title found. Not a PMC XML document?")
115 |     z <- NULL
116 |   }
117 |   z
118 | }
119 | 


--------------------------------------------------------------------------------
/R/pmc_reference.R:
--------------------------------------------------------------------------------
  1 | #' Format references cited
  2 | #'
  3 | #' @param doc \code{xml_document} from PubMed Central
  4 | #'
  5 | #' @return a tibble with id, pmid, authors, year, title, journal, volume, pages,
  6 | #' and doi.
  7 | #'
  8 | #' @author Chris Stubben
  9 | #'
 10 | #' @note Mixed citations without any child tags are added to the author column.
 11 | #'
 12 | #' @examples
 13 | #' # doc <- pmc_xml("PMC2231364")
 14 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
 15 | #'   package = "tidypmc"
 16 | #' ))
 17 | #' x <- pmc_reference(doc)
 18 | #' x
 19 | #' @export
 20 | 
 21 | pmc_reference <- function(doc) {
 22 |   if (class(doc)[1] != "xml_document") {
 23 |     stop("doc should be an xml_document from PubMed Central")
 24 |   }
 25 |   z <- xml2::xml_find_all(doc, "//ref")
 26 |   # cat(as.character(z[[1]]))
 27 |   if (length(z) > 0) {
 28 |     n <- lapply(z, function(x) xml2::xml_name(xml2::xml_find_all(x, "./*")))
 29 |     x <- as.vector(unlist(n))
 30 |     x <- table(x[!x %in% c("label", "note")])
 31 |     message("Found ", paste(x, names(x), collapse = " and "), " tags")
 32 |     ## xml2::xml_find_first returns NA for missing values
 33 |     pmid <- vapply(z, function(x) xml2::xml_text(
 34 |         xml2::xml_find_first(x, ".//pub-id[@pub-id-type='pmid']"),
 35 |         trim = TRUE
 36 |       ), character(1))
 37 |     doi <- vapply(z, function(x) xml2::xml_text(
 38 |         xml2::xml_find_first(x, ".//pub-id[@pub-id-type='doi']"),
 39 |         trim = TRUE
 40 |       ), character(1))
 41 |     a1 <- lapply(z, function(x) xml2::xml_text(
 42 |         xml2::xml_find_all(x, ".//surname"),
 43 |         trim = TRUE
 44 |       ))
 45 |     a2 <- lapply(z, function(x) xml2::xml_text(
 46 |         xml2::xml_find_all(x, ".//given-names"),
 47 |         trim = TRUE
 48 |       ))
 49 |     # if all references have same number of authors, use SIMPLIFY=FALSE,
 50 |     # see PMC6369050
 51 |     authors <- vapply(
 52 |       mapply(paste, a1, a2, SIMPLIFY = FALSE),
 53 |       function(x) paste(x, collapse = ", "), character(1)
 54 |     )
 55 |     authors[authors == ""] <- NA
 56 |     # use character for same authors published twice in same year, 2012a 2012b
 57 |     year <- vapply(z, function(x) xml2::xml_text(
 58 |         xml2::xml_find_first(x, ".//year"),
 59 |         trim = TRUE
 60 |       ), character(1))
 61 |     if (all(grepl("^[0-9]+$", year))) year <- as.integer(year)
 62 |     title <- vapply(z, function(x) xml2::xml_text(
 63 |         xml2::xml_find_first(x, ".//article-title"),
 64 |         trim = TRUE
 65 |       ), character(1))
 66 |     # new lines in title PMC4909105
 67 |     title <- gsub("\n *", " ", title)
 68 |     journal <- vapply(z, function(x) xml2::xml_text(
 69 |         xml2::xml_find_first(x, ".//source"),
 70 |         trim = TRUE
 71 |       ), character(1))
 72 |     volume <- vapply(z, function(x) xml2::xml_text(
 73 |         xml2::xml_find_first(x, ".//volume"),
 74 |         trim = TRUE
 75 |       ), character(1))
 76 |     p1 <- vapply(z, function(x) xml2::xml_text(
 77 |         xml2::xml_find_first(x, ".//fpage"),
 78 |         trim = TRUE
 79 |       ), character(1))
 80 |     p2 <- vapply(z, function(x) xml2::xml_text(
 81 |         xml2::xml_find_first(x, ".//lpage"),
 82 |         trim = TRUE
 83 |       ), character(1))
 84 |     pages <- paste(p1, p2, sep = "-")
 85 |     pages <- gsub("-NA", "", pages)
 86 |     x <- tibble::tibble(
 87 |       id = seq_along(pmid), pmid, authors, year, title, journal,
 88 |       volume, pages, doi
 89 |     )
 90 |     # add mixed citation to title??
 91 |     n <- which(is.na(x$authors) & is.na(x$title))
 92 |     if (length(n) > 0) {
 93 |       if (nrow(x) == length(n)) {
 94 |         message(" References are missing author and title tags")
 95 |       } else {
 96 |         message(" ", length(n), " references are missing author and title tags")
 97 |       }
 98 |       message(" Adding /ref string to author column")
 99 |       x$authors[n] <- vapply(z[n], xml2::xml_text, character(1))
100 |     }
101 |   } else {
102 |     message("No /ref tags")
103 |     x <- NULL
104 |   }
105 |   x
106 | }
107 | 


--------------------------------------------------------------------------------
/R/pmc_table.R:
--------------------------------------------------------------------------------
  1 | #' Convert table nodes to tibbles
  2 | #'
  3 | #' Convert PubMed Central table nodes into a list of tibbles
  4 | #'
  5 | #' @param doc \code{xml_document} from PubMed Central
  6 | #'
  7 | #' @return a list of tibbles
  8 | #'
  9 | #' @note Saves the caption and footnotes as attributes and collapses multiline
 10 | #' headers, expands all rowspan and colspan attributes and adds
 11 | #' subheadings to column one.
 12 | #'
 13 | #' @author Chris Stubben
 14 | #'
 15 | #' @examples
 16 | #' # doc <- pmc_xml("PMC2231364")
 17 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
 18 | #'   package = "tidypmc"
 19 | #' ))
 20 | #' x <- pmc_table(doc)
 21 | #' sapply(x, dim)
 22 | #' x
 23 | #' attributes(x[[1]])
 24 | #' @export
 25 | 
 26 | pmc_table <- function(doc) {
 27 |   if (class(doc)[1] != "xml_document") {
 28 |     stop("doc should be an xml_document from PubMed Central")
 29 |   }
 30 |   twn <- length(xml2::xml_find_all(doc, "//table-wrap"))
 31 |   ## Avoid table-wrap without table node, usually link to image only
 32 |   z <- xml2::xml_find_all(doc, "//table-wrap/table/..")
 33 |   if (length(z) == 0) {
 34 |     message("No tables found")
 35 |     if (twn > 0) message("Table-wrap with link to image?")
 36 |     tbls <- NULL
 37 |   } else {
 38 |     tbl_nodes <- xml2::xml_find_all(z, "./table")
 39 |     message("Parsing ", length(z), " tables")
 40 |     if (twn > length(z)) {
 41 |       message(twn - length(n), " /table-wrap with link to image?")
 42 |     }
 43 |     ## START table function
 44 |     #  t1 <- xml2::xml_find_all(doc, "//table")[1]
 45 |     tbls <- lapply(tbl_nodes, function(t1) {
 46 |       # PARSE HEADER
 47 |       x <- xml2::xml_find_all(t1, ".//thead/tr")
 48 |       # cat(as.character(x))
 49 |       ## missing header
 50 |       if (length(x) == 0) {
 51 |         thead <- NA
 52 |         ## 1 header row...
 53 |       } else if (length(x) == 1) {
 54 |         colspan <- as.numeric(xml2::xml_attr(
 55 |           xml2::xml_find_all(x, ".//td|.//th"), "colspan",
 56 |           default = "1"
 57 |         ))
 58 |         thead <- xml2::xml_text(xml2::xml_find_all(x, ".//td|.//th"))
 59 |         # repeat across colspan
 60 |         if (any(colspan > 1)) {
 61 |           thead <- rep(thead, colspan)
 62 |         }
 63 |         # mutliline header - collapse into single row
 64 |         # SEE  tables 1 and 2 in PMC3109299
 65 |       } else {
 66 |         nr <- length(x)
 67 |         nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr(
 68 |             xml2::xml_find_all(y, ".//td|.//th"), "colspan",
 69 |             default = "1"
 70 |           ))), double(1)))
 71 |         c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc))
 72 |         for (i in seq_len(nr)) {
 73 |           rowspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all(
 74 |             x[[i]], ".//td|.//th"
 75 |           ), "rowspan", default = "1"))
 76 |           colspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all(
 77 |             x[[i]], ".//td|.//th"
 78 |           ), "colspan", default = "1"))
 79 |           thead <- xml2::xml_text(xml2::xml_find_all(
 80 |             x[[i]], ".//td|.//th"
 81 |           ))
 82 |           if (any(colspan > 1)) {
 83 |             thead <- rep(thead, colspan)
 84 |             rowspan <- rep(rowspan, colspan)
 85 |           }
 86 |           # fill values into empty cells
 87 |           n <- which(is.na(c2[i, ]))
 88 |           ## truncate to avoid warning - see PMC3119406
 89 |           if (length(thead) != length(n)) thead <- thead[seq_along(n)]
 90 |           c2[ i, n] <- thead
 91 |           if (any(rowspan > 1)) {
 92 |             for (j in seq_along(rowspan)) {
 93 |               if (rowspan[j] > 1) {
 94 |                 ## repeat value down column
 95 |                 c2[(i + 1):(i + (rowspan[j] - 1)), n[j]] <- thead[j]
 96 |               }
 97 |             }
 98 |           }
 99 |         }
100 |         ## COLLAPSE into single row...
101 |         ## some rowspans may extend past nr!  see table 1 PMC3109299
102 |         if (nrow(c2) > nr) c2 <- c2[seq_len(nr), ]
103 |         ## collaps3 column names and row values uses ";" as separator
104 |         thead <- apply(c2, 2, function(x)
105 |           paste(unique(x), collapse = ": "))
106 |         # some mutliline rows with horizontal lines only
107 |         thead <- gsub(": : ", ": ", thead)
108 |         thead <- gsub("^: ", "", thead)
109 |         thead <- gsub(": $", "", thead)
110 |       }
111 |       #-------------------------------------------------------------------
112 |       # PARSE TABLE
113 |       # Do not repeat values with colspans across rows (usually table
114 |       # subheaders). Repeat values with rowspan down columns
115 |       x <- xml2::xml_find_all(t1, ".//tbody/tr")
116 |       # number of rows
117 |       nr <- length(x)
118 |       nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr(
119 |           xml2::xml_find_all(y, ".//td|.//th"), "colspan",
120 |           default = "1"
121 |         ))), double(1)))
122 |       c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc))
123 |       for (i in seq_len(nr)) {
124 |         ## some table use //th  see table1 PMC3031304
125 |         rowspan <- xml2::xml_attr(xml2::xml_find_all(
126 |           x[[i]], ".//td|.//th"
127 |         ), "rowspan", default = "1")
128 |         colspan <- xml2::xml_attr(xml2::xml_find_all(
129 |           x[[i]], ".//td|.//th"
130 |         ), "colspan", default = "1")
131 |         # PMC6358641 with rowspan=""
132 |         rowspan <- as.numeric(ifelse(rowspan == "", 1, rowspan))
133 |         colspan <- as.numeric(ifelse(colspan == "", 1, colspan))
134 |         val <- xml2::xml_text(xml2::xml_find_all(x[[i]], ".//td|.//th"))
135 |         # NO-BREAK, EN or EM SPACE
136 |         val <- gsub("\u00A0|\u2002|\u2003", " ", val)
137 |         val <- trimws(val)
138 |         if (any(colspan > 1)) {
139 |           val <- rep(val, colspan)
140 |           ##  only display subheader in column 1?
141 |           val[-1][val[-1] == val[-length(val)]] <- NA
142 |           rowspan <- rep(rowspan, colspan)
143 |         }
144 |         # fill values into empty cells
145 |         n <- which(is.na(c2[i, ]))
146 | 
147 |         # some tables have extra td tags  see table 2  PMC3109299
148 |         # <td align="left" rowspan="1" colspan="1"/>
149 |         # truncate to avoid warning??
150 |         if (length(val) != length(n)) {
151 |           val <- val[seq_along(n) ]
152 |         }
153 |         c2[ i, n] <- val
154 |         if (any(rowspan > 1)) {
155 |           for (j in seq_along(rowspan)) {
156 |             if (rowspan[j] > 1) {
157 |               ## repeat value down column
158 |               c2[ (i + 1):(i + (rowspan[j] - 1)), n[j]] <- val[j]
159 |             }
160 |           }
161 |         }
162 |       }
163 |       x <- c2
164 |       #-------------------------------------
165 |       if (!is.na(thead[1])) {
166 |         thead[thead == ""] <- "X"
167 |         tbn <- ncol(x)
168 |         thn <- length(thead)
169 |         if (tbn != thn) {
170 |           message("Warning: number of column in /thead and /tbody do not match")
171 |           if (tbn > thn) {
172 |             thead <- append(thead, rep("X", tbn - thn))
173 |           } else {
174 |             ## see table 3 from PMC3020393
175 |             thead <- thead[seq_len(tbn)]
176 |           }
177 |         }
178 |         thead <- gsub("\n", " ", thead)
179 |         thead <- make.unique(thead)
180 |         colnames(x) <- thead
181 |       }
182 |       # DELETE empty rows  -
183 |       if (nrow(x) > 1) {
184 |         nX <- apply(x, 1, function(y) sum(!(is.na(y) | y == "")))
185 |         x <- x[nX != 0, , FALSE] # use FALSE in case only 1 column in TABLE
186 |       }
187 |       # FIX column typess
188 |       ## errors if newlines and tabs in cells(or colnames!)
189 |       colnames(x) <- gsub("\n *", "", colnames(x))
190 |       x <- tibble::as_tibble(x)
191 |       x <- suppressMessages(repeat_sub(x))
192 |       x
193 |     })
194 |     ### END table functino
195 |     #----------------------------------------------------
196 |     ## should have label and caption?
197 |     f1 <- vapply(z, function(x) xml2::xml_text(
198 |         xml2::xml_find_first(x, "./label")
199 |       ), character(1))
200 |     f2 <- vapply(z, function(x) xml2::xml_text(
201 |         xml2::xml_find_first(x, "./caption")
202 |       ), character(1))
203 |     # check length, some table-wrap with more than 1 /table tag
204 |     if (length(f1) == length(tbls)) {
205 |       names(tbls) <- f1
206 |     }
207 |     else {
208 |       message("Number of /table nodes is not the sampe as /table-wrap")
209 |     }
210 |     if (length(f2) == length(tbls)) {
211 |       for (i in seq_along(tbls)) {
212 |         attr(tbls[[i]], "caption") <- f2[i]
213 |       }
214 |     }
215 |     ## footnotes
216 |     fn <- vapply(z, function(x) xml2::xml_text(
217 |         xml2::xml_find_first(x, "./table-wrap-foot")
218 |       ), character(1))
219 |     n <- which(!is.na(fn))
220 |     if (length(n) > 0) {
221 |       message("Adding footnotes to Table ", paste(n, collapse = ","))
222 |       for (i in n) {
223 |         attr(tbls[[i]], "footnotes") <- fn[i]
224 |       }
225 |     }
226 |   }
227 |   tbls
228 | }
229 | 


--------------------------------------------------------------------------------
/R/pmc_text.R:
--------------------------------------------------------------------------------
  1 | #' Split section paragraphs into sentences
  2 | #'
  3 | #' Split section paragraph tags into a table with subsection titles and
  4 | #' sentences using \code{tokenize_sentences}
  5 | #'
  6 | #' @param doc \code{xml_document} from PubMed Central
  7 | #'
  8 | #' @return a tibble with section, paragraph and sentence number and text
  9 | #'
 10 | #' @note Subsections may be nested to arbitrary depths and this function will
 11 | #' return the entire path to the subsection title as a delimited string like
 12 | #' "Results; Predicted functions; Pathogenicity".  Tables, figures and
 13 | #' formulas that are nested in section paragraphs are removed, superscripted
 14 | #' references are replaced with brackets, and any other superscripts or
 15 | #' subscripts are separared with ^ and _.
 16 | #'
 17 | #' @author Chris Stubben
 18 | #'
 19 | #' @examples
 20 | #' # doc <- pmc_xml("PMC2231364")
 21 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
 22 | #'   package = "tidypmc"
 23 | #' ))
 24 | #' txt <- pmc_text(doc)
 25 | #' txt
 26 | #' dplyr::count(txt, section, sort = TRUE)
 27 | #' @export
 28 | 
 29 | pmc_text <- function(doc) {
 30 |   if (class(doc)[1] != "xml_document") {
 31 |     stop("doc should be an xml_document from PubMed Central")
 32 |   }
 33 |   ## create new document to remove nodes
 34 |   doc2 <- xml2::xml_new_root(doc)
 35 |   z <- vector("list")
 36 |   ## Main title
 37 |   t1 <- xml2::xml_text(xml2::xml_find_first(
 38 |     doc2, "//front//article-title"
 39 |   ), trim = TRUE)
 40 |   if (!is.na(t1)) z[["Title"]] <- t1
 41 |   ## Abstract
 42 |   a1 <- xml2::xml_text(xml2::xml_find_all(
 43 |     doc2, "//abstract[not(@abstract-type='summary')]//p"
 44 |   ))
 45 |   if (length(a1) > 0) z[["Abstract"]] <- a1
 46 | 
 47 |   ## Author summary
 48 |   author_sum <- xml2::xml_text(xml2::xml_find_all(
 49 |     doc2, "//abstract[@abstract-type='summary']/title"
 50 |   ))
 51 |   if (length(author_sum) > 0) {
 52 |     z[[author_sum]] <- xml2::xml_text(xml2::xml_find_all(
 53 |       doc2, "//abstract[@abstract-type='summary']//p"
 54 |     ))
 55 |   }
 56 |   if (length(z) == 0) {
 57 |     message("No title or abstract found. Not a PMC XML document?")
 58 |     x <- NULL
 59 |   } else {
 60 |     ## check for tables, figures, formula within <sec/p> tags
 61 |     n <- xml2::xml_find_all(doc2, "//sec/p/table-wrap")
 62 |     if (length(n) > 0) {
 63 |       message("Note: removing table-wrap nested in sec/p tag")
 64 |       xml2::xml_remove(n)
 65 |     }
 66 |     n <- xml2::xml_find_all(doc2, "//sec/p/fig")
 67 |     if (length(n) > 0) {
 68 |       message("Note: removing fig nested in sec/p tag")
 69 |       xml2::xml_remove(n)
 70 |     }
 71 |     # formulas may include very long MathType encoding strings
 72 |     n <- xml2::xml_find_all(doc2, "//sec/p/disp-formula")
 73 |     if (length(n) > 0) {
 74 |       message("Note: removing disp-formula nested in sec/p tag")
 75 |       xml2::xml_remove(n)
 76 |     }
 77 |     # DROP any sections with supplementary materials (often with nested
 78 |     # sections missing titles)
 79 |     n <- xml2::xml_find_all(
 80 |       doc2, "//body//sec[@sec-type='supplementary-material']"
 81 |     )
 82 |     if (length(n) > 0) xml2::xml_remove(n)
 83 |     ## Add brackets to numbered references with superscript tags
 84 |     add_bracket <- FALSE
 85 |     bib <- xml2::xml_find_all(doc2, "//sup//xref[@ref-type='bibr']")
 86 |     if (length(bib) > 0) {
 87 |       message("Adding brackets to numbered references in /sup tags")
 88 |       add_bracket <- TRUE
 89 |       xml2::xml_text(bib) <- paste0(" [", xml2::xml_text(bib), "]")
 90 |     }
 91 |     ## Add ^ and _ to /sup and /sub tags
 92 |     sup <- xml2::xml_find_all(doc2, "//sup[not(xref)]")
 93 |     if (length(sup) > 0) {
 94 |       xml2::xml_text(sup) <- paste0("^", xml2::xml_text(sup))
 95 |     }
 96 |     subs <- xml2::xml_find_all(doc2, "//sub")
 97 |     if (length(subs) > 0) {
 98 |       xml2::xml_text(subs) <- paste0("_", xml2::xml_text(subs))
 99 |     }
100 | 
101 |     ## parse text from Sections
102 |     sec <- xml2::xml_find_all(doc2, "//body//sec")
103 |     if (length(sec) == 0) {
104 |       message("NOTE: No sections found, using all text in main body/p")
105 |       z[["[Main]"]] <- xml2::xml_text(xml2::xml_find_all(doc2, "//body/p"))
106 |     } else {
107 |       ## Emerging infectious diseases has both body/p and body/sec
108 |       intro <- xml2::xml_text(xml2::xml_find_all(doc2, "//body/p"))
109 |       if (length(intro) > 0) {
110 |         message(
111 |           "NOTE: Body has both /p and /sec tags - untitled Introduction?"
112 |         )
113 |         z[["[Introduction]"]] <- xml2::xml_text(
114 |           xml2::xml_find_all(doc2, "//body/p")
115 |         )
116 |       }
117 |       # /sec should have both title and p?
118 |       t1 <- xml2::xml_text(xml2::xml_find_all(doc2, "//body//sec/title"))
119 |       # fix sections without title  ... PMC6360207
120 |       if ("" %in% t1) {
121 |         message("Missing ", sum(t1 == ""), " title in sec/p tag")
122 |         t1[t1 == ""] <- "[untitled sec/p]"
123 |       }
124 |       ## indentation level of subsections
125 |       n <- stringr::str_count(xml2::xml_path(
126 |         xml2::xml_find_all(doc2, "//body//sec/title")
127 |       ), "/")
128 |       ## full path to subsection title
129 |       path <- path_string(t1, n)
130 |       ## section paragraphs (get sec/p and not any //p)
131 |       secP <- lapply(sec, function(x) xml2::xml_text(
132 |           xml2::xml_find_all(x, "./p")
133 |         ))
134 |       if (length(path) != length(secP)) {
135 |         message("Warning: some sections are missing /title tags")
136 |       }
137 |       minP <- min(length(path), length(secP))
138 |       ## LOOP through subsections and skip sections missing /p tags
139 |       for (i in seq_len(minP)) {
140 |         subT <- path[i]
141 |         subT <- gsub("\\.$", "", subT)
142 |         # in case of nested sec tags,  replace "; ; ; "
143 |         subT <- gsub("[; ]{3,}", "; ", subT)
144 |         if (length(secP[[i]]) > 0) {
145 |           ## don't split Fig. 1 into two sentences, probably many others
146 |           p1 <- lapply(
147 |             secP[[i]],
148 |             function(x) gsub("([ (][Ff]ig)\\.", "\\1", x)
149 |           )
150 |           z[[subT]] <- p1
151 |         }
152 |       }
153 |     }
154 |     x <- lapply(z, tokenizers::tokenize_sentences)
155 |     x1 <- lapply(x, function(y) dplyr::bind_rows(
156 |         lapply(y, function(z) if (length(z) > 0) {
157 |             tibble::tibble(sentence = seq_along(z), text = z)
158 |           }),
159 |         .id = "paragraph"
160 |       ))
161 |     x <- dplyr::bind_rows(x1, .id = "section")
162 |     x <- dplyr::mutate(x, paragraph = as.integer(paragraph))
163 |     # replace en dash, em dash, etc to separate ranges
164 |     x$text <- gsub("\u2011|\u2012|\u2013|\u2014", "-", x$text)
165 |     ## FIX if brackets added to superscripted references
166 |     if (add_bracket) x$text <- gsub("]- [", "-", x$text, fixed = TRUE)
167 |   }
168 |   x
169 | }
170 | 


--------------------------------------------------------------------------------
/R/pmc_xml.R:
--------------------------------------------------------------------------------
 1 | #' Download XML from PubMed Central
 2 | #'
 3 | #' @param id a PMC id starting with 'PMC'
 4 | #'
 5 | #' @return \code{xml_document}
 6 | #'
 7 | #' @source \url{https://europepmc.org/RestfulWebService}
 8 | #'
 9 | #' @examples
10 | #' \dontrun{
11 | #' doc <- pmc_xml("PMC2231364")
12 | #' }
13 | #'
14 | #' @export
15 | 
16 | pmc_xml <- function(id) {
17 |   if (!grepl("^PMC[0-9]+$", id)) {
18 |     stop("id should be a valid PMC id like PMC2231364")
19 |   }
20 |   url1 <- paste0(
21 |     "https://www.ebi.ac.uk/europepmc/webservices/rest/", id, "/fullTextXML"
22 |   )
23 |   xml2::read_xml(url1)
24 | }
25 | 


--------------------------------------------------------------------------------
/R/repeat_sub.R:
--------------------------------------------------------------------------------
 1 | #' Repeat table subheadings
 2 | #'
 3 | #' Repeat table subheadings in a new column
 4 | #'
 5 | #' Identifies subheadings in a data frame by checking for rows with a non-empty
 6 | #' first column and all other columns are empty. Removes subheader rows and
 7 | #' repeats values down a new column.
 8 | #'
 9 | #' @param x a tibble with subheadings
10 | #' @param column new column name, default subheading
11 | #' @param first add subheader as first column, default TRUE
12 | #'
13 | #' @return a tibble
14 | #'
15 | #' @author Chris Stubben
16 | #'
17 | #' @examples
18 | #' x <- data.frame(
19 | #'   genes = c("Up", "aroB", "glnP", "Down", "ndhA", "pyrF"),
20 | #'   fold_change = c(NA, 2.5, 1.7, NA, -3.1, -2.6)
21 | #' )
22 | #' x
23 | #' repeat_sub(x)
24 | #' repeat_sub(x, "regulated", first = FALSE)
25 | #' @noRd
26 | 
27 | repeat_sub <- function(x, column = "subheading", first = TRUE) {
28 |   if (!is.data.frame(x)) {
29 |     stop("x should be a table")
30 |   }
31 |   if (ncol(x) == 1) {
32 |     message("Only one column in table")
33 |   } else {
34 |     ## columns 2 to ncol(x) should be empty
35 |     ## \u00A0 is non-breaking space
36 |     n <- apply(
37 |       x[, -1, FALSE], 1,
38 |       function(z) all(is.na(z) | z == "NA" | z == "" | z == "\u00A0")
39 |     )
40 |     if (sum(n) == 0) {
41 |       message("No subheaders found")
42 |     } else if (sum(diff(which(n)) == 1) > 1) {
43 |       ## check for consecutive subheaders (and then probably not subheaders)
44 |       ## SEE PMC3334355
45 |       message("Too many subheaders in consecutive rows")
46 |     } else if (which(n)[1] != 1) {
47 |       message("No subheader in row 1")
48 |     } else {
49 |       # keep copy of original table
50 |       y <- x
51 |       ## add unlist()  for tibbles
52 |       x[[column]] <- rep(unlist(x[n,1]), times = diff(c(which(n), nrow(x) + 1)))
53 |       # drop rows with subheader only
54 |       y <- x[!n, ]
55 |       # rownames(y)<-NULL
56 |       y <- suppressMessages(readr::type_convert(y))
57 |       if (first) y <- y[, c(ncol(y), seq_len(ncol(y) - 1))]
58 |       x <- y
59 |     }
60 |   }
61 |   x
62 | }
63 | 


--------------------------------------------------------------------------------
/R/separate_genes.R:
--------------------------------------------------------------------------------
 1 | #' Separate genes and operons into multiple rows
 2 | #'
 3 | #' Separate genes and operons mentioned in full text into multiple rows
 4 | #'
 5 | #' @param txt a table
 6 | #' @param pattern regular expression to match genes, default is to match
 7 | #' microbial genes like AbcD, default [A-Za-z][a-z]{2}[A-Z0-9]+
 8 | #' @param genes an optional vector of genes, set pattern to NA to only match
 9 | #' this list.
10 | #' @param operon operon length, default 6. Split genes with 6 or more letters
11 | #' into separate genes, for example AbcDEF is split into abcD, abcE and abcF.
12 | #' @param column column name to search, default "text"
13 | #'
14 | #' @note Check for genes in italics using \code{xml_text(xml_find_all(doc,
15 | #' "//sec//p//italic"))} and update the pattern or add additional genes as an
16 | #' optional vector if needed
17 | #'
18 | #' @return a tibble with gene name, matching text and rows.
19 | #'
20 | #' @author Chris Stubben
21 | #'
22 | #' @examples
23 | #' x <- data.frame(row = 1, text = "Genes like YacK, hmu and sufABC")
24 | #' separate_genes(x)
25 | #' separate_genes(x, genes = "hmu")
26 | #' @export
27 | 
28 | separate_genes <- function(txt, pattern = "\\b[A-Za-z][a-z]{2}[A-Z0-9]+\\b",
29 |                            genes, operon = 6, column = "text") {
30 |   if (!operon > 4) stop("Operon length should be 5 or more")
31 |   if (!missing(genes)) {
32 |     x1 <- paste0("\\b", paste(genes, collapse = "\\b|\\b"), "\\b")
33 |     if (pattern %in% c("", NA)) {
34 |       pattern <- x1
35 |     } else {
36 |       pattern <- paste(pattern, x1, sep = "|")
37 |     }
38 |   }
39 |   x <- separate_text(txt, pattern, column)
40 |   if (is.null(x)) {
41 |     x1 <- NULL
42 |   } else {
43 |     ## add option to exclue common matches
44 |     x <- dplyr::filter(x, !match %in% c(
45 |       "TraDIS", "taqDNA", "log2", "log10",
46 |       "ecoRI", "bamHI", "chr1", "chr2"
47 |     ))
48 |     if (nrow(x) == 0) stop("No match to genes")
49 |     ## don't split locus tags like ypo2995
50 |     y <- ifelse(nchar(x$match) >= operon & !grepl(
51 |       "^[0-9]+$",
52 |       substring(x$match, 4)
53 |     ),
54 |     mapply(
55 |       paste0, tolower(substr(x$match, 1, 3)),
56 |       strsplit(substring(x$match, 4), "")
57 |     ),
58 |     paste0(tolower(substr(x$match, 1, 1)), substring(x$match, 2))
59 |     )
60 |     n <- vapply(y, length, integer(1))
61 |     x1 <- dplyr::bind_cols(gene = unlist(y), x[ rep(seq_len(nrow(x)), n), ])
62 |   }
63 |   x1
64 | }
65 | 


--------------------------------------------------------------------------------
/R/separate_refs.R:
--------------------------------------------------------------------------------
 1 | #' Separate references cited into multiple rows
 2 | #'
 3 | #' Separates references cited in brackets or parentheses into multiple rows and
 4 | #' splits the comma-delimited numeric strings and expands ranges like 7-9 into
 5 | #' new rows
 6 | #'
 7 | #' @param txt a table
 8 | #' @param column column name, default "text"
 9 | #'
10 | #' @return a tibble
11 | #'
12 | #' @author Chris Stubben
13 | #'
14 | #' @examples
15 | #' x <- data.frame(row = 1, text = "some important studies [7-9,15]")
16 | #' separate_refs(x)
17 | #' @export
18 | 
19 | separate_refs <- function(txt, column = "text") {
20 |   pattern <- "(\\(|\\[)[0-9, -]+(\\]|\\))"
21 |   x <- separate_text(txt, pattern, column)
22 |   if (is.null(x)) {
23 |     x1 <- NULL
24 |   } else {
25 |     # remove any parentheses, spaces and brackets
26 |     y <- gsub("[)( ]|\\]|\\[", "", x$match)
27 |     ## split commas
28 |     y <- strsplit(y, ",")
29 |     ## split ranges
30 |     z <- lapply(y, strsplit, "-")
31 |     ## apply seq if length is 2
32 |     y <- lapply(z, function(x) unlist(
33 |         lapply(x, function(x1)
34 |           if (length(x1) == 2) seq(x1[1], x1[2]) else as.numeric(x1))
35 |       ))
36 |     n <- vapply(y, length, integer(1))
37 |     x1 <- dplyr::bind_cols(id = unlist(y), x[ rep(seq_len(nrow(x)), n), ])
38 |   }
39 |   x1
40 | }
41 | 


--------------------------------------------------------------------------------
/R/separate_tags.R:
--------------------------------------------------------------------------------
 1 | #' Separate locus tag into multiple rows
 2 | #'
 3 | #' Separates locus tags mentioned in full text and expands ranges like
 4 | #' YPO1970-74 into new rows
 5 | #'
 6 | #' @param txt a table
 7 | #' @param pattern regular expression to match locus tags like YPO[0-9-]+ or
 8 | #'  the locus tag prefix like YPO.
 9 | #' @param column column name to search, default "text"
10 | #'
11 | #' @return a tibble with locus tag, matching text and rows.
12 | #'
13 | #' @author Chris Stubben
14 | #'
15 | #' @examples
16 | #' x <- data.frame(row = 1, text = "some genes like YPO1002 and YPO1970-74")
17 | #' separate_tags(x, "YPO")
18 | #' @export
19 | 
20 | separate_tags <- function(txt, pattern, column = "text") {
21 |   ## if prefix only (no numbers)  also match YPO1854-YPO1856?
22 |   if (!grepl("[0-9]", pattern)) {
23 |     # pattern <- paste0(pattern, "[0-9-]+")
24 |     pattern <- paste0(pattern, "[0-9", pattern, "-]+")
25 |   }
26 |   x <- separate_text(txt, pattern, column)
27 |   if (is.null(x)) {
28 |     x1 <- NULL
29 |   } else {
30 |     ## avoid YPO1854-YPO1856-YPO1858
31 |     if (any(stringr::str_count(x$match, "-") > 1)) {
32 |       stop("pattern matches 3 or more tags")
33 |     }
34 |     if (any(grepl("-$", x$match))) x$match <- gsub("-$", "", x$match)
35 |     # Expand range if matching "-"
36 |     y <- lapply(x$match, function(id) {
37 |       if (grepl("-", id)) {
38 |         pre <- stringr::str_extract(id, "^[^0-9]+")
39 |         ## split range
40 |         x <- strsplit(gsub("[^0-9-]", "", id), "-")[[1]]
41 |         n <- nchar(x[1])
42 |         x <- as.numeric(x)
43 |         ## check if 2nd number is less than 1st... YPO1970-80
44 |         if (x[2] < x[1]) {
45 |           x[2] <- paste0(
46 |             substring(x[1], 1, nchar(x[1]) - nchar(x[2])), x[2]
47 |           )
48 |         }
49 |         id <- seq(x[1], x[2])
50 |         id <- stringr::str_pad(id, n, pad = "0")
51 |         id <- paste0(pre, id)
52 |       }
53 |       id
54 |     })
55 |     n <- vapply(y, length, integer(1))
56 |     x1 <- dplyr::bind_cols(id = unlist(y), x[ rep(seq_len(nrow(x)), n), ])
57 |   }
58 |   x1
59 | }
60 | 


--------------------------------------------------------------------------------
/R/separate_text.R:
--------------------------------------------------------------------------------
 1 | #' Separate all matching text into multiple rows
 2 | #'
 3 | #' @param txt a tibble, usually results from \code{pmc_text}
 4 | #' @param pattern either a regular expression or a vector of words to find in
 5 | #' text
 6 | #' @param column column name, default "text"
 7 | #'
 8 | #' @return a tibble
 9 | #'
10 | #' @note passed to \code{grepl} and \code{str_extract_all}
11 | #'
12 | #' @author Chris Stubben
13 | #'
14 | #' @examples
15 | #' # doc <- pmc_xml("PMC2231364")
16 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
17 | #'         package = "tidypmc"))
18 | #' txt <- pmc_text(doc)
19 | #' separate_text(txt, "[ATCGN]{5,}")
20 | #' separate_text(txt, "\\([A-Z]{3,6}s?\\)")
21 | #' # pattern can be a vector of words
22 | #' separate_text(txt, c("hmu", "ybt", "yfe", "yfu"))
23 | #' # wrappers for separate_text with extra step to expand matched ranges
24 | #' separate_refs(txt)
25 | #' separate_genes(txt)
26 | #' separate_tags(txt, "YPO")
27 | #'
28 | #' @export
29 | 
30 | separate_text <- function(txt, pattern, column = "text"){
31 |    if (!is.data.frame(txt))     stop("txt should be a tibble")
32 |    if (!column %in% names(txt)) stop("column ", column, " is not found")
33 |    ## paste words into | delimited string with word boundaries
34 |    if (length(pattern) > 1) {
35 |        pattern <- paste0("\\b", paste(pattern, collapse = "\\b|\\b"), "\\b")
36 |    }
37 |    x <- dplyr::filter(txt, grepl(pattern, txt[[column]]))
38 |    if (nrow(x) == 0) {
39 |       message("No match to ", pattern)
40 |       txt2 <- NULL
41 |    } else {
42 |       y <- stringr::str_extract_all(x[[column]], pattern)
43 |       y <- lapply(y, unique)
44 |       n <- vapply(y, length, integer(1))
45 |       txt2 <- dplyr::bind_cols(match = unlist(y), x[rep(seq_len(nrow(x)), n),])
46 |    }
47 |    txt2
48 | }
49 | 


--------------------------------------------------------------------------------
/R/tidypmc-package.R:
--------------------------------------------------------------------------------
 1 | #' \code{tidypmc} package
 2 | #'
 3 | #' Parse full text XML documents from PubMed Central
 4 | #'
 5 | #' See the Github page for details at \url{https://github.com/ropensci/tidypmc}
 6 | #'
 7 | #' @docType package
 8 | #' @name tidypmc
 9 | #' @importFrom dplyr %>%
10 | #' @keywords internal
11 | NULL
12 | 
13 | #  R CMD check error from
14 | #  https://github.com/jennybc/googlesheets/blob/master/R/googlesheets.R
15 | if(getRversion() >= "2.15.1")  utils::globalVariables(c("paragraph"))
16 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | ```{r setup, include = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "# "
  9 | )
 10 | ```
 11 | 
 12 | [![Build Status](https://travis-ci.org/ropensci/tidypmc.svg?branch=master)](https://travis-ci.org/ropensci/tidypmc)
 13 | [![Coverage status](https://codecov.io/gh/ropensci/tidypmc/branch/master/graph/badge.svg)](https://codecov.io/github/ropensci/tidypmc?branch=master)
 14 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/tidypmc)](https://cran.r-project.org/package=tidypmc)
 15 | [![Downloads](https://cranlogs.r-pkg.org/badges/tidypmc)](https://CRAN.R-project.org/package=tidypmc)
 16 | [![Total Downloads](https://cranlogs.r-pkg.org/badges/grand-total/tidypmc?color=orange)](https://CRAN.R-project.org/package=tidypmc)
 17 | 
 18 | # tidypmc
 19 | 
 20 | The [Open Access subset] of [Pubmed Central] (PMC) includes 2.5 million articles
 21 | from biomedical and life sciences journals.  The full text XML files are freely
 22 | available for text mining from the [REST service] or [FTP site] but can be
 23 | challenging to parse. For example, section tags are nested to arbitrary depths,
 24 | formulas and tables may return incomprehensible text blobs and superscripted
 25 | references are pasted at the end of words.  The functions in the `tidypmc`
 26 | package are intended to return readable text and maintain the document
 27 | structure, so gene names and other terms can be associated with specific
 28 | sections, paragraphs, sentences or table rows.
 29 | 
 30 | 
 31 | ## Installation
 32 | 
 33 | Use [remotes] to install the package.
 34 | 
 35 | ```{r install, eval=FALSE}
 36 | remotes::install_github("ropensci/tidypmc")
 37 | ```
 38 | 
 39 | ## Load XML
 40 | 
 41 | Download a single XML document like [PMC2231364] from the [REST service] using
 42 | the `pmc_xml` function.
 43 | 
 44 | ```{r pmc_xml, message=FALSE, echo=-1}
 45 | options(width=100)
 46 | library(tidypmc)
 47 | library(tidyverse)
 48 | doc <- pmc_xml("PMC2231364")
 49 | doc
 50 | ```
 51 | 
 52 | The [europepmc] package includes additional functions to search PMC
 53 | and download full text.  Be sure to include the `OPEN_ACCESS` field in
 54 | the search since these are the only articles with full text XML available.
 55 | 
 56 | ```{r epmc, echo=-1}
 57 | options(width=100)
 58 | library(europepmc)
 59 | yp <- epmc_search("title:(Yersinia pestis virulence) OPEN_ACCESS:Y")
 60 | select(yp, pmcid, pubYear, title) %>%
 61 |   print(n=5)
 62 | ```
 63 | 
 64 | 
 65 | Save all `r nrow(yp)` results to a list of XML documents using the `epmc_ftxt` or `pmc_xml` function.
 66 | 
 67 | ```{r purrr, eval=FALSE}
 68 | docs <- map(yp$pmcid, epmc_ftxt)
 69 | ```
 70 | 
 71 | 
 72 | See the [PMC FTP vignette] for details on parsing the large XML files on the [FTP site]
 73 | with 10,000 articles each.
 74 | 
 75 | 
 76 | ## Parse XML
 77 | 
 78 | 
 79 | The package includes five functions to parse the `xml_document`.
 80 | 
 81 | 
 82 | |R function     |Description                                                                |
 83 | |:--------------|:--------------------------------------------------------------------------|
 84 | |`pmc_text`     |Split section paragraphs into sentences with full path to subsection titles|
 85 | |`pmc_caption`  |Split figure, table and supplementary material captions into sentences     |
 86 | |`pmc_table`    |Convert table nodes into a list of tibbles                                 |
 87 | |`pmc_reference`|Format references cited into a tibble                                      |
 88 | |`pmc_metadata` |List journal and article metadata in front node                            |
 89 | 
 90 | 
 91 | The `pmc_text` function uses the [tokenizers] package to split section paragraphs into
 92 | sentences.  The function also removes any tables, figures or formulas that are nested
 93 | within paragraph tags, replaces superscripted references with brackets, adds carets and
 94 | underscores to other superscripts and subscripts and includes the full path to the
 95 | subsection title.
 96 | 
 97 | ```{r pmc_text, echo=-1}
 98 | options(width=110)
 99 | txt <- pmc_text(doc)
100 | txt
101 | count(txt, section, sort=TRUE)
102 | ```
103 | 
104 | 
105 | Load the [tidytext] package for further text processing.
106 | 
107 | ```{r tidytext, echo=-1}
108 | options(width=110)
109 | library(tidytext)
110 | x1 <- unnest_tokens(txt, word, text) %>%
111 |   anti_join(stop_words) %>%
112 |   filter(!word %in% 1:100)
113 | filter(x1, str_detect(section, "^Results"))
114 | filter(x1, str_detect(section, "^Results")) %>%
115 |   count(word, sort = TRUE)
116 | ```
117 | 
118 | 
119 | 
120 | The `pmc_table` function formats tables by collapsing multiline headers,
121 | expanding rowspan and colspan attributes and adding subheadings into a new column.
122 | 
123 | ```{r pmc_table, echo=-1}
124 | options(width=110)
125 | tbls <- pmc_table(doc)
126 | map_int(tbls, nrow)
127 | tbls[[1]]
128 | ```
129 | 
130 | Use `collapse_rows` to join column names and cell values in a semi-colon delimited string (and
131 | then search using functions in the next section).
132 | 
133 | ```{r collapserows, echo=-1}
134 | options(width=110)
135 | collapse_rows(tbls, na.string="-")
136 | ```
137 | 
138 | The other three `pmc` functions are described in the package [vignette].
139 | 
140 | 
141 | ## Searching text
142 | 
143 | There are a few functions to search within the `pmc_text` or collapsed
144 | `pmc_table` output.  `separate_text` uses the [stringr] package to extract any
145 | regular expression or vector of words.
146 | 
147 | 
148 | ```{r separate_text, echo=-1}
149 | options(width=110)
150 | separate_text(txt, "[ATCGN]{5,}")
151 | ```
152 | 
153 | A few wrappers search pre-defined patterns and add an extra step to expand
154 | matched ranges. `separate_refs` matches references within brackets using
155 | `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`.
156 | 
157 | ```{r separate_refs, echo=-1}
158 | options(width=110)
159 | separate_refs(txt)
160 | ```
161 | 
162 | `separate_genes` will find microbial genes like tauD (with a
163 | capitalized 4th letter)  and expand operons like `tauABCD` into
164 | four genes.  `separate_tags` will find and expand locus tag ranges below.
165 | 
166 | 
167 | ```{r locus_tags, echo=-1}
168 | options(width=110)
169 | collapse_rows(tbls, na="-") %>%
170 |   separate_tags("YPO") %>%
171 |   filter(id == "YPO1855")
172 | ```
173 | 
174 | 
175 | See the [vignette] for more details including code to parse
176 | XML documents using the [xml2] package.  The [PMC FTP vignette]
177 | has details on parsing XML files at the Europe PMC [FTP site].
178 | 
179 | 
180 | ### Community Guidelines
181 | 
182 | This project is released with a [Contributor Code of Conduct](CONDUCT.md). By
183 | participating in this project you agree to abide by its terms. Feedback, bug
184 | reports, and feature requests are welcome
185 | [here](https://github.com/ropensci/tidypmc/issues).
186 | 
187 | 
188 | [remotes]: https://github.com/r-lib/remotes
189 | [PMC2231364]: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC2231364/fullTextXML
190 | [Open Access subset]: https://europepmc.org/downloads/openaccess
191 | [REST service]: https://europepmc.org/RestfulWebService
192 | [FTP site]: https://europepmc.org/ftp/oa/
193 | [tidytext]: https://www.tidytextmining.com/
194 | [stringr]: https://stringr.tidyverse.org/
195 | [vignette]: https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md
196 | [PMC FTP vignette]: https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md
197 | [tokenizers]: https://lincolnmullen.com/software/tokenizers/
198 | [xml2]: https://github.com/r-lib/xml2
199 | [europepmc]: https://github.com/ropensci/europepmc
200 | [Pubmed Central]: https://europepmc.org
201 | 


--------------------------------------------------------------------------------
/README.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8">
  8 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  9 | <meta name="generator" content="pandoc" />
 10 | <meta name="viewport" content="width=device-width, initial-scale=1">
 11 | 
 12 | <style type="text/css">
 13 | @font-face {
 14 | font-family: octicons-link;
 15 | src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAZwABAAAAAACFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEU0lHAAAGaAAAAAgAAAAIAAAAAUdTVUIAAAZcAAAACgAAAAoAAQAAT1MvMgAAAyQAAABJAAAAYFYEU3RjbWFwAAADcAAAAEUAAACAAJThvmN2dCAAAATkAAAABAAAAAQAAAAAZnBnbQAAA7gAAACyAAABCUM+8IhnYXNwAAAGTAAAABAAAAAQABoAI2dseWYAAAFsAAABPAAAAZwcEq9taGVhZAAAAsgAAAA0AAAANgh4a91oaGVhAAADCAAAABoAAAAkCA8DRGhtdHgAAAL8AAAADAAAAAwGAACfbG9jYQAAAsAAAAAIAAAACABiATBtYXhwAAACqAAAABgAAAAgAA8ASm5hbWUAAAToAAABQgAAAlXu73sOcG9zdAAABiwAAAAeAAAAME3QpOBwcmVwAAAEbAAAAHYAAAB/aFGpk3jaTY6xa8JAGMW/O62BDi0tJLYQincXEypYIiGJjSgHniQ6umTsUEyLm5BV6NDBP8Tpts6F0v+k/0an2i+itHDw3v2+9+DBKTzsJNnWJNTgHEy4BgG3EMI9DCEDOGEXzDADU5hBKMIgNPZqoD3SilVaXZCER3/I7AtxEJLtzzuZfI+VVkprxTlXShWKb3TBecG11rwoNlmmn1P2WYcJczl32etSpKnziC7lQyWe1smVPy/Lt7Kc+0vWY/gAgIIEqAN9we0pwKXreiMasxvabDQMM4riO+qxM2ogwDGOZTXxwxDiycQIcoYFBLj5K3EIaSctAq2kTYiw+ymhce7vwM9jSqO8JyVd5RH9gyTt2+J/yUmYlIR0s04n6+7Vm1ozezUeLEaUjhaDSuXHwVRgvLJn1tQ7xiuVv/ocTRF42mNgZGBgYGbwZOBiAAFGJBIMAAizAFoAAABiAGIAznjaY2BkYGAA4in8zwXi+W2+MjCzMIDApSwvXzC97Z4Ig8N/BxYGZgcgl52BCSQKAA3jCV8CAABfAAAAAAQAAEB42mNgZGBg4f3vACQZQABIMjKgAmYAKEgBXgAAeNpjYGY6wTiBgZWBg2kmUxoDA4MPhGZMYzBi1AHygVLYQUCaawqDA4PChxhmh/8ODDEsvAwHgMKMIDnGL0x7gJQCAwMAJd4MFwAAAHjaY2BgYGaA4DAGRgYQkAHyGMF8NgYrIM3JIAGVYYDT+AEjAwuDFpBmA9KMDEwMCh9i/v8H8sH0/4dQc1iAmAkALaUKLgAAAHjaTY9LDsIgEIbtgqHUPpDi3gPoBVyRTmTddOmqTXThEXqrob2gQ1FjwpDvfwCBdmdXC5AVKFu3e5MfNFJ29KTQT48Ob9/lqYwOGZxeUelN2U2R6+cArgtCJpauW7UQBqnFkUsjAY/kOU1cP+DAgvxwn1chZDwUbd6CFimGXwzwF6tPbFIcjEl+vvmM/byA48e6tWrKArm4ZJlCbdsrxksL1AwWn/yBSJKpYbq8AXaaTb8AAHja28jAwOC00ZrBeQNDQOWO//sdBBgYGRiYWYAEELEwMTE4uzo5Zzo5b2BxdnFOcALxNjA6b2ByTswC8jYwg0VlNuoCTWAMqNzMzsoK1rEhNqByEyerg5PMJlYuVueETKcd/89uBpnpvIEVomeHLoMsAAe1Id4AAAAAAAB42oWQT07CQBTGv0JBhagk7HQzKxca2sJCE1hDt4QF+9JOS0nbaaYDCQfwCJ7Au3AHj+LO13FMmm6cl7785vven0kBjHCBhfpYuNa5Ph1c0e2Xu3jEvWG7UdPDLZ4N92nOm+EBXuAbHmIMSRMs+4aUEd4Nd3CHD8NdvOLTsA2GL8M9PODbcL+hD7C1xoaHeLJSEao0FEW14ckxC+TU8TxvsY6X0eLPmRhry2WVioLpkrbp84LLQPGI7c6sOiUzpWIWS5GzlSgUzzLBSikOPFTOXqly7rqx0Z1Q5BAIoZBSFihQYQOOBEdkCOgXTOHA07HAGjGWiIjaPZNW13/+lm6S9FT7rLHFJ6fQbkATOG1j2OFMucKJJsxIVfQORl+9Jyda6Sl1dUYhSCm1dyClfoeDve4qMYdLEbfqHf3O/AdDumsjAAB42mNgYoAAZQYjBmyAGYQZmdhL8zLdDEydARfoAqIAAAABAAMABwAKABMAB///AA8AAQAAAAAAAAAAAAAAAAABAAAAAA==) format('woff');
 16 | }
 17 | body {
 18 | -webkit-text-size-adjust: 100%;
 19 | text-size-adjust: 100%;
 20 | color: #333;
 21 | font-family: "Helvetica Neue", Helvetica, "Segoe UI", Arial, freesans, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
 22 | font-size: 16px;
 23 | line-height: 1.6;
 24 | word-wrap: break-word;
 25 | }
 26 | a {
 27 | background-color: transparent;
 28 | }
 29 | a:active,
 30 | a:hover {
 31 | outline: 0;
 32 | }
 33 | strong {
 34 | font-weight: bold;
 35 | }
 36 | h1 {
 37 | font-size: 2em;
 38 | margin: 0.67em 0;
 39 | }
 40 | img {
 41 | border: 0;
 42 | }
 43 | hr {
 44 | box-sizing: content-box;
 45 | height: 0;
 46 | }
 47 | pre {
 48 | overflow: auto;
 49 | }
 50 | code,
 51 | kbd,
 52 | pre {
 53 | font-family: monospace, monospace;
 54 | font-size: 1em;
 55 | }
 56 | input {
 57 | color: inherit;
 58 | font: inherit;
 59 | margin: 0;
 60 | }
 61 | html input[disabled] {
 62 | cursor: default;
 63 | }
 64 | input {
 65 | line-height: normal;
 66 | }
 67 | input[type="checkbox"] {
 68 | box-sizing: border-box;
 69 | padding: 0;
 70 | }
 71 | table {
 72 | border-collapse: collapse;
 73 | border-spacing: 0;
 74 | }
 75 | td,
 76 | th {
 77 | padding: 0;
 78 | }
 79 | * {
 80 | box-sizing: border-box;
 81 | }
 82 | input {
 83 | font: 13px / 1.4 Helvetica, arial, nimbussansl, liberationsans, freesans, clean, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
 84 | }
 85 | a {
 86 | color: #4078c0;
 87 | text-decoration: none;
 88 | }
 89 | a:hover,
 90 | a:active {
 91 | text-decoration: underline;
 92 | }
 93 | hr {
 94 | height: 0;
 95 | margin: 15px 0;
 96 | overflow: hidden;
 97 | background: transparent;
 98 | border: 0;
 99 | border-bottom: 1px solid #ddd;
100 | }
101 | hr:before {
102 | display: table;
103 | content: "";
104 | }
105 | hr:after {
106 | display: table;
107 | clear: both;
108 | content: "";
109 | }
110 | h1,
111 | h2,
112 | h3,
113 | h4,
114 | h5,
115 | h6 {
116 | margin-top: 15px;
117 | margin-bottom: 15px;
118 | line-height: 1.1;
119 | }
120 | h1 {
121 | font-size: 30px;
122 | }
123 | h2 {
124 | font-size: 21px;
125 | }
126 | h3 {
127 | font-size: 16px;
128 | }
129 | h4 {
130 | font-size: 14px;
131 | }
132 | h5 {
133 | font-size: 12px;
134 | }
135 | h6 {
136 | font-size: 11px;
137 | }
138 | blockquote {
139 | margin: 0;
140 | }
141 | ul,
142 | ol {
143 | padding: 0;
144 | margin-top: 0;
145 | margin-bottom: 0;
146 | }
147 | ol ol,
148 | ul ol {
149 | list-style-type: lower-roman;
150 | }
151 | ul ul ol,
152 | ul ol ol,
153 | ol ul ol,
154 | ol ol ol {
155 | list-style-type: lower-alpha;
156 | }
157 | dd {
158 | margin-left: 0;
159 | }
160 | code {
161 | font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
162 | font-size: 12px;
163 | }
164 | pre {
165 | margin-top: 0;
166 | margin-bottom: 0;
167 | font: 12px Consolas, "Liberation Mono", Menlo, Courier, monospace;
168 | }
169 | .select::-ms-expand {
170 | opacity: 0;
171 | }
172 | .octicon {
173 | font: normal normal normal 16px/1 octicons-link;
174 | display: inline-block;
175 | text-decoration: none;
176 | text-rendering: auto;
177 | -webkit-font-smoothing: antialiased;
178 | -moz-osx-font-smoothing: grayscale;
179 | -webkit-user-select: none;
180 | -moz-user-select: none;
181 | -ms-user-select: none;
182 | user-select: none;
183 | }
184 | .octicon-link:before {
185 | content: '\f05c';
186 | }
187 | .markdown-body:before {
188 | display: table;
189 | content: "";
190 | }
191 | .markdown-body:after {
192 | display: table;
193 | clear: both;
194 | content: "";
195 | }
196 | .markdown-body>*:first-child {
197 | margin-top: 0 !important;
198 | }
199 | .markdown-body>*:last-child {
200 | margin-bottom: 0 !important;
201 | }
202 | a:not([href]) {
203 | color: inherit;
204 | text-decoration: none;
205 | }
206 | .anchor {
207 | display: inline-block;
208 | padding-right: 2px;
209 | margin-left: -18px;
210 | }
211 | .anchor:focus {
212 | outline: none;
213 | }
214 | h1,
215 | h2,
216 | h3,
217 | h4,
218 | h5,
219 | h6 {
220 | margin-top: 1em;
221 | margin-bottom: 16px;
222 | font-weight: bold;
223 | line-height: 1.4;
224 | }
225 | h1 .octicon-link,
226 | h2 .octicon-link,
227 | h3 .octicon-link,
228 | h4 .octicon-link,
229 | h5 .octicon-link,
230 | h6 .octicon-link {
231 | color: #000;
232 | vertical-align: middle;
233 | visibility: hidden;
234 | }
235 | h1:hover .anchor,
236 | h2:hover .anchor,
237 | h3:hover .anchor,
238 | h4:hover .anchor,
239 | h5:hover .anchor,
240 | h6:hover .anchor {
241 | text-decoration: none;
242 | }
243 | h1:hover .anchor .octicon-link,
244 | h2:hover .anchor .octicon-link,
245 | h3:hover .anchor .octicon-link,
246 | h4:hover .anchor .octicon-link,
247 | h5:hover .anchor .octicon-link,
248 | h6:hover .anchor .octicon-link {
249 | visibility: visible;
250 | }
251 | h1 {
252 | padding-bottom: 0.3em;
253 | font-size: 2.25em;
254 | line-height: 1.2;
255 | border-bottom: 1px solid #eee;
256 | }
257 | h1 .anchor {
258 | line-height: 1;
259 | }
260 | h2 {
261 | padding-bottom: 0.3em;
262 | font-size: 1.75em;
263 | line-height: 1.225;
264 | border-bottom: 1px solid #eee;
265 | }
266 | h2 .anchor {
267 | line-height: 1;
268 | }
269 | h3 {
270 | font-size: 1.5em;
271 | line-height: 1.43;
272 | }
273 | h3 .anchor {
274 | line-height: 1.2;
275 | }
276 | h4 {
277 | font-size: 1.25em;
278 | }
279 | h4 .anchor {
280 | line-height: 1.2;
281 | }
282 | h5 {
283 | font-size: 1em;
284 | }
285 | h5 .anchor {
286 | line-height: 1.1;
287 | }
288 | h6 {
289 | font-size: 1em;
290 | color: #777;
291 | }
292 | h6 .anchor {
293 | line-height: 1.1;
294 | }
295 | p,
296 | blockquote,
297 | ul,
298 | ol,
299 | dl,
300 | table,
301 | pre {
302 | margin-top: 0;
303 | margin-bottom: 16px;
304 | }
305 | hr {
306 | height: 4px;
307 | padding: 0;
308 | margin: 16px 0;
309 | background-color: #e7e7e7;
310 | border: 0 none;
311 | }
312 | ul,
313 | ol {
314 | padding-left: 2em;
315 | }
316 | ul ul,
317 | ul ol,
318 | ol ol,
319 | ol ul {
320 | margin-top: 0;
321 | margin-bottom: 0;
322 | }
323 | li>p {
324 | margin-top: 16px;
325 | }
326 | dl {
327 | padding: 0;
328 | }
329 | dl dt {
330 | padding: 0;
331 | margin-top: 16px;
332 | font-size: 1em;
333 | font-style: italic;
334 | font-weight: bold;
335 | }
336 | dl dd {
337 | padding: 0 16px;
338 | margin-bottom: 16px;
339 | }
340 | blockquote {
341 | padding: 0 15px;
342 | color: #777;
343 | border-left: 4px solid #ddd;
344 | }
345 | blockquote>:first-child {
346 | margin-top: 0;
347 | }
348 | blockquote>:last-child {
349 | margin-bottom: 0;
350 | }
351 | table {
352 | display: block;
353 | width: 100%;
354 | overflow: auto;
355 | word-break: normal;
356 | word-break: keep-all;
357 | }
358 | table th {
359 | font-weight: bold;
360 | }
361 | table th,
362 | table td {
363 | padding: 6px 13px;
364 | border: 1px solid #ddd;
365 | }
366 | table tr {
367 | background-color: #fff;
368 | border-top: 1px solid #ccc;
369 | }
370 | table tr:nth-child(2n) {
371 | background-color: #f8f8f8;
372 | }
373 | img {
374 | max-width: 100%;
375 | box-sizing: content-box;
376 | background-color: #fff;
377 | }
378 | code {
379 | padding: 0;
380 | padding-top: 0.2em;
381 | padding-bottom: 0.2em;
382 | margin: 0;
383 | font-size: 85%;
384 | background-color: rgba(0,0,0,0.04);
385 | border-radius: 3px;
386 | }
387 | code:before,
388 | code:after {
389 | letter-spacing: -0.2em;
390 | content: "\00a0";
391 | }
392 | pre>code {
393 | padding: 0;
394 | margin: 0;
395 | font-size: 100%;
396 | word-break: normal;
397 | white-space: pre;
398 | background: transparent;
399 | border: 0;
400 | }
401 | .highlight {
402 | margin-bottom: 16px;
403 | }
404 | .highlight pre,
405 | pre {
406 | padding: 16px;
407 | overflow: auto;
408 | font-size: 85%;
409 | line-height: 1.45;
410 | background-color: #f7f7f7;
411 | border-radius: 3px;
412 | }
413 | .highlight pre {
414 | margin-bottom: 0;
415 | word-break: normal;
416 | }
417 | pre {
418 | word-wrap: normal;
419 | }
420 | pre code {
421 | display: inline;
422 | max-width: initial;
423 | padding: 0;
424 | margin: 0;
425 | overflow: initial;
426 | line-height: inherit;
427 | word-wrap: normal;
428 | background-color: transparent;
429 | border: 0;
430 | }
431 | pre code:before,
432 | pre code:after {
433 | content: normal;
434 | }
435 | kbd {
436 | display: inline-block;
437 | padding: 3px 5px;
438 | font-size: 11px;
439 | line-height: 10px;
440 | color: #555;
441 | vertical-align: middle;
442 | background-color: #fcfcfc;
443 | border: solid 1px #ccc;
444 | border-bottom-color: #bbb;
445 | border-radius: 3px;
446 | box-shadow: inset 0 -1px 0 #bbb;
447 | }
448 | .pl-c {
449 | color: #969896;
450 | }
451 | .pl-c1,
452 | .pl-s .pl-v {
453 | color: #0086b3;
454 | }
455 | .pl-e,
456 | .pl-en {
457 | color: #795da3;
458 | }
459 | .pl-s .pl-s1,
460 | .pl-smi {
461 | color: #333;
462 | }
463 | .pl-ent {
464 | color: #63a35c;
465 | }
466 | .pl-k {
467 | color: #a71d5d;
468 | }
469 | .pl-pds,
470 | .pl-s,
471 | .pl-s .pl-pse .pl-s1,
472 | .pl-sr,
473 | .pl-sr .pl-cce,
474 | .pl-sr .pl-sra,
475 | .pl-sr .pl-sre {
476 | color: #183691;
477 | }
478 | .pl-v {
479 | color: #ed6a43;
480 | }
481 | .pl-id {
482 | color: #b52a1d;
483 | }
484 | .pl-ii {
485 | background-color: #b52a1d;
486 | color: #f8f8f8;
487 | }
488 | .pl-sr .pl-cce {
489 | color: #63a35c;
490 | font-weight: bold;
491 | }
492 | .pl-ml {
493 | color: #693a17;
494 | }
495 | .pl-mh,
496 | .pl-mh .pl-en,
497 | .pl-ms {
498 | color: #1d3e81;
499 | font-weight: bold;
500 | }
501 | .pl-mq {
502 | color: #008080;
503 | }
504 | .pl-mi {
505 | color: #333;
506 | font-style: italic;
507 | }
508 | .pl-mb {
509 | color: #333;
510 | font-weight: bold;
511 | }
512 | .pl-md {
513 | background-color: #ffecec;
514 | color: #bd2c00;
515 | }
516 | .pl-mi1 {
517 | background-color: #eaffea;
518 | color: #55a532;
519 | }
520 | .pl-mdr {
521 | color: #795da3;
522 | font-weight: bold;
523 | }
524 | .pl-mo {
525 | color: #1d3e81;
526 | }
527 | kbd {
528 | display: inline-block;
529 | padding: 3px 5px;
530 | font: 11px Consolas, "Liberation Mono", Menlo, Courier, monospace;
531 | line-height: 10px;
532 | color: #555;
533 | vertical-align: middle;
534 | background-color: #fcfcfc;
535 | border: solid 1px #ccc;
536 | border-bottom-color: #bbb;
537 | border-radius: 3px;
538 | box-shadow: inset 0 -1px 0 #bbb;
539 | }
540 | .task-list-item {
541 | list-style-type: none;
542 | }
543 | .task-list-item+.task-list-item {
544 | margin-top: 3px;
545 | }
546 | .task-list-item input {
547 | margin: 0 0.35em 0.25em -1.6em;
548 | vertical-align: middle;
549 | }
550 | :checked+.radio-label {
551 | z-index: 1;
552 | position: relative;
553 | border-color: #4078c0;
554 | }
555 | .sourceLine {
556 | display: inline-block;
557 | }
558 | code .kw { color: #000000; }
559 | code .dt { color: #ed6a43; }
560 | code .dv { color: #009999; }
561 | code .bn { color: #009999; }
562 | code .fl { color: #009999; }
563 | code .ch { color: #009999; }
564 | code .st { color: #183691; }
565 | code .co { color: #969896; }
566 | code .ot { color: #0086b3; }
567 | code .al { color: #a61717; }
568 | code .fu { color: #63a35c; }
569 | code .er { color: #a61717; background-color: #e3d2d2; }
570 | code .wa { color: #000000; }
571 | code .cn { color: #008080; }
572 | code .sc { color: #008080; }
573 | code .vs { color: #183691; }
574 | code .ss { color: #183691; }
575 | code .im { color: #000000; }
576 | code .va {color: #008080; }
577 | code .cf { color: #000000; }
578 | code .op { color: #000000; }
579 | code .bu { color: #000000; }
580 | code .ex { color: #000000; }
581 | code .pp { color: #999999; }
582 | code .at { color: #008080; }
583 | code .do { color: #969896; }
584 | code .an { color: #008080; }
585 | code .cv { color: #008080; }
586 | code .in { color: #008080; }
587 | </style>
588 | <style>
589 | body {
590 |   box-sizing: border-box;
591 |   min-width: 200px;
592 |   max-width: 980px;
593 |   margin: 0 auto;
594 |   padding: 45px;
595 |   padding-top: 0px;
596 | }
597 | </style>
598 | 
599 | </head>
600 | 
601 | <body>
602 | 
603 | <p><a href="https://travis-ci.org/ropensci/tidypmc"><img src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI5MCIgaGVpZ2h0PSIyMCI+PGxpbmVhckdyYWRpZW50IGlkPSJhIiB4Mj0iMCIgeTI9IjEwMCUiPjxzdG9wIG9mZnNldD0iMCIgc3RvcC1jb2xvcj0iI2JiYiIgc3RvcC1vcGFjaXR5PSIuMSIvPjxzdG9wIG9mZnNldD0iMSIgc3RvcC1vcGFjaXR5PSIuMSIvPjwvbGluZWFyR3JhZGllbnQ+PHJlY3Qgcng9IjMiIHdpZHRoPSI5MCIgaGVpZ2h0PSIyMCIgZmlsbD0iIzU1NSIvPjxyZWN0IHJ4PSIzIiB4PSIzNyIgd2lkdGg9IjUzIiBoZWlnaHQ9IjIwIiBmaWxsPSIjNGMxIi8+PHBhdGggZmlsbD0iIzRjMSIgZD0iTTM3IDBoNHYyMGgtNHoiLz48cmVjdCByeD0iMyIgd2lkdGg9IjkwIiBoZWlnaHQ9IjIwIiBmaWxsPSJ1cmwoI2EpIi8+PGcgZmlsbD0iI2ZmZiIgdGV4dC1hbmNob3I9Im1pZGRsZSIgZm9udC1mYW1pbHk9IkRlamFWdSBTYW5zLFZlcmRhbmEsR2VuZXZhLHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEiPjx0ZXh0IHg9IjE5LjUiIHk9IjE1IiBmaWxsPSIjMDEwMTAxIiBmaWxsLW9wYWNpdHk9Ii4zIj5idWlsZDwvdGV4dD48dGV4dCB4PSIxOS41IiB5PSIxNCI+YnVpbGQ8L3RleHQ+PHRleHQgeD0iNjIuNSIgeT0iMTUiIGZpbGw9IiMwMTAxMDEiIGZpbGwtb3BhY2l0eT0iLjMiPnBhc3Npbmc8L3RleHQ+PHRleHQgeD0iNjIuNSIgeT0iMTQiPnBhc3Npbmc8L3RleHQ+PC9nPjwvc3ZnPg==" alt="Build Status" /></a> <a href="https://codecov.io/github/ropensci/tidypmc?branch=master"><img src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxMTIiIGhlaWdodD0iMjAiPgogICAgPGxpbmVhckdyYWRpZW50IGlkPSJiIiB4Mj0iMCIgeTI9IjEwMCUiPgogICAgICAgIDxzdG9wIG9mZnNldD0iMCIgc3RvcC1jb2xvcj0iI2JiYiIgc3RvcC1vcGFjaXR5PSIuMSIgLz4KICAgICAgICA8c3RvcCBvZmZzZXQ9IjEiIHN0b3Atb3BhY2l0eT0iLjEiIC8+CiAgICA8L2xpbmVhckdyYWRpZW50PgogICAgPG1hc2sgaWQ9ImEiPgogICAgICAgIDxyZWN0IHdpZHRoPSIxMTIiIGhlaWdodD0iMjAiIHJ4PSIzIiBmaWxsPSIjZmZmIiAvPgogICAgPC9tYXNrPgogICAgPGcgbWFzaz0idXJsKCNhKSI+CiAgICAgICAgPHBhdGggZmlsbD0iIzU1NSIgZD0iTTAgMGg3NnYyMEgweiIgLz4KICAgICAgICA8cGF0aCBmaWxsPSIjZjg4ZTI5IiBkPSJNNzYgMGgzNnYyMEg3NnoiIC8+CiAgICAgICAgPHBhdGggZmlsbD0idXJsKCNiKSIgZD0iTTAgMGgxMTJ2MjBIMHoiIC8+CiAgICA8L2c+CiAgICA8ZyBmaWxsPSIjZmZmIiB0ZXh0LWFuY2hvcj0ibWlkZGxlIiBmb250LWZhbWlseT0iRGVqYVZ1IFNhbnMsVmVyZGFuYSxHZW5ldmEsc2Fucy1zZXJpZiIgZm9udC1zaXplPSIxMSI+CiAgICAgICAgPHRleHQgeD0iNDYiIHk9IjE1IiBmaWxsPSIjMDEwMTAxIiBmaWxsLW9wYWNpdHk9Ii4zIj5jb2RlY292PC90ZXh0PgogICAgICAgIDx0ZXh0IHg9IjQ2IiB5PSIxNCI+Y29kZWNvdjwvdGV4dD4KICAgICAgICA8dGV4dCB4PSI5MyIgeT0iMTUiIGZpbGw9IiMwMTAxMDEiIGZpbGwtb3BhY2l0eT0iLjMiPjc4JTwvdGV4dD4KICAgICAgICA8dGV4dCB4PSI5MyIgeT0iMTQiPjc4JTwvdGV4dD4KICAgIDwvZz4KICAgIDxzdmcgdmlld0JveD0iMTIwIC04IDYwIDYwIj4KICAgICAgICA8cGF0aCBkPSJNMjMuMDEzIDBDMTAuMzMzLjAwOS4wMSAxMC4yMiAwIDIyLjc2MnYuMDU4bDMuOTE0IDIuMjc1LjA1My0uMDM2YTExLjI5MSAxMS4yOTEgMCAwIDEgOC4zNTItMS43NjcgMTAuOTExIDEwLjkxMSAwIDAgMSA1LjUgMi43MjZsLjY3My42MjQuMzgtLjgyOGMuMzY4LS44MDIuNzkzLTEuNTU2IDEuMjY0LTIuMjQuMTktLjI3Ni4zOTgtLjU1NC42MzctLjg1MWwuMzkzLS40OS0uNDg0LS40MDRhMTYuMDggMTYuMDggMCAwIDAtNy40NTMtMy40NjYgMTYuNDgyIDE2LjQ4MiAwIDAgMC03LjcwNS40NDlDNy4zODYgMTAuNjgzIDE0LjU2IDUuMDE2IDIzLjAzIDUuMDFjNC43NzkgMCA5LjI3MiAxLjg0IDEyLjY1MSA1LjE4IDIuNDEgMi4zODIgNC4wNjkgNS4zNSA0LjgwNyA4LjU5MWExNi41MyAxNi41MyAwIDAgMC00Ljc5Mi0uNzIzbC0uMjkyLS4wMDJhMTYuNzA3IDE2LjcwNyAwIDAgMC0xLjkwMi4xNGwtLjA4LjAxMmMtLjI4LjAzNy0uNTI0LjA3NC0uNzQ4LjExNS0uMTEuMDE5LS4yMTguMDQxLS4zMjcuMDYzLS4yNTcuMDUyLS41MS4xMDgtLjc1LjE2OWwtLjI2NS4wNjdhMTYuMzkgMTYuMzkgMCAwIDAtLjkyNi4yNzZsLS4wNTYuMDE4Yy0uNjgyLjIzLTEuMzYuNTExLTIuMDE2LjgzOGwtLjA1Mi4wMjZjLS4yOS4xNDUtLjU4NC4zMDUtLjg5OS40OWwtLjA2OS4wNGExNS41OTYgMTUuNTk2IDAgMCAwLTQuMDYxIDMuNDY2bC0uMTQ1LjE3NWMtLjI5LjM2LS41MjEuNjY2LS43MjMuOTYtLjE3LjI0Ny0uMzQuNTEzLS41NTIuODY0bC0uMTE2LjE5OWMtLjE3LjI5Mi0uMzIuNTctLjQ0OS44MjRsLS4wMy4wNTdhMTYuMTE2IDE2LjExNiAwIDAgMC0uODQzIDIuMDI5bC0uMDM0LjEwMmExNS42NSAxNS42NSAwIDAgMC0uNzg2IDUuMTc0bC4wMDMuMjE0YTIxLjUyMyAyMS41MjMgMCAwIDAgLjA0Ljc1NGMuMDA5LjExOS4wMi4yMzcuMDMyLjM1NS4wMTQuMTQ1LjAzMi4yOS4wNDkuNDMybC4wMS4wOGMuMDEuMDY3LjAxNy4xMzMuMDI2LjE5Ny4wMzQuMjQyLjA3NC40OC4xMTkuNzIuNDYzIDIuNDE5IDEuNjIgNC44MzYgMy4zNDUgNi45OWwuMDc4LjA5OC4wOC0uMDk1Yy42ODgtLjgxIDIuMzk1LTMuMzggMi41MzktNC45MjJsLjAwMy0uMDI5LS4wMTQtLjAyNWExMC43MjcgMTAuNzI3IDAgMCAxLTEuMjI2LTQuOTU2YzAtNS43NiA0LjU0NS0xMC41NDQgMTAuMzQzLTEwLjg5bC4zODEtLjAxNGExMS40MDMgMTEuNDAzIDAgMCAxIDYuNjUxIDEuOTU3bC4wNTQuMDM2IDMuODYyLTIuMjM3LjA1LS4wM3YtLjA1NmMuMDA2LTYuMDgtMi4zODQtMTEuNzkzLTYuNzI5LTE2LjA4OUMzNC45MzIgMi4zNjEgMjkuMTYgMCAyMy4wMTMgMCIgZmlsbD0iI0YwMUY3QSIgZmlsbC1ydWxlPSJldmVub2RkIi8+CiAgICA8L3N2Zz4KPC9zdmc+" alt="Coverage status" /></a> <a href="https://cran.r-project.org/package=tidypmc"><img src="data:image/svg+xml; charset=utf-8;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSI3NiIgaGVpZ2h0PSIyMCI+CiAgPGxpbmVhckdyYWRpZW50IGlkPSJiIiB4Mj0iMCIgeTI9IjEwMCUiPgogICAgPHN0b3Agb2Zmc2V0PSIwIiBzdG9wLWNvbG9yPSIjYmJiIiBzdG9wLW9wYWNpdHk9Ii4xIi8+CiAgICA8c3RvcCBvZmZzZXQ9IjEiIHN0b3Atb3BhY2l0eT0iLjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogIDxtYXNrIGlkPSJhIj4KICAgIDxyZWN0IHdpZHRoPSI3NiIgaGVpZ2h0PSIyMCIgcng9IjMiIGZpbGw9IiNmZmYiLz4KICA8L21hc2s+CiAgPGcgbWFzaz0idXJsKCNhKSI+CiAgICA8cGF0aCBmaWxsPSIjNTU1IiBkPSJNMCAwaDQzdjIwSDB6Ii8+CiAgICA8cGF0aCBmaWxsPSIjNGMxIiBkPSJNNDMgMGg1Mi41djIwSDQzeiIvPgogICAgPHBhdGggZmlsbD0idXJsKCNiKSIgZD0iTTAgMGg3NnYyMEgweiIvPgogIDwvZz4KICA8ZyBmaWxsPSIjZmZmIiB0ZXh0LWFuY2hvcj0ibWlkZGxlIgogICAgIGZvbnQtZmFtaWx5PSJEZWphVnUgU2FucyxWZXJkYW5hLEdlbmV2YSxzYW5zLXNlcmlmIiBmb250LXNpemU9IjExIj4KICAgIDx0ZXh0IHg9IjIxLjUiIHk9IjE1IiBmaWxsPSIjMDEwMTAxIiBmaWxsLW9wYWNpdHk9Ii4zIj4KICAgICAgQ1JBTgogICAgPC90ZXh0PgogICAgPHRleHQgeD0iMjEuNSIgeT0iMTQiPgogICAgICBDUkFOCiAgICA8L3RleHQ+CiAgICA8dGV4dCB4PSI1OC41IiB5PSIxNSIgZmlsbD0iIzAxMDEwMSIgZmlsbC1vcGFjaXR5PSIuMyI+CiAgICAgIDEuNwogICAgPC90ZXh0PgogICAgPHRleHQgeD0iNTguNSIgeT0iMTQiPgogICAgICAxLjcKICAgIDwvdGV4dD4KICA8L2c+Cjwvc3ZnPg==" alt="CRAN_Status_Badge" /></a> <a href="https://CRAN.R-project.org/package=tidypmc"><img src="data:image/svg+xml; charset=utf-8;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxNDkiIGhlaWdodD0iMjAiPgogIDxsaW5lYXJHcmFkaWVudCBpZD0iYiIgeDI9IjAiIHkyPSIxMDAlIj4KICAgIDxzdG9wIG9mZnNldD0iMCIgc3RvcC1jb2xvcj0iI2JiYiIgc3RvcC1vcGFjaXR5PSIuMSIvPgogICAgPHN0b3Agb2Zmc2V0PSIxIiBzdG9wLW9wYWNpdHk9Ii4xIi8+CiAgPC9saW5lYXJHcmFkaWVudD4KICA8bWFzayBpZD0iYSI+CiAgICA8cmVjdCB3aWR0aD0iMTQ5IiBoZWlnaHQ9IjIwIiByeD0iMyIgZmlsbD0iI2ZmZiIvPgogIDwvbWFzaz4KICA8ZyBtYXNrPSJ1cmwoI2EpIj48cGF0aCBmaWxsPSIjNTU1IiBkPSJNMCAwaDcwdjIwSDB6Ii8+CiAgICA8cGF0aCBmaWxsPSIjMDA3ZWM2IiBkPSJNNzAgMGg3OXYyMEg3MHoiLz4KICAgIDxwYXRoIGZpbGw9InVybCgjYikiIGQ9Ik0wIDBoMTQ5djIwSDB6Ii8+CiAgPC9nPgogIDxnIGZpbGw9IiNmZmYiIHRleHQtYW5jaG9yPSJtaWRkbGUiCiAgICAgZm9udC1mYW1pbHk9IkRlamFWdSBTYW5zLFZlcmRhbmEsR2VuZXZhLHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEiPgogICAgPHRleHQgeD0iMzYiIHk9IjE1IiBmaWxsPSIjMDEwMTAxIiBmaWxsLW9wYWNpdHk9Ii4zIj4KICAgICAgZG93bmxvYWRzCiAgICA8L3RleHQ+CiAgICA8dGV4dCB4PSIzNiIgeT0iMTQiPgogICAgICBkb3dubG9hZHMKICAgIDwvdGV4dD4KICAgIDx0ZXh0IHg9IjEwOC41IiB5PSIxNSIgZmlsbD0iIzAxMDEwMSIgZmlsbC1vcGFjaXR5PSIuMyI+CiAgICAgIDI0L21vbnRoCiAgICA8L3RleHQ+CiAgICA8dGV4dCB4PSIxMDguNSIgeT0iMTQiPgogICAgICAyNC9tb250aAogICAgPC90ZXh0PgogIDwvZz4KPC9zdmc+" alt="Downloads" /></a> <a href="https://CRAN.R-project.org/package=tidypmc"><img src="data:image/svg+xml; charset=utf-8;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxMDgiIGhlaWdodD0iMjAiPgogIDxsaW5lYXJHcmFkaWVudCBpZD0iYiIgeDI9IjAiIHkyPSIxMDAlIj4KICAgIDxzdG9wIG9mZnNldD0iMCIgc3RvcC1jb2xvcj0iI2JiYiIgc3RvcC1vcGFjaXR5PSIuMSIvPgogICAgPHN0b3Agb2Zmc2V0PSIxIiBzdG9wLW9wYWNpdHk9Ii4xIi8+CiAgPC9saW5lYXJHcmFkaWVudD4KICA8bWFzayBpZD0iYSI+CiAgICA8cmVjdCB3aWR0aD0iMTA4IiBoZWlnaHQ9IjIwIiByeD0iMyIgZmlsbD0iI2ZmZiIvPgogIDwvbWFzaz4KICA8ZyBtYXNrPSJ1cmwoI2EpIj48cGF0aCBmaWxsPSIjNTU1IiBkPSJNMCAwaDcwdjIwSDB6Ii8+CiAgICA8cGF0aCBmaWxsPSIjZmU3ZDM3IiBkPSJNNzAgMGgzOHYyMEg3MHoiLz4KICAgIDxwYXRoIGZpbGw9InVybCgjYikiIGQ9Ik0wIDBoMTA4djIwSDB6Ii8+CiAgPC9nPgogIDxnIGZpbGw9IiNmZmYiIHRleHQtYW5jaG9yPSJtaWRkbGUiCiAgICAgZm9udC1mYW1pbHk9IkRlamFWdSBTYW5zLFZlcmRhbmEsR2VuZXZhLHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEiPgogICAgPHRleHQgeD0iMzYiIHk9IjE1IiBmaWxsPSIjMDEwMTAxIiBmaWxsLW9wYWNpdHk9Ii4zIj4KICAgICAgZG93bmxvYWRzCiAgICA8L3RleHQ+CiAgICA8dGV4dCB4PSIzNiIgeT0iMTQiPgogICAgICBkb3dubG9hZHMKICAgIDwvdGV4dD4KICAgIDx0ZXh0IHg9Ijg4IiB5PSIxNSIgZmlsbD0iIzAxMDEwMSIgZmlsbC1vcGFjaXR5PSIuMyI+CiAgICAgIDI0CiAgICA8L3RleHQ+CiAgICA8dGV4dCB4PSI4OCIgeT0iMTQiPgogICAgICAyNAogICAgPC90ZXh0PgogIDwvZz4KPC9zdmc+" alt="Total Downloads" /></a></p>
604 | <h1 id="tidypmc">tidypmc</h1>
605 | <p>The <a href="https://europepmc.org/downloads/openaccess">Open Access subset</a> of <a href="https://europepmc.org">Pubmed Central</a> (PMC) includes 2.5 million articles from biomedical and life sciences journals. The full text XML files are freely available for text mining from the <a href="https://europepmc.org/RestfulWebService">REST service</a> or <a href="https://europepmc.org/ftp/oa/">FTP site</a> but can be challenging to parse. For example, section tags are nested to arbitrary depths, formulas and tables may return incomprehensible text blobs and superscripted references are pasted at the end of words. The functions in the <code>tidypmc</code> package are intended to return readable text and maintain the document structure, so gene names and other terms can be associated with specific sections, paragraphs, sentences or table rows.</p>
606 | <h2 id="installation">Installation</h2>
607 | <p>Use <a href="https://github.com/r-lib/remotes">remotes</a> to install the package.</p>
608 | <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" data-line-number="1">remotes<span class="op">::</span><span class="kw">install_github</span>(<span class="st">&quot;ropensci/tidypmc&quot;</span>)</a></code></pre></div>
609 | <h2 id="load-xml">Load XML</h2>
610 | <p>Download a single XML document like <a href="https://www.ebi.ac.uk/europepmc/webservices/rest/PMC2231364/fullTextXML">PMC2231364</a> from the <a href="https://europepmc.org/RestfulWebService">REST service</a> using the <code>pmc_xml</code> function.</p>
611 | <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" data-line-number="1"><span class="kw">library</span>(tidypmc)</a>
612 | <a class="sourceLine" id="cb2-2" data-line-number="2"><span class="kw">library</span>(tidyverse)</a>
613 | <a class="sourceLine" id="cb2-3" data-line-number="3">doc &lt;-<span class="st"> </span><span class="kw">pmc_xml</span>(<span class="st">&quot;PMC2231364&quot;</span>)</a>
614 | <a class="sourceLine" id="cb2-4" data-line-number="4">doc</a>
615 | <a class="sourceLine" id="cb2-5" data-line-number="5"><span class="co">#  {xml_document}</span></a>
616 | <a class="sourceLine" id="cb2-6" data-line-number="6"><span class="co">#  &lt;article article-type=&quot;research-article&quot; xmlns:xlink=&quot;http://www.w3.org/1999/xlink&quot;&gt;</span></a>
617 | <a class="sourceLine" id="cb2-7" data-line-number="7"><span class="co">#  [1] &lt;front&gt;\n  &lt;journal-meta&gt;\n    &lt;journal-id journal-id-type=&quot;nlm-ta&quot;&gt;BMC Microbiol&lt;/journal-id ...</span></a>
618 | <a class="sourceLine" id="cb2-8" data-line-number="8"><span class="co">#  [2] &lt;body&gt;\n  &lt;sec&gt;\n    &lt;title&gt;Background&lt;/title&gt;\n    &lt;p&gt;&lt;italic&gt;Yersinia pestis &lt;/italic&gt;is th ...</span></a>
619 | <a class="sourceLine" id="cb2-9" data-line-number="9"><span class="co">#  [3] &lt;back&gt;\n  &lt;ack&gt;\n    &lt;sec&gt;\n      &lt;title&gt;Acknowledgements&lt;/title&gt;\n      &lt;p&gt;We thank Dr. Chen ...</span></a></code></pre></div>
620 | <p>The <a href="https://github.com/ropensci/europepmc">europepmc</a> package includes additional functions to search PMC and download full text. Be sure to include the <code>OPEN_ACCESS</code> field in the search since these are the only articles with full text XML available.</p>
621 | <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" data-line-number="1"><span class="kw">library</span>(europepmc)</a>
622 | <a class="sourceLine" id="cb3-2" data-line-number="2">yp &lt;-<span class="st"> </span><span class="kw">epmc_search</span>(<span class="st">&quot;title:(Yersinia pestis virulence) OPEN_ACCESS:Y&quot;</span>)</a>
623 | <a class="sourceLine" id="cb3-3" data-line-number="3"><span class="co">#  19 records found, returning 19</span></a>
624 | <a class="sourceLine" id="cb3-4" data-line-number="4"><span class="kw">select</span>(yp, pmcid, pubYear, title) <span class="op">%&gt;%</span></a>
625 | <a class="sourceLine" id="cb3-5" data-line-number="5"><span class="st">  </span><span class="kw">print</span>(<span class="dt">n=</span><span class="dv">5</span>)</a>
626 | <a class="sourceLine" id="cb3-6" data-line-number="6"><span class="co">#  # A tibble: 19 x 3</span></a>
627 | <a class="sourceLine" id="cb3-7" data-line-number="7"><span class="co">#    pmcid      pubYear title                                                                          </span></a>
628 | <a class="sourceLine" id="cb3-8" data-line-number="8"><span class="co">#    &lt;chr&gt;      &lt;chr&gt;   &lt;chr&gt;                                                                          </span></a>
629 | <a class="sourceLine" id="cb3-9" data-line-number="9"><span class="co">#  1 PMC5505154 2017    Crystal structure of Yersinia pestis virulence factor YfeA reveals two polyspe…</span></a>
630 | <a class="sourceLine" id="cb3-10" data-line-number="10"><span class="co">#  2 PMC3521224 2012    Omics strategies for revealing Yersinia pestis virulence.                      </span></a>
631 | <a class="sourceLine" id="cb3-11" data-line-number="11"><span class="co">#  3 PMC2704395 2009    Involvement of the post-transcriptional regulator Hfq in Yersinia pestis virul…</span></a>
632 | <a class="sourceLine" id="cb3-12" data-line-number="12"><span class="co">#  4 PMC2736372 2009    The NlpD lipoprotein is a novel Yersinia pestis virulence factor essential for…</span></a>
633 | <a class="sourceLine" id="cb3-13" data-line-number="13"><span class="co">#  5 PMC3109262 2011    A comprehensive study on the role of the Yersinia pestis virulence markers in …</span></a>
634 | <a class="sourceLine" id="cb3-14" data-line-number="14"><span class="co">#  # … with 14 more rows</span></a></code></pre></div>
635 | <p>Save all 19 results to a list of XML documents using the <code>epmc_ftxt</code> or <code>pmc_xml</code> function.</p>
636 | <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" data-line-number="1">docs &lt;-<span class="st"> </span><span class="kw">map</span>(yp<span class="op">$</span>pmcid, epmc_ftxt)</a></code></pre></div>
637 | <p>See the <a href="https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md">PMC FTP vignette</a> for details on parsing the large XML files on the <a href="https://europepmc.org/ftp/oa/">FTP site</a> with 10,000 articles each.</p>
638 | <h2 id="parse-xml">Parse XML</h2>
639 | <p>The package includes five functions to parse the <code>xml_document</code>.</p>
640 | <table>
641 | <thead>
642 | <tr class="header">
643 | <th align="left">R function</th>
644 | <th align="left">Description</th>
645 | </tr>
646 | </thead>
647 | <tbody>
648 | <tr class="odd">
649 | <td align="left"><code>pmc_text</code></td>
650 | <td align="left">Split section paragraphs into sentences with full path to subsection titles</td>
651 | </tr>
652 | <tr class="even">
653 | <td align="left"><code>pmc_caption</code></td>
654 | <td align="left">Split figure, table and supplementary material captions into sentences</td>
655 | </tr>
656 | <tr class="odd">
657 | <td align="left"><code>pmc_table</code></td>
658 | <td align="left">Convert table nodes into a list of tibbles</td>
659 | </tr>
660 | <tr class="even">
661 | <td align="left"><code>pmc_reference</code></td>
662 | <td align="left">Format references cited into a tibble</td>
663 | </tr>
664 | <tr class="odd">
665 | <td align="left"><code>pmc_metadata</code></td>
666 | <td align="left">List journal and article metadata in front node</td>
667 | </tr>
668 | </tbody>
669 | </table>
670 | <p>The <code>pmc_text</code> function uses the <a href="https://lincolnmullen.com/software/tokenizers/">tokenizers</a> package to split section paragraphs into sentences. The function also removes any tables, figures or formulas that are nested within paragraph tags, replaces superscripted references with brackets, adds carets and underscores to other superscripts and subscripts and includes the full path to the subsection title.</p>
671 | <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb5-1" data-line-number="1">txt &lt;-<span class="st"> </span><span class="kw">pmc_text</span>(doc)</a>
672 | <a class="sourceLine" id="cb5-2" data-line-number="2"><span class="co">#  Note: removing disp-formula nested in sec/p tag</span></a>
673 | <a class="sourceLine" id="cb5-3" data-line-number="3">txt</a>
674 | <a class="sourceLine" id="cb5-4" data-line-number="4"><span class="co">#  # A tibble: 194 x 4</span></a>
675 | <a class="sourceLine" id="cb5-5" data-line-number="5"><span class="co">#     section    paragraph sentence text                                                                         </span></a>
676 | <a class="sourceLine" id="cb5-6" data-line-number="6"><span class="co">#     &lt;chr&gt;          &lt;int&gt;    &lt;int&gt; &lt;chr&gt;                                                                        </span></a>
677 | <a class="sourceLine" id="cb5-7" data-line-number="7"><span class="co">#   1 Title              1        1 Comparative transcriptomics in Yersinia pestis: a global view of environment…</span></a>
678 | <a class="sourceLine" id="cb5-8" data-line-number="8"><span class="co">#   2 Abstract           1        1 Environmental modulation of gene expression in Yersinia pestis is critical f…</span></a>
679 | <a class="sourceLine" id="cb5-9" data-line-number="9"><span class="co">#   3 Abstract           1        2 Using cDNA microarray technology, we have analyzed the global gene expressio…</span></a>
680 | <a class="sourceLine" id="cb5-10" data-line-number="10"><span class="co">#   4 Abstract           2        1 To provide us with a comprehensive view of environmental modulation of globa…</span></a>
681 | <a class="sourceLine" id="cb5-11" data-line-number="11"><span class="co">#   5 Abstract           2        2 Almost all known virulence genes of Y. pestis were differentially regulated …</span></a>
682 | <a class="sourceLine" id="cb5-12" data-line-number="12"><span class="co">#   6 Abstract           2        3 Clustering enabled us to functionally classify co-expressed genes, including…</span></a>
683 | <a class="sourceLine" id="cb5-13" data-line-number="13"><span class="co">#   7 Abstract           2        4 Collections of operons were predicted from the microarray data, and some of …</span></a>
684 | <a class="sourceLine" id="cb5-14" data-line-number="14"><span class="co">#   8 Abstract           2        5 Several regulatory DNA motifs, probably recognized by the regulatory protein…</span></a>
685 | <a class="sourceLine" id="cb5-15" data-line-number="15"><span class="co">#   9 Abstract           3        1 The comparative transcriptomics analysis we present here not only benefits o…</span></a>
686 | <a class="sourceLine" id="cb5-16" data-line-number="16"><span class="co">#  10 Background         1        1 Yersinia pestis is the etiological agent of plague, alternatively growing in…</span></a>
687 | <a class="sourceLine" id="cb5-17" data-line-number="17"><span class="co">#  # … with 184 more rows</span></a>
688 | <a class="sourceLine" id="cb5-18" data-line-number="18"><span class="kw">count</span>(txt, section, <span class="dt">sort=</span><span class="ot">TRUE</span>)</a>
689 | <a class="sourceLine" id="cb5-19" data-line-number="19"><span class="co">#  # A tibble: 21 x 2</span></a>
690 | <a class="sourceLine" id="cb5-20" data-line-number="20"><span class="co">#     section                                                                                                   n</span></a>
691 | <a class="sourceLine" id="cb5-21" data-line-number="21"><span class="co">#     &lt;chr&gt;                                                                                                 &lt;int&gt;</span></a>
692 | <a class="sourceLine" id="cb5-22" data-line-number="22"><span class="co">#   1 Results and Discussion; Clustering analysis and functional classification of co-expressed gene clust…    22</span></a>
693 | <a class="sourceLine" id="cb5-23" data-line-number="23"><span class="co">#   2 Background                                                                                               20</span></a>
694 | <a class="sourceLine" id="cb5-24" data-line-number="24"><span class="co">#   3 Results and Discussion; Virulence genes in response to multiple environmental stresses                   20</span></a>
695 | <a class="sourceLine" id="cb5-25" data-line-number="25"><span class="co">#   4 Methods; Collection of microarray expression data                                                        17</span></a>
696 | <a class="sourceLine" id="cb5-26" data-line-number="26"><span class="co">#   5 Results and Discussion; Computational discovery of regulatory DNA motifs                                 16</span></a>
697 | <a class="sourceLine" id="cb5-27" data-line-number="27"><span class="co">#   6 Methods; Gel mobility shift analysis of Fur binding                                                      13</span></a>
698 | <a class="sourceLine" id="cb5-28" data-line-number="28"><span class="co">#   7 Results and Discussion; Verification of predicted operons by RT-PCR                                      10</span></a>
699 | <a class="sourceLine" id="cb5-29" data-line-number="29"><span class="co">#   8 Abstract                                                                                                  8</span></a>
700 | <a class="sourceLine" id="cb5-30" data-line-number="30"><span class="co">#   9 Methods; Discovery of regulatory DNA motifs                                                               8</span></a>
701 | <a class="sourceLine" id="cb5-31" data-line-number="31"><span class="co">#  10 Methods; Clustering analysis                                                                              7</span></a>
702 | <a class="sourceLine" id="cb5-32" data-line-number="32"><span class="co">#  # … with 11 more rows</span></a></code></pre></div>
703 | <p>Load the <a href="https://www.tidytextmining.com/">tidytext</a> package for further text processing.</p>
704 | <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb6-1" data-line-number="1"><span class="kw">library</span>(tidytext)</a>
705 | <a class="sourceLine" id="cb6-2" data-line-number="2">x1 &lt;-<span class="st"> </span><span class="kw">unnest_tokens</span>(txt, word, text) <span class="op">%&gt;%</span></a>
706 | <a class="sourceLine" id="cb6-3" data-line-number="3"><span class="st">  </span><span class="kw">anti_join</span>(stop_words) <span class="op">%&gt;%</span></a>
707 | <a class="sourceLine" id="cb6-4" data-line-number="4"><span class="st">  </span><span class="kw">filter</span>(<span class="op">!</span>word <span class="op">%in%</span><span class="st"> </span><span class="dv">1</span><span class="op">:</span><span class="dv">100</span>)</a>
708 | <a class="sourceLine" id="cb6-5" data-line-number="5"><span class="co">#  Joining, by = &quot;word&quot;</span></a>
709 | <a class="sourceLine" id="cb6-6" data-line-number="6"><span class="kw">filter</span>(x1, <span class="kw">str_detect</span>(section, <span class="st">&quot;^Results&quot;</span>))</a>
710 | <a class="sourceLine" id="cb6-7" data-line-number="7"><span class="co">#  # A tibble: 1,269 x 4</span></a>
711 | <a class="sourceLine" id="cb6-8" data-line-number="8"><span class="co">#     section                paragraph sentence word         </span></a>
712 | <a class="sourceLine" id="cb6-9" data-line-number="9"><span class="co">#     &lt;chr&gt;                      &lt;int&gt;    &lt;int&gt; &lt;chr&gt;        </span></a>
713 | <a class="sourceLine" id="cb6-10" data-line-number="10"><span class="co">#   1 Results and Discussion         1        1 comprehensive</span></a>
714 | <a class="sourceLine" id="cb6-11" data-line-number="11"><span class="co">#   2 Results and Discussion         1        1 analysis     </span></a>
715 | <a class="sourceLine" id="cb6-12" data-line-number="12"><span class="co">#   3 Results and Discussion         1        1 sets         </span></a>
716 | <a class="sourceLine" id="cb6-13" data-line-number="13"><span class="co">#   4 Results and Discussion         1        1 microarray   </span></a>
717 | <a class="sourceLine" id="cb6-14" data-line-number="14"><span class="co">#   5 Results and Discussion         1        1 expression   </span></a>
718 | <a class="sourceLine" id="cb6-15" data-line-number="15"><span class="co">#   6 Results and Discussion         1        1 data         </span></a>
719 | <a class="sourceLine" id="cb6-16" data-line-number="16"><span class="co">#   7 Results and Discussion         1        1 dissect      </span></a>
720 | <a class="sourceLine" id="cb6-17" data-line-number="17"><span class="co">#   8 Results and Discussion         1        1 bacterial    </span></a>
721 | <a class="sourceLine" id="cb6-18" data-line-number="18"><span class="co">#   9 Results and Discussion         1        1 adaptation   </span></a>
722 | <a class="sourceLine" id="cb6-19" data-line-number="19"><span class="co">#  10 Results and Discussion         1        1 environments </span></a>
723 | <a class="sourceLine" id="cb6-20" data-line-number="20"><span class="co">#  # … with 1,259 more rows</span></a>
724 | <a class="sourceLine" id="cb6-21" data-line-number="21"><span class="kw">filter</span>(x1, <span class="kw">str_detect</span>(section, <span class="st">&quot;^Results&quot;</span>)) <span class="op">%&gt;%</span></a>
725 | <a class="sourceLine" id="cb6-22" data-line-number="22"><span class="st">  </span><span class="kw">count</span>(word, <span class="dt">sort =</span> <span class="ot">TRUE</span>)</a>
726 | <a class="sourceLine" id="cb6-23" data-line-number="23"><span class="co">#  # A tibble: 595 x 2</span></a>
727 | <a class="sourceLine" id="cb6-24" data-line-number="24"><span class="co">#     word           n</span></a>
728 | <a class="sourceLine" id="cb6-25" data-line-number="25"><span class="co">#     &lt;chr&gt;      &lt;int&gt;</span></a>
729 | <a class="sourceLine" id="cb6-26" data-line-number="26"><span class="co">#   1 genes         45</span></a>
730 | <a class="sourceLine" id="cb6-27" data-line-number="27"><span class="co">#   2 cluster       24</span></a>
731 | <a class="sourceLine" id="cb6-28" data-line-number="28"><span class="co">#   3 expression    21</span></a>
732 | <a class="sourceLine" id="cb6-29" data-line-number="29"><span class="co">#   4 pestis        21</span></a>
733 | <a class="sourceLine" id="cb6-30" data-line-number="30"><span class="co">#   5 data          19</span></a>
734 | <a class="sourceLine" id="cb6-31" data-line-number="31"><span class="co">#   6 dna           15</span></a>
735 | <a class="sourceLine" id="cb6-32" data-line-number="32"><span class="co">#   7 gene          15</span></a>
736 | <a class="sourceLine" id="cb6-33" data-line-number="33"><span class="co">#   8 figure        13</span></a>
737 | <a class="sourceLine" id="cb6-34" data-line-number="34"><span class="co">#   9 fur           12</span></a>
738 | <a class="sourceLine" id="cb6-35" data-line-number="35"><span class="co">#  10 operons       12</span></a>
739 | <a class="sourceLine" id="cb6-36" data-line-number="36"><span class="co">#  # … with 585 more rows</span></a></code></pre></div>
740 | <p>The <code>pmc_table</code> function formats tables by collapsing multiline headers, expanding rowspan and colspan attributes and adding subheadings into a new column.</p>
741 | <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb7-1" data-line-number="1">tbls &lt;-<span class="st"> </span><span class="kw">pmc_table</span>(doc)</a>
742 | <a class="sourceLine" id="cb7-2" data-line-number="2"><span class="co">#  Parsing 4 tables</span></a>
743 | <a class="sourceLine" id="cb7-3" data-line-number="3"><span class="co">#  Adding footnotes to Table 1</span></a>
744 | <a class="sourceLine" id="cb7-4" data-line-number="4"><span class="kw">map_int</span>(tbls, nrow)</a>
745 | <a class="sourceLine" id="cb7-5" data-line-number="5"><span class="co">#  Table 1 Table 2 Table 3 Table 4 </span></a>
746 | <a class="sourceLine" id="cb7-6" data-line-number="6"><span class="co">#       39      23       4      34</span></a>
747 | <a class="sourceLine" id="cb7-7" data-line-number="7">tbls[[<span class="dv">1</span>]]</a>
748 | <a class="sourceLine" id="cb7-8" data-line-number="8"><span class="co">#  # A tibble: 39 x 5</span></a>
749 | <a class="sourceLine" id="cb7-9" data-line-number="9"><span class="co">#     subheading              `Potential operon (r va… `Gene ID`   `Putative or predicted functi… `Reference (s)`</span></a>
750 | <a class="sourceLine" id="cb7-10" data-line-number="10"><span class="co">#     &lt;chr&gt;                   &lt;chr&gt;                    &lt;chr&gt;       &lt;chr&gt;                          &lt;chr&gt;          </span></a>
751 | <a class="sourceLine" id="cb7-11" data-line-number="11"><span class="co">#   1 Iron uptake or heme sy… yfeABCD operon* (r &gt; 0.… YPO2439-24… Transport/binding chelated ir… yfeABCD [54]   </span></a>
752 | <a class="sourceLine" id="cb7-12" data-line-number="12"><span class="co">#   2 Iron uptake or heme sy… hmuRSTUV operon (r &gt; 0.… YPO0279-02… Transport/binding hemin        hmuRSTUV [55]  </span></a>
753 | <a class="sourceLine" id="cb7-13" data-line-number="13"><span class="co">#   3 Iron uptake or heme sy… ysuJIHG* (r &gt; 0.95)      YPO1529-15… Iron uptake                    -              </span></a>
754 | <a class="sourceLine" id="cb7-14" data-line-number="14"><span class="co">#   4 Iron uptake or heme sy… sufABCDS* (r &gt; 0.90)     YPO2400-24… Iron-regulated Fe-S cluster a… -              </span></a>
755 | <a class="sourceLine" id="cb7-15" data-line-number="15"><span class="co">#   5 Iron uptake or heme sy… YPO1854-1856* (r &gt; 0.97) YPO1854-18… Iron uptake or heme synthesis? -              </span></a>
756 | <a class="sourceLine" id="cb7-16" data-line-number="16"><span class="co">#   6 Sulfur metabolism       tauABCD operon (r &gt; 0.9… YPO0182-01… Transport/binding taurine      tauABCD [56]   </span></a>
757 | <a class="sourceLine" id="cb7-17" data-line-number="17"><span class="co">#   7 Sulfur metabolism       ssuEADCB operon (r &gt; 0.… YPO3623-36… Sulphur metabolism             ssu operon [57]</span></a>
758 | <a class="sourceLine" id="cb7-18" data-line-number="18"><span class="co">#   8 Sulfur metabolism       cys operon (r &gt; 0.92)    YPO3010-30… Cysteine synthesis             -              </span></a>
759 | <a class="sourceLine" id="cb7-19" data-line-number="19"><span class="co">#   9 Sulfur metabolism       YPO1317-1319 (r &gt; 0.97)  YPO1317-13… Sulfur metabolism?             -              </span></a>
760 | <a class="sourceLine" id="cb7-20" data-line-number="20"><span class="co">#  10 Sulfur metabolism       YPO4109-4111 (r &gt; 0.90)  YPO4109-41… Sulfur metabolism?             -              </span></a>
761 | <a class="sourceLine" id="cb7-21" data-line-number="21"><span class="co">#  # … with 29 more rows</span></a></code></pre></div>
762 | <p>Use <code>collapse_rows</code> to join column names and cell values in a semi-colon delimited string (and then search using functions in the next section).</p>
763 | <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb8-1" data-line-number="1"><span class="kw">collapse_rows</span>(tbls, <span class="dt">na.string=</span><span class="st">&quot;-&quot;</span>)</a>
764 | <a class="sourceLine" id="cb8-2" data-line-number="2"><span class="co">#  # A tibble: 100 x 3</span></a>
765 | <a class="sourceLine" id="cb8-3" data-line-number="3"><span class="co">#     table     row text                                                                                         </span></a>
766 | <a class="sourceLine" id="cb8-4" data-line-number="4"><span class="co">#     &lt;chr&gt;   &lt;int&gt; &lt;chr&gt;                                                                                        </span></a>
767 | <a class="sourceLine" id="cb8-5" data-line-number="5"><span class="co">#   1 Table 1     1 subheading=Iron uptake or heme synthesis; Potential operon (r value)=yfeABCD operon* (r &gt; 0.…</span></a>
768 | <a class="sourceLine" id="cb8-6" data-line-number="6"><span class="co">#   2 Table 1     2 subheading=Iron uptake or heme synthesis; Potential operon (r value)=hmuRSTUV operon (r &gt; 0.…</span></a>
769 | <a class="sourceLine" id="cb8-7" data-line-number="7"><span class="co">#   3 Table 1     3 subheading=Iron uptake or heme synthesis; Potential operon (r value)=ysuJIHG* (r &gt; 0.95); Ge…</span></a>
770 | <a class="sourceLine" id="cb8-8" data-line-number="8"><span class="co">#   4 Table 1     4 subheading=Iron uptake or heme synthesis; Potential operon (r value)=sufABCDS* (r &gt; 0.90); G…</span></a>
771 | <a class="sourceLine" id="cb8-9" data-line-number="9"><span class="co">#   5 Table 1     5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1854-1856* (r &gt; 0.97…</span></a>
772 | <a class="sourceLine" id="cb8-10" data-line-number="10"><span class="co">#   6 Table 1     6 subheading=Sulfur metabolism; Potential operon (r value)=tauABCD operon (r &gt; 0.90); Gene ID=…</span></a>
773 | <a class="sourceLine" id="cb8-11" data-line-number="11"><span class="co">#   7 Table 1     7 subheading=Sulfur metabolism; Potential operon (r value)=ssuEADCB operon (r &gt; 0.97); Gene ID…</span></a>
774 | <a class="sourceLine" id="cb8-12" data-line-number="12"><span class="co">#   8 Table 1     8 subheading=Sulfur metabolism; Potential operon (r value)=cys operon (r &gt; 0.92); Gene ID=YPO3…</span></a>
775 | <a class="sourceLine" id="cb8-13" data-line-number="13"><span class="co">#   9 Table 1     9 subheading=Sulfur metabolism; Potential operon (r value)=YPO1317-1319 (r &gt; 0.97); Gene ID=YP…</span></a>
776 | <a class="sourceLine" id="cb8-14" data-line-number="14"><span class="co">#  10 Table 1    10 subheading=Sulfur metabolism; Potential operon (r value)=YPO4109-4111 (r &gt; 0.90); Gene ID=YP…</span></a>
777 | <a class="sourceLine" id="cb8-15" data-line-number="15"><span class="co">#  # … with 90 more rows</span></a></code></pre></div>
778 | <p>The other three <code>pmc</code> functions are described in the package <a href="https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md">vignette</a>.</p>
779 | <h2 id="searching-text">Searching text</h2>
780 | <p>There are a few functions to search within the <code>pmc_text</code> or collapsed <code>pmc_table</code> output. <code>separate_text</code> uses the <a href="https://stringr.tidyverse.org/">stringr</a> package to extract any regular expression or vector of words.</p>
781 | <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb9-1" data-line-number="1"><span class="kw">separate_text</span>(txt, <span class="st">&quot;[ATCGN]{5,}&quot;</span>)</a>
782 | <a class="sourceLine" id="cb9-2" data-line-number="2"><span class="co">#  # A tibble: 9 x 5</span></a>
783 | <a class="sourceLine" id="cb9-3" data-line-number="3"><span class="co">#    match        section                         paragraph sentence text                                        </span></a>
784 | <a class="sourceLine" id="cb9-4" data-line-number="4"><span class="co">#    &lt;chr&gt;        &lt;chr&gt;                               &lt;int&gt;    &lt;int&gt; &lt;chr&gt;                                       </span></a>
785 | <a class="sourceLine" id="cb9-5" data-line-number="5"><span class="co">#  1 ACGCAATCGTT… Results and Discussion; Comput…         2        3 A 16 basepair (bp) box (5'-ACGCAATCGTTTTCNT…</span></a>
786 | <a class="sourceLine" id="cb9-6" data-line-number="6"><span class="co">#  2 AAACGTTTNCGT Results and Discussion; Comput…         2        4 It is very similar to the E. coli PurR box …</span></a>
787 | <a class="sourceLine" id="cb9-7" data-line-number="7"><span class="co">#  3 TGATAATGATT… Results and Discussion; Comput…         2        5 A 21 bp box (5'-TGATAATGATTATCATTATCA-3') w…</span></a>
788 | <a class="sourceLine" id="cb9-8" data-line-number="8"><span class="co">#  4 GATAATGATAA… Results and Discussion; Comput…         2        6 It is a 10-1-10 inverted repeat that resemb…</span></a>
789 | <a class="sourceLine" id="cb9-9" data-line-number="9"><span class="co">#  5 TGANNNNNNTC… Results and Discussion; Comput…         2        7 A 15 bp box (5'-TGANNNNNNTCAA-3') was found…</span></a>
790 | <a class="sourceLine" id="cb9-10" data-line-number="10"><span class="co">#  6 TTGATN       Results and Discussion; Comput…         2        8 It is a part of the E. coli Fnr box (5'-AAW…</span></a>
791 | <a class="sourceLine" id="cb9-11" data-line-number="11"><span class="co">#  7 NATCAA       Results and Discussion; Comput…         2        8 It is a part of the E. coli Fnr box (5'-AAW…</span></a>
792 | <a class="sourceLine" id="cb9-12" data-line-number="12"><span class="co">#  8 GTTAATTAA    Results and Discussion; Comput…         3        4 The ArcA regulator can recognize a relative…</span></a>
793 | <a class="sourceLine" id="cb9-13" data-line-number="13"><span class="co">#  9 GTTAATTAATGT Results and Discussion; Comput…         3        5 An ArcA-box-like sequence (5'-GTTAATTAATGT-…</span></a></code></pre></div>
794 | <p>A few wrappers search pre-defined patterns and add an extra step to expand matched ranges. <code>separate_refs</code> matches references within brackets using <code>\\[[0-9, -]+\\]</code> and expands ranges like <code>[7-9]</code>.</p>
795 | <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb10-1" data-line-number="1"><span class="kw">separate_refs</span>(txt)</a>
796 | <a class="sourceLine" id="cb10-2" data-line-number="2"><span class="co">#  # A tibble: 93 x 6</span></a>
797 | <a class="sourceLine" id="cb10-3" data-line-number="3"><span class="co">#        id match section   paragraph sentence text                                                              </span></a>
798 | <a class="sourceLine" id="cb10-4" data-line-number="4"><span class="co">#     &lt;dbl&gt; &lt;chr&gt; &lt;chr&gt;         &lt;int&gt;    &lt;int&gt; &lt;chr&gt;                                                             </span></a>
799 | <a class="sourceLine" id="cb10-5" data-line-number="5"><span class="co">#   1     1 [1]   Backgrou…         1        1 Yersinia pestis is the etiological agent of plague, alternatively…</span></a>
800 | <a class="sourceLine" id="cb10-6" data-line-number="6"><span class="co">#   2     2 [2]   Backgrou…         1        3 To produce a transmissible infection, Y. pestis colonizes the fle…</span></a>
801 | <a class="sourceLine" id="cb10-7" data-line-number="7"><span class="co">#   3     3 [3]   Backgrou…         1        9 However, a few bacilli are taken up by tissue macrophages, provid…</span></a>
802 | <a class="sourceLine" id="cb10-8" data-line-number="8"><span class="co">#   4     4 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's resistanc…</span></a>
803 | <a class="sourceLine" id="cb10-9" data-line-number="9"><span class="co">#   5     5 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's resistanc…</span></a>
804 | <a class="sourceLine" id="cb10-10" data-line-number="10"><span class="co">#   6     6 [6]   Backgrou…         2        1 A DNA microarray is able to determine simultaneous changes in all…</span></a>
805 | <a class="sourceLine" id="cb10-11" data-line-number="11"><span class="co">#   7     7 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…</span></a>
806 | <a class="sourceLine" id="cb10-12" data-line-number="12"><span class="co">#   8     8 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…</span></a>
807 | <a class="sourceLine" id="cb10-13" data-line-number="13"><span class="co">#   9     9 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…</span></a>
808 | <a class="sourceLine" id="cb10-14" data-line-number="14"><span class="co">#  10    10 [10]  Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…</span></a>
809 | <a class="sourceLine" id="cb10-15" data-line-number="15"><span class="co">#  # … with 83 more rows</span></a></code></pre></div>
810 | <p><code>separate_genes</code> will find microbial genes like tauD (with a capitalized 4th letter) and expand operons like <code>tauABCD</code> into four genes. <code>separate_tags</code> will find and expand locus tag ranges below.</p>
811 | <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb11-1" data-line-number="1"><span class="kw">collapse_rows</span>(tbls, <span class="dt">na=</span><span class="st">&quot;-&quot;</span>) <span class="op">%&gt;%</span></a>
812 | <a class="sourceLine" id="cb11-2" data-line-number="2"><span class="st">  </span><span class="kw">separate_tags</span>(<span class="st">&quot;YPO&quot;</span>) <span class="op">%&gt;%</span></a>
813 | <a class="sourceLine" id="cb11-3" data-line-number="3"><span class="st">  </span><span class="kw">filter</span>(id <span class="op">==</span><span class="st"> &quot;YPO1855&quot;</span>)</a>
814 | <a class="sourceLine" id="cb11-4" data-line-number="4"><span class="co">#  # A tibble: 3 x 5</span></a>
815 | <a class="sourceLine" id="cb11-5" data-line-number="5"><span class="co">#    id      match        table    row text                                                                      </span></a>
816 | <a class="sourceLine" id="cb11-6" data-line-number="6"><span class="co">#    &lt;chr&gt;   &lt;chr&gt;        &lt;chr&gt;  &lt;int&gt; &lt;chr&gt;                                                                     </span></a>
817 | <a class="sourceLine" id="cb11-7" data-line-number="7"><span class="co">#  1 YPO1855 YPO1854-1856 Table…     5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1…</span></a>
818 | <a class="sourceLine" id="cb11-8" data-line-number="8"><span class="co">#  2 YPO1855 YPO1854-1856 Table…    21 subheading=Category C: Hypothetical; Gene ID=YPO1854-1856; Description=Pu…</span></a>
819 | <a class="sourceLine" id="cb11-9" data-line-number="9"><span class="co">#  3 YPO1855 YPO1854-YPO… Table…     2 Cluster=Cluster II; Genes or operons for motif discovery=hmuRSTUV, YPO068…</span></a></code></pre></div>
820 | <p>See the <a href="https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md">vignette</a> for more details including code to parse XML documents using the <a href="https://github.com/r-lib/xml2">xml2</a> package. The <a href="https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md">PMC FTP vignette</a> has details on parsing XML files at the Europe PMC <a href="https://europepmc.org/ftp/oa/">FTP site</a>.</p>
821 | <h3 id="community-guidelines">Community Guidelines</h3>
822 | <p>This project is released with a <a href="CONDUCT.md">Contributor Code of Conduct</a>. By participating in this project you agree to abide by its terms. Feedback, bug reports, and feature requests are welcome <a href="https://github.com/ropensci/tidypmc/issues">here</a>.</p>
823 | 
824 | </body>
825 | </html>
826 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![Build
  3 | Status](https://travis-ci.org/ropensci/tidypmc.svg?branch=master)](https://travis-ci.org/ropensci/tidypmc)
  4 | [![Coverage
  5 | status](https://codecov.io/gh/ropensci/tidypmc/branch/master/graph/badge.svg)](https://codecov.io/github/ropensci/tidypmc?branch=master)
  6 | [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/tidypmc)](https://cran.r-project.org/package=tidypmc)
  7 | [![Downloads](https://cranlogs.r-pkg.org/badges/tidypmc)](https://CRAN.R-project.org/package=tidypmc)
  8 | [![Total
  9 | Downloads](https://cranlogs.r-pkg.org/badges/grand-total/tidypmc?color=orange)](https://CRAN.R-project.org/package=tidypmc)
 10 | 
 11 | # tidypmc
 12 | 
 13 | The [Open Access subset](https://europepmc.org/downloads/openaccess) of
 14 | [Pubmed Central](https://europepmc.org) (PMC) includes 2.5 million
 15 | articles from biomedical and life sciences journals. The full text XML
 16 | files are freely available for text mining from the [REST
 17 | service](https://europepmc.org/RestfulWebService) or [FTP
 18 | site](https://europepmc.org/ftp/oa/) but can be challenging to parse.
 19 | For example, section tags are nested to arbitrary depths, formulas and
 20 | tables may return incomprehensible text blobs and superscripted
 21 | references are pasted at the end of words. The functions in the
 22 | `tidypmc` package are intended to return readable text and maintain the
 23 | document structure, so gene names and other terms can be associated with
 24 | specific sections, paragraphs, sentences or table rows.
 25 | 
 26 | ## Installation
 27 | 
 28 | Use [remotes](https://github.com/r-lib/remotes) to install the package.
 29 | 
 30 | ``` r
 31 | remotes::install_github("ropensci/tidypmc")
 32 | ```
 33 | 
 34 | ## Load XML
 35 | 
 36 | Download a single XML document like
 37 | [PMC2231364](https://www.ebi.ac.uk/europepmc/webservices/rest/PMC2231364/fullTextXML)
 38 | from the [REST service](https://europepmc.org/RestfulWebService) using
 39 | the `pmc_xml` function.
 40 | 
 41 | ``` r
 42 | library(tidypmc)
 43 | library(tidyverse)
 44 | doc <- pmc_xml("PMC2231364")
 45 | doc
 46 | #  {xml_document}
 47 | #  <article article-type="research-article" xmlns:xlink="http://www.w3.org/1999/xlink">
 48 | #  [1] <front>\n  <journal-meta>\n    <journal-id journal-id-type="nlm-ta">BMC Microbiol</journal-id ...
 49 | #  [2] <body>\n  <sec>\n    <title>Background</title>\n    <p><italic>Yersinia pestis </italic>is th ...
 50 | #  [3] <back>\n  <ack>\n    <sec>\n      <title>Acknowledgements</title>\n      <p>We thank Dr. Chen ...
 51 | ```
 52 | 
 53 | The [europepmc](https://github.com/ropensci/europepmc) package includes
 54 | additional functions to search PMC and download full text. Be sure to
 55 | include the `OPEN_ACCESS` field in the search since these are the only
 56 | articles with full text XML available.
 57 | 
 58 | ``` r
 59 | library(europepmc)
 60 | yp <- epmc_search("title:(Yersinia pestis virulence) OPEN_ACCESS:Y")
 61 | #  19 records found, returning 19
 62 | select(yp, pmcid, pubYear, title) %>%
 63 |   print(n=5)
 64 | #  # A tibble: 19 x 3
 65 | #    pmcid      pubYear title                                                                          
 66 | #    <chr>      <chr>   <chr>                                                                          
 67 | #  1 PMC5505154 2017    Crystal structure of Yersinia pestis virulence factor YfeA reveals two polyspe…
 68 | #  2 PMC3521224 2012    Omics strategies for revealing Yersinia pestis virulence.                      
 69 | #  3 PMC2704395 2009    Involvement of the post-transcriptional regulator Hfq in Yersinia pestis virul…
 70 | #  4 PMC2736372 2009    The NlpD lipoprotein is a novel Yersinia pestis virulence factor essential for…
 71 | #  5 PMC3109262 2011    A comprehensive study on the role of the Yersinia pestis virulence markers in …
 72 | #  # … with 14 more rows
 73 | ```
 74 | 
 75 | Save all 19 results to a list of XML documents using the `epmc_ftxt` or
 76 | `pmc_xml` function.
 77 | 
 78 | ``` r
 79 | docs <- map(yp$pmcid, epmc_ftxt)
 80 | ```
 81 | 
 82 | See the [PMC FTP
 83 | vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md)
 84 | for details on parsing the large XML files on the [FTP
 85 | site](https://europepmc.org/ftp/oa/) with 10,000 articles each.
 86 | 
 87 | ## Parse XML
 88 | 
 89 | The package includes five functions to parse the
 90 | `xml_document`.
 91 | 
 92 | | R function      | Description                                                                 |
 93 | | :-------------- | :-------------------------------------------------------------------------- |
 94 | | `pmc_text`      | Split section paragraphs into sentences with full path to subsection titles |
 95 | | `pmc_caption`   | Split figure, table and supplementary material captions into sentences      |
 96 | | `pmc_table`     | Convert table nodes into a list of tibbles                                  |
 97 | | `pmc_reference` | Format references cited into a tibble                                       |
 98 | | `pmc_metadata`  | List journal and article metadata in front node                             |
 99 | 
100 | The `pmc_text` function uses the
101 | [tokenizers](https://lincolnmullen.com/software/tokenizers/) package to
102 | split section paragraphs into sentences. The function also removes any
103 | tables, figures or formulas that are nested within paragraph tags,
104 | replaces superscripted references with brackets, adds carets and
105 | underscores to other superscripts and subscripts and includes the full
106 | path to the subsection title.
107 | 
108 | ``` r
109 | txt <- pmc_text(doc)
110 | #  Note: removing disp-formula nested in sec/p tag
111 | txt
112 | #  # A tibble: 194 x 4
113 | #     section    paragraph sentence text                                                                         
114 | #     <chr>          <int>    <int> <chr>                                                                        
115 | #   1 Title              1        1 Comparative transcriptomics in Yersinia pestis: a global view of environment…
116 | #   2 Abstract           1        1 Environmental modulation of gene expression in Yersinia pestis is critical f…
117 | #   3 Abstract           1        2 Using cDNA microarray technology, we have analyzed the global gene expressio…
118 | #   4 Abstract           2        1 To provide us with a comprehensive view of environmental modulation of globa…
119 | #   5 Abstract           2        2 Almost all known virulence genes of Y. pestis were differentially regulated …
120 | #   6 Abstract           2        3 Clustering enabled us to functionally classify co-expressed genes, including…
121 | #   7 Abstract           2        4 Collections of operons were predicted from the microarray data, and some of …
122 | #   8 Abstract           2        5 Several regulatory DNA motifs, probably recognized by the regulatory protein…
123 | #   9 Abstract           3        1 The comparative transcriptomics analysis we present here not only benefits o…
124 | #  10 Background         1        1 Yersinia pestis is the etiological agent of plague, alternatively growing in…
125 | #  # … with 184 more rows
126 | count(txt, section, sort=TRUE)
127 | #  # A tibble: 21 x 2
128 | #     section                                                                                                   n
129 | #     <chr>                                                                                                 <int>
130 | #   1 Results and Discussion; Clustering analysis and functional classification of co-expressed gene clust…    22
131 | #   2 Background                                                                                               20
132 | #   3 Results and Discussion; Virulence genes in response to multiple environmental stresses                   20
133 | #   4 Methods; Collection of microarray expression data                                                        17
134 | #   5 Results and Discussion; Computational discovery of regulatory DNA motifs                                 16
135 | #   6 Methods; Gel mobility shift analysis of Fur binding                                                      13
136 | #   7 Results and Discussion; Verification of predicted operons by RT-PCR                                      10
137 | #   8 Abstract                                                                                                  8
138 | #   9 Methods; Discovery of regulatory DNA motifs                                                               8
139 | #  10 Methods; Clustering analysis                                                                              7
140 | #  # … with 11 more rows
141 | ```
142 | 
143 | Load the [tidytext](https://www.tidytextmining.com/) package for further
144 | text processing.
145 | 
146 | ``` r
147 | library(tidytext)
148 | x1 <- unnest_tokens(txt, word, text) %>%
149 |   anti_join(stop_words) %>%
150 |   filter(!word %in% 1:100)
151 | #  Joining, by = "word"
152 | filter(x1, str_detect(section, "^Results"))
153 | #  # A tibble: 1,269 x 4
154 | #     section                paragraph sentence word         
155 | #     <chr>                      <int>    <int> <chr>        
156 | #   1 Results and Discussion         1        1 comprehensive
157 | #   2 Results and Discussion         1        1 analysis     
158 | #   3 Results and Discussion         1        1 sets         
159 | #   4 Results and Discussion         1        1 microarray   
160 | #   5 Results and Discussion         1        1 expression   
161 | #   6 Results and Discussion         1        1 data         
162 | #   7 Results and Discussion         1        1 dissect      
163 | #   8 Results and Discussion         1        1 bacterial    
164 | #   9 Results and Discussion         1        1 adaptation   
165 | #  10 Results and Discussion         1        1 environments 
166 | #  # … with 1,259 more rows
167 | filter(x1, str_detect(section, "^Results")) %>%
168 |   count(word, sort = TRUE)
169 | #  # A tibble: 595 x 2
170 | #     word           n
171 | #     <chr>      <int>
172 | #   1 genes         45
173 | #   2 cluster       24
174 | #   3 expression    21
175 | #   4 pestis        21
176 | #   5 data          19
177 | #   6 dna           15
178 | #   7 gene          15
179 | #   8 figure        13
180 | #   9 fur           12
181 | #  10 operons       12
182 | #  # … with 585 more rows
183 | ```
184 | 
185 | The `pmc_table` function formats tables by collapsing multiline headers,
186 | expanding rowspan and colspan attributes and adding subheadings into a
187 | new column.
188 | 
189 | ``` r
190 | tbls <- pmc_table(doc)
191 | #  Parsing 4 tables
192 | #  Adding footnotes to Table 1
193 | map_int(tbls, nrow)
194 | #  Table 1 Table 2 Table 3 Table 4 
195 | #       39      23       4      34
196 | tbls[[1]]
197 | #  # A tibble: 39 x 5
198 | #     subheading              `Potential operon (r va… `Gene ID`   `Putative or predicted functi… `Reference (s)`
199 | #     <chr>                   <chr>                    <chr>       <chr>                          <chr>          
200 | #   1 Iron uptake or heme sy… yfeABCD operon* (r > 0.… YPO2439-24… Transport/binding chelated ir… yfeABCD [54]   
201 | #   2 Iron uptake or heme sy… hmuRSTUV operon (r > 0.… YPO0279-02… Transport/binding hemin        hmuRSTUV [55]  
202 | #   3 Iron uptake or heme sy… ysuJIHG* (r > 0.95)      YPO1529-15… Iron uptake                    -              
203 | #   4 Iron uptake or heme sy… sufABCDS* (r > 0.90)     YPO2400-24… Iron-regulated Fe-S cluster a… -              
204 | #   5 Iron uptake or heme sy… YPO1854-1856* (r > 0.97) YPO1854-18… Iron uptake or heme synthesis? -              
205 | #   6 Sulfur metabolism       tauABCD operon (r > 0.9… YPO0182-01… Transport/binding taurine      tauABCD [56]   
206 | #   7 Sulfur metabolism       ssuEADCB operon (r > 0.… YPO3623-36… Sulphur metabolism             ssu operon [57]
207 | #   8 Sulfur metabolism       cys operon (r > 0.92)    YPO3010-30… Cysteine synthesis             -              
208 | #   9 Sulfur metabolism       YPO1317-1319 (r > 0.97)  YPO1317-13… Sulfur metabolism?             -              
209 | #  10 Sulfur metabolism       YPO4109-4111 (r > 0.90)  YPO4109-41… Sulfur metabolism?             -              
210 | #  # … with 29 more rows
211 | ```
212 | 
213 | Use `collapse_rows` to join column names and cell values in a semi-colon
214 | delimited string (and then search using functions in the next section).
215 | 
216 | ``` r
217 | collapse_rows(tbls, na.string="-")
218 | #  # A tibble: 100 x 3
219 | #     table     row text                                                                                         
220 | #     <chr>   <int> <chr>                                                                                        
221 | #   1 Table 1     1 subheading=Iron uptake or heme synthesis; Potential operon (r value)=yfeABCD operon* (r > 0.…
222 | #   2 Table 1     2 subheading=Iron uptake or heme synthesis; Potential operon (r value)=hmuRSTUV operon (r > 0.…
223 | #   3 Table 1     3 subheading=Iron uptake or heme synthesis; Potential operon (r value)=ysuJIHG* (r > 0.95); Ge…
224 | #   4 Table 1     4 subheading=Iron uptake or heme synthesis; Potential operon (r value)=sufABCDS* (r > 0.90); G…
225 | #   5 Table 1     5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1854-1856* (r > 0.97…
226 | #   6 Table 1     6 subheading=Sulfur metabolism; Potential operon (r value)=tauABCD operon (r > 0.90); Gene ID=…
227 | #   7 Table 1     7 subheading=Sulfur metabolism; Potential operon (r value)=ssuEADCB operon (r > 0.97); Gene ID…
228 | #   8 Table 1     8 subheading=Sulfur metabolism; Potential operon (r value)=cys operon (r > 0.92); Gene ID=YPO3…
229 | #   9 Table 1     9 subheading=Sulfur metabolism; Potential operon (r value)=YPO1317-1319 (r > 0.97); Gene ID=YP…
230 | #  10 Table 1    10 subheading=Sulfur metabolism; Potential operon (r value)=YPO4109-4111 (r > 0.90); Gene ID=YP…
231 | #  # … with 90 more rows
232 | ```
233 | 
234 | The other three `pmc` functions are described in the package
235 | [vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md).
236 | 
237 | ## Searching text
238 | 
239 | There are a few functions to search within the `pmc_text` or collapsed
240 | `pmc_table` output. `separate_text` uses the
241 | [stringr](https://stringr.tidyverse.org/) package to extract any regular
242 | expression or vector of words.
243 | 
244 | ``` r
245 | separate_text(txt, "[ATCGN]{5,}")
246 | #  # A tibble: 9 x 5
247 | #    match        section                         paragraph sentence text                                        
248 | #    <chr>        <chr>                               <int>    <int> <chr>                                       
249 | #  1 ACGCAATCGTT… Results and Discussion; Comput…         2        3 A 16 basepair (bp) box (5'-ACGCAATCGTTTTCNT…
250 | #  2 AAACGTTTNCGT Results and Discussion; Comput…         2        4 It is very similar to the E. coli PurR box …
251 | #  3 TGATAATGATT… Results and Discussion; Comput…         2        5 A 21 bp box (5'-TGATAATGATTATCATTATCA-3') w…
252 | #  4 GATAATGATAA… Results and Discussion; Comput…         2        6 It is a 10-1-10 inverted repeat that resemb…
253 | #  5 TGANNNNNNTC… Results and Discussion; Comput…         2        7 A 15 bp box (5'-TGANNNNNNTCAA-3') was found…
254 | #  6 TTGATN       Results and Discussion; Comput…         2        8 It is a part of the E. coli Fnr box (5'-AAW…
255 | #  7 NATCAA       Results and Discussion; Comput…         2        8 It is a part of the E. coli Fnr box (5'-AAW…
256 | #  8 GTTAATTAA    Results and Discussion; Comput…         3        4 The ArcA regulator can recognize a relative…
257 | #  9 GTTAATTAATGT Results and Discussion; Comput…         3        5 An ArcA-box-like sequence (5'-GTTAATTAATGT-…
258 | ```
259 | 
260 | A few wrappers search pre-defined patterns and add an extra step to
261 | expand matched ranges. `separate_refs` matches references within
262 | brackets using `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`.
263 | 
264 | ``` r
265 | separate_refs(txt)
266 | #  # A tibble: 93 x 6
267 | #        id match section   paragraph sentence text                                                              
268 | #     <dbl> <chr> <chr>         <int>    <int> <chr>                                                             
269 | #   1     1 [1]   Backgrou…         1        1 Yersinia pestis is the etiological agent of plague, alternatively…
270 | #   2     2 [2]   Backgrou…         1        3 To produce a transmissible infection, Y. pestis colonizes the fle…
271 | #   3     3 [3]   Backgrou…         1        9 However, a few bacilli are taken up by tissue macrophages, provid…
272 | #   4     4 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's resistanc…
273 | #   5     5 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's resistanc…
274 | #   6     6 [6]   Backgrou…         2        1 A DNA microarray is able to determine simultaneous changes in all…
275 | #   7     7 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…
276 | #   8     8 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…
277 | #   9     9 [7-9] Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…
278 | #  10    10 [10]  Backgrou…         2        2 We and others have measured the gene expression profiles of Y. pe…
279 | #  # … with 83 more rows
280 | ```
281 | 
282 | `separate_genes` will find microbial genes like tauD (with a capitalized
283 | 4th letter) and expand operons like `tauABCD` into four genes.
284 | `separate_tags` will find and expand locus tag ranges below.
285 | 
286 | ``` r
287 | collapse_rows(tbls, na="-") %>%
288 |   separate_tags("YPO") %>%
289 |   filter(id == "YPO1855")
290 | #  # A tibble: 3 x 5
291 | #    id      match        table    row text                                                                      
292 | #    <chr>   <chr>        <chr>  <int> <chr>                                                                     
293 | #  1 YPO1855 YPO1854-1856 Table…     5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1…
294 | #  2 YPO1855 YPO1854-1856 Table…    21 subheading=Category C: Hypothetical; Gene ID=YPO1854-1856; Description=Pu…
295 | #  3 YPO1855 YPO1854-YPO… Table…     2 Cluster=Cluster II; Genes or operons for motif discovery=hmuRSTUV, YPO068…
296 | ```
297 | 
298 | See the
299 | [vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md)
300 | for more details including code to parse XML documents using the
301 | [xml2](https://github.com/r-lib/xml2) package. The [PMC FTP
302 | vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md)
303 | has details on parsing XML files at the Europe PMC [FTP
304 | site](https://europepmc.org/ftp/oa/).
305 | 
306 | ### Community Guidelines
307 | 
308 | This project is released with a [Contributor Code of
309 | Conduct](CONDUCT.md). By participating in this project you agree to
310 | abide by its terms. Feedback, bug reports, and feature requests are
311 | welcome [here](https://github.com/ropensci/tidypmc/issues).
312 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "@context": ["https://doi.org/10.5063/schema/codemeta-2.0", "http://schema.org"],
  3 |   "@type": "SoftwareSourceCode",
  4 |   "identifier": "tidypmc",
  5 |   "description": "This package parses section paragraphs, captions, tables, references and metadata from XML documents in the Open Access subset of Pubmed Central. Additional functions are available to search text and expand ranges of the referenes cited, locus tags and operons.",
  6 |   "name": "tidypmc: Parse full text XML documents from PMC",
  7 |   "codeRepository": "https://github.com/ropensci/tidypmc",
  8 |   "license": "https://spdx.org/licenses/GPL-3.0",
  9 |   "version": "1.1",
 10 |   "programmingLanguage": {
 11 |     "@type": "ComputerLanguage",
 12 |     "name": "R",
 13 |     "version": "3.5.2",
 14 |     "url": "https://r-project.org"
 15 |   },
 16 |   "runtimePlatform": "R version 3.5.2 (2018-12-20)",
 17 |   "author": {},
 18 |   "contributor": {},
 19 |   "copyrightHolder": {},
 20 |   "funder": {},
 21 |   "maintainer": [
 22 |     {
 23 |       "@type": "Person",
 24 |       "givenName": "Chris",
 25 |       "familyName": "Stubben",
 26 |       "email": "chris.stubben@hci.utah.edu"
 27 |     }
 28 |   ],
 29 |   "softwareSuggestions": [
 30 |     {
 31 |       "@type": "SoftwareApplication",
 32 |       "identifier": "europepmc",
 33 |       "name": "europepmc",
 34 |       "provider": {
 35 |         "@id": "https://cran.r-project.org",
 36 |         "@type": "Organization",
 37 |         "name": "Comprehensive R Archive Network (CRAN)",
 38 |         "url": "https://cran.r-project.org"
 39 |       },
 40 |       "sameAs": "https://CRAN.R-project.org/package=europepmc"
 41 |     },
 42 |     {
 43 |       "@type": "SoftwareApplication",
 44 |       "identifier": "tidytext",
 45 |       "name": "tidytext",
 46 |       "provider": {
 47 |         "@id": "https://cran.r-project.org",
 48 |         "@type": "Organization",
 49 |         "name": "Comprehensive R Archive Network (CRAN)",
 50 |         "url": "https://cran.r-project.org"
 51 |       },
 52 |       "sameAs": "https://CRAN.R-project.org/package=tidytext"
 53 |     }
 54 |   ],
 55 |   "softwareRequirements": [
 56 |     {
 57 |       "@type": "SoftwareApplication",
 58 |       "identifier": "xml2",
 59 |       "name": "xml2",
 60 |       "provider": {
 61 |         "@id": "https://cran.r-project.org",
 62 |         "@type": "Organization",
 63 |         "name": "Comprehensive R Archive Network (CRAN)",
 64 |         "url": "https://cran.r-project.org"
 65 |       },
 66 |       "sameAs": "https://CRAN.R-project.org/package=xml2"
 67 |     },
 68 |     {
 69 |       "@type": "SoftwareApplication",
 70 |       "identifier": "tokenizers",
 71 |       "name": "tokenizers",
 72 |       "provider": {
 73 |         "@id": "https://cran.r-project.org",
 74 |         "@type": "Organization",
 75 |         "name": "Comprehensive R Archive Network (CRAN)",
 76 |         "url": "https://cran.r-project.org"
 77 |       },
 78 |       "sameAs": "https://CRAN.R-project.org/package=tokenizers"
 79 |     },
 80 |     {
 81 |       "@type": "SoftwareApplication",
 82 |       "identifier": "stringr",
 83 |       "name": "stringr",
 84 |       "provider": {
 85 |         "@id": "https://cran.r-project.org",
 86 |         "@type": "Organization",
 87 |         "name": "Comprehensive R Archive Network (CRAN)",
 88 |         "url": "https://cran.r-project.org"
 89 |       },
 90 |       "sameAs": "https://CRAN.R-project.org/package=stringr"
 91 |     },
 92 |     {
 93 |       "@type": "SoftwareApplication",
 94 |       "identifier": "tibble",
 95 |       "name": "tibble",
 96 |       "provider": {
 97 |         "@id": "https://cran.r-project.org",
 98 |         "@type": "Organization",
 99 |         "name": "Comprehensive R Archive Network (CRAN)",
100 |         "url": "https://cran.r-project.org"
101 |       },
102 |       "sameAs": "https://CRAN.R-project.org/package=tibble"
103 |     },
104 |     {
105 |       "@type": "SoftwareApplication",
106 |       "identifier": "dplyr",
107 |       "name": "dplyr",
108 |       "provider": {
109 |         "@id": "https://cran.r-project.org",
110 |         "@type": "Organization",
111 |         "name": "Comprehensive R Archive Network (CRAN)",
112 |         "url": "https://cran.r-project.org"
113 |       },
114 |       "sameAs": "https://CRAN.R-project.org/package=dplyr"
115 |     },
116 |     {
117 |       "@type": "SoftwareApplication",
118 |       "identifier": "readr",
119 |       "name": "readr",
120 |       "provider": {
121 |         "@id": "https://cran.r-project.org",
122 |         "@type": "Organization",
123 |         "name": "Comprehensive R Archive Network (CRAN)",
124 |         "url": "https://cran.r-project.org"
125 |       },
126 |       "sameAs": "https://CRAN.R-project.org/package=readr"
127 |     }
128 |   ],
129 |   "fileSize": "84.221KB",
130 |   "citation": [
131 |     {
132 |       "@type": "SoftwareSourceCode",
133 |       "author": [
134 |         {
135 |           "@type": "Person",
136 |           "givenName": "Chris",
137 |           "familyName": "Stubben"
138 |         }
139 |       ],
140 |       "name": "tidypmc: Parse full text XML documents from PMC",
141 |       "url": "https://github.com/ropensci/tidypmc",
142 |       "description": "R package version 1.6"
143 |     }
144 |   ]
145 | }
146 | 


--------------------------------------------------------------------------------
/man/collapse_rows.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/collapse_rows.R
 3 | \name{collapse_rows}
 4 | \alias{collapse_rows}
 5 | \title{Collapse a list of PubMed Central tables}
 6 | \usage{
 7 | collapse_rows(pmc, na.string)
 8 | }
 9 | \arguments{
10 | \item{pmc}{a list of tables, usually from \code{\link{pmc_table}}}
11 | 
12 | \item{na.string}{additional cell values to skip, default is NA and ""}
13 | }
14 | \value{
15 | A tibble with table and row number and collapsed text
16 | }
17 | \description{
18 | Collapse rows into a semi-colon delimited list with column names and cell
19 | values
20 | }
21 | \examples{
22 | x <- data.frame(
23 |   genes = c("aroB", "glnP", "ndhA", "pyrF"),
24 |   fold_change = c(2.5, 1.7, -3.1, -2.6)
25 | )
26 | collapse_rows(list(`Table 1` = x))
27 | }
28 | \author{
29 | Chris Stubben
30 | }
31 | 


--------------------------------------------------------------------------------
/man/pmc_caption.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_caption.R
 3 | \name{pmc_caption}
 4 | \alias{pmc_caption}
 5 | \title{Split captions into sentences}
 6 | \usage{
 7 | pmc_caption(doc)
 8 | }
 9 | \arguments{
10 | \item{doc}{\code{xml_document} from PubMed Central}
11 | }
12 | \value{
13 | a tibble with tag, label, sentence number and text
14 | }
15 | \description{
16 | Split figure, table and supplementary material captions into sentences
17 | }
18 | \examples{
19 | # doc <- pmc_xml("PMC2231364") # OR
20 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
21 |   package = "tidypmc"
22 | ))
23 | x <- pmc_caption(doc)
24 | x
25 | dplyr::filter(x, sentence == 1)
26 | }
27 | \author{
28 | Chris Stubben
29 | }
30 | 


--------------------------------------------------------------------------------
/man/pmc_metadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_metadata.R
 3 | \name{pmc_metadata}
 4 | \alias{pmc_metadata}
 5 | \title{Get article metadata}
 6 | \usage{
 7 | pmc_metadata(doc)
 8 | }
 9 | \arguments{
10 | \item{doc}{\code{xml_document} from PubMed Central}
11 | }
12 | \value{
13 | a list
14 | }
15 | \description{
16 | Get a list of journal and article metadata in /front tag
17 | }
18 | \examples{
19 | # doc <- pmc_xml("PMC2231364") # OR
20 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
21 |   package = "tidypmc"
22 | ))
23 | pmc_metadata(doc)
24 | }
25 | \author{
26 | Chris Stubben
27 | }
28 | 


--------------------------------------------------------------------------------
/man/pmc_reference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_reference.R
 3 | \name{pmc_reference}
 4 | \alias{pmc_reference}
 5 | \title{Format references cited}
 6 | \usage{
 7 | pmc_reference(doc)
 8 | }
 9 | \arguments{
10 | \item{doc}{\code{xml_document} from PubMed Central}
11 | }
12 | \value{
13 | a tibble with id, pmid, authors, year, title, journal, volume, pages,
14 | and doi.
15 | }
16 | \description{
17 | Format references cited
18 | }
19 | \note{
20 | Mixed citations without any child tags are added to the author column.
21 | }
22 | \examples{
23 | # doc <- pmc_xml("PMC2231364")
24 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
25 |   package = "tidypmc"
26 | ))
27 | x <- pmc_reference(doc)
28 | x
29 | }
30 | \author{
31 | Chris Stubben
32 | }
33 | 


--------------------------------------------------------------------------------
/man/pmc_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_table.R
 3 | \name{pmc_table}
 4 | \alias{pmc_table}
 5 | \title{Convert table nodes to tibbles}
 6 | \usage{
 7 | pmc_table(doc)
 8 | }
 9 | \arguments{
10 | \item{doc}{\code{xml_document} from PubMed Central}
11 | }
12 | \value{
13 | a list of tibbles
14 | }
15 | \description{
16 | Convert PubMed Central table nodes into a list of tibbles
17 | }
18 | \note{
19 | Saves the caption and footnotes as attributes and collapses multiline
20 | headers, expands all rowspan and colspan attributes and adds
21 | subheadings to column one.
22 | }
23 | \examples{
24 | # doc <- pmc_xml("PMC2231364")
25 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
26 |   package = "tidypmc"
27 | ))
28 | x <- pmc_table(doc)
29 | sapply(x, dim)
30 | x
31 | attributes(x[[1]])
32 | }
33 | \author{
34 | Chris Stubben
35 | }
36 | 


--------------------------------------------------------------------------------
/man/pmc_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_text.R
 3 | \name{pmc_text}
 4 | \alias{pmc_text}
 5 | \title{Split section paragraphs into sentences}
 6 | \usage{
 7 | pmc_text(doc)
 8 | }
 9 | \arguments{
10 | \item{doc}{\code{xml_document} from PubMed Central}
11 | }
12 | \value{
13 | a tibble with section, paragraph and sentence number and text
14 | }
15 | \description{
16 | Split section paragraph tags into a table with subsection titles and
17 | sentences using \code{tokenize_sentences}
18 | }
19 | \note{
20 | Subsections may be nested to arbitrary depths and this function will
21 | return the entire path to the subsection title as a delimited string like
22 | "Results; Predicted functions; Pathogenicity".  Tables, figures and
23 | formulas that are nested in section paragraphs are removed, superscripted
24 | references are replaced with brackets, and any other superscripts or
25 | subscripts are separared with ^ and _.
26 | }
27 | \examples{
28 | # doc <- pmc_xml("PMC2231364")
29 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
30 |   package = "tidypmc"
31 | ))
32 | txt <- pmc_text(doc)
33 | txt
34 | dplyr::count(txt, section, sort = TRUE)
35 | }
36 | \author{
37 | Chris Stubben
38 | }
39 | 


--------------------------------------------------------------------------------
/man/pmc_xml.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pmc_xml.R
 3 | \name{pmc_xml}
 4 | \alias{pmc_xml}
 5 | \title{Download XML from PubMed Central}
 6 | \source{
 7 | \url{https://europepmc.org/RestfulWebService}
 8 | }
 9 | \usage{
10 | pmc_xml(id)
11 | }
12 | \arguments{
13 | \item{id}{a PMC id starting with 'PMC'}
14 | }
15 | \value{
16 | \code{xml_document}
17 | }
18 | \description{
19 | Download XML from PubMed Central
20 | }
21 | \examples{
22 | \dontrun{
23 | doc <- pmc_xml("PMC2231364")
24 | }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/man/separate_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/separate_genes.R
 3 | \name{separate_genes}
 4 | \alias{separate_genes}
 5 | \title{Separate genes and operons into multiple rows}
 6 | \usage{
 7 | separate_genes(txt, pattern = "\\\\b[A-Za-z][a-z]{2}[A-Z0-9]+\\\\b",
 8 |   genes, operon = 6, column = "text")
 9 | }
10 | \arguments{
11 | \item{txt}{a table}
12 | 
13 | \item{pattern}{regular expression to match genes, default is to match
14 | microbial genes like AbcD, default [A-Za-z][a-z]{2}[A-Z0-9]+}
15 | 
16 | \item{genes}{an optional vector of genes, set pattern to NA to only match
17 | this list.}
18 | 
19 | \item{operon}{operon length, default 6. Split genes with 6 or more letters
20 | into separate genes, for example AbcDEF is split into abcD, abcE and abcF.}
21 | 
22 | \item{column}{column name to search, default "text"}
23 | }
24 | \value{
25 | a tibble with gene name, matching text and rows.
26 | }
27 | \description{
28 | Separate genes and operons mentioned in full text into multiple rows
29 | }
30 | \note{
31 | Check for genes in italics using \code{xml_text(xml_find_all(doc,
32 | "//sec//p//italic"))} and update the pattern or add additional genes as an
33 | optional vector if needed
34 | }
35 | \examples{
36 | x <- data.frame(row = 1, text = "Genes like YacK, hmu and sufABC")
37 | separate_genes(x)
38 | separate_genes(x, genes = "hmu")
39 | }
40 | \author{
41 | Chris Stubben
42 | }
43 | 


--------------------------------------------------------------------------------
/man/separate_refs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/separate_refs.R
 3 | \name{separate_refs}
 4 | \alias{separate_refs}
 5 | \title{Separate references cited into multiple rows}
 6 | \usage{
 7 | separate_refs(txt, column = "text")
 8 | }
 9 | \arguments{
10 | \item{txt}{a table}
11 | 
12 | \item{column}{column name, default "text"}
13 | }
14 | \value{
15 | a tibble
16 | }
17 | \description{
18 | Separates references cited in brackets or parentheses into multiple rows and
19 | splits the comma-delimited numeric strings and expands ranges like 7-9 into
20 | new rows
21 | }
22 | \examples{
23 | x <- data.frame(row = 1, text = "some important studies [7-9,15]")
24 | separate_refs(x)
25 | }
26 | \author{
27 | Chris Stubben
28 | }
29 | 


--------------------------------------------------------------------------------
/man/separate_tags.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/separate_tags.R
 3 | \name{separate_tags}
 4 | \alias{separate_tags}
 5 | \title{Separate locus tag into multiple rows}
 6 | \usage{
 7 | separate_tags(txt, pattern, column = "text")
 8 | }
 9 | \arguments{
10 | \item{txt}{a table}
11 | 
12 | \item{pattern}{regular expression to match locus tags like YPO[0-9-]+ or
13 | the locus tag prefix like YPO.}
14 | 
15 | \item{column}{column name to search, default "text"}
16 | }
17 | \value{
18 | a tibble with locus tag, matching text and rows.
19 | }
20 | \description{
21 | Separates locus tags mentioned in full text and expands ranges like
22 | YPO1970-74 into new rows
23 | }
24 | \examples{
25 | x <- data.frame(row = 1, text = "some genes like YPO1002 and YPO1970-74")
26 | separate_tags(x, "YPO")
27 | }
28 | \author{
29 | Chris Stubben
30 | }
31 | 


--------------------------------------------------------------------------------
/man/separate_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/separate_text.R
 3 | \name{separate_text}
 4 | \alias{separate_text}
 5 | \title{Separate all matching text into multiple rows}
 6 | \usage{
 7 | separate_text(txt, pattern, column = "text")
 8 | }
 9 | \arguments{
10 | \item{txt}{a tibble, usually results from \code{pmc_text}}
11 | 
12 | \item{pattern}{either a regular expression or a vector of words to find in
13 | text}
14 | 
15 | \item{column}{column name, default "text"}
16 | }
17 | \value{
18 | a tibble
19 | }
20 | \description{
21 | Separate all matching text into multiple rows
22 | }
23 | \note{
24 | passed to \code{grepl} and \code{str_extract_all}
25 | }
26 | \examples{
27 | # doc <- pmc_xml("PMC2231364")
28 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml",
29 |         package = "tidypmc"))
30 | txt <- pmc_text(doc)
31 | separate_text(txt, "[ATCGN]{5,}")
32 | separate_text(txt, "\\\\([A-Z]{3,6}s?\\\\)")
33 | # pattern can be a vector of words
34 | separate_text(txt, c("hmu", "ybt", "yfe", "yfu"))
35 | # wrappers for separate_text with extra step to expand matched ranges
36 | separate_refs(txt)
37 | separate_genes(txt)
38 | separate_tags(txt, "YPO")
39 | 
40 | }
41 | \author{
42 | Chris Stubben
43 | }
44 | 


--------------------------------------------------------------------------------
/man/tidypmc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidypmc-package.R
 3 | \docType{package}
 4 | \name{tidypmc}
 5 | \alias{tidypmc}
 6 | \alias{tidypmc-package}
 7 | \title{\code{tidypmc} package}
 8 | \description{
 9 | Parse full text XML documents from PubMed Central
10 | }
11 | \details{
12 | See the Github page for details at \url{https://github.com/ropensci/tidypmc}
13 | }
14 | \keyword{internal}
15 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(tidypmc)
3 | 
4 | test_check("tidypmc")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/tests-pmc_other.R:
--------------------------------------------------------------------------------
 1 | context("Parse other")
 2 | 
 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc"))
 4 | doc2 <- xml2::read_xml("<p>This is some text</p>")
 5 | 
 6 | test_that("pmc_caption works", {
 7 |     expect_is(pmc_caption(doc), "tbl_df")
 8 |     expect_error(pmc_caption("a vector") )
 9 |     expect_equal(pmc_caption(doc2), NULL)
10 | })
11 | 
12 | test_that("pmc_reference works", {
13 |     expect_is(pmc_reference(doc), "tbl_df")
14 |     expect_error(pmc_reference("a vector") )
15 |     expect_equal(pmc_reference(doc2), NULL)
16 | })
17 | 
18 | test_that("pmc_metadata works", {
19 |     expect_is(pmc_metadata(doc), "list")
20 |     expect_error(pmc_metadata("a vector") )
21 |     expect_equal(pmc_metadata(doc2), NULL)
22 | })
23 | 
24 | test_that("pmc_xml works", {
25 |     expect_error(pmc_xml("not ID"))
26 | })
27 | 


--------------------------------------------------------------------------------
/tests/testthat/tests-pmc_table.R:
--------------------------------------------------------------------------------
 1 | context("Parse tables")
 2 | 
 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc"))
 4 | doc2 <- xml2::read_xml("<p>This is some text</p>")
 5 | t1 <- pmc_table(doc)
 6 | 
 7 | 
 8 | test_that("pmc_table works", {
 9 |     expect_is(t1, "list")
10 |     expect_error(pmc_table("a vector") )
11 |     expect_equal(pmc_text(doc2), NULL)
12 | })
13 | 
14 | test_that("collapse rows works", {
15 |     expect_is(collapse_rows(t1), "tbl_df")
16 |     expect_is(collapse_rows(t1[[1]]), "tbl_df")
17 |     expect_error(collapse_rows("a vector") )
18 | })
19 | 
20 | test_that("repeat subheading works", {
21 |     expect_is(repeat_sub(t1[[1]]), "tbl_df")
22 |     expect_error(repeat_sub("a vector") )
23 | })
24 | 


--------------------------------------------------------------------------------
/tests/testthat/tests-pmc_text.R:
--------------------------------------------------------------------------------
 1 | context("Parse text")
 2 | 
 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc"))
 4 | txt <- pmc_text(doc)
 5 | doc2 <- xml2::read_xml("<p>This is some text</p>")
 6 | 
 7 | test_that("path string formats", {
 8 |    x <- c("carnivores", "bears", "polar", "grizzly", "cats", "tiger")
 9 |    n <- c(1,2,3,3,2,3)
10 |    expect_is(path_string(x, n), "character")
11 |    expect_error(path_string(n, x))
12 | })
13 | 
14 | test_that("pmc_text works", {
15 |     expect_is(txt, "tbl_df")
16 |     expect_error(pmc_text("a vector") )
17 |     expect_equal(pmc_text(doc2), NULL)
18 | })
19 | 


--------------------------------------------------------------------------------
/tests/testthat/tests-separate.R:
--------------------------------------------------------------------------------
 1 | context("Search text")
 2 | 
 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc"))
 4 | txt <- pmc_text(doc)
 5 | 
 6 | test_that("Separate text", {
 7 |  expect_is(separate_text(txt, "[ATCGN]{5,}"), "tbl_df")
 8 |  expect_equal(separate_text(txt, "missing string"), NULL)
 9 | })
10 | 
11 | test_that("Separate refs", {
12 |  expect_is(separate_refs(txt), "tbl_df")
13 |  # no refs in Abstract
14 |  a1 <- separate_refs(dplyr::filter(txt, section=="Abstract"))
15 |  expect_equal(a1, NULL)
16 | })
17 | 
18 | test_that("Separate genes", {
19 |  expect_is(separate_genes(txt), "tbl_df")
20 |  a1 <- separate_genes(dplyr::filter(txt, section=="Conclusion"))
21 |  expect_equal(a1, NULL)
22 | })
23 | 
24 | test_that("Separate locus tags", {
25 |  expect_is(separate_tags(txt, "YPO"), "tbl_df")
26 |  a1 <- separate_tags(dplyr::filter(txt, section=="Abstract"), "YPO")
27 |  expect_equal(a1, NULL)
28 | })
29 | 


--------------------------------------------------------------------------------
/vignettes/pmcftp.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Parsing Europe PMC FTP files"
 3 | author: "Chris Stubben"
 4 | date: '`r gsub("  ", " ", format(Sys.time(), "%B %e, %Y"))`'
 5 | output: rmarkdown::html_vignette
 6 | vignette: >
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\VignetteIndexEntry{Parse PMC FTP files}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r setup, include = FALSE}
13 | knitr::opts_chunk$set(
14 |   collapse = TRUE,
15 |   comment = "# "
16 | )
17 | ```
18 | 
19 | 
20 | The [Europe PMC FTP] includes 2.5 million open access articles separated into
21 | files with 10K articles each.  Download and unzip a recent series of PMC ids
22 | and load into R using the `readr` package.   A sample file with the first 10
23 | articles is included in the `tidypmc` package.
24 | 
25 | ```{r load}
26 | library(readr)
27 | pmcfile <- system.file("extdata/PMC6358576_PMC6358589.xml", package = "tidypmc")
28 | pmc <- read_lines(pmcfile)
29 | ```
30 | 
31 | 
32 | Find the start of the article nodes.
33 | 
34 | ```{r startnode}
35 | a1 <- grep("^<article ", pmc)
36 | head(a1)
37 | n <- length(a1)
38 | n
39 | ```
40 | 
41 | Read a single article by collapsing the lines into a new line separated string.
42 | 
43 | 
44 | ```{r read1, echo=-1}
45 | options(width=100)
46 | library(xml2)
47 | x1 <- paste(pmc[2:29], collapse="\n")
48 | doc <- read_xml(x1)
49 | doc
50 | ```
51 | 
52 | 
53 | Loop through the articles and save the metadata and text below.
54 | All 10K articles takes about 10 minutes to run on a Mac laptop and returns 1.7M
55 | sentences.
56 | 
57 | 
58 | ```{r loop}
59 | library(tidypmc)
60 | a1 <- c(a1, length(pmc))
61 | met1 <- vector("list", n)
62 | txt1 <- vector("list", n)
63 | for(i in seq_len(n)){
64 |   doc <- read_xml(paste(pmc[a1[i]:(a1[i+1]-1)], collapse="\n"))
65 |   m1 <- pmc_metadata(doc)
66 |   id <- m1$PMCID
67 |   message("Parsing ", i, ". ", id)
68 |   met1[[i]] <- m1
69 |   txt1[[i]] <- pmc_text(doc)
70 | }
71 | ```
72 | 
73 | 
74 | Combine the list of metadata and text into tables.
75 | 
76 | 
77 | ```{r combine, echo=-1, message=FALSE}
78 | options(width=100)
79 | library(dplyr)
80 | met <- bind_rows(met1)
81 | names(txt1) <- met$PMCID
82 | txt <- bind_rows(txt1, .id="PMCID")
83 | met
84 | txt
85 | ```
86 | 
87 | 
88 | 
89 | 
90 | [Europe PMC FTP]: https://europepmc.org/ftp/oa/
91 | 


--------------------------------------------------------------------------------
/vignettes/pmcftp.md:
--------------------------------------------------------------------------------
  1 | Parsing Europe PMC FTP files
  2 | ================
  3 | Chris Stubben
  4 | August 6, 2019
  5 | 
  6 | The [Europe PMC FTP](https://europepmc.org/ftp/oa/) includes 2.5 million
  7 | open access articles separated into files with 10K articles each.
  8 | Download and unzip a recent series of PMC ids and load into R using the
  9 | `readr` package. A sample file with the first 10 articles is included in
 10 | the `tidypmc` package.
 11 | 
 12 | ``` r
 13 | library(readr)
 14 | pmcfile <- system.file("extdata/PMC6358576_PMC6358589.xml", package = "tidypmc")
 15 | pmc <- read_lines(pmcfile)
 16 | ```
 17 | 
 18 | Find the start of the article nodes.
 19 | 
 20 | ``` r
 21 | a1 <- grep("^<article ", pmc)
 22 | head(a1)
 23 | #  [1]  2 30 38 52 62 69
 24 | n <- length(a1)
 25 | n
 26 | #  [1] 10
 27 | ```
 28 | 
 29 | Read a single article by collapsing the lines into a new line separated
 30 | string.
 31 | 
 32 | ``` r
 33 | library(xml2)
 34 | x1 <- paste(pmc[2:29], collapse="\n")
 35 | doc <- read_xml(x1)
 36 | doc
 37 | #  {xml_document}
 38 | #  <article article-type="case-report" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML">
 39 | #  [1] <front>\n  <journal-meta>\n    <journal-id journal-id-type="nlm-ta">ACG Case Rep J</journal-i ...
 40 | #  [2] <body>\n  <sec sec-type="intro" id="sec1">\n    <title>Introduction</title>\n    <p>Bezoars a ...
 41 | #  [3] <back>\n  <ref-list>\n    <title>References</title>\n    <ref id="B1">\n      <label>1.</labe ...
 42 | ```
 43 | 
 44 | Loop through the articles and save the metadata and text below. All 10K
 45 | articles takes about 10 minutes to run on a Mac laptop and returns 1.7M
 46 | sentences.
 47 | 
 48 | ``` r
 49 | library(tidypmc)
 50 | a1 <- c(a1, length(pmc))
 51 | met1 <- vector("list", n)
 52 | txt1 <- vector("list", n)
 53 | for(i in seq_len(n)){
 54 |   doc <- read_xml(paste(pmc[a1[i]:(a1[i+1]-1)], collapse="\n"))
 55 |   m1 <- pmc_metadata(doc)
 56 |   id <- m1$PMCID
 57 |   message("Parsing ", i, ". ", id)
 58 |   met1[[i]] <- m1
 59 |   txt1[[i]] <- pmc_text(doc)
 60 | }
 61 | #  Parsing 1. PMC6358576
 62 | #  Parsing 2. PMC6358577
 63 | #  Parsing 3. PMC6358578
 64 | #  Parsing 4. PMC6358579
 65 | #  Parsing 5. PMC6358580
 66 | #  Parsing 6. PMC6358581
 67 | #  Parsing 7. PMC6358585
 68 | #  Note: removing table-wrap nested in sec/p tag
 69 | #  Note: removing fig nested in sec/p tag
 70 | #  Parsing 8. PMC6358587
 71 | #  Note: removing table-wrap nested in sec/p tag
 72 | #  Note: removing fig nested in sec/p tag
 73 | #  Parsing 9. PMC6358588
 74 | #  Note: removing fig nested in sec/p tag
 75 | #  Parsing 10. PMC6358589
 76 | #  Note: removing table-wrap nested in sec/p tag
 77 | #  Note: removing fig nested in sec/p tag
 78 | ```
 79 | 
 80 | Combine the list of metadata and text into tables.
 81 | 
 82 | ``` r
 83 | library(dplyr)
 84 | met <- bind_rows(met1)
 85 | names(txt1) <- met$PMCID
 86 | txt <- bind_rows(txt1, .id="PMCID")
 87 | met
 88 | #  # A tibble: 10 x 12
 89 | #     PMCID Title Authors  Year Journal Volume Pages `Published onli… `Date received` DOI   Publisher
 90 | #     <chr> <chr> <chr>   <int> <chr>   <chr>  <chr> <chr>            <chr>           <chr> <chr>    
 91 | #   1 PMC6… Endo… Dana B…  2018 ACG Ca… 5      e87   2018-12-5        2018-7-8        10.1… American…
 92 | #   2 PMC6… Chro… Scott …  2018 ACG Ca… 5      e94   2018-12-5        2018-5-5        10.1… American…
 93 | #   3 PMC6… Bile… Steffi…  2018 ACG Ca… 5      e88   2018-12-5        2018-5-7        10.1… American…
 94 | #   4 PMC6… New … Gordon…  2018 ACG Ca… 5      e92   2018-12-5        2018-3-3        10.1… American…
 95 | #   5 PMC6… Bile… Michae…  2018 ACG Ca… 5      e89   2018-12-5        2017-11-3       10.1… American…
 96 | #   6 PMC6… Fuso… Akshay…  2018 ACG Ca… 5      e99   2018-12-19       2018-3-8        10.1… American…
 97 | #   7 PMC6… Chor… Marcia…  2019 Genes … 20     56-68 2018-1-24        2017-9-1        10.1… Nature P…
 98 | #   8 PMC6… The … Tao Zh…  2019 Spinal… 57     141-… 2018-8-8         2017-12-19      10.1… Nature P…
 99 | #   9 PMC6… Natu… Marjol…  2019 Molecu… 20     115-… 2018-12-16       2018-10-22      10.1… Elsevier 
100 | #  10 PMC6… Pred… Yury O…  2019 Molecu… 20     63-78 2018-11-16       2018-9-10       10.1… Elsevier 
101 | #  # … with 1 more variable: Issue <chr>
102 | txt
103 | #  # A tibble: 1,083 x 5
104 | #     PMCID    section    paragraph sentence text                                                      
105 | #     <chr>    <chr>          <int>    <int> <chr>                                                     
106 | #   1 PMC6358… Title              1        1 Endoscopic versus Surgical Intervention for Jejunal Bezoa…
107 | #   2 PMC6358… Abstract           1        1 Bezoar-induced small bowel obstruction is a rare entity, …
108 | #   3 PMC6358… Abstract           1        2 The cornerstone of treatment for intestinal bezoars has b…
109 | #   4 PMC6358… Abstract           1        3 We present a patient with obstructive jejunal phytobezoar…
110 | #   5 PMC6358… Introduct…         1        1 Bezoars are aggregates of undigested foreign material tha…
111 | #   6 PMC6358… Introduct…         1        2 There are currently four classifications of bezoars: phyt…
112 | #   7 PMC6358… Introduct…         1        3 Endoscopic treatment of bezoars causing intestinal obstru…
113 | #   8 PMC6358… Case Repo…         1        1 A 60-year old diabetic woman with a past cholecystectomy …
114 | #   9 PMC6358… Case Repo…         1        2 Physical examination revealed mild diffuse abdominal tend…
115 | #  10 PMC6358… Case Repo…         1        3 Computed tomography (CT) of the abdomen and pelvis reveal…
116 | #  # … with 1,073 more rows
117 | ```
118 | 


--------------------------------------------------------------------------------
/vignettes/tidypmc.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to tidypmc"
  3 | author: "Chris Stubben"
  4 | date: '`r gsub("  ", " ", format(Sys.time(), "%B %e, %Y"))`'
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\VignetteIndexEntry{Introduction to tidypmc}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "# "
 16 | )
 17 | ```
 18 | 
 19 | The `tidypmc` package parses XML documents in the Open Access subset of [Pubmed Central].
 20 | Download the full text using `pmc_xml`.
 21 | 
 22 | ```{r epmc_ftxt}
 23 | library(tidypmc)
 24 | doc <- pmc_xml("PMC2231364")
 25 | doc
 26 | ```
 27 | 
 28 | The package includes five functions to parse the `xml_document`.
 29 | 
 30 | 
 31 | |R function     |Description                                                                |
 32 | |:--------------|:--------------------------------------------------------------------------|
 33 | |`pmc_text`     |Split section paragraphs into sentences with full path to subsection titles|
 34 | |`pmc_caption`  |Split figure, table and supplementary material captions into sentences     |
 35 | |`pmc_table`    |Convert table nodes into a list of tibbles                                 |
 36 | |`pmc_reference`|Format references cited into a tibble                                      |
 37 | |`pmc_metadata` |List journal and article metadata in front node                            |
 38 | 
 39 | 
 40 | 
 41 | `pmc_text` splits paragraphs into sentences and  removes any tables, figures or
 42 | formulas that are nested within paragraph tags, replaces superscripted
 43 | references with brackets, adds carets and underscores to other superscripts and
 44 | subscripts and includes the full path to the subsection title.
 45 | 
 46 | ```{r pmc_text, message=FALSE, echo=-1}
 47 | options(width=100)
 48 | library(dplyr)
 49 | txt <- pmc_text(doc)
 50 | txt
 51 | count(txt, section)
 52 | ```
 53 | 
 54 | `pmc_caption` splits figure, table and supplementary material captions into sentences.
 55 | 
 56 | 
 57 | ```{r pmc_caption, echo=-1}
 58 | options(width=100)
 59 | cap1 <- pmc_caption(doc)
 60 | filter(cap1, sentence == 1)
 61 | ```
 62 | 
 63 | `pmc_table` formats tables by collapsing multiline headers, expanding rowspan and
 64 | colspan attributes and adding subheadings into a new column.
 65 | 
 66 | ```{r pmc_table, echo=-1}
 67 | options(width=100)
 68 | tab1 <- pmc_table(doc)
 69 | sapply(tab1, nrow)
 70 | tab1[[1]]
 71 | ```
 72 | 
 73 | Captions and footnotes are added as attributes.
 74 | 
 75 | ```{r attributes}
 76 | attributes(tab1[[1]])
 77 | ```
 78 | 
 79 | 
 80 | Use `collapse_rows` to join column names and cell values in a semi-colon delimited string (and
 81 | then search using functions in the next section).
 82 | 
 83 | ```{r collapserows, echo=-1}
 84 | options(width=100)
 85 | collapse_rows(tab1, na.string="-")
 86 | ```
 87 | 
 88 | 
 89 | `pmc_reference` extracts the id, pmid, authors, year, title, journal, volume, pages,
 90 | and DOIs from reference tags.
 91 | 
 92 | 
 93 | ```{r pmc_ref, echo=-1}
 94 | options(width=100)
 95 | ref1 <- pmc_reference(doc)
 96 | ref1
 97 | ```
 98 | 
 99 | 
100 | Finally, `pmc_metadata` saves journal and article metadata to a list.
101 | 
102 | ```{r pmc_metadata}
103 | pmc_metadata(doc)
104 | ```
105 | 
106 | 
107 | ## Searching text
108 | 
109 | There are a few functions to search within the `pmc_text` or collapsed `pmc_table` output.
110 | `separate_text` uses the [stringr]  package to extract any matching regular expression.
111 | 
112 | 
113 | ```{r separate_text, echo=-1}
114 | options(width=100)
115 | separate_text(txt, "[ATCGN]{5,}")
116 | ```
117 | 
118 | A few wrappers search pre-defined patterns and add an extra step to expand matched ranges. `separate_refs`
119 | matches references within brackets using `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`.
120 | 
121 | ```{r separate_refs, echo=-1}
122 | options(width=100)
123 | x <- separate_refs(txt)
124 | x
125 | filter(x, id == 8)
126 | ```
127 | 
128 | `separate_genes` expands microbial gene operons like `hmsHFRS` into four separate genes.
129 | 
130 | ```{r separate_genes, echo=-1}
131 | options(width=100)
132 | separate_genes(txt)
133 | ```
134 | 
135 | Finally, `separate_tags` expands locus tag ranges.
136 | 
137 | 
138 | ```{r locus_tags, echo=-1}
139 | options(width=100)
140 | collapse_rows(tab1, na="-") %>%
141 |   separate_tags("YPO")
142 | ```
143 | 
144 | 
145 | ### Using `xml2`
146 | 
147 | The `pmc_*` functions use the [xml2] package for parsing and may fail in some situations, so
148 | it helps to know how to parse `xml_documents`.  Use `cat` and `as.character` to view nodes
149 | returned by `xml_find_all`.
150 | 
151 | ```{r catchar}
152 | library(xml2)
153 | refs <- xml_find_all(doc, "//ref")
154 | refs[1]
155 | cat(as.character(refs[1]))
156 | ```
157 | 
158 | 
159 | Many journals use superscripts for references cited so they usually
160 | appear after words like `results9` below.
161 | 
162 | ```{r pmcdoc1, message=FALSE}
163 | # doc1 <- pmc_xml("PMC6385181")
164 | doc1 <- read_xml(system.file("extdata/PMC6385181.xml", package = "tidypmc"))
165 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2])
166 | ```
167 | 
168 | Find the tags using `xml_find_all` and then update the nodes by adding brackets
169 | or other text.
170 | 
171 | ```{r bib}
172 | bib <- xml_find_all(doc1, "//xref[@ref-type='bibr']")
173 | bib[1]
174 | xml_text(bib) <- paste0(" [", xml_text(bib), "]")
175 | bib[1]
176 | ```
177 | 
178 | The text is now separated from the reference.  Note the `pmc_text` function adds the brackets by default.
179 | 
180 | ```{r pmc_text2, message=FALSE}
181 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2])
182 | ```
183 | 
184 | 
185 | Genes, species and many other terms are often included within italic tags.  You
186 | can mark these nodes using the same code above or simply list all the names
187 | in italics and search text or tables for matches, for example three letter gene
188 | names in text below.
189 | 
190 | 
191 | ```{r italicgenes}
192 | library(tibble)
193 | x <- xml_name(xml_find_all(doc, "//*"))
194 | tibble(tag=x) %>%
195 |   count(tag, sort=TRUE)
196 | it <- xml_text(xml_find_all(doc, "//sec//p//italic"), trim=TRUE)
197 | it2 <- tibble(italic=it) %>%
198 |   count(italic, sort=TRUE)
199 | it2
200 | filter(it2, nchar(italic) == 3)
201 | separate_text(txt, c("fur", "cys", "hmu", "ybt", "yfe", "yfu", "ymt"))
202 | ```
203 | 
204 | 
205 | 
206 | 
207 | [stringr]: https://stringr.tidyverse.org/
208 | [xml2]: https://github.com/r-lib/xml2
209 | [europepmc]: https://github.com/ropensci/europepmc
210 | [Pubmed Central]: https://europepmc.org
211 | 


--------------------------------------------------------------------------------
/vignettes/tidypmc.md:
--------------------------------------------------------------------------------
  1 | Introduction to tidypmc
  2 | ================
  3 | Chris Stubben
  4 | August 6, 2019
  5 | 
  6 | The `tidypmc` package parses XML documents in the Open Access subset of
  7 | [Pubmed Central](https://europepmc.org). Download the full text using
  8 | `pmc_xml`.
  9 | 
 10 | ``` r
 11 | library(tidypmc)
 12 | doc <- pmc_xml("PMC2231364")
 13 | doc
 14 | #  {xml_document}
 15 | #  <article article-type="research-article" xmlns:xlink="http://www.w3.org/1999/xlink">
 16 | #  [1] <front>\n  <journal-meta>\n    <journal-id journal-id-type="nlm-ta"> ...
 17 | #  [2] <body>\n  <sec>\n    <title>Background</title>\n    <p><italic>Yersi ...
 18 | #  [3] <back>\n  <ack>\n    <sec>\n      <title>Acknowledgements</title>\n  ...
 19 | ```
 20 | 
 21 | The package includes five functions to parse the
 22 | `xml_document`.
 23 | 
 24 | | R function      | Description                                                                 |
 25 | | :-------------- | :-------------------------------------------------------------------------- |
 26 | | `pmc_text`      | Split section paragraphs into sentences with full path to subsection titles |
 27 | | `pmc_caption`   | Split figure, table and supplementary material captions into sentences      |
 28 | | `pmc_table`     | Convert table nodes into a list of tibbles                                  |
 29 | | `pmc_reference` | Format references cited into a tibble                                       |
 30 | | `pmc_metadata`  | List journal and article metadata in front node                             |
 31 | 
 32 | `pmc_text` splits paragraphs into sentences and removes any tables,
 33 | figures or formulas that are nested within paragraph tags, replaces
 34 | superscripted references with brackets, adds carets and underscores to
 35 | other superscripts and subscripts and includes the full path to the
 36 | subsection title.
 37 | 
 38 | ``` r
 39 | library(dplyr)
 40 | txt <- pmc_text(doc)
 41 | txt
 42 | #  # A tibble: 194 x 4
 43 | #     section    paragraph sentence text                                                               
 44 | #     <chr>          <int>    <int> <chr>                                                              
 45 | #   1 Title              1        1 Comparative transcriptomics in Yersinia pestis: a global view of e…
 46 | #   2 Abstract           1        1 Environmental modulation of gene expression in Yersinia pestis is …
 47 | #   3 Abstract           1        2 Using cDNA microarray technology, we have analyzed the global gene…
 48 | #   4 Abstract           2        1 To provide us with a comprehensive view of environmental modulatio…
 49 | #   5 Abstract           2        2 Almost all known virulence genes of Y. pestis were differentially …
 50 | #   6 Abstract           2        3 Clustering enabled us to functionally classify co-expressed genes,…
 51 | #   7 Abstract           2        4 Collections of operons were predicted from the microarray data, an…
 52 | #   8 Abstract           2        5 Several regulatory DNA motifs, probably recognized by the regulato…
 53 | #   9 Abstract           3        1 The comparative transcriptomics analysis we present here not only …
 54 | #  10 Background         1        1 Yersinia pestis is the etiological agent of plague, alternatively …
 55 | #  # … with 184 more rows
 56 | count(txt, section)
 57 | #  # A tibble: 21 x 2
 58 | #     section                                                  n
 59 | #     <chr>                                                <int>
 60 | #   1 Abstract                                                 8
 61 | #   2 Authors' contributions                                   6
 62 | #   3 Background                                              20
 63 | #   4 Conclusion                                               3
 64 | #   5 Methods; Clustering analysis                             7
 65 | #   6 Methods; Collection of microarray expression data       17
 66 | #   7 Methods; Discovery of regulatory DNA motifs              8
 67 | #   8 Methods; Gel mobility shift analysis of Fur binding     13
 68 | #   9 Methods; Operon prediction                               5
 69 | #  10 Methods; Verification of predicted operons by RT-PCR     7
 70 | #  # … with 11 more rows
 71 | ```
 72 | 
 73 | `pmc_caption` splits figure, table and supplementary material captions
 74 | into sentences.
 75 | 
 76 | ``` r
 77 | cap1 <- pmc_caption(doc)
 78 | #  Found 5 figures
 79 | #  Found 4 tables
 80 | #  Found 3 supplements
 81 | filter(cap1, sentence == 1)
 82 | #  # A tibble: 12 x 4
 83 | #     tag      label               sentence text                                                       
 84 | #     <chr>    <chr>                  <int> <chr>                                                      
 85 | #   1 figure   Figure 1                   1 Environmental modulation of expression of virulence genes. 
 86 | #   2 figure   Figure 2                   1 RT-PCR analysis of potential operons.                      
 87 | #   3 figure   Figure 3                   1 Schematic representation of the clustered microarray data. 
 88 | #   4 figure   Figure 4                   1 Graphical representation of the consensus patterns by moti…
 89 | #   5 figure   Figure 5                   1 EMSA analysis of the binding of Fur protein to promoter DN…
 90 | #   6 table    Table 1                    1 Stress-responsive operons in Y. pestis predicted from micr…
 91 | #   7 table    Table 2                    1 Classification of the gene members of the cluster II in Fi…
 92 | #   8 table    Table 3                    1 Motif discovery for the clustering genes                   
 93 | #   9 table    Table 4                    1 Designs for expression profiling of Y. pestis              
 94 | #  10 supplem… Additional file 1 …        1 Growth curves of Y. pestis strain 201 under different cond…
 95 | #  11 supplem… Additional file 2 …        1 All the transcriptional changes of 4005 genes of Y. pestis…
 96 | #  12 supplem… Additional file 3 …        1 List of oligonucleotide primers used in this study.
 97 | ```
 98 | 
 99 | `pmc_table` formats tables by collapsing multiline headers, expanding
100 | rowspan and colspan attributes and adding subheadings into a new column.
101 | 
102 | ``` r
103 | tab1 <- pmc_table(doc)
104 | #  Parsing 4 tables
105 | #  Adding footnotes to Table 1
106 | sapply(tab1, nrow)
107 | #  Table 1 Table 2 Table 3 Table 4 
108 | #       39      23       4      34
109 | tab1[[1]]
110 | #  # A tibble: 39 x 5
111 | #     subheading           `Potential operon (r … `Gene ID`  `Putative or predicted fu… `Reference (s)`
112 | #     <chr>                <chr>                  <chr>      <chr>                      <chr>          
113 | #   1 Iron uptake or heme… yfeABCD operon* (r > … YPO2439-2… Transport/binding chelate… yfeABCD [54]   
114 | #   2 Iron uptake or heme… hmuRSTUV operon (r > … YPO0279-0… Transport/binding hemin    hmuRSTUV [55]  
115 | #   3 Iron uptake or heme… ysuJIHG* (r > 0.95)    YPO1529-1… Iron uptake                -              
116 | #   4 Iron uptake or heme… sufABCDS* (r > 0.90)   YPO2400-2… Iron-regulated Fe-S clust… -              
117 | #   5 Iron uptake or heme… YPO1854-1856* (r > 0.… YPO1854-1… Iron uptake or heme synth… -              
118 | #   6 Sulfur metabolism    tauABCD operon (r > 0… YPO0182-0… Transport/binding taurine  tauABCD [56]   
119 | #   7 Sulfur metabolism    ssuEADCB operon (r > … YPO3623-3… Sulphur metabolism         ssu operon [57]
120 | #   8 Sulfur metabolism    cys operon (r > 0.92)  YPO3010-3… Cysteine synthesis         -              
121 | #   9 Sulfur metabolism    YPO1317-1319 (r > 0.9… YPO1317-1… Sulfur metabolism?         -              
122 | #  10 Sulfur metabolism    YPO4109-4111 (r > 0.9… YPO4109-4… Sulfur metabolism?         -              
123 | #  # … with 29 more rows
124 | ```
125 | 
126 | Captions and footnotes are added as attributes.
127 | 
128 | ``` r
129 | attributes(tab1[[1]])
130 | #  $names
131 | #  [1] "subheading"                     "Potential operon (r value)"    
132 | #  [3] "Gene ID"                        "Putative or predicted function"
133 | #  [5] "Reference (s)"                 
134 | #  
135 | #  $row.names
136 | #   [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
137 | #  [33] 33 34 35 36 37 38 39
138 | #  
139 | #  $class
140 | #  [1] "tbl_df"     "tbl"        "data.frame"
141 | #  
142 | #  $caption
143 | #  [1] "Stress-responsive operons in Y. pestis predicted from microarray expression data"
144 | #  
145 | #  $footnotes
146 | #  [1] "'r' represents the correlation coefficient of adjacent genes; '*' represent the defined operon has the similar expression pattern in two other published microarray datasets [7, 21]; '?' inferred functions of uncharacterized genes; '-' means the corresponding operons have not been experimentally validated in other bacteria."
147 | ```
148 | 
149 | Use `collapse_rows` to join column names and cell values in a semi-colon
150 | delimited string (and then search using functions in the next section).
151 | 
152 | ``` r
153 | collapse_rows(tab1, na.string="-")
154 | #  # A tibble: 100 x 3
155 | #     table     row text                                                                               
156 | #     <chr>   <int> <chr>                                                                              
157 | #   1 Table 1     1 subheading=Iron uptake or heme synthesis; Potential operon (r value)=yfeABCD opero…
158 | #   2 Table 1     2 subheading=Iron uptake or heme synthesis; Potential operon (r value)=hmuRSTUV oper…
159 | #   3 Table 1     3 subheading=Iron uptake or heme synthesis; Potential operon (r value)=ysuJIHG* (r >…
160 | #   4 Table 1     4 subheading=Iron uptake or heme synthesis; Potential operon (r value)=sufABCDS* (r …
161 | #   5 Table 1     5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1854-1856*…
162 | #   6 Table 1     6 subheading=Sulfur metabolism; Potential operon (r value)=tauABCD operon (r > 0.90)…
163 | #   7 Table 1     7 subheading=Sulfur metabolism; Potential operon (r value)=ssuEADCB operon (r > 0.97…
164 | #   8 Table 1     8 subheading=Sulfur metabolism; Potential operon (r value)=cys operon (r > 0.92); Ge…
165 | #   9 Table 1     9 subheading=Sulfur metabolism; Potential operon (r value)=YPO1317-1319 (r > 0.97); …
166 | #  10 Table 1    10 subheading=Sulfur metabolism; Potential operon (r value)=YPO4109-4111 (r > 0.90); …
167 | #  # … with 90 more rows
168 | ```
169 | 
170 | `pmc_reference` extracts the id, pmid, authors, year, title, journal,
171 | volume, pages, and DOIs from reference tags.
172 | 
173 | ``` r
174 | ref1 <- pmc_reference(doc)
175 | #  Found 76 citation tags
176 | ref1
177 | #  # A tibble: 76 x 9
178 | #        id pmid   authors                  year title                 journal   volume pages doi      
179 | #     <int> <chr>  <chr>                   <int> <chr>                 <chr>     <chr>  <chr> <chr>    
180 | #   1     1 89938… Perry RD, Fetherston JD  1997 Yersinia pestis--eti… Clin Mic… 10     35-66 <NA>     
181 | #   2     2 16053… Hinnebusch BJ            2005 The evolution of fle… Curr Iss… 7      197-… <NA>     
182 | #   3     3 64693… Straley SC, Harmon PA    1984 Yersinia pestis grow… Infect I… 45     655-… <NA>     
183 | #   4     4 15557… Huang XZ, Lindler LE     2004 The pH 6 antigen is … Infect I… 72     7212… 10.1128/…
184 | #   5     5 15721… Pujol C, Bliska JB       2005 Turning Yersinia pat… Clin Imm… 114    216-… 10.1016/…
185 | #   6     6 12732… Rhodius VA, LaRossa RA   2003 Uses and pitfalls of… Curr Opi… 6      114-… 10.1016/…
186 | #   7     7 15342… Motin VL, Georgescu AM…  2004 Temporal global chan… J Bacter… 186    6298… 10.1128/…
187 | #   8     8 15557… Han Y, Zhou D, Pang X,…  2004 Microarray analysis … Microbio… 48     791-… <NA>     
188 | #   9     9 15777… Han Y, Zhou D, Pang X,…  2005 DNA microarray analy… Microbes… 7      335-… 10.1016/…
189 | #  10    10 15808… Han Y, Zhou D, Pang X,…  2005 Comparative transcri… Res Micr… 156    403-… 10.1016/…
190 | #  # … with 66 more rows
191 | ```
192 | 
193 | Finally, `pmc_metadata` saves journal and article metadata to a list.
194 | 
195 | ``` r
196 | pmc_metadata(doc)
197 | #  $PMCID
198 | #  [1] "PMC2231364"
199 | #  
200 | #  $Title
201 | #  [1] "Comparative transcriptomics in Yersinia pestis: a global view of environmental modulation of gene expression"
202 | #  
203 | #  $Authors
204 | #  [1] "Yanping Han, Jingfu Qiu, Zhaobiao Guo, He Gao, Yajun Song, Dongsheng Zhou, Ruifu Yang"
205 | #  
206 | #  $Year
207 | #  [1] 2007
208 | #  
209 | #  $Journal
210 | #  [1] "BMC Microbiology"
211 | #  
212 | #  $Volume
213 | #  [1] "7"
214 | #  
215 | #  $Pages
216 | #  [1] "96"
217 | #  
218 | #  $`Published online`
219 | #  [1] "2007-10-29"
220 | #  
221 | #  $`Date received`
222 | #  [1] "2007-6-2"
223 | #  
224 | #  $DOI
225 | #  [1] "10.1186/1471-2180-7-96"
226 | #  
227 | #  $Publisher
228 | #  [1] "BioMed Central"
229 | ```
230 | 
231 | ## Searching text
232 | 
233 | There are a few functions to search within the `pmc_text` or collapsed
234 | `pmc_table` output. `separate_text` uses the
235 | [stringr](https://stringr.tidyverse.org/) package to extract any
236 | matching regular expression.
237 | 
238 | ``` r
239 | separate_text(txt, "[ATCGN]{5,}")
240 | #  # A tibble: 9 x 5
241 | #    match       section                       paragraph sentence text                                 
242 | #    <chr>       <chr>                             <int>    <int> <chr>                                
243 | #  1 ACGCAATCGT… Results and Discussion; Comp…         2        3 A 16 basepair (bp) box (5'-ACGCAATCG…
244 | #  2 AAACGTTTNC… Results and Discussion; Comp…         2        4 It is very similar to the E. coli Pu…
245 | #  3 TGATAATGAT… Results and Discussion; Comp…         2        5 A 21 bp box (5'-TGATAATGATTATCATTATC…
246 | #  4 GATAATGATA… Results and Discussion; Comp…         2        6 It is a 10-1-10 inverted repeat that…
247 | #  5 TGANNNNNNT… Results and Discussion; Comp…         2        7 A 15 bp box (5'-TGANNNNNNTCAA-3') wa…
248 | #  6 TTGATN      Results and Discussion; Comp…         2        8 It is a part of the E. coli Fnr box …
249 | #  7 NATCAA      Results and Discussion; Comp…         2        8 It is a part of the E. coli Fnr box …
250 | #  8 GTTAATTAA   Results and Discussion; Comp…         3        4 The ArcA regulator can recognize a r…
251 | #  9 GTTAATTAAT… Results and Discussion; Comp…         3        5 An ArcA-box-like sequence (5'-GTTAAT…
252 | ```
253 | 
254 | A few wrappers search pre-defined patterns and add an extra step to
255 | expand matched ranges. `separate_refs` matches references within
256 | brackets using `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`.
257 | 
258 | ``` r
259 | x <- separate_refs(txt)
260 | x
261 | #  # A tibble: 93 x 6
262 | #        id match section   paragraph sentence text                                                    
263 | #     <dbl> <chr> <chr>         <int>    <int> <chr>                                                   
264 | #   1     1 [1]   Backgrou…         1        1 Yersinia pestis is the etiological agent of plague, alt…
265 | #   2     2 [2]   Backgrou…         1        3 To produce a transmissible infection, Y. pestis coloniz…
266 | #   3     3 [3]   Backgrou…         1        9 However, a few bacilli are taken up by tissue macrophag…
267 | #   4     4 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's…
268 | #   5     5 [4,5] Backgrou…         1       10 Residence in this niche also facilitates the bacteria's…
269 | #   6     6 [6]   Backgrou…         2        1 A DNA microarray is able to determine simultaneous chan…
270 | #   7     7 [7-9] Backgrou…         2        2 We and others have measured the gene expression profile…
271 | #   8     8 [7-9] Backgrou…         2        2 We and others have measured the gene expression profile…
272 | #   9     9 [7-9] Backgrou…         2        2 We and others have measured the gene expression profile…
273 | #  10    10 [10]  Backgrou…         2        2 We and others have measured the gene expression profile…
274 | #  # … with 83 more rows
275 | filter(x, id == 8)
276 | #  # A tibble: 5 x 6
277 | #       id match    section                         paragraph sentence text                            
278 | #    <dbl> <chr>    <chr>                               <int>    <int> <chr>                           
279 | #  1     8 [7-9]    Background                              2        2 We and others have measured the…
280 | #  2     8 [8-13,1… Background                              2        4 In order to acquire more regula…
281 | #  3     8 [7-13,1… Results and Discussion                  2        1 Recently, many signature expres…
282 | #  4     8 [7-9]    Results and Discussion; Virule…         3        1 As described previously, expres…
283 | #  5     8 [8-10]   Methods; Collection of microar…         1        6 The genome-wide transcriptional…
284 | ```
285 | 
286 | `separate_genes` expands microbial gene operons like `hmsHFRS` into four
287 | separate genes.
288 | 
289 | ``` r
290 | separate_genes(txt)
291 | #  # A tibble: 103 x 6
292 | #     gene  match  section                         paragraph sentence text                             
293 | #     <chr> <chr>  <chr>                               <int>    <int> <chr>                            
294 | #   1 purR  PurR   Abstract                                2        5 Several regulatory DNA motifs, p…
295 | #   2 phoP  PhoP   Background                              2        3 We also identified the regulons …
296 | #   3 ompR  OmpR   Background                              2        3 We also identified the regulons …
297 | #   4 oxyR  OxyR   Background                              2        3 We also identified the regulons …
298 | #   5 csrA  CsrA   Results and Discussion                  1        3 After the determination of the C…
299 | #   6 slyA  SlyA   Results and Discussion                  1        3 After the determination of the C…
300 | #   7 phoPQ PhoPQ  Results and Discussion                  1        3 After the determination of the C…
301 | #   8 hmsH  hmsHF… Results and Discussion; Virule…         3        3 For example, the hemin storage l…
302 | #   9 hmsF  hmsHF… Results and Discussion; Virule…         3        3 For example, the hemin storage l…
303 | #  10 hmsR  hmsHF… Results and Discussion; Virule…         3        3 For example, the hemin storage l…
304 | #  # … with 93 more rows
305 | ```
306 | 
307 | Finally, `separate_tags` expands locus tag ranges.
308 | 
309 | ``` r
310 | collapse_rows(tab1, na="-") %>%
311 |   separate_tags("YPO")
312 | #  # A tibble: 270 x 5
313 | #     id      match      table    row text                                                             
314 | #     <chr>   <chr>      <chr>  <int> <chr>                                                            
315 | #   1 YPO2439 YPO2439-2… Table…     1 subheading=Iron uptake or heme synthesis; Potential operon (r va…
316 | #   2 YPO2440 YPO2439-2… Table…     1 subheading=Iron uptake or heme synthesis; Potential operon (r va…
317 | #   3 YPO2441 YPO2439-2… Table…     1 subheading=Iron uptake or heme synthesis; Potential operon (r va…
318 | #   4 YPO2442 YPO2439-2… Table…     1 subheading=Iron uptake or heme synthesis; Potential operon (r va…
319 | #   5 YPO0279 YPO0279-0… Table…     2 subheading=Iron uptake or heme synthesis; Potential operon (r va…
320 | #   6 YPO0280 YPO0279-0… Table…     2 subheading=Iron uptake or heme synthesis; Potential operon (r va…
321 | #   7 YPO0281 YPO0279-0… Table…     2 subheading=Iron uptake or heme synthesis; Potential operon (r va…
322 | #   8 YPO0282 YPO0279-0… Table…     2 subheading=Iron uptake or heme synthesis; Potential operon (r va…
323 | #   9 YPO0283 YPO0279-0… Table…     2 subheading=Iron uptake or heme synthesis; Potential operon (r va…
324 | #  10 YPO1529 YPO1529-1… Table…     3 subheading=Iron uptake or heme synthesis; Potential operon (r va…
325 | #  # … with 260 more rows
326 | ```
327 | 
328 | ### Using `xml2`
329 | 
330 | The `pmc_*` functions use the [xml2](https://github.com/r-lib/xml2)
331 | package for parsing and may fail in some situations, so it helps to know
332 | how to parse `xml_documents`. Use `cat` and `as.character` to view nodes
333 | returned by `xml_find_all`.
334 | 
335 | ``` r
336 | library(xml2)
337 | refs <- xml_find_all(doc, "//ref")
338 | refs[1]
339 | #  {xml_nodeset (1)}
340 | #  [1] <ref id="B1">\n  <citation citation-type="journal">\n    <person-group person-group-type="aut ...
341 | cat(as.character(refs[1]))
342 | #  <ref id="B1">
343 | #    <citation citation-type="journal">
344 | #      <person-group person-group-type="author">
345 | #        <name>
346 | #          <surname>Perry</surname>
347 | #          <given-names>RD</given-names>
348 | #        </name>
349 | #        <name>
350 | #          <surname>Fetherston</surname>
351 | #          <given-names>JD</given-names>
352 | #        </name>
353 | #      </person-group>
354 | #      <article-title>Yersinia pestis--etiologic agent of plague</article-title>
355 | #      <source>Clin Microbiol Rev</source>
356 | #      <year>1997</year>
357 | #      <volume>10</volume>
358 | #      <fpage>35</fpage>
359 | #      <lpage>66</lpage>
360 | #      <pub-id pub-id-type="pmid">8993858</pub-id>
361 | #    </citation>
362 | #  </ref>
363 | ```
364 | 
365 | Many journals use superscripts for references cited so they usually
366 | appear after words like `results9` below.
367 | 
368 | ``` r
369 | # doc1 <- pmc_xml("PMC6385181")
370 | doc1 <- read_xml(system.file("extdata/PMC6385181.xml", package = "tidypmc"))
371 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2])
372 | #  [1] "RNA-seq identifies the most relevant genes and RT-qPCR validates its results9, especially in the field of environmental and host adaptation10,11 and antimicrobial response12."
373 | ```
374 | 
375 | Find the tags using `xml_find_all` and then update the nodes by adding
376 | brackets or other text.
377 | 
378 | ``` r
379 | bib <- xml_find_all(doc1, "//xref[@ref-type='bibr']")
380 | bib[1]
381 | #  {xml_nodeset (1)}
382 | #  [1] <xref ref-type="bibr" rid="CR1">1</xref>
383 | xml_text(bib) <- paste0(" [", xml_text(bib), "]")
384 | bib[1]
385 | #  {xml_nodeset (1)}
386 | #  [1] <xref ref-type="bibr" rid="CR1"> [1]</xref>
387 | ```
388 | 
389 | The text is now separated from the reference. Note the `pmc_text`
390 | function adds the brackets by default.
391 | 
392 | ``` r
393 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2])
394 | #  [1] "RNA-seq identifies the most relevant genes and RT-qPCR validates its results [9], especially in the field of environmental and host adaptation [10], [11] and antimicrobial response [12]."
395 | ```
396 | 
397 | Genes, species and many other terms are often included within italic
398 | tags. You can mark these nodes using the same code above or simply list
399 | all the names in italics and search text or tables for matches, for
400 | example three letter gene names in text below.
401 | 
402 | ``` r
403 | library(tibble)
404 | x <- xml_name(xml_find_all(doc, "//*"))
405 | tibble(tag=x) %>%
406 |   count(tag, sort=TRUE)
407 | #  # A tibble: 84 x 2
408 | #     tag               n
409 | #     <chr>         <int>
410 | #   1 td              398
411 | #   2 given-names     388
412 | #   3 name            388
413 | #   4 surname         388
414 | #   5 italic          235
415 | #   6 pub-id          129
416 | #   7 tr              117
417 | #   8 xref            108
418 | #   9 year             80
419 | #  10 article-title    77
420 | #  # … with 74 more rows
421 | it <- xml_text(xml_find_all(doc, "//sec//p//italic"), trim=TRUE)
422 | it2 <- tibble(italic=it) %>%
423 |   count(italic, sort=TRUE)
424 | it2
425 | #  # A tibble: 53 x 2
426 | #     italic        n
427 | #     <chr>     <int>
428 | #   1 Y. pestis    46
429 | #   2 in vitro      5
430 | #   3 E. coli       4
431 | #   4 psaEFABC      3
432 | #   5 r             3
433 | #   6 cis           2
434 | #   7 fur           2
435 | #   8 n             2
436 | #   9 nrdHIEF       2
437 | #  10 sufABCDSE     2
438 | #  # … with 43 more rows
439 | filter(it2, nchar(italic) == 3)
440 | #  # A tibble: 8 x 2
441 | #    italic     n
442 | #    <chr>  <int>
443 | #  1 cis        2
444 | #  2 fur        2
445 | #  3 cys        1
446 | #  4 hmu        1
447 | #  5 ybt        1
448 | #  6 yfe        1
449 | #  7 yfu        1
450 | #  8 ymt        1
451 | separate_text(txt, c("fur", "cys", "hmu", "ybt", "yfe", "yfu", "ymt"))
452 | #  # A tibble: 9 x 5
453 | #    match section                               paragraph sentence text                               
454 | #    <chr> <chr>                                     <int>    <int> <chr>                              
455 | #  1 ymt   Results and Discussion; Virulence ge…         3        4 The ymt gene encoding Yersinia mur…
456 | #  2 fur   Results and Discussion; Clustering a…         3        2 It is noticeable that almost all o…
457 | #  3 yfe   Results and Discussion; Clustering a…         3        4 Genes in category A (yfe, hmu, yfu…
458 | #  4 hmu   Results and Discussion; Clustering a…         3        4 Genes in category A (yfe, hmu, yfu…
459 | #  5 yfu   Results and Discussion; Clustering a…         3        4 Genes in category A (yfe, hmu, yfu…
460 | #  6 ybt   Results and Discussion; Clustering a…         3        4 Genes in category A (yfe, hmu, yfu…
461 | #  7 cys   Results and Discussion; Clustering a…         4        2 Genes responsible for sulfur uptak…
462 | #  8 cys   Results and Discussion; Clustering a…         4        3 Cluster III contains members of th…
463 | #  9 fur   Methods; Gel mobility shift analysis…         1        1 The entire coding region of the fu…
464 | ```
465 | 


--------------------------------------------------------------------------------