├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── CONDUCT.md ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── collapse_rows.R ├── path_string.R ├── pmc_caption.R ├── pmc_metadata.R ├── pmc_reference.R ├── pmc_table.R ├── pmc_text.R ├── pmc_xml.R ├── repeat_sub.R ├── separate_genes.R ├── separate_refs.R ├── separate_tags.R ├── separate_text.R └── tidypmc-package.R ├── README.Rmd ├── README.html ├── README.md ├── codecov.yml ├── codemeta.json ├── inst └── extdata │ ├── PMC2231364.xml │ ├── PMC6095483.xml │ ├── PMC6358576_PMC6358589.xml │ └── PMC6385181.xml ├── man ├── collapse_rows.Rd ├── pmc_caption.Rd ├── pmc_metadata.Rd ├── pmc_reference.Rd ├── pmc_table.Rd ├── pmc_text.Rd ├── pmc_xml.Rd ├── separate_genes.Rd ├── separate_refs.Rd ├── separate_tags.Rd ├── separate_text.Rd └── tidypmc.Rd ├── tests ├── testthat.R └── testthat │ ├── tests-pmc_other.R │ ├── tests-pmc_table.R │ ├── tests-pmc_text.R │ └── tests-separate.R └── vignettes ├── pmcftp.Rmd ├── pmcftp.md ├── tidypmc.Rmd └── tidypmc.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | CONDUCT\.md$ 2 | ^codecov\.yml$ 3 | ^\.travis\.yml$ 4 | ^Meta$ 5 | ^doc$ 6 | ^README* 7 | codemeta.json 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Meta 2 | doc 3 | # History files 4 | .Rhistory 5 | .Rapp.history 6 | 7 | # Session Data files 8 | .RData 9 | .DS_Store 10 | # Example code in package build process 11 | *-Ex.R 12 | # Output files from R CMD build 13 | /*.tar.gz 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | # RStudio files 17 | .Rproj.user/ 18 | # produced vignettes 19 | vignettes/*.html 20 | vignettes/*.pdf 21 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 22 | .httr-oauth 23 | # knitr and R markdown default cache directories 24 | /*_cache/ 25 | /cache/ 26 | # Temporary files created by R markdown 27 | *.utf8.md 28 | *.knit.md 29 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 30 | rsconnect/ 31 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | 7 | after_success: 8 | - Rscript -e 'covr::codecov()' 9 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http://contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tidypmc 2 | Type: Package 3 | Title: Parse Full Text XML Documents from PubMed Central 4 | Version: 1.8 5 | Authors@R: person("Chris", "Stubben", role = c("aut", "cre"), email = "chris.stubben@hci.utah.edu") 6 | Description: Parse XML documents from the Open Access subset of Europe PubMed Central 7 | including section paragraphs, tables, captions and references. 8 | URL: https://docs.ropensci.org/tidypmc, https://github.com/ropensci/tidypmc 9 | BugReports: https://github.com/ropensci/tidypmc/issues 10 | License: GPL-3 11 | Encoding: UTF-8 12 | VignetteBuilder: knitr 13 | Imports: 14 | xml2, 15 | tokenizers, 16 | stringr, 17 | tibble, 18 | dplyr, 19 | readr 20 | Suggests: 21 | europepmc, 22 | tidytext, 23 | rmarkdown, 24 | knitr, 25 | testthat, 26 | covr 27 | RoxygenNote: 6.1.1 28 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(collapse_rows) 4 | export(pmc_caption) 5 | export(pmc_metadata) 6 | export(pmc_reference) 7 | export(pmc_table) 8 | export(pmc_text) 9 | export(pmc_xml) 10 | export(separate_genes) 11 | export(separate_refs) 12 | export(separate_tags) 13 | export(separate_text) 14 | importFrom(dplyr,"%>%") 15 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | tidypmc 1.8 (dev) 2 | ========================= 3 | 4 | ### DOCUMENTATION FIXES 5 | 6 | * Added a NEWS.md file (#2) 7 | 8 | 9 | tidypmc 1.7 (2019-08-01) 10 | ========================= 11 | 12 | ### NEW FEATURES 13 | 14 | * released to CRAN 15 | -------------------------------------------------------------------------------- /R/collapse_rows.R: -------------------------------------------------------------------------------- 1 | #' Collapse a list of PubMed Central tables 2 | #' 3 | #' Collapse rows into a semi-colon delimited list with column names and cell 4 | #' values 5 | #' 6 | #' @param pmc a list of tables, usually from \code{\link{pmc_table}} 7 | #' @param na.string additional cell values to skip, default is NA and "" 8 | #' 9 | #' @return A tibble with table and row number and collapsed text 10 | #' 11 | #' @author Chris Stubben 12 | #' 13 | #' @examples 14 | #' x <- data.frame( 15 | #' genes = c("aroB", "glnP", "ndhA", "pyrF"), 16 | #' fold_change = c(2.5, 1.7, -3.1, -2.6) 17 | #' ) 18 | #' collapse_rows(list(`Table 1` = x)) 19 | #' @export 20 | 21 | collapse_rows <- function(pmc, na.string) { 22 | if (is.null(pmc)) { 23 | cr1 <- NULL 24 | } else { 25 | if (class(pmc)[1] != "list") pmc <- list(Table = pmc) 26 | if (!is.data.frame(pmc[[1]])) { 27 | stop("pmc should be a list of tables from pmc_table") 28 | } 29 | n1 <- length(pmc) 30 | tbls <- vector("list", n1) 31 | names(tbls) <- names(pmc) 32 | for (i in seq_len(n1)) { 33 | x <- data.frame(pmc[[i]], check.names = FALSE) 34 | y <- names(x) 35 | n <- nrow(x) 36 | if (nrow(x) == 0) { 37 | tbls[[i]] <- NULL 38 | } else { 39 | ## convert factors to character 40 | f1 <- vapply(x, is.factor, logical(1)) 41 | if (any(f1)) for (k in which(f1)) x[, k] <- as.character(x[, k]) 42 | # combine (and skip empty fields) 43 | cx <- vector("character", n) 44 | for (j in seq_len(n)) { 45 | n2 <- is.na(x[j, ]) | as.character(x[j, ]) == "" | x[j, ] == "\u00A0" 46 | if (!missing(na.string)) n2 <- n2 | as.character(x[j, ]) == na.string 47 | rowx <- paste(paste(y[!n2], x[j, !n2], sep = "="), collapse = "; ") 48 | cx[j] <- rowx 49 | } 50 | z <- tibble::tibble(row = seq_along(cx), text = cx) 51 | tbls[[i]] <- z 52 | } 53 | } 54 | cr1 <- dplyr::bind_rows(tbls, .id = "table") 55 | } 56 | cr1 57 | } 58 | -------------------------------------------------------------------------------- /R/path_string.R: -------------------------------------------------------------------------------- 1 | #' Print a hierarchical path string 2 | #' 3 | #' Print a hierarchical path string from a vector of names and levels 4 | #' 5 | #' @param x a vector of names 6 | #' @param n a vector of numbers with indentation level 7 | #' 8 | #' @return a character vector 9 | #' 10 | #' @note Used by \code{\link{pmc_text}} to print full path to subsection title 11 | #' 12 | #' @author Chris Stubben 13 | #' 14 | #' @examples 15 | #' x <- c("carnivores", "bears", "polar", "grizzly", "cats", "tiger", "rodents") 16 | #' n <- c(1, 2, 3, 3, 2, 3, 1) 17 | #' path_string(x, n) 18 | #' @noRd 19 | 20 | path_string <- function(x, n) { 21 | n2 <- length(n) 22 | if (is.factor(x)) x <- as.character(x) 23 | if (!is.numeric(n)) stop("n should be a vector of numbers") 24 | if (n2 != length(x)) stop("x and n should be the same length") 25 | z <- vector("list", n2) 26 | if (min(n) > 1) n <- n - min(n) + 1 27 | ## start with empty vector 28 | path <- "" 29 | for (i in seq_len(n2)) { 30 | ## add name at position n[i] 31 | path[n[i]] <- x[i] 32 | ## drop names if n[i] decreases 33 | path <- path[seq_len(n[i])] 34 | ### paste together names 35 | z[[i]] <- paste(path, collapse = "; ") 36 | } 37 | z <- unlist(z) 38 | ## check if any NA? 39 | z <- gsub("NA; ", "", z) 40 | z 41 | } 42 | -------------------------------------------------------------------------------- /R/pmc_caption.R: -------------------------------------------------------------------------------- 1 | #' Split captions into sentences 2 | #' 3 | #' Split figure, table and supplementary material captions into sentences 4 | #' 5 | #' @param doc \code{xml_document} from PubMed Central 6 | #' 7 | #' @return a tibble with tag, label, sentence number and text 8 | #' 9 | #' @author Chris Stubben 10 | #' 11 | #' @examples 12 | #' # doc <- pmc_xml("PMC2231364") # OR 13 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 14 | #' package = "tidypmc" 15 | #' )) 16 | #' x <- pmc_caption(doc) 17 | #' x 18 | #' dplyr::filter(x, sentence == 1) 19 | #' @export 20 | 21 | pmc_caption <- function(doc) { 22 | if (class(doc)[1] != "xml_document") { 23 | stop("doc should be an xml_document from PubMed Central") 24 | } 25 | ### Figures 26 | z <- xml2::xml_find_all(doc, "//fig") 27 | # cat(as.character(z[[1]])) 28 | if (length(z) > 0) { 29 | n <- length(z) 30 | message("Found ", n, ifelse(n > 1, " figures", " figure")) 31 | ## should have label and caption? 32 | f1 <- vapply(z, function(x) xml2::xml_text( 33 | xml2::xml_find_first(x, "./label"), 34 | trim = TRUE 35 | ), character(1)) 36 | # get caption /title and /p tags together since some caption titles are 37 | # missing, in bold tags or have very long titles that should be split. 38 | # use node() to avoid pasting /title and /p sentences without a space 39 | f2 <- vapply(z, function(x) paste(xml2::xml_text( 40 | xml2::xml_find_all(x, "./caption/*") 41 | ), collapse = " "), character(1)) 42 | if (all(is.na(f1)) & all(f2 == "")) { 43 | ## ANY label and ANY paragrah 44 | f1 <- vapply(z, function(x) xml2::xml_text( 45 | xml2::xml_find_first(x, ".//label"), 46 | trim = TRUE 47 | ), character(1)) 48 | f2 <- vapply(z, function(x) xml2::xml_text( 49 | xml2::xml_find_first(x, ".//p") 50 | ), character(1)) 51 | } 52 | names(f2) <- gsub("\\.$", "", f1) 53 | ## only some fig tags with media only 54 | f2 <- f2[f2 != ""] 55 | # text in media/ tag 56 | if (length(f2) == 0) { 57 | message(" No figure /caption or /p tag to parse - link to image only?") 58 | figs <- NULL 59 | } else { 60 | x1 <- vapply(f2, tokenizers::tokenize_sentences, list(1)) 61 | figs <- dplyr::bind_rows( 62 | lapply(x1, function(z) 63 | tibble::tibble(sentence = seq_along(z), text = z)), 64 | .id = "label" 65 | ) 66 | } 67 | } else { 68 | figs <- NULL 69 | } 70 | ### Tables 71 | z <- xml2::xml_find_all(doc, "//table-wrap") 72 | if (length(z) > 0) { 73 | n <- length(z) 74 | message("Found ", n, ifelse(n > 1, " tables", " table")) 75 | ## should have label and caption? 76 | f1 <- vapply(z, function(x) xml2::xml_text( 77 | xml2::xml_find_first(x, "./label"), 78 | trim = TRUE 79 | ), character(1)) 80 | # some with long subcaptions 81 | f2 <- vapply(z, function(x) paste(xml2::xml_text( 82 | xml2::xml_find_all(x, "./caption/*") 83 | ), collapse = " "), character(1)) 84 | names(f2) <- gsub("\\.$", "", f1) 85 | ## only some table tags with media only 86 | f2 <- f2[f2 != ""] 87 | x1 <- vapply(f2, tokenizers::tokenize_sentences, list(1)) 88 | tbls <- dplyr::bind_rows( 89 | lapply(x1, function(z) 90 | tibble::tibble(sentence = seq_along(z), text = z)), 91 | .id = "label" 92 | ) 93 | } else { 94 | tbls <- NULL 95 | } 96 | ### Supplements 97 | z <- xml2::xml_find_all(doc, "//supplementary-material") 98 | if (length(z) > 0) { 99 | if (!all(xml2::xml_text(z, trim = TRUE) == "")) { 100 | n <- length(z) 101 | message("Found ", n, ifelse(n > 1, " supplements", " supplement")) 102 | ## label often missing 103 | f1 <- vapply(z, function(x) xml2::xml_text( 104 | xml2::xml_find_first(x, "./label"), 105 | trim = TRUE 106 | ), character(1)) 107 | # use paste ./caption/* to avoid mashing together title and p like 108 | # Additional file 1Figure S1 109 | f2 <- vapply(z, function(x) paste(xml2::xml_text( 110 | xml2::xml_find_all(x, "./caption/*") 111 | ), collapse = " "), character(1)) 112 | # mBio with /p tags only, others with media/captions only 113 | if (all(f2 == "")) { 114 | f2 <- vapply(z, function(x) xml2::xml_text( 115 | xml2::xml_find_first(x, ".//p") 116 | ), character(1)) 117 | f2[is.na(f2)] <- "" 118 | } 119 | # remove period to avoid splitting (DOC), (XLSX) into new sentences - 120 | # misses (XLSX 32 kb) 121 | f2 <- gsub("\\.( \\([A-Z]+\\))", "\\1", f2) 122 | x1 <- vapply(f2, tokenizers::tokenize_sentences, 123 | list(1), 124 | USE.NAMES = FALSE 125 | ) 126 | if (all(is.na(f1))) { 127 | y <- vapply(x1, function(x) x[1], character(1)) 128 | # if all have more than 1 sentence, then use first for label if all 129 | # are less than 40 characters? 130 | if (all(vapply(x1, length, integer(1)) > 1) & all(nchar(y) < 40)) { 131 | f1 <- y 132 | x1 <- lapply(x1, function(x) x[-1]) 133 | } else { 134 | if (length(y) == 1) { 135 | message(" Missing supplement label tag, using File S1") 136 | } else { 137 | message( 138 | " Missing supplement label tag, using File S1 to S", 139 | length(y) 140 | ) 141 | } 142 | f1 <- paste0("File S", seq_along(y)) 143 | } 144 | } 145 | names(x1) <- gsub("\\.$", "", f1) 146 | sups <- dplyr::bind_rows( 147 | lapply(x1, function(z) 148 | tibble::tibble(sentence = seq_along(z), text = z)), 149 | .id = "label" 150 | ) 151 | } else { 152 | message(" No text found in supplement tag") 153 | sups <- NULL 154 | } 155 | } else { 156 | sups <- NULL 157 | } 158 | x <- dplyr::bind_rows(list(figure = figs, table = tbls, supplement = sups), 159 | .id = "tag" 160 | ) 161 | if (nrow(x) == 0) { 162 | message("No caption tags found") 163 | x <- NULL 164 | } 165 | x 166 | } 167 | -------------------------------------------------------------------------------- /R/pmc_metadata.R: -------------------------------------------------------------------------------- 1 | #' Get article metadata 2 | #' 3 | #' Get a list of journal and article metadata in /front tag 4 | #' 5 | #' @param doc \code{xml_document} from PubMed Central 6 | #' 7 | #' @return a list 8 | #' 9 | #' @author Chris Stubben 10 | #' 11 | #' @examples 12 | #' # doc <- pmc_xml("PMC2231364") # OR 13 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 14 | #' package = "tidypmc" 15 | #' )) 16 | #' pmc_metadata(doc) 17 | #' @export 18 | 19 | pmc_metadata <- function(doc) { 20 | if (class(doc)[1] != "xml_document") { 21 | stop("doc should be an xml_document from PubMed Central") 22 | } 23 | z <- vector("list") 24 | ## //front has journal-meta and article-meta 25 | # cat(as.character(xml2::xml_find_all(doc, "//front//journal-meta"))) 26 | pmcid <- xml2::xml_text(xml2::xml_find_first( 27 | doc, "//front//article-id[@pub-id-type='pmcid']" 28 | )) 29 | if (!is.na(pmcid)) z[["PMCID"]] <- paste0("PMC", pmcid) 30 | t1 <- xml2::xml_text(xml2::xml_find_first( 31 | doc, "//front//article-title" 32 | ), trim = TRUE) 33 | if (!is.na(t1)) { 34 | z[["Title"]] <- t1 35 | a1 <- xml2::xml_text(xml2::xml_find_all( 36 | doc, "//front//contrib[not(@contrib-type='editor')]/name/given-names" 37 | )) 38 | a2 <- xml2::xml_text(xml2::xml_find_all( 39 | doc, "//front//contrib[not(@contrib-type='editor')]/name/surname" 40 | )) 41 | if (length(a1) != length(a2)) { 42 | message("WARNING: Check author names -missing first or last tag") 43 | } 44 | authors <- paste(a1, a2) 45 | ## comma-delimited string (easier to bind_rows with multiple pmcids) 46 | authors <- paste(authors, collapse = ", ") 47 | z[["Authors"]] <- authors 48 | ## Year published, use collection else ppub year? 49 | year <- xml2::xml_text(xml2::xml_find_first( 50 | doc, "//front//pub-date[@pub-type='collection']/year" 51 | )) 52 | if (is.na(year)) { 53 | year <- xml2::xml_text(xml2::xml_find_first( 54 | doc, "//front//pub-date[@pub-type='ppub']/year" 55 | )) 56 | } 57 | if (is.na(year)) { 58 | year <- xml2::xml_text(xml2::xml_find_first( 59 | doc, "//front//pub-date[@pub-type='epub']/year" 60 | )) 61 | } 62 | if (!is.na(year)) z[["Year"]] <- as.integer(year) 63 | # Journal meta 64 | journal <- xml2::xml_text(xml2::xml_find_first( 65 | doc, "//front//journal-meta//journal-title" 66 | )) 67 | if (!is.na(journal)) z[["Journal"]] <- journal 68 | ## volume and issue in article metadata 69 | volume <- xml2::xml_text(xml2::xml_find_first( 70 | doc, "//front//article-meta/volume" 71 | )) 72 | if (!is.na(volume)) z[["Volume"]] <- volume 73 | issue <- xml2::xml_text(xml2::xml_find_first( 74 | doc, "//front//article-meta/issue" 75 | )) 76 | if (!is.na(issue)) z[["Issue"]] <- issue 77 | # PAGES 78 | p1 <- xml2::xml_text(xml2::xml_find_first( 79 | doc, "//front//article-meta/fpage" 80 | )) 81 | if (!is.na(p1)) { 82 | p2 <- xml2::xml_text(xml2::xml_find_first( 83 | doc, "//front//article-meta/lpage" 84 | )) 85 | if (p1 != p2) p1 <- paste(p1, p2, sep = "-") 86 | } else { 87 | p1 <- xml2::xml_text(xml2::xml_find_first( 88 | doc, "//front//article-meta/elocation-id" 89 | )) 90 | } 91 | z[["Pages"]] <- p1 92 | # More PUB Dates - tags always sorted day, month, year? 93 | epub <- xml2::xml_text(xml2::xml_find_all( 94 | doc, "//front//pub-date[@pub-type='epub']/*" 95 | )) 96 | if (length(epub) > 0) { 97 | z[["Published online"]] <- paste(rev(epub), collapse = "-") 98 | } 99 | rec <- xml2::xml_text(xml2::xml_find_all( 100 | doc, "//front//history/date[@date-type='received']/*" 101 | )) 102 | if (length(rec) > 0) z[["Date received"]] <- paste(rev(rec), collapse = "-") 103 | ## DOI 104 | doi <- xml2::xml_text(xml2::xml_find_first( 105 | doc, "//front//article-id[@pub-id-type='doi']" 106 | )) 107 | if (!is.na(doi)) z[["DOI"]] <- doi 108 | # Publisher 109 | x <- xml2::xml_text(xml2::xml_find_first( 110 | doc, "//front//journal-meta//publisher-name" 111 | )) 112 | if (!is.na(x)) z[["Publisher"]] <- x 113 | } else { 114 | message("No title found. Not a PMC XML document?") 115 | z <- NULL 116 | } 117 | z 118 | } 119 | -------------------------------------------------------------------------------- /R/pmc_reference.R: -------------------------------------------------------------------------------- 1 | #' Format references cited 2 | #' 3 | #' @param doc \code{xml_document} from PubMed Central 4 | #' 5 | #' @return a tibble with id, pmid, authors, year, title, journal, volume, pages, 6 | #' and doi. 7 | #' 8 | #' @author Chris Stubben 9 | #' 10 | #' @note Mixed citations without any child tags are added to the author column. 11 | #' 12 | #' @examples 13 | #' # doc <- pmc_xml("PMC2231364") 14 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 15 | #' package = "tidypmc" 16 | #' )) 17 | #' x <- pmc_reference(doc) 18 | #' x 19 | #' @export 20 | 21 | pmc_reference <- function(doc) { 22 | if (class(doc)[1] != "xml_document") { 23 | stop("doc should be an xml_document from PubMed Central") 24 | } 25 | z <- xml2::xml_find_all(doc, "//ref") 26 | # cat(as.character(z[[1]])) 27 | if (length(z) > 0) { 28 | n <- lapply(z, function(x) xml2::xml_name(xml2::xml_find_all(x, "./*"))) 29 | x <- as.vector(unlist(n)) 30 | x <- table(x[!x %in% c("label", "note")]) 31 | message("Found ", paste(x, names(x), collapse = " and "), " tags") 32 | ## xml2::xml_find_first returns NA for missing values 33 | pmid <- vapply(z, function(x) xml2::xml_text( 34 | xml2::xml_find_first(x, ".//pub-id[@pub-id-type='pmid']"), 35 | trim = TRUE 36 | ), character(1)) 37 | doi <- vapply(z, function(x) xml2::xml_text( 38 | xml2::xml_find_first(x, ".//pub-id[@pub-id-type='doi']"), 39 | trim = TRUE 40 | ), character(1)) 41 | a1 <- lapply(z, function(x) xml2::xml_text( 42 | xml2::xml_find_all(x, ".//surname"), 43 | trim = TRUE 44 | )) 45 | a2 <- lapply(z, function(x) xml2::xml_text( 46 | xml2::xml_find_all(x, ".//given-names"), 47 | trim = TRUE 48 | )) 49 | # if all references have same number of authors, use SIMPLIFY=FALSE, 50 | # see PMC6369050 51 | authors <- vapply( 52 | mapply(paste, a1, a2, SIMPLIFY = FALSE), 53 | function(x) paste(x, collapse = ", "), character(1) 54 | ) 55 | authors[authors == ""] <- NA 56 | # use character for same authors published twice in same year, 2012a 2012b 57 | year <- vapply(z, function(x) xml2::xml_text( 58 | xml2::xml_find_first(x, ".//year"), 59 | trim = TRUE 60 | ), character(1)) 61 | if (all(grepl("^[0-9]+$", year))) year <- as.integer(year) 62 | title <- vapply(z, function(x) xml2::xml_text( 63 | xml2::xml_find_first(x, ".//article-title"), 64 | trim = TRUE 65 | ), character(1)) 66 | # new lines in title PMC4909105 67 | title <- gsub("\n *", " ", title) 68 | journal <- vapply(z, function(x) xml2::xml_text( 69 | xml2::xml_find_first(x, ".//source"), 70 | trim = TRUE 71 | ), character(1)) 72 | volume <- vapply(z, function(x) xml2::xml_text( 73 | xml2::xml_find_first(x, ".//volume"), 74 | trim = TRUE 75 | ), character(1)) 76 | p1 <- vapply(z, function(x) xml2::xml_text( 77 | xml2::xml_find_first(x, ".//fpage"), 78 | trim = TRUE 79 | ), character(1)) 80 | p2 <- vapply(z, function(x) xml2::xml_text( 81 | xml2::xml_find_first(x, ".//lpage"), 82 | trim = TRUE 83 | ), character(1)) 84 | pages <- paste(p1, p2, sep = "-") 85 | pages <- gsub("-NA", "", pages) 86 | x <- tibble::tibble( 87 | id = seq_along(pmid), pmid, authors, year, title, journal, 88 | volume, pages, doi 89 | ) 90 | # add mixed citation to title?? 91 | n <- which(is.na(x$authors) & is.na(x$title)) 92 | if (length(n) > 0) { 93 | if (nrow(x) == length(n)) { 94 | message(" References are missing author and title tags") 95 | } else { 96 | message(" ", length(n), " references are missing author and title tags") 97 | } 98 | message(" Adding /ref string to author column") 99 | x$authors[n] <- vapply(z[n], xml2::xml_text, character(1)) 100 | } 101 | } else { 102 | message("No /ref tags") 103 | x <- NULL 104 | } 105 | x 106 | } 107 | -------------------------------------------------------------------------------- /R/pmc_table.R: -------------------------------------------------------------------------------- 1 | #' Convert table nodes to tibbles 2 | #' 3 | #' Convert PubMed Central table nodes into a list of tibbles 4 | #' 5 | #' @param doc \code{xml_document} from PubMed Central 6 | #' 7 | #' @return a list of tibbles 8 | #' 9 | #' @note Saves the caption and footnotes as attributes and collapses multiline 10 | #' headers, expands all rowspan and colspan attributes and adds 11 | #' subheadings to column one. 12 | #' 13 | #' @author Chris Stubben 14 | #' 15 | #' @examples 16 | #' # doc <- pmc_xml("PMC2231364") 17 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 18 | #' package = "tidypmc" 19 | #' )) 20 | #' x <- pmc_table(doc) 21 | #' sapply(x, dim) 22 | #' x 23 | #' attributes(x[[1]]) 24 | #' @export 25 | 26 | pmc_table <- function(doc) { 27 | if (class(doc)[1] != "xml_document") { 28 | stop("doc should be an xml_document from PubMed Central") 29 | } 30 | twn <- length(xml2::xml_find_all(doc, "//table-wrap")) 31 | ## Avoid table-wrap without table node, usually link to image only 32 | z <- xml2::xml_find_all(doc, "//table-wrap/table/..") 33 | if (length(z) == 0) { 34 | message("No tables found") 35 | if (twn > 0) message("Table-wrap with link to image?") 36 | tbls <- NULL 37 | } else { 38 | tbl_nodes <- xml2::xml_find_all(z, "./table") 39 | message("Parsing ", length(z), " tables") 40 | if (twn > length(z)) { 41 | message(twn - length(n), " /table-wrap with link to image?") 42 | } 43 | ## START table function 44 | # t1 <- xml2::xml_find_all(doc, "//table")[1] 45 | tbls <- lapply(tbl_nodes, function(t1) { 46 | # PARSE HEADER 47 | x <- xml2::xml_find_all(t1, ".//thead/tr") 48 | # cat(as.character(x)) 49 | ## missing header 50 | if (length(x) == 0) { 51 | thead <- NA 52 | ## 1 header row... 53 | } else if (length(x) == 1) { 54 | colspan <- as.numeric(xml2::xml_attr( 55 | xml2::xml_find_all(x, ".//td|.//th"), "colspan", 56 | default = "1" 57 | )) 58 | thead <- xml2::xml_text(xml2::xml_find_all(x, ".//td|.//th")) 59 | # repeat across colspan 60 | if (any(colspan > 1)) { 61 | thead <- rep(thead, colspan) 62 | } 63 | # mutliline header - collapse into single row 64 | # SEE tables 1 and 2 in PMC3109299 65 | } else { 66 | nr <- length(x) 67 | nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr( 68 | xml2::xml_find_all(y, ".//td|.//th"), "colspan", 69 | default = "1" 70 | ))), double(1))) 71 | c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc)) 72 | for (i in seq_len(nr)) { 73 | rowspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all( 74 | x[[i]], ".//td|.//th" 75 | ), "rowspan", default = "1")) 76 | colspan <- as.numeric(xml2::xml_attr(xml2::xml_find_all( 77 | x[[i]], ".//td|.//th" 78 | ), "colspan", default = "1")) 79 | thead <- xml2::xml_text(xml2::xml_find_all( 80 | x[[i]], ".//td|.//th" 81 | )) 82 | if (any(colspan > 1)) { 83 | thead <- rep(thead, colspan) 84 | rowspan <- rep(rowspan, colspan) 85 | } 86 | # fill values into empty cells 87 | n <- which(is.na(c2[i, ])) 88 | ## truncate to avoid warning - see PMC3119406 89 | if (length(thead) != length(n)) thead <- thead[seq_along(n)] 90 | c2[ i, n] <- thead 91 | if (any(rowspan > 1)) { 92 | for (j in seq_along(rowspan)) { 93 | if (rowspan[j] > 1) { 94 | ## repeat value down column 95 | c2[(i + 1):(i + (rowspan[j] - 1)), n[j]] <- thead[j] 96 | } 97 | } 98 | } 99 | } 100 | ## COLLAPSE into single row... 101 | ## some rowspans may extend past nr! see table 1 PMC3109299 102 | if (nrow(c2) > nr) c2 <- c2[seq_len(nr), ] 103 | ## collaps3 column names and row values uses ";" as separator 104 | thead <- apply(c2, 2, function(x) 105 | paste(unique(x), collapse = ": ")) 106 | # some mutliline rows with horizontal lines only 107 | thead <- gsub(": : ", ": ", thead) 108 | thead <- gsub("^: ", "", thead) 109 | thead <- gsub(": $", "", thead) 110 | } 111 | #------------------------------------------------------------------- 112 | # PARSE TABLE 113 | # Do not repeat values with colspans across rows (usually table 114 | # subheaders). Repeat values with rowspan down columns 115 | x <- xml2::xml_find_all(t1, ".//tbody/tr") 116 | # number of rows 117 | nr <- length(x) 118 | nc <- max(vapply(x, function(y) sum(as.numeric(xml2::xml_attr( 119 | xml2::xml_find_all(y, ".//td|.//th"), "colspan", 120 | default = "1" 121 | ))), double(1))) 122 | c2 <- data.frame(matrix(NA, nrow = nr, ncol = nc)) 123 | for (i in seq_len(nr)) { 124 | ## some table use //th see table1 PMC3031304 125 | rowspan <- xml2::xml_attr(xml2::xml_find_all( 126 | x[[i]], ".//td|.//th" 127 | ), "rowspan", default = "1") 128 | colspan <- xml2::xml_attr(xml2::xml_find_all( 129 | x[[i]], ".//td|.//th" 130 | ), "colspan", default = "1") 131 | # PMC6358641 with rowspan="" 132 | rowspan <- as.numeric(ifelse(rowspan == "", 1, rowspan)) 133 | colspan <- as.numeric(ifelse(colspan == "", 1, colspan)) 134 | val <- xml2::xml_text(xml2::xml_find_all(x[[i]], ".//td|.//th")) 135 | # NO-BREAK, EN or EM SPACE 136 | val <- gsub("\u00A0|\u2002|\u2003", " ", val) 137 | val <- trimws(val) 138 | if (any(colspan > 1)) { 139 | val <- rep(val, colspan) 140 | ## only display subheader in column 1? 141 | val[-1][val[-1] == val[-length(val)]] <- NA 142 | rowspan <- rep(rowspan, colspan) 143 | } 144 | # fill values into empty cells 145 | n <- which(is.na(c2[i, ])) 146 | 147 | # some tables have extra td tags see table 2 PMC3109299 148 | # 149 | # truncate to avoid warning?? 150 | if (length(val) != length(n)) { 151 | val <- val[seq_along(n) ] 152 | } 153 | c2[ i, n] <- val 154 | if (any(rowspan > 1)) { 155 | for (j in seq_along(rowspan)) { 156 | if (rowspan[j] > 1) { 157 | ## repeat value down column 158 | c2[ (i + 1):(i + (rowspan[j] - 1)), n[j]] <- val[j] 159 | } 160 | } 161 | } 162 | } 163 | x <- c2 164 | #------------------------------------- 165 | if (!is.na(thead[1])) { 166 | thead[thead == ""] <- "X" 167 | tbn <- ncol(x) 168 | thn <- length(thead) 169 | if (tbn != thn) { 170 | message("Warning: number of column in /thead and /tbody do not match") 171 | if (tbn > thn) { 172 | thead <- append(thead, rep("X", tbn - thn)) 173 | } else { 174 | ## see table 3 from PMC3020393 175 | thead <- thead[seq_len(tbn)] 176 | } 177 | } 178 | thead <- gsub("\n", " ", thead) 179 | thead <- make.unique(thead) 180 | colnames(x) <- thead 181 | } 182 | # DELETE empty rows - 183 | if (nrow(x) > 1) { 184 | nX <- apply(x, 1, function(y) sum(!(is.na(y) | y == ""))) 185 | x <- x[nX != 0, , FALSE] # use FALSE in case only 1 column in TABLE 186 | } 187 | # FIX column typess 188 | ## errors if newlines and tabs in cells(or colnames!) 189 | colnames(x) <- gsub("\n *", "", colnames(x)) 190 | x <- tibble::as_tibble(x) 191 | x <- suppressMessages(repeat_sub(x)) 192 | x 193 | }) 194 | ### END table functino 195 | #---------------------------------------------------- 196 | ## should have label and caption? 197 | f1 <- vapply(z, function(x) xml2::xml_text( 198 | xml2::xml_find_first(x, "./label") 199 | ), character(1)) 200 | f2 <- vapply(z, function(x) xml2::xml_text( 201 | xml2::xml_find_first(x, "./caption") 202 | ), character(1)) 203 | # check length, some table-wrap with more than 1 /table tag 204 | if (length(f1) == length(tbls)) { 205 | names(tbls) <- f1 206 | } 207 | else { 208 | message("Number of /table nodes is not the sampe as /table-wrap") 209 | } 210 | if (length(f2) == length(tbls)) { 211 | for (i in seq_along(tbls)) { 212 | attr(tbls[[i]], "caption") <- f2[i] 213 | } 214 | } 215 | ## footnotes 216 | fn <- vapply(z, function(x) xml2::xml_text( 217 | xml2::xml_find_first(x, "./table-wrap-foot") 218 | ), character(1)) 219 | n <- which(!is.na(fn)) 220 | if (length(n) > 0) { 221 | message("Adding footnotes to Table ", paste(n, collapse = ",")) 222 | for (i in n) { 223 | attr(tbls[[i]], "footnotes") <- fn[i] 224 | } 225 | } 226 | } 227 | tbls 228 | } 229 | -------------------------------------------------------------------------------- /R/pmc_text.R: -------------------------------------------------------------------------------- 1 | #' Split section paragraphs into sentences 2 | #' 3 | #' Split section paragraph tags into a table with subsection titles and 4 | #' sentences using \code{tokenize_sentences} 5 | #' 6 | #' @param doc \code{xml_document} from PubMed Central 7 | #' 8 | #' @return a tibble with section, paragraph and sentence number and text 9 | #' 10 | #' @note Subsections may be nested to arbitrary depths and this function will 11 | #' return the entire path to the subsection title as a delimited string like 12 | #' "Results; Predicted functions; Pathogenicity". Tables, figures and 13 | #' formulas that are nested in section paragraphs are removed, superscripted 14 | #' references are replaced with brackets, and any other superscripts or 15 | #' subscripts are separared with ^ and _. 16 | #' 17 | #' @author Chris Stubben 18 | #' 19 | #' @examples 20 | #' # doc <- pmc_xml("PMC2231364") 21 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 22 | #' package = "tidypmc" 23 | #' )) 24 | #' txt <- pmc_text(doc) 25 | #' txt 26 | #' dplyr::count(txt, section, sort = TRUE) 27 | #' @export 28 | 29 | pmc_text <- function(doc) { 30 | if (class(doc)[1] != "xml_document") { 31 | stop("doc should be an xml_document from PubMed Central") 32 | } 33 | ## create new document to remove nodes 34 | doc2 <- xml2::xml_new_root(doc) 35 | z <- vector("list") 36 | ## Main title 37 | t1 <- xml2::xml_text(xml2::xml_find_first( 38 | doc2, "//front//article-title" 39 | ), trim = TRUE) 40 | if (!is.na(t1)) z[["Title"]] <- t1 41 | ## Abstract 42 | a1 <- xml2::xml_text(xml2::xml_find_all( 43 | doc2, "//abstract[not(@abstract-type='summary')]//p" 44 | )) 45 | if (length(a1) > 0) z[["Abstract"]] <- a1 46 | 47 | ## Author summary 48 | author_sum <- xml2::xml_text(xml2::xml_find_all( 49 | doc2, "//abstract[@abstract-type='summary']/title" 50 | )) 51 | if (length(author_sum) > 0) { 52 | z[[author_sum]] <- xml2::xml_text(xml2::xml_find_all( 53 | doc2, "//abstract[@abstract-type='summary']//p" 54 | )) 55 | } 56 | if (length(z) == 0) { 57 | message("No title or abstract found. Not a PMC XML document?") 58 | x <- NULL 59 | } else { 60 | ## check for tables, figures, formula within tags 61 | n <- xml2::xml_find_all(doc2, "//sec/p/table-wrap") 62 | if (length(n) > 0) { 63 | message("Note: removing table-wrap nested in sec/p tag") 64 | xml2::xml_remove(n) 65 | } 66 | n <- xml2::xml_find_all(doc2, "//sec/p/fig") 67 | if (length(n) > 0) { 68 | message("Note: removing fig nested in sec/p tag") 69 | xml2::xml_remove(n) 70 | } 71 | # formulas may include very long MathType encoding strings 72 | n <- xml2::xml_find_all(doc2, "//sec/p/disp-formula") 73 | if (length(n) > 0) { 74 | message("Note: removing disp-formula nested in sec/p tag") 75 | xml2::xml_remove(n) 76 | } 77 | # DROP any sections with supplementary materials (often with nested 78 | # sections missing titles) 79 | n <- xml2::xml_find_all( 80 | doc2, "//body//sec[@sec-type='supplementary-material']" 81 | ) 82 | if (length(n) > 0) xml2::xml_remove(n) 83 | ## Add brackets to numbered references with superscript tags 84 | add_bracket <- FALSE 85 | bib <- xml2::xml_find_all(doc2, "//sup//xref[@ref-type='bibr']") 86 | if (length(bib) > 0) { 87 | message("Adding brackets to numbered references in /sup tags") 88 | add_bracket <- TRUE 89 | xml2::xml_text(bib) <- paste0(" [", xml2::xml_text(bib), "]") 90 | } 91 | ## Add ^ and _ to /sup and /sub tags 92 | sup <- xml2::xml_find_all(doc2, "//sup[not(xref)]") 93 | if (length(sup) > 0) { 94 | xml2::xml_text(sup) <- paste0("^", xml2::xml_text(sup)) 95 | } 96 | subs <- xml2::xml_find_all(doc2, "//sub") 97 | if (length(subs) > 0) { 98 | xml2::xml_text(subs) <- paste0("_", xml2::xml_text(subs)) 99 | } 100 | 101 | ## parse text from Sections 102 | sec <- xml2::xml_find_all(doc2, "//body//sec") 103 | if (length(sec) == 0) { 104 | message("NOTE: No sections found, using all text in main body/p") 105 | z[["[Main]"]] <- xml2::xml_text(xml2::xml_find_all(doc2, "//body/p")) 106 | } else { 107 | ## Emerging infectious diseases has both body/p and body/sec 108 | intro <- xml2::xml_text(xml2::xml_find_all(doc2, "//body/p")) 109 | if (length(intro) > 0) { 110 | message( 111 | "NOTE: Body has both /p and /sec tags - untitled Introduction?" 112 | ) 113 | z[["[Introduction]"]] <- xml2::xml_text( 114 | xml2::xml_find_all(doc2, "//body/p") 115 | ) 116 | } 117 | # /sec should have both title and p? 118 | t1 <- xml2::xml_text(xml2::xml_find_all(doc2, "//body//sec/title")) 119 | # fix sections without title ... PMC6360207 120 | if ("" %in% t1) { 121 | message("Missing ", sum(t1 == ""), " title in sec/p tag") 122 | t1[t1 == ""] <- "[untitled sec/p]" 123 | } 124 | ## indentation level of subsections 125 | n <- stringr::str_count(xml2::xml_path( 126 | xml2::xml_find_all(doc2, "//body//sec/title") 127 | ), "/") 128 | ## full path to subsection title 129 | path <- path_string(t1, n) 130 | ## section paragraphs (get sec/p and not any //p) 131 | secP <- lapply(sec, function(x) xml2::xml_text( 132 | xml2::xml_find_all(x, "./p") 133 | )) 134 | if (length(path) != length(secP)) { 135 | message("Warning: some sections are missing /title tags") 136 | } 137 | minP <- min(length(path), length(secP)) 138 | ## LOOP through subsections and skip sections missing /p tags 139 | for (i in seq_len(minP)) { 140 | subT <- path[i] 141 | subT <- gsub("\\.$", "", subT) 142 | # in case of nested sec tags, replace "; ; ; " 143 | subT <- gsub("[; ]{3,}", "; ", subT) 144 | if (length(secP[[i]]) > 0) { 145 | ## don't split Fig. 1 into two sentences, probably many others 146 | p1 <- lapply( 147 | secP[[i]], 148 | function(x) gsub("([ (][Ff]ig)\\.", "\\1", x) 149 | ) 150 | z[[subT]] <- p1 151 | } 152 | } 153 | } 154 | x <- lapply(z, tokenizers::tokenize_sentences) 155 | x1 <- lapply(x, function(y) dplyr::bind_rows( 156 | lapply(y, function(z) if (length(z) > 0) { 157 | tibble::tibble(sentence = seq_along(z), text = z) 158 | }), 159 | .id = "paragraph" 160 | )) 161 | x <- dplyr::bind_rows(x1, .id = "section") 162 | x <- dplyr::mutate(x, paragraph = as.integer(paragraph)) 163 | # replace en dash, em dash, etc to separate ranges 164 | x$text <- gsub("\u2011|\u2012|\u2013|\u2014", "-", x$text) 165 | ## FIX if brackets added to superscripted references 166 | if (add_bracket) x$text <- gsub("]- [", "-", x$text, fixed = TRUE) 167 | } 168 | x 169 | } 170 | -------------------------------------------------------------------------------- /R/pmc_xml.R: -------------------------------------------------------------------------------- 1 | #' Download XML from PubMed Central 2 | #' 3 | #' @param id a PMC id starting with 'PMC' 4 | #' 5 | #' @return \code{xml_document} 6 | #' 7 | #' @source \url{https://europepmc.org/RestfulWebService} 8 | #' 9 | #' @examples 10 | #' \dontrun{ 11 | #' doc <- pmc_xml("PMC2231364") 12 | #' } 13 | #' 14 | #' @export 15 | 16 | pmc_xml <- function(id) { 17 | if (!grepl("^PMC[0-9]+$", id)) { 18 | stop("id should be a valid PMC id like PMC2231364") 19 | } 20 | url1 <- paste0( 21 | "https://www.ebi.ac.uk/europepmc/webservices/rest/", id, "/fullTextXML" 22 | ) 23 | xml2::read_xml(url1) 24 | } 25 | -------------------------------------------------------------------------------- /R/repeat_sub.R: -------------------------------------------------------------------------------- 1 | #' Repeat table subheadings 2 | #' 3 | #' Repeat table subheadings in a new column 4 | #' 5 | #' Identifies subheadings in a data frame by checking for rows with a non-empty 6 | #' first column and all other columns are empty. Removes subheader rows and 7 | #' repeats values down a new column. 8 | #' 9 | #' @param x a tibble with subheadings 10 | #' @param column new column name, default subheading 11 | #' @param first add subheader as first column, default TRUE 12 | #' 13 | #' @return a tibble 14 | #' 15 | #' @author Chris Stubben 16 | #' 17 | #' @examples 18 | #' x <- data.frame( 19 | #' genes = c("Up", "aroB", "glnP", "Down", "ndhA", "pyrF"), 20 | #' fold_change = c(NA, 2.5, 1.7, NA, -3.1, -2.6) 21 | #' ) 22 | #' x 23 | #' repeat_sub(x) 24 | #' repeat_sub(x, "regulated", first = FALSE) 25 | #' @noRd 26 | 27 | repeat_sub <- function(x, column = "subheading", first = TRUE) { 28 | if (!is.data.frame(x)) { 29 | stop("x should be a table") 30 | } 31 | if (ncol(x) == 1) { 32 | message("Only one column in table") 33 | } else { 34 | ## columns 2 to ncol(x) should be empty 35 | ## \u00A0 is non-breaking space 36 | n <- apply( 37 | x[, -1, FALSE], 1, 38 | function(z) all(is.na(z) | z == "NA" | z == "" | z == "\u00A0") 39 | ) 40 | if (sum(n) == 0) { 41 | message("No subheaders found") 42 | } else if (sum(diff(which(n)) == 1) > 1) { 43 | ## check for consecutive subheaders (and then probably not subheaders) 44 | ## SEE PMC3334355 45 | message("Too many subheaders in consecutive rows") 46 | } else if (which(n)[1] != 1) { 47 | message("No subheader in row 1") 48 | } else { 49 | # keep copy of original table 50 | y <- x 51 | ## add unlist() for tibbles 52 | x[[column]] <- rep(unlist(x[n,1]), times = diff(c(which(n), nrow(x) + 1))) 53 | # drop rows with subheader only 54 | y <- x[!n, ] 55 | # rownames(y)<-NULL 56 | y <- suppressMessages(readr::type_convert(y)) 57 | if (first) y <- y[, c(ncol(y), seq_len(ncol(y) - 1))] 58 | x <- y 59 | } 60 | } 61 | x 62 | } 63 | -------------------------------------------------------------------------------- /R/separate_genes.R: -------------------------------------------------------------------------------- 1 | #' Separate genes and operons into multiple rows 2 | #' 3 | #' Separate genes and operons mentioned in full text into multiple rows 4 | #' 5 | #' @param txt a table 6 | #' @param pattern regular expression to match genes, default is to match 7 | #' microbial genes like AbcD, default [A-Za-z][a-z]{2}[A-Z0-9]+ 8 | #' @param genes an optional vector of genes, set pattern to NA to only match 9 | #' this list. 10 | #' @param operon operon length, default 6. Split genes with 6 or more letters 11 | #' into separate genes, for example AbcDEF is split into abcD, abcE and abcF. 12 | #' @param column column name to search, default "text" 13 | #' 14 | #' @note Check for genes in italics using \code{xml_text(xml_find_all(doc, 15 | #' "//sec//p//italic"))} and update the pattern or add additional genes as an 16 | #' optional vector if needed 17 | #' 18 | #' @return a tibble with gene name, matching text and rows. 19 | #' 20 | #' @author Chris Stubben 21 | #' 22 | #' @examples 23 | #' x <- data.frame(row = 1, text = "Genes like YacK, hmu and sufABC") 24 | #' separate_genes(x) 25 | #' separate_genes(x, genes = "hmu") 26 | #' @export 27 | 28 | separate_genes <- function(txt, pattern = "\\b[A-Za-z][a-z]{2}[A-Z0-9]+\\b", 29 | genes, operon = 6, column = "text") { 30 | if (!operon > 4) stop("Operon length should be 5 or more") 31 | if (!missing(genes)) { 32 | x1 <- paste0("\\b", paste(genes, collapse = "\\b|\\b"), "\\b") 33 | if (pattern %in% c("", NA)) { 34 | pattern <- x1 35 | } else { 36 | pattern <- paste(pattern, x1, sep = "|") 37 | } 38 | } 39 | x <- separate_text(txt, pattern, column) 40 | if (is.null(x)) { 41 | x1 <- NULL 42 | } else { 43 | ## add option to exclue common matches 44 | x <- dplyr::filter(x, !match %in% c( 45 | "TraDIS", "taqDNA", "log2", "log10", 46 | "ecoRI", "bamHI", "chr1", "chr2" 47 | )) 48 | if (nrow(x) == 0) stop("No match to genes") 49 | ## don't split locus tags like ypo2995 50 | y <- ifelse(nchar(x$match) >= operon & !grepl( 51 | "^[0-9]+$", 52 | substring(x$match, 4) 53 | ), 54 | mapply( 55 | paste0, tolower(substr(x$match, 1, 3)), 56 | strsplit(substring(x$match, 4), "") 57 | ), 58 | paste0(tolower(substr(x$match, 1, 1)), substring(x$match, 2)) 59 | ) 60 | n <- vapply(y, length, integer(1)) 61 | x1 <- dplyr::bind_cols(gene = unlist(y), x[ rep(seq_len(nrow(x)), n), ]) 62 | } 63 | x1 64 | } 65 | -------------------------------------------------------------------------------- /R/separate_refs.R: -------------------------------------------------------------------------------- 1 | #' Separate references cited into multiple rows 2 | #' 3 | #' Separates references cited in brackets or parentheses into multiple rows and 4 | #' splits the comma-delimited numeric strings and expands ranges like 7-9 into 5 | #' new rows 6 | #' 7 | #' @param txt a table 8 | #' @param column column name, default "text" 9 | #' 10 | #' @return a tibble 11 | #' 12 | #' @author Chris Stubben 13 | #' 14 | #' @examples 15 | #' x <- data.frame(row = 1, text = "some important studies [7-9,15]") 16 | #' separate_refs(x) 17 | #' @export 18 | 19 | separate_refs <- function(txt, column = "text") { 20 | pattern <- "(\\(|\\[)[0-9, -]+(\\]|\\))" 21 | x <- separate_text(txt, pattern, column) 22 | if (is.null(x)) { 23 | x1 <- NULL 24 | } else { 25 | # remove any parentheses, spaces and brackets 26 | y <- gsub("[)( ]|\\]|\\[", "", x$match) 27 | ## split commas 28 | y <- strsplit(y, ",") 29 | ## split ranges 30 | z <- lapply(y, strsplit, "-") 31 | ## apply seq if length is 2 32 | y <- lapply(z, function(x) unlist( 33 | lapply(x, function(x1) 34 | if (length(x1) == 2) seq(x1[1], x1[2]) else as.numeric(x1)) 35 | )) 36 | n <- vapply(y, length, integer(1)) 37 | x1 <- dplyr::bind_cols(id = unlist(y), x[ rep(seq_len(nrow(x)), n), ]) 38 | } 39 | x1 40 | } 41 | -------------------------------------------------------------------------------- /R/separate_tags.R: -------------------------------------------------------------------------------- 1 | #' Separate locus tag into multiple rows 2 | #' 3 | #' Separates locus tags mentioned in full text and expands ranges like 4 | #' YPO1970-74 into new rows 5 | #' 6 | #' @param txt a table 7 | #' @param pattern regular expression to match locus tags like YPO[0-9-]+ or 8 | #' the locus tag prefix like YPO. 9 | #' @param column column name to search, default "text" 10 | #' 11 | #' @return a tibble with locus tag, matching text and rows. 12 | #' 13 | #' @author Chris Stubben 14 | #' 15 | #' @examples 16 | #' x <- data.frame(row = 1, text = "some genes like YPO1002 and YPO1970-74") 17 | #' separate_tags(x, "YPO") 18 | #' @export 19 | 20 | separate_tags <- function(txt, pattern, column = "text") { 21 | ## if prefix only (no numbers) also match YPO1854-YPO1856? 22 | if (!grepl("[0-9]", pattern)) { 23 | # pattern <- paste0(pattern, "[0-9-]+") 24 | pattern <- paste0(pattern, "[0-9", pattern, "-]+") 25 | } 26 | x <- separate_text(txt, pattern, column) 27 | if (is.null(x)) { 28 | x1 <- NULL 29 | } else { 30 | ## avoid YPO1854-YPO1856-YPO1858 31 | if (any(stringr::str_count(x$match, "-") > 1)) { 32 | stop("pattern matches 3 or more tags") 33 | } 34 | if (any(grepl("-$", x$match))) x$match <- gsub("-$", "", x$match) 35 | # Expand range if matching "-" 36 | y <- lapply(x$match, function(id) { 37 | if (grepl("-", id)) { 38 | pre <- stringr::str_extract(id, "^[^0-9]+") 39 | ## split range 40 | x <- strsplit(gsub("[^0-9-]", "", id), "-")[[1]] 41 | n <- nchar(x[1]) 42 | x <- as.numeric(x) 43 | ## check if 2nd number is less than 1st... YPO1970-80 44 | if (x[2] < x[1]) { 45 | x[2] <- paste0( 46 | substring(x[1], 1, nchar(x[1]) - nchar(x[2])), x[2] 47 | ) 48 | } 49 | id <- seq(x[1], x[2]) 50 | id <- stringr::str_pad(id, n, pad = "0") 51 | id <- paste0(pre, id) 52 | } 53 | id 54 | }) 55 | n <- vapply(y, length, integer(1)) 56 | x1 <- dplyr::bind_cols(id = unlist(y), x[ rep(seq_len(nrow(x)), n), ]) 57 | } 58 | x1 59 | } 60 | -------------------------------------------------------------------------------- /R/separate_text.R: -------------------------------------------------------------------------------- 1 | #' Separate all matching text into multiple rows 2 | #' 3 | #' @param txt a tibble, usually results from \code{pmc_text} 4 | #' @param pattern either a regular expression or a vector of words to find in 5 | #' text 6 | #' @param column column name, default "text" 7 | #' 8 | #' @return a tibble 9 | #' 10 | #' @note passed to \code{grepl} and \code{str_extract_all} 11 | #' 12 | #' @author Chris Stubben 13 | #' 14 | #' @examples 15 | #' # doc <- pmc_xml("PMC2231364") 16 | #' doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 17 | #' package = "tidypmc")) 18 | #' txt <- pmc_text(doc) 19 | #' separate_text(txt, "[ATCGN]{5,}") 20 | #' separate_text(txt, "\\([A-Z]{3,6}s?\\)") 21 | #' # pattern can be a vector of words 22 | #' separate_text(txt, c("hmu", "ybt", "yfe", "yfu")) 23 | #' # wrappers for separate_text with extra step to expand matched ranges 24 | #' separate_refs(txt) 25 | #' separate_genes(txt) 26 | #' separate_tags(txt, "YPO") 27 | #' 28 | #' @export 29 | 30 | separate_text <- function(txt, pattern, column = "text"){ 31 | if (!is.data.frame(txt)) stop("txt should be a tibble") 32 | if (!column %in% names(txt)) stop("column ", column, " is not found") 33 | ## paste words into | delimited string with word boundaries 34 | if (length(pattern) > 1) { 35 | pattern <- paste0("\\b", paste(pattern, collapse = "\\b|\\b"), "\\b") 36 | } 37 | x <- dplyr::filter(txt, grepl(pattern, txt[[column]])) 38 | if (nrow(x) == 0) { 39 | message("No match to ", pattern) 40 | txt2 <- NULL 41 | } else { 42 | y <- stringr::str_extract_all(x[[column]], pattern) 43 | y <- lapply(y, unique) 44 | n <- vapply(y, length, integer(1)) 45 | txt2 <- dplyr::bind_cols(match = unlist(y), x[rep(seq_len(nrow(x)), n),]) 46 | } 47 | txt2 48 | } 49 | -------------------------------------------------------------------------------- /R/tidypmc-package.R: -------------------------------------------------------------------------------- 1 | #' \code{tidypmc} package 2 | #' 3 | #' Parse full text XML documents from PubMed Central 4 | #' 5 | #' See the Github page for details at \url{https://github.com/ropensci/tidypmc} 6 | #' 7 | #' @docType package 8 | #' @name tidypmc 9 | #' @importFrom dplyr %>% 10 | #' @keywords internal 11 | NULL 12 | 13 | # R CMD check error from 14 | # https://github.com/jennybc/googlesheets/blob/master/R/googlesheets.R 15 | if(getRversion() >= "2.15.1") utils::globalVariables(c("paragraph")) 16 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r setup, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "# " 9 | ) 10 | ``` 11 | 12 | [![Build Status](https://travis-ci.org/ropensci/tidypmc.svg?branch=master)](https://travis-ci.org/ropensci/tidypmc) 13 | [![Coverage status](https://codecov.io/gh/ropensci/tidypmc/branch/master/graph/badge.svg)](https://codecov.io/github/ropensci/tidypmc?branch=master) 14 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/tidypmc)](https://cran.r-project.org/package=tidypmc) 15 | [![Downloads](https://cranlogs.r-pkg.org/badges/tidypmc)](https://CRAN.R-project.org/package=tidypmc) 16 | [![Total Downloads](https://cranlogs.r-pkg.org/badges/grand-total/tidypmc?color=orange)](https://CRAN.R-project.org/package=tidypmc) 17 | 18 | # tidypmc 19 | 20 | The [Open Access subset] of [Pubmed Central] (PMC) includes 2.5 million articles 21 | from biomedical and life sciences journals. The full text XML files are freely 22 | available for text mining from the [REST service] or [FTP site] but can be 23 | challenging to parse. For example, section tags are nested to arbitrary depths, 24 | formulas and tables may return incomprehensible text blobs and superscripted 25 | references are pasted at the end of words. The functions in the `tidypmc` 26 | package are intended to return readable text and maintain the document 27 | structure, so gene names and other terms can be associated with specific 28 | sections, paragraphs, sentences or table rows. 29 | 30 | 31 | ## Installation 32 | 33 | Use [remotes] to install the package. 34 | 35 | ```{r install, eval=FALSE} 36 | remotes::install_github("ropensci/tidypmc") 37 | ``` 38 | 39 | ## Load XML 40 | 41 | Download a single XML document like [PMC2231364] from the [REST service] using 42 | the `pmc_xml` function. 43 | 44 | ```{r pmc_xml, message=FALSE, echo=-1} 45 | options(width=100) 46 | library(tidypmc) 47 | library(tidyverse) 48 | doc <- pmc_xml("PMC2231364") 49 | doc 50 | ``` 51 | 52 | The [europepmc] package includes additional functions to search PMC 53 | and download full text. Be sure to include the `OPEN_ACCESS` field in 54 | the search since these are the only articles with full text XML available. 55 | 56 | ```{r epmc, echo=-1} 57 | options(width=100) 58 | library(europepmc) 59 | yp <- epmc_search("title:(Yersinia pestis virulence) OPEN_ACCESS:Y") 60 | select(yp, pmcid, pubYear, title) %>% 61 | print(n=5) 62 | ``` 63 | 64 | 65 | Save all `r nrow(yp)` results to a list of XML documents using the `epmc_ftxt` or `pmc_xml` function. 66 | 67 | ```{r purrr, eval=FALSE} 68 | docs <- map(yp$pmcid, epmc_ftxt) 69 | ``` 70 | 71 | 72 | See the [PMC FTP vignette] for details on parsing the large XML files on the [FTP site] 73 | with 10,000 articles each. 74 | 75 | 76 | ## Parse XML 77 | 78 | 79 | The package includes five functions to parse the `xml_document`. 80 | 81 | 82 | |R function |Description | 83 | |:--------------|:--------------------------------------------------------------------------| 84 | |`pmc_text` |Split section paragraphs into sentences with full path to subsection titles| 85 | |`pmc_caption` |Split figure, table and supplementary material captions into sentences | 86 | |`pmc_table` |Convert table nodes into a list of tibbles | 87 | |`pmc_reference`|Format references cited into a tibble | 88 | |`pmc_metadata` |List journal and article metadata in front node | 89 | 90 | 91 | The `pmc_text` function uses the [tokenizers] package to split section paragraphs into 92 | sentences. The function also removes any tables, figures or formulas that are nested 93 | within paragraph tags, replaces superscripted references with brackets, adds carets and 94 | underscores to other superscripts and subscripts and includes the full path to the 95 | subsection title. 96 | 97 | ```{r pmc_text, echo=-1} 98 | options(width=110) 99 | txt <- pmc_text(doc) 100 | txt 101 | count(txt, section, sort=TRUE) 102 | ``` 103 | 104 | 105 | Load the [tidytext] package for further text processing. 106 | 107 | ```{r tidytext, echo=-1} 108 | options(width=110) 109 | library(tidytext) 110 | x1 <- unnest_tokens(txt, word, text) %>% 111 | anti_join(stop_words) %>% 112 | filter(!word %in% 1:100) 113 | filter(x1, str_detect(section, "^Results")) 114 | filter(x1, str_detect(section, "^Results")) %>% 115 | count(word, sort = TRUE) 116 | ``` 117 | 118 | 119 | 120 | The `pmc_table` function formats tables by collapsing multiline headers, 121 | expanding rowspan and colspan attributes and adding subheadings into a new column. 122 | 123 | ```{r pmc_table, echo=-1} 124 | options(width=110) 125 | tbls <- pmc_table(doc) 126 | map_int(tbls, nrow) 127 | tbls[[1]] 128 | ``` 129 | 130 | Use `collapse_rows` to join column names and cell values in a semi-colon delimited string (and 131 | then search using functions in the next section). 132 | 133 | ```{r collapserows, echo=-1} 134 | options(width=110) 135 | collapse_rows(tbls, na.string="-") 136 | ``` 137 | 138 | The other three `pmc` functions are described in the package [vignette]. 139 | 140 | 141 | ## Searching text 142 | 143 | There are a few functions to search within the `pmc_text` or collapsed 144 | `pmc_table` output. `separate_text` uses the [stringr] package to extract any 145 | regular expression or vector of words. 146 | 147 | 148 | ```{r separate_text, echo=-1} 149 | options(width=110) 150 | separate_text(txt, "[ATCGN]{5,}") 151 | ``` 152 | 153 | A few wrappers search pre-defined patterns and add an extra step to expand 154 | matched ranges. `separate_refs` matches references within brackets using 155 | `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`. 156 | 157 | ```{r separate_refs, echo=-1} 158 | options(width=110) 159 | separate_refs(txt) 160 | ``` 161 | 162 | `separate_genes` will find microbial genes like tauD (with a 163 | capitalized 4th letter) and expand operons like `tauABCD` into 164 | four genes. `separate_tags` will find and expand locus tag ranges below. 165 | 166 | 167 | ```{r locus_tags, echo=-1} 168 | options(width=110) 169 | collapse_rows(tbls, na="-") %>% 170 | separate_tags("YPO") %>% 171 | filter(id == "YPO1855") 172 | ``` 173 | 174 | 175 | See the [vignette] for more details including code to parse 176 | XML documents using the [xml2] package. The [PMC FTP vignette] 177 | has details on parsing XML files at the Europe PMC [FTP site]. 178 | 179 | 180 | ### Community Guidelines 181 | 182 | This project is released with a [Contributor Code of Conduct](CONDUCT.md). By 183 | participating in this project you agree to abide by its terms. Feedback, bug 184 | reports, and feature requests are welcome 185 | [here](https://github.com/ropensci/tidypmc/issues). 186 | 187 | 188 | [remotes]: https://github.com/r-lib/remotes 189 | [PMC2231364]: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC2231364/fullTextXML 190 | [Open Access subset]: https://europepmc.org/downloads/openaccess 191 | [REST service]: https://europepmc.org/RestfulWebService 192 | [FTP site]: https://europepmc.org/ftp/oa/ 193 | [tidytext]: https://www.tidytextmining.com/ 194 | [stringr]: https://stringr.tidyverse.org/ 195 | [vignette]: https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md 196 | [PMC FTP vignette]: https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md 197 | [tokenizers]: https://lincolnmullen.com/software/tokenizers/ 198 | [xml2]: https://github.com/r-lib/xml2 199 | [europepmc]: https://github.com/ropensci/europepmc 200 | [Pubmed Central]: https://europepmc.org 201 | -------------------------------------------------------------------------------- /README.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 588 | 598 | 599 | 600 | 601 | 602 | 603 |

Build Status Coverage status CRAN_Status_Badge Downloads Total Downloads

604 |

tidypmc

605 |

The Open Access subset of Pubmed Central (PMC) includes 2.5 million articles from biomedical and life sciences journals. The full text XML files are freely available for text mining from the REST service or FTP site but can be challenging to parse. For example, section tags are nested to arbitrary depths, formulas and tables may return incomprehensible text blobs and superscripted references are pasted at the end of words. The functions in the tidypmc package are intended to return readable text and maintain the document structure, so gene names and other terms can be associated with specific sections, paragraphs, sentences or table rows.

606 |

Installation

607 |

Use remotes to install the package.

608 | 609 |

Load XML

610 |

Download a single XML document like PMC2231364 from the REST service using the pmc_xml function.

611 | 620 |

The europepmc package includes additional functions to search PMC and download full text. Be sure to include the OPEN_ACCESS field in the search since these are the only articles with full text XML available.

621 | 635 |

Save all 19 results to a list of XML documents using the epmc_ftxt or pmc_xml function.

636 | 637 |

See the PMC FTP vignette for details on parsing the large XML files on the FTP site with 10,000 articles each.

638 |

Parse XML

639 |

The package includes five functions to parse the xml_document.

640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 |
R functionDescription
pmc_textSplit section paragraphs into sentences with full path to subsection titles
pmc_captionSplit figure, table and supplementary material captions into sentences
pmc_tableConvert table nodes into a list of tibbles
pmc_referenceFormat references cited into a tibble
pmc_metadataList journal and article metadata in front node
670 |

The pmc_text function uses the tokenizers package to split section paragraphs into sentences. The function also removes any tables, figures or formulas that are nested within paragraph tags, replaces superscripted references with brackets, adds carets and underscores to other superscripts and subscripts and includes the full path to the subsection title.

671 |
txt <- pmc_text(doc)
672 | #  Note: removing disp-formula nested in sec/p tag
673 | txt
674 | #  # A tibble: 194 x 4
675 | #     section    paragraph sentence text                                                                         
676 | #     <chr>          <int>    <int> <chr>                                                                        
677 | #   1 Title              1        1 Comparative transcriptomics in Yersinia pestis: a global view of environment…
678 | #   2 Abstract           1        1 Environmental modulation of gene expression in Yersinia pestis is critical f…
679 | #   3 Abstract           1        2 Using cDNA microarray technology, we have analyzed the global gene expressio…
680 | #   4 Abstract           2        1 To provide us with a comprehensive view of environmental modulation of globa…
681 | #   5 Abstract           2        2 Almost all known virulence genes of Y. pestis were differentially regulated …
682 | #   6 Abstract           2        3 Clustering enabled us to functionally classify co-expressed genes, including…
683 | #   7 Abstract           2        4 Collections of operons were predicted from the microarray data, and some of …
684 | #   8 Abstract           2        5 Several regulatory DNA motifs, probably recognized by the regulatory protein…
685 | #   9 Abstract           3        1 The comparative transcriptomics analysis we present here not only benefits o…
686 | #  10 Background         1        1 Yersinia pestis is the etiological agent of plague, alternatively growing in…
687 | #  # … with 184 more rows
688 | count(txt, section, sort=TRUE)
689 | #  # A tibble: 21 x 2
690 | #     section                                                                                                   n
691 | #     <chr>                                                                                                 <int>
692 | #   1 Results and Discussion; Clustering analysis and functional classification of co-expressed gene clust…    22
693 | #   2 Background                                                                                               20
694 | #   3 Results and Discussion; Virulence genes in response to multiple environmental stresses                   20
695 | #   4 Methods; Collection of microarray expression data                                                        17
696 | #   5 Results and Discussion; Computational discovery of regulatory DNA motifs                                 16
697 | #   6 Methods; Gel mobility shift analysis of Fur binding                                                      13
698 | #   7 Results and Discussion; Verification of predicted operons by RT-PCR                                      10
699 | #   8 Abstract                                                                                                  8
700 | #   9 Methods; Discovery of regulatory DNA motifs                                                               8
701 | #  10 Methods; Clustering analysis                                                                              7
702 | #  # … with 11 more rows
703 |

Load the tidytext package for further text processing.

704 | 740 |

The pmc_table function formats tables by collapsing multiline headers, expanding rowspan and colspan attributes and adding subheadings into a new column.

741 | 762 |

Use collapse_rows to join column names and cell values in a semi-colon delimited string (and then search using functions in the next section).

763 | 778 |

The other three pmc functions are described in the package vignette.

779 |

Searching text

780 |

There are a few functions to search within the pmc_text or collapsed pmc_table output. separate_text uses the stringr package to extract any regular expression or vector of words.

781 | 794 |

A few wrappers search pre-defined patterns and add an extra step to expand matched ranges. separate_refs matches references within brackets using \\[[0-9, -]+\\] and expands ranges like [7-9].

795 | 810 |

separate_genes will find microbial genes like tauD (with a capitalized 4th letter) and expand operons like tauABCD into four genes. separate_tags will find and expand locus tag ranges below.

811 | 820 |

See the vignette for more details including code to parse XML documents using the xml2 package. The PMC FTP vignette has details on parsing XML files at the Europe PMC FTP site.

821 |

Community Guidelines

822 |

This project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms. Feedback, bug reports, and feature requests are welcome here.

823 | 824 | 825 | 826 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Build 3 | Status](https://travis-ci.org/ropensci/tidypmc.svg?branch=master)](https://travis-ci.org/ropensci/tidypmc) 4 | [![Coverage 5 | status](https://codecov.io/gh/ropensci/tidypmc/branch/master/graph/badge.svg)](https://codecov.io/github/ropensci/tidypmc?branch=master) 6 | [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/tidypmc)](https://cran.r-project.org/package=tidypmc) 7 | [![Downloads](https://cranlogs.r-pkg.org/badges/tidypmc)](https://CRAN.R-project.org/package=tidypmc) 8 | [![Total 9 | Downloads](https://cranlogs.r-pkg.org/badges/grand-total/tidypmc?color=orange)](https://CRAN.R-project.org/package=tidypmc) 10 | 11 | # tidypmc 12 | 13 | The [Open Access subset](https://europepmc.org/downloads/openaccess) of 14 | [Pubmed Central](https://europepmc.org) (PMC) includes 2.5 million 15 | articles from biomedical and life sciences journals. The full text XML 16 | files are freely available for text mining from the [REST 17 | service](https://europepmc.org/RestfulWebService) or [FTP 18 | site](https://europepmc.org/ftp/oa/) but can be challenging to parse. 19 | For example, section tags are nested to arbitrary depths, formulas and 20 | tables may return incomprehensible text blobs and superscripted 21 | references are pasted at the end of words. The functions in the 22 | `tidypmc` package are intended to return readable text and maintain the 23 | document structure, so gene names and other terms can be associated with 24 | specific sections, paragraphs, sentences or table rows. 25 | 26 | ## Installation 27 | 28 | Use [remotes](https://github.com/r-lib/remotes) to install the package. 29 | 30 | ``` r 31 | remotes::install_github("ropensci/tidypmc") 32 | ``` 33 | 34 | ## Load XML 35 | 36 | Download a single XML document like 37 | [PMC2231364](https://www.ebi.ac.uk/europepmc/webservices/rest/PMC2231364/fullTextXML) 38 | from the [REST service](https://europepmc.org/RestfulWebService) using 39 | the `pmc_xml` function. 40 | 41 | ``` r 42 | library(tidypmc) 43 | library(tidyverse) 44 | doc <- pmc_xml("PMC2231364") 45 | doc 46 | # {xml_document} 47 | #
48 | # [1] \n \n BMC Microbiol\n \n Background\n

Yersinia pestis is th ... 50 | # [3] \n \n \n Acknowledgements\n

We thank Dr. Chen ... 51 | ``` 52 | 53 | The [europepmc](https://github.com/ropensci/europepmc) package includes 54 | additional functions to search PMC and download full text. Be sure to 55 | include the `OPEN_ACCESS` field in the search since these are the only 56 | articles with full text XML available. 57 | 58 | ``` r 59 | library(europepmc) 60 | yp <- epmc_search("title:(Yersinia pestis virulence) OPEN_ACCESS:Y") 61 | # 19 records found, returning 19 62 | select(yp, pmcid, pubYear, title) %>% 63 | print(n=5) 64 | # # A tibble: 19 x 3 65 | # pmcid pubYear title 66 | # 67 | # 1 PMC5505154 2017 Crystal structure of Yersinia pestis virulence factor YfeA reveals two polyspe… 68 | # 2 PMC3521224 2012 Omics strategies for revealing Yersinia pestis virulence. 69 | # 3 PMC2704395 2009 Involvement of the post-transcriptional regulator Hfq in Yersinia pestis virul… 70 | # 4 PMC2736372 2009 The NlpD lipoprotein is a novel Yersinia pestis virulence factor essential for… 71 | # 5 PMC3109262 2011 A comprehensive study on the role of the Yersinia pestis virulence markers in … 72 | # # … with 14 more rows 73 | ``` 74 | 75 | Save all 19 results to a list of XML documents using the `epmc_ftxt` or 76 | `pmc_xml` function. 77 | 78 | ``` r 79 | docs <- map(yp$pmcid, epmc_ftxt) 80 | ``` 81 | 82 | See the [PMC FTP 83 | vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md) 84 | for details on parsing the large XML files on the [FTP 85 | site](https://europepmc.org/ftp/oa/) with 10,000 articles each. 86 | 87 | ## Parse XML 88 | 89 | The package includes five functions to parse the 90 | `xml_document`. 91 | 92 | | R function | Description | 93 | | :-------------- | :-------------------------------------------------------------------------- | 94 | | `pmc_text` | Split section paragraphs into sentences with full path to subsection titles | 95 | | `pmc_caption` | Split figure, table and supplementary material captions into sentences | 96 | | `pmc_table` | Convert table nodes into a list of tibbles | 97 | | `pmc_reference` | Format references cited into a tibble | 98 | | `pmc_metadata` | List journal and article metadata in front node | 99 | 100 | The `pmc_text` function uses the 101 | [tokenizers](https://lincolnmullen.com/software/tokenizers/) package to 102 | split section paragraphs into sentences. The function also removes any 103 | tables, figures or formulas that are nested within paragraph tags, 104 | replaces superscripted references with brackets, adds carets and 105 | underscores to other superscripts and subscripts and includes the full 106 | path to the subsection title. 107 | 108 | ``` r 109 | txt <- pmc_text(doc) 110 | # Note: removing disp-formula nested in sec/p tag 111 | txt 112 | # # A tibble: 194 x 4 113 | # section paragraph sentence text 114 | # 115 | # 1 Title 1 1 Comparative transcriptomics in Yersinia pestis: a global view of environment… 116 | # 2 Abstract 1 1 Environmental modulation of gene expression in Yersinia pestis is critical f… 117 | # 3 Abstract 1 2 Using cDNA microarray technology, we have analyzed the global gene expressio… 118 | # 4 Abstract 2 1 To provide us with a comprehensive view of environmental modulation of globa… 119 | # 5 Abstract 2 2 Almost all known virulence genes of Y. pestis were differentially regulated … 120 | # 6 Abstract 2 3 Clustering enabled us to functionally classify co-expressed genes, including… 121 | # 7 Abstract 2 4 Collections of operons were predicted from the microarray data, and some of … 122 | # 8 Abstract 2 5 Several regulatory DNA motifs, probably recognized by the regulatory protein… 123 | # 9 Abstract 3 1 The comparative transcriptomics analysis we present here not only benefits o… 124 | # 10 Background 1 1 Yersinia pestis is the etiological agent of plague, alternatively growing in… 125 | # # … with 184 more rows 126 | count(txt, section, sort=TRUE) 127 | # # A tibble: 21 x 2 128 | # section n 129 | # 130 | # 1 Results and Discussion; Clustering analysis and functional classification of co-expressed gene clust… 22 131 | # 2 Background 20 132 | # 3 Results and Discussion; Virulence genes in response to multiple environmental stresses 20 133 | # 4 Methods; Collection of microarray expression data 17 134 | # 5 Results and Discussion; Computational discovery of regulatory DNA motifs 16 135 | # 6 Methods; Gel mobility shift analysis of Fur binding 13 136 | # 7 Results and Discussion; Verification of predicted operons by RT-PCR 10 137 | # 8 Abstract 8 138 | # 9 Methods; Discovery of regulatory DNA motifs 8 139 | # 10 Methods; Clustering analysis 7 140 | # # … with 11 more rows 141 | ``` 142 | 143 | Load the [tidytext](https://www.tidytextmining.com/) package for further 144 | text processing. 145 | 146 | ``` r 147 | library(tidytext) 148 | x1 <- unnest_tokens(txt, word, text) %>% 149 | anti_join(stop_words) %>% 150 | filter(!word %in% 1:100) 151 | # Joining, by = "word" 152 | filter(x1, str_detect(section, "^Results")) 153 | # # A tibble: 1,269 x 4 154 | # section paragraph sentence word 155 | # 156 | # 1 Results and Discussion 1 1 comprehensive 157 | # 2 Results and Discussion 1 1 analysis 158 | # 3 Results and Discussion 1 1 sets 159 | # 4 Results and Discussion 1 1 microarray 160 | # 5 Results and Discussion 1 1 expression 161 | # 6 Results and Discussion 1 1 data 162 | # 7 Results and Discussion 1 1 dissect 163 | # 8 Results and Discussion 1 1 bacterial 164 | # 9 Results and Discussion 1 1 adaptation 165 | # 10 Results and Discussion 1 1 environments 166 | # # … with 1,259 more rows 167 | filter(x1, str_detect(section, "^Results")) %>% 168 | count(word, sort = TRUE) 169 | # # A tibble: 595 x 2 170 | # word n 171 | # 172 | # 1 genes 45 173 | # 2 cluster 24 174 | # 3 expression 21 175 | # 4 pestis 21 176 | # 5 data 19 177 | # 6 dna 15 178 | # 7 gene 15 179 | # 8 figure 13 180 | # 9 fur 12 181 | # 10 operons 12 182 | # # … with 585 more rows 183 | ``` 184 | 185 | The `pmc_table` function formats tables by collapsing multiline headers, 186 | expanding rowspan and colspan attributes and adding subheadings into a 187 | new column. 188 | 189 | ``` r 190 | tbls <- pmc_table(doc) 191 | # Parsing 4 tables 192 | # Adding footnotes to Table 1 193 | map_int(tbls, nrow) 194 | # Table 1 Table 2 Table 3 Table 4 195 | # 39 23 4 34 196 | tbls[[1]] 197 | # # A tibble: 39 x 5 198 | # subheading `Potential operon (r va… `Gene ID` `Putative or predicted functi… `Reference (s)` 199 | # 200 | # 1 Iron uptake or heme sy… yfeABCD operon* (r > 0.… YPO2439-24… Transport/binding chelated ir… yfeABCD [54] 201 | # 2 Iron uptake or heme sy… hmuRSTUV operon (r > 0.… YPO0279-02… Transport/binding hemin hmuRSTUV [55] 202 | # 3 Iron uptake or heme sy… ysuJIHG* (r > 0.95) YPO1529-15… Iron uptake - 203 | # 4 Iron uptake or heme sy… sufABCDS* (r > 0.90) YPO2400-24… Iron-regulated Fe-S cluster a… - 204 | # 5 Iron uptake or heme sy… YPO1854-1856* (r > 0.97) YPO1854-18… Iron uptake or heme synthesis? - 205 | # 6 Sulfur metabolism tauABCD operon (r > 0.9… YPO0182-01… Transport/binding taurine tauABCD [56] 206 | # 7 Sulfur metabolism ssuEADCB operon (r > 0.… YPO3623-36… Sulphur metabolism ssu operon [57] 207 | # 8 Sulfur metabolism cys operon (r > 0.92) YPO3010-30… Cysteine synthesis - 208 | # 9 Sulfur metabolism YPO1317-1319 (r > 0.97) YPO1317-13… Sulfur metabolism? - 209 | # 10 Sulfur metabolism YPO4109-4111 (r > 0.90) YPO4109-41… Sulfur metabolism? - 210 | # # … with 29 more rows 211 | ``` 212 | 213 | Use `collapse_rows` to join column names and cell values in a semi-colon 214 | delimited string (and then search using functions in the next section). 215 | 216 | ``` r 217 | collapse_rows(tbls, na.string="-") 218 | # # A tibble: 100 x 3 219 | # table row text 220 | # 221 | # 1 Table 1 1 subheading=Iron uptake or heme synthesis; Potential operon (r value)=yfeABCD operon* (r > 0.… 222 | # 2 Table 1 2 subheading=Iron uptake or heme synthesis; Potential operon (r value)=hmuRSTUV operon (r > 0.… 223 | # 3 Table 1 3 subheading=Iron uptake or heme synthesis; Potential operon (r value)=ysuJIHG* (r > 0.95); Ge… 224 | # 4 Table 1 4 subheading=Iron uptake or heme synthesis; Potential operon (r value)=sufABCDS* (r > 0.90); G… 225 | # 5 Table 1 5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1854-1856* (r > 0.97… 226 | # 6 Table 1 6 subheading=Sulfur metabolism; Potential operon (r value)=tauABCD operon (r > 0.90); Gene ID=… 227 | # 7 Table 1 7 subheading=Sulfur metabolism; Potential operon (r value)=ssuEADCB operon (r > 0.97); Gene ID… 228 | # 8 Table 1 8 subheading=Sulfur metabolism; Potential operon (r value)=cys operon (r > 0.92); Gene ID=YPO3… 229 | # 9 Table 1 9 subheading=Sulfur metabolism; Potential operon (r value)=YPO1317-1319 (r > 0.97); Gene ID=YP… 230 | # 10 Table 1 10 subheading=Sulfur metabolism; Potential operon (r value)=YPO4109-4111 (r > 0.90); Gene ID=YP… 231 | # # … with 90 more rows 232 | ``` 233 | 234 | The other three `pmc` functions are described in the package 235 | [vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md). 236 | 237 | ## Searching text 238 | 239 | There are a few functions to search within the `pmc_text` or collapsed 240 | `pmc_table` output. `separate_text` uses the 241 | [stringr](https://stringr.tidyverse.org/) package to extract any regular 242 | expression or vector of words. 243 | 244 | ``` r 245 | separate_text(txt, "[ATCGN]{5,}") 246 | # # A tibble: 9 x 5 247 | # match section paragraph sentence text 248 | # 249 | # 1 ACGCAATCGTT… Results and Discussion; Comput… 2 3 A 16 basepair (bp) box (5'-ACGCAATCGTTTTCNT… 250 | # 2 AAACGTTTNCGT Results and Discussion; Comput… 2 4 It is very similar to the E. coli PurR box … 251 | # 3 TGATAATGATT… Results and Discussion; Comput… 2 5 A 21 bp box (5'-TGATAATGATTATCATTATCA-3') w… 252 | # 4 GATAATGATAA… Results and Discussion; Comput… 2 6 It is a 10-1-10 inverted repeat that resemb… 253 | # 5 TGANNNNNNTC… Results and Discussion; Comput… 2 7 A 15 bp box (5'-TGANNNNNNTCAA-3') was found… 254 | # 6 TTGATN Results and Discussion; Comput… 2 8 It is a part of the E. coli Fnr box (5'-AAW… 255 | # 7 NATCAA Results and Discussion; Comput… 2 8 It is a part of the E. coli Fnr box (5'-AAW… 256 | # 8 GTTAATTAA Results and Discussion; Comput… 3 4 The ArcA regulator can recognize a relative… 257 | # 9 GTTAATTAATGT Results and Discussion; Comput… 3 5 An ArcA-box-like sequence (5'-GTTAATTAATGT-… 258 | ``` 259 | 260 | A few wrappers search pre-defined patterns and add an extra step to 261 | expand matched ranges. `separate_refs` matches references within 262 | brackets using `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`. 263 | 264 | ``` r 265 | separate_refs(txt) 266 | # # A tibble: 93 x 6 267 | # id match section paragraph sentence text 268 | # 269 | # 1 1 [1] Backgrou… 1 1 Yersinia pestis is the etiological agent of plague, alternatively… 270 | # 2 2 [2] Backgrou… 1 3 To produce a transmissible infection, Y. pestis colonizes the fle… 271 | # 3 3 [3] Backgrou… 1 9 However, a few bacilli are taken up by tissue macrophages, provid… 272 | # 4 4 [4,5] Backgrou… 1 10 Residence in this niche also facilitates the bacteria's resistanc… 273 | # 5 5 [4,5] Backgrou… 1 10 Residence in this niche also facilitates the bacteria's resistanc… 274 | # 6 6 [6] Backgrou… 2 1 A DNA microarray is able to determine simultaneous changes in all… 275 | # 7 7 [7-9] Backgrou… 2 2 We and others have measured the gene expression profiles of Y. pe… 276 | # 8 8 [7-9] Backgrou… 2 2 We and others have measured the gene expression profiles of Y. pe… 277 | # 9 9 [7-9] Backgrou… 2 2 We and others have measured the gene expression profiles of Y. pe… 278 | # 10 10 [10] Backgrou… 2 2 We and others have measured the gene expression profiles of Y. pe… 279 | # # … with 83 more rows 280 | ``` 281 | 282 | `separate_genes` will find microbial genes like tauD (with a capitalized 283 | 4th letter) and expand operons like `tauABCD` into four genes. 284 | `separate_tags` will find and expand locus tag ranges below. 285 | 286 | ``` r 287 | collapse_rows(tbls, na="-") %>% 288 | separate_tags("YPO") %>% 289 | filter(id == "YPO1855") 290 | # # A tibble: 3 x 5 291 | # id match table row text 292 | # 293 | # 1 YPO1855 YPO1854-1856 Table… 5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1… 294 | # 2 YPO1855 YPO1854-1856 Table… 21 subheading=Category C: Hypothetical; Gene ID=YPO1854-1856; Description=Pu… 295 | # 3 YPO1855 YPO1854-YPO… Table… 2 Cluster=Cluster II; Genes or operons for motif discovery=hmuRSTUV, YPO068… 296 | ``` 297 | 298 | See the 299 | [vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/tidypmc.md) 300 | for more details including code to parse XML documents using the 301 | [xml2](https://github.com/r-lib/xml2) package. The [PMC FTP 302 | vignette](https://github.com/ropensci/tidypmc/blob/master/vignettes/pmcftp.md) 303 | has details on parsing XML files at the Europe PMC [FTP 304 | site](https://europepmc.org/ftp/oa/). 305 | 306 | ### Community Guidelines 307 | 308 | This project is released with a [Contributor Code of 309 | Conduct](CONDUCT.md). By participating in this project you agree to 310 | abide by its terms. Feedback, bug reports, and feature requests are 311 | welcome [here](https://github.com/ropensci/tidypmc/issues). 312 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": ["https://doi.org/10.5063/schema/codemeta-2.0", "http://schema.org"], 3 | "@type": "SoftwareSourceCode", 4 | "identifier": "tidypmc", 5 | "description": "This package parses section paragraphs, captions, tables, references and metadata from XML documents in the Open Access subset of Pubmed Central. Additional functions are available to search text and expand ranges of the referenes cited, locus tags and operons.", 6 | "name": "tidypmc: Parse full text XML documents from PMC", 7 | "codeRepository": "https://github.com/ropensci/tidypmc", 8 | "license": "https://spdx.org/licenses/GPL-3.0", 9 | "version": "1.1", 10 | "programmingLanguage": { 11 | "@type": "ComputerLanguage", 12 | "name": "R", 13 | "version": "3.5.2", 14 | "url": "https://r-project.org" 15 | }, 16 | "runtimePlatform": "R version 3.5.2 (2018-12-20)", 17 | "author": {}, 18 | "contributor": {}, 19 | "copyrightHolder": {}, 20 | "funder": {}, 21 | "maintainer": [ 22 | { 23 | "@type": "Person", 24 | "givenName": "Chris", 25 | "familyName": "Stubben", 26 | "email": "chris.stubben@hci.utah.edu" 27 | } 28 | ], 29 | "softwareSuggestions": [ 30 | { 31 | "@type": "SoftwareApplication", 32 | "identifier": "europepmc", 33 | "name": "europepmc", 34 | "provider": { 35 | "@id": "https://cran.r-project.org", 36 | "@type": "Organization", 37 | "name": "Comprehensive R Archive Network (CRAN)", 38 | "url": "https://cran.r-project.org" 39 | }, 40 | "sameAs": "https://CRAN.R-project.org/package=europepmc" 41 | }, 42 | { 43 | "@type": "SoftwareApplication", 44 | "identifier": "tidytext", 45 | "name": "tidytext", 46 | "provider": { 47 | "@id": "https://cran.r-project.org", 48 | "@type": "Organization", 49 | "name": "Comprehensive R Archive Network (CRAN)", 50 | "url": "https://cran.r-project.org" 51 | }, 52 | "sameAs": "https://CRAN.R-project.org/package=tidytext" 53 | } 54 | ], 55 | "softwareRequirements": [ 56 | { 57 | "@type": "SoftwareApplication", 58 | "identifier": "xml2", 59 | "name": "xml2", 60 | "provider": { 61 | "@id": "https://cran.r-project.org", 62 | "@type": "Organization", 63 | "name": "Comprehensive R Archive Network (CRAN)", 64 | "url": "https://cran.r-project.org" 65 | }, 66 | "sameAs": "https://CRAN.R-project.org/package=xml2" 67 | }, 68 | { 69 | "@type": "SoftwareApplication", 70 | "identifier": "tokenizers", 71 | "name": "tokenizers", 72 | "provider": { 73 | "@id": "https://cran.r-project.org", 74 | "@type": "Organization", 75 | "name": "Comprehensive R Archive Network (CRAN)", 76 | "url": "https://cran.r-project.org" 77 | }, 78 | "sameAs": "https://CRAN.R-project.org/package=tokenizers" 79 | }, 80 | { 81 | "@type": "SoftwareApplication", 82 | "identifier": "stringr", 83 | "name": "stringr", 84 | "provider": { 85 | "@id": "https://cran.r-project.org", 86 | "@type": "Organization", 87 | "name": "Comprehensive R Archive Network (CRAN)", 88 | "url": "https://cran.r-project.org" 89 | }, 90 | "sameAs": "https://CRAN.R-project.org/package=stringr" 91 | }, 92 | { 93 | "@type": "SoftwareApplication", 94 | "identifier": "tibble", 95 | "name": "tibble", 96 | "provider": { 97 | "@id": "https://cran.r-project.org", 98 | "@type": "Organization", 99 | "name": "Comprehensive R Archive Network (CRAN)", 100 | "url": "https://cran.r-project.org" 101 | }, 102 | "sameAs": "https://CRAN.R-project.org/package=tibble" 103 | }, 104 | { 105 | "@type": "SoftwareApplication", 106 | "identifier": "dplyr", 107 | "name": "dplyr", 108 | "provider": { 109 | "@id": "https://cran.r-project.org", 110 | "@type": "Organization", 111 | "name": "Comprehensive R Archive Network (CRAN)", 112 | "url": "https://cran.r-project.org" 113 | }, 114 | "sameAs": "https://CRAN.R-project.org/package=dplyr" 115 | }, 116 | { 117 | "@type": "SoftwareApplication", 118 | "identifier": "readr", 119 | "name": "readr", 120 | "provider": { 121 | "@id": "https://cran.r-project.org", 122 | "@type": "Organization", 123 | "name": "Comprehensive R Archive Network (CRAN)", 124 | "url": "https://cran.r-project.org" 125 | }, 126 | "sameAs": "https://CRAN.R-project.org/package=readr" 127 | } 128 | ], 129 | "fileSize": "84.221KB", 130 | "citation": [ 131 | { 132 | "@type": "SoftwareSourceCode", 133 | "author": [ 134 | { 135 | "@type": "Person", 136 | "givenName": "Chris", 137 | "familyName": "Stubben" 138 | } 139 | ], 140 | "name": "tidypmc: Parse full text XML documents from PMC", 141 | "url": "https://github.com/ropensci/tidypmc", 142 | "description": "R package version 1.6" 143 | } 144 | ] 145 | } 146 | -------------------------------------------------------------------------------- /man/collapse_rows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/collapse_rows.R 3 | \name{collapse_rows} 4 | \alias{collapse_rows} 5 | \title{Collapse a list of PubMed Central tables} 6 | \usage{ 7 | collapse_rows(pmc, na.string) 8 | } 9 | \arguments{ 10 | \item{pmc}{a list of tables, usually from \code{\link{pmc_table}}} 11 | 12 | \item{na.string}{additional cell values to skip, default is NA and ""} 13 | } 14 | \value{ 15 | A tibble with table and row number and collapsed text 16 | } 17 | \description{ 18 | Collapse rows into a semi-colon delimited list with column names and cell 19 | values 20 | } 21 | \examples{ 22 | x <- data.frame( 23 | genes = c("aroB", "glnP", "ndhA", "pyrF"), 24 | fold_change = c(2.5, 1.7, -3.1, -2.6) 25 | ) 26 | collapse_rows(list(`Table 1` = x)) 27 | } 28 | \author{ 29 | Chris Stubben 30 | } 31 | -------------------------------------------------------------------------------- /man/pmc_caption.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_caption.R 3 | \name{pmc_caption} 4 | \alias{pmc_caption} 5 | \title{Split captions into sentences} 6 | \usage{ 7 | pmc_caption(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{\code{xml_document} from PubMed Central} 11 | } 12 | \value{ 13 | a tibble with tag, label, sentence number and text 14 | } 15 | \description{ 16 | Split figure, table and supplementary material captions into sentences 17 | } 18 | \examples{ 19 | # doc <- pmc_xml("PMC2231364") # OR 20 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 21 | package = "tidypmc" 22 | )) 23 | x <- pmc_caption(doc) 24 | x 25 | dplyr::filter(x, sentence == 1) 26 | } 27 | \author{ 28 | Chris Stubben 29 | } 30 | -------------------------------------------------------------------------------- /man/pmc_metadata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_metadata.R 3 | \name{pmc_metadata} 4 | \alias{pmc_metadata} 5 | \title{Get article metadata} 6 | \usage{ 7 | pmc_metadata(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{\code{xml_document} from PubMed Central} 11 | } 12 | \value{ 13 | a list 14 | } 15 | \description{ 16 | Get a list of journal and article metadata in /front tag 17 | } 18 | \examples{ 19 | # doc <- pmc_xml("PMC2231364") # OR 20 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 21 | package = "tidypmc" 22 | )) 23 | pmc_metadata(doc) 24 | } 25 | \author{ 26 | Chris Stubben 27 | } 28 | -------------------------------------------------------------------------------- /man/pmc_reference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_reference.R 3 | \name{pmc_reference} 4 | \alias{pmc_reference} 5 | \title{Format references cited} 6 | \usage{ 7 | pmc_reference(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{\code{xml_document} from PubMed Central} 11 | } 12 | \value{ 13 | a tibble with id, pmid, authors, year, title, journal, volume, pages, 14 | and doi. 15 | } 16 | \description{ 17 | Format references cited 18 | } 19 | \note{ 20 | Mixed citations without any child tags are added to the author column. 21 | } 22 | \examples{ 23 | # doc <- pmc_xml("PMC2231364") 24 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 25 | package = "tidypmc" 26 | )) 27 | x <- pmc_reference(doc) 28 | x 29 | } 30 | \author{ 31 | Chris Stubben 32 | } 33 | -------------------------------------------------------------------------------- /man/pmc_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_table.R 3 | \name{pmc_table} 4 | \alias{pmc_table} 5 | \title{Convert table nodes to tibbles} 6 | \usage{ 7 | pmc_table(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{\code{xml_document} from PubMed Central} 11 | } 12 | \value{ 13 | a list of tibbles 14 | } 15 | \description{ 16 | Convert PubMed Central table nodes into a list of tibbles 17 | } 18 | \note{ 19 | Saves the caption and footnotes as attributes and collapses multiline 20 | headers, expands all rowspan and colspan attributes and adds 21 | subheadings to column one. 22 | } 23 | \examples{ 24 | # doc <- pmc_xml("PMC2231364") 25 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 26 | package = "tidypmc" 27 | )) 28 | x <- pmc_table(doc) 29 | sapply(x, dim) 30 | x 31 | attributes(x[[1]]) 32 | } 33 | \author{ 34 | Chris Stubben 35 | } 36 | -------------------------------------------------------------------------------- /man/pmc_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_text.R 3 | \name{pmc_text} 4 | \alias{pmc_text} 5 | \title{Split section paragraphs into sentences} 6 | \usage{ 7 | pmc_text(doc) 8 | } 9 | \arguments{ 10 | \item{doc}{\code{xml_document} from PubMed Central} 11 | } 12 | \value{ 13 | a tibble with section, paragraph and sentence number and text 14 | } 15 | \description{ 16 | Split section paragraph tags into a table with subsection titles and 17 | sentences using \code{tokenize_sentences} 18 | } 19 | \note{ 20 | Subsections may be nested to arbitrary depths and this function will 21 | return the entire path to the subsection title as a delimited string like 22 | "Results; Predicted functions; Pathogenicity". Tables, figures and 23 | formulas that are nested in section paragraphs are removed, superscripted 24 | references are replaced with brackets, and any other superscripts or 25 | subscripts are separared with ^ and _. 26 | } 27 | \examples{ 28 | # doc <- pmc_xml("PMC2231364") 29 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 30 | package = "tidypmc" 31 | )) 32 | txt <- pmc_text(doc) 33 | txt 34 | dplyr::count(txt, section, sort = TRUE) 35 | } 36 | \author{ 37 | Chris Stubben 38 | } 39 | -------------------------------------------------------------------------------- /man/pmc_xml.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pmc_xml.R 3 | \name{pmc_xml} 4 | \alias{pmc_xml} 5 | \title{Download XML from PubMed Central} 6 | \source{ 7 | \url{https://europepmc.org/RestfulWebService} 8 | } 9 | \usage{ 10 | pmc_xml(id) 11 | } 12 | \arguments{ 13 | \item{id}{a PMC id starting with 'PMC'} 14 | } 15 | \value{ 16 | \code{xml_document} 17 | } 18 | \description{ 19 | Download XML from PubMed Central 20 | } 21 | \examples{ 22 | \dontrun{ 23 | doc <- pmc_xml("PMC2231364") 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /man/separate_genes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/separate_genes.R 3 | \name{separate_genes} 4 | \alias{separate_genes} 5 | \title{Separate genes and operons into multiple rows} 6 | \usage{ 7 | separate_genes(txt, pattern = "\\\\b[A-Za-z][a-z]{2}[A-Z0-9]+\\\\b", 8 | genes, operon = 6, column = "text") 9 | } 10 | \arguments{ 11 | \item{txt}{a table} 12 | 13 | \item{pattern}{regular expression to match genes, default is to match 14 | microbial genes like AbcD, default [A-Za-z][a-z]{2}[A-Z0-9]+} 15 | 16 | \item{genes}{an optional vector of genes, set pattern to NA to only match 17 | this list.} 18 | 19 | \item{operon}{operon length, default 6. Split genes with 6 or more letters 20 | into separate genes, for example AbcDEF is split into abcD, abcE and abcF.} 21 | 22 | \item{column}{column name to search, default "text"} 23 | } 24 | \value{ 25 | a tibble with gene name, matching text and rows. 26 | } 27 | \description{ 28 | Separate genes and operons mentioned in full text into multiple rows 29 | } 30 | \note{ 31 | Check for genes in italics using \code{xml_text(xml_find_all(doc, 32 | "//sec//p//italic"))} and update the pattern or add additional genes as an 33 | optional vector if needed 34 | } 35 | \examples{ 36 | x <- data.frame(row = 1, text = "Genes like YacK, hmu and sufABC") 37 | separate_genes(x) 38 | separate_genes(x, genes = "hmu") 39 | } 40 | \author{ 41 | Chris Stubben 42 | } 43 | -------------------------------------------------------------------------------- /man/separate_refs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/separate_refs.R 3 | \name{separate_refs} 4 | \alias{separate_refs} 5 | \title{Separate references cited into multiple rows} 6 | \usage{ 7 | separate_refs(txt, column = "text") 8 | } 9 | \arguments{ 10 | \item{txt}{a table} 11 | 12 | \item{column}{column name, default "text"} 13 | } 14 | \value{ 15 | a tibble 16 | } 17 | \description{ 18 | Separates references cited in brackets or parentheses into multiple rows and 19 | splits the comma-delimited numeric strings and expands ranges like 7-9 into 20 | new rows 21 | } 22 | \examples{ 23 | x <- data.frame(row = 1, text = "some important studies [7-9,15]") 24 | separate_refs(x) 25 | } 26 | \author{ 27 | Chris Stubben 28 | } 29 | -------------------------------------------------------------------------------- /man/separate_tags.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/separate_tags.R 3 | \name{separate_tags} 4 | \alias{separate_tags} 5 | \title{Separate locus tag into multiple rows} 6 | \usage{ 7 | separate_tags(txt, pattern, column = "text") 8 | } 9 | \arguments{ 10 | \item{txt}{a table} 11 | 12 | \item{pattern}{regular expression to match locus tags like YPO[0-9-]+ or 13 | the locus tag prefix like YPO.} 14 | 15 | \item{column}{column name to search, default "text"} 16 | } 17 | \value{ 18 | a tibble with locus tag, matching text and rows. 19 | } 20 | \description{ 21 | Separates locus tags mentioned in full text and expands ranges like 22 | YPO1970-74 into new rows 23 | } 24 | \examples{ 25 | x <- data.frame(row = 1, text = "some genes like YPO1002 and YPO1970-74") 26 | separate_tags(x, "YPO") 27 | } 28 | \author{ 29 | Chris Stubben 30 | } 31 | -------------------------------------------------------------------------------- /man/separate_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/separate_text.R 3 | \name{separate_text} 4 | \alias{separate_text} 5 | \title{Separate all matching text into multiple rows} 6 | \usage{ 7 | separate_text(txt, pattern, column = "text") 8 | } 9 | \arguments{ 10 | \item{txt}{a tibble, usually results from \code{pmc_text}} 11 | 12 | \item{pattern}{either a regular expression or a vector of words to find in 13 | text} 14 | 15 | \item{column}{column name, default "text"} 16 | } 17 | \value{ 18 | a tibble 19 | } 20 | \description{ 21 | Separate all matching text into multiple rows 22 | } 23 | \note{ 24 | passed to \code{grepl} and \code{str_extract_all} 25 | } 26 | \examples{ 27 | # doc <- pmc_xml("PMC2231364") 28 | doc <- xml2::read_xml(system.file("extdata/PMC2231364.xml", 29 | package = "tidypmc")) 30 | txt <- pmc_text(doc) 31 | separate_text(txt, "[ATCGN]{5,}") 32 | separate_text(txt, "\\\\([A-Z]{3,6}s?\\\\)") 33 | # pattern can be a vector of words 34 | separate_text(txt, c("hmu", "ybt", "yfe", "yfu")) 35 | # wrappers for separate_text with extra step to expand matched ranges 36 | separate_refs(txt) 37 | separate_genes(txt) 38 | separate_tags(txt, "YPO") 39 | 40 | } 41 | \author{ 42 | Chris Stubben 43 | } 44 | -------------------------------------------------------------------------------- /man/tidypmc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tidypmc-package.R 3 | \docType{package} 4 | \name{tidypmc} 5 | \alias{tidypmc} 6 | \alias{tidypmc-package} 7 | \title{\code{tidypmc} package} 8 | \description{ 9 | Parse full text XML documents from PubMed Central 10 | } 11 | \details{ 12 | See the Github page for details at \url{https://github.com/ropensci/tidypmc} 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tidypmc) 3 | 4 | test_check("tidypmc") 5 | -------------------------------------------------------------------------------- /tests/testthat/tests-pmc_other.R: -------------------------------------------------------------------------------- 1 | context("Parse other") 2 | 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc")) 4 | doc2 <- xml2::read_xml("

This is some text

") 5 | 6 | test_that("pmc_caption works", { 7 | expect_is(pmc_caption(doc), "tbl_df") 8 | expect_error(pmc_caption("a vector") ) 9 | expect_equal(pmc_caption(doc2), NULL) 10 | }) 11 | 12 | test_that("pmc_reference works", { 13 | expect_is(pmc_reference(doc), "tbl_df") 14 | expect_error(pmc_reference("a vector") ) 15 | expect_equal(pmc_reference(doc2), NULL) 16 | }) 17 | 18 | test_that("pmc_metadata works", { 19 | expect_is(pmc_metadata(doc), "list") 20 | expect_error(pmc_metadata("a vector") ) 21 | expect_equal(pmc_metadata(doc2), NULL) 22 | }) 23 | 24 | test_that("pmc_xml works", { 25 | expect_error(pmc_xml("not ID")) 26 | }) 27 | -------------------------------------------------------------------------------- /tests/testthat/tests-pmc_table.R: -------------------------------------------------------------------------------- 1 | context("Parse tables") 2 | 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc")) 4 | doc2 <- xml2::read_xml("

This is some text

") 5 | t1 <- pmc_table(doc) 6 | 7 | 8 | test_that("pmc_table works", { 9 | expect_is(t1, "list") 10 | expect_error(pmc_table("a vector") ) 11 | expect_equal(pmc_text(doc2), NULL) 12 | }) 13 | 14 | test_that("collapse rows works", { 15 | expect_is(collapse_rows(t1), "tbl_df") 16 | expect_is(collapse_rows(t1[[1]]), "tbl_df") 17 | expect_error(collapse_rows("a vector") ) 18 | }) 19 | 20 | test_that("repeat subheading works", { 21 | expect_is(repeat_sub(t1[[1]]), "tbl_df") 22 | expect_error(repeat_sub("a vector") ) 23 | }) 24 | -------------------------------------------------------------------------------- /tests/testthat/tests-pmc_text.R: -------------------------------------------------------------------------------- 1 | context("Parse text") 2 | 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc")) 4 | txt <- pmc_text(doc) 5 | doc2 <- xml2::read_xml("

This is some text

") 6 | 7 | test_that("path string formats", { 8 | x <- c("carnivores", "bears", "polar", "grizzly", "cats", "tiger") 9 | n <- c(1,2,3,3,2,3) 10 | expect_is(path_string(x, n), "character") 11 | expect_error(path_string(n, x)) 12 | }) 13 | 14 | test_that("pmc_text works", { 15 | expect_is(txt, "tbl_df") 16 | expect_error(pmc_text("a vector") ) 17 | expect_equal(pmc_text(doc2), NULL) 18 | }) 19 | -------------------------------------------------------------------------------- /tests/testthat/tests-separate.R: -------------------------------------------------------------------------------- 1 | context("Search text") 2 | 3 | doc <-xml2::read_xml(system.file("extdata/PMC2231364.xml", package = "tidypmc")) 4 | txt <- pmc_text(doc) 5 | 6 | test_that("Separate text", { 7 | expect_is(separate_text(txt, "[ATCGN]{5,}"), "tbl_df") 8 | expect_equal(separate_text(txt, "missing string"), NULL) 9 | }) 10 | 11 | test_that("Separate refs", { 12 | expect_is(separate_refs(txt), "tbl_df") 13 | # no refs in Abstract 14 | a1 <- separate_refs(dplyr::filter(txt, section=="Abstract")) 15 | expect_equal(a1, NULL) 16 | }) 17 | 18 | test_that("Separate genes", { 19 | expect_is(separate_genes(txt), "tbl_df") 20 | a1 <- separate_genes(dplyr::filter(txt, section=="Conclusion")) 21 | expect_equal(a1, NULL) 22 | }) 23 | 24 | test_that("Separate locus tags", { 25 | expect_is(separate_tags(txt, "YPO"), "tbl_df") 26 | a1 <- separate_tags(dplyr::filter(txt, section=="Abstract"), "YPO") 27 | expect_equal(a1, NULL) 28 | }) 29 | -------------------------------------------------------------------------------- /vignettes/pmcftp.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Parsing Europe PMC FTP files" 3 | author: "Chris Stubben" 4 | date: '`r gsub(" ", " ", format(Sys.time(), "%B %e, %Y"))`' 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteIndexEntry{Parse PMC FTP files} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "# " 16 | ) 17 | ``` 18 | 19 | 20 | The [Europe PMC FTP] includes 2.5 million open access articles separated into 21 | files with 10K articles each. Download and unzip a recent series of PMC ids 22 | and load into R using the `readr` package. A sample file with the first 10 23 | articles is included in the `tidypmc` package. 24 | 25 | ```{r load} 26 | library(readr) 27 | pmcfile <- system.file("extdata/PMC6358576_PMC6358589.xml", package = "tidypmc") 28 | pmc <- read_lines(pmcfile) 29 | ``` 30 | 31 | 32 | Find the start of the article nodes. 33 | 34 | ```{r startnode} 35 | a1 <- grep("^
39 | # [1] \n \n ACG Case Rep J\n \n Introduction\n

Bezoars a ... 41 | # [3] \n \n References\n \n

16 | # [1] \n \n ... 17 | # [2] \n \n Background\n

Yersi ... 18 | # [3] \n \n \n Acknowledgements\n ... 19 | ``` 20 | 21 | The package includes five functions to parse the 22 | `xml_document`. 23 | 24 | | R function | Description | 25 | | :-------------- | :-------------------------------------------------------------------------- | 26 | | `pmc_text` | Split section paragraphs into sentences with full path to subsection titles | 27 | | `pmc_caption` | Split figure, table and supplementary material captions into sentences | 28 | | `pmc_table` | Convert table nodes into a list of tibbles | 29 | | `pmc_reference` | Format references cited into a tibble | 30 | | `pmc_metadata` | List journal and article metadata in front node | 31 | 32 | `pmc_text` splits paragraphs into sentences and removes any tables, 33 | figures or formulas that are nested within paragraph tags, replaces 34 | superscripted references with brackets, adds carets and underscores to 35 | other superscripts and subscripts and includes the full path to the 36 | subsection title. 37 | 38 | ``` r 39 | library(dplyr) 40 | txt <- pmc_text(doc) 41 | txt 42 | # # A tibble: 194 x 4 43 | # section paragraph sentence text 44 | # 45 | # 1 Title 1 1 Comparative transcriptomics in Yersinia pestis: a global view of e… 46 | # 2 Abstract 1 1 Environmental modulation of gene expression in Yersinia pestis is … 47 | # 3 Abstract 1 2 Using cDNA microarray technology, we have analyzed the global gene… 48 | # 4 Abstract 2 1 To provide us with a comprehensive view of environmental modulatio… 49 | # 5 Abstract 2 2 Almost all known virulence genes of Y. pestis were differentially … 50 | # 6 Abstract 2 3 Clustering enabled us to functionally classify co-expressed genes,… 51 | # 7 Abstract 2 4 Collections of operons were predicted from the microarray data, an… 52 | # 8 Abstract 2 5 Several regulatory DNA motifs, probably recognized by the regulato… 53 | # 9 Abstract 3 1 The comparative transcriptomics analysis we present here not only … 54 | # 10 Background 1 1 Yersinia pestis is the etiological agent of plague, alternatively … 55 | # # … with 184 more rows 56 | count(txt, section) 57 | # # A tibble: 21 x 2 58 | # section n 59 | # 60 | # 1 Abstract 8 61 | # 2 Authors' contributions 6 62 | # 3 Background 20 63 | # 4 Conclusion 3 64 | # 5 Methods; Clustering analysis 7 65 | # 6 Methods; Collection of microarray expression data 17 66 | # 7 Methods; Discovery of regulatory DNA motifs 8 67 | # 8 Methods; Gel mobility shift analysis of Fur binding 13 68 | # 9 Methods; Operon prediction 5 69 | # 10 Methods; Verification of predicted operons by RT-PCR 7 70 | # # … with 11 more rows 71 | ``` 72 | 73 | `pmc_caption` splits figure, table and supplementary material captions 74 | into sentences. 75 | 76 | ``` r 77 | cap1 <- pmc_caption(doc) 78 | # Found 5 figures 79 | # Found 4 tables 80 | # Found 3 supplements 81 | filter(cap1, sentence == 1) 82 | # # A tibble: 12 x 4 83 | # tag label sentence text 84 | # 85 | # 1 figure Figure 1 1 Environmental modulation of expression of virulence genes. 86 | # 2 figure Figure 2 1 RT-PCR analysis of potential operons. 87 | # 3 figure Figure 3 1 Schematic representation of the clustered microarray data. 88 | # 4 figure Figure 4 1 Graphical representation of the consensus patterns by moti… 89 | # 5 figure Figure 5 1 EMSA analysis of the binding of Fur protein to promoter DN… 90 | # 6 table Table 1 1 Stress-responsive operons in Y. pestis predicted from micr… 91 | # 7 table Table 2 1 Classification of the gene members of the cluster II in Fi… 92 | # 8 table Table 3 1 Motif discovery for the clustering genes 93 | # 9 table Table 4 1 Designs for expression profiling of Y. pestis 94 | # 10 supplem… Additional file 1 … 1 Growth curves of Y. pestis strain 201 under different cond… 95 | # 11 supplem… Additional file 2 … 1 All the transcriptional changes of 4005 genes of Y. pestis… 96 | # 12 supplem… Additional file 3 … 1 List of oligonucleotide primers used in this study. 97 | ``` 98 | 99 | `pmc_table` formats tables by collapsing multiline headers, expanding 100 | rowspan and colspan attributes and adding subheadings into a new column. 101 | 102 | ``` r 103 | tab1 <- pmc_table(doc) 104 | # Parsing 4 tables 105 | # Adding footnotes to Table 1 106 | sapply(tab1, nrow) 107 | # Table 1 Table 2 Table 3 Table 4 108 | # 39 23 4 34 109 | tab1[[1]] 110 | # # A tibble: 39 x 5 111 | # subheading `Potential operon (r … `Gene ID` `Putative or predicted fu… `Reference (s)` 112 | # 113 | # 1 Iron uptake or heme… yfeABCD operon* (r > … YPO2439-2… Transport/binding chelate… yfeABCD [54] 114 | # 2 Iron uptake or heme… hmuRSTUV operon (r > … YPO0279-0… Transport/binding hemin hmuRSTUV [55] 115 | # 3 Iron uptake or heme… ysuJIHG* (r > 0.95) YPO1529-1… Iron uptake - 116 | # 4 Iron uptake or heme… sufABCDS* (r > 0.90) YPO2400-2… Iron-regulated Fe-S clust… - 117 | # 5 Iron uptake or heme… YPO1854-1856* (r > 0.… YPO1854-1… Iron uptake or heme synth… - 118 | # 6 Sulfur metabolism tauABCD operon (r > 0… YPO0182-0… Transport/binding taurine tauABCD [56] 119 | # 7 Sulfur metabolism ssuEADCB operon (r > … YPO3623-3… Sulphur metabolism ssu operon [57] 120 | # 8 Sulfur metabolism cys operon (r > 0.92) YPO3010-3… Cysteine synthesis - 121 | # 9 Sulfur metabolism YPO1317-1319 (r > 0.9… YPO1317-1… Sulfur metabolism? - 122 | # 10 Sulfur metabolism YPO4109-4111 (r > 0.9… YPO4109-4… Sulfur metabolism? - 123 | # # … with 29 more rows 124 | ``` 125 | 126 | Captions and footnotes are added as attributes. 127 | 128 | ``` r 129 | attributes(tab1[[1]]) 130 | # $names 131 | # [1] "subheading" "Potential operon (r value)" 132 | # [3] "Gene ID" "Putative or predicted function" 133 | # [5] "Reference (s)" 134 | # 135 | # $row.names 136 | # [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 137 | # [33] 33 34 35 36 37 38 39 138 | # 139 | # $class 140 | # [1] "tbl_df" "tbl" "data.frame" 141 | # 142 | # $caption 143 | # [1] "Stress-responsive operons in Y. pestis predicted from microarray expression data" 144 | # 145 | # $footnotes 146 | # [1] "'r' represents the correlation coefficient of adjacent genes; '*' represent the defined operon has the similar expression pattern in two other published microarray datasets [7, 21]; '?' inferred functions of uncharacterized genes; '-' means the corresponding operons have not been experimentally validated in other bacteria." 147 | ``` 148 | 149 | Use `collapse_rows` to join column names and cell values in a semi-colon 150 | delimited string (and then search using functions in the next section). 151 | 152 | ``` r 153 | collapse_rows(tab1, na.string="-") 154 | # # A tibble: 100 x 3 155 | # table row text 156 | # 157 | # 1 Table 1 1 subheading=Iron uptake or heme synthesis; Potential operon (r value)=yfeABCD opero… 158 | # 2 Table 1 2 subheading=Iron uptake or heme synthesis; Potential operon (r value)=hmuRSTUV oper… 159 | # 3 Table 1 3 subheading=Iron uptake or heme synthesis; Potential operon (r value)=ysuJIHG* (r >… 160 | # 4 Table 1 4 subheading=Iron uptake or heme synthesis; Potential operon (r value)=sufABCDS* (r … 161 | # 5 Table 1 5 subheading=Iron uptake or heme synthesis; Potential operon (r value)=YPO1854-1856*… 162 | # 6 Table 1 6 subheading=Sulfur metabolism; Potential operon (r value)=tauABCD operon (r > 0.90)… 163 | # 7 Table 1 7 subheading=Sulfur metabolism; Potential operon (r value)=ssuEADCB operon (r > 0.97… 164 | # 8 Table 1 8 subheading=Sulfur metabolism; Potential operon (r value)=cys operon (r > 0.92); Ge… 165 | # 9 Table 1 9 subheading=Sulfur metabolism; Potential operon (r value)=YPO1317-1319 (r > 0.97); … 166 | # 10 Table 1 10 subheading=Sulfur metabolism; Potential operon (r value)=YPO4109-4111 (r > 0.90); … 167 | # # … with 90 more rows 168 | ``` 169 | 170 | `pmc_reference` extracts the id, pmid, authors, year, title, journal, 171 | volume, pages, and DOIs from reference tags. 172 | 173 | ``` r 174 | ref1 <- pmc_reference(doc) 175 | # Found 76 citation tags 176 | ref1 177 | # # A tibble: 76 x 9 178 | # id pmid authors year title journal volume pages doi 179 | # 180 | # 1 1 89938… Perry RD, Fetherston JD 1997 Yersinia pestis--eti… Clin Mic… 10 35-66 181 | # 2 2 16053… Hinnebusch BJ 2005 The evolution of fle… Curr Iss… 7 197-… 182 | # 3 3 64693… Straley SC, Harmon PA 1984 Yersinia pestis grow… Infect I… 45 655-… 183 | # 4 4 15557… Huang XZ, Lindler LE 2004 The pH 6 antigen is … Infect I… 72 7212… 10.1128/… 184 | # 5 5 15721… Pujol C, Bliska JB 2005 Turning Yersinia pat… Clin Imm… 114 216-… 10.1016/… 185 | # 6 6 12732… Rhodius VA, LaRossa RA 2003 Uses and pitfalls of… Curr Opi… 6 114-… 10.1016/… 186 | # 7 7 15342… Motin VL, Georgescu AM… 2004 Temporal global chan… J Bacter… 186 6298… 10.1128/… 187 | # 8 8 15557… Han Y, Zhou D, Pang X,… 2004 Microarray analysis … Microbio… 48 791-… 188 | # 9 9 15777… Han Y, Zhou D, Pang X,… 2005 DNA microarray analy… Microbes… 7 335-… 10.1016/… 189 | # 10 10 15808… Han Y, Zhou D, Pang X,… 2005 Comparative transcri… Res Micr… 156 403-… 10.1016/… 190 | # # … with 66 more rows 191 | ``` 192 | 193 | Finally, `pmc_metadata` saves journal and article metadata to a list. 194 | 195 | ``` r 196 | pmc_metadata(doc) 197 | # $PMCID 198 | # [1] "PMC2231364" 199 | # 200 | # $Title 201 | # [1] "Comparative transcriptomics in Yersinia pestis: a global view of environmental modulation of gene expression" 202 | # 203 | # $Authors 204 | # [1] "Yanping Han, Jingfu Qiu, Zhaobiao Guo, He Gao, Yajun Song, Dongsheng Zhou, Ruifu Yang" 205 | # 206 | # $Year 207 | # [1] 2007 208 | # 209 | # $Journal 210 | # [1] "BMC Microbiology" 211 | # 212 | # $Volume 213 | # [1] "7" 214 | # 215 | # $Pages 216 | # [1] "96" 217 | # 218 | # $`Published online` 219 | # [1] "2007-10-29" 220 | # 221 | # $`Date received` 222 | # [1] "2007-6-2" 223 | # 224 | # $DOI 225 | # [1] "10.1186/1471-2180-7-96" 226 | # 227 | # $Publisher 228 | # [1] "BioMed Central" 229 | ``` 230 | 231 | ## Searching text 232 | 233 | There are a few functions to search within the `pmc_text` or collapsed 234 | `pmc_table` output. `separate_text` uses the 235 | [stringr](https://stringr.tidyverse.org/) package to extract any 236 | matching regular expression. 237 | 238 | ``` r 239 | separate_text(txt, "[ATCGN]{5,}") 240 | # # A tibble: 9 x 5 241 | # match section paragraph sentence text 242 | # 243 | # 1 ACGCAATCGT… Results and Discussion; Comp… 2 3 A 16 basepair (bp) box (5'-ACGCAATCG… 244 | # 2 AAACGTTTNC… Results and Discussion; Comp… 2 4 It is very similar to the E. coli Pu… 245 | # 3 TGATAATGAT… Results and Discussion; Comp… 2 5 A 21 bp box (5'-TGATAATGATTATCATTATC… 246 | # 4 GATAATGATA… Results and Discussion; Comp… 2 6 It is a 10-1-10 inverted repeat that… 247 | # 5 TGANNNNNNT… Results and Discussion; Comp… 2 7 A 15 bp box (5'-TGANNNNNNTCAA-3') wa… 248 | # 6 TTGATN Results and Discussion; Comp… 2 8 It is a part of the E. coli Fnr box … 249 | # 7 NATCAA Results and Discussion; Comp… 2 8 It is a part of the E. coli Fnr box … 250 | # 8 GTTAATTAA Results and Discussion; Comp… 3 4 The ArcA regulator can recognize a r… 251 | # 9 GTTAATTAAT… Results and Discussion; Comp… 3 5 An ArcA-box-like sequence (5'-GTTAAT… 252 | ``` 253 | 254 | A few wrappers search pre-defined patterns and add an extra step to 255 | expand matched ranges. `separate_refs` matches references within 256 | brackets using `\\[[0-9, -]+\\]` and expands ranges like `[7-9]`. 257 | 258 | ``` r 259 | x <- separate_refs(txt) 260 | x 261 | # # A tibble: 93 x 6 262 | # id match section paragraph sentence text 263 | # 264 | # 1 1 [1] Backgrou… 1 1 Yersinia pestis is the etiological agent of plague, alt… 265 | # 2 2 [2] Backgrou… 1 3 To produce a transmissible infection, Y. pestis coloniz… 266 | # 3 3 [3] Backgrou… 1 9 However, a few bacilli are taken up by tissue macrophag… 267 | # 4 4 [4,5] Backgrou… 1 10 Residence in this niche also facilitates the bacteria's… 268 | # 5 5 [4,5] Backgrou… 1 10 Residence in this niche also facilitates the bacteria's… 269 | # 6 6 [6] Backgrou… 2 1 A DNA microarray is able to determine simultaneous chan… 270 | # 7 7 [7-9] Backgrou… 2 2 We and others have measured the gene expression profile… 271 | # 8 8 [7-9] Backgrou… 2 2 We and others have measured the gene expression profile… 272 | # 9 9 [7-9] Backgrou… 2 2 We and others have measured the gene expression profile… 273 | # 10 10 [10] Backgrou… 2 2 We and others have measured the gene expression profile… 274 | # # … with 83 more rows 275 | filter(x, id == 8) 276 | # # A tibble: 5 x 6 277 | # id match section paragraph sentence text 278 | # 279 | # 1 8 [7-9] Background 2 2 We and others have measured the… 280 | # 2 8 [8-13,1… Background 2 4 In order to acquire more regula… 281 | # 3 8 [7-13,1… Results and Discussion 2 1 Recently, many signature expres… 282 | # 4 8 [7-9] Results and Discussion; Virule… 3 1 As described previously, expres… 283 | # 5 8 [8-10] Methods; Collection of microar… 1 6 The genome-wide transcriptional… 284 | ``` 285 | 286 | `separate_genes` expands microbial gene operons like `hmsHFRS` into four 287 | separate genes. 288 | 289 | ``` r 290 | separate_genes(txt) 291 | # # A tibble: 103 x 6 292 | # gene match section paragraph sentence text 293 | # 294 | # 1 purR PurR Abstract 2 5 Several regulatory DNA motifs, p… 295 | # 2 phoP PhoP Background 2 3 We also identified the regulons … 296 | # 3 ompR OmpR Background 2 3 We also identified the regulons … 297 | # 4 oxyR OxyR Background 2 3 We also identified the regulons … 298 | # 5 csrA CsrA Results and Discussion 1 3 After the determination of the C… 299 | # 6 slyA SlyA Results and Discussion 1 3 After the determination of the C… 300 | # 7 phoPQ PhoPQ Results and Discussion 1 3 After the determination of the C… 301 | # 8 hmsH hmsHF… Results and Discussion; Virule… 3 3 For example, the hemin storage l… 302 | # 9 hmsF hmsHF… Results and Discussion; Virule… 3 3 For example, the hemin storage l… 303 | # 10 hmsR hmsHF… Results and Discussion; Virule… 3 3 For example, the hemin storage l… 304 | # # … with 93 more rows 305 | ``` 306 | 307 | Finally, `separate_tags` expands locus tag ranges. 308 | 309 | ``` r 310 | collapse_rows(tab1, na="-") %>% 311 | separate_tags("YPO") 312 | # # A tibble: 270 x 5 313 | # id match table row text 314 | # 315 | # 1 YPO2439 YPO2439-2… Table… 1 subheading=Iron uptake or heme synthesis; Potential operon (r va… 316 | # 2 YPO2440 YPO2439-2… Table… 1 subheading=Iron uptake or heme synthesis; Potential operon (r va… 317 | # 3 YPO2441 YPO2439-2… Table… 1 subheading=Iron uptake or heme synthesis; Potential operon (r va… 318 | # 4 YPO2442 YPO2439-2… Table… 1 subheading=Iron uptake or heme synthesis; Potential operon (r va… 319 | # 5 YPO0279 YPO0279-0… Table… 2 subheading=Iron uptake or heme synthesis; Potential operon (r va… 320 | # 6 YPO0280 YPO0279-0… Table… 2 subheading=Iron uptake or heme synthesis; Potential operon (r va… 321 | # 7 YPO0281 YPO0279-0… Table… 2 subheading=Iron uptake or heme synthesis; Potential operon (r va… 322 | # 8 YPO0282 YPO0279-0… Table… 2 subheading=Iron uptake or heme synthesis; Potential operon (r va… 323 | # 9 YPO0283 YPO0279-0… Table… 2 subheading=Iron uptake or heme synthesis; Potential operon (r va… 324 | # 10 YPO1529 YPO1529-1… Table… 3 subheading=Iron uptake or heme synthesis; Potential operon (r va… 325 | # # … with 260 more rows 326 | ``` 327 | 328 | ### Using `xml2` 329 | 330 | The `pmc_*` functions use the [xml2](https://github.com/r-lib/xml2) 331 | package for parsing and may fail in some situations, so it helps to know 332 | how to parse `xml_documents`. Use `cat` and `as.character` to view nodes 333 | returned by `xml_find_all`. 334 | 335 | ``` r 336 | library(xml2) 337 | refs <- xml_find_all(doc, "//ref") 338 | refs[1] 339 | # {xml_nodeset (1)} 340 | # [1] \n \n 343 | # 344 | # 345 | # 346 | # Perry 347 | # RD 348 | # 349 | # 350 | # Fetherston 351 | # JD 352 | # 353 | # 354 | # Yersinia pestis--etiologic agent of plague 355 | # Clin Microbiol Rev 356 | # 1997 357 | # 10 358 | # 35 359 | # 66 360 | # 8993858 361 | # 362 | # 363 | ``` 364 | 365 | Many journals use superscripts for references cited so they usually 366 | appear after words like `results9` below. 367 | 368 | ``` r 369 | # doc1 <- pmc_xml("PMC6385181") 370 | doc1 <- read_xml(system.file("extdata/PMC6385181.xml", package = "tidypmc")) 371 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2]) 372 | # [1] "RNA-seq identifies the most relevant genes and RT-qPCR validates its results9, especially in the field of environmental and host adaptation10,11 and antimicrobial response12." 373 | ``` 374 | 375 | Find the tags using `xml_find_all` and then update the nodes by adding 376 | brackets or other text. 377 | 378 | ``` r 379 | bib <- xml_find_all(doc1, "//xref[@ref-type='bibr']") 380 | bib[1] 381 | # {xml_nodeset (1)} 382 | # [1] 1 383 | xml_text(bib) <- paste0(" [", xml_text(bib), "]") 384 | bib[1] 385 | # {xml_nodeset (1)} 386 | # [1] [1] 387 | ``` 388 | 389 | The text is now separated from the reference. Note the `pmc_text` 390 | function adds the brackets by default. 391 | 392 | ``` r 393 | gsub(".*\\. ", "", xml_text( xml_find_all(doc1, "//sec/p"))[2]) 394 | # [1] "RNA-seq identifies the most relevant genes and RT-qPCR validates its results [9], especially in the field of environmental and host adaptation [10], [11] and antimicrobial response [12]." 395 | ``` 396 | 397 | Genes, species and many other terms are often included within italic 398 | tags. You can mark these nodes using the same code above or simply list 399 | all the names in italics and search text or tables for matches, for 400 | example three letter gene names in text below. 401 | 402 | ``` r 403 | library(tibble) 404 | x <- xml_name(xml_find_all(doc, "//*")) 405 | tibble(tag=x) %>% 406 | count(tag, sort=TRUE) 407 | # # A tibble: 84 x 2 408 | # tag n 409 | # 410 | # 1 td 398 411 | # 2 given-names 388 412 | # 3 name 388 413 | # 4 surname 388 414 | # 5 italic 235 415 | # 6 pub-id 129 416 | # 7 tr 117 417 | # 8 xref 108 418 | # 9 year 80 419 | # 10 article-title 77 420 | # # … with 74 more rows 421 | it <- xml_text(xml_find_all(doc, "//sec//p//italic"), trim=TRUE) 422 | it2 <- tibble(italic=it) %>% 423 | count(italic, sort=TRUE) 424 | it2 425 | # # A tibble: 53 x 2 426 | # italic n 427 | # 428 | # 1 Y. pestis 46 429 | # 2 in vitro 5 430 | # 3 E. coli 4 431 | # 4 psaEFABC 3 432 | # 5 r 3 433 | # 6 cis 2 434 | # 7 fur 2 435 | # 8 n 2 436 | # 9 nrdHIEF 2 437 | # 10 sufABCDSE 2 438 | # # … with 43 more rows 439 | filter(it2, nchar(italic) == 3) 440 | # # A tibble: 8 x 2 441 | # italic n 442 | # 443 | # 1 cis 2 444 | # 2 fur 2 445 | # 3 cys 1 446 | # 4 hmu 1 447 | # 5 ybt 1 448 | # 6 yfe 1 449 | # 7 yfu 1 450 | # 8 ymt 1 451 | separate_text(txt, c("fur", "cys", "hmu", "ybt", "yfe", "yfu", "ymt")) 452 | # # A tibble: 9 x 5 453 | # match section paragraph sentence text 454 | # 455 | # 1 ymt Results and Discussion; Virulence ge… 3 4 The ymt gene encoding Yersinia mur… 456 | # 2 fur Results and Discussion; Clustering a… 3 2 It is noticeable that almost all o… 457 | # 3 yfe Results and Discussion; Clustering a… 3 4 Genes in category A (yfe, hmu, yfu… 458 | # 4 hmu Results and Discussion; Clustering a… 3 4 Genes in category A (yfe, hmu, yfu… 459 | # 5 yfu Results and Discussion; Clustering a… 3 4 Genes in category A (yfe, hmu, yfu… 460 | # 6 ybt Results and Discussion; Clustering a… 3 4 Genes in category A (yfe, hmu, yfu… 461 | # 7 cys Results and Discussion; Clustering a… 4 2 Genes responsible for sulfur uptak… 462 | # 8 cys Results and Discussion; Clustering a… 4 3 Cluster III contains members of th… 463 | # 9 fur Methods; Gel mobility shift analysis… 1 1 The entire coding region of the fu… 464 | ``` 465 | --------------------------------------------------------------------------------