├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── cluster.R ├── complement.R ├── intersect.R ├── join_closest.R └── subtract.R ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── docs ├── articles │ ├── index.html │ ├── intro.html │ └── resources │ │ ├── genome_cluster_docu.png │ │ ├── genome_complement_docu.png │ │ ├── genome_intersect_docu.png │ │ ├── genome_join_closest_docu.png │ │ ├── genome_join_docu.png │ │ └── genome_subtract_docu.png ├── authors.html ├── docsearch.css ├── docsearch.js ├── index.html ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml └── reference │ ├── cluster_interval.html │ ├── genome_cluster.html │ ├── genome_complement.html │ ├── genome_intersect.html │ ├── genome_join_closest.html │ ├── genome_subtract.html │ └── index.html ├── man ├── cluster_interval.Rd ├── genome_cluster.Rd ├── genome_complement.Rd ├── genome_intersect.Rd ├── genome_join_closest.Rd └── genome_subtract.Rd ├── src ├── .gitignore ├── RcppExports.cpp ├── cluster_interval.cpp └── tidygenomics_init.c ├── tests ├── testthat.R └── testthat │ ├── test_cluster.R │ ├── test_complement.R │ ├── test_intersect.R │ ├── test_issue.R │ ├── test_join_closest.R │ └── test_subtract.R ├── tidygenomics.Rproj └── vignettes ├── intro.Rmd └── resources ├── genome_cluster_docu.png ├── genome_complement_docu.png ├── genome_intersect_docu.png ├── genome_join_closest_docu.png ├── genome_join_docu.png └── genome_subtract_docu.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | cran-comments.md 4 | ^_pkgdown\.yml$ 5 | ^docs$ 6 | ^pkgdown$ 7 | ^CRAN-RELEASE$ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tidygenomics 2 | Type: Package 3 | Title: Tidy Verbs for Dealing with Genomic Data Frames 4 | Version: 0.1.2 5 | Authors@R: c(person("Constantin", "Ahlmann-Eltze", email = "artjom31415@googlemail.com", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0002-3762-068X")), 7 | person("Stan Developers", role="cph", 8 | comment="Code from the Stan Math library is reused in 'cluster_interval.cpp'"), 9 | person("David", "Robinson", role="cph", 10 | comment="Code from the fuzzyjoin package is reused")) 11 | Description: Handle genomic data within data frames just as you would with 'GRanges'. 12 | This packages provides method to deal with genomic intervals the "tidy-way" which makes 13 | it simpler to integrate in the the general data munging process. The API is inspired by the 14 | popular 'bedtools' and the genome_join() method from the 'fuzzyjoin' package. 15 | URL: https://github.com/const-ae/tidygenomics 16 | License: GPL-3 17 | Encoding: UTF-8 18 | LazyData: true 19 | Imports: 20 | dplyr, 21 | rlang, 22 | purrr, 23 | tidyr, 24 | fuzzyjoin (>= 0.1.3), 25 | IRanges, 26 | Rcpp 27 | Suggests: testthat, 28 | knitr, 29 | rmarkdown 30 | RoxygenNote: 6.1.1 31 | LinkingTo: 32 | Rcpp 33 | VignetteBuilder: knitr 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(cluster_interval) 4 | export(genome_anti_join_closest) 5 | export(genome_cluster) 6 | export(genome_complement) 7 | export(genome_full_join_closest) 8 | export(genome_inner_join_closest) 9 | export(genome_intersect) 10 | export(genome_join_closest) 11 | export(genome_left_join_closest) 12 | export(genome_right_join_closest) 13 | export(genome_semi_join_closest) 14 | export(genome_subtract) 15 | importFrom(Rcpp,sourceCpp) 16 | importFrom(dplyr,"%>%") 17 | importFrom(dplyr,"n") 18 | importFrom(rlang,":=") 19 | importFrom(rlang,"sym") 20 | useDynLib(tidygenomics, .registration = TRUE) 21 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | # v0.1.2 3 | 4 | * Remove remaining calls to deprecated tidyr functions to become compatible with tidyr v1.0.0. 5 | Thanks to @jennybc for the pull request (#6) 6 | 7 | # v0.1.1 8 | 9 | * Fix issue #5 10 | - the genome_cluster method assigned all chunks to cluster zero if their end was smaller 11 | than the end of the first entry 12 | 13 | * Port dplyr calls to new tidyeval API 14 | - This avoids plenty of deprecation warnings 15 | 16 | * Add pkgdown webpage: https://const-ae.github.io/tidygenomics/ 17 | 18 | # Initial Release (v0.1.0) 19 | 20 | First acceptance on CRAN 21 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | sort_indices <- function(x) { 5 | .Call(`_tidygenomics_sort_indices`, x) 6 | } 7 | 8 | #' Cluster ranges which are implemented as 2 equal-length numeric vectors. 9 | #' @param starts A numeric vector that defines the starts of each interval 10 | #' @param ends A numeric vector that defines the ends of each interval 11 | #' @param max_distance The maximum distance up to which intervals are still considered to be 12 | #' the same cluster. Default: 0. 13 | #' @examples 14 | #' starts <- c(50, 100, 120) 15 | #' ends <- c(75, 130, 150) 16 | #' j <- cluster_interval(starts, ends) 17 | #' j == c(0,1,1) 18 | #' @export 19 | cluster_interval <- function(starts, ends, max_distance = 0L) { 20 | .Call(`_tidygenomics_cluster_interval`, starts, ends, max_distance) 21 | } 22 | 23 | -------------------------------------------------------------------------------- /R/cluster.R: -------------------------------------------------------------------------------- 1 | 2 | #' @useDynLib tidygenomics, .registration = TRUE 3 | #' @importFrom Rcpp sourceCpp 4 | NULL 5 | 6 | .onUnload <- function (libpath) { 7 | library.dynam.unload("tidygenomics", libpath) 8 | } 9 | 10 | #' Intersect data frames based on chromosome, start and end. 11 | #' 12 | #' @param x A dataframe. 13 | #' @param by A character vector with 3 entries which are the chromosome, start and end column. 14 | #' For example: \code{by=c("chr", "start", "end")} 15 | #' @param max_distance The maximum distance up to which intervals are still considered to be 16 | #' the same cluster. Default: 0. 17 | #' @param cluster_column_name A string that is used as the new column name 18 | #' @return The dataframe with the additional column of the cluster 19 | #' @examples 20 | #' 21 | #' library(dplyr) 22 | #' 23 | #' x1 <- data.frame(id = 1:4, bla=letters[1:4], 24 | #' chromosome = c("chr1", "chr1", "chr2", "chr1"), 25 | #' start = c(100, 120, 300, 260), 26 | #' end = c(150, 250, 350, 450)) 27 | #' genome_cluster(x1, by=c("chromosome", "start", "end")) 28 | #' genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10) 29 | #' @export 30 | genome_cluster <- function(x, by=NULL, max_distance=0, cluster_column_name="cluster_id"){ 31 | 32 | x_groups <- dplyr::groups(x) 33 | x <- dplyr::ungroup(x) 34 | regroup <- function(d) { 35 | if (is.null(x_groups)) { 36 | return(d) 37 | } 38 | g <- purrr::map_chr(x_groups, as.character) 39 | missing <- !(g %in% colnames(d)) 40 | g[missing] <- paste0(g[missing], ".x") 41 | dplyr::group_by_(d, .dots = g) 42 | } 43 | 44 | if (is.null(by) | length(by) != 3) { 45 | stop("genome_cluster must join on exactly three columns") 46 | } 47 | 48 | ret <- x %>% 49 | dplyr::group_by(!!sym(by[1])) %>% 50 | dplyr::mutate(!! cluster_column_name := cluster_interval(!!sym(by[2]), !!sym(by[3]), max_distance = max_distance)) %>% 51 | dplyr::ungroup() %>% 52 | dplyr::mutate(!! cluster_column_name := as.numeric(as.factor(paste0(!!sym(by[1]), "-", !!sym(cluster_column_name))))-1) 53 | 54 | ret <- regroup(ret) 55 | return(ret) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /R/complement.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | #' Calculates the complement to the intervals covered by the intervals in 6 | #' a data frame. It can optionally take a \code{chromosome_size} data frame 7 | #' that contains 2 or 3 columns, the first the names of chromosome and in case 8 | #' there are 2 columns the size or first the start index and lastly the end index 9 | #' on the chromosome. 10 | #' @param x A data frame for which the complement is calculated 11 | #' @param chromosome_size A dataframe with at least 2 columns that contains 12 | #' first the chromosome name and then the size of that chromosome. Can be NULL 13 | #' in which case the largest value per chromosome from \code{x} is used. 14 | #' @param by A character vector with 3 entries which are the chromosome, start and end column. 15 | #' For example: \code{by=c("chr", "start", "end")} 16 | #' @examples 17 | #' 18 | #' library(dplyr) 19 | #' 20 | #' x1 <- data.frame(id = 1:4, bla=letters[1:4], 21 | #' chromosome = c("chr1", "chr1", "chr2", "chr1"), 22 | #' start = c(100, 200, 300, 400), 23 | #' end = c(150, 250, 350, 450)) 24 | #' 25 | #' genome_complement(x1, by=c("chromosome", "start", "end")) 26 | #' @export 27 | genome_complement <- function(x, chromosome_size=NULL, by=NULL){ 28 | 29 | if (is.null(by) | length(by) != 3) { 30 | stop("genome_complement must work on exactly three columns") 31 | } 32 | 33 | 34 | if(is.null(chromosome_size)){ 35 | chromosome_size <- x %>% 36 | dplyr::group_by(!! sym(by[1])) %>% 37 | dplyr::summarize(start = 1, 38 | end = max(!! sym(by[3]))) 39 | }else if(ncol(chromosome_size) == 2){ 40 | chromosome_size <- cbind(chromosome_size[, 1, drop=FALSE], data.frame(start=1), chromosome_size[, -1, drop=FALSE]) 41 | } 42 | 43 | colnames(chromosome_size)[1:3] <- by 44 | 45 | chromosome_size %>% 46 | genome_subtract(x, by=by) 47 | } 48 | -------------------------------------------------------------------------------- /R/intersect.R: -------------------------------------------------------------------------------- 1 | 2 | #' @importFrom dplyr "%>%" "n" 3 | #' @importFrom rlang "sym" ":=" 4 | NULL 5 | 6 | 7 | ## quiets concerns of R CMD check re: the .'s that appear in pipelines 8 | if(getRversion() >= "2.15.1") utils::globalVariables(c("..start", "..end", "..id", "..distance")) 9 | 10 | 11 | 12 | #' Intersect data frames based on chromosome, start and end. 13 | #' 14 | #' @param x A dataframe. 15 | #' @param y A dataframe. 16 | #' @param by A character vector with 3 entries which are used to match the chromosome, start and end column. 17 | #' For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")} 18 | #' @param mode One of "both", "left", "right" or "anti". 19 | #' @return The intersected dataframe of \code{x} and \code{y} with the new boundaries. 20 | #' @examples 21 | #' 22 | #' library(dplyr) 23 | #' 24 | #' x1 <- data.frame(id = 1:4, bla=letters[1:4], 25 | #' chromosome = c("chr1", "chr1", "chr2", "chr2"), 26 | #' start = c(100, 200, 300, 400), 27 | #' end = c(150, 250, 350, 450)) 28 | #' 29 | #' x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 30 | #' chromosome = c("chr1", "chr2", "chr2", "chr1"), 31 | #' start = c(140, 210, 400, 300), 32 | #' end = c(160, 240, 415, 320)) 33 | #' j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 34 | #' print(j) 35 | #' 36 | #' 37 | #' 38 | #' @export 39 | genome_intersect <- function(x, y, by=NULL, mode= "both"){ 40 | 41 | # Much of this code is copied from https://github.com/dgrtwo/fuzzyjoin 42 | 43 | x_groups <- dplyr::groups(x) 44 | x <- dplyr::ungroup(x) 45 | regroup <- function(d) { 46 | if (is.null(x_groups)) { 47 | return(d) 48 | } 49 | g <- purrr::map_chr(x_groups, as.character) 50 | missing <- !(g %in% colnames(d)) 51 | g[missing] <- paste0(g[missing], ".x") 52 | dplyr::group_by_(d, .dots = g) 53 | } 54 | 55 | mode <- match.arg(mode, c("both", "left", "right", "anti")) 56 | 57 | by <- dplyr::common_by(by, x, y) 58 | 59 | if (length(by$x) != 3) { 60 | stop("genome_join must join on exactly three columns") 61 | } 62 | 63 | 64 | index_match_fun <- function(x,y){ 65 | # nest around the chromosome column 66 | x$..index <- seq_len(nrow(x)) 67 | y$..index <- seq_len(nrow(y)) 68 | nested_x <- dplyr::group_by_at(x, 1) %>% tidyr::nest() 69 | nested_y <- dplyr::group_by_at(y, 1) %>% tidyr::nest() 70 | by <- c(colnames(nested_y)[1]) 71 | names(by) <- colnames(nested_x)[1] 72 | 73 | joined <- dplyr::inner_join(nested_x, nested_y, by = by) 74 | 75 | # find matching ranges in each 76 | find_overlaps <- function(xd, yd) { 77 | r1 <- IRanges::IRanges(xd[[1]], xd[[2]]) 78 | r2 <- IRanges::IRanges(yd[[1]], yd[[2]]) 79 | o <- as.data.frame(IRanges::findOverlaps(r1, r2)) 80 | intersection <- IRanges::pintersect(r1[o$queryHits], r2[o$subjectHits]) 81 | data.frame(x = xd$..index[o$queryHits], y = yd$..index[o$subjectHits], 82 | ..start=IRanges::start(intersection), ..end=IRanges::end(intersection)) 83 | } 84 | 85 | ret <- purrr::map2_df(joined$data.x, joined$data.y, find_overlaps) 86 | ret 87 | } 88 | 89 | d1 <- x[, by$x, drop = FALSE] 90 | d2 <- y[, by$y, drop = FALSE] 91 | matches <- index_match_fun(d1, d2) 92 | 93 | matches$i <- NULL 94 | if (mode == "anti") { 95 | if (nrow(matches) == 0) { 96 | return(regroup(x)) 97 | } 98 | return(regroup(x[-sort(unique(matches$x)), ])) 99 | } 100 | if (mode == "left") { 101 | ret <- x %>% 102 | dplyr::select(- dplyr::one_of(by$x[-1])) %>% 103 | dplyr::mutate(..id=seq_len(n())) %>% 104 | dplyr::inner_join(matches[, c("x", "..start", "..end")], by=c("..id"="x")) %>% 105 | dplyr::rename(!! by$x[2] := `..start`, !! by$x[3] := `..end`) %>% 106 | dplyr::select(- `..id`) %>% 107 | regroup() 108 | return(ret) 109 | } 110 | else if (mode == "right") { 111 | ret <- y %>% 112 | dplyr::select(- dplyr::one_of(by$y[-1])) %>% 113 | dplyr::mutate(..id=seq_len(n())) %>% 114 | dplyr::inner_join(matches[,c("y", "..start", "..end")], by=c("..id"="y")) %>% 115 | dplyr::rename(!! by$y[2] := `..start`, !! by$y[3] := `..end`) %>% 116 | dplyr::select(- `..id`) %>% 117 | regroup() 118 | return(ret) 119 | } 120 | 121 | matches <- dplyr::arrange(matches, x, y) 122 | for (n in intersect(colnames(x), colnames(y))) { 123 | if(! n %in% by$x){ 124 | x <- dplyr::rename(x, !! paste0(n, ".x") := !! sym(n)) 125 | } 126 | if(! n %in% by$y){ 127 | y <- dplyr::rename(y, !! paste0(n, ".y") := !! sym(n)) 128 | } 129 | } 130 | 131 | ret <- dplyr::bind_cols(x[matches$x, , drop = FALSE] %>% dplyr::select(- dplyr::one_of(by$x[-1])), 132 | y[matches$y, , drop = FALSE] %>% dplyr::select(- dplyr::one_of(by$y))) 133 | if (ncol(matches) > 2) { 134 | extra_cols <- matches[, -(1:2), drop = FALSE] 135 | ret <- dplyr::bind_cols(ret, extra_cols) %>% 136 | dplyr::rename(!! by$x[2] := `..start`, !! by$x[3] := `..end`) 137 | } 138 | regroup(ret) 139 | 140 | 141 | } 142 | -------------------------------------------------------------------------------- /R/join_closest.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Join intervals on chromosomes in data frames, to the closest partner 4 | #' 5 | #' @param x A dataframe. 6 | #' @param y A dataframe. 7 | #' @param by A character vector with 3 entries which are used to match the chromosome, start and end column. 8 | #' For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")} 9 | #' @param mode One of "inner", "full", "left", "right", "semi" or "anti". 10 | #' @param distance_column_name A string that is used as the new column name with the distance. 11 | #' If \code{NULL} no new column is added. 12 | #' @param max_distance The maximum distance that is allowed to join 2 entries. 13 | #' @param select A string that is passed on to \code{IRanges::distanceToNearest}, can either be 14 | #' all which means that in case that multiple intervals have the same distance all are reported, or 15 | #' arbitrary which means in that case one would be chosen at random. 16 | #' @param ... Additional arguments parsed on to genome_join_closest. 17 | #' @return The joined dataframe of \code{x} and \code{y}. 18 | #' @examples 19 | #' 20 | #' library(dplyr) 21 | #' 22 | #' x1 <- data.frame(id = 1:4, bla=letters[1:4], 23 | #' chromosome = c("chr1", "chr1", "chr2", "chr2"), 24 | #' start = c(100, 200, 300, 400), 25 | #' end = c(150, 250, 350, 450)) 26 | #' 27 | #' x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 28 | #' chromosome = c("chr1", "chr2", "chr2", "chr1"), 29 | #' start = c(140, 210, 400, 300), 30 | #' end = c(160, 240, 415, 320)) 31 | #' j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 32 | #' print(j) 33 | #' @export 34 | genome_join_closest <- function(x, y, by=NULL, mode = "inner", 35 | distance_column_name=NULL, max_distance=Inf, select="all"){ 36 | 37 | # Nearly all of this code is copied from https://github.com/dgrtwo/fuzzyjoin 38 | 39 | if (!requireNamespace("IRanges", quietly = TRUE)) { 40 | stop("genome_join_closest requires the IRanges package: ", 41 | "https://bioconductor.org/packages/release/bioc/html/IRanges.html") 42 | } 43 | 44 | select <- match.arg(select, c("all", "arbitrary")) 45 | 46 | by <- dplyr::common_by(by, x, y) 47 | if (length(by$x) != 3) { 48 | stop("genome_join_closest must join on exactly three columns") 49 | } 50 | 51 | f <- function(x, y) { 52 | # nest around the chromosome column 53 | x$..index <- seq_len(nrow(x)) 54 | y$..index <- seq_len(nrow(y)) 55 | 56 | nested_x <- dplyr::group_by_at(x, 1) %>% tidyr::nest() 57 | nested_y <- dplyr::group_by_at(y, 1) %>% tidyr::nest() 58 | by <- c(colnames(nested_y)[1]) 59 | names(by) <- colnames(nested_x)[1] 60 | 61 | joined <- dplyr::inner_join(nested_x, nested_y, by = by) 62 | 63 | # find matching ranges in each 64 | find_closest <- function(xd, yd) { 65 | r1 <- IRanges::IRanges(xd[[1]], xd[[2]]) 66 | r2 <- IRanges::IRanges(yd[[1]], yd[[2]]) 67 | o <- as.data.frame(IRanges::distanceToNearest(r1, r2, select=select)) 68 | 69 | data.frame(x = xd$..index[o$queryHits], y = yd$..index[o$subjectHits], ..distance=o$distance) %>% 70 | dplyr::filter(`..distance` < max_distance) 71 | } 72 | 73 | ret <- purrr::map2_df(joined$data.x, joined$data.y, find_closest) 74 | 75 | if(! is.null(distance_column_name)){ 76 | ret[[distance_column_name]] <- ret$..distance 77 | } 78 | ret$..distance <- NULL 79 | 80 | ret 81 | } 82 | 83 | fuzzyjoin::fuzzy_join(x, y, mode = mode, index_match_fun = f, multi_by = by) 84 | 85 | } 86 | 87 | 88 | #' @rdname genome_join_closest 89 | #' @export 90 | genome_inner_join_closest <- function(x, y, by = NULL, ...) { 91 | genome_join_closest (x, y, by, mode = "inner", ...) 92 | } 93 | 94 | 95 | #' @rdname genome_join_closest 96 | #' @export 97 | genome_left_join_closest <- function(x, y, by = NULL, ...) { 98 | genome_join_closest (x, y, by, mode = "left", ...) 99 | } 100 | 101 | 102 | #' @rdname genome_join_closest 103 | #' @export 104 | genome_right_join_closest <- function(x, y, by = NULL, ...) { 105 | genome_join_closest (x, y, by, mode = "right", ...) 106 | } 107 | 108 | 109 | #' @rdname genome_join_closest 110 | #' @export 111 | genome_full_join_closest <- function(x, y, by = NULL, ...) { 112 | genome_join_closest (x, y, by, mode = "full", ...) 113 | } 114 | 115 | 116 | #' @rdname genome_join_closest 117 | #' @export 118 | genome_semi_join_closest <- function(x, y, by = NULL, ...) { 119 | genome_join_closest (x, y, by, mode = "semi", ...) 120 | } 121 | 122 | 123 | #' @rdname genome_join_closest 124 | #' @export 125 | genome_anti_join_closest <- function(x, y, by = NULL, ...) { 126 | genome_join_closest (x, y, by, mode = "anti", ...) 127 | } 128 | 129 | 130 | -------------------------------------------------------------------------------- /R/subtract.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' Subtract one data frame from another based on chromosome, start and end. 5 | #' 6 | #' @param x A dataframe. 7 | #' @param y A dataframe. 8 | #' @param by A character vector with 3 entries which are used to match the chromosome, start and end column. 9 | #' For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")} 10 | #' @return The subtracted dataframe of \code{x} and \code{y} with the new boundaries. 11 | #' @examples 12 | #' 13 | #' library(dplyr) 14 | #' 15 | #' x1 <- data.frame(id = 1:4, bla=letters[1:4], 16 | #' chromosome = c("chr1", "chr1", "chr2", "chr1"), 17 | #' start = c(100, 200, 300, 400), 18 | #' end = c(150, 250, 350, 450)) 19 | #' 20 | #' x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 21 | #' chromosome = c("chr1", "chr2", "chr1", "chr1"), 22 | #' start = c(120, 210, 300, 400), 23 | #' end = c(125, 240, 320, 415)) 24 | #' 25 | #' j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 26 | #' print(j) 27 | #' 28 | #' 29 | #' @export 30 | genome_subtract <- function(x, y, by=NULL){ 31 | 32 | # Much of this code is copied from https://github.com/dgrtwo/fuzzyjoin 33 | 34 | x_groups <- dplyr::groups(x) 35 | x <- dplyr::ungroup(x) 36 | regroup <- function(d) { 37 | if (is.null(x_groups)) { 38 | return(d) 39 | } 40 | g <- purrr::map_chr(x_groups, as.character) 41 | missing <- !(g %in% colnames(d)) 42 | g[missing] <- paste0(g[missing], ".x") 43 | dplyr::group_by_(d, .dots = g) 44 | } 45 | 46 | by <- dplyr::common_by(by, x, y) 47 | 48 | if (length(by$x) != 3) { 49 | stop("genome_join must join on exactly three columns") 50 | } 51 | 52 | 53 | f <- function(x,y){ 54 | # nest around the chromosome column 55 | x$..index <- seq_len(nrow(x)) 56 | y$..index <- seq_len(nrow(y)) 57 | nested_x <- dplyr::group_by_at(x, 1) %>% tidyr::nest() 58 | nested_y <- dplyr::group_by_at(y, 1) %>% tidyr::nest() 59 | by <- c(colnames(nested_y)[1]) 60 | names(by) <- colnames(nested_x)[1] 61 | 62 | joined <- dplyr::inner_join(nested_x, nested_y, by = by) 63 | 64 | # find matching ranges in each 65 | find_subtractions <- function(xd, yd) { 66 | r1 <- IRanges::IRanges(xd[[1]], xd[[2]]) 67 | r2 <- IRanges::IRanges(yd[[1]], yd[[2]]) 68 | 69 | subtraction <- IRanges::setdiff(r1, r2) 70 | 71 | o <- as.data.frame(IRanges::findOverlaps(subtraction, r1)) 72 | data.frame(x = xd$..index[o$subjectHits], 73 | ..start=pmax(IRanges::start(subtraction)[o$queryHits], IRanges::start(r1)[o$subjectHits]), 74 | ..end=pmin(IRanges::end(subtraction)[o$queryHits], IRanges::end(r1)[o$subjectHits])) 75 | } 76 | 77 | ret <- purrr::map2_df(joined$data.x, joined$data.y, find_subtractions) 78 | ret 79 | } 80 | 81 | d1 <- x[, by$x, drop = FALSE] 82 | d2 <- y[, by$y, drop = FALSE] 83 | 84 | matches <- f(d1, d2) 85 | ret <- x %>% 86 | dplyr::select(- dplyr::one_of(by$x[-1])) %>% 87 | dplyr::mutate(..id=seq_len(n())) %>% 88 | dplyr::inner_join(matches[, c("x", "..start", "..end")], by=c("..id"="x")) %>% 89 | dplyr::rename(!! by$x[2] := `..start`, !! by$x[3] := `..end`) %>% 90 | dplyr::select(- `..id`) %>% 91 | regroup() 92 | return(ret) 93 | 94 | } 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tidygenomics 2 | 3 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/tidygenomics)](https://cran.r-project.org/package=tidygenomics) 4 | 5 | Tidy Verbs for Dealing with Genomic Data Frames 6 | 7 | ## Description 8 | 9 | Handle genomic data within data frames just as you would with `GRanges`. 10 | This packages provides method to deal with genomics intervals the "tidy-way" which makes 11 | it simpler to integrate in the the general data munging process. The API is inspired by the 12 | popular bedtools and the genome_join() method from the fuzzyjoin package. 13 | 14 | ## Installation 15 | 16 | ``` 17 | install.packages("tidygenomics") 18 | ``` 19 | 20 | Or to get the latest development version 21 | ``` 22 | devtools::install_github("const-ae/tidygenomics") 23 | ``` 24 | 25 | ## Documentation 26 | 27 | 28 | #### genome_intersect 29 | 30 | Joins 2 data frames based on their genomic overlap. Unlike the `genome_join` function it updates the boundaries to reflect 31 | the overlap of the regions. 32 | 33 | genome_intersect 34 | 35 | 36 | ```{r} 37 | x1 <- data.frame(id = 1:4, 38 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 39 | start = c(100, 200, 300, 400), 40 | end = c(150, 250, 350, 450)) 41 | 42 | x2 <- data.frame(id = 1:4, 43 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 44 | start = c(140, 210, 400, 300), 45 | end = c(160, 240, 415, 320)) 46 | 47 | genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 48 | ``` 49 | 50 | | id.x|chromosome | id.y| start| end| 51 | |----:|:----------|----:|-----:|---:| 52 | | 1|chr1 | 1| 140| 150| 53 | | 4|chr2 | 3| 400| 415| 54 | 55 | #### genome_subtract 56 | 57 | Subtracts one data frame from the other. This can be used to split the x data frame into smaller areas. 58 | 59 | genome_subtract 60 | 61 | ```{r} 62 | x1 <- data.frame(id = 1:4, 63 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 64 | start = c(100, 200, 300, 400), 65 | end = c(150, 250, 350, 450)) 66 | 67 | x2 <- data.frame(id = 1:4, 68 | chromosome = c("chr1", "chr2", "chr1", "chr1"), 69 | start = c(120, 210, 300, 400), 70 | end = c(125, 240, 320, 415)) 71 | 72 | genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 73 | ``` 74 | 75 | | id|chromosome | start| end| 76 | |--:|:----------|-----:|---:| 77 | | 1|chr1 | 100| 119| 78 | | 1|chr1 | 126| 150| 79 | | 2|chr1 | 200| 250| 80 | | 3|chr2 | 300| 350| 81 | | 4|chr1 | 416| 450| 82 | 83 | 84 | #### genome_join_closest 85 | 86 | Joins 2 data frames based on their genomic location. If no exact overlap is found the next closest interval is used. 87 | 88 | genome_join_closest 89 | 90 | ```{r} 91 | x1 <- data_frame(id = 1:4, 92 | chr = c("chr1", "chr1", "chr2", "chr3"), 93 | start = c(100, 200, 300, 400), 94 | end = c(150, 250, 350, 450)) 95 | 96 | x2 <- data_frame(id = 1:4, 97 | chr = c("chr1", "chr1", "chr1", "chr2"), 98 | start = c(220, 210, 300, 400), 99 | end = c(225, 240, 320, 415)) 100 | genome_join_closest(x1, x2, by=c("chr", "start", "end"), distance_column_name="distance", mode="left") 101 | ``` 102 | 103 | | id.x|chr.x | start.x| end.x| id.y|chr.y | start.y| end.y| distance| 104 | |----:|:-----|-------:|-----:|----:|:-----|-------:|-----:|--------:| 105 | | 1|chr1 | 100| 150| 2|chr1 | 210| 240| 59| 106 | | 2|chr1 | 200| 250| 1|chr1 | 220| 225| 0| 107 | | 2|chr1 | 200| 250| 2|chr1 | 210| 240| 0| 108 | | 3|chr2 | 300| 350| 4|chr2 | 400| 415| 49| 109 | | 4|chr3 | 400| 450| NA|NA | NA| NA| NA| 110 | 111 | #### genome_cluster 112 | 113 | Add a new column with the cluster if 2 intervals are overlapping or are within the `max_distance`. 114 | 115 | genome_cluster 116 | 117 | ```{r} 118 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 119 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 120 | start = c(100, 120, 300, 260), 121 | end = c(150, 250, 350, 450)) 122 | genome_cluster(x1, by=c("chromosome", "start", "end")) 123 | ``` 124 | 125 | | id|bla |chromosome | start| end| cluster_id| 126 | |--:|:---|:----------|-----:|---:|----------:| 127 | | 1|a |chr1 | 100| 150| 0| 128 | | 2|b |chr1 | 120| 250| 0| 129 | | 3|c |chr2 | 300| 350| 2| 130 | | 4|d |chr1 | 260| 450| 1| 131 | 132 | ```{r} 133 | genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10) 134 | ``` 135 | 136 | | id|bla |chromosome | start| end| cluster_id| 137 | |--:|:---|:----------|-----:|---:|----------:| 138 | | 1|a |chr1 | 100| 150| 0| 139 | | 2|b |chr1 | 120| 250| 0| 140 | | 3|c |chr2 | 300| 350| 1| 141 | | 4|d |chr1 | 260| 450| 0| 142 | 143 | #### genome_complement 144 | 145 | Calculates the complement of a genomic region. 146 | 147 | genome_complement 148 | 149 | ```{r} 150 | x1 <- data.frame(id = 1:4, 151 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 152 | start = c(100, 200, 300, 400), 153 | end = c(150, 250, 350, 450)) 154 | 155 | genome_complement(x1, by=c("chromosome", "start", "end")) 156 | ``` 157 | 158 | |chromosome | start| end| 159 | |:----------|-----:|---:| 160 | |chr1 | 1| 99| 161 | |chr1 | 151| 199| 162 | |chr1 | 251| 399| 163 | |chr2 | 1| 299| 164 | 165 | 166 | #### genome_join 167 | 168 | Classical join function based on the overlap of the interval. Implemented and maintained in the 169 | [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin) package and documented here only for completeness. 170 | 171 | genome_join 172 | 173 | ```{r} 174 | x1 <- data_frame(id = 1:4, 175 | chr = c("chr1", "chr1", "chr2", "chr3"), 176 | start = c(100, 200, 300, 400), 177 | end = c(150, 250, 350, 450)) 178 | 179 | x2 <- data_frame(id = 1:4, 180 | chr = c("chr1", "chr1", "chr1", "chr2"), 181 | start = c(220, 210, 300, 400), 182 | end = c(225, 240, 320, 415)) 183 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="inner") 184 | ``` 185 | 186 | | id.x|chr.x | start.x| end.x| id.y|chr.y | start.y| end.y| 187 | |----:|:-----|-------:|-----:|----:|:-----|-------:|-----:| 188 | | 2|chr1 | 200| 250| 1|chr1 | 220| 225| 189 | | 2|chr1 | 200| 250| 2|chr1 | 210| 240| 190 | 191 | ```{r} 192 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="left") 193 | ``` 194 | 195 | | id.x|chr.x | start.x| end.x| id.y|chr.y | start.y| end.y| 196 | |----:|:-----|-------:|-----:|----:|:-----|-------:|-----:| 197 | | 1|chr1 | 100| 150| NA|NA | NA| NA| 198 | | 2|chr1 | 200| 250| 1|chr1 | 220| 225| 199 | | 2|chr1 | 200| 250| 2|chr1 | 210| 240| 200 | | 3|chr2 | 300| 350| NA|NA | NA| NA| 201 | | 4|chr3 | 400| 450| NA|NA | NA| NA| 202 | 203 | ```{r} 204 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="anti") 205 | ``` 206 | 207 | | id|chr | start| end| 208 | |--:|:----|-----:|---:| 209 | | 1|chr1 | 100| 150| 210 | | 3|chr2 | 300| 350| 211 | | 4|chr3 | 400| 450| 212 | 213 | 214 | 215 | ## Inspiration 216 | 217 | - [tidyverse](http://tidyverse.org/) 218 | - [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin) 219 | - [GenomicRanges](http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html) 220 | - [bedtools](http://bedtools.readthedocs.io) 221 | 222 | If you have any additional questions or encounter issues please raise them on the [github page](https://github.com/Artjom-Metro/tidygenomics). 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | destination: docs 2 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | # Bug fix 2 | 3 | In this release I have fixed a bug and ported code to new dplyr API 4 | 5 | ## Test environments 6 | * macOS Mojave: R 3.6.1 7 | * R-Hub (Fedora R-devel, Ubuntu R-release) 8 | * winbuilder (R-devel, R-release) 9 | 10 | ## R CMD check results 11 | There were no ERRORs, WARNINGs 12 | 13 | ## Downstream dependencies 14 | 15 | There is no downstream dependency yet 16 | 17 | -------------------------------------------------------------------------------- /docs/articles/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Articles • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 107 | 108 | 109 |
110 | 111 |
112 |
113 | 116 | 117 |
118 |

All vignettes

119 |

120 | 121 | 124 |
125 |
126 |
127 | 128 | 137 |
138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /docs/articles/resources/genome_cluster_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_cluster_docu.png -------------------------------------------------------------------------------- /docs/articles/resources/genome_complement_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_complement_docu.png -------------------------------------------------------------------------------- /docs/articles/resources/genome_intersect_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_intersect_docu.png -------------------------------------------------------------------------------- /docs/articles/resources/genome_join_closest_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_join_closest_docu.png -------------------------------------------------------------------------------- /docs/articles/resources/genome_join_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_join_docu.png -------------------------------------------------------------------------------- /docs/articles/resources/genome_subtract_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/docs/articles/resources/genome_subtract_docu.png -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Authors • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 107 | 108 | 109 |
110 | 111 |
112 |
113 | 116 | 117 |
    118 |
  • 119 |

    Constantin Ahlmann-Eltze. Author, maintainer. ORCID 120 |

    121 |
  • 122 |
  • 123 |

    Stan Developers. Copyright holder. 124 |
    Code from the Stan Math library is reused in 'cluster_interval.cpp'

    125 |
  • 126 |
  • 127 |

    David Robinson. Copyright holder. 128 |
    Code from the fuzzyjoin package is reused

    129 |
  • 130 |
131 | 132 |
133 | 134 |
135 | 136 | 137 | 146 |
147 | 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /docs/docsearch.css: -------------------------------------------------------------------------------- 1 | /* Docsearch -------------------------------------------------------------- */ 2 | /* 3 | Source: https://github.com/algolia/docsearch/ 4 | License: MIT 5 | */ 6 | 7 | .algolia-autocomplete { 8 | display: block; 9 | -webkit-box-flex: 1; 10 | -ms-flex: 1; 11 | flex: 1 12 | } 13 | 14 | .algolia-autocomplete .ds-dropdown-menu { 15 | width: 100%; 16 | min-width: none; 17 | max-width: none; 18 | padding: .75rem 0; 19 | background-color: #fff; 20 | background-clip: padding-box; 21 | border: 1px solid rgba(0, 0, 0, .1); 22 | box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); 23 | } 24 | 25 | @media (min-width:768px) { 26 | .algolia-autocomplete .ds-dropdown-menu { 27 | width: 175% 28 | } 29 | } 30 | 31 | .algolia-autocomplete .ds-dropdown-menu::before { 32 | display: none 33 | } 34 | 35 | .algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { 36 | padding: 0; 37 | background-color: rgb(255,255,255); 38 | border: 0; 39 | max-height: 80vh; 40 | } 41 | 42 | .algolia-autocomplete .ds-dropdown-menu .ds-suggestions { 43 | margin-top: 0 44 | } 45 | 46 | .algolia-autocomplete .algolia-docsearch-suggestion { 47 | padding: 0; 48 | overflow: visible 49 | } 50 | 51 | .algolia-autocomplete .algolia-docsearch-suggestion--category-header { 52 | padding: .125rem 1rem; 53 | margin-top: 0; 54 | font-size: 1.3em; 55 | font-weight: 500; 56 | color: #00008B; 57 | border-bottom: 0 58 | } 59 | 60 | .algolia-autocomplete .algolia-docsearch-suggestion--wrapper { 61 | float: none; 62 | padding-top: 0 63 | } 64 | 65 | .algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { 66 | float: none; 67 | width: auto; 68 | padding: 0; 69 | text-align: left 70 | } 71 | 72 | .algolia-autocomplete .algolia-docsearch-suggestion--content { 73 | float: none; 74 | width: auto; 75 | padding: 0 76 | } 77 | 78 | .algolia-autocomplete .algolia-docsearch-suggestion--content::before { 79 | display: none 80 | } 81 | 82 | .algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { 83 | padding-top: .75rem; 84 | margin-top: .75rem; 85 | border-top: 1px solid rgba(0, 0, 0, .1) 86 | } 87 | 88 | .algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { 89 | display: block; 90 | padding: .1rem 1rem; 91 | margin-bottom: 0.1; 92 | font-size: 1.0em; 93 | font-weight: 400 94 | /* display: none */ 95 | } 96 | 97 | .algolia-autocomplete .algolia-docsearch-suggestion--title { 98 | display: block; 99 | padding: .25rem 1rem; 100 | margin-bottom: 0; 101 | font-size: 0.9em; 102 | font-weight: 400 103 | } 104 | 105 | .algolia-autocomplete .algolia-docsearch-suggestion--text { 106 | padding: 0 1rem .5rem; 107 | margin-top: -.25rem; 108 | font-size: 0.8em; 109 | font-weight: 400; 110 | line-height: 1.25 111 | } 112 | 113 | .algolia-autocomplete .algolia-docsearch-footer { 114 | width: 110px; 115 | height: 20px; 116 | z-index: 3; 117 | margin-top: 10.66667px; 118 | float: right; 119 | font-size: 0; 120 | line-height: 0; 121 | } 122 | 123 | .algolia-autocomplete .algolia-docsearch-footer--logo { 124 | background-image: url("data:image/svg+xml;utf8,"); 125 | background-repeat: no-repeat; 126 | background-position: 50%; 127 | background-size: 100%; 128 | overflow: hidden; 129 | text-indent: -9000px; 130 | width: 100%; 131 | height: 100%; 132 | display: block; 133 | transform: translate(-8px); 134 | } 135 | 136 | .algolia-autocomplete .algolia-docsearch-suggestion--highlight { 137 | color: #FF8C00; 138 | background: rgba(232, 189, 54, 0.1) 139 | } 140 | 141 | 142 | .algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { 143 | box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) 144 | } 145 | 146 | .algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { 147 | background-color: rgba(192, 192, 192, .15) 148 | } 149 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Tidy Verbs for Dealing with Genomic Data Frames • tidygenomics 9 | 10 | 11 | 12 | 13 | 17 | 18 | 22 | 23 | 24 |
25 |
80 | 81 | 82 | 83 |
84 |
85 |
86 | 88 | 89 |

Tidy Verbs for Dealing with Genomic Data Frames

90 |
91 |

92 | Description

93 |

Handle genomic data within data frames just as you would with GRanges. This packages provides method to deal with genomics intervals the “tidy-way” which makes it simpler to integrate in the the general data munging process. The API is inspired by the popular bedtools and the genome_join() method from the fuzzyjoin package.

94 |
95 | 103 |
104 |

105 | Documentation

106 |
107 |

108 | genome_intersect

109 |

Joins 2 data frames based on their genomic overlap. Unlike the genome_join function it updates the boundaries to reflect the overlap of the regions.

110 |

genome_intersect

111 |
x1 <- data.frame(id = 1:4, 
112 |                 chromosome = c("chr1", "chr1", "chr2", "chr2"),
113 |                 start = c(100, 200, 300, 400),
114 |                 end = c(150, 250, 350, 450))
115 | 
116 | x2 <- data.frame(id = 1:4,
117 |                  chromosome = c("chr1", "chr2", "chr2", "chr1"),
118 |                  start = c(140, 210, 400, 300),
119 |                  end = c(160, 240, 415, 320))
120 | 
121 | genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both")
122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 |
id.xchromosomeid.ystartend
1chr11140150
4chr23400415
147 |
148 |
149 |

150 | genome_subtract

151 |

Subtracts one data frame from the other. This can be used to split the x data frame into smaller areas.

152 |

genome_subtract

153 |
x1 <- data.frame(id = 1:4,
154 |                 chromosome = c("chr1", "chr1", "chr2", "chr1"),
155 |                 start = c(100, 200, 300, 400),
156 |                 end = c(150, 250, 350, 450))
157 | 
158 | x2 <- data.frame(id = 1:4,
159 |                 chromosome = c("chr1", "chr2", "chr1", "chr1"),
160 |                 start = c(120, 210, 300, 400),
161 |                 end = c(125, 240, 320, 415))
162 | 
163 | genome_subtract(x1, x2, by=c("chromosome", "start", "end"))
164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 |
idchromosomestartend
1chr1100119
1chr1126150
2chr1200250
3chr2300350
4chr1416450
204 |
205 |
206 |

207 | genome_join_closest

208 |

Joins 2 data frames based on their genomic location. If no exact overlap is found the next closest interval is used.

209 |

genome_join_closest

210 |
x1 <- data_frame(id = 1:4, 
211 |                  chr = c("chr1", "chr1", "chr2", "chr3"),
212 |                  start = c(100, 200, 300, 400),
213 |                  end = c(150, 250, 350, 450))
214 | 
215 | x2 <- data_frame(id = 1:4,
216 |                  chr = c("chr1", "chr1", "chr1", "chr2"),
217 |                  start = c(220, 210, 300, 400),
218 |                  end = c(225, 240, 320, 415))
219 | genome_join_closest(x1, x2, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 |
id.xchr.xstart.xend.xid.ychr.ystart.yend.ydistance
1chr11001502chr121024059
2chr12002501chr12202250
2chr12002502chr12102400
3chr23003504chr240041549
4chr3400450NANANANANA
290 |
291 |
292 |

293 | genome_cluster

294 |

Add a new column with the cluster if 2 intervals are overlapping or are within the max_distance.

295 |

genome_cluster

296 |
x1 <- data.frame(id = 1:4, bla=letters[1:4],
297 |                 chromosome = c("chr1", "chr1", "chr2", "chr1"),
298 |                 start = c(100, 120, 300, 260),
299 |                 end = c(150, 250, 350, 450))
300 | genome_cluster(x1, by=c("chromosome", "start", "end"))
301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 |
idblachromosomestartendcluster_id
1achr11001500
2bchr11202500
3cchr23003502
4dchr12604501
345 |
genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10)
346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 |
idblachromosomestartendcluster_id
1achr11001500
2bchr11202500
3cchr23003501
4dchr12604500
390 |
391 |
392 |

393 | genome_complement

394 |

Calculates the complement of a genomic region.

395 |

genome_complement

396 |
x1 <- data.frame(id = 1:4,
397 |                  chromosome = c("chr1", "chr1", "chr2", "chr1"),
398 |                  start = c(100, 200, 300, 400),
399 |                  end = c(150, 250, 350, 450))
400 | 
401 | genome_complement(x1, by=c("chromosome", "start", "end"))
402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 |
chromosomestartend
chr1199
chr1151199
chr1251399
chr21299
431 |
432 |
433 |

434 | genome_join

435 |

Classical join function based on the overlap of the interval. Implemented and maintained in the fuzzyjoin package and documented here only for completeness.

436 |

genome_join

437 |
x1 <- data_frame(id = 1:4, 
438 |                  chr = c("chr1", "chr1", "chr2", "chr3"),
439 |                  start = c(100, 200, 300, 400),
440 |                  end = c(150, 250, 350, 450))
441 | 
442 | x2 <- data_frame(id = 1:4,
443 |                  chr = c("chr1", "chr1", "chr1", "chr2"),
444 |                  start = c(220, 210, 300, 400),
445 |                  end = c(225, 240, 320, 415))
446 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="inner")
447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 |
id.xchr.xstart.xend.xid.ychr.ystart.yend.y
2chr12002501chr1220225
2chr12002502chr1210240
481 |
fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="left")
482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 |
id.xchr.xstart.xend.xid.ychr.ystart.yend.y
1chr1100150NANANANA
2chr12002501chr1220225
2chr12002502chr1210240
3chr2300350NANANANA
4chr3400450NANANANA
546 |
fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="anti")
547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 |
idchrstartend
1chr1100150
3chr2300350
4chr3400450
575 |
576 |
577 |
578 |

579 | Inspiration

580 | 586 |

If you have any additional questions or encounter issues please raise them on the github page.

587 |
588 |
589 |
590 | 591 | 622 |
623 | 624 | 632 |
633 | 634 | 635 | 636 | 637 | 638 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/news/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Changelog • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
51 |
52 | 107 | 108 | 109 |
110 | 111 |
112 |
113 | 117 | 118 |
119 | 120 | 127 | 128 |
129 | 130 | 139 |
140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body > .container { 21 | display: flex; 22 | height: 100%; 23 | flex-direction: column; 24 | 25 | padding-top: 60px; 26 | } 27 | 28 | body > .container .row { 29 | flex: 1 0 auto; 30 | } 31 | 32 | footer { 33 | margin-top: 45px; 34 | padding: 35px 0 36px; 35 | border-top: 1px solid #e5e5e5; 36 | color: #666; 37 | display: flex; 38 | flex-shrink: 0; 39 | } 40 | footer p { 41 | margin-bottom: 0; 42 | } 43 | footer div { 44 | flex: 1; 45 | } 46 | footer .pkgdown { 47 | text-align: right; 48 | } 49 | footer p { 50 | margin-bottom: 0; 51 | } 52 | 53 | img.icon { 54 | float: right; 55 | } 56 | 57 | img { 58 | max-width: 100%; 59 | } 60 | 61 | /* Fix bug in bootstrap (only seen in firefox) */ 62 | summary { 63 | display: list-item; 64 | } 65 | 66 | /* Typographic tweaking ---------------------------------*/ 67 | 68 | .contents .page-header { 69 | margin-top: calc(-60px + 1em); 70 | } 71 | 72 | /* Section anchors ---------------------------------*/ 73 | 74 | a.anchor { 75 | margin-left: -30px; 76 | display:inline-block; 77 | width: 30px; 78 | height: 30px; 79 | visibility: hidden; 80 | 81 | background-image: url(./link.svg); 82 | background-repeat: no-repeat; 83 | background-size: 20px 20px; 84 | background-position: center center; 85 | } 86 | 87 | .hasAnchor:hover a.anchor { 88 | visibility: visible; 89 | } 90 | 91 | @media (max-width: 767px) { 92 | .hasAnchor:hover a.anchor { 93 | visibility: hidden; 94 | } 95 | } 96 | 97 | 98 | /* Fixes for fixed navbar --------------------------*/ 99 | 100 | .contents h1, .contents h2, .contents h3, .contents h4 { 101 | padding-top: 60px; 102 | margin-top: -40px; 103 | } 104 | 105 | /* Static header placement on mobile devices */ 106 | @media (max-width: 767px) { 107 | .navbar-fixed-top { 108 | position: absolute; 109 | } 110 | .navbar { 111 | padding: 0; 112 | } 113 | } 114 | 115 | 116 | /* Sidebar --------------------------*/ 117 | 118 | #sidebar { 119 | margin-top: 30px; 120 | } 121 | #sidebar h2 { 122 | font-size: 1.5em; 123 | margin-top: 1em; 124 | } 125 | 126 | #sidebar h2:first-child { 127 | margin-top: 0; 128 | } 129 | 130 | #sidebar .list-unstyled li { 131 | margin-bottom: 0.5em; 132 | } 133 | 134 | .orcid { 135 | height: 16px; 136 | vertical-align: middle; 137 | } 138 | 139 | /* Reference index & topics ----------------------------------------------- */ 140 | 141 | .ref-index th {font-weight: normal;} 142 | 143 | .ref-index td {vertical-align: top;} 144 | .ref-index .icon {width: 40px;} 145 | .ref-index .alias {width: 40%;} 146 | .ref-index-icons .alias {width: calc(40% - 40px);} 147 | .ref-index .title {width: 60%;} 148 | 149 | .ref-arguments th {text-align: right; padding-right: 10px;} 150 | .ref-arguments th, .ref-arguments td {vertical-align: top;} 151 | .ref-arguments .name {width: 20%;} 152 | .ref-arguments .desc {width: 80%;} 153 | 154 | /* Nice scrolling for wide elements --------------------------------------- */ 155 | 156 | table { 157 | display: block; 158 | overflow: auto; 159 | } 160 | 161 | /* Syntax highlighting ---------------------------------------------------- */ 162 | 163 | pre { 164 | word-wrap: normal; 165 | word-break: normal; 166 | border: 1px solid #eee; 167 | } 168 | 169 | pre, code { 170 | background-color: #f8f8f8; 171 | color: #333; 172 | } 173 | 174 | pre code { 175 | overflow: auto; 176 | word-wrap: normal; 177 | white-space: pre; 178 | } 179 | 180 | pre .img { 181 | margin: 5px 0; 182 | } 183 | 184 | pre .img img { 185 | background-color: #fff; 186 | display: block; 187 | height: auto; 188 | } 189 | 190 | code a, pre a { 191 | color: #375f84; 192 | } 193 | 194 | a.sourceLine:hover { 195 | text-decoration: none; 196 | } 197 | 198 | .fl {color: #1514b5;} 199 | .fu {color: #000000;} /* function */ 200 | .ch,.st {color: #036a07;} /* string */ 201 | .kw {color: #264D66;} /* keyword */ 202 | .co {color: #888888;} /* comment */ 203 | 204 | .message { color: black; font-weight: bolder;} 205 | .error { color: orange; font-weight: bolder;} 206 | .warning { color: #6A0366; font-weight: bolder;} 207 | 208 | /* Clipboard --------------------------*/ 209 | 210 | .hasCopyButton { 211 | position: relative; 212 | } 213 | 214 | .btn-copy-ex { 215 | position: absolute; 216 | right: 0; 217 | top: 0; 218 | visibility: hidden; 219 | } 220 | 221 | .hasCopyButton:hover button.btn-copy-ex { 222 | visibility: visible; 223 | } 224 | 225 | /* mark.js ----------------------------*/ 226 | 227 | mark { 228 | background-color: rgba(255, 255, 51, 0.5); 229 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 230 | padding: 1px; 231 | } 232 | 233 | /* vertical spacing after htmlwidgets */ 234 | .html-widget { 235 | margin-bottom: 10px; 236 | } 237 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $("#sidebar") 6 | .stick_in_parent({offset_top: 40}) 7 | .on('sticky_kit:bottom', function(e) { 8 | $(this).parent().css('position', 'static'); 9 | }) 10 | .on('sticky_kit:unbottom', function(e) { 11 | $(this).parent().css('position', 'relative'); 12 | }); 13 | 14 | $('body').scrollspy({ 15 | target: '#sidebar', 16 | offset: 60 17 | }); 18 | 19 | $('[data-toggle="tooltip"]').tooltip(); 20 | 21 | var cur_path = paths(location.pathname); 22 | var links = $("#navbar ul li a"); 23 | var max_length = -1; 24 | var pos = -1; 25 | for (var i = 0; i < links.length; i++) { 26 | if (links[i].getAttribute("href") === "#") 27 | continue; 28 | // Ignore external links 29 | if (links[i].host !== location.host) 30 | continue; 31 | 32 | var nav_path = paths(links[i].pathname); 33 | 34 | var length = prefix_length(nav_path, cur_path); 35 | if (length > max_length) { 36 | max_length = length; 37 | pos = i; 38 | } 39 | } 40 | 41 | // Add class to parent
  • , and enclosing
  • if in dropdown 42 | if (pos >= 0) { 43 | var menu_anchor = $(links[pos]); 44 | menu_anchor.parent().addClass("active"); 45 | menu_anchor.closest("li.dropdown").addClass("active"); 46 | } 47 | }); 48 | 49 | function paths(pathname) { 50 | var pieces = pathname.split("/"); 51 | pieces.shift(); // always starts with / 52 | 53 | var end = pieces[pieces.length - 1]; 54 | if (end === "index.html" || end === "") 55 | pieces.pop(); 56 | return(pieces); 57 | } 58 | 59 | // Returns -1 if not found 60 | function prefix_length(needle, haystack) { 61 | if (needle.length > haystack.length) 62 | return(-1); 63 | 64 | // Special case for length-0 haystack, since for loop won't run 65 | if (haystack.length === 0) { 66 | return(needle.length === 0 ? 0 : -1); 67 | } 68 | 69 | for (var i = 0; i < haystack.length; i++) { 70 | if (needle[i] != haystack[i]) 71 | return(i); 72 | } 73 | 74 | return(haystack.length); 75 | } 76 | 77 | /* Clipboard --------------------------*/ 78 | 79 | function changeTooltipMessage(element, msg) { 80 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 81 | element.setAttribute('data-original-title', msg); 82 | $(element).tooltip('show'); 83 | element.setAttribute('data-original-title', tooltipOriginalTitle); 84 | } 85 | 86 | if(ClipboardJS.isSupported()) { 87 | $(document).ready(function() { 88 | var copyButton = ""; 89 | 90 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 91 | 92 | // Insert copy buttons: 93 | $(copyButton).prependTo(".hasCopyButton"); 94 | 95 | // Initialize tooltips: 96 | $('.btn-copy-ex').tooltip({container: 'body'}); 97 | 98 | // Initialize clipboard: 99 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 100 | text: function(trigger) { 101 | return trigger.parentNode.textContent; 102 | } 103 | }); 104 | 105 | clipboardBtnCopies.on('success', function(e) { 106 | changeTooltipMessage(e.trigger, 'Copied!'); 107 | e.clearSelection(); 108 | }); 109 | 110 | clipboardBtnCopies.on('error', function() { 111 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 112 | }); 113 | }); 114 | } 115 | })(window.jQuery || window.$) 116 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.3.1 2 | pkgdown: 1.3.0 3 | pkgdown_sha: ~ 4 | articles: 5 | intro: intro.html 6 | 7 | -------------------------------------------------------------------------------- /docs/reference/cluster_interval.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Cluster ranges which are implemented as 2 equal-length numeric vectors. — cluster_interval • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 110 | 111 | 112 |
    113 | 114 |
    115 |
    116 | 121 | 122 |
    123 | 124 |

    Cluster ranges which are implemented as 2 equal-length numeric vectors.

    125 | 126 |
    127 | 128 |
    cluster_interval(starts, ends, max_distance = 0L)
    129 | 130 |

    Arguments

    131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 145 | 146 |
    starts

    A numeric vector that defines the starts of each interval

    ends

    A numeric vector that defines the ends of each interval

    max_distance

    The maximum distance up to which intervals are still considered to be 144 | the same cluster. Default: 0.

    147 | 148 | 149 |

    Examples

    150 |
    starts <- c(50, 100, 120) 151 | ends <- c(75, 130, 150) 152 | j <- cluster_interval(starts, ends) 153 | j == c(0,1,1)
    #> [1] TRUE TRUE TRUE
    154 |
    155 | 164 |
    165 | 166 |
    167 | 170 | 171 |
    172 |

    Site built with pkgdown 1.3.0.

    173 |
    174 |
    175 |
    176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /docs/reference/genome_cluster.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Intersect data frames based on chromosome, start and end. — genome_cluster • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 110 | 111 | 112 |
    113 | 114 |
    115 |
    116 | 121 | 122 |
    123 | 124 |

    Intersect data frames based on chromosome, start and end.

    125 | 126 |
    127 | 128 |
    genome_cluster(x, by = NULL, max_distance = 0,
    129 |   cluster_column_name = "cluster_id")
    130 | 131 |

    Arguments

    132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 142 | 143 | 144 | 145 | 147 | 148 | 149 | 150 | 151 | 152 |
    x

    A dataframe.

    by

    A character vector with 3 entries which are the chromosome, start and end column. 141 | For example: by=c("chr", "start", "end")

    max_distance

    The maximum distance up to which intervals are still considered to be 146 | the same cluster. Default: 0.

    cluster_column_name

    A string that is used as the new column name

    153 | 154 |

    Value

    155 | 156 |

    The dataframe with the additional column of the cluster

    157 | 158 | 159 |

    Examples

    160 |
    161 | library(dplyr)
    #> 162 | #> Attaching package: ‘dplyr’
    #> The following object is masked from ‘package:testthat’: 163 | #> 164 | #> matches
    #> The following objects are masked from ‘package:stats’: 165 | #> 166 | #> filter, lag
    #> The following objects are masked from ‘package:base’: 167 | #> 168 | #> intersect, setdiff, setequal, union
    169 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 170 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 171 | start = c(100, 120, 300, 260), 172 | end = c(150, 250, 350, 450)) 173 | genome_cluster(x1, by=c("chromosome", "start", "end"))
    #> # A tibble: 4 x 6 174 | #> id bla chromosome start end cluster_id 175 | #> <int> <fct> <fct> <dbl> <dbl> <dbl> 176 | #> 1 1 a chr1 100 150 0 177 | #> 2 2 b chr1 120 250 0 178 | #> 3 3 c chr2 300 350 2 179 | #> 4 4 d chr1 260 450 1
    genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10)
    #> # A tibble: 4 x 6 180 | #> id bla chromosome start end cluster_id 181 | #> <int> <fct> <fct> <dbl> <dbl> <dbl> 182 | #> 1 1 a chr1 100 150 0 183 | #> 2 2 b chr1 120 250 0 184 | #> 3 3 c chr2 300 350 1 185 | #> 4 4 d chr1 260 450 0
    186 |
    187 | 198 |
    199 | 200 |
    201 | 204 | 205 |
    206 |

    Site built with pkgdown 1.3.0.

    207 |
    208 |
    209 |
    210 | 211 | 212 | 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /docs/reference/genome_complement.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Calculates the complement to the intervals covered by the intervals in 10 | a data frame. It can optionally take a <code>chromosome_size</code> data frame 11 | that contains 2 or 3 columns, the first the names of chromosome and in case 12 | there are 2 columns the size or first the start index and lastly the end index 13 | on the chromosome. — genome_complement • tidygenomics 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 42 | 43 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 60 | 61 | 62 | 63 | 64 | 65 |
    66 |
    67 | 122 | 123 | 124 |
    125 | 126 |
    127 |
    128 | 137 | 138 |
    139 | 140 |

    Calculates the complement to the intervals covered by the intervals in 141 | a data frame. It can optionally take a chromosome_size data frame 142 | that contains 2 or 3 columns, the first the names of chromosome and in case 143 | there are 2 columns the size or first the start index and lastly the end index 144 | on the chromosome.

    145 | 146 |
    147 | 148 |
    genome_complement(x, chromosome_size = NULL, by = NULL)
    149 | 150 |

    Arguments

    151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 162 | 163 | 164 | 165 | 167 | 168 |
    x

    A data frame for which the complement is calculated

    chromosome_size

    A dataframe with at least 2 columns that contains 160 | first the chromosome name and then the size of that chromosome. Can be NULL 161 | in which case the largest value per chromosome from x is used.

    by

    A character vector with 3 entries which are the chromosome, start and end column. 166 | For example: by=c("chr", "start", "end")

    169 | 170 | 171 |

    Examples

    172 |
    173 | library(dplyr) 174 | 175 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 176 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 177 | start = c(100, 200, 300, 400), 178 | end = c(150, 250, 350, 450)) 179 | 180 | genome_complement(x1, by=c("chromosome", "start", "end"))
    #> # A tibble: 4 x 3 181 | #> chromosome start end 182 | #> <fct> <int> <int> 183 | #> 1 chr1 1 99 184 | #> 2 chr1 151 199 185 | #> 3 chr1 251 399 186 | #> 4 chr2 1 299
    187 |
    188 | 197 |
    198 | 199 |
    200 | 203 | 204 |
    205 |

    Site built with pkgdown 1.3.0.

    206 |
    207 |
    208 |
    209 | 210 | 211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /docs/reference/genome_intersect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Intersect data frames based on chromosome, start and end. — genome_intersect • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 110 | 111 | 112 |
    113 | 114 |
    115 |
    116 | 121 | 122 |
    123 | 124 |

    Intersect data frames based on chromosome, start and end.

    125 | 126 |
    127 | 128 |
    genome_intersect(x, y, by = NULL, mode = "both")
    129 | 130 |

    Arguments

    131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 145 | 146 | 147 | 148 | 149 | 150 |
    x

    A dataframe.

    y

    A dataframe.

    by

    A character vector with 3 entries which are used to match the chromosome, start and end column. 144 | For example: by=c("Chromosome"="chr", "Start"="start", "End"="end")

    mode

    One of "both", "left", "right" or "anti".

    151 | 152 |

    Value

    153 | 154 |

    The intersected dataframe of x and y with the new boundaries.

    155 | 156 | 157 |

    Examples

    158 |
    159 | library(dplyr) 160 | 161 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 162 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 163 | start = c(100, 200, 300, 400), 164 | end = c(150, 250, 350, 450)) 165 | 166 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 167 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 168 | start = c(140, 210, 400, 300), 169 | end = c(160, 240, 415, 320)) 170 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 171 | print(j)
    #> id.x bla chromosome id.y BLA start end 172 | #> 1 1 a chr1 1 A 140 150 173 | #> 2 4 d chr2 3 C 400 415
    174 | 175 | 176 |
    177 |
    178 | 189 |
    190 | 191 |
    192 | 195 | 196 |
    197 |

    Site built with pkgdown 1.3.0.

    198 |
    199 |
    200 |
    201 | 202 | 203 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /docs/reference/genome_join_closest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Join intervals on chromosomes in data frames, to the closest partner — genome_join_closest • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 110 | 111 | 112 |
    113 | 114 |
    115 |
    116 | 121 | 122 |
    123 | 124 |

    Join intervals on chromosomes in data frames, to the closest partner

    125 | 126 |
    127 | 128 |
    genome_join_closest(x, y, by = NULL, mode = "inner",
    129 |   distance_column_name = NULL, max_distance = Inf, select = "all")
    130 | 
    131 | genome_inner_join_closest(x, y, by = NULL, ...)
    132 | 
    133 | genome_left_join_closest(x, y, by = NULL, ...)
    134 | 
    135 | genome_right_join_closest(x, y, by = NULL, ...)
    136 | 
    137 | genome_full_join_closest(x, y, by = NULL, ...)
    138 | 
    139 | genome_semi_join_closest(x, y, by = NULL, ...)
    140 | 
    141 | genome_anti_join_closest(x, y, by = NULL, ...)
    142 | 143 |

    Arguments

    144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 177 | 178 | 179 | 180 | 181 | 182 |
    x

    A dataframe.

    y

    A dataframe.

    by

    A character vector with 3 entries which are used to match the chromosome, start and end column. 157 | For example: by=c("Chromosome"="chr", "Start"="start", "End"="end")

    mode

    One of "inner", "full", "left", "right", "semi" or "anti".

    distance_column_name

    A string that is used as the new column name with the distance. 166 | If NULL no new column is added.

    max_distance

    The maximum distance that is allowed to join 2 entries.

    select

    A string that is passed on to IRanges::distanceToNearest, can either be 175 | all which means that in case that multiple intervals have the same distance all are reported, or 176 | arbitrary which means in that case one would be chosen at random.

    ...

    Additional arguments parsed on to genome_join_closest.

    183 | 184 |

    Value

    185 | 186 |

    The joined dataframe of x and y.

    187 | 188 | 189 |

    Examples

    190 |
    191 | library(dplyr) 192 | 193 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 194 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 195 | start = c(100, 200, 300, 400), 196 | end = c(150, 250, 350, 450)) 197 | 198 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 199 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 200 | start = c(140, 210, 400, 300), 201 | end = c(160, 240, 415, 320)) 202 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 203 | print(j)
    #> id.x bla chromosome id.y BLA start end 204 | #> 1 1 a chr1 1 A 140 150 205 | #> 2 4 d chr2 3 C 400 415
    206 |
    207 | 218 |
    219 | 220 |
    221 | 224 | 225 |
    226 |

    Site built with pkgdown 1.3.0.

    227 |
    228 |
    229 |
    230 | 231 | 232 | 233 | 234 | 235 | 236 | -------------------------------------------------------------------------------- /docs/reference/genome_subtract.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Subtract one data frame from another based on chromosome, start and end. — genome_subtract • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 48 | 49 | 50 | 51 | 52 | 53 |
    54 |
    55 | 110 | 111 | 112 |
    113 | 114 |
    115 |
    116 | 121 | 122 |
    123 | 124 |

    Subtract one data frame from another based on chromosome, start and end.

    125 | 126 |
    127 | 128 |
    genome_subtract(x, y, by = NULL)
    129 | 130 |

    Arguments

    131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 145 | 146 |
    x

    A dataframe.

    y

    A dataframe.

    by

    A character vector with 3 entries which are used to match the chromosome, start and end column. 144 | For example: by=c("Chromosome"="chr", "Start"="start", "End"="end")

    147 | 148 |

    Value

    149 | 150 |

    The subtracted dataframe of x and y with the new boundaries.

    151 | 152 | 153 |

    Examples

    154 |
    155 | library(dplyr) 156 | 157 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 158 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 159 | start = c(100, 200, 300, 400), 160 | end = c(150, 250, 350, 450)) 161 | 162 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 163 | chromosome = c("chr1", "chr2", "chr1", "chr1"), 164 | start = c(120, 210, 300, 400), 165 | end = c(125, 240, 320, 415)) 166 | 167 | j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 168 | print(j)
    #> id bla chromosome start end 169 | #> 1 1 a chr1 100 119 170 | #> 2 1 a chr1 126 150 171 | #> 3 2 b chr1 200 250 172 | #> 4 3 c chr2 300 350 173 | #> 5 4 d chr1 416 450
    174 | 175 |
    176 |
    177 | 188 |
    189 | 190 |
    191 | 194 | 195 |
    196 |

    Site built with pkgdown 1.3.0.

    197 |
    198 |
    199 |
    200 | 201 | 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Function reference • tidygenomics 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 45 | 46 | 47 | 48 | 49 | 50 |
    51 |
    52 | 107 | 108 | 109 |
    110 | 111 |
    112 |
    113 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 131 | 132 | 133 | 134 | 137 | 138 | 139 | 140 | 143 | 144 | 145 | 146 | 149 | 154 | 155 | 156 | 159 | 160 | 161 | 162 | 165 | 166 | 167 | 168 | 171 | 172 | 173 | 174 |
    128 |

    All functions

    129 |

    130 |
    135 |

    cluster_interval()

    136 |

    Cluster ranges which are implemented as 2 equal-length numeric vectors.

    141 |

    genome_cluster()

    142 |

    Intersect data frames based on chromosome, start and end.

    147 |

    genome_complement()

    148 |

    Calculates the complement to the intervals covered by the intervals in 150 | a data frame. It can optionally take a chromosome_size data frame 151 | that contains 2 or 3 columns, the first the names of chromosome and in case 152 | there are 2 columns the size or first the start index and lastly the end index 153 | on the chromosome.

    157 |

    genome_intersect()

    158 |

    Intersect data frames based on chromosome, start and end.

    163 |

    genome_join_closest() genome_inner_join_closest() genome_left_join_closest() genome_right_join_closest() genome_full_join_closest() genome_semi_join_closest() genome_anti_join_closest()

    164 |

    Join intervals on chromosomes in data frames, to the closest partner

    169 |

    genome_subtract()

    170 |

    Subtract one data frame from another based on chromosome, start and end.

    175 |
    176 | 177 | 183 |
    184 | 185 |
    186 | 189 | 190 |
    191 |

    Site built with pkgdown 1.3.0.

    192 |
    193 |
    194 |
    195 | 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /man/cluster_interval.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{cluster_interval} 4 | \alias{cluster_interval} 5 | \title{Cluster ranges which are implemented as 2 equal-length numeric vectors.} 6 | \usage{ 7 | cluster_interval(starts, ends, max_distance = 0L) 8 | } 9 | \arguments{ 10 | \item{starts}{A numeric vector that defines the starts of each interval} 11 | 12 | \item{ends}{A numeric vector that defines the ends of each interval} 13 | 14 | \item{max_distance}{The maximum distance up to which intervals are still considered to be 15 | the same cluster. Default: 0.} 16 | } 17 | \description{ 18 | Cluster ranges which are implemented as 2 equal-length numeric vectors. 19 | } 20 | \examples{ 21 | starts <- c(50, 100, 120) 22 | ends <- c(75, 130, 150) 23 | j <- cluster_interval(starts, ends) 24 | j == c(0,1,1) 25 | } 26 | -------------------------------------------------------------------------------- /man/genome_cluster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cluster.R 3 | \name{genome_cluster} 4 | \alias{genome_cluster} 5 | \title{Intersect data frames based on chromosome, start and end.} 6 | \usage{ 7 | genome_cluster(x, by = NULL, max_distance = 0, 8 | cluster_column_name = "cluster_id") 9 | } 10 | \arguments{ 11 | \item{x}{A dataframe.} 12 | 13 | \item{by}{A character vector with 3 entries which are the chromosome, start and end column. 14 | For example: \code{by=c("chr", "start", "end")}} 15 | 16 | \item{max_distance}{The maximum distance up to which intervals are still considered to be 17 | the same cluster. Default: 0.} 18 | 19 | \item{cluster_column_name}{A string that is used as the new column name} 20 | } 21 | \value{ 22 | The dataframe with the additional column of the cluster 23 | } 24 | \description{ 25 | Intersect data frames based on chromosome, start and end. 26 | } 27 | \examples{ 28 | 29 | library(dplyr) 30 | 31 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 32 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 33 | start = c(100, 120, 300, 260), 34 | end = c(150, 250, 350, 450)) 35 | genome_cluster(x1, by=c("chromosome", "start", "end")) 36 | genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10) 37 | } 38 | -------------------------------------------------------------------------------- /man/genome_complement.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/complement.R 3 | \name{genome_complement} 4 | \alias{genome_complement} 5 | \title{Calculates the complement to the intervals covered by the intervals in 6 | a data frame. It can optionally take a \code{chromosome_size} data frame 7 | that contains 2 or 3 columns, the first the names of chromosome and in case 8 | there are 2 columns the size or first the start index and lastly the end index 9 | on the chromosome.} 10 | \usage{ 11 | genome_complement(x, chromosome_size = NULL, by = NULL) 12 | } 13 | \arguments{ 14 | \item{x}{A data frame for which the complement is calculated} 15 | 16 | \item{chromosome_size}{A dataframe with at least 2 columns that contains 17 | first the chromosome name and then the size of that chromosome. Can be NULL 18 | in which case the largest value per chromosome from \code{x} is used.} 19 | 20 | \item{by}{A character vector with 3 entries which are the chromosome, start and end column. 21 | For example: \code{by=c("chr", "start", "end")}} 22 | } 23 | \description{ 24 | Calculates the complement to the intervals covered by the intervals in 25 | a data frame. It can optionally take a \code{chromosome_size} data frame 26 | that contains 2 or 3 columns, the first the names of chromosome and in case 27 | there are 2 columns the size or first the start index and lastly the end index 28 | on the chromosome. 29 | } 30 | \examples{ 31 | 32 | library(dplyr) 33 | 34 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 35 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 36 | start = c(100, 200, 300, 400), 37 | end = c(150, 250, 350, 450)) 38 | 39 | genome_complement(x1, by=c("chromosome", "start", "end")) 40 | } 41 | -------------------------------------------------------------------------------- /man/genome_intersect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/intersect.R 3 | \name{genome_intersect} 4 | \alias{genome_intersect} 5 | \title{Intersect data frames based on chromosome, start and end.} 6 | \usage{ 7 | genome_intersect(x, y, by = NULL, mode = "both") 8 | } 9 | \arguments{ 10 | \item{x}{A dataframe.} 11 | 12 | \item{y}{A dataframe.} 13 | 14 | \item{by}{A character vector with 3 entries which are used to match the chromosome, start and end column. 15 | For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")}} 16 | 17 | \item{mode}{One of "both", "left", "right" or "anti".} 18 | } 19 | \value{ 20 | The intersected dataframe of \code{x} and \code{y} with the new boundaries. 21 | } 22 | \description{ 23 | Intersect data frames based on chromosome, start and end. 24 | } 25 | \examples{ 26 | 27 | library(dplyr) 28 | 29 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 30 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 31 | start = c(100, 200, 300, 400), 32 | end = c(150, 250, 350, 450)) 33 | 34 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 35 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 36 | start = c(140, 210, 400, 300), 37 | end = c(160, 240, 415, 320)) 38 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 39 | print(j) 40 | 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /man/genome_join_closest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/join_closest.R 3 | \name{genome_join_closest} 4 | \alias{genome_join_closest} 5 | \alias{genome_inner_join_closest} 6 | \alias{genome_left_join_closest} 7 | \alias{genome_right_join_closest} 8 | \alias{genome_full_join_closest} 9 | \alias{genome_semi_join_closest} 10 | \alias{genome_anti_join_closest} 11 | \title{Join intervals on chromosomes in data frames, to the closest partner} 12 | \usage{ 13 | genome_join_closest(x, y, by = NULL, mode = "inner", 14 | distance_column_name = NULL, max_distance = Inf, select = "all") 15 | 16 | genome_inner_join_closest(x, y, by = NULL, ...) 17 | 18 | genome_left_join_closest(x, y, by = NULL, ...) 19 | 20 | genome_right_join_closest(x, y, by = NULL, ...) 21 | 22 | genome_full_join_closest(x, y, by = NULL, ...) 23 | 24 | genome_semi_join_closest(x, y, by = NULL, ...) 25 | 26 | genome_anti_join_closest(x, y, by = NULL, ...) 27 | } 28 | \arguments{ 29 | \item{x}{A dataframe.} 30 | 31 | \item{y}{A dataframe.} 32 | 33 | \item{by}{A character vector with 3 entries which are used to match the chromosome, start and end column. 34 | For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")}} 35 | 36 | \item{mode}{One of "inner", "full", "left", "right", "semi" or "anti".} 37 | 38 | \item{distance_column_name}{A string that is used as the new column name with the distance. 39 | If \code{NULL} no new column is added.} 40 | 41 | \item{max_distance}{The maximum distance that is allowed to join 2 entries.} 42 | 43 | \item{select}{A string that is passed on to \code{IRanges::distanceToNearest}, can either be 44 | all which means that in case that multiple intervals have the same distance all are reported, or 45 | arbitrary which means in that case one would be chosen at random.} 46 | 47 | \item{...}{Additional arguments parsed on to genome_join_closest.} 48 | } 49 | \value{ 50 | The joined dataframe of \code{x} and \code{y}. 51 | } 52 | \description{ 53 | Join intervals on chromosomes in data frames, to the closest partner 54 | } 55 | \examples{ 56 | 57 | library(dplyr) 58 | 59 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 60 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 61 | start = c(100, 200, 300, 400), 62 | end = c(150, 250, 350, 450)) 63 | 64 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 65 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 66 | start = c(140, 210, 400, 300), 67 | end = c(160, 240, 415, 320)) 68 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 69 | print(j) 70 | } 71 | -------------------------------------------------------------------------------- /man/genome_subtract.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/subtract.R 3 | \name{genome_subtract} 4 | \alias{genome_subtract} 5 | \title{Subtract one data frame from another based on chromosome, start and end.} 6 | \usage{ 7 | genome_subtract(x, y, by = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{A dataframe.} 11 | 12 | \item{y}{A dataframe.} 13 | 14 | \item{by}{A character vector with 3 entries which are used to match the chromosome, start and end column. 15 | For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")}} 16 | } 17 | \value{ 18 | The subtracted dataframe of \code{x} and \code{y} with the new boundaries. 19 | } 20 | \description{ 21 | Subtract one data frame from another based on chromosome, start and end. 22 | } 23 | \examples{ 24 | 25 | library(dplyr) 26 | 27 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 28 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 29 | start = c(100, 200, 300, 400), 30 | end = c(150, 250, 350, 450)) 31 | 32 | x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4], 33 | chromosome = c("chr1", "chr2", "chr1", "chr1"), 34 | start = c(120, 210, 300, 400), 35 | end = c(125, 240, 320, 415)) 36 | 37 | j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 38 | print(j) 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // sort_indices 9 | IntegerVector sort_indices(NumericVector x); 10 | RcppExport SEXP _tidygenomics_sort_indices(SEXP xSEXP) { 11 | BEGIN_RCPP 12 | Rcpp::RObject rcpp_result_gen; 13 | Rcpp::RNGScope rcpp_rngScope_gen; 14 | Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); 15 | rcpp_result_gen = Rcpp::wrap(sort_indices(x)); 16 | return rcpp_result_gen; 17 | END_RCPP 18 | } 19 | // cluster_interval 20 | IntegerVector cluster_interval(NumericVector starts, NumericVector ends, int max_distance); 21 | RcppExport SEXP _tidygenomics_cluster_interval(SEXP startsSEXP, SEXP endsSEXP, SEXP max_distanceSEXP) { 22 | BEGIN_RCPP 23 | Rcpp::RObject rcpp_result_gen; 24 | Rcpp::RNGScope rcpp_rngScope_gen; 25 | Rcpp::traits::input_parameter< NumericVector >::type starts(startsSEXP); 26 | Rcpp::traits::input_parameter< NumericVector >::type ends(endsSEXP); 27 | Rcpp::traits::input_parameter< int >::type max_distance(max_distanceSEXP); 28 | rcpp_result_gen = Rcpp::wrap(cluster_interval(starts, ends, max_distance)); 29 | return rcpp_result_gen; 30 | END_RCPP 31 | } 32 | -------------------------------------------------------------------------------- /src/cluster_interval.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | using namespace Rcpp; 5 | using namespace std; 6 | 7 | 8 | // The following code was copied from the stan math library 9 | // https://github.com/stan-dev/stan/blob/e118db2b78ed33c40f7b5c774f3ce5b85aa5dfdf/src/stan/math/matrix/sort_indices.hpp 10 | 11 | /* 12 | * Copyright (c) 2011--2015, Stan Developers and their Assignees 13 | All rights reserved. 14 | 15 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 16 | 17 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 18 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 19 | * Neither the name of Columbia University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | template 25 | class index_comparator { 26 | const C& xs_; 27 | public: 28 | /** 29 | * Construct an index comparator holding a reference 30 | * to the specified container. 31 | * 32 | * @patam xs Container 33 | */ 34 | index_comparator(const C& xs) : xs_(xs) { } 35 | 36 | /** 37 | * Return true if the value at the first index is sorted in 38 | * front of the value at the second index; this will depend 39 | * on the template parameter ascending. 40 | * 41 | * @param i Index of first value for comparison 42 | * @param j Index of second value for comparison 43 | */ 44 | bool operator()(int i, int j) const { 45 | if (ascending) 46 | return xs_[i] < xs_[j]; 47 | else 48 | return xs_[i] > xs_[j]; 49 | } 50 | }; 51 | 52 | 53 | /** 54 | * Return an integer array of indices of the specified container 55 | * sorting the values in ascending or descending order based on 56 | * the value of the first template prameter. 57 | * 58 | * @tparam ascending true if sort is in ascending order 59 | * @tparam C type of container 60 | * @param xs Container to sort 61 | * @return sorted version of container 62 | */ 63 | template 64 | std::vector sort_indices(const C& xs) { 65 | typename C::size_type size = xs.size(); 66 | std::vector idxs; 67 | idxs.resize(size); 68 | for (typename C::size_type i = 0; i < size; ++i) 69 | idxs[i] = i; 70 | index_comparator comparator(xs); 71 | std::sort(idxs.begin(), idxs.end(), comparator); 72 | return idxs; 73 | } 74 | 75 | 76 | // [[Rcpp::export]] 77 | IntegerVector sort_indices(NumericVector x){ 78 | return wrap(sort_indices(as >(x))); 79 | } 80 | 81 | 82 | //' Cluster ranges which are implemented as 2 equal-length numeric vectors. 83 | //' @param starts A numeric vector that defines the starts of each interval 84 | //' @param ends A numeric vector that defines the ends of each interval 85 | //' @param max_distance The maximum distance up to which intervals are still considered to be 86 | //' the same cluster. Default: 0. 87 | //' @examples 88 | //' starts <- c(50, 100, 120) 89 | //' ends <- c(75, 130, 150) 90 | //' j <- cluster_interval(starts, ends) 91 | //' j == c(0,1,1) 92 | //' @export 93 | // [[Rcpp::export]] 94 | IntegerVector cluster_interval(NumericVector starts, NumericVector ends, int max_distance=0) { 95 | 96 | // Require that starts and ends are the same length 97 | 98 | // The implementation is inspired by the bedtools implementation: 99 | // https://github.com/arq5x/bedtools2/blob/14fbbb8aed5c6a04685da2cee3f11b98d70304a7/src/clusterBed/clusterBed.cpp 100 | IntegerVector result(starts.size()); 101 | int cluster_id = -1; 102 | int prev_end = std::numeric_limits::min(); 103 | vector indices = sort_indices(as >(starts)); 104 | for (int j = 0; j < indices.size(); j++) { 105 | int i = indices[j]; 106 | Rcpp::checkUserInterrupt(); 107 | if(starts[i] - prev_end > max_distance){ 108 | cluster_id++; 109 | prev_end = ends[i]; 110 | }else{ 111 | if(ends[i] > prev_end){ 112 | prev_end = ends[i]; 113 | } 114 | } 115 | result[i] = cluster_id; 116 | } 117 | 118 | return result; 119 | } 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /src/tidygenomics_init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP _tidygenomics_cluster_interval(SEXP, SEXP, SEXP); 12 | extern SEXP _tidygenomics_sort_indices(SEXP); 13 | 14 | static const R_CallMethodDef CallEntries[] = { 15 | {"_tidygenomics_cluster_interval", (DL_FUNC) &_tidygenomics_cluster_interval, 3}, 16 | {"_tidygenomics_sort_indices", (DL_FUNC) &_tidygenomics_sort_indices, 1}, 17 | {NULL, NULL, 0} 18 | }; 19 | 20 | void R_init_tidygenomics(DllInfo *dll) 21 | { 22 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 23 | R_useDynamicSymbols(dll, FALSE); 24 | } 25 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tidygenomics) 3 | 4 | test_check("tidygenomics") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_cluster.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_cluster") 3 | 4 | library(dplyr) 5 | 6 | x1 <- tibble(id = 1:4, bla=letters[1:4], 7 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 8 | start = c(100, 120, 300, 260), 9 | end = c(150, 250, 350, 450)) 10 | 11 | 12 | test_that("genome_clustering assings that correct clusters", { 13 | j <- genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=5) 14 | 15 | print(j) 16 | 17 | expect_equal(j$cluster_id, c(0,0,2,1)) 18 | }) 19 | 20 | 21 | test_that("cluster_interval works", { 22 | starts <- c(50, 100, 120) 23 | ends <- c(75, 130, 150) 24 | j <- cluster_interval(starts, ends) 25 | expect_equal(j, c(0,1,1)) 26 | expect_equal(cluster_interval(starts, ends, max_distance = 24), c(0,1,1)) 27 | expect_equal(cluster_interval(starts, ends, max_distance = 25), c(0,0,0)) 28 | 29 | starts <- c(50, 100, 120, 180, 350) 30 | ends <- c(75, 200, 150, 210, 400) 31 | expect_equal(cluster_interval(starts, ends), c(0,1,1,1,2)) 32 | 33 | starts <- c(500, 300, 150) 34 | ends <- c(510, 310, 160) 35 | expect_equal(cluster_interval(starts, ends), c(2,1,0)) 36 | 37 | expect_equal(cluster_interval(numeric(0), numeric(0)), numeric(0)) 38 | }) 39 | -------------------------------------------------------------------------------- /tests/testthat/test_complement.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_complement") 3 | 4 | library(dplyr) 5 | 6 | x1 <- tibble(id = 1:4, bla=letters[1:4], 7 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 8 | start = c(100, 200, 300, 400), 9 | end = c(150, 250, 350, 450)) 10 | 11 | test_that("Calculating the complement of a sequence works", { 12 | j <- genome_complement(x1, by=c("chromosome", "start", "end")) 13 | print(j) 14 | expect_equal(j$chromosome, c("chr1", "chr1", "chr1", "chr2")) 15 | expect_equal(j$start, c(1,151, 251,1)) 16 | expect_equal(j$end, c(99,199, 399, 299)) 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test_intersect.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_intersect") 3 | 4 | suppressPackageStartupMessages(library(dplyr)) 5 | 6 | x1 <- tibble(id = 1:4, bla=letters[1:4], 7 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 8 | start = c(100, 200, 300, 400), 9 | end = c(150, 250, 350, 450)) 10 | 11 | x2 <- tibble(id = 1:4, BLA=LETTERS[1:4], 12 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 13 | start = c(140, 210, 400, 300), 14 | end = c(160, 240, 415, 320)) 15 | 16 | test_that("Intersection (both) of 2 data frames works as expected", { 17 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 18 | # print(j) 19 | expect_equal(colnames(j), c("id.x", "bla", "chromosome", "id.y", "BLA", "start", "end")) 20 | expect_equal(j$start, c(140, 400)) 21 | expect_equal(j$end, c(150, 415)) 22 | }) 23 | 24 | test_that("Intersection of 2 data frames works for multi-overlap ranges", { 25 | x2 <- tibble(id = 1, BLA=LETTERS[1], 26 | chromosome = c("chr1"), 27 | start = c(140), 28 | end = c(220)) 29 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 30 | # print(j) 31 | expect_equal(colnames(j), c("id.x", "bla", "chromosome", "id.y", "BLA", "start", "end")) 32 | expect_equal(j$start, c(140, 200)) 33 | expect_equal(j$end, c(150, 220)) 34 | expect_equal(j$id.x, c(1,2)) 35 | expect_equal(j$id.y, c(1,1)) 36 | 37 | }) 38 | 39 | 40 | 41 | test_that("Intersection of 2 data frames works for multi-overlap ranges the other way around", { 42 | x1 <- tibble(id = 1, bla=letters[1], 43 | chromosome = c("chr1"), 44 | start = c(100), 45 | end = c(420)) 46 | j <- genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 47 | # print(j) 48 | expect_equal(colnames(j), c("id.x", "bla", "chromosome", "id.y", "BLA", "start", "end")) 49 | expect_equal(j$start, c(140, 300)) 50 | expect_equal(j$end, c(160, 320)) 51 | expect_equal(j$id.x, c(1,1)) 52 | expect_equal(j$id.y, c(1,4)) 53 | 54 | }) 55 | 56 | 57 | test_that("Intersect and findOverlap always match", { 58 | r1 <- IRanges::IRanges(start=c(1,3,24), end=c(1,130,24)) 59 | r2 <- IRanges::IRanges(start=c(1,20,100), end=c(10,30,110)) 60 | o <- as.data.frame(IRanges::findOverlaps(r1, r2)) 61 | intersection <- IRanges::pintersect(r1[o$queryHits], r2[o$subjectHits]) 62 | expect_equal(length(o$queryHits), length(intersection)) 63 | expect_true(all(IRanges::poverlaps(intersection, r1[o$queryHits]))) 64 | }) 65 | 66 | 67 | -------------------------------------------------------------------------------- /tests/testthat/test_issue.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_issue") 3 | 4 | 5 | suppressPackageStartupMessages(library(dplyr)) 6 | 7 | 8 | test_that("Latest issue", { 9 | 10 | }) 11 | -------------------------------------------------------------------------------- /tests/testthat/test_join_closest.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_join_closest") 3 | 4 | library(dplyr) 5 | 6 | x1 <- tibble(id = 1:4, bla=letters[1:4], 7 | chromosome = c("chr1", "chr1", "chr2", "chr3"), 8 | start = c(100, 200, 300, 400), 9 | end = c(150, 250, 350, 450)) 10 | 11 | x2 <- tibble(id = 1:4, BLA=LETTERS[1:4], 12 | chromosome = c("chr1", "chr1", "chr1", "chr2"), 13 | start = c(220, 210, 300, 400), 14 | end = c(225, 240, 320, 415)) 15 | 16 | test_that("Joining with closest works as expected", { 17 | j <- genome_join_closest(x1, x2, by=c("chromosome", "start", "end"), distance_column_name="distance", mode="left") 18 | print(j) 19 | expect_equal(colnames(j), c("id.x", "bla", "chromosome.x", "start.x", "end.x", 20 | "id.y", "BLA", "chromosome.y", "start.y", "end.y", "distance")) 21 | expect_equal(j$start.y, c(210, 220, 210, 400, NA)) 22 | expect_equal(j$distance, c(59, 0, 0, 49, NA)) 23 | }) 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/testthat/test_subtract.R: -------------------------------------------------------------------------------- 1 | 2 | context("genome_subtract") 3 | 4 | 5 | suppressPackageStartupMessages(library(dplyr)) 6 | 7 | x1 <- tibble(id = 1:4, bla=letters[1:4], 8 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 9 | start = c(100, 200, 300, 400), 10 | end = c(150, 250, 350, 450)) 11 | 12 | x2 <- tibble(id = 1:4, BLA=LETTERS[1:4], 13 | chromosome = c("chr1", "chr2", "chr1", "chr1"), 14 | start = c(120, 210, 300, 400), 15 | end = c(125, 240, 320, 415)) 16 | 17 | test_that("Subtraction of 2 data frames works as expected", { 18 | j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 19 | # print(j) 20 | expect_equal(colnames(j), c("id", "bla", "chromosome", "start", "end")) 21 | expect_equal(j$start, c(100, 126, 200, 300, 416)) 22 | expect_equal(j$end, c(119, 150, 250, 350, 450)) 23 | }) 24 | 25 | 26 | 27 | test_that("Edge cases of subtraction of 2 data frames works as expected", { 28 | x1 <- tibble(id = 1:2, bla=letters[1:2], 29 | chromosome = c("chr1", "chr1"), 30 | start = c(100, 200), 31 | end = c(150, 250)) 32 | 33 | x2 <- tibble(id = 1:4, BLA=LETTERS[1:4], 34 | chromosome = c("chr1", "chr1", "chr1", "chr1"), 35 | start = c(120, 110, 190, 400), 36 | end = c(125, 122, 320, 415)) 37 | 38 | j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 39 | print(j) 40 | expect_equal(colnames(j), c("id", "bla", "chromosome", "start", "end")) 41 | expect_equal(j$start, c(100, 126)) 42 | expect_equal(j$end, c(109, 150)) 43 | }) 44 | 45 | 46 | test_that("during subtraction the intervals are not unified", { 47 | x1 <- tibble(id = 1:3, bla=letters[1:3], 48 | chromosome = c("chr1", "chr1", "chr1"), 49 | start = c(100, 115, 200), 50 | end = c(150, 160, 250)) 51 | 52 | x2 <- tibble(id = 1, BLA=LETTERS[1], 53 | chromosome = c("chr1"), 54 | start = c(110), 55 | end = c(130)) 56 | 57 | j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 58 | print(j) 59 | expect_equal(colnames(j), c("id", "bla", "chromosome", "start", "end")) 60 | expect_equal(j$start, c(100, 131, 131, 200)) 61 | expect_equal(j$end, c(109, 150, 160, 250)) 62 | }) 63 | -------------------------------------------------------------------------------- /tidygenomics.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /vignettes/intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidy Genomics" 3 | author: "Constantin Ahlmann-Eltze" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | fig_caption: yes 8 | vignette: > 9 | %\VignetteIndexEntry{Tidy Genomics} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | The most dramatic impact on programming in R the last years was the development of the [tidyverse](http://tidyverse.org/) by Hadley Wickham et al. 15 | which, combined with the ingenious `%>%` from magrittr, provides a uniform philosophy for handling data. 16 | 17 | The genomics community has an alternative set of approaches, for which [bioconductor](http://bioconductor.org/) and the 18 | [GenomicRanges](http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html) package provide the basis. The `GenomicRanges` and 19 | the underlying `IRanges` package provide a great set of methods for dealing with intervals as they typically encountered in genomics. 20 | 21 | Unfortunately it is not always easy to combine those two worlds, many common operations in `GenomicRanges` focus solely on the 22 | ranges and loose the additional metadata columns. On the other hand the `tidyverse` does not provide a unified set of methods 23 | to do common set operations with intervals. 24 | 25 | At least until recently, when the [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin) package was extended with the `genome_join` 26 | method for combining genomic data stored in a `data.frame`. It demonstrated that genomic data could appropriately be handled 27 | with the _tidy_-philosophy. 28 | 29 | The `tidygenomics` package extends the limited set of methods provided by the `fuzzyjoin` package for dealing with genomic 30 | data. Its API is inspired by the very popular [bedtools](http://bedtools.readthedocs.io/en/latest/index.html): 31 | 32 | 33 | - `genome_intersect` 34 | - `genome_subtract` 35 | - `genome_join_closest` 36 | - `genome_cluster` 37 | - `genome_complement` 38 | - `genome_join` _Provided by the fuzzyjoin package_ 39 | 40 | ```{r, message=FALSE, warning=FALSE, echo=FALSE} 41 | library(dplyr) 42 | library(tidygenomics) 43 | ``` 44 | 45 | 46 | ## genome_intersect 47 | 48 | Joins 2 data frames based on their genomic overlap. Unlike the `genome_join` function it updates the boundaries to reflect 49 | the overlap of the regions. 50 | 51 | genome_intersect 52 | 53 | 54 | ```{r} 55 | x1 <- data.frame(id = 1:4, 56 | chromosome = c("chr1", "chr1", "chr2", "chr2"), 57 | start = c(100, 200, 300, 400), 58 | end = c(150, 250, 350, 450)) 59 | 60 | x2 <- data.frame(id = 1:4, 61 | chromosome = c("chr1", "chr2", "chr2", "chr1"), 62 | start = c(140, 210, 400, 300), 63 | end = c(160, 240, 415, 320)) 64 | 65 | genome_intersect(x1, x2, by=c("chromosome", "start", "end"), mode="both") 66 | ``` 67 | 68 | 69 | ## genome_subtract 70 | 71 | Subtracts one data frame from the other. This can be used to split the x data frame into smaller areas. 72 | 73 | genome_subtract 74 | 75 | ```{r} 76 | x1 <- data.frame(id = 1:4, 77 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 78 | start = c(100, 200, 300, 400), 79 | end = c(150, 250, 350, 450)) 80 | 81 | x2 <- data.frame(id = 1:4, 82 | chromosome = c("chr1", "chr2", "chr1", "chr1"), 83 | start = c(120, 210, 300, 400), 84 | end = c(125, 240, 320, 415)) 85 | 86 | genome_subtract(x1, x2, by=c("chromosome", "start", "end")) 87 | ``` 88 | 89 | 90 | 91 | 92 | ## genome_join_closest 93 | 94 | Joins 2 data frames based on their genomic location. If no exact overlap is found the next closest interval is used. 95 | 96 | genome_join_closest 97 | 98 | ```{r} 99 | x1 <- tibble(id = 1:4, 100 | chr = c("chr1", "chr1", "chr2", "chr3"), 101 | start = c(100, 200, 300, 400), 102 | end = c(150, 250, 350, 450)) 103 | 104 | x2 <- tibble(id = 1:4, 105 | chr = c("chr1", "chr1", "chr1", "chr2"), 106 | start = c(220, 210, 300, 400), 107 | end = c(225, 240, 320, 415)) 108 | genome_join_closest(x1, x2, by=c("chr", "start", "end"), distance_column_name="distance", mode="left") 109 | ``` 110 | 111 | 112 | ## genome_cluster 113 | 114 | Add a new column with the cluster if 2 intervals are overlapping or are within the `max_distance`. 115 | 116 | genome_cluster 117 | 118 | ```{r} 119 | x1 <- data.frame(id = 1:4, bla=letters[1:4], 120 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 121 | start = c(100, 120, 300, 260), 122 | end = c(150, 250, 350, 450)) 123 | genome_cluster(x1, by=c("chromosome", "start", "end")) 124 | genome_cluster(x1, by=c("chromosome", "start", "end"), max_distance=10) 125 | ``` 126 | 127 | ## genome_complement 128 | 129 | Calculates the complement of a genomic region. 130 | 131 | genome_complement 132 | 133 | ```{r} 134 | x1 <- data.frame(id = 1:4, 135 | chromosome = c("chr1", "chr1", "chr2", "chr1"), 136 | start = c(100, 200, 300, 400), 137 | end = c(150, 250, 350, 450)) 138 | 139 | genome_complement(x1, by=c("chromosome", "start", "end")) 140 | ``` 141 | 142 | 143 | 144 | ## genome_join 145 | 146 | Classical join function based on the overlap of the interval. Implemented and mainted in the 147 | [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin) package and documented here only for completeness. 148 | 149 | genome_join 150 | 151 | ```{r} 152 | x1 <- tibble(id = 1:4, 153 | chr = c("chr1", "chr1", "chr2", "chr3"), 154 | start = c(100, 200, 300, 400), 155 | end = c(150, 250, 350, 450)) 156 | 157 | x2 <- tibble(id = 1:4, 158 | chr = c("chr1", "chr1", "chr1", "chr2"), 159 | start = c(220, 210, 300, 400), 160 | end = c(225, 240, 320, 415)) 161 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="inner") 162 | 163 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="left") 164 | 165 | fuzzyjoin::genome_join(x1, x2, by=c("chr", "start", "end"), mode="anti") 166 | ``` 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /vignettes/resources/genome_cluster_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_cluster_docu.png -------------------------------------------------------------------------------- /vignettes/resources/genome_complement_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_complement_docu.png -------------------------------------------------------------------------------- /vignettes/resources/genome_intersect_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_intersect_docu.png -------------------------------------------------------------------------------- /vignettes/resources/genome_join_closest_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_join_closest_docu.png -------------------------------------------------------------------------------- /vignettes/resources/genome_join_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_join_docu.png -------------------------------------------------------------------------------- /vignettes/resources/genome_subtract_docu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/const-ae/tidygenomics/23737e99f7ff9893f485e2b6b48c1d15c13a5623/vignettes/resources/genome_subtract_docu.png --------------------------------------------------------------------------------