├── .gitignore ├── IGDr ├── DESCRIPTION ├── NAMESPACE ├── R │ ├── .Rhistory │ ├── IGDr.R │ └── create.R ├── man │ ├── IGDr.Rd │ ├── createIGD.Rd │ ├── createIGD_f.Rd │ ├── get_binData.Rd │ ├── get_binLen.Rd │ ├── get_binSize.Rd │ ├── get_ctgId.Rd │ ├── get_nCtgs.Rd │ ├── get_nFiles.Rd │ ├── search_1r.Rd │ ├── search_nr.Rd │ └── search_qfile.Rd └── src │ ├── IGDr.so │ ├── RcppExports.o │ ├── igd_base.c │ ├── igd_base.h │ ├── igd_base.o │ ├── igd_create.c │ ├── igd_create.h │ ├── igd_create.o │ ├── igd_search.c │ ├── igd_search.h │ ├── igd_search.o │ ├── khash.h │ └── kseq.h ├── IGDr_0408 ├── DESCRIPTION ├── NAMESPACE ├── R │ ├── .Rhistory │ ├── IGDr.R │ └── create.R ├── man │ ├── IGDr.Rd │ ├── createIGD.Rd │ ├── createIGD_f.Rd │ ├── get_binData.Rd │ ├── get_binLen.Rd │ ├── get_binSize.Rd │ ├── get_ctgId.Rd │ ├── get_nCtgs.Rd │ ├── get_nFiles.Rd │ ├── search_1r.Rd │ ├── search_nr.Rd │ └── search_qfile.Rd └── src │ ├── IGDr.so │ ├── RcppExports.o │ ├── igd_base.c │ ├── igd_base.h │ ├── igd_base.o │ ├── igd_create.c │ ├── igd_create.h │ ├── igd_create.o │ ├── igd_search.c │ ├── igd_search.h │ ├── igd_search.o │ ├── khash.h │ └── kseq.h ├── LICENSE.txt ├── Makefile ├── README.md ├── src ├── igd.c ├── igd_base.c ├── igd_base.h ├── igd_create.c ├── igd_create.h ├── igd_search.c ├── igd_search.h ├── khash.h └── kseq.h ├── src_py ├── igd_base.c ├── igd_base.h ├── igd_create.c ├── igd_create.h ├── igd_py.c ├── igd_py.pyx ├── igd_search.c ├── igd_search.h ├── igd_test.py ├── khash.h ├── kseq.h └── setup.py └── vignettes ├── .Rhistory └── using_igd.md /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | bin 3 | obj -------------------------------------------------------------------------------- /IGDr/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: IGDr 2 | Type: Package 3 | Title: R wrapper of IGD 4 | Version: 0.1.0 5 | Author: Jianglin (John) Feng 6 | Maintainer: John Feng 7 | Description: Provides an R wrapper for high performance search engine IGD--integrated genomic database. 8 | Create igd database from a collection of .bed files and then search it. 9 | NeedsCompilation: yes 10 | License: GPL (>=2) 11 | Encoding: UTF-8 12 | LazyData: true 13 | Imports: methods 14 | RoxygenNote: 7.0.2 15 | -------------------------------------------------------------------------------- /IGDr/NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | useDynLib(IGDr) 3 | exportClasses(IGDr) 4 | export(IGDr) 5 | -------------------------------------------------------------------------------- /IGDr/R/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr/R/.Rhistory -------------------------------------------------------------------------------- /IGDr/R/IGDr.R: -------------------------------------------------------------------------------- 1 | # S4 class of iGD 2 | 3 | #' IGDr Construction 4 | setClass("IGDr", 5 | representation(ref="externalptr") 6 | ) 7 | 8 | #' Function to open/load an igd database for search 9 | #' 10 | #' @param igd_file the path to the igd database file 11 | #' @return an IGDr object 12 | #' @export 13 | #' @examples 14 | #' \dontrun{library(IGDr) 15 | #' igd_file <- "testigd/roadmap_b14.igd") 16 | #' igdr <- IGDr(igd_file)} 17 | IGDr <- function(igd_file) 18 | { 19 | #check if igd file exist 20 | if(!file.exists(igd_file)) 21 | stop("File '", igd_file, "' is not found. ") 22 | tsv_file = paste(substr(igd_file, 1, nchar(igd_file)-4),"_index.tsv", sep="") 23 | if(!file.exists(tsv_file)) 24 | stop("IGD tsv file '", tsv_file, "' not found. ") 25 | 26 | ans <- .Call("iGD_new", igd_file, PACKAGE = "IGDr") 27 | } 28 | 29 | #' Function to search the igd database for a single query 30 | #' 31 | #' @param igdr an igd database object (loaded) 32 | #' @param chrm the chromosome name of the query in format of chr1, chrX, chrY, chrM, ... 33 | #' @param qs the start location of the query 34 | #' @param qe the end location of the query 35 | #' @return hits: number of intersections to each database source file 36 | #' @export 37 | #' @examples 38 | #' \dontrun{ 39 | #' hits <- search_1r(igdr, "chr1", 1000000, 1100000)} 40 | search_1r <- function(igdr, chrm, qs, qe) 41 | { 42 | hits <- .Call("search_1r", igdr@ref, as.character(chrm), as.integer(qs), as.integer(qe), PACKAGE = "IGDr") 43 | } 44 | 45 | #' Function to search the igd database for multiple queries 46 | #' 47 | #' @param igdr an igd database object (loaded) 48 | #' @param n number of queries to be searched 49 | #' @param chrm vector of chromosome names 50 | #' @param qs vector of the start locations of the queries 51 | #' @param qe vector of the end locations of the queries 52 | #' @return hits: number of intersections to each database source file 53 | #' @export 54 | search_nr <- function(igdr, n, chrm, qs, qe) 55 | { 56 | hits <- .Call("search_nr", igdr@ref, as.integer(n), as.character(chrm), as.integer(qs), as.integer(qe), PACKAGE = "IGDr") 57 | } 58 | 59 | #' Function to search the igd database for a query set from a file 60 | #' 61 | #' @param igdr an igd database object (loaded) 62 | #' @param qfile path to the query file (file type of .bed or .bed.gz) 63 | #' @return hits: number of intersections to each database source file 64 | #' @export 65 | #' @examples 66 | #' \dontrun{ 67 | #' hits <- search_qfile(igdr, "data/r10000.bed")} 68 | search_qfile <- function(igdr, qfile) { #int32 for counts 69 | if(!file.exists(qfile)) 70 | stop("File '", qfile, "' is not found. ") 71 | qinfo <- read.csv(file=qfile, head=FALSE, sep="\t") #Index(0-based), File, Number of regions, Avg size 72 | nfiles <- length(qinfo[,1]) 73 | hits <- .Call("search_nr", igdr@ref, as.integer(nfiles), as.character(qinfo[,1]), as.integer(qinfo[,2]), as.integer(qinfo[,3]), PACKAGE = "IGDr") 74 | } 75 | 76 | #' Function to get the contig id of a chromosome name 77 | #' 78 | #' @param igdr an igd database object (loaded) 79 | #' @param chrm chromosome name ("chr1", "chrX", ...) 80 | #' @return ichr (0 if not exist) 81 | #' @export 82 | #' @examples 83 | #' \dontrun{ 84 | #' ichr <- get_ctgId(igdr, "chrX") 85 | #' } 86 | get_ctgId <- function(igdr, chrm) 87 | { 88 | ichr <- .Call("get_cid", igdr@ref, as.character(chrm), PACKAGE = "IGDr") 89 | } 90 | 91 | #' Function to get the number of contigs in an igd database 92 | #' 93 | #' @param igdr an igd database object (loaded) 94 | #' @return nCtgs: number of contigs 95 | #' @export 96 | #' @examples 97 | #' \dontrun{ 98 | #' nCtgs <- get_ctgId(igdr)} 99 | get_nCtgs <- function(igdr) 100 | { 101 | nCtgs <- .Call("get_nCtgs", igdr@ref, PACKAGE = "IGDr") 102 | } 103 | 104 | #' Function to get the number of source files in an igd database 105 | #' 106 | #' @param igdr an igd database object (loaded) 107 | #' @return nCtgs: number of source files 108 | #' @export 109 | #' @examples 110 | #' \dontrun{ 111 | #' nFiles <- get_nFiles(igdr)} 112 | get_nFiles <- function(igdr) 113 | { 114 | nFiles <- .Call("get_nFiles", igdr@ref, PACKAGE = "IGDr") 115 | } 116 | 117 | #' Function to get the bin size of an igd database 118 | #' 119 | #' @param igdr an igd database object (loaded) 120 | #' @return binSize 121 | #' @export 122 | #' @examples 123 | #' \dontrun{ 124 | #' binSize <- get_binSize(igdr) 125 | #' } 126 | get_binSize <- function(igdr) 127 | { 128 | binSize <- .Call("get_nbp", igdr@ref, PACKAGE = "IGDr") 129 | } 130 | 131 | #' Function to get the number of regions in a given bin and a given contig 132 | #' 133 | #' @param igdr an igd database object (loaded) 134 | #' @param ichr contig id 135 | #' @param binID the bin number 136 | #' @return binLen 137 | #' @export 138 | #' @examples 139 | #' \dontrun{ 140 | #' binLen <- get_binLen(igdr, 10, 123)} 141 | get_binLen <- function(igdr, ichr, binID) 142 | { 143 | binLen <- .Call("get_binLen", igdr@ref, as.integer(ichr), as.integer(binID), PACKAGE = "IGDr") 144 | } 145 | 146 | #' Function to get region data in the given bin of a given contig 147 | #' 148 | #' @param igdr an igd database object (loaded) 149 | #' @param ichr contig id 150 | #' @param binID the bin number 151 | #' @return binData: vector of regions start, end, source_id 152 | #' @export 153 | #' @examples 154 | #' \dontrun{ 155 | #' binData <- get_binLen(igdr, 10, 123)} 156 | get_binData <- function(igdr, ichr, binID) 157 | { 158 | binData <- .Call("get_binData", igdr@ref, as.integer(ichr), as.integer(binID), PACKAGE = "IGDr") 159 | } 160 | -------------------------------------------------------------------------------- /IGDr/R/create.R: -------------------------------------------------------------------------------- 1 | #==========================iGD in R======================================================== 2 | #----------------- Copyright (C) 2019 Jianglin Feng -------------------------------------- 3 | # 4 | # This file is a part of the package IGDr. This function creates a igd 5 | # database from a folder of .bed files 6 | # 7 | # The IGDr package is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 2 of the License, or 10 | # any later version. 11 | # 12 | # The IGDr package is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty 14 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | #----------------------------------------------------------------------------------------- 17 | 18 | #' Function to create an IGD database from a folder of .bed or .bed.gz files, 19 | #' or a list of such folders 20 | #' 21 | #' @param iPath folder where your input files are stored 22 | #' @param oPath the folder that the created IGD database will be stored 23 | #' @param igdName the name you give to the IGD database (.igd will be added to it) 24 | #' @param binsize the size in basepairs for the bin (block) used in the database: 25 | #' usually 8192, 16384, 32768, ... as a power of 2; default 16384 26 | #' @return an igd database will be created in the specified folder 27 | #' @export 28 | #' @examples 29 | #' \dontrun{ 30 | #' library("IGDr") 31 | #' iPath <- system.file("extdata", "rme3", package = "IGDr") 32 | #' IGDr::createIGD(iPath, "testigd", "roadmap_b14") 33 | #' } 34 | createIGD <- function(iPath, oPath, igdName, binsize=16384) { 35 | .C("create_iGD", as.character(iPath), as.character(oPath), as.character(igdName), as.integer(binsize), PACKAGE = "IGDr") 36 | } 37 | 38 | #' Function to create an IGD database from a list of source files (.bed or .bed.gz) 39 | #' 40 | #' @param iPath path to a txt file that lists the paths of all the source files 41 | #' @param oPath the folder that the created IGD database will be stored 42 | #' @param igdName the name you give to the IGD database (.igd will be added to it) 43 | #' @param binsize the size in basepairs for the bin (block) used in the database: 44 | #' usually 8192, 16384, 32768, ... as a power of 2 45 | #' @return an igd database will be created in the specified folder 46 | #' @export 47 | createIGD_f <- function(iPath, oPath, igdName, binsize=16384) { 48 | .C("create_iGD_f", as.character(iPath), as.character(oPath), as.character(igdName), as.integer(binsize), PACKAGE = "IGDr") 49 | } 50 | -------------------------------------------------------------------------------- /IGDr/man/IGDr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{IGDr} 4 | \alias{IGDr} 5 | \title{Function to open/load an igd database for search} 6 | \usage{ 7 | IGDr(igd_file) 8 | } 9 | \arguments{ 10 | \item{igd_file}{the path to the igd database file} 11 | } 12 | \value{ 13 | an IGDr object 14 | } 15 | \description{ 16 | Function to open/load an igd database for search 17 | } 18 | \examples{ 19 | \dontrun{library(IGDr) 20 | igd_file <- "testigd/roadmap_b14.igd") 21 | igdr <- IGDr(igd_file)} 22 | } 23 | -------------------------------------------------------------------------------- /IGDr/man/createIGD.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create.R 3 | \name{createIGD} 4 | \alias{createIGD} 5 | \title{Function to create an IGD database from a folder of .bed or .bed.gz files, 6 | or a list of such folders} 7 | \usage{ 8 | createIGD(iPath, oPath, igdName, binsize = 16384) 9 | } 10 | \arguments{ 11 | \item{iPath}{folder where your input files are stored} 12 | 13 | \item{oPath}{the folder that the created IGD database will be stored} 14 | 15 | \item{igdName}{the name you give to the IGD database (.igd will be added to it)} 16 | 17 | \item{binsize}{the size in basepairs for the bin (block) used in the database: 18 | usually 8192, 16384, 32768, ... as a power of 2; default 16384} 19 | } 20 | \value{ 21 | an igd database will be created in the specified folder 22 | } 23 | \description{ 24 | Function to create an IGD database from a folder of .bed or .bed.gz files, 25 | or a list of such folders 26 | } 27 | \examples{ 28 | \dontrun{ 29 | library("IGDr") 30 | iPath <- system.file("extdata", "rme3", package = "IGDr") 31 | IGDr::createIGD(iPath, "testigd", "roadmap_b14") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /IGDr/man/createIGD_f.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create.R 3 | \name{createIGD_f} 4 | \alias{createIGD_f} 5 | \title{Function to create an IGD database from a list of source files (.bed or .bed.gz)} 6 | \usage{ 7 | createIGD_f(iPath, oPath, igdName, binsize = 16384) 8 | } 9 | \arguments{ 10 | \item{iPath}{path to a txt file that lists the paths of all the source files} 11 | 12 | \item{oPath}{the folder that the created IGD database will be stored} 13 | 14 | \item{igdName}{the name you give to the IGD database (.igd will be added to it)} 15 | 16 | \item{binsize}{the size in basepairs for the bin (block) used in the database: 17 | usually 8192, 16384, 32768, ... as a power of 2} 18 | } 19 | \value{ 20 | an igd database will be created in the specified folder 21 | } 22 | \description{ 23 | Function to create an IGD database from a list of source files (.bed or .bed.gz) 24 | } 25 | -------------------------------------------------------------------------------- /IGDr/man/get_binData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binData} 4 | \alias{get_binData} 5 | \title{Function to get region data in the given bin of a given contig} 6 | \usage{ 7 | get_binData(igdr, ichr, binID) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{ichr}{contig id} 13 | 14 | \item{binID}{the bin number} 15 | } 16 | \value{ 17 | binData: vector of regions start, end, source_id 18 | } 19 | \description{ 20 | Function to get region data in the given bin of a given contig 21 | } 22 | \examples{ 23 | \dontrun{ 24 | binData <- get_binLen(igdr, 10, 123)} 25 | } 26 | -------------------------------------------------------------------------------- /IGDr/man/get_binLen.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binLen} 4 | \alias{get_binLen} 5 | \title{Function to get the number of regions in a given bin and a given contig} 6 | \usage{ 7 | get_binLen(igdr, ichr, binID) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{ichr}{contig id} 13 | 14 | \item{binID}{the bin number} 15 | } 16 | \value{ 17 | binLen 18 | } 19 | \description{ 20 | Function to get the number of regions in a given bin and a given contig 21 | } 22 | \examples{ 23 | \dontrun{ 24 | binLen <- get_binLen(igdr, 10, 123)} 25 | } 26 | -------------------------------------------------------------------------------- /IGDr/man/get_binSize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binSize} 4 | \alias{get_binSize} 5 | \title{Function to get the bin size of an igd database} 6 | \usage{ 7 | get_binSize(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | binSize 14 | } 15 | \description{ 16 | Function to get the bin size of an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | binSize <- get_binSize(igdr) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /IGDr/man/get_ctgId.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_ctgId} 4 | \alias{get_ctgId} 5 | \title{Function to get the contig id of a chromosome name} 6 | \usage{ 7 | get_ctgId(igdr, chrm) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{chrm}{chromosome name ("chr1", "chrX", ...)} 13 | } 14 | \value{ 15 | ichr (0 if not exist) 16 | } 17 | \description{ 18 | Function to get the contig id of a chromosome name 19 | } 20 | \examples{ 21 | \dontrun{ 22 | ichr <- get_ctgId(igdr, "chrX") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /IGDr/man/get_nCtgs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_nCtgs} 4 | \alias{get_nCtgs} 5 | \title{Function to get the number of contigs in an igd database} 6 | \usage{ 7 | get_nCtgs(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | nCtgs: number of contigs 14 | } 15 | \description{ 16 | Function to get the number of contigs in an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | nCtgs <- get_ctgId(igdr)} 21 | } 22 | -------------------------------------------------------------------------------- /IGDr/man/get_nFiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_nFiles} 4 | \alias{get_nFiles} 5 | \title{Function to get the number of source files in an igd database} 6 | \usage{ 7 | get_nFiles(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | nCtgs: number of source files 14 | } 15 | \description{ 16 | Function to get the number of source files in an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | nFiles <- get_nFiles(igdr)} 21 | } 22 | -------------------------------------------------------------------------------- /IGDr/man/search_1r.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_1r} 4 | \alias{search_1r} 5 | \title{Function to search the igd database for a single query} 6 | \usage{ 7 | search_1r(igdr, chrm, qs, qe) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{chrm}{the chromosome name of the query in format of chr1, chrX, chrY, chrM, ...} 13 | 14 | \item{qs}{the start location of the query} 15 | 16 | \item{qe}{the end location of the query} 17 | } 18 | \value{ 19 | hits: number of intersections to each database source file 20 | } 21 | \description{ 22 | Function to search the igd database for a single query 23 | } 24 | \examples{ 25 | \dontrun{ 26 | hits <- search_1r(igdr, "chr1", 1000000, 1100000)} 27 | } 28 | -------------------------------------------------------------------------------- /IGDr/man/search_nr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_nr} 4 | \alias{search_nr} 5 | \title{Function to search the igd database for multiple queries} 6 | \usage{ 7 | search_nr(igdr, n, chrm, qs, qe) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{n}{number of queries to be searched} 13 | 14 | \item{chrm}{vector of chromosome names} 15 | 16 | \item{qs}{vector of the start locations of the queries} 17 | 18 | \item{qe}{vector of the end locations of the queries} 19 | } 20 | \value{ 21 | hits: number of intersections to each database source file 22 | } 23 | \description{ 24 | Function to search the igd database for multiple queries 25 | } 26 | -------------------------------------------------------------------------------- /IGDr/man/search_qfile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_qfile} 4 | \alias{search_qfile} 5 | \title{Function to search the igd database for a query set from a file} 6 | \usage{ 7 | search_qfile(igdr, qfile) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{qfile}{path to the query file (file type of .bed or .bed.gz)} 13 | } 14 | \value{ 15 | hits: number of intersections to each database source file 16 | } 17 | \description{ 18 | Function to search the igd database for a query set from a file 19 | } 20 | \examples{ 21 | \dontrun{ 22 | hits <- search_qfile(igdr, "data/r10000.bed")} 23 | } 24 | -------------------------------------------------------------------------------- /IGDr/src/IGDr.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr/src/IGDr.so -------------------------------------------------------------------------------- /IGDr/src/RcppExports.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr/src/RcppExports.o -------------------------------------------------------------------------------- /IGDr/src/igd_base.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Common igd struct, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_base.h" 7 | 8 | #define gdata_t_key(r) ((r).start) 9 | KRADIX_SORT_INIT(intv, gdata_t, gdata_t_key, 4) 10 | KHASH_MAP_INIT_STR(str, int32_t) 11 | typedef khash_t(str) strhash_t; 12 | 13 | void str_splits( char* str, int *nmax, char **splits) 14 | { //tsv 15 | splits[*nmax] = NULL; 16 | splits[0] = str; 17 | char *ch = str; 18 | int ns = 1; 19 | do { 20 | if (*ch == '\t'){ 21 | splits[ns++] = &ch[1]; 22 | *ch = '\0'; 23 | } 24 | ch++; 25 | } while (*ch != '\0' && ns < *nmax+1); 26 | *nmax = ns; 27 | } 28 | 29 | char *parse_bed(char *s, int32_t *st_, int32_t *en_) 30 | { 31 | char *p, *q, *ctg = 0; 32 | int32_t i, st = -1, en = -1; 33 | for (i = 0, p = q = s;; ++q) { 34 | if (*q == '\t' || *q == '\0') { 35 | int c = *q; 36 | *q = 0; 37 | if (i == 0) ctg = p; 38 | else if (i == 1) st = atol(p); 39 | else if (i == 2) en = atol(p); 40 | ++i, p = q + 1; 41 | if (c == '\0') break; 42 | } 43 | } 44 | *st_ = st, *en_ = en; 45 | return i >= 3? ctg : 0; 46 | } 47 | 48 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe) 49 | { //find tE: index of the last item satisfying .start < qe from right 50 | //assuming gdata sorted by start 51 | int32_t tL=t0, tR=tc, tM, tE = -1; 52 | if(gdata[tR].start < qe) 53 | return tR; 54 | else if(gdata[tL].start >= qe) 55 | return -1; 56 | while(tL= qe) 59 | tR = tM-1; 60 | else 61 | tL = tM; 62 | } 63 | if(gdata[tR].start < qe) 64 | tE = tR; 65 | else if(gdata[tL].start < qe) 66 | tE = tL; 67 | return tE; 68 | } 69 | 70 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx) 71 | { //layers: igd->ctg->gTile->gdata(list) 72 | if(s >= e)return; 73 | int absent; 74 | khint_t k; 75 | strhash_t *h = (strhash_t*)igd->hc; 76 | k = kh_put(str, h, chrm, &absent); 77 | int32_t n1 = s/igd->nbp; 78 | int32_t n2 = (e-1)/igd->nbp; 79 | if (absent) { 80 | //printf("%s %i %i %i\n", chrm, n1, n2, k); 81 | //igd 82 | if (igd->nctg == igd->mctg) 83 | EXPAND(igd->ctg, igd->mctg); 84 | kh_val(h, k) = igd->nctg; 85 | //ctg: initialize 86 | ctg_t *p = &igd->ctg[igd->nctg++]; 87 | p->name = strdup(chrm); 88 | p->mTiles= 1 + n2; 89 | p->gTile = malloc(p->mTiles*sizeof(tile_t)); 90 | kh_key(h, k) = p->name; 91 | //tile: initialize 92 | for(int i=0;imTiles;i++){ 93 | tile_t *tile = &p->gTile[i]; 94 | tile->ncnts = 0; //each batch 95 | tile->nCnts = 0; //total 96 | tile->mcnts = 4; 97 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 98 | } 99 | } 100 | int32_t kk = kh_val(h, k); 101 | ctg_t *p = &igd->ctg[kk]; 102 | if (n2+1>=p->mTiles){ 103 | int32_t tt = p->mTiles; 104 | p->mTiles = n2+1; 105 | p->gTile = realloc(p->gTile, p->mTiles*sizeof(tile_t)); 106 | //initialize new tiles 107 | for(int i=tt;imTiles;i++){ 108 | tile_t *tile = &p->gTile[i]; 109 | tile->ncnts = 0; //each batch 110 | tile->nCnts = 0; //total 111 | tile->mcnts = 16; 112 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 113 | } 114 | } 115 | //add data elements 116 | for(int i=n1;i<=n2;i++){ 117 | tile_t *tile = &p->gTile[i]; 118 | if(tile->ncnts == tile->mcnts) 119 | EXPAND(tile->gList, tile->mcnts); 120 | gdata_t *gdata = &tile->gList[tile->ncnts++]; 121 | gdata->start = s; 122 | gdata->end = e; 123 | gdata->value = v; 124 | gdata->idx = idx; 125 | igd->total++; 126 | } 127 | return; 128 | } 129 | 130 | info_t* get_fileinfo(char *ifName, int32_t *nFiles) 131 | { //read head file __index.tsv to get info 132 | FILE *fp = fopen(ifName, "r"); 133 | if(fp==NULL){ 134 | printf("file not found:%s\n", ifName); 135 | return NULL; 136 | } 137 | char buf[1024], *s0, *s1, *s2, *s3; 138 | int nfiles=0; 139 | char* rtn = fgets(buf, 1024, fp); 140 | while(fgets(buf, 1024, fp)!=NULL) 141 | nfiles++; 142 | 143 | info_t *fi = (info_t*)malloc(nfiles*sizeof(info_t)); 144 | fseek(fp, 0, SEEK_SET); 145 | int i=0; 146 | rtn = fgets(buf, 1024, fp); //header 147 | while(fgets(buf, 1024, fp)!=NULL){ 148 | s0 = strtok(buf, "\t"); 149 | s1 = strtok(NULL, "\t"); 150 | fi[i].fileName = strdup(s1); 151 | s2 = strtok(NULL, "\t"); 152 | fi[i].nr = atol(s2); 153 | //s3 = strtok(NULL, "\t"); 154 | //fi[i].md = (double)atol(s3); 155 | i++; 156 | } 157 | *nFiles = (int32_t)nfiles; 158 | fclose(fp); 159 | return fi; 160 | } 161 | 162 | iGD_t* open_iGD(char *igdFile) 163 | { 164 | iGD_t* iGD = iGD_init(); 165 | char tmp[256]; 166 | strcpy(tmp, igdFile); 167 | tmp[strrchr(tmp, '.')-tmp] = '\0'; 168 | strcpy(iGD->fname, tmp); 169 | char *idFile = tmp; //str_split(tmp, '.', &nCols)[0]; 170 | strcat(idFile, "_index.tsv"); 171 | iGD->finfo = get_fileinfo(idFile, &iGD->nFiles); 172 | FILE *fp = fopen(igdFile, "rb"); 173 | if(fp == NULL) 174 | printf("Can't open file %s", igdFile); 175 | long rtn = fread(&iGD->nbp, sizeof(int32_t), 1, fp); 176 | rtn = fread(&iGD->gType, sizeof(int32_t), 1, fp); 177 | rtn = fread(&iGD->nCtg, sizeof(int32_t), 1, fp); 178 | int i, k; 179 | int32_t gdsize; 180 | gdsize = sizeof(gdata_t); 181 | int32_t tileS, m = iGD->nCtg; //the idx of a tile in the chrom 182 | //------------------------------------------ 183 | iGD->nTile = malloc(m*sizeof(int32_t)); 184 | rtn = fread(iGD->nTile, sizeof(int32_t)*m, 1, fp); 185 | int64_t chr_loc = 12 + 44*m; //header size in bytes 186 | for(i=0;inTile[i]*4; 187 | //------------------------------------------ 188 | iGD->nCnt = malloc(m*sizeof(int32_t*)); 189 | iGD->tIdx = malloc(m*sizeof(int64_t*)); 190 | for(i=0;inTile[i]; 192 | iGD->nCnt[i] = calloc(k, sizeof(int32_t)); 193 | rtn = fread(iGD->nCnt[i], sizeof(int32_t)*k, 1, fp); 194 | //-------------------------------------- 195 | iGD->tIdx[i] = calloc(k, sizeof(int64_t)); 196 | iGD->tIdx[i][0] = chr_loc; 197 | for(int j=1; jtIdx[i][j] = iGD->tIdx[i][j-1]+iGD->nCnt[i][j-1]*gdsize; 199 | chr_loc = iGD->tIdx[i][k-1]+iGD->nCnt[i][k-1]*gdsize; 200 | } 201 | 202 | iGD->cName = malloc(m*sizeof(char*)); 203 | for(i=0;icName[i] = malloc(40*sizeof(char)); 205 | rtn = fread(iGD->cName[i], 40, 1, fp); 206 | } 207 | iGD->fP = fp; 208 | 209 | //setup hc 210 | iGD->hc = kh_init(str); 211 | int absent; 212 | for(i=0;inCtg;i++){ 213 | khint_t k; 214 | strhash_t *h = (strhash_t*)iGD->hc; 215 | k = kh_put(str, h, iGD->cName[i], &absent); 216 | kh_val(h, k) = i; 217 | kh_key(h, k) = iGD->cName[i]; 218 | } 219 | iGD->gData = malloc(1*sizeof(gdata_t)); 220 | iGD->preIdx = -1; 221 | iGD->preChr = -1; 222 | return iGD; 223 | } 224 | 225 | int32_t get_id(iGD_t *iGD, const char *chrm) 226 | { //for search 227 | khint_t k; 228 | strhash_t *h = (strhash_t*)iGD->hc; 229 | k = kh_get(str, h, chrm); 230 | return k == kh_end(h)? -1 : kh_val(h, k); 231 | } 232 | 233 | int32_t get_nFiles(iGD_t *iGD) 234 | { 235 | return iGD->nFiles; 236 | } 237 | 238 | void igd_saveT(igd_t *igd, char *oPath) 239 | { //Save/append tiles to disc, add cnts tp Cnts 240 | char idFile[256]; 241 | for (int i = 0; i < igd->nctg; i++){ 242 | ctg_t *ctg = &igd->ctg[i]; 243 | for(int j=0; j< ctg->mTiles; j++){ 244 | tile_t *tile = &ctg->gTile[j]; 245 | //--------------------------------------- 246 | if(tile->ncnts>0){ 247 | sprintf(idFile, "%s%s%s_%i", oPath, "data0/", ctg->name, j); 248 | FILE *fp = fopen(idFile, "ab"); 249 | if(fp==NULL) 250 | printf("Can't open file %s", idFile); 251 | fwrite(tile->gList, sizeof(gdata_t), tile->ncnts, fp); 252 | fclose(fp); 253 | } 254 | tile->nCnts += tile->ncnts; 255 | tile->ncnts = 0; 256 | free(tile->gList); 257 | tile->mcnts = 16;//MAX(16, tile->mcnts/16); 258 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 259 | //tile->gList = realloc(tile->gList, tile->mcnts*sizeof(gdata_t));? 260 | } 261 | } 262 | igd->total = 0; //batch total 263 | } 264 | 265 | void igd_save(igd_t *igd, char *oPath, char *igdName) 266 | { 267 | char idFile[256], iname[256]; 268 | //1. Save iGD data info: ctg string length 40 269 | int32_t i, j, n, m = igd->nctg; 270 | sprintf(idFile, "%s%s%s", oPath, igdName, ".igd"); 271 | FILE *fp = fopen(idFile, "wb"); 272 | if(fp==NULL) 273 | printf("Can't open file %s", idFile); 274 | fwrite(&igd->nbp, sizeof(int32_t), 1, fp); //4 bytes 275 | fwrite(&igd->gType, sizeof(int32_t), 1, fp); //4 276 | fwrite(&m, sizeof(int32_t), 1, fp); //4 277 | //----------------- 278 | for(i=0;ictg[i].mTiles, sizeof(int32_t), 1, fp); 280 | for(i=0;ictg[i]; 282 | n = p->mTiles; 283 | for(j=0;jgTile[j].nCnts, sizeof(int32_t), 1, fp); 285 | } 286 | //write string array 287 | for(i=0;ictg[i].name, 40, 1, fp); 289 | 290 | //2. Sort and save tiles data 291 | for(i=0;ictg[i]; 293 | n = p->mTiles; 294 | for(j=0;jgTile[j]; 296 | int32_t nrec = q->nCnts, gdsize; 297 | if(nrec>0){ 298 | sprintf(iname, "%s%s%s_%i", oPath, "data0/", p->name, j); 299 | FILE *fp0 = fopen(iname, "rb"); 300 | if(fp0 == NULL) 301 | printf("Can't open file %s", iname); 302 | gdsize = nrec*sizeof(gdata_t); 303 | gdata_t *gdata = malloc(gdsize); 304 | long rtn = fread(gdata, gdsize, 1, fp0); 305 | fclose(fp0); 306 | radix_sort_intv(gdata, gdata+nrec); 307 | fwrite(gdata, gdsize, 1, fp); 308 | free(gdata); 309 | remove(iname); 310 | } 311 | } 312 | } 313 | fclose(fp); 314 | } 315 | 316 | igd_t *igd_init(int tile_size) 317 | { 318 | igd_t *igd = malloc(1*sizeof(igd_t)); 319 | igd->gType = 1; 320 | igd->nbp = tile_size; 321 | igd->hc = kh_init(str); 322 | igd->nctg = 0; 323 | igd->mctg = 32; 324 | igd->ctg = malloc(igd->mctg*sizeof(ctg_t)); 325 | igd->total = 0; 326 | return igd; 327 | } 328 | 329 | void igd_destroy(igd_t *igd) 330 | { 331 | if (igd == 0) return; 332 | for (int i = 0; i < igd->nctg; ++i){ 333 | free(igd->ctg[i].name); 334 | for(int j=0; j< igd->ctg[i].mTiles; j++) 335 | free(igd->ctg[i].gTile[j].gList); 336 | } 337 | free(igd->ctg); 338 | kh_destroy(str, (strhash_t*)igd->hc); 339 | free(igd); 340 | } 341 | 342 | iGD_t *iGD_init() 343 | { 344 | iGD_t *iGD = (iGD_t *) malloc(1*sizeof(iGD_t)); 345 | iGD->nbp = 16384; 346 | iGD->gType = 1; 347 | iGD->nCtg = 24; 348 | return iGD; 349 | } 350 | 351 | void close_iGD(iGD_t *iGD) 352 | { 353 | if(iGD==0) return; 354 | fclose(iGD->fP); 355 | free(iGD->gData); 356 | free(iGD->nTile); 357 | kh_destroy(str, (strhash_t*)iGD->hc); 358 | for(int i=0;inCtg;i++){ 359 | free(iGD->nCnt[i]); 360 | free(iGD->tIdx[i]); 361 | } 362 | free(iGD->nCnt); 363 | free(iGD->tIdx); 364 | free(iGD->cName); 365 | free(iGD->finfo); 366 | free(iGD); 367 | } 368 | 369 | //--------------------------------------------------------------------------------- 370 | //.Call entry point 371 | //--------------------------------------------------------------------------------- 372 | SEXP iGD_free(SEXP igdr) 373 | { 374 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 375 | if(iGD==NULL) 376 | error("iGD_free: iGDr external pointer is NULL"); 377 | close_iGD(iGD); 378 | R_SetExternalPtrAddr(igdr, NULL); 379 | return(R_NilValue); 380 | } 381 | 382 | SEXP iGD_new(SEXP igd_file) 383 | { //new a class that contains an externalPtr (iGD_t structure) 384 | const char *igdFile = CHAR(STRING_ELT(igd_file, 0)); 385 | iGD_t *iGD = open_iGD(igdFile); 386 | SEXP igdr, klass, obj; 387 | PROTECT(igdr = R_MakeExternalPtr(iGD, R_NilValue, R_NilValue)); 388 | R_RegisterCFinalizer(igdr, (R_CFinalizer_t)iGD_free); 389 | klass = PROTECT(MAKE_CLASS("IGDr")); 390 | PROTECT(obj = NEW_OBJECT(klass)); 391 | SET_SLOT(obj, Rf_install("ref"), igdr); 392 | UNPROTECT(3); 393 | return(obj); 394 | } 395 | 396 | SEXP get_cid(SEXP igdr, SEXP chrom) 397 | { //chrom id 398 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 399 | if(iGD==NULL) 400 | error("iGD_free: iGDr external pointer is NULL"); 401 | const char *chrm = CHAR(STRING_ELT(chrom, 0)); 402 | int32_t tid = get_id(iGD, chrm); 403 | SEXP cid; 404 | PROTECT(cid = allocVector(INTSXP, 1)); 405 | INTEGER(cid)[0] = tid; 406 | UNPROTECT(1); 407 | return(cid); 408 | } 409 | 410 | SEXP get_nbp(SEXP igdr) 411 | { //chrom id 412 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 413 | if(iGD==NULL) 414 | error("iGD_free: iGDr external pointer is NULL"); 415 | SEXP t_nbp; 416 | PROTECT(t_nbp = allocVector(INTSXP, 1)); 417 | INTEGER(t_nbp)[0] = iGD->nbp; 418 | UNPROTECT(1); 419 | return(t_nbp); 420 | } 421 | 422 | SEXP get_nfiles(SEXP igdr) 423 | { 424 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 425 | if(iGD==NULL) 426 | error("iGD_free: iGDr external pointer is NULL"); 427 | SEXP nfile; 428 | PROTECT(nfile = allocVector(INTSXP, 1)); 429 | INTEGER(nfile)[0] = iGD->nFiles; 430 | UNPROTECT(1); 431 | return(nfile); 432 | } 433 | 434 | SEXP get_nCtgs(SEXP igdr) 435 | { //chrom id 436 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 437 | if(iGD==NULL) 438 | error("iGD_free: iGDr external pointer is NULL"); 439 | SEXP n_ctgs; 440 | PROTECT(n_ctgs = allocVector(INTSXP, 1)); 441 | INTEGER(n_ctgs)[0] = iGD->nCtg; 442 | UNPROTECT(1); 443 | return(n_ctgs); 444 | } 445 | 446 | SEXP get_binLen(SEXP igdr, SEXP ichr, SEXP bin) 447 | { //not really necessary 448 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 449 | if(iGD==NULL) 450 | error("iGD_free: iGDr external pointer is NULL"); 451 | SEXP binLen; 452 | int ichr0 = INTEGER(ichr)[0]-1; 453 | int j = INTEGER(bin)[0]-1; 454 | if(ichr0 >= iGD->nCtg || ichr0<0 || j<0 || j>=iGD->nTile[ichr0]) 455 | return(R_NilValue); 456 | PROTECT(binLen = allocVector(INTSXP, 1)); 457 | INTEGER(binLen)[0] = iGD->nCnt[ichr0][j]; 458 | UNPROTECT(1); 459 | return(binLen); 460 | } 461 | 462 | -------------------------------------------------------------------------------- /IGDr/src/igd_base.h: -------------------------------------------------------------------------------- 1 | //================================================================================= 2 | //Common structs, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //re-designed 7/1/2019 5 | //database intervals sorted by _start: 8/12/2019 6 | //--------------------------------------------------------------------------------- 7 | #ifndef __IGD_BASE_H__ 8 | #define __IGD_BASE_H__ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "khash.h" 28 | #include "kseq.h" 29 | 30 | #define PROGRAM_NAME "igd" 31 | #define MAJOR_VERSION "0" 32 | #define MINOR_VERSION "1" 33 | #define REVISION_VERSION "1" 34 | #define BUILD_VERSION "0" 35 | #define VERSION MAJOR_VERSION "." MINOR_VERSION "." REVISION_VERSION 36 | #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) 37 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) 38 | #define maxCount 268435456 //16* = 4GB memory 39 | //--------------------------------------------------------------------------------- 40 | typedef struct{ //default 41 | int32_t idx; //genomic object--data set index 42 | int32_t start; //region start 43 | int32_t end; //region end 44 | int32_t value; 45 | } gdata_t; 46 | 47 | typedef struct{ 48 | char* fileName; //dataset file 49 | int32_t nr; //number regions/dataset 50 | double md; //average width of the regions 51 | } info_t; 52 | 53 | typedef struct{ 54 | int32_t ncnts, nCnts, mcnts; //batch counts, total, max 55 | gdata_t *gList; 56 | } tile_t; 57 | 58 | typedef struct{ 59 | char *name; //name of the contig 60 | int32_t mTiles; //determined by the interval start and end 61 | tile_t *gTile; //tile data 62 | } ctg_t; 63 | 64 | typedef struct{ 65 | int32_t nbp, gType, nctg, mctg; // number of base pairs, data type: 0, 1, 2 etc; size differs 66 | int64_t total; //total region in each ctg 67 | ctg_t *ctg; //list of contigs (of size _n_ctg_) 68 | void *hc; //dict for converting contig names to int 69 | } igd_t; //For creation: internal... 70 | 71 | typedef struct{ //For search: external... 72 | int32_t nbp, gType, nCtg, nFiles; 73 | int32_t preIdx, preChr; 74 | char fname[64]; 75 | char **cName; //name of ctgs 76 | int32_t *nTile; //num of tiles in each ctg 77 | int32_t **nCnt; //num of counts in each tile 78 | int64_t **tIdx; //tile index *sizeof -> location in .igd file 79 | gdata_t *gData; 80 | info_t *finfo; 81 | FILE *fP; 82 | void *hc; 83 | } iGD_t; 84 | 85 | //--------------------------------------------------------------------------------- 86 | //Parse a line of BED file 87 | void str_splits( char* str, int *nmax, char **splits); 88 | char *parse_bed(char *s, int32_t *st_, int32_t *en_); 89 | 90 | //Binary search 91 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe); 92 | 93 | //Add an interval 94 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx); 95 | 96 | //Get id from igd dict 97 | int32_t get_id(iGD_t *iGD, const char *chrm); 98 | 99 | //Get nFiles from iGD 100 | int32_t get_nFiles(iGD_t *iGD); 101 | 102 | //Get file info from .tsv 103 | info_t *get_fileinfo(char *ifName, int32_t *nFiles); 104 | 105 | //Get igd info from .igd 106 | iGD_t *open_iGD(char *igdFile); 107 | 108 | //Initialize igd_t 109 | igd_t *igd_init(int tile_size); 110 | 111 | //Initialize iGD_t 112 | iGD_t *iGD_init(); 113 | 114 | //Save tile data 115 | void igd_saveT(igd_t *igd, char *oPath); 116 | 117 | //Sort and save igd 118 | void igd_save(igd_t *igd, char *oPath, char *igdName); 119 | 120 | //Free igd data 121 | void igd_destroy(igd_t *igd); 122 | 123 | //Free iGD data 124 | void close_iGD(iGD_t *iGD); 125 | 126 | //--------------------------------------------------------------------------------- 127 | //The following section taken from Dr Heng Li's cgranges 128 | // (https://github.com/lh3/cgranges) 129 | 130 | KSTREAM_INIT(gzFile, gzread, 0x10000) 131 | /************** 132 | * Radix sort * 133 | **************/ 134 | #define RS_MIN_SIZE 64 135 | #define RS_MAX_BITS 8 136 | 137 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ 138 | typedef struct { \ 139 | rstype_t *b, *e; \ 140 | } rsbucket_##name##_t; \ 141 | void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ 142 | { \ 143 | rstype_t *i; \ 144 | for (i = beg + 1; i < end; ++i) \ 145 | if (rskey(*i) < rskey(*(i - 1))) { \ 146 | rstype_t *j, tmp = *i; \ 147 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ 148 | *j = *(j - 1); \ 149 | *j = tmp; \ 150 | } \ 151 | } \ 152 | void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ 153 | { \ 154 | rstype_t *i; \ 155 | int size = 1<b = k->e = beg; \ 159 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ 160 | for (k = b + 1; k != be; ++k) \ 161 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ 162 | for (k = b; k != be;) { \ 163 | if (k->b != k->e) { \ 164 | rsbucket_##name##_t *l; \ 165 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ 166 | rstype_t tmp = *k->b, swap; \ 167 | do { \ 168 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ 169 | l = b + (rskey(tmp)>>s&m); \ 170 | } while (l != k); \ 171 | *k->b++ = tmp; \ 172 | } else ++k->b; \ 173 | } else ++k; \ 174 | } \ 175 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ 176 | if (s) { \ 177 | s = s > n_bits? s - n_bits : 0; \ 178 | for (k = b; k != be; ++k) \ 179 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ 180 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ 181 | } \ 182 | } \ 183 | void radix_sort_##name(rstype_t *beg, rstype_t *end) \ 184 | { \ 185 | if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ 186 | else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \ 187 | } 188 | 189 | /********************* 190 | * Convenient macros * 191 | *********************/ 192 | 193 | #ifndef kroundup32 194 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 195 | #endif 196 | 197 | #define CALLOC(type, len) ((type*)calloc((len), sizeof(type))) 198 | #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr)))) 199 | 200 | #define EXPAND(a, m) do { \ 201 | (m) = (m)? (m) + ((m)>>1) : 16; \ 202 | REALLOC((a), (m)); \ 203 | }while (0) 204 | 205 | #endif 206 | 207 | -------------------------------------------------------------------------------- /IGDr/src/igd_base.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr/src/igd_base.o -------------------------------------------------------------------------------- /IGDr/src/igd_create.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Read igd region data and query data, and then find all overlaps 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_create.h" 7 | 8 | int create_help(int exit_code) 9 | { 10 | printf( 11 | "%s, v%s\n" 12 | "usage: %s create [options] \n" 13 | " -b \n" 14 | " -c < .BED column as value >=4 (default 4) \n", 15 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 16 | return exit_code; 17 | } 18 | 19 | void create_iGD(char **i_path, char **o_path, char **igd_name, int *tile_size) 20 | { //.C call using pointers to pass arguments!!! 21 | char iPath[256]; 22 | char oPath[256]; 23 | char igdName[64]; 24 | strcpy(iPath, *i_path); 25 | strcpy(oPath, *o_path); 26 | strcpy(igdName, *igd_name); 27 | int rtn, binSize = *tile_size; 28 | 29 | //Check if the subfolders exist: 30 | char ftmp[255]; 31 | struct stat st = {0}; 32 | if(oPath[strlen(oPath)-1]!='/'){ 33 | strcat(oPath, "/"); 34 | } 35 | sprintf(ftmp, "%s%s%s", oPath, igdName, ".igd"); 36 | if(stat(ftmp, &st) == 0){ 37 | printf("The igd database file %s exists!\n", ftmp); 38 | return; 39 | } 40 | else{ 41 | if (stat(oPath, &st) == -1){ 42 | mkdir(oPath, 0777); 43 | } 44 | sprintf(ftmp, "%s%s", oPath, "data0"); 45 | if (stat(ftmp, &st) == -1) 46 | mkdir(ftmp, 0777); 47 | } 48 | 49 | //check if iPath a file or directory 50 | stat(iPath, &st); 51 | glob_t gResult; 52 | if(S_ISREG(st.st_mode)){ 53 | FILE *fp = fopen(iPath, "r"); 54 | if(fgets(ftmp, 255, fp)){ 55 | if(ftmp[strlen(iPath)-1]=='/'){ 56 | strcat(ftmp, "*"); 57 | } 58 | else if(ftmp[strlen(ftmp)-1]!='*'){ 59 | strcat(ftmp, "/*"); 60 | } 61 | rtn = glob(ftmp, 0, NULL, &gResult); 62 | } 63 | while(fgets(ftmp, 255, fp)){ 64 | if(ftmp[strlen(iPath)-1]=='/'){ 65 | strcat(ftmp, "*"); 66 | } 67 | else if(ftmp[strlen(ftmp)-1]!='*'){ 68 | strcat(ftmp, "/*"); 69 | } 70 | rtn = glob(ftmp, GLOB_APPEND, NULL, &gResult); 71 | } 72 | fclose(fp); 73 | } 74 | else{ //S_ISDIR(..) 75 | if(iPath[strlen(iPath)-1]=='/'){ 76 | strcat(iPath, "*"); 77 | } 78 | else if(iPath[strlen(iPath)-1]!='*'){ 79 | strcat(iPath, "/*"); 80 | } 81 | rtn = glob(iPath, 0, NULL, &gResult); 82 | if(rtn!=0){ 83 | printf("wrong dir path: %s", iPath); 84 | return; 85 | } 86 | } 87 | 88 | //0. Initialize igd 89 | igd_t *igd = igd_init(binSize); 90 | //printf("igd_create 0\n"); 91 | 92 | //1. Get the files 93 | char** file_ids = gResult.gl_pathv; 94 | int32_t n_files = gResult.gl_pathc; 95 | if(n_files<1) 96 | printf("Too few files (add to path /*): %i\n", n_files); 97 | 98 | int32_t *nr = calloc(n_files, sizeof(int32_t)); 99 | double *avg = calloc(n_files, sizeof(double)); 100 | printf("igd_create 1: %i\n", n_files); 101 | 102 | //2. Read files 103 | int nCols=32; 104 | unsigned char buffer[256]; 105 | int32_t i, j, k, ig, i0=0, i1=0, L0=0, L1=1, m, nL; //int64_t? 106 | char **splits = malloc((nCols+1)*sizeof(char *)); 107 | while(i00 defines breaks when reading maxCount 114 | //printf("%i, %i, %i, %s\n", i0, ig, nL, file_ids[ig]); 115 | gzFile fp; 116 | if ((fp = gzopen(file_ids[ig], "r")) == 0) 117 | return; 118 | nL = 0; 119 | if(ig==i0 && L0>0){ //pass L0 lines of a big file 120 | while(nL4) va = atol(splits[4]); 127 | igd_add(igd, splits[0], st, en, va, ig); 128 | nr[ig]++; 129 | avg[ig]+=en-st; 130 | nL++; 131 | if(igd->total>maxCount){ 132 | m = 1; 133 | i1 = ig; 134 | L1 = nL; //number of total lines or next line 135 | } 136 | } 137 | gzclose(fp); 138 | if(m==0) ig++; 139 | } 140 | //2.3 Save/append tiles to disc, add cnts tp Cnts 141 | 142 | igd_saveT(igd, oPath); 143 | i0 = ig; 144 | L0 = L1; 145 | L1 = 0; 146 | } 147 | //printf("igd_create 2\n"); 148 | 149 | //3. save _index.tsv: 4 columns--index, filename, nr, avg 150 | //Also has a header line: 151 | char idFile[128]; 152 | char *tchr; 153 | sprintf(idFile, "%s%s%s", oPath, igdName, "_index.tsv"); 154 | FILE *fpi = fopen(idFile, "w"); 155 | if(fpi==NULL) 156 | printf("Can't open file %s", idFile); 157 | fprintf(fpi, "Index\tFile\tNumber of regions\tAvg size\n"); 158 | for(i=0; i0 defines breaks when reading maxCount 258 | //printf("%i, %i, %i, %s\n", i0, ig, nL, file_ids[ig]); 259 | gzFile fp; 260 | if ((fp = gzopen(file_ids[ig], "r")) == 0) 261 | return; 262 | nL = 0; 263 | if(ig==i0 && L0>0){ //pass L0 lines of a big file 264 | while(nL4) va = atol(splits[4]); 271 | igd_add(igd, splits[0], st, en, va, ig); 272 | nr[ig]++; 273 | avg[ig]+=en-st; 274 | nL++; 275 | if(igd->total>maxCount){ 276 | m = 1; 277 | i1 = ig; 278 | L1 = nL; //number of total lines or next line 279 | } 280 | } 281 | gzclose(fp); 282 | if(m==0) ig++; 283 | } 284 | //2.3 Save/append tiles to disc, add cnts tp Cnts 285 | 286 | igd_saveT(igd, oPath); 287 | i0 = ig; 288 | L0 = L1; 289 | L1 = 0; 290 | } 291 | //printf("igd_create 2\n"); 292 | 293 | //3. save _index.tsv: 4 columns--index, filename, nr, avg 294 | //Also has a header line: 295 | char idFile[128]; 296 | char *tchr; 297 | sprintf(idFile, "%s%s%s", oPath, igdName, "_index.tsv"); 298 | FILE *fpi = fopen(idFile, "w"); 299 | if(fpi==NULL) 300 | printf("Can't open file %s", idFile); 301 | fprintf(fpi, "Index\tFile\tNumber of regions\tAvg size\n"); 302 | for(i=0; i [options]\n" 15 | " options:\n" 16 | " -q \n" 17 | " -r \n" 18 | " -v \n" 19 | " -o \n" 20 | " -c display all intersects\n", 21 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 22 | return exit_code; 23 | } 24 | 25 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits) 26 | { 27 | int ichr = get_id(iGD, chrm); 28 | if(ichr<0) 29 | return; 30 | int i, j, n1 = qs/iGD->nbp, n2 = (qe-1)/iGD->nbp; //define boundary! 31 | int32_t tE, tS, tL, tR, tM, tmpi, tmpi1, mlen, mTile = iGD->nTile[ichr]-1; 32 | if(n1>mTile) 33 | return; 34 | n2 = MIN(n2, mTile); 35 | tmpi = iGD->nCnt[ichr][n1]; 36 | tmpi1 = tmpi-1; 37 | long rtn; 38 | if(tmpi>0){ 39 | if(n1!=iGD->preIdx || ichr!=iGD->preChr){ 40 | fseek(iGD->fP, iGD->tIdx[ichr][n1], SEEK_SET); 41 | free(iGD->gData); 42 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 43 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 44 | iGD->preIdx = n1; 45 | iGD->preChr = ichr; 46 | } 47 | if(qe>iGD->gData[0].start){ //sorted by start 48 | //find the 1st rs < qe 49 | tL = 0, tR=tmpi1; 50 | while(tLgData[tM].start < qe) //right side: 53 | tL = tM; 54 | else 55 | tR = tM; //left side 56 | } 57 | if(iGD->gData[tR].start=0; i--){ 60 | if(iGD->gData[i].end>qs){ 61 | hits[iGD->gData[i].idx]++; 62 | } 63 | } 64 | } 65 | if(n2>n1){ //n2>n1 66 | int32_t bd = iGD->nbp*(n1+1); //only keep the first 67 | for(j=n1+1; j<=n2; j++){ //n2 inclusive!!! 68 | tmpi = iGD->nCnt[ichr][j]; 69 | tmpi1 = tmpi-1; 70 | if(tmpi>0){ 71 | if(j!=iGD->preIdx || ichr!=iGD->preChr){ 72 | fseek(iGD->fP, iGD->tIdx[ichr][j], SEEK_SET); 73 | free(iGD->gData); 74 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 75 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 76 | iGD->preIdx = j; 77 | iGD->preChr = ichr; 78 | } 79 | if(qe>iGD->gData[0].start){ 80 | tS = 0; 81 | while(tSgData[tS].startgData[tM].start < qe) //right side: 86 | tL = tM; 87 | else 88 | tR = tM; //left side 89 | } 90 | if(iGD->gData[tR].start=tS; i--){ 93 | if(iGD->gData[i].end>qs){ 94 | hits[iGD->gData[i].idx]++; 95 | } 96 | } 97 | } 98 | } 99 | bd+=iGD->nbp; 100 | } 101 | } 102 | } 103 | } 104 | 105 | void get_overlaps32(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int32_t *hits) 106 | { 107 | //printf("%i\t%i\t%s\n", qs, qe, chrm); 108 | //printf("%i\t%i\t%i\t%i\n", hits[0], hits[1], hits[2], hits[3]); 109 | int ichr = get_id(iGD, chrm); 110 | if(ichr<0) 111 | return; 112 | int i, j, n1 = qs/iGD->nbp, n2 = (qe-1)/iGD->nbp; //define boundary! 113 | int32_t tE, tS, tL, tR, tM, tmpi, tmpi1, mlen, mTile = iGD->nTile[ichr]-1; 114 | if(n1>mTile) 115 | return; 116 | n2 = MIN(n2, mTile); 117 | tmpi = iGD->nCnt[ichr][n1]; 118 | tmpi1 = tmpi-1; 119 | long rtn; 120 | if(tmpi>0){ 121 | if(n1!=iGD->preIdx || ichr!=iGD->preChr){ 122 | fseek(iGD->fP, iGD->tIdx[ichr][n1], SEEK_SET); 123 | free(iGD->gData); 124 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 125 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 126 | iGD->preIdx = n1; 127 | iGD->preChr = ichr; 128 | } 129 | if(qe>iGD->gData[0].start){ //sorted by start 130 | //find the 1st rs < qe 131 | tL = 0, tR=tmpi1; 132 | while(tLgData[tM].start < qe) //right side: 135 | tL = tM; 136 | else 137 | tR = tM; //left side 138 | } 139 | if(iGD->gData[tR].start=0; i--){ 142 | if(iGD->gData[i].end>qs){ 143 | hits[iGD->gData[i].idx]++; 144 | } 145 | } 146 | } 147 | if(n2>n1){ //n2>n1 148 | int32_t bd = iGD->nbp*(n1+1); //only keep the first 149 | for(j=n1+1; j<=n2; j++){ //n2 inclusive!!! 150 | tmpi = iGD->nCnt[ichr][j]; 151 | tmpi1 = tmpi-1; 152 | if(tmpi>0){ 153 | if(j!=iGD->preIdx || ichr!=iGD->preChr){ 154 | fseek(iGD->fP, iGD->tIdx[ichr][j], SEEK_SET); 155 | free(iGD->gData); 156 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 157 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 158 | iGD->preIdx = j; 159 | iGD->preChr = ichr; 160 | } 161 | if(qe>iGD->gData[0].start){ 162 | tS = 0; 163 | while(tSgData[tS].startgData[tM].start < qe) //right side: 168 | tL = tM; 169 | else 170 | tR = tM; //left side 171 | } 172 | if(iGD->gData[tR].start=tS; i--){ 175 | if(iGD->gData[i].end>qs){ 176 | hits[iGD->gData[i].idx]++; 177 | } 178 | } 179 | } 180 | } 181 | bd+=iGD->nbp; 182 | } 183 | } 184 | } 185 | //printf("%i\t%i\t%i\t%i\n", hits[0], hits[1], hits[2], hits[3]); 186 | } 187 | 188 | 189 | void search_1(char **igdFile, char **qchr, int32_t *qs, int32_t *qe, int64_t *hits) 190 | { 191 | iGD_t *iGD = open_iGD(*igdFile); 192 | get_overlaps(iGD, *qchr, *qs, *qe, hits); 193 | close_iGD(iGD); 194 | } 195 | 196 | void getOverlaps(char **igdFile, char **qFile, int64_t *hits) 197 | { 198 | iGD_t *iGD = open_iGD(*igdFile); 199 | gzFile fp; 200 | kstream_t *ks; 201 | kstring_t str = {0,0,0}; 202 | if ((fp = gzopen(*qFile, "r")) == 0) 203 | return; 204 | ks = ks_init(fp); 205 | char *chrm; 206 | int32_t st, en, nl=0; 207 | while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { 208 | chrm = parse_bed(str.s, &st, &en); 209 | if (chrm) { 210 | get_overlaps(iGD, chrm, st, en, hits); 211 | } 212 | } 213 | free(str.s); 214 | ks_destroy(ks); 215 | gzclose(fp); 216 | close_iGD(iGD); 217 | } 218 | 219 | //---Hash table for R 220 | int32_t hash(const char *key, int32_t htsize) 221 | { 222 | int32_t i=0, hash=0; 223 | while(key && key[i]){ 224 | hash = (hash+key[i])%htsize; 225 | i++; 226 | } 227 | return hash; 228 | } 229 | 230 | hTable *ht_init(int32_t htsize) 231 | { 232 | hTable *ht; 233 | if(htsize<1) 234 | return NULL; 235 | ht = malloc(sizeof(hTable)); 236 | ht->nodes = malloc(htsize*sizeof(htNode)); 237 | memset(ht->nodes, 0, htsize*sizeof(htNode)); 238 | ht->size = htsize; 239 | return ht; 240 | } 241 | 242 | int ht_put(hTable *ht, const char *key, int32_t value) 243 | { 244 | htNode *node = malloc(sizeof(htNode)); 245 | node->key = strdup(key); 246 | node->value = value; 247 | int32_t i = hash(key, value); 248 | htNode *tmp = ht->nodes[i];//linkList[i] 249 | if(tmp!=NULL){ 250 | while(tmp!=NULL){ 251 | if(strcmp(tmp->key, node->key)==0) 252 | break; 253 | tmp = tmp->next; 254 | } 255 | if(tmp==NULL){ //already filled 256 | node->next = ht->nodes[i]; 257 | ht->nodes[i] = node; 258 | } 259 | else{ //alrady exist 260 | tmp->value = node->value; 261 | free(node->key); 262 | free(node); 263 | } 264 | } 265 | else{ 266 | node->next = NULL; 267 | ht->nodes[i] = node; 268 | } 269 | } 270 | 271 | int32_t ht_get(hTable *ht, const char *key) 272 | { 273 | char *key1 = strdup(key); 274 | int32_t i = hash(key, ht->size); 275 | htNode *tmp = ht->nodes[i]; 276 | while(tmp!=NULL){ 277 | if(strcmp(tmp->key, key1)==0) 278 | break; 279 | tmp = tmp->next; 280 | } 281 | free(key1); 282 | if(tmp==NULL) 283 | return -1; 284 | return tmp->value; 285 | } 286 | 287 | void ht_free(hTable* ht) 288 | { 289 | htNode *tmp=NULL; 290 | if(ht==NULL)return; 291 | for(int i=0;isize;i++){ 292 | if(ht->nodes[i]!=NULL){ 293 | while(ht->nodes[i]!=NULL){ 294 | tmp = ht->nodes[i]->next; 295 | free(ht->nodes[i]->key); 296 | free(ht->nodes[i]); 297 | ht->nodes[i] = tmp; 298 | } 299 | free(ht->nodes[i]); 300 | } 301 | } 302 | free(ht->nodes); 303 | free(ht); 304 | } 305 | 306 | //--------------------------------------------------------------------------- 307 | SEXP search_1r(SEXP igdr, SEXP qchrm, SEXP qs, SEXP qe) 308 | { //NO need to supply output vector!!! 309 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 310 | if(iGD==NULL) 311 | error("iGD_free: iGDr external pointer is NULL"); 312 | const char *chrm = CHAR(STRING_ELT(qchrm, 0)); 313 | SEXP hits; 314 | PROTECT(hits = allocVector(INTSXP, iGD->nFiles));//not initialized 315 | memset(INTEGER(hits), 0, iGD->nFiles * sizeof(int)); 316 | get_overlaps32(iGD, chrm, INTEGER(qs)[0], INTEGER(qe)[0], INTEGER(hits)); 317 | UNPROTECT(1); 318 | return(hits); 319 | } 320 | 321 | /*SEXP search_nr(SEXP igdr, SEXP n, SEXP qchrm, SEXP qs, SEXP qe) 322 | { //NO need to supply output vector!!! 323 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 324 | if(iGD==NULL) 325 | error("iGD_free: iGDr external pointer is NULL"); 326 | SEXP hits; 327 | const char *chrm; 328 | int32_t *tmp = calloc(iGD->nFiles, sizeof(int32_t)); 329 | for(int i=0; inFiles));//not initialized 334 | memcpy(INTEGER(hits), tmp, iGD->nFiles * sizeof(int32_t)); 335 | UNPROTECT(1); 336 | free(tmp); 337 | return(hits); 338 | }*/ 339 | 340 | SEXP search_nr(SEXP igdr, SEXP n, SEXP qchrm, SEXP qs, SEXP qe) 341 | { //NO need to supply output vector!!! 342 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 343 | if(iGD==NULL) 344 | error("iGD_free: iGDr external pointer is NULL"); 345 | SEXP hits; 346 | const char *chrm; 347 | PROTECT(hits = allocVector(INTSXP, iGD->nFiles));//not initialized 348 | memset(INTEGER(hits), 0, iGD->nFiles * sizeof(int)); 349 | for(int i=0; i=iGD->nTile[ichr0] || j<0){ 365 | printf("Max bin number is %i\n", iGD->nTile[ichr0]); 366 | return(R_NilValue); 367 | } 368 | int ncnt = iGD->nCnt[ichr0][j]; 369 | if(ncnt<1){ 370 | printf("No records in bin %i \n", j); 371 | return(R_NilValue); 372 | } 373 | SEXP starts = PROTECT(allocVector(INTSXP, ncnt)); 374 | SEXP ends = PROTECT(allocVector(INTSXP, ncnt)); 375 | SEXP idx = PROTECT(allocVector(INTSXP, ncnt)); 376 | //-------------------------------------------- 377 | gdata_t *gd = malloc(ncnt*sizeof(gdata_t)); 378 | fseek(iGD->fP, iGD->tIdx[ichr0][j], SEEK_SET); 379 | long rtn = fread(gd, sizeof(gdata_t)*ncnt, 1, iGD->fP); 380 | //-------------------------------------------- 381 | for(int i=0;i location in .igd file 37 | //gdata_t *gData; 38 | //info_t *finfo; 39 | //FILE *fP; 40 | //void *hc; 41 | } igdr_t; 42 | 43 | int32_t hash(const char *key, int32_t htsize); //hash function 44 | hTable *ht_init(int32_t htsize); //initialize 45 | int ht_put(hTable *ht, const char *key, int32_t value); 46 | int32_t ht_get(hTable *ht, const char *key); 47 | void ht_free(hTable* ht); 48 | 49 | //Single query 50 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits); 51 | 52 | //32bit 53 | void get_overlaps32(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int32_t *hits); 54 | 55 | void search_1(char **igdFile, char **qchr, int32_t *qs, int32_t *qe, int64_t *hits); 56 | 57 | //query file: call _r 58 | void getOverlaps(char **igdFile, char **qFile, int64_t *hits); 59 | 60 | #endif 61 | 62 | -------------------------------------------------------------------------------- /IGDr/src/igd_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr/src/igd_search.o -------------------------------------------------------------------------------- /IGDr/src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 68 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 69 | 70 | #ifndef KSTRING_T 71 | #define KSTRING_T kstring_t 72 | typedef struct __kstring_t { 73 | unsigned l, m; 74 | char *s; 75 | } kstring_t; 76 | #endif 77 | 78 | #ifndef kroundup32 79 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 80 | #endif 81 | 82 | #define __KS_GETUNTIL(SCOPE, __read) \ 83 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 84 | { \ 85 | if (dret) *dret = 0; \ 86 | str->l = append? str->l : 0; \ 87 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 88 | for (;;) { \ 89 | int i; \ 90 | if (ks->begin >= ks->end) { \ 91 | if (!ks->is_eof) { \ 92 | ks->begin = 0; \ 93 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 94 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 95 | if (ks->end == 0) break; \ 96 | } else break; \ 97 | } \ 98 | if (delimiter == KS_SEP_LINE) { \ 99 | for (i = ks->begin; i < ks->end; ++i) \ 100 | if (ks->buf[i] == '\n') break; \ 101 | } else if (delimiter > KS_SEP_MAX) { \ 102 | for (i = ks->begin; i < ks->end; ++i) \ 103 | if (ks->buf[i] == delimiter) break; \ 104 | } else if (delimiter == KS_SEP_SPACE) { \ 105 | for (i = ks->begin; i < ks->end; ++i) \ 106 | if (isspace(ks->buf[i])) break; \ 107 | } else if (delimiter == KS_SEP_TAB) { \ 108 | for (i = ks->begin; i < ks->end; ++i) \ 109 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 110 | } else i = 0; /* never come to here! */ \ 111 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 112 | str->m = str->l + (i - ks->begin) + 1; \ 113 | kroundup32(str->m); \ 114 | str->s = (char*)realloc(str->s, str->m); \ 115 | } \ 116 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 117 | str->l = str->l + (i - ks->begin); \ 118 | ks->begin = i + 1; \ 119 | if (i < ks->end) { \ 120 | if (dret) *dret = ks->buf[i]; \ 121 | break; \ 122 | } \ 123 | } \ 124 | if (str->s == 0) { \ 125 | str->m = 1; \ 126 | str->s = (char*)calloc(1, 1); \ 127 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 128 | str->s[str->l] = '\0'; \ 129 | return str->l; \ 130 | } 131 | 132 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 133 | __KS_TYPE(type_t) \ 134 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 135 | __KS_GETUNTIL(SCOPE, __read) \ 136 | __KS_INLINED(__read) 137 | 138 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 139 | 140 | #define KSTREAM_DECLARE(type_t, __read) \ 141 | __KS_TYPE(type_t) \ 142 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 143 | extern kstream_t *ks_init(type_t f); \ 144 | extern void ks_destroy(kstream_t *ks); \ 145 | __KS_INLINED(__read) 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /IGDr_0408/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: IGDr 2 | Type: Package 3 | Title: R wrapper of IGD 4 | Version: 0.1.0 5 | Author: Jianglin (John) Feng 6 | Maintainer: John Feng 7 | Description: Provides an R wrapper for high performance search engine IGD--integrated genomic database. 8 | Create igd database from a collection of .bed files and then search it. 9 | NeedsCompilation: yes 10 | License: GPL (>=2) 11 | Encoding: UTF-8 12 | LazyData: true 13 | Imports: methods 14 | RoxygenNote: 7.0.2 15 | -------------------------------------------------------------------------------- /IGDr_0408/NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | useDynLib(IGDr) 3 | exportClasses(IGDr) 4 | export(IGDr) 5 | -------------------------------------------------------------------------------- /IGDr_0408/R/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr_0408/R/.Rhistory -------------------------------------------------------------------------------- /IGDr_0408/R/IGDr.R: -------------------------------------------------------------------------------- 1 | # S4 class of iGD 2 | 3 | #' IGDr Construction 4 | setClass("IGDr", 5 | representation(ref="externalptr") 6 | ) 7 | 8 | #' Function to open/load an igd database for search 9 | #' 10 | #' @param igd_file the path to the igd database file 11 | #' @return an IGDr object 12 | #' @export 13 | #' @examples 14 | #' \dontrun{library(IGDr) 15 | #' igd_file <- "testigd/roadmap_b14.igd") 16 | #' igdr <- IGDr(igd_file)} 17 | IGDr <- function(igd_file) 18 | { 19 | #check if igd file exist 20 | if(!file.exists(igd_file)) 21 | stop("File '", igd_file, "' is not found. ") 22 | tsv_file = paste(substr(igd_file, 1, nchar(igd_file)-4),"_index.tsv", sep="") 23 | if(!file.exists(tsv_file)) 24 | stop("IGD tsv file '", tsv_file, "' not found. ") 25 | 26 | ans <- .Call("iGD_new", igd_file, PACKAGE = "IGDr") 27 | } 28 | 29 | #' Function to search the igd database for a single query 30 | #' 31 | #' @param igdr an igd database object (loaded) 32 | #' @param chrm the chromosome name of the query in format of chr1, chrX, chrY, chrM, ... 33 | #' @param qs the start location of the query 34 | #' @param qe the end location of the query 35 | #' @return hits: number of intersections to each database source file 36 | #' @export 37 | #' @examples 38 | #' \dontrun{ 39 | #' hits <- search_1r(igdr, "chr1", 1000000, 1100000)} 40 | search_1r <- function(igdr, chrm, qs, qe) 41 | { 42 | hits <- .Call("search_1r", igdr@ref, as.character(chrm), as.integer(qs), as.integer(qe), PACKAGE = "IGDr") 43 | } 44 | 45 | #' Function to search the igd database for multiple queries 46 | #' 47 | #' @param igdr an igd database object (loaded) 48 | #' @param n number of queries to be searched 49 | #' @param chrm vector of chromosome names 50 | #' @param qs vector of the start locations of the queries 51 | #' @param qe vector of the end locations of the queries 52 | #' @return hits: number of intersections to each database source file 53 | #' @export 54 | search_nr <- function(igdr, n, chrm, qs, qe) 55 | { 56 | hits <- .Call("search_nr", igdr@ref, as.integer(n), as.character(chrm), as.integer(qs), as.integer(qe), PACKAGE = "IGDr") 57 | } 58 | 59 | #' Function to search the igd database for a query set from a file 60 | #' 61 | #' @param igdr an igd database object (loaded) 62 | #' @param qfile path to the query file (file type of .bed or .bed.gz) 63 | #' @return hits: number of intersections to each database source file 64 | #' @export 65 | #' @examples 66 | #' \dontrun{ 67 | #' hits <- search_qfile(igdr, "data/r10000.bed")} 68 | search_qfile <- function(igdr, qfile) { #int32 for counts 69 | if(!file.exists(qfile)) 70 | stop("File '", qfile, "' is not found. ") 71 | qinfo <- read.csv(file=qfile, head=FALSE, sep="\t") #Index(0-based), File, Number of regions, Avg size 72 | nfiles <- length(qinfo[,1]) 73 | hits <- .Call("search_nr", igdr@ref, as.integer(nfiles), as.character(qinfo[,1]), as.integer(qinfo[,2]), as.integer(qinfo[,3]), PACKAGE = "IGDr") 74 | } 75 | 76 | #' Function to get the contig id of a chromosome name 77 | #' 78 | #' @param igdr an igd database object (loaded) 79 | #' @param chrm chromosome name ("chr1", "chrX", ...) 80 | #' @return ichr (0 if not exist) 81 | #' @export 82 | #' @examples 83 | #' \dontrun{ 84 | #' ichr <- get_ctgId(igdr, "chrX") 85 | #' } 86 | get_ctgId <- function(igdr, chrm) 87 | { 88 | ichr <- .Call("get_cid", igdr@ref, as.character(chrm), PACKAGE = "IGDr") 89 | } 90 | 91 | #' Function to get the number of contigs in an igd database 92 | #' 93 | #' @param igdr an igd database object (loaded) 94 | #' @return nCtgs: number of contigs 95 | #' @export 96 | #' @examples 97 | #' \dontrun{ 98 | #' nCtgs <- get_ctgId(igdr)} 99 | get_nCtgs <- function(igdr) 100 | { 101 | nCtgs <- .Call("get_nCtgs", igdr@ref, PACKAGE = "IGDr") 102 | } 103 | 104 | #' Function to get the number of source files in an igd database 105 | #' 106 | #' @param igdr an igd database object (loaded) 107 | #' @return nCtgs: number of source files 108 | #' @export 109 | #' @examples 110 | #' \dontrun{ 111 | #' nFiles <- get_nFiles(igdr)} 112 | get_nFiles <- function(igdr) 113 | { 114 | nFiles <- .Call("get_nFiles", igdr@ref, PACKAGE = "IGDr") 115 | } 116 | 117 | #' Function to get the bin size of an igd database 118 | #' 119 | #' @param igdr an igd database object (loaded) 120 | #' @return binSize 121 | #' @export 122 | #' @examples 123 | #' \dontrun{ 124 | #' binSize <- get_binSize(igdr) 125 | #' } 126 | get_binSize <- function(igdr) 127 | { 128 | binSize <- .Call("get_nbp", igdr@ref, PACKAGE = "IGDr") 129 | } 130 | 131 | #' Function to get the number of regions in a given bin and a given contig 132 | #' 133 | #' @param igdr an igd database object (loaded) 134 | #' @param ichr contig id 135 | #' @param binID the bin number 136 | #' @return binLen 137 | #' @export 138 | #' @examples 139 | #' \dontrun{ 140 | #' binLen <- get_binLen(igdr, 10, 123)} 141 | get_binLen <- function(igdr, ichr, binID) 142 | { 143 | binLen <- .Call("get_binLen", igdr@ref, as.integer(ichr), as.integer(binID), PACKAGE = "IGDr") 144 | } 145 | 146 | #' Function to get region data in the given bin of a given contig 147 | #' 148 | #' @param igdr an igd database object (loaded) 149 | #' @param ichr contig id 150 | #' @param binID the bin number 151 | #' @return binData: vector of regions start, end, source_id 152 | #' @export 153 | #' @examples 154 | #' \dontrun{ 155 | #' binData <- get_binLen(igdr, 10, 123)} 156 | get_binData <- function(igdr, ichr, binID) 157 | { 158 | binData <- .Call("get_binData", igdr@ref, as.integer(ichr), as.integer(binID), PACKAGE = "IGDr") 159 | } 160 | -------------------------------------------------------------------------------- /IGDr_0408/R/create.R: -------------------------------------------------------------------------------- 1 | #==========================iGD in R======================================================== 2 | #----------------- Copyright (C) 2019 Jianglin Feng -------------------------------------- 3 | # 4 | # This file is a part of the package IGDr. This function creates a igd 5 | # database from a folder of .bed files 6 | # 7 | # The IGDr package is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 2 of the License, or 10 | # any later version. 11 | # 12 | # The IGDr package is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty 14 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | #----------------------------------------------------------------------------------------- 17 | 18 | #' Function to create an IGD database from a folder of .bed or .bed.gz files, 19 | #' or a list of such folders 20 | #' 21 | #' @param iPath folder where your input files are stored 22 | #' @param oPath the folder that the created IGD database will be stored 23 | #' @param igdName the name you give to the IGD database (.igd will be added to it) 24 | #' @param binsize the size in basepairs for the bin (block) used in the database: 25 | #' usually 8192, 16384, 32768, ... as a power of 2; default 16384 26 | #' @return an igd database will be created in the specified folder 27 | #' @export 28 | #' @examples 29 | #' \dontrun{ 30 | #' library("IGDr") 31 | #' iPath <- system.file("extdata", "rme3", package = "IGDr") 32 | #' IGDr::createIGD(iPath, "testigd", "roadmap_b14") 33 | #' } 34 | createIGD <- function(iPath, oPath, igdName, binsize=16384) { 35 | .C("create_iGD", as.character(iPath), as.character(oPath), as.character(igdName), as.integer(binsize), PACKAGE = "IGDr") 36 | } 37 | 38 | #' Function to create an IGD database from a list of source files (.bed or .bed.gz) 39 | #' 40 | #' @param iPath path to a txt file that lists the paths of all the source files 41 | #' @param oPath the folder that the created IGD database will be stored 42 | #' @param igdName the name you give to the IGD database (.igd will be added to it) 43 | #' @param binsize the size in basepairs for the bin (block) used in the database: 44 | #' usually 8192, 16384, 32768, ... as a power of 2 45 | #' @return an igd database will be created in the specified folder 46 | #' @export 47 | createIGD_f <- function(iPath, oPath, igdName, binsize=16384) { 48 | .C("create_iGD_f", as.character(iPath), as.character(oPath), as.character(igdName), as.integer(binsize), PACKAGE = "IGDr") 49 | } 50 | -------------------------------------------------------------------------------- /IGDr_0408/man/IGDr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{IGDr} 4 | \alias{IGDr} 5 | \title{Function to open/load an igd database for search} 6 | \usage{ 7 | IGDr(igd_file) 8 | } 9 | \arguments{ 10 | \item{igd_file}{the path to the igd database file} 11 | } 12 | \value{ 13 | an IGDr object 14 | } 15 | \description{ 16 | Function to open/load an igd database for search 17 | } 18 | \examples{ 19 | \dontrun{library(IGDr) 20 | igd_file <- "testigd/roadmap_b14.igd") 21 | igdr <- IGDr(igd_file)} 22 | } 23 | -------------------------------------------------------------------------------- /IGDr_0408/man/createIGD.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create.R 3 | \name{createIGD} 4 | \alias{createIGD} 5 | \title{Function to create an IGD database from a folder of .bed or .bed.gz files, 6 | or a list of such folders} 7 | \usage{ 8 | createIGD(iPath, oPath, igdName, binsize = 16384) 9 | } 10 | \arguments{ 11 | \item{iPath}{folder where your input files are stored} 12 | 13 | \item{oPath}{the folder that the created IGD database will be stored} 14 | 15 | \item{igdName}{the name you give to the IGD database (.igd will be added to it)} 16 | 17 | \item{binsize}{the size in basepairs for the bin (block) used in the database: 18 | usually 8192, 16384, 32768, ... as a power of 2; default 16384} 19 | } 20 | \value{ 21 | an igd database will be created in the specified folder 22 | } 23 | \description{ 24 | Function to create an IGD database from a folder of .bed or .bed.gz files, 25 | or a list of such folders 26 | } 27 | \examples{ 28 | \dontrun{ 29 | library("IGDr") 30 | iPath <- system.file("extdata", "rme3", package = "IGDr") 31 | IGDr::createIGD(iPath, "testigd", "roadmap_b14") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /IGDr_0408/man/createIGD_f.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create.R 3 | \name{createIGD_f} 4 | \alias{createIGD_f} 5 | \title{Function to create an IGD database from a list of source files (.bed or .bed.gz)} 6 | \usage{ 7 | createIGD_f(iPath, oPath, igdName, binsize = 16384) 8 | } 9 | \arguments{ 10 | \item{iPath}{path to a txt file that lists the paths of all the source files} 11 | 12 | \item{oPath}{the folder that the created IGD database will be stored} 13 | 14 | \item{igdName}{the name you give to the IGD database (.igd will be added to it)} 15 | 16 | \item{binsize}{the size in basepairs for the bin (block) used in the database: 17 | usually 8192, 16384, 32768, ... as a power of 2} 18 | } 19 | \value{ 20 | an igd database will be created in the specified folder 21 | } 22 | \description{ 23 | Function to create an IGD database from a list of source files (.bed or .bed.gz) 24 | } 25 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_binData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binData} 4 | \alias{get_binData} 5 | \title{Function to get region data in the given bin of a given contig} 6 | \usage{ 7 | get_binData(igdr, ichr, binID) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{ichr}{contig id} 13 | 14 | \item{binID}{the bin number} 15 | } 16 | \value{ 17 | binData: vector of regions start, end, source_id 18 | } 19 | \description{ 20 | Function to get region data in the given bin of a given contig 21 | } 22 | \examples{ 23 | \dontrun{ 24 | binData <- get_binLen(igdr, 10, 123)} 25 | } 26 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_binLen.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binLen} 4 | \alias{get_binLen} 5 | \title{Function to get the number of regions in a given bin and a given contig} 6 | \usage{ 7 | get_binLen(igdr, ichr, binID) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{ichr}{contig id} 13 | 14 | \item{binID}{the bin number} 15 | } 16 | \value{ 17 | binLen 18 | } 19 | \description{ 20 | Function to get the number of regions in a given bin and a given contig 21 | } 22 | \examples{ 23 | \dontrun{ 24 | binLen <- get_binLen(igdr, 10, 123)} 25 | } 26 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_binSize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_binSize} 4 | \alias{get_binSize} 5 | \title{Function to get the bin size of an igd database} 6 | \usage{ 7 | get_binSize(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | binSize 14 | } 15 | \description{ 16 | Function to get the bin size of an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | binSize <- get_binSize(igdr) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_ctgId.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_ctgId} 4 | \alias{get_ctgId} 5 | \title{Function to get the contig id of a chromosome name} 6 | \usage{ 7 | get_ctgId(igdr, chrm) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{chrm}{chromosome name ("chr1", "chrX", ...)} 13 | } 14 | \value{ 15 | ichr (0 if not exist) 16 | } 17 | \description{ 18 | Function to get the contig id of a chromosome name 19 | } 20 | \examples{ 21 | \dontrun{ 22 | ichr <- get_ctgId(igdr, "chrX") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_nCtgs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_nCtgs} 4 | \alias{get_nCtgs} 5 | \title{Function to get the number of contigs in an igd database} 6 | \usage{ 7 | get_nCtgs(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | nCtgs: number of contigs 14 | } 15 | \description{ 16 | Function to get the number of contigs in an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | nCtgs <- get_ctgId(igdr)} 21 | } 22 | -------------------------------------------------------------------------------- /IGDr_0408/man/get_nFiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{get_nFiles} 4 | \alias{get_nFiles} 5 | \title{Function to get the number of source files in an igd database} 6 | \usage{ 7 | get_nFiles(igdr) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | } 12 | \value{ 13 | nCtgs: number of source files 14 | } 15 | \description{ 16 | Function to get the number of source files in an igd database 17 | } 18 | \examples{ 19 | \dontrun{ 20 | nFiles <- get_nFiles(igdr)} 21 | } 22 | -------------------------------------------------------------------------------- /IGDr_0408/man/search_1r.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_1r} 4 | \alias{search_1r} 5 | \title{Function to search the igd database for a single query} 6 | \usage{ 7 | search_1r(igdr, chrm, qs, qe) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{chrm}{the chromosome name of the query in format of chr1, chrX, chrY, chrM, ...} 13 | 14 | \item{qs}{the start location of the query} 15 | 16 | \item{qe}{the end location of the query} 17 | } 18 | \value{ 19 | hits: number of intersections to each database source file 20 | } 21 | \description{ 22 | Function to search the igd database for a single query 23 | } 24 | \examples{ 25 | \dontrun{ 26 | hits <- search_1r(igdr, "chr1", 1000000, 1100000)} 27 | } 28 | -------------------------------------------------------------------------------- /IGDr_0408/man/search_nr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_nr} 4 | \alias{search_nr} 5 | \title{Function to search the igd database for multiple queries} 6 | \usage{ 7 | search_nr(igdr, n, chrm, qs, qe) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{n}{number of queries to be searched} 13 | 14 | \item{chrm}{vector of chromosome names} 15 | 16 | \item{qs}{vector of the start locations of the queries} 17 | 18 | \item{qe}{vector of the end locations of the queries} 19 | } 20 | \value{ 21 | hits: number of intersections to each database source file 22 | } 23 | \description{ 24 | Function to search the igd database for multiple queries 25 | } 26 | -------------------------------------------------------------------------------- /IGDr_0408/man/search_qfile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/IGDr.R 3 | \name{search_qfile} 4 | \alias{search_qfile} 5 | \title{Function to search the igd database for a query set from a file} 6 | \usage{ 7 | search_qfile(igdr, qfile) 8 | } 9 | \arguments{ 10 | \item{igdr}{an igd database object (loaded)} 11 | 12 | \item{qfile}{path to the query file (file type of .bed or .bed.gz)} 13 | } 14 | \value{ 15 | hits: number of intersections to each database source file 16 | } 17 | \description{ 18 | Function to search the igd database for a query set from a file 19 | } 20 | \examples{ 21 | \dontrun{ 22 | hits <- search_qfile(igdr, "data/r10000.bed")} 23 | } 24 | -------------------------------------------------------------------------------- /IGDr_0408/src/IGDr.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr_0408/src/IGDr.so -------------------------------------------------------------------------------- /IGDr_0408/src/RcppExports.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr_0408/src/RcppExports.o -------------------------------------------------------------------------------- /IGDr_0408/src/igd_base.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Common igd struct, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_base.h" 7 | 8 | #define gdata_t_key(r) ((r).start) 9 | KRADIX_SORT_INIT(intv, gdata_t, gdata_t_key, 4) 10 | KHASH_MAP_INIT_STR(str, int32_t) 11 | typedef khash_t(str) strhash_t; 12 | 13 | void str_splits( char* str, int *nmax, char **splits) 14 | { //tsv 15 | splits[*nmax] = NULL; 16 | splits[0] = str; 17 | char *ch = str; 18 | int ns = 1; 19 | do { 20 | if (*ch == '\t'){ 21 | splits[ns++] = &ch[1]; 22 | *ch = '\0'; 23 | } 24 | ch++; 25 | } while (*ch != '\0' && ns < *nmax+1); 26 | *nmax = ns; 27 | } 28 | 29 | char *parse_bed(char *s, int32_t *st_, int32_t *en_) 30 | { 31 | char *p, *q, *ctg = 0; 32 | int32_t i, st = -1, en = -1; 33 | for (i = 0, p = q = s;; ++q) { 34 | if (*q == '\t' || *q == '\0') { 35 | int c = *q; 36 | *q = 0; 37 | if (i == 0) ctg = p; 38 | else if (i == 1) st = atol(p); 39 | else if (i == 2) en = atol(p); 40 | ++i, p = q + 1; 41 | if (c == '\0') break; 42 | } 43 | } 44 | *st_ = st, *en_ = en; 45 | return i >= 3? ctg : 0; 46 | } 47 | 48 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe) 49 | { //find tE: index of the last item satisfying .start < qe from right 50 | //assuming gdata sorted by start 51 | int32_t tL=t0, tR=tc, tM, tE = -1; 52 | if(gdata[tR].start < qe) 53 | return tR; 54 | else if(gdata[tL].start >= qe) 55 | return -1; 56 | while(tL= qe) 59 | tR = tM-1; 60 | else 61 | tL = tM; 62 | } 63 | if(gdata[tR].start < qe) 64 | tE = tR; 65 | else if(gdata[tL].start < qe) 66 | tE = tL; 67 | return tE; 68 | } 69 | 70 | <<<<<<< HEAD 71 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx) 72 | { //layers: igd->ctg->gTile->gdata(list) 73 | if(s >= e)return; 74 | int absent; 75 | ======= 76 | /** 77 | * @brief Primary function for adding regions to an IGD object 78 | * 79 | * Adds a region to an IGD database? 80 | * 81 | * @param *igd a pointer to the IGD object (of type igd_t) 82 | * @param *chrm Chromosome ??? 83 | * @param s Start coordinate 84 | * @param e End coordinate 85 | * @param v Value ??? 86 | * @param idx ??? 87 | * @return Null 88 | */ 89 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx) 90 | { //layers: igd->ctg->gTile->gdata(list) 91 | if(s >= e)return; 92 | int absent; //return value from kh_put function 93 | >>>>>>> 42a3ce8e28829decf75d9ff19eef2dbf843ddee3 94 | khint_t k; 95 | strhash_t *h = (strhash_t*)igd->hc; 96 | k = kh_put(str, h, chrm, &absent); 97 | int32_t n1 = s/igd->nbp; 98 | int32_t n2 = (e-1)/igd->nbp; 99 | if (absent) { 100 | //printf("%s %i %i %i\n", chrm, n1, n2, k); 101 | //igd 102 | if (igd->nctg == igd->mctg) 103 | EXPAND(igd->ctg, igd->mctg); 104 | kh_val(h, k) = igd->nctg; 105 | //ctg: initialize 106 | ctg_t *p = &igd->ctg[igd->nctg++]; 107 | p->name = strdup(chrm); 108 | p->mTiles= 1 + n2; 109 | p->gTile = malloc(p->mTiles*sizeof(tile_t)); 110 | kh_key(h, k) = p->name; 111 | //tile: initialize 112 | for(int i=0;imTiles;i++){ 113 | tile_t *tile = &p->gTile[i]; 114 | tile->ncnts = 0; //each batch 115 | tile->nCnts = 0; //total 116 | tile->mcnts = 4; 117 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 118 | } 119 | } 120 | int32_t kk = kh_val(h, k); 121 | ctg_t *p = &igd->ctg[kk]; 122 | if (n2+1>=p->mTiles){ 123 | int32_t tt = p->mTiles; 124 | p->mTiles = n2+1; 125 | p->gTile = realloc(p->gTile, p->mTiles*sizeof(tile_t)); 126 | //initialize new tiles 127 | for(int i=tt;imTiles;i++){ 128 | tile_t *tile = &p->gTile[i]; 129 | tile->ncnts = 0; //each batch 130 | tile->nCnts = 0; //total 131 | tile->mcnts = 16; 132 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 133 | } 134 | } 135 | //add data elements 136 | for(int i=n1;i<=n2;i++){ 137 | tile_t *tile = &p->gTile[i]; 138 | if(tile->ncnts == tile->mcnts) 139 | EXPAND(tile->gList, tile->mcnts); 140 | gdata_t *gdata = &tile->gList[tile->ncnts++]; 141 | gdata->start = s; 142 | gdata->end = e; 143 | gdata->value = v; 144 | gdata->idx = idx; 145 | igd->total++; 146 | } 147 | return; 148 | } 149 | 150 | info_t* get_fileinfo(char *ifName, int32_t *nFiles) 151 | { //read head file __index.tsv to get info 152 | FILE *fp = fopen(ifName, "r"); 153 | if(fp==NULL){ 154 | printf("file not found:%s\n", ifName); 155 | return NULL; 156 | } 157 | char buf[1024], *s0, *s1, *s2, *s3; 158 | int nfiles=0; 159 | char* rtn = fgets(buf, 1024, fp); 160 | while(fgets(buf, 1024, fp)!=NULL) 161 | nfiles++; 162 | 163 | info_t *fi = (info_t*)malloc(nfiles*sizeof(info_t)); 164 | fseek(fp, 0, SEEK_SET); 165 | int i=0; 166 | rtn = fgets(buf, 1024, fp); //header 167 | while(fgets(buf, 1024, fp)!=NULL){ 168 | s0 = strtok(buf, "\t"); 169 | s1 = strtok(NULL, "\t"); 170 | fi[i].fileName = strdup(s1); 171 | s2 = strtok(NULL, "\t"); 172 | fi[i].nr = atol(s2); 173 | //s3 = strtok(NULL, "\t"); 174 | //fi[i].md = (double)atol(s3); 175 | i++; 176 | } 177 | *nFiles = (int32_t)nfiles; 178 | fclose(fp); 179 | return fi; 180 | } 181 | 182 | iGD_t* open_iGD(char *igdFile) 183 | { 184 | iGD_t* iGD = iGD_init(); 185 | char tmp[256]; 186 | strcpy(tmp, igdFile); 187 | tmp[strrchr(tmp, '.')-tmp] = '\0'; 188 | strcpy(iGD->fname, tmp); 189 | char *idFile = tmp; //str_split(tmp, '.', &nCols)[0]; 190 | strcat(idFile, "_index.tsv"); 191 | iGD->finfo = get_fileinfo(idFile, &iGD->nFiles); 192 | FILE *fp = fopen(igdFile, "rb"); 193 | if(fp == NULL) 194 | printf("Can't open file %s", igdFile); 195 | long rtn = fread(&iGD->nbp, sizeof(int32_t), 1, fp); 196 | rtn = fread(&iGD->gType, sizeof(int32_t), 1, fp); 197 | rtn = fread(&iGD->nCtg, sizeof(int32_t), 1, fp); 198 | int i, k; 199 | int32_t gdsize; 200 | gdsize = sizeof(gdata_t); 201 | int32_t tileS, m = iGD->nCtg; //the idx of a tile in the chrom 202 | //------------------------------------------ 203 | iGD->nTile = malloc(m*sizeof(int32_t)); 204 | rtn = fread(iGD->nTile, sizeof(int32_t)*m, 1, fp); 205 | int64_t chr_loc = 12 + 44*m; //header size in bytes 206 | for(i=0;inTile[i]*4; 207 | //------------------------------------------ 208 | iGD->nCnt = malloc(m*sizeof(int32_t*)); 209 | iGD->tIdx = malloc(m*sizeof(int64_t*)); 210 | for(i=0;inTile[i]; 212 | iGD->nCnt[i] = calloc(k, sizeof(int32_t)); 213 | rtn = fread(iGD->nCnt[i], sizeof(int32_t)*k, 1, fp); 214 | //-------------------------------------- 215 | iGD->tIdx[i] = calloc(k, sizeof(int64_t)); 216 | iGD->tIdx[i][0] = chr_loc; 217 | for(int j=1; jtIdx[i][j] = iGD->tIdx[i][j-1]+iGD->nCnt[i][j-1]*gdsize; 219 | chr_loc = iGD->tIdx[i][k-1]+iGD->nCnt[i][k-1]*gdsize; 220 | } 221 | 222 | iGD->cName = malloc(m*sizeof(char*)); 223 | for(i=0;icName[i] = malloc(40*sizeof(char)); 225 | rtn = fread(iGD->cName[i], 40, 1, fp); 226 | } 227 | iGD->fP = fp; 228 | 229 | //setup hc 230 | iGD->hc = kh_init(str); 231 | int absent; 232 | for(i=0;inCtg;i++){ 233 | khint_t k; 234 | strhash_t *h = (strhash_t*)iGD->hc; 235 | k = kh_put(str, h, iGD->cName[i], &absent); 236 | kh_val(h, k) = i; 237 | kh_key(h, k) = iGD->cName[i]; 238 | } 239 | iGD->gData = malloc(1*sizeof(gdata_t)); 240 | iGD->preIdx = -1; 241 | iGD->preChr = -1; 242 | return iGD; 243 | } 244 | 245 | int32_t get_id(iGD_t *iGD, const char *chrm) 246 | { //for search 247 | khint_t k; 248 | strhash_t *h = (strhash_t*)iGD->hc; 249 | k = kh_get(str, h, chrm); 250 | return k == kh_end(h)? -1 : kh_val(h, k); 251 | } 252 | 253 | int32_t get_nFiles(iGD_t *iGD) 254 | { 255 | return iGD->nFiles; 256 | } 257 | 258 | void igd_saveT(igd_t *igd, char *oPath) 259 | { //Save/append tiles to disc, add cnts tp Cnts 260 | char idFile[256]; 261 | for (int i = 0; i < igd->nctg; i++){ 262 | ctg_t *ctg = &igd->ctg[i]; 263 | for(int j=0; j< ctg->mTiles; j++){ 264 | tile_t *tile = &ctg->gTile[j]; 265 | //--------------------------------------- 266 | if(tile->ncnts>0){ 267 | sprintf(idFile, "%s%s%s_%i", oPath, "data0/", ctg->name, j); 268 | FILE *fp = fopen(idFile, "ab"); 269 | if(fp==NULL) 270 | printf("Can't open file %s", idFile); 271 | fwrite(tile->gList, sizeof(gdata_t), tile->ncnts, fp); 272 | fclose(fp); 273 | } 274 | tile->nCnts += tile->ncnts; 275 | tile->ncnts = 0; 276 | free(tile->gList); 277 | tile->mcnts = 16;//MAX(16, tile->mcnts/16); 278 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 279 | //tile->gList = realloc(tile->gList, tile->mcnts*sizeof(gdata_t));? 280 | } 281 | } 282 | igd->total = 0; //batch total 283 | } 284 | 285 | void igd_save(igd_t *igd, char *oPath, char *igdName) 286 | { 287 | char idFile[256], iname[256]; 288 | //1. Save iGD data info: ctg string length 40 289 | int32_t i, j, n, m = igd->nctg; 290 | sprintf(idFile, "%s%s%s", oPath, igdName, ".igd"); 291 | FILE *fp = fopen(idFile, "wb"); 292 | if(fp==NULL) 293 | printf("Can't open file %s", idFile); 294 | fwrite(&igd->nbp, sizeof(int32_t), 1, fp); //4 bytes 295 | fwrite(&igd->gType, sizeof(int32_t), 1, fp); //4 296 | fwrite(&m, sizeof(int32_t), 1, fp); //4 297 | //----------------- 298 | for(i=0;ictg[i].mTiles, sizeof(int32_t), 1, fp); 300 | for(i=0;ictg[i]; 302 | n = p->mTiles; 303 | for(j=0;jgTile[j].nCnts, sizeof(int32_t), 1, fp); 305 | } 306 | //write string array 307 | for(i=0;ictg[i].name, 40, 1, fp); 309 | 310 | //2. Sort and save tiles data 311 | for(i=0;ictg[i]; 313 | n = p->mTiles; 314 | for(j=0;jgTile[j]; 316 | int32_t nrec = q->nCnts, gdsize; 317 | if(nrec>0){ 318 | sprintf(iname, "%s%s%s_%i", oPath, "data0/", p->name, j); 319 | FILE *fp0 = fopen(iname, "rb"); 320 | if(fp0 == NULL) 321 | printf("Can't open file %s", iname); 322 | gdsize = nrec*sizeof(gdata_t); 323 | gdata_t *gdata = malloc(gdsize); 324 | long rtn = fread(gdata, gdsize, 1, fp0); 325 | fclose(fp0); 326 | radix_sort_intv(gdata, gdata+nrec); 327 | fwrite(gdata, gdsize, 1, fp); 328 | free(gdata); 329 | remove(iname); 330 | } 331 | } 332 | } 333 | fclose(fp); 334 | } 335 | 336 | igd_t *igd_init(int tile_size) 337 | { 338 | igd_t *igd = malloc(1*sizeof(igd_t)); 339 | igd->gType = 1; 340 | igd->nbp = tile_size; 341 | igd->hc = kh_init(str); 342 | igd->nctg = 0; 343 | igd->mctg = 32; 344 | igd->ctg = malloc(igd->mctg*sizeof(ctg_t)); 345 | igd->total = 0; 346 | return igd; 347 | } 348 | 349 | void igd_destroy(igd_t *igd) 350 | { 351 | if (igd == 0) return; 352 | for (int i = 0; i < igd->nctg; ++i){ 353 | free(igd->ctg[i].name); 354 | for(int j=0; j< igd->ctg[i].mTiles; j++) 355 | free(igd->ctg[i].gTile[j].gList); 356 | } 357 | free(igd->ctg); 358 | kh_destroy(str, (strhash_t*)igd->hc); 359 | free(igd); 360 | } 361 | 362 | iGD_t *iGD_init() 363 | { 364 | iGD_t *iGD = (iGD_t *) malloc(1*sizeof(iGD_t)); 365 | iGD->nbp = 16384; 366 | iGD->gType = 1; 367 | iGD->nCtg = 24; 368 | return iGD; 369 | } 370 | 371 | void close_iGD(iGD_t *iGD) 372 | { 373 | if(iGD==0) return; 374 | fclose(iGD->fP); 375 | free(iGD->gData); 376 | free(iGD->nTile); 377 | kh_destroy(str, (strhash_t*)iGD->hc); 378 | for(int i=0;inCtg;i++){ 379 | free(iGD->nCnt[i]); 380 | free(iGD->tIdx[i]); 381 | } 382 | free(iGD->nCnt); 383 | free(iGD->tIdx); 384 | free(iGD->cName); 385 | free(iGD->finfo); 386 | free(iGD); 387 | } 388 | 389 | //--------------------------------------------------------------------------------- 390 | //.Call entry point 391 | //--------------------------------------------------------------------------------- 392 | SEXP iGD_free(SEXP igdr) 393 | { 394 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 395 | if(iGD==NULL) 396 | error("iGD_free: iGDr external pointer is NULL"); 397 | close_iGD(iGD); 398 | R_SetExternalPtrAddr(igdr, NULL); 399 | return(R_NilValue); 400 | } 401 | 402 | SEXP iGD_new(SEXP igd_file) 403 | { //new a class that contains an externalPtr (iGD_t structure) 404 | const char *igdFile = CHAR(STRING_ELT(igd_file, 0)); 405 | iGD_t *iGD = open_iGD(igdFile); 406 | SEXP igdr, klass, obj; 407 | PROTECT(igdr = R_MakeExternalPtr(iGD, R_NilValue, R_NilValue)); 408 | R_RegisterCFinalizer(igdr, (R_CFinalizer_t)iGD_free); 409 | klass = PROTECT(MAKE_CLASS("IGDr")); 410 | PROTECT(obj = NEW_OBJECT(klass)); 411 | SET_SLOT(obj, Rf_install("ref"), igdr); 412 | UNPROTECT(3); 413 | return(obj); 414 | } 415 | 416 | SEXP get_cid(SEXP igdr, SEXP chrom) 417 | { //chrom id 418 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 419 | if(iGD==NULL) 420 | error("iGD_free: iGDr external pointer is NULL"); 421 | const char *chrm = CHAR(STRING_ELT(chrom, 0)); 422 | int32_t tid = get_id(iGD, chrm); 423 | SEXP cid; 424 | PROTECT(cid = allocVector(INTSXP, 1)); 425 | INTEGER(cid)[0] = tid; 426 | UNPROTECT(1); 427 | return(cid); 428 | } 429 | 430 | SEXP get_nbp(SEXP igdr) 431 | { //chrom id 432 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 433 | if(iGD==NULL) 434 | error("iGD_free: iGDr external pointer is NULL"); 435 | SEXP t_nbp; 436 | PROTECT(t_nbp = allocVector(INTSXP, 1)); 437 | INTEGER(t_nbp)[0] = iGD->nbp; 438 | UNPROTECT(1); 439 | return(t_nbp); 440 | } 441 | 442 | SEXP get_nfiles(SEXP igdr) 443 | { 444 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 445 | if(iGD==NULL) 446 | error("iGD_free: iGDr external pointer is NULL"); 447 | SEXP nfile; 448 | PROTECT(nfile = allocVector(INTSXP, 1)); 449 | INTEGER(nfile)[0] = iGD->nFiles; 450 | UNPROTECT(1); 451 | return(nfile); 452 | } 453 | 454 | SEXP get_nCtgs(SEXP igdr) 455 | { //chrom id 456 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 457 | if(iGD==NULL) 458 | error("iGD_free: iGDr external pointer is NULL"); 459 | SEXP n_ctgs; 460 | PROTECT(n_ctgs = allocVector(INTSXP, 1)); 461 | INTEGER(n_ctgs)[0] = iGD->nCtg; 462 | UNPROTECT(1); 463 | return(n_ctgs); 464 | } 465 | 466 | SEXP get_binLen(SEXP igdr, SEXP ichr, SEXP bin) 467 | { //not really necessary 468 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 469 | if(iGD==NULL) 470 | error("iGD_free: iGDr external pointer is NULL"); 471 | SEXP binLen; 472 | int ichr0 = INTEGER(ichr)[0]-1; 473 | int j = INTEGER(bin)[0]-1; 474 | if(ichr0 >= iGD->nCtg || ichr0<0 || j<0 || j>=iGD->nTile[ichr0]) 475 | return(R_NilValue); 476 | PROTECT(binLen = allocVector(INTSXP, 1)); 477 | INTEGER(binLen)[0] = iGD->nCnt[ichr0][j]; 478 | UNPROTECT(1); 479 | return(binLen); 480 | } 481 | 482 | -------------------------------------------------------------------------------- /IGDr_0408/src/igd_base.h: -------------------------------------------------------------------------------- 1 | //================================================================================= 2 | //Common structs, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //re-designed 7/1/2019 5 | //database intervals sorted by _start: 8/12/2019 6 | //--------------------------------------------------------------------------------- 7 | #ifndef __IGD_BASE_H__ 8 | #define __IGD_BASE_H__ 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "khash.h" 28 | #include "kseq.h" 29 | 30 | #define PROGRAM_NAME "igd" 31 | #define MAJOR_VERSION "0" 32 | #define MINOR_VERSION "1" 33 | #define REVISION_VERSION "1" 34 | #define BUILD_VERSION "0" 35 | #define VERSION MAJOR_VERSION "." MINOR_VERSION "." REVISION_VERSION 36 | #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) 37 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) 38 | #define maxCount 268435456 //16* = 4GB memory 39 | //--------------------------------------------------------------------------------- 40 | typedef struct{ //default 41 | int32_t idx; //genomic object--data set index 42 | int32_t start; //region start 43 | int32_t end; //region end 44 | int32_t value; 45 | } gdata_t; 46 | 47 | typedef struct{ 48 | char* fileName; //dataset file 49 | int32_t nr; //number regions/dataset 50 | double md; //average width of the regions 51 | } info_t; 52 | 53 | typedef struct{ 54 | int32_t ncnts, nCnts, mcnts; //batch counts, total, max 55 | gdata_t *gList; 56 | } tile_t; 57 | 58 | typedef struct{ 59 | char *name; //name of the contig 60 | int32_t mTiles; //determined by the interval start and end 61 | tile_t *gTile; //tile data 62 | } ctg_t; 63 | 64 | typedef struct{ 65 | int32_t nbp, gType, nctg, mctg; // number of base pairs, data type: 0, 1, 2 etc; size differs 66 | int64_t total; //total region in each ctg 67 | ctg_t *ctg; //list of contigs (of size _n_ctg_) 68 | void *hc; //dict for converting contig names to int 69 | } igd_t; //For creation: internal... 70 | 71 | typedef struct{ //For search: external... 72 | int32_t nbp, gType, nCtg, nFiles; 73 | int32_t preIdx, preChr; 74 | char fname[64]; 75 | char **cName; //name of ctgs 76 | int32_t *nTile; //num of tiles in each ctg 77 | int32_t **nCnt; //num of counts in each tile 78 | int64_t **tIdx; //tile index *sizeof -> location in .igd file 79 | gdata_t *gData; 80 | info_t *finfo; 81 | FILE *fP; 82 | void *hc; 83 | } iGD_t; 84 | 85 | //--------------------------------------------------------------------------------- 86 | //Parse a line of BED file 87 | void str_splits( char* str, int *nmax, char **splits); 88 | char *parse_bed(char *s, int32_t *st_, int32_t *en_); 89 | 90 | //Binary search 91 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe); 92 | 93 | //Add an interval 94 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx); 95 | 96 | //Get id from igd dict 97 | int32_t get_id(iGD_t *iGD, const char *chrm); 98 | 99 | //Get nFiles from iGD 100 | int32_t get_nFiles(iGD_t *iGD); 101 | 102 | //Get file info from .tsv 103 | info_t *get_fileinfo(char *ifName, int32_t *nFiles); 104 | 105 | //Get igd info from .igd 106 | iGD_t *open_iGD(char *igdFile); 107 | 108 | //Initialize igd_t 109 | igd_t *igd_init(int tile_size); 110 | 111 | //Initialize iGD_t 112 | iGD_t *iGD_init(); 113 | 114 | //Save tile data 115 | void igd_saveT(igd_t *igd, char *oPath); 116 | 117 | //Sort and save igd 118 | void igd_save(igd_t *igd, char *oPath, char *igdName); 119 | 120 | //Free igd data 121 | void igd_destroy(igd_t *igd); 122 | 123 | //Free iGD data 124 | void close_iGD(iGD_t *iGD); 125 | 126 | //--------------------------------------------------------------------------------- 127 | //The following section taken from Dr Heng Li's cgranges 128 | // (https://github.com/lh3/cgranges) 129 | 130 | KSTREAM_INIT(gzFile, gzread, 0x10000) 131 | /************** 132 | * Radix sort * 133 | **************/ 134 | #define RS_MIN_SIZE 64 135 | #define RS_MAX_BITS 8 136 | 137 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ 138 | typedef struct { \ 139 | rstype_t *b, *e; \ 140 | } rsbucket_##name##_t; \ 141 | void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ 142 | { \ 143 | rstype_t *i; \ 144 | for (i = beg + 1; i < end; ++i) \ 145 | if (rskey(*i) < rskey(*(i - 1))) { \ 146 | rstype_t *j, tmp = *i; \ 147 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ 148 | *j = *(j - 1); \ 149 | *j = tmp; \ 150 | } \ 151 | } \ 152 | void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ 153 | { \ 154 | rstype_t *i; \ 155 | int size = 1<b = k->e = beg; \ 159 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ 160 | for (k = b + 1; k != be; ++k) \ 161 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ 162 | for (k = b; k != be;) { \ 163 | if (k->b != k->e) { \ 164 | rsbucket_##name##_t *l; \ 165 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ 166 | rstype_t tmp = *k->b, swap; \ 167 | do { \ 168 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ 169 | l = b + (rskey(tmp)>>s&m); \ 170 | } while (l != k); \ 171 | *k->b++ = tmp; \ 172 | } else ++k->b; \ 173 | } else ++k; \ 174 | } \ 175 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ 176 | if (s) { \ 177 | s = s > n_bits? s - n_bits : 0; \ 178 | for (k = b; k != be; ++k) \ 179 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ 180 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ 181 | } \ 182 | } \ 183 | void radix_sort_##name(rstype_t *beg, rstype_t *end) \ 184 | { \ 185 | if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ 186 | else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \ 187 | } 188 | 189 | /********************* 190 | * Convenient macros * 191 | *********************/ 192 | 193 | #ifndef kroundup32 194 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 195 | #endif 196 | 197 | #define CALLOC(type, len) ((type*)calloc((len), sizeof(type))) 198 | #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr)))) 199 | 200 | #define EXPAND(a, m) do { \ 201 | (m) = (m)? (m) + ((m)>>1) : 16; \ 202 | REALLOC((a), (m)); \ 203 | }while (0) 204 | 205 | #endif 206 | 207 | -------------------------------------------------------------------------------- /IGDr_0408/src/igd_base.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr_0408/src/igd_base.o -------------------------------------------------------------------------------- /IGDr_0408/src/igd_create.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Read igd region data and query data, and then find all overlaps 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_create.h" 7 | 8 | int create_help(int exit_code) 9 | { 10 | printf( 11 | "%s, v%s\n" 12 | "usage: %s create [options] \n" 13 | " -b \n" 14 | " -c < .BED column as value >=4 (default 4) \n", 15 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 16 | return exit_code; 17 | } 18 | 19 | void create_iGD(char **i_path, char **o_path, char **igd_name, int *tile_size) 20 | { //.C call using pointers to pass arguments!!! 21 | char iPath[256]; 22 | char oPath[256]; 23 | char igdName[64]; 24 | strcpy(iPath, *i_path); 25 | strcpy(oPath, *o_path); 26 | strcpy(igdName, *igd_name); 27 | int rtn, binSize = *tile_size; 28 | 29 | //Check if the subfolders exist: 30 | char ftmp[255]; 31 | struct stat st = {0}; 32 | if(oPath[strlen(oPath)-1]!='/'){ 33 | strcat(oPath, "/"); 34 | } 35 | sprintf(ftmp, "%s%s%s", oPath, igdName, ".igd"); 36 | if(stat(ftmp, &st) == 0){ 37 | printf("The igd database file %s exists!\n", ftmp); 38 | return; 39 | } 40 | else{ 41 | if (stat(oPath, &st) == -1){ 42 | mkdir(oPath, 0777); 43 | } 44 | sprintf(ftmp, "%s%s", oPath, "data0"); 45 | if (stat(ftmp, &st) == -1) 46 | mkdir(ftmp, 0777); 47 | } 48 | 49 | //check if iPath a file or directory 50 | stat(iPath, &st); 51 | glob_t gResult; 52 | if(S_ISREG(st.st_mode)){ 53 | FILE *fp = fopen(iPath, "r"); 54 | if(fgets(ftmp, 255, fp)){ 55 | if(ftmp[strlen(iPath)-1]=='/'){ 56 | strcat(ftmp, "*"); 57 | } 58 | else if(ftmp[strlen(ftmp)-1]!='*'){ 59 | strcat(ftmp, "/*"); 60 | } 61 | rtn = glob(ftmp, 0, NULL, &gResult); 62 | } 63 | while(fgets(ftmp, 255, fp)){ 64 | if(ftmp[strlen(iPath)-1]=='/'){ 65 | strcat(ftmp, "*"); 66 | } 67 | else if(ftmp[strlen(ftmp)-1]!='*'){ 68 | strcat(ftmp, "/*"); 69 | } 70 | rtn = glob(ftmp, GLOB_APPEND, NULL, &gResult); 71 | } 72 | fclose(fp); 73 | } 74 | else{ //S_ISDIR(..) 75 | if(iPath[strlen(iPath)-1]=='/'){ 76 | strcat(iPath, "*"); 77 | } 78 | else if(iPath[strlen(iPath)-1]!='*'){ 79 | strcat(iPath, "/*"); 80 | } 81 | rtn = glob(iPath, 0, NULL, &gResult); 82 | if(rtn!=0){ 83 | printf("wrong dir path: %s", iPath); 84 | return; 85 | } 86 | } 87 | 88 | //0. Initialize igd 89 | igd_t *igd = igd_init(binSize); 90 | //printf("igd_create 0\n"); 91 | 92 | //1. Get the files 93 | char** file_ids = gResult.gl_pathv; 94 | int32_t n_files = gResult.gl_pathc; 95 | if(n_files<1) 96 | printf("Too few files (add to path /*): %i\n", n_files); 97 | 98 | int32_t *nr = calloc(n_files, sizeof(int32_t)); 99 | double *avg = calloc(n_files, sizeof(double)); 100 | printf("igd_create 1: %i\n", n_files); 101 | 102 | //2. Read files 103 | int nCols=32; 104 | unsigned char buffer[256]; 105 | int32_t i, j, k, ig, i0=0, i1=0, L0=0, L1=1, m, nL; //int64_t? 106 | char **splits = malloc((nCols+1)*sizeof(char *)); 107 | while(i00 defines breaks when reading maxCount 114 | //printf("%i, %i, %i, %s\n", i0, ig, nL, file_ids[ig]); 115 | gzFile fp; 116 | if ((fp = gzopen(file_ids[ig], "r")) == 0) 117 | return; 118 | nL = 0; 119 | if(ig==i0 && L0>0){ //pass L0 lines of a big file 120 | while(nL4) va = atol(splits[4]); 127 | igd_add(igd, splits[0], st, en, va, ig); 128 | nr[ig]++; 129 | avg[ig]+=en-st; 130 | nL++; 131 | if(igd->total>maxCount){ 132 | m = 1; 133 | i1 = ig; 134 | L1 = nL; //number of total lines or next line 135 | } 136 | } 137 | gzclose(fp); 138 | if(m==0) ig++; 139 | } 140 | //2.3 Save/append tiles to disc, add cnts tp Cnts 141 | 142 | igd_saveT(igd, oPath); 143 | i0 = ig; 144 | L0 = L1; 145 | L1 = 0; 146 | } 147 | //printf("igd_create 2\n"); 148 | 149 | //3. save _index.tsv: 4 columns--index, filename, nr, avg 150 | //Also has a header line: 151 | char idFile[128]; 152 | char *tchr; 153 | sprintf(idFile, "%s%s%s", oPath, igdName, "_index.tsv"); 154 | FILE *fpi = fopen(idFile, "w"); 155 | if(fpi==NULL) 156 | printf("Can't open file %s", idFile); 157 | fprintf(fpi, "Index\tFile\tNumber of regions\tAvg size\n"); 158 | for(i=0; i0 defines breaks when reading maxCount 258 | //printf("%i, %i, %i, %s\n", i0, ig, nL, file_ids[ig]); 259 | gzFile fp; 260 | if ((fp = gzopen(file_ids[ig], "r")) == 0) 261 | return; 262 | nL = 0; 263 | if(ig==i0 && L0>0){ //pass L0 lines of a big file 264 | while(nL4) va = atol(splits[4]); 271 | igd_add(igd, splits[0], st, en, va, ig); 272 | nr[ig]++; 273 | avg[ig]+=en-st; 274 | nL++; 275 | if(igd->total>maxCount){ 276 | m = 1; 277 | i1 = ig; 278 | L1 = nL; //number of total lines or next line 279 | } 280 | } 281 | gzclose(fp); 282 | if(m==0) ig++; 283 | } 284 | //2.3 Save/append tiles to disc, add cnts tp Cnts 285 | 286 | igd_saveT(igd, oPath); 287 | i0 = ig; 288 | L0 = L1; 289 | L1 = 0; 290 | } 291 | //printf("igd_create 2\n"); 292 | 293 | //3. save _index.tsv: 4 columns--index, filename, nr, avg 294 | //Also has a header line: 295 | char idFile[128]; 296 | char *tchr; 297 | sprintf(idFile, "%s%s%s", oPath, igdName, "_index.tsv"); 298 | FILE *fpi = fopen(idFile, "w"); 299 | if(fpi==NULL) 300 | printf("Can't open file %s", idFile); 301 | fprintf(fpi, "Index\tFile\tNumber of regions\tAvg size\n"); 302 | for(i=0; i [options]\n" 15 | " options:\n" 16 | " -q \n" 17 | " -r \n" 18 | " -v \n" 19 | " -o \n" 20 | " -c display all intersects\n", 21 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 22 | return exit_code; 23 | } 24 | 25 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits) 26 | { 27 | int ichr = get_id(iGD, chrm); 28 | if(ichr<0) 29 | return; 30 | int i, j, n1 = qs/iGD->nbp, n2 = (qe-1)/iGD->nbp; //define boundary! 31 | int32_t tE, tS, tL, tR, tM, tmpi, tmpi1, mlen, mTile = iGD->nTile[ichr]-1; 32 | if(n1>mTile) 33 | return; 34 | n2 = MIN(n2, mTile); 35 | tmpi = iGD->nCnt[ichr][n1]; 36 | tmpi1 = tmpi-1; 37 | long rtn; 38 | if(tmpi>0){ 39 | if(n1!=iGD->preIdx || ichr!=iGD->preChr){ 40 | fseek(iGD->fP, iGD->tIdx[ichr][n1], SEEK_SET); 41 | free(iGD->gData); 42 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 43 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 44 | iGD->preIdx = n1; 45 | iGD->preChr = ichr; 46 | } 47 | if(qe>iGD->gData[0].start){ //sorted by start 48 | //find the 1st rs < qe 49 | tL = 0, tR=tmpi1; 50 | while(tLgData[tM].start < qe) //right side: 53 | tL = tM; 54 | else 55 | tR = tM; //left side 56 | } 57 | if(iGD->gData[tR].start=0; i--){ 60 | if(iGD->gData[i].end>qs){ 61 | hits[iGD->gData[i].idx]++; 62 | } 63 | } 64 | } 65 | if(n2>n1){ //n2>n1 66 | int32_t bd = iGD->nbp*(n1+1); //only keep the first 67 | for(j=n1+1; j<=n2; j++){ //n2 inclusive!!! 68 | tmpi = iGD->nCnt[ichr][j]; 69 | tmpi1 = tmpi-1; 70 | if(tmpi>0){ 71 | if(j!=iGD->preIdx || ichr!=iGD->preChr){ 72 | fseek(iGD->fP, iGD->tIdx[ichr][j], SEEK_SET); 73 | free(iGD->gData); 74 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 75 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 76 | iGD->preIdx = j; 77 | iGD->preChr = ichr; 78 | } 79 | if(qe>iGD->gData[0].start){ 80 | tS = 0; 81 | while(tSgData[tS].startgData[tM].start < qe) //right side: 86 | tL = tM; 87 | else 88 | tR = tM; //left side 89 | } 90 | if(iGD->gData[tR].start=tS; i--){ 93 | if(iGD->gData[i].end>qs){ 94 | hits[iGD->gData[i].idx]++; 95 | } 96 | } 97 | } 98 | } 99 | bd+=iGD->nbp; 100 | } 101 | } 102 | } 103 | } 104 | 105 | void get_overlaps32(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int32_t *hits) 106 | { 107 | //printf("%i\t%i\t%s\n", qs, qe, chrm); 108 | //printf("%i\t%i\t%i\t%i\n", hits[0], hits[1], hits[2], hits[3]); 109 | int ichr = get_id(iGD, chrm); 110 | if(ichr<0) 111 | return; 112 | int i, j, n1 = qs/iGD->nbp, n2 = (qe-1)/iGD->nbp; //define boundary! 113 | int32_t tE, tS, tL, tR, tM, tmpi, tmpi1, mlen, mTile = iGD->nTile[ichr]-1; 114 | if(n1>mTile) 115 | return; 116 | n2 = MIN(n2, mTile); 117 | tmpi = iGD->nCnt[ichr][n1]; 118 | tmpi1 = tmpi-1; 119 | long rtn; 120 | if(tmpi>0){ 121 | if(n1!=iGD->preIdx || ichr!=iGD->preChr){ 122 | fseek(iGD->fP, iGD->tIdx[ichr][n1], SEEK_SET); 123 | free(iGD->gData); 124 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 125 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 126 | iGD->preIdx = n1; 127 | iGD->preChr = ichr; 128 | } 129 | if(qe>iGD->gData[0].start){ //sorted by start 130 | //find the 1st rs < qe 131 | tL = 0, tR=tmpi1; 132 | while(tLgData[tM].start < qe) //right side: 135 | tL = tM; 136 | else 137 | tR = tM; //left side 138 | } 139 | if(iGD->gData[tR].start=0; i--){ 142 | if(iGD->gData[i].end>qs){ 143 | hits[iGD->gData[i].idx]++; 144 | } 145 | } 146 | } 147 | if(n2>n1){ //n2>n1 148 | int32_t bd = iGD->nbp*(n1+1); //only keep the first 149 | for(j=n1+1; j<=n2; j++){ //n2 inclusive!!! 150 | tmpi = iGD->nCnt[ichr][j]; 151 | tmpi1 = tmpi-1; 152 | if(tmpi>0){ 153 | if(j!=iGD->preIdx || ichr!=iGD->preChr){ 154 | fseek(iGD->fP, iGD->tIdx[ichr][j], SEEK_SET); 155 | free(iGD->gData); 156 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 157 | rtn = fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 158 | iGD->preIdx = j; 159 | iGD->preChr = ichr; 160 | } 161 | if(qe>iGD->gData[0].start){ 162 | tS = 0; 163 | while(tSgData[tS].startgData[tM].start < qe) //right side: 168 | tL = tM; 169 | else 170 | tR = tM; //left side 171 | } 172 | if(iGD->gData[tR].start=tS; i--){ 175 | if(iGD->gData[i].end>qs){ 176 | hits[iGD->gData[i].idx]++; 177 | } 178 | } 179 | } 180 | } 181 | bd+=iGD->nbp; 182 | } 183 | } 184 | } 185 | //printf("%i\t%i\t%i\t%i\n", hits[0], hits[1], hits[2], hits[3]); 186 | } 187 | 188 | 189 | void search_1(char **igdFile, char **qchr, int32_t *qs, int32_t *qe, int64_t *hits) 190 | { 191 | iGD_t *iGD = open_iGD(*igdFile); 192 | get_overlaps(iGD, *qchr, *qs, *qe, hits); 193 | close_iGD(iGD); 194 | } 195 | 196 | void getOverlaps(char **igdFile, char **qFile, int64_t *hits) 197 | { 198 | iGD_t *iGD = open_iGD(*igdFile); 199 | gzFile fp; 200 | kstream_t *ks; 201 | kstring_t str = {0,0,0}; 202 | if ((fp = gzopen(*qFile, "r")) == 0) 203 | return; 204 | ks = ks_init(fp); 205 | char *chrm; 206 | int32_t st, en, nl=0; 207 | while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { 208 | chrm = parse_bed(str.s, &st, &en); 209 | if (chrm) { 210 | get_overlaps(iGD, chrm, st, en, hits); 211 | } 212 | } 213 | free(str.s); 214 | ks_destroy(ks); 215 | gzclose(fp); 216 | close_iGD(iGD); 217 | } 218 | 219 | //---Hash table for R 220 | int32_t hash(const char *key, int32_t htsize) 221 | { 222 | int32_t i=0, hash=0; 223 | while(key && key[i]){ 224 | hash = (hash+key[i])%htsize; 225 | i++; 226 | } 227 | return hash; 228 | } 229 | 230 | hTable *ht_init(int32_t htsize) 231 | { 232 | hTable *ht; 233 | if(htsize<1) 234 | return NULL; 235 | ht = malloc(sizeof(hTable)); 236 | ht->nodes = malloc(htsize*sizeof(htNode)); 237 | memset(ht->nodes, 0, htsize*sizeof(htNode)); 238 | ht->size = htsize; 239 | return ht; 240 | } 241 | 242 | int ht_put(hTable *ht, const char *key, int32_t value) 243 | { 244 | htNode *node = malloc(sizeof(htNode)); 245 | node->key = strdup(key); 246 | node->value = value; 247 | int32_t i = hash(key, value); 248 | htNode *tmp = ht->nodes[i];//linkList[i] 249 | if(tmp!=NULL){ 250 | while(tmp!=NULL){ 251 | if(strcmp(tmp->key, node->key)==0) 252 | break; 253 | tmp = tmp->next; 254 | } 255 | if(tmp==NULL){ //already filled 256 | node->next = ht->nodes[i]; 257 | ht->nodes[i] = node; 258 | } 259 | else{ //alrady exist 260 | tmp->value = node->value; 261 | free(node->key); 262 | free(node); 263 | } 264 | } 265 | else{ 266 | node->next = NULL; 267 | ht->nodes[i] = node; 268 | } 269 | } 270 | 271 | int32_t ht_get(hTable *ht, const char *key) 272 | { 273 | char *key1 = strdup(key); 274 | int32_t i = hash(key, ht->size); 275 | htNode *tmp = ht->nodes[i]; 276 | while(tmp!=NULL){ 277 | if(strcmp(tmp->key, key1)==0) 278 | break; 279 | tmp = tmp->next; 280 | } 281 | free(key1); 282 | if(tmp==NULL) 283 | return -1; 284 | return tmp->value; 285 | } 286 | 287 | void ht_free(hTable* ht) 288 | { 289 | htNode *tmp=NULL; 290 | if(ht==NULL)return; 291 | for(int i=0;isize;i++){ 292 | if(ht->nodes[i]!=NULL){ 293 | while(ht->nodes[i]!=NULL){ 294 | tmp = ht->nodes[i]->next; 295 | free(ht->nodes[i]->key); 296 | free(ht->nodes[i]); 297 | ht->nodes[i] = tmp; 298 | } 299 | free(ht->nodes[i]); 300 | } 301 | } 302 | free(ht->nodes); 303 | free(ht); 304 | } 305 | 306 | //--------------------------------------------------------------------------- 307 | SEXP search_1r(SEXP igdr, SEXP qchrm, SEXP qs, SEXP qe) 308 | { //NO need to supply output vector!!! 309 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 310 | if(iGD==NULL) 311 | error("iGD_free: iGDr external pointer is NULL"); 312 | const char *chrm = CHAR(STRING_ELT(qchrm, 0)); 313 | SEXP hits; 314 | PROTECT(hits = allocVector(INTSXP, iGD->nFiles));//not initialized 315 | memset(INTEGER(hits), 0, iGD->nFiles * sizeof(int)); 316 | get_overlaps32(iGD, chrm, INTEGER(qs)[0], INTEGER(qe)[0], INTEGER(hits)); 317 | UNPROTECT(1); 318 | return(hits); 319 | } 320 | 321 | /*SEXP search_nr(SEXP igdr, SEXP n, SEXP qchrm, SEXP qs, SEXP qe) 322 | { //NO need to supply output vector!!! 323 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 324 | if(iGD==NULL) 325 | error("iGD_free: iGDr external pointer is NULL"); 326 | SEXP hits; 327 | const char *chrm; 328 | int32_t *tmp = calloc(iGD->nFiles, sizeof(int32_t)); 329 | for(int i=0; inFiles));//not initialized 334 | memcpy(INTEGER(hits), tmp, iGD->nFiles * sizeof(int32_t)); 335 | UNPROTECT(1); 336 | free(tmp); 337 | return(hits); 338 | }*/ 339 | 340 | SEXP search_nr(SEXP igdr, SEXP n, SEXP qchrm, SEXP qs, SEXP qe) 341 | { //NO need to supply output vector!!! 342 | iGD_t *iGD = (iGD_t *) R_ExternalPtrAddr(igdr); 343 | if(iGD==NULL) 344 | error("iGD_free: iGDr external pointer is NULL"); 345 | SEXP hits; 346 | const char *chrm; 347 | PROTECT(hits = allocVector(INTSXP, iGD->nFiles));//not initialized 348 | memset(INTEGER(hits), 0, iGD->nFiles * sizeof(int)); 349 | for(int i=0; i=iGD->nTile[ichr0] || j<0){ 365 | printf("Max bin number is %i\n", iGD->nTile[ichr0]); 366 | return(R_NilValue); 367 | } 368 | int ncnt = iGD->nCnt[ichr0][j]; 369 | if(ncnt<1){ 370 | printf("No records in bin %i \n", j); 371 | return(R_NilValue); 372 | } 373 | SEXP starts = PROTECT(allocVector(INTSXP, ncnt)); 374 | SEXP ends = PROTECT(allocVector(INTSXP, ncnt)); 375 | SEXP idx = PROTECT(allocVector(INTSXP, ncnt)); 376 | //-------------------------------------------- 377 | gdata_t *gd = malloc(ncnt*sizeof(gdata_t)); 378 | fseek(iGD->fP, iGD->tIdx[ichr0][j], SEEK_SET); 379 | long rtn = fread(gd, sizeof(gdata_t)*ncnt, 1, iGD->fP); 380 | //-------------------------------------------- 381 | for(int i=0;i location in .igd file 37 | //gdata_t *gData; 38 | //info_t *finfo; 39 | //FILE *fP; 40 | //void *hc; 41 | } igdr_t; 42 | 43 | int32_t hash(const char *key, int32_t htsize); //hash function 44 | hTable *ht_init(int32_t htsize); //initialize 45 | int ht_put(hTable *ht, const char *key, int32_t value); 46 | int32_t ht_get(hTable *ht, const char *key); 47 | void ht_free(hTable* ht); 48 | 49 | //Single query 50 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits); 51 | 52 | //32bit 53 | void get_overlaps32(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int32_t *hits); 54 | 55 | void search_1(char **igdFile, char **qchr, int32_t *qs, int32_t *qe, int64_t *hits); 56 | 57 | //query file: call _r 58 | void getOverlaps(char **igdFile, char **qFile, int64_t *hits); 59 | 60 | #endif 61 | 62 | -------------------------------------------------------------------------------- /IGDr_0408/src/igd_search.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/IGDr_0408/src/igd_search.o -------------------------------------------------------------------------------- /IGDr_0408/src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 68 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 69 | 70 | #ifndef KSTRING_T 71 | #define KSTRING_T kstring_t 72 | typedef struct __kstring_t { 73 | unsigned l, m; 74 | char *s; 75 | } kstring_t; 76 | #endif 77 | 78 | #ifndef kroundup32 79 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 80 | #endif 81 | 82 | #define __KS_GETUNTIL(SCOPE, __read) \ 83 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 84 | { \ 85 | if (dret) *dret = 0; \ 86 | str->l = append? str->l : 0; \ 87 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 88 | for (;;) { \ 89 | int i; \ 90 | if (ks->begin >= ks->end) { \ 91 | if (!ks->is_eof) { \ 92 | ks->begin = 0; \ 93 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 94 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 95 | if (ks->end == 0) break; \ 96 | } else break; \ 97 | } \ 98 | if (delimiter == KS_SEP_LINE) { \ 99 | for (i = ks->begin; i < ks->end; ++i) \ 100 | if (ks->buf[i] == '\n') break; \ 101 | } else if (delimiter > KS_SEP_MAX) { \ 102 | for (i = ks->begin; i < ks->end; ++i) \ 103 | if (ks->buf[i] == delimiter) break; \ 104 | } else if (delimiter == KS_SEP_SPACE) { \ 105 | for (i = ks->begin; i < ks->end; ++i) \ 106 | if (isspace(ks->buf[i])) break; \ 107 | } else if (delimiter == KS_SEP_TAB) { \ 108 | for (i = ks->begin; i < ks->end; ++i) \ 109 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 110 | } else i = 0; /* never come to here! */ \ 111 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 112 | str->m = str->l + (i - ks->begin) + 1; \ 113 | kroundup32(str->m); \ 114 | str->s = (char*)realloc(str->s, str->m); \ 115 | } \ 116 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 117 | str->l = str->l + (i - ks->begin); \ 118 | ks->begin = i + 1; \ 119 | if (i < ks->end) { \ 120 | if (dret) *dret = ks->buf[i]; \ 121 | break; \ 122 | } \ 123 | } \ 124 | if (str->s == 0) { \ 125 | str->m = 1; \ 126 | str->s = (char*)calloc(1, 1); \ 127 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 128 | str->s[str->l] = '\0'; \ 129 | return str->l; \ 130 | } 131 | 132 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 133 | __KS_TYPE(type_t) \ 134 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 135 | __KS_GETUNTIL(SCOPE, __read) \ 136 | __KS_INLINED(__read) 137 | 138 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 139 | 140 | #define KSTREAM_DECLARE(type_t, __read) \ 141 | __KS_TYPE(type_t) \ 142 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 143 | extern kstream_t *ks_init(type_t f); \ 144 | extern void ks_destroy(kstream_t *ks); \ 145 | __KS_INLINED(__read) 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2020 Jianglin Feng and Nathan Sheffield 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS = -g -lz -lm -O2 2 | BIN = bin 3 | OBJ = obj 4 | VPATH = src 5 | LIB = igd_base.o igd_create.o igd_search.o igd.o 6 | OBJS = $(addprefix $(OBJ)/, $(LIB)) 7 | 8 | $(OBJ)/%.o: %.c 9 | cc -c $(CFLAGS) $< -o $@ 10 | 11 | igd_dev1: $(OBJS) 12 | cc -o $(BIN)/igd $(OBJS) $(CFLAGS) 13 | all: $(OBJS) 14 | 15 | $(OBJS): | $(OBJ) $(BIN) 16 | 17 | $(OBJ): 18 | mkdir -p $(OBJ) 19 | 20 | $(BIN): 21 | mkdir -p $(BIN) 22 | 23 | .PHONY: clean 24 | clean: 25 | rm -rf $(BIN)/* 26 | rm -rf $(OBJ)/* 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IGD: A high-performance search engine for large-scale genomic interval datasets 2 | 3 | ## Summary 4 | 5 | Databases of large-scale genome projects now contain thousands of genomic interval datasets. These data are a critical resource for understanding the function of DNA. However, our ability to examine and integrate interval data of this scale is limited. Here, we introduce the integrated genome database (IGD), a method and tool for searching genome interval datasets more than three orders of magnitude faster than existing approaches, while using only one hundredth of the memory. IGD uses a novel linear binning method that allows us to scale analysis to billions of genomic regions. 6 | 7 | ## Citation 8 | 9 | If you use IGD in your research, please cite: 10 | 11 | 12 | Jianglin Feng, Nathan C Sheffield. IGD: high-performance search for large-scale genomic interval datasets. *Bioinformatics*, Volume 37, Issue 1, 1 January 2021, Pages 118–120, https://doi.org/10.1093/bioinformatics/btaa1062 13 | 14 | Preprint: https://www.biorxiv.org/content/10.1101/2020.06.08.139758v1 15 | 16 | 17 | ## How to build iGD 18 | 19 | If zlib is not already installed, install it: 20 | ``` 21 | sudo apt-get install libpng12-0 22 | ``` 23 | Then: 24 | ``` 25 | git clone https://github.com/databio/iGD.git 26 | cd iGD 27 | make 28 | ``` 29 | the executable `igd` is in the subfolder `bin`. And then copy it to /usr/local/bin. 30 | 31 | ## How to run iGD 32 | 33 | ### 1. Create iGD database 34 | 35 | #### 1.1 Create iGD database from a genome data source folder 36 | ``` 37 | igd create "/path/to/data_source_folder/*" "/path/to/igd_folder/" "databaseName" [option] 38 | 39 | where: 40 | 41 | - "path/to/data_source_folder/" is the path of the folder that contains .bed.gz or .bed data files. 42 | 43 | - "path/to/igd_folder/" is the path to the output igd folder; 44 | 45 | - "databaseName" is the name you give to the database, for eaxmple, "roadmap" 46 | 47 | option: 48 | 49 | -b: bin-size (power of 2; default 14, which is 16384 bp) 50 | ``` 51 | #### 1.2 Create iGD database from a list of source files 52 | 53 | ``` 54 | igd create "/path/to/source-list file" "/path/to/igd_folder/" "databaseName" -f [option] 55 | 56 | where: 57 | 58 | - "/path/to/source-list file" is the path to the file that lists the source files 59 | 60 | - "path/to/igd_folder/" is the path to the output igd folder; 61 | 62 | - "databaseName" is the name you give to the database, for eaxmple, "roadmap" 63 | 64 | option: 65 | 66 | -b: bin-size (power of 2; default 14, which is 16384 bp) 67 | ``` 68 | 69 | 70 | ### 2. Search iGD for overlaps 71 | ``` 72 | igd search "path/to/igd_data_file" -q "path/to/query_file" 73 | 74 | where: 75 | 76 | - path/to/igd_data_file is the path to the igd data 77 | 78 | - path/to/query_file is the path to the query file (.bed or .bed.gz) 79 | 80 | other options: 81 | 82 | -r (a single query) 83 | 84 | -v (signal value > v) 85 | 86 | -o 87 | 88 | -s (output Seqpare similarity) 89 | 90 | -f (output full overlaps, for -q and -r only) 91 | 92 | -m (hitsmap of igd datasets) 93 | 94 | ``` 95 | 96 | For a detailed example, please check out the `vignettes`. 97 | 98 | ## R-wrapper of IGD 99 | 100 | ### 1. Create iGD database 101 | #### 1.1 from a genome data source 102 | ``` 103 | > library(IGDr) 104 | > createIGD("/path/to/data_source_folder/*" "/path/to/igd_folder/" "databaseName" [option] 105 | 106 | where: 107 | 108 | - "path/to/data_source_folder/" is the path of the folder that contains .bed.gz or .bed data files. 109 | 110 | - "path/to/igd_folder/" is the path to the output igd folder; 111 | 112 | - "databaseName" is the name you give to the database, for eaxmple, "roadmap" 113 | 114 | options: 115 | 116 | -b: bin size in bp (default 16384) 117 | ``` 118 | #### 1.2 from a file that contains the list of genome data source files 119 | ``` 120 | > library(IGDr) 121 | > createIGD_f("/path/to/source-list file" "/path/to/igd_folder/" "databaseName" [option] 122 | 123 | where: 124 | 125 | - "path/to/the list file/" is the path to the file that contains the .bed.gz or .bed data files. 126 | 127 | - "path/to/igd_folder/" is the path to the output igd folder; 128 | 129 | - "databaseName" is the name you give to the database, for eaxmple, "roadmap" 130 | 131 | options: 132 | 133 | -b: bin size in bp (default 16384) 134 | ``` 135 | 136 | ### 2. search the igd database in R (an example for a created igd file) 137 | 138 | Search the igd database with a single query: 139 | ``` 140 | > igd_file = "igdr_b14/roadmap.igd" 141 | > library(IGDr) 142 | > igd <- IGDr::IGDr(igd_file) 143 | > hits <- search_1r(igd, "chr6", 1000000, 10000000) 144 | > hits 145 | ``` 146 | Search the igd database with n queries: 147 | ``` 148 | > igd_file = "igdr_b14/roadmap.igd" 149 | > library(IGDr) 150 | > igd <- IGDr::IGDr(igd_file) 151 | > chrms = c("chr6", "chr1", "chr2") 152 | > starts = c(10000, 100000, 1000000) 153 | > ends = (100000, 1000000, 10000000) 154 | > hits <- search_nr(igd, 3, chrms, starts, ends) 155 | > hits 156 | ``` 157 | Search a whole query file chainRn4.bed 158 | ``` 159 | > igd_file = "igdr_b14/roadmap.igd" 160 | > query_file = "r10000.bed" 161 | > library(bit64) 162 | > library(IGDr) 163 | > fi = IGDr::getFInfo(igd_file) 164 | > hits = integer64(fi$nFiles) 165 | > ret = IGDr::search_all(igd_file, query_file, hits) 166 | > for(i in 1:fi$nFiles){ 167 | cat(i, "\t", toString(ret[i]), "\t", toString(fi$fInfo[i,2]), "\n") 168 | } 169 | > 170 | ``` 171 | -------------------------------------------------------------------------------- /src/igd.c: -------------------------------------------------------------------------------- 1 | //===================================================================================== 2 | //Common igd struct, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | // 5 | //01/24/2019: Change definition of a region/interval to be half-open, 0-based 6 | // Implement AIList for in-bin search: an option 7 | // Test with new data 8 | //------------------------------------------------------------------------------------- 9 | #include "igd_base.h" 10 | #include "igd_create.h" 11 | #include "igd_search.h" 12 | 13 | int igd_help(int argc, char **argv, int exit_code); 14 | void *hc; //extern from igd_base.h 15 | iGD_t *IGD; 16 | gdata_t *gData = NULL; 17 | gdata0_t *gData0 = NULL; 18 | int32_t preIdx, preChr, tile_size; 19 | FILE *fP; 20 | 21 | int main(int argc, char **argv) 22 | { 23 | if (argc < 2) return igd_help(argc, argv, 0); 24 | char *cmd = argv[1]; 25 | 26 | if (strcmp(cmd, "create") == 0){ 27 | return igd_create(argc, argv); 28 | } 29 | else if (strcmp(cmd, "search") == 0){ 30 | return igd_search(argc, argv); 31 | if(gData!=NULL)free(gData); 32 | if(fP!=NULL)fclose(fP); 33 | } 34 | else { 35 | fprintf(stderr, "Unknown command\n"); 36 | return igd_help(argc, argv, EX_USAGE); 37 | } 38 | } 39 | 40 | int igd_help(int argc, char **argv, int exit_code) 41 | { 42 | fprintf(stderr, 43 | "%s, v%s\n" 44 | "usage: %s [options]\n" 45 | " create Create an igd database\n" 46 | " search Search an igd database\n", 47 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 48 | return exit_code; 49 | } 50 | -------------------------------------------------------------------------------- /src/igd_base.h: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Common structs, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //re-designed 7/1/2019 5 | //database intervals sorted by _start: 8/12/2019 6 | //----------------------------------------------------------------------------------- 7 | #ifndef __IGD_BASE_H__ 8 | #define __IGD_BASE_H__ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "khash.h" 27 | #include "kseq.h" 28 | 29 | #define PROGRAM_NAME "igd" 30 | #define MAJOR_VERSION "0" 31 | #define MINOR_VERSION "1" 32 | #define REVISION_VERSION "1" 33 | #define BUILD_VERSION "0" 34 | #define VERSION MAJOR_VERSION "." MINOR_VERSION "." REVISION_VERSION 35 | #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) 36 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) 37 | #define maxCount 268435456 //16* = 4GB memory 38 | #define MAXC 10 //max number of components 39 | 40 | //----------------------------------------------------------------------------------- 41 | typedef struct{ //default 42 | int32_t idx; //genomic object--data set index 43 | int32_t start; //region start 44 | int32_t end; //region end 45 | int32_t value; 46 | } gdata_t; 47 | 48 | typedef struct{ //default 49 | int32_t idx; //genomic object--data set index 50 | int32_t start; //region start 51 | int32_t end; //region end 52 | } gdata0_t; 53 | 54 | typedef struct{ 55 | char* fileName; //dataset file 56 | int32_t nr; //number regions/dataset 57 | double md; //average width of the regions 58 | } info_t; 59 | 60 | typedef struct{ 61 | int32_t ncnts, nCnts, mcnts;//batch counts, total, max 62 | gdata_t *gList; 63 | } tile_t; 64 | 65 | typedef struct{ 66 | int32_t ncnts, nCnts, mcnts;//batch counts, total, max 67 | gdata0_t *gList; 68 | } tile0_t; 69 | 70 | typedef struct{ 71 | char *name; //name of the contig 72 | int32_t mTiles; //determined by the interval start and end 73 | tile_t *gTile; //tile data 74 | } ctg_t; 75 | 76 | typedef struct{ 77 | char *name; //name of the contig 78 | int32_t mTiles; //determined by the interval start and end 79 | tile0_t *gTile; //tile data 80 | } ctg0_t; 81 | 82 | 83 | 84 | typedef struct{ 85 | int32_t nbp, gType, nctg, mctg; // number of base pairs, data type: 0, 1, 2 etc; size differs 86 | int64_t total; //total region in each ctg 87 | ctg_t *ctg; //list of contigs (of size _n_ctg_) 88 | } igd_t; 89 | 90 | typedef struct{ 91 | int32_t nbp, gType, nctg, mctg; // number of base pairs, data type: 0, 1, 2 etc; size differs 92 | int64_t total; //total region in each ctg 93 | ctg0_t *ctg; //list of contigs (of size _n_ctg_) 94 | } igd0_t; 95 | 96 | typedef struct{ //for retrieving from disk file 97 | int32_t nFiles; 98 | info_t *finfo; 99 | char fname[64]; 100 | int32_t nbp, gType, nCtg; //data type: 0, 1, 2 etc; size differs 101 | char **cName; //name of ctgs 102 | int32_t *nTile; //num of tiles in each ctg 103 | int32_t **nCnt; //num of counts in each tile 104 | int64_t **tIdx; //tile index *sizeof -> location in .igd file 105 | } iGD_t; 106 | 107 | //--------------------------------------------------------------------------------- 108 | //for seqpare index 109 | typedef struct{ //for each query: query set search 110 | int32_t idx_t; //tile index 111 | int32_t idx_g; //gdata index: 64bit? 112 | int32_t idx_f; //from gdata 113 | float sm; //similarity 114 | } overlap_t; 115 | 116 | typedef struct{ //save space 117 | int32_t nn, mm; //number of regions 118 | overlap_t *olist; //regions data 119 | } overlaps_t; 120 | 121 | typedef struct{ 122 | char *name; //name of the contig 123 | int64_t nr, mr; //number of regions 124 | gdata0_t *glist; //regions data 125 | } chrom_t; 126 | 127 | typedef struct { 128 | chrom_t *ctg; // list of contigs (of size _n_ctg_) 129 | int32_t nctg, mctg; // number and max number of contigs 130 | void *hc; // dict for converting contig names to int 131 | } ailist_t; 132 | 133 | //--------------------------------------------------------------------------------- 134 | //---------Globals----------------------------------------------------------------- 135 | extern void *hc; //dict for converting contig names to int 136 | extern iGD_t *IGD; 137 | extern gdata_t *gData; 138 | extern gdata0_t *gData0; 139 | extern int32_t preIdx, preChr, tile_size; 140 | extern FILE *fP; 141 | //--------------------------------------------------------------------------------- 142 | //Parse a line of BED file 143 | void str_splits( char* str, int *nmax, char **splits); 144 | char *parse_bed(char *s, int32_t *st_, int32_t *en_); 145 | 146 | int compare_rstart(const void *a, const void *b); 147 | int compare_qstart(const void *a, const void *b);//gdata0 148 | int compare_fidx(const void *a, const void *b); 149 | 150 | //Binary search 151 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe); 152 | int32_t bSearch0(gdata0_t *gdata, int32_t t0, int32_t tc, int32_t qe); 153 | 154 | //Add an interval 155 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx); 156 | void igd0_add(igd0_t *igd, const char *chrm, int32_t s, int32_t e, int32_t idx); 157 | 158 | //Get id from igd dict 159 | int32_t get_id(const char *chrm); 160 | 161 | //Get file info from .tsv 162 | info_t *get_fileinfo(char *ifName, int32_t *nFiles); 163 | 164 | //Get igd info from .igd 165 | iGD_t *get_igdinfo(char *igdFile); 166 | 167 | //Initialize igd_t 168 | igd_t *igd_init(void); 169 | igd0_t *igd0_init(void); 170 | 171 | //Save tile data 172 | void igd_saveT(igd_t *igd, char *oPath); 173 | void igd0_saveT(igd0_t *igd, char *oPath); 174 | 175 | //Sort and save igd 176 | void igd_save(igd_t *igd, char *oPath, char *igdName); 177 | void igd0_save(igd0_t *igd, char *oPath, char *igdName); 178 | 179 | //Free ailist data 180 | void igd_destroy(igd_t *igd); 181 | void igd0_destroy(igd0_t *igd); 182 | 183 | //From AIList: 184 | ailist_t *ailist_init(void); 185 | void ailist_destroy(ailist_t *ail); 186 | void ailist_add(ailist_t *ail, const char *chr, uint32_t s, uint32_t e, int32_t v); 187 | ailist_t* readBED(const char* fn); 188 | //--------------------------------------------------------------------------------- 189 | //The following section taken from Dr Heng Li's cgranges 190 | // (https://github.com/lh3/cgranges) 191 | 192 | KSTREAM_INIT(gzFile, gzread, 0x10000) 193 | /************** 194 | * Radix sort * 195 | **************/ 196 | #define RS_MIN_SIZE 64 197 | #define RS_MAX_BITS 8 198 | 199 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ 200 | typedef struct { \ 201 | rstype_t *b, *e; \ 202 | } rsbucket_##name##_t; \ 203 | void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ 204 | { \ 205 | rstype_t *i; \ 206 | for (i = beg + 1; i < end; ++i) \ 207 | if (rskey(*i) < rskey(*(i - 1))) { \ 208 | rstype_t *j, tmp = *i; \ 209 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ 210 | *j = *(j - 1); \ 211 | *j = tmp; \ 212 | } \ 213 | } \ 214 | void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ 215 | { \ 216 | rstype_t *i; \ 217 | int size = 1<b = k->e = beg; \ 221 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ 222 | for (k = b + 1; k != be; ++k) \ 223 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ 224 | for (k = b; k != be;) { \ 225 | if (k->b != k->e) { \ 226 | rsbucket_##name##_t *l; \ 227 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ 228 | rstype_t tmp = *k->b, swap; \ 229 | do { \ 230 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ 231 | l = b + (rskey(tmp)>>s&m); \ 232 | } while (l != k); \ 233 | *k->b++ = tmp; \ 234 | } else ++k->b; \ 235 | } else ++k; \ 236 | } \ 237 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ 238 | if (s) { \ 239 | s = s > n_bits? s - n_bits : 0; \ 240 | for (k = b; k != be; ++k) \ 241 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ 242 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ 243 | } \ 244 | } \ 245 | void radix_sort_##name(rstype_t *beg, rstype_t *end) \ 246 | { \ 247 | if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ 248 | else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \ 249 | } 250 | 251 | /********************* 252 | * Convenient macros * 253 | *********************/ 254 | 255 | #ifndef kroundup32 256 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 257 | #endif 258 | 259 | #define CALLOC(type, len) ((type*)calloc((len), sizeof(type))) 260 | #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr)))) 261 | 262 | #define EXPAND(a, m) do { \ 263 | (m) = (m)? (m) + (2+(m)/8) : 16; \ 264 | REALLOC((a), (m)); \ 265 | }while (0) 266 | 267 | #endif 268 | -------------------------------------------------------------------------------- /src/igd_create.h: -------------------------------------------------------------------------------- 1 | #ifndef __IGD_CREATE_H__ 2 | #define __IGD_CREATE_H__ 3 | 4 | //===================================================================================== 5 | //Create igd database 6 | //by Jianglin Feng 05/12/2018 7 | //------------------------------------------------------------------------------------- 8 | #include "igd_base.h" 9 | 10 | //create igd from .bed.gz files 11 | void create_igd(char *iPath, char *oPath, char *igdName); 12 | void create_igd0(char *iPath, char *oPath, char *igdName); 13 | void create_igd_f(char *iPath, char *oPath, char *igdName);//create from a file list 14 | void create_igd_bed4(char *iPath, char *oPath, char *igdName); 15 | 16 | //main 17 | int igd_create(int argc, char **argv); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /src/igd_search.h: -------------------------------------------------------------------------------- 1 | #ifndef __IGD_SEARCH_H__ 2 | #define __IGD_SEARCH_H__ 3 | 4 | //===================================================================================== 5 | //Search the igd database for all overlaps with queries 6 | //by Jianglin Feng 05/12/2018 7 | // 8 | //time ./igd_search Test110000.bed /media/john/Extra/ucsc_igd/ucsc.igd 9 | //database intervals sorted by _start: 8/12/2019 10 | //Reorganized: 11/06/2019 11 | //------------------------------------------------------------------------------------- 12 | #include "igd_base.h" 13 | //------------------------------------------------------------------------------------- 14 | //for gdata0_t only----- 15 | int32_t get_overlaps0(char *chrm, int32_t qs, int32_t qe, int64_t *hits); 16 | int64_t getOverlaps0(char *qFile, int64_t *hits); 17 | 18 | //for seqpare 19 | void seq_overlaps(char *chrm, int32_t qs, int32_t qe, overlaps_t *olp); 20 | void seqOverlaps(char *qFile, double *sm); 21 | 22 | //Single query 23 | int32_t get_overlaps(char *chrm, int32_t qs, int32_t qe, int64_t *hits); 24 | int32_t get_overlaps_v(char *chrm, int32_t qs, int32_t qe, int32_t v, int64_t *hits); 25 | 26 | //query file: call _r 27 | int64_t getOverlaps(char *qFile, int64_t *hits); 28 | int64_t getOverlaps_v(char *qFile, int64_t *hits, int32_t v); 29 | 30 | //mapping 31 | int64_t getMap(uint32_t **hitmap); 32 | int64_t getMap_v(uint32_t **hitmap, int32_t v); 33 | 34 | //full output 35 | int32_t get_overlaps_f0(char *chrm, int32_t qs, int32_t qe); 36 | int32_t get_overlaps_f1(char *chrm, int32_t qs, int32_t qe); 37 | int64_t getOverlaps_f0(char *qFile); 38 | int64_t getOverlaps_f1(char *qFile); 39 | 40 | //search main 41 | int igd_search(int argc, char **argv); 42 | 43 | #endif 44 | 45 | -------------------------------------------------------------------------------- /src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 68 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 69 | 70 | #ifndef KSTRING_T 71 | #define KSTRING_T kstring_t 72 | typedef struct __kstring_t { 73 | unsigned l, m; 74 | char *s; 75 | } kstring_t; 76 | #endif 77 | 78 | #ifndef kroundup32 79 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 80 | #endif 81 | 82 | #define __KS_GETUNTIL(SCOPE, __read) \ 83 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 84 | { \ 85 | if (dret) *dret = 0; \ 86 | str->l = append? str->l : 0; \ 87 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 88 | for (;;) { \ 89 | int i; \ 90 | if (ks->begin >= ks->end) { \ 91 | if (!ks->is_eof) { \ 92 | ks->begin = 0; \ 93 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 94 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 95 | if (ks->end == 0) break; \ 96 | } else break; \ 97 | } \ 98 | if (delimiter == KS_SEP_LINE) { \ 99 | for (i = ks->begin; i < ks->end; ++i) \ 100 | if (ks->buf[i] == '\n') break; \ 101 | } else if (delimiter > KS_SEP_MAX) { \ 102 | for (i = ks->begin; i < ks->end; ++i) \ 103 | if (ks->buf[i] == delimiter) break; \ 104 | } else if (delimiter == KS_SEP_SPACE) { \ 105 | for (i = ks->begin; i < ks->end; ++i) \ 106 | if (isspace(ks->buf[i])) break; \ 107 | } else if (delimiter == KS_SEP_TAB) { \ 108 | for (i = ks->begin; i < ks->end; ++i) \ 109 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 110 | } else i = 0; /* never come to here! */ \ 111 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 112 | str->m = str->l + (i - ks->begin) + 1; \ 113 | kroundup32(str->m); \ 114 | str->s = (char*)realloc(str->s, str->m); \ 115 | } \ 116 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 117 | str->l = str->l + (i - ks->begin); \ 118 | ks->begin = i + 1; \ 119 | if (i < ks->end) { \ 120 | if (dret) *dret = ks->buf[i]; \ 121 | break; \ 122 | } \ 123 | } \ 124 | if (str->s == 0) { \ 125 | str->m = 1; \ 126 | str->s = (char*)calloc(1, 1); \ 127 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 128 | str->s[str->l] = '\0'; \ 129 | return str->l; \ 130 | } 131 | 132 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 133 | __KS_TYPE(type_t) \ 134 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 135 | __KS_GETUNTIL(SCOPE, __read) \ 136 | __KS_INLINED(__read) 137 | 138 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 139 | 140 | #define KSTREAM_DECLARE(type_t, __read) \ 141 | __KS_TYPE(type_t) \ 142 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 143 | extern kstream_t *ks_init(type_t f); \ 144 | extern void ks_destroy(kstream_t *ks); \ 145 | __KS_INLINED(__read) 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /src_py/igd_base.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Common igd struct, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_base.h" 7 | #define gdata_t_key(r) ((r).start) 8 | KRADIX_SORT_INIT(intv, gdata_t, gdata_t_key, 4) 9 | KHASH_MAP_INIT_STR(str, int32_t) 10 | typedef khash_t(str) strhash_t; 11 | 12 | void str_splits( char* str, int *nmax, char **splits) 13 | { //tsv 14 | splits[*nmax] = NULL; 15 | splits[0] = str; 16 | char *ch = str; 17 | int ns = 1; 18 | do { 19 | if (*ch == '\t'){ 20 | splits[ns++] = &ch[1]; 21 | *ch = '\0'; 22 | } 23 | ch++; 24 | } while (*ch != '\0' && ns < *nmax+1); 25 | *nmax = ns; 26 | } 27 | 28 | char *parse_bed(char *s, int32_t *st_, int32_t *en_) 29 | { 30 | char *p, *q, *ctg = 0; 31 | int32_t i, st = -1, en = -1; 32 | for (i = 0, p = q = s;; ++q) { 33 | if (*q == '\t' || *q == '\0') { 34 | int c = *q; 35 | *q = 0; 36 | if (i == 0) ctg = p; 37 | else if (i == 1) st = atol(p); 38 | else if (i == 2) en = atol(p); 39 | ++i, p = q + 1; 40 | if (c == '\0') break; 41 | } 42 | } 43 | *st_ = st, *en_ = en; 44 | return i >= 3? ctg : 0; 45 | } 46 | 47 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe) 48 | { //find tE: index of the last item satisfying .start < qe from right 49 | //assuming gdata sorted by start 50 | int32_t tL=t0, tR=tc, tM, tE = -1; 51 | if(gdata[tR].start < qe) 52 | return tR; 53 | else if(gdata[tL].start >= qe) 54 | return -1; 55 | while(tL= qe) 58 | tR = tM-1; 59 | else 60 | tL = tM; 61 | } 62 | if(gdata[tR].start < qe) 63 | tE = tR; 64 | else if(gdata[tL].start < qe) 65 | tE = tL; 66 | return tE; 67 | } 68 | 69 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx) 70 | { //layers: igd->ctg->gTile->gdata(list) 71 | if(s >= e)return; 72 | int absent; 73 | khint_t k; 74 | strhash_t *h = (strhash_t*)igd->hc; 75 | k = kh_put(str, h, chrm, &absent); 76 | int32_t n1 = s/igd->nbp; 77 | int32_t n2 = (e-1)/igd->nbp; 78 | if (absent) { 79 | //printf("%s %i %i %i\n", chrm, n1, n2, k); 80 | //igd 81 | if (igd->nctg == igd->mctg) 82 | EXPAND(igd->ctg, igd->mctg); 83 | kh_val(h, k) = igd->nctg; 84 | //ctg: initialize 85 | ctg_t *p = &igd->ctg[igd->nctg++]; 86 | p->name = strdup(chrm); 87 | p->mTiles= 1 + n2; 88 | p->gTile = malloc(p->mTiles*sizeof(tile_t)); 89 | kh_key(h, k) = p->name; 90 | //tile: initialize 91 | for(int i=0;imTiles;i++){ 92 | tile_t *tile = &p->gTile[i]; 93 | tile->ncnts = 0; //each batch 94 | tile->nCnts = 0; //total 95 | tile->mcnts = 4; 96 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 97 | } 98 | } 99 | int32_t kk = kh_val(h, k); 100 | ctg_t *p = &igd->ctg[kk]; 101 | if (n2+1>=p->mTiles){ 102 | int32_t tt = p->mTiles; 103 | p->mTiles = n2+1; 104 | p->gTile = realloc(p->gTile, p->mTiles*sizeof(tile_t)); 105 | //initialize new tiles 106 | for(int i=tt;imTiles;i++){ 107 | tile_t *tile = &p->gTile[i]; 108 | tile->ncnts = 0; //each batch 109 | tile->nCnts = 0; //total 110 | tile->mcnts = 16; 111 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 112 | } 113 | } 114 | //add data elements 115 | for(int i=n1;i<=n2;i++){ 116 | tile_t *tile = &p->gTile[i]; 117 | if(tile->ncnts == tile->mcnts) 118 | EXPAND(tile->gList, tile->mcnts); 119 | gdata_t *gdata = &tile->gList[tile->ncnts++]; 120 | gdata->start = s; 121 | gdata->end = e; 122 | gdata->value = v; 123 | gdata->idx = idx; 124 | igd->total++; 125 | } 126 | return; 127 | } 128 | 129 | info_t* get_fileinfo(char *ifName, int32_t *nFiles) 130 | { //read head file __index.tsv to get info 131 | FILE *fp = fopen(ifName, "r"); 132 | if(fp==NULL){ 133 | printf("file not found:%s\n", ifName); 134 | return NULL; 135 | } 136 | char buf[1024], *s0, *s1, *s2, *s3; 137 | int nfiles=0; 138 | fgets(buf, 1024, fp); 139 | while(fgets(buf, 1024, fp)!=NULL) 140 | nfiles++; 141 | 142 | info_t *fi = (info_t*)malloc(nfiles*sizeof(info_t)); 143 | fseek(fp, 0, SEEK_SET); 144 | int i=0; 145 | fgets(buf, 1024, fp); //header 146 | while(fgets(buf, 1024, fp)!=NULL){ 147 | s0 = strtok(buf, "\t"); 148 | s1 = strtok(NULL, "\t"); 149 | fi[i].fileName = strdup(s1); 150 | s2 = strtok(NULL, "\t"); 151 | fi[i].nr = atol(s2); 152 | //s3 = strtok(NULL, "\t"); 153 | //fi[i].md = (double)atol(s3); 154 | i++; 155 | } 156 | *nFiles = (int32_t)nfiles; 157 | fclose(fp); 158 | return fi; 159 | } 160 | 161 | void open_iGD(iGD_t *iGD, char *igdFile) 162 | { 163 | char tmp[128]; 164 | strcpy(tmp, igdFile); 165 | tmp[strrchr(tmp, '.')-tmp] = '\0'; 166 | strcpy(iGD->fname, tmp); 167 | char *idFile = tmp; //str_split(tmp, '.', &nCols)[0]; 168 | strcat(idFile, "_index.tsv"); 169 | iGD->finfo = get_fileinfo(idFile, &iGD->nFiles); 170 | 171 | FILE *fp = fopen(igdFile, "rb"); 172 | if(fp == NULL) 173 | printf("Can't open file %s", igdFile); 174 | fread(&iGD->nbp, sizeof(int32_t), 1, fp); 175 | fread(&iGD->gType, sizeof(int32_t), 1, fp); 176 | fread(&iGD->nCtg, sizeof(int32_t), 1, fp); 177 | int i, k; 178 | int32_t gdsize; 179 | gdsize = sizeof(gdata_t); 180 | int32_t tileS, m = iGD->nCtg; //the idx of a tile in the chrom 181 | //------------------------------------------ 182 | iGD->nTile = malloc(m*sizeof(int32_t)); 183 | fread(iGD->nTile, sizeof(int32_t)*m, 1, fp); 184 | int64_t chr_loc = 12 + 44*m; //header size in bytes 185 | for(i=0;inTile[i]*4; 186 | //------------------------------------------ 187 | iGD->nCnt = malloc(m*sizeof(int32_t*)); 188 | iGD->tIdx = malloc(m*sizeof(int64_t*)); 189 | for(i=0;inTile[i]; 191 | iGD->nCnt[i] = calloc(k, sizeof(int32_t)); 192 | fread(iGD->nCnt[i], sizeof(int32_t)*k, 1, fp); 193 | //-------------------------------------- 194 | iGD->tIdx[i] = calloc(k, sizeof(int64_t)); 195 | iGD->tIdx[i][0] = chr_loc; 196 | for(int j=1; jtIdx[i][j] = iGD->tIdx[i][j-1]+iGD->nCnt[i][j-1]*gdsize; 198 | chr_loc = iGD->tIdx[i][k-1]+iGD->nCnt[i][k-1]*gdsize; 199 | } 200 | 201 | iGD->cName = malloc(m*sizeof(char*)); 202 | for(i=0;icName[i] = malloc(40*sizeof(char)); 204 | fread(iGD->cName[i], 40, 1, fp); 205 | } 206 | iGD->fP = fp; 207 | 208 | //setup hc 209 | iGD->hc = kh_init(str); 210 | int absent; 211 | for(i=0;inCtg;i++){ 212 | khint_t k; 213 | strhash_t *h = (strhash_t*)iGD->hc; 214 | k = kh_put(str, h, iGD->cName[i], &absent); 215 | kh_val(h, k) = i; 216 | kh_key(h, k) = iGD->cName[i]; 217 | } 218 | iGD->gData = malloc(1*sizeof(gdata_t)); 219 | iGD->preIdx = -1; 220 | iGD->preChr = -1; 221 | return iGD; 222 | } 223 | 224 | int32_t get_id(iGD_t *iGD, const char *chrm) 225 | { //for search 226 | khint_t k; 227 | strhash_t *h = (strhash_t*)iGD->hc; 228 | k = kh_get(str, h, chrm); 229 | return k == kh_end(h)? -1 : kh_val(h, k); 230 | } 231 | 232 | int32_t get_nFiles(iGD_t *iGD) 233 | { 234 | return iGD->nFiles; 235 | } 236 | 237 | void igd_saveT(igd_t *igd, char *oPath) 238 | { //Save/append tiles to disc, add cnts tp Cnts 239 | char idFile[128]; 240 | for (int i = 0; i < igd->nctg; i++){ 241 | ctg_t *ctg = &igd->ctg[i]; 242 | for(int j=0; j< ctg->mTiles; j++){ 243 | tile_t *tile = &ctg->gTile[j]; 244 | //--------------------------------------- 245 | if(tile->ncnts>0){ 246 | sprintf(idFile, "%s%s%s_%i", oPath, "data0/", ctg->name, j); 247 | FILE *fp = fopen(idFile, "ab"); 248 | if(fp==NULL) 249 | printf("Can't open file %s", idFile); 250 | fwrite(tile->gList, sizeof(gdata_t), tile->ncnts, fp); 251 | fclose(fp); 252 | } 253 | tile->nCnts += tile->ncnts; 254 | tile->ncnts = 0; 255 | free(tile->gList); 256 | tile->mcnts = 16;//MAX(16, tile->mcnts/16); 257 | tile->gList = malloc(tile->mcnts*sizeof(gdata_t)); 258 | //tile->gList = realloc(tile->gList, tile->mcnts*sizeof(gdata_t));? 259 | } 260 | } 261 | igd->total = 0; //batch total 262 | } 263 | 264 | void igd_save(igd_t *igd, char *oPath, char *igdName) 265 | { 266 | char idFile[128], iname[128]; 267 | //1. Save iGD data info: ctg string length 40 268 | int32_t i, j, n, m = igd->nctg; 269 | sprintf(idFile, "%s%s%s", oPath, igdName, ".igd"); 270 | FILE *fp = fopen(idFile, "wb"); 271 | if(fp==NULL) 272 | printf("Can't open file %s", idFile); 273 | fwrite(&igd->nbp, sizeof(int32_t), 1, fp); //4 bytes 274 | fwrite(&igd->gType, sizeof(int32_t), 1, fp); //4 275 | fwrite(&m, sizeof(int32_t), 1, fp); //4 276 | //----------------- 277 | for(i=0;ictg[i].mTiles, sizeof(int32_t), 1, fp); 279 | for(i=0;ictg[i]; 281 | n = p->mTiles; 282 | for(j=0;jgTile[j].nCnts, sizeof(int32_t), 1, fp); 284 | } 285 | //write string array 286 | for(i=0;ictg[i].name, 40, 1, fp); 288 | 289 | //2. Sort and save tiles data 290 | for(i=0;ictg[i]; 292 | n = p->mTiles; 293 | for(j=0;jgTile[j]; 295 | int32_t nrec = q->nCnts, gdsize; 296 | if(nrec>0){ 297 | sprintf(iname, "%s%s%s_%i", oPath, "data0/", p->name, j); 298 | FILE *fp0 = fopen(iname, "rb"); 299 | if(fp0 == NULL) 300 | printf("Can't open file %s", iname); 301 | gdsize = nrec*sizeof(gdata_t); 302 | gdata_t *gdata = malloc(gdsize); 303 | fread(gdata, gdsize, 1, fp0); 304 | fclose(fp0); 305 | radix_sort_intv(gdata, gdata+nrec); 306 | fwrite(gdata, gdsize, 1, fp); 307 | free(gdata); 308 | remove(iname); 309 | } 310 | } 311 | } 312 | fclose(fp); 313 | } 314 | 315 | igd_t *igd_init(int tile_size) 316 | { 317 | igd_t *igd = malloc(1*sizeof(igd_t)); 318 | igd->gType = 1; 319 | igd->nbp = tile_size; 320 | igd->hc = kh_init(str); 321 | igd->nctg = 0; 322 | igd->mctg = 32; 323 | igd->ctg = malloc(igd->mctg*sizeof(ctg_t)); 324 | igd->total = 0; 325 | return igd; 326 | } 327 | 328 | void igd_destroy(igd_t *igd) 329 | { 330 | if (igd == 0) return; 331 | for (int i = 0; i < igd->nctg; ++i){ 332 | free(igd->ctg[i].name); 333 | for(int j=0; j< igd->ctg[i].mTiles; j++) 334 | free(igd->ctg[i].gTile[j].gList); 335 | } 336 | free(igd->ctg); 337 | kh_destroy(str, (strhash_t*)igd->hc); 338 | free(igd); 339 | } 340 | 341 | iGD_t *iGD_init() 342 | { 343 | iGD_t *iGD = malloc(1*sizeof(iGD_t)); 344 | iGD->nbp = 16384; 345 | iGD->gType = 1; 346 | iGD->nCtg = 24; 347 | return iGD; 348 | } 349 | 350 | void close_iGD(iGD_t *iGD) 351 | { 352 | if(iGD==0) return; 353 | fclose(iGD->fP); 354 | free(iGD->gData); 355 | free(iGD->nTile); 356 | kh_destroy(str, (strhash_t*)iGD->hc); 357 | for(int i=0;inCtg;i++){ 358 | free(iGD->nCnt[i]); 359 | free(iGD->tIdx[i]); 360 | } 361 | free(iGD->nCnt); 362 | free(iGD->tIdx); 363 | free(iGD->cName); 364 | free(iGD->finfo); 365 | free(iGD); 366 | } 367 | 368 | -------------------------------------------------------------------------------- /src_py/igd_base.h: -------------------------------------------------------------------------------- 1 | //================================================================================= 2 | //Common structs, parameters, functions 3 | //by Jianglin Feng 05/12/2018 4 | //re-designed 7/1/2019 5 | //database intervals sorted by _start: 8/12/2019 6 | //--------------------------------------------------------------------------------- 7 | #ifndef __IGD_BASE_H__ 8 | #define __IGD_BASE_H__ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "khash.h" 26 | #include "kseq.h" 27 | 28 | #define PROGRAM_NAME "igd" 29 | #define MAJOR_VERSION "0" 30 | #define MINOR_VERSION "1" 31 | #define REVISION_VERSION "1" 32 | #define BUILD_VERSION "0" 33 | #define VERSION MAJOR_VERSION "." MINOR_VERSION "." REVISION_VERSION 34 | #define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) 35 | #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) 36 | #define maxCount 268435456 //16* = 4GB memory 37 | #define MAXC 10 //max number of components 38 | //--------------------------------------------------------------------------------- 39 | typedef struct{ //default 40 | int32_t idx; //genomic object--data set index 41 | int32_t start; //region start 42 | int32_t end; //region end 43 | int32_t value; 44 | } gdata_t; 45 | 46 | typedef struct{ 47 | char* fileName; //dataset file 48 | int32_t nr; //number regions/dataset 49 | double md; //average width of the regions 50 | } info_t; 51 | 52 | typedef struct{ 53 | int32_t ncnts, nCnts, mcnts; //batch counts, total, max 54 | gdata_t *gList; 55 | } tile_t; 56 | 57 | typedef struct{ 58 | char *name; //name of the contig 59 | int32_t mTiles; //determined by the interval start and end 60 | tile_t *gTile; //tile data 61 | } ctg_t; 62 | 63 | typedef struct{ 64 | int32_t nbp, gType, nctg, mctg; // number of base pairs, data type: 0, 1, 2 etc; size differs 65 | int64_t total; //total region in each ctg 66 | ctg_t *ctg; //list of contigs (of size _n_ctg_) 67 | void *hc; //dict for converting contig names to int 68 | } igd_t; //For creation: internal... 69 | 70 | typedef struct{ //For search: external... 71 | gdata_t *gData; 72 | int32_t preIdx, preChr; 73 | FILE *fP; 74 | void *hc; 75 | //-------------------------------- 76 | int32_t nFiles; 77 | info_t *finfo; 78 | char fname[64]; 79 | int32_t nbp, gType, nCtg; //data type: 0, 1, 2 etc; size differs 80 | char **cName; //name of ctgs 81 | int32_t *nTile; //num of tiles in each ctg 82 | int32_t **nCnt; //num of counts in each tile 83 | int64_t **tIdx; //tile index *sizeof -> location in .igd file 84 | } iGD_t; 85 | 86 | //--------------------------------------------------------------------------------- 87 | //Parse a line of BED file 88 | void str_splits( char* str, int *nmax, char **splits); 89 | char *parse_bed(char *s, int32_t *st_, int32_t *en_); 90 | 91 | //Binary search 92 | int32_t bSearch(gdata_t *gdata, int32_t t0, int32_t tc, int32_t qe); 93 | 94 | //Add an interval 95 | void igd_add(igd_t *igd, const char *chrm, int32_t s, int32_t e, int32_t v, int32_t idx); 96 | 97 | //Get id from igd dict 98 | int32_t get_id(iGD_t *iGD, const char *chrm); 99 | 100 | //Get nFiles from iGD 101 | int32_t get_nFiles(iGD_t *iGD); 102 | 103 | //Get file info from .tsv 104 | info_t *get_fileinfo(char *ifName, int32_t *nFiles); 105 | 106 | //Get igd info from .igd 107 | void open_iGD(iGD_t *iGD, char *igdFile); 108 | 109 | //Initialize igd_t 110 | igd_t *igd_init(int tile_size); 111 | 112 | //Initialize iGD_t 113 | iGD_t *iGD_init(); 114 | 115 | //Save tile data 116 | void igd_saveT(igd_t *igd, char *oPath); 117 | 118 | //Sort and save igd 119 | void igd_save(igd_t *igd, char *oPath, char *igdName); 120 | 121 | //Free igd data 122 | void igd_destroy(igd_t *igd); 123 | 124 | //Free iGD data 125 | void close_iGD(iGD_t *iGD); 126 | //--------------------------------------------------------------------------------- 127 | //The following section taken from Dr Heng Li's cgranges 128 | // (https://github.com/lh3/cgranges) 129 | 130 | KSTREAM_INIT(gzFile, gzread, 0x10000) 131 | /************** 132 | * Radix sort * 133 | **************/ 134 | #define RS_MIN_SIZE 64 135 | #define RS_MAX_BITS 8 136 | 137 | #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ 138 | typedef struct { \ 139 | rstype_t *b, *e; \ 140 | } rsbucket_##name##_t; \ 141 | void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ 142 | { \ 143 | rstype_t *i; \ 144 | for (i = beg + 1; i < end; ++i) \ 145 | if (rskey(*i) < rskey(*(i - 1))) { \ 146 | rstype_t *j, tmp = *i; \ 147 | for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ 148 | *j = *(j - 1); \ 149 | *j = tmp; \ 150 | } \ 151 | } \ 152 | void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ 153 | { \ 154 | rstype_t *i; \ 155 | int size = 1<b = k->e = beg; \ 159 | for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ 160 | for (k = b + 1; k != be; ++k) \ 161 | k->e += (k-1)->e - beg, k->b = (k-1)->e; \ 162 | for (k = b; k != be;) { \ 163 | if (k->b != k->e) { \ 164 | rsbucket_##name##_t *l; \ 165 | if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ 166 | rstype_t tmp = *k->b, swap; \ 167 | do { \ 168 | swap = tmp; tmp = *l->b; *l->b++ = swap; \ 169 | l = b + (rskey(tmp)>>s&m); \ 170 | } while (l != k); \ 171 | *k->b++ = tmp; \ 172 | } else ++k->b; \ 173 | } else ++k; \ 174 | } \ 175 | for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ 176 | if (s) { \ 177 | s = s > n_bits? s - n_bits : 0; \ 178 | for (k = b; k != be; ++k) \ 179 | if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ 180 | else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ 181 | } \ 182 | } \ 183 | void radix_sort_##name(rstype_t *beg, rstype_t *end) \ 184 | { \ 185 | if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ 186 | else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \ 187 | } 188 | 189 | /********************* 190 | * Convenient macros * 191 | *********************/ 192 | 193 | #ifndef kroundup32 194 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 195 | #endif 196 | 197 | #define CALLOC(type, len) ((type*)calloc((len), sizeof(type))) 198 | #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr)))) 199 | 200 | #define EXPAND(a, m) do { \ 201 | (m) = (m)? (m) + ((m)>>1) : 16; \ 202 | REALLOC((a), (m)); \ 203 | }while (0) 204 | 205 | #endif 206 | 207 | -------------------------------------------------------------------------------- /src_py/igd_create.c: -------------------------------------------------------------------------------- 1 | //=================================================================================== 2 | //Read igd region data and query data, and then find all overlaps 3 | //by Jianglin Feng 05/12/2018 4 | //database intervals sorted by _start: 8/12/2019 5 | //----------------------------------------------------------------------------------- 6 | #include "igd_create.h" 7 | 8 | int create_help(int exit_code) 9 | { 10 | fprintf(stderr, 11 | "%s, v%s\n" 12 | "usage: %s create [options] \n" 13 | " -b \n" 14 | " -c < .BED column as value >=4 (default 4) \n", 15 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 16 | return exit_code; 17 | } 18 | 19 | void create_iGD(iGD_t *iGD, char *iPath, char *oPath, char *igdName, int tile_size) 20 | { 21 | //*. Check if the subfolders exist: 22 | if(oPath[strlen(oPath)-1]!='/'){ 23 | strcat(oPath, "/"); 24 | } 25 | 26 | if(iPath[strlen(iPath)-1]=='/'){ 27 | strcat(iPath, "*"); 28 | } 29 | else if(iPath[strlen(iPath)-1]!='*'){ 30 | strcat(iPath, "/*"); 31 | } 32 | 33 | char ftmp[128]; 34 | struct stat st = {0}; 35 | 36 | sprintf(ftmp, "%s%s%s", oPath, igdName, ".igd"); 37 | if(stat(ftmp, &st) == 0){ 38 | printf("The igd database file %s exists!\n", ftmp); 39 | return EX_OK; 40 | } 41 | else{ 42 | if (stat(oPath, &st) == -1){ 43 | mkdir(oPath, 0777); 44 | } 45 | sprintf(ftmp, "%s%s", oPath, "data0"); 46 | if (stat(ftmp, &st) == -1) 47 | mkdir(ftmp, 0777); 48 | } 49 | 50 | //0. Initialize igd 51 | igd_t *igd = igd_init(tile_size); 52 | printf("igd_create 0\n"); 53 | 54 | //1. Get the files 55 | glob_t gResult; 56 | int rtn = glob(iPath, 0, NULL, &gResult); 57 | if(rtn!=0){ 58 | printf("wrong dir path: %s", iPath); 59 | return; 60 | } 61 | char** file_ids = gResult.gl_pathv; 62 | int32_t n_files = gResult.gl_pathc; 63 | if(n_files<1) 64 | printf("Too few files (add to path /*): %i\n", n_files); 65 | int32_t *nr = calloc(n_files, sizeof(int32_t)); 66 | double *avg = calloc(n_files, sizeof(double)); 67 | printf("igd_create 1: %i\n", n_files); 68 | //2. Read files 69 | int nCols=16; 70 | unsigned char buffer[256]; 71 | int32_t i, j, k, ig, i0=0, i1=0, L0=0, L1=1, m, nL; //int64_t? 72 | while(i00 defines breaks when reading maxCount 79 | //printf("%i, %i, %i, %s\n", i0, ig, nL, file_ids[ig]); 80 | gzFile fp; 81 | if ((fp = gzopen(file_ids[ig], "r")) == 0) 82 | return; 83 | nL = 0; 84 | if(ig==i0 && L0>0){ //pass L0 lines of a big file 85 | while(nL4) va = atol(splits[4]); 92 | igd_add(igd, splits[0], st, en, va, ig); 93 | nr[ig]++; 94 | avg[ig]+=en-st; 95 | nL++; 96 | if(igd->total>maxCount){ 97 | m = 1; 98 | i1 = ig; 99 | L1 = nL; //number of total lines or next line 100 | } 101 | } 102 | gzclose(fp); 103 | if(m==0) ig++; 104 | } 105 | //2.3 Save/append tiles to disc, add cnts tp Cnts 106 | free(splits); 107 | igd_saveT(igd, oPath); 108 | i0 = ig; 109 | L0 = L1; 110 | L1 = 0; 111 | } 112 | printf("igd_create 2\n"); 113 | 114 | //3. save _index.tsv: 4 columns--index, filename, nr, avg 115 | //Also has a header line: 116 | char idFile[128]; 117 | char *tchr; 118 | sprintf(idFile, "%s%s%s", oPath, igdName, "_index.tsv"); 119 | FILE *fpi = fopen(idFile, "w"); 120 | if(fpi==NULL) 121 | printf("Can't open file %s", idFile); 122 | fprintf(fpi, "Index\tFile\tNumber of regions\tAvg size\n"); 123 | for(i=0; i [options]\n" 15 | " options:\n" 16 | " -q \n" 17 | " -r \n" 18 | " -v \n" 19 | " -o \n" 20 | " -c display all intersects\n", 21 | PROGRAM_NAME, VERSION, PROGRAM_NAME); 22 | return exit_code; 23 | } 24 | 25 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits) 26 | { 27 | int ichr = get_id(iGD, chrm); 28 | if(ichr<0) 29 | return 0; 30 | int i, j, n1 = qs/iGD->nbp, n2 = (qe-1)/iGD->nbp; //define boundary! 31 | int32_t tE, tS, tL, tR, tM, tmpi, tmpi1, mlen, mTile = iGD->nTile[ichr]-1; 32 | if(n1>mTile) 33 | return 0; 34 | n2 = MIN(n2, mTile); 35 | tmpi = iGD->nCnt[ichr][n1]; 36 | tmpi1 = tmpi-1; 37 | if(tmpi>0){ 38 | if(n1!=iGD->preIdx || ichr!=iGD->preChr){ 39 | fseek(iGD->fP, iGD->tIdx[ichr][n1], SEEK_SET); 40 | free(iGD->gData); 41 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 42 | fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 43 | iGD->preIdx = n1; 44 | iGD->preChr = ichr; 45 | } 46 | if(qe>iGD->gData[0].start){ //sorted by start 47 | //find the 1st rs < qe 48 | tL = 0, tR=tmpi1; 49 | while(tLgData[tM].start < qe) //right side: 52 | tL = tM; 53 | else 54 | tR = tM; //left side 55 | } 56 | if(iGD->gData[tR].start=0; i--){ 59 | if(iGD->gData[i].end>qs){ 60 | hits[iGD->gData[i].idx]++; 61 | } 62 | } 63 | } 64 | if(n2>n1){ //n2>n1 65 | int32_t bd = iGD->nbp*(n1+1); //only keep the first 66 | for(j=n1+1; j<=n2; j++){ //n2 inclusive!!! 67 | tmpi = iGD->nCnt[ichr][j]; 68 | tmpi1 = tmpi-1; 69 | if(tmpi>0){ 70 | if(j!=iGD->preIdx || ichr!=iGD->preChr){ 71 | fseek(iGD->fP, iGD->tIdx[ichr][j], SEEK_SET); 72 | free(iGD->gData); 73 | iGD->gData = malloc(tmpi*sizeof(gdata_t)); 74 | fread(iGD->gData, sizeof(gdata_t)*tmpi, 1, iGD->fP); 75 | iGD->preIdx = j; 76 | iGD->preChr = ichr; 77 | } 78 | if(qe>iGD->gData[0].start){ 79 | tS = 0; 80 | while(tSgData[tS].startgData[tM].start < qe) //right side: 85 | tL = tM; 86 | else 87 | tR = tM; //left side 88 | } 89 | if(iGD->gData[tR].start=tS; i--){ 92 | if(iGD->gData[i].end>qs){ 93 | hits[iGD->gData[i].idx]++; 94 | } 95 | } 96 | } 97 | } 98 | bd+=iGD->nbp; 99 | } 100 | } 101 | } 102 | } 103 | 104 | int64_t getOverlaps(iGD_t *iGD, char *qFile, int64_t *hits) 105 | { 106 | gzFile fp; 107 | kstream_t *ks; 108 | kstring_t str = {0,0,0}; 109 | if ((fp = gzopen(qFile, "r")) == 0) 110 | return 0; 111 | ks = ks_init(fp); 112 | uint64_t ols = 0; 113 | char *chrm; 114 | int32_t st, en, nl; 115 | while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { 116 | chrm = parse_bed(str.s, &st, &en); 117 | if (chrm) { 118 | get_overlaps(iGD, chrm, st, en, hits); 119 | } 120 | } 121 | free(str.s); 122 | ks_destroy(ks); 123 | gzclose(fp); 124 | int64_t nols=0; 125 | for(int i=0;inFiles;i++) 126 | nols+=hits[i]; 127 | return nols; 128 | } 129 | 130 | -------------------------------------------------------------------------------- /src_py/igd_search.h: -------------------------------------------------------------------------------- 1 | #ifndef __IGD_SEARCH_H__ 2 | #define __IGD_SEARCH_H__ 3 | 4 | //===================================================================================== 5 | //Search the igd database for all overlaps with queries 6 | //by Jianglin Feng 05/12/2018 7 | // 8 | //database intervals sorted by _start: 8/12/2019 9 | //Reorganized: 11/06/2019 10 | //------------------------------------------------------------------------------------- 11 | #include "igd_base.h" 12 | //------------------------------------------------------------------------------------- 13 | //Single query 14 | void get_overlaps(iGD_t *iGD, char *chrm, int32_t qs, int32_t qe, int64_t *hits); 15 | 16 | //query file: call _r 17 | int64_t getOverlaps(iGD_t *iGD, char *qFile, int64_t *hits); 18 | 19 | #endif 20 | 21 | -------------------------------------------------------------------------------- /src_py/igd_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import igd_py as iGD 4 | from timeit import default_timer as timer 5 | import sys 6 | import numpy as np 7 | import pandas as pd 8 | 9 | def main(argv): 10 | if len(argv) < 4: 11 | print("To create: igd_test.py create \n \ 12 | To search: igd_test.py search ") 13 | sys.exit(1) 14 | 15 | igd = iGD.igd_py() 16 | if argv[1]=="create" and len(argv)>=5: 17 | igd.create(argv[2], argv[3], argv[4], 16384) 18 | 19 | elif argv[1]=="search" and len(argv)>=4: 20 | igd.open(argv[2]) 21 | nFiles = igd.get_nFiles() 22 | hits = np.zeros(nFiles, dtype='int64') 23 | total = igd.search_n(argv[3], hits) 24 | print("Total: ", total, "\n") 25 | 26 | print("nFiles: ", igd.get_nFiles(), "\n") 27 | 28 | if __name__ == "__main__": 29 | main(sys.argv) 30 | -------------------------------------------------------------------------------- /src_py/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 68 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 69 | 70 | #ifndef KSTRING_T 71 | #define KSTRING_T kstring_t 72 | typedef struct __kstring_t { 73 | unsigned l, m; 74 | char *s; 75 | } kstring_t; 76 | #endif 77 | 78 | #ifndef kroundup32 79 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 80 | #endif 81 | 82 | #define __KS_GETUNTIL(SCOPE, __read) \ 83 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 84 | { \ 85 | if (dret) *dret = 0; \ 86 | str->l = append? str->l : 0; \ 87 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 88 | for (;;) { \ 89 | int i; \ 90 | if (ks->begin >= ks->end) { \ 91 | if (!ks->is_eof) { \ 92 | ks->begin = 0; \ 93 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 94 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 95 | if (ks->end == 0) break; \ 96 | } else break; \ 97 | } \ 98 | if (delimiter == KS_SEP_LINE) { \ 99 | for (i = ks->begin; i < ks->end; ++i) \ 100 | if (ks->buf[i] == '\n') break; \ 101 | } else if (delimiter > KS_SEP_MAX) { \ 102 | for (i = ks->begin; i < ks->end; ++i) \ 103 | if (ks->buf[i] == delimiter) break; \ 104 | } else if (delimiter == KS_SEP_SPACE) { \ 105 | for (i = ks->begin; i < ks->end; ++i) \ 106 | if (isspace(ks->buf[i])) break; \ 107 | } else if (delimiter == KS_SEP_TAB) { \ 108 | for (i = ks->begin; i < ks->end; ++i) \ 109 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 110 | } else i = 0; /* never come to here! */ \ 111 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 112 | str->m = str->l + (i - ks->begin) + 1; \ 113 | kroundup32(str->m); \ 114 | str->s = (char*)realloc(str->s, str->m); \ 115 | } \ 116 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 117 | str->l = str->l + (i - ks->begin); \ 118 | ks->begin = i + 1; \ 119 | if (i < ks->end) { \ 120 | if (dret) *dret = ks->buf[i]; \ 121 | break; \ 122 | } \ 123 | } \ 124 | if (str->s == 0) { \ 125 | str->m = 1; \ 126 | str->s = (char*)calloc(1, 1); \ 127 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 128 | str->s[str->l] = '\0'; \ 129 | return str->l; \ 130 | } 131 | 132 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 133 | __KS_TYPE(type_t) \ 134 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 135 | __KS_GETUNTIL(SCOPE, __read) \ 136 | __KS_INLINED(__read) 137 | 138 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 139 | 140 | #define KSTREAM_DECLARE(type_t, __read) \ 141 | __KS_TYPE(type_t) \ 142 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 143 | extern kstream_t *ks_init(type_t f); \ 144 | extern void ks_destroy(kstream_t *ks); \ 145 | __KS_INLINED(__read) 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /src_py/setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup, Extension 3 | except ImportError: 4 | from distutils.core import setup 5 | from distutils.extension import Extension 6 | import os 7 | import sysconfig 8 | 9 | cmdclass = {} 10 | 11 | try: 12 | from Cython.Build import build_ext 13 | except ImportError: # without Cython 14 | module_src = 'igd_py.c' 15 | else: # with Cython 16 | module_src = 'igd_py.pyx' 17 | cmdclass['build_ext'] = build_ext 18 | 19 | extra_compile_args = sysconfig.get_config_var('CFLAGS').split() 20 | extra_compile_args += ["-lm", "-lz"] 21 | 22 | import sys, platform 23 | 24 | sys.path.append('python') 25 | 26 | include_dirs = [".", "/usr/local/zlib/include", "/home/john/anaconda3/include"] 27 | 28 | setup( 29 | name = 'igd_py', 30 | version = '0.1', 31 | url = 'https://github.com/databio/igd', 32 | description = 'Augmented Interval List for genomic interval overlap', 33 | author = 'Jianglin Feng', 34 | author_email = 'jf.xtable@gmail.com', 35 | license = 'GPL2', 36 | keywords = 'interval', 37 | ext_modules = [Extension('igd_py', 38 | sources = [module_src, 'igd_base.c', 'igd_create.c', 'igd_search.c'], 39 | depends = ['igd_base.h', 'igd_create.h', 'igd_search.h', 'khash.h', 'kseq.h', 'igd.pyx'], 40 | libraries = ['z'], 41 | #library_dirs = ['/usr/lib32/','/usr/lib/x86_64-linux-gnu/'], 42 | #extra_link_args = ['-L/usr/lib/x86_64-linux-gnu/'], 43 | extra_compile_args = extra_compile_args, 44 | include_dirs = include_dirs)], 45 | classifiers = [ 46 | 'Development Status :: Beta', 47 | 'License :: OSI Approved :: GPL2 License', 48 | 'Operating System :: POSIX', 49 | 'Programming Language :: C', 50 | 'Programming Language :: Cython', 51 | 'Programming Language :: Python :: 2.7', 52 | 'Programming Language :: Python :: 3', 53 | 'Intended Audience :: Science/Research', 54 | 'Topic :: Scientific/Engineering :: Bio-Informatics'], 55 | cmdclass = cmdclass) 56 | -------------------------------------------------------------------------------- /vignettes/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/IGD/4197c239380059e7e3d6eafce10011fc96047f5a/vignettes/.Rhistory -------------------------------------------------------------------------------- /vignettes/using_igd.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using iGD" 3 | author: "John Feng" 4 | date: "06/18/2018" 5 | updated: "5/8/2020" 6 | --- 7 | 8 | This vignette shows how to create an iGD database from roadmap dataset and then search the iGD database for overlaps with a query file. Make sure your computer memory is at least 4GB. 9 | 10 | ## Create roadmap iGD database 11 | 12 | First download the roadmap data source [rme.tgz](http://big.databio.org/igd/data/rme.tgz) and extract it: 13 | ``` 14 | wget http://big.databio.org/igd/data/rme.tgz 15 | tar -xzf rme.tgz 16 | ``` 17 | The `rme` folder contains 1905 `.bed.gz` data files. 18 | 19 | Then: 20 | ``` 21 | mkdir rme_igd 22 | igd create "rme/*" "rme_igd/" "roadmap" 23 | ``` 24 | 25 | This will generate the following in the output folder `rme_igd`: 26 | a single igd database file (mode 1) `roadmap.igd` and dataset index file `roadmap_index.tsv`. 27 | 28 | 29 | ## Search iGD for overlaps 30 | 31 | Download a sample query file [query100.bed](http://big.databio.org/igd/data/query100.bed) to the same directory as above. 32 | 33 | Then: 34 | ``` 35 | wget http://big.databio.org/igd/data/query100.bed 36 | igd search rme_igd/roadmap.igd -q query100.bed 37 | ``` 38 | 39 | --------------------------------------------------------------------------------