├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── random_intervals.rs ├── docs.sh ├── examples └── intersect_bed_count.rs ├── src ├── bedder_bed.rs ├── bedder_vcf.rs ├── chrom_ordering.rs ├── intersection.rs ├── interval.rs ├── lib.rs ├── main.rs ├── position.rs ├── sniff.rs └── string.rs └── tests ├── test.bam └── test.sam /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | name: Continuous integration 4 | 5 | jobs: 6 | check: 7 | name: Check 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: actions-rs/toolchain@v1 12 | with: 13 | profile: minimal 14 | toolchain: stable 15 | override: true 16 | - uses: actions-rs/cargo@v1 17 | with: 18 | command: check 19 | 20 | test: 21 | name: Test Suite 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v2 25 | - uses: actions-rs/toolchain@v1 26 | with: 27 | profile: minimal 28 | toolchain: stable 29 | override: true 30 | - uses: actions-rs/cargo@v1 31 | with: 32 | command: test 33 | - uses: actions-rs/cargo@v1 34 | with: 35 | command: test 36 | args: --features smartstring 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | docs/ 4 | *.bed 5 | *.bed.gz 6 | *.vcf 7 | .vscode 8 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bedder" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | [lib] 8 | name = "bedder" 9 | path = "src/lib.rs" 10 | 11 | [[bin]] 12 | name = "bedder" 13 | path = "src/main.rs" 14 | 15 | 16 | [dependencies] 17 | rand = "0.8.5" 18 | smartstring = { version = "1.0.1", optional = true } 19 | smol_str = { version = "0.2.0", optional = true } 20 | compact_str = { version = "0.7.0", optional = true } 21 | kstring = { version = "2.0.0", optional = true } 22 | noodles = { version = "0.52.0" } 23 | flate2 = "1.0.26" 24 | clap = { version = "4.2.7", features = ['derive'] } 25 | env_logger = "0.10.0" 26 | log = "0.4.19" 27 | linear-map = "1.2.0" 28 | hashbrown = "0.14.0" 29 | xvcf = { version = "0.1.4", git = "https://github.com/brentp/xvcf-rs" } 30 | 31 | [features] 32 | default = ["bed", "vcf", "bcf", "csi", "core", "bam", "sam", "bgzf"] 33 | bam = ["noodles/bam"] 34 | bed = ["noodles/bed"] 35 | bgzf = ["noodles/bgzf"] 36 | #cram = ["noodles/cram"] 37 | sam = ["noodles/sam"] 38 | vcf = ["noodles/vcf"] 39 | csi = ["noodles/csi"] 40 | core = ["noodles/core"] 41 | bcf = ["noodles/bcf"] 42 | # allow a Box in the enum to support user-specified types. 43 | dyn_positioned = [] 44 | 45 | [dev-dependencies] 46 | criterion = { version = "0.4", features = ["html_reports"] } 47 | clap = { version = "4.2.7", features = ["derive"] } 48 | 49 | [[bench]] 50 | name = "random_intervals" 51 | harness = false 52 | 53 | [profile.release] 54 | lto = true 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Brent Pedersen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 8 | 9 | [![status](https://github.com/quinlan-lab/bedder-rs/actions/workflows/rust.yml/badge.svg)](https://github.com/quinlan-lab/bedder-rs/actions/) 10 | 11 | # bedder (tools) 12 | 13 | This is an early release of the library for feedback, especially from rust practitioners. If interested, 14 | read below and then, for example, have a look at [issue 2](https://github.com/quinlan-lab/bedder-rs/issues/2) and the associated [discussion](https://github.com/quinlan-lab/bedder-rs/discussions/3) 15 | 16 | This library aims to provide: 17 | 18 | - [x] an abstraction so any interval types from sorted sources can be intersected together 19 | - [x] the rust implementation of the heap and Queue to find intersections with minimal overhead 20 | - [ ] bedder wrappers for: 21 | + [x] bed 22 | + [x] vcf/bcf 23 | + [ ] sam/bam/cram 24 | + [ ] gff/gtf 25 | + [ ] generalized tabixed/csi files 26 | - [ ] downstream APIs to perform operations on the intersections 27 | - [ ] a python library to interact with the intersections 28 | 29 | The API looks as follows 30 | 31 | Any genomic position from any data source can be intersected by this library as long as it implements this trait: 32 | 33 | ```rust 34 | 35 | pub trait Positioned { 36 | fn chrom(&self) -> &str; 37 | fn start(&self) -> u64; 38 | fn stop(&self) -> u64; 39 | 40 | // extract a value from the Positioned object Field 41 | fn value(&self, f: Field) -> Result; 42 | } 43 | 44 | /// Value can be any number of Ints, Floats, or Strings. 45 | pub enum Value { 46 | Ints(Vec), 47 | Floats(Vec), 48 | Strings(Vec), 49 | } 50 | 51 | /// Field is either an integer: the i'th column. 52 | /// Or a String, e.g. "INFO.DP". 53 | pub enum Field { 54 | String(String), 55 | Int(usize), 56 | } 57 | 58 | pub enum FieldError { 59 | InvalidFieldIndex(usize), 60 | InvalidFieldName(String), 61 | } 62 | 63 | ``` 64 | 65 | Then each file-type (VCF/BAM/etc) would implement this trait 66 | 67 | ```rust 68 | // something that generates Positioned things (BED/VCF/BAM/GFF/etc.) 69 | pub trait PositionedIterator { 70 | type Item: Positioned; 71 | 72 | /// Q can be ignored. See below for more detail. 73 | fn next_position(&mut self, q: Option<&dyn Positioned>) -> Option; 74 | 75 | /// A name for the iterator (likely filename) used by this library when logging. 76 | fn name(&self) 77 | } 78 | ``` 79 | 80 | Anything that can create a `PositionedIterator` can be used by the library. 81 | 82 | Note the `q` argument to `next_position`. This can be ignored by implementers but can be used to skip. 83 | For each query interval, we may make many calls to `next_position`. On the first of those calls, `q` 84 | is `Some(query_position)`. The implementer can choose to use this information to skip (rather than stream) 85 | for example with an index (or search) to the first interval that overlaps the `q`. Subsequent calls for the 86 | same interval will be called with `q` of `None`. The implementer must: 87 | 88 | 1. Always return an interval (unless EOF is reached) 89 | 1. Always return intervals in order. 90 | 1. Never return an interval that was returned previously (even if the same query interval appears multiple times). 91 | 92 | # Implementation Brief 93 | 94 | All Positioned structs are pulled through a min-heap. Each time an interval (with the smallest genomic position) is pulled from the min heap, 95 | a new struct is pulled from the file where that interval originated. Then the pulled interval is pushed onto a `queue` (actually a deque becase that's what is in the rust standard library). 96 | We then know the queue is in order. For each query interval, we drop from the queue any interval that is strictly _before_ the interval, 97 | then pull into the Intersection result any interval that is not _after_ the interval. Then return the result from the `next` call. 98 | We use `Rc` because each database interval may be attached to more than one query interval. 99 | 100 | # Acknowledgements 101 | 102 | - We received very valuable `rust` feedback and code from @sstadick. 103 | - We leverage the excellent [noodles](https://github.com/zaeleus/noodles) library. 104 | -------------------------------------------------------------------------------- /benches/random_intervals.rs: -------------------------------------------------------------------------------- 1 | use bedder::chrom_ordering::parse_genome; 2 | use bedder::intersection::IntersectionIterator; 3 | use bedder::interval::Interval; 4 | use bedder::position::{Position, PositionedIterator}; 5 | use bedder::string::String; 6 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 7 | use rand::Rng; 8 | use std::io; 9 | 10 | struct Intervals { 11 | i: usize, 12 | name: String, 13 | n_intervals: usize, 14 | curr_max: f64, 15 | rng: rand::rngs::ThreadRng, 16 | interval_len: u64, 17 | saved_chrom: String, 18 | } 19 | 20 | impl Intervals { 21 | fn new(name: String, n_intervals: usize, interval_len: u64) -> Self { 22 | Intervals { 23 | i: 0, 24 | name: name, 25 | n_intervals, 26 | curr_max: 1.0, 27 | rng: rand::thread_rng(), 28 | interval_len: interval_len, 29 | saved_chrom: String::from("chr1"), 30 | } 31 | } 32 | } 33 | 34 | impl PositionedIterator for Intervals { 35 | fn name(&self) -> String { 36 | String::from(format!("{}:{}", self.name, self.i)) 37 | } 38 | 39 | fn next_position(&mut self, _q: Option<&Position>) -> Option> { 40 | if self.i < self.n_intervals { 41 | self.i += 1; 42 | let r: f64 = self.rng.gen(); 43 | self.curr_max *= r.powf(self.i as f64); 44 | let start = ((1.0 - self.curr_max) * (MAX_POSITION as f64)) as u64; 45 | Some(Ok(Position::Interval(Interval { 46 | chrom: self.saved_chrom.clone(), 47 | start: start, 48 | stop: start + self.interval_len, 49 | ..Default::default() 50 | }))) 51 | } else { 52 | None 53 | } 54 | } 55 | } 56 | 57 | const MAX_POSITION: u64 = 10_000; 58 | 59 | pub fn intersection_benchmark(c: &mut Criterion) { 60 | let genome_str = "chr1\nchr2\n"; 61 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 62 | 63 | c.bench_function("simple intersection", |b| { 64 | b.iter_with_large_drop(|| { 65 | let a_ivs = Box::new(Intervals::new(String::from("a"), 100, 1000)); 66 | let b_ivs = Box::new(Intervals::new(String::from("b"), 100_000, 100)); 67 | let iter = IntersectionIterator::new(a_ivs, vec![b_ivs], &chrom_order) 68 | .expect("error getting iterator"); 69 | 70 | iter.for_each(|intersection| { 71 | let intersection = intersection.expect("error getting intersection"); 72 | black_box(intersection.overlapping); 73 | }); 74 | }); 75 | }); 76 | } 77 | 78 | criterion_group!(benches, intersection_benchmark); 79 | criterion_main!(benches); 80 | -------------------------------------------------------------------------------- /docs.sh: -------------------------------------------------------------------------------- 1 | # https://dev.to/deciduously/prepare-your-rust-api-docs-for-github-pages-2n5i 2 | cargo doc --no-deps 3 | rm -rf ./docs target/doc/ 4 | echo "" > target/doc/index.html 5 | cp -r target/doc ./docs 6 | -------------------------------------------------------------------------------- /examples/intersect_bed_count.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::io::{self, BufReader, BufWriter, Write}; 3 | use std::path::PathBuf; 4 | 5 | use bedder::sniff; 6 | use clap::Parser; 7 | extern crate bedder; 8 | use crate::bedder::chrom_ordering::parse_genome; 9 | use crate::bedder::intersection::IntersectionIterator; 10 | 11 | #[derive(Parser, Debug)] 12 | struct Args { 13 | a: PathBuf, 14 | b: PathBuf, 15 | 16 | fai: PathBuf, 17 | } 18 | 19 | fn main() -> io::Result<()> { 20 | let args = Args::parse(); 21 | 22 | // sniff determines the file type (bam/cram/bcf/vcf/bed/gff/gtf) 23 | // and returns a PositionIterator 24 | let ai = sniff::open_file(&args.a)?; 25 | let bi = sniff::open_file(&args.b)?; 26 | 27 | // bedder always requires a hashmap that indicates the chromosome order 28 | let fh = BufReader::new(fs::File::open(args.fai)?); 29 | let h = parse_genome(fh)?; 30 | 31 | // we can have any number of b (other_iterators). 32 | let it = IntersectionIterator::new(ai, vec![bi], &h)?; 33 | 34 | // we need to use buffered stdout or performance is determined by 35 | // file IO 36 | let mut stdout = BufWriter::new(io::stdout()); 37 | 38 | for intersection in it { 39 | let intersection = intersection?; 40 | writeln!( 41 | &mut stdout, 42 | "{}\t{}\t{}\t{}", 43 | intersection.base_interval.chrom(), 44 | intersection.base_interval.start(), 45 | intersection.base_interval.stop(), 46 | intersection.overlapping.len() 47 | )?; 48 | } 49 | 50 | Ok(()) 51 | } 52 | -------------------------------------------------------------------------------- /src/bedder_bed.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::useless_conversion)] // these are needed to support e.g. smartstring 2 | 3 | use crate::position::{Field, FieldError, Position, Positioned, Value, Valued}; 4 | use crate::string::String; 5 | pub use bed::record::Record; 6 | pub use noodles::bed; 7 | use std::io::{self, BufRead}; 8 | use std::result; 9 | 10 | impl crate::position::Positioned for bed::record::Record<3> { 11 | #[inline] 12 | fn chrom(&self) -> &str { 13 | self.reference_sequence_name() 14 | } 15 | 16 | #[inline] 17 | fn start(&self) -> u64 { 18 | // noodles position is 1-based. 19 | self.start_position().get() as u64 - 1 20 | } 21 | 22 | #[inline] 23 | fn stop(&self) -> u64 { 24 | self.end_position().get() as u64 25 | } 26 | } 27 | 28 | impl Valued for bed::record::Record<3> { 29 | fn value(&self, v: crate::position::Field) -> result::Result { 30 | match v { 31 | Field::String(s) => Ok(Value::Strings(vec![s])), 32 | Field::Int(i) => match i { 33 | 0 => Ok(Value::Strings(vec![String::from(self.chrom())])), 34 | 1 => Ok(Value::Ints(vec![self.start() as i64])), 35 | 2 => Ok(Value::Ints(vec![self.stop() as i64])), 36 | _ => Err(FieldError::InvalidFieldIndex(i)), 37 | }, 38 | } 39 | } 40 | } 41 | 42 | struct Last { 43 | chrom: String, 44 | start: u64, 45 | stop: u64, 46 | } 47 | 48 | pub struct BedderBed 49 | where 50 | R: BufRead, 51 | { 52 | reader: bed::Reader, 53 | buf: std::string::String, 54 | last_record: Option, 55 | line_number: u64, 56 | } 57 | 58 | impl BedderBed 59 | where 60 | R: BufRead, 61 | { 62 | pub fn new(r: R) -> BedderBed { 63 | BedderBed { 64 | reader: bed::Reader::new(r), 65 | buf: std::string::String::new(), 66 | last_record: None, 67 | line_number: 0, 68 | } 69 | } 70 | } 71 | 72 | impl crate::position::PositionedIterator for BedderBed 73 | where 74 | R: BufRead, 75 | { 76 | fn next_position( 77 | &mut self, 78 | _q: Option<&crate::position::Position>, 79 | ) -> Option> { 80 | self.buf.clear(); 81 | loop { 82 | self.line_number += 1; 83 | return match self.reader.read_line(&mut self.buf) { 84 | Ok(0) => None, 85 | Ok(_) => { 86 | if self.buf.starts_with('#') || self.buf.is_empty() { 87 | continue; 88 | } 89 | let record: bed::record::Record<3> = match self.buf.parse() { 90 | Err(e) => { 91 | let msg = format!( 92 | "line#{:?}:{:?} error: {:?}", 93 | self.line_number, &self.buf, e 94 | ); 95 | return Some(Err(io::Error::new(io::ErrorKind::InvalidData, msg))); 96 | } 97 | Ok(r) => r, 98 | }; 99 | 100 | match &mut self.last_record { 101 | None => { 102 | self.last_record = Some(Last { 103 | chrom: String::from(record.chrom()), 104 | start: record.start(), 105 | stop: record.stop(), 106 | }) 107 | } 108 | Some(r) => { 109 | if r.chrom != record.chrom() { 110 | r.chrom = String::from(record.chrom()) 111 | } 112 | r.start = record.start(); 113 | r.stop = record.stop(); 114 | } 115 | } 116 | 117 | Some(Ok(Position::Bed(record))) 118 | } 119 | Err(e) => Some(Err(e)), 120 | }; 121 | } 122 | } 123 | 124 | fn name(&self) -> String { 125 | String::from(format!("bed:{}", self.line_number)) 126 | } 127 | } 128 | 129 | #[cfg(test)] 130 | mod tests { 131 | use super::*; 132 | use crate::chrom_ordering::Chromosome; 133 | use crate::intersection::IntersectionIterator; 134 | use hashbrown::HashMap; 135 | use std::io::Cursor; 136 | 137 | #[test] 138 | fn test_bed_read() { 139 | // write a test for bed from a string using BufRead 140 | let ar = BedderBed::new(Cursor::new("chr1\t20\t30\nchr1\t21\t33")); 141 | let br = BedderBed::new(Cursor::new("chr1\t21\t30\nchr1\t22\t33")); 142 | 143 | let chrom_order = HashMap::from([ 144 | ( 145 | String::from("chr1"), 146 | Chromosome { 147 | index: 0usize, 148 | length: None, 149 | }, 150 | ), 151 | ( 152 | String::from("chr2"), 153 | Chromosome { 154 | index: 1usize, 155 | length: None, 156 | }, 157 | ), 158 | ]); 159 | 160 | let it = IntersectionIterator::new(Box::new(ar), vec![Box::new(br)], &chrom_order) 161 | .expect("error creating iterator"); 162 | 163 | let mut n = 0; 164 | it.for_each(|int| { 165 | let int = int.expect("error getting intersection"); 166 | //dbg!(&int.overlapping); 167 | assert!(int.overlapping.len() == 2); 168 | n += 1; 169 | }); 170 | assert!(n == 2); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/bedder_vcf.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::useless_conversion)] // these are needed to support e.g. smartstring 2 | use crate::position::{Field, FieldError, Position, Positioned, Value}; 3 | use crate::string::String; 4 | use noodles::core::Region; 5 | use noodles::vcf::{self, record::Chromosome}; 6 | use std::io::{self, Read, Seek}; 7 | use std::result; 8 | use vcf::record::info::field; 9 | use vcf::record::QualityScore; 10 | pub use vcf::Record; 11 | pub use xvcf; 12 | use xvcf::Skip; 13 | 14 | pub struct BedderVCF { 15 | reader: xvcf::Reader, 16 | record_number: u64, 17 | header: vcf::Header, 18 | } 19 | 20 | impl BedderVCF 21 | where 22 | R: Read + 'static, 23 | { 24 | pub fn new(r: xvcf::Reader) -> io::Result> { 25 | let h = r.header().clone(); 26 | let v = BedderVCF { 27 | reader: r, 28 | record_number: 0, 29 | header: h, 30 | }; 31 | Ok(v) 32 | } 33 | } 34 | 35 | pub fn match_info_value(info: &vcf::record::Info, name: &str) -> result::Result { 36 | //let info = record.info(); 37 | let key: vcf::record::info::field::Key = name 38 | .parse() 39 | .map_err(|_| FieldError::InvalidFieldName(String::from(name)))?; 40 | 41 | match info.get(&key) { 42 | Some(value) => match value { 43 | Some(field::Value::Integer(i)) => Ok(Value::Ints(vec![*i as i64])), 44 | Some(field::Value::Float(f)) => Ok(Value::Floats(vec![*f as f64])), 45 | Some(field::Value::String(s)) => Ok(Value::Strings(vec![String::from(s)])), 46 | Some(field::Value::Character(c)) => { 47 | Ok(Value::Strings(vec![String::from(c.to_string())])) 48 | } 49 | //Some(field::Value::Flag) => Ok(Value::Strings(vec![String::from("true")])), 50 | Some(field::Value::Array(arr)) => { 51 | match arr { 52 | field::value::Array::Integer(arr) => Ok(Value::Ints( 53 | arr.iter().flatten().map(|&v| v as i64).collect(), 54 | )), 55 | field::value::Array::Float(arr) => Ok(Value::Floats( 56 | arr.iter().flatten().map(|&v| v as f64).collect(), 57 | )), 58 | field::value::Array::String(arr) => Ok(Value::Strings( 59 | arr.iter().flatten().map(String::from).collect(), 60 | )), 61 | field::value::Array::Character(arr) => Ok(Value::Strings( 62 | arr.iter().flatten().map(|v| v.to_string().into()).collect(), 63 | )), 64 | //field::Value::Flag => Ok(Value::Strings(vec![String::from("true")])), 65 | } 66 | } 67 | 68 | _ => Err(FieldError::InvalidFieldName(String::from(name))), 69 | }, 70 | None => Err(FieldError::InvalidFieldName(String::from(name))), 71 | } 72 | } 73 | 74 | pub fn match_value(record: &vcf::record::Record, f: Field) -> result::Result { 75 | match f { 76 | Field::String(s) => match s.as_str() { 77 | "chrom" => Ok(Value::Strings(vec![String::from(Positioned::chrom( 78 | record, 79 | ))])), 80 | "start" => Ok(Value::Ints(vec![Positioned::start(record) as i64])), 81 | "stop" => Ok(Value::Ints(vec![Positioned::stop(record) as i64])), 82 | "ID" => Ok(Value::Strings( 83 | record.ids().iter().map(|s| s.to_string().into()).collect(), 84 | )), 85 | "FILTER" => Ok(Value::Strings( 86 | record 87 | .filters() 88 | .iter() 89 | .map(|s| String::from(s.to_string())) 90 | .collect(), 91 | )), 92 | "QUAL" => Ok(Value::Floats(vec![f32::from( 93 | record 94 | .quality_score() 95 | .unwrap_or(QualityScore::try_from(0f32).expect("error getting quality score")), 96 | ) as f64])), 97 | _ => { 98 | if s.len() > 5 && &s[0..5] == "INFO." { 99 | match_info_value(record.info(), &s[5..]) 100 | } else { 101 | // TODO: format 102 | unimplemented!(); 103 | } 104 | } 105 | }, 106 | 107 | Field::Int(i) => Err(FieldError::InvalidFieldIndex(i)), 108 | } 109 | } 110 | 111 | impl Positioned for vcf::record::Record { 112 | #[inline] 113 | fn chrom(&self) -> &str { 114 | match self.chromosome() { 115 | Chromosome::Name(s) => s, 116 | Chromosome::Symbol(s) => s, 117 | } 118 | } 119 | 120 | #[inline] 121 | fn start(&self) -> u64 { 122 | usize::from(self.position()) as u64 123 | } 124 | 125 | #[inline] 126 | fn stop(&self) -> u64 { 127 | usize::from(self.end().expect("error getting end from vcf record")) as u64 128 | } 129 | } 130 | 131 | impl crate::position::PositionedIterator for BedderVCF 132 | where 133 | R: Read + Seek + 'static, 134 | { 135 | fn next_position( 136 | &mut self, 137 | q: Option<&crate::position::Position>, 138 | ) -> Option> { 139 | if let Some(q) = q { 140 | let s = noodles::core::Position::new(q.start() as usize + 1)?; 141 | let e = noodles::core::Position::new(q.stop() as usize + 1)?; 142 | let region = Region::new(q.chrom(), s..=e); 143 | match self.reader.skip_to(&self.header, ®ion) { 144 | Ok(_) => (), 145 | Err(e) => return Some(Err(e)), 146 | } 147 | } 148 | 149 | // take self.reader.variant if it's there 150 | if let Some(v) = self.reader.take() { 151 | self.record_number += 1; 152 | return Some(Ok(Position::Vcf(Box::new(v)))); 153 | } 154 | 155 | let mut v = vcf::Record::default(); 156 | 157 | match self.reader.next_record(&self.header, &mut v) { 158 | Ok(0) => None, // EOF 159 | Ok(_) => { 160 | self.record_number += 1; 161 | Some(Ok(Position::Vcf(Box::new(v)))) 162 | } 163 | Err(e) => Some(Err(e)), 164 | } 165 | } 166 | fn name(&self) -> String { 167 | String::from("vcf line number:".to_owned() + self.record_number.to_string().as_str()) 168 | } 169 | } 170 | 171 | // tests 172 | #[cfg(test)] 173 | mod tests { 174 | use super::*; 175 | 176 | #[test] 177 | fn test_match_info() { 178 | let key: field::Key = "AAA".parse().expect("error parsing key"); 179 | 180 | let info: vcf::record::Info = [(key, Some(field::Value::Integer(1)))] 181 | .into_iter() 182 | .collect(); 183 | 184 | // write a test to extract the value using match_info_value 185 | let value = match_info_value(&info, "AAA").unwrap(); 186 | assert!(matches!(value, Value::Ints(_))); 187 | } 188 | 189 | #[test] 190 | fn test_match_info_vector() { 191 | let key: field::Key = "AAA".parse().expect("error parsing key"); 192 | 193 | let info: vcf::record::Info = [( 194 | key, 195 | Some(field::Value::Array(field::value::Array::Integer(vec![ 196 | Some(-1), 197 | Some(2), 198 | Some(3), 199 | None, 200 | Some(496), 201 | ]))), 202 | )] 203 | .into_iter() 204 | .collect(); 205 | 206 | // write a test to extract the value using match_info_value 207 | let value = match_info_value(&info, "AAA").unwrap(); 208 | assert!(matches!(value, Value::Ints(_))); 209 | 210 | if let Value::Ints(v) = value { 211 | assert_eq!(v.len(), 4); 212 | assert_eq!(v[0], -1); 213 | assert_eq!(v[1], 2); 214 | assert_eq!(v[2], 3); 215 | assert_eq!(v[3], 496); 216 | } else { 217 | panic!("error getting value"); 218 | } 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/chrom_ordering.rs: -------------------------------------------------------------------------------- 1 | use crate::string::String; 2 | use hashbrown::HashMap; 3 | use std::io::{self, BufRead, Read}; 4 | 5 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 6 | pub struct Chromosome { 7 | pub(crate) index: usize, 8 | pub(crate) length: Option, 9 | } 10 | /// A genome is a map from chromosome name to index with an optional chromosome length. 11 | 12 | pub fn parse_genome(reader: R) -> io::Result> 13 | where 14 | R: Read, 15 | { 16 | let mut reader = io::BufReader::new(reader); 17 | let mut genome = HashMap::default(); 18 | let mut line = std::string::String::new(); 19 | while reader.read_line(&mut line)? > 0 { 20 | if line.trim().is_empty() || line.starts_with('#') { 21 | line.clear(); 22 | continue; 23 | } 24 | let mut fields = line.split_whitespace(); 25 | match fields.next() { 26 | Some(chrom) => { 27 | let length = fields.next().map(|s| s.parse::()); 28 | let l = length.and_then(|c| match c { 29 | Ok(l) => Some(l), 30 | Err(_) => { 31 | log::warn!( 32 | "invalid length for chromosome {} with line: {}", 33 | chrom, 34 | line 35 | ); 36 | None 37 | } 38 | }); 39 | genome.insert( 40 | String::from(chrom), 41 | Chromosome { 42 | index: genome.len(), 43 | length: l, 44 | }, 45 | ); 46 | } 47 | None => { 48 | return Err(io::Error::new( 49 | io::ErrorKind::InvalidData, 50 | format!("invalid genome file line: {}", line), 51 | )) 52 | } 53 | } 54 | //.expect("require at least one column in genome file"); 55 | line.clear(); 56 | } 57 | Ok(genome) 58 | } 59 | 60 | #[cfg(test)] 61 | mod tests { 62 | use super::*; 63 | 64 | #[test] 65 | fn test_parse_genome() { 66 | let genome_str = "chr1\nchr2\t43\nchr3\n"; 67 | let genome = parse_genome(genome_str.as_bytes()).unwrap(); 68 | assert_eq!(genome.len(), 3); 69 | assert_eq!( 70 | genome.get("chr1"), 71 | Some(&Chromosome { 72 | index: 0, 73 | length: None 74 | }) 75 | ); 76 | assert_eq!( 77 | genome.get("chr2"), 78 | Some(&Chromosome { 79 | index: 1, 80 | length: Some(43) 81 | }) 82 | ); 83 | assert_eq!( 84 | genome.get("chr3"), 85 | Some(&Chromosome { 86 | index: 2, 87 | length: None 88 | }) 89 | ); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/intersection.rs: -------------------------------------------------------------------------------- 1 | use crate::chrom_ordering::Chromosome; 2 | use crate::string::String; 3 | use hashbrown::HashMap; 4 | use std::cmp::Ordering; 5 | use std::collections::{vec_deque::VecDeque, BinaryHeap}; 6 | use std::io; 7 | use std::io::{Error, ErrorKind}; 8 | use std::rc::Rc; 9 | //use std::sync::Arc as Rc; 10 | 11 | use crate::position::{Position, PositionedIterator}; 12 | 13 | /// An iterator that returns the intersection of multiple iterators. 14 | pub struct IntersectionIterator<'a> { 15 | base_iterator: Box, 16 | other_iterators: Vec>, 17 | min_heap: BinaryHeap, 18 | chromosome_order: &'a HashMap, 19 | // because multiple intervals from each stream can overlap a single base interval 20 | // and each interval from others may overlap many base intervals, we must keep a cache (Q) 21 | // we always add intervals in order with push_back and therefore remove with pop_front. 22 | // As soon as the front interval in cache is stricly less than the query interval, then we can pop it. 23 | dequeue: VecDeque, 24 | 25 | // this is only kept for error checking so we can track if intervals are out of order. 26 | previous_interval: Option>, 27 | 28 | // this tracks which iterators have been called with Some(Positioned) for a given interval 29 | // so that calls after the first are called with None. 30 | called: Vec, 31 | 32 | // we call this on the first iteration of pull_through_heap 33 | heap_initialized: bool, 34 | } 35 | 36 | /// An Intersection wraps the Positioned that was intersected with a unique identifier. 37 | /// The u32 identifier matches the index of the database that was intersected. 38 | #[derive(Debug)] 39 | pub struct Intersection { 40 | /// the Positioned that was intersected 41 | pub interval: Rc, 42 | /// a unique identifier indicating the source of this interval. 43 | pub id: u32, 44 | } 45 | 46 | /// An Intersections wraps the base interval and a vector of overlapping intervals. 47 | #[derive(Debug)] 48 | pub struct Intersections { 49 | pub base_interval: Rc, 50 | pub overlapping: Vec, 51 | } 52 | 53 | struct ReverseOrderPosition { 54 | position: Position, 55 | chromosome_index: usize, // index order of chrom. 56 | id: usize, // file_index 57 | } 58 | 59 | impl PartialEq for ReverseOrderPosition { 60 | #[inline] 61 | fn eq(&self, other: &Self) -> bool { 62 | self.position.start() == other.position.start() 63 | && self.position.stop() == other.position.stop() 64 | && self.chromosome_index == other.chromosome_index 65 | } 66 | } 67 | 68 | impl Eq for ReverseOrderPosition {} 69 | 70 | impl PartialOrd for ReverseOrderPosition { 71 | #[inline] 72 | fn partial_cmp(&self, other: &Self) -> Option { 73 | Some(self.cmp(other)) 74 | } 75 | } 76 | 77 | impl Ord for ReverseOrderPosition { 78 | #[inline] 79 | fn cmp(&self, other: &Self) -> Ordering { 80 | if self.chromosome_index != other.chromosome_index { 81 | return self.chromosome_index.cmp(&other.chromosome_index).reverse(); 82 | } 83 | 84 | let so = self.position.start().cmp(&other.position.start()).reverse(); 85 | match so { 86 | Ordering::Equal => self.position.stop().cmp(&other.position.stop()).reverse(), 87 | _ => so, 88 | } 89 | } 90 | } 91 | 92 | /// cmp will return Less if a is before b, Greater if a is after b, Equal if they overlap. 93 | #[inline(always)] 94 | fn cmp(a: &Position, b: &Position, chromosome_order: &HashMap) -> Ordering { 95 | if a.chrom() != b.chrom() { 96 | return chromosome_order[a.chrom()] 97 | .index 98 | .cmp(&chromosome_order[b.chrom()].index); 99 | } 100 | // same chrom. 101 | if a.stop() <= b.start() { 102 | return Ordering::Less; 103 | } 104 | if a.start() >= b.stop() { 105 | return Ordering::Greater; 106 | } 107 | // Equal simply means they overlap. 108 | Ordering::Equal 109 | } 110 | 111 | fn region_str(p: &Position) -> std::string::String { 112 | format!("{}:{}-{}", p.chrom(), p.start() + 1, p.stop()) 113 | } 114 | 115 | /// An iterator that returns the intersection of multiple iterators for each query interval 116 | impl<'a> Iterator for IntersectionIterator<'a> { 117 | type Item = io::Result; 118 | 119 | fn next(&mut self) -> Option { 120 | let bi = self.base_iterator.next_position(None)?; 121 | 122 | // if bi is an error return the Result here 123 | let base_interval = match bi { 124 | Err(e) => return Some(Err(e)), 125 | Ok(p) => Rc::new(p), 126 | }; 127 | if let Some(chrom) = self.chromosome_order.get(base_interval.chrom()) { 128 | if let Some(chrom_len) = chrom.length { 129 | if base_interval.stop() > chrom_len as u64 { 130 | let msg = format!( 131 | "interval beyond end of chromosome: {}", 132 | region_str(base_interval.as_ref()) 133 | ); 134 | return Some(Err(Error::new(ErrorKind::Other, msg))); 135 | } 136 | } 137 | } else { 138 | let msg = format!("invalid chromosome: {}", region_str(base_interval.as_ref())); 139 | return Some(Err(Error::new(ErrorKind::Other, msg))); 140 | } 141 | 142 | if self.out_of_order(base_interval.clone()) { 143 | let p = self 144 | .previous_interval 145 | .as_ref() 146 | .expect("we know previous interval is_some from out_of_order"); 147 | let msg = format!( 148 | "intervals from {} out of order {} should be before {}", 149 | self.base_iterator.name(), 150 | region_str(p), 151 | region_str(base_interval.as_ref()), 152 | ); 153 | return Some(Err(Error::new(ErrorKind::Other, msg))); 154 | } 155 | 156 | self.previous_interval = Some(base_interval.clone()); 157 | 158 | // drop intervals from Q that are strictly before the base interval. 159 | self.pop_front(base_interval.clone()); 160 | 161 | // pull intervals through the min-heap until the base interval is strictly less than the 162 | // last pulled interval. 163 | // we want all intervals to pass through the min_heap so that they are ordered across files 164 | if let Err(e) = self.pull_through_heap(base_interval.clone()) { 165 | return Some(Err(e)); 166 | } 167 | 168 | let mut overlapping_positions = Vec::new(); 169 | // de-Q contains all intervals that can overlap with the base interval. 170 | // de-Q is sorted. 171 | // We iterate through (again) and add those to overlapping positions. 172 | for o in self.dequeue.iter() { 173 | match cmp( 174 | o.interval.as_ref(), 175 | base_interval.as_ref(), 176 | self.chromosome_order, 177 | ) { 178 | Ordering::Less => continue, 179 | Ordering::Greater => break, 180 | Ordering::Equal => overlapping_positions.push(Intersection { 181 | // NOTE: we're effectively making a copy here, but it's only incrementing the Rc and a u32... 182 | // we could avoid by by keeping entire intersection in Rc. 183 | interval: Rc::clone(&o.interval), 184 | id: o.id, 185 | }), 186 | } 187 | } 188 | 189 | Some(Ok(Intersections { 190 | base_interval, 191 | overlapping: overlapping_positions, 192 | })) 193 | } 194 | } 195 | 196 | /// Create a new IntersectionIterator given a query (base) and a vector of other positioned iterators. 197 | impl<'a> IntersectionIterator<'a> { 198 | pub fn new( 199 | base_iterator: Box, 200 | other_iterators: Vec>, 201 | chromosome_order: &'a HashMap, 202 | ) -> io::Result { 203 | let min_heap = BinaryHeap::new(); 204 | let called = vec![false; other_iterators.len()]; 205 | Ok(IntersectionIterator { 206 | base_iterator, 207 | other_iterators, 208 | min_heap, 209 | chromosome_order, 210 | dequeue: VecDeque::new(), 211 | previous_interval: None, 212 | called, 213 | heap_initialized: false, 214 | }) 215 | } 216 | 217 | fn init_heap(&mut self, base_interval: Rc) -> io::Result<()> { 218 | assert!(!self.heap_initialized); 219 | for (i, iter) in self.other_iterators.iter_mut().enumerate() { 220 | if let Some(positioned) = iter.next_position(Some(base_interval.as_ref())) { 221 | let positioned = positioned?; 222 | let chromosome_index = match self.chromosome_order.get(positioned.chrom()) { 223 | Some(c) => c.index, 224 | None => { 225 | let msg = format!( 226 | "invalid chromosome: {} in iterator {}", 227 | region_str(&positioned), 228 | self.other_iterators[i].name() 229 | ); 230 | return Err(Error::new(ErrorKind::Other, msg)); 231 | } 232 | }; 233 | self.min_heap.push(ReverseOrderPosition { 234 | position: positioned, 235 | chromosome_index, 236 | id: i, 237 | }); 238 | } 239 | } 240 | self.heap_initialized = true; 241 | Ok(()) 242 | } 243 | 244 | /// drop intervals from Q that are strictly before the base interval. 245 | fn pop_front(&mut self, base_interval: Rc) { 246 | while !self.dequeue.is_empty() 247 | && Ordering::Less 248 | == cmp( 249 | self.dequeue[0].interval.as_ref(), 250 | base_interval.as_ref(), 251 | self.chromosome_order, 252 | ) 253 | { 254 | _ = self.dequeue.pop_front(); 255 | } 256 | } 257 | 258 | fn out_of_order(&self, interval: Rc) -> bool { 259 | return match &self.previous_interval { 260 | None => false, // first interval in file. 261 | Some(previous_interval) => { 262 | if previous_interval.chrom() != interval.chrom() { 263 | let pci = self.chromosome_order[previous_interval.chrom()].index; 264 | let ici = self.chromosome_order[interval.chrom()].index; 265 | return pci > ici; 266 | } 267 | previous_interval.start() > interval.start() 268 | || (previous_interval.start() == interval.start() 269 | && previous_interval.stop() > interval.stop()) 270 | } 271 | }; 272 | } 273 | // reset the array that tracks which iterators have been called with Some(Positioned) 274 | #[inline] 275 | fn zero_called(&mut self) { 276 | let ptr = self.called.as_mut_ptr(); 277 | unsafe { ptr.write_bytes(0, self.called.len()) }; 278 | } 279 | 280 | fn pull_through_heap(&mut self, base_interval: Rc) -> io::Result<()> { 281 | self.zero_called(); 282 | if !self.heap_initialized { 283 | // we wait til first iteration here to call init heap 284 | // because we need the base interval. 285 | self.init_heap(Rc::clone(&base_interval))?; 286 | } 287 | let other_iterators = self.other_iterators.as_mut_slice(); 288 | 289 | while let Some(ReverseOrderPosition { 290 | position, 291 | chromosome_index, 292 | id: file_index, 293 | .. 294 | }) = self.min_heap.pop() 295 | { 296 | // must always pull into the heap. 297 | let f = other_iterators 298 | .get_mut(file_index) 299 | .expect("expected interval iterator at file index"); 300 | // for a given base_interval, we make sure to call next_position with Some, only once. 301 | // subsequent calls will be with None. 302 | let arg: Option<&Position> = if !self.called[file_index] { 303 | self.called[file_index] = true; 304 | Some(base_interval.as_ref()) 305 | } else { 306 | None 307 | }; 308 | if let Some(next_position) = f.next_position(arg) { 309 | let next_position = next_position?; 310 | let next_chromosome = match self.chromosome_order.get(next_position.chrom()) { 311 | Some(c) => c, 312 | None => { 313 | let msg = format!( 314 | "invalid chromosome: {} in iterator {}", 315 | region_str(&next_position), 316 | other_iterators[file_index].name() 317 | ); 318 | return Err(Error::new(ErrorKind::Other, msg)); 319 | } 320 | }; 321 | 322 | // check that intervals within a file are in order. 323 | if !(position.start() <= next_position.start() 324 | || chromosome_index < next_chromosome.index) 325 | { 326 | let msg = format!( 327 | "database intervals out of order ({} -> {}) in iterator: {}", 328 | region_str(&position), 329 | region_str(&next_position), 330 | other_iterators[file_index].name() 331 | ); 332 | return Err(Error::new(ErrorKind::Other, msg)); 333 | } 334 | self.min_heap.push(ReverseOrderPosition { 335 | position: next_position, 336 | chromosome_index, 337 | id: file_index, 338 | }); 339 | } 340 | 341 | // and we must always add the position to the Q 342 | let rc_pos = Rc::new(position); 343 | let intersection = Intersection { 344 | interval: rc_pos.clone(), 345 | id: file_index as u32, 346 | }; 347 | self.dequeue.push_back(intersection); 348 | 349 | // if this position is after base_interval, we can stop pulling through heap. 350 | if cmp( 351 | base_interval.as_ref(), 352 | rc_pos.as_ref(), 353 | self.chromosome_order, 354 | ) == Ordering::Greater 355 | { 356 | break; 357 | } 358 | } 359 | Ok(()) 360 | } 361 | } 362 | 363 | #[cfg(test)] 364 | mod tests { 365 | use super::*; 366 | use crate::chrom_ordering::parse_genome; 367 | use crate::interval::Interval; 368 | 369 | struct Intervals { 370 | i: usize, 371 | name: String, 372 | ivs: Vec, 373 | } 374 | 375 | impl Intervals { 376 | fn new(name: String, ivs: Vec) -> Self { 377 | Intervals { 378 | i: 0, 379 | name, 380 | ivs: ivs 381 | .into_iter() 382 | .map(|i| Position::Interval(i)) 383 | .collect::>(), 384 | } 385 | } 386 | fn add(&mut self, iv: Interval) { 387 | self.ivs.push(Position::Interval(iv)); 388 | } 389 | } 390 | impl PositionedIterator for Intervals { 391 | fn name(&self) -> String { 392 | String::from(format!("{}:{}", self.name, self.i)) 393 | } 394 | 395 | fn next_position(&mut self, _o: Option<&Position>) -> Option> { 396 | if self.i >= self.ivs.len() { 397 | return None; 398 | } 399 | let p = self.ivs.remove(0); 400 | Some(Ok(p)) 401 | } 402 | } 403 | 404 | #[test] 405 | fn many_intervals() { 406 | let chrom_order = HashMap::from([ 407 | ( 408 | String::from("chr1"), 409 | Chromosome { 410 | index: 0, 411 | length: None, 412 | }, 413 | ), 414 | ( 415 | String::from("chr2"), 416 | Chromosome { 417 | index: 1, 418 | length: None, 419 | }, 420 | ), 421 | ]); 422 | let mut a_ivs = Intervals::new(String::from("A"), Vec::new()); 423 | let mut b_ivs = Intervals::new(String::from("B"), Vec::new()); 424 | let n_intervals = 100; 425 | let times = 3; 426 | for i in 0..n_intervals { 427 | let iv = Interval { 428 | chrom: String::from("chr1"), 429 | start: i, 430 | stop: i + 1, 431 | ..Default::default() 432 | }; 433 | a_ivs.add(iv); 434 | for _ in 0..times { 435 | let iv = Interval { 436 | chrom: String::from("chr1"), 437 | start: i, 438 | stop: i + 1, 439 | ..Default::default() 440 | }; 441 | b_ivs.add(iv); 442 | } 443 | } 444 | 445 | b_ivs.ivs.sort_by(|a, b| a.start().cmp(&b.start())); 446 | 447 | let a_ivs: Box = Box::new(a_ivs); 448 | 449 | let mut iter = IntersectionIterator::new(a_ivs, vec![Box::new(b_ivs)], &chrom_order) 450 | .expect("error getting iterator"); 451 | let mut n = 0; 452 | assert!(iter.all(|intersection| { 453 | let intersection = intersection.expect("error getting intersection"); 454 | n += 1; 455 | assert!(intersection 456 | .overlapping 457 | .iter() 458 | .all(|p| p.interval.start() == intersection.base_interval.start())); 459 | intersection.overlapping.len() == times 460 | })); 461 | assert_eq!(n, n_intervals) 462 | } 463 | 464 | #[test] 465 | fn bookend_and_chrom() { 466 | let genome_str = "chr1\nchr2\nchr3\n"; 467 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 468 | let chrom = String::from("chr1"); 469 | let a_ivs = Intervals::new( 470 | String::from("A"), 471 | vec![ 472 | Interval { 473 | chrom: chrom.clone(), 474 | start: 0, 475 | stop: 10, 476 | ..Default::default() 477 | }, 478 | Interval { 479 | chrom: chrom.clone(), 480 | start: 0, 481 | stop: 10, 482 | ..Default::default() 483 | }, 484 | ], 485 | ); 486 | 487 | let b_ivs = Intervals::new( 488 | String::from("B"), 489 | vec![ 490 | Interval { 491 | chrom: chrom.clone(), 492 | start: 0, 493 | stop: 5, 494 | ..Default::default() 495 | }, 496 | Interval { 497 | chrom: chrom.clone(), 498 | start: 0, 499 | stop: 10, 500 | ..Default::default() 501 | }, 502 | Interval { 503 | // this interval should not overlap. 504 | chrom: chrom.clone(), 505 | start: 10, 506 | stop: 20, 507 | ..Default::default() 508 | }, 509 | Interval { 510 | // this interval should not overlap. 511 | chrom: String::from("chr2"), 512 | start: 1, 513 | stop: 20, 514 | ..Default::default() 515 | }, 516 | ], 517 | ); 518 | 519 | let iter = IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order) 520 | .expect("error getting iterator"); 521 | iter.for_each(|intersection| { 522 | let intersection = intersection.expect("intersection"); 523 | assert_eq!(intersection.overlapping.len(), 2); 524 | assert!(intersection 525 | .overlapping 526 | .iter() 527 | .all(|p| { p.interval.start() == 0 })); 528 | }) 529 | } 530 | 531 | #[test] 532 | fn interval_beyond_end_of_chrom() { 533 | let genome_str = "chr1\t22\n"; 534 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 535 | let a_ivs = Intervals::new( 536 | String::from("A"), 537 | vec![ 538 | Interval { 539 | chrom: String::from("chr1"), 540 | start: 10, 541 | stop: 22, 542 | ..Default::default() 543 | }, 544 | Interval { 545 | chrom: String::from("chr1"), 546 | start: 1, 547 | stop: 23, 548 | ..Default::default() 549 | }, 550 | ], 551 | ); 552 | let mut iter = IntersectionIterator::new(Box::new(a_ivs), vec![], &chrom_order) 553 | .expect("error getting iterator"); 554 | 555 | let e = iter.nth(1).expect("error getting next"); 556 | assert!(e.is_err()); 557 | let e = e.err().unwrap(); 558 | assert!(e.to_string().contains("beyond end of chromosome")); 559 | } 560 | 561 | #[test] 562 | fn ordering_error() { 563 | let genome_str = "chr1\nchr2\nchr3\n"; 564 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 565 | let a_ivs = Intervals::new( 566 | String::from("A"), 567 | vec![ 568 | Interval { 569 | chrom: String::from("chr1"), 570 | start: 10, 571 | stop: 1, 572 | ..Default::default() 573 | }, 574 | Interval { 575 | chrom: String::from("chr1"), 576 | start: 1, 577 | stop: 2, 578 | ..Default::default() 579 | }, 580 | ], 581 | ); 582 | let mut iter = IntersectionIterator::new(Box::new(a_ivs), vec![], &chrom_order) 583 | .expect("error getting iterator"); 584 | 585 | let e = iter.nth(1).expect("error getting next"); 586 | assert!(e.is_err()); 587 | let e = e.err().unwrap(); 588 | assert!(e.to_string().contains("out of order")); 589 | 590 | // now repeat with database out of order. 591 | let a_ivs = Intervals::new( 592 | String::from("A"), 593 | vec![ 594 | Interval { 595 | chrom: String::from("chr1"), 596 | start: 1, 597 | stop: 2, 598 | ..Default::default() 599 | }, 600 | Interval { 601 | chrom: String::from("chr1"), 602 | start: 1, 603 | stop: 2, 604 | ..Default::default() 605 | }, 606 | ], 607 | ); 608 | // now repeat with database out of order. 609 | let b_ivs = Intervals::new( 610 | String::from("B"), 611 | vec![ 612 | Interval { 613 | chrom: String::from("chr1"), 614 | start: 1, 615 | stop: 2, 616 | ..Default::default() 617 | }, 618 | Interval { 619 | chrom: String::from("chr1"), 620 | start: 0, 621 | stop: 2, 622 | ..Default::default() 623 | }, 624 | ], 625 | ); 626 | 627 | let mut iter = 628 | IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order) 629 | .expect("error getting iterator"); 630 | let e = iter.next().expect("error getting next"); 631 | assert!(e.is_err()); 632 | let e = e.err().unwrap(); 633 | assert!(e.to_string().contains("out of order")); 634 | } 635 | 636 | #[test] 637 | fn multiple_sources() { 638 | let genome_str = "chr1\nchr2\nchr3\n"; 639 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 640 | let a_ivs = Intervals::new( 641 | String::from("A"), 642 | vec![Interval { 643 | chrom: String::from("chr1"), 644 | start: 0, 645 | stop: 1, 646 | ..Default::default() 647 | }], 648 | ); 649 | let b_ivs = Intervals::new( 650 | String::from("B"), 651 | vec![Interval { 652 | chrom: String::from("chr1"), 653 | start: 0, 654 | stop: 1, 655 | ..Default::default() 656 | }], 657 | ); 658 | let c_ivs = Intervals::new( 659 | String::from("c"), 660 | vec![Interval { 661 | chrom: String::from("chr1"), 662 | start: 0, 663 | stop: 1, 664 | ..Default::default() 665 | }], 666 | ); 667 | let iter = IntersectionIterator::new( 668 | Box::new(a_ivs), 669 | vec![Box::new(b_ivs), Box::new(c_ivs)], 670 | &chrom_order, 671 | ) 672 | .expect("error getting iterator"); 673 | let c = iter 674 | .map(|intersection| { 675 | let intersection = intersection.expect("error getting intersection"); 676 | dbg!(&intersection.overlapping); 677 | assert_eq!(intersection.overlapping.len(), 2); 678 | // check that we got from source 1 and source 2. 679 | assert_ne!( 680 | intersection.overlapping[0].id, 681 | intersection.overlapping[1].id 682 | ); 683 | 1 684 | }) 685 | .sum::(); 686 | assert_eq!(c, 1); 687 | } 688 | 689 | #[test] 690 | #[ignore] 691 | fn zero_length() { 692 | let genome_str = "chr1\nchr2\nchr3\n"; 693 | let chrom_order = parse_genome(genome_str.as_bytes()).unwrap(); 694 | let a_ivs = Intervals::new( 695 | String::from("A"), 696 | vec![Interval { 697 | chrom: String::from("chr1"), 698 | start: 1, 699 | stop: 1, 700 | ..Default::default() 701 | }], 702 | ); 703 | let b_ivs = Intervals::new( 704 | String::from("B"), 705 | vec![Interval { 706 | chrom: String::from("chr1"), 707 | start: 1, 708 | stop: 1, 709 | ..Default::default() 710 | }], 711 | ); 712 | let iter = IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order) 713 | .expect("error getting iterator"); 714 | // check that it overlapped by asserting that the loop ran and also that there was an overlap within the loop. 715 | let c = iter 716 | .map(|intersection| { 717 | let intersection = intersection.expect("error getting intersection"); 718 | assert!(intersection.overlapping.len() == 1); 719 | 1 720 | }) 721 | .sum::(); 722 | // NOTE this fails as we likely need to fix the lt function. 723 | assert_eq!(c, 1); 724 | } 725 | } 726 | -------------------------------------------------------------------------------- /src/interval.rs: -------------------------------------------------------------------------------- 1 | use crate::position::{Field, FieldError, Value}; 2 | use crate::string::String; 3 | /// Interval type is a simple struct that can be used as a default interval type. 4 | /// It has a chromosome, start, and stop field along with a (linear) HashMap of Values. 5 | use linear_map::LinearMap; 6 | use std::fmt::Debug; 7 | 8 | #[derive(Debug, Default)] 9 | pub struct Interval { 10 | pub chrom: String, 11 | pub start: u64, 12 | pub stop: u64, 13 | pub fields: LinearMap, 14 | } 15 | 16 | impl Interval { 17 | #[inline] 18 | pub fn start(&self) -> u64 { 19 | self.start 20 | } 21 | #[inline] 22 | pub fn stop(&self) -> u64 { 23 | self.stop 24 | } 25 | #[inline] 26 | pub fn chrom(&self) -> &str { 27 | &self.chrom 28 | } 29 | 30 | #[inline] 31 | pub fn value(&self, f: Field) -> Result { 32 | match f { 33 | Field::String(name) => match self.fields.get(&name) { 34 | None => Err(FieldError::InvalidFieldName(name)), 35 | Some(v) => match v { 36 | Value::Strings(s) => Ok(Value::Strings(s.clone())), 37 | Value::Ints(i) => Ok(Value::Ints(i.clone())), 38 | Value::Floats(f) => Ok(Value::Floats(f.clone())), 39 | }, 40 | }, 41 | Field::Int(i) => { 42 | let name = self.fields.keys().nth(i); 43 | match name { 44 | None => Err(FieldError::InvalidFieldIndex(i)), 45 | Some(name) => match self.fields.get(name) { 46 | None => Err(FieldError::InvalidFieldName(name.clone())), 47 | Some(v) => match v { 48 | Value::Strings(s) => Ok(Value::Strings(s.clone())), 49 | Value::Ints(i) => Ok(Value::Ints(i.clone())), 50 | Value::Floats(f) => Ok(Value::Floats(f.clone())), 51 | }, 52 | }, 53 | } 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Bedder is a library for intersecting genomic data. 2 | 3 | /// Intersection iterators and data structures. 4 | pub mod intersection; 5 | 6 | /// Position traits. 7 | pub mod position; 8 | 9 | // Interval type 10 | pub mod interval; 11 | 12 | /// a std::String::String unless other string features are enabled. 13 | pub mod string; 14 | 15 | pub mod sniff; 16 | 17 | pub mod chrom_ordering; 18 | 19 | #[cfg(feature = "bed")] 20 | /// Bed parser implementing the PositionedIterator trait. 21 | pub mod bedder_bed; 22 | 23 | #[cfg(feature = "vcf")] 24 | /// Vcf parser implementing the PositionedIterator trait. 25 | pub mod bedder_vcf; 26 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate bedder; 2 | use bedder::sniff; 3 | use clap::Parser; 4 | use std::env; 5 | use std::path::PathBuf; 6 | 7 | #[derive(Parser, Debug)] 8 | #[command(author, version, about, long_about=None)] 9 | struct Args { 10 | #[arg(help = "input file", short = 'a')] 11 | query_path: PathBuf, 12 | #[arg(help = "other file", short = 'b', required = true)] 13 | other_paths: Vec, 14 | #[arg( 15 | help = "genome file for chromosome ordering", 16 | short = 'g', 17 | required = true 18 | )] 19 | genome_file: PathBuf, 20 | } 21 | 22 | pub fn main() -> Result<(), Box> { 23 | if env::var("RUST_LOG").is_err() { 24 | env::set_var("RUST_LOG", "bedder=info"); 25 | } 26 | env_logger::init(); 27 | log::info!("starting up"); 28 | let args = Args::parse(); 29 | 30 | let chrom_order = 31 | bedder::chrom_ordering::parse_genome(std::fs::File::open(&args.genome_file)?)?; 32 | 33 | let a_iter = sniff::open_file(&args.query_path)?; 34 | let b_iters: Vec<_> = args 35 | .other_paths 36 | .iter() 37 | .map(|p| sniff::open_file(p).expect("error opening file")) 38 | .collect(); 39 | 40 | let ii = bedder::intersection::IntersectionIterator::new(a_iter, b_iters, &chrom_order)?; 41 | // iterate over the intersections 42 | ii.for_each(|intersection| { 43 | let intersection = intersection.expect("error getting intersection"); 44 | println!("{:?}", intersection); 45 | }); 46 | Ok(()) 47 | } 48 | -------------------------------------------------------------------------------- /src/position.rs: -------------------------------------------------------------------------------- 1 | use crate::string::String; 2 | use std::fmt::{self, Debug}; 3 | use std::io; 4 | use std::result; 5 | 6 | /// A Value is a vector of integers, floats, or strings. 7 | /// Often this will be a single value. 8 | #[derive(Debug)] 9 | pub enum Value { 10 | Ints(Vec), 11 | Floats(Vec), 12 | Strings(Vec), 13 | } 14 | 15 | /// Field is either an integer, as in a bed column 16 | /// or a string, as in a vcf info field. 17 | #[derive(Debug)] 18 | pub enum Field { 19 | String(String), 20 | Int(usize), 21 | } 22 | 23 | /// Error returned when a field is not found. 24 | #[derive(Debug)] 25 | pub enum FieldError { 26 | InvalidFieldIndex(usize), 27 | InvalidFieldName(String), 28 | } 29 | 30 | impl fmt::Display for FieldError { 31 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 32 | match self { 33 | FieldError::InvalidFieldIndex(i) => write!(f, "invalid column index: {}", i), 34 | FieldError::InvalidFieldName(s) => write!(f, "invalid column name: {}", s), 35 | } 36 | } 37 | } 38 | 39 | impl std::error::Error for FieldError {} 40 | 41 | /// A Positioned has a position in the genome. It is a bed-like (half-open) interval. 42 | /// It also has a means to extract values from integer or string columns. 43 | pub trait Positioned: Debug { 44 | fn chrom(&self) -> &str; 45 | /// 0-based start position. 46 | fn start(&self) -> u64; 47 | /// non-inclusive end; 48 | fn stop(&self) -> u64; 49 | 50 | // get back the original line? 51 | //fn line(&self) -> &'a str; 52 | } 53 | 54 | pub trait Valued { 55 | // extract a value from the Positioned object Col 56 | fn value(&self, b: Field) -> result::Result; 57 | } 58 | 59 | #[derive(Debug)] 60 | pub enum Position { 61 | Bed(crate::bedder_bed::Record<3>), 62 | // Note: we use a Box here because a vcf Record is large. 63 | Vcf(Box), 64 | Interval(crate::interval::Interval), 65 | // catch-all in case we have another interval type. 66 | #[cfg(feature = "dyn_positioned")] 67 | Other(Box), 68 | } 69 | 70 | impl Position { 71 | #[inline] 72 | pub fn chrom(&self) -> &str { 73 | match self { 74 | Position::Bed(b) => b.chrom(), 75 | Position::Vcf(v) => v.chrom(), 76 | Position::Interval(i) => &i.chrom, 77 | #[cfg(feature = "dyn_positioned")] 78 | Position::Other(o) => o.chrom(), 79 | } 80 | } 81 | 82 | #[inline] 83 | pub fn start(&self) -> u64 { 84 | match self { 85 | Position::Bed(b) => b.start(), 86 | Position::Vcf(v) => v.start(), 87 | Position::Interval(i) => i.start, 88 | #[cfg(feature = "dyn_positioned")] 89 | Position::Other(o) => o.start(), 90 | } 91 | } 92 | 93 | #[inline] 94 | pub fn stop(&self) -> u64 { 95 | match self { 96 | Position::Bed(b) => b.stop(), 97 | Position::Vcf(v) => v.stop(), 98 | Position::Interval(i) => i.stop, 99 | #[cfg(feature = "dyn_positioned")] 100 | Position::Other(o) => o.stop(), 101 | } 102 | } 103 | } 104 | 105 | #[cfg(feature = "dyn_positioned")] 106 | impl Valued for Box { 107 | #[inline] 108 | fn value(&self, f: Field) -> result::Result { 109 | self.value(f) 110 | } 111 | } 112 | 113 | // Delegate the boxed version of this trait object to the inner object. 114 | impl Positioned for Box { 115 | fn chrom(&self) -> &str { 116 | self.as_ref().chrom() 117 | } 118 | 119 | fn start(&self) -> u64 { 120 | self.as_ref().start() 121 | } 122 | 123 | fn stop(&self) -> u64 { 124 | self.as_ref().stop() 125 | } 126 | } 127 | 128 | impl PartialEq for dyn Positioned { 129 | fn eq(&self, other: &dyn Positioned) -> bool { 130 | self.start() == other.start() 131 | && self.stop() == other.stop() 132 | && self.chrom() == other.chrom() 133 | } 134 | } 135 | 136 | /// PositionedIterator is an iterator over Positioned objects. 137 | pub trait PositionedIterator { 138 | /// A name for the iterator. This is most often the file path, perhaps with the line number appended. 139 | /// Used to provide informative messages to the user. 140 | fn name(&self) -> String; 141 | 142 | /// return the next Positioned from the iterator. 143 | /// It is fine for implementers to ignore `q`; 144 | /// Some iterators may improve performance by using `q` to index-skip. 145 | /// `q` will be Some only on the first call for a given query interval. 146 | /// Calls where `q` is None should return the next Positioned in the iterator (file) that has not 147 | /// been returned previously. Intervals should only be returned once (even across many identical query intervals) 148 | /// and they should always be returned in order (querys will always be in order). 149 | /// Thus, if the implementer heeds `q` it should check that the returned Positioned is greater than the previously 150 | /// returned position (Positioned equal to previously returned position should have already been returned). 151 | fn next_position( 152 | &mut self, 153 | q: Option<&Position>, 154 | ) -> Option>; 155 | } 156 | -------------------------------------------------------------------------------- /src/sniff.rs: -------------------------------------------------------------------------------- 1 | use flate2::read::GzDecoder; 2 | use std::io::{BufRead, Read, Seek}; 3 | use std::path::Path; 4 | 5 | use crate::bedder_bed::BedderBed; 6 | use crate::bedder_vcf::BedderVCF; 7 | use crate::position::PositionedIterator; 8 | use noodles::bgzf; 9 | 10 | /// File formats supported by this file detector. 11 | #[derive(Debug, PartialEq)] 12 | pub enum FileFormat { 13 | VCF, 14 | BCF, 15 | BAM, 16 | CRAM, 17 | SAM, 18 | BED, 19 | CSI, 20 | Unknown, 21 | } 22 | 23 | /// Possible Compression formats. 24 | #[derive(Debug, PartialEq)] 25 | pub enum Compression { 26 | None, 27 | GZ, 28 | BGZF, 29 | RAZF, 30 | } 31 | 32 | pub fn open_file

(path: P) -> std::io::Result> 33 | where 34 | P: AsRef, 35 | { 36 | let file = std::fs::File::open(&path)?; 37 | let r = open_reader(file, path); 38 | r 39 | } 40 | 41 | pub fn open_reader(reader: R, path: P) -> std::io::Result> 42 | where 43 | R: Read + Seek + 'static, 44 | P: AsRef, 45 | { 46 | let mut reader = std::io::BufReader::new(reader); 47 | let (format, compression) = detect_file_format(&mut reader, &path)?; 48 | log::info!( 49 | "path: {:?}, format: {:?} compression: {:?}", 50 | path.as_ref(), 51 | format, 52 | compression 53 | ); 54 | /* 55 | */ 56 | match format { 57 | FileFormat::VCF | FileFormat::BCF => { 58 | // get &str from path 59 | let path = path.as_ref().to_str().unwrap(); 60 | let x = xvcf::Reader::from_reader(Box::new(reader), Some(path))?; 61 | let bed_vcf = BedderVCF::new(x)?; 62 | Ok(Box::new(bed_vcf)) 63 | } 64 | _ => { 65 | let br: Box = match compression { 66 | Compression::None => Box::new(reader), 67 | Compression::GZ => Box::new(std::io::BufReader::new(GzDecoder::new(reader))), 68 | Compression::BGZF => match format { 69 | // BCF|BAM will appear as bgzf so we don't want to do this outside 70 | FileFormat::BCF | FileFormat::BAM => Box::new(reader), 71 | _ => Box::new(bgzf::Reader::new(reader)), 72 | }, 73 | Compression::RAZF => unimplemented!(), 74 | }; 75 | 76 | match format { 77 | FileFormat::BED => { 78 | let reader = BedderBed::new(br); 79 | Ok(Box::new(reader)) 80 | } 81 | _ => unimplemented!("{format:?} not yet supported"), 82 | } 83 | } 84 | } 85 | } 86 | 87 | /// detect the file format of a reader. 88 | pub fn detect_file_format>( 89 | reader: &mut R, 90 | path: P, 91 | ) -> std::io::Result<(FileFormat, Compression)> { 92 | let buf = reader.fill_buf()?; 93 | let mut dec_buf = vec![0; buf.len()]; 94 | 95 | let is_gzipped = &buf[0..2] == b"\x1f\x8b"; 96 | let (compression, dec_buf) = if is_gzipped && buf[3] & 4 != 0 && buf.len() >= 18 { 97 | let c = match &buf[12..16] { 98 | // BGZF magic number 99 | b"BC\x02\x00" => Compression::BGZF, 100 | // RAZF magic number 101 | b"RAZF" => Compression::RAZF, 102 | _ => Compression::GZ, 103 | }; 104 | 105 | let mut gz = GzDecoder::new(buf); 106 | // it's ok if we have an unexepected EOF here 107 | match gz.read_exact(&mut dec_buf) { 108 | Ok(_) => {} 109 | Err(e) => { 110 | if e.kind() != std::io::ErrorKind::UnexpectedEof { 111 | return Err(e); 112 | } 113 | } 114 | } 115 | (c, dec_buf.as_slice()) 116 | } else { 117 | ( 118 | if is_gzipped { 119 | Compression::GZ 120 | } else { 121 | Compression::None 122 | }, 123 | buf, 124 | ) 125 | }; 126 | 127 | let format = if dec_buf.starts_with(b"BAM\x01") { 128 | FileFormat::BAM 129 | } else if &dec_buf[0..3] == b"BCF" && (dec_buf[3] == 0x2 || dec_buf[3] == 0x4) { 130 | FileFormat::BCF 131 | } else if dec_buf.starts_with(b"##fileformat=VCF") { 132 | FileFormat::VCF 133 | } else if dec_buf.starts_with(b"CRAM") { 134 | FileFormat::CRAM 135 | } else if dec_buf.len() > 3 136 | && (&dec_buf[0..4] == b"@HD\t" 137 | || &dec_buf[0..4] == b"@SQ\t" 138 | || &dec_buf[0..4] == b"@RG\t" 139 | || &dec_buf[0..4] == b"@PG\t" 140 | || &dec_buf[0..4] == b"@CO\t") 141 | { 142 | FileFormat::SAM 143 | } else { 144 | let p = path.as_ref(); 145 | if p.ends_with(".bed") || p.ends_with(".bed.gz") || p.ends_with(".bed.bgz") { 146 | FileFormat::BED 147 | } else { 148 | FileFormat::Unknown 149 | } 150 | }; 151 | 152 | if matches!(format, FileFormat::Unknown) { 153 | let s = String::from_utf8_lossy(dec_buf); 154 | let mut lines = s 155 | .lines() 156 | .filter(|l| !l.is_empty() && !l.starts_with('#')) 157 | .collect::>(); 158 | if lines 159 | .last() 160 | .map(|l| !l.ends_with('\n') && l.split('\t').collect::>().len() < 3) 161 | .unwrap_or(false) 162 | { 163 | // drop the final incomplete line 164 | lines.pop(); 165 | } 166 | 167 | if !lines.is_empty() && lines.iter().all(|&line| is_bed_line(line)) { 168 | return Ok((FileFormat::BED, compression)); 169 | } 170 | } 171 | 172 | Ok((format, compression)) 173 | } 174 | 175 | fn is_bed_line(s: &str) -> bool { 176 | if s.starts_with('#') { 177 | return true; 178 | } 179 | let cols: Vec<_> = s.split('\t').collect(); 180 | if cols.len() < 3 { 181 | return false; 182 | } 183 | // check that 2nd and 3rd cols are integers 184 | cols[1].parse::().is_ok() && cols[2].parse::().is_ok() 185 | } 186 | 187 | #[cfg(test)] 188 | mod tests { 189 | 190 | use super::*; 191 | use noodles::bam; 192 | use noodles::sam; 193 | 194 | #[test] 195 | fn test_detect_format_bam() { 196 | let file_path = "tests/test.bam"; 197 | let mut fs = std::fs::File::open(file_path).unwrap(); 198 | let mut rdr = std::io::BufReader::new(&mut fs); 199 | let (format, compression) = detect_file_format(&mut rdr, file_path).unwrap(); 200 | assert_eq!(compression, Compression::BGZF); 201 | assert_eq!(format, FileFormat::BAM); 202 | 203 | let mut b = bam::reader::Reader::new(&mut rdr); 204 | let h = b.read_header().expect("eror reading header"); 205 | for r in b.records(&h) { 206 | let r = r.expect("error reading record"); 207 | eprintln!("{:?}", r); 208 | } 209 | } 210 | 211 | #[test] 212 | fn test_detect_format_sam() { 213 | let file_path = "tests/test.sam"; 214 | let mut fs = std::fs::File::open(file_path).unwrap(); 215 | let mut rdr = std::io::BufReader::new(&mut fs); 216 | let (format, compression) = detect_file_format(&mut rdr, file_path).unwrap(); 217 | assert_eq!(compression, Compression::None); 218 | assert_eq!(format, FileFormat::SAM); 219 | 220 | let mut b = sam::reader::Reader::new(&mut rdr); 221 | let h = b.read_header().expect("eror reading header"); 222 | for r in b.records(&h) { 223 | let r = r.expect("error reading record"); 224 | eprintln!("{:?}", r); 225 | } 226 | } 227 | 228 | #[test] 229 | fn test_is_bed_line() { 230 | // Test valid BED line 231 | let valid_bed_line = "chr1\t100\t200\tname\t0\t+\t50\t150\t0\t2\t10,20\t0,80"; 232 | assert!(is_bed_line(valid_bed_line)); 233 | 234 | // Test invalid BED line with missing columns 235 | let invalid_bed_line = "chr1\t100"; 236 | assert!(!is_bed_line(invalid_bed_line)); 237 | 238 | // Test invalid BED line with non-integer columns 239 | let invalid_bed_line = "chr1\ta\tb\tname\t0\t+\t50\t150\t0\t2\t10,20\t0,80"; 240 | assert!(!is_bed_line(invalid_bed_line)); 241 | 242 | // Test comment line 243 | let comment_line = "# This is a comment"; 244 | assert!(is_bed_line(comment_line)); 245 | 246 | // single interval with no newline. 247 | let valid_bed_line = "chr1\t100\t200"; 248 | assert!(is_bed_line(valid_bed_line)); 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /src/string.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "smol_str")] 2 | pub use smol_str::SmolStr as String; 3 | 4 | #[cfg(feature = "smartstring")] 5 | pub use smartstring::alias::String; 6 | 7 | #[cfg(feature = "compact_str")] 8 | pub use compact_str::CompactString as String; 9 | 10 | #[cfg(feature = "kstring")] 11 | pub use kstring::KString as String; 12 | 13 | #[cfg(all( 14 | not(feature = "smartstring"), 15 | not(feature = "smol_str"), 16 | not(feature = "compact_str"), 17 | not(feature = "kstring"), 18 | ))] 19 | pub use std::string::String; 20 | -------------------------------------------------------------------------------- /tests/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quinlan-lab/bedder-rs/ca1ff342611ed2b968729526cdd75ae988c4ea3b/tests/test.bam -------------------------------------------------------------------------------- /tests/test.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.6 SO:coordinate 2 | @SQ SN:chr1 LN:1009800 3 | A 16 chr1 999901 42 100M * 0 0 ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU 4 | B 0 chr1 999914 42 100M * 0 0 TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU 5 | --------------------------------------------------------------------------------