├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
    └── random_intervals.rs
├── docs.sh
├── examples
    └── intersect_bed_count.rs
├── src
    ├── bedder_bed.rs
    ├── bedder_vcf.rs
    ├── chrom_ordering.rs
    ├── intersection.rs
    ├── interval.rs
    ├── lib.rs
    ├── main.rs
    ├── position.rs
    ├── sniff.rs
    └── string.rs
└── tests
    ├── test.bam
    └── test.sam


/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request]
 2 | 
 3 | name: Continuous integration
 4 | 
 5 | jobs:
 6 |   check:
 7 |     name: Check
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - uses: actions-rs/toolchain@v1
12 |         with:
13 |           profile: minimal
14 |           toolchain: stable
15 |           override: true
16 |       - uses: actions-rs/cargo@v1
17 |         with:
18 |           command: check
19 | 
20 |   test:
21 |     name: Test Suite
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - uses: actions/checkout@v2
25 |       - uses: actions-rs/toolchain@v1
26 |         with:
27 |           profile: minimal
28 |           toolchain: stable
29 |           override: true
30 |       - uses: actions-rs/cargo@v1
31 |         with:
32 |           command: test
33 |       - uses: actions-rs/cargo@v1
34 |         with:
35 |           command: test
36 |           args: --features smartstring
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | docs/
4 | *.bed
5 | *.bed.gz
6 | *.vcf
7 | .vscode
8 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bedder"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | [lib]
 8 | name = "bedder"
 9 | path = "src/lib.rs"
10 | 
11 | [[bin]]
12 | name = "bedder"
13 | path = "src/main.rs"
14 | 
15 | 
16 | [dependencies]
17 | rand = "0.8.5"
18 | smartstring = { version = "1.0.1", optional = true }
19 | smol_str = { version = "0.2.0", optional = true }
20 | compact_str = { version = "0.7.0", optional = true }
21 | kstring = { version = "2.0.0", optional = true }
22 | noodles = { version = "0.52.0" }
23 | flate2 = "1.0.26"
24 | clap = { version = "4.2.7", features = ['derive'] }
25 | env_logger = "0.10.0"
26 | log = "0.4.19"
27 | linear-map = "1.2.0"
28 | hashbrown = "0.14.0"
29 | xvcf = { version = "0.1.4", git = "https://github.com/brentp/xvcf-rs" }
30 | 
31 | [features]
32 | default = ["bed", "vcf", "bcf", "csi", "core", "bam", "sam", "bgzf"]
33 | bam = ["noodles/bam"]
34 | bed = ["noodles/bed"]
35 | bgzf = ["noodles/bgzf"]
36 | #cram = ["noodles/cram"]
37 | sam = ["noodles/sam"]
38 | vcf = ["noodles/vcf"]
39 | csi = ["noodles/csi"]
40 | core = ["noodles/core"]
41 | bcf = ["noodles/bcf"]
42 | # allow a Box<dyn Positioned> in the enum to support user-specified types.
43 | dyn_positioned = []
44 | 
45 | [dev-dependencies]
46 | criterion = { version = "0.4", features = ["html_reports"] }
47 | clap = { version = "4.2.7", features = ["derive"] }
48 | 
49 | [[bench]]
50 | name = "random_intervals"
51 | harness = false
52 | 
53 | [profile.release]
54 | lto = true
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Brent Pedersen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--- 
  2 | # build
  3 | target=x86_64-unknown-linux-gnu
  4 | export RUSTFLAGS="-C target-feature=-crt-static -C relocation-model=pie"
  5 | cargo test --release --target $target \
  6 | && cargo build --release --target $target
  7 | --->
  8 | 
  9 | [![status](https://github.com/quinlan-lab/bedder-rs/actions/workflows/rust.yml/badge.svg)](https://github.com/quinlan-lab/bedder-rs/actions/)
 10 | 
 11 | # bedder (tools)
 12 | 
 13 | This is an early release of the library for feedback, especially from rust practitioners. If interested,
 14 | read below and then, for example, have a look at [issue 2](https://github.com/quinlan-lab/bedder-rs/issues/2) and the associated [discussion](https://github.com/quinlan-lab/bedder-rs/discussions/3)
 15 | 
 16 | This library aims to provide:
 17 | 
 18 | - [x] an abstraction so any interval types from sorted sources can be intersected together
 19 | - [x] the rust implementation of the heap and Queue to find intersections with minimal overhead
 20 | - [ ] bedder wrappers for:
 21 |   + [x] bed
 22 |   + [x] vcf/bcf
 23 |   + [ ] sam/bam/cram
 24 |   + [ ] gff/gtf
 25 |   + [ ] generalized tabixed/csi files
 26 | - [ ] downstream APIs to perform operations on the intersections
 27 | - [ ] a python library to interact with the intersections
 28 | 
 29 | The API looks as follows
 30 | 
 31 | Any genomic position from any data source can be intersected by this library as long as it implements this trait:
 32 | 
 33 | ```rust
 34 | 
 35 | pub trait Positioned {
 36 |     fn chrom(&self) -> &str;
 37 |     fn start(&self) -> u64;
 38 |     fn stop(&self) -> u64;
 39 | 
 40 |     // extract a value from the Positioned object Field
 41 |     fn value(&self, f: Field) -> Result<Value, FieldError>;
 42 | }
 43 | 
 44 | /// Value can be any number of Ints, Floats, or Strings.
 45 | pub enum Value {
 46 |     Ints(Vec<i64>),
 47 |     Floats(Vec<f64>),
 48 |     Strings(Vec<String>),
 49 | }
 50 | 
 51 | /// Field is either an integer: the i'th column.
 52 | /// Or a String, e.g. "INFO.DP".
 53 | pub enum Field {
 54 |     String(String),
 55 |     Int(usize),
 56 | }
 57 | 
 58 | pub enum FieldError {
 59 |     InvalidFieldIndex(usize),
 60 |     InvalidFieldName(String),
 61 | }
 62 | 
 63 | ```
 64 | 
 65 | Then each file-type (VCF/BAM/etc) would implement this trait
 66 | 
 67 | ```rust
 68 | // something that generates Positioned things (BED/VCF/BAM/GFF/etc.)
 69 | pub trait PositionedIterator {
 70 |     type Item: Positioned;
 71 | 
 72 |     /// Q can be ignored. See below for more detail.
 73 |     fn next_position(&mut self, q: Option<&dyn Positioned>) -> Option<Self::Item>;
 74 | 
 75 |     /// A name for the iterator (likely filename) used by this library when logging.
 76 |     fn name(&self)
 77 | }
 78 | ```
 79 | 
 80 | Anything that can create a `PositionedIterator` can be used by the library.
 81 | 
 82 | Note the `q` argument to `next_position`. This can be ignored by implementers but can be used to skip.
 83 | For each query interval, we may make many calls to `next_position`. On the first of those calls, `q`
 84 | is `Some(query_position)`. The implementer can choose to use this information to skip (rather than stream)
 85 | for example with an index (or search) to the first interval that overlaps the `q`. Subsequent calls for the
 86 | same interval will be called with `q` of `None`. The implementer must:
 87 | 
 88 | 1. Always return an interval (unless EOF is reached)
 89 | 1. Always return intervals in order.
 90 | 1. Never return an interval that was returned previously (even if the same query interval appears multiple times).
 91 | 
 92 | # Implementation Brief
 93 | 
 94 | All Positioned structs are pulled through a min-heap. Each time an interval (with the smallest genomic position) is pulled from the min heap,
 95 | a new struct is pulled from the file where that interval originated. Then the pulled interval is pushed onto a `queue` (actually a deque becase that's what is in the rust standard library).
 96 | We then know the queue is in order. For each query interval, we drop from the queue any interval that is strictly _before_ the interval,
 97 | then pull into the Intersection result any interval that is not _after_ the interval. Then return the result from the `next` call.
 98 | We use `Rc` because each database interval may be attached to more than one query interval.
 99 | 
100 | # Acknowledgements
101 | 
102 | - We received very valuable `rust` feedback and code from @sstadick.
103 | - We leverage the excellent [noodles](https://github.com/zaeleus/noodles) library.
104 | 


--------------------------------------------------------------------------------
/benches/random_intervals.rs:
--------------------------------------------------------------------------------
 1 | use bedder::chrom_ordering::parse_genome;
 2 | use bedder::intersection::IntersectionIterator;
 3 | use bedder::interval::Interval;
 4 | use bedder::position::{Position, PositionedIterator};
 5 | use bedder::string::String;
 6 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
 7 | use rand::Rng;
 8 | use std::io;
 9 | 
10 | struct Intervals {
11 |     i: usize,
12 |     name: String,
13 |     n_intervals: usize,
14 |     curr_max: f64,
15 |     rng: rand::rngs::ThreadRng,
16 |     interval_len: u64,
17 |     saved_chrom: String,
18 | }
19 | 
20 | impl Intervals {
21 |     fn new(name: String, n_intervals: usize, interval_len: u64) -> Self {
22 |         Intervals {
23 |             i: 0,
24 |             name: name,
25 |             n_intervals,
26 |             curr_max: 1.0,
27 |             rng: rand::thread_rng(),
28 |             interval_len: interval_len,
29 |             saved_chrom: String::from("chr1"),
30 |         }
31 |     }
32 | }
33 | 
34 | impl PositionedIterator for Intervals {
35 |     fn name(&self) -> String {
36 |         String::from(format!("{}:{}", self.name, self.i))
37 |     }
38 | 
39 |     fn next_position(&mut self, _q: Option<&Position>) -> Option<io::Result<Position>> {
40 |         if self.i < self.n_intervals {
41 |             self.i += 1;
42 |             let r: f64 = self.rng.gen();
43 |             self.curr_max *= r.powf(self.i as f64);
44 |             let start = ((1.0 - self.curr_max) * (MAX_POSITION as f64)) as u64;
45 |             Some(Ok(Position::Interval(Interval {
46 |                 chrom: self.saved_chrom.clone(),
47 |                 start: start,
48 |                 stop: start + self.interval_len,
49 |                 ..Default::default()
50 |             })))
51 |         } else {
52 |             None
53 |         }
54 |     }
55 | }
56 | 
57 | const MAX_POSITION: u64 = 10_000;
58 | 
59 | pub fn intersection_benchmark(c: &mut Criterion) {
60 |     let genome_str = "chr1\nchr2\n";
61 |     let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
62 | 
63 |     c.bench_function("simple intersection", |b| {
64 |         b.iter_with_large_drop(|| {
65 |             let a_ivs = Box::new(Intervals::new(String::from("a"), 100, 1000));
66 |             let b_ivs = Box::new(Intervals::new(String::from("b"), 100_000, 100));
67 |             let iter = IntersectionIterator::new(a_ivs, vec![b_ivs], &chrom_order)
68 |                 .expect("error getting iterator");
69 | 
70 |             iter.for_each(|intersection| {
71 |                 let intersection = intersection.expect("error getting intersection");
72 |                 black_box(intersection.overlapping);
73 |             });
74 |         });
75 |     });
76 | }
77 | 
78 | criterion_group!(benches, intersection_benchmark);
79 | criterion_main!(benches);
80 | 


--------------------------------------------------------------------------------
/docs.sh:
--------------------------------------------------------------------------------
1 | # https://dev.to/deciduously/prepare-your-rust-api-docs-for-github-pages-2n5i
2 | cargo doc --no-deps
3 | rm -rf ./docs target/doc/
4 | echo "<meta http-equiv=\"refresh\" content=\"0; url=build_wheel\">" > target/doc/index.html
5 | cp -r target/doc ./docs
6 | 


--------------------------------------------------------------------------------
/examples/intersect_bed_count.rs:
--------------------------------------------------------------------------------
 1 | use std::fs;
 2 | use std::io::{self, BufReader, BufWriter, Write};
 3 | use std::path::PathBuf;
 4 | 
 5 | use bedder::sniff;
 6 | use clap::Parser;
 7 | extern crate bedder;
 8 | use crate::bedder::chrom_ordering::parse_genome;
 9 | use crate::bedder::intersection::IntersectionIterator;
10 | 
11 | #[derive(Parser, Debug)]
12 | struct Args {
13 |     a: PathBuf,
14 |     b: PathBuf,
15 | 
16 |     fai: PathBuf,
17 | }
18 | 
19 | fn main() -> io::Result<()> {
20 |     let args = Args::parse();
21 | 
22 |     // sniff determines the file type (bam/cram/bcf/vcf/bed/gff/gtf)
23 |     // and returns a PositionIterator
24 |     let ai = sniff::open_file(&args.a)?;
25 |     let bi = sniff::open_file(&args.b)?;
26 | 
27 |     // bedder always requires a hashmap that indicates the chromosome order
28 |     let fh = BufReader::new(fs::File::open(args.fai)?);
29 |     let h = parse_genome(fh)?;
30 | 
31 |     // we can have any number of b (other_iterators).
32 |     let it = IntersectionIterator::new(ai, vec![bi], &h)?;
33 | 
34 |     // we need to use buffered stdout or performance is determined by
35 |     // file IO
36 |     let mut stdout = BufWriter::new(io::stdout());
37 | 
38 |     for intersection in it {
39 |         let intersection = intersection?;
40 |         writeln!(
41 |             &mut stdout,
42 |             "{}\t{}\t{}\t{}",
43 |             intersection.base_interval.chrom(),
44 |             intersection.base_interval.start(),
45 |             intersection.base_interval.stop(),
46 |             intersection.overlapping.len()
47 |         )?;
48 |     }
49 | 
50 |     Ok(())
51 | }
52 | 


--------------------------------------------------------------------------------
/src/bedder_bed.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::useless_conversion)] // these are needed to support e.g. smartstring
  2 | 
  3 | use crate::position::{Field, FieldError, Position, Positioned, Value, Valued};
  4 | use crate::string::String;
  5 | pub use bed::record::Record;
  6 | pub use noodles::bed;
  7 | use std::io::{self, BufRead};
  8 | use std::result;
  9 | 
 10 | impl crate::position::Positioned for bed::record::Record<3> {
 11 |     #[inline]
 12 |     fn chrom(&self) -> &str {
 13 |         self.reference_sequence_name()
 14 |     }
 15 | 
 16 |     #[inline]
 17 |     fn start(&self) -> u64 {
 18 |         // noodles position is 1-based.
 19 |         self.start_position().get() as u64 - 1
 20 |     }
 21 | 
 22 |     #[inline]
 23 |     fn stop(&self) -> u64 {
 24 |         self.end_position().get() as u64
 25 |     }
 26 | }
 27 | 
 28 | impl Valued for bed::record::Record<3> {
 29 |     fn value(&self, v: crate::position::Field) -> result::Result<Value, FieldError> {
 30 |         match v {
 31 |             Field::String(s) => Ok(Value::Strings(vec![s])),
 32 |             Field::Int(i) => match i {
 33 |                 0 => Ok(Value::Strings(vec![String::from(self.chrom())])),
 34 |                 1 => Ok(Value::Ints(vec![self.start() as i64])),
 35 |                 2 => Ok(Value::Ints(vec![self.stop() as i64])),
 36 |                 _ => Err(FieldError::InvalidFieldIndex(i)),
 37 |             },
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | struct Last {
 43 |     chrom: String,
 44 |     start: u64,
 45 |     stop: u64,
 46 | }
 47 | 
 48 | pub struct BedderBed<R>
 49 | where
 50 |     R: BufRead,
 51 | {
 52 |     reader: bed::Reader<R>,
 53 |     buf: std::string::String,
 54 |     last_record: Option<Last>,
 55 |     line_number: u64,
 56 | }
 57 | 
 58 | impl<R> BedderBed<R>
 59 | where
 60 |     R: BufRead,
 61 | {
 62 |     pub fn new(r: R) -> BedderBed<R> {
 63 |         BedderBed {
 64 |             reader: bed::Reader::new(r),
 65 |             buf: std::string::String::new(),
 66 |             last_record: None,
 67 |             line_number: 0,
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | impl<R> crate::position::PositionedIterator for BedderBed<R>
 73 | where
 74 |     R: BufRead,
 75 | {
 76 |     fn next_position(
 77 |         &mut self,
 78 |         _q: Option<&crate::position::Position>,
 79 |     ) -> Option<std::result::Result<Position, std::io::Error>> {
 80 |         self.buf.clear();
 81 |         loop {
 82 |             self.line_number += 1;
 83 |             return match self.reader.read_line(&mut self.buf) {
 84 |                 Ok(0) => None,
 85 |                 Ok(_) => {
 86 |                     if self.buf.starts_with('#') || self.buf.is_empty() {
 87 |                         continue;
 88 |                     }
 89 |                     let record: bed::record::Record<3> = match self.buf.parse() {
 90 |                         Err(e) => {
 91 |                             let msg = format!(
 92 |                                 "line#{:?}:{:?} error: {:?}",
 93 |                                 self.line_number, &self.buf, e
 94 |                             );
 95 |                             return Some(Err(io::Error::new(io::ErrorKind::InvalidData, msg)));
 96 |                         }
 97 |                         Ok(r) => r,
 98 |                     };
 99 | 
100 |                     match &mut self.last_record {
101 |                         None => {
102 |                             self.last_record = Some(Last {
103 |                                 chrom: String::from(record.chrom()),
104 |                                 start: record.start(),
105 |                                 stop: record.stop(),
106 |                             })
107 |                         }
108 |                         Some(r) => {
109 |                             if r.chrom != record.chrom() {
110 |                                 r.chrom = String::from(record.chrom())
111 |                             }
112 |                             r.start = record.start();
113 |                             r.stop = record.stop();
114 |                         }
115 |                     }
116 | 
117 |                     Some(Ok(Position::Bed(record)))
118 |                 }
119 |                 Err(e) => Some(Err(e)),
120 |             };
121 |         }
122 |     }
123 | 
124 |     fn name(&self) -> String {
125 |         String::from(format!("bed:{}", self.line_number))
126 |     }
127 | }
128 | 
129 | #[cfg(test)]
130 | mod tests {
131 |     use super::*;
132 |     use crate::chrom_ordering::Chromosome;
133 |     use crate::intersection::IntersectionIterator;
134 |     use hashbrown::HashMap;
135 |     use std::io::Cursor;
136 | 
137 |     #[test]
138 |     fn test_bed_read() {
139 |         // write a test for bed from a string using BufRead
140 |         let ar = BedderBed::new(Cursor::new("chr1\t20\t30\nchr1\t21\t33"));
141 |         let br = BedderBed::new(Cursor::new("chr1\t21\t30\nchr1\t22\t33"));
142 | 
143 |         let chrom_order = HashMap::from([
144 |             (
145 |                 String::from("chr1"),
146 |                 Chromosome {
147 |                     index: 0usize,
148 |                     length: None,
149 |                 },
150 |             ),
151 |             (
152 |                 String::from("chr2"),
153 |                 Chromosome {
154 |                     index: 1usize,
155 |                     length: None,
156 |                 },
157 |             ),
158 |         ]);
159 | 
160 |         let it = IntersectionIterator::new(Box::new(ar), vec![Box::new(br)], &chrom_order)
161 |             .expect("error creating iterator");
162 | 
163 |         let mut n = 0;
164 |         it.for_each(|int| {
165 |             let int = int.expect("error getting intersection");
166 |             //dbg!(&int.overlapping);
167 |             assert!(int.overlapping.len() == 2);
168 |             n += 1;
169 |         });
170 |         assert!(n == 2);
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/bedder_vcf.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::useless_conversion)] // these are needed to support e.g. smartstring
  2 | use crate::position::{Field, FieldError, Position, Positioned, Value};
  3 | use crate::string::String;
  4 | use noodles::core::Region;
  5 | use noodles::vcf::{self, record::Chromosome};
  6 | use std::io::{self, Read, Seek};
  7 | use std::result;
  8 | use vcf::record::info::field;
  9 | use vcf::record::QualityScore;
 10 | pub use vcf::Record;
 11 | pub use xvcf;
 12 | use xvcf::Skip;
 13 | 
 14 | pub struct BedderVCF<R> {
 15 |     reader: xvcf::Reader<R>,
 16 |     record_number: u64,
 17 |     header: vcf::Header,
 18 | }
 19 | 
 20 | impl<R> BedderVCF<R>
 21 | where
 22 |     R: Read + 'static,
 23 | {
 24 |     pub fn new(r: xvcf::Reader<R>) -> io::Result<BedderVCF<R>> {
 25 |         let h = r.header().clone();
 26 |         let v = BedderVCF {
 27 |             reader: r,
 28 |             record_number: 0,
 29 |             header: h,
 30 |         };
 31 |         Ok(v)
 32 |     }
 33 | }
 34 | 
 35 | pub fn match_info_value(info: &vcf::record::Info, name: &str) -> result::Result<Value, FieldError> {
 36 |     //let info = record.info();
 37 |     let key: vcf::record::info::field::Key = name
 38 |         .parse()
 39 |         .map_err(|_| FieldError::InvalidFieldName(String::from(name)))?;
 40 | 
 41 |     match info.get(&key) {
 42 |         Some(value) => match value {
 43 |             Some(field::Value::Integer(i)) => Ok(Value::Ints(vec![*i as i64])),
 44 |             Some(field::Value::Float(f)) => Ok(Value::Floats(vec![*f as f64])),
 45 |             Some(field::Value::String(s)) => Ok(Value::Strings(vec![String::from(s)])),
 46 |             Some(field::Value::Character(c)) => {
 47 |                 Ok(Value::Strings(vec![String::from(c.to_string())]))
 48 |             }
 49 |             //Some(field::Value::Flag) => Ok(Value::Strings(vec![String::from("true")])),
 50 |             Some(field::Value::Array(arr)) => {
 51 |                 match arr {
 52 |                     field::value::Array::Integer(arr) => Ok(Value::Ints(
 53 |                         arr.iter().flatten().map(|&v| v as i64).collect(),
 54 |                     )),
 55 |                     field::value::Array::Float(arr) => Ok(Value::Floats(
 56 |                         arr.iter().flatten().map(|&v| v as f64).collect(),
 57 |                     )),
 58 |                     field::value::Array::String(arr) => Ok(Value::Strings(
 59 |                         arr.iter().flatten().map(String::from).collect(),
 60 |                     )),
 61 |                     field::value::Array::Character(arr) => Ok(Value::Strings(
 62 |                         arr.iter().flatten().map(|v| v.to_string().into()).collect(),
 63 |                     )),
 64 |                     //field::Value::Flag => Ok(Value::Strings(vec![String::from("true")])),
 65 |                 }
 66 |             }
 67 | 
 68 |             _ => Err(FieldError::InvalidFieldName(String::from(name))),
 69 |         },
 70 |         None => Err(FieldError::InvalidFieldName(String::from(name))),
 71 |     }
 72 | }
 73 | 
 74 | pub fn match_value(record: &vcf::record::Record, f: Field) -> result::Result<Value, FieldError> {
 75 |     match f {
 76 |         Field::String(s) => match s.as_str() {
 77 |             "chrom" => Ok(Value::Strings(vec![String::from(Positioned::chrom(
 78 |                 record,
 79 |             ))])),
 80 |             "start" => Ok(Value::Ints(vec![Positioned::start(record) as i64])),
 81 |             "stop" => Ok(Value::Ints(vec![Positioned::stop(record) as i64])),
 82 |             "ID" => Ok(Value::Strings(
 83 |                 record.ids().iter().map(|s| s.to_string().into()).collect(),
 84 |             )),
 85 |             "FILTER" => Ok(Value::Strings(
 86 |                 record
 87 |                     .filters()
 88 |                     .iter()
 89 |                     .map(|s| String::from(s.to_string()))
 90 |                     .collect(),
 91 |             )),
 92 |             "QUAL" => Ok(Value::Floats(vec![f32::from(
 93 |                 record
 94 |                     .quality_score()
 95 |                     .unwrap_or(QualityScore::try_from(0f32).expect("error getting quality score")),
 96 |             ) as f64])),
 97 |             _ => {
 98 |                 if s.len() > 5 && &s[0..5] == "INFO." {
 99 |                     match_info_value(record.info(), &s[5..])
100 |                 } else {
101 |                     // TODO: format
102 |                     unimplemented!();
103 |                 }
104 |             }
105 |         },
106 | 
107 |         Field::Int(i) => Err(FieldError::InvalidFieldIndex(i)),
108 |     }
109 | }
110 | 
111 | impl Positioned for vcf::record::Record {
112 |     #[inline]
113 |     fn chrom(&self) -> &str {
114 |         match self.chromosome() {
115 |             Chromosome::Name(s) => s,
116 |             Chromosome::Symbol(s) => s,
117 |         }
118 |     }
119 | 
120 |     #[inline]
121 |     fn start(&self) -> u64 {
122 |         usize::from(self.position()) as u64
123 |     }
124 | 
125 |     #[inline]
126 |     fn stop(&self) -> u64 {
127 |         usize::from(self.end().expect("error getting end from vcf record")) as u64
128 |     }
129 | }
130 | 
131 | impl<R> crate::position::PositionedIterator for BedderVCF<R>
132 | where
133 |     R: Read + Seek + 'static,
134 | {
135 |     fn next_position(
136 |         &mut self,
137 |         q: Option<&crate::position::Position>,
138 |     ) -> Option<std::result::Result<Position, std::io::Error>> {
139 |         if let Some(q) = q {
140 |             let s = noodles::core::Position::new(q.start() as usize + 1)?;
141 |             let e = noodles::core::Position::new(q.stop() as usize + 1)?;
142 |             let region = Region::new(q.chrom(), s..=e);
143 |             match self.reader.skip_to(&self.header, &region) {
144 |                 Ok(_) => (),
145 |                 Err(e) => return Some(Err(e)),
146 |             }
147 |         }
148 | 
149 |         // take self.reader.variant if it's there
150 |         if let Some(v) = self.reader.take() {
151 |             self.record_number += 1;
152 |             return Some(Ok(Position::Vcf(Box::new(v))));
153 |         }
154 | 
155 |         let mut v = vcf::Record::default();
156 | 
157 |         match self.reader.next_record(&self.header, &mut v) {
158 |             Ok(0) => None, // EOF
159 |             Ok(_) => {
160 |                 self.record_number += 1;
161 |                 Some(Ok(Position::Vcf(Box::new(v))))
162 |             }
163 |             Err(e) => Some(Err(e)),
164 |         }
165 |     }
166 |     fn name(&self) -> String {
167 |         String::from("vcf line number:".to_owned() + self.record_number.to_string().as_str())
168 |     }
169 | }
170 | 
171 | // tests
172 | #[cfg(test)]
173 | mod tests {
174 |     use super::*;
175 | 
176 |     #[test]
177 |     fn test_match_info() {
178 |         let key: field::Key = "AAA".parse().expect("error parsing key");
179 | 
180 |         let info: vcf::record::Info = [(key, Some(field::Value::Integer(1)))]
181 |             .into_iter()
182 |             .collect();
183 | 
184 |         // write a test to extract the value using match_info_value
185 |         let value = match_info_value(&info, "AAA").unwrap();
186 |         assert!(matches!(value, Value::Ints(_)));
187 |     }
188 | 
189 |     #[test]
190 |     fn test_match_info_vector() {
191 |         let key: field::Key = "AAA".parse().expect("error parsing key");
192 | 
193 |         let info: vcf::record::Info = [(
194 |             key,
195 |             Some(field::Value::Array(field::value::Array::Integer(vec![
196 |                 Some(-1),
197 |                 Some(2),
198 |                 Some(3),
199 |                 None,
200 |                 Some(496),
201 |             ]))),
202 |         )]
203 |         .into_iter()
204 |         .collect();
205 | 
206 |         // write a test to extract the value using match_info_value
207 |         let value = match_info_value(&info, "AAA").unwrap();
208 |         assert!(matches!(value, Value::Ints(_)));
209 | 
210 |         if let Value::Ints(v) = value {
211 |             assert_eq!(v.len(), 4);
212 |             assert_eq!(v[0], -1);
213 |             assert_eq!(v[1], 2);
214 |             assert_eq!(v[2], 3);
215 |             assert_eq!(v[3], 496);
216 |         } else {
217 |             panic!("error getting value");
218 |         }
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/chrom_ordering.rs:
--------------------------------------------------------------------------------
 1 | use crate::string::String;
 2 | use hashbrown::HashMap;
 3 | use std::io::{self, BufRead, Read};
 4 | 
 5 | #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 6 | pub struct Chromosome {
 7 |     pub(crate) index: usize,
 8 |     pub(crate) length: Option<usize>,
 9 | }
10 | /// A genome is a map from chromosome name to index with an optional chromosome length.
11 | 
12 | pub fn parse_genome<R>(reader: R) -> io::Result<HashMap<String, Chromosome>>
13 | where
14 |     R: Read,
15 | {
16 |     let mut reader = io::BufReader::new(reader);
17 |     let mut genome = HashMap::default();
18 |     let mut line = std::string::String::new();
19 |     while reader.read_line(&mut line)? > 0 {
20 |         if line.trim().is_empty() || line.starts_with('#') {
21 |             line.clear();
22 |             continue;
23 |         }
24 |         let mut fields = line.split_whitespace();
25 |         match fields.next() {
26 |             Some(chrom) => {
27 |                 let length = fields.next().map(|s| s.parse::<usize>());
28 |                 let l = length.and_then(|c| match c {
29 |                     Ok(l) => Some(l),
30 |                     Err(_) => {
31 |                         log::warn!(
32 |                             "invalid length for chromosome {} with line: {}",
33 |                             chrom,
34 |                             line
35 |                         );
36 |                         None
37 |                     }
38 |                 });
39 |                 genome.insert(
40 |                     String::from(chrom),
41 |                     Chromosome {
42 |                         index: genome.len(),
43 |                         length: l,
44 |                     },
45 |                 );
46 |             }
47 |             None => {
48 |                 return Err(io::Error::new(
49 |                     io::ErrorKind::InvalidData,
50 |                     format!("invalid genome file line: {}", line),
51 |                 ))
52 |             }
53 |         }
54 |         //.expect("require at least one column in genome file");
55 |         line.clear();
56 |     }
57 |     Ok(genome)
58 | }
59 | 
60 | #[cfg(test)]
61 | mod tests {
62 |     use super::*;
63 | 
64 |     #[test]
65 |     fn test_parse_genome() {
66 |         let genome_str = "chr1\nchr2\t43\nchr3\n";
67 |         let genome = parse_genome(genome_str.as_bytes()).unwrap();
68 |         assert_eq!(genome.len(), 3);
69 |         assert_eq!(
70 |             genome.get("chr1"),
71 |             Some(&Chromosome {
72 |                 index: 0,
73 |                 length: None
74 |             })
75 |         );
76 |         assert_eq!(
77 |             genome.get("chr2"),
78 |             Some(&Chromosome {
79 |                 index: 1,
80 |                 length: Some(43)
81 |             })
82 |         );
83 |         assert_eq!(
84 |             genome.get("chr3"),
85 |             Some(&Chromosome {
86 |                 index: 2,
87 |                 length: None
88 |             })
89 |         );
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/intersection.rs:
--------------------------------------------------------------------------------
  1 | use crate::chrom_ordering::Chromosome;
  2 | use crate::string::String;
  3 | use hashbrown::HashMap;
  4 | use std::cmp::Ordering;
  5 | use std::collections::{vec_deque::VecDeque, BinaryHeap};
  6 | use std::io;
  7 | use std::io::{Error, ErrorKind};
  8 | use std::rc::Rc;
  9 | //use std::sync::Arc as Rc;
 10 | 
 11 | use crate::position::{Position, PositionedIterator};
 12 | 
 13 | /// An iterator that returns the intersection of multiple iterators.
 14 | pub struct IntersectionIterator<'a> {
 15 |     base_iterator: Box<dyn PositionedIterator>,
 16 |     other_iterators: Vec<Box<dyn PositionedIterator>>,
 17 |     min_heap: BinaryHeap<ReverseOrderPosition>,
 18 |     chromosome_order: &'a HashMap<String, Chromosome>,
 19 |     // because multiple intervals from each stream can overlap a single base interval
 20 |     // and each interval from others may overlap many base intervals, we must keep a cache (Q)
 21 |     // we always add intervals in order with push_back and therefore remove with pop_front.
 22 |     // As soon as the front interval in cache is stricly less than the query interval, then we can pop it.
 23 |     dequeue: VecDeque<Intersection>,
 24 | 
 25 |     // this is only kept for error checking so we can track if intervals are out of order.
 26 |     previous_interval: Option<Rc<Position>>,
 27 | 
 28 |     // this tracks which iterators have been called with Some(Positioned) for a given interval
 29 |     // so that calls after the first are called with None.
 30 |     called: Vec<bool>,
 31 | 
 32 |     // we call this on the first iteration of pull_through_heap
 33 |     heap_initialized: bool,
 34 | }
 35 | 
 36 | /// An Intersection wraps the Positioned that was intersected with a unique identifier.
 37 | /// The u32 identifier matches the index of the database that was intersected.
 38 | #[derive(Debug)]
 39 | pub struct Intersection {
 40 |     /// the Positioned that was intersected
 41 |     pub interval: Rc<Position>,
 42 |     /// a unique identifier indicating the source of this interval.
 43 |     pub id: u32,
 44 | }
 45 | 
 46 | /// An Intersections wraps the base interval and a vector of overlapping intervals.
 47 | #[derive(Debug)]
 48 | pub struct Intersections {
 49 |     pub base_interval: Rc<Position>,
 50 |     pub overlapping: Vec<Intersection>,
 51 | }
 52 | 
 53 | struct ReverseOrderPosition {
 54 |     position: Position,
 55 |     chromosome_index: usize, // index order of chrom.
 56 |     id: usize,               // file_index
 57 | }
 58 | 
 59 | impl PartialEq for ReverseOrderPosition {
 60 |     #[inline]
 61 |     fn eq(&self, other: &Self) -> bool {
 62 |         self.position.start() == other.position.start()
 63 |             && self.position.stop() == other.position.stop()
 64 |             && self.chromosome_index == other.chromosome_index
 65 |     }
 66 | }
 67 | 
 68 | impl Eq for ReverseOrderPosition {}
 69 | 
 70 | impl PartialOrd for ReverseOrderPosition {
 71 |     #[inline]
 72 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 73 |         Some(self.cmp(other))
 74 |     }
 75 | }
 76 | 
 77 | impl Ord for ReverseOrderPosition {
 78 |     #[inline]
 79 |     fn cmp(&self, other: &Self) -> Ordering {
 80 |         if self.chromosome_index != other.chromosome_index {
 81 |             return self.chromosome_index.cmp(&other.chromosome_index).reverse();
 82 |         }
 83 | 
 84 |         let so = self.position.start().cmp(&other.position.start()).reverse();
 85 |         match so {
 86 |             Ordering::Equal => self.position.stop().cmp(&other.position.stop()).reverse(),
 87 |             _ => so,
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | /// cmp will return Less if a is before b, Greater if a is after b, Equal if they overlap.
 93 | #[inline(always)]
 94 | fn cmp(a: &Position, b: &Position, chromosome_order: &HashMap<String, Chromosome>) -> Ordering {
 95 |     if a.chrom() != b.chrom() {
 96 |         return chromosome_order[a.chrom()]
 97 |             .index
 98 |             .cmp(&chromosome_order[b.chrom()].index);
 99 |     }
100 |     // same chrom.
101 |     if a.stop() <= b.start() {
102 |         return Ordering::Less;
103 |     }
104 |     if a.start() >= b.stop() {
105 |         return Ordering::Greater;
106 |     }
107 |     // Equal simply means they overlap.
108 |     Ordering::Equal
109 | }
110 | 
111 | fn region_str(p: &Position) -> std::string::String {
112 |     format!("{}:{}-{}", p.chrom(), p.start() + 1, p.stop())
113 | }
114 | 
115 | /// An iterator that returns the intersection of multiple iterators for each query interval
116 | impl<'a> Iterator for IntersectionIterator<'a> {
117 |     type Item = io::Result<Intersections>;
118 | 
119 |     fn next(&mut self) -> Option<Self::Item> {
120 |         let bi = self.base_iterator.next_position(None)?;
121 | 
122 |         // if bi is an error return the Result here
123 |         let base_interval = match bi {
124 |             Err(e) => return Some(Err(e)),
125 |             Ok(p) => Rc::new(p),
126 |         };
127 |         if let Some(chrom) = self.chromosome_order.get(base_interval.chrom()) {
128 |             if let Some(chrom_len) = chrom.length {
129 |                 if base_interval.stop() > chrom_len as u64 {
130 |                     let msg = format!(
131 |                         "interval beyond end of chromosome: {}",
132 |                         region_str(base_interval.as_ref())
133 |                     );
134 |                     return Some(Err(Error::new(ErrorKind::Other, msg)));
135 |                 }
136 |             }
137 |         } else {
138 |             let msg = format!("invalid chromosome: {}", region_str(base_interval.as_ref()));
139 |             return Some(Err(Error::new(ErrorKind::Other, msg)));
140 |         }
141 | 
142 |         if self.out_of_order(base_interval.clone()) {
143 |             let p = self
144 |                 .previous_interval
145 |                 .as_ref()
146 |                 .expect("we know previous interval is_some from out_of_order");
147 |             let msg = format!(
148 |                 "intervals from {} out of order {} should be before {}",
149 |                 self.base_iterator.name(),
150 |                 region_str(p),
151 |                 region_str(base_interval.as_ref()),
152 |             );
153 |             return Some(Err(Error::new(ErrorKind::Other, msg)));
154 |         }
155 | 
156 |         self.previous_interval = Some(base_interval.clone());
157 | 
158 |         // drop intervals from Q that are strictly before the base interval.
159 |         self.pop_front(base_interval.clone());
160 | 
161 |         // pull intervals through the min-heap until the base interval is strictly less than the
162 |         // last pulled interval.
163 |         // we want all intervals to pass through the min_heap so that they are ordered across files
164 |         if let Err(e) = self.pull_through_heap(base_interval.clone()) {
165 |             return Some(Err(e));
166 |         }
167 | 
168 |         let mut overlapping_positions = Vec::new();
169 |         // de-Q contains all intervals that can overlap with the base interval.
170 |         // de-Q is sorted.
171 |         // We iterate through (again) and add those to overlapping positions.
172 |         for o in self.dequeue.iter() {
173 |             match cmp(
174 |                 o.interval.as_ref(),
175 |                 base_interval.as_ref(),
176 |                 self.chromosome_order,
177 |             ) {
178 |                 Ordering::Less => continue,
179 |                 Ordering::Greater => break,
180 |                 Ordering::Equal => overlapping_positions.push(Intersection {
181 |                     // NOTE: we're effectively making a copy here, but it's only incrementing the Rc and a u32...
182 |                     // we could avoid by by keeping entire intersection in Rc.
183 |                     interval: Rc::clone(&o.interval),
184 |                     id: o.id,
185 |                 }),
186 |             }
187 |         }
188 | 
189 |         Some(Ok(Intersections {
190 |             base_interval,
191 |             overlapping: overlapping_positions,
192 |         }))
193 |     }
194 | }
195 | 
196 | /// Create a new IntersectionIterator given a query (base) and a vector of other positioned iterators.
197 | impl<'a> IntersectionIterator<'a> {
198 |     pub fn new(
199 |         base_iterator: Box<dyn PositionedIterator>,
200 |         other_iterators: Vec<Box<dyn PositionedIterator>>,
201 |         chromosome_order: &'a HashMap<String, Chromosome>,
202 |     ) -> io::Result<Self> {
203 |         let min_heap = BinaryHeap::new();
204 |         let called = vec![false; other_iterators.len()];
205 |         Ok(IntersectionIterator {
206 |             base_iterator,
207 |             other_iterators,
208 |             min_heap,
209 |             chromosome_order,
210 |             dequeue: VecDeque::new(),
211 |             previous_interval: None,
212 |             called,
213 |             heap_initialized: false,
214 |         })
215 |     }
216 | 
217 |     fn init_heap(&mut self, base_interval: Rc<Position>) -> io::Result<()> {
218 |         assert!(!self.heap_initialized);
219 |         for (i, iter) in self.other_iterators.iter_mut().enumerate() {
220 |             if let Some(positioned) = iter.next_position(Some(base_interval.as_ref())) {
221 |                 let positioned = positioned?;
222 |                 let chromosome_index = match self.chromosome_order.get(positioned.chrom()) {
223 |                     Some(c) => c.index,
224 |                     None => {
225 |                         let msg = format!(
226 |                             "invalid chromosome: {} in iterator {}",
227 |                             region_str(&positioned),
228 |                             self.other_iterators[i].name()
229 |                         );
230 |                         return Err(Error::new(ErrorKind::Other, msg));
231 |                     }
232 |                 };
233 |                 self.min_heap.push(ReverseOrderPosition {
234 |                     position: positioned,
235 |                     chromosome_index,
236 |                     id: i,
237 |                 });
238 |             }
239 |         }
240 |         self.heap_initialized = true;
241 |         Ok(())
242 |     }
243 | 
244 |     /// drop intervals from Q that are strictly before the base interval.
245 |     fn pop_front(&mut self, base_interval: Rc<Position>) {
246 |         while !self.dequeue.is_empty()
247 |             && Ordering::Less
248 |                 == cmp(
249 |                     self.dequeue[0].interval.as_ref(),
250 |                     base_interval.as_ref(),
251 |                     self.chromosome_order,
252 |                 )
253 |         {
254 |             _ = self.dequeue.pop_front();
255 |         }
256 |     }
257 | 
258 |     fn out_of_order(&self, interval: Rc<Position>) -> bool {
259 |         return match &self.previous_interval {
260 |             None => false, // first interval in file.
261 |             Some(previous_interval) => {
262 |                 if previous_interval.chrom() != interval.chrom() {
263 |                     let pci = self.chromosome_order[previous_interval.chrom()].index;
264 |                     let ici = self.chromosome_order[interval.chrom()].index;
265 |                     return pci > ici;
266 |                 }
267 |                 previous_interval.start() > interval.start()
268 |                     || (previous_interval.start() == interval.start()
269 |                         && previous_interval.stop() > interval.stop())
270 |             }
271 |         };
272 |     }
273 |     // reset the array that tracks which iterators have been called with Some(Positioned)
274 |     #[inline]
275 |     fn zero_called(&mut self) {
276 |         let ptr = self.called.as_mut_ptr();
277 |         unsafe { ptr.write_bytes(0, self.called.len()) };
278 |     }
279 | 
280 |     fn pull_through_heap(&mut self, base_interval: Rc<Position>) -> io::Result<()> {
281 |         self.zero_called();
282 |         if !self.heap_initialized {
283 |             // we wait til first iteration here to call init heap
284 |             // because we need the base interval.
285 |             self.init_heap(Rc::clone(&base_interval))?;
286 |         }
287 |         let other_iterators = self.other_iterators.as_mut_slice();
288 | 
289 |         while let Some(ReverseOrderPosition {
290 |             position,
291 |             chromosome_index,
292 |             id: file_index,
293 |             ..
294 |         }) = self.min_heap.pop()
295 |         {
296 |             // must always pull into the heap.
297 |             let f = other_iterators
298 |                 .get_mut(file_index)
299 |                 .expect("expected interval iterator at file index");
300 |             // for a given base_interval, we make sure to call next_position with Some, only once.
301 |             // subsequent calls will be with None.
302 |             let arg: Option<&Position> = if !self.called[file_index] {
303 |                 self.called[file_index] = true;
304 |                 Some(base_interval.as_ref())
305 |             } else {
306 |                 None
307 |             };
308 |             if let Some(next_position) = f.next_position(arg) {
309 |                 let next_position = next_position?;
310 |                 let next_chromosome = match self.chromosome_order.get(next_position.chrom()) {
311 |                     Some(c) => c,
312 |                     None => {
313 |                         let msg = format!(
314 |                             "invalid chromosome: {} in iterator {}",
315 |                             region_str(&next_position),
316 |                             other_iterators[file_index].name()
317 |                         );
318 |                         return Err(Error::new(ErrorKind::Other, msg));
319 |                     }
320 |                 };
321 | 
322 |                 // check that intervals within a file are in order.
323 |                 if !(position.start() <= next_position.start()
324 |                     || chromosome_index < next_chromosome.index)
325 |                 {
326 |                     let msg = format!(
327 |                         "database intervals out of order ({} -> {}) in iterator: {}",
328 |                         region_str(&position),
329 |                         region_str(&next_position),
330 |                         other_iterators[file_index].name()
331 |                     );
332 |                     return Err(Error::new(ErrorKind::Other, msg));
333 |                 }
334 |                 self.min_heap.push(ReverseOrderPosition {
335 |                     position: next_position,
336 |                     chromosome_index,
337 |                     id: file_index,
338 |                 });
339 |             }
340 | 
341 |             // and we must always add the position to the Q
342 |             let rc_pos = Rc::new(position);
343 |             let intersection = Intersection {
344 |                 interval: rc_pos.clone(),
345 |                 id: file_index as u32,
346 |             };
347 |             self.dequeue.push_back(intersection);
348 | 
349 |             // if this position is after base_interval, we can stop pulling through heap.
350 |             if cmp(
351 |                 base_interval.as_ref(),
352 |                 rc_pos.as_ref(),
353 |                 self.chromosome_order,
354 |             ) == Ordering::Greater
355 |             {
356 |                 break;
357 |             }
358 |         }
359 |         Ok(())
360 |     }
361 | }
362 | 
363 | #[cfg(test)]
364 | mod tests {
365 |     use super::*;
366 |     use crate::chrom_ordering::parse_genome;
367 |     use crate::interval::Interval;
368 | 
369 |     struct Intervals {
370 |         i: usize,
371 |         name: String,
372 |         ivs: Vec<Position>,
373 |     }
374 | 
375 |     impl Intervals {
376 |         fn new(name: String, ivs: Vec<Interval>) -> Self {
377 |             Intervals {
378 |                 i: 0,
379 |                 name,
380 |                 ivs: ivs
381 |                     .into_iter()
382 |                     .map(|i| Position::Interval(i))
383 |                     .collect::<Vec<Position>>(),
384 |             }
385 |         }
386 |         fn add(&mut self, iv: Interval) {
387 |             self.ivs.push(Position::Interval(iv));
388 |         }
389 |     }
390 |     impl PositionedIterator for Intervals {
391 |         fn name(&self) -> String {
392 |             String::from(format!("{}:{}", self.name, self.i))
393 |         }
394 | 
395 |         fn next_position(&mut self, _o: Option<&Position>) -> Option<io::Result<Position>> {
396 |             if self.i >= self.ivs.len() {
397 |                 return None;
398 |             }
399 |             let p = self.ivs.remove(0);
400 |             Some(Ok(p))
401 |         }
402 |     }
403 | 
404 |     #[test]
405 |     fn many_intervals() {
406 |         let chrom_order = HashMap::from([
407 |             (
408 |                 String::from("chr1"),
409 |                 Chromosome {
410 |                     index: 0,
411 |                     length: None,
412 |                 },
413 |             ),
414 |             (
415 |                 String::from("chr2"),
416 |                 Chromosome {
417 |                     index: 1,
418 |                     length: None,
419 |                 },
420 |             ),
421 |         ]);
422 |         let mut a_ivs = Intervals::new(String::from("A"), Vec::new());
423 |         let mut b_ivs = Intervals::new(String::from("B"), Vec::new());
424 |         let n_intervals = 100;
425 |         let times = 3;
426 |         for i in 0..n_intervals {
427 |             let iv = Interval {
428 |                 chrom: String::from("chr1"),
429 |                 start: i,
430 |                 stop: i + 1,
431 |                 ..Default::default()
432 |             };
433 |             a_ivs.add(iv);
434 |             for _ in 0..times {
435 |                 let iv = Interval {
436 |                     chrom: String::from("chr1"),
437 |                     start: i,
438 |                     stop: i + 1,
439 |                     ..Default::default()
440 |                 };
441 |                 b_ivs.add(iv);
442 |             }
443 |         }
444 | 
445 |         b_ivs.ivs.sort_by(|a, b| a.start().cmp(&b.start()));
446 | 
447 |         let a_ivs: Box<dyn PositionedIterator> = Box::new(a_ivs);
448 | 
449 |         let mut iter = IntersectionIterator::new(a_ivs, vec![Box::new(b_ivs)], &chrom_order)
450 |             .expect("error getting iterator");
451 |         let mut n = 0;
452 |         assert!(iter.all(|intersection| {
453 |             let intersection = intersection.expect("error getting intersection");
454 |             n += 1;
455 |             assert!(intersection
456 |                 .overlapping
457 |                 .iter()
458 |                 .all(|p| p.interval.start() == intersection.base_interval.start()));
459 |             intersection.overlapping.len() == times
460 |         }));
461 |         assert_eq!(n, n_intervals)
462 |     }
463 | 
464 |     #[test]
465 |     fn bookend_and_chrom() {
466 |         let genome_str = "chr1\nchr2\nchr3\n";
467 |         let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
468 |         let chrom = String::from("chr1");
469 |         let a_ivs = Intervals::new(
470 |             String::from("A"),
471 |             vec![
472 |                 Interval {
473 |                     chrom: chrom.clone(),
474 |                     start: 0,
475 |                     stop: 10,
476 |                     ..Default::default()
477 |                 },
478 |                 Interval {
479 |                     chrom: chrom.clone(),
480 |                     start: 0,
481 |                     stop: 10,
482 |                     ..Default::default()
483 |                 },
484 |             ],
485 |         );
486 | 
487 |         let b_ivs = Intervals::new(
488 |             String::from("B"),
489 |             vec![
490 |                 Interval {
491 |                     chrom: chrom.clone(),
492 |                     start: 0,
493 |                     stop: 5,
494 |                     ..Default::default()
495 |                 },
496 |                 Interval {
497 |                     chrom: chrom.clone(),
498 |                     start: 0,
499 |                     stop: 10,
500 |                     ..Default::default()
501 |                 },
502 |                 Interval {
503 |                     // this interval should not overlap.
504 |                     chrom: chrom.clone(),
505 |                     start: 10,
506 |                     stop: 20,
507 |                     ..Default::default()
508 |                 },
509 |                 Interval {
510 |                     // this interval should not overlap.
511 |                     chrom: String::from("chr2"),
512 |                     start: 1,
513 |                     stop: 20,
514 |                     ..Default::default()
515 |                 },
516 |             ],
517 |         );
518 | 
519 |         let iter = IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order)
520 |             .expect("error getting iterator");
521 |         iter.for_each(|intersection| {
522 |             let intersection = intersection.expect("intersection");
523 |             assert_eq!(intersection.overlapping.len(), 2);
524 |             assert!(intersection
525 |                 .overlapping
526 |                 .iter()
527 |                 .all(|p| { p.interval.start() == 0 }));
528 |         })
529 |     }
530 | 
531 |     #[test]
532 |     fn interval_beyond_end_of_chrom() {
533 |         let genome_str = "chr1\t22\n";
534 |         let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
535 |         let a_ivs = Intervals::new(
536 |             String::from("A"),
537 |             vec![
538 |                 Interval {
539 |                     chrom: String::from("chr1"),
540 |                     start: 10,
541 |                     stop: 22,
542 |                     ..Default::default()
543 |                 },
544 |                 Interval {
545 |                     chrom: String::from("chr1"),
546 |                     start: 1,
547 |                     stop: 23,
548 |                     ..Default::default()
549 |                 },
550 |             ],
551 |         );
552 |         let mut iter = IntersectionIterator::new(Box::new(a_ivs), vec![], &chrom_order)
553 |             .expect("error getting iterator");
554 | 
555 |         let e = iter.nth(1).expect("error getting next");
556 |         assert!(e.is_err());
557 |         let e = e.err().unwrap();
558 |         assert!(e.to_string().contains("beyond end of chromosome"));
559 |     }
560 | 
561 |     #[test]
562 |     fn ordering_error() {
563 |         let genome_str = "chr1\nchr2\nchr3\n";
564 |         let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
565 |         let a_ivs = Intervals::new(
566 |             String::from("A"),
567 |             vec![
568 |                 Interval {
569 |                     chrom: String::from("chr1"),
570 |                     start: 10,
571 |                     stop: 1,
572 |                     ..Default::default()
573 |                 },
574 |                 Interval {
575 |                     chrom: String::from("chr1"),
576 |                     start: 1,
577 |                     stop: 2,
578 |                     ..Default::default()
579 |                 },
580 |             ],
581 |         );
582 |         let mut iter = IntersectionIterator::new(Box::new(a_ivs), vec![], &chrom_order)
583 |             .expect("error getting iterator");
584 | 
585 |         let e = iter.nth(1).expect("error getting next");
586 |         assert!(e.is_err());
587 |         let e = e.err().unwrap();
588 |         assert!(e.to_string().contains("out of order"));
589 | 
590 |         // now repeat with database out of order.
591 |         let a_ivs = Intervals::new(
592 |             String::from("A"),
593 |             vec![
594 |                 Interval {
595 |                     chrom: String::from("chr1"),
596 |                     start: 1,
597 |                     stop: 2,
598 |                     ..Default::default()
599 |                 },
600 |                 Interval {
601 |                     chrom: String::from("chr1"),
602 |                     start: 1,
603 |                     stop: 2,
604 |                     ..Default::default()
605 |                 },
606 |             ],
607 |         );
608 |         // now repeat with database out of order.
609 |         let b_ivs = Intervals::new(
610 |             String::from("B"),
611 |             vec![
612 |                 Interval {
613 |                     chrom: String::from("chr1"),
614 |                     start: 1,
615 |                     stop: 2,
616 |                     ..Default::default()
617 |                 },
618 |                 Interval {
619 |                     chrom: String::from("chr1"),
620 |                     start: 0,
621 |                     stop: 2,
622 |                     ..Default::default()
623 |                 },
624 |             ],
625 |         );
626 | 
627 |         let mut iter =
628 |             IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order)
629 |                 .expect("error getting iterator");
630 |         let e = iter.next().expect("error getting next");
631 |         assert!(e.is_err());
632 |         let e = e.err().unwrap();
633 |         assert!(e.to_string().contains("out of order"));
634 |     }
635 | 
636 |     #[test]
637 |     fn multiple_sources() {
638 |         let genome_str = "chr1\nchr2\nchr3\n";
639 |         let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
640 |         let a_ivs = Intervals::new(
641 |             String::from("A"),
642 |             vec![Interval {
643 |                 chrom: String::from("chr1"),
644 |                 start: 0,
645 |                 stop: 1,
646 |                 ..Default::default()
647 |             }],
648 |         );
649 |         let b_ivs = Intervals::new(
650 |             String::from("B"),
651 |             vec![Interval {
652 |                 chrom: String::from("chr1"),
653 |                 start: 0,
654 |                 stop: 1,
655 |                 ..Default::default()
656 |             }],
657 |         );
658 |         let c_ivs = Intervals::new(
659 |             String::from("c"),
660 |             vec![Interval {
661 |                 chrom: String::from("chr1"),
662 |                 start: 0,
663 |                 stop: 1,
664 |                 ..Default::default()
665 |             }],
666 |         );
667 |         let iter = IntersectionIterator::new(
668 |             Box::new(a_ivs),
669 |             vec![Box::new(b_ivs), Box::new(c_ivs)],
670 |             &chrom_order,
671 |         )
672 |         .expect("error getting iterator");
673 |         let c = iter
674 |             .map(|intersection| {
675 |                 let intersection = intersection.expect("error getting intersection");
676 |                 dbg!(&intersection.overlapping);
677 |                 assert_eq!(intersection.overlapping.len(), 2);
678 |                 // check that we got from source 1 and source 2.
679 |                 assert_ne!(
680 |                     intersection.overlapping[0].id,
681 |                     intersection.overlapping[1].id
682 |                 );
683 |                 1
684 |             })
685 |             .sum::<usize>();
686 |         assert_eq!(c, 1);
687 |     }
688 | 
689 |     #[test]
690 |     #[ignore]
691 |     fn zero_length() {
692 |         let genome_str = "chr1\nchr2\nchr3\n";
693 |         let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
694 |         let a_ivs = Intervals::new(
695 |             String::from("A"),
696 |             vec![Interval {
697 |                 chrom: String::from("chr1"),
698 |                 start: 1,
699 |                 stop: 1,
700 |                 ..Default::default()
701 |             }],
702 |         );
703 |         let b_ivs = Intervals::new(
704 |             String::from("B"),
705 |             vec![Interval {
706 |                 chrom: String::from("chr1"),
707 |                 start: 1,
708 |                 stop: 1,
709 |                 ..Default::default()
710 |             }],
711 |         );
712 |         let iter = IntersectionIterator::new(Box::new(a_ivs), vec![Box::new(b_ivs)], &chrom_order)
713 |             .expect("error getting iterator");
714 |         // check that it overlapped by asserting that the loop ran and also that there was an overlap within the loop.
715 |         let c = iter
716 |             .map(|intersection| {
717 |                 let intersection = intersection.expect("error getting intersection");
718 |                 assert!(intersection.overlapping.len() == 1);
719 |                 1
720 |             })
721 |             .sum::<usize>();
722 |         // NOTE this fails as we likely need to fix the lt function.
723 |         assert_eq!(c, 1);
724 |     }
725 | }
726 | 


--------------------------------------------------------------------------------
/src/interval.rs:
--------------------------------------------------------------------------------
 1 | use crate::position::{Field, FieldError, Value};
 2 | use crate::string::String;
 3 | /// Interval type is a simple struct that can be used as a default interval type.
 4 | /// It has a chromosome, start, and stop field along with a (linear) HashMap of Values.
 5 | use linear_map::LinearMap;
 6 | use std::fmt::Debug;
 7 | 
 8 | #[derive(Debug, Default)]
 9 | pub struct Interval {
10 |     pub chrom: String,
11 |     pub start: u64,
12 |     pub stop: u64,
13 |     pub fields: LinearMap<String, Value>,
14 | }
15 | 
16 | impl Interval {
17 |     #[inline]
18 |     pub fn start(&self) -> u64 {
19 |         self.start
20 |     }
21 |     #[inline]
22 |     pub fn stop(&self) -> u64 {
23 |         self.stop
24 |     }
25 |     #[inline]
26 |     pub fn chrom(&self) -> &str {
27 |         &self.chrom
28 |     }
29 | 
30 |     #[inline]
31 |     pub fn value(&self, f: Field) -> Result<Value, FieldError> {
32 |         match f {
33 |             Field::String(name) => match self.fields.get(&name) {
34 |                 None => Err(FieldError::InvalidFieldName(name)),
35 |                 Some(v) => match v {
36 |                     Value::Strings(s) => Ok(Value::Strings(s.clone())),
37 |                     Value::Ints(i) => Ok(Value::Ints(i.clone())),
38 |                     Value::Floats(f) => Ok(Value::Floats(f.clone())),
39 |                 },
40 |             },
41 |             Field::Int(i) => {
42 |                 let name = self.fields.keys().nth(i);
43 |                 match name {
44 |                     None => Err(FieldError::InvalidFieldIndex(i)),
45 |                     Some(name) => match self.fields.get(name) {
46 |                         None => Err(FieldError::InvalidFieldName(name.clone())),
47 |                         Some(v) => match v {
48 |                             Value::Strings(s) => Ok(Value::Strings(s.clone())),
49 |                             Value::Ints(i) => Ok(Value::Ints(i.clone())),
50 |                             Value::Floats(f) => Ok(Value::Floats(f.clone())),
51 |                         },
52 |                     },
53 |                 }
54 |             }
55 |         }
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Bedder is a library for intersecting genomic data.
 2 | 
 3 | /// Intersection iterators and data structures.
 4 | pub mod intersection;
 5 | 
 6 | /// Position traits.
 7 | pub mod position;
 8 | 
 9 | // Interval type
10 | pub mod interval;
11 | 
12 | /// a std::String::String unless other string features are enabled.
13 | pub mod string;
14 | 
15 | pub mod sniff;
16 | 
17 | pub mod chrom_ordering;
18 | 
19 | #[cfg(feature = "bed")]
20 | /// Bed parser implementing the PositionedIterator trait.
21 | pub mod bedder_bed;
22 | 
23 | #[cfg(feature = "vcf")]
24 | /// Vcf parser implementing the PositionedIterator trait.
25 | pub mod bedder_vcf;
26 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | extern crate bedder;
 2 | use bedder::sniff;
 3 | use clap::Parser;
 4 | use std::env;
 5 | use std::path::PathBuf;
 6 | 
 7 | #[derive(Parser, Debug)]
 8 | #[command(author, version, about, long_about=None)]
 9 | struct Args {
10 |     #[arg(help = "input file", short = 'a')]
11 |     query_path: PathBuf,
12 |     #[arg(help = "other file", short = 'b', required = true)]
13 |     other_paths: Vec<PathBuf>,
14 |     #[arg(
15 |         help = "genome file for chromosome ordering",
16 |         short = 'g',
17 |         required = true
18 |     )]
19 |     genome_file: PathBuf,
20 | }
21 | 
22 | pub fn main() -> Result<(), Box<dyn std::error::Error>> {
23 |     if env::var("RUST_LOG").is_err() {
24 |         env::set_var("RUST_LOG", "bedder=info");
25 |     }
26 |     env_logger::init();
27 |     log::info!("starting up");
28 |     let args = Args::parse();
29 | 
30 |     let chrom_order =
31 |         bedder::chrom_ordering::parse_genome(std::fs::File::open(&args.genome_file)?)?;
32 | 
33 |     let a_iter = sniff::open_file(&args.query_path)?;
34 |     let b_iters: Vec<_> = args
35 |         .other_paths
36 |         .iter()
37 |         .map(|p| sniff::open_file(p).expect("error opening file"))
38 |         .collect();
39 | 
40 |     let ii = bedder::intersection::IntersectionIterator::new(a_iter, b_iters, &chrom_order)?;
41 |     // iterate over the intersections
42 |     ii.for_each(|intersection| {
43 |         let intersection = intersection.expect("error getting intersection");
44 |         println!("{:?}", intersection);
45 |     });
46 |     Ok(())
47 | }
48 | 


--------------------------------------------------------------------------------
/src/position.rs:
--------------------------------------------------------------------------------
  1 | use crate::string::String;
  2 | use std::fmt::{self, Debug};
  3 | use std::io;
  4 | use std::result;
  5 | 
  6 | /// A Value is a vector of integers, floats, or strings.
  7 | /// Often this will be a single value.
  8 | #[derive(Debug)]
  9 | pub enum Value {
 10 |     Ints(Vec<i64>),
 11 |     Floats(Vec<f64>),
 12 |     Strings(Vec<String>),
 13 | }
 14 | 
 15 | /// Field is either an integer, as in a bed column
 16 | /// or a string, as in a vcf info field.
 17 | #[derive(Debug)]
 18 | pub enum Field {
 19 |     String(String),
 20 |     Int(usize),
 21 | }
 22 | 
 23 | /// Error returned when a field is not found.
 24 | #[derive(Debug)]
 25 | pub enum FieldError {
 26 |     InvalidFieldIndex(usize),
 27 |     InvalidFieldName(String),
 28 | }
 29 | 
 30 | impl fmt::Display for FieldError {
 31 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 32 |         match self {
 33 |             FieldError::InvalidFieldIndex(i) => write!(f, "invalid column index: {}", i),
 34 |             FieldError::InvalidFieldName(s) => write!(f, "invalid column name: {}", s),
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | impl std::error::Error for FieldError {}
 40 | 
 41 | /// A Positioned has a position in the genome. It is a bed-like (half-open) interval.
 42 | /// It also has a means to extract values from integer or string columns.
 43 | pub trait Positioned: Debug {
 44 |     fn chrom(&self) -> &str;
 45 |     /// 0-based start position.
 46 |     fn start(&self) -> u64;
 47 |     /// non-inclusive end;
 48 |     fn stop(&self) -> u64;
 49 | 
 50 |     // get back the original line?
 51 |     //fn line(&self) -> &'a str;
 52 | }
 53 | 
 54 | pub trait Valued {
 55 |     // extract a value from the Positioned object Col
 56 |     fn value(&self, b: Field) -> result::Result<Value, FieldError>;
 57 | }
 58 | 
 59 | #[derive(Debug)]
 60 | pub enum Position {
 61 |     Bed(crate::bedder_bed::Record<3>),
 62 |     // Note: we use a Box here because a vcf Record is large.
 63 |     Vcf(Box<crate::bedder_vcf::Record>),
 64 |     Interval(crate::interval::Interval),
 65 |     // catch-all in case we have another interval type.
 66 |     #[cfg(feature = "dyn_positioned")]
 67 |     Other(Box<dyn Positioned>),
 68 | }
 69 | 
 70 | impl Position {
 71 |     #[inline]
 72 |     pub fn chrom(&self) -> &str {
 73 |         match self {
 74 |             Position::Bed(b) => b.chrom(),
 75 |             Position::Vcf(v) => v.chrom(),
 76 |             Position::Interval(i) => &i.chrom,
 77 |             #[cfg(feature = "dyn_positioned")]
 78 |             Position::Other(o) => o.chrom(),
 79 |         }
 80 |     }
 81 | 
 82 |     #[inline]
 83 |     pub fn start(&self) -> u64 {
 84 |         match self {
 85 |             Position::Bed(b) => b.start(),
 86 |             Position::Vcf(v) => v.start(),
 87 |             Position::Interval(i) => i.start,
 88 |             #[cfg(feature = "dyn_positioned")]
 89 |             Position::Other(o) => o.start(),
 90 |         }
 91 |     }
 92 | 
 93 |     #[inline]
 94 |     pub fn stop(&self) -> u64 {
 95 |         match self {
 96 |             Position::Bed(b) => b.stop(),
 97 |             Position::Vcf(v) => v.stop(),
 98 |             Position::Interval(i) => i.stop,
 99 |             #[cfg(feature = "dyn_positioned")]
100 |             Position::Other(o) => o.stop(),
101 |         }
102 |     }
103 | }
104 | 
105 | #[cfg(feature = "dyn_positioned")]
106 | impl Valued for Box<dyn Positioned> {
107 |     #[inline]
108 |     fn value(&self, f: Field) -> result::Result<Value, FieldError> {
109 |         self.value(f)
110 |     }
111 | }
112 | 
113 | // Delegate the boxed version of this trait object to the inner object.
114 | impl Positioned for Box<dyn Positioned> {
115 |     fn chrom(&self) -> &str {
116 |         self.as_ref().chrom()
117 |     }
118 | 
119 |     fn start(&self) -> u64 {
120 |         self.as_ref().start()
121 |     }
122 | 
123 |     fn stop(&self) -> u64 {
124 |         self.as_ref().stop()
125 |     }
126 | }
127 | 
128 | impl PartialEq for dyn Positioned {
129 |     fn eq(&self, other: &dyn Positioned) -> bool {
130 |         self.start() == other.start()
131 |             && self.stop() == other.stop()
132 |             && self.chrom() == other.chrom()
133 |     }
134 | }
135 | 
136 | /// PositionedIterator is an iterator over Positioned objects.
137 | pub trait PositionedIterator {
138 |     /// A name for the iterator. This is most often the file path, perhaps with the line number appended.
139 |     /// Used to provide informative messages to the user.
140 |     fn name(&self) -> String;
141 | 
142 |     /// return the next Positioned from the iterator.
143 |     /// It is fine for implementers to ignore `q`;
144 |     /// Some iterators may improve performance by using `q` to index-skip.
145 |     /// `q` will be Some only on the first call for a given query interval.
146 |     /// Calls where `q` is None should return the next Positioned in the iterator (file) that has not
147 |     /// been returned previously. Intervals should only be returned once (even across many identical query intervals)
148 |     /// and they should always be returned in order (querys will always be in order).
149 |     /// Thus, if the implementer heeds `q` it should check that the returned Positioned is greater than the previously
150 |     /// returned position (Positioned equal to previously returned position should have already been returned).
151 |     fn next_position(
152 |         &mut self,
153 |         q: Option<&Position>,
154 |     ) -> Option<std::result::Result<Position, io::Error>>;
155 | }
156 | 


--------------------------------------------------------------------------------
/src/sniff.rs:
--------------------------------------------------------------------------------
  1 | use flate2::read::GzDecoder;
  2 | use std::io::{BufRead, Read, Seek};
  3 | use std::path::Path;
  4 | 
  5 | use crate::bedder_bed::BedderBed;
  6 | use crate::bedder_vcf::BedderVCF;
  7 | use crate::position::PositionedIterator;
  8 | use noodles::bgzf;
  9 | 
 10 | /// File formats supported by this file detector.
 11 | #[derive(Debug, PartialEq)]
 12 | pub enum FileFormat {
 13 |     VCF,
 14 |     BCF,
 15 |     BAM,
 16 |     CRAM,
 17 |     SAM,
 18 |     BED,
 19 |     CSI,
 20 |     Unknown,
 21 | }
 22 | 
 23 | /// Possible Compression formats.
 24 | #[derive(Debug, PartialEq)]
 25 | pub enum Compression {
 26 |     None,
 27 |     GZ,
 28 |     BGZF,
 29 |     RAZF,
 30 | }
 31 | 
 32 | pub fn open_file<P>(path: P) -> std::io::Result<Box<dyn PositionedIterator>>
 33 | where
 34 |     P: AsRef<Path>,
 35 | {
 36 |     let file = std::fs::File::open(&path)?;
 37 |     let r = open_reader(file, path);
 38 |     r
 39 | }
 40 | 
 41 | pub fn open_reader<R, P>(reader: R, path: P) -> std::io::Result<Box<dyn PositionedIterator>>
 42 | where
 43 |     R: Read + Seek + 'static,
 44 |     P: AsRef<Path>,
 45 | {
 46 |     let mut reader = std::io::BufReader::new(reader);
 47 |     let (format, compression) = detect_file_format(&mut reader, &path)?;
 48 |     log::info!(
 49 |         "path: {:?}, format: {:?} compression: {:?}",
 50 |         path.as_ref(),
 51 |         format,
 52 |         compression
 53 |     );
 54 |     /*
 55 |      */
 56 |     match format {
 57 |         FileFormat::VCF | FileFormat::BCF => {
 58 |             // get &str from path
 59 |             let path = path.as_ref().to_str().unwrap();
 60 |             let x = xvcf::Reader::from_reader(Box::new(reader), Some(path))?;
 61 |             let bed_vcf = BedderVCF::new(x)?;
 62 |             Ok(Box::new(bed_vcf))
 63 |         }
 64 |         _ => {
 65 |             let br: Box<dyn BufRead> = match compression {
 66 |                 Compression::None => Box::new(reader),
 67 |                 Compression::GZ => Box::new(std::io::BufReader::new(GzDecoder::new(reader))),
 68 |                 Compression::BGZF => match format {
 69 |                     // BCF|BAM will appear as bgzf so we don't want to do this outside
 70 |                     FileFormat::BCF | FileFormat::BAM => Box::new(reader),
 71 |                     _ => Box::new(bgzf::Reader::new(reader)),
 72 |                 },
 73 |                 Compression::RAZF => unimplemented!(),
 74 |             };
 75 | 
 76 |             match format {
 77 |                 FileFormat::BED => {
 78 |                     let reader = BedderBed::new(br);
 79 |                     Ok(Box::new(reader))
 80 |                 }
 81 |                 _ => unimplemented!("{format:?} not yet supported"),
 82 |             }
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | /// detect the file format of a reader.
 88 | pub fn detect_file_format<R: BufRead, P: AsRef<Path>>(
 89 |     reader: &mut R,
 90 |     path: P,
 91 | ) -> std::io::Result<(FileFormat, Compression)> {
 92 |     let buf = reader.fill_buf()?;
 93 |     let mut dec_buf = vec![0; buf.len()];
 94 | 
 95 |     let is_gzipped = &buf[0..2] == b"\x1f\x8b";
 96 |     let (compression, dec_buf) = if is_gzipped && buf[3] & 4 != 0 && buf.len() >= 18 {
 97 |         let c = match &buf[12..16] {
 98 |             // BGZF magic number
 99 |             b"BC\x02\x00" => Compression::BGZF,
100 |             // RAZF magic number
101 |             b"RAZF" => Compression::RAZF,
102 |             _ => Compression::GZ,
103 |         };
104 | 
105 |         let mut gz = GzDecoder::new(buf);
106 |         // it's ok if we have an unexepected EOF here
107 |         match gz.read_exact(&mut dec_buf) {
108 |             Ok(_) => {}
109 |             Err(e) => {
110 |                 if e.kind() != std::io::ErrorKind::UnexpectedEof {
111 |                     return Err(e);
112 |                 }
113 |             }
114 |         }
115 |         (c, dec_buf.as_slice())
116 |     } else {
117 |         (
118 |             if is_gzipped {
119 |                 Compression::GZ
120 |             } else {
121 |                 Compression::None
122 |             },
123 |             buf,
124 |         )
125 |     };
126 | 
127 |     let format = if dec_buf.starts_with(b"BAM\x01") {
128 |         FileFormat::BAM
129 |     } else if &dec_buf[0..3] == b"BCF" && (dec_buf[3] == 0x2 || dec_buf[3] == 0x4) {
130 |         FileFormat::BCF
131 |     } else if dec_buf.starts_with(b"##fileformat=VCF") {
132 |         FileFormat::VCF
133 |     } else if dec_buf.starts_with(b"CRAM") {
134 |         FileFormat::CRAM
135 |     } else if dec_buf.len() > 3
136 |         && (&dec_buf[0..4] == b"@HD\t"
137 |             || &dec_buf[0..4] == b"@SQ\t"
138 |             || &dec_buf[0..4] == b"@RG\t"
139 |             || &dec_buf[0..4] == b"@PG\t"
140 |             || &dec_buf[0..4] == b"@CO\t")
141 |     {
142 |         FileFormat::SAM
143 |     } else {
144 |         let p = path.as_ref();
145 |         if p.ends_with(".bed") || p.ends_with(".bed.gz") || p.ends_with(".bed.bgz") {
146 |             FileFormat::BED
147 |         } else {
148 |             FileFormat::Unknown
149 |         }
150 |     };
151 | 
152 |     if matches!(format, FileFormat::Unknown) {
153 |         let s = String::from_utf8_lossy(dec_buf);
154 |         let mut lines = s
155 |             .lines()
156 |             .filter(|l| !l.is_empty() && !l.starts_with('#'))
157 |             .collect::<Vec<_>>();
158 |         if lines
159 |             .last()
160 |             .map(|l| !l.ends_with('\n') && l.split('\t').collect::<Vec<_>>().len() < 3)
161 |             .unwrap_or(false)
162 |         {
163 |             // drop the final incomplete line
164 |             lines.pop();
165 |         }
166 | 
167 |         if !lines.is_empty() && lines.iter().all(|&line| is_bed_line(line)) {
168 |             return Ok((FileFormat::BED, compression));
169 |         }
170 |     }
171 | 
172 |     Ok((format, compression))
173 | }
174 | 
175 | fn is_bed_line(s: &str) -> bool {
176 |     if s.starts_with('#') {
177 |         return true;
178 |     }
179 |     let cols: Vec<_> = s.split('\t').collect();
180 |     if cols.len() < 3 {
181 |         return false;
182 |     }
183 |     // check that 2nd and 3rd cols are integers
184 |     cols[1].parse::<i32>().is_ok() && cols[2].parse::<i32>().is_ok()
185 | }
186 | 
187 | #[cfg(test)]
188 | mod tests {
189 | 
190 |     use super::*;
191 |     use noodles::bam;
192 |     use noodles::sam;
193 | 
194 |     #[test]
195 |     fn test_detect_format_bam() {
196 |         let file_path = "tests/test.bam";
197 |         let mut fs = std::fs::File::open(file_path).unwrap();
198 |         let mut rdr = std::io::BufReader::new(&mut fs);
199 |         let (format, compression) = detect_file_format(&mut rdr, file_path).unwrap();
200 |         assert_eq!(compression, Compression::BGZF);
201 |         assert_eq!(format, FileFormat::BAM);
202 | 
203 |         let mut b = bam::reader::Reader::new(&mut rdr);
204 |         let h = b.read_header().expect("eror reading header");
205 |         for r in b.records(&h) {
206 |             let r = r.expect("error reading record");
207 |             eprintln!("{:?}", r);
208 |         }
209 |     }
210 | 
211 |     #[test]
212 |     fn test_detect_format_sam() {
213 |         let file_path = "tests/test.sam";
214 |         let mut fs = std::fs::File::open(file_path).unwrap();
215 |         let mut rdr = std::io::BufReader::new(&mut fs);
216 |         let (format, compression) = detect_file_format(&mut rdr, file_path).unwrap();
217 |         assert_eq!(compression, Compression::None);
218 |         assert_eq!(format, FileFormat::SAM);
219 | 
220 |         let mut b = sam::reader::Reader::new(&mut rdr);
221 |         let h = b.read_header().expect("eror reading header");
222 |         for r in b.records(&h) {
223 |             let r = r.expect("error reading record");
224 |             eprintln!("{:?}", r);
225 |         }
226 |     }
227 | 
228 |     #[test]
229 |     fn test_is_bed_line() {
230 |         // Test valid BED line
231 |         let valid_bed_line = "chr1\t100\t200\tname\t0\t+\t50\t150\t0\t2\t10,20\t0,80";
232 |         assert!(is_bed_line(valid_bed_line));
233 | 
234 |         // Test invalid BED line with missing columns
235 |         let invalid_bed_line = "chr1\t100";
236 |         assert!(!is_bed_line(invalid_bed_line));
237 | 
238 |         // Test invalid BED line with non-integer columns
239 |         let invalid_bed_line = "chr1\ta\tb\tname\t0\t+\t50\t150\t0\t2\t10,20\t0,80";
240 |         assert!(!is_bed_line(invalid_bed_line));
241 | 
242 |         // Test comment line
243 |         let comment_line = "# This is a comment";
244 |         assert!(is_bed_line(comment_line));
245 | 
246 |         // single interval with no newline.
247 |         let valid_bed_line = "chr1\t100\t200";
248 |         assert!(is_bed_line(valid_bed_line));
249 |     }
250 | }
251 | 


--------------------------------------------------------------------------------
/src/string.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "smol_str")]
 2 | pub use smol_str::SmolStr as String;
 3 | 
 4 | #[cfg(feature = "smartstring")]
 5 | pub use smartstring::alias::String;
 6 | 
 7 | #[cfg(feature = "compact_str")]
 8 | pub use compact_str::CompactString as String;
 9 | 
10 | #[cfg(feature = "kstring")]
11 | pub use kstring::KString as String;
12 | 
13 | #[cfg(all(
14 |     not(feature = "smartstring"),
15 |     not(feature = "smol_str"),
16 |     not(feature = "compact_str"),
17 |     not(feature = "kstring"),
18 | ))]
19 | pub use std::string::String;
20 | 


--------------------------------------------------------------------------------
/tests/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quinlan-lab/bedder-rs/ca1ff342611ed2b968729526cdd75ae988c4ea3b/tests/test.bam


--------------------------------------------------------------------------------
/tests/test.sam:
--------------------------------------------------------------------------------
1 | @HD	VN:1.6	SO:coordinate
2 | @SQ	SN:chr1	LN:1009800
3 | A	16	chr1	999901	42	100M	*	0	0	ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG	CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:100	YT:Z:UU
4 | B	0	chr1	999914	42	100M	*	0	0	TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:100	YT:Z:UU
5 | 


--------------------------------------------------------------------------------