├── .gitignore ├── .cargo └── config.toml ├── data ├── subset.bq ├── subset.vbq ├── subset_R1.bq ├── subset_R2.bq ├── subset_R1.fastq.gz └── subset_R2.fastq.gz ├── src ├── prelude.rs ├── context │ ├── mod.rs │ ├── traits.rs │ └── structs.rs ├── lib.rs ├── record.rs ├── vbq │ ├── mod.rs │ ├── header.rs │ └── index.rs ├── parallel.rs ├── policy.rs ├── bq │ ├── mod.rs │ ├── header.rs │ ├── writer.rs │ └── reader.rs └── error.rs ├── Cargo.toml ├── README.md ├── examples ├── streaming.rs ├── grep.rs ├── network_streaming.rs ├── example.rs ├── parallel_processing.rs ├── parallel_range.rs └── read_write.rs └── .github └── workflows └── ci.yml /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | Cargo.lock 4 | data/test* 5 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | -------------------------------------------------------------------------------- /data/subset.bq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset.bq -------------------------------------------------------------------------------- /data/subset.vbq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset.vbq -------------------------------------------------------------------------------- /data/subset_R1.bq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R1.bq -------------------------------------------------------------------------------- /data/subset_R2.bq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R2.bq -------------------------------------------------------------------------------- /data/subset_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R1.fastq.gz -------------------------------------------------------------------------------- /data/subset_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R2.fastq.gz -------------------------------------------------------------------------------- /src/prelude.rs: -------------------------------------------------------------------------------- 1 | pub use super::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader}; 2 | 3 | pub use crate::context::{ 4 | Context, Ctx, HeaderContext, QualityContext, SeqCtx, SeqHeaderCtx, SeqQualCtx, SequenceContext, 5 | }; 6 | -------------------------------------------------------------------------------- /src/context/mod.rs: -------------------------------------------------------------------------------- 1 | /// Instances of common contexts 2 | mod structs; 3 | 4 | /// Traits for different context behaviors 5 | mod traits; 6 | 7 | pub use structs::{Ctx, SeqCtx, SeqHeaderCtx, SeqQualCtx}; 8 | pub use traits::{Context, HeaderContext, QualityContext, SequenceContext}; 9 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "binseq" 3 | version = "0.8.1" 4 | edition = "2021" 5 | description = "A high efficiency binary format for sequencing data" 6 | license = "MIT" 7 | authors = ["Noam Teyssier "] 8 | repository = "https://github.com/arcinstitute/binseq" 9 | documentation = "https://docs.rs/binseq" 10 | categories = ["science::bioinformatics", "encoding", "data-structures"] 11 | keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"] 12 | 13 | [dependencies] 14 | anyhow = "1.0.100" 15 | auto_impl = "1.3.0" 16 | bitnuc = "0.3.0" 17 | bytemuck = "1.24.0" 18 | byteorder = "1.5.0" 19 | itoa = "1.0.15" 20 | memmap2 = "0.9.9" 21 | num_cpus = "1.17.0" 22 | rand = { version = "0.9.2", features = ["small_rng"] } 23 | thiserror = "2.0.17" 24 | zstd = { version = "0.13.3", features = ["zstdmt"] } 25 | 26 | [dev-dependencies] 27 | nucgen = "0.2.0" 28 | niffler = "3.0.0" 29 | seq_io = "0.3.4" 30 | parking_lot = "0.12.5" 31 | itoa = "1.0.15" 32 | memchr = "2.7.6" 33 | 34 | [lints.clippy] 35 | pedantic = { level = "warn", priority = -1 } 36 | cast_possible_truncation = "allow" 37 | missing_errors_doc = "allow" 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BINSEQ Format Specification 2 | 3 | [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE.md) 4 | ![actions status](https://github.com/arcinstitute/binseq/workflows/CI/badge.svg) 5 | [![Crates.io](https://img.shields.io/crates/d/binseq?color=orange&label=crates.io)](https://crates.io/crates/binseq) 6 | [![docs.rs](https://img.shields.io/docsrs/binseq?color=green&label=docs.rs)](https://docs.rs/binseq/latest/binseq/) 7 | 8 | ## Overview 9 | 10 | BINSEQ is a binary file format family designed for efficient storage and processing of DNA sequences. 11 | They make use of two-bit encoding for nucleotides and are optimized for high-performance parallel processing. 12 | 13 | BINSEQ currently has two flavors: 14 | 15 | 1. **BQ**: (`*.bq`) files are for _fixed-length_ records **without** quality scores. 16 | 2. **VBQ**: (`*.vbq`) files are for _variable-length_ records **with optional** quality scores and headers. 17 | 18 | Both flavors support both single and paired sequences. 19 | 20 | ## Getting Started 21 | 22 | This is a **library** for reading and writing BINSEQ files, for a **command-line interface** see [bqtools](https://github.com/arcinstitute/bqtools). 23 | 24 | To get started please refer to our [documentation](https://docs.rs/binseq/latest/binseq/). 25 | For example programs which make use of the library check out our [examples directory](https://github.com/arcinstitute/binseq/tree/main/examples). 26 | 27 | For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). 28 | -------------------------------------------------------------------------------- /examples/streaming.rs: -------------------------------------------------------------------------------- 1 | use std::io::{BufReader, Cursor}; 2 | 3 | use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; 4 | use binseq::{BinseqRecord, Policy, Result}; 5 | 6 | fn main() -> Result<()> { 7 | // Create a header for sequences of length 100 8 | let header = BinseqHeaderBuilder::new().slen(100).build()?; 9 | 10 | // Create some example sequence data 11 | let sequence = b"ACGT".repeat(25); // 100 nucleotides 12 | 13 | // Create a stream writer with a memory buffer as destination 14 | let mut writer = StreamWriterBuilder::default() 15 | .header(header) 16 | .policy(Policy::RandomDraw) // Use random nucleotides for invalid bases 17 | .buffer_capacity(4096) // Use 4K buffer 18 | .build(Cursor::new(Vec::new()))?; 19 | 20 | // Write the sequence with flag 0 21 | writer.write_record(Some(0), &sequence)?; 22 | 23 | // Write the sequence with flag 1 24 | writer.write_record(Some(1), &sequence)?; 25 | 26 | // Flush and get the buffer 27 | let buffer = writer.into_inner()?; 28 | let buffer_inner = buffer.into_inner(); 29 | 30 | println!("Wrote {} bytes to buffer", buffer_inner.len()); 31 | 32 | // Now read from the buffer using the streaming reader 33 | let cursor = Cursor::new(buffer_inner); 34 | let buf_reader = BufReader::new(cursor); 35 | 36 | // Create a stream reader 37 | let mut reader = StreamReader::new(buf_reader); 38 | 39 | // Read and display the header 40 | let header = reader.read_header()?; 41 | println!("Read header: sequence length = {}", header.slen); 42 | 43 | // Read records one by one 44 | let mut count = 0; 45 | while let Some(record) = reader.next_record() { 46 | let record = record?; 47 | println!("Record {}: flag = {:?}", count, record.flag()); 48 | count += 1; 49 | } 50 | 51 | println!("Read {count} records"); 52 | 53 | Ok(()) 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | env: 6 | CARGO_TERM_COLOR: always 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Build 14 | run: cargo build --verbose 15 | - name: Run tests 16 | run: cargo test --verbose 17 | - name: Run tests (release) 18 | run: cargo test --verbose --release 19 | 20 | fmt_lint: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Formatting 25 | run: cargo fmt --check 26 | - name: Linting 27 | run: cargo clippy --verbose 28 | 29 | example_read_write: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v3 33 | - name: run example 34 | run: cargo run --release --example read_write 35 | 36 | example_parallel: 37 | runs-on: ubuntu-latest 38 | steps: 39 | - uses: actions/checkout@v3 40 | - name: run example 41 | run: cargo run --release --example parallel_processing 42 | 43 | example_example: 44 | runs-on: ubuntu-latest 45 | steps: 46 | - uses: actions/checkout@v3 47 | - name: run example 48 | run: cargo run --release --example example 49 | 50 | example_grep: 51 | runs-on: ubuntu-latest 52 | steps: 53 | - uses: actions/checkout@v3 54 | - name: run example bq 55 | run: cargo run --release --example grep ./data/subset.bq 56 | - name: run example vbq 57 | run: cargo run --release --example grep ./data/subset.vbq 58 | 59 | example_range: 60 | runs-on: ubuntu-latest 61 | steps: 62 | - uses: actions/checkout@v3 63 | - name: run example (bq) 64 | run: cargo run --release --example parallel_range -- ./data/subset.bq 4 30 200 65 | - name: run example (vbq) 66 | run: cargo run --release --example parallel_range -- ./data/subset.vbq 4 30 200 67 | -------------------------------------------------------------------------------- /examples/grep.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use anyhow::Result; 4 | use binseq::{context::SeqCtx, prelude::*}; 5 | use memchr::memmem::Finder; 6 | use parking_lot::Mutex; 7 | 8 | #[derive(Clone)] 9 | pub struct GrepCounter { 10 | // (thread) local variables 11 | ctx: SeqCtx, 12 | local_count: usize, 13 | 14 | // search pattern (using memchr::memmem::Finder for fast searching) 15 | pattern: Finder<'static>, 16 | 17 | // global variables 18 | count: Arc>, 19 | } 20 | impl GrepCounter { 21 | #[must_use] 22 | pub fn new(pattern: &[u8]) -> Self { 23 | Self { 24 | ctx: SeqCtx::default(), 25 | pattern: Finder::new(pattern).into_owned(), 26 | local_count: 0, 27 | count: Arc::new(Mutex::new(0)), 28 | } 29 | } 30 | 31 | fn match_sequence(&self, seq: &[u8]) -> bool { 32 | self.pattern.find(seq).is_some() 33 | } 34 | 35 | fn pprint(&self) { 36 | println!("Matching records: {}", self.count.lock()); 37 | } 38 | } 39 | impl ParallelProcessor for GrepCounter { 40 | fn process_record(&mut self, record: R) -> binseq::Result<()> { 41 | self.ctx.fill(&record)?; 42 | 43 | if self.match_sequence(&self.ctx.sbuf()) || self.match_sequence(&self.ctx.xbuf()) { 44 | self.local_count += 1; 45 | } 46 | 47 | Ok(()) 48 | } 49 | 50 | fn on_batch_complete(&mut self) -> binseq::Result<()> { 51 | *self.count.lock() += self.local_count; 52 | self.local_count = 0; 53 | Ok(()) 54 | } 55 | } 56 | 57 | fn main() -> Result<()> { 58 | let path = std::env::args() 59 | .nth(1) 60 | .unwrap_or("./data/subset.bq".to_string()); 61 | let pattern = std::env::args() 62 | .nth(2) 63 | .unwrap_or("ACGT".to_string()) 64 | .as_bytes() 65 | .to_vec(); 66 | let n_threads = std::env::args().nth(3).unwrap_or("1".to_string()).parse()?; 67 | 68 | let reader = BinseqReader::new(&path)?; 69 | let counter = GrepCounter::new(&pattern); 70 | reader.process_parallel(counter.clone(), n_threads)?; 71 | counter.pprint(); 72 | 73 | Ok(()) 74 | } 75 | -------------------------------------------------------------------------------- /examples/network_streaming.rs: -------------------------------------------------------------------------------- 1 | use std::io::{BufReader, BufWriter}; 2 | use std::net::{TcpListener, TcpStream}; 3 | use std::thread; 4 | 5 | use binseq::bq::{BinseqHeader, BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; 6 | use binseq::{BinseqRecord, Policy, Result}; 7 | 8 | fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> { 9 | // Create a listener on localhost:3000 10 | let listener = TcpListener::bind("127.0.0.1:3000").expect("Failed to bind to address"); 11 | println!("Server listening on 127.0.0.1:3000"); 12 | 13 | // Accept one connection 14 | let (stream, _) = listener.accept().expect("Failed to accept connection"); 15 | println!("Client connected"); 16 | 17 | let stream = BufWriter::new(stream); 18 | 19 | // Create a stream writer with the network stream as destination 20 | let mut writer = StreamWriterBuilder::default() 21 | .header(header) 22 | .policy(Policy::RandomDraw) 23 | .buffer_capacity(16384) // Larger buffer for network I/O 24 | .build(stream)?; 25 | 26 | // Write sequences in a loop 27 | for i in 0..10 { 28 | writer.write_record(Some(i), sequence)?; 29 | println!("Server: Sent record {i}"); 30 | 31 | // Simulate delay between records 32 | thread::sleep(std::time::Duration::from_millis(100)); 33 | } 34 | 35 | // Ensure flush on drop 36 | writer.flush()?; 37 | println!("Server: All records sent"); 38 | 39 | Ok(()) 40 | } 41 | 42 | fn client() -> Result<()> { 43 | // Wait a moment for the server to start 44 | thread::sleep(std::time::Duration::from_millis(500)); 45 | 46 | // Connect to the server 47 | let stream = TcpStream::connect("127.0.0.1:3000").expect("Failed to connect to server"); 48 | println!("Connected to server"); 49 | 50 | // Create a buffered reader for the stream 51 | let reader = BufReader::new(stream); 52 | 53 | // Create a streaming reader 54 | let mut reader = StreamReader::new(reader); 55 | 56 | // Read the header 57 | let header = reader.read_header()?; 58 | println!( 59 | "Client: Received header with sequence length = {}", 60 | header.slen 61 | ); 62 | 63 | // Read records as they arrive 64 | let mut count = 0; 65 | while let Some(record) = reader.next_record() { 66 | let record = record?; 67 | println!( 68 | "Client: Received record {} with flag = {:?}", 69 | count, 70 | record.flag() 71 | ); 72 | count += 1; 73 | } 74 | 75 | println!("Client: Received {count} records total"); 76 | 77 | Ok(()) 78 | } 79 | 80 | fn main() -> Result<()> { 81 | // Create a header for sequences of length 100 82 | let header = BinseqHeaderBuilder::new().slen(100).build()?; 83 | 84 | // Create some example sequence data 85 | let sequence = b"ACGT".repeat(25); // 100 nucleotides 86 | 87 | // Spawn the server in a separate thread 88 | let server_thread = thread::spawn(move || { 89 | if let Err(e) = server(header, &sequence) { 90 | eprintln!("Server error: {e:?}"); 91 | } 92 | }); 93 | 94 | // Run the client in the main thread 95 | if let Err(e) = client() { 96 | eprintln!("Client error: {e:?}"); 97 | } 98 | 99 | // Wait for the server to finish 100 | server_thread.join().unwrap(); 101 | 102 | Ok(()) 103 | } 104 | -------------------------------------------------------------------------------- /examples/example.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{stdout, BufWriter, Write}; 3 | use std::sync::Arc; 4 | 5 | use anyhow::Result; 6 | use binseq::prelude::*; 7 | 8 | use parking_lot::Mutex; 9 | 10 | /// A struct for decoding BINSEQ data back to FASTQ format. 11 | #[derive(Clone)] 12 | pub struct Decoder { 13 | /// Reusable context 14 | ctx: Ctx, 15 | 16 | /// local output buffer 17 | local_writer: Vec, 18 | 19 | /// global output buffer 20 | global_writer: Arc>>, 21 | 22 | /// Local count of records 23 | local_count: usize, 24 | 25 | /// global count of records 26 | global_count: Arc>, 27 | } 28 | 29 | impl Decoder { 30 | #[must_use] 31 | pub fn new(writer: Box) -> Self { 32 | let global_writer = Arc::new(Mutex::new(writer)); 33 | Decoder { 34 | local_writer: Vec::new(), 35 | ctx: Ctx::default(), 36 | local_count: 0, 37 | global_writer, 38 | global_count: Arc::new(Mutex::new(0)), 39 | } 40 | } 41 | 42 | #[must_use] 43 | pub fn num_records(&self) -> usize { 44 | *self.global_count.lock() 45 | } 46 | } 47 | impl ParallelProcessor for Decoder { 48 | fn process_record(&mut self, record: R) -> binseq::Result<()> { 49 | self.ctx.fill(&record)?; 50 | write_fastq_parts( 51 | &mut self.local_writer, 52 | self.ctx.sheader(), 53 | self.ctx.sbuf(), 54 | self.ctx.squal(), 55 | )?; 56 | 57 | // write extended fastq to local buffer 58 | if record.is_paired() { 59 | write_fastq_parts( 60 | &mut self.local_writer, 61 | self.ctx.xheader(), 62 | &self.ctx.xbuf(), 63 | self.ctx.xqual(), 64 | )?; 65 | } 66 | 67 | self.local_count += 1; 68 | Ok(()) 69 | } 70 | 71 | fn on_batch_complete(&mut self) -> binseq::Result<()> { 72 | // Lock the mutex to write to the global buffer 73 | { 74 | let mut lock = self.global_writer.lock(); 75 | lock.write_all(&self.local_writer)?; 76 | lock.flush()?; 77 | } 78 | // Lock the mutex to update the number of records 79 | { 80 | let mut global_count = self.global_count.lock(); 81 | *global_count += self.local_count; 82 | } 83 | 84 | // Clear the local buffer and reset the local record count 85 | self.local_writer.clear(); 86 | self.local_count = 0; 87 | Ok(()) 88 | } 89 | } 90 | 91 | #[allow(clippy::missing_errors_doc)] 92 | pub fn write_fastq_parts( 93 | writer: &mut W, 94 | index: &[u8], 95 | sequence: &[u8], 96 | quality: &[u8], 97 | ) -> Result<(), std::io::Error> { 98 | writer.write_all(b"@seq.")?; 99 | writer.write_all(index)?; 100 | writer.write_all(b"\n")?; 101 | writer.write_all(sequence)?; 102 | writer.write_all(b"\n+\n")?; 103 | writer.write_all(quality)?; 104 | writer.write_all(b"\n")?; 105 | Ok(()) 106 | } 107 | 108 | fn match_output(path: Option<&str>) -> Result> { 109 | if let Some(path) = path { 110 | let writer = File::create(path).map(BufWriter::new)?; 111 | Ok(Box::new(writer)) 112 | } else { 113 | let stdout = stdout(); 114 | Ok(Box::new(BufWriter::new(stdout))) 115 | } 116 | } 117 | 118 | fn main() -> Result<()> { 119 | let file = std::env::args() 120 | .nth(1) 121 | .unwrap_or("./data/subset.bq".to_string()); 122 | let n_threads = std::env::args().nth(2).unwrap_or("1".to_string()).parse()?; 123 | 124 | let reader = BinseqReader::new(&file)?; 125 | let writer = match_output(None)?; 126 | let proc = Decoder::new(writer); 127 | 128 | reader.process_parallel(proc.clone(), n_threads)?; 129 | eprintln!("Read {} records", proc.num_records()); 130 | 131 | Ok(()) 132 | } 133 | -------------------------------------------------------------------------------- /src/context/traits.rs: -------------------------------------------------------------------------------- 1 | use crate::{BinseqRecord, Result}; 2 | 3 | pub const DEFAULT_QUALITY: u8 = b'?'; 4 | 5 | /// Trait for handling reusable buffers in decoding BINSEQ records. 6 | pub trait Context: Clone + Default { 7 | /// Replaces the contents of the context with the contents of the given record. 8 | /// 9 | /// This will clear all existing data and fill the context with the contents of the record. 10 | fn fill(&mut self, record: &R) -> Result<()>; 11 | } 12 | 13 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on nucleotide sequences. 14 | pub trait SequenceContext { 15 | fn sbuf(&self) -> &[u8]; 16 | fn xbuf(&self) -> &[u8]; 17 | fn sbuf_mut(&mut self) -> &mut Vec; 18 | fn xbuf_mut(&mut self) -> &mut Vec; 19 | #[inline] 20 | fn clear_sequences(&mut self) { 21 | self.sbuf_mut().clear(); 22 | self.xbuf_mut().clear(); 23 | } 24 | #[inline] 25 | #[allow(deprecated)] 26 | fn fill_sequences(&mut self, record: &R) -> Result<()> { 27 | self.clear_sequences(); 28 | record.decode_s(self.sbuf_mut())?; 29 | if record.is_paired() { 30 | record.decode_x(self.xbuf_mut())?; 31 | } 32 | Ok(()) 33 | } 34 | } 35 | 36 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on quality data. 37 | pub trait QualityContext { 38 | fn squal(&self) -> &[u8]; 39 | fn xqual(&self) -> &[u8]; 40 | fn squal_mut(&mut self) -> &mut Vec; 41 | fn xqual_mut(&mut self) -> &mut Vec; 42 | #[inline] 43 | fn clear_qualities(&mut self) { 44 | self.squal_mut().clear(); 45 | self.xqual_mut().clear(); 46 | } 47 | #[inline] 48 | fn fill_qualities(&mut self, record: &R) -> Result<()> { 49 | if record.has_quality() { 50 | let slen = record.slen() as usize; 51 | let squal = self.squal_mut(); 52 | if squal.len() != slen { 53 | squal.resize(slen, DEFAULT_QUALITY); 54 | } 55 | squal.copy_from_slice(record.squal()); 56 | 57 | if record.is_paired() { 58 | let xlen = record.xlen() as usize; 59 | let xqual = self.xqual_mut(); 60 | if xqual.len() != xlen { 61 | xqual.resize(xlen, DEFAULT_QUALITY); 62 | } 63 | xqual.copy_from_slice(record.xqual()); 64 | } 65 | } else { 66 | self.ensure_quality_capacity(record); 67 | } 68 | Ok(()) 69 | } 70 | #[inline] 71 | fn ensure_quality_capacity(&mut self, record: &R) { 72 | let slen = record.slen() as usize; 73 | let xlen = record.xlen() as usize; 74 | 75 | // only resize if its not the right size 76 | let squal = self.squal_mut(); 77 | if squal.len() != slen { 78 | squal.resize(slen, DEFAULT_QUALITY); 79 | } 80 | 81 | // Only resize if there's an extended sequence and it's not already the right size 82 | let xqual = self.xqual_mut(); 83 | if xqual.len() != xlen { 84 | xqual.resize(xlen, DEFAULT_QUALITY); 85 | } 86 | } 87 | } 88 | 89 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on header data. 90 | pub trait HeaderContext { 91 | fn sheader(&self) -> &[u8]; 92 | fn sheader_mut(&mut self) -> &mut Vec; 93 | fn xheader(&self) -> &[u8]; 94 | fn xheader_mut(&mut self) -> &mut Vec; 95 | #[inline] 96 | fn clear_headers(&mut self) { 97 | self.sheader_mut().clear(); 98 | self.xheader_mut().clear(); 99 | } 100 | 101 | #[inline] 102 | fn fill_headers(&mut self, record: &R) { 103 | self.clear_headers(); 104 | self.sheader_mut().extend_from_slice(record.sheader()); 105 | if record.is_paired() { 106 | self.xheader_mut().extend_from_slice(record.xheader()); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | //! 3 | //! # BINSEQ 4 | //! 5 | //! The `binseq` library provides efficient APIs for working with the [BINSEQ](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1) file format family. 6 | //! 7 | //! It offers methods to read and write BINSEQ files, providing: 8 | //! 9 | //! - Compact multi-bit encoding and decoding of nucleotide sequences through [`bitnuc`](https://docs.rs/bitnuc/latest/bitnuc/) 10 | //! - Memory-mapped file access for efficient reading ([`bq::MmapReader`] and [`vbq::MmapReader`]) 11 | //! - Parallel processing capabilities for arbitrary tasks through the [`ParallelProcessor`] trait. 12 | //! - Configurable [`Policy`] for handling invalid nucleotides 13 | //! - Support for both single and paired-end sequences 14 | //! - Optional sequence headers/identifiers (VBQ format) 15 | //! - Abstract [`BinseqRecord`] trait for representing records from both `.bq` and `.vbq` files. 16 | //! - Abstract [`BinseqReader`] enum for processing records from both `.bq` and `.vbq` files. 17 | //! 18 | //! ## Recent VBQ Format Changes (v0.7.0+) 19 | //! 20 | //! The VBQ format has undergone significant improvements: 21 | //! 22 | //! - **Embedded Index**: VBQ files now contain their index data embedded at the end of the file, 23 | //! eliminating separate `.vqi` index files and improving portability. 24 | //! - **Headers Support**: Optional sequence identifiers/headers can be stored with each record. 25 | //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records. 26 | //! - **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. 27 | //! 28 | //! Legacy VBQ files are automatically migrated to the new format when accessed. 29 | //! 30 | //! ## Crate Organization 31 | //! 32 | //! This library is split into 3 major parts. 33 | //! 34 | //! There are the [`bq`] and [`vbq`] modules, which provide tools for reading and writing `BQ` and `VBQ` files respectively. 35 | //! Then there are traits and utilities that are ubiquitous across the library which are available at the top-level of the crate. 36 | //! 37 | //! # Example: Memory-mapped Access 38 | //! 39 | //! ``` 40 | //! use binseq::Result; 41 | //! use binseq::prelude::*; 42 | //! 43 | //! #[derive(Clone, Default)] 44 | //! pub struct Processor { 45 | //! // Define fields here 46 | //! } 47 | //! 48 | //! impl ParallelProcessor for Processor { 49 | //! fn process_record(&mut self, record: B) -> Result<()> { 50 | //! // Implement per-record logic here 51 | //! Ok(()) 52 | //! } 53 | //! 54 | //! fn on_batch_complete(&mut self) -> Result<()> { 55 | //! // Implement per-batch logic here 56 | //! Ok(()) 57 | //! } 58 | //! } 59 | //! 60 | //! fn main() -> Result<()> { 61 | //! // provide an input path (*.bq or *.vbq) 62 | //! let path = "./data/subset.bq"; 63 | //! 64 | //! // open a reader 65 | //! let reader = BinseqReader::new(path)?; 66 | //! 67 | //! // initialize a processor 68 | //! let processor = Processor::default(); 69 | //! 70 | //! // process the records in parallel with 8 threads 71 | //! reader.process_parallel(processor, 8)?; 72 | //! Ok(()) 73 | //! } 74 | //! ``` 75 | 76 | #![allow(clippy::module_inception)] 77 | 78 | /// BQ - fixed length records, no quality scores 79 | pub mod bq; 80 | 81 | /// Error definitions 82 | pub mod error; 83 | 84 | /// Parallel processing 85 | mod parallel; 86 | 87 | /// Invalid nucleotide policy 88 | mod policy; 89 | 90 | /// Record trait shared between BINSEQ variants 91 | mod record; 92 | 93 | /// VBQ - Variable length records, optional quality scores, compressed blocks 94 | pub mod vbq; 95 | 96 | /// Prelude - Commonly used types and traits 97 | pub mod prelude; 98 | 99 | /// Context - Reusable state for parallel processing 100 | pub mod context; 101 | 102 | pub use error::{Error, IntoBinseqError, Result}; 103 | pub use parallel::{BinseqReader, ParallelProcessor, ParallelReader}; 104 | pub use policy::{Policy, RNG_SEED}; 105 | pub use record::BinseqRecord; 106 | 107 | /// Re-export `bitnuc::BitSize` 108 | pub use bitnuc::BitSize; 109 | -------------------------------------------------------------------------------- /src/record.rs: -------------------------------------------------------------------------------- 1 | use auto_impl::auto_impl; 2 | use bitnuc::BitSize; 3 | 4 | use super::Result; 5 | 6 | /// Record trait shared between BINSEQ variants. 7 | /// 8 | /// Exposes public methods for accessing internal data. 9 | /// Interfaces with the [`bitnuc`] crate for decoding sequences. 10 | /// 11 | /// Implemented by [`bq::RefRecord`](crate::bq::RefRecord) and [`vbq::RefRecord`](crate::vbq::RefRecord). 12 | /// 13 | /// Used to interact with [`ParallelProcessor`](crate::ParallelProcessor) for easy parallel processing. 14 | #[auto_impl(&, &mut)] 15 | pub trait BinseqRecord { 16 | /// Returns the bitsize of the record (number of bits per nucleotide) 17 | fn bitsize(&self) -> BitSize; 18 | 19 | /// Returns the global index of the record. 20 | fn index(&self) -> u64; 21 | 22 | /// Returns the flag value of this record 23 | fn flag(&self) -> Option; 24 | 25 | /// Returns the header of this record 26 | fn sheader(&self) -> &[u8]; 27 | 28 | /// Returns the header of the extended/paired sequence (empty if not paired) 29 | fn xheader(&self) -> &[u8]; 30 | 31 | /// Returns the length of the primary sequence of this record 32 | fn slen(&self) -> u64; 33 | 34 | /// Returns the length of the extended sequence of this record 35 | fn xlen(&self) -> u64; 36 | 37 | /// Returns a reference to the **encoded** primary sequence of this record 38 | fn sbuf(&self) -> &[u64]; 39 | 40 | /// Returns a reference to the **encoded** extended sequence of this record. 41 | /// 42 | /// Empty if no extended sequence is present. 43 | fn xbuf(&self) -> &[u64]; 44 | 45 | /// Returns a reference to the quality scores of the primary sequence of this record. 46 | /// 47 | /// Empty if no quality scores are present. 48 | fn squal(&self) -> &[u8] { 49 | &[] 50 | } 51 | 52 | /// Returns a reference to the quality scores of the extended sequence of this record. 53 | /// 54 | /// Empty if no quality scores are present. 55 | fn xqual(&self) -> &[u8] { 56 | &[] 57 | } 58 | 59 | /// Decodes the primary sequence of this record into the provided buffer. 60 | fn decode_s(&self, buf: &mut Vec) -> Result<()> { 61 | self.bitsize() 62 | .decode(self.sbuf(), self.slen() as usize, buf)?; 63 | Ok(()) 64 | } 65 | 66 | /// Decodes the extended sequence of this record into the provided buffer. 67 | fn decode_x(&self, buf: &mut Vec) -> Result<()> { 68 | self.bitsize() 69 | .decode(self.xbuf(), self.xlen() as usize, buf)?; 70 | Ok(()) 71 | } 72 | 73 | /// Returns a reference to the primary decoded sequence of this record. 74 | /// 75 | /// This is not available on all types that implement the `Record` trait. 76 | /// It should be available on types that implement it in this library however. 77 | fn sseq(&self) -> &[u8] { 78 | unimplemented!("This record does not implement direct sequence access"); 79 | } 80 | 81 | /// Returns a reference to the extended decoded sequence of this record. 82 | /// 83 | /// This may not be available on all types that implement the `Record` trait. 84 | /// It should be available on types that implement it in this library however. 85 | fn xseq(&self) -> &[u8] { 86 | unimplemented!("This record does not implement direct sequence access"); 87 | } 88 | 89 | /// Decodes the primary sequence of this record into a newly allocated buffer. 90 | /// 91 | /// Not advised to use this function as it allocates a new buffer every time. 92 | fn decode_s_alloc(&self) -> Result> { 93 | let mut buf = Vec::with_capacity(self.slen() as usize); 94 | self.decode_s(&mut buf)?; 95 | Ok(buf) 96 | } 97 | 98 | /// Decodes the extended sequence of this record into a newly allocated buffer. 99 | /// 100 | /// Not advised to use this function as it allocates a new buffer every time. 101 | fn decode_x_alloc(&self) -> Result> { 102 | let mut buf = Vec::with_capacity(self.xlen() as usize); 103 | self.decode_x(&mut buf)?; 104 | Ok(buf) 105 | } 106 | 107 | /// A convenience function to check if the record is paired. 108 | fn is_paired(&self) -> bool { 109 | self.xlen() > 0 110 | } 111 | 112 | /// A convenience function to check if record has associated quality scores 113 | fn has_quality(&self) -> bool { 114 | !self.squal().is_empty() 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /examples/parallel_processing.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::File, 3 | io::BufWriter, 4 | sync::{ 5 | atomic::{AtomicUsize, Ordering}, 6 | Arc, 7 | }, 8 | }; 9 | 10 | use anyhow::{bail, Result}; 11 | use binseq::{ 12 | bq::{self, BinseqHeaderBuilder}, 13 | context::SeqCtx, 14 | prelude::*, 15 | }; 16 | use nucgen::Sequence; 17 | 18 | #[derive(Clone, Default)] 19 | pub struct MyProcessor { 20 | local_counter: usize, 21 | counter: Arc, 22 | ctx: SeqCtx, 23 | } 24 | impl MyProcessor { 25 | #[must_use] 26 | pub fn counter(&self) -> usize { 27 | self.counter.load(Ordering::Relaxed) 28 | } 29 | } 30 | impl ParallelProcessor for MyProcessor { 31 | fn process_record(&mut self, record: R) -> binseq::Result<()> { 32 | self.ctx.fill_sequences(&record)?; 33 | self.local_counter += 1; 34 | Ok(()) 35 | } 36 | fn on_batch_complete(&mut self) -> binseq::Result<()> { 37 | self.counter 38 | .fetch_add(self.local_counter, Ordering::Relaxed); 39 | self.local_counter = 0; 40 | Ok(()) 41 | } 42 | } 43 | 44 | fn mmap_processing(binseq_path: &str, n_threads: usize) -> Result<()> { 45 | let reader = BinseqReader::new(binseq_path)?; 46 | let proc = MyProcessor::default(); 47 | reader.process_parallel(proc.clone(), n_threads)?; 48 | Ok(()) 49 | } 50 | 51 | pub fn main() -> Result<()> { 52 | let binseq_path_single = "./data/test.bq"; 53 | let binseq_path_paired = "./data/test_paired.bq"; 54 | let r1_size = 150; 55 | let r2_size = 300; 56 | let num_seq = 1_000_000; 57 | 58 | time_it( 59 | || { 60 | write_single(binseq_path_single, num_seq, r1_size)?; 61 | Ok(()) 62 | }, 63 | "write_single", 64 | ); 65 | 66 | time_it( 67 | || { 68 | write_paired(binseq_path_paired, num_seq, r1_size, r2_size)?; 69 | Ok(()) 70 | }, 71 | "write_paired", 72 | ); 73 | 74 | for n_threads in 1..=16 { 75 | if n_threads > 1 && n_threads % 2 != 0 { 76 | continue; 77 | } 78 | time_it( 79 | || { 80 | mmap_processing(binseq_path_single, n_threads)?; 81 | Ok(()) 82 | }, 83 | &format!("single - mmap_parallel_processing ({n_threads})"), 84 | ); 85 | } 86 | for n_threads in 1..=16 { 87 | if n_threads > 1 && n_threads % 2 != 0 { 88 | continue; 89 | } 90 | time_it( 91 | || { 92 | mmap_processing(binseq_path_paired, n_threads)?; 93 | Ok(()) 94 | }, 95 | &format!("paired - mmap_parallel_processing ({n_threads})"), 96 | ); 97 | } 98 | 99 | Ok(()) 100 | } 101 | 102 | fn time_it(f: F, name: &str) 103 | where 104 | F: Fn() -> Result<()>, 105 | { 106 | let now = std::time::Instant::now(); 107 | f().unwrap(); 108 | let elapsed = now.elapsed(); 109 | eprintln!("Elapsed time ({name}): {elapsed:?}"); 110 | } 111 | 112 | fn write_single(binseq_path: &str, num_seq: usize, seq_size: usize) -> Result<()> { 113 | // Open the output file 114 | let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?; 115 | let out_handle = File::create(binseq_path).map(BufWriter::new)?; 116 | let mut writer = bq::BinseqWriterBuilder::default() 117 | .header(header) 118 | .build(out_handle)?; 119 | 120 | // Write the binary sequence 121 | let mut sequence = Sequence::new(); 122 | let mut rng = rand::rng(); 123 | for _ in 0..num_seq { 124 | sequence.fill_buffer(&mut rng, seq_size); 125 | if !writer.write_record(Some(0), sequence.bytes())? { 126 | bail!("Error writing nucleotides") 127 | } 128 | } 129 | writer.flush()?; 130 | eprintln!("Finished writing {num_seq} records to path: {binseq_path}"); 131 | Ok(()) 132 | } 133 | 134 | fn write_paired(binseq_path: &str, num_seq: usize, r1_size: usize, r2_size: usize) -> Result<()> { 135 | // Open the output file 136 | let header = bq::BinseqHeaderBuilder::new() 137 | .slen(r1_size as u32) 138 | .xlen(r2_size as u32) 139 | .build()?; 140 | let out_handle = File::create(binseq_path).map(BufWriter::new)?; 141 | let mut writer = bq::BinseqWriterBuilder::default() 142 | .header(header) 143 | .build(out_handle)?; 144 | 145 | // Write the binary sequence 146 | let mut r1 = Sequence::new(); 147 | let mut r2 = Sequence::new(); 148 | let mut rng = rand::rng(); 149 | for _ in 0..num_seq { 150 | r1.fill_buffer(&mut rng, r1_size); 151 | r2.fill_buffer(&mut rng, r2_size); 152 | 153 | if !writer.write_paired_record(Some(0), r1.bytes(), r2.bytes())? { 154 | bail!("Error writing nucleotides") 155 | } 156 | } 157 | writer.flush()?; 158 | eprintln!("Finished writing {num_seq} records to path: {binseq_path}"); 159 | Ok(()) 160 | } 161 | -------------------------------------------------------------------------------- /src/context/structs.rs: -------------------------------------------------------------------------------- 1 | use super::traits::{Context, HeaderContext, QualityContext, SequenceContext}; 2 | use crate::{BinseqRecord, Result}; 3 | 4 | /// Context for sequence data 5 | /// 6 | /// Has all the necessary fields for storing sequence data. 7 | #[derive(Clone, Default)] 8 | pub struct Ctx { 9 | sbuf: Vec, 10 | xbuf: Vec, 11 | sheader: Vec, 12 | xheader: Vec, 13 | squal: Vec, 14 | xqual: Vec, 15 | } 16 | impl SequenceContext for Ctx { 17 | #[inline] 18 | fn sbuf(&self) -> &[u8] { 19 | &self.sbuf 20 | } 21 | #[inline] 22 | fn xbuf(&self) -> &[u8] { 23 | &self.xbuf 24 | } 25 | #[inline] 26 | fn sbuf_mut(&mut self) -> &mut Vec { 27 | &mut self.sbuf 28 | } 29 | #[inline] 30 | fn xbuf_mut(&mut self) -> &mut Vec { 31 | &mut self.xbuf 32 | } 33 | } 34 | impl QualityContext for Ctx { 35 | #[inline] 36 | fn squal(&self) -> &[u8] { 37 | &self.squal 38 | } 39 | #[inline] 40 | fn xqual(&self) -> &[u8] { 41 | &self.xqual 42 | } 43 | #[inline] 44 | fn squal_mut(&mut self) -> &mut Vec { 45 | &mut self.squal 46 | } 47 | #[inline] 48 | fn xqual_mut(&mut self) -> &mut Vec { 49 | &mut self.xqual 50 | } 51 | } 52 | impl HeaderContext for Ctx { 53 | #[inline] 54 | fn sheader(&self) -> &[u8] { 55 | &self.sheader 56 | } 57 | #[inline] 58 | fn xheader(&self) -> &[u8] { 59 | &self.xheader 60 | } 61 | #[inline] 62 | fn sheader_mut(&mut self) -> &mut Vec { 63 | &mut self.sheader 64 | } 65 | #[inline] 66 | fn xheader_mut(&mut self) -> &mut Vec { 67 | &mut self.xheader 68 | } 69 | } 70 | impl Context for Ctx { 71 | #[inline] 72 | fn fill(&mut self, record: &R) -> Result<()> { 73 | self.fill_sequences(record)?; 74 | self.fill_qualities(record)?; 75 | self.fill_headers(record); 76 | Ok(()) 77 | } 78 | } 79 | 80 | /// Context for just sequence data 81 | /// 82 | /// Only stores nucleotide sequence data. 83 | #[derive(Clone, Default)] 84 | pub struct SeqCtx { 85 | sbuf: Vec, 86 | xbuf: Vec, 87 | } 88 | impl SequenceContext for SeqCtx { 89 | #[inline] 90 | fn sbuf(&self) -> &[u8] { 91 | &self.sbuf 92 | } 93 | #[inline] 94 | fn xbuf(&self) -> &[u8] { 95 | &self.xbuf 96 | } 97 | #[inline] 98 | fn sbuf_mut(&mut self) -> &mut Vec { 99 | &mut self.sbuf 100 | } 101 | #[inline] 102 | fn xbuf_mut(&mut self) -> &mut Vec { 103 | &mut self.xbuf 104 | } 105 | } 106 | impl Context for SeqCtx { 107 | #[inline] 108 | fn fill(&mut self, record: &R) -> Result<()> { 109 | self.fill_sequences(record) 110 | } 111 | } 112 | 113 | /// Context for sequence data and headers 114 | /// 115 | /// Does not store quality data. 116 | #[derive(Clone, Default)] 117 | pub struct SeqHeaderCtx { 118 | sbuf: Vec, 119 | xbuf: Vec, 120 | sheader: Vec, 121 | xheader: Vec, 122 | } 123 | impl SequenceContext for SeqHeaderCtx { 124 | #[inline] 125 | fn sbuf(&self) -> &[u8] { 126 | &self.sbuf 127 | } 128 | #[inline] 129 | fn xbuf(&self) -> &[u8] { 130 | &self.xbuf 131 | } 132 | #[inline] 133 | fn sbuf_mut(&mut self) -> &mut Vec { 134 | &mut self.sbuf 135 | } 136 | #[inline] 137 | fn xbuf_mut(&mut self) -> &mut Vec { 138 | &mut self.xbuf 139 | } 140 | } 141 | impl HeaderContext for SeqHeaderCtx { 142 | #[inline] 143 | fn sheader(&self) -> &[u8] { 144 | &self.sheader 145 | } 146 | #[inline] 147 | fn xheader(&self) -> &[u8] { 148 | &self.xheader 149 | } 150 | #[inline] 151 | fn sheader_mut(&mut self) -> &mut Vec { 152 | &mut self.sheader 153 | } 154 | #[inline] 155 | fn xheader_mut(&mut self) -> &mut Vec { 156 | &mut self.xheader 157 | } 158 | } 159 | impl Context for SeqHeaderCtx { 160 | #[inline] 161 | fn fill(&mut self, record: &R) -> Result<()> { 162 | self.fill_sequences(record)?; 163 | self.fill_headers(record); 164 | Ok(()) 165 | } 166 | } 167 | 168 | /// Context for sequence data and quality data 169 | /// 170 | /// Does not store header data. 171 | #[derive(Clone, Default)] 172 | pub struct SeqQualCtx { 173 | sbuf: Vec, 174 | xbuf: Vec, 175 | squal: Vec, 176 | xqual: Vec, 177 | } 178 | impl SequenceContext for SeqQualCtx { 179 | #[inline] 180 | fn sbuf(&self) -> &[u8] { 181 | &self.sbuf 182 | } 183 | #[inline] 184 | fn xbuf(&self) -> &[u8] { 185 | &self.xbuf 186 | } 187 | #[inline] 188 | fn sbuf_mut(&mut self) -> &mut Vec { 189 | &mut self.sbuf 190 | } 191 | #[inline] 192 | fn xbuf_mut(&mut self) -> &mut Vec { 193 | &mut self.xbuf 194 | } 195 | } 196 | impl QualityContext for SeqQualCtx { 197 | #[inline] 198 | fn squal(&self) -> &[u8] { 199 | &self.squal 200 | } 201 | #[inline] 202 | fn xqual(&self) -> &[u8] { 203 | &self.xqual 204 | } 205 | #[inline] 206 | fn squal_mut(&mut self) -> &mut Vec { 207 | &mut self.squal 208 | } 209 | #[inline] 210 | fn xqual_mut(&mut self) -> &mut Vec { 211 | &mut self.xqual 212 | } 213 | } 214 | impl Context for SeqQualCtx { 215 | #[inline] 216 | fn fill(&mut self, record: &R) -> Result<()> { 217 | self.fill_sequences(record)?; 218 | self.fill_qualities(record)?; 219 | Ok(()) 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/vbq/mod.rs: -------------------------------------------------------------------------------- 1 | //! # VBINSEQ Format 2 | //! 3 | //! VBINSEQ is a high-performance binary format for variable-length nucleotide sequences 4 | //! that optimizes both storage efficiency and parallel processing capabilities. 5 | //! 6 | //! For more information on the format, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). 7 | //! 8 | //! ## Overview 9 | //! 10 | //! VBINSEQ extends the core principles of BINSEQ to accommodate: 11 | //! 12 | //! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBINSEQ can store 13 | //! sequences of any length, making it suitable for technologies like PacBio and Oxford Nanopore. 14 | //! 15 | //! * **Quality scores**: Optional storage of quality scores alongside nucleotide data when needed. 16 | //! 17 | //! * **Sequence headers**: Optional storage of sequence identifiers/headers with each record. 18 | //! 19 | //! * **Block-based organization**: Data is organized into fixed-size independent record blocks 20 | //! for efficient parallel processing. 21 | //! 22 | //! * **Compression**: Optional ZSTD compression of individual blocks balances storage 23 | //! efficiency with processing speed. 24 | //! 25 | //! * **Paired-end support**: Native support for paired sequences without needing multiple files. 26 | //! 27 | //! * **Multi-bit encoding**: Support for 2-bit and 4-bit nucleotide encodings. 28 | //! 29 | //! * **Embedded index**: Self-contained files with embedded index data for efficient random access. 30 | //! 31 | //! ## File Structure 32 | //! 33 | //! A VBINSEQ file consists of a 32-byte header followed by record blocks and an embedded index: 34 | //! 35 | //! ```text 36 | //! ┌───────────────────┐ 37 | //! │ File Header │ 32 bytes 38 | //! ├───────────────────┤ 39 | //! │ Block Header │ 32 bytes 40 | //! ├───────────────────┤ 41 | //! │ │ 42 | //! │ Block Records │ Variable size 43 | //! │ │ 44 | //! ├───────────────────┤ 45 | //! │ ... │ More blocks 46 | //! ├───────────────────┤ 47 | //! │ Compressed Index │ Variable size 48 | //! ├───────────────────┤ 49 | //! │ Index Size │ 8 bytes (u64) 50 | //! ├───────────────────┤ 51 | //! │ Index End Magic │ 8 bytes 52 | //! └───────────────────┘ 53 | //! ``` 54 | //! 55 | //! ## Record Format 56 | //! 57 | //! Each record contains the following fields in order: 58 | //! 59 | //! * Flag field (8 bytes) 60 | //! * Primary sequence length (8 bytes) 61 | //! * Extended sequence length (8 bytes, 0 if not paired) 62 | //! * Primary sequence data (2-bit or 4-bit encoded) 63 | //! * Extended sequence data (optional, for paired-end) 64 | //! * Primary quality scores (optional, if `qual` flag set) 65 | //! * Extended quality scores (optional, if paired and `qual` flag set) 66 | //! * Primary header length (8 bytes, if `headers` flag set) 67 | //! * Primary header data (UTF-8 string, if `headers` flag set) 68 | //! * Extended header length (8 bytes, if paired and `headers` flag set) 69 | //! * Extended header data (UTF-8 string, if paired and `headers` flag set) 70 | //! 71 | //! ## Recent Format Changes (v0.7.0+) 72 | //! 73 | //! * **Embedded Index**: Index data is now stored within the VBQ file itself, eliminating 74 | //! separate `.vqi` files and improving portability. 75 | //! * **Headers Support**: Optional sequence identifiers can be stored with each record. 76 | //! * **Extended Capacity**: u64 indexing supports files with more than 4 billion records. 77 | //! * **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. 78 | //! 79 | //! ## Performance Characteristics 80 | //! 81 | //! VBINSEQ is designed for high-throughput parallel processing: 82 | //! 83 | //! * Independent blocks enable true parallel processing without synchronization 84 | //! * Memory-mapped access provides efficient I/O 85 | //! * Embedded index enables fast random access without auxiliary files 86 | //! * Multi-bit encoding (2-bit/4-bit) optimizes storage for different use cases 87 | //! * Optional ZSTD compression reduces file size with minimal performance impact 88 | //! 89 | //! ## Usage Example 90 | //! 91 | //! ``` 92 | //! use std::fs::File; 93 | //! use std::io::BufWriter; 94 | //! use binseq::vbq::{VBinseqHeaderBuilder, VBinseqWriterBuilder, MmapReader}; 95 | //! use binseq::BinseqRecord; 96 | //! 97 | //! /* 98 | //! WRITING 99 | //! */ 100 | //! 101 | //! // Create a header for sequences with quality scores and headers 102 | //! let header = VBinseqHeaderBuilder::new() 103 | //! .qual(true) 104 | //! .compressed(true) 105 | //! .headers(true) 106 | //! .build(); 107 | //! 108 | //! // Create a writer 109 | //! let file = File::create("example.vbq").unwrap(); 110 | //! let mut writer = VBinseqWriterBuilder::default() 111 | //! .header(header) 112 | //! .build(BufWriter::new(file)) 113 | //! .unwrap(); 114 | //! 115 | //! // Write a sequence with quality scores and header 116 | //! let sequence = b"ACGTACGT"; 117 | //! let quality = b"IIIIFFFF"; 118 | //! let header_str = b"sequence_001"; 119 | //! writer.write_record(None, Some(header_str), sequence, Some(quality)).unwrap(); 120 | //! writer.finish().unwrap(); 121 | //! 122 | //! /* 123 | //! READING 124 | //! */ 125 | //! 126 | //! // Read the sequences back 127 | //! let mut reader = MmapReader::new("example.vbq").unwrap(); 128 | //! let mut block = reader.new_block(); 129 | //! 130 | //! // Process blocks one at a time 131 | //! let mut seq_buffer = Vec::new(); 132 | //! while reader.read_block_into(&mut block).unwrap() { 133 | //! for record in block.iter() { 134 | //! record.decode_s(&mut seq_buffer).unwrap(); 135 | //! let header = record.sheader(); 136 | //! println!("Header: {}", std::str::from_utf8(header).unwrap()); 137 | //! println!("Sequence: {}", std::str::from_utf8(&seq_buffer).unwrap()); 138 | //! println!("Quality: {}", std::str::from_utf8(record.squal()).unwrap()); 139 | //! seq_buffer.clear(); 140 | //! } 141 | //! } 142 | //! # std::fs::remove_file("example.vbq").unwrap_or(()); 143 | //! ``` 144 | 145 | mod header; 146 | mod index; 147 | mod reader; 148 | mod writer; 149 | 150 | pub use header::{BlockHeader, VBinseqHeader, VBinseqHeaderBuilder}; 151 | pub use index::{BlockIndex, BlockRange}; 152 | pub use reader::{MmapReader, RecordBlock, RecordBlockIter, RefRecord}; 153 | pub use writer::{VBinseqWriter, VBinseqWriterBuilder}; 154 | -------------------------------------------------------------------------------- /examples/parallel_range.rs: -------------------------------------------------------------------------------- 1 | use binseq::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader, Result}; 2 | use std::sync::atomic::{AtomicUsize, Ordering}; 3 | use std::sync::Arc; 4 | 5 | #[derive(Clone)] 6 | struct RangeProcessor { 7 | counter: Arc, 8 | tid: Option, 9 | range_start: usize, 10 | range_end: usize, 11 | } 12 | 13 | impl RangeProcessor { 14 | fn new(range_start: usize, range_end: usize) -> Self { 15 | Self { 16 | counter: Arc::new(AtomicUsize::new(0)), 17 | tid: None, 18 | range_start, 19 | range_end, 20 | } 21 | } 22 | 23 | fn count(&self) -> usize { 24 | self.counter.load(Ordering::Relaxed) 25 | } 26 | } 27 | 28 | impl ParallelProcessor for RangeProcessor { 29 | fn process_record(&mut self, record: R) -> Result<()> { 30 | let count = self.counter.fetch_add(1, Ordering::Relaxed); 31 | 32 | // Print progress every 10,000 records 33 | if count % 10_000 == 0 { 34 | if let Some(tid) = self.tid { 35 | println!( 36 | "Thread {}: Processed {} records (Range: {}-{}, Index: {}, Len: {})", 37 | tid, 38 | count + 1, 39 | self.range_start, 40 | self.range_end, 41 | record.index(), 42 | record.sseq().len(), 43 | ); 44 | } 45 | } 46 | 47 | Ok(()) 48 | } 49 | 50 | fn set_tid(&mut self, tid: usize) { 51 | self.tid = Some(tid); 52 | } 53 | 54 | fn get_tid(&self) -> Option { 55 | self.tid 56 | } 57 | 58 | fn on_batch_complete(&mut self) -> Result<()> { 59 | if let Some(tid) = self.tid { 60 | println!("Thread {tid} completed batch processing"); 61 | } 62 | Ok(()) 63 | } 64 | } 65 | 66 | fn main() -> Result<()> { 67 | let args: Vec = std::env::args().collect(); 68 | if args.len() < 2 { 69 | eprintln!( 70 | "Usage: {} [num_threads] [start] [end]", 71 | args[0] 72 | ); 73 | eprintln!("Example: {} data/subset.bq 4 1000 5000", args[0]); 74 | std::process::exit(1); 75 | } 76 | 77 | let file_path = &args[1]; 78 | let num_threads = args 79 | .get(2) 80 | .unwrap_or(&"4".to_string()) 81 | .parse::() 82 | .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))?; 83 | 84 | // Create reader to get total record count 85 | let reader = BinseqReader::new(file_path)?; 86 | let total_records = reader.num_records()?; 87 | 88 | println!("File: {file_path}"); 89 | println!("Total records in file: {total_records}"); 90 | 91 | // Parse range arguments or use defaults 92 | let start = args 93 | .get(3) 94 | .map(|s| s.parse::()) 95 | .transpose() 96 | .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))? 97 | .unwrap_or(0); 98 | let end = args 99 | .get(4) 100 | .map(|s| s.parse::()) 101 | .transpose() 102 | .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))? 103 | .unwrap_or(total_records.min(10_000)); // Default to first 10k records 104 | 105 | // Validate range 106 | if start >= total_records { 107 | eprintln!("Error: Start index {start} is >= total records {total_records}"); 108 | std::process::exit(1); 109 | } 110 | if end > total_records { 111 | eprintln!( 112 | "Warning: End index {end} is > total records {total_records}, clamping to {total_records}" 113 | ); 114 | } 115 | let end = end.min(total_records); 116 | 117 | if start >= end { 118 | eprintln!("Error: Start index {start} must be < end index {end}"); 119 | std::process::exit(1); 120 | } 121 | 122 | println!( 123 | "Processing range: {} to {} ({} records)", 124 | start, 125 | end, 126 | end - start 127 | ); 128 | println!("Using {num_threads} threads"); 129 | println!(); 130 | 131 | // Demonstrate processing the full file 132 | println!("=== Processing full file ==="); 133 | let reader_full = BinseqReader::new(file_path)?; 134 | let processor_full = RangeProcessor::new(0, total_records); 135 | let start_time = std::time::Instant::now(); 136 | 137 | reader_full.process_parallel(processor_full.clone(), num_threads)?; 138 | 139 | let elapsed_full = start_time.elapsed(); 140 | println!("Full file processing completed!"); 141 | println!("Records processed: {}", processor_full.count()); 142 | println!("Time taken: {elapsed_full:.2?}"); 143 | println!(); 144 | 145 | // Demonstrate processing a specific range 146 | println!("=== Processing specific range ==="); 147 | let reader_range = BinseqReader::new(file_path)?; 148 | let processor_range = RangeProcessor::new(start, end); 149 | let start_time = std::time::Instant::now(); 150 | 151 | reader_range.process_parallel_range(processor_range.clone(), num_threads, start..end)?; 152 | 153 | let elapsed_range = start_time.elapsed(); 154 | println!("Range processing completed!"); 155 | println!("Records processed: {}", processor_range.count()); 156 | println!("Expected records: {}", end - start); 157 | println!("Time taken: {elapsed_range:.2?}"); 158 | 159 | // Compare performance 160 | if processor_range.count() > 0 && processor_full.count() > 0 { 161 | let full_rate = processor_full.count() as f64 / elapsed_full.as_secs_f64(); 162 | let range_rate = processor_range.count() as f64 / elapsed_range.as_secs_f64(); 163 | println!(); 164 | println!("=== Performance Comparison ==="); 165 | println!("Full file rate: {full_rate:.0} records/sec"); 166 | println!("Range rate: {range_rate:.0} records/sec"); 167 | 168 | if range_rate > full_rate { 169 | println!( 170 | "Range processing was {:.1}x faster per record", 171 | range_rate / full_rate 172 | ); 173 | } else { 174 | println!( 175 | "Full file processing was {:.1}x faster per record", 176 | full_rate / range_rate 177 | ); 178 | } 179 | } 180 | 181 | Ok(()) 182 | } 183 | -------------------------------------------------------------------------------- /examples/read_write.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::File, 3 | io::{BufReader, BufWriter}, 4 | }; 5 | 6 | use anyhow::{bail, Result}; 7 | use binseq::{ 8 | bq::{BinseqHeaderBuilder, BinseqWriterBuilder, MmapReader}, 9 | BinseqRecord, 10 | }; 11 | use seq_io::fastq::{Reader, Record}; 12 | 13 | fn read_write_single(fastq_path: &str, binseq_path: &str, seq_size: usize) -> Result<()> { 14 | // Open the input FASTQ file 15 | let (in_handle, _comp) = niffler::from_path(fastq_path)?; 16 | 17 | // Open the output file 18 | let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?; 19 | let out_handle = File::create(binseq_path).map(BufWriter::new)?; 20 | let mut writer = BinseqWriterBuilder::default() 21 | .header(header) 22 | .build(out_handle)?; 23 | 24 | let mut all_sequences = Vec::new(); 25 | 26 | // Write the binary sequence 27 | let mut reader = Reader::new(in_handle); 28 | let mut num_records_write = 0; 29 | let mut skipped_records = 0; 30 | while let Some(record) = reader.next() { 31 | let record = record?; 32 | let seq = record.seq(); 33 | if writer.write_record(Some(0), seq)? { 34 | num_records_write += 1; 35 | all_sequences.push(seq.to_vec()); 36 | } else { 37 | skipped_records += 1; 38 | } 39 | } 40 | writer.flush()?; 41 | eprintln!("Finished writing {num_records_write} records to path: {binseq_path}"); 42 | eprintln!("Skipped {skipped_records} records"); 43 | 44 | // Read the binary sequence 45 | let reader = MmapReader::new(binseq_path)?; 46 | let mut num_records_read = 0; 47 | let mut sbuf = Vec::new(); 48 | for idx in 0..reader.num_records() { 49 | let record = reader.get(idx)?; 50 | record.decode_s(&mut sbuf)?; 51 | 52 | // Check if the decoded sequence matches the original 53 | let buf_str = std::str::from_utf8(&sbuf)?; 54 | let seq_str = std::str::from_utf8(&all_sequences[num_records_read])?; 55 | assert_eq!(buf_str, seq_str); 56 | 57 | num_records_read += 1; 58 | sbuf.clear(); 59 | } 60 | eprintln!("Finished reading {num_records_read} records (mmap)"); 61 | eprintln!( 62 | "Difference in total records: {}", 63 | num_records_write - num_records_read 64 | ); 65 | eprintln!("Number of records in vec: {}", all_sequences.len()); 66 | 67 | Ok(()) 68 | } 69 | 70 | fn read_write_paired( 71 | fastq_path_r1: &str, 72 | fastq_path_r2: &str, 73 | binseq_path: &str, 74 | seq_size_r1: usize, 75 | seq_size_r2: usize, 76 | ) -> Result<()> { 77 | // Open the input FASTQ files 78 | 79 | let in_buf_r1 = File::open(fastq_path_r1).map(BufReader::new)?; 80 | let in_buf_r2 = File::open(fastq_path_r2).map(BufReader::new)?; 81 | 82 | let (in_handle_r1, _comp) = niffler::get_reader(Box::new(in_buf_r1))?; 83 | let (in_handle_r2, _comp) = niffler::get_reader(Box::new(in_buf_r2))?; 84 | 85 | // Create the header 86 | let header = BinseqHeaderBuilder::new() 87 | .slen(seq_size_r1 as u32) 88 | .xlen(seq_size_r2 as u32) 89 | .build()?; 90 | 91 | // Open the output handle 92 | let out_handle = File::create(binseq_path).map(BufWriter::new)?; 93 | 94 | // Create the writer 95 | let mut writer = BinseqWriterBuilder::default() 96 | .header(header) 97 | .build(out_handle)?; 98 | 99 | // Open the FASTQ readers 100 | let mut reader_r1 = Reader::new(in_handle_r1); 101 | let mut reader_r2 = Reader::new(in_handle_r2); 102 | 103 | // Write the binary sequence 104 | let mut num_records = 0; 105 | let mut num_skipped = 0; 106 | 107 | let mut r1_storage = Vec::new(); 108 | let mut r2_storage = Vec::new(); 109 | 110 | loop { 111 | let (record_r1, record_r2) = match (reader_r1.next(), reader_r2.next()) { 112 | (Some(r1), Some(r2)) => (r1?, r2?), 113 | (None, None) => break, 114 | _ => bail!("Mismatched number of records in R1 and R2"), 115 | }; 116 | 117 | let seq_r1 = record_r1.seq(); 118 | let seq_r2 = record_r2.seq(); 119 | 120 | if writer.write_paired_record(Some(0), seq_r1, seq_r2)? { 121 | num_records += 1; 122 | r1_storage.push(seq_r1.to_vec()); 123 | r2_storage.push(seq_r2.to_vec()); 124 | } else { 125 | num_skipped += 1; 126 | } 127 | } 128 | writer.flush()?; 129 | eprintln!("Finished writing {num_records} records"); 130 | eprintln!("Skipped {num_skipped} records"); 131 | 132 | // Read the binary sequence with mmap 133 | let reader = MmapReader::new(binseq_path)?; 134 | 135 | let mut n_processed = 0; 136 | let mut sbuf = Vec::new(); 137 | let mut xbuf = Vec::new(); 138 | 139 | for idx in 0..reader.num_records() { 140 | let record = reader.get(idx)?; 141 | 142 | record.decode_s(&mut sbuf)?; 143 | record.decode_x(&mut xbuf)?; 144 | 145 | // Check if the decoded sequence matches the original 146 | let s_str = std::str::from_utf8(&sbuf)?; 147 | let x_str = std::str::from_utf8(&xbuf)?; 148 | 149 | let s_exp = std::str::from_utf8(&r1_storage[n_processed])?; 150 | let x_exp = std::str::from_utf8(&r2_storage[n_processed])?; 151 | 152 | assert_eq!(s_str, s_exp); 153 | assert_eq!(x_str, x_exp); 154 | 155 | n_processed += 1; 156 | sbuf.clear(); 157 | xbuf.clear(); 158 | } 159 | eprintln!("Finished reading {n_processed} records"); 160 | 161 | Ok(()) 162 | } 163 | 164 | fn main() -> Result<()> { 165 | // INPUT ARGUMENTS 166 | let fastq_path_r1 = "./data/subset_R1.fastq.gz"; // exists 167 | let fastq_path_r2 = "./data/subset_R2.fastq.gz"; // exists 168 | let binseq_path_r1 = "./data/subset_R1.bq"; // created 169 | let binseq_path_r2 = "./data/subset_R2.bq"; // created 170 | let binseq_path = "./data/subset.bq"; // created 171 | let seq_size_r1 = 28; // a priori known 172 | let seq_size_r2 = 90; // a priori known 173 | 174 | read_write_single(fastq_path_r1, binseq_path_r1, seq_size_r1)?; 175 | read_write_single(fastq_path_r2, binseq_path_r2, seq_size_r2)?; 176 | read_write_paired( 177 | fastq_path_r1, 178 | fastq_path_r2, 179 | binseq_path, 180 | seq_size_r1, 181 | seq_size_r2, 182 | )?; 183 | 184 | Ok(()) 185 | } 186 | -------------------------------------------------------------------------------- /src/parallel.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | use std::path::Path; 3 | 4 | use crate::{bq, error::ExtensionError, vbq, BinseqRecord, Result}; 5 | 6 | /// An enum abstraction for BINSEQ readers that can process records in parallel 7 | /// 8 | /// This is a convenience enum that can be used for general workflows where the 9 | /// distinction between BQ and VBQ readers is not important. 10 | /// 11 | /// For more specialized workflows see [`bq::MmapReader`] and [`vbq::MmapReader`]. 12 | pub enum BinseqReader { 13 | Bq(bq::MmapReader), 14 | Vbq(vbq::MmapReader), 15 | } 16 | impl BinseqReader { 17 | pub fn new(path: &str) -> Result { 18 | let pathbuf = Path::new(path); 19 | match pathbuf.extension() { 20 | Some(ext) => match ext.to_str() { 21 | Some("bq") => Ok(Self::Bq(bq::MmapReader::new(path)?)), 22 | Some("vbq") => Ok(Self::Vbq(vbq::MmapReader::new(path)?)), 23 | _ => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()), 24 | }, 25 | None => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()), 26 | } 27 | } 28 | 29 | /// Set whether to decode sequences at once in each block 30 | /// 31 | /// Note: This setting applies to VBQ readers only. 32 | pub fn set_decode_block(&mut self, decode_block: bool) { 33 | match self { 34 | Self::Bq(_) => { 35 | // no-op 36 | } 37 | Self::Vbq(reader) => reader.set_decode_block(decode_block), 38 | } 39 | } 40 | 41 | #[must_use] 42 | pub fn is_paired(&self) -> bool { 43 | match self { 44 | Self::Bq(reader) => reader.is_paired(), 45 | Self::Vbq(reader) => reader.is_paired(), 46 | } 47 | } 48 | 49 | pub fn num_records(&self) -> Result { 50 | match self { 51 | Self::Bq(reader) => Ok(reader.num_records()), 52 | Self::Vbq(reader) => reader.num_records(), 53 | } 54 | } 55 | 56 | /// Process records in parallel within a specified range 57 | /// 58 | /// This method allows parallel processing of a subset of records within the file, 59 | /// defined by a start and end index. The range is distributed across the specified 60 | /// number of threads. 61 | /// 62 | /// # Arguments 63 | /// 64 | /// * `processor` - The processor to use for each record 65 | /// * `num_threads` - The number of threads to spawn 66 | /// * `start` - The starting record index (inclusive) 67 | /// * `end` - The ending record index (exclusive) 68 | /// 69 | /// # Returns 70 | /// 71 | /// * `Ok(())` - If all records were processed successfully 72 | /// * `Err(Error)` - If an error occurred during processing 73 | pub fn process_parallel_range( 74 | self, 75 | processor: P, 76 | num_threads: usize, 77 | range: Range, 78 | ) -> Result<()> { 79 | match self { 80 | Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range), 81 | Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range), 82 | } 83 | } 84 | } 85 | impl ParallelReader for BinseqReader { 86 | fn process_parallel( 87 | self, 88 | processor: P, 89 | num_threads: usize, 90 | ) -> Result<()> { 91 | let num_records = self.num_records()?; 92 | self.process_parallel_range(processor, num_threads, 0..num_records) 93 | } 94 | 95 | fn process_parallel_range( 96 | self, 97 | processor: P, 98 | num_threads: usize, 99 | range: Range, 100 | ) -> Result<()> { 101 | match self { 102 | Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range), 103 | Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range), 104 | } 105 | } 106 | } 107 | 108 | /// Trait for BINSEQ readers that can process records in parallel 109 | /// 110 | /// This is implemented by the **reader** not by the **processor**. 111 | /// For the **processor**, see the [`ParallelProcessor`] trait. 112 | pub trait ParallelReader { 113 | fn process_parallel( 114 | self, 115 | processor: P, 116 | num_threads: usize, 117 | ) -> Result<()>; 118 | 119 | /// Process records in parallel within a specified range 120 | /// 121 | /// This method allows parallel processing of a subset of records within the file, 122 | /// defined by a start and end index. The range is distributed across the specified 123 | /// number of threads. 124 | /// 125 | /// # Arguments 126 | /// 127 | /// * `processor` - The processor to use for each record 128 | /// * `num_threads` - The number of threads to spawn 129 | /// * `range` - The range of record indices to process 130 | /// 131 | /// # Returns 132 | /// 133 | /// * `Ok(())` - If all records were processed successfully 134 | /// * `Err(Error)` - If an error occurred during processing 135 | fn process_parallel_range( 136 | self, 137 | processor: P, 138 | num_threads: usize, 139 | range: Range, 140 | ) -> Result<()>; 141 | } 142 | 143 | /// Trait for types that can process records in parallel. 144 | /// 145 | /// This is implemented by the **processor** not by the **reader**. 146 | /// For the **reader**, see the [`ParallelReader`] trait. 147 | pub trait ParallelProcessor: Send + Clone { 148 | /// Process a single record 149 | fn process_record(&mut self, record: R) -> Result<()>; 150 | 151 | /// Called when a thread finishes processing its batch 152 | /// Default implementation does nothing 153 | #[allow(unused_variables)] 154 | fn on_batch_complete(&mut self) -> Result<()> { 155 | Ok(()) 156 | } 157 | 158 | /// Set the thread ID for this processor 159 | /// 160 | /// Each thread should call this method with its own unique ID. 161 | #[allow(unused_variables)] 162 | fn set_tid(&mut self, _tid: usize) { 163 | // Default implementation does nothing 164 | } 165 | 166 | /// Get the thread ID for this processor 167 | fn get_tid(&self) -> Option { 168 | None 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/policy.rs: -------------------------------------------------------------------------------- 1 | //! Nucleotide sequence validation and correction policies 2 | //! 3 | //! This module provides policies for handling invalid nucleotides in sequences 4 | //! during encoding operations. Different policies allow for ignoring, rejecting, 5 | //! or correcting sequences with invalid nucleotides. 6 | 7 | use rand::Rng; 8 | 9 | use crate::error::{Result, WriteError}; 10 | 11 | /// A global seed for the random number generator used in randomized policies 12 | /// 13 | /// This seed ensures reproducible behavior when using the `RandomDraw` policy 14 | /// across different runs of the program. 15 | pub const RNG_SEED: u64 = 42; 16 | 17 | /// Policy for handling invalid nucleotide sequences during encoding 18 | /// 19 | /// When encoding sequences into binary format, non-standard nucleotides (anything 20 | /// other than A, C, G, or T) may be encountered. This enum defines different 21 | /// strategies for handling such invalid nucleotides. 22 | /// 23 | /// The default policy is `IgnoreSequence`, which skips sequences containing 24 | /// invalid nucleotides. 25 | #[derive(Debug, Clone, Copy, Default)] 26 | pub enum Policy { 27 | /// Skip sequences containing invalid nucleotides (default policy) 28 | #[default] 29 | IgnoreSequence, 30 | 31 | /// Fail with an error when invalid nucleotides are encountered 32 | BreakOnInvalid, 33 | 34 | /// Replace invalid nucleotides with randomly chosen valid nucleotides (A, C, G, or T) 35 | RandomDraw, 36 | 37 | /// Replace all invalid nucleotides with 'A' 38 | SetToA, 39 | 40 | /// Replace all invalid nucleotides with 'C' 41 | SetToC, 42 | 43 | /// Replace all invalid nucleotides with 'G' 44 | SetToG, 45 | 46 | /// Replace all invalid nucleotides with 'T' 47 | SetToT, 48 | } 49 | impl Policy { 50 | /// Helper method to replace invalid nucleotides with a specific nucleotide 51 | /// 52 | /// This internal method processes a sequence and replaces any non-standard 53 | /// nucleotides (anything other than A, C, G, or T) with the specified value. 54 | /// 55 | /// # Arguments 56 | /// 57 | /// * `sequence` - The input sequence to process 58 | /// * `val` - The replacement nucleotide (should be one of A, C, G, or T) 59 | /// * `ibuf` - The output buffer to store the processed sequence 60 | fn fill_with_known(sequence: &[u8], val: u8, ibuf: &mut Vec) { 61 | for &n in sequence { 62 | ibuf.push(match n { 63 | b'A' | b'C' | b'G' | b'T' => n, 64 | _ => val, 65 | }); 66 | } 67 | } 68 | 69 | /// Helper method to replace invalid nucleotides with random valid nucleotides 70 | /// 71 | /// This internal method processes a sequence and replaces any non-standard 72 | /// nucleotides with randomly chosen valid nucleotides (A, C, G, or T). 73 | /// 74 | /// # Arguments 75 | /// 76 | /// * `sequence` - The input sequence to process 77 | /// * `rng` - The random number generator to use for selecting replacement nucleotides 78 | /// * `ibuf` - The output buffer to store the processed sequence 79 | /// 80 | /// # Type Parameters 81 | /// 82 | /// * `R` - A type that implements the `Rng` trait from the `rand` crate 83 | fn fill_with_random(sequence: &[u8], rng: &mut R, ibuf: &mut Vec) { 84 | for &n in sequence { 85 | ibuf.push(match n { 86 | b'A' | b'C' | b'G' | b'T' => n, 87 | _ => match rng.random_range(0..4) { 88 | 0 => b'A', 89 | 1 => b'C', 90 | 2 => b'G', 91 | 3 => b'T', 92 | _ => unreachable!(), 93 | }, 94 | }); 95 | } 96 | } 97 | 98 | /// Process a sequence according to the selected policy for handling invalid nucleotides 99 | /// 100 | /// This method applies the policy to the given sequence, handling any invalid nucleotides 101 | /// according to the policy's rules. It first clears the input buffer to ensure that it is empty, 102 | /// then processes the sequence accordingly. 103 | /// 104 | /// # Arguments 105 | /// 106 | /// * `sequence` - The nucleotide sequence to be processed 107 | /// * `ibuf` - The buffer to store the processed sequence (will be cleared first) 108 | /// * `rng` - The random number generator (used only with the `RandomDraw` policy) 109 | /// 110 | /// # Returns 111 | /// 112 | /// * `Ok(true)` - If the sequence was processed and should be encoded 113 | /// * `Ok(false)` - If the sequence should be skipped (for `IgnoreSequence` policy) 114 | /// * `Err(Error)` - If an error occurred (for `BreakOnInvalid` policy when invalid nucleotides are found) 115 | /// 116 | /// # Type Parameters 117 | /// 118 | /// * `R` - A type that implements the `Rng` trait from the `rand` crate 119 | /// 120 | /// # Examples 121 | /// 122 | /// ``` 123 | /// # use binseq::{Policy, Result}; 124 | /// # use rand::thread_rng; 125 | /// # fn main() -> Result<()> { 126 | /// let policy = Policy::SetToA; 127 | /// let sequence = b"ACGTNX"; 128 | /// let mut output = Vec::new(); 129 | /// let mut rng = thread_rng(); 130 | /// 131 | /// let should_process = policy.handle(sequence, &mut output, &mut rng)?; 132 | /// 133 | /// assert!(should_process); 134 | /// assert_eq!(output, b"ACGTAA"); 135 | /// # Ok(()) 136 | /// # } 137 | /// ``` 138 | pub fn handle(&self, sequence: &[u8], ibuf: &mut Vec, rng: &mut R) -> Result { 139 | // First clears the input buffer to ensure that it is empty. 140 | ibuf.clear(); 141 | 142 | // Returns a boolean indicating whether the sequence should be processed further. 143 | match self { 144 | Self::IgnoreSequence => Ok(false), 145 | Self::BreakOnInvalid => { 146 | let seq_str = std::str::from_utf8(sequence)?.to_string(); 147 | Err(WriteError::InvalidNucleotideSequence(seq_str).into()) 148 | } 149 | Self::RandomDraw => { 150 | Self::fill_with_random(sequence, rng, ibuf); 151 | Ok(true) 152 | } 153 | Self::SetToA => { 154 | Self::fill_with_known(sequence, b'A', ibuf); 155 | Ok(true) 156 | } 157 | Self::SetToC => { 158 | Self::fill_with_known(sequence, b'C', ibuf); 159 | Ok(true) 160 | } 161 | Self::SetToG => { 162 | Self::fill_with_known(sequence, b'G', ibuf); 163 | Ok(true) 164 | } 165 | Self::SetToT => { 166 | Self::fill_with_known(sequence, b'T', ibuf); 167 | Ok(true) 168 | } 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/bq/mod.rs: -------------------------------------------------------------------------------- 1 | //! # bq 2 | //! 3 | //! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**. 4 | //! 5 | //! For variable-length records and optional quality scores use the [`vbq`](crate::vbq) module. 6 | //! 7 | //! This module contains the utilities for reading, writing, and interacting with BINSEQ files. 8 | //! 9 | //! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). 10 | //! 11 | //! ## Usage 12 | //! 13 | //! ### Reading 14 | //! ```rust 15 | //! use binseq::{bq, BinseqRecord}; 16 | //! use rand::{thread_rng, Rng}; 17 | //! 18 | //! let path = "./data/subset.bq"; 19 | //! let reader = bq::MmapReader::new(path).unwrap(); 20 | //! 21 | //! // We can easily determine the number of records in the file 22 | //! let num_records = reader.num_records(); 23 | //! 24 | //! // We have random access to any record within the range 25 | //! let random_index = thread_rng().gen_range(0..num_records); 26 | //! let record = reader.get(random_index).unwrap(); 27 | //! 28 | //! // We can easily decode the (2bit)encoded sequence back to a sequence of bytes 29 | //! let mut sbuf = Vec::new(); 30 | //! let mut xbuf = Vec::new(); 31 | //! 32 | //! record.decode_s(&mut sbuf); 33 | //! if record.is_paired() { 34 | //! record.decode_x(&mut xbuf); 35 | //! } 36 | //! ``` 37 | //! 38 | //! ### Writing 39 | //! 40 | //! #### Writing unpaired sequences 41 | //! 42 | //! ```rust 43 | //! use binseq::bq; 44 | //! use std::fs::File; 45 | //! 46 | //! // Define a path for the output file 47 | //! let path = "./data/some_output.bq"; 48 | //! 49 | //! // Create the file handle 50 | //! let output_handle = File::create(path).unwrap(); 51 | //! 52 | //! // Initialize our BINSEQ header (64 bp, only primary) 53 | //! let header = bq::BinseqHeaderBuilder::new().slen(64).build().unwrap(); 54 | //! 55 | //! // Initialize our BINSEQ writer 56 | //! let mut writer = bq::BinseqWriterBuilder::default() 57 | //! .header(header) 58 | //! .build(output_handle) 59 | //! .unwrap(); 60 | //! 61 | //! // Generate a random sequence 62 | //! let seq = [b'A'; 64]; 63 | //! let flag = 0; 64 | //! 65 | //! // Write the sequence to the file 66 | //! writer.write_record(Some(flag), &seq).unwrap(); 67 | //! 68 | //! // Close the file 69 | //! writer.flush().unwrap(); 70 | //! 71 | //! // Remove the file created 72 | //! std::fs::remove_file(path).unwrap(); 73 | //! ``` 74 | //! 75 | //! #### Writing paired sequences 76 | //! 77 | //! ```rust 78 | //! use binseq::bq; 79 | //! use std::fs::File; 80 | //! 81 | //! // Define a path for the output file 82 | //! let path = "./data/some_output.bq"; 83 | //! 84 | //! // Create the file handle 85 | //! let output_handle = File::create(path).unwrap(); 86 | //! 87 | //! // Initialize our BINSEQ header (64 bp and 128bp) 88 | //! let header = bq::BinseqHeaderBuilder::new().slen(64).xlen(128).build().unwrap(); 89 | //! 90 | //! // Initialize our BINSEQ writer 91 | //! let mut writer = bq::BinseqWriterBuilder::default() 92 | //! .header(header) 93 | //! .build(output_handle) 94 | //! .unwrap(); 95 | //! 96 | //! // Generate a random sequence 97 | //! let primary = [b'A'; 64]; 98 | //! let secondary = [b'C'; 128]; 99 | //! let flag = 0; 100 | //! 101 | //! // Write the sequence to the file 102 | //! writer.write_paired_record(Some(flag), &primary, &secondary).unwrap(); 103 | //! 104 | //! // Close the file 105 | //! writer.flush().unwrap(); 106 | //! 107 | //! // Remove the file created 108 | //! std::fs::remove_file(path).unwrap(); 109 | //! ``` 110 | //! 111 | //! # Example: Streaming Access 112 | //! 113 | //! ``` 114 | //! use binseq::{Policy, Result, BinseqRecord}; 115 | //! use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; 116 | //! use std::io::{BufReader, Cursor}; 117 | //! 118 | //! fn main() -> Result<()> { 119 | //! // Create a header for sequences of length 100 120 | //! let header = BinseqHeaderBuilder::new().slen(100).build()?; 121 | //! 122 | //! // Create a stream writer 123 | //! let mut writer = StreamWriterBuilder::default() 124 | //! .header(header) 125 | //! .buffer_capacity(8192) 126 | //! .build(Cursor::new(Vec::new()))?; 127 | //! 128 | //! // Write sequences 129 | //! let sequence = b"ACGT".repeat(25); // 100 nucleotides 130 | //! writer.write_record(Some(0), &sequence)?; 131 | //! 132 | //! // Get the inner buffer 133 | //! let buffer = writer.into_inner()?; 134 | //! let data = buffer.into_inner(); 135 | //! 136 | //! // Create a stream reader 137 | //! let mut reader = StreamReader::new(BufReader::new(Cursor::new(data))); 138 | //! 139 | //! // Process records as they arrive 140 | //! while let Some(record) = reader.next_record() { 141 | //! // Process each record 142 | //! let record = record?; 143 | //! let flag = record.flag(); 144 | //! } 145 | //! 146 | //! Ok(()) 147 | //! } 148 | //! ``` 149 | //! 150 | //! ## BQ file format 151 | //! 152 | //! A BINSEQ file consists of two sections: 153 | //! 154 | //! 1. Fixed-size header (32 bytes) 155 | //! 2. Record data section 156 | //! 157 | //! ### Header Format (32 bytes total) 158 | //! 159 | //! | Offset | Size (bytes) | Name | Description | Type | 160 | //! | ------ | ------------ | -------- | ---------------------------- | ------ | 161 | //! | 0 | 4 | magic | Magic number (0x42534551) | uint32 | 162 | //! | 4 | 1 | format | Format version (currently 2) | uint8 | 163 | //! | 5 | 4 | slen | Sequence length (primary) | uint32 | 164 | //! | 9 | 4 | xlen | Sequence length (secondary) | uint32 | 165 | //! | 13 | 19 | reserved | Reserved for future use | bytes | 166 | //! 167 | //! ### Record Format 168 | //! 169 | //! Each record consists of a: 170 | //! 171 | //! 1. Flag field (8 bytes, uint64) 172 | //! 2. Sequence data (ceil(N/32) \* 8 bytes, where N is sequence length) 173 | //! 174 | //! The flag field is implementation-defined and can be used for filtering, metadata, or other purposes. The placement of the flag field at the start of each record enables efficient filtering without reading sequence data. 175 | //! 176 | //! Total record size = 8 + (ceil(N/32) \* 8) bytes, where N is sequence length 177 | //! 178 | //! ## Encoding 179 | //! 180 | //! - Each nucleotide is encoded using 2 bits: 181 | //! - A = 00 182 | //! - C = 01 183 | //! - G = 10 184 | //! - T = 11 185 | //! - Non-ATCG characters are **unsupported**. 186 | //! - Sequences are stored in Little-Endian order 187 | //! - The final u64 of sequence data is padded with zeros if the sequence length is not divisible by 32 188 | //! 189 | //! See [`bitnuc`] for 2bit implementation details. 190 | //! 191 | //! ## bq implementation Notes 192 | //! 193 | //! - Sequences are stored in u64 chunks, each holding up to 32 bases 194 | //! - Random access to any record can be calculated as: 195 | //! - record_size = 8 + (ceil(sequence_length/32) \* 8) 196 | //! - record_start = 16 + (record_index \* record_size) 197 | //! - Total number of records can be calculated as: (file_size - 16) / record_size 198 | //! - Flag field placement allows for efficient filtering strategies: 199 | //! - Records can be skipped based on flag values without reading sequence data 200 | //! - Flag checks can be vectorized for parallel processing 201 | //! - Memory access patterns are predictable for better cache utilization 202 | //! 203 | //! ## Example Storage Requirements 204 | //! 205 | //! Common sequence lengths: 206 | //! 207 | //! - 32bp reads: 208 | //! - Sequence: 1 \* 8 = 8 bytes (fits in one u64) 209 | //! - Flag: 8 bytes 210 | //! - Total per record: 16 bytes 211 | //! - 100bp reads: 212 | //! - Sequence: 4 \* 8 = 32 bytes (requires four u64s) 213 | //! - Flag: 8 bytes 214 | //! - Total per record: 40 bytes 215 | //! - 150bp reads: 216 | //! - Sequence: 5 \* 8 = 40 bytes (requires five u64s) 217 | //! - Flag: 8 bytes 218 | //! - Total per record: 48 bytes 219 | //! 220 | //! ## Validation 221 | //! 222 | //! Implementations should verify: 223 | //! 224 | //! 1. Correct magic number 225 | //! 2. Compatible version number 226 | //! 3. Sequence length is greater than 0 227 | //! 4. File size minus header (32 bytes) is divisible by the record size 228 | //! 229 | //! ## Future Considerations 230 | //! 231 | //! - The 19 reserved bytes in the header allow for future format extensions 232 | //! - The 64-bit flag field provides space for implementation-specific features such as: 233 | //! - Quality score summaries 234 | //! - Filtering flags 235 | //! - Read group identifiers 236 | //! - Processing state 237 | //! - Count data 238 | 239 | mod header; 240 | mod reader; 241 | mod writer; 242 | 243 | pub use header::{BinseqHeader, BinseqHeaderBuilder, SIZE_HEADER}; 244 | pub use reader::{MmapReader, RefRecord, StreamReader}; 245 | pub use writer::{BinseqWriter, BinseqWriterBuilder, Encoder, StreamWriter, StreamWriterBuilder}; 246 | -------------------------------------------------------------------------------- /src/bq/header.rs: -------------------------------------------------------------------------------- 1 | //! Header module for the binseq library 2 | //! 3 | //! This module provides the header structure and functionality for binary sequence files. 4 | //! The header contains metadata about the binary sequence data, including format version, 5 | //! sequence length, and other information necessary for proper interpretation of the data. 6 | 7 | use bitnuc::BitSize; 8 | use byteorder::{ByteOrder, LittleEndian}; 9 | use std::io::{Read, Write}; 10 | 11 | use crate::error::{BuilderError, HeaderError, Result}; 12 | 13 | /// Current magic number: "BSEQ" in ASCII (in little-endian byte order) 14 | /// 15 | /// This is used to identify binary sequence files and verify file integrity. 16 | #[allow(clippy::unreadable_literal)] 17 | const MAGIC: u32 = 0x51455342; 18 | 19 | /// Current format version of the binary sequence file format 20 | /// 21 | /// This version number allows for future format changes while maintaining backward compatibility. 22 | const FORMAT: u8 = 1; 23 | 24 | /// Size of the header in bytes 25 | /// 26 | /// The header has a fixed size to ensure consistent reading and writing of binary sequence files. 27 | pub const SIZE_HEADER: usize = 32; 28 | 29 | /// Reserved bytes in the header 30 | /// 31 | /// These bytes are reserved for future use and should be set to a consistent value. 32 | pub const RESERVED: [u8; 17] = [42; 17]; 33 | 34 | #[derive(Debug, Clone, Copy)] 35 | pub struct BinseqHeaderBuilder { 36 | slen: Option, 37 | xlen: Option, 38 | bitsize: Option, 39 | flags: Option, 40 | } 41 | impl Default for BinseqHeaderBuilder { 42 | fn default() -> Self { 43 | Self::new() 44 | } 45 | } 46 | 47 | impl BinseqHeaderBuilder { 48 | #[must_use] 49 | pub fn new() -> Self { 50 | BinseqHeaderBuilder { 51 | slen: None, 52 | xlen: None, 53 | bitsize: None, 54 | flags: None, 55 | } 56 | } 57 | #[must_use] 58 | pub fn slen(mut self, slen: u32) -> Self { 59 | self.slen = Some(slen); 60 | self 61 | } 62 | #[must_use] 63 | pub fn xlen(mut self, xlen: u32) -> Self { 64 | self.xlen = Some(xlen); 65 | self 66 | } 67 | #[must_use] 68 | pub fn bitsize(mut self, bitsize: BitSize) -> Self { 69 | self.bitsize = Some(bitsize); 70 | self 71 | } 72 | #[must_use] 73 | pub fn flags(mut self, flags: bool) -> Self { 74 | self.flags = Some(flags); 75 | self 76 | } 77 | pub fn build(self) -> Result { 78 | Ok(BinseqHeader { 79 | magic: MAGIC, 80 | format: FORMAT, 81 | slen: if let Some(slen) = self.slen { 82 | slen 83 | } else { 84 | return Err(BuilderError::MissingSlen.into()); 85 | }, 86 | xlen: self.xlen.unwrap_or(0), 87 | bits: self.bitsize.unwrap_or_default(), 88 | flags: self.flags.unwrap_or(false), 89 | reserved: RESERVED, 90 | }) 91 | } 92 | } 93 | 94 | /// Header structure for binary sequence files 95 | /// 96 | /// The `BinseqHeader` contains metadata about the binary sequence data stored in a file, 97 | /// including format information, sequence lengths, and space for future extensions. 98 | /// 99 | /// The total size of this structure is 32 bytes, with a fixed layout to ensure 100 | /// consistent reading and writing across different platforms. 101 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 102 | pub struct BinseqHeader { 103 | /// Magic number to identify the file format 104 | /// 105 | /// 4 bytes 106 | pub magic: u32, 107 | 108 | /// Version of the file format 109 | /// 110 | /// 1 byte 111 | pub format: u8, 112 | 113 | /// Length of all sequences in the file 114 | /// 115 | /// 4 bytes 116 | pub slen: u32, 117 | 118 | /// Length of secondary sequences in the file 119 | /// 120 | /// 4 bytes 121 | pub xlen: u32, 122 | 123 | /// Number of bits per nucleotide (currently 2 or 4) 124 | /// 125 | /// 1 byte 126 | pub bits: BitSize, 127 | 128 | /// All records have a flag attribute 129 | /// 130 | /// 1 byte 131 | pub flags: bool, 132 | 133 | /// Reserve remaining bytes for future use 134 | /// 135 | /// 17 bytes 136 | pub reserved: [u8; 17], 137 | } 138 | impl BinseqHeader { 139 | /// Creates a new header with the specified sequence length 140 | /// 141 | /// This constructor initializes a standard header with the given sequence length, 142 | /// setting the magic number and format version to their default values. 143 | /// The extended sequence length (xlen) is set to 0. 144 | /// 145 | /// # Arguments 146 | /// 147 | /// * `bits` - The number of bits per nucleotide (currently 2 or 4) 148 | /// * `slen` - The length of sequences in the file 149 | /// * `flags` - The flags for the header 150 | /// 151 | /// # Returns 152 | /// 153 | /// A new `BinseqHeader` instance 154 | #[must_use] 155 | pub fn new(bits: BitSize, slen: u32, flags: bool) -> Self { 156 | Self { 157 | magic: MAGIC, 158 | format: FORMAT, 159 | slen, 160 | xlen: 0, 161 | bits, 162 | flags, 163 | reserved: RESERVED, 164 | } 165 | } 166 | 167 | /// Creates a new header with both primary and extended sequence lengths 168 | /// 169 | /// This constructor initializes a header for files that contain both primary 170 | /// and secondary sequence data, such as quality scores or annotations. 171 | /// 172 | /// # Arguments 173 | /// 174 | /// * `bits` - The number of bits per nucleotide (currently 2 or 4) 175 | /// * `slen` - The length of primary sequences in the file 176 | /// * `xlen` - The length of secondary/extended sequences in the file 177 | /// * `flags` - The flags for the header 178 | /// 179 | /// # Returns 180 | /// 181 | /// A new `BinseqHeader` instance with extended sequence information 182 | #[must_use] 183 | pub fn new_extended(bits: BitSize, slen: u32, xlen: u32, flags: bool) -> Self { 184 | Self { 185 | magic: MAGIC, 186 | format: FORMAT, 187 | slen, 188 | xlen, 189 | bits, 190 | flags, 191 | reserved: RESERVED, 192 | } 193 | } 194 | 195 | /// Sets the bitsize of the header 196 | pub fn set_bitsize(&mut self, bits: BitSize) { 197 | self.bits = bits; 198 | } 199 | 200 | /// Checks if the file is paired 201 | #[must_use] 202 | pub fn is_paired(&self) -> bool { 203 | self.xlen > 0 204 | } 205 | 206 | /// Parses a header from a fixed-size byte array 207 | /// 208 | /// This method validates the magic number and format version before constructing 209 | /// a header instance. If validation fails, appropriate errors are returned. 210 | /// 211 | /// # Arguments 212 | /// 213 | /// * `buffer` - A byte array of exactly `SIZE_HEADER` bytes containing the header data 214 | /// 215 | /// # Returns 216 | /// 217 | /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer 218 | /// * `Err(Error)` - If the buffer contains invalid header data 219 | /// 220 | /// # Errors 221 | /// 222 | /// Returns an error if: 223 | /// * The magic number is incorrect 224 | /// * The format version is unsupported 225 | /// * The reserved bytes are invalid 226 | pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result { 227 | let magic = LittleEndian::read_u32(&buffer[0..4]); 228 | if magic != MAGIC { 229 | return Err(HeaderError::InvalidMagicNumber(magic).into()); 230 | } 231 | let format = buffer[4]; 232 | if format != FORMAT { 233 | return Err(HeaderError::InvalidFormatVersion(format).into()); 234 | } 235 | let slen = LittleEndian::read_u32(&buffer[5..9]); 236 | let xlen = LittleEndian::read_u32(&buffer[9..13]); 237 | let bits = match buffer[13] { 238 | 0 | 2 | 42 => BitSize::Two, 239 | 4 => BitSize::Four, 240 | x => return Err(HeaderError::InvalidBitSize(x).into()), 241 | }; 242 | let flags = buffer[14] != 0; 243 | let Ok(reserved) = buffer[15..32].try_into() else { 244 | return Err(HeaderError::InvalidReservedBytes.into()); 245 | }; 246 | Ok(Self { 247 | magic, 248 | format, 249 | slen, 250 | xlen, 251 | bits, 252 | flags, 253 | reserved, 254 | }) 255 | } 256 | 257 | /// Parses a header from an arbitrarily sized buffer 258 | /// 259 | /// This method extracts the header from the beginning of a buffer that may be larger 260 | /// than the header size. It checks that the buffer is at least as large as the header 261 | /// before attempting to parse it. 262 | /// 263 | /// # Arguments 264 | /// 265 | /// * `buffer` - A byte slice containing at least `SIZE_HEADER` bytes 266 | /// 267 | /// # Returns 268 | /// 269 | /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer 270 | /// * `Err(Error)` - If the buffer is too small or contains invalid header data 271 | /// 272 | /// # Errors 273 | /// 274 | /// Returns an error if: 275 | /// * The buffer is smaller than `SIZE_HEADER` 276 | /// * The header data is invalid (see `from_bytes` for validation details) 277 | pub fn from_buffer(buffer: &[u8]) -> Result { 278 | let mut bytes = [0u8; SIZE_HEADER]; 279 | if buffer.len() < SIZE_HEADER { 280 | return Err(HeaderError::InvalidSize(buffer.len(), SIZE_HEADER).into()); 281 | } 282 | bytes.copy_from_slice(&buffer[..SIZE_HEADER]); 283 | Self::from_bytes(&bytes) 284 | } 285 | 286 | /// Writes the header to a writer 287 | /// 288 | /// This method serializes the header to its binary representation and writes it 289 | /// to the provided writer. 290 | /// 291 | /// # Arguments 292 | /// 293 | /// * `writer` - Any type that implements the `Write` trait 294 | /// 295 | /// # Returns 296 | /// 297 | /// * `Ok(())` - If the header was successfully written 298 | /// * `Err(Error)` - If writing to the writer failed 299 | /// 300 | /// # Errors 301 | /// 302 | /// Returns an error if writing to the writer fails (typically an I/O error). 303 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 304 | let mut buffer = [0u8; SIZE_HEADER]; 305 | LittleEndian::write_u32(&mut buffer[0..4], self.magic); 306 | buffer[4] = self.format; 307 | LittleEndian::write_u32(&mut buffer[5..9], self.slen); 308 | LittleEndian::write_u32(&mut buffer[9..13], self.xlen); 309 | buffer[13] = self.bits.into(); 310 | buffer[14] = self.flags.into(); 311 | buffer[15..32].copy_from_slice(&self.reserved); 312 | writer.write_all(&buffer)?; 313 | Ok(()) 314 | } 315 | 316 | /// Reads a header from a reader 317 | /// 318 | /// This method reads exactly `SIZE_HEADER` bytes from the provided reader and 319 | /// parses them into a header structure. 320 | /// 321 | /// # Arguments 322 | /// 323 | /// * `reader` - Any type that implements the `Read` trait 324 | /// 325 | /// # Returns 326 | /// 327 | /// * `Ok(BinseqHeader)` - A valid header read from the reader 328 | /// * `Err(Error)` - If reading from the reader failed or the header data is invalid 329 | /// 330 | /// # Errors 331 | /// 332 | /// Returns an error if: 333 | /// * Reading from the reader fails (typically an I/O error) 334 | /// * The header data is invalid (see `from_bytes` for validation details) 335 | pub fn from_reader(reader: &mut R) -> Result { 336 | let mut buffer = [0u8; SIZE_HEADER]; 337 | reader.read_exact(&mut buffer)?; 338 | Self::from_bytes(&buffer) 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error as StdError; 2 | 3 | /// Custom Result type for binseq operations, wrapping the custom [`Error`] type 4 | pub type Result = std::result::Result; 5 | 6 | /// The main error type for the binseq library, encompassing all possible error cases 7 | /// that can occur during binary sequence operations. 8 | #[derive(thiserror::Error, Debug)] 9 | #[error(transparent)] 10 | pub enum Error { 11 | /// Errors related to file and block headers 12 | #[error("Error processing header: {0}")] 13 | HeaderError(#[from] HeaderError), 14 | 15 | /// Errors that occur during write operations 16 | #[error("Error writing file: {0}")] 17 | WriteError(#[from] WriteError), 18 | 19 | /// Errors that occur during read operations 20 | #[error("Error reading file: {0}")] 21 | ReadError(#[from] ReadError), 22 | 23 | /// Errors that occur during build operations 24 | #[error("Error building file: {0}")] 25 | BuilderError(#[from] BuilderError), 26 | 27 | /// Errors related to file indexing 28 | #[error("Error processing Index: {0}")] 29 | IndexError(#[from] IndexError), 30 | 31 | /// Standard I/O errors 32 | #[error("Error with IO: {0}")] 33 | IoError(#[from] std::io::Error), 34 | 35 | /// UTF-8 conversion errors 36 | #[error("Error with UTF8: {0}")] 37 | Utf8Error(#[from] std::str::Utf8Error), 38 | 39 | /// Errors related to missing extensions 40 | ExtensionError(#[from] ExtensionError), 41 | 42 | /// Errors from the bitnuc dependency for nucleotide encoding/decoding 43 | #[error("Bitnuc error: {0}")] 44 | BitnucError(#[from] bitnuc::Error), 45 | 46 | /// Conversion errors from anyhow errors 47 | #[error("Generic error: {0}")] 48 | AnyhowError(#[from] anyhow::Error), 49 | 50 | /// Generic errors for other unexpected situations 51 | #[error("Generic error: {0}")] 52 | GenericError(#[from] Box), 53 | } 54 | impl Error { 55 | /// Checks if the error is an index mismatch error 56 | /// 57 | /// This is useful for determining if a file's index is out of sync with its content, 58 | /// which might require rebuilding the index. 59 | /// 60 | /// # Returns 61 | /// 62 | /// * `true` if the error is an `IndexError::ByteSizeMismatch` 63 | /// * `false` for all other error types 64 | #[must_use] 65 | pub fn is_index_mismatch(&self) -> bool { 66 | match self { 67 | Self::IndexError(err) => err.is_mismatch(), 68 | _ => false, 69 | } 70 | } 71 | } 72 | 73 | /// Errors specific to processing and validating binary sequence headers 74 | #[derive(thiserror::Error, Debug)] 75 | pub enum HeaderError { 76 | /// The magic number in the header does not match the expected value 77 | /// 78 | /// # Arguments 79 | /// * `u32` - The invalid magic number that was found 80 | #[error("Invalid magic number: {0}")] 81 | InvalidMagicNumber(u32), 82 | 83 | /// The format version in the header is not supported 84 | /// 85 | /// # Arguments 86 | /// * `u8` - The unsupported version number that was found 87 | #[error("Invalid format version: {0}")] 88 | InvalidFormatVersion(u8), 89 | 90 | /// The reserved bytes in the header contain unexpected values 91 | #[error("Invalid reserved bytes")] 92 | InvalidReservedBytes, 93 | 94 | /// The bits in the header contain unexpected values 95 | #[error("Invalid bit size found in header: {0} - expecting [2,4]")] 96 | InvalidBitSize(u8), 97 | 98 | /// The size of the data does not match what was specified in the header 99 | /// 100 | /// # Arguments 101 | /// * First `usize` - The actual number of bytes provided 102 | /// * Second `usize` - The expected number of bytes according to the header 103 | #[error("Invalid number of bytes provided: {0}. Expected: {1}")] 104 | InvalidSize(usize, usize), 105 | } 106 | 107 | /// Errors that can occur while reading binary sequence data 108 | #[derive(thiserror::Error, Debug)] 109 | pub enum ReadError { 110 | /// The file being read is not a regular file (e.g., it might be a directory or special file) 111 | #[error("File is not regular")] 112 | IncompatibleFile, 113 | 114 | /// The file appears to be truncated or corrupted 115 | /// 116 | /// # Arguments 117 | /// * `usize` - The byte position where the truncation was detected 118 | #[error( 119 | "Number of bytes in file does not match expectation - possibly truncated at byte pos {0}" 120 | )] 121 | FileTruncation(usize), 122 | 123 | /// Attempted to access a record index that is beyond the available range 124 | /// 125 | /// # Arguments 126 | /// * First `usize` - The requested record index 127 | /// * Second `usize` - The maximum available record index 128 | #[error("Requested record index ({0}) is out of record range ({1})")] 129 | OutOfRange(usize, usize), 130 | 131 | /// End of stream was reached while reading 132 | #[error("End of stream reached")] 133 | EndOfStream, 134 | 135 | /// A partial record was encountered at the end of a stream 136 | /// 137 | /// # Arguments 138 | /// * `usize` - The number of bytes read in the partial record 139 | #[error("Partial record at end of stream ({0} bytes)")] 140 | PartialRecord(usize), 141 | 142 | /// When a block header contains an invalid magic number 143 | /// 144 | /// The first parameter is the invalid magic number, the second is the position in the file 145 | #[error("Unexpected Block Magic Number found: {0} at position {1}")] 146 | InvalidBlockMagicNumber(u64, usize), 147 | 148 | /// When trying to read a block but reaching the end of the file unexpectedly 149 | /// 150 | /// The parameter is the position in the file where the read was attempted 151 | #[error("Unable to find an expected full block at position {0}")] 152 | UnexpectedEndOfFile(usize), 153 | 154 | /// When the file metadata doesn't match the expected VBINSEQ format 155 | #[error("Unexpected file metadata")] 156 | InvalidFileType, 157 | 158 | /// Missing the index end magic number 159 | #[error("Missing index end magic number")] 160 | MissingIndexEndMagic, 161 | } 162 | 163 | #[derive(thiserror::Error, Debug)] 164 | pub enum BuilderError { 165 | #[error("Missing sequence length")] 166 | MissingSlen, 167 | } 168 | 169 | /// Errors that can occur while writing binary sequence data 170 | #[derive(thiserror::Error, Debug)] 171 | pub enum WriteError { 172 | /// The length of the sequence being written does not match what was specified in the header 173 | /// 174 | /// # Fields 175 | /// * `expected` - The sequence length specified in the header 176 | /// * `got` - The actual length of the sequence being written 177 | #[error("Sequence length ({got}) does not match the header ({expected})")] 178 | UnexpectedSequenceLength { expected: u32, got: usize }, 179 | 180 | /// The sequence contains invalid nucleotide characters 181 | /// 182 | /// # Arguments 183 | /// * `String` - Description of the invalid nucleotides found 184 | #[error("Invalid nucleotides found in sequence: {0}")] 185 | InvalidNucleotideSequence(String), 186 | 187 | /// Attempted to write data without first setting up the header 188 | #[error("Missing header in writer builder")] 189 | MissingHeader, 190 | 191 | /// When trying to write data without quality scores but the header specifies they should be present 192 | #[error("Quality flag is set in header but trying to write without quality scores.")] 193 | QualityFlagSet, 194 | 195 | /// When trying to write data without a pair but the header specifies paired records 196 | #[error("Paired flag is set in header but trying to write without record pair.")] 197 | PairedFlagSet, 198 | 199 | /// When trying to write quality scores but the header specifies they are not present 200 | #[error("Quality flag not set in header but trying to write quality scores.")] 201 | QualityFlagNotSet, 202 | 203 | /// When trying to write paired data but the header doesn't specify paired records 204 | #[error("Paired flag not set in header but trying to write with record pair.")] 205 | PairedFlagNotSet, 206 | 207 | /// When trying to write data without headers but the header specifies they should be present 208 | #[error("Header flag is set in header but trying to write without headers.")] 209 | HeaderFlagSet, 210 | 211 | /// When a record is too large to fit in a block of the configured size 212 | /// 213 | /// The first parameter is the record size, the second is the maximum block size 214 | #[error("Encountered a record with embedded size {0} but the maximum block size is {1}. Rerun with increased block size.")] 215 | RecordSizeExceedsMaximumBlockSize(usize, usize), 216 | 217 | /// When trying to ingest blocks with different sizes than expected 218 | /// 219 | /// The first parameter is the expected size, the second is the found size 220 | #[error( 221 | "Incompatible block sizes encountered in BlockWriter Ingest. Found ({1}) Expected ({0})" 222 | )] 223 | IncompatibleBlockSizes(usize, usize), 224 | 225 | /// When trying to ingest data with an incompatible header 226 | /// 227 | /// The first parameter is the expected header, the second is the found header 228 | #[error("Incompatible headers found in VBinseqWriter::ingest. Found ({1:?}) Expected ({0:?})")] 229 | IncompatibleHeaders(crate::vbq::VBinseqHeader, crate::vbq::VBinseqHeader), 230 | } 231 | 232 | /// Errors related to VBINSEQ file indexing 233 | /// 234 | /// These errors occur when there are issues with the index of a VBINSEQ file, 235 | /// such as corruption or mismatches with the underlying file. 236 | #[derive(thiserror::Error, Debug)] 237 | pub enum IndexError { 238 | /// When the magic number in the index doesn't match the expected value 239 | /// 240 | /// The parameter is the invalid magic number that was found 241 | #[error("Invalid magic number: {0}")] 242 | InvalidMagicNumber(u64), 243 | 244 | /// When the index references a file that doesn't exist 245 | /// 246 | /// The parameter is the missing file path 247 | #[error("Index missing upstream file path: {0}")] 248 | MissingUpstreamFile(String), 249 | 250 | /// When the size of the file doesn't match what the index expects 251 | /// 252 | /// The first parameter is the actual file size, the second is the expected size 253 | #[error("Mismatch in size between upstream size: {0} and expected index size {1}")] 254 | ByteSizeMismatch(u64, u64), 255 | 256 | /// Invalid reserved bytes in the index header 257 | #[error("Invalid reserved bytes in index header")] 258 | InvalidReservedBytes, 259 | } 260 | impl IndexError { 261 | /// Checks if this error indicates a mismatch between the index and file 262 | /// 263 | /// This is useful to determine if the index needs to be rebuilt. 264 | /// 265 | /// # Returns 266 | /// 267 | /// * `true` for `ByteSizeMismatch` errors 268 | /// * `true` for any other error type (this behavior is likely a bug and should be fixed) 269 | #[must_use] 270 | pub fn is_mismatch(&self) -> bool { 271 | matches!(self, Self::ByteSizeMismatch(_, _) | _) // Note: this appears to always return true regardless of error type 272 | } 273 | } 274 | 275 | #[derive(thiserror::Error, Debug)] 276 | pub enum ExtensionError { 277 | /// When the extension is not supported 278 | #[error("Unsupported extension in path: {0}")] 279 | UnsupportedExtension(String), 280 | } 281 | 282 | /// Trait for converting arbitrary errors into `Error` 283 | pub trait IntoBinseqError { 284 | fn into_binseq_error(self) -> Error; 285 | } 286 | 287 | // Implement conversion for Box 288 | impl IntoBinseqError for E 289 | where 290 | E: StdError + Send + Sync + 'static, 291 | { 292 | fn into_binseq_error(self) -> Error { 293 | Error::GenericError(Box::new(self)) 294 | } 295 | } 296 | 297 | mod testing { 298 | #[allow(unused)] 299 | use super::*; 300 | use thiserror::Error; 301 | 302 | #[allow(unused)] 303 | #[derive(Error, Debug)] 304 | pub enum MyError { 305 | #[error("Custom error: {0}")] 306 | CustomError(String), 307 | } 308 | 309 | #[test] 310 | fn test_into_binseq_error() { 311 | let my_error = MyError::CustomError(String::from("some error")); 312 | let binseq_error = my_error.into_binseq_error(); 313 | assert!(matches!(binseq_error, Error::GenericError(_))); 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /src/vbq/header.rs: -------------------------------------------------------------------------------- 1 | //! # File and Block Header Definitions 2 | //! 3 | //! This module defines the header structures used in the VBINSEQ file format. 4 | //! 5 | //! The VBINSEQ format consists of two primary header types: 6 | //! 7 | //! 1. `VBinseqHeader` - The file header that appears at the beginning of a VBINSEQ file, 8 | //! containing information about the overall file format and configuration. 9 | //! 10 | //! 2. `BlockHeader` - Headers that appear before each block of records, containing 11 | //! information specific to that block like its size and number of records. 12 | //! 13 | //! Both headers are fixed-size and include magic numbers to validate file integrity. 14 | 15 | use std::io::{Read, Write}; 16 | 17 | use bitnuc::BitSize; 18 | use byteorder::{ByteOrder, LittleEndian}; 19 | 20 | use crate::error::{HeaderError, ReadError, Result}; 21 | 22 | /// Magic number for file identification: "VSEQ" in ASCII (0x51455356) 23 | /// 24 | /// This constant is used in the file header to identify VBINSEQ formatted files. 25 | #[allow(clippy::unreadable_literal)] 26 | const MAGIC: u32 = 0x51455356; 27 | 28 | /// Magic number for block identification: "BLOCKSEQ" in ASCII (0x5145534B434F4C42) 29 | /// 30 | /// This constant is used in block headers to validate block integrity. 31 | #[allow(clippy::unreadable_literal)] 32 | const BLOCK_MAGIC: u64 = 0x5145534B434F4C42; 33 | 34 | /// Current format version number 35 | /// 36 | /// This should be incremented when making backwards-incompatible changes to the format. 37 | const FORMAT: u8 = 1; 38 | 39 | /// Size of the file header in bytes (32 bytes) 40 | /// 41 | /// The file header has a fixed size to simplify parsing. 42 | pub const SIZE_HEADER: usize = 32; 43 | 44 | /// Size of the block header in bytes (32 bytes) 45 | /// 46 | /// Each block header has a fixed size to simplify block navigation. 47 | pub const SIZE_BLOCK_HEADER: usize = 32; 48 | 49 | /// Default block size in bytes: 128KB 50 | /// 51 | /// This defines the default virtual size of each record block. 52 | /// A larger block size can improve compression ratio but reduces random access granularity. 53 | pub const BLOCK_SIZE: u64 = 128 * 1024; 54 | 55 | /// Reserved bytes for future use in the file header 56 | /// 57 | /// These bytes are set to a placeholder value (42) and reserved for future extensions. 58 | pub const RESERVED_BYTES: [u8; 13] = [42; 13]; 59 | 60 | /// Reserved bytes for future use in block headers (12 bytes) 61 | /// 62 | /// These bytes are set to a placeholder value (42) and reserved for future extensions. 63 | pub const RESERVED_BYTES_BLOCK: [u8; 12] = [42; 12]; 64 | 65 | #[derive(Default, Debug, Clone, Copy)] 66 | pub struct VBinseqHeaderBuilder { 67 | qual: Option, 68 | block: Option, 69 | compressed: Option, 70 | paired: Option, 71 | bitsize: Option, 72 | headers: Option, 73 | flags: Option, 74 | } 75 | impl VBinseqHeaderBuilder { 76 | #[must_use] 77 | pub fn new() -> Self { 78 | Self::default() 79 | } 80 | #[must_use] 81 | pub fn qual(mut self, qual: bool) -> Self { 82 | self.qual = Some(qual); 83 | self 84 | } 85 | #[must_use] 86 | pub fn block(mut self, block: u64) -> Self { 87 | self.block = Some(block); 88 | self 89 | } 90 | #[must_use] 91 | pub fn compressed(mut self, compressed: bool) -> Self { 92 | self.compressed = Some(compressed); 93 | self 94 | } 95 | #[must_use] 96 | pub fn paired(mut self, paired: bool) -> Self { 97 | self.paired = Some(paired); 98 | self 99 | } 100 | #[must_use] 101 | pub fn bitsize(mut self, bitsize: BitSize) -> Self { 102 | self.bitsize = Some(bitsize); 103 | self 104 | } 105 | #[must_use] 106 | pub fn headers(mut self, headers: bool) -> Self { 107 | self.headers = Some(headers); 108 | self 109 | } 110 | #[must_use] 111 | pub fn flags(mut self, flags: bool) -> Self { 112 | self.flags = Some(flags); 113 | self 114 | } 115 | #[must_use] 116 | pub fn build(self) -> VBinseqHeader { 117 | VBinseqHeader::with_capacity( 118 | self.block.unwrap_or(BLOCK_SIZE), 119 | self.qual.unwrap_or(false), 120 | self.compressed.unwrap_or(false), 121 | self.paired.unwrap_or(false), 122 | self.bitsize.unwrap_or_default(), 123 | self.headers.unwrap_or(false), 124 | self.flags.unwrap_or(false), 125 | ) 126 | } 127 | } 128 | 129 | /// File header for VBINSEQ files 130 | /// 131 | /// This structure represents the 32-byte header that appears at the beginning of every 132 | /// VBINSEQ file. It contains configuration information about the file format, including 133 | /// whether quality scores are included, whether blocks are compressed, and whether 134 | /// records contain paired sequences. 135 | /// 136 | /// # Fields 137 | /// 138 | /// * `magic` - Magic number to validate file format ("VSEQ", 4 bytes) 139 | /// * `format` - Version number of the file format (1 byte) 140 | /// * `block` - Size of each block in bytes (8 bytes) 141 | /// * `qual` - Whether quality scores are included (1 byte boolean) 142 | /// * `compressed` - Whether blocks are ZSTD compressed (1 byte boolean) 143 | /// * `paired` - Whether records contain paired sequences (1 byte boolean) 144 | /// * `reserved` - Reserved bytes for future extensions (16 bytes) 145 | #[derive(Clone, Copy, Debug, PartialEq)] 146 | pub struct VBinseqHeader { 147 | /// Magic number to identify the file format ("VSEQ") 148 | /// 149 | /// Always set to 0x51455356 (4 bytes) 150 | pub magic: u32, 151 | 152 | /// Version of the file format 153 | /// 154 | /// Currently set to 1 (1 byte) 155 | pub format: u8, 156 | 157 | /// Block size in bytes 158 | /// 159 | /// This is the virtual (uncompressed) size of each record block (8 bytes) 160 | pub block: u64, 161 | 162 | /// Whether quality scores are included with sequences 163 | /// 164 | /// If true, quality scores are stored for each nucleotide (1 byte) 165 | pub qual: bool, 166 | 167 | /// Whether internal blocks are compressed with ZSTD 168 | /// 169 | /// If true, blocks are compressed individually (1 byte) 170 | pub compressed: bool, 171 | 172 | /// Whether records contain paired sequences 173 | /// 174 | /// If true, each record has both primary and extended sequences (1 byte) 175 | pub paired: bool, 176 | 177 | /// The bitsize of the sequence data (1 byte) 178 | /// 179 | /// Specifies the number of bits per nucleotide: 180 | /// - 2-bit: Standard encoding (A=00, C=01, G=10, T=11) 181 | /// - 4-bit: Extended encoding supporting ambiguous nucleotides 182 | pub bits: BitSize, 183 | 184 | /// Whether sequence headers are included with sequences (1 byte) 185 | /// 186 | /// When true, each record includes length-prefixed UTF-8 header strings 187 | /// for both primary and extended (paired) sequences 188 | pub headers: bool, 189 | 190 | /// Whether flags are included with sequences (1 byte) 191 | /// 192 | /// When true, each record includes length-prefixed UTF-8 flag strings 193 | /// for both primary and extended (paired) sequences 194 | pub flags: bool, 195 | 196 | /// Reserved bytes for future format extensions 197 | /// 198 | /// Currently filled with placeholder values (13 bytes) 199 | pub reserved: [u8; 13], 200 | } 201 | impl Default for VBinseqHeader { 202 | /// Creates a default header with default block size and all features disabled 203 | /// 204 | /// The default header: 205 | /// - Uses the default block size (128KB) 206 | /// - Does not include quality scores 207 | /// - Does not use compression 208 | /// - Does not support paired sequences 209 | /// - Does not include sequence headers 210 | /// - Uses 2-bit nucleotide encoding 211 | fn default() -> Self { 212 | Self::with_capacity( 213 | BLOCK_SIZE, 214 | false, 215 | false, 216 | false, 217 | BitSize::default(), 218 | false, 219 | false, 220 | ) 221 | } 222 | } 223 | impl VBinseqHeader { 224 | /// Creates a new VBINSEQ header with the default block size 225 | /// 226 | /// # Parameters 227 | /// 228 | /// * `qual` - Whether to include quality scores with sequences 229 | /// * `compressed` - Whether to use ZSTD compression for blocks 230 | /// * `paired` - Whether records contain paired sequences 231 | /// * `bitsize` - Number of bits per nucleotide (2 or 4) 232 | /// * `headers` - Whether to include sequence headers with records 233 | /// 234 | /// # Example 235 | /// 236 | /// ```rust 237 | /// use binseq::vbq::VBinseqHeaderBuilder; 238 | /// 239 | /// // Create header with quality scores and compression, without paired sequences 240 | /// let header = VBinseqHeaderBuilder::new() 241 | /// .qual(true) 242 | /// .compressed(true) 243 | /// .build(); 244 | /// ``` 245 | #[must_use] 246 | pub fn new( 247 | qual: bool, 248 | compressed: bool, 249 | paired: bool, 250 | bitsize: BitSize, 251 | headers: bool, 252 | flags: bool, 253 | ) -> Self { 254 | Self::with_capacity( 255 | BLOCK_SIZE, qual, compressed, paired, bitsize, headers, flags, 256 | ) 257 | } 258 | 259 | /// Creates a new VBINSEQ header with a custom block size 260 | /// 261 | /// # Parameters 262 | /// 263 | /// * `block` - Custom block size in bytes (virtual/uncompressed size) 264 | /// * `qual` - Whether to include quality scores with sequences 265 | /// * `compressed` - Whether to use ZSTD compression for blocks 266 | /// * `paired` - Whether records contain paired sequences 267 | /// 268 | /// # Example 269 | /// 270 | /// ```rust 271 | /// use binseq::vbq::VBinseqHeaderBuilder; 272 | /// 273 | /// // Create header with a 256KB block size, with quality scores and compression 274 | /// let header = VBinseqHeaderBuilder::new() 275 | /// .block(256 * 1024) 276 | /// .qual(true) 277 | /// .compressed(true) 278 | /// .build(); 279 | /// ``` 280 | #[must_use] 281 | pub fn with_capacity( 282 | block: u64, 283 | qual: bool, 284 | compressed: bool, 285 | paired: bool, 286 | bitsize: BitSize, 287 | headers: bool, 288 | flags: bool, 289 | ) -> Self { 290 | Self { 291 | magic: MAGIC, 292 | format: FORMAT, 293 | block, 294 | qual, 295 | compressed, 296 | paired, 297 | headers, 298 | flags, 299 | bits: bitsize, 300 | reserved: RESERVED_BYTES, 301 | } 302 | } 303 | 304 | /// Sets the encoding bitsize for the header. 305 | pub fn set_bitsize(&mut self, bits: BitSize) { 306 | self.bits = bits; 307 | } 308 | 309 | /// Creates a header from a 32-byte buffer 310 | /// 311 | /// This function parses a raw byte buffer into a `VBinseqHeader` structure, 312 | /// validating the magic number and format version. 313 | /// 314 | /// # Parameters 315 | /// 316 | /// * `buffer` - A 32-byte array containing the header data 317 | /// 318 | /// # Returns 319 | /// 320 | /// * `Result` - A valid header if parsing was successful 321 | /// 322 | /// # Errors 323 | /// 324 | /// * `HeaderError::InvalidMagicNumber` - If the magic number doesn't match "VSEQ" 325 | /// * `HeaderError::InvalidFormatVersion` - If the format version is unsupported 326 | /// * `HeaderError::InvalidReservedBytes` - If the reserved bytes section is invalid 327 | pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result { 328 | let magic = LittleEndian::read_u32(&buffer[0..4]); 329 | if magic != MAGIC { 330 | return Err(HeaderError::InvalidMagicNumber(magic).into()); 331 | } 332 | let format = buffer[4]; 333 | if format != FORMAT { 334 | return Err(HeaderError::InvalidFormatVersion(format).into()); 335 | } 336 | let block = LittleEndian::read_u64(&buffer[5..13]); 337 | let qual = buffer[13] != 0; 338 | let compressed = buffer[14] != 0; 339 | let paired = buffer[15] != 0; 340 | let bits = match buffer[16] { 341 | 0 | 2 | 42 => BitSize::Two, 342 | 4 => BitSize::Four, 343 | x => return Err(HeaderError::InvalidBitSize(x).into()), 344 | }; 345 | let headers = match buffer[17] { 346 | 0 | 42 => false, // backwards compatibility 347 | _ => true, 348 | }; 349 | let flags = buffer[18] != 0; 350 | let Ok(reserved) = buffer[19..32].try_into() else { 351 | return Err(HeaderError::InvalidReservedBytes.into()); 352 | }; 353 | Ok(Self { 354 | magic, 355 | format, 356 | block, 357 | qual, 358 | compressed, 359 | paired, 360 | bits, 361 | headers, 362 | flags, 363 | reserved, 364 | }) 365 | } 366 | 367 | /// Writes the header to a writer 368 | /// 369 | /// This function serializes the header structure into a 32-byte buffer and writes 370 | /// it to the provided writer. 371 | /// 372 | /// # Parameters 373 | /// 374 | /// * `writer` - Any type that implements the `Write` trait 375 | /// 376 | /// # Returns 377 | /// 378 | /// * `Result<()>` - Success if the header was written 379 | /// 380 | /// # Errors 381 | /// 382 | /// * IO errors if writing to the writer fails 383 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 384 | let mut buffer = [0u8; SIZE_HEADER]; 385 | LittleEndian::write_u32(&mut buffer[0..4], self.magic); 386 | buffer[4] = self.format; 387 | LittleEndian::write_u64(&mut buffer[5..13], self.block); 388 | buffer[13] = self.qual.into(); 389 | buffer[14] = self.compressed.into(); 390 | buffer[15] = self.paired.into(); 391 | buffer[16] = self.bits.into(); 392 | buffer[17] = self.headers.into(); 393 | buffer[18] = self.flags.into(); 394 | buffer[19..32].copy_from_slice(&self.reserved); 395 | writer.write_all(&buffer)?; 396 | Ok(()) 397 | } 398 | 399 | /// Reads a header from a reader 400 | /// 401 | /// This function reads 32 bytes from the provided reader and parses them into 402 | /// a `VBinseqHeader` structure. 403 | /// 404 | /// # Parameters 405 | /// 406 | /// * `reader` - Any type that implements the `Read` trait 407 | /// 408 | /// # Returns 409 | /// 410 | /// * `Result` - A valid header if reading and parsing was successful 411 | /// 412 | /// # Errors 413 | /// 414 | /// * IO errors if reading from the reader fails 415 | /// * Header validation errors from `from_bytes()` 416 | pub fn from_reader(reader: &mut R) -> Result { 417 | let mut buffer = [0u8; SIZE_HEADER]; 418 | reader.read_exact(&mut buffer)?; 419 | Self::from_bytes(&buffer) 420 | } 421 | 422 | #[must_use] 423 | pub fn is_paired(&self) -> bool { 424 | self.paired 425 | } 426 | } 427 | 428 | /// Block header for VBINSEQ block data 429 | /// 430 | /// Each block in a VBINSEQ file is preceded by a 32-byte block header that contains 431 | /// information about the block including its size and the number of records it contains. 432 | /// 433 | /// # Fields 434 | /// 435 | /// * `magic` - Magic number to validate block integrity ("BLOCKSEQ", 8 bytes) 436 | /// * `size` - Actual size of the block in bytes (8 bytes) 437 | /// * `records` - Number of records in the block (4 bytes) 438 | /// * `reserved` - Reserved bytes for future extensions (12 bytes) 439 | #[derive(Clone, Copy, Debug)] 440 | pub struct BlockHeader { 441 | /// Magic number to identify the block ("BLOCKSEQ") 442 | /// 443 | /// Always set to 0x5145534B434F4C42 (8 bytes) 444 | pub magic: u64, 445 | 446 | /// Actual size of the block in bytes 447 | /// 448 | /// This can differ from the virtual block size in the file header 449 | /// when compression is enabled (8 bytes) 450 | pub size: u64, 451 | 452 | /// Number of records stored in this block 453 | /// 454 | /// Used to iterate through records efficiently (4 bytes) 455 | pub records: u32, 456 | 457 | /// Reserved bytes for future extensions 458 | /// 459 | /// Currently filled with placeholder values (12 bytes) 460 | pub reserved: [u8; 12], 461 | } 462 | impl BlockHeader { 463 | /// Creates a new block header 464 | /// 465 | /// # Parameters 466 | /// 467 | /// * `size` - The actual size of the block in bytes (can be compressed size) 468 | /// * `records` - The number of records contained in the block 469 | /// 470 | /// # Example 471 | /// 472 | /// ```rust 473 | /// use binseq::vbq::BlockHeader; 474 | /// 475 | /// // Create a block header for a block with 1024 bytes and 100 records 476 | /// let header = BlockHeader::new(1024, 100); 477 | /// ``` 478 | #[must_use] 479 | pub fn new(size: u64, records: u32) -> Self { 480 | Self { 481 | magic: BLOCK_MAGIC, 482 | size, 483 | records, 484 | reserved: RESERVED_BYTES_BLOCK, 485 | } 486 | } 487 | 488 | #[must_use] 489 | pub fn empty() -> Self { 490 | Self { 491 | magic: BLOCK_MAGIC, 492 | size: 0, 493 | records: 0, 494 | reserved: RESERVED_BYTES_BLOCK, 495 | } 496 | } 497 | 498 | #[must_use] 499 | pub fn is_empty(&self) -> bool { 500 | self.size == 0 && self.records == 0 501 | } 502 | 503 | /// Writes the block header to a writer 504 | /// 505 | /// This function serializes the block header structure into a 32-byte buffer and writes 506 | /// it to the provided writer. 507 | /// 508 | /// # Parameters 509 | /// 510 | /// * `writer` - Any type that implements the `Write` trait 511 | /// 512 | /// # Returns 513 | /// 514 | /// * `Result<()>` - Success if the header was written 515 | /// 516 | /// # Errors 517 | /// 518 | /// * IO errors if writing to the writer fails 519 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 520 | let mut buffer = [0u8; SIZE_BLOCK_HEADER]; 521 | LittleEndian::write_u64(&mut buffer[0..8], self.magic); 522 | LittleEndian::write_u64(&mut buffer[8..16], self.size); 523 | LittleEndian::write_u32(&mut buffer[16..20], self.records); 524 | buffer[20..].copy_from_slice(&self.reserved); 525 | writer.write_all(&buffer)?; 526 | Ok(()) 527 | } 528 | 529 | /// Creates a block header from a 32-byte buffer 530 | /// 531 | /// This function parses a raw byte buffer into a `BlockHeader` structure, 532 | /// validating the magic number. 533 | /// 534 | /// # Parameters 535 | /// 536 | /// * `buffer` - A 32-byte array containing the block header data 537 | /// 538 | /// # Returns 539 | /// 540 | /// * `Result` - A valid block header if parsing was successful 541 | /// 542 | /// # Errors 543 | /// 544 | /// * `ReadError::InvalidBlockMagicNumber` - If the magic number doesn't match "BLOCKSEQ" 545 | pub fn from_bytes(buffer: &[u8; SIZE_BLOCK_HEADER]) -> Result { 546 | let magic = LittleEndian::read_u64(&buffer[0..8]); 547 | if magic != BLOCK_MAGIC { 548 | return Err(ReadError::InvalidBlockMagicNumber(magic, 0).into()); 549 | } 550 | let size = LittleEndian::read_u64(&buffer[8..16]); 551 | let records = LittleEndian::read_u32(&buffer[16..20]); 552 | Ok(Self::new(size, records)) 553 | } 554 | 555 | #[must_use] 556 | pub fn size_with_header(&self) -> usize { 557 | self.size as usize + SIZE_BLOCK_HEADER 558 | } 559 | } 560 | -------------------------------------------------------------------------------- /src/bq/writer.rs: -------------------------------------------------------------------------------- 1 | //! Binary sequence writer module 2 | //! 3 | //! This module provides functionality for writing nucleotide sequences to binary files 4 | //! in a compact 2-bit format. It includes support for: 5 | //! - Single and paired sequence writing 6 | //! - Invalid nucleotide handling with configurable policies 7 | //! - Efficient buffering and encoding 8 | //! - Headless mode for parallel writing 9 | 10 | use std::io::{BufWriter, Write}; 11 | 12 | use byteorder::{LittleEndian, WriteBytesExt}; 13 | use rand::{rngs::SmallRng, SeedableRng}; 14 | 15 | use super::BinseqHeader; 16 | use crate::{ 17 | error::{Result, WriteError}, 18 | Policy, RNG_SEED, 19 | }; 20 | 21 | /// Writes a single flag value to a writer in little-endian format 22 | /// 23 | /// # Arguments 24 | /// 25 | /// * `writer` - Any type that implements the `Write` trait 26 | /// * `flag` - The 64-bit flag value to write 27 | /// 28 | /// # Returns 29 | /// 30 | /// * `Ok(())` - If the flag was successfully written 31 | /// * `Err(Error)` - If writing to the writer failed 32 | pub fn write_flag(writer: &mut W, flag: u64) -> Result<()> { 33 | writer.write_u64::(flag)?; 34 | Ok(()) 35 | } 36 | 37 | /// Writes a buffer of u64 values to a writer in little-endian format 38 | /// 39 | /// This function is used to write encoded sequence data to the output. 40 | /// Each u64 in the buffer contains up to 32 nucleotides in 2-bit format. 41 | /// 42 | /// # Arguments 43 | /// 44 | /// * `writer` - Any type that implements the `Write` trait 45 | /// * `ebuf` - The buffer of u64 values to write 46 | /// 47 | /// # Returns 48 | /// 49 | /// * `Ok(())` - If the buffer was successfully written 50 | /// * `Err(Error)` - If writing to the writer failed 51 | pub fn write_buffer(writer: &mut W, ebuf: &[u64]) -> Result<()> { 52 | ebuf.iter() 53 | .try_for_each(|&x| writer.write_u64::(x))?; 54 | Ok(()) 55 | } 56 | 57 | /// Encodes nucleotide sequences into a compact 2-bit binary format 58 | /// 59 | /// The `Encoder` handles the conversion of nucleotide sequences (A, C, G, T) 60 | /// into a compact binary representation where each nucleotide is stored using 61 | /// 2 bits. It also handles invalid nucleotides according to a configurable policy. 62 | /// 63 | /// The encoder maintains internal buffers to avoid repeated allocations during 64 | /// encoding operations. These buffers are reused across multiple encode calls 65 | /// and are cleared automatically when needed. 66 | #[derive(Clone)] 67 | pub struct Encoder { 68 | /// Header containing sequence length and format information 69 | header: BinseqHeader, 70 | 71 | /// Buffers for storing encoded nucleotides in 2-bit format 72 | /// Each u64 can store 32 nucleotides (64 bits / 2 bits per nucleotide) 73 | sbuffer: Vec, // Primary sequence buffer 74 | xbuffer: Vec, // Extended sequence buffer 75 | 76 | /// Temporary buffers for handling invalid nucleotides 77 | /// These store the processed sequences after policy application 78 | s_ibuf: Vec, // Primary sequence invalid buffer 79 | x_ibuf: Vec, // Extended sequence invalid buffer 80 | 81 | /// Policy for handling invalid nucleotides during encoding 82 | policy: Policy, 83 | 84 | /// Random number generator for the `RandomDraw` policy 85 | /// Seeded with `RNG_SEED` for reproducibility 86 | rng: SmallRng, 87 | } 88 | impl Encoder { 89 | /// Creates a new encoder with default invalid nucleotide policy 90 | /// 91 | /// # Arguments 92 | /// 93 | /// * `header` - The header defining sequence lengths and format 94 | /// 95 | /// # Examples 96 | /// 97 | /// ``` 98 | /// # use binseq::bq::{BinseqHeaderBuilder, Encoder}; 99 | /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap(); 100 | /// let encoder = Encoder::new(header); 101 | /// ``` 102 | #[must_use] 103 | pub fn new(header: BinseqHeader) -> Self { 104 | Self::with_policy(header, Policy::default()) 105 | } 106 | 107 | /// Creates a new encoder with a specific invalid nucleotide policy 108 | /// 109 | /// # Arguments 110 | /// 111 | /// * `header` - The header defining sequence lengths and format 112 | /// * `policy` - The policy for handling invalid nucleotides 113 | /// 114 | /// # Examples 115 | /// 116 | /// ``` 117 | /// # use binseq::bq::{BinseqHeaderBuilder, Encoder}; 118 | /// # use binseq::Policy; 119 | /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap(); 120 | /// let encoder = Encoder::with_policy(header, Policy::SetToA); 121 | /// ``` 122 | #[must_use] 123 | pub fn with_policy(header: BinseqHeader, policy: Policy) -> Self { 124 | Self { 125 | header, 126 | policy, 127 | sbuffer: Vec::default(), 128 | xbuffer: Vec::default(), 129 | s_ibuf: Vec::default(), 130 | x_ibuf: Vec::default(), 131 | rng: SmallRng::seed_from_u64(RNG_SEED), 132 | } 133 | } 134 | 135 | /// Encodes a single sequence as 2-bit. 136 | /// 137 | /// Will return `None` if the sequence is invalid and the policy does not allow correction. 138 | pub fn encode_single(&mut self, primary: &[u8]) -> Result> { 139 | if primary.len() != self.header.slen as usize { 140 | return Err(WriteError::UnexpectedSequenceLength { 141 | expected: self.header.slen, 142 | got: primary.len(), 143 | } 144 | .into()); 145 | } 146 | 147 | // Fill the buffer with the 2-bit representation of the nucleotides 148 | self.clear(); 149 | if self.header.bits.encode(primary, &mut self.sbuffer).is_err() { 150 | self.clear(); 151 | if self 152 | .policy 153 | .handle(primary, &mut self.s_ibuf, &mut self.rng)? 154 | { 155 | self.header.bits.encode(&self.s_ibuf, &mut self.sbuffer)?; 156 | } else { 157 | return Ok(None); 158 | } 159 | } 160 | 161 | Ok(Some(&self.sbuffer)) 162 | } 163 | 164 | /// Encodes a pair of sequences as 2-bit. 165 | /// 166 | /// Will return `None` if either sequence is invalid and the policy does not allow correction. 167 | pub fn encode_paired( 168 | &mut self, 169 | primary: &[u8], 170 | extended: &[u8], 171 | ) -> Result> { 172 | if primary.len() != self.header.slen as usize { 173 | return Err(WriteError::UnexpectedSequenceLength { 174 | expected: self.header.slen, 175 | got: primary.len(), 176 | } 177 | .into()); 178 | } 179 | if extended.len() != self.header.xlen as usize { 180 | return Err(WriteError::UnexpectedSequenceLength { 181 | expected: self.header.xlen, 182 | got: extended.len(), 183 | } 184 | .into()); 185 | } 186 | 187 | self.clear(); 188 | if self.header.bits.encode(primary, &mut self.sbuffer).is_err() 189 | || self 190 | .header 191 | .bits 192 | .encode(extended, &mut self.xbuffer) 193 | .is_err() 194 | { 195 | self.clear(); 196 | if self 197 | .policy 198 | .handle(primary, &mut self.s_ibuf, &mut self.rng)? 199 | && self 200 | .policy 201 | .handle(extended, &mut self.x_ibuf, &mut self.rng)? 202 | { 203 | self.header.bits.encode(&self.s_ibuf, &mut self.sbuffer)?; 204 | self.header.bits.encode(&self.x_ibuf, &mut self.xbuffer)?; 205 | } else { 206 | return Ok(None); 207 | } 208 | } 209 | 210 | Ok(Some((&self.sbuffer, &self.xbuffer))) 211 | } 212 | 213 | /// Clear all buffers and reset the encoder. 214 | pub fn clear(&mut self) { 215 | self.sbuffer.clear(); 216 | self.xbuffer.clear(); 217 | self.s_ibuf.clear(); 218 | self.x_ibuf.clear(); 219 | } 220 | } 221 | 222 | /// Builder for creating configured `BinseqWriter` instances 223 | /// 224 | /// This builder provides a flexible way to create writers with various 225 | /// configurations. It follows the builder pattern, allowing for optional 226 | /// settings to be specified in any order. 227 | /// 228 | /// # Examples 229 | /// 230 | /// ``` 231 | /// # use binseq::{Policy, Result}; 232 | /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder}; 233 | /// # fn main() -> Result<()> { 234 | /// let header = BinseqHeaderBuilder::new().slen(100).build()?; 235 | /// let writer = BinseqWriterBuilder::default() 236 | /// .header(header) 237 | /// .policy(Policy::SetToA) 238 | /// .headless(false) 239 | /// .build(Vec::new())?; 240 | /// # Ok(()) 241 | /// # } 242 | /// ``` 243 | #[derive(Default)] 244 | pub struct BinseqWriterBuilder { 245 | /// Required header defining sequence lengths and format 246 | header: Option, 247 | /// Optional policy for handling invalid nucleotides 248 | policy: Option, 249 | /// Optional headless mode for parallel writing scenarios 250 | headless: Option, 251 | } 252 | impl BinseqWriterBuilder { 253 | #[must_use] 254 | pub fn header(mut self, header: BinseqHeader) -> Self { 255 | self.header = Some(header); 256 | self 257 | } 258 | 259 | #[must_use] 260 | pub fn policy(mut self, policy: Policy) -> Self { 261 | self.policy = Some(policy); 262 | self 263 | } 264 | 265 | #[must_use] 266 | pub fn headless(mut self, headless: bool) -> Self { 267 | self.headless = Some(headless); 268 | self 269 | } 270 | 271 | pub fn build(self, inner: W) -> Result> { 272 | let Some(header) = self.header else { 273 | return Err(WriteError::MissingHeader.into()); 274 | }; 275 | BinseqWriter::new( 276 | inner, 277 | header, 278 | self.policy.unwrap_or_default(), 279 | self.headless.unwrap_or(false), 280 | ) 281 | } 282 | } 283 | 284 | /// High-level writer for binary sequence files 285 | /// 286 | /// This writer provides a convenient interface for writing nucleotide sequences 287 | /// to binary files in a compact format. It handles sequence encoding, invalid 288 | /// nucleotide processing, and file format compliance. 289 | /// 290 | /// The writer can operate in two modes: 291 | /// - Normal mode: Writes the header followed by records 292 | /// - Headless mode: Writes only records (useful for parallel writing) 293 | /// 294 | /// # Type Parameters 295 | /// 296 | /// * `W` - The underlying writer type that implements `Write` 297 | #[derive(Clone)] 298 | pub struct BinseqWriter { 299 | /// The underlying writer for output 300 | inner: W, 301 | 302 | /// Encoder for converting sequences to binary format 303 | encoder: Encoder, 304 | 305 | /// Whether this writer is in headless mode 306 | /// When true, the header is not written to the output 307 | headless: bool, 308 | } 309 | impl BinseqWriter { 310 | /// Creates a new `BinseqWriter` instance with specified configuration 311 | /// 312 | /// This is a low-level constructor. For a more convenient way to create a 313 | /// `BinseqWriter`, use the `BinseqWriterBuilder` struct. 314 | /// 315 | /// # Arguments 316 | /// 317 | /// * `inner` - The underlying writer to write to 318 | /// * `header` - The header defining sequence lengths and format 319 | /// * `policy` - The policy for handling invalid nucleotides 320 | /// * `headless` - Whether to skip writing the header (for parallel writing) 321 | /// 322 | /// # Returns 323 | /// 324 | /// * `Ok(BinseqWriter)` - A new writer instance 325 | /// * `Err(Error)` - If writing the header fails 326 | /// 327 | /// # Examples 328 | /// 329 | /// ``` 330 | /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriter}; 331 | /// # use binseq::{Result, Policy}; 332 | /// # fn main() -> Result<()> { 333 | /// let header = BinseqHeaderBuilder::new().slen(100).build()?; 334 | /// let writer = BinseqWriter::new( 335 | /// Vec::new(), 336 | /// header, 337 | /// Policy::default(), 338 | /// false 339 | /// )?; 340 | /// # Ok(()) 341 | /// # } 342 | /// ``` 343 | pub fn new(mut inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result { 344 | if !headless { 345 | header.write_bytes(&mut inner)?; 346 | } 347 | Ok(Self { 348 | inner, 349 | encoder: Encoder::with_policy(header, policy), 350 | headless, 351 | }) 352 | } 353 | 354 | /// Writes a single record to the output 355 | /// 356 | /// This method encodes and writes a primary sequence along with an associated flag. 357 | /// 358 | /// # Arguments 359 | /// 360 | /// * `flag` - A 64-bit flag value associated with the sequence 361 | /// * `primary` - The nucleotide sequence to write 362 | /// 363 | /// # Returns 364 | /// 365 | /// * `Ok(true)` if the record was written successfully 366 | /// * `Ok(false)` if the record was not written because it was empty 367 | /// * `Err(WriteError::FlagSet)` if the flag is set but no flag value is provided 368 | pub fn write_record(&mut self, flag: Option, primary: &[u8]) -> Result { 369 | let has_flag = self.encoder.header.flags; 370 | if let Some(sbuffer) = self.encoder.encode_single(primary)? { 371 | if has_flag { 372 | write_flag(&mut self.inner, flag.unwrap_or(0))?; 373 | } 374 | write_buffer(&mut self.inner, sbuffer)?; 375 | Ok(true) 376 | } else { 377 | Ok(false) 378 | } 379 | } 380 | 381 | /// Writes a paired record to the output 382 | /// 383 | /// This method writes a paired record to the output. It takes a flag, primary sequence, and extended sequence as input. 384 | /// If the flag is set but no flag value is provided, it returns an error. 385 | /// Otherwise, it writes the encoded single and extended sequences to the output and returns true. 386 | /// 387 | /// # Arguments 388 | /// * `flag` - The flag value to write to the output 389 | /// * `primary` - The primary sequence to encode and write to the output 390 | /// * `extended` - The extended sequence to encode and write to the output 391 | /// 392 | /// # Returns 393 | /// * `Result` - A result indicating whether the write was successful or not 394 | pub fn write_paired_record( 395 | &mut self, 396 | flag: Option, 397 | primary: &[u8], 398 | extended: &[u8], 399 | ) -> Result { 400 | let has_flag = self.encoder.header.flags; 401 | if let Some((sbuffer, xbuffer)) = self.encoder.encode_paired(primary, extended)? { 402 | if has_flag { 403 | write_flag(&mut self.inner, flag.unwrap_or(0))?; 404 | } 405 | write_buffer(&mut self.inner, sbuffer)?; 406 | write_buffer(&mut self.inner, xbuffer)?; 407 | Ok(true) 408 | } else { 409 | Ok(false) 410 | } 411 | } 412 | 413 | /// Consumes the writer and returns the underlying writer 414 | /// 415 | /// This is useful when you need to access the underlying writer after 416 | /// writing is complete, for example to get the contents of a `Vec`. 417 | /// 418 | /// # Examples 419 | /// 420 | /// ``` 421 | /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder}; 422 | /// # use binseq::Result; 423 | /// # fn main() -> Result<()> { 424 | /// let header = BinseqHeaderBuilder::new().slen(100).build()?; 425 | /// let writer = BinseqWriterBuilder::default() 426 | /// .header(header) 427 | /// .build(Vec::new())?; 428 | /// 429 | /// // After writing sequences... 430 | /// let bytes = writer.into_inner(); 431 | /// # Ok(()) 432 | /// # } 433 | /// ``` 434 | pub fn into_inner(self) -> W { 435 | self.inner 436 | } 437 | 438 | /// Gets a mutable reference to the underlying writer 439 | /// 440 | /// This allows direct access to the underlying writer while retaining 441 | /// ownership of the `BinseqWriter`. 442 | pub fn by_ref(&mut self) -> &mut W { 443 | &mut self.inner 444 | } 445 | 446 | /// Flushes any buffered data to the underlying writer 447 | /// 448 | /// # Returns 449 | /// 450 | /// * `Ok(())` - If the flush was successful 451 | /// * `Err(Error)` - If flushing failed 452 | pub fn flush(&mut self) -> Result<()> { 453 | self.inner.flush()?; 454 | Ok(()) 455 | } 456 | 457 | /// Creates a new encoder with the same configuration as this writer 458 | /// 459 | /// This is useful when you need a separate encoder instance for parallel 460 | /// processing or other scenarios where you need independent encoding. 461 | /// The new encoder is initialized with a cleared state. 462 | /// 463 | /// # Returns 464 | /// 465 | /// A new `Encoder` instance with the same configuration but cleared buffers 466 | pub fn new_encoder(&self) -> Encoder { 467 | let mut encoder = self.encoder.clone(); 468 | encoder.clear(); 469 | encoder 470 | } 471 | 472 | /// Checks if this writer is in headless mode 473 | /// 474 | /// In headless mode, the writer does not write the header to the output. 475 | /// This is useful for parallel writing scenarios where only one writer 476 | /// should write the header. 477 | /// 478 | /// # Returns 479 | /// 480 | /// `true` if the writer is in headless mode, `false` otherwise 481 | pub fn is_headless(&self) -> bool { 482 | self.headless 483 | } 484 | 485 | /// Ingests the contents of another writer's buffer 486 | /// 487 | /// This method is used in parallel writing scenarios to combine the output 488 | /// of multiple writers. It takes the contents of another writer's buffer 489 | /// and writes them to this writer's output. 490 | /// 491 | /// # Arguments 492 | /// 493 | /// * `other` - Another writer whose underlying writer is a `Vec` 494 | /// 495 | /// # Returns 496 | /// 497 | /// * `Ok(())` - If the contents were successfully ingested 498 | /// * `Err(Error)` - If writing the contents failed 499 | pub fn ingest(&mut self, other: &mut BinseqWriter>) -> Result<()> { 500 | let other_inner = other.by_ref(); 501 | self.inner.write_all(other_inner)?; 502 | other_inner.clear(); 503 | Ok(()) 504 | } 505 | } 506 | 507 | /// A streaming writer for binary sequence data 508 | /// 509 | /// This writer buffers data before writing it to the underlying writer, 510 | /// providing efficient streaming capabilities suitable for: 511 | /// - Writing to network connections 512 | /// - Processing very large datasets 513 | /// - Pipeline processing 514 | /// 515 | /// The `StreamWriter` is a specialized version of `BinseqWriter` that 516 | /// adds internal buffering and is optimized for streaming scenarios. 517 | pub struct StreamWriter { 518 | /// The underlying writer for processing sequences 519 | writer: BinseqWriter>, 520 | } 521 | 522 | impl StreamWriter { 523 | /// Creates a new `StreamWriter` with the default buffer size 524 | /// 525 | /// This constructor initializes a `StreamWriter` with an 8K buffer 526 | /// for efficient writing to the underlying writer. 527 | /// 528 | /// # Arguments 529 | /// 530 | /// * `inner` - The writer to write binary sequence data to 531 | /// * `header` - The header defining sequence lengths and format 532 | /// * `policy` - The policy for handling invalid nucleotides 533 | /// * `headless` - Whether to skip writing the header 534 | /// 535 | /// # Returns 536 | /// 537 | /// * `Ok(StreamWriter)` - A new streaming writer 538 | /// * `Err(Error)` - If initialization fails 539 | pub fn new(inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result { 540 | Self::with_capacity(inner, 8192, header, policy, headless) 541 | } 542 | 543 | /// Creates a new `StreamWriter` with a specified buffer capacity 544 | /// 545 | /// This constructor allows customizing the buffer size based on 546 | /// expected usage patterns and performance requirements. 547 | /// 548 | /// # Arguments 549 | /// 550 | /// * `inner` - The writer to write binary sequence data to 551 | /// * `capacity` - The size of the internal buffer in bytes 552 | /// * `header` - The header defining sequence lengths and format 553 | /// * `policy` - The policy for handling invalid nucleotides 554 | /// * `headless` - Whether to skip writing the header 555 | /// 556 | /// # Returns 557 | /// 558 | /// * `Ok(StreamWriter)` - A new streaming writer with the specified buffer capacity 559 | /// * `Err(Error)` - If initialization fails 560 | pub fn with_capacity( 561 | inner: W, 562 | capacity: usize, 563 | header: BinseqHeader, 564 | policy: Policy, 565 | headless: bool, 566 | ) -> Result { 567 | let buffered = BufWriter::with_capacity(capacity, inner); 568 | let writer = BinseqWriter::new(buffered, header, policy, headless)?; 569 | 570 | Ok(Self { writer }) 571 | } 572 | 573 | pub fn write_record(&mut self, flag: Option, primary: &[u8]) -> Result { 574 | self.writer.write_record(flag, primary) 575 | } 576 | 577 | pub fn write_paired_record( 578 | &mut self, 579 | flag: Option, 580 | primary: &[u8], 581 | extended: &[u8], 582 | ) -> Result { 583 | self.writer.write_paired_record(flag, primary, extended) 584 | } 585 | 586 | /// Flushes any buffered data to the underlying writer 587 | /// 588 | /// # Returns 589 | /// 590 | /// * `Ok(())` - If the flush was successful 591 | /// * `Err(Error)` - If flushing failed 592 | pub fn flush(&mut self) -> Result<()> { 593 | self.writer.flush() 594 | } 595 | 596 | /// Consumes the streaming writer and returns the inner writer after flushing 597 | /// 598 | /// This method is useful when you need access to the underlying writer 599 | /// after all writing is complete. 600 | /// 601 | /// # Returns 602 | /// 603 | /// * `Ok(W)` - The inner writer after flushing all data 604 | /// * `Err(Error)` - If flushing failed 605 | pub fn into_inner(self) -> Result { 606 | // First unwrap the writer inner (BufWriter) 607 | let bufw = self.writer.into_inner(); 608 | // Now unwrap the BufWriter to get W 609 | match bufw.into_inner() { 610 | Ok(inner) => Ok(inner), 611 | Err(e) => Err(std::io::Error::from(e).into()), 612 | } 613 | } 614 | } 615 | 616 | /// Builder for `StreamWriter` instances 617 | /// 618 | /// This builder provides a convenient way to create and configure `StreamWriter` 619 | /// instances with custom buffer sizes and other settings. 620 | #[derive(Default)] 621 | pub struct StreamWriterBuilder { 622 | /// Required header defining sequence lengths and format 623 | header: Option, 624 | /// Optional policy for handling invalid nucleotides 625 | policy: Option, 626 | /// Optional headless mode for parallel writing scenarios 627 | headless: Option, 628 | /// Optional buffer capacity setting 629 | buffer_capacity: Option, 630 | } 631 | 632 | impl StreamWriterBuilder { 633 | /// Sets the header for the writer 634 | #[must_use] 635 | pub fn header(mut self, header: BinseqHeader) -> Self { 636 | self.header = Some(header); 637 | self 638 | } 639 | 640 | /// Sets the policy for handling invalid nucleotides 641 | #[must_use] 642 | pub fn policy(mut self, policy: Policy) -> Self { 643 | self.policy = Some(policy); 644 | self 645 | } 646 | 647 | /// Sets headless mode (whether to skip writing the header) 648 | #[must_use] 649 | pub fn headless(mut self, headless: bool) -> Self { 650 | self.headless = Some(headless); 651 | self 652 | } 653 | 654 | /// Sets the buffer capacity for the writer 655 | #[must_use] 656 | pub fn buffer_capacity(mut self, capacity: usize) -> Self { 657 | self.buffer_capacity = Some(capacity); 658 | self 659 | } 660 | 661 | /// Builds a `StreamWriter` with the configured settings 662 | /// 663 | /// # Arguments 664 | /// 665 | /// * `inner` - The writer to write binary sequence data to 666 | /// 667 | /// # Returns 668 | /// 669 | /// * `Ok(StreamWriter)` - A new streaming writer with the specified configuration 670 | /// * `Err(Error)` - If building the writer fails 671 | pub fn build(self, inner: W) -> Result> { 672 | let Some(header) = self.header else { 673 | return Err(WriteError::MissingHeader.into()); 674 | }; 675 | 676 | let capacity = self.buffer_capacity.unwrap_or(8192); 677 | StreamWriter::with_capacity( 678 | inner, 679 | capacity, 680 | header, 681 | self.policy.unwrap_or_default(), 682 | self.headless.unwrap_or(false), 683 | ) 684 | } 685 | } 686 | 687 | #[cfg(test)] 688 | mod testing { 689 | 690 | use std::{fs::File, io::BufWriter}; 691 | 692 | use super::*; 693 | use crate::bq::{BinseqHeaderBuilder, SIZE_HEADER}; 694 | 695 | #[test] 696 | fn test_headless() -> Result<()> { 697 | let inner = Vec::new(); 698 | let mut writer = BinseqWriterBuilder::default() 699 | .header(BinseqHeaderBuilder::new().slen(32).build()?) 700 | .headless(true) 701 | .build(inner)?; 702 | assert!(writer.is_headless()); 703 | let inner = writer.by_ref(); 704 | assert!(inner.is_empty()); 705 | Ok(()) 706 | } 707 | 708 | #[test] 709 | fn test_not_headless() -> Result<()> { 710 | let inner = Vec::new(); 711 | let mut writer = BinseqWriterBuilder::default() 712 | .header(BinseqHeaderBuilder::new().slen(32).build()?) 713 | .build(inner)?; 714 | assert!(!writer.is_headless()); 715 | let inner = writer.by_ref(); 716 | assert_eq!(inner.len(), SIZE_HEADER); 717 | Ok(()) 718 | } 719 | 720 | #[test] 721 | fn test_stdout() -> Result<()> { 722 | let writer = BinseqWriterBuilder::default() 723 | .header(BinseqHeaderBuilder::new().slen(32).build()?) 724 | .build(std::io::stdout())?; 725 | assert!(!writer.is_headless()); 726 | Ok(()) 727 | } 728 | 729 | #[test] 730 | fn test_to_path() -> Result<()> { 731 | let path = "test_to_path.file"; 732 | let inner = File::create(path).map(BufWriter::new)?; 733 | let mut writer = BinseqWriterBuilder::default() 734 | .header(BinseqHeaderBuilder::new().slen(32).build()?) 735 | .build(inner)?; 736 | assert!(!writer.is_headless()); 737 | let inner = writer.by_ref(); 738 | inner.flush()?; 739 | 740 | // delete file 741 | std::fs::remove_file(path)?; 742 | 743 | Ok(()) 744 | } 745 | 746 | #[test] 747 | fn test_stream_writer() -> Result<()> { 748 | let inner = Vec::new(); 749 | let writer = StreamWriterBuilder::default() 750 | .header(BinseqHeaderBuilder::new().slen(32).build()?) 751 | .buffer_capacity(16384) 752 | .build(inner)?; 753 | 754 | // Convert back to Vec to verify it works 755 | let inner = writer.into_inner()?; 756 | assert_eq!(inner.len(), SIZE_HEADER); 757 | Ok(()) 758 | } 759 | } 760 | -------------------------------------------------------------------------------- /src/vbq/index.rs: -------------------------------------------------------------------------------- 1 | //! # VBQ Index Format 2 | //! 3 | //! This module implements the embedded index format for VBQ files. 4 | //! 5 | //! ## Format Changes (v0.7.0+) 6 | //! 7 | //! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files instead of 8 | //! being stored in separate `.vqi` files. This improves portability and eliminates the 9 | //! need to manage auxiliary files. 10 | //! 11 | //! ## Embedded Index Structure 12 | //! 13 | //! The index is located at the end of the VBQ file with this layout: 14 | //! 15 | //! ```text 16 | //! [VBQ Data Blocks][Compressed Index][Index Size (u64)][INDEX_END_MAGIC (u64)] 17 | //! ``` 18 | //! 19 | //! Where: 20 | //! - **Compressed Index**: ZSTD-compressed index data (`IndexHeader` + `BlockRanges`) 21 | //! - **Index Size**: 8 bytes indicating size of compressed index data 22 | //! - **`INDEX_END_MAGIC`**: 8 bytes (`0x444E455845444E49` = "INDEXEND") 23 | //! 24 | //! ## Index Contents 25 | //! 26 | //! The compressed index contains: 27 | //! 1. **`IndexHeader`** (32 bytes): Metadata about the indexed file 28 | //! 2. **`BlockRange` entries** (32 bytes each): One per data block 29 | //! 30 | //! ## Key Changes from v0.6.x 31 | //! 32 | //! - Index moved from separate `.vqi` files into VBQ files 33 | //! - Cumulative record counts changed from `u32` to `u64` 34 | //! - Support for files with more than 4 billion records 35 | 36 | use std::{ 37 | fs::File, 38 | io::{BufReader, BufWriter, Cursor, Read, Write}, 39 | path::Path, 40 | }; 41 | 42 | use byteorder::{ByteOrder, LittleEndian}; 43 | use zstd::{Decoder, Encoder}; 44 | 45 | use super::{ 46 | header::{SIZE_BLOCK_HEADER, SIZE_HEADER}, 47 | BlockHeader, VBinseqHeader, 48 | }; 49 | use crate::error::{IndexError, Result}; 50 | 51 | /// Size of `BlockRange` in bytes 52 | pub const SIZE_BLOCK_RANGE: usize = 32; 53 | /// Size of `IndexHeader` in bytes 54 | pub const INDEX_HEADER_SIZE: usize = 32; 55 | /// Magic number to designate index (VBQINDEX) 56 | #[allow(clippy::unreadable_literal)] 57 | pub const INDEX_MAGIC: u64 = 0x5845444e49514256; 58 | /// Magic number to designate end of index (INDEXEND) 59 | #[allow(clippy::unreadable_literal)] 60 | pub const INDEX_END_MAGIC: u64 = 0x444E455845444E49; 61 | /// Index Block Reservation 62 | pub const INDEX_RESERVATION: [u8; 4] = [42; 4]; 63 | 64 | /// Descriptor of the dimensions of a block in a VBINSEQ file 65 | /// 66 | /// A `BlockRange` contains metadata about a single block within a VBINSEQ file, 67 | /// including its position, size, and record count. This information enables 68 | /// efficient random access to blocks without scanning the entire file. 69 | /// 70 | /// Block ranges are stored in a `BlockIndex` to form a complete index of a VBINSEQ file. 71 | /// Each range is serialized to a fixed-size 32-byte structure when stored in the embedded index. 72 | /// 73 | /// ## Format Changes (v0.7.0+) 74 | /// 75 | /// - `cumulative_records` field changed from `u32` to `u64` 76 | /// - Supports files with more than 4 billion records 77 | /// - Reserved bytes reduced from 8 to 4 bytes 78 | /// 79 | /// # Examples 80 | /// 81 | /// ```rust 82 | /// use binseq::vbq::BlockRange; 83 | /// 84 | /// // Create a new block range 85 | /// let range = BlockRange::new( 86 | /// 1024, // Starting offset in the file (bytes) 87 | /// 8192, // Length of the block (bytes) 88 | /// 1000, // Number of records in this block 89 | /// 5000 // Cumulative number of records up to this block (now u64) 90 | /// ); 91 | /// 92 | /// // Use the range information 93 | /// println!("Block starts at byte {}", range.start_offset); 94 | /// println!("Block contains {} records", range.block_records); 95 | /// ``` 96 | #[derive(Debug, Clone, Copy)] 97 | pub struct BlockRange { 98 | /// File offset where the block starts (in bytes, including headers) 99 | /// 100 | /// This is the absolute byte position in the file where this block begins, 101 | /// including the file header and block header. 102 | /// 103 | /// (8 bytes in serialized form) 104 | pub start_offset: u64, 105 | 106 | /// Length of the block data in bytes 107 | /// 108 | /// This is the size of the block data, not including the block header. 109 | /// For compressed blocks, this is the compressed size. 110 | /// 111 | /// (8 bytes in serialized form) 112 | pub len: u64, 113 | 114 | /// Number of records contained in this block 115 | /// 116 | /// (4 bytes in serialized form) 117 | pub block_records: u32, 118 | 119 | /// Cumulative number of records up to this block 120 | /// 121 | /// This allows efficient determination of which block contains a specific record 122 | /// by index without scanning through all previous blocks. 123 | /// 124 | /// **BREAKING CHANGE (v0.7.0+)**: Changed from u32 to u64 to support files 125 | /// with more than 4 billion records. 126 | /// 127 | /// (8 bytes in serialized form) 128 | pub cumulative_records: u64, 129 | 130 | /// Reserved bytes for future extensions 131 | pub reservation: [u8; 4], 132 | } 133 | impl BlockRange { 134 | /// Creates a new `BlockRange` with the specified parameters 135 | /// 136 | /// # Parameters 137 | /// 138 | /// * `start_offset` - The byte offset in the file where this block starts 139 | /// * `len` - The length of the block data in bytes 140 | /// * `block_records` - The number of records contained in this block 141 | /// * `cumulative_records` - The total number of records up to and including this block 142 | /// 143 | /// # Returns 144 | /// 145 | /// A new `BlockRange` instance with the specified parameters 146 | /// 147 | /// # Examples 148 | /// 149 | /// ```rust 150 | /// use binseq::vbq::BlockRange; 151 | /// 152 | /// // Create a new block range for a block starting at byte 1024 153 | /// let range = BlockRange::new(1024, 8192, 1000, 5000); 154 | /// ``` 155 | #[must_use] 156 | pub fn new(start_offset: u64, len: u64, block_records: u32, cumulative_records: u64) -> Self { 157 | Self { 158 | start_offset, 159 | len, 160 | block_records, 161 | cumulative_records, 162 | reservation: INDEX_RESERVATION, 163 | } 164 | } 165 | 166 | /// Serializes the block range to a binary format and writes it to the provided writer 167 | /// 168 | /// This method serializes the `BlockRange` to a fixed-size 32-byte structure and 169 | /// writes it to the provided writer. The serialized format is: 170 | /// - Bytes 0-7: `start_offset` (u64, little endian) 171 | /// - Bytes 8-15: len (u64, little endian) 172 | /// - Bytes 16-19: `block_records` (u32, little endian) 173 | /// - Bytes 20-23: `cumulative_records` (u32, little endian) 174 | /// - Bytes 24-31: reservation (8 bytes) 175 | /// 176 | /// # Parameters 177 | /// 178 | /// * `writer` - The destination to write the serialized block range to 179 | /// 180 | /// # Returns 181 | /// 182 | /// * `Ok(())` - If the block range was successfully written 183 | /// * `Err(_)` - If an error occurred during writing 184 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 185 | let mut buf = [0; SIZE_BLOCK_RANGE]; 186 | LittleEndian::write_u64(&mut buf[0..8], self.start_offset); 187 | LittleEndian::write_u64(&mut buf[8..16], self.len); 188 | LittleEndian::write_u32(&mut buf[16..20], self.block_records); 189 | LittleEndian::write_u64(&mut buf[20..28], self.cumulative_records); 190 | buf[28..].copy_from_slice(&self.reservation); 191 | writer.write_all(&buf)?; 192 | Ok(()) 193 | } 194 | 195 | /// Deserializes a `BlockRange` from a fixed-size buffer 196 | /// 197 | /// This method deserializes a `BlockRange` from a 32-byte buffer in the format 198 | /// used by `write_bytes`. It's typically used when reading an index file. 199 | /// 200 | /// # Parameters 201 | /// 202 | /// * `buffer` - A fixed-size buffer containing a serialized `BlockRange` 203 | /// 204 | /// # Returns 205 | /// 206 | /// A new `BlockRange` with the values read from the buffer 207 | /// 208 | /// # Format 209 | /// 210 | /// The buffer is expected to contain: 211 | /// - Bytes 0-7: `start_offset` (u64, little endian) 212 | /// - Bytes 8-15: len (u64, little endian) 213 | /// - Bytes 16-19: `block_records` (u32, little endian) 214 | /// - Bytes 20-27: `cumulative_records` (u64, little endian) 215 | /// - Bytes 28-31: reservation (ignored, default value used) 216 | #[must_use] 217 | pub fn from_exact(buffer: &[u8; SIZE_BLOCK_RANGE]) -> Self { 218 | Self { 219 | start_offset: LittleEndian::read_u64(&buffer[0..8]), 220 | len: LittleEndian::read_u64(&buffer[8..16]), 221 | block_records: LittleEndian::read_u32(&buffer[16..20]), 222 | cumulative_records: LittleEndian::read_u64(&buffer[20..28]), 223 | reservation: INDEX_RESERVATION, 224 | } 225 | } 226 | 227 | /// Deserializes a `BlockRange` from a slice of bytes 228 | /// 229 | /// This is a convenience method that copies the first 32 bytes from the provided slice 230 | /// into a fixed-size buffer and then calls `from_exact`. It's useful when reading from 231 | /// a larger buffer that contains multiple serialized `BlockRange` instances. 232 | /// 233 | /// # Parameters 234 | /// 235 | /// * `buffer` - A slice containing at least 32 bytes with a serialized `BlockRange` 236 | /// 237 | /// # Returns 238 | /// 239 | /// A new `BlockRange` with the values read from the buffer 240 | /// 241 | /// # Panics 242 | /// 243 | /// This method will panic if the buffer is less than 32 bytes long. 244 | #[must_use] 245 | pub fn from_bytes(buffer: &[u8]) -> Self { 246 | let mut buf = [0; SIZE_BLOCK_RANGE]; 247 | buf.copy_from_slice(buffer); 248 | Self::from_exact(&buf) 249 | } 250 | } 251 | 252 | /// Header for a VBINSEQ index file 253 | /// 254 | /// The `IndexHeader` contains metadata about an index file, including a magic number 255 | /// for validation and the size of the indexed file. This allows verifying that an index 256 | /// file matches its corresponding VBINSEQ file. 257 | /// 258 | /// The header has a fixed size of 32 bytes to ensure compatibility across versions. 259 | #[derive(Debug, Clone, Copy)] 260 | pub struct IndexHeader { 261 | /// Magic number to designate the index file ("VBQINDEX" in ASCII) 262 | /// 263 | /// This is used to verify that a file is indeed a VBINSEQ index file. 264 | /// (8 bytes in serialized form) 265 | magic: u64, 266 | 267 | /// Total size of the indexed VBINSEQ file in bytes 268 | /// 269 | /// This is used to verify that the index matches the file it references. 270 | /// (8 bytes in serialized form) 271 | bytes: u64, 272 | 273 | /// Reserved bytes for future extensions 274 | /// 275 | /// (16 bytes in serialized form) 276 | reserved: [u8; INDEX_HEADER_SIZE - 16], 277 | } 278 | impl IndexHeader { 279 | /// Creates a new index header for a VBINSEQ file of the specified size 280 | /// 281 | /// # Parameters 282 | /// 283 | /// * `bytes` - The total size of the VBINSEQ file being indexed, in bytes 284 | /// 285 | /// # Returns 286 | /// 287 | /// A new `IndexHeader` instance with the appropriate magic number and size 288 | pub fn new(bytes: u64) -> Self { 289 | Self { 290 | magic: INDEX_MAGIC, 291 | bytes, 292 | reserved: [42; INDEX_HEADER_SIZE - 16], 293 | } 294 | } 295 | /// Reads an index header from the provided reader 296 | /// 297 | /// This method reads 32 bytes from the provided reader and deserializes them 298 | /// into an `IndexHeader`. It validates the magic number to ensure that the file 299 | /// is indeed a VBINSEQ index file. 300 | /// 301 | /// # Parameters 302 | /// 303 | /// * `reader` - The source from which to read the header 304 | /// 305 | /// # Returns 306 | /// 307 | /// * `Ok(Self)` - If the header was successfully read and has a valid magic number 308 | /// * `Err(_)` - If an error occurred during reading or the magic number is invalid 309 | /// 310 | /// # Format 311 | /// 312 | /// The header is expected to be 32 bytes with the following structure: 313 | /// - Bytes 0-7: magic number (u64, little endian, must be `INDEX_MAGIC`) 314 | /// - Bytes 8-15: file size in bytes (u64, little endian) 315 | /// - Bytes 16-31: reserved for future extensions 316 | pub fn from_reader(reader: &mut R) -> Result { 317 | let mut buffer = [0; INDEX_HEADER_SIZE]; 318 | reader.read_exact(&mut buffer)?; 319 | let magic = LittleEndian::read_u64(&buffer[0..8]); 320 | let bytes = LittleEndian::read_u64(&buffer[8..16]); 321 | let Ok(reserved) = buffer[16..INDEX_HEADER_SIZE].try_into() else { 322 | return Err(IndexError::InvalidReservedBytes.into()); 323 | }; 324 | if magic != INDEX_MAGIC { 325 | return Err(IndexError::InvalidMagicNumber(magic).into()); 326 | } 327 | Ok(Self { 328 | magic, 329 | bytes, 330 | reserved, 331 | }) 332 | } 333 | 334 | pub fn from_bytes(bytes: &[u8]) -> Result { 335 | let mut buffer = [0; INDEX_HEADER_SIZE]; 336 | buffer.copy_from_slice(&bytes[..INDEX_HEADER_SIZE]); 337 | Self::from_reader(&mut Cursor::new(buffer)) 338 | } 339 | 340 | /// Serializes the index header to a binary format and writes it to the provided writer 341 | /// 342 | /// This method serializes the `IndexHeader` to a fixed-size 32-byte structure and 343 | /// writes it to the provided writer. This is typically used when saving an index to a file. 344 | /// 345 | /// # Parameters 346 | /// 347 | /// * `writer` - The destination to write the serialized header to 348 | /// 349 | /// # Returns 350 | /// 351 | /// * `Ok(())` - If the header was successfully written 352 | /// * `Err(_)` - If an error occurred during writing 353 | /// 354 | /// # Format 355 | /// 356 | /// The header is serialized as: 357 | /// - Bytes 0-7: magic number (u64, little endian) 358 | /// - Bytes 8-15: file size in bytes (u64, little endian) 359 | /// - Bytes 16-31: reserved for future extensions 360 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 361 | let mut buffer = [0; INDEX_HEADER_SIZE]; 362 | LittleEndian::write_u64(&mut buffer[0..8], self.magic); 363 | LittleEndian::write_u64(&mut buffer[8..16], self.bytes); 364 | buffer[16..].copy_from_slice(&self.reserved); 365 | writer.write_all(&buffer)?; 366 | Ok(()) 367 | } 368 | } 369 | 370 | /// Complete index for a VBINSEQ file 371 | /// 372 | /// A `BlockIndex` contains metadata about a VBINSEQ file and all of its blocks, 373 | /// enabling efficient random access and parallel processing. It consists of an 374 | /// `IndexHeader` and a collection of `BlockRange` entries, one for each block in 375 | /// the file. 376 | /// 377 | /// The index can be created by scanning a VBINSEQ file or loaded from a previously 378 | /// created index file. Once loaded, it provides information about block locations, 379 | /// sizes, and record counts. 380 | /// 381 | /// # Examples 382 | /// 383 | /// ```rust,no_run 384 | /// use binseq::vbq::{BlockIndex, MmapReader}; 385 | /// use std::path::Path; 386 | /// 387 | /// // Create an index from a VBINSEQ file 388 | /// let vbq_path = Path::new("example.vbq"); 389 | /// let index = BlockIndex::from_vbq(vbq_path).unwrap(); 390 | /// 391 | /// // Save the index for future use 392 | /// let index_path = Path::new("example.vbq.vqi"); 393 | /// index.save_to_path(index_path).unwrap(); 394 | /// 395 | /// // Use the index with a reader for parallel processing 396 | /// let reader = MmapReader::new(vbq_path).unwrap(); 397 | /// println!("File contains {} blocks", index.n_blocks()); 398 | /// ``` 399 | #[derive(Debug, Clone)] 400 | pub struct BlockIndex { 401 | /// Header containing metadata about the indexed file 402 | pub(crate) header: IndexHeader, 403 | 404 | /// Collection of block ranges, one for each block in the file 405 | pub(crate) ranges: Vec, 406 | } 407 | impl BlockIndex { 408 | /// Creates a new empty block index with the specified header 409 | /// 410 | /// # Parameters 411 | /// 412 | /// * `header` - The index header containing metadata about the indexed file 413 | /// 414 | /// # Returns 415 | /// 416 | /// A new empty `BlockIndex` instance 417 | #[must_use] 418 | pub fn new(header: IndexHeader) -> Self { 419 | Self { 420 | header, 421 | ranges: Vec::default(), 422 | } 423 | } 424 | /// Returns the number of blocks in the indexed file 425 | /// 426 | /// # Returns 427 | /// 428 | /// The number of blocks in the VBINSEQ file described by this index 429 | /// 430 | /// # Examples 431 | /// 432 | /// ```rust,no_run 433 | /// use binseq::vbq::BlockIndex; 434 | /// use std::path::Path; 435 | /// 436 | /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); 437 | /// println!("The file contains {} blocks", index.n_blocks()); 438 | /// ``` 439 | #[must_use] 440 | pub fn n_blocks(&self) -> usize { 441 | self.ranges.len() 442 | } 443 | 444 | /// Writes the collection of `BlockRange` to a file 445 | /// Saves the index to a file 446 | /// 447 | /// This writes the index header and all block ranges to a file, which can be loaded 448 | /// later to avoid rescanning the VBINSEQ file. The index is compressed to reduce 449 | /// storage space. 450 | /// 451 | /// # Parameters 452 | /// 453 | /// * `path` - The path where the index file should be saved 454 | /// 455 | /// # Returns 456 | /// 457 | /// * `Ok(())` - If the index was successfully saved 458 | /// * `Err(_)` - If an error occurred during saving 459 | /// 460 | /// # Examples 461 | /// 462 | /// ```rust,no_run 463 | /// use binseq::vbq::BlockIndex; 464 | /// use std::path::Path; 465 | /// 466 | /// // Create an index from a VBINSEQ file 467 | /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); 468 | /// 469 | /// // Save it for future use 470 | /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); 471 | /// ``` 472 | pub fn save_to_path>(&self, path: P) -> Result<()> { 473 | let mut writer = File::create(path).map(BufWriter::new)?; 474 | self.header.write_bytes(&mut writer)?; 475 | let mut writer = Encoder::new(writer, 3)?.auto_finish(); 476 | self.write_range(&mut writer)?; 477 | writer.flush()?; 478 | Ok(()) 479 | } 480 | 481 | /// Write the index to an output buffer 482 | pub fn write_bytes(&self, writer: &mut W) -> Result<()> { 483 | self.header.write_bytes(writer)?; 484 | let mut writer = Encoder::new(writer, 3)?.auto_finish(); 485 | self.write_range(&mut writer)?; 486 | writer.flush()?; 487 | Ok(()) 488 | } 489 | 490 | /// Write the collection of `BlockRange` to an output handle 491 | /// Writes all block ranges to the provided writer 492 | /// 493 | /// This method is used internally by `save_to_path` to write the block ranges 494 | /// to an index file. It can also be used to serialize an index to any destination 495 | /// that implements `Write`. 496 | /// 497 | /// # Parameters 498 | /// 499 | /// * `writer` - The destination to write the block ranges to 500 | /// 501 | /// # Returns 502 | /// 503 | /// * `Ok(())` - If all block ranges were successfully written 504 | /// * `Err(_)` - If an error occurred during writing 505 | pub fn write_range(&self, writer: &mut W) -> Result<()> { 506 | self.ranges 507 | .iter() 508 | .filter(|range| range.block_records > 0) 509 | .try_for_each(|range| -> Result<()> { range.write_bytes(writer) }) 510 | } 511 | 512 | /// Adds a block range to the index 513 | /// 514 | /// This method is used internally during index creation to add information 515 | /// about each block in the file. Blocks are typically added in order. 516 | /// 517 | /// # Parameters 518 | /// 519 | /// * `range` - The block range to add to the index 520 | fn add_range(&mut self, range: BlockRange) { 521 | self.ranges.push(range); 522 | } 523 | 524 | /// Creates a new index by scanning a VBINSEQ file 525 | /// 526 | /// This method memory-maps the specified VBINSEQ file and scans it block by block 527 | /// to create an index. The index can then be saved to a file for future use, enabling 528 | /// efficient random access without rescanning the file. 529 | /// 530 | /// # Parameters 531 | /// 532 | /// * `path` - Path to the VBINSEQ file to index 533 | /// 534 | /// # Returns 535 | /// 536 | /// * `Ok(Self)` - A new `BlockIndex` containing information about all blocks in the file 537 | /// * `Err(_)` - If an error occurred during file opening, validation, or scanning 538 | /// 539 | /// # Examples 540 | /// 541 | /// ```rust,no_run 542 | /// use binseq::vbq::BlockIndex; 543 | /// use std::path::Path; 544 | /// 545 | /// // Create an index from a VBINSEQ file 546 | /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); 547 | /// 548 | /// // Save the index for future use 549 | /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); 550 | /// 551 | /// // Get statistics about the file 552 | /// println!("File contains {} blocks", index.n_blocks()); 553 | /// 554 | /// // Analyze the record distribution 555 | /// if let Some(last_range) = index.ranges().last() { 556 | /// println!("Total records: {}", last_range.cumulative_records); 557 | /// println!("Average records per block: {}", 558 | /// last_range.cumulative_records as f64 / index.n_blocks() as f64); 559 | /// } 560 | /// ``` 561 | /// 562 | /// # Notes 563 | /// 564 | /// This method uses memory mapping for efficiency, which allows the operating system 565 | /// to load only the needed portions of the file into memory as they are accessed. 566 | pub fn from_vbq>(path: P) -> Result { 567 | let file = File::open(path)?; 568 | let mmap = unsafe { memmap2::Mmap::map(&file)? }; 569 | let file_size = mmap.len(); 570 | 571 | // Read header from mapped memory (unused but checks for validity) 572 | let _header = { 573 | let mut header_bytes = [0u8; SIZE_HEADER]; 574 | header_bytes.copy_from_slice(&mmap[..SIZE_HEADER]); 575 | VBinseqHeader::from_bytes(&header_bytes)? 576 | }; 577 | 578 | // Initialize position after the header 579 | let mut pos = SIZE_HEADER; 580 | 581 | // Initialize the collection 582 | let index_header = IndexHeader::new(file_size as u64); 583 | let mut index = BlockIndex::new(index_header); 584 | 585 | // Find all block headers 586 | let mut record_total = 0; 587 | while pos < mmap.len() { 588 | let block_header = { 589 | let mut header_bytes = [0u8; SIZE_BLOCK_HEADER]; 590 | header_bytes.copy_from_slice(&mmap[pos..pos + SIZE_BLOCK_HEADER]); 591 | BlockHeader::from_bytes(&header_bytes)? 592 | }; 593 | index.add_range(BlockRange::new( 594 | pos as u64, 595 | block_header.size, 596 | block_header.records, 597 | record_total, 598 | )); 599 | pos += SIZE_BLOCK_HEADER + block_header.size as usize; 600 | record_total += u64::from(block_header.records); 601 | } 602 | 603 | Ok(index) 604 | } 605 | 606 | /// Reads an index from a path 607 | /// 608 | /// # Panics 609 | /// Panics if the path is not a valid UTF-8 string. 610 | pub fn from_path>(path: P) -> Result { 611 | let Some(upstream_file) = path.as_ref().to_str().unwrap().strip_suffix(".vqi") else { 612 | return Err(IndexError::MissingUpstreamFile( 613 | path.as_ref().to_string_lossy().to_string(), 614 | ) 615 | .into()); 616 | }; 617 | let upstream_handle = File::open(upstream_file)?; 618 | let mmap = unsafe { memmap2::Mmap::map(&upstream_handle)? }; 619 | let file_size = mmap.len() as u64; 620 | 621 | let mut file_handle = File::open(path).map(BufReader::new)?; 622 | let index_header = IndexHeader::from_reader(&mut file_handle)?; 623 | if index_header.bytes != file_size { 624 | return Err(IndexError::ByteSizeMismatch(file_size, index_header.bytes).into()); 625 | } 626 | let buffer = { 627 | let mut buffer = Vec::new(); 628 | let mut decoder = Decoder::new(file_handle)?; 629 | decoder.read_to_end(&mut buffer)?; 630 | buffer 631 | }; 632 | 633 | let mut ranges = Self::new(index_header); 634 | let mut pos = 0; 635 | while pos < buffer.len() { 636 | let bound = pos + SIZE_BLOCK_RANGE; 637 | let range = BlockRange::from_bytes(&buffer[pos..bound]); 638 | ranges.add_range(range); 639 | pos += SIZE_BLOCK_RANGE; 640 | } 641 | 642 | Ok(ranges) 643 | } 644 | 645 | pub fn from_bytes(bytes: &[u8]) -> Result { 646 | let index_header = IndexHeader::from_bytes(bytes)?; 647 | let buffer = { 648 | let mut buffer = Vec::new(); 649 | let mut decoder = Decoder::new(Cursor::new(&bytes[INDEX_HEADER_SIZE..]))?; 650 | decoder.read_to_end(&mut buffer)?; 651 | buffer 652 | }; 653 | 654 | let mut ranges = Self::new(index_header); 655 | let mut pos = 0; 656 | while pos < buffer.len() { 657 | let bound = pos + SIZE_BLOCK_RANGE; 658 | let range = BlockRange::from_bytes(&buffer[pos..bound]); 659 | ranges.add_range(range); 660 | pos += SIZE_BLOCK_RANGE; 661 | } 662 | 663 | Ok(ranges) 664 | } 665 | 666 | /// Get a reference to the internal ranges 667 | /// Returns a reference to the collection of block ranges 668 | /// 669 | /// This provides access to the metadata for all blocks in the indexed file, 670 | /// which can be used for operations like parallel processing or random access. 671 | /// 672 | /// # Returns 673 | /// 674 | /// A slice containing all `BlockRange` entries in this index 675 | /// 676 | /// # Examples 677 | /// 678 | /// ```rust,no_run 679 | /// use binseq::vbq::BlockIndex; 680 | /// use std::path::Path; 681 | /// 682 | /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); 683 | /// 684 | /// // Examine the ranges to determine which blocks to process 685 | /// for (i, range) in index.ranges().iter().enumerate() { 686 | /// println!("Block {}: {} records at offset {}", 687 | /// i, range.block_records, range.start_offset); 688 | /// } 689 | /// ``` 690 | #[must_use] 691 | pub fn ranges(&self) -> &[BlockRange] { 692 | &self.ranges 693 | } 694 | 695 | pub fn pprint(&self) { 696 | self.ranges.iter().for_each(|range| { 697 | println!( 698 | "{}\t{}\t{}\t{}", 699 | range.start_offset, range.len, range.block_records, range.cumulative_records 700 | ); 701 | }); 702 | } 703 | 704 | /// Returns the total number of records in the dataset 705 | #[must_use] 706 | pub fn num_records(&self) -> usize { 707 | self.ranges 708 | .iter() 709 | .next_back() 710 | .map(|r| (r.cumulative_records + u64::from(r.block_records)) as usize) 711 | .unwrap_or_default() 712 | } 713 | } 714 | -------------------------------------------------------------------------------- /src/bq/reader.rs: -------------------------------------------------------------------------------- 1 | //! Binary sequence reader module 2 | //! 3 | //! This module provides functionality for reading binary sequence files using either: 4 | //! 1. Memory mapping for efficient access to entire files 5 | //! 2. Streaming for processing data as it arrives 6 | //! 7 | //! It supports both sequential and parallel processing of records, 8 | //! with configurable record layouts for different sequence types. 9 | 10 | use std::fs::File; 11 | use std::io::Read; 12 | use std::ops::Range; 13 | use std::path::Path; 14 | use std::sync::Arc; 15 | 16 | use bitnuc::BitSize; 17 | use bytemuck::cast_slice; 18 | use memmap2::Mmap; 19 | 20 | use super::header::{BinseqHeader, SIZE_HEADER}; 21 | use crate::{ 22 | error::{ReadError, Result}, 23 | BinseqRecord, Error, ParallelProcessor, ParallelReader, 24 | }; 25 | 26 | /// A reference to a binary sequence record in a memory-mapped file 27 | /// 28 | /// This struct provides a view into a single record within a binary sequence file, 29 | /// allowing access to the record's components (sequence data, flags, etc.) without 30 | /// copying the data from the memory-mapped file. 31 | /// 32 | /// The record's data is stored in a compact binary format where: 33 | /// - The first u64 contains flags 34 | /// - Subsequent u64s contain the primary sequence data 35 | /// - If present, final u64s contain the extended sequence data 36 | #[derive(Clone, Copy)] 37 | pub struct RefRecord<'a> { 38 | /// The position (index) of this record in the file (0-based record index, not byte offset) 39 | id: u64, 40 | /// The underlying u64 buffer representing the record's binary data 41 | buffer: &'a [u64], 42 | /// The configuration that defines the layout and size of record components 43 | config: RecordConfig, 44 | /// Cached index string for the sequence header 45 | header_buf: [u8; 20], 46 | /// Length of the header in bytes 47 | header_len: usize, 48 | } 49 | impl<'a> RefRecord<'a> { 50 | /// Creates a new record reference 51 | /// 52 | /// # Arguments 53 | /// 54 | /// * `id` - The record's position in the file (0-based record index, not byte offset) 55 | /// * `buffer` - The u64 slice containing the record's binary data 56 | /// * `config` - Configuration defining the record's layout 57 | /// 58 | /// # Panics 59 | /// 60 | /// Panics if the buffer length doesn't match the expected size from the config 61 | #[must_use] 62 | pub fn new(id: u64, buffer: &'a [u64], config: RecordConfig) -> Self { 63 | assert_eq!(buffer.len(), config.record_size_u64()); 64 | Self { 65 | id, 66 | buffer, 67 | config, 68 | header_buf: [0; 20], 69 | header_len: 0, 70 | } 71 | } 72 | /// Returns the record's configuration 73 | /// 74 | /// The configuration defines the layout and size of the record's components. 75 | #[must_use] 76 | pub fn config(&self) -> RecordConfig { 77 | self.config 78 | } 79 | 80 | pub fn set_id(&mut self, id: &[u8]) { 81 | self.header_len = id.len(); 82 | self.header_buf[..self.header_len].copy_from_slice(id); 83 | } 84 | } 85 | 86 | impl BinseqRecord for RefRecord<'_> { 87 | fn bitsize(&self) -> BitSize { 88 | self.config.bitsize 89 | } 90 | fn index(&self) -> u64 { 91 | self.id 92 | } 93 | /// Clear the buffer and fill it with the sequence header 94 | fn sheader(&self) -> &[u8] { 95 | &self.header_buf[..self.header_len] 96 | } 97 | 98 | /// Clear the buffer and fill it with the extended header 99 | fn xheader(&self) -> &[u8] { 100 | self.sheader() 101 | } 102 | 103 | fn flag(&self) -> Option { 104 | if self.config.flags { 105 | Some(self.buffer[0]) 106 | } else { 107 | None 108 | } 109 | } 110 | fn slen(&self) -> u64 { 111 | self.config.slen 112 | } 113 | fn xlen(&self) -> u64 { 114 | self.config.xlen 115 | } 116 | fn sbuf(&self) -> &[u64] { 117 | if self.config.flags { 118 | &self.buffer[1..=(self.config.schunk as usize)] 119 | } else { 120 | &self.buffer[..(self.config.schunk as usize)] 121 | } 122 | } 123 | fn xbuf(&self) -> &[u64] { 124 | if self.config.flags { 125 | &self.buffer[1 + self.config.schunk as usize..] 126 | } else { 127 | &self.buffer[self.config.schunk as usize..] 128 | } 129 | } 130 | } 131 | 132 | /// A reference to a record in the map with a precomputed decoded buffer slice 133 | pub struct BatchRecord<'a> { 134 | /// Unprocessed buffer slice (with flags) 135 | buffer: &'a [u64], 136 | /// Decoded buffer slice 137 | dbuf: &'a [u8], 138 | /// Record ID 139 | id: u64, 140 | /// The configuration that defines the layout and size of record components 141 | config: RecordConfig, 142 | /// Cached index string for the sequence header 143 | header_buf: [u8; 20], 144 | /// Length of the header in bytes 145 | header_len: usize, 146 | } 147 | impl BinseqRecord for BatchRecord<'_> { 148 | fn bitsize(&self) -> BitSize { 149 | self.config.bitsize 150 | } 151 | fn index(&self) -> u64 { 152 | self.id 153 | } 154 | /// Clear the buffer and fill it with the sequence header 155 | fn sheader(&self) -> &[u8] { 156 | &self.header_buf[..self.header_len] 157 | } 158 | 159 | /// Clear the buffer and fill it with the extended header 160 | fn xheader(&self) -> &[u8] { 161 | self.sheader() 162 | } 163 | 164 | fn flag(&self) -> Option { 165 | if self.config.flags { 166 | Some(self.buffer[0]) 167 | } else { 168 | None 169 | } 170 | } 171 | fn slen(&self) -> u64 { 172 | self.config.slen 173 | } 174 | fn xlen(&self) -> u64 { 175 | self.config.xlen 176 | } 177 | fn sbuf(&self) -> &[u64] { 178 | if self.config.flags { 179 | &self.buffer[1..=(self.config.schunk as usize)] 180 | } else { 181 | &self.buffer[..(self.config.schunk as usize)] 182 | } 183 | } 184 | fn xbuf(&self) -> &[u64] { 185 | if self.config.flags { 186 | &self.buffer[1 + self.config.schunk as usize..] 187 | } else { 188 | &self.buffer[self.config.schunk as usize..] 189 | } 190 | } 191 | fn decode_s(&self, dbuf: &mut Vec) -> Result<()> { 192 | dbuf.extend_from_slice(self.sseq()); 193 | Ok(()) 194 | } 195 | fn decode_x(&self, dbuf: &mut Vec) -> Result<()> { 196 | dbuf.extend_from_slice(self.xseq()); 197 | Ok(()) 198 | } 199 | /// Override this method since we can make use of block information 200 | fn sseq(&self) -> &[u8] { 201 | let scalar = self.config.scalar(); 202 | let mut lbound = 0; 203 | let mut rbound = self.config.slen(); 204 | if self.config.flags { 205 | lbound += scalar; 206 | rbound += scalar; 207 | } 208 | &self.dbuf[lbound..rbound] 209 | } 210 | /// Override this method since we can make use of block information 211 | fn xseq(&self) -> &[u8] { 212 | let scalar = self.config.scalar(); 213 | let mut lbound = scalar * self.config.schunk(); 214 | let mut rbound = lbound + self.config.xlen(); 215 | if self.config.flags { 216 | lbound += scalar; 217 | rbound += scalar; 218 | } 219 | &self.dbuf[lbound..rbound] 220 | } 221 | } 222 | 223 | /// Configuration for binary sequence record layout 224 | /// 225 | /// This struct defines the size and layout of binary sequence records, 226 | /// including both primary sequence data and optional extended data. 227 | /// It handles the translation between sequence lengths in base pairs 228 | /// and the number of u64 chunks needed to store the compressed data. 229 | #[derive(Clone, Copy)] 230 | pub struct RecordConfig { 231 | /// The primary sequence length in base pairs 232 | slen: u64, 233 | /// The extended sequence length in base pairs 234 | xlen: u64, 235 | /// The number of u64 chunks needed to store the primary sequence 236 | /// (each u64 stores 32 nucleotides) 237 | schunk: u64, 238 | /// The number of u64 chunks needed to store the extended sequence 239 | /// (each u64 stores 32 values) 240 | xchunk: u64, 241 | /// The bitsize of the record 242 | bitsize: BitSize, 243 | /// Whether flags are present 244 | flags: bool, 245 | } 246 | impl RecordConfig { 247 | /// Creates a new record configuration 248 | /// 249 | /// This constructor initializes a configuration for a binary sequence record 250 | /// with specified primary and extended sequence lengths. 251 | /// 252 | /// # Arguments 253 | /// 254 | /// * `slen` - The length of primary sequences in the file 255 | /// * `xlen` - The length of secondary/extended sequences in the file 256 | /// * `bitsize` - The bitsize of the record 257 | /// * `flags` - Whether flags are present 258 | /// 259 | /// # Returns 260 | /// 261 | /// A new `RecordConfig` instance with the specified sequence lengths 262 | pub fn new(slen: usize, xlen: usize, bitsize: BitSize, flags: bool) -> Self { 263 | let (schunk, xchunk) = match bitsize { 264 | BitSize::Two => (slen.div_ceil(32), xlen.div_ceil(32)), 265 | BitSize::Four => (slen.div_ceil(16), xlen.div_ceil(16)), 266 | }; 267 | Self { 268 | slen: slen as u64, 269 | xlen: xlen as u64, 270 | schunk: schunk as u64, 271 | xchunk: xchunk as u64, 272 | bitsize, 273 | flags, 274 | } 275 | } 276 | 277 | /// Creates a new record configuration from a header 278 | /// 279 | /// This constructor initializes a configuration based on a header that contains 280 | /// the sequence lengths for primary and extended sequences. 281 | /// 282 | /// # Arguments 283 | /// 284 | /// * `header` - A reference to a `BinseqHeader` containing sequence lengths 285 | /// 286 | /// # Returns 287 | /// 288 | /// A new `RecordConfig` instance with the sequence lengths from the header 289 | pub fn from_header(header: &BinseqHeader) -> Self { 290 | Self::new( 291 | header.slen as usize, 292 | header.xlen as usize, 293 | header.bits, 294 | header.flags, 295 | ) 296 | } 297 | 298 | /// Returns whether this record contains extended sequence data 299 | /// 300 | /// A record is considered paired if it has a non-zero extended sequence length. 301 | pub fn paired(&self) -> bool { 302 | self.xlen > 0 303 | } 304 | 305 | /// Returns the primary sequence length in base pairs 306 | /// 307 | /// This method returns the length of the primary sequence in base pairs. 308 | pub fn slen(&self) -> usize { 309 | self.slen as usize 310 | } 311 | 312 | /// Returns the extended sequence length in base pairs 313 | /// 314 | /// This method returns the length of the extended sequence in base pairs. 315 | pub fn xlen(&self) -> usize { 316 | self.xlen as usize 317 | } 318 | 319 | /// Returns the number of u64 chunks needed to store the primary sequence 320 | /// 321 | /// This method returns the number of u64 chunks required to store the primary 322 | /// sequence, where each u64 stores 32 nucleotides. 323 | pub fn schunk(&self) -> usize { 324 | self.schunk as usize 325 | } 326 | 327 | /// Returns the number of u64 chunks needed to store the extended sequence 328 | /// 329 | /// This method returns the number of u64 chunks required to store the extended 330 | /// sequence, where each u64 stores 32 values. 331 | pub fn xchunk(&self) -> usize { 332 | self.xchunk as usize 333 | } 334 | 335 | /// Returns the full record size in bytes (u8): 336 | /// 8 * (schunk + xchunk + 1 (flag)) 337 | pub fn record_size_bytes(&self) -> usize { 338 | 8 * self.record_size_u64() 339 | } 340 | 341 | /// Returns the full record size in u64 342 | /// schunk + xchunk + 1 (flag) 343 | pub fn record_size_u64(&self) -> usize { 344 | if self.flags { 345 | (self.schunk + self.xchunk + 1) as usize 346 | } else { 347 | (self.schunk + self.xchunk) as usize 348 | } 349 | } 350 | 351 | /// The number of nucleotides per word 352 | pub fn scalar(&self) -> usize { 353 | match self.bitsize { 354 | BitSize::Two => 32, 355 | BitSize::Four => 16, 356 | } 357 | } 358 | } 359 | 360 | /// A memory-mapped reader for binary sequence files 361 | /// 362 | /// This reader provides efficient access to binary sequence files by memory-mapping 363 | /// them instead of performing traditional I/O operations. It supports both 364 | /// sequential access to individual records and parallel processing of records 365 | /// across multiple threads. 366 | /// 367 | /// The reader ensures thread-safety through the use of `Arc` for sharing the 368 | /// memory-mapped data between threads. 369 | /// 370 | /// Records are returned as [`RefRecord`] which implement the [`BinseqRecord`] trait. 371 | /// 372 | /// # Examples 373 | /// 374 | /// ``` 375 | /// use binseq::bq::MmapReader; 376 | /// use binseq::Result; 377 | /// 378 | /// fn main() -> Result<()> { 379 | /// let path = "./data/subset.bq"; 380 | /// let reader = MmapReader::new(path)?; 381 | /// 382 | /// // Calculate the number of records in the file 383 | /// let num_records = reader.num_records(); 384 | /// println!("Number of records: {}", num_records); 385 | /// 386 | /// // Get the record at index 20 (0-indexed) 387 | /// let record = reader.get(20)?; 388 | /// 389 | /// Ok(()) 390 | /// } 391 | /// ``` 392 | pub struct MmapReader { 393 | /// Memory mapped file contents, wrapped in Arc for thread-safe sharing 394 | mmap: Arc, 395 | 396 | /// Binary sequence file header containing format information 397 | header: BinseqHeader, 398 | 399 | /// Configuration defining the layout of records in the file 400 | config: RecordConfig, 401 | } 402 | 403 | impl MmapReader { 404 | /// Creates a new memory-mapped reader for a binary sequence file 405 | /// 406 | /// This method opens the file, memory-maps its contents, and validates 407 | /// the file structure to ensure it contains valid binary sequence data. 408 | /// 409 | /// # Arguments 410 | /// 411 | /// * `path` - Path to the binary sequence file 412 | /// 413 | /// # Returns 414 | /// 415 | /// * `Ok(MmapReader)` - A new reader if the file is valid 416 | /// * `Err(Error)` - If the file is invalid or cannot be opened 417 | /// 418 | /// # Errors 419 | /// 420 | /// Returns an error if: 421 | /// * The file cannot be opened 422 | /// * The file is not a regular file 423 | /// * The file header is invalid 424 | /// * The file size doesn't match the expected size based on the header 425 | pub fn new>(path: P) -> Result { 426 | // Verify input file is a file before attempting to map 427 | let file = File::open(path)?; 428 | if !file.metadata()?.is_file() { 429 | return Err(ReadError::IncompatibleFile.into()); 430 | } 431 | 432 | // Safety: the file is open and won't be modified while mapped 433 | let mmap = unsafe { Mmap::map(&file)? }; 434 | 435 | // Read header from mapped memory 436 | let header = BinseqHeader::from_buffer(&mmap)?; 437 | 438 | // Record configuraration 439 | let config = RecordConfig::from_header(&header); 440 | 441 | // Immediately validate the size of the file against the expected byte size of records 442 | if !(mmap.len() - SIZE_HEADER).is_multiple_of(config.record_size_bytes()) { 443 | return Err(ReadError::FileTruncation(mmap.len()).into()); 444 | } 445 | 446 | Ok(Self { 447 | mmap: Arc::new(mmap), 448 | header, 449 | config, 450 | }) 451 | } 452 | 453 | /// Returns the total number of records in the file 454 | /// 455 | /// This is calculated by subtracting the header size from the total file size 456 | /// and dividing by the size of each record. 457 | #[must_use] 458 | pub fn num_records(&self) -> usize { 459 | (self.mmap.len() - SIZE_HEADER) / self.config.record_size_bytes() 460 | } 461 | 462 | /// Returns a copy of the binary sequence file header 463 | /// 464 | /// The header contains format information and sequence length specifications. 465 | #[must_use] 466 | pub fn header(&self) -> BinseqHeader { 467 | self.header 468 | } 469 | 470 | /// Checks if the file has paired-records 471 | #[must_use] 472 | pub fn is_paired(&self) -> bool { 473 | self.header.is_paired() 474 | } 475 | 476 | /// Returns a reference to a specific record 477 | /// 478 | /// # Arguments 479 | /// 480 | /// * `idx` - The index of the record to retrieve (0-based) 481 | /// 482 | /// # Returns 483 | /// 484 | /// * `Ok(RefRecord)` - A reference to the requested record 485 | /// * `Err(Error)` - If the index is out of bounds 486 | /// 487 | /// # Errors 488 | /// 489 | /// Returns an error if the requested index is beyond the number of records in the file 490 | pub fn get(&self, idx: usize) -> Result> { 491 | if idx > self.num_records() { 492 | return Err(ReadError::OutOfRange(idx, self.num_records()).into()); 493 | } 494 | let rsize = self.config.record_size_bytes(); 495 | let lbound = SIZE_HEADER + (idx * rsize); 496 | let rbound = lbound + rsize; 497 | let bytes = &self.mmap[lbound..rbound]; 498 | let buffer = cast_slice(bytes); 499 | Ok(RefRecord::new(idx as u64, buffer, self.config)) 500 | } 501 | 502 | /// Returns a slice of the buffer containing the underlying u64 for that range 503 | /// of records. 504 | /// 505 | /// Note: range 10..40 will return all u64s in the mmap between the record index 10 and 40 506 | pub fn get_buffer_slice(&self, range: Range) -> Result<&[u64]> { 507 | if range.end > self.num_records() { 508 | return Err(ReadError::OutOfRange(range.end, self.num_records()).into()); 509 | } 510 | let rsize = self.config.record_size_bytes(); 511 | let total_records = range.end - range.start; 512 | let lbound = SIZE_HEADER + (range.start * rsize); 513 | let rbound = lbound + (total_records * rsize); 514 | let bytes = &self.mmap[lbound..rbound]; 515 | let buffer = cast_slice(bytes); 516 | Ok(buffer) 517 | } 518 | } 519 | 520 | /// A reader for streaming binary sequence data from any source that implements Read 521 | /// 522 | /// Unlike `MmapReader` which requires the entire file to be accessible at once, 523 | /// `StreamReader` processes data as it becomes available, making it suitable for: 524 | /// - Processing data as it arrives over a network 525 | /// - Handling very large files that exceed available memory 526 | /// - Pipeline processing where data is flowing continuously 527 | /// 528 | /// The reader maintains an internal buffer and can handle partial record reconstruction 529 | /// across chunk boundaries. 530 | pub struct StreamReader { 531 | /// The source reader for binary sequence data 532 | reader: R, 533 | 534 | /// Binary sequence file header containing format information 535 | header: Option, 536 | 537 | /// Configuration defining the layout of records in the file 538 | config: Option, 539 | 540 | /// Buffer for storing incoming data 541 | buffer: Vec, 542 | 543 | /// Current position in the buffer 544 | buffer_pos: usize, 545 | 546 | /// Length of valid data in the buffer 547 | buffer_len: usize, 548 | } 549 | 550 | impl StreamReader { 551 | /// Creates a new `StreamReader` with the default buffer size 552 | /// 553 | /// This constructor initializes a `StreamReader` that will read from the provided 554 | /// source, using an 8K default buffer size. 555 | /// 556 | /// # Arguments 557 | /// 558 | /// * `reader` - The source to read binary sequence data from 559 | /// 560 | /// # Returns 561 | /// 562 | /// A new `StreamReader` instance 563 | pub fn new(reader: R) -> Self { 564 | Self::with_capacity(reader, 8192) 565 | } 566 | 567 | /// Creates a new `StreamReader` with a specified buffer capacity 568 | /// 569 | /// This constructor initializes a `StreamReader` with a custom buffer size, 570 | /// which can be tuned based on the expected usage pattern. 571 | /// 572 | /// # Arguments 573 | /// 574 | /// * `reader` - The source to read binary sequence data from 575 | /// * `capacity` - The size of the internal buffer in bytes 576 | /// 577 | /// # Returns 578 | /// 579 | /// A new `StreamReader` instance with the specified buffer capacity 580 | pub fn with_capacity(reader: R, capacity: usize) -> Self { 581 | Self { 582 | reader, 583 | header: None, 584 | config: None, 585 | buffer: vec![0; capacity], 586 | buffer_pos: 0, 587 | buffer_len: 0, 588 | // buffer_capacity: capacity, 589 | } 590 | } 591 | 592 | /// Reads and validates the header from the underlying reader 593 | /// 594 | /// This method reads the binary sequence file header and validates it. 595 | /// It caches the header internally for future use. 596 | /// 597 | /// # Returns 598 | /// 599 | /// * `Ok(&BinseqHeader)` - A reference to the validated header 600 | /// * `Err(Error)` - If reading or validating the header fails 601 | /// 602 | /// # Panics 603 | /// 604 | /// Panics if the header is missing when expected in the stream. 605 | /// 606 | /// # Errors 607 | /// 608 | /// Returns an error if: 609 | /// * There is an I/O error when reading from the source 610 | /// * The header data is invalid 611 | /// * End of stream is reached before the full header can be read 612 | pub fn read_header(&mut self) -> Result<&BinseqHeader> { 613 | if self.header.is_some() { 614 | return Ok(self 615 | .header 616 | .as_ref() 617 | .expect("Missing header when expected in stream")); 618 | } 619 | 620 | // Ensure we have enough data for the header 621 | while self.buffer_len - self.buffer_pos < SIZE_HEADER { 622 | self.fill_buffer()?; 623 | } 624 | 625 | // Parse header 626 | let header_slice = &self.buffer[self.buffer_pos..self.buffer_pos + SIZE_HEADER]; 627 | let header = BinseqHeader::from_buffer(header_slice)?; 628 | 629 | self.header = Some(header); 630 | self.config = Some(RecordConfig::from_header(&header)); 631 | self.buffer_pos += SIZE_HEADER; 632 | 633 | Ok(self.header.as_ref().unwrap()) 634 | } 635 | 636 | /// Fills the internal buffer with more data from the reader 637 | /// 638 | /// This method reads more data from the underlying reader, handling 639 | /// the case where some unprocessed data remains in the buffer. 640 | /// 641 | /// # Returns 642 | /// 643 | /// * `Ok(())` - If the buffer was successfully filled with new data 644 | /// * `Err(Error)` - If reading from the source fails 645 | /// 646 | /// # Errors 647 | /// 648 | /// Returns an error if: 649 | /// * There is an I/O error when reading from the source 650 | /// * End of stream is reached (no more data available) 651 | fn fill_buffer(&mut self) -> Result<()> { 652 | // Move remaining data to beginning of buffer if needed 653 | if self.buffer_pos > 0 && self.buffer_pos < self.buffer_len { 654 | self.buffer.copy_within(self.buffer_pos..self.buffer_len, 0); 655 | self.buffer_len -= self.buffer_pos; 656 | self.buffer_pos = 0; 657 | } else if self.buffer_pos == self.buffer_len { 658 | self.buffer_len = 0; 659 | self.buffer_pos = 0; 660 | } 661 | 662 | // Read more data 663 | let bytes_read = self.reader.read(&mut self.buffer[self.buffer_len..])?; 664 | if bytes_read == 0 { 665 | return Err(ReadError::EndOfStream.into()); 666 | } 667 | 668 | self.buffer_len += bytes_read; 669 | Ok(()) 670 | } 671 | 672 | /// Retrieves the next record from the stream 673 | /// 674 | /// This method reads and processes the next complete record from the stream. 675 | /// It handles the case where a record spans multiple buffer fills. 676 | /// 677 | /// # Returns 678 | /// 679 | /// * `Ok(Some(RefRecord))` - The next record was successfully read 680 | /// * `Ok(None)` - End of stream was reached (no more records) 681 | /// * `Err(Error)` - If an error occurred during reading 682 | /// 683 | /// # Panics 684 | /// 685 | /// Panics if the configuration is missing when expected in the stream. 686 | /// 687 | /// # Errors 688 | /// 689 | /// Returns an error if: 690 | /// * There is an I/O error when reading from the source 691 | /// * The header has not been read yet 692 | /// * The data format is invalid 693 | pub fn next_record(&mut self) -> Option>> { 694 | // Ensure header is read 695 | if self.header.is_none() { 696 | if let Some(e) = self.read_header().err() { 697 | return Some(Err(e)); 698 | } 699 | } 700 | 701 | let config = self 702 | .config 703 | .expect("Missing configuration when expected in stream"); 704 | let record_size = config.record_size_bytes(); 705 | 706 | // Ensure we have enough data for a complete record 707 | while self.buffer_len - self.buffer_pos < record_size { 708 | match self.fill_buffer() { 709 | Ok(()) => {} 710 | Err(Error::ReadError(ReadError::EndOfStream)) => { 711 | // End of stream reached - if we have any partial data, it's an error 712 | if self.buffer_len - self.buffer_pos > 0 { 713 | return Some(Err(ReadError::PartialRecord( 714 | self.buffer_len - self.buffer_pos, 715 | ) 716 | .into())); 717 | } 718 | return None; 719 | } 720 | Err(e) => return Some(Err(e)), 721 | } 722 | } 723 | 724 | // Process record 725 | let record_start = self.buffer_pos; 726 | self.buffer_pos += record_size; 727 | 728 | let record_bytes = &self.buffer[record_start..record_start + record_size]; 729 | let record_u64s = cast_slice(record_bytes); 730 | 731 | // Create record with incremental ID (based on read position) 732 | let id = (record_start - SIZE_HEADER) / record_size; 733 | Some(Ok(RefRecord::new(id as u64, record_u64s, config))) 734 | } 735 | 736 | /// Consumes the stream reader and returns the inner reader 737 | /// 738 | /// This method is useful when you need access to the underlying reader 739 | /// after processing is complete. 740 | /// 741 | /// # Returns 742 | /// 743 | /// The inner reader that was used by this `StreamReader` 744 | pub fn into_inner(self) -> R { 745 | self.reader 746 | } 747 | } 748 | 749 | /// Default batch size for parallel processing 750 | /// 751 | /// This constant defines how many records each thread processes at a time 752 | /// during parallel processing operations. 753 | pub const BATCH_SIZE: usize = 1024; 754 | 755 | /// Parallel processing implementation for memory-mapped readers 756 | impl ParallelReader for MmapReader { 757 | /// Processes all records in parallel using multiple threads 758 | /// 759 | /// This method distributes the records across the specified number of threads 760 | /// and processes them using the provided processor. Each thread receives its 761 | /// own clone of the processor and processes a contiguous chunk of records. 762 | /// 763 | /// # Arguments 764 | /// 765 | /// * `processor` - The processor to use for handling records 766 | /// * `num_threads` - The number of threads to use for processing 767 | /// 768 | /// # Type Parameters 769 | /// 770 | /// * `P` - A type that implements `ParallelProcessor` and can be cloned 771 | /// 772 | /// # Returns 773 | /// 774 | /// * `Ok(())` - If all records were processed successfully 775 | /// * `Err(Error)` - If an error occurred during processing 776 | fn process_parallel( 777 | self, 778 | processor: P, 779 | num_threads: usize, 780 | ) -> Result<()> { 781 | let num_records = self.num_records(); 782 | self.process_parallel_range(processor, num_threads, 0..num_records) 783 | } 784 | 785 | /// Process records in parallel within a specified range 786 | /// 787 | /// This method allows parallel processing of a subset of records within the file, 788 | /// defined by a start and end index. The range is distributed across the specified 789 | /// number of threads. 790 | /// 791 | /// # Arguments 792 | /// 793 | /// * `processor` - The processor to use for each record 794 | /// * `num_threads` - The number of threads to spawn 795 | /// * `range` - The range of record indices to process 796 | /// 797 | /// # Type Parameters 798 | /// 799 | /// * `P` - A type that implements `ParallelProcessor` and can be cloned 800 | /// 801 | /// # Returns 802 | /// 803 | /// * `Ok(())` - If all records were processed successfully 804 | /// * `Err(Error)` - If an error occurred during processing 805 | fn process_parallel_range( 806 | self, 807 | processor: P, 808 | num_threads: usize, 809 | range: Range, 810 | ) -> Result<()> { 811 | // Calculate the number of threads to use 812 | let num_threads = if num_threads == 0 { 813 | num_cpus::get() 814 | } else { 815 | num_threads.min(num_cpus::get()) 816 | }; 817 | 818 | // Validate range 819 | let num_records = self.num_records(); 820 | if range.start >= num_records || range.end > num_records || range.start >= range.end { 821 | return Ok(()); // Nothing to process or invalid range 822 | } 823 | 824 | // Calculate number of records for each thread within the range 825 | let range_size = range.end - range.start; 826 | let records_per_thread = range_size.div_ceil(num_threads); 827 | 828 | // Arc self 829 | let reader = Arc::new(self); 830 | 831 | // Build thread handles 832 | let mut handles = Vec::new(); 833 | for tid in 0..num_threads { 834 | let mut processor = processor.clone(); 835 | let reader = reader.clone(); 836 | processor.set_tid(tid); 837 | 838 | let handle = std::thread::spawn(move || -> Result<()> { 839 | let start_idx = range.start + tid * records_per_thread; 840 | let end_idx = (start_idx + records_per_thread).min(range.end); 841 | 842 | if start_idx >= end_idx { 843 | return Ok(()); // No records for this thread 844 | } 845 | 846 | // create a reusable buffer for translating record IDs 847 | let mut translater = itoa::Buffer::new(); 848 | 849 | // initialize a decoding buffer 850 | let mut dbuf = Vec::new(); 851 | 852 | // calculate the size of a record in the cast u64 slice 853 | let rsize_u64 = reader.config.record_size_bytes() / 8; 854 | 855 | // determine the required scalar size 856 | let scalar = reader.config.scalar(); 857 | 858 | // calculate the size of a record in the batch decoded buffer 859 | let mut dbuf_rsize = { (reader.config.schunk() + reader.config.xchunk()) * scalar }; 860 | if reader.config.flags { 861 | dbuf_rsize += scalar; 862 | } 863 | 864 | // iterate over the range of indices 865 | for range_start in (start_idx..end_idx).step_by(BATCH_SIZE) { 866 | let range_end = (range_start + BATCH_SIZE).min(end_idx); 867 | 868 | // clear the decoded buffer 869 | dbuf.clear(); 870 | 871 | // get the encoded buffer slice 872 | let ebuf = reader.get_buffer_slice(range_start..range_end)?; 873 | 874 | // decode the entire buffer at once (with flags and extra bases) 875 | reader 876 | .config 877 | .bitsize 878 | .decode(ebuf, ebuf.len() * scalar, &mut dbuf)?; 879 | 880 | // iterate over each index in the range 881 | for (inner_idx, idx) in (range_start..range_end).enumerate() { 882 | // translate the index 883 | let id_str = translater.format(idx); 884 | 885 | // create the index buffer 886 | let mut header_buf = [0; 20]; 887 | let header_len = id_str.len(); 888 | header_buf[..header_len].copy_from_slice(id_str.as_bytes()); 889 | 890 | // find the buffer starts 891 | let ebuf_start = inner_idx * rsize_u64; 892 | let dbuf_start = inner_idx * dbuf_rsize; 893 | 894 | // initialize the record 895 | let record = BatchRecord { 896 | buffer: &ebuf[ebuf_start..(ebuf_start + rsize_u64)], 897 | dbuf: &dbuf[dbuf_start..(dbuf_start + dbuf_rsize)], 898 | id: idx as u64, 899 | config: reader.config, 900 | header_buf, 901 | header_len, 902 | }; 903 | 904 | // process the record 905 | processor.process_record(record)?; 906 | } 907 | 908 | // process the batch 909 | processor.on_batch_complete()?; 910 | } 911 | 912 | Ok(()) 913 | }); 914 | 915 | handles.push(handle); 916 | } 917 | 918 | for handle in handles { 919 | handle 920 | .join() 921 | .expect("Error joining handle (1)") 922 | .expect("Error joining handle (2)"); 923 | } 924 | 925 | Ok(()) 926 | } 927 | } 928 | --------------------------------------------------------------------------------