├── .gitignore
├── .cargo
    └── config.toml
├── data
    ├── subset.bq
    ├── subset.vbq
    ├── subset_R1.bq
    ├── subset_R2.bq
    ├── subset_R1.fastq.gz
    └── subset_R2.fastq.gz
├── src
    ├── prelude.rs
    ├── context
    │   ├── mod.rs
    │   ├── traits.rs
    │   └── structs.rs
    ├── lib.rs
    ├── record.rs
    ├── vbq
    │   ├── mod.rs
    │   ├── header.rs
    │   └── index.rs
    ├── parallel.rs
    ├── policy.rs
    ├── bq
    │   ├── mod.rs
    │   ├── header.rs
    │   ├── writer.rs
    │   └── reader.rs
    └── error.rs
├── Cargo.toml
├── README.md
├── examples
    ├── streaming.rs
    ├── grep.rs
    ├── network_streaming.rs
    ├── example.rs
    ├── parallel_processing.rs
    ├── parallel_range.rs
    └── read_write.rs
└── .github
    └── workflows
        └── ci.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 
3 | Cargo.lock
4 | data/test*
5 | 


--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | rustflags = ["-C", "target-cpu=native"]
3 | 


--------------------------------------------------------------------------------
/data/subset.bq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset.bq


--------------------------------------------------------------------------------
/data/subset.vbq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset.vbq


--------------------------------------------------------------------------------
/data/subset_R1.bq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R1.bq


--------------------------------------------------------------------------------
/data/subset_R2.bq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R2.bq


--------------------------------------------------------------------------------
/data/subset_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R1.fastq.gz


--------------------------------------------------------------------------------
/data/subset_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArcInstitute/binseq/HEAD/data/subset_R2.fastq.gz


--------------------------------------------------------------------------------
/src/prelude.rs:
--------------------------------------------------------------------------------
1 | pub use super::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader};
2 | 
3 | pub use crate::context::{
4 |     Context, Ctx, HeaderContext, QualityContext, SeqCtx, SeqHeaderCtx, SeqQualCtx, SequenceContext,
5 | };
6 | 


--------------------------------------------------------------------------------
/src/context/mod.rs:
--------------------------------------------------------------------------------
1 | /// Instances of common contexts
2 | mod structs;
3 | 
4 | /// Traits for different context behaviors
5 | mod traits;
6 | 
7 | pub use structs::{Ctx, SeqCtx, SeqHeaderCtx, SeqQualCtx};
8 | pub use traits::{Context, HeaderContext, QualityContext, SequenceContext};
9 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "binseq"
 3 | version = "0.8.1"
 4 | edition = "2021"
 5 | description = "A high efficiency binary format for sequencing data"
 6 | license = "MIT"
 7 | authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
 8 | repository = "https://github.com/arcinstitute/binseq"
 9 | documentation = "https://docs.rs/binseq"
10 | categories = ["science::bioinformatics", "encoding", "data-structures"]
11 | keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"]
12 | 
13 | [dependencies]
14 | anyhow = "1.0.100"
15 | auto_impl = "1.3.0"
16 | bitnuc = "0.3.0"
17 | bytemuck = "1.24.0"
18 | byteorder = "1.5.0"
19 | itoa = "1.0.15"
20 | memmap2 = "0.9.9"
21 | num_cpus = "1.17.0"
22 | rand = { version = "0.9.2", features = ["small_rng"] }
23 | thiserror = "2.0.17"
24 | zstd = { version = "0.13.3", features = ["zstdmt"] }
25 | 
26 | [dev-dependencies]
27 | nucgen = "0.2.0"
28 | niffler = "3.0.0"
29 | seq_io = "0.3.4"
30 | parking_lot = "0.12.5"
31 | itoa = "1.0.15"
32 | memchr = "2.7.6"
33 | 
34 | [lints.clippy]
35 | pedantic = { level = "warn", priority = -1 }
36 | cast_possible_truncation = "allow"
37 | missing_errors_doc = "allow"
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BINSEQ Format Specification
 2 | 
 3 | [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE.md)
 4 | ![actions status](https://github.com/arcinstitute/binseq/workflows/CI/badge.svg)
 5 | [![Crates.io](https://img.shields.io/crates/d/binseq?color=orange&label=crates.io)](https://crates.io/crates/binseq)
 6 | [![docs.rs](https://img.shields.io/docsrs/binseq?color=green&label=docs.rs)](https://docs.rs/binseq/latest/binseq/)
 7 | 
 8 | ## Overview
 9 | 
10 | BINSEQ is a binary file format family designed for efficient storage and processing of DNA sequences.
11 | They make use of two-bit encoding for nucleotides and are optimized for high-performance parallel processing.
12 | 
13 | BINSEQ currently has two flavors:
14 | 
15 | 1. **BQ**: (`*.bq`) files are for _fixed-length_ records **without** quality scores.
16 | 2. **VBQ**: (`*.vbq`) files are for _variable-length_ records **with optional** quality scores and headers.
17 | 
18 | Both flavors support both single and paired sequences.
19 | 
20 | ## Getting Started
21 | 
22 | This is a **library** for reading and writing BINSEQ files, for a **command-line interface** see [bqtools](https://github.com/arcinstitute/bqtools).
23 | 
24 | To get started please refer to our [documentation](https://docs.rs/binseq/latest/binseq/).
25 | For example programs which make use of the library check out our [examples directory](https://github.com/arcinstitute/binseq/tree/main/examples).
26 | 
27 | For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
28 | 


--------------------------------------------------------------------------------
/examples/streaming.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{BufReader, Cursor};
 2 | 
 3 | use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
 4 | use binseq::{BinseqRecord, Policy, Result};
 5 | 
 6 | fn main() -> Result<()> {
 7 |     // Create a header for sequences of length 100
 8 |     let header = BinseqHeaderBuilder::new().slen(100).build()?;
 9 | 
10 |     // Create some example sequence data
11 |     let sequence = b"ACGT".repeat(25); // 100 nucleotides
12 | 
13 |     // Create a stream writer with a memory buffer as destination
14 |     let mut writer = StreamWriterBuilder::default()
15 |         .header(header)
16 |         .policy(Policy::RandomDraw) // Use random nucleotides for invalid bases
17 |         .buffer_capacity(4096) // Use 4K buffer
18 |         .build(Cursor::new(Vec::new()))?;
19 | 
20 |     // Write the sequence with flag 0
21 |     writer.write_record(Some(0), &sequence)?;
22 | 
23 |     // Write the sequence with flag 1
24 |     writer.write_record(Some(1), &sequence)?;
25 | 
26 |     // Flush and get the buffer
27 |     let buffer = writer.into_inner()?;
28 |     let buffer_inner = buffer.into_inner();
29 | 
30 |     println!("Wrote {} bytes to buffer", buffer_inner.len());
31 | 
32 |     // Now read from the buffer using the streaming reader
33 |     let cursor = Cursor::new(buffer_inner);
34 |     let buf_reader = BufReader::new(cursor);
35 | 
36 |     // Create a stream reader
37 |     let mut reader = StreamReader::new(buf_reader);
38 | 
39 |     // Read and display the header
40 |     let header = reader.read_header()?;
41 |     println!("Read header: sequence length = {}", header.slen);
42 | 
43 |     // Read records one by one
44 |     let mut count = 0;
45 |     while let Some(record) = reader.next_record() {
46 |         let record = record?;
47 |         println!("Record {}: flag = {:?}", count, record.flag());
48 |         count += 1;
49 |     }
50 | 
51 |     println!("Read {count} records");
52 | 
53 |     Ok(())
54 | }
55 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | env:
 6 |   CARGO_TERM_COLOR: always
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - name: Build
14 |         run: cargo build --verbose
15 |       - name: Run tests
16 |         run: cargo test --verbose
17 |       - name: Run tests (release)
18 |         run: cargo test --verbose --release
19 | 
20 |   fmt_lint:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: actions/checkout@v3
24 |       - name: Formatting
25 |         run: cargo fmt --check
26 |       - name: Linting
27 |         run: cargo clippy --verbose
28 | 
29 |   example_read_write:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - uses: actions/checkout@v3
33 |       - name: run example
34 |         run: cargo run --release --example read_write
35 | 
36 |   example_parallel:
37 |     runs-on: ubuntu-latest
38 |     steps:
39 |       - uses: actions/checkout@v3
40 |       - name: run example
41 |         run: cargo run --release --example parallel_processing
42 | 
43 |   example_example:
44 |     runs-on: ubuntu-latest
45 |     steps:
46 |       - uses: actions/checkout@v3
47 |       - name: run example
48 |         run: cargo run --release --example example
49 | 
50 |   example_grep:
51 |     runs-on: ubuntu-latest
52 |     steps:
53 |       - uses: actions/checkout@v3
54 |       - name: run example bq
55 |         run: cargo run --release --example grep ./data/subset.bq
56 |       - name: run example vbq
57 |         run: cargo run --release --example grep ./data/subset.vbq
58 | 
59 |   example_range:
60 |     runs-on: ubuntu-latest
61 |     steps:
62 |       - uses: actions/checkout@v3
63 |       - name: run example (bq)
64 |         run: cargo run --release --example parallel_range -- ./data/subset.bq 4 30 200
65 |       - name: run example (vbq)
66 |         run: cargo run --release --example parallel_range -- ./data/subset.vbq 4 30 200
67 | 


--------------------------------------------------------------------------------
/examples/grep.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use anyhow::Result;
 4 | use binseq::{context::SeqCtx, prelude::*};
 5 | use memchr::memmem::Finder;
 6 | use parking_lot::Mutex;
 7 | 
 8 | #[derive(Clone)]
 9 | pub struct GrepCounter {
10 |     // (thread) local variables
11 |     ctx: SeqCtx,
12 |     local_count: usize,
13 | 
14 |     // search pattern (using memchr::memmem::Finder for fast searching)
15 |     pattern: Finder<'static>,
16 | 
17 |     // global variables
18 |     count: Arc<Mutex<usize>>,
19 | }
20 | impl GrepCounter {
21 |     #[must_use]
22 |     pub fn new(pattern: &[u8]) -> Self {
23 |         Self {
24 |             ctx: SeqCtx::default(),
25 |             pattern: Finder::new(pattern).into_owned(),
26 |             local_count: 0,
27 |             count: Arc::new(Mutex::new(0)),
28 |         }
29 |     }
30 | 
31 |     fn match_sequence(&self, seq: &[u8]) -> bool {
32 |         self.pattern.find(seq).is_some()
33 |     }
34 | 
35 |     fn pprint(&self) {
36 |         println!("Matching records: {}", self.count.lock());
37 |     }
38 | }
39 | impl ParallelProcessor for GrepCounter {
40 |     fn process_record<R: binseq::BinseqRecord>(&mut self, record: R) -> binseq::Result<()> {
41 |         self.ctx.fill(&record)?;
42 | 
43 |         if self.match_sequence(&self.ctx.sbuf()) || self.match_sequence(&self.ctx.xbuf()) {
44 |             self.local_count += 1;
45 |         }
46 | 
47 |         Ok(())
48 |     }
49 | 
50 |     fn on_batch_complete(&mut self) -> binseq::Result<()> {
51 |         *self.count.lock() += self.local_count;
52 |         self.local_count = 0;
53 |         Ok(())
54 |     }
55 | }
56 | 
57 | fn main() -> Result<()> {
58 |     let path = std::env::args()
59 |         .nth(1)
60 |         .unwrap_or("./data/subset.bq".to_string());
61 |     let pattern = std::env::args()
62 |         .nth(2)
63 |         .unwrap_or("ACGT".to_string())
64 |         .as_bytes()
65 |         .to_vec();
66 |     let n_threads = std::env::args().nth(3).unwrap_or("1".to_string()).parse()?;
67 | 
68 |     let reader = BinseqReader::new(&path)?;
69 |     let counter = GrepCounter::new(&pattern);
70 |     reader.process_parallel(counter.clone(), n_threads)?;
71 |     counter.pprint();
72 | 
73 |     Ok(())
74 | }
75 | 


--------------------------------------------------------------------------------
/examples/network_streaming.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{BufReader, BufWriter};
  2 | use std::net::{TcpListener, TcpStream};
  3 | use std::thread;
  4 | 
  5 | use binseq::bq::{BinseqHeader, BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
  6 | use binseq::{BinseqRecord, Policy, Result};
  7 | 
  8 | fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> {
  9 |     // Create a listener on localhost:3000
 10 |     let listener = TcpListener::bind("127.0.0.1:3000").expect("Failed to bind to address");
 11 |     println!("Server listening on 127.0.0.1:3000");
 12 | 
 13 |     // Accept one connection
 14 |     let (stream, _) = listener.accept().expect("Failed to accept connection");
 15 |     println!("Client connected");
 16 | 
 17 |     let stream = BufWriter::new(stream);
 18 | 
 19 |     // Create a stream writer with the network stream as destination
 20 |     let mut writer = StreamWriterBuilder::default()
 21 |         .header(header)
 22 |         .policy(Policy::RandomDraw)
 23 |         .buffer_capacity(16384) // Larger buffer for network I/O
 24 |         .build(stream)?;
 25 | 
 26 |     // Write sequences in a loop
 27 |     for i in 0..10 {
 28 |         writer.write_record(Some(i), sequence)?;
 29 |         println!("Server: Sent record {i}");
 30 | 
 31 |         // Simulate delay between records
 32 |         thread::sleep(std::time::Duration::from_millis(100));
 33 |     }
 34 | 
 35 |     // Ensure flush on drop
 36 |     writer.flush()?;
 37 |     println!("Server: All records sent");
 38 | 
 39 |     Ok(())
 40 | }
 41 | 
 42 | fn client() -> Result<()> {
 43 |     // Wait a moment for the server to start
 44 |     thread::sleep(std::time::Duration::from_millis(500));
 45 | 
 46 |     // Connect to the server
 47 |     let stream = TcpStream::connect("127.0.0.1:3000").expect("Failed to connect to server");
 48 |     println!("Connected to server");
 49 | 
 50 |     // Create a buffered reader for the stream
 51 |     let reader = BufReader::new(stream);
 52 | 
 53 |     // Create a streaming reader
 54 |     let mut reader = StreamReader::new(reader);
 55 | 
 56 |     // Read the header
 57 |     let header = reader.read_header()?;
 58 |     println!(
 59 |         "Client: Received header with sequence length = {}",
 60 |         header.slen
 61 |     );
 62 | 
 63 |     // Read records as they arrive
 64 |     let mut count = 0;
 65 |     while let Some(record) = reader.next_record() {
 66 |         let record = record?;
 67 |         println!(
 68 |             "Client: Received record {} with flag = {:?}",
 69 |             count,
 70 |             record.flag()
 71 |         );
 72 |         count += 1;
 73 |     }
 74 | 
 75 |     println!("Client: Received {count} records total");
 76 | 
 77 |     Ok(())
 78 | }
 79 | 
 80 | fn main() -> Result<()> {
 81 |     // Create a header for sequences of length 100
 82 |     let header = BinseqHeaderBuilder::new().slen(100).build()?;
 83 | 
 84 |     // Create some example sequence data
 85 |     let sequence = b"ACGT".repeat(25); // 100 nucleotides
 86 | 
 87 |     // Spawn the server in a separate thread
 88 |     let server_thread = thread::spawn(move || {
 89 |         if let Err(e) = server(header, &sequence) {
 90 |             eprintln!("Server error: {e:?}");
 91 |         }
 92 |     });
 93 | 
 94 |     // Run the client in the main thread
 95 |     if let Err(e) = client() {
 96 |         eprintln!("Client error: {e:?}");
 97 |     }
 98 | 
 99 |     // Wait for the server to finish
100 |     server_thread.join().unwrap();
101 | 
102 |     Ok(())
103 | }
104 | 


--------------------------------------------------------------------------------
/examples/example.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | use std::io::{stdout, BufWriter, Write};
  3 | use std::sync::Arc;
  4 | 
  5 | use anyhow::Result;
  6 | use binseq::prelude::*;
  7 | 
  8 | use parking_lot::Mutex;
  9 | 
 10 | /// A struct for decoding BINSEQ data back to FASTQ format.
 11 | #[derive(Clone)]
 12 | pub struct Decoder {
 13 |     /// Reusable context
 14 |     ctx: Ctx,
 15 | 
 16 |     /// local output buffer
 17 |     local_writer: Vec<u8>,
 18 | 
 19 |     /// global output buffer
 20 |     global_writer: Arc<Mutex<Box<dyn Write + Send>>>,
 21 | 
 22 |     /// Local count of records
 23 |     local_count: usize,
 24 | 
 25 |     /// global count of records
 26 |     global_count: Arc<Mutex<usize>>,
 27 | }
 28 | 
 29 | impl Decoder {
 30 |     #[must_use]
 31 |     pub fn new(writer: Box<dyn Write + Send>) -> Self {
 32 |         let global_writer = Arc::new(Mutex::new(writer));
 33 |         Decoder {
 34 |             local_writer: Vec::new(),
 35 |             ctx: Ctx::default(),
 36 |             local_count: 0,
 37 |             global_writer,
 38 |             global_count: Arc::new(Mutex::new(0)),
 39 |         }
 40 |     }
 41 | 
 42 |     #[must_use]
 43 |     pub fn num_records(&self) -> usize {
 44 |         *self.global_count.lock()
 45 |     }
 46 | }
 47 | impl ParallelProcessor for Decoder {
 48 |     fn process_record<R: BinseqRecord>(&mut self, record: R) -> binseq::Result<()> {
 49 |         self.ctx.fill(&record)?;
 50 |         write_fastq_parts(
 51 |             &mut self.local_writer,
 52 |             self.ctx.sheader(),
 53 |             self.ctx.sbuf(),
 54 |             self.ctx.squal(),
 55 |         )?;
 56 | 
 57 |         // write extended fastq to local buffer
 58 |         if record.is_paired() {
 59 |             write_fastq_parts(
 60 |                 &mut self.local_writer,
 61 |                 self.ctx.xheader(),
 62 |                 &self.ctx.xbuf(),
 63 |                 self.ctx.xqual(),
 64 |             )?;
 65 |         }
 66 | 
 67 |         self.local_count += 1;
 68 |         Ok(())
 69 |     }
 70 | 
 71 |     fn on_batch_complete(&mut self) -> binseq::Result<()> {
 72 |         // Lock the mutex to write to the global buffer
 73 |         {
 74 |             let mut lock = self.global_writer.lock();
 75 |             lock.write_all(&self.local_writer)?;
 76 |             lock.flush()?;
 77 |         }
 78 |         // Lock the mutex to update the number of records
 79 |         {
 80 |             let mut global_count = self.global_count.lock();
 81 |             *global_count += self.local_count;
 82 |         }
 83 | 
 84 |         // Clear the local buffer and reset the local record count
 85 |         self.local_writer.clear();
 86 |         self.local_count = 0;
 87 |         Ok(())
 88 |     }
 89 | }
 90 | 
 91 | #[allow(clippy::missing_errors_doc)]
 92 | pub fn write_fastq_parts<W: Write>(
 93 |     writer: &mut W,
 94 |     index: &[u8],
 95 |     sequence: &[u8],
 96 |     quality: &[u8],
 97 | ) -> Result<(), std::io::Error> {
 98 |     writer.write_all(b"@seq.")?;
 99 |     writer.write_all(index)?;
100 |     writer.write_all(b"\n")?;
101 |     writer.write_all(sequence)?;
102 |     writer.write_all(b"\n+\n")?;
103 |     writer.write_all(quality)?;
104 |     writer.write_all(b"\n")?;
105 |     Ok(())
106 | }
107 | 
108 | fn match_output(path: Option<&str>) -> Result<Box<dyn Write + Send>> {
109 |     if let Some(path) = path {
110 |         let writer = File::create(path).map(BufWriter::new)?;
111 |         Ok(Box::new(writer))
112 |     } else {
113 |         let stdout = stdout();
114 |         Ok(Box::new(BufWriter::new(stdout)))
115 |     }
116 | }
117 | 
118 | fn main() -> Result<()> {
119 |     let file = std::env::args()
120 |         .nth(1)
121 |         .unwrap_or("./data/subset.bq".to_string());
122 |     let n_threads = std::env::args().nth(2).unwrap_or("1".to_string()).parse()?;
123 | 
124 |     let reader = BinseqReader::new(&file)?;
125 |     let writer = match_output(None)?;
126 |     let proc = Decoder::new(writer);
127 | 
128 |     reader.process_parallel(proc.clone(), n_threads)?;
129 |     eprintln!("Read {} records", proc.num_records());
130 | 
131 |     Ok(())
132 | }
133 | 


--------------------------------------------------------------------------------
/src/context/traits.rs:
--------------------------------------------------------------------------------
  1 | use crate::{BinseqRecord, Result};
  2 | 
  3 | pub const DEFAULT_QUALITY: u8 = b'?';
  4 | 
  5 | /// Trait for handling reusable buffers in decoding BINSEQ records.
  6 | pub trait Context: Clone + Default {
  7 |     /// Replaces the contents of the context with the contents of the given record.
  8 |     ///
  9 |     /// This will clear all existing data and fill the context with the contents of the record.
 10 |     fn fill<R: BinseqRecord>(&mut self, record: &R) -> Result<()>;
 11 | }
 12 | 
 13 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on nucleotide sequences.
 14 | pub trait SequenceContext {
 15 |     fn sbuf(&self) -> &[u8];
 16 |     fn xbuf(&self) -> &[u8];
 17 |     fn sbuf_mut(&mut self) -> &mut Vec<u8>;
 18 |     fn xbuf_mut(&mut self) -> &mut Vec<u8>;
 19 |     #[inline]
 20 |     fn clear_sequences(&mut self) {
 21 |         self.sbuf_mut().clear();
 22 |         self.xbuf_mut().clear();
 23 |     }
 24 |     #[inline]
 25 |     #[allow(deprecated)]
 26 |     fn fill_sequences<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
 27 |         self.clear_sequences();
 28 |         record.decode_s(self.sbuf_mut())?;
 29 |         if record.is_paired() {
 30 |             record.decode_x(self.xbuf_mut())?;
 31 |         }
 32 |         Ok(())
 33 |     }
 34 | }
 35 | 
 36 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on quality data.
 37 | pub trait QualityContext {
 38 |     fn squal(&self) -> &[u8];
 39 |     fn xqual(&self) -> &[u8];
 40 |     fn squal_mut(&mut self) -> &mut Vec<u8>;
 41 |     fn xqual_mut(&mut self) -> &mut Vec<u8>;
 42 |     #[inline]
 43 |     fn clear_qualities(&mut self) {
 44 |         self.squal_mut().clear();
 45 |         self.xqual_mut().clear();
 46 |     }
 47 |     #[inline]
 48 |     fn fill_qualities<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
 49 |         if record.has_quality() {
 50 |             let slen = record.slen() as usize;
 51 |             let squal = self.squal_mut();
 52 |             if squal.len() != slen {
 53 |                 squal.resize(slen, DEFAULT_QUALITY);
 54 |             }
 55 |             squal.copy_from_slice(record.squal());
 56 | 
 57 |             if record.is_paired() {
 58 |                 let xlen = record.xlen() as usize;
 59 |                 let xqual = self.xqual_mut();
 60 |                 if xqual.len() != xlen {
 61 |                     xqual.resize(xlen, DEFAULT_QUALITY);
 62 |                 }
 63 |                 xqual.copy_from_slice(record.xqual());
 64 |             }
 65 |         } else {
 66 |             self.ensure_quality_capacity(record);
 67 |         }
 68 |         Ok(())
 69 |     }
 70 |     #[inline]
 71 |     fn ensure_quality_capacity<R: BinseqRecord>(&mut self, record: &R) {
 72 |         let slen = record.slen() as usize;
 73 |         let xlen = record.xlen() as usize;
 74 | 
 75 |         // only resize if its not the right size
 76 |         let squal = self.squal_mut();
 77 |         if squal.len() != slen {
 78 |             squal.resize(slen, DEFAULT_QUALITY);
 79 |         }
 80 | 
 81 |         // Only resize if there's an extended sequence and it's not already the right size
 82 |         let xqual = self.xqual_mut();
 83 |         if xqual.len() != xlen {
 84 |             xqual.resize(xlen, DEFAULT_QUALITY);
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | /// Trait for handling reusable buffers in decoding BINSEQ records focused on header data.
 90 | pub trait HeaderContext {
 91 |     fn sheader(&self) -> &[u8];
 92 |     fn sheader_mut(&mut self) -> &mut Vec<u8>;
 93 |     fn xheader(&self) -> &[u8];
 94 |     fn xheader_mut(&mut self) -> &mut Vec<u8>;
 95 |     #[inline]
 96 |     fn clear_headers(&mut self) {
 97 |         self.sheader_mut().clear();
 98 |         self.xheader_mut().clear();
 99 |     }
100 | 
101 |     #[inline]
102 |     fn fill_headers<R: BinseqRecord>(&mut self, record: &R) {
103 |         self.clear_headers();
104 |         self.sheader_mut().extend_from_slice(record.sheader());
105 |         if record.is_paired() {
106 |             self.xheader_mut().extend_from_slice(record.xheader());
107 |         }
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![doc = include_str!("../README.md")]
  2 | //!
  3 | //! # BINSEQ
  4 | //!
  5 | //! The `binseq` library provides efficient APIs for working with the [BINSEQ](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1) file format family.
  6 | //!
  7 | //! It offers methods to read and write BINSEQ files, providing:
  8 | //!
  9 | //! - Compact multi-bit encoding and decoding of nucleotide sequences through [`bitnuc`](https://docs.rs/bitnuc/latest/bitnuc/)
 10 | //! - Memory-mapped file access for efficient reading ([`bq::MmapReader`] and [`vbq::MmapReader`])
 11 | //! - Parallel processing capabilities for arbitrary tasks through the [`ParallelProcessor`] trait.
 12 | //! - Configurable [`Policy`] for handling invalid nucleotides
 13 | //! - Support for both single and paired-end sequences
 14 | //! - Optional sequence headers/identifiers (VBQ format)
 15 | //! - Abstract [`BinseqRecord`] trait for representing records from both `.bq` and `.vbq` files.
 16 | //! - Abstract [`BinseqReader`] enum for processing records from both `.bq` and `.vbq` files.
 17 | //!
 18 | //! ## Recent VBQ Format Changes (v0.7.0+)
 19 | //!
 20 | //! The VBQ format has undergone significant improvements:
 21 | //!
 22 | //! - **Embedded Index**: VBQ files now contain their index data embedded at the end of the file,
 23 | //!   eliminating separate `.vqi` index files and improving portability.
 24 | //! - **Headers Support**: Optional sequence identifiers/headers can be stored with each record.
 25 | //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records.
 26 | //! - **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings.
 27 | //!
 28 | //! Legacy VBQ files are automatically migrated to the new format when accessed.
 29 | //!
 30 | //! ## Crate Organization
 31 | //!
 32 | //! This library is split into 3 major parts.
 33 | //!
 34 | //! There are the [`bq`] and [`vbq`] modules, which provide tools for reading and writing `BQ` and `VBQ` files respectively.
 35 | //! Then there are traits and utilities that are ubiquitous across the library which are available at the top-level of the crate.
 36 | //!
 37 | //! # Example: Memory-mapped Access
 38 | //!
 39 | //! ```
 40 | //! use binseq::Result;
 41 | //! use binseq::prelude::*;
 42 | //!
 43 | //! #[derive(Clone, Default)]
 44 | //! pub struct Processor {
 45 | //!     // Define fields here
 46 | //! }
 47 | //!
 48 | //! impl ParallelProcessor for Processor {
 49 | //!     fn process_record<B: BinseqRecord>(&mut self, record: B) -> Result<()> {
 50 | //!         // Implement per-record logic here
 51 | //!         Ok(())
 52 | //!     }
 53 | //!
 54 | //!     fn on_batch_complete(&mut self) -> Result<()> {
 55 | //!         // Implement per-batch logic here
 56 | //!         Ok(())
 57 | //!     }
 58 | //! }
 59 | //!
 60 | //! fn main() -> Result<()> {
 61 | //!     // provide an input path (*.bq or *.vbq)
 62 | //!     let path = "./data/subset.bq";
 63 | //!
 64 | //!     // open a reader
 65 | //!     let reader = BinseqReader::new(path)?;
 66 | //!
 67 | //!     // initialize a processor
 68 | //!     let processor = Processor::default();
 69 | //!
 70 | //!     // process the records in parallel with 8 threads
 71 | //!     reader.process_parallel(processor, 8)?;
 72 | //!     Ok(())
 73 | //! }
 74 | //! ```
 75 | 
 76 | #![allow(clippy::module_inception)]
 77 | 
 78 | /// BQ - fixed length records, no quality scores
 79 | pub mod bq;
 80 | 
 81 | /// Error definitions
 82 | pub mod error;
 83 | 
 84 | /// Parallel processing
 85 | mod parallel;
 86 | 
 87 | /// Invalid nucleotide policy
 88 | mod policy;
 89 | 
 90 | /// Record trait shared between BINSEQ variants
 91 | mod record;
 92 | 
 93 | /// VBQ - Variable length records, optional quality scores, compressed blocks
 94 | pub mod vbq;
 95 | 
 96 | /// Prelude - Commonly used types and traits
 97 | pub mod prelude;
 98 | 
 99 | /// Context - Reusable state for parallel processing
100 | pub mod context;
101 | 
102 | pub use error::{Error, IntoBinseqError, Result};
103 | pub use parallel::{BinseqReader, ParallelProcessor, ParallelReader};
104 | pub use policy::{Policy, RNG_SEED};
105 | pub use record::BinseqRecord;
106 | 
107 | /// Re-export `bitnuc::BitSize`
108 | pub use bitnuc::BitSize;
109 | 


--------------------------------------------------------------------------------
/src/record.rs:
--------------------------------------------------------------------------------
  1 | use auto_impl::auto_impl;
  2 | use bitnuc::BitSize;
  3 | 
  4 | use super::Result;
  5 | 
  6 | /// Record trait shared between BINSEQ variants.
  7 | ///
  8 | /// Exposes public methods for accessing internal data.
  9 | /// Interfaces with the [`bitnuc`] crate for decoding sequences.
 10 | ///
 11 | /// Implemented by [`bq::RefRecord`](crate::bq::RefRecord) and [`vbq::RefRecord`](crate::vbq::RefRecord).
 12 | ///
 13 | /// Used to interact with [`ParallelProcessor`](crate::ParallelProcessor) for easy parallel processing.
 14 | #[auto_impl(&, &mut)]
 15 | pub trait BinseqRecord {
 16 |     /// Returns the bitsize of the record (number of bits per nucleotide)
 17 |     fn bitsize(&self) -> BitSize;
 18 | 
 19 |     /// Returns the global index of the record.
 20 |     fn index(&self) -> u64;
 21 | 
 22 |     /// Returns the flag value of this record
 23 |     fn flag(&self) -> Option<u64>;
 24 | 
 25 |     /// Returns the header of this record
 26 |     fn sheader(&self) -> &[u8];
 27 | 
 28 |     /// Returns the header of the extended/paired sequence (empty if not paired)
 29 |     fn xheader(&self) -> &[u8];
 30 | 
 31 |     /// Returns the length of the primary sequence of this record
 32 |     fn slen(&self) -> u64;
 33 | 
 34 |     /// Returns the length of the extended sequence of this record
 35 |     fn xlen(&self) -> u64;
 36 | 
 37 |     /// Returns a reference to the **encoded** primary sequence of this record
 38 |     fn sbuf(&self) -> &[u64];
 39 | 
 40 |     /// Returns a reference to the **encoded** extended sequence of this record.
 41 |     ///
 42 |     /// Empty if no extended sequence is present.
 43 |     fn xbuf(&self) -> &[u64];
 44 | 
 45 |     /// Returns a reference to the quality scores of the primary sequence of this record.
 46 |     ///
 47 |     /// Empty if no quality scores are present.
 48 |     fn squal(&self) -> &[u8] {
 49 |         &[]
 50 |     }
 51 | 
 52 |     /// Returns a reference to the quality scores of the extended sequence of this record.
 53 |     ///
 54 |     /// Empty if no quality scores are present.
 55 |     fn xqual(&self) -> &[u8] {
 56 |         &[]
 57 |     }
 58 | 
 59 |     /// Decodes the primary sequence of this record into the provided buffer.
 60 |     fn decode_s(&self, buf: &mut Vec<u8>) -> Result<()> {
 61 |         self.bitsize()
 62 |             .decode(self.sbuf(), self.slen() as usize, buf)?;
 63 |         Ok(())
 64 |     }
 65 | 
 66 |     /// Decodes the extended sequence of this record into the provided buffer.
 67 |     fn decode_x(&self, buf: &mut Vec<u8>) -> Result<()> {
 68 |         self.bitsize()
 69 |             .decode(self.xbuf(), self.xlen() as usize, buf)?;
 70 |         Ok(())
 71 |     }
 72 | 
 73 |     /// Returns a reference to the primary decoded sequence of this record.
 74 |     ///
 75 |     /// This is not available on all types that implement the `Record` trait.
 76 |     /// It should be available on types that implement it in this library however.
 77 |     fn sseq(&self) -> &[u8] {
 78 |         unimplemented!("This record does not implement direct sequence access");
 79 |     }
 80 | 
 81 |     /// Returns a reference to the extended decoded sequence of this record.
 82 |     ///
 83 |     /// This may not be available on all types that implement the `Record` trait.
 84 |     /// It should be available on types that implement it in this library however.
 85 |     fn xseq(&self) -> &[u8] {
 86 |         unimplemented!("This record does not implement direct sequence access");
 87 |     }
 88 | 
 89 |     /// Decodes the primary sequence of this record into a newly allocated buffer.
 90 |     ///
 91 |     /// Not advised to use this function as it allocates a new buffer every time.
 92 |     fn decode_s_alloc(&self) -> Result<Vec<u8>> {
 93 |         let mut buf = Vec::with_capacity(self.slen() as usize);
 94 |         self.decode_s(&mut buf)?;
 95 |         Ok(buf)
 96 |     }
 97 | 
 98 |     /// Decodes the extended sequence of this record into a newly allocated buffer.
 99 |     ///
100 |     /// Not advised to use this function as it allocates a new buffer every time.
101 |     fn decode_x_alloc(&self) -> Result<Vec<u8>> {
102 |         let mut buf = Vec::with_capacity(self.xlen() as usize);
103 |         self.decode_x(&mut buf)?;
104 |         Ok(buf)
105 |     }
106 | 
107 |     /// A convenience function to check if the record is paired.
108 |     fn is_paired(&self) -> bool {
109 |         self.xlen() > 0
110 |     }
111 | 
112 |     /// A convenience function to check if record has associated quality scores
113 |     fn has_quality(&self) -> bool {
114 |         !self.squal().is_empty()
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/examples/parallel_processing.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fs::File,
  3 |     io::BufWriter,
  4 |     sync::{
  5 |         atomic::{AtomicUsize, Ordering},
  6 |         Arc,
  7 |     },
  8 | };
  9 | 
 10 | use anyhow::{bail, Result};
 11 | use binseq::{
 12 |     bq::{self, BinseqHeaderBuilder},
 13 |     context::SeqCtx,
 14 |     prelude::*,
 15 | };
 16 | use nucgen::Sequence;
 17 | 
 18 | #[derive(Clone, Default)]
 19 | pub struct MyProcessor {
 20 |     local_counter: usize,
 21 |     counter: Arc<AtomicUsize>,
 22 |     ctx: SeqCtx,
 23 | }
 24 | impl MyProcessor {
 25 |     #[must_use]
 26 |     pub fn counter(&self) -> usize {
 27 |         self.counter.load(Ordering::Relaxed)
 28 |     }
 29 | }
 30 | impl ParallelProcessor for MyProcessor {
 31 |     fn process_record<R: BinseqRecord>(&mut self, record: R) -> binseq::Result<()> {
 32 |         self.ctx.fill_sequences(&record)?;
 33 |         self.local_counter += 1;
 34 |         Ok(())
 35 |     }
 36 |     fn on_batch_complete(&mut self) -> binseq::Result<()> {
 37 |         self.counter
 38 |             .fetch_add(self.local_counter, Ordering::Relaxed);
 39 |         self.local_counter = 0;
 40 |         Ok(())
 41 |     }
 42 | }
 43 | 
 44 | fn mmap_processing(binseq_path: &str, n_threads: usize) -> Result<()> {
 45 |     let reader = BinseqReader::new(binseq_path)?;
 46 |     let proc = MyProcessor::default();
 47 |     reader.process_parallel(proc.clone(), n_threads)?;
 48 |     Ok(())
 49 | }
 50 | 
 51 | pub fn main() -> Result<()> {
 52 |     let binseq_path_single = "./data/test.bq";
 53 |     let binseq_path_paired = "./data/test_paired.bq";
 54 |     let r1_size = 150;
 55 |     let r2_size = 300;
 56 |     let num_seq = 1_000_000;
 57 | 
 58 |     time_it(
 59 |         || {
 60 |             write_single(binseq_path_single, num_seq, r1_size)?;
 61 |             Ok(())
 62 |         },
 63 |         "write_single",
 64 |     );
 65 | 
 66 |     time_it(
 67 |         || {
 68 |             write_paired(binseq_path_paired, num_seq, r1_size, r2_size)?;
 69 |             Ok(())
 70 |         },
 71 |         "write_paired",
 72 |     );
 73 | 
 74 |     for n_threads in 1..=16 {
 75 |         if n_threads > 1 && n_threads % 2 != 0 {
 76 |             continue;
 77 |         }
 78 |         time_it(
 79 |             || {
 80 |                 mmap_processing(binseq_path_single, n_threads)?;
 81 |                 Ok(())
 82 |             },
 83 |             &format!("single - mmap_parallel_processing ({n_threads})"),
 84 |         );
 85 |     }
 86 |     for n_threads in 1..=16 {
 87 |         if n_threads > 1 && n_threads % 2 != 0 {
 88 |             continue;
 89 |         }
 90 |         time_it(
 91 |             || {
 92 |                 mmap_processing(binseq_path_paired, n_threads)?;
 93 |                 Ok(())
 94 |             },
 95 |             &format!("paired - mmap_parallel_processing ({n_threads})"),
 96 |         );
 97 |     }
 98 | 
 99 |     Ok(())
100 | }
101 | 
102 | fn time_it<F>(f: F, name: &str)
103 | where
104 |     F: Fn() -> Result<()>,
105 | {
106 |     let now = std::time::Instant::now();
107 |     f().unwrap();
108 |     let elapsed = now.elapsed();
109 |     eprintln!("Elapsed time ({name}): {elapsed:?}");
110 | }
111 | 
112 | fn write_single(binseq_path: &str, num_seq: usize, seq_size: usize) -> Result<()> {
113 |     // Open the output file
114 |     let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?;
115 |     let out_handle = File::create(binseq_path).map(BufWriter::new)?;
116 |     let mut writer = bq::BinseqWriterBuilder::default()
117 |         .header(header)
118 |         .build(out_handle)?;
119 | 
120 |     // Write the binary sequence
121 |     let mut sequence = Sequence::new();
122 |     let mut rng = rand::rng();
123 |     for _ in 0..num_seq {
124 |         sequence.fill_buffer(&mut rng, seq_size);
125 |         if !writer.write_record(Some(0), sequence.bytes())? {
126 |             bail!("Error writing nucleotides")
127 |         }
128 |     }
129 |     writer.flush()?;
130 |     eprintln!("Finished writing {num_seq} records to path: {binseq_path}");
131 |     Ok(())
132 | }
133 | 
134 | fn write_paired(binseq_path: &str, num_seq: usize, r1_size: usize, r2_size: usize) -> Result<()> {
135 |     // Open the output file
136 |     let header = bq::BinseqHeaderBuilder::new()
137 |         .slen(r1_size as u32)
138 |         .xlen(r2_size as u32)
139 |         .build()?;
140 |     let out_handle = File::create(binseq_path).map(BufWriter::new)?;
141 |     let mut writer = bq::BinseqWriterBuilder::default()
142 |         .header(header)
143 |         .build(out_handle)?;
144 | 
145 |     // Write the binary sequence
146 |     let mut r1 = Sequence::new();
147 |     let mut r2 = Sequence::new();
148 |     let mut rng = rand::rng();
149 |     for _ in 0..num_seq {
150 |         r1.fill_buffer(&mut rng, r1_size);
151 |         r2.fill_buffer(&mut rng, r2_size);
152 | 
153 |         if !writer.write_paired_record(Some(0), r1.bytes(), r2.bytes())? {
154 |             bail!("Error writing nucleotides")
155 |         }
156 |     }
157 |     writer.flush()?;
158 |     eprintln!("Finished writing {num_seq} records to path: {binseq_path}");
159 |     Ok(())
160 | }
161 | 


--------------------------------------------------------------------------------
/src/context/structs.rs:
--------------------------------------------------------------------------------
  1 | use super::traits::{Context, HeaderContext, QualityContext, SequenceContext};
  2 | use crate::{BinseqRecord, Result};
  3 | 
  4 | /// Context for sequence data
  5 | ///
  6 | /// Has all the necessary fields for storing sequence data.
  7 | #[derive(Clone, Default)]
  8 | pub struct Ctx {
  9 |     sbuf: Vec<u8>,
 10 |     xbuf: Vec<u8>,
 11 |     sheader: Vec<u8>,
 12 |     xheader: Vec<u8>,
 13 |     squal: Vec<u8>,
 14 |     xqual: Vec<u8>,
 15 | }
 16 | impl SequenceContext for Ctx {
 17 |     #[inline]
 18 |     fn sbuf(&self) -> &[u8] {
 19 |         &self.sbuf
 20 |     }
 21 |     #[inline]
 22 |     fn xbuf(&self) -> &[u8] {
 23 |         &self.xbuf
 24 |     }
 25 |     #[inline]
 26 |     fn sbuf_mut(&mut self) -> &mut Vec<u8> {
 27 |         &mut self.sbuf
 28 |     }
 29 |     #[inline]
 30 |     fn xbuf_mut(&mut self) -> &mut Vec<u8> {
 31 |         &mut self.xbuf
 32 |     }
 33 | }
 34 | impl QualityContext for Ctx {
 35 |     #[inline]
 36 |     fn squal(&self) -> &[u8] {
 37 |         &self.squal
 38 |     }
 39 |     #[inline]
 40 |     fn xqual(&self) -> &[u8] {
 41 |         &self.xqual
 42 |     }
 43 |     #[inline]
 44 |     fn squal_mut(&mut self) -> &mut Vec<u8> {
 45 |         &mut self.squal
 46 |     }
 47 |     #[inline]
 48 |     fn xqual_mut(&mut self) -> &mut Vec<u8> {
 49 |         &mut self.xqual
 50 |     }
 51 | }
 52 | impl HeaderContext for Ctx {
 53 |     #[inline]
 54 |     fn sheader(&self) -> &[u8] {
 55 |         &self.sheader
 56 |     }
 57 |     #[inline]
 58 |     fn xheader(&self) -> &[u8] {
 59 |         &self.xheader
 60 |     }
 61 |     #[inline]
 62 |     fn sheader_mut(&mut self) -> &mut Vec<u8> {
 63 |         &mut self.sheader
 64 |     }
 65 |     #[inline]
 66 |     fn xheader_mut(&mut self) -> &mut Vec<u8> {
 67 |         &mut self.xheader
 68 |     }
 69 | }
 70 | impl Context for Ctx {
 71 |     #[inline]
 72 |     fn fill<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
 73 |         self.fill_sequences(record)?;
 74 |         self.fill_qualities(record)?;
 75 |         self.fill_headers(record);
 76 |         Ok(())
 77 |     }
 78 | }
 79 | 
 80 | /// Context for just sequence data
 81 | ///
 82 | /// Only stores nucleotide sequence data.
 83 | #[derive(Clone, Default)]
 84 | pub struct SeqCtx {
 85 |     sbuf: Vec<u8>,
 86 |     xbuf: Vec<u8>,
 87 | }
 88 | impl SequenceContext for SeqCtx {
 89 |     #[inline]
 90 |     fn sbuf(&self) -> &[u8] {
 91 |         &self.sbuf
 92 |     }
 93 |     #[inline]
 94 |     fn xbuf(&self) -> &[u8] {
 95 |         &self.xbuf
 96 |     }
 97 |     #[inline]
 98 |     fn sbuf_mut(&mut self) -> &mut Vec<u8> {
 99 |         &mut self.sbuf
100 |     }
101 |     #[inline]
102 |     fn xbuf_mut(&mut self) -> &mut Vec<u8> {
103 |         &mut self.xbuf
104 |     }
105 | }
106 | impl Context for SeqCtx {
107 |     #[inline]
108 |     fn fill<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
109 |         self.fill_sequences(record)
110 |     }
111 | }
112 | 
113 | /// Context for sequence data and headers
114 | ///
115 | /// Does not store quality data.
116 | #[derive(Clone, Default)]
117 | pub struct SeqHeaderCtx {
118 |     sbuf: Vec<u8>,
119 |     xbuf: Vec<u8>,
120 |     sheader: Vec<u8>,
121 |     xheader: Vec<u8>,
122 | }
123 | impl SequenceContext for SeqHeaderCtx {
124 |     #[inline]
125 |     fn sbuf(&self) -> &[u8] {
126 |         &self.sbuf
127 |     }
128 |     #[inline]
129 |     fn xbuf(&self) -> &[u8] {
130 |         &self.xbuf
131 |     }
132 |     #[inline]
133 |     fn sbuf_mut(&mut self) -> &mut Vec<u8> {
134 |         &mut self.sbuf
135 |     }
136 |     #[inline]
137 |     fn xbuf_mut(&mut self) -> &mut Vec<u8> {
138 |         &mut self.xbuf
139 |     }
140 | }
141 | impl HeaderContext for SeqHeaderCtx {
142 |     #[inline]
143 |     fn sheader(&self) -> &[u8] {
144 |         &self.sheader
145 |     }
146 |     #[inline]
147 |     fn xheader(&self) -> &[u8] {
148 |         &self.xheader
149 |     }
150 |     #[inline]
151 |     fn sheader_mut(&mut self) -> &mut Vec<u8> {
152 |         &mut self.sheader
153 |     }
154 |     #[inline]
155 |     fn xheader_mut(&mut self) -> &mut Vec<u8> {
156 |         &mut self.xheader
157 |     }
158 | }
159 | impl Context for SeqHeaderCtx {
160 |     #[inline]
161 |     fn fill<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
162 |         self.fill_sequences(record)?;
163 |         self.fill_headers(record);
164 |         Ok(())
165 |     }
166 | }
167 | 
168 | /// Context for sequence data and quality data
169 | ///
170 | /// Does not store header data.
171 | #[derive(Clone, Default)]
172 | pub struct SeqQualCtx {
173 |     sbuf: Vec<u8>,
174 |     xbuf: Vec<u8>,
175 |     squal: Vec<u8>,
176 |     xqual: Vec<u8>,
177 | }
178 | impl SequenceContext for SeqQualCtx {
179 |     #[inline]
180 |     fn sbuf(&self) -> &[u8] {
181 |         &self.sbuf
182 |     }
183 |     #[inline]
184 |     fn xbuf(&self) -> &[u8] {
185 |         &self.xbuf
186 |     }
187 |     #[inline]
188 |     fn sbuf_mut(&mut self) -> &mut Vec<u8> {
189 |         &mut self.sbuf
190 |     }
191 |     #[inline]
192 |     fn xbuf_mut(&mut self) -> &mut Vec<u8> {
193 |         &mut self.xbuf
194 |     }
195 | }
196 | impl QualityContext for SeqQualCtx {
197 |     #[inline]
198 |     fn squal(&self) -> &[u8] {
199 |         &self.squal
200 |     }
201 |     #[inline]
202 |     fn xqual(&self) -> &[u8] {
203 |         &self.xqual
204 |     }
205 |     #[inline]
206 |     fn squal_mut(&mut self) -> &mut Vec<u8> {
207 |         &mut self.squal
208 |     }
209 |     #[inline]
210 |     fn xqual_mut(&mut self) -> &mut Vec<u8> {
211 |         &mut self.xqual
212 |     }
213 | }
214 | impl Context for SeqQualCtx {
215 |     #[inline]
216 |     fn fill<R: BinseqRecord>(&mut self, record: &R) -> Result<()> {
217 |         self.fill_sequences(record)?;
218 |         self.fill_qualities(record)?;
219 |         Ok(())
220 |     }
221 | }
222 | 


--------------------------------------------------------------------------------
/src/vbq/mod.rs:
--------------------------------------------------------------------------------
  1 | //! # VBINSEQ Format
  2 | //!
  3 | //! VBINSEQ is a high-performance binary format for variable-length nucleotide sequences
  4 | //! that optimizes both storage efficiency and parallel processing capabilities.
  5 | //!
  6 | //! For more information on the format, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
  7 | //!
  8 | //! ## Overview
  9 | //!
 10 | //! VBINSEQ extends the core principles of BINSEQ to accommodate:
 11 | //!
 12 | //! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBINSEQ can store
 13 | //!   sequences of any length, making it suitable for technologies like PacBio and Oxford Nanopore.
 14 | //!
 15 | //! * **Quality scores**: Optional storage of quality scores alongside nucleotide data when needed.
 16 | //!
 17 | //! * **Sequence headers**: Optional storage of sequence identifiers/headers with each record.
 18 | //!
 19 | //! * **Block-based organization**: Data is organized into fixed-size independent record blocks
 20 | //!   for efficient parallel processing.
 21 | //!
 22 | //! * **Compression**: Optional ZSTD compression of individual blocks balances storage
 23 | //!   efficiency with processing speed.
 24 | //!
 25 | //! * **Paired-end support**: Native support for paired sequences without needing multiple files.
 26 | //!
 27 | //! * **Multi-bit encoding**: Support for 2-bit and 4-bit nucleotide encodings.
 28 | //!
 29 | //! * **Embedded index**: Self-contained files with embedded index data for efficient random access.
 30 | //!
 31 | //! ## File Structure
 32 | //!
 33 | //! A VBINSEQ file consists of a 32-byte header followed by record blocks and an embedded index:
 34 | //!
 35 | //! ```text
 36 | //! ┌───────────────────┐
 37 | //! │    File Header    │ 32 bytes
 38 | //! ├───────────────────┤
 39 | //! │   Block Header    │ 32 bytes
 40 | //! ├───────────────────┤
 41 | //! │                   │
 42 | //! │   Block Records   │ Variable size
 43 | //! │                   │
 44 | //! ├───────────────────┤
 45 | //! │       ...         │ More blocks
 46 | //! ├───────────────────┤
 47 | //! │ Compressed Index  │ Variable size
 48 | //! ├───────────────────┤
 49 | //! │   Index Size      │ 8 bytes (u64)
 50 | //! ├───────────────────┤
 51 | //! │ Index End Magic   │ 8 bytes
 52 | //! └───────────────────┘
 53 | //! ```
 54 | //!
 55 | //! ## Record Format
 56 | //!
 57 | //! Each record contains the following fields in order:
 58 | //!
 59 | //! * Flag field (8 bytes)
 60 | //! * Primary sequence length (8 bytes)
 61 | //! * Extended sequence length (8 bytes, 0 if not paired)
 62 | //! * Primary sequence data (2-bit or 4-bit encoded)
 63 | //! * Extended sequence data (optional, for paired-end)
 64 | //! * Primary quality scores (optional, if `qual` flag set)
 65 | //! * Extended quality scores (optional, if paired and `qual` flag set)
 66 | //! * Primary header length (8 bytes, if `headers` flag set)
 67 | //! * Primary header data (UTF-8 string, if `headers` flag set)
 68 | //! * Extended header length (8 bytes, if paired and `headers` flag set)
 69 | //! * Extended header data (UTF-8 string, if paired and `headers` flag set)
 70 | //!
 71 | //! ## Recent Format Changes (v0.7.0+)
 72 | //!
 73 | //! * **Embedded Index**: Index data is now stored within the VBQ file itself, eliminating
 74 | //!   separate `.vqi` files and improving portability.
 75 | //! * **Headers Support**: Optional sequence identifiers can be stored with each record.
 76 | //! * **Extended Capacity**: u64 indexing supports files with more than 4 billion records.
 77 | //! * **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings.
 78 | //!
 79 | //! ## Performance Characteristics
 80 | //!
 81 | //! VBINSEQ is designed for high-throughput parallel processing:
 82 | //!
 83 | //! * Independent blocks enable true parallel processing without synchronization
 84 | //! * Memory-mapped access provides efficient I/O
 85 | //! * Embedded index enables fast random access without auxiliary files
 86 | //! * Multi-bit encoding (2-bit/4-bit) optimizes storage for different use cases
 87 | //! * Optional ZSTD compression reduces file size with minimal performance impact
 88 | //!
 89 | //! ## Usage Example
 90 | //!
 91 | //! ```
 92 | //! use std::fs::File;
 93 | //! use std::io::BufWriter;
 94 | //! use binseq::vbq::{VBinseqHeaderBuilder, VBinseqWriterBuilder, MmapReader};
 95 | //! use binseq::BinseqRecord;
 96 | //!
 97 | //! /*
 98 | //!    WRITING
 99 | //! */
100 | //!
101 | //! // Create a header for sequences with quality scores and headers
102 | //! let header = VBinseqHeaderBuilder::new()
103 | //!     .qual(true)
104 | //!     .compressed(true)
105 | //!     .headers(true)
106 | //!     .build();
107 | //!
108 | //! // Create a writer
109 | //! let file = File::create("example.vbq").unwrap();
110 | //! let mut writer = VBinseqWriterBuilder::default()
111 | //!     .header(header)
112 | //!     .build(BufWriter::new(file))
113 | //!     .unwrap();
114 | //!
115 | //! // Write a sequence with quality scores and header
116 | //! let sequence = b"ACGTACGT";
117 | //! let quality = b"IIIIFFFF";
118 | //! let header_str = b"sequence_001";
119 | //! writer.write_record(None, Some(header_str), sequence, Some(quality)).unwrap();
120 | //! writer.finish().unwrap();
121 | //!
122 | //! /*
123 | //!    READING
124 | //! */
125 | //!
126 | //! // Read the sequences back
127 | //! let mut reader = MmapReader::new("example.vbq").unwrap();
128 | //! let mut block = reader.new_block();
129 | //!
130 | //! // Process blocks one at a time
131 | //! let mut seq_buffer = Vec::new();
132 | //! while reader.read_block_into(&mut block).unwrap() {
133 | //!     for record in block.iter() {
134 | //!         record.decode_s(&mut seq_buffer).unwrap();
135 | //!         let header = record.sheader();
136 | //!         println!("Header: {}", std::str::from_utf8(header).unwrap());
137 | //!         println!("Sequence: {}", std::str::from_utf8(&seq_buffer).unwrap());
138 | //!         println!("Quality: {}", std::str::from_utf8(record.squal()).unwrap());
139 | //!         seq_buffer.clear();
140 | //!     }
141 | //! }
142 | //! # std::fs::remove_file("example.vbq").unwrap_or(());
143 | //! ```
144 | 
145 | mod header;
146 | mod index;
147 | mod reader;
148 | mod writer;
149 | 
150 | pub use header::{BlockHeader, VBinseqHeader, VBinseqHeaderBuilder};
151 | pub use index::{BlockIndex, BlockRange};
152 | pub use reader::{MmapReader, RecordBlock, RecordBlockIter, RefRecord};
153 | pub use writer::{VBinseqWriter, VBinseqWriterBuilder};
154 | 


--------------------------------------------------------------------------------
/examples/parallel_range.rs:
--------------------------------------------------------------------------------
  1 | use binseq::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader, Result};
  2 | use std::sync::atomic::{AtomicUsize, Ordering};
  3 | use std::sync::Arc;
  4 | 
  5 | #[derive(Clone)]
  6 | struct RangeProcessor {
  7 |     counter: Arc<AtomicUsize>,
  8 |     tid: Option<usize>,
  9 |     range_start: usize,
 10 |     range_end: usize,
 11 | }
 12 | 
 13 | impl RangeProcessor {
 14 |     fn new(range_start: usize, range_end: usize) -> Self {
 15 |         Self {
 16 |             counter: Arc::new(AtomicUsize::new(0)),
 17 |             tid: None,
 18 |             range_start,
 19 |             range_end,
 20 |         }
 21 |     }
 22 | 
 23 |     fn count(&self) -> usize {
 24 |         self.counter.load(Ordering::Relaxed)
 25 |     }
 26 | }
 27 | 
 28 | impl ParallelProcessor for RangeProcessor {
 29 |     fn process_record<R: BinseqRecord>(&mut self, record: R) -> Result<()> {
 30 |         let count = self.counter.fetch_add(1, Ordering::Relaxed);
 31 | 
 32 |         // Print progress every 10,000 records
 33 |         if count % 10_000 == 0 {
 34 |             if let Some(tid) = self.tid {
 35 |                 println!(
 36 |                     "Thread {}: Processed {} records (Range: {}-{}, Index: {}, Len: {})",
 37 |                     tid,
 38 |                     count + 1,
 39 |                     self.range_start,
 40 |                     self.range_end,
 41 |                     record.index(),
 42 |                     record.sseq().len(),
 43 |                 );
 44 |             }
 45 |         }
 46 | 
 47 |         Ok(())
 48 |     }
 49 | 
 50 |     fn set_tid(&mut self, tid: usize) {
 51 |         self.tid = Some(tid);
 52 |     }
 53 | 
 54 |     fn get_tid(&self) -> Option<usize> {
 55 |         self.tid
 56 |     }
 57 | 
 58 |     fn on_batch_complete(&mut self) -> Result<()> {
 59 |         if let Some(tid) = self.tid {
 60 |             println!("Thread {tid} completed batch processing");
 61 |         }
 62 |         Ok(())
 63 |     }
 64 | }
 65 | 
 66 | fn main() -> Result<()> {
 67 |     let args: Vec<String> = std::env::args().collect();
 68 |     if args.len() < 2 {
 69 |         eprintln!(
 70 |             "Usage: {} <binseq_file> [num_threads] [start] [end]",
 71 |             args[0]
 72 |         );
 73 |         eprintln!("Example: {} data/subset.bq 4 1000 5000", args[0]);
 74 |         std::process::exit(1);
 75 |     }
 76 | 
 77 |     let file_path = &args[1];
 78 |     let num_threads = args
 79 |         .get(2)
 80 |         .unwrap_or(&"4".to_string())
 81 |         .parse::<usize>()
 82 |         .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))?;
 83 | 
 84 |     // Create reader to get total record count
 85 |     let reader = BinseqReader::new(file_path)?;
 86 |     let total_records = reader.num_records()?;
 87 | 
 88 |     println!("File: {file_path}");
 89 |     println!("Total records in file: {total_records}");
 90 | 
 91 |     // Parse range arguments or use defaults
 92 |     let start = args
 93 |         .get(3)
 94 |         .map(|s| s.parse::<usize>())
 95 |         .transpose()
 96 |         .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))?
 97 |         .unwrap_or(0);
 98 |     let end = args
 99 |         .get(4)
100 |         .map(|s| s.parse::<usize>())
101 |         .transpose()
102 |         .map_err(|e| binseq::Error::from(anyhow::Error::from(e)))?
103 |         .unwrap_or(total_records.min(10_000)); // Default to first 10k records
104 | 
105 |     // Validate range
106 |     if start >= total_records {
107 |         eprintln!("Error: Start index {start} is >= total records {total_records}");
108 |         std::process::exit(1);
109 |     }
110 |     if end > total_records {
111 |         eprintln!(
112 |             "Warning: End index {end} is > total records {total_records}, clamping to {total_records}"
113 |         );
114 |     }
115 |     let end = end.min(total_records);
116 | 
117 |     if start >= end {
118 |         eprintln!("Error: Start index {start} must be < end index {end}");
119 |         std::process::exit(1);
120 |     }
121 | 
122 |     println!(
123 |         "Processing range: {} to {} ({} records)",
124 |         start,
125 |         end,
126 |         end - start
127 |     );
128 |     println!("Using {num_threads} threads");
129 |     println!();
130 | 
131 |     // Demonstrate processing the full file
132 |     println!("=== Processing full file ===");
133 |     let reader_full = BinseqReader::new(file_path)?;
134 |     let processor_full = RangeProcessor::new(0, total_records);
135 |     let start_time = std::time::Instant::now();
136 | 
137 |     reader_full.process_parallel(processor_full.clone(), num_threads)?;
138 | 
139 |     let elapsed_full = start_time.elapsed();
140 |     println!("Full file processing completed!");
141 |     println!("Records processed: {}", processor_full.count());
142 |     println!("Time taken: {elapsed_full:.2?}");
143 |     println!();
144 | 
145 |     // Demonstrate processing a specific range
146 |     println!("=== Processing specific range ===");
147 |     let reader_range = BinseqReader::new(file_path)?;
148 |     let processor_range = RangeProcessor::new(start, end);
149 |     let start_time = std::time::Instant::now();
150 | 
151 |     reader_range.process_parallel_range(processor_range.clone(), num_threads, start..end)?;
152 | 
153 |     let elapsed_range = start_time.elapsed();
154 |     println!("Range processing completed!");
155 |     println!("Records processed: {}", processor_range.count());
156 |     println!("Expected records: {}", end - start);
157 |     println!("Time taken: {elapsed_range:.2?}");
158 | 
159 |     // Compare performance
160 |     if processor_range.count() > 0 && processor_full.count() > 0 {
161 |         let full_rate = processor_full.count() as f64 / elapsed_full.as_secs_f64();
162 |         let range_rate = processor_range.count() as f64 / elapsed_range.as_secs_f64();
163 |         println!();
164 |         println!("=== Performance Comparison ===");
165 |         println!("Full file rate: {full_rate:.0} records/sec");
166 |         println!("Range rate: {range_rate:.0} records/sec");
167 | 
168 |         if range_rate > full_rate {
169 |             println!(
170 |                 "Range processing was {:.1}x faster per record",
171 |                 range_rate / full_rate
172 |             );
173 |         } else {
174 |             println!(
175 |                 "Full file processing was {:.1}x faster per record",
176 |                 full_rate / range_rate
177 |             );
178 |         }
179 |     }
180 | 
181 |     Ok(())
182 | }
183 | 


--------------------------------------------------------------------------------
/examples/read_write.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fs::File,
  3 |     io::{BufReader, BufWriter},
  4 | };
  5 | 
  6 | use anyhow::{bail, Result};
  7 | use binseq::{
  8 |     bq::{BinseqHeaderBuilder, BinseqWriterBuilder, MmapReader},
  9 |     BinseqRecord,
 10 | };
 11 | use seq_io::fastq::{Reader, Record};
 12 | 
 13 | fn read_write_single(fastq_path: &str, binseq_path: &str, seq_size: usize) -> Result<()> {
 14 |     // Open the input FASTQ file
 15 |     let (in_handle, _comp) = niffler::from_path(fastq_path)?;
 16 | 
 17 |     // Open the output file
 18 |     let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?;
 19 |     let out_handle = File::create(binseq_path).map(BufWriter::new)?;
 20 |     let mut writer = BinseqWriterBuilder::default()
 21 |         .header(header)
 22 |         .build(out_handle)?;
 23 | 
 24 |     let mut all_sequences = Vec::new();
 25 | 
 26 |     // Write the binary sequence
 27 |     let mut reader = Reader::new(in_handle);
 28 |     let mut num_records_write = 0;
 29 |     let mut skipped_records = 0;
 30 |     while let Some(record) = reader.next() {
 31 |         let record = record?;
 32 |         let seq = record.seq();
 33 |         if writer.write_record(Some(0), seq)? {
 34 |             num_records_write += 1;
 35 |             all_sequences.push(seq.to_vec());
 36 |         } else {
 37 |             skipped_records += 1;
 38 |         }
 39 |     }
 40 |     writer.flush()?;
 41 |     eprintln!("Finished writing {num_records_write} records to path: {binseq_path}");
 42 |     eprintln!("Skipped {skipped_records} records");
 43 | 
 44 |     // Read the binary sequence
 45 |     let reader = MmapReader::new(binseq_path)?;
 46 |     let mut num_records_read = 0;
 47 |     let mut sbuf = Vec::new();
 48 |     for idx in 0..reader.num_records() {
 49 |         let record = reader.get(idx)?;
 50 |         record.decode_s(&mut sbuf)?;
 51 | 
 52 |         // Check if the decoded sequence matches the original
 53 |         let buf_str = std::str::from_utf8(&sbuf)?;
 54 |         let seq_str = std::str::from_utf8(&all_sequences[num_records_read])?;
 55 |         assert_eq!(buf_str, seq_str);
 56 | 
 57 |         num_records_read += 1;
 58 |         sbuf.clear();
 59 |     }
 60 |     eprintln!("Finished reading {num_records_read} records (mmap)");
 61 |     eprintln!(
 62 |         "Difference in total records: {}",
 63 |         num_records_write - num_records_read
 64 |     );
 65 |     eprintln!("Number of records in vec: {}", all_sequences.len());
 66 | 
 67 |     Ok(())
 68 | }
 69 | 
 70 | fn read_write_paired(
 71 |     fastq_path_r1: &str,
 72 |     fastq_path_r2: &str,
 73 |     binseq_path: &str,
 74 |     seq_size_r1: usize,
 75 |     seq_size_r2: usize,
 76 | ) -> Result<()> {
 77 |     // Open the input FASTQ files
 78 | 
 79 |     let in_buf_r1 = File::open(fastq_path_r1).map(BufReader::new)?;
 80 |     let in_buf_r2 = File::open(fastq_path_r2).map(BufReader::new)?;
 81 | 
 82 |     let (in_handle_r1, _comp) = niffler::get_reader(Box::new(in_buf_r1))?;
 83 |     let (in_handle_r2, _comp) = niffler::get_reader(Box::new(in_buf_r2))?;
 84 | 
 85 |     // Create the header
 86 |     let header = BinseqHeaderBuilder::new()
 87 |         .slen(seq_size_r1 as u32)
 88 |         .xlen(seq_size_r2 as u32)
 89 |         .build()?;
 90 | 
 91 |     // Open the output handle
 92 |     let out_handle = File::create(binseq_path).map(BufWriter::new)?;
 93 | 
 94 |     // Create the writer
 95 |     let mut writer = BinseqWriterBuilder::default()
 96 |         .header(header)
 97 |         .build(out_handle)?;
 98 | 
 99 |     // Open the FASTQ readers
100 |     let mut reader_r1 = Reader::new(in_handle_r1);
101 |     let mut reader_r2 = Reader::new(in_handle_r2);
102 | 
103 |     // Write the binary sequence
104 |     let mut num_records = 0;
105 |     let mut num_skipped = 0;
106 | 
107 |     let mut r1_storage = Vec::new();
108 |     let mut r2_storage = Vec::new();
109 | 
110 |     loop {
111 |         let (record_r1, record_r2) = match (reader_r1.next(), reader_r2.next()) {
112 |             (Some(r1), Some(r2)) => (r1?, r2?),
113 |             (None, None) => break,
114 |             _ => bail!("Mismatched number of records in R1 and R2"),
115 |         };
116 | 
117 |         let seq_r1 = record_r1.seq();
118 |         let seq_r2 = record_r2.seq();
119 | 
120 |         if writer.write_paired_record(Some(0), seq_r1, seq_r2)? {
121 |             num_records += 1;
122 |             r1_storage.push(seq_r1.to_vec());
123 |             r2_storage.push(seq_r2.to_vec());
124 |         } else {
125 |             num_skipped += 1;
126 |         }
127 |     }
128 |     writer.flush()?;
129 |     eprintln!("Finished writing {num_records} records");
130 |     eprintln!("Skipped {num_skipped} records");
131 | 
132 |     // Read the binary sequence with mmap
133 |     let reader = MmapReader::new(binseq_path)?;
134 | 
135 |     let mut n_processed = 0;
136 |     let mut sbuf = Vec::new();
137 |     let mut xbuf = Vec::new();
138 | 
139 |     for idx in 0..reader.num_records() {
140 |         let record = reader.get(idx)?;
141 | 
142 |         record.decode_s(&mut sbuf)?;
143 |         record.decode_x(&mut xbuf)?;
144 | 
145 |         // Check if the decoded sequence matches the original
146 |         let s_str = std::str::from_utf8(&sbuf)?;
147 |         let x_str = std::str::from_utf8(&xbuf)?;
148 | 
149 |         let s_exp = std::str::from_utf8(&r1_storage[n_processed])?;
150 |         let x_exp = std::str::from_utf8(&r2_storage[n_processed])?;
151 | 
152 |         assert_eq!(s_str, s_exp);
153 |         assert_eq!(x_str, x_exp);
154 | 
155 |         n_processed += 1;
156 |         sbuf.clear();
157 |         xbuf.clear();
158 |     }
159 |     eprintln!("Finished reading {n_processed} records");
160 | 
161 |     Ok(())
162 | }
163 | 
164 | fn main() -> Result<()> {
165 |     // INPUT ARGUMENTS
166 |     let fastq_path_r1 = "./data/subset_R1.fastq.gz"; // exists
167 |     let fastq_path_r2 = "./data/subset_R2.fastq.gz"; // exists
168 |     let binseq_path_r1 = "./data/subset_R1.bq"; // created
169 |     let binseq_path_r2 = "./data/subset_R2.bq"; // created
170 |     let binseq_path = "./data/subset.bq"; // created
171 |     let seq_size_r1 = 28; // a priori known
172 |     let seq_size_r2 = 90; // a priori known
173 | 
174 |     read_write_single(fastq_path_r1, binseq_path_r1, seq_size_r1)?;
175 |     read_write_single(fastq_path_r2, binseq_path_r2, seq_size_r2)?;
176 |     read_write_paired(
177 |         fastq_path_r1,
178 |         fastq_path_r2,
179 |         binseq_path,
180 |         seq_size_r1,
181 |         seq_size_r2,
182 |     )?;
183 | 
184 |     Ok(())
185 | }
186 | 


--------------------------------------------------------------------------------
/src/parallel.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Range;
  2 | use std::path::Path;
  3 | 
  4 | use crate::{bq, error::ExtensionError, vbq, BinseqRecord, Result};
  5 | 
  6 | /// An enum abstraction for BINSEQ readers that can process records in parallel
  7 | ///
  8 | /// This is a convenience enum that can be used for general workflows where the
  9 | /// distinction between BQ and VBQ readers is not important.
 10 | ///
 11 | /// For more specialized workflows see [`bq::MmapReader`] and [`vbq::MmapReader`].
 12 | pub enum BinseqReader {
 13 |     Bq(bq::MmapReader),
 14 |     Vbq(vbq::MmapReader),
 15 | }
 16 | impl BinseqReader {
 17 |     pub fn new(path: &str) -> Result<Self> {
 18 |         let pathbuf = Path::new(path);
 19 |         match pathbuf.extension() {
 20 |             Some(ext) => match ext.to_str() {
 21 |                 Some("bq") => Ok(Self::Bq(bq::MmapReader::new(path)?)),
 22 |                 Some("vbq") => Ok(Self::Vbq(vbq::MmapReader::new(path)?)),
 23 |                 _ => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()),
 24 |             },
 25 |             None => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()),
 26 |         }
 27 |     }
 28 | 
 29 |     /// Set whether to decode sequences at once in each block
 30 |     ///
 31 |     /// Note: This setting applies to VBQ readers only.
 32 |     pub fn set_decode_block(&mut self, decode_block: bool) {
 33 |         match self {
 34 |             Self::Bq(_) => {
 35 |                 // no-op
 36 |             }
 37 |             Self::Vbq(reader) => reader.set_decode_block(decode_block),
 38 |         }
 39 |     }
 40 | 
 41 |     #[must_use]
 42 |     pub fn is_paired(&self) -> bool {
 43 |         match self {
 44 |             Self::Bq(reader) => reader.is_paired(),
 45 |             Self::Vbq(reader) => reader.is_paired(),
 46 |         }
 47 |     }
 48 | 
 49 |     pub fn num_records(&self) -> Result<usize> {
 50 |         match self {
 51 |             Self::Bq(reader) => Ok(reader.num_records()),
 52 |             Self::Vbq(reader) => reader.num_records(),
 53 |         }
 54 |     }
 55 | 
 56 |     /// Process records in parallel within a specified range
 57 |     ///
 58 |     /// This method allows parallel processing of a subset of records within the file,
 59 |     /// defined by a start and end index. The range is distributed across the specified
 60 |     /// number of threads.
 61 |     ///
 62 |     /// # Arguments
 63 |     ///
 64 |     /// * `processor` - The processor to use for each record
 65 |     /// * `num_threads` - The number of threads to spawn
 66 |     /// * `start` - The starting record index (inclusive)
 67 |     /// * `end` - The ending record index (exclusive)
 68 |     ///
 69 |     /// # Returns
 70 |     ///
 71 |     /// * `Ok(())` - If all records were processed successfully
 72 |     /// * `Err(Error)` - If an error occurred during processing
 73 |     pub fn process_parallel_range<P: ParallelProcessor + Clone + 'static>(
 74 |         self,
 75 |         processor: P,
 76 |         num_threads: usize,
 77 |         range: Range<usize>,
 78 |     ) -> Result<()> {
 79 |         match self {
 80 |             Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range),
 81 |             Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range),
 82 |         }
 83 |     }
 84 | }
 85 | impl ParallelReader for BinseqReader {
 86 |     fn process_parallel<P: ParallelProcessor + Clone + 'static>(
 87 |         self,
 88 |         processor: P,
 89 |         num_threads: usize,
 90 |     ) -> Result<()> {
 91 |         let num_records = self.num_records()?;
 92 |         self.process_parallel_range(processor, num_threads, 0..num_records)
 93 |     }
 94 | 
 95 |     fn process_parallel_range<P: ParallelProcessor + Clone + 'static>(
 96 |         self,
 97 |         processor: P,
 98 |         num_threads: usize,
 99 |         range: Range<usize>,
100 |     ) -> Result<()> {
101 |         match self {
102 |             Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range),
103 |             Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range),
104 |         }
105 |     }
106 | }
107 | 
108 | /// Trait for BINSEQ readers that can process records in parallel
109 | ///
110 | /// This is implemented by the **reader** not by the **processor**.
111 | /// For the **processor**, see the [`ParallelProcessor`] trait.
112 | pub trait ParallelReader {
113 |     fn process_parallel<P: ParallelProcessor + Clone + 'static>(
114 |         self,
115 |         processor: P,
116 |         num_threads: usize,
117 |     ) -> Result<()>;
118 | 
119 |     /// Process records in parallel within a specified range
120 |     ///
121 |     /// This method allows parallel processing of a subset of records within the file,
122 |     /// defined by a start and end index. The range is distributed across the specified
123 |     /// number of threads.
124 |     ///
125 |     /// # Arguments
126 |     ///
127 |     /// * `processor` - The processor to use for each record
128 |     /// * `num_threads` - The number of threads to spawn
129 |     /// * `range` - The range of record indices to process
130 |     ///
131 |     /// # Returns
132 |     ///
133 |     /// * `Ok(())` - If all records were processed successfully
134 |     /// * `Err(Error)` - If an error occurred during processing
135 |     fn process_parallel_range<P: ParallelProcessor + Clone + 'static>(
136 |         self,
137 |         processor: P,
138 |         num_threads: usize,
139 |         range: Range<usize>,
140 |     ) -> Result<()>;
141 | }
142 | 
143 | /// Trait for types that can process records in parallel.
144 | ///
145 | /// This is implemented by the **processor** not by the **reader**.
146 | /// For the **reader**, see the [`ParallelReader`] trait.
147 | pub trait ParallelProcessor: Send + Clone {
148 |     /// Process a single record
149 |     fn process_record<R: BinseqRecord>(&mut self, record: R) -> Result<()>;
150 | 
151 |     /// Called when a thread finishes processing its batch
152 |     /// Default implementation does nothing
153 |     #[allow(unused_variables)]
154 |     fn on_batch_complete(&mut self) -> Result<()> {
155 |         Ok(())
156 |     }
157 | 
158 |     /// Set the thread ID for this processor
159 |     ///
160 |     /// Each thread should call this method with its own unique ID.
161 |     #[allow(unused_variables)]
162 |     fn set_tid(&mut self, _tid: usize) {
163 |         // Default implementation does nothing
164 |     }
165 | 
166 |     /// Get the thread ID for this processor
167 |     fn get_tid(&self) -> Option<usize> {
168 |         None
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/src/policy.rs:
--------------------------------------------------------------------------------
  1 | //! Nucleotide sequence validation and correction policies
  2 | //!
  3 | //! This module provides policies for handling invalid nucleotides in sequences
  4 | //! during encoding operations. Different policies allow for ignoring, rejecting,
  5 | //! or correcting sequences with invalid nucleotides.
  6 | 
  7 | use rand::Rng;
  8 | 
  9 | use crate::error::{Result, WriteError};
 10 | 
 11 | /// A global seed for the random number generator used in randomized policies
 12 | ///
 13 | /// This seed ensures reproducible behavior when using the `RandomDraw` policy
 14 | /// across different runs of the program.
 15 | pub const RNG_SEED: u64 = 42;
 16 | 
 17 | /// Policy for handling invalid nucleotide sequences during encoding
 18 | ///
 19 | /// When encoding sequences into binary format, non-standard nucleotides (anything
 20 | /// other than A, C, G, or T) may be encountered. This enum defines different
 21 | /// strategies for handling such invalid nucleotides.
 22 | ///
 23 | /// The default policy is `IgnoreSequence`, which skips sequences containing
 24 | /// invalid nucleotides.
 25 | #[derive(Debug, Clone, Copy, Default)]
 26 | pub enum Policy {
 27 |     /// Skip sequences containing invalid nucleotides (default policy)
 28 |     #[default]
 29 |     IgnoreSequence,
 30 | 
 31 |     /// Fail with an error when invalid nucleotides are encountered
 32 |     BreakOnInvalid,
 33 | 
 34 |     /// Replace invalid nucleotides with randomly chosen valid nucleotides (A, C, G, or T)
 35 |     RandomDraw,
 36 | 
 37 |     /// Replace all invalid nucleotides with 'A'
 38 |     SetToA,
 39 | 
 40 |     /// Replace all invalid nucleotides with 'C'
 41 |     SetToC,
 42 | 
 43 |     /// Replace all invalid nucleotides with 'G'
 44 |     SetToG,
 45 | 
 46 |     /// Replace all invalid nucleotides with 'T'
 47 |     SetToT,
 48 | }
 49 | impl Policy {
 50 |     /// Helper method to replace invalid nucleotides with a specific nucleotide
 51 |     ///
 52 |     /// This internal method processes a sequence and replaces any non-standard
 53 |     /// nucleotides (anything other than A, C, G, or T) with the specified value.
 54 |     ///
 55 |     /// # Arguments
 56 |     ///
 57 |     /// * `sequence` - The input sequence to process
 58 |     /// * `val` - The replacement nucleotide (should be one of A, C, G, or T)
 59 |     /// * `ibuf` - The output buffer to store the processed sequence
 60 |     fn fill_with_known(sequence: &[u8], val: u8, ibuf: &mut Vec<u8>) {
 61 |         for &n in sequence {
 62 |             ibuf.push(match n {
 63 |                 b'A' | b'C' | b'G' | b'T' => n,
 64 |                 _ => val,
 65 |             });
 66 |         }
 67 |     }
 68 | 
 69 |     /// Helper method to replace invalid nucleotides with random valid nucleotides
 70 |     ///
 71 |     /// This internal method processes a sequence and replaces any non-standard
 72 |     /// nucleotides with randomly chosen valid nucleotides (A, C, G, or T).
 73 |     ///
 74 |     /// # Arguments
 75 |     ///
 76 |     /// * `sequence` - The input sequence to process
 77 |     /// * `rng` - The random number generator to use for selecting replacement nucleotides
 78 |     /// * `ibuf` - The output buffer to store the processed sequence
 79 |     ///
 80 |     /// # Type Parameters
 81 |     ///
 82 |     /// * `R` - A type that implements the `Rng` trait from the `rand` crate
 83 |     fn fill_with_random<R: Rng>(sequence: &[u8], rng: &mut R, ibuf: &mut Vec<u8>) {
 84 |         for &n in sequence {
 85 |             ibuf.push(match n {
 86 |                 b'A' | b'C' | b'G' | b'T' => n,
 87 |                 _ => match rng.random_range(0..4) {
 88 |                     0 => b'A',
 89 |                     1 => b'C',
 90 |                     2 => b'G',
 91 |                     3 => b'T',
 92 |                     _ => unreachable!(),
 93 |                 },
 94 |             });
 95 |         }
 96 |     }
 97 | 
 98 |     /// Process a sequence according to the selected policy for handling invalid nucleotides
 99 |     ///
100 |     /// This method applies the policy to the given sequence, handling any invalid nucleotides
101 |     /// according to the policy's rules. It first clears the input buffer to ensure that it is empty,
102 |     /// then processes the sequence accordingly.
103 |     ///
104 |     /// # Arguments
105 |     ///
106 |     /// * `sequence` - The nucleotide sequence to be processed
107 |     /// * `ibuf` - The buffer to store the processed sequence (will be cleared first)
108 |     /// * `rng` - The random number generator (used only with the `RandomDraw` policy)
109 |     ///
110 |     /// # Returns
111 |     ///
112 |     /// * `Ok(true)` - If the sequence was processed and should be encoded
113 |     /// * `Ok(false)` - If the sequence should be skipped (for `IgnoreSequence` policy)
114 |     /// * `Err(Error)` - If an error occurred (for `BreakOnInvalid` policy when invalid nucleotides are found)
115 |     ///
116 |     /// # Type Parameters
117 |     ///
118 |     /// * `R` - A type that implements the `Rng` trait from the `rand` crate
119 |     ///
120 |     /// # Examples
121 |     ///
122 |     /// ```
123 |     /// # use binseq::{Policy, Result};
124 |     /// # use rand::thread_rng;
125 |     /// # fn main() -> Result<()> {
126 |     /// let policy = Policy::SetToA;
127 |     /// let sequence = b"ACGTNX";
128 |     /// let mut output = Vec::new();
129 |     /// let mut rng = thread_rng();
130 |     ///
131 |     /// let should_process = policy.handle(sequence, &mut output, &mut rng)?;
132 |     ///
133 |     /// assert!(should_process);
134 |     /// assert_eq!(output, b"ACGTAA");
135 |     /// # Ok(())
136 |     /// # }
137 |     /// ```
138 |     pub fn handle<R: Rng>(&self, sequence: &[u8], ibuf: &mut Vec<u8>, rng: &mut R) -> Result<bool> {
139 |         // First clears the input buffer to ensure that it is empty.
140 |         ibuf.clear();
141 | 
142 |         // Returns a boolean indicating whether the sequence should be processed further.
143 |         match self {
144 |             Self::IgnoreSequence => Ok(false),
145 |             Self::BreakOnInvalid => {
146 |                 let seq_str = std::str::from_utf8(sequence)?.to_string();
147 |                 Err(WriteError::InvalidNucleotideSequence(seq_str).into())
148 |             }
149 |             Self::RandomDraw => {
150 |                 Self::fill_with_random(sequence, rng, ibuf);
151 |                 Ok(true)
152 |             }
153 |             Self::SetToA => {
154 |                 Self::fill_with_known(sequence, b'A', ibuf);
155 |                 Ok(true)
156 |             }
157 |             Self::SetToC => {
158 |                 Self::fill_with_known(sequence, b'C', ibuf);
159 |                 Ok(true)
160 |             }
161 |             Self::SetToG => {
162 |                 Self::fill_with_known(sequence, b'G', ibuf);
163 |                 Ok(true)
164 |             }
165 |             Self::SetToT => {
166 |                 Self::fill_with_known(sequence, b'T', ibuf);
167 |                 Ok(true)
168 |             }
169 |         }
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/bq/mod.rs:
--------------------------------------------------------------------------------
  1 | //! # bq
  2 | //!
  3 | //! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**.
  4 | //!
  5 | //! For variable-length records and optional quality scores use the [`vbq`](crate::vbq) module.
  6 | //!
  7 | //! This module contains the utilities for reading, writing, and interacting with BINSEQ files.
  8 | //!
  9 | //! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
 10 | //!
 11 | //! ## Usage
 12 | //!
 13 | //! ### Reading
 14 | //! ```rust
 15 | //! use binseq::{bq, BinseqRecord};
 16 | //! use rand::{thread_rng, Rng};
 17 | //!
 18 | //! let path = "./data/subset.bq";
 19 | //! let reader = bq::MmapReader::new(path).unwrap();
 20 | //!
 21 | //! // We can easily determine the number of records in the file
 22 | //! let num_records = reader.num_records();
 23 | //!
 24 | //! // We have random access to any record within the range
 25 | //! let random_index = thread_rng().gen_range(0..num_records);
 26 | //! let record = reader.get(random_index).unwrap();
 27 | //!
 28 | //! // We can easily decode the (2bit)encoded sequence back to a sequence of bytes
 29 | //! let mut sbuf = Vec::new();
 30 | //! let mut xbuf = Vec::new();
 31 | //!
 32 | //! record.decode_s(&mut sbuf);
 33 | //! if record.is_paired() {
 34 | //!     record.decode_x(&mut xbuf);
 35 | //! }
 36 | //! ```
 37 | //!
 38 | //! ### Writing
 39 | //!
 40 | //! #### Writing unpaired sequences
 41 | //!
 42 | //! ```rust
 43 | //! use binseq::bq;
 44 | //! use std::fs::File;
 45 | //!
 46 | //! // Define a path for the output file
 47 | //! let path = "./data/some_output.bq";
 48 | //!
 49 | //! // Create the file handle
 50 | //! let output_handle = File::create(path).unwrap();
 51 | //!
 52 | //! // Initialize our BINSEQ header (64 bp, only primary)
 53 | //! let header = bq::BinseqHeaderBuilder::new().slen(64).build().unwrap();
 54 | //!
 55 | //! // Initialize our BINSEQ writer
 56 | //! let mut writer = bq::BinseqWriterBuilder::default()
 57 | //!     .header(header)
 58 | //!     .build(output_handle)
 59 | //!     .unwrap();
 60 | //!
 61 | //! // Generate a random sequence
 62 | //! let seq = [b'A'; 64];
 63 | //! let flag = 0;
 64 | //!
 65 | //! // Write the sequence to the file
 66 | //! writer.write_record(Some(flag), &seq).unwrap();
 67 | //!
 68 | //! // Close the file
 69 | //! writer.flush().unwrap();
 70 | //!
 71 | //! // Remove the file created
 72 | //! std::fs::remove_file(path).unwrap();
 73 | //! ```
 74 | //!
 75 | //! #### Writing paired sequences
 76 | //!
 77 | //! ```rust
 78 | //! use binseq::bq;
 79 | //! use std::fs::File;
 80 | //!
 81 | //! // Define a path for the output file
 82 | //! let path = "./data/some_output.bq";
 83 | //!
 84 | //! // Create the file handle
 85 | //! let output_handle = File::create(path).unwrap();
 86 | //!
 87 | //! // Initialize our BINSEQ header (64 bp and 128bp)
 88 | //! let header = bq::BinseqHeaderBuilder::new().slen(64).xlen(128).build().unwrap();
 89 | //!
 90 | //! // Initialize our BINSEQ writer
 91 | //! let mut writer = bq::BinseqWriterBuilder::default()
 92 | //!     .header(header)
 93 | //!     .build(output_handle)
 94 | //!     .unwrap();
 95 | //!
 96 | //! // Generate a random sequence
 97 | //! let primary = [b'A'; 64];
 98 | //! let secondary = [b'C'; 128];
 99 | //! let flag = 0;
100 | //!
101 | //! // Write the sequence to the file
102 | //! writer.write_paired_record(Some(flag), &primary, &secondary).unwrap();
103 | //!
104 | //! // Close the file
105 | //! writer.flush().unwrap();
106 | //!
107 | //! // Remove the file created
108 | //! std::fs::remove_file(path).unwrap();
109 | //! ```
110 | //!
111 | //! # Example: Streaming Access
112 | //!
113 | //! ```
114 | //! use binseq::{Policy, Result, BinseqRecord};
115 | //! use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
116 | //! use std::io::{BufReader, Cursor};
117 | //!
118 | //! fn main() -> Result<()> {
119 | //!     // Create a header for sequences of length 100
120 | //!     let header = BinseqHeaderBuilder::new().slen(100).build()?;
121 | //!
122 | //!     // Create a stream writer
123 | //!     let mut writer = StreamWriterBuilder::default()
124 | //!         .header(header)
125 | //!         .buffer_capacity(8192)
126 | //!         .build(Cursor::new(Vec::new()))?;
127 | //!
128 | //!     // Write sequences
129 | //!     let sequence = b"ACGT".repeat(25); // 100 nucleotides
130 | //!     writer.write_record(Some(0), &sequence)?;
131 | //!
132 | //!     // Get the inner buffer
133 | //!     let buffer = writer.into_inner()?;
134 | //!     let data = buffer.into_inner();
135 | //!
136 | //!     // Create a stream reader
137 | //!     let mut reader = StreamReader::new(BufReader::new(Cursor::new(data)));
138 | //!
139 | //!     // Process records as they arrive
140 | //!     while let Some(record) = reader.next_record() {
141 | //!         // Process each record
142 | //!         let record = record?;
143 | //!         let flag = record.flag();
144 | //!     }
145 | //!
146 | //!     Ok(())
147 | //! }
148 | //! ```
149 | //!
150 | //! ## BQ file format
151 | //!
152 | //! A BINSEQ file consists of two sections:
153 | //!
154 | //! 1. Fixed-size header (32 bytes)
155 | //! 2. Record data section
156 | //!
157 | //! ### Header Format (32 bytes total)
158 | //!
159 | //! | Offset | Size (bytes) | Name     | Description                  | Type   |
160 | //! | ------ | ------------ | -------- | ---------------------------- | ------ |
161 | //! | 0      | 4            | magic    | Magic number (0x42534551)    | uint32 |
162 | //! | 4      | 1            | format   | Format version (currently 2) | uint8  |
163 | //! | 5      | 4            | slen     | Sequence length (primary)    | uint32 |
164 | //! | 9      | 4            | xlen     | Sequence length (secondary)  | uint32 |
165 | //! | 13     | 19           | reserved | Reserved for future use      | bytes  |
166 | //!
167 | //! ### Record Format
168 | //!
169 | //! Each record consists of a:
170 | //!
171 | //! 1. Flag field (8 bytes, uint64)
172 | //! 2. Sequence data (ceil(N/32) \* 8 bytes, where N is sequence length)
173 | //!
174 | //! The flag field is implementation-defined and can be used for filtering, metadata, or other purposes. The placement of the flag field at the start of each record enables efficient filtering without reading sequence data.
175 | //!
176 | //! Total record size = 8 + (ceil(N/32) \* 8) bytes, where N is sequence length
177 | //!
178 | //! ## Encoding
179 | //!
180 | //! - Each nucleotide is encoded using 2 bits:
181 | //!   - A = 00
182 | //!   - C = 01
183 | //!   - G = 10
184 | //!   - T = 11
185 | //! - Non-ATCG characters are **unsupported**.
186 | //! - Sequences are stored in Little-Endian order
187 | //! - The final u64 of sequence data is padded with zeros if the sequence length is not divisible by 32
188 | //!
189 | //! See [`bitnuc`] for 2bit implementation details.
190 | //!
191 | //! ## bq implementation Notes
192 | //!
193 | //! - Sequences are stored in u64 chunks, each holding up to 32 bases
194 | //! - Random access to any record can be calculated as:
195 | //!   - record_size = 8 + (ceil(sequence_length/32) \* 8)
196 | //!   - record_start = 16 + (record_index \* record_size)
197 | //! - Total number of records can be calculated as: (file_size - 16) / record_size
198 | //! - Flag field placement allows for efficient filtering strategies:
199 | //!   - Records can be skipped based on flag values without reading sequence data
200 | //!   - Flag checks can be vectorized for parallel processing
201 | //!   - Memory access patterns are predictable for better cache utilization
202 | //!
203 | //! ## Example Storage Requirements
204 | //!
205 | //! Common sequence lengths:
206 | //!
207 | //! - 32bp reads:
208 | //!   - Sequence: 1 \* 8 = 8 bytes (fits in one u64)
209 | //!   - Flag: 8 bytes
210 | //!   - Total per record: 16 bytes
211 | //! - 100bp reads:
212 | //!   - Sequence: 4 \* 8 = 32 bytes (requires four u64s)
213 | //!   - Flag: 8 bytes
214 | //!   - Total per record: 40 bytes
215 | //! - 150bp reads:
216 | //!   - Sequence: 5 \* 8 = 40 bytes (requires five u64s)
217 | //!   - Flag: 8 bytes
218 | //!   - Total per record: 48 bytes
219 | //!
220 | //! ## Validation
221 | //!
222 | //! Implementations should verify:
223 | //!
224 | //! 1. Correct magic number
225 | //! 2. Compatible version number
226 | //! 3. Sequence length is greater than 0
227 | //! 4. File size minus header (32 bytes) is divisible by the record size
228 | //!
229 | //! ## Future Considerations
230 | //!
231 | //! - The 19 reserved bytes in the header allow for future format extensions
232 | //! - The 64-bit flag field provides space for implementation-specific features such as:
233 | //!   - Quality score summaries
234 | //!   - Filtering flags
235 | //!   - Read group identifiers
236 | //!   - Processing state
237 | //!   - Count data
238 | 
239 | mod header;
240 | mod reader;
241 | mod writer;
242 | 
243 | pub use header::{BinseqHeader, BinseqHeaderBuilder, SIZE_HEADER};
244 | pub use reader::{MmapReader, RefRecord, StreamReader};
245 | pub use writer::{BinseqWriter, BinseqWriterBuilder, Encoder, StreamWriter, StreamWriterBuilder};
246 | 


--------------------------------------------------------------------------------
/src/bq/header.rs:
--------------------------------------------------------------------------------
  1 | //! Header module for the binseq library
  2 | //!
  3 | //! This module provides the header structure and functionality for binary sequence files.
  4 | //! The header contains metadata about the binary sequence data, including format version,
  5 | //! sequence length, and other information necessary for proper interpretation of the data.
  6 | 
  7 | use bitnuc::BitSize;
  8 | use byteorder::{ByteOrder, LittleEndian};
  9 | use std::io::{Read, Write};
 10 | 
 11 | use crate::error::{BuilderError, HeaderError, Result};
 12 | 
 13 | /// Current magic number: "BSEQ" in ASCII (in little-endian byte order)
 14 | ///
 15 | /// This is used to identify binary sequence files and verify file integrity.
 16 | #[allow(clippy::unreadable_literal)]
 17 | const MAGIC: u32 = 0x51455342;
 18 | 
 19 | /// Current format version of the binary sequence file format
 20 | ///
 21 | /// This version number allows for future format changes while maintaining backward compatibility.
 22 | const FORMAT: u8 = 1;
 23 | 
 24 | /// Size of the header in bytes
 25 | ///
 26 | /// The header has a fixed size to ensure consistent reading and writing of binary sequence files.
 27 | pub const SIZE_HEADER: usize = 32;
 28 | 
 29 | /// Reserved bytes in the header
 30 | ///
 31 | /// These bytes are reserved for future use and should be set to a consistent value.
 32 | pub const RESERVED: [u8; 17] = [42; 17];
 33 | 
 34 | #[derive(Debug, Clone, Copy)]
 35 | pub struct BinseqHeaderBuilder {
 36 |     slen: Option<u32>,
 37 |     xlen: Option<u32>,
 38 |     bitsize: Option<BitSize>,
 39 |     flags: Option<bool>,
 40 | }
 41 | impl Default for BinseqHeaderBuilder {
 42 |     fn default() -> Self {
 43 |         Self::new()
 44 |     }
 45 | }
 46 | 
 47 | impl BinseqHeaderBuilder {
 48 |     #[must_use]
 49 |     pub fn new() -> Self {
 50 |         BinseqHeaderBuilder {
 51 |             slen: None,
 52 |             xlen: None,
 53 |             bitsize: None,
 54 |             flags: None,
 55 |         }
 56 |     }
 57 |     #[must_use]
 58 |     pub fn slen(mut self, slen: u32) -> Self {
 59 |         self.slen = Some(slen);
 60 |         self
 61 |     }
 62 |     #[must_use]
 63 |     pub fn xlen(mut self, xlen: u32) -> Self {
 64 |         self.xlen = Some(xlen);
 65 |         self
 66 |     }
 67 |     #[must_use]
 68 |     pub fn bitsize(mut self, bitsize: BitSize) -> Self {
 69 |         self.bitsize = Some(bitsize);
 70 |         self
 71 |     }
 72 |     #[must_use]
 73 |     pub fn flags(mut self, flags: bool) -> Self {
 74 |         self.flags = Some(flags);
 75 |         self
 76 |     }
 77 |     pub fn build(self) -> Result<BinseqHeader> {
 78 |         Ok(BinseqHeader {
 79 |             magic: MAGIC,
 80 |             format: FORMAT,
 81 |             slen: if let Some(slen) = self.slen {
 82 |                 slen
 83 |             } else {
 84 |                 return Err(BuilderError::MissingSlen.into());
 85 |             },
 86 |             xlen: self.xlen.unwrap_or(0),
 87 |             bits: self.bitsize.unwrap_or_default(),
 88 |             flags: self.flags.unwrap_or(false),
 89 |             reserved: RESERVED,
 90 |         })
 91 |     }
 92 | }
 93 | 
 94 | /// Header structure for binary sequence files
 95 | ///
 96 | /// The `BinseqHeader` contains metadata about the binary sequence data stored in a file,
 97 | /// including format information, sequence lengths, and space for future extensions.
 98 | ///
 99 | /// The total size of this structure is 32 bytes, with a fixed layout to ensure
100 | /// consistent reading and writing across different platforms.
101 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
102 | pub struct BinseqHeader {
103 |     /// Magic number to identify the file format
104 |     ///
105 |     /// 4 bytes
106 |     pub magic: u32,
107 | 
108 |     /// Version of the file format
109 |     ///
110 |     /// 1 byte
111 |     pub format: u8,
112 | 
113 |     /// Length of all sequences in the file
114 |     ///
115 |     /// 4 bytes
116 |     pub slen: u32,
117 | 
118 |     /// Length of secondary sequences in the file
119 |     ///
120 |     /// 4 bytes
121 |     pub xlen: u32,
122 | 
123 |     /// Number of bits per nucleotide (currently 2 or 4)
124 |     ///
125 |     /// 1 byte
126 |     pub bits: BitSize,
127 | 
128 |     /// All records have a flag attribute
129 |     ///
130 |     /// 1 byte
131 |     pub flags: bool,
132 | 
133 |     /// Reserve remaining bytes for future use
134 |     ///
135 |     /// 17 bytes
136 |     pub reserved: [u8; 17],
137 | }
138 | impl BinseqHeader {
139 |     /// Creates a new header with the specified sequence length
140 |     ///
141 |     /// This constructor initializes a standard header with the given sequence length,
142 |     /// setting the magic number and format version to their default values.
143 |     /// The extended sequence length (xlen) is set to 0.
144 |     ///
145 |     /// # Arguments
146 |     ///
147 |     /// * `bits` - The number of bits per nucleotide (currently 2 or 4)
148 |     /// * `slen` - The length of sequences in the file
149 |     /// * `flags` - The flags for the header
150 |     ///
151 |     /// # Returns
152 |     ///
153 |     /// A new `BinseqHeader` instance
154 |     #[must_use]
155 |     pub fn new(bits: BitSize, slen: u32, flags: bool) -> Self {
156 |         Self {
157 |             magic: MAGIC,
158 |             format: FORMAT,
159 |             slen,
160 |             xlen: 0,
161 |             bits,
162 |             flags,
163 |             reserved: RESERVED,
164 |         }
165 |     }
166 | 
167 |     /// Creates a new header with both primary and extended sequence lengths
168 |     ///
169 |     /// This constructor initializes a header for files that contain both primary
170 |     /// and secondary sequence data, such as quality scores or annotations.
171 |     ///
172 |     /// # Arguments
173 |     ///
174 |     /// * `bits` - The number of bits per nucleotide (currently 2 or 4)
175 |     /// * `slen` - The length of primary sequences in the file
176 |     /// * `xlen` - The length of secondary/extended sequences in the file
177 |     /// * `flags` - The flags for the header
178 |     ///
179 |     /// # Returns
180 |     ///
181 |     /// A new `BinseqHeader` instance with extended sequence information
182 |     #[must_use]
183 |     pub fn new_extended(bits: BitSize, slen: u32, xlen: u32, flags: bool) -> Self {
184 |         Self {
185 |             magic: MAGIC,
186 |             format: FORMAT,
187 |             slen,
188 |             xlen,
189 |             bits,
190 |             flags,
191 |             reserved: RESERVED,
192 |         }
193 |     }
194 | 
195 |     /// Sets the bitsize of the header
196 |     pub fn set_bitsize(&mut self, bits: BitSize) {
197 |         self.bits = bits;
198 |     }
199 | 
200 |     /// Checks if the file is paired
201 |     #[must_use]
202 |     pub fn is_paired(&self) -> bool {
203 |         self.xlen > 0
204 |     }
205 | 
206 |     /// Parses a header from a fixed-size byte array
207 |     ///
208 |     /// This method validates the magic number and format version before constructing
209 |     /// a header instance. If validation fails, appropriate errors are returned.
210 |     ///
211 |     /// # Arguments
212 |     ///
213 |     /// * `buffer` - A byte array of exactly `SIZE_HEADER` bytes containing the header data
214 |     ///
215 |     /// # Returns
216 |     ///
217 |     /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer
218 |     /// * `Err(Error)` - If the buffer contains invalid header data
219 |     ///
220 |     /// # Errors
221 |     ///
222 |     /// Returns an error if:
223 |     /// * The magic number is incorrect
224 |     /// * The format version is unsupported
225 |     /// * The reserved bytes are invalid
226 |     pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result<Self> {
227 |         let magic = LittleEndian::read_u32(&buffer[0..4]);
228 |         if magic != MAGIC {
229 |             return Err(HeaderError::InvalidMagicNumber(magic).into());
230 |         }
231 |         let format = buffer[4];
232 |         if format != FORMAT {
233 |             return Err(HeaderError::InvalidFormatVersion(format).into());
234 |         }
235 |         let slen = LittleEndian::read_u32(&buffer[5..9]);
236 |         let xlen = LittleEndian::read_u32(&buffer[9..13]);
237 |         let bits = match buffer[13] {
238 |             0 | 2 | 42 => BitSize::Two,
239 |             4 => BitSize::Four,
240 |             x => return Err(HeaderError::InvalidBitSize(x).into()),
241 |         };
242 |         let flags = buffer[14] != 0;
243 |         let Ok(reserved) = buffer[15..32].try_into() else {
244 |             return Err(HeaderError::InvalidReservedBytes.into());
245 |         };
246 |         Ok(Self {
247 |             magic,
248 |             format,
249 |             slen,
250 |             xlen,
251 |             bits,
252 |             flags,
253 |             reserved,
254 |         })
255 |     }
256 | 
257 |     /// Parses a header from an arbitrarily sized buffer
258 |     ///
259 |     /// This method extracts the header from the beginning of a buffer that may be larger
260 |     /// than the header size. It checks that the buffer is at least as large as the header
261 |     /// before attempting to parse it.
262 |     ///
263 |     /// # Arguments
264 |     ///
265 |     /// * `buffer` - A byte slice containing at least `SIZE_HEADER` bytes
266 |     ///
267 |     /// # Returns
268 |     ///
269 |     /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer
270 |     /// * `Err(Error)` - If the buffer is too small or contains invalid header data
271 |     ///
272 |     /// # Errors
273 |     ///
274 |     /// Returns an error if:
275 |     /// * The buffer is smaller than `SIZE_HEADER`
276 |     /// * The header data is invalid (see `from_bytes` for validation details)
277 |     pub fn from_buffer(buffer: &[u8]) -> Result<Self> {
278 |         let mut bytes = [0u8; SIZE_HEADER];
279 |         if buffer.len() < SIZE_HEADER {
280 |             return Err(HeaderError::InvalidSize(buffer.len(), SIZE_HEADER).into());
281 |         }
282 |         bytes.copy_from_slice(&buffer[..SIZE_HEADER]);
283 |         Self::from_bytes(&bytes)
284 |     }
285 | 
286 |     /// Writes the header to a writer
287 |     ///
288 |     /// This method serializes the header to its binary representation and writes it
289 |     /// to the provided writer.
290 |     ///
291 |     /// # Arguments
292 |     ///
293 |     /// * `writer` - Any type that implements the `Write` trait
294 |     ///
295 |     /// # Returns
296 |     ///
297 |     /// * `Ok(())` - If the header was successfully written
298 |     /// * `Err(Error)` - If writing to the writer failed
299 |     ///
300 |     /// # Errors
301 |     ///
302 |     /// Returns an error if writing to the writer fails (typically an I/O error).
303 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
304 |         let mut buffer = [0u8; SIZE_HEADER];
305 |         LittleEndian::write_u32(&mut buffer[0..4], self.magic);
306 |         buffer[4] = self.format;
307 |         LittleEndian::write_u32(&mut buffer[5..9], self.slen);
308 |         LittleEndian::write_u32(&mut buffer[9..13], self.xlen);
309 |         buffer[13] = self.bits.into();
310 |         buffer[14] = self.flags.into();
311 |         buffer[15..32].copy_from_slice(&self.reserved);
312 |         writer.write_all(&buffer)?;
313 |         Ok(())
314 |     }
315 | 
316 |     /// Reads a header from a reader
317 |     ///
318 |     /// This method reads exactly `SIZE_HEADER` bytes from the provided reader and
319 |     /// parses them into a header structure.
320 |     ///
321 |     /// # Arguments
322 |     ///
323 |     /// * `reader` - Any type that implements the `Read` trait
324 |     ///
325 |     /// # Returns
326 |     ///
327 |     /// * `Ok(BinseqHeader)` - A valid header read from the reader
328 |     /// * `Err(Error)` - If reading from the reader failed or the header data is invalid
329 |     ///
330 |     /// # Errors
331 |     ///
332 |     /// Returns an error if:
333 |     /// * Reading from the reader fails (typically an I/O error)
334 |     /// * The header data is invalid (see `from_bytes` for validation details)
335 |     pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
336 |         let mut buffer = [0u8; SIZE_HEADER];
337 |         reader.read_exact(&mut buffer)?;
338 |         Self::from_bytes(&buffer)
339 |     }
340 | }
341 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error as StdError;
  2 | 
  3 | /// Custom Result type for binseq operations, wrapping the custom [`Error`] type
  4 | pub type Result<T> = std::result::Result<T, Error>;
  5 | 
  6 | /// The main error type for the binseq library, encompassing all possible error cases
  7 | /// that can occur during binary sequence operations.
  8 | #[derive(thiserror::Error, Debug)]
  9 | #[error(transparent)]
 10 | pub enum Error {
 11 |     /// Errors related to file and block headers
 12 |     #[error("Error processing header: {0}")]
 13 |     HeaderError(#[from] HeaderError),
 14 | 
 15 |     /// Errors that occur during write operations
 16 |     #[error("Error writing file: {0}")]
 17 |     WriteError(#[from] WriteError),
 18 | 
 19 |     /// Errors that occur during read operations
 20 |     #[error("Error reading file: {0}")]
 21 |     ReadError(#[from] ReadError),
 22 | 
 23 |     /// Errors that occur during build operations
 24 |     #[error("Error building file: {0}")]
 25 |     BuilderError(#[from] BuilderError),
 26 | 
 27 |     /// Errors related to file indexing
 28 |     #[error("Error processing Index: {0}")]
 29 |     IndexError(#[from] IndexError),
 30 | 
 31 |     /// Standard I/O errors
 32 |     #[error("Error with IO: {0}")]
 33 |     IoError(#[from] std::io::Error),
 34 | 
 35 |     /// UTF-8 conversion errors
 36 |     #[error("Error with UTF8: {0}")]
 37 |     Utf8Error(#[from] std::str::Utf8Error),
 38 | 
 39 |     /// Errors related to missing extensions
 40 |     ExtensionError(#[from] ExtensionError),
 41 | 
 42 |     /// Errors from the bitnuc dependency for nucleotide encoding/decoding
 43 |     #[error("Bitnuc error: {0}")]
 44 |     BitnucError(#[from] bitnuc::Error),
 45 | 
 46 |     /// Conversion errors from anyhow errors
 47 |     #[error("Generic error: {0}")]
 48 |     AnyhowError(#[from] anyhow::Error),
 49 | 
 50 |     /// Generic errors for other unexpected situations
 51 |     #[error("Generic error: {0}")]
 52 |     GenericError(#[from] Box<dyn StdError + Send + Sync>),
 53 | }
 54 | impl Error {
 55 |     /// Checks if the error is an index mismatch error
 56 |     ///
 57 |     /// This is useful for determining if a file's index is out of sync with its content,
 58 |     /// which might require rebuilding the index.
 59 |     ///
 60 |     /// # Returns
 61 |     ///
 62 |     /// * `true` if the error is an `IndexError::ByteSizeMismatch`
 63 |     /// * `false` for all other error types
 64 |     #[must_use]
 65 |     pub fn is_index_mismatch(&self) -> bool {
 66 |         match self {
 67 |             Self::IndexError(err) => err.is_mismatch(),
 68 |             _ => false,
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | /// Errors specific to processing and validating binary sequence headers
 74 | #[derive(thiserror::Error, Debug)]
 75 | pub enum HeaderError {
 76 |     /// The magic number in the header does not match the expected value
 77 |     ///
 78 |     /// # Arguments
 79 |     /// * `u32` - The invalid magic number that was found
 80 |     #[error("Invalid magic number: {0}")]
 81 |     InvalidMagicNumber(u32),
 82 | 
 83 |     /// The format version in the header is not supported
 84 |     ///
 85 |     /// # Arguments
 86 |     /// * `u8` - The unsupported version number that was found
 87 |     #[error("Invalid format version: {0}")]
 88 |     InvalidFormatVersion(u8),
 89 | 
 90 |     /// The reserved bytes in the header contain unexpected values
 91 |     #[error("Invalid reserved bytes")]
 92 |     InvalidReservedBytes,
 93 | 
 94 |     /// The bits in the header contain unexpected values
 95 |     #[error("Invalid bit size found in header: {0} - expecting [2,4]")]
 96 |     InvalidBitSize(u8),
 97 | 
 98 |     /// The size of the data does not match what was specified in the header
 99 |     ///
100 |     /// # Arguments
101 |     /// * First `usize` - The actual number of bytes provided
102 |     /// * Second `usize` - The expected number of bytes according to the header
103 |     #[error("Invalid number of bytes provided: {0}. Expected: {1}")]
104 |     InvalidSize(usize, usize),
105 | }
106 | 
107 | /// Errors that can occur while reading binary sequence data
108 | #[derive(thiserror::Error, Debug)]
109 | pub enum ReadError {
110 |     /// The file being read is not a regular file (e.g., it might be a directory or special file)
111 |     #[error("File is not regular")]
112 |     IncompatibleFile,
113 | 
114 |     /// The file appears to be truncated or corrupted
115 |     ///
116 |     /// # Arguments
117 |     /// * `usize` - The byte position where the truncation was detected
118 |     #[error(
119 |         "Number of bytes in file does not match expectation - possibly truncated at byte pos {0}"
120 |     )]
121 |     FileTruncation(usize),
122 | 
123 |     /// Attempted to access a record index that is beyond the available range
124 |     ///
125 |     /// # Arguments
126 |     /// * First `usize` - The requested record index
127 |     /// * Second `usize` - The maximum available record index
128 |     #[error("Requested record index ({0}) is out of record range ({1})")]
129 |     OutOfRange(usize, usize),
130 | 
131 |     /// End of stream was reached while reading
132 |     #[error("End of stream reached")]
133 |     EndOfStream,
134 | 
135 |     /// A partial record was encountered at the end of a stream
136 |     ///
137 |     /// # Arguments
138 |     /// * `usize` - The number of bytes read in the partial record
139 |     #[error("Partial record at end of stream ({0} bytes)")]
140 |     PartialRecord(usize),
141 | 
142 |     /// When a block header contains an invalid magic number
143 |     ///
144 |     /// The first parameter is the invalid magic number, the second is the position in the file
145 |     #[error("Unexpected Block Magic Number found: {0} at position {1}")]
146 |     InvalidBlockMagicNumber(u64, usize),
147 | 
148 |     /// When trying to read a block but reaching the end of the file unexpectedly
149 |     ///
150 |     /// The parameter is the position in the file where the read was attempted
151 |     #[error("Unable to find an expected full block at position {0}")]
152 |     UnexpectedEndOfFile(usize),
153 | 
154 |     /// When the file metadata doesn't match the expected VBINSEQ format
155 |     #[error("Unexpected file metadata")]
156 |     InvalidFileType,
157 | 
158 |     /// Missing the index end magic number
159 |     #[error("Missing index end magic number")]
160 |     MissingIndexEndMagic,
161 | }
162 | 
163 | #[derive(thiserror::Error, Debug)]
164 | pub enum BuilderError {
165 |     #[error("Missing sequence length")]
166 |     MissingSlen,
167 | }
168 | 
169 | /// Errors that can occur while writing binary sequence data
170 | #[derive(thiserror::Error, Debug)]
171 | pub enum WriteError {
172 |     /// The length of the sequence being written does not match what was specified in the header
173 |     ///
174 |     /// # Fields
175 |     /// * `expected` - The sequence length specified in the header
176 |     /// * `got` - The actual length of the sequence being written
177 |     #[error("Sequence length ({got}) does not match the header ({expected})")]
178 |     UnexpectedSequenceLength { expected: u32, got: usize },
179 | 
180 |     /// The sequence contains invalid nucleotide characters
181 |     ///
182 |     /// # Arguments
183 |     /// * `String` - Description of the invalid nucleotides found
184 |     #[error("Invalid nucleotides found in sequence: {0}")]
185 |     InvalidNucleotideSequence(String),
186 | 
187 |     /// Attempted to write data without first setting up the header
188 |     #[error("Missing header in writer builder")]
189 |     MissingHeader,
190 | 
191 |     /// When trying to write data without quality scores but the header specifies they should be present
192 |     #[error("Quality flag is set in header but trying to write without quality scores.")]
193 |     QualityFlagSet,
194 | 
195 |     /// When trying to write data without a pair but the header specifies paired records
196 |     #[error("Paired flag is set in header but trying to write without record pair.")]
197 |     PairedFlagSet,
198 | 
199 |     /// When trying to write quality scores but the header specifies they are not present
200 |     #[error("Quality flag not set in header but trying to write quality scores.")]
201 |     QualityFlagNotSet,
202 | 
203 |     /// When trying to write paired data but the header doesn't specify paired records
204 |     #[error("Paired flag not set in header but trying to write with record pair.")]
205 |     PairedFlagNotSet,
206 | 
207 |     /// When trying to write data without headers but the header specifies they should be present
208 |     #[error("Header flag is set in header but trying to write without headers.")]
209 |     HeaderFlagSet,
210 | 
211 |     /// When a record is too large to fit in a block of the configured size
212 |     ///
213 |     /// The first parameter is the record size, the second is the maximum block size
214 |     #[error("Encountered a record with embedded size {0} but the maximum block size is {1}. Rerun with increased block size.")]
215 |     RecordSizeExceedsMaximumBlockSize(usize, usize),
216 | 
217 |     /// When trying to ingest blocks with different sizes than expected
218 |     ///
219 |     /// The first parameter is the expected size, the second is the found size
220 |     #[error(
221 |         "Incompatible block sizes encountered in BlockWriter Ingest. Found ({1}) Expected ({0})"
222 |     )]
223 |     IncompatibleBlockSizes(usize, usize),
224 | 
225 |     /// When trying to ingest data with an incompatible header
226 |     ///
227 |     /// The first parameter is the expected header, the second is the found header
228 |     #[error("Incompatible headers found in VBinseqWriter::ingest. Found ({1:?}) Expected ({0:?})")]
229 |     IncompatibleHeaders(crate::vbq::VBinseqHeader, crate::vbq::VBinseqHeader),
230 | }
231 | 
232 | /// Errors related to VBINSEQ file indexing
233 | ///
234 | /// These errors occur when there are issues with the index of a VBINSEQ file,
235 | /// such as corruption or mismatches with the underlying file.
236 | #[derive(thiserror::Error, Debug)]
237 | pub enum IndexError {
238 |     /// When the magic number in the index doesn't match the expected value
239 |     ///
240 |     /// The parameter is the invalid magic number that was found
241 |     #[error("Invalid magic number: {0}")]
242 |     InvalidMagicNumber(u64),
243 | 
244 |     /// When the index references a file that doesn't exist
245 |     ///
246 |     /// The parameter is the missing file path
247 |     #[error("Index missing upstream file path: {0}")]
248 |     MissingUpstreamFile(String),
249 | 
250 |     /// When the size of the file doesn't match what the index expects
251 |     ///
252 |     /// The first parameter is the actual file size, the second is the expected size
253 |     #[error("Mismatch in size between upstream size: {0} and expected index size {1}")]
254 |     ByteSizeMismatch(u64, u64),
255 | 
256 |     /// Invalid reserved bytes in the index header
257 |     #[error("Invalid reserved bytes in index header")]
258 |     InvalidReservedBytes,
259 | }
260 | impl IndexError {
261 |     /// Checks if this error indicates a mismatch between the index and file
262 |     ///
263 |     /// This is useful to determine if the index needs to be rebuilt.
264 |     ///
265 |     /// # Returns
266 |     ///
267 |     /// * `true` for `ByteSizeMismatch` errors
268 |     /// * `true` for any other error type (this behavior is likely a bug and should be fixed)
269 |     #[must_use]
270 |     pub fn is_mismatch(&self) -> bool {
271 |         matches!(self, Self::ByteSizeMismatch(_, _) | _) // Note: this appears to always return true regardless of error type
272 |     }
273 | }
274 | 
275 | #[derive(thiserror::Error, Debug)]
276 | pub enum ExtensionError {
277 |     /// When the extension is not supported
278 |     #[error("Unsupported extension in path: {0}")]
279 |     UnsupportedExtension(String),
280 | }
281 | 
282 | /// Trait for converting arbitrary errors into `Error`
283 | pub trait IntoBinseqError {
284 |     fn into_binseq_error(self) -> Error;
285 | }
286 | 
287 | // Implement conversion for Box<dyn Error>
288 | impl<E> IntoBinseqError for E
289 | where
290 |     E: StdError + Send + Sync + 'static,
291 | {
292 |     fn into_binseq_error(self) -> Error {
293 |         Error::GenericError(Box::new(self))
294 |     }
295 | }
296 | 
297 | mod testing {
298 |     #[allow(unused)]
299 |     use super::*;
300 |     use thiserror::Error;
301 | 
302 |     #[allow(unused)]
303 |     #[derive(Error, Debug)]
304 |     pub enum MyError {
305 |         #[error("Custom error: {0}")]
306 |         CustomError(String),
307 |     }
308 | 
309 |     #[test]
310 |     fn test_into_binseq_error() {
311 |         let my_error = MyError::CustomError(String::from("some error"));
312 |         let binseq_error = my_error.into_binseq_error();
313 |         assert!(matches!(binseq_error, Error::GenericError(_)));
314 |     }
315 | }
316 | 


--------------------------------------------------------------------------------
/src/vbq/header.rs:
--------------------------------------------------------------------------------
  1 | //! # File and Block Header Definitions
  2 | //!
  3 | //! This module defines the header structures used in the VBINSEQ file format.
  4 | //!
  5 | //! The VBINSEQ format consists of two primary header types:
  6 | //!
  7 | //! 1. `VBinseqHeader` - The file header that appears at the beginning of a VBINSEQ file,
  8 | //!    containing information about the overall file format and configuration.
  9 | //!
 10 | //! 2. `BlockHeader` - Headers that appear before each block of records, containing
 11 | //!    information specific to that block like its size and number of records.
 12 | //!
 13 | //! Both headers are fixed-size and include magic numbers to validate file integrity.
 14 | 
 15 | use std::io::{Read, Write};
 16 | 
 17 | use bitnuc::BitSize;
 18 | use byteorder::{ByteOrder, LittleEndian};
 19 | 
 20 | use crate::error::{HeaderError, ReadError, Result};
 21 | 
 22 | /// Magic number for file identification: "VSEQ" in ASCII (0x51455356)
 23 | ///
 24 | /// This constant is used in the file header to identify VBINSEQ formatted files.
 25 | #[allow(clippy::unreadable_literal)]
 26 | const MAGIC: u32 = 0x51455356;
 27 | 
 28 | /// Magic number for block identification: "BLOCKSEQ" in ASCII (0x5145534B434F4C42)
 29 | ///
 30 | /// This constant is used in block headers to validate block integrity.
 31 | #[allow(clippy::unreadable_literal)]
 32 | const BLOCK_MAGIC: u64 = 0x5145534B434F4C42;
 33 | 
 34 | /// Current format version number
 35 | ///
 36 | /// This should be incremented when making backwards-incompatible changes to the format.
 37 | const FORMAT: u8 = 1;
 38 | 
 39 | /// Size of the file header in bytes (32 bytes)
 40 | ///
 41 | /// The file header has a fixed size to simplify parsing.
 42 | pub const SIZE_HEADER: usize = 32;
 43 | 
 44 | /// Size of the block header in bytes (32 bytes)
 45 | ///
 46 | /// Each block header has a fixed size to simplify block navigation.
 47 | pub const SIZE_BLOCK_HEADER: usize = 32;
 48 | 
 49 | /// Default block size in bytes: 128KB
 50 | ///
 51 | /// This defines the default virtual size of each record block.
 52 | /// A larger block size can improve compression ratio but reduces random access granularity.
 53 | pub const BLOCK_SIZE: u64 = 128 * 1024;
 54 | 
 55 | /// Reserved bytes for future use in the file header
 56 | ///
 57 | /// These bytes are set to a placeholder value (42) and reserved for future extensions.
 58 | pub const RESERVED_BYTES: [u8; 13] = [42; 13];
 59 | 
 60 | /// Reserved bytes for future use in block headers (12 bytes)
 61 | ///
 62 | /// These bytes are set to a placeholder value (42) and reserved for future extensions.
 63 | pub const RESERVED_BYTES_BLOCK: [u8; 12] = [42; 12];
 64 | 
 65 | #[derive(Default, Debug, Clone, Copy)]
 66 | pub struct VBinseqHeaderBuilder {
 67 |     qual: Option<bool>,
 68 |     block: Option<u64>,
 69 |     compressed: Option<bool>,
 70 |     paired: Option<bool>,
 71 |     bitsize: Option<BitSize>,
 72 |     headers: Option<bool>,
 73 |     flags: Option<bool>,
 74 | }
 75 | impl VBinseqHeaderBuilder {
 76 |     #[must_use]
 77 |     pub fn new() -> Self {
 78 |         Self::default()
 79 |     }
 80 |     #[must_use]
 81 |     pub fn qual(mut self, qual: bool) -> Self {
 82 |         self.qual = Some(qual);
 83 |         self
 84 |     }
 85 |     #[must_use]
 86 |     pub fn block(mut self, block: u64) -> Self {
 87 |         self.block = Some(block);
 88 |         self
 89 |     }
 90 |     #[must_use]
 91 |     pub fn compressed(mut self, compressed: bool) -> Self {
 92 |         self.compressed = Some(compressed);
 93 |         self
 94 |     }
 95 |     #[must_use]
 96 |     pub fn paired(mut self, paired: bool) -> Self {
 97 |         self.paired = Some(paired);
 98 |         self
 99 |     }
100 |     #[must_use]
101 |     pub fn bitsize(mut self, bitsize: BitSize) -> Self {
102 |         self.bitsize = Some(bitsize);
103 |         self
104 |     }
105 |     #[must_use]
106 |     pub fn headers(mut self, headers: bool) -> Self {
107 |         self.headers = Some(headers);
108 |         self
109 |     }
110 |     #[must_use]
111 |     pub fn flags(mut self, flags: bool) -> Self {
112 |         self.flags = Some(flags);
113 |         self
114 |     }
115 |     #[must_use]
116 |     pub fn build(self) -> VBinseqHeader {
117 |         VBinseqHeader::with_capacity(
118 |             self.block.unwrap_or(BLOCK_SIZE),
119 |             self.qual.unwrap_or(false),
120 |             self.compressed.unwrap_or(false),
121 |             self.paired.unwrap_or(false),
122 |             self.bitsize.unwrap_or_default(),
123 |             self.headers.unwrap_or(false),
124 |             self.flags.unwrap_or(false),
125 |         )
126 |     }
127 | }
128 | 
129 | /// File header for VBINSEQ files
130 | ///
131 | /// This structure represents the 32-byte header that appears at the beginning of every
132 | /// VBINSEQ file. It contains configuration information about the file format, including
133 | /// whether quality scores are included, whether blocks are compressed, and whether
134 | /// records contain paired sequences.
135 | ///
136 | /// # Fields
137 | ///
138 | /// * `magic` - Magic number to validate file format ("VSEQ", 4 bytes)
139 | /// * `format` - Version number of the file format (1 byte)
140 | /// * `block` - Size of each block in bytes (8 bytes)
141 | /// * `qual` - Whether quality scores are included (1 byte boolean)
142 | /// * `compressed` - Whether blocks are ZSTD compressed (1 byte boolean)
143 | /// * `paired` - Whether records contain paired sequences (1 byte boolean)
144 | /// * `reserved` - Reserved bytes for future extensions (16 bytes)
145 | #[derive(Clone, Copy, Debug, PartialEq)]
146 | pub struct VBinseqHeader {
147 |     /// Magic number to identify the file format ("VSEQ")
148 |     ///
149 |     /// Always set to 0x51455356 (4 bytes)
150 |     pub magic: u32,
151 | 
152 |     /// Version of the file format
153 |     ///
154 |     /// Currently set to 1 (1 byte)
155 |     pub format: u8,
156 | 
157 |     /// Block size in bytes
158 |     ///
159 |     /// This is the virtual (uncompressed) size of each record block (8 bytes)
160 |     pub block: u64,
161 | 
162 |     /// Whether quality scores are included with sequences
163 |     ///
164 |     /// If true, quality scores are stored for each nucleotide (1 byte)
165 |     pub qual: bool,
166 | 
167 |     /// Whether internal blocks are compressed with ZSTD
168 |     ///
169 |     /// If true, blocks are compressed individually (1 byte)
170 |     pub compressed: bool,
171 | 
172 |     /// Whether records contain paired sequences
173 |     ///
174 |     /// If true, each record has both primary and extended sequences (1 byte)
175 |     pub paired: bool,
176 | 
177 |     /// The bitsize of the sequence data (1 byte)
178 |     ///
179 |     /// Specifies the number of bits per nucleotide:
180 |     /// - 2-bit: Standard encoding (A=00, C=01, G=10, T=11)
181 |     /// - 4-bit: Extended encoding supporting ambiguous nucleotides
182 |     pub bits: BitSize,
183 | 
184 |     /// Whether sequence headers are included with sequences (1 byte)
185 |     ///
186 |     /// When true, each record includes length-prefixed UTF-8 header strings
187 |     /// for both primary and extended (paired) sequences
188 |     pub headers: bool,
189 | 
190 |     /// Whether flags are included with sequences (1 byte)
191 |     ///
192 |     /// When true, each record includes length-prefixed UTF-8 flag strings
193 |     /// for both primary and extended (paired) sequences
194 |     pub flags: bool,
195 | 
196 |     /// Reserved bytes for future format extensions
197 |     ///
198 |     /// Currently filled with placeholder values (13 bytes)
199 |     pub reserved: [u8; 13],
200 | }
201 | impl Default for VBinseqHeader {
202 |     /// Creates a default header with default block size and all features disabled
203 |     ///
204 |     /// The default header:
205 |     /// - Uses the default block size (128KB)
206 |     /// - Does not include quality scores
207 |     /// - Does not use compression
208 |     /// - Does not support paired sequences
209 |     /// - Does not include sequence headers
210 |     /// - Uses 2-bit nucleotide encoding
211 |     fn default() -> Self {
212 |         Self::with_capacity(
213 |             BLOCK_SIZE,
214 |             false,
215 |             false,
216 |             false,
217 |             BitSize::default(),
218 |             false,
219 |             false,
220 |         )
221 |     }
222 | }
223 | impl VBinseqHeader {
224 |     /// Creates a new VBINSEQ header with the default block size
225 |     ///
226 |     /// # Parameters
227 |     ///
228 |     /// * `qual` - Whether to include quality scores with sequences
229 |     /// * `compressed` - Whether to use ZSTD compression for blocks
230 |     /// * `paired` - Whether records contain paired sequences
231 |     /// * `bitsize` - Number of bits per nucleotide (2 or 4)
232 |     /// * `headers` - Whether to include sequence headers with records
233 |     ///
234 |     /// # Example
235 |     ///
236 |     /// ```rust
237 |     /// use binseq::vbq::VBinseqHeaderBuilder;
238 |     ///
239 |     /// // Create header with quality scores and compression, without paired sequences
240 |     /// let header = VBinseqHeaderBuilder::new()
241 |     ///     .qual(true)
242 |     ///     .compressed(true)
243 |     ///     .build();
244 |     /// ```
245 |     #[must_use]
246 |     pub fn new(
247 |         qual: bool,
248 |         compressed: bool,
249 |         paired: bool,
250 |         bitsize: BitSize,
251 |         headers: bool,
252 |         flags: bool,
253 |     ) -> Self {
254 |         Self::with_capacity(
255 |             BLOCK_SIZE, qual, compressed, paired, bitsize, headers, flags,
256 |         )
257 |     }
258 | 
259 |     /// Creates a new VBINSEQ header with a custom block size
260 |     ///
261 |     /// # Parameters
262 |     ///
263 |     /// * `block` - Custom block size in bytes (virtual/uncompressed size)
264 |     /// * `qual` - Whether to include quality scores with sequences
265 |     /// * `compressed` - Whether to use ZSTD compression for blocks
266 |     /// * `paired` - Whether records contain paired sequences
267 |     ///
268 |     /// # Example
269 |     ///
270 |     /// ```rust
271 |     /// use binseq::vbq::VBinseqHeaderBuilder;
272 |     ///
273 |     /// // Create header with a 256KB block size, with quality scores and compression
274 |     /// let header = VBinseqHeaderBuilder::new()
275 |     ///     .block(256 * 1024)
276 |     ///     .qual(true)
277 |     ///     .compressed(true)
278 |     ///     .build();
279 |     /// ```
280 |     #[must_use]
281 |     pub fn with_capacity(
282 |         block: u64,
283 |         qual: bool,
284 |         compressed: bool,
285 |         paired: bool,
286 |         bitsize: BitSize,
287 |         headers: bool,
288 |         flags: bool,
289 |     ) -> Self {
290 |         Self {
291 |             magic: MAGIC,
292 |             format: FORMAT,
293 |             block,
294 |             qual,
295 |             compressed,
296 |             paired,
297 |             headers,
298 |             flags,
299 |             bits: bitsize,
300 |             reserved: RESERVED_BYTES,
301 |         }
302 |     }
303 | 
304 |     /// Sets the encoding bitsize for the header.
305 |     pub fn set_bitsize(&mut self, bits: BitSize) {
306 |         self.bits = bits;
307 |     }
308 | 
309 |     /// Creates a header from a 32-byte buffer
310 |     ///
311 |     /// This function parses a raw byte buffer into a `VBinseqHeader` structure,
312 |     /// validating the magic number and format version.
313 |     ///
314 |     /// # Parameters
315 |     ///
316 |     /// * `buffer` - A 32-byte array containing the header data
317 |     ///
318 |     /// # Returns
319 |     ///
320 |     /// * `Result<Self>` - A valid header if parsing was successful
321 |     ///
322 |     /// # Errors
323 |     ///
324 |     /// * `HeaderError::InvalidMagicNumber` - If the magic number doesn't match "VSEQ"
325 |     /// * `HeaderError::InvalidFormatVersion` - If the format version is unsupported
326 |     /// * `HeaderError::InvalidReservedBytes` - If the reserved bytes section is invalid
327 |     pub fn from_bytes(buffer: &[u8; SIZE_HEADER]) -> Result<Self> {
328 |         let magic = LittleEndian::read_u32(&buffer[0..4]);
329 |         if magic != MAGIC {
330 |             return Err(HeaderError::InvalidMagicNumber(magic).into());
331 |         }
332 |         let format = buffer[4];
333 |         if format != FORMAT {
334 |             return Err(HeaderError::InvalidFormatVersion(format).into());
335 |         }
336 |         let block = LittleEndian::read_u64(&buffer[5..13]);
337 |         let qual = buffer[13] != 0;
338 |         let compressed = buffer[14] != 0;
339 |         let paired = buffer[15] != 0;
340 |         let bits = match buffer[16] {
341 |             0 | 2 | 42 => BitSize::Two,
342 |             4 => BitSize::Four,
343 |             x => return Err(HeaderError::InvalidBitSize(x).into()),
344 |         };
345 |         let headers = match buffer[17] {
346 |             0 | 42 => false, // backwards compatibility
347 |             _ => true,
348 |         };
349 |         let flags = buffer[18] != 0;
350 |         let Ok(reserved) = buffer[19..32].try_into() else {
351 |             return Err(HeaderError::InvalidReservedBytes.into());
352 |         };
353 |         Ok(Self {
354 |             magic,
355 |             format,
356 |             block,
357 |             qual,
358 |             compressed,
359 |             paired,
360 |             bits,
361 |             headers,
362 |             flags,
363 |             reserved,
364 |         })
365 |     }
366 | 
367 |     /// Writes the header to a writer
368 |     ///
369 |     /// This function serializes the header structure into a 32-byte buffer and writes
370 |     /// it to the provided writer.
371 |     ///
372 |     /// # Parameters
373 |     ///
374 |     /// * `writer` - Any type that implements the `Write` trait
375 |     ///
376 |     /// # Returns
377 |     ///
378 |     /// * `Result<()>` - Success if the header was written
379 |     ///
380 |     /// # Errors
381 |     ///
382 |     /// * IO errors if writing to the writer fails
383 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
384 |         let mut buffer = [0u8; SIZE_HEADER];
385 |         LittleEndian::write_u32(&mut buffer[0..4], self.magic);
386 |         buffer[4] = self.format;
387 |         LittleEndian::write_u64(&mut buffer[5..13], self.block);
388 |         buffer[13] = self.qual.into();
389 |         buffer[14] = self.compressed.into();
390 |         buffer[15] = self.paired.into();
391 |         buffer[16] = self.bits.into();
392 |         buffer[17] = self.headers.into();
393 |         buffer[18] = self.flags.into();
394 |         buffer[19..32].copy_from_slice(&self.reserved);
395 |         writer.write_all(&buffer)?;
396 |         Ok(())
397 |     }
398 | 
399 |     /// Reads a header from a reader
400 |     ///
401 |     /// This function reads 32 bytes from the provided reader and parses them into
402 |     /// a `VBinseqHeader` structure.
403 |     ///
404 |     /// # Parameters
405 |     ///
406 |     /// * `reader` - Any type that implements the `Read` trait
407 |     ///
408 |     /// # Returns
409 |     ///
410 |     /// * `Result<Self>` - A valid header if reading and parsing was successful
411 |     ///
412 |     /// # Errors
413 |     ///
414 |     /// * IO errors if reading from the reader fails
415 |     /// * Header validation errors from `from_bytes()`
416 |     pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
417 |         let mut buffer = [0u8; SIZE_HEADER];
418 |         reader.read_exact(&mut buffer)?;
419 |         Self::from_bytes(&buffer)
420 |     }
421 | 
422 |     #[must_use]
423 |     pub fn is_paired(&self) -> bool {
424 |         self.paired
425 |     }
426 | }
427 | 
428 | /// Block header for VBINSEQ block data
429 | ///
430 | /// Each block in a VBINSEQ file is preceded by a 32-byte block header that contains
431 | /// information about the block including its size and the number of records it contains.
432 | ///
433 | /// # Fields
434 | ///
435 | /// * `magic` - Magic number to validate block integrity ("BLOCKSEQ", 8 bytes)
436 | /// * `size` - Actual size of the block in bytes (8 bytes)
437 | /// * `records` - Number of records in the block (4 bytes)
438 | /// * `reserved` - Reserved bytes for future extensions (12 bytes)
439 | #[derive(Clone, Copy, Debug)]
440 | pub struct BlockHeader {
441 |     /// Magic number to identify the block ("BLOCKSEQ")
442 |     ///
443 |     /// Always set to 0x5145534B434F4C42 (8 bytes)
444 |     pub magic: u64,
445 | 
446 |     /// Actual size of the block in bytes
447 |     ///
448 |     /// This can differ from the virtual block size in the file header
449 |     /// when compression is enabled (8 bytes)
450 |     pub size: u64,
451 | 
452 |     /// Number of records stored in this block
453 |     ///
454 |     /// Used to iterate through records efficiently (4 bytes)
455 |     pub records: u32,
456 | 
457 |     /// Reserved bytes for future extensions
458 |     ///
459 |     /// Currently filled with placeholder values (12 bytes)
460 |     pub reserved: [u8; 12],
461 | }
462 | impl BlockHeader {
463 |     /// Creates a new block header
464 |     ///
465 |     /// # Parameters
466 |     ///
467 |     /// * `size` - The actual size of the block in bytes (can be compressed size)
468 |     /// * `records` - The number of records contained in the block
469 |     ///
470 |     /// # Example
471 |     ///
472 |     /// ```rust
473 |     /// use binseq::vbq::BlockHeader;
474 |     ///
475 |     /// // Create a block header for a block with 1024 bytes and 100 records
476 |     /// let header = BlockHeader::new(1024, 100);
477 |     /// ```
478 |     #[must_use]
479 |     pub fn new(size: u64, records: u32) -> Self {
480 |         Self {
481 |             magic: BLOCK_MAGIC,
482 |             size,
483 |             records,
484 |             reserved: RESERVED_BYTES_BLOCK,
485 |         }
486 |     }
487 | 
488 |     #[must_use]
489 |     pub fn empty() -> Self {
490 |         Self {
491 |             magic: BLOCK_MAGIC,
492 |             size: 0,
493 |             records: 0,
494 |             reserved: RESERVED_BYTES_BLOCK,
495 |         }
496 |     }
497 | 
498 |     #[must_use]
499 |     pub fn is_empty(&self) -> bool {
500 |         self.size == 0 && self.records == 0
501 |     }
502 | 
503 |     /// Writes the block header to a writer
504 |     ///
505 |     /// This function serializes the block header structure into a 32-byte buffer and writes
506 |     /// it to the provided writer.
507 |     ///
508 |     /// # Parameters
509 |     ///
510 |     /// * `writer` - Any type that implements the `Write` trait
511 |     ///
512 |     /// # Returns
513 |     ///
514 |     /// * `Result<()>` - Success if the header was written
515 |     ///
516 |     /// # Errors
517 |     ///
518 |     /// * IO errors if writing to the writer fails
519 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
520 |         let mut buffer = [0u8; SIZE_BLOCK_HEADER];
521 |         LittleEndian::write_u64(&mut buffer[0..8], self.magic);
522 |         LittleEndian::write_u64(&mut buffer[8..16], self.size);
523 |         LittleEndian::write_u32(&mut buffer[16..20], self.records);
524 |         buffer[20..].copy_from_slice(&self.reserved);
525 |         writer.write_all(&buffer)?;
526 |         Ok(())
527 |     }
528 | 
529 |     /// Creates a block header from a 32-byte buffer
530 |     ///
531 |     /// This function parses a raw byte buffer into a `BlockHeader` structure,
532 |     /// validating the magic number.
533 |     ///
534 |     /// # Parameters
535 |     ///
536 |     /// * `buffer` - A 32-byte array containing the block header data
537 |     ///
538 |     /// # Returns
539 |     ///
540 |     /// * `Result<Self>` - A valid block header if parsing was successful
541 |     ///
542 |     /// # Errors
543 |     ///
544 |     /// * `ReadError::InvalidBlockMagicNumber` - If the magic number doesn't match "BLOCKSEQ"
545 |     pub fn from_bytes(buffer: &[u8; SIZE_BLOCK_HEADER]) -> Result<Self> {
546 |         let magic = LittleEndian::read_u64(&buffer[0..8]);
547 |         if magic != BLOCK_MAGIC {
548 |             return Err(ReadError::InvalidBlockMagicNumber(magic, 0).into());
549 |         }
550 |         let size = LittleEndian::read_u64(&buffer[8..16]);
551 |         let records = LittleEndian::read_u32(&buffer[16..20]);
552 |         Ok(Self::new(size, records))
553 |     }
554 | 
555 |     #[must_use]
556 |     pub fn size_with_header(&self) -> usize {
557 |         self.size as usize + SIZE_BLOCK_HEADER
558 |     }
559 | }
560 | 


--------------------------------------------------------------------------------
/src/bq/writer.rs:
--------------------------------------------------------------------------------
  1 | //! Binary sequence writer module
  2 | //!
  3 | //! This module provides functionality for writing nucleotide sequences to binary files
  4 | //! in a compact 2-bit format. It includes support for:
  5 | //! - Single and paired sequence writing
  6 | //! - Invalid nucleotide handling with configurable policies
  7 | //! - Efficient buffering and encoding
  8 | //! - Headless mode for parallel writing
  9 | 
 10 | use std::io::{BufWriter, Write};
 11 | 
 12 | use byteorder::{LittleEndian, WriteBytesExt};
 13 | use rand::{rngs::SmallRng, SeedableRng};
 14 | 
 15 | use super::BinseqHeader;
 16 | use crate::{
 17 |     error::{Result, WriteError},
 18 |     Policy, RNG_SEED,
 19 | };
 20 | 
 21 | /// Writes a single flag value to a writer in little-endian format
 22 | ///
 23 | /// # Arguments
 24 | ///
 25 | /// * `writer` - Any type that implements the `Write` trait
 26 | /// * `flag` - The 64-bit flag value to write
 27 | ///
 28 | /// # Returns
 29 | ///
 30 | /// * `Ok(())` - If the flag was successfully written
 31 | /// * `Err(Error)` - If writing to the writer failed
 32 | pub fn write_flag<W: Write>(writer: &mut W, flag: u64) -> Result<()> {
 33 |     writer.write_u64::<LittleEndian>(flag)?;
 34 |     Ok(())
 35 | }
 36 | 
 37 | /// Writes a buffer of u64 values to a writer in little-endian format
 38 | ///
 39 | /// This function is used to write encoded sequence data to the output.
 40 | /// Each u64 in the buffer contains up to 32 nucleotides in 2-bit format.
 41 | ///
 42 | /// # Arguments
 43 | ///
 44 | /// * `writer` - Any type that implements the `Write` trait
 45 | /// * `ebuf` - The buffer of u64 values to write
 46 | ///
 47 | /// # Returns
 48 | ///
 49 | /// * `Ok(())` - If the buffer was successfully written
 50 | /// * `Err(Error)` - If writing to the writer failed
 51 | pub fn write_buffer<W: Write>(writer: &mut W, ebuf: &[u64]) -> Result<()> {
 52 |     ebuf.iter()
 53 |         .try_for_each(|&x| writer.write_u64::<LittleEndian>(x))?;
 54 |     Ok(())
 55 | }
 56 | 
 57 | /// Encodes nucleotide sequences into a compact 2-bit binary format
 58 | ///
 59 | /// The `Encoder` handles the conversion of nucleotide sequences (A, C, G, T)
 60 | /// into a compact binary representation where each nucleotide is stored using
 61 | /// 2 bits. It also handles invalid nucleotides according to a configurable policy.
 62 | ///
 63 | /// The encoder maintains internal buffers to avoid repeated allocations during
 64 | /// encoding operations. These buffers are reused across multiple encode calls
 65 | /// and are cleared automatically when needed.
 66 | #[derive(Clone)]
 67 | pub struct Encoder {
 68 |     /// Header containing sequence length and format information
 69 |     header: BinseqHeader,
 70 | 
 71 |     /// Buffers for storing encoded nucleotides in 2-bit format
 72 |     /// Each u64 can store 32 nucleotides (64 bits / 2 bits per nucleotide)
 73 |     sbuffer: Vec<u64>, // Primary sequence buffer
 74 |     xbuffer: Vec<u64>, // Extended sequence buffer
 75 | 
 76 |     /// Temporary buffers for handling invalid nucleotides
 77 |     /// These store the processed sequences after policy application
 78 |     s_ibuf: Vec<u8>, // Primary sequence invalid buffer
 79 |     x_ibuf: Vec<u8>, // Extended sequence invalid buffer
 80 | 
 81 |     /// Policy for handling invalid nucleotides during encoding
 82 |     policy: Policy,
 83 | 
 84 |     /// Random number generator for the `RandomDraw` policy
 85 |     /// Seeded with `RNG_SEED` for reproducibility
 86 |     rng: SmallRng,
 87 | }
 88 | impl Encoder {
 89 |     /// Creates a new encoder with default invalid nucleotide policy
 90 |     ///
 91 |     /// # Arguments
 92 |     ///
 93 |     /// * `header` - The header defining sequence lengths and format
 94 |     ///
 95 |     /// # Examples
 96 |     ///
 97 |     /// ```
 98 |     /// # use binseq::bq::{BinseqHeaderBuilder, Encoder};
 99 |     /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap();
100 |     /// let encoder = Encoder::new(header);
101 |     /// ```
102 |     #[must_use]
103 |     pub fn new(header: BinseqHeader) -> Self {
104 |         Self::with_policy(header, Policy::default())
105 |     }
106 | 
107 |     /// Creates a new encoder with a specific invalid nucleotide policy
108 |     ///
109 |     /// # Arguments
110 |     ///
111 |     /// * `header` - The header defining sequence lengths and format
112 |     /// * `policy` - The policy for handling invalid nucleotides
113 |     ///
114 |     /// # Examples
115 |     ///
116 |     /// ```
117 |     /// # use binseq::bq::{BinseqHeaderBuilder, Encoder};
118 |     /// # use binseq::Policy;
119 |     /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap();
120 |     /// let encoder = Encoder::with_policy(header, Policy::SetToA);
121 |     /// ```
122 |     #[must_use]
123 |     pub fn with_policy(header: BinseqHeader, policy: Policy) -> Self {
124 |         Self {
125 |             header,
126 |             policy,
127 |             sbuffer: Vec::default(),
128 |             xbuffer: Vec::default(),
129 |             s_ibuf: Vec::default(),
130 |             x_ibuf: Vec::default(),
131 |             rng: SmallRng::seed_from_u64(RNG_SEED),
132 |         }
133 |     }
134 | 
135 |     /// Encodes a single sequence as 2-bit.
136 |     ///
137 |     /// Will return `None` if the sequence is invalid and the policy does not allow correction.
138 |     pub fn encode_single(&mut self, primary: &[u8]) -> Result<Option<&[u64]>> {
139 |         if primary.len() != self.header.slen as usize {
140 |             return Err(WriteError::UnexpectedSequenceLength {
141 |                 expected: self.header.slen,
142 |                 got: primary.len(),
143 |             }
144 |             .into());
145 |         }
146 | 
147 |         // Fill the buffer with the 2-bit representation of the nucleotides
148 |         self.clear();
149 |         if self.header.bits.encode(primary, &mut self.sbuffer).is_err() {
150 |             self.clear();
151 |             if self
152 |                 .policy
153 |                 .handle(primary, &mut self.s_ibuf, &mut self.rng)?
154 |             {
155 |                 self.header.bits.encode(&self.s_ibuf, &mut self.sbuffer)?;
156 |             } else {
157 |                 return Ok(None);
158 |             }
159 |         }
160 | 
161 |         Ok(Some(&self.sbuffer))
162 |     }
163 | 
164 |     /// Encodes a pair of sequences as 2-bit.
165 |     ///
166 |     /// Will return `None` if either sequence is invalid and the policy does not allow correction.
167 |     pub fn encode_paired(
168 |         &mut self,
169 |         primary: &[u8],
170 |         extended: &[u8],
171 |     ) -> Result<Option<(&[u64], &[u64])>> {
172 |         if primary.len() != self.header.slen as usize {
173 |             return Err(WriteError::UnexpectedSequenceLength {
174 |                 expected: self.header.slen,
175 |                 got: primary.len(),
176 |             }
177 |             .into());
178 |         }
179 |         if extended.len() != self.header.xlen as usize {
180 |             return Err(WriteError::UnexpectedSequenceLength {
181 |                 expected: self.header.xlen,
182 |                 got: extended.len(),
183 |             }
184 |             .into());
185 |         }
186 | 
187 |         self.clear();
188 |         if self.header.bits.encode(primary, &mut self.sbuffer).is_err()
189 |             || self
190 |                 .header
191 |                 .bits
192 |                 .encode(extended, &mut self.xbuffer)
193 |                 .is_err()
194 |         {
195 |             self.clear();
196 |             if self
197 |                 .policy
198 |                 .handle(primary, &mut self.s_ibuf, &mut self.rng)?
199 |                 && self
200 |                     .policy
201 |                     .handle(extended, &mut self.x_ibuf, &mut self.rng)?
202 |             {
203 |                 self.header.bits.encode(&self.s_ibuf, &mut self.sbuffer)?;
204 |                 self.header.bits.encode(&self.x_ibuf, &mut self.xbuffer)?;
205 |             } else {
206 |                 return Ok(None);
207 |             }
208 |         }
209 | 
210 |         Ok(Some((&self.sbuffer, &self.xbuffer)))
211 |     }
212 | 
213 |     /// Clear all buffers and reset the encoder.
214 |     pub fn clear(&mut self) {
215 |         self.sbuffer.clear();
216 |         self.xbuffer.clear();
217 |         self.s_ibuf.clear();
218 |         self.x_ibuf.clear();
219 |     }
220 | }
221 | 
222 | /// Builder for creating configured `BinseqWriter` instances
223 | ///
224 | /// This builder provides a flexible way to create writers with various
225 | /// configurations. It follows the builder pattern, allowing for optional
226 | /// settings to be specified in any order.
227 | ///
228 | /// # Examples
229 | ///
230 | /// ```
231 | /// # use binseq::{Policy, Result};
232 | /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder};
233 | /// # fn main() -> Result<()> {
234 | /// let header = BinseqHeaderBuilder::new().slen(100).build()?;
235 | /// let writer = BinseqWriterBuilder::default()
236 | ///     .header(header)
237 | ///     .policy(Policy::SetToA)
238 | ///     .headless(false)
239 | ///     .build(Vec::new())?;
240 | /// # Ok(())
241 | /// # }
242 | /// ```
243 | #[derive(Default)]
244 | pub struct BinseqWriterBuilder {
245 |     /// Required header defining sequence lengths and format
246 |     header: Option<BinseqHeader>,
247 |     /// Optional policy for handling invalid nucleotides
248 |     policy: Option<Policy>,
249 |     /// Optional headless mode for parallel writing scenarios
250 |     headless: Option<bool>,
251 | }
252 | impl BinseqWriterBuilder {
253 |     #[must_use]
254 |     pub fn header(mut self, header: BinseqHeader) -> Self {
255 |         self.header = Some(header);
256 |         self
257 |     }
258 | 
259 |     #[must_use]
260 |     pub fn policy(mut self, policy: Policy) -> Self {
261 |         self.policy = Some(policy);
262 |         self
263 |     }
264 | 
265 |     #[must_use]
266 |     pub fn headless(mut self, headless: bool) -> Self {
267 |         self.headless = Some(headless);
268 |         self
269 |     }
270 | 
271 |     pub fn build<W: Write>(self, inner: W) -> Result<BinseqWriter<W>> {
272 |         let Some(header) = self.header else {
273 |             return Err(WriteError::MissingHeader.into());
274 |         };
275 |         BinseqWriter::new(
276 |             inner,
277 |             header,
278 |             self.policy.unwrap_or_default(),
279 |             self.headless.unwrap_or(false),
280 |         )
281 |     }
282 | }
283 | 
284 | /// High-level writer for binary sequence files
285 | ///
286 | /// This writer provides a convenient interface for writing nucleotide sequences
287 | /// to binary files in a compact format. It handles sequence encoding, invalid
288 | /// nucleotide processing, and file format compliance.
289 | ///
290 | /// The writer can operate in two modes:
291 | /// - Normal mode: Writes the header followed by records
292 | /// - Headless mode: Writes only records (useful for parallel writing)
293 | ///
294 | /// # Type Parameters
295 | ///
296 | /// * `W` - The underlying writer type that implements `Write`
297 | #[derive(Clone)]
298 | pub struct BinseqWriter<W: Write> {
299 |     /// The underlying writer for output
300 |     inner: W,
301 | 
302 |     /// Encoder for converting sequences to binary format
303 |     encoder: Encoder,
304 | 
305 |     /// Whether this writer is in headless mode
306 |     /// When true, the header is not written to the output
307 |     headless: bool,
308 | }
309 | impl<W: Write> BinseqWriter<W> {
310 |     /// Creates a new `BinseqWriter` instance with specified configuration
311 |     ///
312 |     /// This is a low-level constructor. For a more convenient way to create a
313 |     /// `BinseqWriter`, use the `BinseqWriterBuilder` struct.
314 |     ///
315 |     /// # Arguments
316 |     ///
317 |     /// * `inner` - The underlying writer to write to
318 |     /// * `header` - The header defining sequence lengths and format
319 |     /// * `policy` - The policy for handling invalid nucleotides
320 |     /// * `headless` - Whether to skip writing the header (for parallel writing)
321 |     ///
322 |     /// # Returns
323 |     ///
324 |     /// * `Ok(BinseqWriter)` - A new writer instance
325 |     /// * `Err(Error)` - If writing the header fails
326 |     ///
327 |     /// # Examples
328 |     ///
329 |     /// ```
330 |     /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriter};
331 |     /// # use binseq::{Result, Policy};
332 |     /// # fn main() -> Result<()> {
333 |     /// let header = BinseqHeaderBuilder::new().slen(100).build()?;
334 |     /// let writer = BinseqWriter::new(
335 |     ///     Vec::new(),
336 |     ///     header,
337 |     ///     Policy::default(),
338 |     ///     false
339 |     /// )?;
340 |     /// # Ok(())
341 |     /// # }
342 |     /// ```
343 |     pub fn new(mut inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result<Self> {
344 |         if !headless {
345 |             header.write_bytes(&mut inner)?;
346 |         }
347 |         Ok(Self {
348 |             inner,
349 |             encoder: Encoder::with_policy(header, policy),
350 |             headless,
351 |         })
352 |     }
353 | 
354 |     /// Writes a single record to the output
355 |     ///
356 |     /// This method encodes and writes a primary sequence along with an associated flag.
357 |     ///
358 |     /// # Arguments
359 |     ///
360 |     /// * `flag` - A 64-bit flag value associated with the sequence
361 |     /// * `primary` - The nucleotide sequence to write
362 |     ///
363 |     /// # Returns
364 |     ///
365 |     /// * `Ok(true)` if the record was written successfully
366 |     /// * `Ok(false)` if the record was not written because it was empty
367 |     /// * `Err(WriteError::FlagSet)` if the flag is set but no flag value is provided
368 |     pub fn write_record(&mut self, flag: Option<u64>, primary: &[u8]) -> Result<bool> {
369 |         let has_flag = self.encoder.header.flags;
370 |         if let Some(sbuffer) = self.encoder.encode_single(primary)? {
371 |             if has_flag {
372 |                 write_flag(&mut self.inner, flag.unwrap_or(0))?;
373 |             }
374 |             write_buffer(&mut self.inner, sbuffer)?;
375 |             Ok(true)
376 |         } else {
377 |             Ok(false)
378 |         }
379 |     }
380 | 
381 |     /// Writes a paired record to the output
382 |     ///
383 |     /// This method writes a paired record to the output. It takes a flag, primary sequence, and extended sequence as input.
384 |     /// If the flag is set but no flag value is provided, it returns an error.
385 |     /// Otherwise, it writes the encoded single and extended sequences to the output and returns true.
386 |     ///
387 |     /// # Arguments
388 |     /// * `flag` - The flag value to write to the output
389 |     /// * `primary` - The primary sequence to encode and write to the output
390 |     /// * `extended` - The extended sequence to encode and write to the output
391 |     ///
392 |     /// # Returns
393 |     /// * `Result<bool>` - A result indicating whether the write was successful or not
394 |     pub fn write_paired_record(
395 |         &mut self,
396 |         flag: Option<u64>,
397 |         primary: &[u8],
398 |         extended: &[u8],
399 |     ) -> Result<bool> {
400 |         let has_flag = self.encoder.header.flags;
401 |         if let Some((sbuffer, xbuffer)) = self.encoder.encode_paired(primary, extended)? {
402 |             if has_flag {
403 |                 write_flag(&mut self.inner, flag.unwrap_or(0))?;
404 |             }
405 |             write_buffer(&mut self.inner, sbuffer)?;
406 |             write_buffer(&mut self.inner, xbuffer)?;
407 |             Ok(true)
408 |         } else {
409 |             Ok(false)
410 |         }
411 |     }
412 | 
413 |     /// Consumes the writer and returns the underlying writer
414 |     ///
415 |     /// This is useful when you need to access the underlying writer after
416 |     /// writing is complete, for example to get the contents of a `Vec<u8>`.
417 |     ///
418 |     /// # Examples
419 |     ///
420 |     /// ```
421 |     /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder};
422 |     /// # use binseq::Result;
423 |     /// # fn main() -> Result<()> {
424 |     /// let header = BinseqHeaderBuilder::new().slen(100).build()?;
425 |     /// let writer = BinseqWriterBuilder::default()
426 |     ///     .header(header)
427 |     ///     .build(Vec::new())?;
428 |     ///
429 |     /// // After writing sequences...
430 |     /// let bytes = writer.into_inner();
431 |     /// # Ok(())
432 |     /// # }
433 |     /// ```
434 |     pub fn into_inner(self) -> W {
435 |         self.inner
436 |     }
437 | 
438 |     /// Gets a mutable reference to the underlying writer
439 |     ///
440 |     /// This allows direct access to the underlying writer while retaining
441 |     /// ownership of the `BinseqWriter`.
442 |     pub fn by_ref(&mut self) -> &mut W {
443 |         &mut self.inner
444 |     }
445 | 
446 |     /// Flushes any buffered data to the underlying writer
447 |     ///
448 |     /// # Returns
449 |     ///
450 |     /// * `Ok(())` - If the flush was successful
451 |     /// * `Err(Error)` - If flushing failed
452 |     pub fn flush(&mut self) -> Result<()> {
453 |         self.inner.flush()?;
454 |         Ok(())
455 |     }
456 | 
457 |     /// Creates a new encoder with the same configuration as this writer
458 |     ///
459 |     /// This is useful when you need a separate encoder instance for parallel
460 |     /// processing or other scenarios where you need independent encoding.
461 |     /// The new encoder is initialized with a cleared state.
462 |     ///
463 |     /// # Returns
464 |     ///
465 |     /// A new `Encoder` instance with the same configuration but cleared buffers
466 |     pub fn new_encoder(&self) -> Encoder {
467 |         let mut encoder = self.encoder.clone();
468 |         encoder.clear();
469 |         encoder
470 |     }
471 | 
472 |     /// Checks if this writer is in headless mode
473 |     ///
474 |     /// In headless mode, the writer does not write the header to the output.
475 |     /// This is useful for parallel writing scenarios where only one writer
476 |     /// should write the header.
477 |     ///
478 |     /// # Returns
479 |     ///
480 |     /// `true` if the writer is in headless mode, `false` otherwise
481 |     pub fn is_headless(&self) -> bool {
482 |         self.headless
483 |     }
484 | 
485 |     /// Ingests the contents of another writer's buffer
486 |     ///
487 |     /// This method is used in parallel writing scenarios to combine the output
488 |     /// of multiple writers. It takes the contents of another writer's buffer
489 |     /// and writes them to this writer's output.
490 |     ///
491 |     /// # Arguments
492 |     ///
493 |     /// * `other` - Another writer whose underlying writer is a `Vec<u8>`
494 |     ///
495 |     /// # Returns
496 |     ///
497 |     /// * `Ok(())` - If the contents were successfully ingested
498 |     /// * `Err(Error)` - If writing the contents failed
499 |     pub fn ingest(&mut self, other: &mut BinseqWriter<Vec<u8>>) -> Result<()> {
500 |         let other_inner = other.by_ref();
501 |         self.inner.write_all(other_inner)?;
502 |         other_inner.clear();
503 |         Ok(())
504 |     }
505 | }
506 | 
507 | /// A streaming writer for binary sequence data
508 | ///
509 | /// This writer buffers data before writing it to the underlying writer,
510 | /// providing efficient streaming capabilities suitable for:
511 | /// - Writing to network connections
512 | /// - Processing very large datasets
513 | /// - Pipeline processing
514 | ///
515 | /// The `StreamWriter` is a specialized version of `BinseqWriter` that
516 | /// adds internal buffering and is optimized for streaming scenarios.
517 | pub struct StreamWriter<W: Write> {
518 |     /// The underlying writer for processing sequences
519 |     writer: BinseqWriter<BufWriter<W>>,
520 | }
521 | 
522 | impl<W: Write> StreamWriter<W> {
523 |     /// Creates a new `StreamWriter` with the default buffer size
524 |     ///
525 |     /// This constructor initializes a `StreamWriter` with an 8K buffer
526 |     /// for efficient writing to the underlying writer.
527 |     ///
528 |     /// # Arguments
529 |     ///
530 |     /// * `inner` - The writer to write binary sequence data to
531 |     /// * `header` - The header defining sequence lengths and format
532 |     /// * `policy` - The policy for handling invalid nucleotides
533 |     /// * `headless` - Whether to skip writing the header
534 |     ///
535 |     /// # Returns
536 |     ///
537 |     /// * `Ok(StreamWriter)` - A new streaming writer
538 |     /// * `Err(Error)` - If initialization fails
539 |     pub fn new(inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result<Self> {
540 |         Self::with_capacity(inner, 8192, header, policy, headless)
541 |     }
542 | 
543 |     /// Creates a new `StreamWriter` with a specified buffer capacity
544 |     ///
545 |     /// This constructor allows customizing the buffer size based on
546 |     /// expected usage patterns and performance requirements.
547 |     ///
548 |     /// # Arguments
549 |     ///
550 |     /// * `inner` - The writer to write binary sequence data to
551 |     /// * `capacity` - The size of the internal buffer in bytes
552 |     /// * `header` - The header defining sequence lengths and format
553 |     /// * `policy` - The policy for handling invalid nucleotides
554 |     /// * `headless` - Whether to skip writing the header
555 |     ///
556 |     /// # Returns
557 |     ///
558 |     /// * `Ok(StreamWriter)` - A new streaming writer with the specified buffer capacity
559 |     /// * `Err(Error)` - If initialization fails
560 |     pub fn with_capacity(
561 |         inner: W,
562 |         capacity: usize,
563 |         header: BinseqHeader,
564 |         policy: Policy,
565 |         headless: bool,
566 |     ) -> Result<Self> {
567 |         let buffered = BufWriter::with_capacity(capacity, inner);
568 |         let writer = BinseqWriter::new(buffered, header, policy, headless)?;
569 | 
570 |         Ok(Self { writer })
571 |     }
572 | 
573 |     pub fn write_record(&mut self, flag: Option<u64>, primary: &[u8]) -> Result<bool> {
574 |         self.writer.write_record(flag, primary)
575 |     }
576 | 
577 |     pub fn write_paired_record(
578 |         &mut self,
579 |         flag: Option<u64>,
580 |         primary: &[u8],
581 |         extended: &[u8],
582 |     ) -> Result<bool> {
583 |         self.writer.write_paired_record(flag, primary, extended)
584 |     }
585 | 
586 |     /// Flushes any buffered data to the underlying writer
587 |     ///
588 |     /// # Returns
589 |     ///
590 |     /// * `Ok(())` - If the flush was successful
591 |     /// * `Err(Error)` - If flushing failed
592 |     pub fn flush(&mut self) -> Result<()> {
593 |         self.writer.flush()
594 |     }
595 | 
596 |     /// Consumes the streaming writer and returns the inner writer after flushing
597 |     ///
598 |     /// This method is useful when you need access to the underlying writer
599 |     /// after all writing is complete.
600 |     ///
601 |     /// # Returns
602 |     ///
603 |     /// * `Ok(W)` - The inner writer after flushing all data
604 |     /// * `Err(Error)` - If flushing failed
605 |     pub fn into_inner(self) -> Result<W> {
606 |         // First unwrap the writer inner (BufWriter<W>)
607 |         let bufw = self.writer.into_inner();
608 |         // Now unwrap the BufWriter to get W
609 |         match bufw.into_inner() {
610 |             Ok(inner) => Ok(inner),
611 |             Err(e) => Err(std::io::Error::from(e).into()),
612 |         }
613 |     }
614 | }
615 | 
616 | /// Builder for `StreamWriter` instances
617 | ///
618 | /// This builder provides a convenient way to create and configure `StreamWriter`
619 | /// instances with custom buffer sizes and other settings.
620 | #[derive(Default)]
621 | pub struct StreamWriterBuilder {
622 |     /// Required header defining sequence lengths and format
623 |     header: Option<BinseqHeader>,
624 |     /// Optional policy for handling invalid nucleotides
625 |     policy: Option<Policy>,
626 |     /// Optional headless mode for parallel writing scenarios
627 |     headless: Option<bool>,
628 |     /// Optional buffer capacity setting
629 |     buffer_capacity: Option<usize>,
630 | }
631 | 
632 | impl StreamWriterBuilder {
633 |     /// Sets the header for the writer
634 |     #[must_use]
635 |     pub fn header(mut self, header: BinseqHeader) -> Self {
636 |         self.header = Some(header);
637 |         self
638 |     }
639 | 
640 |     /// Sets the policy for handling invalid nucleotides
641 |     #[must_use]
642 |     pub fn policy(mut self, policy: Policy) -> Self {
643 |         self.policy = Some(policy);
644 |         self
645 |     }
646 | 
647 |     /// Sets headless mode (whether to skip writing the header)
648 |     #[must_use]
649 |     pub fn headless(mut self, headless: bool) -> Self {
650 |         self.headless = Some(headless);
651 |         self
652 |     }
653 | 
654 |     /// Sets the buffer capacity for the writer
655 |     #[must_use]
656 |     pub fn buffer_capacity(mut self, capacity: usize) -> Self {
657 |         self.buffer_capacity = Some(capacity);
658 |         self
659 |     }
660 | 
661 |     /// Builds a `StreamWriter` with the configured settings
662 |     ///
663 |     /// # Arguments
664 |     ///
665 |     /// * `inner` - The writer to write binary sequence data to
666 |     ///
667 |     /// # Returns
668 |     ///
669 |     /// * `Ok(StreamWriter)` - A new streaming writer with the specified configuration
670 |     /// * `Err(Error)` - If building the writer fails
671 |     pub fn build<W: Write>(self, inner: W) -> Result<StreamWriter<W>> {
672 |         let Some(header) = self.header else {
673 |             return Err(WriteError::MissingHeader.into());
674 |         };
675 | 
676 |         let capacity = self.buffer_capacity.unwrap_or(8192);
677 |         StreamWriter::with_capacity(
678 |             inner,
679 |             capacity,
680 |             header,
681 |             self.policy.unwrap_or_default(),
682 |             self.headless.unwrap_or(false),
683 |         )
684 |     }
685 | }
686 | 
687 | #[cfg(test)]
688 | mod testing {
689 | 
690 |     use std::{fs::File, io::BufWriter};
691 | 
692 |     use super::*;
693 |     use crate::bq::{BinseqHeaderBuilder, SIZE_HEADER};
694 | 
695 |     #[test]
696 |     fn test_headless() -> Result<()> {
697 |         let inner = Vec::new();
698 |         let mut writer = BinseqWriterBuilder::default()
699 |             .header(BinseqHeaderBuilder::new().slen(32).build()?)
700 |             .headless(true)
701 |             .build(inner)?;
702 |         assert!(writer.is_headless());
703 |         let inner = writer.by_ref();
704 |         assert!(inner.is_empty());
705 |         Ok(())
706 |     }
707 | 
708 |     #[test]
709 |     fn test_not_headless() -> Result<()> {
710 |         let inner = Vec::new();
711 |         let mut writer = BinseqWriterBuilder::default()
712 |             .header(BinseqHeaderBuilder::new().slen(32).build()?)
713 |             .build(inner)?;
714 |         assert!(!writer.is_headless());
715 |         let inner = writer.by_ref();
716 |         assert_eq!(inner.len(), SIZE_HEADER);
717 |         Ok(())
718 |     }
719 | 
720 |     #[test]
721 |     fn test_stdout() -> Result<()> {
722 |         let writer = BinseqWriterBuilder::default()
723 |             .header(BinseqHeaderBuilder::new().slen(32).build()?)
724 |             .build(std::io::stdout())?;
725 |         assert!(!writer.is_headless());
726 |         Ok(())
727 |     }
728 | 
729 |     #[test]
730 |     fn test_to_path() -> Result<()> {
731 |         let path = "test_to_path.file";
732 |         let inner = File::create(path).map(BufWriter::new)?;
733 |         let mut writer = BinseqWriterBuilder::default()
734 |             .header(BinseqHeaderBuilder::new().slen(32).build()?)
735 |             .build(inner)?;
736 |         assert!(!writer.is_headless());
737 |         let inner = writer.by_ref();
738 |         inner.flush()?;
739 | 
740 |         // delete file
741 |         std::fs::remove_file(path)?;
742 | 
743 |         Ok(())
744 |     }
745 | 
746 |     #[test]
747 |     fn test_stream_writer() -> Result<()> {
748 |         let inner = Vec::new();
749 |         let writer = StreamWriterBuilder::default()
750 |             .header(BinseqHeaderBuilder::new().slen(32).build()?)
751 |             .buffer_capacity(16384)
752 |             .build(inner)?;
753 | 
754 |         // Convert back to Vec to verify it works
755 |         let inner = writer.into_inner()?;
756 |         assert_eq!(inner.len(), SIZE_HEADER);
757 |         Ok(())
758 |     }
759 | }
760 | 


--------------------------------------------------------------------------------
/src/vbq/index.rs:
--------------------------------------------------------------------------------
  1 | //! # VBQ Index Format
  2 | //!
  3 | //! This module implements the embedded index format for VBQ files.
  4 | //!
  5 | //! ## Format Changes (v0.7.0+)
  6 | //!
  7 | //! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files instead of
  8 | //! being stored in separate `.vqi` files. This improves portability and eliminates the
  9 | //! need to manage auxiliary files.
 10 | //!
 11 | //! ## Embedded Index Structure
 12 | //!
 13 | //! The index is located at the end of the VBQ file with this layout:
 14 | //!
 15 | //! ```text
 16 | //! [VBQ Data Blocks][Compressed Index][Index Size (u64)][INDEX_END_MAGIC (u64)]
 17 | //! ```
 18 | //!
 19 | //! Where:
 20 | //! - **Compressed Index**: ZSTD-compressed index data (`IndexHeader` + `BlockRanges`)
 21 | //! - **Index Size**: 8 bytes indicating size of compressed index data
 22 | //! - **`INDEX_END_MAGIC`**: 8 bytes (`0x444E455845444E49` = "INDEXEND")
 23 | //!
 24 | //! ## Index Contents
 25 | //!
 26 | //! The compressed index contains:
 27 | //! 1. **`IndexHeader`** (32 bytes): Metadata about the indexed file
 28 | //! 2. **`BlockRange` entries** (32 bytes each): One per data block
 29 | //!
 30 | //! ## Key Changes from v0.6.x
 31 | //!
 32 | //! - Index moved from separate `.vqi` files into VBQ files
 33 | //! - Cumulative record counts changed from `u32` to `u64`
 34 | //! - Support for files with more than 4 billion records
 35 | 
 36 | use std::{
 37 |     fs::File,
 38 |     io::{BufReader, BufWriter, Cursor, Read, Write},
 39 |     path::Path,
 40 | };
 41 | 
 42 | use byteorder::{ByteOrder, LittleEndian};
 43 | use zstd::{Decoder, Encoder};
 44 | 
 45 | use super::{
 46 |     header::{SIZE_BLOCK_HEADER, SIZE_HEADER},
 47 |     BlockHeader, VBinseqHeader,
 48 | };
 49 | use crate::error::{IndexError, Result};
 50 | 
 51 | /// Size of `BlockRange` in bytes
 52 | pub const SIZE_BLOCK_RANGE: usize = 32;
 53 | /// Size of `IndexHeader` in bytes
 54 | pub const INDEX_HEADER_SIZE: usize = 32;
 55 | /// Magic number to designate index (VBQINDEX)
 56 | #[allow(clippy::unreadable_literal)]
 57 | pub const INDEX_MAGIC: u64 = 0x5845444e49514256;
 58 | /// Magic number to designate end of index (INDEXEND)
 59 | #[allow(clippy::unreadable_literal)]
 60 | pub const INDEX_END_MAGIC: u64 = 0x444E455845444E49;
 61 | /// Index Block Reservation
 62 | pub const INDEX_RESERVATION: [u8; 4] = [42; 4];
 63 | 
 64 | /// Descriptor of the dimensions of a block in a VBINSEQ file
 65 | ///
 66 | /// A `BlockRange` contains metadata about a single block within a VBINSEQ file,
 67 | /// including its position, size, and record count. This information enables
 68 | /// efficient random access to blocks without scanning the entire file.
 69 | ///
 70 | /// Block ranges are stored in a `BlockIndex` to form a complete index of a VBINSEQ file.
 71 | /// Each range is serialized to a fixed-size 32-byte structure when stored in the embedded index.
 72 | ///
 73 | /// ## Format Changes (v0.7.0+)
 74 | ///
 75 | /// - `cumulative_records` field changed from `u32` to `u64`
 76 | /// - Supports files with more than 4 billion records
 77 | /// - Reserved bytes reduced from 8 to 4 bytes
 78 | ///
 79 | /// # Examples
 80 | ///
 81 | /// ```rust
 82 | /// use binseq::vbq::BlockRange;
 83 | ///
 84 | /// // Create a new block range
 85 | /// let range = BlockRange::new(
 86 | ///     1024,                  // Starting offset in the file (bytes)
 87 | ///     8192,                  // Length of the block (bytes)
 88 | ///     1000,                  // Number of records in this block
 89 | ///     5000                   // Cumulative number of records up to this block (now u64)
 90 | /// );
 91 | ///
 92 | /// // Use the range information
 93 | /// println!("Block starts at byte {}", range.start_offset);
 94 | /// println!("Block contains {} records", range.block_records);
 95 | /// ```
 96 | #[derive(Debug, Clone, Copy)]
 97 | pub struct BlockRange {
 98 |     /// File offset where the block starts (in bytes, including headers)
 99 |     ///
100 |     /// This is the absolute byte position in the file where this block begins,
101 |     /// including the file header and block header.
102 |     ///
103 |     /// (8 bytes in serialized form)
104 |     pub start_offset: u64,
105 | 
106 |     /// Length of the block data in bytes
107 |     ///
108 |     /// This is the size of the block data, not including the block header.
109 |     /// For compressed blocks, this is the compressed size.
110 |     ///
111 |     /// (8 bytes in serialized form)
112 |     pub len: u64,
113 | 
114 |     /// Number of records contained in this block
115 |     ///
116 |     /// (4 bytes in serialized form)
117 |     pub block_records: u32,
118 | 
119 |     /// Cumulative number of records up to this block
120 |     ///
121 |     /// This allows efficient determination of which block contains a specific record
122 |     /// by index without scanning through all previous blocks.
123 |     ///
124 |     /// **BREAKING CHANGE (v0.7.0+)**: Changed from u32 to u64 to support files
125 |     /// with more than 4 billion records.
126 |     ///
127 |     /// (8 bytes in serialized form)
128 |     pub cumulative_records: u64,
129 | 
130 |     /// Reserved bytes for future extensions
131 |     pub reservation: [u8; 4],
132 | }
133 | impl BlockRange {
134 |     /// Creates a new `BlockRange` with the specified parameters
135 |     ///
136 |     /// # Parameters
137 |     ///
138 |     /// * `start_offset` - The byte offset in the file where this block starts
139 |     /// * `len` - The length of the block data in bytes
140 |     /// * `block_records` - The number of records contained in this block
141 |     /// * `cumulative_records` - The total number of records up to and including this block
142 |     ///
143 |     /// # Returns
144 |     ///
145 |     /// A new `BlockRange` instance with the specified parameters
146 |     ///
147 |     /// # Examples
148 |     ///
149 |     /// ```rust
150 |     /// use binseq::vbq::BlockRange;
151 |     ///
152 |     /// // Create a new block range for a block starting at byte 1024
153 |     /// let range = BlockRange::new(1024, 8192, 1000, 5000);
154 |     /// ```
155 |     #[must_use]
156 |     pub fn new(start_offset: u64, len: u64, block_records: u32, cumulative_records: u64) -> Self {
157 |         Self {
158 |             start_offset,
159 |             len,
160 |             block_records,
161 |             cumulative_records,
162 |             reservation: INDEX_RESERVATION,
163 |         }
164 |     }
165 | 
166 |     /// Serializes the block range to a binary format and writes it to the provided writer
167 |     ///
168 |     /// This method serializes the `BlockRange` to a fixed-size 32-byte structure and
169 |     /// writes it to the provided writer. The serialized format is:
170 |     /// - Bytes 0-7: `start_offset` (u64, little endian)
171 |     /// - Bytes 8-15: len (u64, little endian)
172 |     /// - Bytes 16-19: `block_records` (u32, little endian)
173 |     /// - Bytes 20-23: `cumulative_records` (u32, little endian)
174 |     /// - Bytes 24-31: reservation (8 bytes)
175 |     ///
176 |     /// # Parameters
177 |     ///
178 |     /// * `writer` - The destination to write the serialized block range to
179 |     ///
180 |     /// # Returns
181 |     ///
182 |     /// * `Ok(())` - If the block range was successfully written
183 |     /// * `Err(_)` - If an error occurred during writing
184 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
185 |         let mut buf = [0; SIZE_BLOCK_RANGE];
186 |         LittleEndian::write_u64(&mut buf[0..8], self.start_offset);
187 |         LittleEndian::write_u64(&mut buf[8..16], self.len);
188 |         LittleEndian::write_u32(&mut buf[16..20], self.block_records);
189 |         LittleEndian::write_u64(&mut buf[20..28], self.cumulative_records);
190 |         buf[28..].copy_from_slice(&self.reservation);
191 |         writer.write_all(&buf)?;
192 |         Ok(())
193 |     }
194 | 
195 |     /// Deserializes a `BlockRange` from a fixed-size buffer
196 |     ///
197 |     /// This method deserializes a `BlockRange` from a 32-byte buffer in the format
198 |     /// used by `write_bytes`. It's typically used when reading an index file.
199 |     ///
200 |     /// # Parameters
201 |     ///
202 |     /// * `buffer` - A fixed-size buffer containing a serialized `BlockRange`
203 |     ///
204 |     /// # Returns
205 |     ///
206 |     /// A new `BlockRange` with the values read from the buffer
207 |     ///
208 |     /// # Format
209 |     ///
210 |     /// The buffer is expected to contain:
211 |     /// - Bytes 0-7: `start_offset` (u64, little endian)
212 |     /// - Bytes 8-15: len (u64, little endian)
213 |     /// - Bytes 16-19: `block_records` (u32, little endian)
214 |     /// - Bytes 20-27: `cumulative_records` (u64, little endian)
215 |     /// - Bytes 28-31: reservation (ignored, default value used)
216 |     #[must_use]
217 |     pub fn from_exact(buffer: &[u8; SIZE_BLOCK_RANGE]) -> Self {
218 |         Self {
219 |             start_offset: LittleEndian::read_u64(&buffer[0..8]),
220 |             len: LittleEndian::read_u64(&buffer[8..16]),
221 |             block_records: LittleEndian::read_u32(&buffer[16..20]),
222 |             cumulative_records: LittleEndian::read_u64(&buffer[20..28]),
223 |             reservation: INDEX_RESERVATION,
224 |         }
225 |     }
226 | 
227 |     /// Deserializes a `BlockRange` from a slice of bytes
228 |     ///
229 |     /// This is a convenience method that copies the first 32 bytes from the provided slice
230 |     /// into a fixed-size buffer and then calls `from_exact`. It's useful when reading from
231 |     /// a larger buffer that contains multiple serialized `BlockRange` instances.
232 |     ///
233 |     /// # Parameters
234 |     ///
235 |     /// * `buffer` - A slice containing at least 32 bytes with a serialized `BlockRange`
236 |     ///
237 |     /// # Returns
238 |     ///
239 |     /// A new `BlockRange` with the values read from the buffer
240 |     ///
241 |     /// # Panics
242 |     ///
243 |     /// This method will panic if the buffer is less than 32 bytes long.
244 |     #[must_use]
245 |     pub fn from_bytes(buffer: &[u8]) -> Self {
246 |         let mut buf = [0; SIZE_BLOCK_RANGE];
247 |         buf.copy_from_slice(buffer);
248 |         Self::from_exact(&buf)
249 |     }
250 | }
251 | 
252 | /// Header for a VBINSEQ index file
253 | ///
254 | /// The `IndexHeader` contains metadata about an index file, including a magic number
255 | /// for validation and the size of the indexed file. This allows verifying that an index
256 | /// file matches its corresponding VBINSEQ file.
257 | ///
258 | /// The header has a fixed size of 32 bytes to ensure compatibility across versions.
259 | #[derive(Debug, Clone, Copy)]
260 | pub struct IndexHeader {
261 |     /// Magic number to designate the index file ("VBQINDEX" in ASCII)
262 |     ///
263 |     /// This is used to verify that a file is indeed a VBINSEQ index file.
264 |     /// (8 bytes in serialized form)
265 |     magic: u64,
266 | 
267 |     /// Total size of the indexed VBINSEQ file in bytes
268 |     ///
269 |     /// This is used to verify that the index matches the file it references.
270 |     /// (8 bytes in serialized form)
271 |     bytes: u64,
272 | 
273 |     /// Reserved bytes for future extensions
274 |     ///
275 |     /// (16 bytes in serialized form)
276 |     reserved: [u8; INDEX_HEADER_SIZE - 16],
277 | }
278 | impl IndexHeader {
279 |     /// Creates a new index header for a VBINSEQ file of the specified size
280 |     ///
281 |     /// # Parameters
282 |     ///
283 |     /// * `bytes` - The total size of the VBINSEQ file being indexed, in bytes
284 |     ///
285 |     /// # Returns
286 |     ///
287 |     /// A new `IndexHeader` instance with the appropriate magic number and size
288 |     pub fn new(bytes: u64) -> Self {
289 |         Self {
290 |             magic: INDEX_MAGIC,
291 |             bytes,
292 |             reserved: [42; INDEX_HEADER_SIZE - 16],
293 |         }
294 |     }
295 |     /// Reads an index header from the provided reader
296 |     ///
297 |     /// This method reads 32 bytes from the provided reader and deserializes them
298 |     /// into an `IndexHeader`. It validates the magic number to ensure that the file
299 |     /// is indeed a VBINSEQ index file.
300 |     ///
301 |     /// # Parameters
302 |     ///
303 |     /// * `reader` - The source from which to read the header
304 |     ///
305 |     /// # Returns
306 |     ///
307 |     /// * `Ok(Self)` - If the header was successfully read and has a valid magic number
308 |     /// * `Err(_)` - If an error occurred during reading or the magic number is invalid
309 |     ///
310 |     /// # Format
311 |     ///
312 |     /// The header is expected to be 32 bytes with the following structure:
313 |     /// - Bytes 0-7: magic number (u64, little endian, must be `INDEX_MAGIC`)
314 |     /// - Bytes 8-15: file size in bytes (u64, little endian)
315 |     /// - Bytes 16-31: reserved for future extensions
316 |     pub fn from_reader<R: Read>(reader: &mut R) -> Result<Self> {
317 |         let mut buffer = [0; INDEX_HEADER_SIZE];
318 |         reader.read_exact(&mut buffer)?;
319 |         let magic = LittleEndian::read_u64(&buffer[0..8]);
320 |         let bytes = LittleEndian::read_u64(&buffer[8..16]);
321 |         let Ok(reserved) = buffer[16..INDEX_HEADER_SIZE].try_into() else {
322 |             return Err(IndexError::InvalidReservedBytes.into());
323 |         };
324 |         if magic != INDEX_MAGIC {
325 |             return Err(IndexError::InvalidMagicNumber(magic).into());
326 |         }
327 |         Ok(Self {
328 |             magic,
329 |             bytes,
330 |             reserved,
331 |         })
332 |     }
333 | 
334 |     pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
335 |         let mut buffer = [0; INDEX_HEADER_SIZE];
336 |         buffer.copy_from_slice(&bytes[..INDEX_HEADER_SIZE]);
337 |         Self::from_reader(&mut Cursor::new(buffer))
338 |     }
339 | 
340 |     /// Serializes the index header to a binary format and writes it to the provided writer
341 |     ///
342 |     /// This method serializes the `IndexHeader` to a fixed-size 32-byte structure and
343 |     /// writes it to the provided writer. This is typically used when saving an index to a file.
344 |     ///
345 |     /// # Parameters
346 |     ///
347 |     /// * `writer` - The destination to write the serialized header to
348 |     ///
349 |     /// # Returns
350 |     ///
351 |     /// * `Ok(())` - If the header was successfully written
352 |     /// * `Err(_)` - If an error occurred during writing
353 |     ///
354 |     /// # Format
355 |     ///
356 |     /// The header is serialized as:
357 |     /// - Bytes 0-7: magic number (u64, little endian)
358 |     /// - Bytes 8-15: file size in bytes (u64, little endian)
359 |     /// - Bytes 16-31: reserved for future extensions
360 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
361 |         let mut buffer = [0; INDEX_HEADER_SIZE];
362 |         LittleEndian::write_u64(&mut buffer[0..8], self.magic);
363 |         LittleEndian::write_u64(&mut buffer[8..16], self.bytes);
364 |         buffer[16..].copy_from_slice(&self.reserved);
365 |         writer.write_all(&buffer)?;
366 |         Ok(())
367 |     }
368 | }
369 | 
370 | /// Complete index for a VBINSEQ file
371 | ///
372 | /// A `BlockIndex` contains metadata about a VBINSEQ file and all of its blocks,
373 | /// enabling efficient random access and parallel processing. It consists of an
374 | /// `IndexHeader` and a collection of `BlockRange` entries, one for each block in
375 | /// the file.
376 | ///
377 | /// The index can be created by scanning a VBINSEQ file or loaded from a previously
378 | /// created index file. Once loaded, it provides information about block locations,
379 | /// sizes, and record counts.
380 | ///
381 | /// # Examples
382 | ///
383 | /// ```rust,no_run
384 | /// use binseq::vbq::{BlockIndex, MmapReader};
385 | /// use std::path::Path;
386 | ///
387 | /// // Create an index from a VBINSEQ file
388 | /// let vbq_path = Path::new("example.vbq");
389 | /// let index = BlockIndex::from_vbq(vbq_path).unwrap();
390 | ///
391 | /// // Save the index for future use
392 | /// let index_path = Path::new("example.vbq.vqi");
393 | /// index.save_to_path(index_path).unwrap();
394 | ///
395 | /// // Use the index with a reader for parallel processing
396 | /// let reader = MmapReader::new(vbq_path).unwrap();
397 | /// println!("File contains {} blocks", index.n_blocks());
398 | /// ```
399 | #[derive(Debug, Clone)]
400 | pub struct BlockIndex {
401 |     /// Header containing metadata about the indexed file
402 |     pub(crate) header: IndexHeader,
403 | 
404 |     /// Collection of block ranges, one for each block in the file
405 |     pub(crate) ranges: Vec<BlockRange>,
406 | }
407 | impl BlockIndex {
408 |     /// Creates a new empty block index with the specified header
409 |     ///
410 |     /// # Parameters
411 |     ///
412 |     /// * `header` - The index header containing metadata about the indexed file
413 |     ///
414 |     /// # Returns
415 |     ///
416 |     /// A new empty `BlockIndex` instance
417 |     #[must_use]
418 |     pub fn new(header: IndexHeader) -> Self {
419 |         Self {
420 |             header,
421 |             ranges: Vec::default(),
422 |         }
423 |     }
424 |     /// Returns the number of blocks in the indexed file
425 |     ///
426 |     /// # Returns
427 |     ///
428 |     /// The number of blocks in the VBINSEQ file described by this index
429 |     ///
430 |     /// # Examples
431 |     ///
432 |     /// ```rust,no_run
433 |     /// use binseq::vbq::BlockIndex;
434 |     /// use std::path::Path;
435 |     ///
436 |     /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap();
437 |     /// println!("The file contains {} blocks", index.n_blocks());
438 |     /// ```
439 |     #[must_use]
440 |     pub fn n_blocks(&self) -> usize {
441 |         self.ranges.len()
442 |     }
443 | 
444 |     /// Writes the collection of `BlockRange` to a file
445 |     /// Saves the index to a file
446 |     ///
447 |     /// This writes the index header and all block ranges to a file, which can be loaded
448 |     /// later to avoid rescanning the VBINSEQ file. The index is compressed to reduce
449 |     /// storage space.
450 |     ///
451 |     /// # Parameters
452 |     ///
453 |     /// * `path` - The path where the index file should be saved
454 |     ///
455 |     /// # Returns
456 |     ///
457 |     /// * `Ok(())` - If the index was successfully saved
458 |     /// * `Err(_)` - If an error occurred during saving
459 |     ///
460 |     /// # Examples
461 |     ///
462 |     /// ```rust,no_run
463 |     /// use binseq::vbq::BlockIndex;
464 |     /// use std::path::Path;
465 |     ///
466 |     /// // Create an index from a VBINSEQ file
467 |     /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap();
468 |     ///
469 |     /// // Save it for future use
470 |     /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap();
471 |     /// ```
472 |     pub fn save_to_path<P: AsRef<Path>>(&self, path: P) -> Result<()> {
473 |         let mut writer = File::create(path).map(BufWriter::new)?;
474 |         self.header.write_bytes(&mut writer)?;
475 |         let mut writer = Encoder::new(writer, 3)?.auto_finish();
476 |         self.write_range(&mut writer)?;
477 |         writer.flush()?;
478 |         Ok(())
479 |     }
480 | 
481 |     /// Write the index to an output buffer
482 |     pub fn write_bytes<W: Write>(&self, writer: &mut W) -> Result<()> {
483 |         self.header.write_bytes(writer)?;
484 |         let mut writer = Encoder::new(writer, 3)?.auto_finish();
485 |         self.write_range(&mut writer)?;
486 |         writer.flush()?;
487 |         Ok(())
488 |     }
489 | 
490 |     /// Write the collection of `BlockRange` to an output handle
491 |     /// Writes all block ranges to the provided writer
492 |     ///
493 |     /// This method is used internally by `save_to_path` to write the block ranges
494 |     /// to an index file. It can also be used to serialize an index to any destination
495 |     /// that implements `Write`.
496 |     ///
497 |     /// # Parameters
498 |     ///
499 |     /// * `writer` - The destination to write the block ranges to
500 |     ///
501 |     /// # Returns
502 |     ///
503 |     /// * `Ok(())` - If all block ranges were successfully written
504 |     /// * `Err(_)` - If an error occurred during writing
505 |     pub fn write_range<W: Write>(&self, writer: &mut W) -> Result<()> {
506 |         self.ranges
507 |             .iter()
508 |             .filter(|range| range.block_records > 0)
509 |             .try_for_each(|range| -> Result<()> { range.write_bytes(writer) })
510 |     }
511 | 
512 |     /// Adds a block range to the index
513 |     ///
514 |     /// This method is used internally during index creation to add information
515 |     /// about each block in the file. Blocks are typically added in order.
516 |     ///
517 |     /// # Parameters
518 |     ///
519 |     /// * `range` - The block range to add to the index
520 |     fn add_range(&mut self, range: BlockRange) {
521 |         self.ranges.push(range);
522 |     }
523 | 
524 |     /// Creates a new index by scanning a VBINSEQ file
525 |     ///
526 |     /// This method memory-maps the specified VBINSEQ file and scans it block by block
527 |     /// to create an index. The index can then be saved to a file for future use, enabling
528 |     /// efficient random access without rescanning the file.
529 |     ///
530 |     /// # Parameters
531 |     ///
532 |     /// * `path` - Path to the VBINSEQ file to index
533 |     ///
534 |     /// # Returns
535 |     ///
536 |     /// * `Ok(Self)` - A new `BlockIndex` containing information about all blocks in the file
537 |     /// * `Err(_)` - If an error occurred during file opening, validation, or scanning
538 |     ///
539 |     /// # Examples
540 |     ///
541 |     /// ```rust,no_run
542 |     /// use binseq::vbq::BlockIndex;
543 |     /// use std::path::Path;
544 |     ///
545 |     /// // Create an index from a VBINSEQ file
546 |     /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap();
547 |     ///
548 |     /// // Save the index for future use
549 |     /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap();
550 |     ///
551 |     /// // Get statistics about the file
552 |     /// println!("File contains {} blocks", index.n_blocks());
553 |     ///
554 |     /// // Analyze the record distribution
555 |     /// if let Some(last_range) = index.ranges().last() {
556 |     ///     println!("Total records: {}", last_range.cumulative_records);
557 |     ///     println!("Average records per block: {}",
558 |     ///              last_range.cumulative_records as f64 / index.n_blocks() as f64);
559 |     /// }
560 |     /// ```
561 |     ///
562 |     /// # Notes
563 |     ///
564 |     /// This method uses memory mapping for efficiency, which allows the operating system
565 |     /// to load only the needed portions of the file into memory as they are accessed.
566 |     pub fn from_vbq<P: AsRef<Path>>(path: P) -> Result<Self> {
567 |         let file = File::open(path)?;
568 |         let mmap = unsafe { memmap2::Mmap::map(&file)? };
569 |         let file_size = mmap.len();
570 | 
571 |         // Read header from mapped memory (unused but checks for validity)
572 |         let _header = {
573 |             let mut header_bytes = [0u8; SIZE_HEADER];
574 |             header_bytes.copy_from_slice(&mmap[..SIZE_HEADER]);
575 |             VBinseqHeader::from_bytes(&header_bytes)?
576 |         };
577 | 
578 |         // Initialize position after the header
579 |         let mut pos = SIZE_HEADER;
580 | 
581 |         // Initialize the collection
582 |         let index_header = IndexHeader::new(file_size as u64);
583 |         let mut index = BlockIndex::new(index_header);
584 | 
585 |         // Find all block headers
586 |         let mut record_total = 0;
587 |         while pos < mmap.len() {
588 |             let block_header = {
589 |                 let mut header_bytes = [0u8; SIZE_BLOCK_HEADER];
590 |                 header_bytes.copy_from_slice(&mmap[pos..pos + SIZE_BLOCK_HEADER]);
591 |                 BlockHeader::from_bytes(&header_bytes)?
592 |             };
593 |             index.add_range(BlockRange::new(
594 |                 pos as u64,
595 |                 block_header.size,
596 |                 block_header.records,
597 |                 record_total,
598 |             ));
599 |             pos += SIZE_BLOCK_HEADER + block_header.size as usize;
600 |             record_total += u64::from(block_header.records);
601 |         }
602 | 
603 |         Ok(index)
604 |     }
605 | 
606 |     /// Reads an index from a path
607 |     ///
608 |     /// # Panics
609 |     /// Panics if the path is not a valid UTF-8 string.
610 |     pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
611 |         let Some(upstream_file) = path.as_ref().to_str().unwrap().strip_suffix(".vqi") else {
612 |             return Err(IndexError::MissingUpstreamFile(
613 |                 path.as_ref().to_string_lossy().to_string(),
614 |             )
615 |             .into());
616 |         };
617 |         let upstream_handle = File::open(upstream_file)?;
618 |         let mmap = unsafe { memmap2::Mmap::map(&upstream_handle)? };
619 |         let file_size = mmap.len() as u64;
620 | 
621 |         let mut file_handle = File::open(path).map(BufReader::new)?;
622 |         let index_header = IndexHeader::from_reader(&mut file_handle)?;
623 |         if index_header.bytes != file_size {
624 |             return Err(IndexError::ByteSizeMismatch(file_size, index_header.bytes).into());
625 |         }
626 |         let buffer = {
627 |             let mut buffer = Vec::new();
628 |             let mut decoder = Decoder::new(file_handle)?;
629 |             decoder.read_to_end(&mut buffer)?;
630 |             buffer
631 |         };
632 | 
633 |         let mut ranges = Self::new(index_header);
634 |         let mut pos = 0;
635 |         while pos < buffer.len() {
636 |             let bound = pos + SIZE_BLOCK_RANGE;
637 |             let range = BlockRange::from_bytes(&buffer[pos..bound]);
638 |             ranges.add_range(range);
639 |             pos += SIZE_BLOCK_RANGE;
640 |         }
641 | 
642 |         Ok(ranges)
643 |     }
644 | 
645 |     pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
646 |         let index_header = IndexHeader::from_bytes(bytes)?;
647 |         let buffer = {
648 |             let mut buffer = Vec::new();
649 |             let mut decoder = Decoder::new(Cursor::new(&bytes[INDEX_HEADER_SIZE..]))?;
650 |             decoder.read_to_end(&mut buffer)?;
651 |             buffer
652 |         };
653 | 
654 |         let mut ranges = Self::new(index_header);
655 |         let mut pos = 0;
656 |         while pos < buffer.len() {
657 |             let bound = pos + SIZE_BLOCK_RANGE;
658 |             let range = BlockRange::from_bytes(&buffer[pos..bound]);
659 |             ranges.add_range(range);
660 |             pos += SIZE_BLOCK_RANGE;
661 |         }
662 | 
663 |         Ok(ranges)
664 |     }
665 | 
666 |     /// Get a reference to the internal ranges
667 |     /// Returns a reference to the collection of block ranges
668 |     ///
669 |     /// This provides access to the metadata for all blocks in the indexed file,
670 |     /// which can be used for operations like parallel processing or random access.
671 |     ///
672 |     /// # Returns
673 |     ///
674 |     /// A slice containing all `BlockRange` entries in this index
675 |     ///
676 |     /// # Examples
677 |     ///
678 |     /// ```rust,no_run
679 |     /// use binseq::vbq::BlockIndex;
680 |     /// use std::path::Path;
681 |     ///
682 |     /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap();
683 |     ///
684 |     /// // Examine the ranges to determine which blocks to process
685 |     /// for (i, range) in index.ranges().iter().enumerate() {
686 |     ///     println!("Block {}: {} records at offset {}",
687 |     ///              i, range.block_records, range.start_offset);
688 |     /// }
689 |     /// ```
690 |     #[must_use]
691 |     pub fn ranges(&self) -> &[BlockRange] {
692 |         &self.ranges
693 |     }
694 | 
695 |     pub fn pprint(&self) {
696 |         self.ranges.iter().for_each(|range| {
697 |             println!(
698 |                 "{}\t{}\t{}\t{}",
699 |                 range.start_offset, range.len, range.block_records, range.cumulative_records
700 |             );
701 |         });
702 |     }
703 | 
704 |     /// Returns the total number of records in the dataset
705 |     #[must_use]
706 |     pub fn num_records(&self) -> usize {
707 |         self.ranges
708 |             .iter()
709 |             .next_back()
710 |             .map(|r| (r.cumulative_records + u64::from(r.block_records)) as usize)
711 |             .unwrap_or_default()
712 |     }
713 | }
714 | 


--------------------------------------------------------------------------------
/src/bq/reader.rs:
--------------------------------------------------------------------------------
  1 | //! Binary sequence reader module
  2 | //!
  3 | //! This module provides functionality for reading binary sequence files using either:
  4 | //! 1. Memory mapping for efficient access to entire files
  5 | //! 2. Streaming for processing data as it arrives
  6 | //!
  7 | //! It supports both sequential and parallel processing of records,
  8 | //! with configurable record layouts for different sequence types.
  9 | 
 10 | use std::fs::File;
 11 | use std::io::Read;
 12 | use std::ops::Range;
 13 | use std::path::Path;
 14 | use std::sync::Arc;
 15 | 
 16 | use bitnuc::BitSize;
 17 | use bytemuck::cast_slice;
 18 | use memmap2::Mmap;
 19 | 
 20 | use super::header::{BinseqHeader, SIZE_HEADER};
 21 | use crate::{
 22 |     error::{ReadError, Result},
 23 |     BinseqRecord, Error, ParallelProcessor, ParallelReader,
 24 | };
 25 | 
 26 | /// A reference to a binary sequence record in a memory-mapped file
 27 | ///
 28 | /// This struct provides a view into a single record within a binary sequence file,
 29 | /// allowing access to the record's components (sequence data, flags, etc.) without
 30 | /// copying the data from the memory-mapped file.
 31 | ///
 32 | /// The record's data is stored in a compact binary format where:
 33 | /// - The first u64 contains flags
 34 | /// - Subsequent u64s contain the primary sequence data
 35 | /// - If present, final u64s contain the extended sequence data
 36 | #[derive(Clone, Copy)]
 37 | pub struct RefRecord<'a> {
 38 |     /// The position (index) of this record in the file (0-based record index, not byte offset)
 39 |     id: u64,
 40 |     /// The underlying u64 buffer representing the record's binary data
 41 |     buffer: &'a [u64],
 42 |     /// The configuration that defines the layout and size of record components
 43 |     config: RecordConfig,
 44 |     /// Cached index string for the sequence header
 45 |     header_buf: [u8; 20],
 46 |     /// Length of the header in bytes
 47 |     header_len: usize,
 48 | }
 49 | impl<'a> RefRecord<'a> {
 50 |     /// Creates a new record reference
 51 |     ///
 52 |     /// # Arguments
 53 |     ///
 54 |     /// * `id` - The record's position in the file (0-based record index, not byte offset)
 55 |     /// * `buffer` - The u64 slice containing the record's binary data
 56 |     /// * `config` - Configuration defining the record's layout
 57 |     ///
 58 |     /// # Panics
 59 |     ///
 60 |     /// Panics if the buffer length doesn't match the expected size from the config
 61 |     #[must_use]
 62 |     pub fn new(id: u64, buffer: &'a [u64], config: RecordConfig) -> Self {
 63 |         assert_eq!(buffer.len(), config.record_size_u64());
 64 |         Self {
 65 |             id,
 66 |             buffer,
 67 |             config,
 68 |             header_buf: [0; 20],
 69 |             header_len: 0,
 70 |         }
 71 |     }
 72 |     /// Returns the record's configuration
 73 |     ///
 74 |     /// The configuration defines the layout and size of the record's components.
 75 |     #[must_use]
 76 |     pub fn config(&self) -> RecordConfig {
 77 |         self.config
 78 |     }
 79 | 
 80 |     pub fn set_id(&mut self, id: &[u8]) {
 81 |         self.header_len = id.len();
 82 |         self.header_buf[..self.header_len].copy_from_slice(id);
 83 |     }
 84 | }
 85 | 
 86 | impl BinseqRecord for RefRecord<'_> {
 87 |     fn bitsize(&self) -> BitSize {
 88 |         self.config.bitsize
 89 |     }
 90 |     fn index(&self) -> u64 {
 91 |         self.id
 92 |     }
 93 |     /// Clear the buffer and fill it with the sequence header
 94 |     fn sheader(&self) -> &[u8] {
 95 |         &self.header_buf[..self.header_len]
 96 |     }
 97 | 
 98 |     /// Clear the buffer and fill it with the extended header
 99 |     fn xheader(&self) -> &[u8] {
100 |         self.sheader()
101 |     }
102 | 
103 |     fn flag(&self) -> Option<u64> {
104 |         if self.config.flags {
105 |             Some(self.buffer[0])
106 |         } else {
107 |             None
108 |         }
109 |     }
110 |     fn slen(&self) -> u64 {
111 |         self.config.slen
112 |     }
113 |     fn xlen(&self) -> u64 {
114 |         self.config.xlen
115 |     }
116 |     fn sbuf(&self) -> &[u64] {
117 |         if self.config.flags {
118 |             &self.buffer[1..=(self.config.schunk as usize)]
119 |         } else {
120 |             &self.buffer[..(self.config.schunk as usize)]
121 |         }
122 |     }
123 |     fn xbuf(&self) -> &[u64] {
124 |         if self.config.flags {
125 |             &self.buffer[1 + self.config.schunk as usize..]
126 |         } else {
127 |             &self.buffer[self.config.schunk as usize..]
128 |         }
129 |     }
130 | }
131 | 
132 | /// A reference to a record in the map with a precomputed decoded buffer slice
133 | pub struct BatchRecord<'a> {
134 |     /// Unprocessed buffer slice (with flags)
135 |     buffer: &'a [u64],
136 |     /// Decoded buffer slice
137 |     dbuf: &'a [u8],
138 |     /// Record ID
139 |     id: u64,
140 |     /// The configuration that defines the layout and size of record components
141 |     config: RecordConfig,
142 |     /// Cached index string for the sequence header
143 |     header_buf: [u8; 20],
144 |     /// Length of the header in bytes
145 |     header_len: usize,
146 | }
147 | impl BinseqRecord for BatchRecord<'_> {
148 |     fn bitsize(&self) -> BitSize {
149 |         self.config.bitsize
150 |     }
151 |     fn index(&self) -> u64 {
152 |         self.id
153 |     }
154 |     /// Clear the buffer and fill it with the sequence header
155 |     fn sheader(&self) -> &[u8] {
156 |         &self.header_buf[..self.header_len]
157 |     }
158 | 
159 |     /// Clear the buffer and fill it with the extended header
160 |     fn xheader(&self) -> &[u8] {
161 |         self.sheader()
162 |     }
163 | 
164 |     fn flag(&self) -> Option<u64> {
165 |         if self.config.flags {
166 |             Some(self.buffer[0])
167 |         } else {
168 |             None
169 |         }
170 |     }
171 |     fn slen(&self) -> u64 {
172 |         self.config.slen
173 |     }
174 |     fn xlen(&self) -> u64 {
175 |         self.config.xlen
176 |     }
177 |     fn sbuf(&self) -> &[u64] {
178 |         if self.config.flags {
179 |             &self.buffer[1..=(self.config.schunk as usize)]
180 |         } else {
181 |             &self.buffer[..(self.config.schunk as usize)]
182 |         }
183 |     }
184 |     fn xbuf(&self) -> &[u64] {
185 |         if self.config.flags {
186 |             &self.buffer[1 + self.config.schunk as usize..]
187 |         } else {
188 |             &self.buffer[self.config.schunk as usize..]
189 |         }
190 |     }
191 |     fn decode_s(&self, dbuf: &mut Vec<u8>) -> Result<()> {
192 |         dbuf.extend_from_slice(self.sseq());
193 |         Ok(())
194 |     }
195 |     fn decode_x(&self, dbuf: &mut Vec<u8>) -> Result<()> {
196 |         dbuf.extend_from_slice(self.xseq());
197 |         Ok(())
198 |     }
199 |     /// Override this method since we can make use of block information
200 |     fn sseq(&self) -> &[u8] {
201 |         let scalar = self.config.scalar();
202 |         let mut lbound = 0;
203 |         let mut rbound = self.config.slen();
204 |         if self.config.flags {
205 |             lbound += scalar;
206 |             rbound += scalar;
207 |         }
208 |         &self.dbuf[lbound..rbound]
209 |     }
210 |     /// Override this method since we can make use of block information
211 |     fn xseq(&self) -> &[u8] {
212 |         let scalar = self.config.scalar();
213 |         let mut lbound = scalar * self.config.schunk();
214 |         let mut rbound = lbound + self.config.xlen();
215 |         if self.config.flags {
216 |             lbound += scalar;
217 |             rbound += scalar;
218 |         }
219 |         &self.dbuf[lbound..rbound]
220 |     }
221 | }
222 | 
223 | /// Configuration for binary sequence record layout
224 | ///
225 | /// This struct defines the size and layout of binary sequence records,
226 | /// including both primary sequence data and optional extended data.
227 | /// It handles the translation between sequence lengths in base pairs
228 | /// and the number of u64 chunks needed to store the compressed data.
229 | #[derive(Clone, Copy)]
230 | pub struct RecordConfig {
231 |     /// The primary sequence length in base pairs
232 |     slen: u64,
233 |     /// The extended sequence length in base pairs
234 |     xlen: u64,
235 |     /// The number of u64 chunks needed to store the primary sequence
236 |     /// (each u64 stores 32 nucleotides)
237 |     schunk: u64,
238 |     /// The number of u64 chunks needed to store the extended sequence
239 |     /// (each u64 stores 32 values)
240 |     xchunk: u64,
241 |     /// The bitsize of the record
242 |     bitsize: BitSize,
243 |     /// Whether flags are present
244 |     flags: bool,
245 | }
246 | impl RecordConfig {
247 |     /// Creates a new record configuration
248 |     ///
249 |     /// This constructor initializes a configuration for a binary sequence record
250 |     /// with specified primary and extended sequence lengths.
251 |     ///
252 |     /// # Arguments
253 |     ///
254 |     /// * `slen` - The length of primary sequences in the file
255 |     /// * `xlen` - The length of secondary/extended sequences in the file
256 |     /// * `bitsize` - The bitsize of the record
257 |     /// * `flags` - Whether flags are present
258 |     ///
259 |     /// # Returns
260 |     ///
261 |     /// A new `RecordConfig` instance with the specified sequence lengths
262 |     pub fn new(slen: usize, xlen: usize, bitsize: BitSize, flags: bool) -> Self {
263 |         let (schunk, xchunk) = match bitsize {
264 |             BitSize::Two => (slen.div_ceil(32), xlen.div_ceil(32)),
265 |             BitSize::Four => (slen.div_ceil(16), xlen.div_ceil(16)),
266 |         };
267 |         Self {
268 |             slen: slen as u64,
269 |             xlen: xlen as u64,
270 |             schunk: schunk as u64,
271 |             xchunk: xchunk as u64,
272 |             bitsize,
273 |             flags,
274 |         }
275 |     }
276 | 
277 |     /// Creates a new record configuration from a header
278 |     ///
279 |     /// This constructor initializes a configuration based on a header that contains
280 |     /// the sequence lengths for primary and extended sequences.
281 |     ///
282 |     /// # Arguments
283 |     ///
284 |     /// * `header` - A reference to a `BinseqHeader` containing sequence lengths
285 |     ///
286 |     /// # Returns
287 |     ///
288 |     /// A new `RecordConfig` instance with the sequence lengths from the header
289 |     pub fn from_header(header: &BinseqHeader) -> Self {
290 |         Self::new(
291 |             header.slen as usize,
292 |             header.xlen as usize,
293 |             header.bits,
294 |             header.flags,
295 |         )
296 |     }
297 | 
298 |     /// Returns whether this record contains extended sequence data
299 |     ///
300 |     /// A record is considered paired if it has a non-zero extended sequence length.
301 |     pub fn paired(&self) -> bool {
302 |         self.xlen > 0
303 |     }
304 | 
305 |     /// Returns the primary sequence length in base pairs
306 |     ///
307 |     /// This method returns the length of the primary sequence in base pairs.
308 |     pub fn slen(&self) -> usize {
309 |         self.slen as usize
310 |     }
311 | 
312 |     /// Returns the extended sequence length in base pairs
313 |     ///
314 |     /// This method returns the length of the extended sequence in base pairs.
315 |     pub fn xlen(&self) -> usize {
316 |         self.xlen as usize
317 |     }
318 | 
319 |     /// Returns the number of u64 chunks needed to store the primary sequence
320 |     ///
321 |     /// This method returns the number of u64 chunks required to store the primary
322 |     /// sequence, where each u64 stores 32 nucleotides.
323 |     pub fn schunk(&self) -> usize {
324 |         self.schunk as usize
325 |     }
326 | 
327 |     /// Returns the number of u64 chunks needed to store the extended sequence
328 |     ///
329 |     /// This method returns the number of u64 chunks required to store the extended
330 |     /// sequence, where each u64 stores 32 values.
331 |     pub fn xchunk(&self) -> usize {
332 |         self.xchunk as usize
333 |     }
334 | 
335 |     /// Returns the full record size in bytes (u8):
336 |     /// 8 * (schunk + xchunk + 1 (flag))
337 |     pub fn record_size_bytes(&self) -> usize {
338 |         8 * self.record_size_u64()
339 |     }
340 | 
341 |     /// Returns the full record size in u64
342 |     /// schunk + xchunk + 1 (flag)
343 |     pub fn record_size_u64(&self) -> usize {
344 |         if self.flags {
345 |             (self.schunk + self.xchunk + 1) as usize
346 |         } else {
347 |             (self.schunk + self.xchunk) as usize
348 |         }
349 |     }
350 | 
351 |     /// The number of nucleotides per word
352 |     pub fn scalar(&self) -> usize {
353 |         match self.bitsize {
354 |             BitSize::Two => 32,
355 |             BitSize::Four => 16,
356 |         }
357 |     }
358 | }
359 | 
360 | /// A memory-mapped reader for binary sequence files
361 | ///
362 | /// This reader provides efficient access to binary sequence files by memory-mapping
363 | /// them instead of performing traditional I/O operations. It supports both
364 | /// sequential access to individual records and parallel processing of records
365 | /// across multiple threads.
366 | ///
367 | /// The reader ensures thread-safety through the use of `Arc` for sharing the
368 | /// memory-mapped data between threads.
369 | ///
370 | /// Records are returned as [`RefRecord`] which implement the [`BinseqRecord`] trait.
371 | ///
372 | /// # Examples
373 | ///
374 | /// ```
375 | /// use binseq::bq::MmapReader;
376 | /// use binseq::Result;
377 | ///
378 | /// fn main() -> Result<()> {
379 | ///     let path = "./data/subset.bq";
380 | ///     let reader = MmapReader::new(path)?;
381 | ///
382 | ///     // Calculate the number of records in the file
383 | ///     let num_records = reader.num_records();
384 | ///     println!("Number of records: {}", num_records);
385 | ///
386 | ///     // Get the record at index 20 (0-indexed)
387 | ///     let record = reader.get(20)?;
388 | ///
389 | ///     Ok(())
390 | /// }
391 | /// ```
392 | pub struct MmapReader {
393 |     /// Memory mapped file contents, wrapped in Arc for thread-safe sharing
394 |     mmap: Arc<Mmap>,
395 | 
396 |     /// Binary sequence file header containing format information
397 |     header: BinseqHeader,
398 | 
399 |     /// Configuration defining the layout of records in the file
400 |     config: RecordConfig,
401 | }
402 | 
403 | impl MmapReader {
404 |     /// Creates a new memory-mapped reader for a binary sequence file
405 |     ///
406 |     /// This method opens the file, memory-maps its contents, and validates
407 |     /// the file structure to ensure it contains valid binary sequence data.
408 |     ///
409 |     /// # Arguments
410 |     ///
411 |     /// * `path` - Path to the binary sequence file
412 |     ///
413 |     /// # Returns
414 |     ///
415 |     /// * `Ok(MmapReader)` - A new reader if the file is valid
416 |     /// * `Err(Error)` - If the file is invalid or cannot be opened
417 |     ///
418 |     /// # Errors
419 |     ///
420 |     /// Returns an error if:
421 |     /// * The file cannot be opened
422 |     /// * The file is not a regular file
423 |     /// * The file header is invalid
424 |     /// * The file size doesn't match the expected size based on the header
425 |     pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
426 |         // Verify input file is a file before attempting to map
427 |         let file = File::open(path)?;
428 |         if !file.metadata()?.is_file() {
429 |             return Err(ReadError::IncompatibleFile.into());
430 |         }
431 | 
432 |         // Safety: the file is open and won't be modified while mapped
433 |         let mmap = unsafe { Mmap::map(&file)? };
434 | 
435 |         // Read header from mapped memory
436 |         let header = BinseqHeader::from_buffer(&mmap)?;
437 | 
438 |         // Record configuraration
439 |         let config = RecordConfig::from_header(&header);
440 | 
441 |         // Immediately validate the size of the file against the expected byte size of records
442 |         if !(mmap.len() - SIZE_HEADER).is_multiple_of(config.record_size_bytes()) {
443 |             return Err(ReadError::FileTruncation(mmap.len()).into());
444 |         }
445 | 
446 |         Ok(Self {
447 |             mmap: Arc::new(mmap),
448 |             header,
449 |             config,
450 |         })
451 |     }
452 | 
453 |     /// Returns the total number of records in the file
454 |     ///
455 |     /// This is calculated by subtracting the header size from the total file size
456 |     /// and dividing by the size of each record.
457 |     #[must_use]
458 |     pub fn num_records(&self) -> usize {
459 |         (self.mmap.len() - SIZE_HEADER) / self.config.record_size_bytes()
460 |     }
461 | 
462 |     /// Returns a copy of the binary sequence file header
463 |     ///
464 |     /// The header contains format information and sequence length specifications.
465 |     #[must_use]
466 |     pub fn header(&self) -> BinseqHeader {
467 |         self.header
468 |     }
469 | 
470 |     /// Checks if the file has paired-records
471 |     #[must_use]
472 |     pub fn is_paired(&self) -> bool {
473 |         self.header.is_paired()
474 |     }
475 | 
476 |     /// Returns a reference to a specific record
477 |     ///
478 |     /// # Arguments
479 |     ///
480 |     /// * `idx` - The index of the record to retrieve (0-based)
481 |     ///
482 |     /// # Returns
483 |     ///
484 |     /// * `Ok(RefRecord)` - A reference to the requested record
485 |     /// * `Err(Error)` - If the index is out of bounds
486 |     ///
487 |     /// # Errors
488 |     ///
489 |     /// Returns an error if the requested index is beyond the number of records in the file
490 |     pub fn get(&self, idx: usize) -> Result<RefRecord<'_>> {
491 |         if idx > self.num_records() {
492 |             return Err(ReadError::OutOfRange(idx, self.num_records()).into());
493 |         }
494 |         let rsize = self.config.record_size_bytes();
495 |         let lbound = SIZE_HEADER + (idx * rsize);
496 |         let rbound = lbound + rsize;
497 |         let bytes = &self.mmap[lbound..rbound];
498 |         let buffer = cast_slice(bytes);
499 |         Ok(RefRecord::new(idx as u64, buffer, self.config))
500 |     }
501 | 
502 |     /// Returns a slice of the buffer containing the underlying u64 for that range
503 |     /// of records.
504 |     ///
505 |     /// Note: range 10..40 will return all u64s in the mmap between the record index 10 and 40
506 |     pub fn get_buffer_slice(&self, range: Range<usize>) -> Result<&[u64]> {
507 |         if range.end > self.num_records() {
508 |             return Err(ReadError::OutOfRange(range.end, self.num_records()).into());
509 |         }
510 |         let rsize = self.config.record_size_bytes();
511 |         let total_records = range.end - range.start;
512 |         let lbound = SIZE_HEADER + (range.start * rsize);
513 |         let rbound = lbound + (total_records * rsize);
514 |         let bytes = &self.mmap[lbound..rbound];
515 |         let buffer = cast_slice(bytes);
516 |         Ok(buffer)
517 |     }
518 | }
519 | 
520 | /// A reader for streaming binary sequence data from any source that implements Read
521 | ///
522 | /// Unlike `MmapReader` which requires the entire file to be accessible at once,
523 | /// `StreamReader` processes data as it becomes available, making it suitable for:
524 | /// - Processing data as it arrives over a network
525 | /// - Handling very large files that exceed available memory
526 | /// - Pipeline processing where data is flowing continuously
527 | ///
528 | /// The reader maintains an internal buffer and can handle partial record reconstruction
529 | /// across chunk boundaries.
530 | pub struct StreamReader<R: Read> {
531 |     /// The source reader for binary sequence data
532 |     reader: R,
533 | 
534 |     /// Binary sequence file header containing format information
535 |     header: Option<BinseqHeader>,
536 | 
537 |     /// Configuration defining the layout of records in the file
538 |     config: Option<RecordConfig>,
539 | 
540 |     /// Buffer for storing incoming data
541 |     buffer: Vec<u8>,
542 | 
543 |     /// Current position in the buffer
544 |     buffer_pos: usize,
545 | 
546 |     /// Length of valid data in the buffer
547 |     buffer_len: usize,
548 | }
549 | 
550 | impl<R: Read> StreamReader<R> {
551 |     /// Creates a new `StreamReader` with the default buffer size
552 |     ///
553 |     /// This constructor initializes a `StreamReader` that will read from the provided
554 |     /// source, using an 8K default buffer size.
555 |     ///
556 |     /// # Arguments
557 |     ///
558 |     /// * `reader` - The source to read binary sequence data from
559 |     ///
560 |     /// # Returns
561 |     ///
562 |     /// A new `StreamReader` instance
563 |     pub fn new(reader: R) -> Self {
564 |         Self::with_capacity(reader, 8192)
565 |     }
566 | 
567 |     /// Creates a new `StreamReader` with a specified buffer capacity
568 |     ///
569 |     /// This constructor initializes a `StreamReader` with a custom buffer size,
570 |     /// which can be tuned based on the expected usage pattern.
571 |     ///
572 |     /// # Arguments
573 |     ///
574 |     /// * `reader` - The source to read binary sequence data from
575 |     /// * `capacity` - The size of the internal buffer in bytes
576 |     ///
577 |     /// # Returns
578 |     ///
579 |     /// A new `StreamReader` instance with the specified buffer capacity
580 |     pub fn with_capacity(reader: R, capacity: usize) -> Self {
581 |         Self {
582 |             reader,
583 |             header: None,
584 |             config: None,
585 |             buffer: vec![0; capacity],
586 |             buffer_pos: 0,
587 |             buffer_len: 0,
588 |             // buffer_capacity: capacity,
589 |         }
590 |     }
591 | 
592 |     /// Reads and validates the header from the underlying reader
593 |     ///
594 |     /// This method reads the binary sequence file header and validates it.
595 |     /// It caches the header internally for future use.
596 |     ///
597 |     /// # Returns
598 |     ///
599 |     /// * `Ok(&BinseqHeader)` - A reference to the validated header
600 |     /// * `Err(Error)` - If reading or validating the header fails
601 |     ///
602 |     /// # Panics
603 |     ///
604 |     /// Panics if the header is missing when expected in the stream.
605 |     ///
606 |     /// # Errors
607 |     ///
608 |     /// Returns an error if:
609 |     /// * There is an I/O error when reading from the source
610 |     /// * The header data is invalid
611 |     /// * End of stream is reached before the full header can be read
612 |     pub fn read_header(&mut self) -> Result<&BinseqHeader> {
613 |         if self.header.is_some() {
614 |             return Ok(self
615 |                 .header
616 |                 .as_ref()
617 |                 .expect("Missing header when expected in stream"));
618 |         }
619 | 
620 |         // Ensure we have enough data for the header
621 |         while self.buffer_len - self.buffer_pos < SIZE_HEADER {
622 |             self.fill_buffer()?;
623 |         }
624 | 
625 |         // Parse header
626 |         let header_slice = &self.buffer[self.buffer_pos..self.buffer_pos + SIZE_HEADER];
627 |         let header = BinseqHeader::from_buffer(header_slice)?;
628 | 
629 |         self.header = Some(header);
630 |         self.config = Some(RecordConfig::from_header(&header));
631 |         self.buffer_pos += SIZE_HEADER;
632 | 
633 |         Ok(self.header.as_ref().unwrap())
634 |     }
635 | 
636 |     /// Fills the internal buffer with more data from the reader
637 |     ///
638 |     /// This method reads more data from the underlying reader, handling
639 |     /// the case where some unprocessed data remains in the buffer.
640 |     ///
641 |     /// # Returns
642 |     ///
643 |     /// * `Ok(())` - If the buffer was successfully filled with new data
644 |     /// * `Err(Error)` - If reading from the source fails
645 |     ///
646 |     /// # Errors
647 |     ///
648 |     /// Returns an error if:
649 |     /// * There is an I/O error when reading from the source
650 |     /// * End of stream is reached (no more data available)
651 |     fn fill_buffer(&mut self) -> Result<()> {
652 |         // Move remaining data to beginning of buffer if needed
653 |         if self.buffer_pos > 0 && self.buffer_pos < self.buffer_len {
654 |             self.buffer.copy_within(self.buffer_pos..self.buffer_len, 0);
655 |             self.buffer_len -= self.buffer_pos;
656 |             self.buffer_pos = 0;
657 |         } else if self.buffer_pos == self.buffer_len {
658 |             self.buffer_len = 0;
659 |             self.buffer_pos = 0;
660 |         }
661 | 
662 |         // Read more data
663 |         let bytes_read = self.reader.read(&mut self.buffer[self.buffer_len..])?;
664 |         if bytes_read == 0 {
665 |             return Err(ReadError::EndOfStream.into());
666 |         }
667 | 
668 |         self.buffer_len += bytes_read;
669 |         Ok(())
670 |     }
671 | 
672 |     /// Retrieves the next record from the stream
673 |     ///
674 |     /// This method reads and processes the next complete record from the stream.
675 |     /// It handles the case where a record spans multiple buffer fills.
676 |     ///
677 |     /// # Returns
678 |     ///
679 |     /// * `Ok(Some(RefRecord))` - The next record was successfully read
680 |     /// * `Ok(None)` - End of stream was reached (no more records)
681 |     /// * `Err(Error)` - If an error occurred during reading
682 |     ///
683 |     /// # Panics
684 |     ///
685 |     /// Panics if the configuration is missing when expected in the stream.
686 |     ///
687 |     /// # Errors
688 |     ///
689 |     /// Returns an error if:
690 |     /// * There is an I/O error when reading from the source
691 |     /// * The header has not been read yet
692 |     /// * The data format is invalid
693 |     pub fn next_record(&mut self) -> Option<Result<RefRecord<'_>>> {
694 |         // Ensure header is read
695 |         if self.header.is_none() {
696 |             if let Some(e) = self.read_header().err() {
697 |                 return Some(Err(e));
698 |             }
699 |         }
700 | 
701 |         let config = self
702 |             .config
703 |             .expect("Missing configuration when expected in stream");
704 |         let record_size = config.record_size_bytes();
705 | 
706 |         // Ensure we have enough data for a complete record
707 |         while self.buffer_len - self.buffer_pos < record_size {
708 |             match self.fill_buffer() {
709 |                 Ok(()) => {}
710 |                 Err(Error::ReadError(ReadError::EndOfStream)) => {
711 |                     // End of stream reached - if we have any partial data, it's an error
712 |                     if self.buffer_len - self.buffer_pos > 0 {
713 |                         return Some(Err(ReadError::PartialRecord(
714 |                             self.buffer_len - self.buffer_pos,
715 |                         )
716 |                         .into()));
717 |                     }
718 |                     return None;
719 |                 }
720 |                 Err(e) => return Some(Err(e)),
721 |             }
722 |         }
723 | 
724 |         // Process record
725 |         let record_start = self.buffer_pos;
726 |         self.buffer_pos += record_size;
727 | 
728 |         let record_bytes = &self.buffer[record_start..record_start + record_size];
729 |         let record_u64s = cast_slice(record_bytes);
730 | 
731 |         // Create record with incremental ID (based on read position)
732 |         let id = (record_start - SIZE_HEADER) / record_size;
733 |         Some(Ok(RefRecord::new(id as u64, record_u64s, config)))
734 |     }
735 | 
736 |     /// Consumes the stream reader and returns the inner reader
737 |     ///
738 |     /// This method is useful when you need access to the underlying reader
739 |     /// after processing is complete.
740 |     ///
741 |     /// # Returns
742 |     ///
743 |     /// The inner reader that was used by this `StreamReader`
744 |     pub fn into_inner(self) -> R {
745 |         self.reader
746 |     }
747 | }
748 | 
749 | /// Default batch size for parallel processing
750 | ///
751 | /// This constant defines how many records each thread processes at a time
752 | /// during parallel processing operations.
753 | pub const BATCH_SIZE: usize = 1024;
754 | 
755 | /// Parallel processing implementation for memory-mapped readers
756 | impl ParallelReader for MmapReader {
757 |     /// Processes all records in parallel using multiple threads
758 |     ///
759 |     /// This method distributes the records across the specified number of threads
760 |     /// and processes them using the provided processor. Each thread receives its
761 |     /// own clone of the processor and processes a contiguous chunk of records.
762 |     ///
763 |     /// # Arguments
764 |     ///
765 |     /// * `processor` - The processor to use for handling records
766 |     /// * `num_threads` - The number of threads to use for processing
767 |     ///
768 |     /// # Type Parameters
769 |     ///
770 |     /// * `P` - A type that implements `ParallelProcessor` and can be cloned
771 |     ///
772 |     /// # Returns
773 |     ///
774 |     /// * `Ok(())` - If all records were processed successfully
775 |     /// * `Err(Error)` - If an error occurred during processing
776 |     fn process_parallel<P: ParallelProcessor + Clone + 'static>(
777 |         self,
778 |         processor: P,
779 |         num_threads: usize,
780 |     ) -> Result<()> {
781 |         let num_records = self.num_records();
782 |         self.process_parallel_range(processor, num_threads, 0..num_records)
783 |     }
784 | 
785 |     /// Process records in parallel within a specified range
786 |     ///
787 |     /// This method allows parallel processing of a subset of records within the file,
788 |     /// defined by a start and end index. The range is distributed across the specified
789 |     /// number of threads.
790 |     ///
791 |     /// # Arguments
792 |     ///
793 |     /// * `processor` - The processor to use for each record
794 |     /// * `num_threads` - The number of threads to spawn
795 |     /// * `range` - The range of record indices to process
796 |     ///
797 |     /// # Type Parameters
798 |     ///
799 |     /// * `P` - A type that implements `ParallelProcessor` and can be cloned
800 |     ///
801 |     /// # Returns
802 |     ///
803 |     /// * `Ok(())` - If all records were processed successfully
804 |     /// * `Err(Error)` - If an error occurred during processing
805 |     fn process_parallel_range<P: ParallelProcessor + Clone + 'static>(
806 |         self,
807 |         processor: P,
808 |         num_threads: usize,
809 |         range: Range<usize>,
810 |     ) -> Result<()> {
811 |         // Calculate the number of threads to use
812 |         let num_threads = if num_threads == 0 {
813 |             num_cpus::get()
814 |         } else {
815 |             num_threads.min(num_cpus::get())
816 |         };
817 | 
818 |         // Validate range
819 |         let num_records = self.num_records();
820 |         if range.start >= num_records || range.end > num_records || range.start >= range.end {
821 |             return Ok(()); // Nothing to process or invalid range
822 |         }
823 | 
824 |         // Calculate number of records for each thread within the range
825 |         let range_size = range.end - range.start;
826 |         let records_per_thread = range_size.div_ceil(num_threads);
827 | 
828 |         // Arc self
829 |         let reader = Arc::new(self);
830 | 
831 |         // Build thread handles
832 |         let mut handles = Vec::new();
833 |         for tid in 0..num_threads {
834 |             let mut processor = processor.clone();
835 |             let reader = reader.clone();
836 |             processor.set_tid(tid);
837 | 
838 |             let handle = std::thread::spawn(move || -> Result<()> {
839 |                 let start_idx = range.start + tid * records_per_thread;
840 |                 let end_idx = (start_idx + records_per_thread).min(range.end);
841 | 
842 |                 if start_idx >= end_idx {
843 |                     return Ok(()); // No records for this thread
844 |                 }
845 | 
846 |                 // create a reusable buffer for translating record IDs
847 |                 let mut translater = itoa::Buffer::new();
848 | 
849 |                 // initialize a decoding buffer
850 |                 let mut dbuf = Vec::new();
851 | 
852 |                 // calculate the size of a record in the cast u64 slice
853 |                 let rsize_u64 = reader.config.record_size_bytes() / 8;
854 | 
855 |                 // determine the required scalar size
856 |                 let scalar = reader.config.scalar();
857 | 
858 |                 // calculate the size of a record in the batch decoded buffer
859 |                 let mut dbuf_rsize = { (reader.config.schunk() + reader.config.xchunk()) * scalar };
860 |                 if reader.config.flags {
861 |                     dbuf_rsize += scalar;
862 |                 }
863 | 
864 |                 // iterate over the range of indices
865 |                 for range_start in (start_idx..end_idx).step_by(BATCH_SIZE) {
866 |                     let range_end = (range_start + BATCH_SIZE).min(end_idx);
867 | 
868 |                     // clear the decoded buffer
869 |                     dbuf.clear();
870 | 
871 |                     // get the encoded buffer slice
872 |                     let ebuf = reader.get_buffer_slice(range_start..range_end)?;
873 | 
874 |                     // decode the entire buffer at once (with flags and extra bases)
875 |                     reader
876 |                         .config
877 |                         .bitsize
878 |                         .decode(ebuf, ebuf.len() * scalar, &mut dbuf)?;
879 | 
880 |                     // iterate over each index in the range
881 |                     for (inner_idx, idx) in (range_start..range_end).enumerate() {
882 |                         // translate the index
883 |                         let id_str = translater.format(idx);
884 | 
885 |                         // create the index buffer
886 |                         let mut header_buf = [0; 20];
887 |                         let header_len = id_str.len();
888 |                         header_buf[..header_len].copy_from_slice(id_str.as_bytes());
889 | 
890 |                         // find the buffer starts
891 |                         let ebuf_start = inner_idx * rsize_u64;
892 |                         let dbuf_start = inner_idx * dbuf_rsize;
893 | 
894 |                         // initialize the record
895 |                         let record = BatchRecord {
896 |                             buffer: &ebuf[ebuf_start..(ebuf_start + rsize_u64)],
897 |                             dbuf: &dbuf[dbuf_start..(dbuf_start + dbuf_rsize)],
898 |                             id: idx as u64,
899 |                             config: reader.config,
900 |                             header_buf,
901 |                             header_len,
902 |                         };
903 | 
904 |                         // process the record
905 |                         processor.process_record(record)?;
906 |                     }
907 | 
908 |                     // process the batch
909 |                     processor.on_batch_complete()?;
910 |                 }
911 | 
912 |                 Ok(())
913 |             });
914 | 
915 |             handles.push(handle);
916 |         }
917 | 
918 |         for handle in handles {
919 |             handle
920 |                 .join()
921 |                 .expect("Error joining handle (1)")
922 |                 .expect("Error joining handle (2)");
923 |         }
924 | 
925 |         Ok(())
926 |     }
927 | }
928 | 


--------------------------------------------------------------------------------