├── .cargo └── config.toml ├── .gitignore ├── Cargo.toml ├── README.md └── src ├── align.rs ├── cli ├── command.rs ├── idxopts.rs ├── ioopts.rs ├── mapopts.rs ├── mod.rs ├── preset.rs └── runopts.rs ├── index.rs ├── io.rs ├── main.rs └── stats.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | /data 4 | Cargo.lock 5 | profile.json 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mmr" 3 | version = "0.1.5" 4 | edition = "2021" 5 | license = "MIT" 6 | authors = ["Noam Teyssier "] 7 | description = "Minimap2 CLI written in rust using bindings with BINSEQ and VBINSEQ support." 8 | repository = "https://github.com/arcinstitute/mmr" 9 | categories = ["command-line-utilities", "science::bioinformatics"] 10 | keywords = ["long-read", "minimap2", "binseq", "vbinseq", "alignment"] 11 | 12 | [features] 13 | default = [] 14 | sse2only = ["minimap2/sse2only"] 15 | simde = ["minimap2/simde"] 16 | 17 | [dependencies] 18 | anyhow = "1.0.95" 19 | binseq = "0.6.2" 20 | clap = { version = "4.5.26", features = ["derive"] } 21 | csv = "1.3.1" 22 | indicatif = "0.17.11" 23 | minimap2 = { version = "0.1.23+minimap2.2.28", default-features = false } 24 | niffler = "3.0.0" 25 | num_cpus = "1.16.0" 26 | paraseq = "0.1.2" 27 | parking_lot = "0.12.3" 28 | serde_json = "1.0.138" 29 | serde = { version = "1.0.217", features = ["derive", "rc"] } 30 | 31 | [profile.release] 32 | lto = true 33 | codegen-units = 1 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mmr 2 | 3 | [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE.md) 4 | [![Crates.io](https://img.shields.io/crates/d/mmr?color=orange&label=crates.io)](https://crates.io/crates/mmr) 5 | 6 | A [minimap2](https://github.com/lh3/minimap2)-based aligner with [BINSEQ](https://github.com/arcinstitute/binseq) file format support (`*.bq` and `*.vbq`). 7 | For converting FASTQ to BINSEQ formats see [bqtools](https://github.com/arcinstitute/bqtools). 8 | 9 | This uses the [minimap2-rs](https://github.com/jguhlin/minimap2-rs) library which facilitates raw FFI bindings to the `minimap2` C library. 10 | 11 | ## Installation 12 | 13 | `mmr` is written in rust and deployed with [`cargo`](https://rustup.rs/). 14 | 15 | ```rust 16 | # install binary from cargo 17 | cargo install mmr 18 | 19 | # validate installation 20 | mmr --version 21 | ``` 22 | 23 | ## Usage 24 | 25 | `mmr` follows the same (or similar) CLI as the original [`minimap2`](https://github.com/lh3/minimap2) binary. 26 | 27 | ```bash 28 | # map a *.bq file 29 | mmr -x map-pb 30 | 31 | # map a *.vbq file 32 | mmr -x map-pb 33 | 34 | # map a *.fq file (supports compressed FASTQ as well) 35 | mmr -x map-pb 36 | ``` 37 | -------------------------------------------------------------------------------- /src/align.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::{File, OpenOptions}, 3 | io::{BufWriter, Write}, 4 | num::NonZeroI32, 5 | sync::Arc, 6 | time::Instant, 7 | }; 8 | 9 | use anyhow::{anyhow, Result}; 10 | use binseq::BinseqRecord; 11 | use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; 12 | use minimap2::{Aligner, Built, Mapping, Strand}; 13 | use paraseq::{fastx::Record, parallel::ProcessError}; 14 | use parking_lot::Mutex; 15 | use serde::Serialize; 16 | 17 | #[derive(Clone)] 18 | pub struct ParallelAlignment { 19 | aligner: Arc>, 20 | 21 | /// Local buffer for decoding records 22 | dbuf: Vec, 23 | 24 | /// Local write buffer for PAF records 25 | wbuf: Vec, 26 | 27 | /// Path name for the output file 28 | output_path: Option, 29 | 30 | /// IO lock 31 | io_lock: Arc>, 32 | 33 | /// Number of records processed (local/global) 34 | local_n_processed: usize, 35 | global_n_processed: Arc>, 36 | 37 | /// Start time 38 | start_time: Instant, 39 | 40 | /// Thread id (local) 41 | tid: usize, 42 | 43 | /// Progress bar 44 | pbar: Arc>, 45 | } 46 | impl ParallelAlignment { 47 | pub fn new(aligner: Aligner, output_path: Option) -> Result { 48 | Self::initialize_output(output_path.as_ref())?; 49 | let pbar = Self::initialize_pbar(); 50 | Ok(Self { 51 | aligner: Arc::new(aligner), 52 | dbuf: Vec::new(), 53 | wbuf: Vec::new(), 54 | io_lock: Arc::new(Mutex::new(())), 55 | local_n_processed: 0, 56 | global_n_processed: Arc::new(Mutex::new(0)), 57 | output_path, 58 | start_time: Instant::now(), 59 | tid: 0, 60 | pbar: Arc::new(Mutex::new(pbar)), 61 | }) 62 | } 63 | pub fn initialize_output(output_path: Option<&String>) -> Result<()> { 64 | if let Some(path) = output_path { 65 | File::create(path)?; 66 | Ok(()) 67 | } else { 68 | Ok(()) 69 | } 70 | } 71 | pub fn initialize_pbar() -> ProgressBar { 72 | let pbar = ProgressBar::new_spinner(); 73 | pbar.set_style( 74 | ProgressStyle::default_spinner() 75 | .template("{spinner:.cyan} [{elapsed_precise}] {msg}") 76 | .unwrap(), 77 | ); 78 | pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(10)); 79 | pbar 80 | } 81 | 82 | fn decode_record(&mut self, record: B) -> Result<(), binseq::Error> { 83 | self.dbuf.clear(); 84 | record.decode_s(&mut self.dbuf)?; 85 | Ok(()) 86 | } 87 | 88 | fn reopen_handle(&self) -> Result> { 89 | if let Some(path) = &self.output_path { 90 | let file = OpenOptions::new().append(true).open(path)?; 91 | let buffer = BufWriter::new(file); 92 | Ok(Box::new(buffer)) 93 | } else { 94 | let file = std::io::stdout(); 95 | let buffer = BufWriter::new(file); 96 | Ok(Box::new(buffer)) 97 | } 98 | } 99 | fn write_local(&mut self, mapping: Vec) -> Result<()> { 100 | let mut wtr = csv::WriterBuilder::new() 101 | .has_headers(false) 102 | .delimiter(b'\t') 103 | .from_writer(&mut self.wbuf); 104 | 105 | for alignment in mapping { 106 | let mapping: MappingNutype = alignment.into(); 107 | wtr.serialize(mapping)?; 108 | } 109 | wtr.flush()?; 110 | Ok(()) 111 | } 112 | fn write_record_set(&mut self) -> Result<()> { 113 | // Open a thread-safe stdout writer 114 | // 115 | // Drops lock when it goes out of scope 116 | { 117 | let _lock = self.io_lock.lock(); 118 | let mut handle = self.reopen_handle()?; 119 | handle.write_all(&self.wbuf)?; 120 | handle.flush()?; 121 | } 122 | 123 | // Clear the write buffer 124 | self.wbuf.clear(); 125 | 126 | Ok(()) 127 | } 128 | fn calculate_throughput(&self) -> f64 { 129 | let elapsed = self.start_time.elapsed().as_secs_f64(); 130 | *self.global_n_processed.lock() as f64 / elapsed 131 | } 132 | fn update_statistics(&mut self) { 133 | *self.global_n_processed.lock() += self.local_n_processed; 134 | self.local_n_processed = 0; 135 | } 136 | fn update_pbar(&self) { 137 | // only update progress bar on the main thread 138 | if self.tid == 0 { 139 | let pbar = self.pbar.lock(); 140 | let elapsed = self.start_time.elapsed().as_secs_f64(); 141 | let throughput = self.calculate_throughput(); 142 | let msg = format!("Elapsed: {elapsed:.2}s, Throughput: {throughput:.2} reads/s",); 143 | pbar.set_message(msg); 144 | } 145 | } 146 | pub fn finish_pbar(&self) { 147 | let pbar = self.pbar.lock(); 148 | let elapsed = self.start_time.elapsed().as_secs_f64(); 149 | let throughput = self.calculate_throughput(); 150 | let msg = format!("Elapsed: {elapsed:.2}s, Throughput: {throughput:.2} reads/s",); 151 | pbar.finish_with_message(msg); 152 | } 153 | pub fn start_time(&self) -> Instant { 154 | self.start_time 155 | } 156 | pub fn num_records(&self) -> usize { 157 | *self.global_n_processed.lock() 158 | } 159 | } 160 | impl binseq::ParallelProcessor for ParallelAlignment { 161 | fn process_record(&mut self, record: B) -> binseq::Result<()> { 162 | let query_name = format!("bq.{}", record.index()); 163 | self.decode_record(record)?; 164 | let mapping = match self.aligner.map( 165 | &self.dbuf, 166 | false, 167 | false, 168 | None, 169 | None, 170 | Some(query_name.as_bytes()), 171 | ) { 172 | Ok(mapping) => mapping, 173 | Err(err) => return Err(anyhow!("Error mapping record: {}", err).into()), 174 | }; 175 | self.local_n_processed += 1; 176 | self.write_local(mapping)?; 177 | Ok(()) 178 | } 179 | 180 | fn on_batch_complete(&mut self) -> Result<(), binseq::Error> { 181 | self.write_record_set()?; 182 | self.update_statistics(); 183 | self.update_pbar(); 184 | Ok(()) 185 | } 186 | 187 | fn set_tid(&mut self, tid: usize) { 188 | self.tid = tid; 189 | } 190 | } 191 | impl paraseq::parallel::ParallelProcessor for ParallelAlignment { 192 | fn process_record(&mut self, record: Rf) -> paraseq::parallel::Result<()> { 193 | let mapping = 194 | match self 195 | .aligner 196 | .map(record.seq(), false, false, None, None, Some(record.id())) 197 | { 198 | Ok(mapping) => mapping, 199 | Err(err) => { 200 | return Err(ProcessError::from(anyhow!("Error mapping record: {}", err))); 201 | } 202 | }; 203 | self.local_n_processed += 1; 204 | self.write_local(mapping)?; 205 | Ok(()) 206 | } 207 | 208 | fn on_batch_complete(&mut self) -> paraseq::parallel::Result<()> { 209 | self.write_record_set()?; 210 | self.update_statistics(); 211 | self.update_pbar(); 212 | Ok(()) 213 | } 214 | 215 | fn set_thread_id(&mut self, thread_id: usize) { 216 | self.tid = thread_id; 217 | } 218 | } 219 | 220 | #[derive(Debug, Clone, Serialize)] 221 | pub struct MappingNutype { 222 | pub query_name: Arc, 223 | pub query_len: Option, 224 | pub query_start: i32, 225 | pub query_end: i32, 226 | pub strand: char, 227 | pub target_name: Option>, 228 | pub target_len: i32, 229 | pub target_start: i32, 230 | pub target_end: i32, 231 | pub match_len: i32, 232 | pub block_len: i32, 233 | pub mapq: u32, 234 | // pub is_primary: bool, 235 | // pub is_supplementary: bool, 236 | // pub alignment: &'static str, 237 | } 238 | impl From for MappingNutype { 239 | fn from(mapping: Mapping) -> Self { 240 | MappingNutype { 241 | query_name: mapping 242 | .query_name 243 | .unwrap_or_else(|| Arc::new("*".to_string())), 244 | query_len: mapping.query_len, 245 | query_start: mapping.query_start, 246 | query_end: mapping.query_end, 247 | strand: match mapping.strand { 248 | Strand::Forward => '+', 249 | Strand::Reverse => '-', 250 | }, 251 | target_name: mapping.target_name, 252 | target_len: mapping.target_len, 253 | target_start: mapping.target_start, 254 | target_end: mapping.target_end, 255 | match_len: mapping.match_len, 256 | block_len: mapping.block_len, 257 | mapq: mapping.mapq, 258 | } 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /src/cli/command.rs: -------------------------------------------------------------------------------- 1 | use clap::{ 2 | builder::{ 3 | styling::{AnsiColor, Effects}, 4 | Styles, 5 | }, 6 | Parser, 7 | }; 8 | 9 | // Configures Clap v3-style help menu colors 10 | const STYLES: Styles = Styles::styled() 11 | .header(AnsiColor::Green.on_default().effects(Effects::BOLD)) 12 | .usage(AnsiColor::Green.on_default().effects(Effects::BOLD)) 13 | .literal(AnsiColor::Cyan.on_default().effects(Effects::BOLD)) 14 | .placeholder(AnsiColor::Yellow.on_default()); 15 | 16 | use super::{IndexOptions, IoOptions, MappingOptions, RunOptions}; 17 | 18 | #[derive(Parser)] 19 | #[command(styles = STYLES, version)] 20 | pub struct Cli { 21 | #[clap(flatten)] 22 | pub io_options: IoOptions, 23 | 24 | #[clap(flatten)] 25 | pub run_options: RunOptions, 26 | 27 | #[clap(flatten)] 28 | pub index_options: IndexOptions, 29 | 30 | #[clap(flatten)] 31 | pub mapping_options: MappingOptions, 32 | } 33 | -------------------------------------------------------------------------------- /src/cli/idxopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use super::PresetWrapper; 4 | 5 | #[derive(Parser, Clone, Copy)] 6 | #[clap(next_help_heading = "INDEX OPTIONS")] 7 | pub struct IndexOptions { 8 | #[clap(short, long, help = "k-mer size (no larger than 28) [default: 15]")] 9 | pub kmer_size: Option, 10 | 11 | #[clap( 12 | short, 13 | long, 14 | default_value = "10", 15 | help = "minimizer window size [default: 10]" 16 | )] 17 | pub window_size: Option, 18 | 19 | /// Preset to use when aligning reads 20 | #[clap(short = 'x', long)] 21 | pub preset: PresetWrapper, 22 | } 23 | -------------------------------------------------------------------------------- /src/cli/ioopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser)] 4 | #[clap(next_help_heading = "INPUT FILE OPTIONS")] 5 | pub struct IoOptions { 6 | #[clap(help = "Path to the file to index")] 7 | pub index_path: String, 8 | #[clap(help = "Path to the binseq file to query")] 9 | pub query_path: String, 10 | #[clap(short, long, help = "Path to the output file [default: stdout]")] 11 | pub output_path: Option, 12 | } 13 | -------------------------------------------------------------------------------- /src/cli/mapopts.rs: -------------------------------------------------------------------------------- 1 | use clap::{Parser, ValueEnum}; 2 | use minimap2::MapOpt; 3 | 4 | #[derive(Parser, Clone, Copy)] 5 | #[clap(next_help_heading = "MAPPING OPTIONS")] 6 | pub struct MappingOptions { 7 | #[clap( 8 | short = 'f', 9 | long, 10 | help = "Filter out top FLOAT fraction of repetitive minimizers [default = 0.0002]" 11 | )] 12 | pub mask_level: Option, 13 | 14 | #[clap( 15 | short = 'g', 16 | long, 17 | help = "Stop chain elongation if there are no minimizers in INT-bp [default = 10000]" 18 | )] 19 | pub max_gap: Option, 20 | 21 | #[clap( 22 | short = 'G', 23 | long, 24 | help = "Max intron length (effective with -xsplice) [default = 200000]" 25 | )] 26 | pub max_gap_ref: Option, 27 | 28 | #[clap( 29 | short = 'F', 30 | long, 31 | help = "Max fragment length (effective with -xsr or in the fragment mode) [default = 800]" 32 | )] 33 | pub max_frag_len: Option, 34 | 35 | #[clap(short = 'r', long, value_parser = parse_integer_tuple, help = "Chaining/alignment bandwidth and long-join bandwidth [default = 500,20000]")] 36 | pub bandwidth: Option<(i32, i32)>, 37 | 38 | #[clap( 39 | short = 'n', 40 | long, 41 | help = "Minimal number of minimizers on a chain [default = 3]" 42 | )] 43 | pub min_cnt: Option, 44 | 45 | #[clap( 46 | short = 'm', 47 | long, 48 | help = "Minimal chaining score (matching bases minus log gap penalty) [default = 40]" 49 | )] 50 | pub min_chain_score: Option, 51 | 52 | #[clap( 53 | short = 'p', 54 | long, 55 | help = "Min secondary-to-primary score ratio [default = 0.8]" 56 | )] 57 | pub pri_ratio: Option, 58 | 59 | #[clap( 60 | short = 'N', 61 | long, 62 | help = "Retain at most INT secondary alignments [default = 5]" 63 | )] 64 | pub best_n: Option, 65 | 66 | // Alignment scoring parameters 67 | #[clap(short = 'A', long, help = "Matching score [default = 2]")] 68 | pub a: Option, 69 | 70 | #[clap( 71 | short = 'B', 72 | long, 73 | help = "Mismatch penalty (larger value for lower divergence) [default = 4]" 74 | )] 75 | pub b: Option, 76 | 77 | #[clap(short = 'O', long, value_parser = parse_integer_tuple, help = "Gap open penalties. Format: INT,INT [default = 4,24]")] 78 | pub gap_open: Option<(i32, i32)>, 79 | 80 | #[clap(short = 'E', long, value_parser = parse_integer_tuple, help = "Gap extension penalties. Format: INT,INT [default = 2,1]")] 81 | pub gap_ext: Option<(i32, i32)>, 82 | 83 | #[clap(short = 'z', long, value_parser = parse_integer_tuple, help = "Z-drop score and inversion Z-drop score [default = 400,200]")] 84 | pub zdrop: Option<(i32, i32)>, 85 | 86 | #[clap( 87 | short = 'u', 88 | long = "splice-mode", 89 | help = "How to find canonical splicing sites GT-AG - f:transcript strand; b:both strands; r:reverse strand; n:don't match GT-AG [default = n]" 90 | )] 91 | pub splice_mode: Option, 92 | } 93 | fn parse_integer_tuple(s: &str) -> Result<(i32, i32), String> { 94 | let parts: Vec<&str> = s.split(',').collect(); 95 | if parts.len() != 2 { 96 | return Err("Expected format: INT,INT".to_string()); 97 | } 98 | Ok(( 99 | parts[0].parse::().map_err(|e| e.to_string())?, 100 | parts[1].parse::().map_err(|e| e.to_string())?, 101 | )) 102 | } 103 | 104 | /// How to find canonical splicing sites GT-AG 105 | #[derive(Clone, Copy, Debug, PartialEq, ValueEnum, Default)] 106 | pub enum SpliceSiteMode { 107 | /// Don't attempt to match GT-AG (default) 108 | #[clap(name = "n")] 109 | #[default] 110 | None, 111 | 112 | /// Match GT-AG on the forward/transcript strand only 113 | #[clap(name = "f")] 114 | Forward, 115 | 116 | /// Match GT-AG on both strands 117 | #[clap(name = "b")] 118 | Both, 119 | 120 | /// Match CT-AC on the reverse strand (reverse complement of GT-AG) 121 | #[clap(name = "r")] 122 | Reverse, 123 | } 124 | impl SpliceSiteMode { 125 | pub fn update_mapopt(&self, mapopt: &mut MapOpt) { 126 | match self { 127 | Self::None => { 128 | mapopt.unset_splice_for(); 129 | mapopt.unset_splice_rev(); 130 | } 131 | Self::Forward => { 132 | mapopt.set_splice_for(); 133 | mapopt.unset_splice_rev(); 134 | } 135 | Self::Both => { 136 | mapopt.set_splice_for(); 137 | mapopt.set_splice_rev(); 138 | } 139 | Self::Reverse => { 140 | mapopt.unset_splice_for(); 141 | mapopt.set_splice_rev(); 142 | } 143 | } 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | mod command; 2 | mod idxopts; 3 | mod ioopts; 4 | mod mapopts; 5 | mod preset; 6 | mod runopts; 7 | 8 | pub use command::Cli; 9 | pub use idxopts::IndexOptions; 10 | pub use ioopts::IoOptions; 11 | pub use mapopts::MappingOptions; 12 | pub use preset::PresetWrapper; 13 | pub use runopts::RunOptions; 14 | -------------------------------------------------------------------------------- /src/cli/preset.rs: -------------------------------------------------------------------------------- 1 | use clap::ValueEnum; 2 | use minimap2::Preset; 3 | 4 | #[derive(Debug, Clone, Copy, PartialEq, ValueEnum)] 5 | pub enum PresetWrapper { 6 | LrHqae, 7 | LrHq, 8 | Splice, 9 | SpliceHq, 10 | Asm, 11 | Asm5, 12 | Asm10, 13 | Asm20, 14 | Sr, 15 | MapPb, 16 | MapHifi, 17 | MapOnt, 18 | AvaPb, 19 | AvaOnt, 20 | Short, 21 | Map10k, 22 | Cdna, 23 | } 24 | impl From for Preset { 25 | fn from(value: PresetWrapper) -> Self { 26 | match value { 27 | PresetWrapper::LrHqae => Preset::LrHqae, 28 | PresetWrapper::LrHq => Preset::LrHq, 29 | PresetWrapper::Splice => Preset::Splice, 30 | PresetWrapper::SpliceHq => Preset::SpliceHq, 31 | PresetWrapper::Asm => Preset::Asm, 32 | PresetWrapper::Asm5 => Preset::Asm5, 33 | PresetWrapper::Asm10 => Preset::Asm10, 34 | PresetWrapper::Asm20 => Preset::Asm20, 35 | PresetWrapper::Sr => Preset::Sr, 36 | PresetWrapper::MapPb => Preset::MapPb, 37 | PresetWrapper::MapHifi => Preset::MapHifi, 38 | PresetWrapper::MapOnt => Preset::MapOnt, 39 | PresetWrapper::AvaPb => Preset::AvaPb, 40 | PresetWrapper::AvaOnt => Preset::AvaOnt, 41 | PresetWrapper::Short => Preset::Short, 42 | PresetWrapper::Map10k => Preset::Map10k, 43 | PresetWrapper::Cdna => Preset::Cdna, 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/cli/runopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser, Clone)] 4 | #[clap(next_help_heading = "RUN OPTIONS")] 5 | pub struct RunOptions { 6 | #[clap(short = 'T', long, default_value = "1")] 7 | n_threads: usize, 8 | #[clap(short = 'L', long)] 9 | pub log_path: Option, 10 | /// Write the option configuration to stderr 11 | #[clap(long)] 12 | pub show_options: bool, 13 | } 14 | impl RunOptions { 15 | pub fn n_threads(&self) -> usize { 16 | if self.n_threads == 0 { 17 | num_cpus::get() 18 | } else { 19 | self.n_threads.min(num_cpus::get()) 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/index.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use anyhow::{bail, Result}; 4 | use minimap2::{Aligner, Built, IdxOpt, MapOpt}; 5 | 6 | use crate::cli::{IndexOptions, MappingOptions}; 7 | 8 | pub fn build_index( 9 | path: &str, 10 | map_options: MappingOptions, 11 | idx_options: IndexOptions, 12 | n_threads: usize, 13 | log_options: bool, 14 | ) -> Result> { 15 | eprintln!("Building index..."); 16 | let start = std::time::Instant::now(); 17 | let aligner = Aligner::builder() 18 | .preset(idx_options.preset.into()) 19 | .with_index_threads(n_threads) 20 | .with_index(path, None); 21 | let duration = start.elapsed(); 22 | eprintln!("Index built in {:?}", duration); 23 | 24 | match aligner { 25 | Ok(mut aligner) => { 26 | update_map_options(&mut aligner, map_options); 27 | update_index_options(&mut aligner, idx_options); 28 | 29 | if log_options { 30 | pprint_index(&mut std::io::stderr(), aligner.idxopt)?; 31 | pprint_map(&mut std::io::stderr(), aligner.mapopt)?; 32 | } 33 | 34 | Ok(aligner) 35 | } 36 | Err(err) => bail!("Error building index: {}", err), 37 | } 38 | } 39 | 40 | fn update_map_options(aligner: &mut Aligner, map_options: MappingOptions) { 41 | if let Some(mask_level) = map_options.mask_level { 42 | aligner.mapopt.mask_level = mask_level; 43 | } 44 | if let Some(max_gap) = map_options.max_gap { 45 | aligner.mapopt.max_gap = max_gap; 46 | } 47 | if let Some(max_gap_ref) = map_options.max_gap_ref { 48 | aligner.mapopt.max_gap_ref = max_gap_ref; 49 | } 50 | if let Some(max_frag_len) = map_options.max_frag_len { 51 | aligner.mapopt.max_frag_len = max_frag_len; 52 | } 53 | if let Some(bandwidth) = map_options.bandwidth { 54 | aligner.mapopt.bw = bandwidth.0; 55 | aligner.mapopt.bw_long = bandwidth.1; 56 | } 57 | if let Some(min_cnt) = map_options.min_cnt { 58 | aligner.mapopt.min_cnt = min_cnt; 59 | } 60 | if let Some(min_chain_score) = map_options.min_chain_score { 61 | aligner.mapopt.min_chain_score = min_chain_score; 62 | } 63 | if let Some(pri_ratio) = map_options.pri_ratio { 64 | aligner.mapopt.pri_ratio = pri_ratio; 65 | } 66 | if let Some(best_n) = map_options.best_n { 67 | aligner.mapopt.best_n = best_n; 68 | } 69 | if let Some(a) = map_options.a { 70 | aligner.mapopt.a = a; 71 | } 72 | if let Some(b) = map_options.b { 73 | aligner.mapopt.b = b; 74 | } 75 | if let Some(gap_open) = map_options.gap_open { 76 | aligner.mapopt.q = gap_open.0; 77 | aligner.mapopt.q2 = gap_open.1; 78 | } 79 | if let Some(gap_ext) = map_options.gap_ext { 80 | aligner.mapopt.e = gap_ext.0; 81 | aligner.mapopt.e2 = gap_ext.1; 82 | } 83 | if let Some(zdrop) = map_options.zdrop { 84 | aligner.mapopt.zdrop = zdrop.0; 85 | aligner.mapopt.zdrop_inv = zdrop.1; 86 | } 87 | if let Some(splice_mode) = map_options.splice_mode { 88 | splice_mode.update_mapopt(&mut aligner.mapopt); 89 | } 90 | } 91 | 92 | fn update_index_options(aligner: &mut Aligner, idx_options: IndexOptions) { 93 | if let Some(k) = idx_options.kmer_size { 94 | aligner.idxopt.k = k; 95 | } 96 | if let Some(w) = idx_options.window_size { 97 | aligner.idxopt.w = w; 98 | } 99 | } 100 | 101 | fn pprint_index(writer: &mut W, opt: IdxOpt) -> Result<()> { 102 | writeln!(writer, "== Index Options ==")?; 103 | writeln!(writer, " k: {}", opt.k)?; 104 | writeln!(writer, " w: {}", opt.w)?; 105 | writeln!(writer, " flag: {}", opt.flag)?; 106 | writeln!(writer, " bucket_bits: {}", opt.bucket_bits)?; 107 | writeln!(writer, " mini_batch_size: {}", opt.mini_batch_size)?; 108 | writeln!(writer, " batch_size: {}", opt.batch_size)?; 109 | Ok(()) 110 | } 111 | 112 | fn pprint_map(writer: &mut W, opt: MapOpt) -> Result<()> { 113 | writeln!(writer, "== Mapping Options ==")?; 114 | writeln!(writer, " flag: {}", opt.flag)?; 115 | writeln!(writer, " seed: {}", opt.seed)?; 116 | writeln!(writer, " sdust_thres: {}", opt.sdust_thres)?; 117 | writeln!(writer, " max_qlen: {}", opt.max_qlen)?; 118 | writeln!(writer, " bw: {}", opt.bw)?; 119 | writeln!(writer, " bw_long: {}", opt.bw_long)?; 120 | writeln!(writer, " max_gap: {}", opt.max_gap)?; 121 | writeln!(writer, " max_gap_ref: {}", opt.max_gap_ref)?; 122 | writeln!(writer, " max_frag_len: {}", opt.max_frag_len)?; 123 | writeln!(writer, " max_chain_skip: {}", opt.max_chain_skip)?; 124 | writeln!(writer, " max_chain_iter: {}", opt.max_chain_iter)?; 125 | writeln!(writer, " min_cnt: {}", opt.min_cnt)?; 126 | writeln!(writer, " min_chain_score: {}", opt.min_chain_score)?; 127 | writeln!(writer, " chain_gap_scale: {}", opt.chain_gap_scale)?; 128 | writeln!(writer, " chain_skip_scale: {}", opt.chain_skip_scale)?; 129 | writeln!(writer, " rmq_size_cap: {}", opt.rmq_size_cap)?; 130 | writeln!(writer, " rmq_inner_dist: {}", opt.rmq_inner_dist)?; 131 | writeln!(writer, " rmq_rescue_size: {}", opt.rmq_rescue_size)?; 132 | writeln!(writer, " rmq_rescue_ratio: {}", opt.rmq_rescue_ratio)?; 133 | writeln!(writer, " mask_level: {}", opt.mask_level)?; 134 | writeln!(writer, " mask_len: {}", opt.mask_len)?; 135 | writeln!(writer, " pri_ratio: {}", opt.pri_ratio)?; 136 | writeln!(writer, " best_n: {}", opt.best_n)?; 137 | writeln!(writer, " alt_drop: {}", opt.alt_drop)?; 138 | writeln!(writer, " a: {}", opt.a)?; 139 | writeln!(writer, " b: {}", opt.b)?; 140 | writeln!(writer, " q: {}", opt.q)?; 141 | writeln!(writer, " e: {}", opt.e)?; 142 | writeln!(writer, " q2: {}", opt.q2)?; 143 | writeln!(writer, " e2: {}", opt.e2)?; 144 | writeln!(writer, " transition: {}", opt.transition)?; 145 | writeln!(writer, " sc_ambi: {}", opt.sc_ambi)?; 146 | writeln!(writer, " noncan: {}", opt.noncan)?; 147 | writeln!(writer, " junc_bonus: {}", opt.junc_bonus)?; 148 | writeln!(writer, " zdrop: {}", opt.zdrop)?; 149 | writeln!(writer, " zdrop_inv: {}", opt.zdrop_inv)?; 150 | writeln!(writer, " end_bonus: {}", opt.end_bonus)?; 151 | writeln!(writer, " min_dp_max: {}", opt.min_dp_max)?; 152 | writeln!(writer, " min_ksw_len: {}", opt.min_ksw_len)?; 153 | writeln!(writer, " anchor_ext_len: {}", opt.anchor_ext_len)?; 154 | writeln!(writer, " anchor_ext_shift: {}", opt.anchor_ext_shift)?; 155 | writeln!(writer, " max_clip_ratio: {}", opt.max_clip_ratio)?; 156 | writeln!(writer, " rank_min_len: {}", opt.rank_min_len)?; 157 | writeln!(writer, " rank_frac: {}", opt.rank_frac)?; 158 | writeln!(writer, " pe_ori: {}", opt.pe_ori)?; 159 | writeln!(writer, " pe_bonus: {}", opt.pe_bonus)?; 160 | writeln!(writer, " mid_occ_frac: {}", opt.mid_occ_frac)?; 161 | writeln!(writer, " q_occ_frac: {}", opt.q_occ_frac)?; 162 | writeln!(writer, " min_mid_occ: {}", opt.min_mid_occ)?; 163 | writeln!(writer, " max_mid_occ: {}", opt.max_mid_occ)?; 164 | writeln!(writer, " mid_occ: {}", opt.mid_occ)?; 165 | writeln!(writer, " max_occ: {}", opt.max_occ)?; 166 | writeln!(writer, " max_max_occ: {}", opt.max_max_occ)?; 167 | writeln!(writer, " occ_dist: {}", opt.occ_dist)?; 168 | writeln!(writer, " mini_batch_size: {}", opt.mini_batch_size)?; 169 | writeln!(writer, " max_sw_mat: {}", opt.max_sw_mat)?; 170 | writeln!(writer, " cap_kalloc: {}", opt.cap_kalloc)?; 171 | 172 | // For the pointer field, we need to handle it carefully 173 | // This is a C-style string pointer, so we should print it safely 174 | if !opt.split_prefix.is_null() { 175 | // Note: This is unsafe and assumes the string is valid UTF-8 176 | // In a real implementation, you might want more robust handling 177 | unsafe { 178 | let c_str = std::ffi::CStr::from_ptr(opt.split_prefix); 179 | if let Ok(str_slice) = c_str.to_str() { 180 | writeln!(writer, " split_prefix: {}", str_slice)?; 181 | } else { 182 | writeln!(writer, " split_prefix: ")?; 183 | } 184 | } 185 | } else { 186 | writeln!(writer, " split_prefix: ")?; 187 | } 188 | 189 | Ok(()) 190 | } 191 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{Read, Write}; 3 | 4 | use anyhow::Result; 5 | 6 | pub fn transparent_reader(input: &str) -> Result> { 7 | let (stream, _comp) = niffler::send::from_path(input)?; 8 | Ok(stream) 9 | } 10 | 11 | pub fn transparent_writer(output: Option<&str>) -> Result> { 12 | if let Some(path) = output { 13 | let stream = File::create(path)?; 14 | Ok(Box::new(stream)) 15 | } else { 16 | Ok(Box::new(std::io::stderr())) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use std::time::Instant; 3 | 4 | use anyhow::Result; 5 | use binseq::{BinseqReader, ParallelReader}; 6 | use clap::Parser; 7 | 8 | mod align; 9 | mod cli; 10 | mod index; 11 | mod io; 12 | mod stats; 13 | 14 | use align::ParallelAlignment; 15 | use cli::Cli; 16 | use index::build_index; 17 | use io::{transparent_reader, transparent_writer}; 18 | use paraseq::{fastq, parallel::ParallelReader as FastqParallelReader}; 19 | use stats::Runtime; 20 | 21 | fn report_runtime( 22 | program_start: Instant, 23 | map_start: Instant, 24 | num_records: usize, 25 | path: Option<&str>, 26 | ) -> Result<()> { 27 | let stats = Runtime::new(program_start, map_start, num_records); 28 | let mut wtr = transparent_writer(path)?; 29 | serde_json::to_writer_pretty(&mut wtr, &stats)?; 30 | wtr.flush()?; 31 | Ok(()) 32 | } 33 | 34 | fn process_fastq( 35 | aligner: ParallelAlignment, 36 | query_path: &str, 37 | n_threads: usize, 38 | start_time: Instant, 39 | log_path: Option<&str>, 40 | ) -> Result<()> { 41 | let stream = transparent_reader(query_path)?; 42 | let reader = fastq::Reader::new(stream); 43 | reader.process_parallel(aligner.clone(), n_threads)?; 44 | aligner.finish_pbar(); 45 | report_runtime( 46 | start_time, 47 | aligner.start_time(), 48 | aligner.num_records(), 49 | log_path, 50 | ) 51 | } 52 | 53 | fn process_binseq( 54 | aligner: ParallelAlignment, 55 | query_path: &str, 56 | n_threads: usize, 57 | start_time: Instant, 58 | log_path: Option<&str>, 59 | ) -> Result<()> { 60 | let reader = BinseqReader::new(query_path)?; 61 | reader.process_parallel(aligner.clone(), n_threads)?; 62 | aligner.finish_pbar(); 63 | report_runtime( 64 | start_time, 65 | aligner.start_time(), 66 | aligner.num_records(), 67 | log_path, 68 | ) 69 | } 70 | 71 | fn main() -> Result<()> { 72 | let args = Cli::parse(); 73 | 74 | let start_time = Instant::now(); 75 | let index = build_index( 76 | &args.io_options.index_path, 77 | args.mapping_options, 78 | args.index_options, 79 | args.run_options.n_threads(), 80 | args.run_options.show_options, 81 | )?; 82 | let aligner = ParallelAlignment::new(index, args.io_options.output_path)?; 83 | 84 | let query_path = &args.io_options.query_path; 85 | if query_path.ends_with(".bq") || query_path.ends_with(".vbq") { 86 | process_binseq( 87 | aligner, 88 | query_path, 89 | args.run_options.n_threads(), 90 | start_time, 91 | args.run_options.log_path.as_deref(), 92 | ) 93 | } else { 94 | process_fastq( 95 | aligner, 96 | query_path, 97 | args.run_options.n_threads(), 98 | start_time, 99 | args.run_options.log_path.as_deref(), 100 | ) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/stats.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use serde::Serialize; 4 | 5 | #[derive(Serialize)] 6 | pub struct Runtime { 7 | #[serde(rename = "elapsed_total_sec")] 8 | pub e_total: f64, 9 | #[serde(rename = "elapsed_init_sec")] 10 | pub e_init: f64, 11 | #[serde(rename = "elapsed_map_sec")] 12 | pub e_map: f64, 13 | #[serde(rename = "total_records")] 14 | pub n_records: usize, 15 | #[serde(rename = "throughput_records_per_sec")] 16 | pub throughput: f64, 17 | } 18 | impl Runtime { 19 | pub fn new(t_init: Instant, t_map: Instant, n_records: usize) -> Self { 20 | let e_total = t_init.elapsed().as_secs_f64(); 21 | let e_init = (t_map - t_init).as_secs_f64(); 22 | let e_map = t_map.elapsed().as_secs_f64(); 23 | let throughput = n_records as f64 / e_map; 24 | Self { 25 | e_total, 26 | e_init, 27 | e_map, 28 | n_records, 29 | throughput, 30 | } 31 | } 32 | } 33 | --------------------------------------------------------------------------------