├── .gitignore ├── .cargo └── config.toml ├── src ├── cli │ ├── mod.rs │ ├── ioopts.rs │ ├── idxopts.rs │ ├── runopts.rs │ ├── command.rs │ ├── preset.rs │ └── mapopts.rs ├── io.rs ├── stats.rs ├── main.rs ├── index.rs └── align.rs ├── Cargo.toml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | /data 4 | Cargo.lock 5 | profile.json 6 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | -------------------------------------------------------------------------------- /src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | mod command; 2 | mod idxopts; 3 | mod ioopts; 4 | mod mapopts; 5 | mod preset; 6 | mod runopts; 7 | 8 | pub use command::Cli; 9 | pub use idxopts::IndexOptions; 10 | pub use ioopts::IoOptions; 11 | pub use mapopts::MappingOptions; 12 | pub use preset::PresetWrapper; 13 | pub use runopts::RunOptions; 14 | -------------------------------------------------------------------------------- /src/cli/ioopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser)] 4 | #[clap(next_help_heading = "INPUT FILE OPTIONS")] 5 | pub struct IoOptions { 6 | #[clap(help = "Path to the file to index")] 7 | pub index_path: String, 8 | #[clap(help = "Path to the binseq file to query")] 9 | pub query_path: String, 10 | #[clap(short, long, help = "Path to the output file [default: stdout]")] 11 | pub output_path: Option, 12 | } 13 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{Read, Write}; 3 | 4 | use anyhow::Result; 5 | 6 | pub fn transparent_reader(input: &str) -> Result> { 7 | let (stream, _comp) = niffler::send::from_path(input)?; 8 | Ok(stream) 9 | } 10 | 11 | pub fn transparent_writer(output: Option<&str>) -> Result> { 12 | if let Some(path) = output { 13 | let stream = File::create(path)?; 14 | Ok(Box::new(stream)) 15 | } else { 16 | Ok(Box::new(std::io::stderr())) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/cli/idxopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use super::PresetWrapper; 4 | 5 | #[derive(Parser, Clone, Copy)] 6 | #[clap(next_help_heading = "INDEX OPTIONS")] 7 | pub struct IndexOptions { 8 | #[clap(short, long, help = "k-mer size (no larger than 28) [default: 15]")] 9 | pub kmer_size: Option, 10 | 11 | #[clap( 12 | short, 13 | long, 14 | default_value = "10", 15 | help = "minimizer window size [default: 10]" 16 | )] 17 | pub window_size: Option, 18 | 19 | /// Preset to use when aligning reads 20 | #[clap(short = 'x', long)] 21 | pub preset: PresetWrapper, 22 | } 23 | -------------------------------------------------------------------------------- /src/cli/runopts.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser, Clone)] 4 | #[clap(next_help_heading = "RUN OPTIONS")] 5 | pub struct RunOptions { 6 | #[clap(short = 'T', long, default_value = "1")] 7 | n_threads: usize, 8 | #[clap(short = 'L', long)] 9 | pub log_path: Option, 10 | /// Write the option configuration to stderr 11 | #[clap(long)] 12 | pub show_options: bool, 13 | } 14 | impl RunOptions { 15 | pub fn n_threads(&self) -> usize { 16 | if self.n_threads == 0 { 17 | num_cpus::get() 18 | } else { 19 | self.n_threads.min(num_cpus::get()) 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/cli/command.rs: -------------------------------------------------------------------------------- 1 | use clap::{ 2 | builder::{ 3 | styling::{AnsiColor, Effects}, 4 | Styles, 5 | }, 6 | Parser, 7 | }; 8 | 9 | // Configures Clap v3-style help menu colors 10 | const STYLES: Styles = Styles::styled() 11 | .header(AnsiColor::Green.on_default().effects(Effects::BOLD)) 12 | .usage(AnsiColor::Green.on_default().effects(Effects::BOLD)) 13 | .literal(AnsiColor::Cyan.on_default().effects(Effects::BOLD)) 14 | .placeholder(AnsiColor::Yellow.on_default()); 15 | 16 | use super::{IndexOptions, IoOptions, MappingOptions, RunOptions}; 17 | 18 | #[derive(Parser)] 19 | #[command(styles = STYLES, version)] 20 | pub struct Cli { 21 | #[clap(flatten)] 22 | pub io_options: IoOptions, 23 | 24 | #[clap(flatten)] 25 | pub run_options: RunOptions, 26 | 27 | #[clap(flatten)] 28 | pub index_options: IndexOptions, 29 | 30 | #[clap(flatten)] 31 | pub mapping_options: MappingOptions, 32 | } 33 | -------------------------------------------------------------------------------- /src/stats.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use serde::Serialize; 4 | 5 | #[derive(Serialize)] 6 | pub struct Runtime { 7 | #[serde(rename = "elapsed_total_sec")] 8 | pub e_total: f64, 9 | #[serde(rename = "elapsed_init_sec")] 10 | pub e_init: f64, 11 | #[serde(rename = "elapsed_map_sec")] 12 | pub e_map: f64, 13 | #[serde(rename = "total_records")] 14 | pub n_records: usize, 15 | #[serde(rename = "throughput_records_per_sec")] 16 | pub throughput: f64, 17 | } 18 | impl Runtime { 19 | pub fn new(t_init: Instant, t_map: Instant, n_records: usize) -> Self { 20 | let e_total = t_init.elapsed().as_secs_f64(); 21 | let e_init = (t_map - t_init).as_secs_f64(); 22 | let e_map = t_map.elapsed().as_secs_f64(); 23 | let throughput = n_records as f64 / e_map; 24 | Self { 25 | e_total, 26 | e_init, 27 | e_map, 28 | n_records, 29 | throughput, 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mmr" 3 | version = "0.1.7" 4 | edition = "2021" 5 | license = "MIT" 6 | authors = ["Noam Teyssier "] 7 | description = "Minimap2 CLI written in rust using bindings with BINSEQ and VBINSEQ support." 8 | repository = "https://github.com/arcinstitute/mmr" 9 | categories = ["command-line-utilities", "science::bioinformatics"] 10 | keywords = ["long-read", "minimap2", "binseq", "vbinseq", "alignment"] 11 | 12 | [features] 13 | default = [] 14 | sse2only = ["minimap2/sse2only"] 15 | simde = ["minimap2/simde"] 16 | 17 | [dependencies] 18 | anyhow = "1.0.100" 19 | binseq = "0.8.1" 20 | clap = { version = "4.5.53", features = ["derive"] } 21 | csv = "1.4.0" 22 | indicatif = "0.18.3" 23 | minimap2 = { version = "0.1.28", default-features = false } 24 | niffler = "3.0.0" 25 | num_cpus = "1.17.0" 26 | paraseq = "0.4.3" 27 | parking_lot = "0.12.5" 28 | serde_json = "1.0.145" 29 | serde = { version = "1.0.228", features = ["derive", "rc"] } 30 | 31 | [profile.release] 32 | lto = true 33 | codegen-units = 1 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mmr 2 | 3 | [![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE.md) 4 | [![Crates.io](https://img.shields.io/crates/d/mmr?color=orange&label=crates.io)](https://crates.io/crates/mmr) 5 | 6 | A [minimap2](https://github.com/lh3/minimap2)-based aligner with [BINSEQ](https://github.com/arcinstitute/binseq) file format support (`*.bq` and `*.vbq`). 7 | For converting FASTQ to BINSEQ formats see [bqtools](https://github.com/arcinstitute/bqtools). 8 | 9 | This uses the [minimap2-rs](https://github.com/jguhlin/minimap2-rs) library which facilitates raw FFI bindings to the `minimap2` C library. 10 | 11 | ## Installation 12 | 13 | `mmr` is written in rust and deployed with [`cargo`](https://rustup.rs/). 14 | 15 | ```rust 16 | # install binary from cargo 17 | cargo install mmr 18 | 19 | # validate installation 20 | mmr --version 21 | ``` 22 | 23 | ## Usage 24 | 25 | `mmr` follows the same (or similar) CLI as the original [`minimap2`](https://github.com/lh3/minimap2) binary. 26 | 27 | ```bash 28 | # map a *.bq file 29 | mmr -x map-pb 30 | 31 | # map a *.vbq file 32 | mmr -x map-pb 33 | 34 | # map a *.fq file (supports compressed FASTQ as well) 35 | mmr -x map-pb 36 | ``` 37 | -------------------------------------------------------------------------------- /src/cli/preset.rs: -------------------------------------------------------------------------------- 1 | use clap::ValueEnum; 2 | use minimap2::Preset; 3 | 4 | #[derive(Debug, Clone, Copy, PartialEq, ValueEnum)] 5 | pub enum PresetWrapper { 6 | LrHqae, 7 | LrHq, 8 | Splice, 9 | SpliceHq, 10 | Asm, 11 | Asm5, 12 | Asm10, 13 | Asm20, 14 | Sr, 15 | MapPb, 16 | MapHifi, 17 | MapOnt, 18 | AvaPb, 19 | AvaOnt, 20 | Short, 21 | Map10k, 22 | Cdna, 23 | } 24 | impl From for Preset { 25 | fn from(value: PresetWrapper) -> Self { 26 | match value { 27 | PresetWrapper::LrHqae => Preset::LrHqae, 28 | PresetWrapper::LrHq => Preset::LrHq, 29 | PresetWrapper::Splice => Preset::Splice, 30 | PresetWrapper::SpliceHq => Preset::SpliceHq, 31 | PresetWrapper::Asm => Preset::Asm, 32 | PresetWrapper::Asm5 => Preset::Asm5, 33 | PresetWrapper::Asm10 => Preset::Asm10, 34 | PresetWrapper::Asm20 => Preset::Asm20, 35 | PresetWrapper::Sr => Preset::Sr, 36 | PresetWrapper::MapPb => Preset::MapPb, 37 | PresetWrapper::MapHifi => Preset::MapHifi, 38 | PresetWrapper::MapOnt => Preset::MapOnt, 39 | PresetWrapper::AvaPb => Preset::AvaPb, 40 | PresetWrapper::AvaOnt => Preset::AvaOnt, 41 | PresetWrapper::Short => Preset::Short, 42 | PresetWrapper::Map10k => Preset::Map10k, 43 | PresetWrapper::Cdna => Preset::Cdna, 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use std::time::Instant; 3 | 4 | use anyhow::Result; 5 | use binseq::{BinseqReader, ParallelReader}; 6 | use clap::Parser; 7 | 8 | mod align; 9 | mod cli; 10 | mod index; 11 | mod io; 12 | mod stats; 13 | 14 | use align::ParallelAlignment; 15 | use cli::Cli; 16 | use index::build_index; 17 | use io::{transparent_reader, transparent_writer}; 18 | use paraseq::{fastq, parallel::ParallelReader as FastqParallelReader}; 19 | use stats::Runtime; 20 | 21 | fn report_runtime( 22 | program_start: Instant, 23 | map_start: Instant, 24 | num_records: usize, 25 | path: Option<&str>, 26 | ) -> Result<()> { 27 | let stats = Runtime::new(program_start, map_start, num_records); 28 | let mut wtr = transparent_writer(path)?; 29 | serde_json::to_writer_pretty(&mut wtr, &stats)?; 30 | wtr.flush()?; 31 | Ok(()) 32 | } 33 | 34 | fn process_fastq( 35 | mut aligner: ParallelAlignment, 36 | query_path: &str, 37 | n_threads: usize, 38 | start_time: Instant, 39 | log_path: Option<&str>, 40 | ) -> Result<()> { 41 | let stream = transparent_reader(query_path)?; 42 | let reader = fastq::Reader::new(stream); 43 | reader.process_parallel(&mut aligner, n_threads)?; 44 | aligner.finish_pbar(); 45 | report_runtime( 46 | start_time, 47 | aligner.start_time(), 48 | aligner.num_records(), 49 | log_path, 50 | ) 51 | } 52 | 53 | fn process_binseq( 54 | aligner: ParallelAlignment, 55 | query_path: &str, 56 | n_threads: usize, 57 | start_time: Instant, 58 | log_path: Option<&str>, 59 | ) -> Result<()> { 60 | let reader = BinseqReader::new(query_path)?; 61 | reader.process_parallel(aligner.clone(), n_threads)?; 62 | aligner.finish_pbar(); 63 | report_runtime( 64 | start_time, 65 | aligner.start_time(), 66 | aligner.num_records(), 67 | log_path, 68 | ) 69 | } 70 | 71 | fn main() -> Result<()> { 72 | let args = Cli::parse(); 73 | 74 | let start_time = Instant::now(); 75 | let index = build_index( 76 | &args.io_options.index_path, 77 | args.mapping_options, 78 | args.index_options, 79 | args.run_options.n_threads(), 80 | args.run_options.show_options, 81 | )?; 82 | let aligner = ParallelAlignment::new( 83 | index, 84 | args.io_options.output_path, 85 | args.mapping_options.cigar, 86 | )?; 87 | 88 | let query_path = &args.io_options.query_path; 89 | if query_path.ends_with(".bq") || query_path.ends_with(".vbq") { 90 | process_binseq( 91 | aligner, 92 | query_path, 93 | args.run_options.n_threads(), 94 | start_time, 95 | args.run_options.log_path.as_deref(), 96 | ) 97 | } else { 98 | process_fastq( 99 | aligner, 100 | query_path, 101 | args.run_options.n_threads(), 102 | start_time, 103 | args.run_options.log_path.as_deref(), 104 | ) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/cli/mapopts.rs: -------------------------------------------------------------------------------- 1 | use clap::{Parser, ValueEnum}; 2 | use minimap2::MapOpt; 3 | 4 | #[derive(Parser, Clone, Copy)] 5 | #[clap(next_help_heading = "MAPPING OPTIONS")] 6 | pub struct MappingOptions { 7 | #[clap(short = 'c', long, help = "Output CIGAR string")] 8 | pub cigar: bool, 9 | 10 | #[clap( 11 | short = 'f', 12 | long, 13 | help = "Filter out top FLOAT fraction of repetitive minimizers [default = 0.0002]" 14 | )] 15 | pub mask_level: Option, 16 | 17 | #[clap( 18 | short = 'g', 19 | long, 20 | help = "Stop chain elongation if there are no minimizers in INT-bp [default = 10000]" 21 | )] 22 | pub max_gap: Option, 23 | 24 | #[clap( 25 | short = 'G', 26 | long, 27 | help = "Max intron length (effective with -xsplice) [default = 200000]" 28 | )] 29 | pub max_gap_ref: Option, 30 | 31 | #[clap( 32 | short = 'F', 33 | long, 34 | help = "Max fragment length (effective with -xsr or in the fragment mode) [default = 800]" 35 | )] 36 | pub max_frag_len: Option, 37 | 38 | #[clap(short = 'r', long, value_parser = parse_integer_tuple, help = "Chaining/alignment bandwidth and long-join bandwidth [default = 500,20000]")] 39 | pub bandwidth: Option<(i32, i32)>, 40 | 41 | #[clap( 42 | short = 'n', 43 | long, 44 | help = "Minimal number of minimizers on a chain [default = 3]" 45 | )] 46 | pub min_cnt: Option, 47 | 48 | #[clap( 49 | short = 'm', 50 | long, 51 | help = "Minimal chaining score (matching bases minus log gap penalty) [default = 40]" 52 | )] 53 | pub min_chain_score: Option, 54 | 55 | #[clap( 56 | short = 'p', 57 | long, 58 | help = "Min secondary-to-primary score ratio [default = 0.8]" 59 | )] 60 | pub pri_ratio: Option, 61 | 62 | #[clap( 63 | short = 'N', 64 | long, 65 | help = "Retain at most INT secondary alignments [default = 5]" 66 | )] 67 | pub best_n: Option, 68 | 69 | // Alignment scoring parameters 70 | #[clap(short = 'A', long, help = "Matching score [default = 2]")] 71 | pub a: Option, 72 | 73 | #[clap( 74 | short = 'B', 75 | long, 76 | help = "Mismatch penalty (larger value for lower divergence) [default = 4]" 77 | )] 78 | pub b: Option, 79 | 80 | #[clap(short = 'O', long, value_parser = parse_integer_tuple, help = "Gap open penalties. Format: INT,INT [default = 4,24]")] 81 | pub gap_open: Option<(i32, i32)>, 82 | 83 | #[clap(short = 'E', long, value_parser = parse_integer_tuple, help = "Gap extension penalties. Format: INT,INT [default = 2,1]")] 84 | pub gap_ext: Option<(i32, i32)>, 85 | 86 | #[clap(short = 'z', long, value_parser = parse_integer_tuple, help = "Z-drop score and inversion Z-drop score [default = 400,200]")] 87 | pub zdrop: Option<(i32, i32)>, 88 | 89 | #[clap( 90 | short = 'u', 91 | long = "splice-mode", 92 | help = "How to find canonical splicing sites GT-AG - f:transcript strand; b:both strands; r:reverse strand; n:don't match GT-AG [default = n]" 93 | )] 94 | pub splice_mode: Option, 95 | } 96 | fn parse_integer_tuple(s: &str) -> Result<(i32, i32), String> { 97 | let parts: Vec<&str> = s.split(',').collect(); 98 | if parts.len() != 2 { 99 | return Err("Expected format: INT,INT".to_string()); 100 | } 101 | Ok(( 102 | parts[0].parse::().map_err(|e| e.to_string())?, 103 | parts[1].parse::().map_err(|e| e.to_string())?, 104 | )) 105 | } 106 | 107 | /// How to find canonical splicing sites GT-AG 108 | #[derive(Clone, Copy, Debug, PartialEq, ValueEnum, Default)] 109 | pub enum SpliceSiteMode { 110 | /// Don't attempt to match GT-AG (default) 111 | #[clap(name = "n")] 112 | #[default] 113 | None, 114 | 115 | /// Match GT-AG on the forward/transcript strand only 116 | #[clap(name = "f")] 117 | Forward, 118 | 119 | /// Match GT-AG on both strands 120 | #[clap(name = "b")] 121 | Both, 122 | 123 | /// Match CT-AC on the reverse strand (reverse complement of GT-AG) 124 | #[clap(name = "r")] 125 | Reverse, 126 | } 127 | impl SpliceSiteMode { 128 | pub fn update_mapopt(&self, mapopt: &mut MapOpt) { 129 | match self { 130 | Self::None => { 131 | mapopt.unset_splice_for(); 132 | mapopt.unset_splice_rev(); 133 | } 134 | Self::Forward => { 135 | mapopt.set_splice_for(); 136 | mapopt.unset_splice_rev(); 137 | } 138 | Self::Both => { 139 | mapopt.set_splice_for(); 140 | mapopt.set_splice_rev(); 141 | } 142 | Self::Reverse => { 143 | mapopt.unset_splice_for(); 144 | mapopt.set_splice_rev(); 145 | } 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/index.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use anyhow::{bail, Result}; 4 | use minimap2::{Aligner, Built, IdxOpt, MapOpt}; 5 | 6 | use crate::cli::{IndexOptions, MappingOptions}; 7 | 8 | pub fn build_index( 9 | path: &str, 10 | map_options: MappingOptions, 11 | idx_options: IndexOptions, 12 | n_threads: usize, 13 | log_options: bool, 14 | ) -> Result> { 15 | eprintln!("Building index..."); 16 | let start = std::time::Instant::now(); 17 | let aligner = Aligner::builder() 18 | .preset(idx_options.preset.into()) 19 | .with_index_threads(n_threads) 20 | .with_index(path, None); 21 | let duration = start.elapsed(); 22 | eprintln!("Index built in {:?}", duration); 23 | 24 | match aligner { 25 | Ok(mut aligner) => { 26 | update_map_options(&mut aligner, map_options); 27 | update_index_options(&mut aligner, idx_options); 28 | 29 | if log_options { 30 | pprint_index(&mut std::io::stderr(), aligner.idxopt)?; 31 | pprint_map(&mut std::io::stderr(), aligner.mapopt)?; 32 | } 33 | 34 | Ok(aligner) 35 | } 36 | Err(err) => bail!("Error building index: {}", err), 37 | } 38 | } 39 | 40 | fn update_map_options(aligner: &mut Aligner, map_options: MappingOptions) { 41 | if map_options.cigar { 42 | aligner.mapopt.set_cigar(); 43 | } 44 | if let Some(mask_level) = map_options.mask_level { 45 | aligner.mapopt.mask_level = mask_level; 46 | } 47 | if let Some(max_gap) = map_options.max_gap { 48 | aligner.mapopt.max_gap = max_gap; 49 | } 50 | if let Some(max_gap_ref) = map_options.max_gap_ref { 51 | aligner.mapopt.max_gap_ref = max_gap_ref; 52 | } 53 | if let Some(max_frag_len) = map_options.max_frag_len { 54 | aligner.mapopt.max_frag_len = max_frag_len; 55 | } 56 | if let Some(bandwidth) = map_options.bandwidth { 57 | aligner.mapopt.bw = bandwidth.0; 58 | aligner.mapopt.bw_long = bandwidth.1; 59 | } 60 | if let Some(min_cnt) = map_options.min_cnt { 61 | aligner.mapopt.min_cnt = min_cnt; 62 | } 63 | if let Some(min_chain_score) = map_options.min_chain_score { 64 | aligner.mapopt.min_chain_score = min_chain_score; 65 | } 66 | if let Some(pri_ratio) = map_options.pri_ratio { 67 | aligner.mapopt.pri_ratio = pri_ratio; 68 | } 69 | if let Some(best_n) = map_options.best_n { 70 | aligner.mapopt.best_n = best_n; 71 | } 72 | if let Some(a) = map_options.a { 73 | aligner.mapopt.a = a; 74 | } 75 | if let Some(b) = map_options.b { 76 | aligner.mapopt.b = b; 77 | } 78 | if let Some(gap_open) = map_options.gap_open { 79 | aligner.mapopt.q = gap_open.0; 80 | aligner.mapopt.q2 = gap_open.1; 81 | } 82 | if let Some(gap_ext) = map_options.gap_ext { 83 | aligner.mapopt.e = gap_ext.0; 84 | aligner.mapopt.e2 = gap_ext.1; 85 | } 86 | if let Some(zdrop) = map_options.zdrop { 87 | aligner.mapopt.zdrop = zdrop.0; 88 | aligner.mapopt.zdrop_inv = zdrop.1; 89 | } 90 | if let Some(splice_mode) = map_options.splice_mode { 91 | splice_mode.update_mapopt(&mut aligner.mapopt); 92 | } 93 | } 94 | 95 | fn update_index_options(aligner: &mut Aligner, idx_options: IndexOptions) { 96 | if let Some(k) = idx_options.kmer_size { 97 | aligner.idxopt.k = k; 98 | } 99 | if let Some(w) = idx_options.window_size { 100 | aligner.idxopt.w = w; 101 | } 102 | } 103 | 104 | fn pprint_index(writer: &mut W, opt: IdxOpt) -> Result<()> { 105 | writeln!(writer, "== Index Options ==")?; 106 | writeln!(writer, " k: {}", opt.k)?; 107 | writeln!(writer, " w: {}", opt.w)?; 108 | writeln!(writer, " flag: {}", opt.flag)?; 109 | writeln!(writer, " bucket_bits: {}", opt.bucket_bits)?; 110 | writeln!(writer, " mini_batch_size: {}", opt.mini_batch_size)?; 111 | writeln!(writer, " batch_size: {}", opt.batch_size)?; 112 | Ok(()) 113 | } 114 | 115 | fn pprint_map(writer: &mut W, opt: MapOpt) -> Result<()> { 116 | writeln!(writer, "== Mapping Options ==")?; 117 | writeln!(writer, " flag: {}", opt.flag)?; 118 | writeln!(writer, " seed: {}", opt.seed)?; 119 | writeln!(writer, " sdust_thres: {}", opt.sdust_thres)?; 120 | writeln!(writer, " max_qlen: {}", opt.max_qlen)?; 121 | writeln!(writer, " bw: {}", opt.bw)?; 122 | writeln!(writer, " bw_long: {}", opt.bw_long)?; 123 | writeln!(writer, " max_gap: {}", opt.max_gap)?; 124 | writeln!(writer, " max_gap_ref: {}", opt.max_gap_ref)?; 125 | writeln!(writer, " max_frag_len: {}", opt.max_frag_len)?; 126 | writeln!(writer, " max_chain_skip: {}", opt.max_chain_skip)?; 127 | writeln!(writer, " max_chain_iter: {}", opt.max_chain_iter)?; 128 | writeln!(writer, " min_cnt: {}", opt.min_cnt)?; 129 | writeln!(writer, " min_chain_score: {}", opt.min_chain_score)?; 130 | writeln!(writer, " chain_gap_scale: {}", opt.chain_gap_scale)?; 131 | writeln!(writer, " chain_skip_scale: {}", opt.chain_skip_scale)?; 132 | writeln!(writer, " rmq_size_cap: {}", opt.rmq_size_cap)?; 133 | writeln!(writer, " rmq_inner_dist: {}", opt.rmq_inner_dist)?; 134 | writeln!(writer, " rmq_rescue_size: {}", opt.rmq_rescue_size)?; 135 | writeln!(writer, " rmq_rescue_ratio: {}", opt.rmq_rescue_ratio)?; 136 | writeln!(writer, " mask_level: {}", opt.mask_level)?; 137 | writeln!(writer, " mask_len: {}", opt.mask_len)?; 138 | writeln!(writer, " pri_ratio: {}", opt.pri_ratio)?; 139 | writeln!(writer, " best_n: {}", opt.best_n)?; 140 | writeln!(writer, " alt_drop: {}", opt.alt_drop)?; 141 | writeln!(writer, " a: {}", opt.a)?; 142 | writeln!(writer, " b: {}", opt.b)?; 143 | writeln!(writer, " q: {}", opt.q)?; 144 | writeln!(writer, " e: {}", opt.e)?; 145 | writeln!(writer, " q2: {}", opt.q2)?; 146 | writeln!(writer, " e2: {}", opt.e2)?; 147 | writeln!(writer, " transition: {}", opt.transition)?; 148 | writeln!(writer, " sc_ambi: {}", opt.sc_ambi)?; 149 | writeln!(writer, " noncan: {}", opt.noncan)?; 150 | writeln!(writer, " junc_bonus: {}", opt.junc_bonus)?; 151 | writeln!(writer, " zdrop: {}", opt.zdrop)?; 152 | writeln!(writer, " zdrop_inv: {}", opt.zdrop_inv)?; 153 | writeln!(writer, " end_bonus: {}", opt.end_bonus)?; 154 | writeln!(writer, " min_dp_max: {}", opt.min_dp_max)?; 155 | writeln!(writer, " min_ksw_len: {}", opt.min_ksw_len)?; 156 | writeln!(writer, " anchor_ext_len: {}", opt.anchor_ext_len)?; 157 | writeln!(writer, " anchor_ext_shift: {}", opt.anchor_ext_shift)?; 158 | writeln!(writer, " max_clip_ratio: {}", opt.max_clip_ratio)?; 159 | writeln!(writer, " rank_min_len: {}", opt.rank_min_len)?; 160 | writeln!(writer, " rank_frac: {}", opt.rank_frac)?; 161 | writeln!(writer, " pe_ori: {}", opt.pe_ori)?; 162 | writeln!(writer, " pe_bonus: {}", opt.pe_bonus)?; 163 | writeln!(writer, " mid_occ_frac: {}", opt.mid_occ_frac)?; 164 | writeln!(writer, " q_occ_frac: {}", opt.q_occ_frac)?; 165 | writeln!(writer, " min_mid_occ: {}", opt.min_mid_occ)?; 166 | writeln!(writer, " max_mid_occ: {}", opt.max_mid_occ)?; 167 | writeln!(writer, " mid_occ: {}", opt.mid_occ)?; 168 | writeln!(writer, " max_occ: {}", opt.max_occ)?; 169 | writeln!(writer, " max_max_occ: {}", opt.max_max_occ)?; 170 | writeln!(writer, " occ_dist: {}", opt.occ_dist)?; 171 | writeln!(writer, " mini_batch_size: {}", opt.mini_batch_size)?; 172 | writeln!(writer, " max_sw_mat: {}", opt.max_sw_mat)?; 173 | writeln!(writer, " cap_kalloc: {}", opt.cap_kalloc)?; 174 | 175 | // For the pointer field, we need to handle it carefully 176 | // This is a C-style string pointer, so we should print it safely 177 | if !opt.split_prefix.is_null() { 178 | // Note: This is unsafe and assumes the string is valid UTF-8 179 | // In a real implementation, you might want more robust handling 180 | unsafe { 181 | let c_str = std::ffi::CStr::from_ptr(opt.split_prefix); 182 | if let Ok(str_slice) = c_str.to_str() { 183 | writeln!(writer, " split_prefix: {}", str_slice)?; 184 | } else { 185 | writeln!(writer, " split_prefix: ")?; 186 | } 187 | } 188 | } else { 189 | writeln!(writer, " split_prefix: ")?; 190 | } 191 | 192 | Ok(()) 193 | } 194 | -------------------------------------------------------------------------------- /src/align.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::{File, OpenOptions}, 3 | io::{BufWriter, Write}, 4 | num::NonZeroI32, 5 | sync::Arc, 6 | time::Instant, 7 | }; 8 | 9 | use anyhow::{anyhow, Result}; 10 | use binseq::BinseqRecord; 11 | use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; 12 | use minimap2::{Aligner, Built, Mapping, Strand}; 13 | use paraseq::parallel::ProcessError; 14 | use parking_lot::Mutex; 15 | use serde::Serialize; 16 | 17 | #[derive(Clone)] 18 | pub struct ParallelAlignment { 19 | aligner: Arc>, 20 | 21 | /// Local write buffer for PAF records 22 | wbuf: Vec, 23 | 24 | /// Path name for the output file 25 | output_path: Option, 26 | 27 | /// IO lock 28 | io_lock: Arc>, 29 | 30 | /// Cigar option 31 | with_cigar: bool, 32 | 33 | /// Number of records processed (local/global) 34 | local_n_processed: usize, 35 | global_n_processed: Arc>, 36 | 37 | /// Start time 38 | start_time: Instant, 39 | 40 | /// Thread id (local) 41 | tid: usize, 42 | 43 | /// Progress bar 44 | pbar: Arc>, 45 | } 46 | impl ParallelAlignment { 47 | pub fn new( 48 | aligner: Aligner, 49 | output_path: Option, 50 | with_cigar: bool, 51 | ) -> Result { 52 | Self::initialize_output(output_path.as_ref())?; 53 | let pbar = Self::initialize_pbar(); 54 | Ok(Self { 55 | aligner: Arc::new(aligner), 56 | wbuf: Vec::new(), 57 | io_lock: Arc::new(Mutex::new(())), 58 | local_n_processed: 0, 59 | global_n_processed: Arc::new(Mutex::new(0)), 60 | output_path, 61 | start_time: Instant::now(), 62 | tid: 0, 63 | pbar: Arc::new(Mutex::new(pbar)), 64 | with_cigar, 65 | }) 66 | } 67 | pub fn initialize_output(output_path: Option<&String>) -> Result<()> { 68 | if let Some(path) = output_path { 69 | File::create(path)?; 70 | Ok(()) 71 | } else { 72 | Ok(()) 73 | } 74 | } 75 | pub fn initialize_pbar() -> ProgressBar { 76 | let pbar = ProgressBar::new_spinner(); 77 | pbar.set_style( 78 | ProgressStyle::default_spinner() 79 | .template("{spinner:.cyan} [{elapsed_precise}] {msg}") 80 | .unwrap(), 81 | ); 82 | pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(10)); 83 | pbar 84 | } 85 | 86 | fn reopen_handle(&self) -> Result> { 87 | if let Some(path) = &self.output_path { 88 | let file = OpenOptions::new().append(true).open(path)?; 89 | let buffer = BufWriter::new(file); 90 | Ok(Box::new(buffer)) 91 | } else { 92 | let file = std::io::stdout(); 93 | let buffer = BufWriter::new(file); 94 | Ok(Box::new(buffer)) 95 | } 96 | } 97 | fn write_local(&mut self, mapping: Vec) -> Result<()> { 98 | let mut wtr = csv::WriterBuilder::new() 99 | .has_headers(false) 100 | .delimiter(b'\t') 101 | .from_writer(&mut self.wbuf); 102 | 103 | for alignment in mapping { 104 | let mapping = MappingNutype::new(alignment, self.with_cigar); 105 | wtr.serialize(mapping)?; 106 | } 107 | wtr.flush()?; 108 | Ok(()) 109 | } 110 | fn write_record_set(&mut self) -> Result<()> { 111 | // Open a thread-safe stdout writer 112 | // 113 | // Drops lock when it goes out of scope 114 | { 115 | let _lock = self.io_lock.lock(); 116 | let mut handle = self.reopen_handle()?; 117 | handle.write_all(&self.wbuf)?; 118 | handle.flush()?; 119 | } 120 | 121 | // Clear the write buffer 122 | self.wbuf.clear(); 123 | 124 | Ok(()) 125 | } 126 | fn calculate_throughput(&self) -> f64 { 127 | let elapsed = self.start_time.elapsed().as_secs_f64(); 128 | *self.global_n_processed.lock() as f64 / elapsed 129 | } 130 | fn update_statistics(&mut self) { 131 | *self.global_n_processed.lock() += self.local_n_processed; 132 | self.local_n_processed = 0; 133 | } 134 | fn update_pbar(&self) { 135 | // only update progress bar on the main thread 136 | if self.tid == 0 { 137 | let pbar = self.pbar.lock(); 138 | let elapsed = self.start_time.elapsed().as_secs_f64(); 139 | let throughput = self.calculate_throughput(); 140 | let msg = format!("Elapsed: {elapsed:.2}s, Throughput: {throughput:.2} reads/s",); 141 | pbar.set_message(msg); 142 | } 143 | } 144 | pub fn finish_pbar(&self) { 145 | let pbar = self.pbar.lock(); 146 | let elapsed = self.start_time.elapsed().as_secs_f64(); 147 | let throughput = self.calculate_throughput(); 148 | let msg = format!("Elapsed: {elapsed:.2}s, Throughput: {throughput:.2} reads/s",); 149 | pbar.finish_with_message(msg); 150 | } 151 | pub fn start_time(&self) -> Instant { 152 | self.start_time 153 | } 154 | pub fn num_records(&self) -> usize { 155 | *self.global_n_processed.lock() 156 | } 157 | } 158 | impl binseq::ParallelProcessor for ParallelAlignment { 159 | fn process_record(&mut self, record: B) -> binseq::Result<()> { 160 | let mapping = match self.aligner.map( 161 | record.sseq(), 162 | self.with_cigar, 163 | false, 164 | None, 165 | None, 166 | Some(record.sheader()), 167 | ) { 168 | Ok(mapping) => mapping, 169 | Err(err) => return Err(anyhow!("Error mapping record: {}", err).into()), 170 | }; 171 | self.local_n_processed += 1; 172 | self.write_local(mapping)?; 173 | Ok(()) 174 | } 175 | 176 | fn on_batch_complete(&mut self) -> Result<(), binseq::Error> { 177 | self.write_record_set()?; 178 | self.update_statistics(); 179 | self.update_pbar(); 180 | Ok(()) 181 | } 182 | 183 | fn set_tid(&mut self, tid: usize) { 184 | self.tid = tid; 185 | } 186 | } 187 | impl paraseq::parallel::ParallelProcessor for ParallelAlignment { 188 | fn process_record(&mut self, record: Rf) -> paraseq::Result<()> { 189 | let mapping = 190 | match self 191 | .aligner 192 | .map(&record.seq(), false, false, None, None, Some(record.id())) 193 | { 194 | Ok(mapping) => mapping, 195 | Err(err) => { 196 | return Err(ProcessError::from(anyhow!("Error mapping record: {}", err))); 197 | } 198 | }; 199 | self.local_n_processed += 1; 200 | self.write_local(mapping)?; 201 | Ok(()) 202 | } 203 | 204 | fn on_batch_complete(&mut self) -> paraseq::Result<()> { 205 | self.write_record_set()?; 206 | self.update_statistics(); 207 | self.update_pbar(); 208 | Ok(()) 209 | } 210 | 211 | fn set_thread_id(&mut self, thread_id: usize) { 212 | self.tid = thread_id; 213 | } 214 | } 215 | 216 | #[derive(Debug, Clone, Serialize)] 217 | pub struct MappingNutype { 218 | pub query_name: Arc, 219 | pub query_len: Option, 220 | pub query_start: i32, 221 | pub query_end: i32, 222 | pub strand: char, 223 | pub target_name: Option>, 224 | pub target_len: i32, 225 | pub target_start: i32, 226 | pub target_end: i32, 227 | pub match_len: i32, 228 | pub block_len: i32, 229 | pub mapq: u32, 230 | #[serde(skip_serializing_if = "Option::is_none")] 231 | pub cigar: Option, 232 | } 233 | impl MappingNutype { 234 | fn new(mapping: Mapping, with_cigar: bool) -> Self { 235 | Self { 236 | query_name: mapping 237 | .query_name 238 | .unwrap_or_else(|| Arc::new("*".to_string())), 239 | query_len: mapping.query_len, 240 | query_start: mapping.query_start, 241 | query_end: mapping.query_end, 242 | strand: match mapping.strand { 243 | Strand::Forward => '+', 244 | Strand::Reverse => '-', 245 | }, 246 | target_name: mapping.target_name, 247 | target_len: mapping.target_len, 248 | target_start: mapping.target_start, 249 | target_end: mapping.target_end, 250 | match_len: mapping.match_len, 251 | block_len: mapping.block_len, 252 | mapq: mapping.mapq, 253 | cigar: if with_cigar { 254 | if let Some(alignment) = mapping.alignment { 255 | alignment.cigar_str.map(|cigar| format!("cg:Z:{}M", cigar)) 256 | } else { 257 | Some(format!("cg:Z:{:?}M", mapping.query_len)) 258 | } 259 | } else { 260 | None 261 | }, 262 | } 263 | } 264 | } 265 | --------------------------------------------------------------------------------