├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── benchmark.rs └── src ├── algorithm.rs ├── builder.rs ├── iterator ├── minimizer.rs ├── mod.rs └── mod_sampling.rs └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | .DS_Store 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "minimizer-iter" 3 | version = "1.2.1" 4 | authors = ["Igor Martayan "] 5 | description = "Iterate over minimizers of a DNA sequence" 6 | readme = "README.md" 7 | exclude = [".github/**", ".gitignore", "benches/**"] 8 | repository = "https://github.com/rust-seq/minimizer-iter" 9 | homepage = "https://crates.io/crates/minimizer-iter" 10 | documentation = "https://docs.rs/minimizer-iter" 11 | license = "MIT" 12 | keywords = ["minimizer", "iterator", "bioinformatics", "dna"] 13 | categories = ["data-structures"] 14 | edition = "2021" 15 | 16 | 17 | [dependencies] 18 | minimizer-queue = "1" 19 | num-traits = "0.2" 20 | strength_reduce = "0.2" 21 | 22 | # A dev dependency but optional dev dependency isn't actual support 23 | minimizers = { git = "https://github.com/RagnarGrootKoerkamp/minimizers.git", optional = true } 24 | 25 | 26 | [dev-dependencies] 27 | biotest = { version = "0.2", features = ["sequence"] } 28 | cocktail = { git = "https://github.com/natir/cocktail.git" } 29 | criterion = "0.5" 30 | nohash-hasher = "0.2" 31 | rand = "0.8" 32 | 33 | 34 | [features] 35 | default = [] 36 | nightly = ["dep:minimizers"] 37 | 38 | 39 | [[bench]] 40 | name = "benchmark" 41 | harness = false 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Igor Martayan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minimizer-iter 2 | 3 | [![crates.io](https://img.shields.io/crates/v/minimizer-iter)](https://crates.io/crates/minimizer-iter) 4 | [![docs](https://img.shields.io/docsrs/minimizer-iter)](https://docs.rs/minimizer-iter) 5 | 6 | Iterate over minimizers of a DNA sequence. 7 | 8 | ## Features 9 | 10 | - iterates over minimizers in a single pass 11 | - yields bitpacked minimizers with their position 12 | - supports [mod-minimizers](https://doi.org/10.1101/2024.05.25.595898), introduced by Groot Koerkamp & Pibiri 13 | - supports canonical minimizers 14 | - supports custom bit encoding of the nucleotides 15 | - supports custom [hasher](https://doc.rust-lang.org/stable/core/hash/trait.BuildHasher.html), using [wyhash](https://github.com/JackThomson2/wyhash2) by default 16 | - can be seeded to produce a different ordering 17 | 18 | If you'd like to use the underlying data structure manually, have a look at the [minimizer-queue](https://github.com/rust-seq/minimizer-queue) crate. 19 | 20 | ## Example usage 21 | 22 | ```rust 23 | use minimizer_iter::MinimizerBuilder; 24 | 25 | // Build an iterator over minimizers 26 | // of size 21 with a window of size 11 27 | // for the sequence "TGATTGCACAATC" 28 | let min_iter = MinimizerBuilder::::new() 29 | .minimizer_size(21) 30 | .width(11) 31 | .iter(b"TGATTGCACAATC"); 32 | 33 | for (minimizer, position) in min_iter { 34 | // ... 35 | } 36 | ``` 37 | 38 | If you'd like to use mod-minimizers instead, just change `new()` to `new_mod()`: 39 | ```rust 40 | use minimizer_iter::MinimizerBuilder; 41 | 42 | // Build an iterator over mod-minimizers 43 | // of size 21 with a window of size 11 44 | // for the sequence "TGATTGCACAATC" 45 | let min_iter = MinimizerBuilder::::new_mod() 46 | .minimizer_size(21) 47 | .width(11) 48 | .iter(b"TGATTGCACAATC"); 49 | 50 | for (minimizer, position) in min_iter { 51 | // ... 52 | } 53 | ``` 54 | 55 | Additionally, the iterator can produce canonical minimizers so that a sequence and its reverse complement will select the same minimizers. 56 | To do so, just add `.canonical()` to the builder: 57 | ```rust 58 | MinimizerBuilder::::new() 59 | .canonical() 60 | .minimizer_size(...) 61 | .width(...) 62 | .iter(...) 63 | ``` 64 | 65 | If you need longer minimizers (> 32 bases), you can specify a bigger integer type such as `u128`: 66 | ```rust 67 | MinimizerBuilder::::new() 68 | .minimizer_size(...) 69 | .width(...) 70 | .iter(...) 71 | ``` 72 | 73 | See the [documentation](https://docs.rs/minimizer-iter) for more details. 74 | 75 | ## Benchmarks 76 | 77 | To run benchmarks against other implementations of minimizers, clone this repository and run: 78 | ```sh 79 | cargo bench 80 | ``` 81 | 82 | ## Contributors 83 | 84 | - [Igor Martayan](https://github.com/imartayan) (main developer) 85 | - [Pierre Marijon](https://github.com/natir) 86 | -------------------------------------------------------------------------------- /benches/benchmark.rs: -------------------------------------------------------------------------------- 1 | use biotest::Format; 2 | use cocktail::tokenizer::minimizer::{method::Random, Forward}; 3 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 4 | use minimizer_iter::MinimizerBuilder; 5 | #[cfg(feature = "nightly")] 6 | use minimizers::{order::RandomOrder, Minimizer, ModSampling, SamplingScheme}; 7 | use nohash_hasher::BuildNoHashHasher; 8 | 9 | fn gen_seq(len: usize) -> Vec { 10 | let mut rng = biotest::rand(); 11 | let mut seq = Vec::with_capacity(len); 12 | let generator = biotest::Sequence::builder() 13 | .sequence_len(len) 14 | .build() 15 | .unwrap(); 16 | generator.record(&mut seq, &mut rng).unwrap(); 17 | seq 18 | } 19 | 20 | fn minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 21 | let id = format!("minimizer m={m} w={w}"); 22 | c.bench_function(id.as_str(), |b| { 23 | b.iter(|| { 24 | for x in MinimizerBuilder::::new() 25 | .minimizer_size(m) 26 | .width(w) 27 | .iter_pos(seq) 28 | { 29 | black_box(x); 30 | } 31 | }) 32 | }); 33 | } 34 | 35 | fn mod_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 36 | let id = format!("mod-minimizer m={m} w={w}"); 37 | c.bench_function(id.as_str(), |b| { 38 | b.iter(|| { 39 | for x in MinimizerBuilder::::new_mod() 40 | .minimizer_size(m) 41 | .width(w) 42 | .iter_pos(seq) 43 | { 44 | black_box(x); 45 | } 46 | }) 47 | }); 48 | } 49 | 50 | fn lex_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 51 | let id = format!("lex minimizer m={m} w={w}"); 52 | c.bench_function(id.as_str(), |b| { 53 | b.iter(|| { 54 | for x in MinimizerBuilder::::new() 55 | .minimizer_size(m) 56 | .width(w) 57 | .hasher(BuildNoHashHasher::::default()) 58 | .iter_pos(seq) 59 | { 60 | black_box(x); 61 | } 62 | }) 63 | }); 64 | } 65 | 66 | fn lex_mod_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 67 | let id = format!("lex mod-minimizer m={m} w={w}"); 68 | c.bench_function(id.as_str(), |b| { 69 | b.iter(|| { 70 | for x in MinimizerBuilder::::new_mod() 71 | .minimizer_size(m) 72 | .width(w) 73 | .hasher(BuildNoHashHasher::::default()) 74 | .iter_pos(seq) 75 | { 76 | black_box(x); 77 | } 78 | }) 79 | }); 80 | } 81 | 82 | fn canon_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 83 | let id = format!("canon minimizer m={m} w={w}"); 84 | c.bench_function(id.as_str(), |b| { 85 | b.iter(|| { 86 | for x in MinimizerBuilder::::new() 87 | .canonical() 88 | .minimizer_size(m) 89 | .width(w) 90 | .iter_pos(seq) 91 | { 92 | black_box(x); 93 | } 94 | }) 95 | }); 96 | } 97 | 98 | fn canon_mod_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: u16) { 99 | let id = format!("canon mod-minimizer m={m} w={w}"); 100 | c.bench_function(id.as_str(), |b| { 101 | b.iter(|| { 102 | for x in MinimizerBuilder::::new_mod() 103 | .canonical() 104 | .minimizer_size(m) 105 | .width(w) 106 | .iter_pos(seq) 107 | { 108 | black_box(x); 109 | } 110 | }) 111 | }); 112 | } 113 | 114 | #[cfg(feature = "nightly")] 115 | fn ragnar_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: usize) { 116 | let id = format!("ragnar's random minimizer m={m} w={w}"); 117 | c.bench_function(id.as_str(), |b| { 118 | b.iter(|| { 119 | for x in Minimizer::new(m, w, RandomOrder).stream(seq) { 120 | black_box(x); 121 | } 122 | }) 123 | }); 124 | } 125 | 126 | #[cfg(feature = "nightly")] 127 | fn ragnar_mod_minimizer(c: &mut Criterion, seq: &[u8], m: usize, w: usize) { 128 | let id = format!("ragnar's mod-minimizer m={m} w={w}"); 129 | c.bench_function(id.as_str(), |b| { 130 | b.iter(|| { 131 | for x in ModSampling::mod_minimizer(m, w).stream(seq) { 132 | black_box(x); 133 | } 134 | }) 135 | }); 136 | } 137 | 138 | fn cocktail_minimizer_forward(c: &mut Criterion, seq: &[u8], k: u8, m: u8) { 139 | let id = format!("cocktail minimizer forward m={m} w={}", k - m + 1); 140 | c.bench_function(id.as_str(), |b| { 141 | b.iter(|| { 142 | for x in Forward::::new(seq, k, m) { 143 | black_box(x); 144 | } 145 | }) 146 | }); 147 | } 148 | 149 | fn all_benches(c: &mut Criterion) { 150 | let seq = gen_seq(1_000_000); 151 | let ks = [31, 63]; 152 | let ms = [21, 31]; 153 | for (k, m) in ks.iter().copied().zip(ms.iter().copied()) { 154 | let w = (k - m + 1) as u16; 155 | minimizer(c, &seq, m, w); 156 | lex_minimizer(c, &seq, m, w); 157 | #[cfg(feature = "nightly")] 158 | ragnar_minimizer(c, &seq, m, w as usize); 159 | if k <= 32 { 160 | cocktail_minimizer_forward(c, &seq, k as u8, m as u8); 161 | } 162 | canon_minimizer(c, &seq, m, w); 163 | mod_minimizer(c, &seq, m, w); 164 | lex_mod_minimizer(c, &seq, m, w); 165 | #[cfg(feature = "nightly")] 166 | ragnar_mod_minimizer(c, &seq, m, w as usize); 167 | canon_mod_minimizer(c, &seq, m, w); 168 | } 169 | } 170 | 171 | criterion_group!(benches, all_benches); 172 | criterion_main!(benches); 173 | -------------------------------------------------------------------------------- /src/algorithm.rs: -------------------------------------------------------------------------------- 1 | //! Algorithms to compute minimizers. 2 | 3 | pub trait MinimizerAlgorithm {} 4 | 5 | /// "Classic" minimizers. 6 | pub struct Minimizer {} 7 | impl MinimizerAlgorithm for Minimizer {} 8 | 9 | /// Mod-minimizers, introduced in [The mod-minimizer: a simple and efficient sampling algorithm for long k-mers (Groot Koerkamp & Pibiri '24)](https://doi.org/10.1101/2024.05.25.595898). 10 | pub struct ModMinimizer {} 11 | impl MinimizerAlgorithm for ModMinimizer {} 12 | -------------------------------------------------------------------------------- /src/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::algorithm::{Minimizer, MinimizerAlgorithm, ModMinimizer}; 2 | use crate::iterator::*; 3 | use core::hash::{BuildHasher, Hash}; 4 | use core::marker::PhantomData; 5 | use minimizer_queue::DefaultHashBuilder; 6 | use num_traits::PrimInt; 7 | 8 | /// A builder for iterators over minimizers. 9 | /// 10 | /// # Examples 11 | /// 12 | /// ``` 13 | /// use minimizer_iter::MinimizerBuilder; 14 | /// 15 | /// // Build an iterator over minimizers 16 | /// // of size 3 with a window of size 4 17 | /// // for the sequence "TGATTGCACAATC" 18 | /// let min_iter = MinimizerBuilder::::new() 19 | /// .minimizer_size(3) 20 | /// .width(4) 21 | /// .iter(b"TGATTGCACAATC"); 22 | /// 23 | /// for (minimizer, position) in min_iter { 24 | /// // ... 25 | /// } 26 | /// ``` 27 | #[derive(Clone, Debug, Eq, PartialEq)] 28 | pub struct MinimizerBuilder< 29 | T: PrimInt = u64, 30 | A: MinimizerAlgorithm = Minimizer, 31 | S: BuildHasher = DefaultHashBuilder, 32 | const CANONICAL: bool = false, 33 | > { 34 | minimizer_size: usize, 35 | width: u16, 36 | hasher: S, 37 | encoding: [u8; 256], 38 | _marker: PhantomData<(T, A)>, 39 | } 40 | 41 | impl MinimizerBuilder { 42 | /// Sets up the `MinimizerBuilder` with default values: 43 | /// - minimizer_size = 21 44 | /// - width = 11 (31 - 21 + 1) 45 | /// - hasher = [`DefaultHashBuilder`] 46 | /// - encoding: A = `00`, C = `01`, G = `10`, T = `11` 47 | #[inline] 48 | pub fn new() -> Self { 49 | Self::_new() 50 | } 51 | } 52 | 53 | impl Default for MinimizerBuilder { 54 | #[inline] 55 | fn default() -> Self { 56 | Self::_new() 57 | } 58 | } 59 | 60 | impl MinimizerBuilder { 61 | /// Builds an iterator over the minimizers and their positions in the given sequence. 62 | #[inline] 63 | pub fn iter(self, seq: &[u8]) -> MinimizerIterator { 64 | MinimizerIterator::new( 65 | seq, 66 | self.minimizer_size, 67 | self.width, 68 | self.hasher, 69 | self.encoding, 70 | ) 71 | } 72 | 73 | /// Builds an iterator over the positions of the minimizers in the given sequence. 74 | #[inline] 75 | pub fn iter_pos(self, seq: &[u8]) -> MinimizerPosIterator { 76 | MinimizerPosIterator::new( 77 | seq, 78 | self.minimizer_size, 79 | self.width, 80 | self.hasher, 81 | self.encoding, 82 | ) 83 | } 84 | } 85 | 86 | impl MinimizerBuilder { 87 | /// Builds an iterator over the canonical minimizers and their positions in the given sequence with a boolean indicating a reverse complement. 88 | /// It requires an odd width to break ties between multiple minimizers. 89 | #[inline] 90 | pub fn iter(self, seq: &[u8]) -> CanonicalMinimizerIterator { 91 | assert_eq!( 92 | self.width % 2, 93 | 1, 94 | "width must be odd to break ties between multiple minimizers" 95 | ); 96 | CanonicalMinimizerIterator::new( 97 | seq, 98 | self.minimizer_size, 99 | self.width, 100 | self.hasher, 101 | self.encoding, 102 | ) 103 | } 104 | 105 | /// Builds an iterator over the positions of the canonical minimizers in the given sequence with a boolean indicating a reverse complement. 106 | /// It requires an odd width to break ties between multiple minimizers. 107 | #[inline] 108 | pub fn iter_pos(self, seq: &[u8]) -> CanonicalMinimizerPosIterator { 109 | assert_eq!( 110 | self.width % 2, 111 | 1, 112 | "width must be odd to break ties between multiple minimizers" 113 | ); 114 | CanonicalMinimizerPosIterator::new( 115 | seq, 116 | self.minimizer_size, 117 | self.width, 118 | self.hasher, 119 | self.encoding, 120 | ) 121 | } 122 | } 123 | 124 | const R: usize = 4; 125 | 126 | impl MinimizerBuilder { 127 | /// Sets up the `MinimizerBuilder` for mod-minimizers with default values: 128 | /// - minimizer_size = 21 129 | /// - width = 11 (31 - 21 + 1) 130 | /// - hasher = [`DefaultHashBuilder`] 131 | /// - encoding: A = `00`, C = `01`, G = `10`, T = `11` 132 | #[inline] 133 | pub fn new_mod() -> Self { 134 | Self::_new() 135 | } 136 | } 137 | 138 | impl MinimizerBuilder { 139 | /// Builds an iterator over the mod-minimizers and their positions in the given sequence. 140 | #[inline] 141 | pub fn iter(self, seq: &[u8]) -> ModSamplingIterator { 142 | assert!( 143 | self.minimizer_size >= R, 144 | "mod-minimizers require minimizer_size ≥ r={R}" 145 | ); 146 | ModSamplingIterator::new( 147 | seq, 148 | self.minimizer_size, 149 | self.width, 150 | R + ((self.minimizer_size - R) % self.width as usize), 151 | self.hasher, 152 | self.encoding, 153 | ) 154 | } 155 | 156 | /// Builds an iterator over the positions of the mod-minimizers in the given sequence. 157 | #[inline] 158 | pub fn iter_pos(self, seq: &[u8]) -> ModSamplingPosIterator { 159 | assert!( 160 | self.minimizer_size >= R, 161 | "mod-minimizers require minimizer_size ≥ r={R}" 162 | ); 163 | ModSamplingPosIterator::new( 164 | seq, 165 | self.minimizer_size, 166 | self.width, 167 | R + ((self.minimizer_size - R) % self.width as usize), 168 | self.hasher, 169 | self.encoding, 170 | ) 171 | } 172 | } 173 | 174 | impl MinimizerBuilder { 175 | /// Builds an iterator over the canonical mod-minimizers and their positions in the given sequence with a boolean indicating a reverse complement. 176 | /// It requires an odd width to break ties between multiple minimizers. 177 | #[inline] 178 | pub fn iter(self, seq: &[u8]) -> CanonicalModSamplingIterator { 179 | assert!( 180 | self.minimizer_size >= R, 181 | "mod-minimizers require minimizer_size ≥ r={R}" 182 | ); 183 | assert_eq!( 184 | self.width % 2, 185 | 1, 186 | "width must be odd to break ties between multiple minimizers" 187 | ); 188 | CanonicalModSamplingIterator::new( 189 | seq, 190 | self.minimizer_size, 191 | self.width, 192 | R + ((self.minimizer_size - R) % self.width as usize), 193 | self.hasher, 194 | self.encoding, 195 | ) 196 | } 197 | 198 | /// Builds an iterator over the positions of the canonical mod-minimizers in the given sequence with a boolean indicating a reverse complement. 199 | /// It requires an odd width to break ties between multiple minimizers. 200 | #[inline] 201 | pub fn iter_pos(self, seq: &[u8]) -> CanonicalModSamplingPosIterator { 202 | assert!( 203 | self.minimizer_size >= R, 204 | "mod-minimizers require minimizer_size ≥ r={R}" 205 | ); 206 | assert_eq!( 207 | self.width % 2, 208 | 1, 209 | "width must be odd to break ties between multiple minimizers" 210 | ); 211 | CanonicalModSamplingPosIterator::new( 212 | seq, 213 | self.minimizer_size, 214 | self.width, 215 | R + ((self.minimizer_size - R) % self.width as usize), 216 | self.hasher, 217 | self.encoding, 218 | ) 219 | } 220 | } 221 | 222 | impl MinimizerBuilder { 223 | fn _new() -> Self { 224 | let mut encoding = [0u8; 256]; 225 | encoding[b'A' as usize] = 0b00; 226 | encoding[b'a' as usize] = 0b00; 227 | encoding[b'C' as usize] = 0b01; 228 | encoding[b'c' as usize] = 0b01; 229 | encoding[b'G' as usize] = 0b10; 230 | encoding[b'g' as usize] = 0b10; 231 | encoding[b'T' as usize] = 0b11; 232 | encoding[b't' as usize] = 0b11; 233 | Self { 234 | minimizer_size: 21, 235 | width: 31 - 21 + 1, 236 | hasher: DefaultHashBuilder::default(), 237 | encoding, 238 | _marker: PhantomData, 239 | } 240 | } 241 | 242 | /// Sets the seed of the default hasher. 243 | pub fn seed(mut self, seed: u64) -> Self { 244 | self.hasher = DefaultHashBuilder::with_seed(seed); 245 | self 246 | } 247 | } 248 | 249 | impl 250 | MinimizerBuilder 251 | { 252 | /// Sets the size of the minimizers. 253 | pub fn minimizer_size(mut self, minimizer_size: usize) -> Self { 254 | let max_size = (T::zero().count_zeros() / 2) as usize; 255 | assert!( 256 | minimizer_size <= max_size, 257 | "With this integer type, minimizer_size must be ≤ {max_size}. Please select a smaller size or a larger type." 258 | ); 259 | self.minimizer_size = minimizer_size; 260 | self 261 | } 262 | 263 | /// Sets the width of the window. 264 | pub const fn width(mut self, width: u16) -> Self { 265 | self.width = width; 266 | self 267 | } 268 | 269 | /// Sets the hasher used to compute minimizers. 270 | pub fn hasher(self, hasher: H) -> MinimizerBuilder { 271 | MinimizerBuilder:: { 272 | minimizer_size: self.minimizer_size, 273 | width: self.width, 274 | hasher, 275 | encoding: self.encoding, 276 | _marker: self._marker, 277 | } 278 | } 279 | 280 | /// Sets the binary encoding of the bases. 281 | pub fn encoding(mut self, a: u8, c: u8, g: u8, t: u8) -> Self { 282 | self.encoding[b'A' as usize] = a; 283 | self.encoding[b'a' as usize] = a; 284 | self.encoding[b'C' as usize] = c; 285 | self.encoding[b'c' as usize] = c; 286 | self.encoding[b'G' as usize] = g; 287 | self.encoding[b'g' as usize] = g; 288 | self.encoding[b'T' as usize] = t; 289 | self.encoding[b't' as usize] = t; 290 | self 291 | } 292 | 293 | /// Compute canonical minimizers. 294 | pub fn canonical(self) -> MinimizerBuilder { 295 | MinimizerBuilder:: { 296 | minimizer_size: self.minimizer_size, 297 | width: self.width, 298 | hasher: self.hasher, 299 | encoding: self.encoding, 300 | _marker: self._marker, 301 | } 302 | } 303 | 304 | /// Compute non-canonical minimizers. 305 | pub fn non_canonical(self) -> MinimizerBuilder { 306 | MinimizerBuilder:: { 307 | minimizer_size: self.minimizer_size, 308 | width: self.width, 309 | hasher: self.hasher, 310 | encoding: self.encoding, 311 | _marker: self._marker, 312 | } 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /src/iterator/minimizer.rs: -------------------------------------------------------------------------------- 1 | use core::cmp::min; 2 | use core::hash::{BuildHasher, Hash}; 3 | use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue, MinimizerQueue}; 4 | use num_traits::{AsPrimitive, PrimInt}; 5 | use std::collections::VecDeque; 6 | 7 | /// An iterator over the positions of the minimizers of a sequence. 8 | pub struct MinimizerPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> { 9 | pub(crate) seq: &'a [u8], 10 | pub(crate) queue: ImplicitMinimizerQueue, 11 | pub(crate) width: usize, 12 | pub(crate) mmer: T, 13 | pub(crate) mmer_mask: T, 14 | pub(crate) encoding: [u8; 256], 15 | pub(crate) base_width: usize, 16 | pub(crate) min_pos: usize, 17 | pub(crate) end: usize, 18 | } 19 | 20 | impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerPosIterator<'a, T, S> { 21 | pub fn new( 22 | seq: &'a [u8], 23 | minimizer_size: usize, 24 | width: u16, 25 | hasher: S, 26 | encoding: [u8; 256], 27 | ) -> Self { 28 | let queue = ImplicitMinimizerQueue::with_hasher(width, hasher); 29 | let width = width as usize; 30 | Self { 31 | seq, 32 | queue, 33 | width, 34 | mmer: T::zero(), 35 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 36 | encoding, 37 | base_width: width + minimizer_size - 1, 38 | end: width + minimizer_size - 1, 39 | min_pos: 0, 40 | } 41 | } 42 | } 43 | 44 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerPosIterator<'a, T, S> 45 | where 46 | u8: AsPrimitive, 47 | { 48 | type Item = usize; 49 | 50 | fn next(&mut self) -> Option { 51 | if self.queue.is_empty() { 52 | if self.base_width > self.seq.len() { 53 | return None; 54 | } 55 | for i in 0..(self.base_width - self.width) { 56 | self.mmer = (self.mmer << 2) 57 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 58 | } 59 | for i in (self.base_width - self.width)..self.base_width { 60 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 61 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 62 | self.queue.insert(&self.mmer); 63 | } 64 | self.min_pos = self.queue.get_min_pos(); 65 | } else { 66 | let mut min_pos = self.min_pos; 67 | while self.end < self.seq.len() && min_pos == self.min_pos { 68 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 69 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 70 | self.queue.insert(&self.mmer); 71 | self.end += 1; 72 | min_pos = self.end - self.base_width + self.queue.get_min_pos(); 73 | } 74 | if min_pos == self.min_pos { 75 | return None; 76 | } 77 | self.min_pos = min_pos; 78 | } 79 | Some(self.min_pos) 80 | } 81 | } 82 | 83 | /// An iterator over the minimizers of a sequence and their positions. 84 | pub struct MinimizerIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> { 85 | pub(crate) seq: &'a [u8], 86 | pub(crate) queue: MinimizerQueue, 87 | pub(crate) width: usize, 88 | pub(crate) mmer: T, 89 | pub(crate) mmer_mask: T, 90 | pub(crate) encoding: [u8; 256], 91 | pub(crate) base_width: usize, 92 | pub(crate) min_pos: (T, usize), 93 | pub(crate) end: usize, 94 | } 95 | 96 | impl<'a, T: PrimInt + Hash, S: BuildHasher> MinimizerIterator<'a, T, S> { 97 | pub fn new( 98 | seq: &'a [u8], 99 | minimizer_size: usize, 100 | width: u16, 101 | hasher: S, 102 | encoding: [u8; 256], 103 | ) -> Self { 104 | let queue = MinimizerQueue::with_hasher(width, hasher); 105 | let width = width as usize; 106 | Self { 107 | seq, 108 | queue, 109 | width, 110 | mmer: T::zero(), 111 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 112 | encoding, 113 | base_width: width + minimizer_size - 1, 114 | end: width + minimizer_size - 1, 115 | min_pos: (T::zero(), 0), 116 | } 117 | } 118 | } 119 | 120 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for MinimizerIterator<'a, T, S> 121 | where 122 | u8: AsPrimitive, 123 | { 124 | type Item = (T, usize); 125 | 126 | fn next(&mut self) -> Option { 127 | if self.queue.is_empty() { 128 | if self.base_width > self.seq.len() { 129 | return None; 130 | } 131 | for i in 0..(self.base_width - self.width) { 132 | self.mmer = (self.mmer << 2) 133 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 134 | } 135 | for i in (self.base_width - self.width)..self.base_width { 136 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 137 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 138 | self.queue.insert(self.mmer); 139 | } 140 | self.min_pos = self.queue.get_min_pos(); 141 | } else { 142 | let mut min_pos = self.min_pos; 143 | while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 { 144 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 145 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 146 | self.queue.insert(self.mmer); 147 | self.end += 1; 148 | let _min_pos = self.queue.get_min_pos(); 149 | min_pos = (_min_pos.0, self.end - self.base_width + _min_pos.1); 150 | } 151 | if min_pos.1 == self.min_pos.1 { 152 | return None; 153 | } 154 | self.min_pos = min_pos; 155 | } 156 | Some(self.min_pos) 157 | } 158 | } 159 | 160 | /// An iterator over the positions of the canonical minimizers of a sequence with a boolean indicating a reverse complement. 161 | /// It requires an odd width to break ties between multiple minimizers. 162 | pub struct CanonicalMinimizerPosIterator< 163 | 'a, 164 | T: PrimInt + Hash = u64, 165 | S: BuildHasher = DefaultHashBuilder, 166 | > { 167 | pub(crate) seq: &'a [u8], 168 | pub(crate) queue: ImplicitMinimizerQueue, 169 | pub(crate) width: usize, 170 | pub(crate) mmer: T, 171 | pub(crate) rc_mmer: T, 172 | pub(crate) mmer_mask: T, 173 | pub(crate) rc_mmer_shift: usize, 174 | pub(crate) is_rc: VecDeque, 175 | pub(crate) encoding: [u8; 256], 176 | pub(crate) rc_encoding: [u8; 256], 177 | pub(crate) base_width: usize, 178 | pub(crate) min_pos: (usize, bool), 179 | pub(crate) end: usize, 180 | } 181 | 182 | impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerPosIterator<'a, T, S> { 183 | pub fn new( 184 | seq: &'a [u8], 185 | minimizer_size: usize, 186 | width: u16, 187 | hasher: S, 188 | encoding: [u8; 256], 189 | ) -> Self { 190 | let queue = ImplicitMinimizerQueue::with_hasher(width, hasher); 191 | let width = width as usize; 192 | assert_eq!( 193 | width % 2, 194 | 1, 195 | "width must be odd to break ties between multiple minimizers" 196 | ); 197 | let mut rc_encoding = encoding; 198 | rc_encoding.swap(b'A' as usize, b'T' as usize); 199 | rc_encoding.swap(b'a' as usize, b't' as usize); 200 | rc_encoding.swap(b'C' as usize, b'G' as usize); 201 | rc_encoding.swap(b'c' as usize, b'g' as usize); 202 | Self { 203 | seq, 204 | queue, 205 | width, 206 | mmer: T::zero(), 207 | rc_mmer: T::zero(), 208 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 209 | rc_mmer_shift: 2 * (minimizer_size - 1), 210 | is_rc: VecDeque::with_capacity(width), 211 | encoding, 212 | rc_encoding, 213 | base_width: width + minimizer_size - 1, 214 | end: width + minimizer_size - 1, 215 | min_pos: (0, false), 216 | } 217 | } 218 | 219 | #[inline] 220 | fn window_not_canonical(&self) -> bool { 221 | self.is_rc[self.width / 2] 222 | } 223 | } 224 | 225 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator 226 | for CanonicalMinimizerPosIterator<'a, T, S> 227 | where 228 | u8: AsPrimitive, 229 | { 230 | type Item = (usize, bool); 231 | 232 | fn next(&mut self) -> Option { 233 | if self.queue.is_empty() { 234 | if self.base_width > self.seq.len() { 235 | return None; 236 | } 237 | for i in 0..(self.base_width - self.width) { 238 | self.mmer = (self.mmer << 2) 239 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 240 | self.rc_mmer = (self.rc_mmer >> 2) 241 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 242 | << self.rc_mmer_shift); 243 | } 244 | for i in (self.base_width - self.width)..self.base_width { 245 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 246 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 247 | self.rc_mmer = (self.rc_mmer >> 2) 248 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 249 | << self.rc_mmer_shift); 250 | let canonical_mmer = min(self.mmer, self.rc_mmer); 251 | self.queue.insert(&canonical_mmer); 252 | self.is_rc.push_back(canonical_mmer == self.rc_mmer); 253 | } 254 | let pos = if self.queue.multiple_mins() { 255 | let (pos, tie) = self.queue.get_inner_min_pos(); 256 | tie.map_or(pos, |alt| { 257 | if self.window_not_canonical() { 258 | alt 259 | } else { 260 | pos 261 | } 262 | }) 263 | } else { 264 | self.queue.get_min_pos() 265 | }; 266 | self.min_pos = (pos, self.is_rc[pos]) 267 | } else { 268 | let mut min_pos = self.min_pos; 269 | while self.end < self.seq.len() && min_pos == self.min_pos { 270 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 271 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 272 | self.rc_mmer = (self.rc_mmer >> 2) 273 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) } 274 | .as_() 275 | << self.rc_mmer_shift); 276 | let canonical_mmer = min(self.mmer, self.rc_mmer); 277 | self.queue.insert(&canonical_mmer); 278 | self.is_rc.pop_front(); 279 | self.is_rc.push_back(canonical_mmer == self.rc_mmer); 280 | self.end += 1; 281 | let pos = if self.queue.multiple_mins() { 282 | let (pos, tie) = self.queue.get_inner_min_pos(); 283 | tie.map_or(pos, |alt| { 284 | if self.window_not_canonical() { 285 | alt 286 | } else { 287 | pos 288 | } 289 | }) 290 | } else { 291 | self.queue.get_min_pos() 292 | }; 293 | min_pos = (self.end - self.base_width + pos, self.is_rc[pos]); 294 | } 295 | if min_pos == self.min_pos { 296 | return None; 297 | } 298 | self.min_pos = min_pos; 299 | } 300 | Some(self.min_pos) 301 | } 302 | } 303 | 304 | /// An iterator over the canonical minimizers of a sequence and their positions with a boolean indicating a reverse complement. 305 | /// It requires an odd width to break ties between multiple minimizers. 306 | pub struct CanonicalMinimizerIterator< 307 | 'a, 308 | T: PrimInt + Hash = u64, 309 | S: BuildHasher = DefaultHashBuilder, 310 | > { 311 | pub(crate) seq: &'a [u8], 312 | pub(crate) queue: MinimizerQueue, 313 | pub(crate) width: usize, 314 | pub(crate) mmer: T, 315 | pub(crate) rc_mmer: T, 316 | pub(crate) mmer_mask: T, 317 | pub(crate) rc_mmer_shift: usize, 318 | pub(crate) is_rc: VecDeque, 319 | pub(crate) encoding: [u8; 256], 320 | pub(crate) rc_encoding: [u8; 256], 321 | pub(crate) base_width: usize, 322 | pub(crate) min_pos: (T, usize, bool), 323 | pub(crate) end: usize, 324 | } 325 | 326 | impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalMinimizerIterator<'a, T, S> { 327 | pub fn new( 328 | seq: &'a [u8], 329 | minimizer_size: usize, 330 | width: u16, 331 | hasher: S, 332 | encoding: [u8; 256], 333 | ) -> Self { 334 | let queue = MinimizerQueue::with_hasher(width, hasher); 335 | let width = width as usize; 336 | assert_eq!( 337 | width % 2, 338 | 1, 339 | "width must be odd to break ties between multiple minimizers" 340 | ); 341 | let mut rc_encoding = encoding; 342 | rc_encoding.swap(b'A' as usize, b'T' as usize); 343 | rc_encoding.swap(b'a' as usize, b't' as usize); 344 | rc_encoding.swap(b'C' as usize, b'G' as usize); 345 | rc_encoding.swap(b'c' as usize, b'g' as usize); 346 | Self { 347 | seq, 348 | queue, 349 | width, 350 | mmer: T::zero(), 351 | rc_mmer: T::zero(), 352 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 353 | rc_mmer_shift: 2 * (minimizer_size - 1), 354 | is_rc: VecDeque::with_capacity(width), 355 | encoding, 356 | rc_encoding, 357 | base_width: width + minimizer_size - 1, 358 | end: width + minimizer_size - 1, 359 | min_pos: (T::zero(), 0, false), 360 | } 361 | } 362 | 363 | #[inline] 364 | fn window_not_canonical(&self) -> bool { 365 | self.is_rc[self.width / 2] 366 | } 367 | } 368 | 369 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator 370 | for CanonicalMinimizerIterator<'a, T, S> 371 | where 372 | u8: AsPrimitive, 373 | { 374 | type Item = (T, usize, bool); 375 | 376 | fn next(&mut self) -> Option { 377 | if self.queue.is_empty() { 378 | if self.base_width > self.seq.len() { 379 | return None; 380 | } 381 | for i in 0..(self.base_width - self.width) { 382 | self.mmer = (self.mmer << 2) 383 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 384 | self.rc_mmer = (self.rc_mmer >> 2) 385 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 386 | << self.rc_mmer_shift); 387 | } 388 | for i in (self.base_width - self.width)..self.base_width { 389 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 390 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 391 | self.rc_mmer = (self.rc_mmer >> 2) 392 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 393 | << self.rc_mmer_shift); 394 | let canonical_mmer = min(self.mmer, self.rc_mmer); 395 | self.queue.insert(canonical_mmer); 396 | self.is_rc.push_back(canonical_mmer == self.rc_mmer); 397 | } 398 | let _min_pos = if self.queue.multiple_mins() { 399 | let (x, pos, tie) = self.queue.get_inner_min_pos(); 400 | tie.map_or((x, pos), |alt| { 401 | if self.window_not_canonical() { 402 | alt 403 | } else { 404 | (x, pos) 405 | } 406 | }) 407 | } else { 408 | self.queue.get_min_pos() 409 | }; 410 | self.min_pos = (_min_pos.0, _min_pos.1, self.is_rc[_min_pos.1]); 411 | } else { 412 | let mut min_pos = self.min_pos; 413 | while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 { 414 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 415 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 416 | self.rc_mmer = (self.rc_mmer >> 2) 417 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) } 418 | .as_() 419 | << self.rc_mmer_shift); 420 | let canonical_mmer = min(self.mmer, self.rc_mmer); 421 | self.queue.insert(canonical_mmer); 422 | self.is_rc.pop_front(); 423 | self.is_rc.push_back(canonical_mmer == self.rc_mmer); 424 | self.end += 1; 425 | let _min_pos = if self.queue.multiple_mins() { 426 | let (x, pos, tie) = self.queue.get_inner_min_pos(); 427 | tie.map_or((x, pos), |alt| { 428 | if self.window_not_canonical() { 429 | alt 430 | } else { 431 | (x, pos) 432 | } 433 | }) 434 | } else { 435 | self.queue.get_min_pos() 436 | }; 437 | min_pos = ( 438 | _min_pos.0, 439 | self.end - self.base_width + _min_pos.1, 440 | self.is_rc[_min_pos.1], 441 | ); 442 | } 443 | if min_pos.1 == self.min_pos.1 { 444 | return None; 445 | } 446 | self.min_pos = min_pos; 447 | } 448 | Some(self.min_pos) 449 | } 450 | } 451 | -------------------------------------------------------------------------------- /src/iterator/mod.rs: -------------------------------------------------------------------------------- 1 | //! Iterators over minimizers. 2 | 3 | mod minimizer; 4 | mod mod_sampling; 5 | 6 | pub use minimizer::*; 7 | pub use mod_sampling::*; 8 | -------------------------------------------------------------------------------- /src/iterator/mod_sampling.rs: -------------------------------------------------------------------------------- 1 | use core::cmp::min; 2 | use core::hash::{BuildHasher, Hash}; 3 | use minimizer_queue::{DefaultHashBuilder, ImplicitMinimizerQueue}; 4 | use num_traits::{AsPrimitive, PrimInt}; 5 | use std::collections::VecDeque; 6 | use strength_reduce::StrengthReducedU16; 7 | 8 | /// An iterator over the positions of the mod-sampling minimizers of a sequence. 9 | pub struct ModSamplingPosIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> 10 | { 11 | pub(crate) seq: &'a [u8], 12 | pub(crate) queue: ImplicitMinimizerQueue, 13 | pub(crate) width_m: StrengthReducedU16, 14 | pub(crate) width_t: usize, 15 | pub(crate) tmer: T, 16 | pub(crate) tmer_mask: T, 17 | pub(crate) encoding: [u8; 256], 18 | pub(crate) base_width: usize, 19 | pub(crate) min_pos: usize, 20 | pub(crate) end: usize, 21 | } 22 | 23 | impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingPosIterator<'a, T, S> { 24 | pub fn new( 25 | seq: &'a [u8], 26 | minimizer_size: usize, 27 | width: u16, 28 | t: usize, 29 | hasher: S, 30 | encoding: [u8; 256], 31 | ) -> Self { 32 | let width_m = StrengthReducedU16::new(width); 33 | let width_t = width + (minimizer_size - t) as u16; 34 | let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher); 35 | let width_t = width_t as usize; 36 | Self { 37 | seq, 38 | queue, 39 | width_m, 40 | width_t, 41 | tmer: T::zero(), 42 | tmer_mask: (T::one() << (2 * t)) - T::one(), 43 | encoding, 44 | base_width: width_t + t - 1, 45 | end: width_t + t - 1, 46 | min_pos: 0, 47 | } 48 | } 49 | } 50 | 51 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingPosIterator<'a, T, S> 52 | where 53 | u8: AsPrimitive, 54 | { 55 | type Item = usize; 56 | 57 | fn next(&mut self) -> Option { 58 | if self.queue.is_empty() { 59 | if self.base_width > self.seq.len() { 60 | return None; 61 | } 62 | for i in 0..(self.base_width - self.width_t) { 63 | self.tmer = (self.tmer << 2) 64 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 65 | } 66 | for i in (self.base_width - self.width_t)..self.base_width { 67 | self.tmer = ((self.tmer << 2) & self.tmer_mask) 68 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 69 | self.queue.insert(&self.tmer); 70 | } 71 | self.min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize; 72 | } else { 73 | let mut min_pos = self.min_pos; 74 | while self.end < self.seq.len() && min_pos == self.min_pos { 75 | self.tmer = ((self.tmer << 2) & self.tmer_mask) 76 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 77 | self.queue.insert(&self.tmer); 78 | self.end += 1; 79 | min_pos = self.end - self.base_width 80 | + (self.queue.get_min_pos() as u16 % self.width_m) as usize; 81 | } 82 | if min_pos == self.min_pos { 83 | return None; 84 | } 85 | self.min_pos = min_pos; 86 | } 87 | Some(self.min_pos) 88 | } 89 | } 90 | 91 | /// An iterator over the mod-sampling minimizers of a sequence and their positions. 92 | pub struct ModSamplingIterator<'a, T: PrimInt + Hash = u64, S: BuildHasher = DefaultHashBuilder> { 93 | pub(crate) seq: &'a [u8], 94 | pub(crate) queue: ImplicitMinimizerQueue, 95 | pub(crate) width_m: StrengthReducedU16, 96 | pub(crate) width_t: usize, 97 | pub(crate) mmer: T, 98 | pub(crate) mmer_mask: T, 99 | pub(crate) tmer_mask: T, 100 | pub(crate) canon_mmers: VecDeque, 101 | pub(crate) encoding: [u8; 256], 102 | pub(crate) base_width: usize, 103 | pub(crate) min_pos: (T, usize), 104 | pub(crate) end: usize, 105 | } 106 | 107 | impl<'a, T: PrimInt + Hash, S: BuildHasher> ModSamplingIterator<'a, T, S> { 108 | pub fn new( 109 | seq: &'a [u8], 110 | minimizer_size: usize, 111 | width: u16, 112 | t: usize, 113 | hasher: S, 114 | encoding: [u8; 256], 115 | ) -> Self { 116 | let width_m = StrengthReducedU16::new(width); 117 | let width_t = width + (minimizer_size - t) as u16; 118 | let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher); 119 | let width_t = width_t as usize; 120 | Self { 121 | seq, 122 | queue, 123 | width_m, 124 | width_t, 125 | mmer: T::zero(), 126 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 127 | tmer_mask: (T::one() << (2 * t)) - T::one(), 128 | canon_mmers: VecDeque::with_capacity(width as usize), 129 | encoding, 130 | base_width: width_t + t - 1, 131 | end: width_t + t - 1, 132 | min_pos: (T::zero(), 0), 133 | } 134 | } 135 | } 136 | 137 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator for ModSamplingIterator<'a, T, S> 138 | where 139 | u8: AsPrimitive, 140 | { 141 | type Item = (T, usize); 142 | 143 | fn next(&mut self) -> Option { 144 | if self.queue.is_empty() { 145 | if self.base_width > self.seq.len() { 146 | return None; 147 | } 148 | let width_m = self.width_m.get() as usize; 149 | for i in 0..(self.base_width - self.width_t) { 150 | self.mmer = (self.mmer << 2) 151 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 152 | } 153 | for i in (self.base_width - self.width_t)..(self.base_width - width_m) { 154 | self.mmer = (self.mmer << 2) 155 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 156 | self.queue.insert(&(self.mmer & self.tmer_mask)); 157 | } 158 | for i in (self.base_width - width_m)..self.base_width { 159 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 160 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 161 | self.queue.insert(&(self.mmer & self.tmer_mask)); 162 | self.canon_mmers.push_back(self.mmer); 163 | } 164 | let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize; 165 | self.min_pos = (self.canon_mmers[_min_pos], _min_pos); 166 | } else { 167 | let mut min_pos = self.min_pos; 168 | while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 { 169 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 170 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 171 | self.queue.insert(&(self.mmer & self.tmer_mask)); 172 | self.canon_mmers.pop_front(); 173 | self.canon_mmers.push_back(self.mmer); 174 | self.end += 1; 175 | let _min_pos = (self.queue.get_min_pos() as u16 % self.width_m) as usize; 176 | min_pos = ( 177 | self.canon_mmers[_min_pos], 178 | self.end - self.base_width + _min_pos, 179 | ); 180 | } 181 | if min_pos.1 == self.min_pos.1 { 182 | return None; 183 | } 184 | self.min_pos = min_pos; 185 | } 186 | Some(self.min_pos) 187 | } 188 | } 189 | 190 | /// An iterator over the positions of the canonical mod-sampling minimizers of a sequence with a boolean indicating a reverse complement. 191 | /// It requires an odd width to break ties between multiple minimizers. 192 | pub struct CanonicalModSamplingPosIterator< 193 | 'a, 194 | T: PrimInt + Hash = u64, 195 | S: BuildHasher = DefaultHashBuilder, 196 | > { 197 | pub(crate) seq: &'a [u8], 198 | pub(crate) queue: ImplicitMinimizerQueue, 199 | pub(crate) width_m: StrengthReducedU16, 200 | pub(crate) width_t: usize, 201 | pub(crate) mmer: T, 202 | pub(crate) rc_mmer: T, 203 | pub(crate) mmer_mask: T, 204 | pub(crate) tmer_mask: T, 205 | pub(crate) rc_mmer_shift: usize, 206 | pub(crate) rc_tmer_shift: usize, 207 | pub(crate) is_rc_m: VecDeque, 208 | pub(crate) encoding: [u8; 256], 209 | pub(crate) rc_encoding: [u8; 256], 210 | pub(crate) base_width: usize, 211 | pub(crate) min_pos: (usize, bool), 212 | pub(crate) end: usize, 213 | } 214 | 215 | impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingPosIterator<'a, T, S> { 216 | pub fn new( 217 | seq: &'a [u8], 218 | minimizer_size: usize, 219 | width: u16, 220 | t: usize, 221 | hasher: S, 222 | encoding: [u8; 256], 223 | ) -> Self { 224 | let width_m = StrengthReducedU16::new(width); 225 | let width_t = width + (minimizer_size - t) as u16; 226 | assert_eq!( 227 | width_t % width_m, 228 | 0, 229 | "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers" 230 | ); 231 | assert_eq!( 232 | width % 2, 233 | 1, 234 | "width must be odd to break ties between multiple minimizers" 235 | ); 236 | let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher); 237 | let width_t = width_t as usize; 238 | let mut rc_encoding = encoding; 239 | rc_encoding.swap(b'A' as usize, b'T' as usize); 240 | rc_encoding.swap(b'a' as usize, b't' as usize); 241 | rc_encoding.swap(b'C' as usize, b'G' as usize); 242 | rc_encoding.swap(b'c' as usize, b'g' as usize); 243 | Self { 244 | seq, 245 | queue, 246 | width_m, 247 | width_t, 248 | mmer: T::zero(), 249 | rc_mmer: T::zero(), 250 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 251 | tmer_mask: (T::one() << (2 * t)) - T::one(), 252 | rc_mmer_shift: 2 * (minimizer_size - 1), 253 | rc_tmer_shift: 2 * (minimizer_size - t), 254 | is_rc_m: VecDeque::with_capacity(width as usize), 255 | encoding, 256 | rc_encoding, 257 | base_width: width_t + t - 1, 258 | end: width_t + t - 1, 259 | min_pos: (0, false), 260 | } 261 | } 262 | 263 | #[inline] 264 | fn window_not_canonical(&self) -> bool { 265 | let mid = self.is_rc_m.len() / 2; 266 | self.is_rc_m[mid] 267 | } 268 | } 269 | 270 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator 271 | for CanonicalModSamplingPosIterator<'a, T, S> 272 | where 273 | u8: AsPrimitive, 274 | { 275 | type Item = (usize, bool); 276 | 277 | fn next(&mut self) -> Option { 278 | if self.queue.is_empty() { 279 | if self.base_width > self.seq.len() { 280 | return None; 281 | } 282 | let width_m = self.width_m.get() as usize; 283 | for i in 0..(self.base_width - self.width_t) { 284 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 285 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 286 | self.rc_mmer = (self.rc_mmer >> 2) 287 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 288 | << self.rc_mmer_shift); 289 | } 290 | for i in (self.base_width - self.width_t)..(self.base_width - width_m) { 291 | self.mmer = (self.mmer << 2) 292 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 293 | self.rc_mmer = (self.rc_mmer >> 2) 294 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 295 | << self.rc_mmer_shift); 296 | let tmer = self.mmer & self.tmer_mask; 297 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 298 | let canonical_tmer = min(tmer, rc_tmer); 299 | self.queue.insert(&canonical_tmer); 300 | } 301 | for i in (self.base_width - width_m)..self.base_width { 302 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 303 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 304 | self.rc_mmer = (self.rc_mmer >> 2) 305 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 306 | << self.rc_mmer_shift); 307 | let tmer = self.mmer & self.tmer_mask; 308 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 309 | let canonical_tmer = min(tmer, rc_tmer); 310 | self.queue.insert(&canonical_tmer); 311 | self.is_rc_m.push_back(self.rc_mmer <= self.mmer); 312 | } 313 | let pos = if self.queue.multiple_mins() { 314 | let (pos, tie) = self.queue.get_inner_min_pos(); 315 | tie.map_or(pos, |alt| { 316 | if self.window_not_canonical() { 317 | alt 318 | } else { 319 | pos 320 | } 321 | }) 322 | } else { 323 | self.queue.get_min_pos() 324 | }; 325 | let pos = (pos as u16 % self.width_m) as usize; 326 | self.min_pos = (pos, self.is_rc_m[pos]); 327 | } else { 328 | let mut min_pos = self.min_pos; 329 | while self.end < self.seq.len() && min_pos.0 == self.min_pos.0 { 330 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 331 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 332 | self.rc_mmer = (self.rc_mmer >> 2) 333 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) } 334 | .as_() 335 | << self.rc_mmer_shift); 336 | let tmer = self.mmer & self.tmer_mask; 337 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 338 | let canonical_tmer = min(tmer, rc_tmer); 339 | self.queue.insert(&canonical_tmer); 340 | self.is_rc_m.pop_front(); 341 | self.is_rc_m.push_back(self.rc_mmer <= self.mmer); 342 | self.end += 1; 343 | let pos = if self.queue.multiple_mins() { 344 | let (pos, tie) = self.queue.get_inner_min_pos(); 345 | tie.map_or(pos, |alt| { 346 | if self.window_not_canonical() { 347 | alt 348 | } else { 349 | pos 350 | } 351 | }) 352 | } else { 353 | self.queue.get_min_pos() 354 | }; 355 | let pos = (pos as u16 % self.width_m) as usize; 356 | min_pos = (self.end - self.base_width + pos, self.is_rc_m[pos]); 357 | } 358 | if min_pos.0 == self.min_pos.0 { 359 | return None; 360 | } 361 | self.min_pos = min_pos; 362 | } 363 | Some(self.min_pos) 364 | } 365 | } 366 | 367 | /// An iterator over the canonical mod-sampling minimizers of a sequence and their positions with a boolean indicating a reverse complement. 368 | /// It requires an odd width to break ties between multiple minimizers. 369 | pub struct CanonicalModSamplingIterator< 370 | 'a, 371 | T: PrimInt + Hash = u64, 372 | S: BuildHasher = DefaultHashBuilder, 373 | > { 374 | pub(crate) seq: &'a [u8], 375 | pub(crate) queue: ImplicitMinimizerQueue, 376 | pub(crate) width_m: StrengthReducedU16, 377 | pub(crate) width_t: usize, 378 | pub(crate) mmer: T, 379 | pub(crate) rc_mmer: T, 380 | pub(crate) mmer_mask: T, 381 | pub(crate) tmer_mask: T, 382 | pub(crate) rc_mmer_shift: usize, 383 | pub(crate) rc_tmer_shift: usize, 384 | pub(crate) canon_mmers: VecDeque<(T, bool)>, 385 | pub(crate) encoding: [u8; 256], 386 | pub(crate) rc_encoding: [u8; 256], 387 | pub(crate) base_width: usize, 388 | pub(crate) min_pos: (T, usize, bool), 389 | pub(crate) end: usize, 390 | } 391 | 392 | impl<'a, T: PrimInt + Hash, S: BuildHasher> CanonicalModSamplingIterator<'a, T, S> { 393 | pub fn new( 394 | seq: &'a [u8], 395 | minimizer_size: usize, 396 | width: u16, 397 | t: usize, 398 | hasher: S, 399 | encoding: [u8; 256], 400 | ) -> Self { 401 | let width_m = StrengthReducedU16::new(width); 402 | let width_t = width + (minimizer_size - t) as u16; 403 | assert_eq!( 404 | width_t % width_m, 405 | 0, 406 | "(minimizer_size - t) must be a multiple of the width to preserve canonical minimizers" 407 | ); 408 | assert_eq!( 409 | width % 2, 410 | 1, 411 | "width must be odd to break ties between multiple minimizers" 412 | ); 413 | let queue = ImplicitMinimizerQueue::with_hasher(width_t, hasher); 414 | let width_t = width_t as usize; 415 | let mut rc_encoding = encoding; 416 | rc_encoding.swap(b'A' as usize, b'T' as usize); 417 | rc_encoding.swap(b'a' as usize, b't' as usize); 418 | rc_encoding.swap(b'C' as usize, b'G' as usize); 419 | rc_encoding.swap(b'c' as usize, b'g' as usize); 420 | Self { 421 | seq, 422 | queue, 423 | width_m, 424 | width_t, 425 | mmer: T::zero(), 426 | rc_mmer: T::zero(), 427 | mmer_mask: (T::one() << (2 * minimizer_size)) - T::one(), 428 | tmer_mask: (T::one() << (2 * t)) - T::one(), 429 | rc_mmer_shift: 2 * (minimizer_size - 1), 430 | rc_tmer_shift: 2 * (minimizer_size - t), 431 | canon_mmers: VecDeque::with_capacity(width as usize), 432 | encoding, 433 | rc_encoding, 434 | base_width: width_t + t - 1, 435 | end: width_t + t - 1, 436 | min_pos: (T::zero(), 0, false), 437 | } 438 | } 439 | 440 | #[inline] 441 | fn window_not_canonical(&self) -> bool { 442 | let mid = self.canon_mmers.len() / 2; 443 | self.canon_mmers[mid].1 444 | } 445 | } 446 | 447 | impl<'a, T: PrimInt + Hash + 'static, S: BuildHasher> Iterator 448 | for CanonicalModSamplingIterator<'a, T, S> 449 | where 450 | u8: AsPrimitive, 451 | { 452 | type Item = (T, usize, bool); 453 | 454 | fn next(&mut self) -> Option { 455 | if self.queue.is_empty() { 456 | if self.base_width > self.seq.len() { 457 | return None; 458 | } 459 | let width_m = self.width_m.get() as usize; 460 | for i in 0..(self.base_width - self.width_t) { 461 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 462 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 463 | self.rc_mmer = (self.rc_mmer >> 2) 464 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 465 | << self.rc_mmer_shift); 466 | } 467 | for i in (self.base_width - self.width_t)..(self.base_width - width_m) { 468 | self.mmer = (self.mmer << 2) 469 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 470 | self.rc_mmer = (self.rc_mmer >> 2) 471 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 472 | << self.rc_mmer_shift); 473 | let tmer = self.mmer & self.tmer_mask; 474 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 475 | let canonical_tmer = min(tmer, rc_tmer); 476 | self.queue.insert(&canonical_tmer); 477 | } 478 | for i in (self.base_width - width_m)..self.base_width { 479 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 480 | | (unsafe { self.encoding.get_unchecked(self.seq[i] as usize) }.as_()); 481 | self.rc_mmer = (self.rc_mmer >> 2) 482 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[i] as usize) }.as_() 483 | << self.rc_mmer_shift); 484 | let tmer = self.mmer & self.tmer_mask; 485 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 486 | let canonical_tmer = min(tmer, rc_tmer); 487 | self.queue.insert(&canonical_tmer); 488 | let canonical_mmer = min(self.mmer, self.rc_mmer); 489 | self.canon_mmers 490 | .push_back((canonical_mmer, canonical_mmer == self.rc_mmer)); 491 | } 492 | let pos = if self.queue.multiple_mins() { 493 | let (pos, tie) = self.queue.get_inner_min_pos(); 494 | tie.map_or(pos, |alt| { 495 | if self.window_not_canonical() { 496 | alt 497 | } else { 498 | pos 499 | } 500 | }) 501 | } else { 502 | self.queue.get_min_pos() 503 | }; 504 | let pos = (pos as u16 % self.width_m) as usize; 505 | let (mmer, rc) = self.canon_mmers[pos]; 506 | self.min_pos = (mmer, pos, rc); 507 | } else { 508 | let mut min_pos = self.min_pos; 509 | while self.end < self.seq.len() && min_pos.1 == self.min_pos.1 { 510 | self.mmer = ((self.mmer << 2) & self.mmer_mask) 511 | | (unsafe { self.encoding.get_unchecked(self.seq[self.end] as usize) }.as_()); 512 | self.rc_mmer = (self.rc_mmer >> 2) 513 | | (unsafe { self.rc_encoding.get_unchecked(self.seq[self.end] as usize) } 514 | .as_() 515 | << self.rc_mmer_shift); 516 | let tmer = self.mmer & self.tmer_mask; 517 | let rc_tmer = self.rc_mmer >> self.rc_tmer_shift; 518 | let canonical_tmer = min(tmer, rc_tmer); 519 | self.queue.insert(&canonical_tmer); 520 | let canonical_mmer = min(self.mmer, self.rc_mmer); 521 | self.canon_mmers.pop_front(); 522 | self.canon_mmers 523 | .push_back((canonical_mmer, canonical_mmer == self.rc_mmer)); 524 | self.end += 1; 525 | let pos = if self.queue.multiple_mins() { 526 | let (pos, tie) = self.queue.get_inner_min_pos(); 527 | tie.map_or(pos, |alt| { 528 | if self.window_not_canonical() { 529 | alt 530 | } else { 531 | pos 532 | } 533 | }) 534 | } else { 535 | self.queue.get_min_pos() 536 | }; 537 | let pos = (pos as u16 % self.width_m) as usize; 538 | let (mmer, rc) = self.canon_mmers[pos]; 539 | min_pos = (mmer, self.end - self.base_width + pos, rc); 540 | } 541 | if min_pos.1 == self.min_pos.1 { 542 | return None; 543 | } 544 | self.min_pos = min_pos; 545 | } 546 | Some(self.min_pos) 547 | } 548 | } 549 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod algorithm; 2 | mod builder; 3 | pub mod iterator; 4 | 5 | pub use builder::MinimizerBuilder; 6 | pub use minimizer_queue::DefaultHashBuilder; 7 | 8 | #[cfg(test)] 9 | mod tests { 10 | use super::*; 11 | use biotest::Format; 12 | use nohash_hasher::BuildNoHashHasher; 13 | 14 | #[test] 15 | fn test_minimizer_iter() { 16 | let seq = b"TGATTGCACAATC"; 17 | let minimizer_size = 3; 18 | let width = 4; 19 | let hasher = BuildNoHashHasher::::default(); 20 | let mut min_iter = MinimizerBuilder::new() 21 | .minimizer_size(minimizer_size) 22 | .width(width) 23 | .hasher(hasher) 24 | .iter(seq); 25 | 26 | assert_eq!(min_iter.next(), Some((0b001111, 2))); // ATT 27 | assert_eq!(min_iter.next(), Some((0b010001, 6))); // CAC 28 | assert_eq!(min_iter.next(), Some((0b000100, 7))); // ACA 29 | assert_eq!(min_iter.next(), Some((0b000011, 9))); // AAT 30 | assert_eq!(min_iter.next(), None); 31 | } 32 | 33 | #[test] 34 | fn test_minimizer_iter_pos() { 35 | let seq = b"TGATTGCACAATC"; 36 | let minimizer_size = 3; 37 | let width = 4; 38 | let hasher = BuildNoHashHasher::::default(); 39 | let mut min_iter = MinimizerBuilder::::new() 40 | .minimizer_size(minimizer_size) 41 | .width(width) 42 | .hasher(hasher) 43 | .iter_pos(seq); 44 | 45 | assert_eq!(min_iter.next(), Some(2)); // ATT 46 | assert_eq!(min_iter.next(), Some(6)); // CAC 47 | assert_eq!(min_iter.next(), Some(7)); // ACA 48 | assert_eq!(min_iter.next(), Some(9)); // AAT 49 | assert_eq!(min_iter.next(), None); 50 | } 51 | 52 | #[test] 53 | fn test_mod_minimizer_iter() { 54 | let seq = b"TGATTGCACAATC"; 55 | let minimizer_size = 4; 56 | let width = 4; 57 | let hasher = BuildNoHashHasher::::default(); 58 | let mut min_iter = MinimizerBuilder::new_mod() 59 | .minimizer_size(minimizer_size) 60 | .width(width) 61 | .hasher(hasher) 62 | .iter(seq); 63 | 64 | assert_eq!(min_iter.next(), Some((0b00111110, 2))); // ATTG 65 | assert_eq!(min_iter.next(), Some((0b01000100, 6))); // CACA 66 | assert_eq!(min_iter.next(), Some((0b00010000, 7))); // ACAA 67 | assert_eq!(min_iter.next(), Some((0b00001101, 9))); // AATC 68 | assert_eq!(min_iter.next(), None); 69 | } 70 | 71 | #[test] 72 | fn test_mod_minimizer_iter_pos() { 73 | let seq = b"TGATTGCACAATC"; 74 | let minimizer_size = 4; 75 | let width = 4; 76 | let hasher = BuildNoHashHasher::::default(); 77 | let mut min_iter = MinimizerBuilder::::new_mod() 78 | .minimizer_size(minimizer_size) 79 | .width(width) 80 | .hasher(hasher) 81 | .iter_pos(seq); 82 | 83 | assert_eq!(min_iter.next(), Some(2)); // ATTG 84 | assert_eq!(min_iter.next(), Some(6)); // CACA 85 | assert_eq!(min_iter.next(), Some(7)); // ACAA 86 | assert_eq!(min_iter.next(), Some(9)); // AATC 87 | assert_eq!(min_iter.next(), None); 88 | } 89 | 90 | fn gen_seq(len: usize) -> Vec { 91 | let mut rng = biotest::rand(); 92 | let mut seq = Vec::with_capacity(len); 93 | let generator = biotest::Sequence::builder() 94 | .sequence_len(len) 95 | .build() 96 | .unwrap(); 97 | generator.record(&mut seq, &mut rng).unwrap(); 98 | seq 99 | } 100 | 101 | fn rc(seq: &[u8]) -> Vec { 102 | seq.iter() 103 | .rev() 104 | .map(|&b| match b { 105 | b'A' => b'T', 106 | b'a' => b't', 107 | b'T' => b'A', 108 | b't' => b'a', 109 | b'C' => b'G', 110 | b'c' => b'g', 111 | b'G' => b'C', 112 | b'g' => b'c', 113 | b => b, 114 | }) 115 | .collect() 116 | } 117 | 118 | #[test] 119 | fn test_canonical_minimizer_iter() { 120 | let seq_len = 1_000_000; 121 | let seq = &gen_seq(seq_len); 122 | let seq_rc = &rc(seq); 123 | let minimizer_size = 21; 124 | let width = 11; 125 | 126 | let mins: Vec = MinimizerBuilder::new() 127 | .canonical() 128 | .minimizer_size(minimizer_size) 129 | .width(width) 130 | .iter(seq) 131 | .map(|(min, _, _)| min) 132 | .collect(); 133 | let mut mins_rc: Vec = MinimizerBuilder::new() 134 | .canonical() 135 | .minimizer_size(minimizer_size) 136 | .width(width) 137 | .iter(seq_rc) 138 | .map(|(min, _, _)| min) 139 | .collect(); 140 | mins_rc.reverse(); 141 | 142 | assert_eq!(mins, mins_rc); 143 | } 144 | 145 | #[test] 146 | fn test_canonical_minimizer_iter_pos() { 147 | let seq_len = 1_000_000; 148 | let seq = &gen_seq(seq_len); 149 | let seq_rc = &rc(seq); 150 | let minimizer_size = 21; 151 | let width = 11; 152 | 153 | let mins: Vec<_> = MinimizerBuilder::::new() 154 | .canonical() 155 | .minimizer_size(minimizer_size) 156 | .width(width) 157 | .iter_pos(seq) 158 | .map(|(pos, _)| pos) 159 | .collect(); 160 | let mut mins_rc: Vec<_> = MinimizerBuilder::::new() 161 | .canonical() 162 | .minimizer_size(minimizer_size) 163 | .width(width) 164 | .iter_pos(seq_rc) 165 | .map(|(pos, _)| seq_len - pos - minimizer_size) 166 | .collect(); 167 | mins_rc.reverse(); 168 | 169 | assert_eq!(mins, mins_rc); 170 | } 171 | 172 | #[test] 173 | fn test_canonical_mod_minimizer_iter() { 174 | let seq_len = 1_000_000; 175 | let seq = &gen_seq(seq_len); 176 | let seq_rc = &rc(seq); 177 | let minimizer_size = 21; 178 | let width = 11; 179 | 180 | let mins: Vec<_> = MinimizerBuilder::::new_mod() 181 | .canonical() 182 | .minimizer_size(minimizer_size) 183 | .width(width) 184 | .iter(seq) 185 | .map(|(min, _, _)| min) 186 | .collect(); 187 | let mut mins_rc: Vec<_> = MinimizerBuilder::::new_mod() 188 | .canonical() 189 | .minimizer_size(minimizer_size) 190 | .width(width) 191 | .iter(seq_rc) 192 | .map(|(min, _, _)| min) 193 | .collect(); 194 | mins_rc.reverse(); 195 | 196 | assert_eq!(mins, mins_rc); 197 | } 198 | 199 | #[test] 200 | fn test_canonical_mod_minimizer_iter_pos() { 201 | let seq_len = 1_000_000; 202 | let seq = &gen_seq(seq_len); 203 | let seq_rc = &rc(seq); 204 | let minimizer_size = 21; 205 | let width = 11; 206 | let mins: Vec<_> = MinimizerBuilder::::new_mod() 207 | .canonical() 208 | .minimizer_size(minimizer_size) 209 | .width(width) 210 | .iter_pos(seq) 211 | .map(|(pos, _)| pos) 212 | .collect(); 213 | let mut mins_rc: Vec<_> = MinimizerBuilder::::new_mod() 214 | .canonical() 215 | .minimizer_size(minimizer_size) 216 | .width(width) 217 | .iter_pos(seq_rc) 218 | .map(|(pos, _)| seq_len - pos - minimizer_size) 219 | .collect(); 220 | mins_rc.reverse(); 221 | 222 | assert_eq!(mins, mins_rc); 223 | } 224 | 225 | #[test] 226 | fn test_repetitive_minimizer_iter_pos() { 227 | const SEQ_LEN: usize = 100; 228 | let seq = &[b'A'; SEQ_LEN]; 229 | let seq_rc = &rc(seq); 230 | let minimizer_size = 21; 231 | let width = 11; 232 | 233 | let mins: Vec<_> = MinimizerBuilder::::new() 234 | .canonical() 235 | .minimizer_size(minimizer_size) 236 | .width(width) 237 | .iter_pos(seq) 238 | .map(|(pos, _)| pos) 239 | .collect(); 240 | let mut mins_rc: Vec<_> = MinimizerBuilder::::new() 241 | .canonical() 242 | .minimizer_size(minimizer_size) 243 | .width(width) 244 | .iter_pos(seq_rc) 245 | .map(|(pos, _)| SEQ_LEN - pos - minimizer_size) 246 | .collect(); 247 | mins_rc.reverse(); 248 | 249 | assert_eq!(mins, mins_rc); 250 | } 251 | 252 | #[test] 253 | fn test_repetitive_2_minimizer_iter_pos() { 254 | const SEQ_LEN: usize = 100; 255 | let seq = &mut [b'A'; SEQ_LEN]; 256 | for i in (1..SEQ_LEN).step_by(2) { 257 | seq[i] = b'G'; 258 | } 259 | let seq_rc = &rc(seq); 260 | let minimizer_size = 21; 261 | let width = 11; 262 | 263 | let mins: Vec<_> = MinimizerBuilder::::new() 264 | .canonical() 265 | .minimizer_size(minimizer_size) 266 | .width(width) 267 | .iter_pos(seq) 268 | .map(|(pos, _)| pos) 269 | .collect(); 270 | let mut mins_rc: Vec<_> = MinimizerBuilder::::new() 271 | .canonical() 272 | .minimizer_size(minimizer_size) 273 | .width(width) 274 | .iter_pos(seq_rc) 275 | .map(|(pos, _)| SEQ_LEN - pos - minimizer_size) 276 | .collect(); 277 | mins_rc.reverse(); 278 | 279 | assert_eq!(mins, mins_rc); 280 | } 281 | } 282 | --------------------------------------------------------------------------------