├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── src ├── aligner.rs ├── cli.rs ├── db_file.rs ├── db_file │ ├── native.rs │ └── xml.rs ├── dotbracket.rs ├── fasta.rs ├── gapped_data.rs ├── gapped_reactivity.rs ├── gapped_sequence.rs ├── handle_query_entry.rs ├── iter.rs ├── main.rs ├── mass.rs ├── norm_dist.rs ├── null_model.rs ├── query_aligner.rs ├── query_file.rs ├── query_result.rs ├── stockholm.rs └── viennarna.rs ├── test_data ├── query.txt ├── query_align.txt ├── query_empty_sequence.txt ├── query_invalid_base.txt ├── query_invalid_lengths.txt ├── query_invalid_reactivity.txt ├── query_truncated_reactivities.txt ├── query_truncated_sequence.txt ├── test.db ├── test_db.xml └── valid_query.txt └── viennarna-mfe-sys ├── .gitignore ├── Cargo.toml ├── build.rs ├── src └── lib.rs └── wrapper.h /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "SHAPEwarp" 3 | version = "2.2.0" 4 | edition = "2021" 5 | license = "GPL-3.0-or-later" 6 | 7 | [dependencies] 8 | anyhow = "1.0.58" 9 | bitflags = "2.3.1" 10 | clap = { version = "4.3.0", features = ["derive"] } 11 | csv = "1.1.6" 12 | fftw = { version = "0.8.0", default-features = false, features = ["system"] } 13 | fnv = "1.0.7" 14 | itertools = "0.10.3" 15 | ndarray = "0.15.4" 16 | num-complex = "0.4.3" 17 | num-traits = "0.2.14" 18 | once_cell = "1.17.1" 19 | quick-xml = "0.31.0" 20 | rand = "0.8.5" 21 | rayon = "1.5.3" 22 | serde = { version = "1.0.139", features = ["derive", "rc"] } 23 | serde_json = "1.0.85" 24 | smallvec = "1.8.0" 25 | statrs = "0.16.0" 26 | tabled = "0.17.0" 27 | toml_edit = { version = "0.19.10", features = ["serde"] } 28 | viennarna-mfe-sys = { version = "0.1.0", path = "viennarna-mfe-sys" } 29 | 30 | [dev-dependencies] 31 | approx = { version = "0.5.1", features = ["num-complex"] } 32 | rand = { version = "0.8.5", features = ["small_rng"] } 33 | tempfile = "3.5.0" 34 | 35 | [profile.release-opt] 36 | inherits = "release" 37 | lto = true 38 | codegen-units = 1 39 | 40 | [build-dependencies] 41 | pkg-config = "0.3.27" 42 | semver = "1.0.18" 43 | 44 | [lints.rust] 45 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(vrna24)', 'cfg(vrna25)', 'cfg(vrna251)', 'cfg(vrna26)'] } 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![SHAPEwarp logo](http://www.incarnatolab.com/images/software/SHAPEwarp.png) 2 |
3 | ## Introduction 4 | 5 | The model-guided search for structurally-homologous RNAs is a non-trivial task, as it largely depends on the quality of the inferred structure model. When it comes to inferring RNA structures from chemical probing data, the challenges are numerous. Use of different chemical probes, or of different approaches for incorporating experimental reactivities as pseudo-free energy contributions can significantly affect the reliability of the inferred RNA structure model. 6 | 7 | __SHAPEwarp__ is a sequence-agnostic method for the identification of structurally-similar RNA elements in a database of chemical probing-derived reactivity profiles. The approach used by SHAPEwarp is inspired by the BLAST algorithm and builds on top of two widely used methods for similarity search in time series data: Mueen's Algorithm for Similarity Search ([MASS](https://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html)) and dynamic time warping (DTW). 8 | 9 | For support requests, please post your questions to: 10 | 11 | For a complete documentation, please refer to: 12 | 13 | 14 | ## Author(s) 15 | 16 | Edoardo Morandi (emorandi[at]rnaframework.com)
17 | Danny Incarnato (dincarnato[at]rnaframework.com)
18 | 19 | 20 | ## References 21 | 22 | Morandi *et al*., 2022. SHAPE-guided RNA structure homology search and motif discovery. Nature Communications (PMID: [35361788](https://pubmed.ncbi.nlm.nih.gov/35361788/)) 23 | 24 | Scholten *et al*., 2024. SHAPEwarp-web: sequence-agnostic search for structurally homologous RNA regions across databases of chemical probing data. Nucleic Acids Research (PMID: [38709889](https://pubmed.ncbi.nlm.nih.gov/38709889/)) 25 | 26 | 27 | ## License 28 | 29 | This program is free software, and can be redistribute and/or modified under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or any later version. 30 | 31 | Please see for more information. 32 | 33 | 34 | ## Prerequisites 35 | 36 | - Linux system 37 | - Rust and Cargo (Installation instructions: ) 38 | - [FFTW](http://fftw.org/) 3.x library. 39 | 40 | ## Installation of FFTW 41 | 42 | This library is generally provided by package managers, keep in mind that some distros split the `-dev` package (which are needed to compile projects depending on the library) from the main one. 43 | 44 | ### Debian based distros (i.e. Debian, Ubuntu) 45 | 46 | ```bash 47 | sudo apt install libfftw3-dev 48 | ``` 49 | 50 | ### Red-Hat based distros (i.e. Fedora, CentOS, Alma Linux) 51 | 52 | ```bash 53 | sudo dnf install fftw-devel 54 | ``` 55 | 56 | ### Arch based distros (i.e. Arch, Manjaro) 57 | 58 | ```bash 59 | sudo pacman -S fftw 60 | ``` 61 | 62 | ## Installation 63 | 64 | ```bash 65 | $ git clone https://github.com/dincarnato/SHAPEwarp 66 | $ cd SHAPEwarp 67 | 68 | # Add to PKG_CONFIG_PATH the path to the directory containing RNAlib2.pc from the ViennaRNA package 69 | $ export PKG_CONFIG_PATH=/path/to/dir/containing/RNAlib2.pc 70 | 71 | $ export RUSTFLAGS=-Ctarget-cpu=native 72 | $ cargo build --release 73 | ``` 74 | 75 | The SHAPEwarp executable will be located under ``target/release/``.
76 | 77 | 78 | ### Note for Mac OS X users: 79 | To compile SHAPEwarp on Mac OS X, after having installed the ViennaRNA package, open the RNAlib2.pc file in a text editor and replace the ``-lstdc++`` flag with ``-lc++``.
80 | 81 | 82 | ## Testing the SHAPEwarp installation 83 | 84 | To test SHAPEwarp on a small test dataset, issue the following command from within the SHAPEwarp install directory: 85 | 86 | ```bash 87 | target/release/SHAPEwarp --query test_data/query.txt --database test_data/test.db --output test_out --ow 88 | ``` 89 | The search will take less than 10 seconds, and the expected output should look like the following: 90 | 91 | ```bash 92 | query db_entry query_start query_end db_start db_end query_seed db_seed score pvalue evalue status 93 | 16S_750 16S_Bsubtilis 0 99 758 857 15-79 773-837 109.103 5.665e-8 1.003e-5 ! 94 | ``` 95 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | use semver::Version; 2 | 3 | fn main() { 4 | let vrna = pkg_config::Config::new() 5 | .range_version("2.4.18".."2.7") 6 | .cargo_metadata(false) 7 | .env_metadata(false) 8 | .print_system_libs(false) 9 | .print_system_cflags(false) 10 | .probe("RNAlib2") 11 | .unwrap(); 12 | 13 | println!("cargo:rerun-if-changed=build.rs"); 14 | 15 | let version: Version = vrna 16 | .version 17 | .parse() 18 | .expect("unable to parse ViennaRNA version"); 19 | 20 | let version_cfg = format!("vrna{}{}", version.major, version.minor); 21 | println!("cargo:rustc-cfg={version_cfg}"); 22 | 23 | if version.major == 2 && version.minor == 5 { 24 | let version_cfg = format!("vrna{}{}{}", version.major, version.minor, version.patch); 25 | println!("cargo:rustc-cfg={version_cfg}"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | // We are defining CLI structs 2 | #![allow(clippy::struct_excessive_bools)] 3 | 4 | use clap::{Args, Parser, ValueEnum}; 5 | use serde::Serialize; 6 | use std::{fmt, ops::Range, path::PathBuf, str::FromStr}; 7 | 8 | use crate::{Distance, Reactivity}; 9 | 10 | #[derive(Debug, Parser, Serialize)] 11 | #[clap(author, version, about, allow_negative_numbers = true)] 12 | #[serde(rename_all = "kebab-case")] 13 | /// SHAPE-guided RNA structural homology search 14 | pub struct Cli { 15 | /// Path to a database file, or to a (directory of) XML file(s) 16 | #[clap(long, visible_alias = "db")] 17 | #[serde(skip)] 18 | pub database: PathBuf, 19 | 20 | /// Path to a shuffled database file 21 | /// 22 | /// Uses a file containing the shuffled database instead of generating one on the fly. 23 | /// A shuffled database can be dumped to file using `--dump-shuffled-db`. 24 | #[clap( 25 | long, 26 | conflicts_with_all = &[ 27 | "dump_shuffled_db", 28 | "db_shuffles", 29 | "db_block_size", 30 | "db_in_block_shuffle", 31 | ], 32 | )] 33 | #[serde(skip)] 34 | pub shuffled_db: Option, 35 | 36 | /// Dumps the database to the specified file. 37 | /// 38 | /// Input is a (directory of) XML file(s). 39 | #[clap(long)] 40 | #[serde(skip)] 41 | pub dump_db: Option, 42 | 43 | /// Dumps the shuffled database to the specified file. 44 | /// 45 | /// Shuffled databases can be imported using the `--shuffled-db` parameter. 46 | #[clap(long)] 47 | #[serde(skip)] 48 | pub dump_shuffled_db: Option, 49 | 50 | /// Path to the query file 51 | /// 52 | /// Note: each entry should contain (one per row) the sequence id, the nucleotide sequence and 53 | /// a comma-separated list of SHAPE reactivities 54 | #[clap(short, long)] 55 | #[serde(skip)] 56 | pub query: PathBuf, 57 | 58 | /// Output directory 59 | #[clap(short, long, default_value = "sw_out/")] 60 | pub output: PathBuf, 61 | 62 | /// Overwrites the output directory (if the specified path already exists) 63 | #[clap(long, visible_alias = "ow")] 64 | pub overwrite: bool, 65 | 66 | /// Number of processors to use 67 | /// 68 | /// Uses all available processors if not specified 69 | #[clap(long)] 70 | pub threads: Option, 71 | 72 | /// Number of shuffles to perform for each sequence in db 73 | /// 74 | /// In case the parameter is unspecified, it is automatically evaluated based on the length of 75 | /// the sequences in the database. 76 | /// 77 | /// Given `L` as the sum of the lengths of each sequence in the database, the number of 78 | /// shuffles is calculated as `max(1, 500000 / L)`. 79 | #[clap(long, alias = "dbShuffles")] 80 | pub db_shuffles: Option, 81 | 82 | /// Size (in nt) of the blocks for shuffling the sequences in db 83 | #[clap(long, alias = "dbBlockSize", default_value_t = 10)] 84 | pub db_block_size: u16, 85 | 86 | /// Besides shuffling blocks, residues within each block in db will be shuffled as well 87 | #[clap(long, alias = "dbInBlockShuffle")] 88 | pub db_in_block_shuffle: bool, 89 | 90 | /// Maximum value to which reactivities will be capped 91 | #[clap(long, default_value_t = 1., alias = "maxReactivity")] 92 | pub max_reactivity: Reactivity, 93 | 94 | /// If two significant alignments overlap by more than this value, the least significant one 95 | /// (the one with the lowest alignment score) will be discarded 96 | #[clap(long, default_value_t = 0.5, alias = "maxAlignOverlap")] 97 | pub max_align_overlap: f32, 98 | 99 | /// Number of HSGs in the shuffled database to be extended to build the null model 100 | #[clap(long, default_value_t = 10_000, alias = "nullHSGs")] 101 | pub null_hsgs: u32, 102 | 103 | /// E-value threshold to consider an alignment significant 104 | #[clap(long, default_value_t = 0.01, aliases = &["inclusionEvalue", "incE"], visible_alias = "inc-e")] 105 | pub inclusion_evalue: f64, 106 | 107 | /// E-value threshold to report a match 108 | #[clap(long, default_value_t = 0.1, aliases = &["reportEvalue", "repE"], visible_alias = "rep-e")] 109 | pub report_evalue: f64, 110 | 111 | /// Reports sequence alignments in the specified format 112 | /// 113 | /// Note: alignments are reported only for matches below the inclusion E-value cutoff 114 | #[clap(long, alias = "reportAln", value_enum)] 115 | pub report_alignment: Option, 116 | 117 | /// Reports the aligned reactivities for significant matches in the "reactivities/" subfolder of the output 118 | /// directory, in JSON format 119 | #[clap(long)] 120 | pub report_reactivity: bool, 121 | 122 | #[clap(flatten, next_help_heading = "Kmer lookup options")] 123 | #[serde(flatten)] 124 | pub kmer_lookup_args: KmerLookupArgs, 125 | 126 | #[clap(flatten, next_help_heading = "Alignment options")] 127 | #[serde(flatten)] 128 | pub alignment_args: AlignmentArgs, 129 | 130 | #[clap(flatten, next_help_heading = r#"Alignment folding evaluation options"#)] 131 | #[serde(flatten)] 132 | pub alignment_folding_eval_args: AlignmentFoldingEvaluationArgs, 133 | } 134 | 135 | #[derive(Debug, Args, Serialize)] 136 | #[serde(rename_all = "kebab-case")] 137 | pub struct KmerLookupArgs { 138 | /// Minimum number of kmers required to form a High Scoring Group (HSG) 139 | #[clap(long, default_value_t = 2, alias = "minKmers")] 140 | pub min_kmers: u16, 141 | 142 | /// Maximum distance between two kmers to be merged in a HSG 143 | #[clap(long, default_value_t = 30, alias = "maxKmerDist")] 144 | pub max_kmer_dist: u16, 145 | 146 | /// Length (in nt) of the kmers 147 | #[clap(long, default_value_t = 15, alias = "kmerLen")] 148 | pub kmer_len: u16, 149 | 150 | /// Sliding offset for extracting candidate kmers from the query 151 | #[clap(long, default_value_t = 1, alias = "kmerOffset")] 152 | pub kmer_offset: u16, 153 | 154 | /// The sequence of a query kmer and the corresponding database match must have GC% contents 155 | /// differing no more than --kmer-max-gc-diff 156 | #[clap(long, alias = "matchKmerGCcontent")] 157 | pub match_kmer_gc_content: bool, 158 | 159 | /// Maximum allowed GC% difference to retain a kmer match 160 | /// 161 | /// Note: the default value is automatically determined based on the chosen kmer length 162 | #[clap(long, requires = "match_kmer_gc_content", alias = "kmerMaxGCdiff")] 163 | pub kmer_max_gc_diff: Option, 164 | 165 | /// The sequence of a query kmer and the corresponding database match must differ no more than 166 | /// --kmer-max-seq-dist 167 | #[clap(long, alias = "matchKmerSeq")] 168 | pub match_kmer_seq: bool, 169 | 170 | /// Maximum allowed sequence distance to retain a kmer match 171 | /// 172 | /// Note: when >= 1, this is interpreted as the absolute number of bases that are allowed to 173 | /// differ between the kmer and the matching region. When < 1, this is interpreted as a 174 | /// fraction of the kmer's length 175 | #[clap(long, requires = "match_kmer_seq", alias = "kmerMaxSeqDist")] 176 | pub kmer_max_seq_dist: Option>, 177 | 178 | /// Minimum complexity (measured as Gini coefficient) of candidate kmers 179 | #[clap(long, default_value_t = 0.3, alias = "kmerMinComplexity")] 180 | pub kmer_min_complexity: f32, 181 | 182 | /// A kmer is allowed to match a database entry on average every this many nt 183 | #[clap(long, default_value_t = 200, alias = "kmerMaxMatchEveryNt")] 184 | pub kmer_max_match_every_nt: u32, 185 | } 186 | 187 | #[derive(Debug, Args, Serialize)] 188 | #[serde(rename_all = "kebab-case")] 189 | #[allow(clippy::struct_field_names)] 190 | pub struct AlignmentArgs { 191 | /// Minimum and maximum score reactivity differences below 0.5 will be mapped to 192 | #[clap(long, default_value_t = MinMax (-0.5..2.), alias = "alignMatchScore", allow_hyphen_values = true)] 193 | pub align_match_score: MinMax, 194 | 195 | /// Minimum and maximum score reactivity differences above 0.5 will be mapped to 196 | #[clap(long, default_value_t = MinMax (-6.0..-0.5), alias = "alignMismatchScore", allow_hyphen_values = true)] 197 | pub align_mismatch_score: MinMax, 198 | 199 | /// Gap open penalty 200 | #[clap(long, default_value_t = -14., alias = "alignGapOpenPenal")] 201 | pub align_gap_open_penalty: f32, 202 | 203 | /// Gap extension penalty 204 | #[clap(long, default_value_t = -5., alias = "alignGapExtPenal")] 205 | pub align_gap_ext_penalty: f32, 206 | 207 | /// An alignment is allowed to drop by maximum this fraction of the best score encountered so 208 | /// far, before extension is interrupted 209 | #[clap(long, default_value_t = 0.8, alias = "alignMaxDropOffRate")] 210 | pub align_max_drop_off_rate: f32, 211 | 212 | /// An alignment is allowed to drop below the best score encountered so far * 213 | /// --align-max-drop-off-rate by this number of bases, before extension is interrupted 214 | #[clap(long, default_value_t = 8, alias = "alignMaxDropOffBases")] 215 | pub align_max_drop_off_bases: u16, 216 | 217 | /// The maximum allowed tolerated length difference between the query and db sequences to look 218 | /// for the ideal alignment along the diagonal (measured as a fraction of the length of the 219 | /// shortest sequence among db and query) 220 | #[clap(long, default_value_t = 0.1, alias = "alignLenTolerance")] 221 | pub align_len_tolerance: f32, 222 | 223 | /// Sequence matches are rewarded during the alignment 224 | #[clap(long, alias = "alignScoreSeq")] 225 | pub align_score_seq: bool, 226 | 227 | /// Score reward for matching bases 228 | #[clap( 229 | long, 230 | default_value_t = 0.5, 231 | requires = "align_score_seq", 232 | alias = "alignSeqMatchScore" 233 | )] 234 | pub align_seq_match_score: f32, 235 | 236 | /// Score penalty for mismatching bases 237 | #[clap( 238 | long, 239 | default_value_t = -2., 240 | requires = "align_score_seq", 241 | alias = "alignSeqMismatchScore" 242 | )] 243 | pub align_seq_mismatch_score: f32, 244 | } 245 | 246 | #[derive(Debug, Args, Serialize)] 247 | #[serde(rename_all = "kebab-case")] 248 | pub struct AlignmentFoldingEvaluationArgs { 249 | /// Alignments passing the --inclusion-evalue threshold, are further evaluated for the presence 250 | /// or a conserved RNA structure by using `RNAalifold` 251 | #[clap(long, alias = "evalAlignFold")] 252 | pub eval_align_fold: bool, 253 | 254 | /// Number of shuffles to perform for each alignment during folding evaluation 255 | #[clap(long, default_value_t = 100)] 256 | pub shuffles: u16, 257 | 258 | /// Size (in nt) of the blocks for shuffling the alignment during folding evaluation 259 | #[clap(long, alias = "blockSize", default_value_t = 3)] 260 | pub block_size: u16, 261 | 262 | /// Besides shuffling blocks, residues within each block will be shuffled as well during 263 | /// folding evaluation 264 | #[clap(long, alias = "inBlockShuffle")] 265 | pub in_block_shuffle: bool, 266 | 267 | /// Minimum fraction of base-pairs of the RNAalifold-inferred structure that should be 268 | /// supported by both query and db sequence to retain a match 269 | #[clap(long, default_value_t = 0.75, alias = "minBpSupport")] 270 | pub min_bp_support: f32, 271 | 272 | /// Use RIBOSUM scoring matrix 273 | #[clap(long, alias = "ribosumScoring")] 274 | pub ribosum_scoring: bool, 275 | 276 | /// Slope for SHAPE reactivities conversion into pseudo-free energy contributions 277 | #[clap(long, default_value_t = 1.8, requires = "eval_align_fold")] 278 | pub slope: Reactivity, 279 | 280 | /// Intercept for SHAPE reactivities conversion into pseudo-free energy contributions 281 | #[clap(long, default_value_t = -0.6, requires = "eval_align_fold")] 282 | pub intercept: Reactivity, 283 | 284 | /// Maximum allowed base-pairing distance 285 | #[clap( 286 | long, 287 | default_value_t = 600, 288 | alias = "maxBPspan", 289 | requires = "eval_align_fold" 290 | )] 291 | pub max_bp_span: u32, 292 | 293 | /// Disallows lonely pairs (helices of 1 bp) 294 | #[clap(long, alias = "noLonelyPairs", requires = "eval_align_fold")] 295 | pub no_lonely_pairs: bool, 296 | 297 | /// Disallows G:U wobbles at the end of helices 298 | #[clap(long, alias = "noClosingGU", requires = "eval_align_fold")] 299 | pub no_closing_gu: bool, 300 | 301 | /// Folding temperature 302 | #[clap(long, default_value_t = 37., requires = "eval_align_fold")] 303 | pub temperature: f32, 304 | } 305 | 306 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 307 | pub struct MinMax(pub Range); 308 | 309 | impl fmt::Display for MinMax 310 | where 311 | T: fmt::Display, 312 | { 313 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 314 | write!(f, "{},{}", self.0.start, self.0.end) 315 | } 316 | } 317 | 318 | impl Serialize for MinMax 319 | where 320 | T: fmt::Display, 321 | { 322 | fn serialize(&self, serializer: S) -> Result 323 | where 324 | S: serde::Serializer, 325 | { 326 | serializer.collect_str(self) 327 | } 328 | } 329 | 330 | #[derive(Debug, Clone, PartialEq, Eq)] 331 | pub enum ParseMinMaxError { 332 | InvalidFormat, 333 | InnerError { index: u8, error: T }, 334 | } 335 | 336 | impl FromStr for MinMax 337 | where 338 | T: FromStr, 339 | { 340 | type Err = ParseMinMaxError; 341 | 342 | fn from_str(s: &str) -> Result { 343 | let (start, end) = s.split_once(',').ok_or(ParseMinMaxError::InvalidFormat)?; 344 | 345 | let start = start 346 | .parse() 347 | .map_err(|error| ParseMinMaxError::InnerError { index: 0, error })?; 348 | 349 | let end = end 350 | .parse() 351 | .map_err(|error| ParseMinMaxError::InnerError { index: 1, error })?; 352 | 353 | Ok(Self(start..end)) 354 | } 355 | } 356 | 357 | impl fmt::Display for ParseMinMaxError 358 | where 359 | T: fmt::Display, 360 | { 361 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 362 | match self { 363 | Self::InvalidFormat => { 364 | f.write_str("invalid min-max format, two comma-separated values expected") 365 | } 366 | Self::InnerError { index, error } => { 367 | let part = match index { 368 | 0 => "min", 369 | 1 => "max", 370 | _ => unreachable!(), 371 | }; 372 | write!(f, "{part} part of min-max format is invalid: {error}") 373 | } 374 | } 375 | } 376 | } 377 | 378 | impl std::error::Error for ParseMinMaxError where T: std::error::Error {} 379 | 380 | #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize)] 381 | pub enum ReportAlignment { 382 | #[clap(alias = "f")] 383 | Fasta, 384 | 385 | #[clap(alias = "s")] 386 | Stockholm, 387 | } 388 | 389 | impl Cli { 390 | #[cfg(test)] 391 | pub(crate) fn dummy() -> Self { 392 | Self::parse_from(["test", "--database", "test", "--query", "test"]) 393 | } 394 | } 395 | 396 | /// Hidden dumper for XML files. 397 | /// 398 | /// Read an XML or the XML files from a directory and dump the content to a native DB format. 399 | #[derive(Debug, Parser)] 400 | pub struct Alternative { 401 | /// Path to a database file, or to a (directory of) XML file(s) 402 | #[clap(long, visible_alias = "db")] 403 | pub database: PathBuf, 404 | 405 | /// Dumps the database to the specified file. 406 | /// 407 | /// Input is a (directory of) XML file(s). 408 | #[clap(long)] 409 | pub dump_db: PathBuf, 410 | 411 | /// Dumps the shuffled database to the specified file. 412 | /// 413 | /// Shuffled databases can be imported using the `--shuffled-db` parameter. 414 | #[clap(long)] 415 | pub dump_shuffled_db: Option, 416 | 417 | /// Number of processors to use 418 | /// 419 | /// Uses all available processors if not specified 420 | #[clap(long)] 421 | pub threads: Option, 422 | 423 | /// Number of shuffles to perform for each sequence in db 424 | /// 425 | /// In case the parameter is unspecified, it is automatically evaluated based on the length of 426 | /// the sequences in the database. 427 | /// 428 | /// Given `L` as the sum of the lengths of each sequence in the database, the number of 429 | /// shuffles is calculated as `max(1, 500000 / L)`. 430 | #[clap(long, alias = "dbShuffles")] 431 | pub db_shuffles: Option, 432 | 433 | /// Size (in nt) of the blocks for shuffling the sequences in db 434 | #[clap(long, alias = "dbBlockSize", default_value_t = 10)] 435 | pub db_block_size: u16, 436 | 437 | /// Besides shuffling blocks, residues within each block in db will be shuffled as well 438 | #[clap(long, alias = "dbInBlockShuffle")] 439 | pub db_in_block_shuffle: bool, 440 | } 441 | -------------------------------------------------------------------------------- /src/db_file.rs: -------------------------------------------------------------------------------- 1 | pub mod native; 2 | mod xml; 3 | 4 | use std::{ 5 | convert::TryInto, 6 | error::Error as StdError, 7 | ffi::OsString, 8 | fmt::{self, Display}, 9 | io, 10 | path::Path, 11 | ptr, 12 | string::FromUtf8Error, 13 | }; 14 | 15 | use serde::{Serialize, Serializer}; 16 | 17 | use crate::{Base, Molecule, Reactivity, SequenceEntry}; 18 | 19 | #[derive(Debug, Clone, PartialEq)] 20 | pub struct Entry { 21 | pub id: String, 22 | pub(crate) sequence: Vec, 23 | pub reactivity: Vec, 24 | } 25 | 26 | const NAN_PLACEHOLDER: Reactivity = -999.; 27 | 28 | #[derive(Debug, Clone, Copy)] 29 | #[repr(transparent)] 30 | pub struct ReactivityWithPlaceholder(Reactivity); 31 | 32 | impl ReactivityWithPlaceholder { 33 | pub fn is_nan(self) -> bool { 34 | self.0.is_nan() | (self.0 == NAN_PLACEHOLDER) 35 | } 36 | 37 | pub fn get_non_nan(self) -> Option { 38 | if self.is_nan() { 39 | None 40 | } else { 41 | Some(self.0) 42 | } 43 | } 44 | 45 | pub fn to_maybe_placeholder(self) -> Reactivity { 46 | if self.0.is_nan() { 47 | NAN_PLACEHOLDER 48 | } else { 49 | self.0 50 | } 51 | } 52 | 53 | pub fn as_inner_slice(this: &[ReactivityWithPlaceholder]) -> &[Reactivity] { 54 | // Safety: 55 | // - `ReactivityWithPlaceholder` is transparent and it contains only a `Reactivity` 56 | // - lifetime is maintained 57 | unsafe { &*(ptr::from_ref(this) as *const [Reactivity]) } 58 | } 59 | 60 | pub fn inner(self) -> Reactivity { 61 | self.0 62 | } 63 | 64 | #[inline] 65 | #[must_use] 66 | pub const fn nan_placeholder() -> Self { 67 | Self(NAN_PLACEHOLDER) 68 | } 69 | } 70 | 71 | impl PartialEq for ReactivityWithPlaceholder { 72 | fn eq(&self, other: &Self) -> bool { 73 | if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) { 74 | false 75 | } else { 76 | self.0 == other.0 77 | } 78 | } 79 | } 80 | 81 | impl PartialEq for ReactivityWithPlaceholder { 82 | fn eq(&self, other: &Reactivity) -> bool { 83 | if self.0 == NAN_PLACEHOLDER { 84 | false 85 | } else { 86 | self.0 == *other 87 | } 88 | } 89 | } 90 | 91 | impl PartialOrd for ReactivityWithPlaceholder { 92 | fn partial_cmp(&self, other: &Self) -> Option { 93 | if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) { 94 | None 95 | } else { 96 | self.0.partial_cmp(&other.0) 97 | } 98 | } 99 | } 100 | 101 | impl PartialOrd for ReactivityWithPlaceholder { 102 | fn partial_cmp(&self, other: &Reactivity) -> Option { 103 | if self.0 == NAN_PLACEHOLDER { 104 | None 105 | } else { 106 | self.0.partial_cmp(other) 107 | } 108 | } 109 | } 110 | 111 | impl From for ReactivityWithPlaceholder { 112 | fn from(reactivity: Reactivity) -> Self { 113 | Self(reactivity) 114 | } 115 | } 116 | 117 | impl Serialize for ReactivityWithPlaceholder { 118 | #[inline] 119 | fn serialize(&self, serializer: S) -> Result 120 | where 121 | S: Serializer, 122 | { 123 | self.get_non_nan() 124 | .unwrap_or(Reactivity::NAN) 125 | .serialize(serializer) 126 | } 127 | } 128 | 129 | pub trait ReactivityLike: Copy + PartialOrd + PartialEq { 130 | fn is_nan(self) -> bool; 131 | fn value(self) -> Reactivity; 132 | } 133 | 134 | impl ReactivityLike for Reactivity { 135 | #[inline] 136 | fn is_nan(self) -> bool { 137 | Reactivity::is_nan(self) 138 | } 139 | 140 | #[inline] 141 | fn value(self) -> Reactivity { 142 | self 143 | } 144 | } 145 | 146 | impl ReactivityLike for ReactivityWithPlaceholder { 147 | #[inline] 148 | fn is_nan(self) -> bool { 149 | ReactivityWithPlaceholder::is_nan(self) 150 | } 151 | 152 | #[inline] 153 | fn value(self) -> Reactivity { 154 | self.to_maybe_placeholder() 155 | } 156 | } 157 | 158 | impl Entry { 159 | pub fn cap_reactivities(&mut self, max_reactivity: Reactivity) { 160 | self.reactivity.iter_mut().for_each(|reactivity| { 161 | if let Some(r) = reactivity.get_non_nan() { 162 | *reactivity = r.min(max_reactivity).into(); 163 | } 164 | }); 165 | } 166 | } 167 | 168 | impl SequenceEntry for Entry { 169 | type Reactivity = ReactivityWithPlaceholder; 170 | 171 | fn name(&self) -> &str { 172 | &self.id 173 | } 174 | 175 | fn sequence(&self) -> &[Base] { 176 | &self.sequence 177 | } 178 | 179 | fn reactivity(&self) -> &[Self::Reactivity] { 180 | &self.reactivity 181 | } 182 | 183 | fn molecule(&self) -> crate::Molecule { 184 | Molecule::Dna 185 | } 186 | } 187 | 188 | #[derive(Debug)] 189 | pub enum ReaderError { 190 | TooSmall, 191 | InvalidMarker, 192 | } 193 | 194 | impl Display for ReaderError { 195 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 196 | let s = match self { 197 | ReaderError::TooSmall => "DB file is too small", 198 | ReaderError::InvalidMarker => "DB file contains and invalid EOF marker", 199 | }; 200 | 201 | f.write_str(s) 202 | } 203 | } 204 | 205 | impl StdError for ReaderError {} 206 | 207 | #[derive(Debug)] 208 | pub enum EntryError { 209 | InvalidSequenceId(FromUtf8Error), 210 | InvalidBase, 211 | UnexpectedEof, 212 | SurpassedEofMarker, 213 | } 214 | 215 | impl Display for EntryError { 216 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 217 | let s = match self { 218 | EntryError::InvalidSequenceId(_) => "Invalid sequence ID string", 219 | EntryError::InvalidBase => "Invalid encoded nucleobase", 220 | EntryError::UnexpectedEof => "Unexpected end of file", 221 | EntryError::SurpassedEofMarker => "End of file marked has been surpassed", 222 | }; 223 | 224 | f.write_str(s) 225 | } 226 | } 227 | 228 | impl StdError for EntryError { 229 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 230 | match self { 231 | EntryError::InvalidSequenceId(source) => Some(source), 232 | EntryError::InvalidBase 233 | | EntryError::UnexpectedEof 234 | | EntryError::SurpassedEofMarker => None, 235 | } 236 | } 237 | } 238 | 239 | pub fn read_db(path: &Path) -> Result, Error> { 240 | if path.is_dir() { 241 | xml::read_directory(path).map_err(Error::Directory) 242 | } else { 243 | let extension = path.extension().ok_or(Error::NoExtension)?; 244 | if extension.eq_ignore_ascii_case("db") { 245 | native::read_file(path).map_err(Error::Native) 246 | } else if extension.eq_ignore_ascii_case("xml") { 247 | let entry = xml::read_file(path).map_err(Error::Xml)?; 248 | Ok(vec![entry]) 249 | } else { 250 | Err(Error::InvalidExtension(extension.to_os_string())) 251 | } 252 | } 253 | } 254 | 255 | #[derive(Debug)] 256 | pub enum Error { 257 | NoExtension, 258 | InvalidExtension(OsString), 259 | Native(native::Error), 260 | Xml(xml::ReadFileError), 261 | Directory(xml::ReadDirectoryError), 262 | } 263 | 264 | impl Display for Error { 265 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 266 | match self { 267 | Error::NoExtension => f.write_str("db file does not have an extension"), 268 | Error::InvalidExtension(extension) => { 269 | write!( 270 | f, 271 | "extension \"{}\" is not valid for a db", 272 | extension.to_string_lossy() 273 | ) 274 | } 275 | Error::Native(_) => f.write_str("cannot read native db file"), 276 | Error::Xml(_) => f.write_str("cannot read xml db file"), 277 | Error::Directory(_) => f.write_str("cannot read xml entries from a directory"), 278 | } 279 | } 280 | } 281 | 282 | impl StdError for Error { 283 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 284 | match self { 285 | Error::NoExtension | Error::InvalidExtension(_) => None, 286 | Error::Native(source) => Some(source), 287 | Error::Xml(source) => Some(source), 288 | Error::Directory(source) => Some(source), 289 | } 290 | } 291 | } 292 | 293 | pub fn write_entries(entries: &[Entry], mut writer: W) -> io::Result<()> { 294 | entries.iter().try_for_each(|entry| { 295 | let name = entry.name(); 296 | let sequence = entry.sequence(); 297 | let name_len_buf = u32::try_from(name.len().checked_add(1).unwrap()) 298 | .unwrap() 299 | .to_le_bytes(); 300 | let seq_len_buf = u32::try_from(sequence.len()).unwrap().to_le_bytes(); 301 | 302 | writer.write_all(name_len_buf.as_slice())?; 303 | writer.write_all(name.as_bytes())?; 304 | writer.write_all(&[0])?; 305 | writer.write_all(seq_len_buf.as_slice())?; 306 | sequence.chunks_exact(2).try_for_each(|pair| { 307 | writer.write_all(&[Base::pair_to_nibble(pair.try_into().unwrap())]) 308 | })?; 309 | if let Some(base) = sequence.chunks_exact(2).remainder().first().copied() { 310 | writer.write_all(&[Base::pair_to_nibble([base, Base::A])])?; 311 | } 312 | 313 | entry.reactivity().iter().try_for_each(|reactivity| { 314 | let reactivity = f64::from(reactivity.inner()).to_le_bytes(); 315 | writer.write_all(reactivity.as_slice()) 316 | })?; 317 | 318 | Ok::<_, io::Error>(()) 319 | })?; 320 | 321 | let n_entries = u64::try_from(entries.len()).unwrap().to_le_bytes(); 322 | writer.write_all(n_entries.as_slice())?; 323 | writer.write_all(native::VERSION.to_le_bytes().as_slice())?; 324 | writer.write_all(native::END_MARKER)?; 325 | writer.flush()?; 326 | 327 | Ok(()) 328 | } 329 | -------------------------------------------------------------------------------- /src/db_file/native.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | convert::TryInto, 3 | error::Error as StdError, 4 | fmt::{self, Display}, 5 | fs::File, 6 | io::{self, BufReader, Read, Seek, SeekFrom}, 7 | path::Path, 8 | string::FromUtf8Error, 9 | }; 10 | 11 | use itertools::Itertools; 12 | 13 | use crate::{db_file::ReactivityWithPlaceholder, Base, InvalidBasePair, Reactivity}; 14 | 15 | use super::Entry; 16 | 17 | pub(super) const END_SIZE: u8 = 17; 18 | pub(super) const END_MARKER: &[u8] = b"[eofdb]"; 19 | pub(super) const VERSION: u16 = 1; 20 | 21 | #[derive(Debug)] 22 | pub struct Reader { 23 | inner: R, 24 | _db_len: u64, 25 | _version: u16, 26 | end_offset: u64, 27 | } 28 | 29 | impl Reader 30 | where 31 | R: Read + Seek, 32 | { 33 | pub fn new(mut reader: R) -> Result { 34 | use NewReaderError as E; 35 | 36 | let end_offset = reader 37 | .seek(SeekFrom::End(-i64::from(END_SIZE))) 38 | .map_err(E::SeekToMetadata)?; 39 | let mut end_buf = [0; END_SIZE as usize]; 40 | reader.read_exact(&mut end_buf).map_err(E::ReadMetadata)?; 41 | 42 | if &end_buf[10..17] != END_MARKER { 43 | return Err(E::InvalidMarker); 44 | } 45 | 46 | let db_len = u64::from_le_bytes(end_buf[0..8].try_into().unwrap()); 47 | let version = u16::from_le_bytes(end_buf[8..10].try_into().unwrap()); 48 | Ok(Self { 49 | inner: reader, 50 | _db_len: db_len, 51 | _version: version, 52 | end_offset, 53 | }) 54 | } 55 | 56 | pub fn entries(&mut self) -> EntryIter { 57 | let &mut Self { 58 | ref mut inner, 59 | end_offset, 60 | .. 61 | } = self; 62 | 63 | EntryIter { 64 | reader: inner, 65 | end_offset, 66 | offset: 0, 67 | } 68 | } 69 | } 70 | 71 | #[derive(Debug)] 72 | pub enum NewReaderError { 73 | SeekToMetadata(io::Error), 74 | ReadMetadata(io::Error), 75 | InvalidMarker, 76 | } 77 | 78 | impl Display for NewReaderError { 79 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 80 | let s = match self { 81 | NewReaderError::SeekToMetadata(_) => "unable to seek to metadata", 82 | NewReaderError::ReadMetadata(_) => "unable to read metadata", 83 | NewReaderError::InvalidMarker => "invalid metadata marker", 84 | }; 85 | 86 | f.write_str(s) 87 | } 88 | } 89 | 90 | impl StdError for NewReaderError { 91 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 92 | match self { 93 | NewReaderError::SeekToMetadata(source) | NewReaderError::ReadMetadata(source) => { 94 | Some(source) 95 | } 96 | NewReaderError::InvalidMarker => None, 97 | } 98 | } 99 | } 100 | 101 | #[derive(Debug)] 102 | pub struct EntryIter<'a, R> { 103 | reader: &'a mut R, 104 | end_offset: u64, 105 | offset: u64, 106 | } 107 | 108 | impl Iterator for EntryIter<'_, R> 109 | where 110 | R: Seek + Read, 111 | { 112 | type Item = Result; 113 | 114 | fn next(&mut self) -> Option { 115 | (self.offset != self.end_offset).then(|| self.next_entry()) 116 | } 117 | } 118 | 119 | impl EntryIter<'_, R> 120 | where 121 | R: Seek + Read, 122 | { 123 | fn next_entry(&mut self) -> Result { 124 | use NextEntryError as E; 125 | 126 | if self.offset == 0 { 127 | self.reader.seek(SeekFrom::Start(0)).map_err(E::SeekStart)?; 128 | } 129 | 130 | let mut id_len_with_nul_buf = [0; 4]; 131 | self.reader 132 | .read_exact(&mut id_len_with_nul_buf) 133 | .map_err(E::ReadIdLen)?; 134 | let id_len_with_nul: usize = u32::from_le_bytes(id_len_with_nul_buf) 135 | .try_into() 136 | .expect("cannot represent id length as usize for the current architecture"); 137 | let mut sequence_id = vec![0; id_len_with_nul]; 138 | self.reader 139 | .read_exact(&mut sequence_id) 140 | .map_err(E::ReadSequenceId)?; 141 | if sequence_id.pop().filter(|&b| b == 0).is_none() { 142 | return Err(E::MissingSequenceIdNul); 143 | } 144 | let sequence_id = 145 | String::from_utf8(sequence_id).map_err(NextEntryError::InvalidSequenceId)?; 146 | let mut sequence_len_buf = [0; 4]; 147 | self.reader 148 | .read_exact(&mut sequence_len_buf) 149 | .map_err(E::ReadSequenceLen)?; 150 | let sequence_len: usize = u32::from_le_bytes(sequence_len_buf) 151 | .try_into() 152 | .expect("cannot represent sequence length as usize for the current architecture"); 153 | 154 | let sequence_bytes = sequence_len / 2 + sequence_len % 2; 155 | let mut sequence = self 156 | .reader 157 | .bytes() 158 | .take(sequence_bytes) 159 | .map(|result| { 160 | result.map_err(E::ReadSequence).and_then(|byte| { 161 | Base::try_pair_from_byte(byte) 162 | .map(|[first, second]| [first, second]) 163 | .map_err(E::InvalidEncodedBase) 164 | }) 165 | }) 166 | .flatten_ok() 167 | .collect::, _>>()?; 168 | 169 | if sequence_len > 0 && sequence_len % 2 == 1 { 170 | sequence.pop().unwrap(); 171 | } 172 | 173 | if sequence.len() != sequence_len { 174 | return Err(E::UnexpectedEof); 175 | } 176 | 177 | let reactivity = (0..sequence_len) 178 | .map(|_| { 179 | let mut reactivity_buffer = [0; 8]; 180 | self.reader 181 | .read_exact(&mut reactivity_buffer) 182 | .map(|()| reactivity_buffer) 183 | .map_err(E::ReadReactivity) 184 | }) 185 | // Reactivity is an alias to either f32 or f64 186 | .map_ok(|bytes| { 187 | // We internally use a fixed type that can be f32, there is no need to necessarily 188 | // have 64 bits of precision 189 | #[allow(clippy::cast_possible_truncation)] 190 | let reactivity = f64::from_le_bytes(bytes) as Reactivity; 191 | ReactivityWithPlaceholder::from(reactivity) 192 | }) 193 | .collect::, _>>()?; 194 | 195 | if reactivity.len() != sequence_len { 196 | return Err(E::UnexpectedEof); 197 | } 198 | 199 | let offset = self.reader.stream_position().map_err(E::StreamPosition)?; 200 | if offset > self.end_offset { 201 | return Err(E::SurpassedEofMarker); 202 | } 203 | self.offset = offset; 204 | 205 | Ok(Entry { 206 | id: sequence_id, 207 | sequence, 208 | reactivity, 209 | }) 210 | } 211 | } 212 | 213 | #[derive(Debug)] 214 | pub enum NextEntryError { 215 | SeekStart(io::Error), 216 | ReadIdLen(io::Error), 217 | ReadSequenceId(io::Error), 218 | MissingSequenceIdNul, 219 | InvalidSequenceId(FromUtf8Error), 220 | ReadSequenceLen(io::Error), 221 | ReadSequence(io::Error), 222 | InvalidEncodedBase(InvalidBasePair), 223 | ReadReactivity(io::Error), 224 | UnexpectedEof, 225 | SurpassedEofMarker, 226 | StreamPosition(io::Error), 227 | } 228 | 229 | impl Display for NextEntryError { 230 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 231 | let s = match self { 232 | NextEntryError::SeekStart(_) => "unable to seek to the start of the file", 233 | NextEntryError::ReadIdLen(_) => "unable to read the length of the sequence id", 234 | NextEntryError::ReadSequenceId(_) => "unable to read sequence id", 235 | NextEntryError::MissingSequenceIdNul => { 236 | "sequence id does not have a nul termination character" 237 | } 238 | NextEntryError::InvalidSequenceId(_) => "sequence id is not valid", 239 | NextEntryError::ReadSequenceLen(_) => "unable to read sequence length", 240 | NextEntryError::ReadSequence(_) => "unable to read sequence content", 241 | NextEntryError::InvalidEncodedBase(_) => "invalid encoded base", 242 | NextEntryError::ReadReactivity(_) => "unable to read rectivity", 243 | NextEntryError::UnexpectedEof => "unexpected end of file", 244 | NextEntryError::SurpassedEofMarker => "end of file marker is being surpassed", 245 | NextEntryError::StreamPosition(_) => "unable to get stream position", 246 | }; 247 | 248 | f.write_str(s) 249 | } 250 | } 251 | 252 | impl StdError for NextEntryError { 253 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 254 | match self { 255 | NextEntryError::SeekStart(source) 256 | | NextEntryError::ReadIdLen(source) 257 | | NextEntryError::ReadSequenceId(source) 258 | | NextEntryError::ReadSequenceLen(source) 259 | | NextEntryError::ReadSequence(source) 260 | | NextEntryError::ReadReactivity(source) 261 | | NextEntryError::StreamPosition(source) => Some(source), 262 | NextEntryError::MissingSequenceIdNul 263 | | NextEntryError::UnexpectedEof 264 | | NextEntryError::SurpassedEofMarker => None, 265 | NextEntryError::InvalidSequenceId(source) => Some(source), 266 | NextEntryError::InvalidEncodedBase(source) => Some(source), 267 | } 268 | } 269 | } 270 | 271 | #[derive(Debug)] 272 | pub enum Error { 273 | OpenFile(io::Error), 274 | NewReader(NewReaderError), 275 | Entry(NextEntryError), 276 | } 277 | 278 | impl Display for Error { 279 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 280 | let s = match self { 281 | Error::OpenFile(_) => "unable to open file", 282 | Error::NewReader(_) => "unable to create new reader", 283 | Error::Entry(_) => "unable to get the next entry", 284 | }; 285 | 286 | f.write_str(s) 287 | } 288 | } 289 | 290 | impl StdError for Error { 291 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 292 | match self { 293 | Error::OpenFile(source) => Some(source), 294 | Error::NewReader(source) => Some(source), 295 | Error::Entry(source) => Some(source), 296 | } 297 | } 298 | } 299 | 300 | pub fn read_file(path: &Path) -> Result, Error> { 301 | let file = File::open(path).map_err(Error::OpenFile)?; 302 | let mut reader = Reader::new(BufReader::new(file)).map_err(Error::NewReader)?; 303 | let entries = reader 304 | .entries() 305 | .collect::>() 306 | .map_err(Error::Entry)?; 307 | Ok(entries) 308 | } 309 | 310 | #[cfg(test)] 311 | mod tests { 312 | use std::io::Cursor; 313 | 314 | use super::*; 315 | 316 | const TEST_DB: &[u8] = include_bytes!("../../test_data/test.db"); 317 | 318 | #[test] 319 | fn valid_reader() { 320 | let reader = Reader::new(Cursor::new(TEST_DB)).unwrap(); 321 | #[allow(clippy::used_underscore_binding)] 322 | let len = reader._db_len; 323 | 324 | #[allow(clippy::used_underscore_binding)] 325 | let version = reader._version; 326 | 327 | assert_eq!(len, 0x1181); 328 | assert_eq!(version, 1); 329 | } 330 | 331 | #[test] 332 | fn read_all_db() { 333 | let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap(); 334 | let db_len = reader 335 | .entries() 336 | .map_ok(|entry| entry.sequence.len()) 337 | .try_fold(0, |acc, seq_len| seq_len.map(|seq_len| acc + seq_len)) 338 | .unwrap(); 339 | 340 | #[allow(clippy::used_underscore_binding)] 341 | let reader_len = usize::try_from(reader._db_len).unwrap(); 342 | assert_eq!(db_len, reader_len); 343 | } 344 | 345 | #[test] 346 | fn transform_pseudo_nans() { 347 | let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap(); 348 | let entry = reader.entries().next().unwrap().unwrap(); 349 | 350 | // The first 13 reactivities are -999 in the file 351 | assert!(entry.reactivity[..13] 352 | .iter() 353 | .copied() 354 | .all(ReactivityWithPlaceholder::is_nan)); 355 | } 356 | } 357 | -------------------------------------------------------------------------------- /src/db_file/xml.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | borrow::Cow, 3 | error::Error as StdError, 4 | fmt::{self, Display}, 5 | fs::File, 6 | io::{self, BufReader}, 7 | num::ParseFloatError, 8 | ops::Not, 9 | path::Path, 10 | str::Utf8Error, 11 | }; 12 | 13 | use quick_xml::{ 14 | events::{BytesEnd, BytesStart, BytesText}, 15 | Reader, 16 | }; 17 | use rayon::iter::{ParallelBridge, ParallelIterator}; 18 | 19 | use crate::{Base, InvalidBase, Reactivity}; 20 | 21 | use super::{Entry, ReactivityWithPlaceholder}; 22 | 23 | pub fn read_file(path: &Path) -> Result { 24 | use quick_xml::events::Event; 25 | use ReadFileError as E; 26 | 27 | let mut reader = Reader::from_file(path).map_err(E::ReaderFromFile)?; 28 | let mut buffer = Vec::new(); 29 | let mut state = XmlState::default(); 30 | 31 | let mut id = None; 32 | let mut sequence = None; 33 | let mut reactivity = None; 34 | 35 | loop { 36 | let event = reader 37 | .read_event_into(&mut buffer) 38 | .map_err(|source| E::ReadEvent { 39 | buffer_position: reader.buffer_position(), 40 | source, 41 | })?; 42 | 43 | match event { 44 | Event::Start(start) => { 45 | state = handle_start_event(&start, state, &mut id)?; 46 | } 47 | 48 | Event::End(end) => { 49 | state = handle_end_event(&end, state)?; 50 | } 51 | 52 | Event::Empty(tag) => return Err(E::UnexpectedEmptyTag(tag.name().as_ref().to_owned())), 53 | Event::Text(text) => { 54 | handle_text_event(&text, &state, &mut sequence, &mut reactivity, &reader)?; 55 | } 56 | 57 | Event::CData(_) 58 | | Event::Comment(_) 59 | | Event::Decl(_) 60 | | Event::PI(_) 61 | | Event::DocType(_) => {} 62 | 63 | Event::Eof => break, 64 | } 65 | } 66 | 67 | let id = id.ok_or(E::MissingTranscript)?; 68 | let sequence = sequence.ok_or(E::MissingSequence)?; 69 | let reactivity = reactivity.ok_or(E::MissingReactivity)?; 70 | 71 | if sequence.len() != reactivity.len() { 72 | return Err(E::InconsistentLength { 73 | sequence: sequence.len(), 74 | reactivity: reactivity.len(), 75 | }); 76 | } 77 | 78 | Ok(Entry { 79 | id, 80 | sequence, 81 | reactivity, 82 | }) 83 | } 84 | 85 | fn handle_start_event( 86 | start: &BytesStart<'_>, 87 | state: XmlState, 88 | id: &mut Option, 89 | ) -> Result { 90 | use ReadFileError as E; 91 | 92 | match (start.name().as_ref(), state) { 93 | (b"data", XmlState::Start) => Ok(XmlState::Data), 94 | (b"meta-data", XmlState::Data) => Ok(XmlState::MetaData), 95 | (b"organism", XmlState::MetaData) => Ok(XmlState::Organism), 96 | (b"probe", XmlState::MetaData) => Ok(XmlState::Probe), 97 | (b"source", XmlState::MetaData) => Ok(XmlState::Source), 98 | (b"citation", XmlState::Source) => Ok(XmlState::Citation), 99 | (b"pmid", XmlState::Source) => Ok(XmlState::Pmid), 100 | (b"replicate", XmlState::MetaData) => Ok(XmlState::Replicate), 101 | (b"condition", XmlState::MetaData) => Ok(XmlState::Condition), 102 | (b"transcript", XmlState::Data) => { 103 | if id.is_some() { 104 | return Err(E::MultipleTranscripts); 105 | } 106 | 107 | let id_attr = start 108 | .try_get_attribute("id") 109 | .map_err(E::MalformedTranscriptTag)? 110 | .ok_or(E::MissingId)?; 111 | 112 | let id_string = match id_attr.value { 113 | Cow::Borrowed(id) => std::str::from_utf8(id) 114 | .map(str::to_owned) 115 | .map_err(E::InvalidId)?, 116 | Cow::Owned(id) => { 117 | String::from_utf8(id).map_err(|err| E::InvalidId(err.utf8_error()))? 118 | } 119 | }; 120 | *id = Some(id_string); 121 | 122 | Ok(XmlState::Transcript) 123 | } 124 | (b"sequence", XmlState::Transcript) => Ok(XmlState::Sequence), 125 | (b"reactivity", XmlState::Transcript) => Ok(XmlState::Reactivity), 126 | _ => Err(E::UnexpectedOpenTag(start.name().as_ref().to_owned())), 127 | } 128 | } 129 | 130 | fn handle_end_event(end: &BytesEnd<'_>, state: XmlState) -> Result { 131 | use ReadFileError as E; 132 | 133 | match (end.name().as_ref(), state) { 134 | (b"data", XmlState::Data) => Ok(XmlState::End), 135 | 136 | (b"meta-data", XmlState::MetaData) | (b"transcript", XmlState::Transcript) => { 137 | Ok(XmlState::Data) 138 | } 139 | 140 | (b"organism", XmlState::Organism) 141 | | (b"probe", XmlState::Probe) 142 | | (b"source", XmlState::Source) 143 | | (b"replicate", XmlState::Replicate) 144 | | (b"condition", XmlState::Condition) => Ok(XmlState::MetaData), 145 | 146 | (b"citation", XmlState::Citation) | (b"pmid", XmlState::Pmid) => Ok(XmlState::Source), 147 | 148 | (b"sequence", XmlState::Sequence) | (b"reactivity", XmlState::Reactivity) => { 149 | Ok(XmlState::Transcript) 150 | } 151 | 152 | _ => Err(E::UnexpectedCloseTag(end.name().as_ref().to_owned())), 153 | } 154 | } 155 | 156 | fn handle_text_event( 157 | text: &BytesText<'_>, 158 | state: &XmlState, 159 | sequence: &mut Option>, 160 | reactivity: &mut Option>, 161 | reader: &Reader>, 162 | ) -> Result<(), ReadFileError> { 163 | use ReadFileError as E; 164 | 165 | if text.iter().all(u8::is_ascii_whitespace) { 166 | return Ok(()); 167 | } 168 | 169 | match state { 170 | XmlState::Start 171 | | XmlState::Data 172 | | XmlState::MetaData 173 | | XmlState::Source 174 | | XmlState::Transcript 175 | | XmlState::End => return Err(E::UnexpectedText(reader.buffer_position())), 176 | 177 | XmlState::Organism 178 | | XmlState::Probe 179 | | XmlState::Citation 180 | | XmlState::Pmid 181 | | XmlState::Replicate 182 | | XmlState::Condition => {} 183 | 184 | XmlState::Sequence => { 185 | if sequence.is_some() { 186 | return Err(E::MultipleSequences); 187 | } 188 | *sequence = Some(parse_sequence(text).map_err(E::InvalidSequence)?); 189 | } 190 | XmlState::Reactivity => { 191 | if reactivity.is_some() { 192 | return Err(E::MultipleReactivities); 193 | } 194 | 195 | *reactivity = Some(parse_reactivity(text).map_err(E::InvalidReactivity)?); 196 | } 197 | } 198 | 199 | Ok(()) 200 | } 201 | 202 | #[derive(Debug, Default)] 203 | enum XmlState { 204 | #[default] 205 | Start, 206 | Data, 207 | MetaData, 208 | Organism, 209 | Probe, 210 | Source, 211 | Citation, 212 | Pmid, 213 | Replicate, 214 | Condition, 215 | Transcript, 216 | Sequence, 217 | Reactivity, 218 | End, 219 | } 220 | 221 | #[derive(Debug)] 222 | pub enum ReadFileError { 223 | ReaderFromFile(quick_xml::Error), 224 | ReadEvent { 225 | buffer_position: usize, 226 | source: quick_xml::Error, 227 | }, 228 | UnexpectedOpenTag(Vec), 229 | UnexpectedCloseTag(Vec), 230 | UnexpectedEmptyTag(Vec), 231 | UnexpectedText(usize), 232 | MultipleTranscripts, 233 | MalformedTranscriptTag(quick_xml::Error), 234 | MissingId, 235 | InvalidId(Utf8Error), 236 | MultipleSequences, 237 | InvalidSequence(InvalidBase), 238 | MultipleReactivities, 239 | InvalidReactivity(InvalidReactivity), 240 | MissingTranscript, 241 | MissingSequence, 242 | MissingReactivity, 243 | InconsistentLength { 244 | sequence: usize, 245 | reactivity: usize, 246 | }, 247 | } 248 | 249 | impl Display for ReadFileError { 250 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 251 | match self { 252 | ReadFileError::ReaderFromFile(_) => { 253 | f.write_str("unable to create XML reader from file") 254 | } 255 | ReadFileError::ReadEvent { 256 | buffer_position, 257 | source: _, 258 | } => write!(f, "unable to read XML event at position {buffer_position}"), 259 | ReadFileError::UnexpectedOpenTag(tag) => write!( 260 | f, 261 | r#"unexpected opening tag "{}""#, 262 | String::from_utf8_lossy(tag) 263 | ), 264 | ReadFileError::UnexpectedCloseTag(tag) => write!( 265 | f, 266 | r#"unexpected closing tag "{}""#, 267 | String::from_utf8_lossy(tag), 268 | ), 269 | ReadFileError::UnexpectedEmptyTag(tag) => write!( 270 | f, 271 | r#"unexpected empty tag "{}""#, 272 | String::from_utf8_lossy(tag), 273 | ), 274 | ReadFileError::UnexpectedText(position) => { 275 | write!(f, "unexpected text content at position {position}") 276 | } 277 | ReadFileError::MultipleTranscripts => f.write_str("more than one transcript tag found"), 278 | ReadFileError::MalformedTranscriptTag(_) => { 279 | f.write_str("transcript tag has invalid or duplicated attributes") 280 | } 281 | ReadFileError::MissingId => { 282 | f.write_str(r#""id" attribute is missing from transcript tag"#) 283 | } 284 | ReadFileError::InvalidId(_) => f.write_str("transcript id is not a valid UTF-8 string"), 285 | ReadFileError::MultipleSequences => f.write_str("more than one sequence tag found"), 286 | ReadFileError::InvalidSequence(_) => f.write_str("sequence is invalid"), 287 | ReadFileError::MultipleReactivities => { 288 | f.write_str("more than one reactivity tag found") 289 | } 290 | ReadFileError::InvalidReactivity(_) => f.write_str("reactivity data is invalid"), 291 | ReadFileError::MissingTranscript => f.write_str("transcript tag is missing"), 292 | ReadFileError::MissingSequence => f.write_str("sequence tag is missing"), 293 | ReadFileError::MissingReactivity => f.write_str("reactivity tag is missing"), 294 | ReadFileError::InconsistentLength { 295 | sequence, 296 | reactivity, 297 | } => write!( 298 | f, 299 | "sequence length ({sequence}) is different from reactivity sequence {reactivity}" 300 | ), 301 | } 302 | } 303 | } 304 | 305 | impl StdError for ReadFileError { 306 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 307 | match self { 308 | ReadFileError::ReaderFromFile(source) | ReadFileError::ReadEvent { source, .. } => { 309 | Some(source) 310 | } 311 | 312 | ReadFileError::UnexpectedOpenTag(_) 313 | | ReadFileError::UnexpectedCloseTag(_) 314 | | ReadFileError::UnexpectedEmptyTag(_) 315 | | ReadFileError::UnexpectedText(_) 316 | | ReadFileError::MultipleTranscripts 317 | | ReadFileError::MissingId 318 | | ReadFileError::MultipleSequences 319 | | ReadFileError::MultipleReactivities 320 | | ReadFileError::MissingTranscript 321 | | ReadFileError::MissingSequence 322 | | ReadFileError::MissingReactivity 323 | | ReadFileError::InconsistentLength { .. } => None, 324 | 325 | ReadFileError::MalformedTranscriptTag(source) => Some(source), 326 | ReadFileError::InvalidId(source) => Some(source), 327 | ReadFileError::InvalidSequence(source) => Some(source), 328 | ReadFileError::InvalidReactivity(source) => Some(source), 329 | } 330 | } 331 | } 332 | 333 | fn parse_sequence(raw: &[u8]) -> Result, InvalidBase> { 334 | raw.iter() 335 | .filter(|c| c.is_ascii_whitespace().not()) 336 | .copied() 337 | .map(Base::try_from) 338 | .collect() 339 | } 340 | 341 | fn parse_reactivity(raw: &[u8]) -> Result, InvalidReactivity> { 342 | use InvalidReactivity as E; 343 | 344 | raw.split(|&c| c == b',') 345 | .map(|raw| { 346 | let raw = std::str::from_utf8(raw).map_err(E::Utf8)?.trim(); 347 | 348 | if raw == "NaN" { 349 | Ok(ReactivityWithPlaceholder::nan_placeholder()) 350 | } else { 351 | raw.parse::() 352 | .map(ReactivityWithPlaceholder::from) 353 | .map_err(InvalidReactivity::Value) 354 | } 355 | }) 356 | .collect() 357 | } 358 | 359 | #[derive(Debug)] 360 | pub enum InvalidReactivity { 361 | Utf8(Utf8Error), 362 | Value(ParseFloatError), 363 | } 364 | 365 | impl Display for InvalidReactivity { 366 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 367 | let s = match self { 368 | InvalidReactivity::Utf8(_) => "rectivity is not a valid UTF-8 string", 369 | InvalidReactivity::Value(_) => "unable to parse reactivity value", 370 | }; 371 | 372 | f.write_str(s) 373 | } 374 | } 375 | 376 | impl StdError for InvalidReactivity { 377 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 378 | match self { 379 | InvalidReactivity::Utf8(source) => Some(source), 380 | InvalidReactivity::Value(source) => Some(source), 381 | } 382 | } 383 | } 384 | 385 | pub fn read_directory(path: &Path) -> Result, ReadDirectoryError> { 386 | use ReadDirectoryError as E; 387 | 388 | path.read_dir() 389 | .map_err(E::Dir)? 390 | .filter_map(|entry| { 391 | entry 392 | .map(|entry| { 393 | let path = entry.path(); 394 | let extension = path.extension()?; 395 | extension.eq_ignore_ascii_case("xml").then_some(path) 396 | }) 397 | .transpose() 398 | }) 399 | .par_bridge() 400 | .filter_map(|path| { 401 | let path = match path { 402 | Ok(path) => path, 403 | Err(err) => return Some(Err(E::DirEntry(err))), 404 | }; 405 | match read_file(&path) { 406 | Ok(entry) => Some(Ok(entry)), 407 | Err(err) => { 408 | eprintln!( 409 | "WARNING: unable to read XML path {}: {:#}", 410 | path.display(), 411 | anyhow::Error::from(err) 412 | ); 413 | None 414 | } 415 | } 416 | }) 417 | .collect() 418 | } 419 | 420 | #[derive(Debug)] 421 | pub enum ReadDirectoryError { 422 | Dir(io::Error), 423 | DirEntry(io::Error), 424 | } 425 | 426 | impl Display for ReadDirectoryError { 427 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 428 | match self { 429 | ReadDirectoryError::Dir(_) => f.write_str("unable to read directory"), 430 | ReadDirectoryError::DirEntry(_) => f.write_str("unable to read directory entry"), 431 | } 432 | } 433 | } 434 | 435 | impl StdError for ReadDirectoryError { 436 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 437 | match self { 438 | ReadDirectoryError::Dir(source) | ReadDirectoryError::DirEntry(source) => Some(source), 439 | } 440 | } 441 | } 442 | 443 | #[cfg(test)] 444 | mod tests { 445 | use std::{ 446 | fs, 447 | path::{Path, PathBuf}, 448 | sync::OnceLock, 449 | }; 450 | 451 | use tempfile::tempdir; 452 | 453 | use crate::{db_file::ReactivityWithPlaceholder, Base}; 454 | 455 | use super::{read_directory, read_file}; 456 | 457 | fn raw_xml_db_path() -> &'static Path { 458 | static RAW_XML_DB_PATH: OnceLock = OnceLock::new(); 459 | 460 | RAW_XML_DB_PATH.get_or_init(|| { 461 | let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); 462 | manifest_dir.join("test_data/test_db.xml") 463 | }) 464 | } 465 | 466 | #[test] 467 | fn read_valid_xml() { 468 | let entry = read_file(raw_xml_db_path()).unwrap(); 469 | assert_eq!(entry.id, "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S"); 470 | assert_eq!(entry.sequence.len(), 1800); 471 | assert_eq!( 472 | entry.sequence[..5], 473 | [Base::T, Base::A, Base::T, Base::C, Base::T] 474 | ); 475 | assert!(entry.reactivity[..37] 476 | .iter() 477 | .copied() 478 | .all(ReactivityWithPlaceholder::is_nan)); 479 | assert!((entry.reactivity[37].get_non_nan().unwrap() - 0.389).abs() < 0.001); 480 | } 481 | 482 | #[test] 483 | fn read_directory_ignores_non_xml_files() { 484 | let tempdir = tempdir().unwrap(); 485 | let temp_path = tempdir.path(); 486 | fs::write(temp_path.join("test.txt"), "hello world").unwrap(); 487 | fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap(); 488 | let entries = read_directory(temp_path).unwrap(); 489 | assert_eq!(entries.len(), 1); 490 | assert_eq!( 491 | entries[0].id, 492 | "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S", 493 | ); 494 | } 495 | 496 | #[test] 497 | fn read_directory_ignores_invalid_xml_files() { 498 | let tempdir = tempdir().unwrap(); 499 | let temp_path = tempdir.path(); 500 | let xml_file_path = temp_path.join("test.xml"); 501 | fs::write(xml_file_path, "invalid xml").unwrap(); 502 | fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap(); 503 | let entries = read_directory(temp_path).unwrap(); 504 | assert_eq!(entries.len(), 1); 505 | assert_eq!( 506 | entries[0].id, 507 | "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S", 508 | ); 509 | } 510 | } 511 | -------------------------------------------------------------------------------- /src/dotbracket.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | cmp::Ordering, 3 | fmt::{self, Display}, 4 | ops::{Not, Range}, 5 | str::FromStr, 6 | }; 7 | 8 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 9 | pub struct DotBracket { 10 | paired_blocks: C, 11 | len: usize, 12 | } 13 | 14 | impl DotBracket 15 | where 16 | C: AsMut>, 17 | { 18 | #[inline] 19 | pub fn from_str(dot_bracket: &str, paired_blocks_buffer: C) -> Result { 20 | Self::from_str_with_buffer(dot_bracket, paired_blocks_buffer, &mut Vec::new()) 21 | } 22 | 23 | #[inline] 24 | pub fn from_str_with_buffer( 25 | dot_bracket: &str, 26 | paired_blocks_buffer: C, 27 | working_buffer: &mut Vec, 28 | ) -> Result { 29 | Self::from_bytes_with_buffer(dot_bracket.as_bytes(), paired_blocks_buffer, working_buffer) 30 | } 31 | 32 | pub fn from_bytes_with_buffer( 33 | dot_bracket: &[u8], 34 | mut paired_blocks_buffer: C, 35 | working_buffer: &mut Vec, 36 | ) -> Result { 37 | let len = dot_bracket.len(); 38 | 39 | let paired_blocks_buffer_ref = paired_blocks_buffer.as_mut(); 40 | paired_blocks_buffer_ref.clear(); 41 | working_buffer.clear(); 42 | let state = dot_bracket 43 | .iter() 44 | .enumerate() 45 | .try_fold(None, |partial, (index, &c)| { 46 | try_fold_from_bytes(partial, index, c, paired_blocks_buffer_ref, working_buffer) 47 | })?; 48 | 49 | if working_buffer.is_empty().not() { 50 | return Err(InvalidDotBracket); 51 | } 52 | 53 | if let Some(state) = state { 54 | let PartialPairedBlockUnstored { 55 | left_start, 56 | other: 57 | Some(PartialPairedBlockOther { 58 | left_end, 59 | right_start, 60 | }), 61 | } = state 62 | else { 63 | return Err(InvalidDotBracket); 64 | }; 65 | 66 | let left = left_start..left_end; 67 | let right = right_start..dot_bracket.len(); 68 | if left.len() != right.len() { 69 | return Err(InvalidDotBracket); 70 | } 71 | 72 | paired_blocks_buffer_ref.push(PairedBlock { left, right }); 73 | } 74 | 75 | Ok(DotBracket { 76 | paired_blocks: paired_blocks_buffer, 77 | len, 78 | }) 79 | } 80 | 81 | #[inline] 82 | pub fn into_sorted(self) -> DotBracket { 83 | let Self { 84 | mut paired_blocks, 85 | len, 86 | } = self; 87 | paired_blocks 88 | .as_mut() 89 | .sort_unstable_by_key(|block| block.left.start); 90 | 91 | DotBracket { paired_blocks, len } 92 | } 93 | } 94 | 95 | fn try_fold_from_bytes( 96 | partial: Option, 97 | index: usize, 98 | c: u8, 99 | paired_blocks_buffer: &mut Vec, 100 | working_buffer: &mut Vec, 101 | ) -> Result, InvalidDotBracket> { 102 | match c { 103 | b'(' => Ok(Some(handle_opening_bracket( 104 | partial, 105 | index, 106 | paired_blocks_buffer, 107 | working_buffer, 108 | ))), 109 | 110 | b'.' => Ok(handle_dot( 111 | partial, 112 | index, 113 | paired_blocks_buffer, 114 | working_buffer, 115 | )), 116 | 117 | b')' => handle_closing_bracket( 118 | partial.as_ref(), 119 | index, 120 | paired_blocks_buffer, 121 | working_buffer, 122 | ), 123 | 124 | _ => Err(InvalidDotBracket), 125 | } 126 | } 127 | 128 | fn handle_opening_bracket( 129 | partial: Option, 130 | index: usize, 131 | paired_blocks_buffer: &mut Vec, 132 | working_buffer: &mut Vec, 133 | ) -> PartialPairedBlockUnstored { 134 | partial.map_or( 135 | PartialPairedBlockUnstored { 136 | left_start: index, 137 | other: None, 138 | }, 139 | |partial| match partial { 140 | partial @ PartialPairedBlockUnstored { 141 | left_start: _, 142 | other: None, 143 | } => partial, 144 | 145 | PartialPairedBlockUnstored { 146 | left_start, 147 | other: 148 | Some(PartialPairedBlockOther { 149 | left_end, 150 | right_start, 151 | }), 152 | } => { 153 | paired_blocks_buffer.push(handle_lr_paired_block( 154 | index, 155 | right_start, 156 | left_start, 157 | left_end, 158 | working_buffer, 159 | )); 160 | 161 | PartialPairedBlockUnstored { 162 | left_start: index, 163 | other: None, 164 | } 165 | } 166 | }, 167 | ) 168 | } 169 | 170 | fn handle_dot( 171 | partial: Option, 172 | index: usize, 173 | paired_blocks_buffer: &mut Vec, 174 | working_buffer: &mut Vec, 175 | ) -> Option { 176 | if let Some(partial) = partial { 177 | let PartialPairedBlockUnstored { left_start, other } = partial; 178 | match other { 179 | Some(PartialPairedBlockOther { 180 | left_end, 181 | right_start, 182 | }) => paired_blocks_buffer.push(handle_lr_paired_block( 183 | index, 184 | right_start, 185 | left_start, 186 | left_end, 187 | working_buffer, 188 | )), 189 | None => working_buffer.push(PartialPairedBlock { 190 | left: left_start..index, 191 | }), 192 | } 193 | } 194 | 195 | None 196 | } 197 | 198 | fn handle_closing_bracket( 199 | partial: Option<&PartialPairedBlockUnstored>, 200 | index: usize, 201 | paired_blocks_buffer: &mut Vec, 202 | working_buffer: &mut Vec, 203 | ) -> Result, InvalidDotBracket> { 204 | match partial { 205 | None => { 206 | let PartialPairedBlock { left } = working_buffer.pop().ok_or(InvalidDotBracket)?; 207 | if left.end - left.start == 1 { 208 | // Cannot use InclusiveRange here 209 | #[allow(clippy::range_plus_one)] 210 | let right = index..(index + 1); 211 | paired_blocks_buffer.push(PairedBlock { left, right }); 212 | Ok(None) 213 | } else { 214 | Ok(Some(PartialPairedBlockUnstored { 215 | left_start: left.start, 216 | other: Some(PartialPairedBlockOther { 217 | left_end: left.end, 218 | right_start: index, 219 | }), 220 | })) 221 | } 222 | } 223 | 224 | Some(&PartialPairedBlockUnstored { 225 | left_start, 226 | other: 227 | Some(PartialPairedBlockOther { 228 | left_end, 229 | right_start, 230 | }), 231 | }) => match (left_end - left_start).cmp(&(index + 1 - right_start)) { 232 | Ordering::Greater => Ok(Some(PartialPairedBlockUnstored { 233 | left_start, 234 | other: Some(PartialPairedBlockOther { 235 | left_end, 236 | right_start, 237 | }), 238 | })), 239 | 240 | Ordering::Equal => { 241 | let left = left_start..left_end; 242 | // Cannot use InclusiveRange here 243 | #[allow(clippy::range_plus_one)] 244 | let right = right_start..(index + 1); 245 | paired_blocks_buffer.push(PairedBlock { left, right }); 246 | 247 | Ok(None) 248 | } 249 | 250 | Ordering::Less => { 251 | panic!("invalid partial paired blocks status") 252 | } 253 | }, 254 | 255 | Some(&PartialPairedBlockUnstored { 256 | left_start, 257 | other: None, 258 | }) => Ok(Some(PartialPairedBlockUnstored { 259 | left_start, 260 | other: Some(PartialPairedBlockOther { 261 | left_end: index, 262 | right_start: index, 263 | }), 264 | })), 265 | } 266 | } 267 | 268 | fn handle_lr_paired_block( 269 | index: usize, 270 | right_start: usize, 271 | left_start: usize, 272 | left_end: usize, 273 | working_buffer: &mut Vec, 274 | ) -> PairedBlock { 275 | let right = right_start..index; 276 | 277 | let left_len = left_end - left_start; 278 | let right_len = index - right_start; 279 | let left = match left_len.cmp(&right_len) { 280 | Ordering::Greater => { 281 | let new_left_start = left_end - right_len; 282 | working_buffer.push(PartialPairedBlock { 283 | left: left_start..new_left_start, 284 | }); 285 | 286 | new_left_start..left_end 287 | } 288 | Ordering::Equal => left_start..left_end, 289 | Ordering::Less => unreachable!("invalid paired blocks"), 290 | }; 291 | 292 | PairedBlock { left, right } 293 | } 294 | 295 | impl DotBracket 296 | where 297 | C: AsRef<[PairedBlock]>, 298 | { 299 | #[inline] 300 | pub fn paired_blocks(&self) -> &[PairedBlock] { 301 | self.paired_blocks.as_ref() 302 | } 303 | 304 | pub fn to_owned(&self) -> DotBracket, SORTED> { 305 | let &Self { 306 | ref paired_blocks, 307 | len, 308 | } = self; 309 | let paired_blocks = paired_blocks.as_ref().to_owned(); 310 | DotBracket { paired_blocks, len } 311 | } 312 | } 313 | 314 | impl Display for DotBracket 315 | where 316 | C: AsRef<[PairedBlock]>, 317 | { 318 | // TODO: find a better implementation 319 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 320 | let &Self { 321 | ref paired_blocks, 322 | len, 323 | } = self; 324 | 325 | let mut buf = vec![b'.'; len]; 326 | for block in paired_blocks.as_ref() { 327 | buf[block.left().clone()].fill(b'('); 328 | buf[block.right().clone()].fill(b')'); 329 | } 330 | 331 | f.write_str(std::str::from_utf8(&buf).unwrap()) 332 | } 333 | } 334 | 335 | pub type DotBracketOwned = DotBracket, false>; 336 | pub type DotBracketOwnedSorted = DotBracket, true>; 337 | 338 | pub type DotBracketBuffered<'a> = DotBracket<&'a mut Vec, false>; 339 | 340 | impl FromStr for DotBracketOwned { 341 | type Err = InvalidDotBracket; 342 | 343 | #[inline] 344 | fn from_str(s: &str) -> Result { 345 | DotBracket::from_str(s, Vec::new()) 346 | } 347 | } 348 | 349 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 350 | pub struct InvalidDotBracket; 351 | 352 | impl Display for InvalidDotBracket { 353 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 354 | f.write_str("invalid dot-bracket notation string") 355 | } 356 | } 357 | 358 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 359 | pub struct PairedBlock { 360 | left: Range, 361 | right: Range, 362 | } 363 | 364 | #[derive(Debug)] 365 | #[doc(hidden)] 366 | pub struct PartialPairedBlock { 367 | left: Range, 368 | } 369 | 370 | #[derive(Debug)] 371 | struct PartialPairedBlockUnstored { 372 | left_start: usize, 373 | other: Option, 374 | } 375 | 376 | #[derive(Debug)] 377 | struct PartialPairedBlockOther { 378 | left_end: usize, 379 | right_start: usize, 380 | } 381 | 382 | impl PairedBlock { 383 | #[inline] 384 | pub fn left(&self) -> &Range { 385 | &self.left 386 | } 387 | 388 | #[inline] 389 | pub fn right(&self) -> &Range { 390 | &self.right 391 | } 392 | } 393 | 394 | #[cfg(test)] 395 | mod tests { 396 | use super::*; 397 | 398 | const STEM_LOOP_DB: &str = "...(((((((((....)))))))))"; 399 | fn test_stem_loop(db: &DotBracket) 400 | where 401 | C: AsRef<[PairedBlock]> + fmt::Debug, 402 | { 403 | assert_eq!(db.len, 25); 404 | assert_eq!( 405 | db.paired_blocks.as_ref(), 406 | [PairedBlock { 407 | left: 3..12, 408 | right: 16..25, 409 | }], 410 | ); 411 | } 412 | 413 | #[test] 414 | fn simple_stem_loop_owned() { 415 | let db: DotBracketOwned = STEM_LOOP_DB.parse().unwrap(); 416 | test_stem_loop(&db); 417 | } 418 | 419 | #[test] 420 | fn simple_stem_loop_buffered() { 421 | let mut buffer = vec![]; 422 | let db = DotBracketBuffered::from_str(STEM_LOOP_DB, &mut buffer).unwrap(); 423 | test_stem_loop(&db); 424 | } 425 | 426 | #[test] 427 | fn multiple_stem_loop() { 428 | let db: DotBracketOwned = "...((((....))))..(((....))).....((....)).." 429 | .parse() 430 | .unwrap(); 431 | assert_eq!(db.len, 42); 432 | assert_eq!( 433 | db.paired_blocks, 434 | [ 435 | PairedBlock { 436 | left: 3..7, 437 | right: 11..15, 438 | }, 439 | PairedBlock { 440 | left: 17..20, 441 | right: 24..27, 442 | }, 443 | PairedBlock { 444 | left: 32..34, 445 | right: 38..40, 446 | }, 447 | ], 448 | ); 449 | } 450 | 451 | #[test] 452 | fn tight_loop() { 453 | let db: DotBracketOwned = "(((())))".parse().unwrap(); 454 | assert_eq!(db.len, 8); 455 | assert_eq!( 456 | db.paired_blocks, 457 | [PairedBlock { 458 | left: 0..4, 459 | right: 4..8, 460 | }], 461 | ); 462 | } 463 | 464 | #[test] 465 | fn simple_nested_loop_of_one_bp() { 466 | let db: DotBracketOwned = "(.(..))".parse().unwrap(); 467 | assert_eq!(db.len, 7); 468 | assert_eq!( 469 | db.paired_blocks, 470 | [ 471 | PairedBlock { 472 | left: 2..3, 473 | right: 5..6, 474 | }, 475 | PairedBlock { 476 | left: 0..1, 477 | right: 6..7, 478 | }, 479 | ], 480 | ); 481 | } 482 | 483 | #[test] 484 | fn nested_stem_loop_left() { 485 | let db: DotBracketOwned = "((((.(((((...)))..))...))))..".parse().unwrap(); 486 | assert_eq!(db.len, 29); 487 | assert_eq!( 488 | db.paired_blocks, 489 | [ 490 | PairedBlock { 491 | left: 7..10, 492 | right: 13..16, 493 | }, 494 | PairedBlock { 495 | left: 5..7, 496 | right: 18..20, 497 | }, 498 | PairedBlock { 499 | left: 0..4, 500 | right: 23..27, 501 | }, 502 | ], 503 | ); 504 | 505 | assert_eq!( 506 | db.into_sorted(), 507 | DotBracket::<_, true> { 508 | len: 29, 509 | paired_blocks: vec![ 510 | PairedBlock { 511 | left: 0..4, 512 | right: 23..27, 513 | }, 514 | PairedBlock { 515 | left: 5..7, 516 | right: 18..20, 517 | }, 518 | PairedBlock { 519 | left: 7..10, 520 | right: 13..16, 521 | }, 522 | ], 523 | } 524 | ); 525 | } 526 | 527 | #[test] 528 | fn nested_stem_loop_right() { 529 | let db: DotBracketOwned = "((((.((..(((...)))))...))))..".parse().unwrap(); 530 | assert_eq!(db.len, 29); 531 | assert_eq!( 532 | db.paired_blocks, 533 | [ 534 | PairedBlock { 535 | left: 9..12, 536 | right: 15..18, 537 | }, 538 | PairedBlock { 539 | left: 5..7, 540 | right: 18..20, 541 | }, 542 | PairedBlock { 543 | left: 0..4, 544 | right: 23..27, 545 | }, 546 | ], 547 | ); 548 | 549 | assert_eq!( 550 | db.into_sorted(), 551 | DotBracket::<_, true> { 552 | len: 29, 553 | paired_blocks: vec![ 554 | PairedBlock { 555 | left: 0..4, 556 | right: 23..27, 557 | }, 558 | PairedBlock { 559 | left: 5..7, 560 | right: 18..20, 561 | }, 562 | PairedBlock { 563 | left: 9..12, 564 | right: 15..18, 565 | }, 566 | ], 567 | }, 568 | ); 569 | } 570 | 571 | #[test] 572 | fn ending_with_state() { 573 | let db: DotBracketOwned = "(.((..)))".parse().unwrap(); 574 | assert_eq!(db.len, 9); 575 | assert_eq!( 576 | db.paired_blocks, 577 | [ 578 | PairedBlock { 579 | left: 2..4, 580 | right: 6..8, 581 | }, 582 | PairedBlock { 583 | left: 0..1, 584 | right: 8..9, 585 | }, 586 | ], 587 | ); 588 | 589 | assert_eq!( 590 | db.into_sorted(), 591 | DotBracket::<_, true> { 592 | len: 9, 593 | paired_blocks: vec![ 594 | PairedBlock { 595 | left: 0..1, 596 | right: 8..9, 597 | }, 598 | PairedBlock { 599 | left: 2..4, 600 | right: 6..8, 601 | }, 602 | ], 603 | }, 604 | ); 605 | } 606 | 607 | #[test] 608 | fn new_loop_after_unstored_block() { 609 | let db: DotBracketOwned = "((.((..)))(((..))))".parse().unwrap(); 610 | assert_eq!(db.len, 19); 611 | assert_eq!( 612 | db.paired_blocks, 613 | [ 614 | PairedBlock { 615 | left: 3..5, 616 | right: 7..9, 617 | }, 618 | PairedBlock { 619 | left: 1..2, 620 | right: 9..10, 621 | }, 622 | PairedBlock { 623 | left: 10..13, 624 | right: 15..18, 625 | }, 626 | PairedBlock { 627 | left: 0..1, 628 | right: 18..19, 629 | }, 630 | ], 631 | ); 632 | 633 | assert_eq!( 634 | db.into_sorted(), 635 | DotBracket::<_, true> { 636 | len: 19, 637 | paired_blocks: vec![ 638 | PairedBlock { 639 | left: 0..1, 640 | right: 18..19, 641 | }, 642 | PairedBlock { 643 | left: 1..2, 644 | right: 9..10, 645 | }, 646 | PairedBlock { 647 | left: 3..5, 648 | right: 7..9, 649 | }, 650 | PairedBlock { 651 | left: 10..13, 652 | right: 15..18, 653 | }, 654 | ], 655 | }, 656 | ); 657 | } 658 | } 659 | -------------------------------------------------------------------------------- /src/fasta.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt, 3 | fs::File, 4 | io::{self, BufWriter}, 5 | path::Path, 6 | }; 7 | 8 | use anyhow::Context; 9 | 10 | use crate::{ 11 | aligner::{AlignedSequence, BaseOrGap}, 12 | db_file, query_file, QueryResult, ResultFileFormat, Sequence, SequenceEntry, 13 | }; 14 | 15 | pub(crate) struct Entry<'a> { 16 | pub(crate) description: &'a str, 17 | pub(crate) sequence: Sequence<'a>, 18 | pub(crate) alignment: Option<&'a AlignedSequence>, 19 | } 20 | 21 | impl fmt::Display for Entry<'_> { 22 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 23 | writeln!(f, ">{}", self.description)?; 24 | match self.alignment { 25 | Some(alignment) => { 26 | let mut sequence = self.sequence.bases.iter(); 27 | for base_or_gap in &alignment.0 { 28 | match base_or_gap { 29 | BaseOrGap::Base => match sequence.next() { 30 | Some(base) => write!(f, "{}", base.display(self.sequence.molecule))?, 31 | None => break, 32 | }, 33 | BaseOrGap::Gap => f.write_str("-")?, 34 | } 35 | } 36 | sequence.try_for_each(|base| write!(f, "{}", base.display(self.sequence.molecule))) 37 | } 38 | None => write!(f, "{}", self.sequence), 39 | } 40 | } 41 | } 42 | 43 | pub(crate) fn write_result( 44 | result: &QueryResult, 45 | db_entries: &[db_file::Entry], 46 | query_entries: &[query_file::Entry], 47 | alignments_path: &Path, 48 | ) -> Result<(), anyhow::Error> { 49 | let fasta_path = alignments_path.join(result_filename(result)); 50 | let file = File::create(fasta_path).context("Unable to create FASTA file")?; 51 | let writer = BufWriter::new(file); 52 | 53 | write_result_to_writer(result, db_entries, query_entries, writer) 54 | } 55 | 56 | fn result_filename(result: &QueryResult) -> String { 57 | format!("{}.fasta", ResultFileFormat::from(result)) 58 | } 59 | 60 | #[inline] 61 | fn write_result_to_writer( 62 | result: &QueryResult, 63 | db_entries: &[db_file::Entry], 64 | query_entries: &[query_file::Entry], 65 | mut writer: W, 66 | ) -> Result<(), anyhow::Error> { 67 | let &QueryResult { 68 | ref query, 69 | ref db_entry, 70 | query_start, 71 | query_end, 72 | db_start, 73 | db_end, 74 | ref alignment, 75 | .. 76 | } = result; 77 | 78 | let db_entry = db_entries 79 | .iter() 80 | .find(|entry| entry.name() == db_entry) 81 | .expect("db entry should be available"); 82 | let query_entry = query_entries 83 | .iter() 84 | .find(|entry| entry.name() == &**query) 85 | .expect("query entry should be available"); 86 | 87 | let db_sequence = Sequence { 88 | bases: &db_entry.sequence()[db_start..=db_end], 89 | molecule: db_entry.molecule(), 90 | }; 91 | let query_sequence = Sequence { 92 | bases: &query_entry.sequence()[query_start..=query_end], 93 | molecule: query_entry.molecule(), 94 | }; 95 | 96 | writeln!( 97 | writer, 98 | "{}\n{}", 99 | Entry { 100 | description: db_entry.name(), 101 | sequence: db_sequence, 102 | alignment: Some(&alignment.target), 103 | }, 104 | Entry { 105 | description: query_entry.name(), 106 | sequence: query_sequence, 107 | alignment: Some(&alignment.query), 108 | } 109 | ) 110 | .context("Unable to write to FASTA file")?; 111 | 112 | Ok::<_, anyhow::Error>(()) 113 | } 114 | 115 | #[cfg(test)] 116 | mod test { 117 | use std::sync::Arc; 118 | 119 | use crate::{aligner::BaseOrGap, query_result, Molecule}; 120 | 121 | use super::*; 122 | 123 | #[test] 124 | fn write_result() { 125 | let query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap(); 126 | let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap(); 127 | let query_name = query[0].name().into(); 128 | let db_name = db[0].name().to_owned(); 129 | 130 | let query_result = QueryResult { 131 | query: query_name, 132 | db_entry: db_name, 133 | query_start: 5, 134 | query_end: 10, 135 | db_start: 15, 136 | db_end: 20, 137 | query_seed: query_result::Range(0..=10), 138 | db_seed: query_result::Range(0..=10), 139 | score: 0., 140 | pvalue: 0., 141 | evalue: 0., 142 | target_bp_support: Option::default(), 143 | query_bp_support: Option::default(), 144 | mfe_pvalue: Option::default(), 145 | status: query_result::Status::PassInclusionEvalue, 146 | alignment: Arc::default(), 147 | dotbracket: Option::default(), 148 | }; 149 | 150 | let mut writer = vec![]; 151 | write_result_to_writer(&query_result, &db, &query, &mut writer).unwrap(); 152 | let written = String::from_utf8(writer).unwrap(); 153 | assert_eq!(written, ">16S_Bsubtilis\nGATCCT\n>16S_750\nCTCAGG\n"); 154 | } 155 | 156 | #[test] 157 | fn write_result_rna() { 158 | let mut query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap(); 159 | query[0].molecule = Molecule::Rna; 160 | let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap(); 161 | let query_name = query[0].name().into(); 162 | let db_name = db[0].name().to_owned(); 163 | 164 | let query_result = QueryResult { 165 | query: query_name, 166 | db_entry: db_name, 167 | query_start: 5, 168 | query_end: 10, 169 | db_start: 15, 170 | db_end: 20, 171 | query_seed: query_result::Range(0..=10), 172 | db_seed: query_result::Range(0..=10), 173 | score: 0., 174 | pvalue: 0., 175 | evalue: 0., 176 | target_bp_support: Option::default(), 177 | query_bp_support: Option::default(), 178 | mfe_pvalue: Option::default(), 179 | status: query_result::Status::PassInclusionEvalue, 180 | alignment: Arc::default(), 181 | dotbracket: Option::default(), 182 | }; 183 | 184 | let mut writer = vec![]; 185 | write_result_to_writer(&query_result, &db, &query, &mut writer).unwrap(); 186 | let written = String::from_utf8(writer).unwrap(); 187 | assert_eq!(written, ">16S_Bsubtilis\nGATCCT\n>16S_750\nCUCAGG\n"); 188 | } 189 | 190 | #[test] 191 | fn result_filename() { 192 | let query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap(); 193 | let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap(); 194 | let query_name = query[0].name().into(); 195 | let db_name = db[0].name().to_owned(); 196 | 197 | let query_result = QueryResult { 198 | query: query_name, 199 | db_entry: db_name, 200 | query_start: 5, 201 | query_end: 10, 202 | db_start: 15, 203 | db_end: 20, 204 | query_seed: query_result::Range(0..=10), 205 | db_seed: query_result::Range(0..=10), 206 | score: 0., 207 | pvalue: 0., 208 | evalue: 0., 209 | target_bp_support: Option::default(), 210 | query_bp_support: Option::default(), 211 | mfe_pvalue: Option::default(), 212 | status: query_result::Status::PassInclusionEvalue, 213 | alignment: Arc::default(), 214 | dotbracket: Option::default(), 215 | }; 216 | 217 | assert_eq!( 218 | super::result_filename(&query_result), 219 | "16S_Bsubtilis_15-20_16S_750_5-10.fasta" 220 | ); 221 | } 222 | 223 | #[test] 224 | fn display_aligned_entry() { 225 | use crate::Base::*; 226 | use BaseOrGap::*; 227 | 228 | let alignment = AlignedSequence(vec![Base, Base, Gap, Base, Gap, Gap, Base]); 229 | let entry = Entry { 230 | description: "test", 231 | sequence: Sequence { 232 | bases: &[A, C, T, G, A, A], 233 | molecule: crate::Molecule::Dna, 234 | }, 235 | alignment: Some(&alignment), 236 | }; 237 | 238 | assert_eq!(entry.to_string(), ">test\nAC-T--GAA"); 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /src/gapped_reactivity.rs: -------------------------------------------------------------------------------- 1 | use std::{iter, ops::Not, slice}; 2 | 3 | use serde::{ser::SerializeSeq, Serialize, Serializer}; 4 | 5 | use crate::{ 6 | aligner::{AlignedSequenceRef, BaseOrGap}, 7 | db_file::ReactivityLike, 8 | }; 9 | 10 | #[derive(Clone, Copy, Debug, PartialEq)] 11 | pub(crate) struct GappedReactivity<'a, T> { 12 | pub(crate) reactivity: &'a [T], 13 | pub(crate) alignment: AlignedSequenceRef<'a>, 14 | } 15 | 16 | pub(crate) trait GappedReactivityLike { 17 | type AlignmentIter<'a>: Iterator + 'a 18 | where 19 | Self: 'a; 20 | 21 | type ReactivityIter<'a>: Iterator + 'a 22 | where 23 | Self: 'a; 24 | 25 | fn alignment(&self) -> Self::AlignmentIter<'_>; 26 | fn reactivity(&self) -> Self::ReactivityIter<'_>; 27 | } 28 | 29 | impl Serialize for GappedReactivity<'_, T> 30 | where 31 | T: ReactivityLike + Serialize, 32 | { 33 | fn serialize(&self, serializer: S) -> Result 34 | where 35 | S: Serializer, 36 | { 37 | let mut usable_alignment = 0; 38 | let gaps = self 39 | .alignment 40 | .0 41 | .iter() 42 | .scan(0, |bases, base| { 43 | if *bases == self.reactivity.len() { 44 | None 45 | } else { 46 | usable_alignment += 1; 47 | if let BaseOrGap::Base = base { 48 | *bases += 1; 49 | } 50 | Some(base) 51 | } 52 | }) 53 | .filter(|base| matches!(base, BaseOrGap::Gap)) 54 | .count(); 55 | let usable_alignment = usable_alignment; 56 | let len = self.reactivity.len() + gaps; 57 | let mut seq = serializer.serialize_seq(Some(len))?; 58 | 59 | let mut reactivities = self.reactivity.iter(); 60 | for base_or_gap in &self.alignment.0[..usable_alignment] { 61 | match base_or_gap { 62 | BaseOrGap::Base => match reactivities.next() { 63 | Some(reactivity) if reactivity.is_nan() => seq.serialize_element("NaN")?, 64 | Some(reactivity) => seq.serialize_element(reactivity)?, 65 | None => break, 66 | }, 67 | BaseOrGap::Gap => seq.serialize_element(&f32::NAN)?, 68 | } 69 | } 70 | 71 | reactivities.try_for_each(|reactivity| seq.serialize_element(reactivity))?; 72 | seq.end() 73 | } 74 | } 75 | 76 | impl GappedReactivityLike for GappedReactivity<'_, T> 77 | where 78 | T: Copy, 79 | { 80 | type AlignmentIter<'a> = iter::Copied> 81 | where 82 | Self: 'a; 83 | 84 | type ReactivityIter<'a> = iter::Copied> 85 | where 86 | Self: 'a; 87 | 88 | #[inline] 89 | fn alignment(&self) -> Self::AlignmentIter<'_> { 90 | self.alignment.0.iter().copied() 91 | } 92 | 93 | #[inline] 94 | fn reactivity(&self) -> Self::ReactivityIter<'_> { 95 | self.reactivity.iter().copied() 96 | } 97 | } 98 | 99 | impl<'a, T> IntoIterator for &'a GappedReactivity<'a, T> 100 | where 101 | T: Copy, 102 | { 103 | type Item = GappedReactivityValue; 104 | type IntoIter = GappedReactivityIter<'a, T>; 105 | 106 | #[inline] 107 | fn into_iter(self) -> Self::IntoIter { 108 | let reactivity = self.reactivity.iter(); 109 | let alignment = self.alignment.0.iter(); 110 | GappedReactivityIter { 111 | reactivity, 112 | alignment, 113 | } 114 | } 115 | } 116 | 117 | impl<'a, T> IntoIterator for GappedReactivity<'a, T> 118 | where 119 | T: Copy, 120 | { 121 | type Item = GappedReactivityValue; 122 | type IntoIter = GappedReactivityIter<'a, T>; 123 | 124 | #[inline] 125 | fn into_iter(self) -> Self::IntoIter { 126 | let reactivity = self.reactivity.iter(); 127 | let alignment = self.alignment.0.iter(); 128 | GappedReactivityIter { 129 | reactivity, 130 | alignment, 131 | } 132 | } 133 | } 134 | 135 | pub(crate) struct GappedReactivityIter<'a, T> { 136 | reactivity: slice::Iter<'a, T>, 137 | alignment: slice::Iter<'a, BaseOrGap>, 138 | } 139 | 140 | #[derive(Clone, Copy, Debug, PartialEq)] 141 | pub(crate) enum GappedReactivityValue { 142 | Reactivity(T), 143 | Gap, 144 | } 145 | 146 | impl Iterator for GappedReactivityIter<'_, T> 147 | where 148 | T: Copy, 149 | { 150 | type Item = GappedReactivityValue; 151 | 152 | #[inline] 153 | fn next(&mut self) -> Option { 154 | match self.alignment.next()? { 155 | BaseOrGap::Base => self 156 | .reactivity 157 | .next() 158 | .copied() 159 | .map(GappedReactivityValue::Reactivity), 160 | 161 | BaseOrGap::Gap => self 162 | .reactivity 163 | .as_slice() 164 | .is_empty() 165 | .not() 166 | .then_some(GappedReactivityValue::Gap), 167 | } 168 | } 169 | 170 | fn size_hint(&self) -> (usize, Option) { 171 | let reactivity = self.reactivity.size_hint(); 172 | let alignment = self.alignment.size_hint(); 173 | 174 | ( 175 | reactivity.0.min(alignment.0), 176 | reactivity 177 | .1 178 | .map(|bases| alignment.1.map_or(bases, |alignment| bases.max(alignment))) 179 | .or(alignment.1), 180 | ) 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/gapped_sequence.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | ffi::CString, 3 | fmt::{self, Display}, 4 | ops::{Not, Range}, 5 | slice, 6 | }; 7 | 8 | use crate::{ 9 | aligner::{AlignedSequence, AlignedSequenceRef, BaseOrGap}, 10 | Base, Molecule, Sequence, 11 | }; 12 | 13 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 14 | pub(crate) struct GappedSequence<'a> { 15 | pub(crate) sequence: Sequence<'a>, 16 | pub(crate) alignment: AlignedSequenceRef<'a>, 17 | } 18 | 19 | pub(crate) trait GappedSequenceLike { 20 | fn to_cstring(&self, molecule: Option) -> CString; 21 | } 22 | 23 | impl<'a> GappedSequence<'a> { 24 | pub(crate) fn new(sequence: Sequence<'a>, alignment: &'a AlignedSequence) -> Self { 25 | let alignment = alignment.to_ref(); 26 | Self { 27 | sequence, 28 | alignment, 29 | } 30 | } 31 | 32 | #[inline] 33 | pub(crate) fn iter(&self) -> GappedSequenceIter<'_> { 34 | IntoIterator::into_iter(self) 35 | } 36 | 37 | pub(crate) fn get(&self, index: Range) -> Option> { 38 | let start = index.start; 39 | self.alignment.0.get(index).map(|alignment| { 40 | let bases_before = self.alignment.0[..start] 41 | .iter() 42 | .filter(|base_or_gap| base_or_gap.is_base()) 43 | .count(); 44 | let bases = alignment 45 | .iter() 46 | .filter(|base_or_gap| base_or_gap.is_base()) 47 | .count(); 48 | 49 | let bases = &self.sequence.bases[bases_before..(bases_before + bases)]; 50 | let sequence = Sequence { 51 | bases, 52 | molecule: self.sequence.molecule, 53 | }; 54 | let alignment = AlignedSequenceRef(alignment); 55 | 56 | GappedSequence { 57 | sequence, 58 | alignment, 59 | } 60 | }) 61 | } 62 | } 63 | 64 | impl GappedSequenceLike for GappedSequence<'_> { 65 | #[inline] 66 | fn to_cstring(&self, molecule: Option) -> CString { 67 | // Rough estimation 68 | let estimated_len = self.alignment.0.len().max(self.sequence.bases.len()) + 1; 69 | let molecule = molecule.unwrap_or(self.sequence.molecule); 70 | sequence_cstring_from_iter( 71 | self.sequence.bases.iter().copied(), 72 | self.alignment.0.iter().copied(), 73 | molecule, 74 | estimated_len, 75 | ) 76 | } 77 | } 78 | 79 | pub(crate) fn sequence_cstring_from_iter( 80 | sequence: S, 81 | base_or_gap: B, 82 | molecule: Molecule, 83 | estimated_len: usize, 84 | ) -> CString 85 | where 86 | S: Iterator, 87 | B: Iterator, 88 | { 89 | let mut chars = Vec::with_capacity(estimated_len); 90 | let mut sequence = sequence.map(|base| base.to_byte(molecule)); 91 | let iter = base_or_gap.filter_map(|alignment| match alignment { 92 | BaseOrGap::Base => sequence.next(), 93 | BaseOrGap::Gap => Some(b'-'), 94 | }); 95 | chars.extend(iter); 96 | chars.extend(sequence); 97 | chars.push(b'\0'); 98 | 99 | CString::from_vec_with_nul(chars).unwrap() 100 | } 101 | 102 | impl<'a> IntoIterator for &'a GappedSequence<'a> { 103 | type Item = StatefulBaseOrGap; 104 | type IntoIter = GappedSequenceIter<'a>; 105 | 106 | #[inline] 107 | fn into_iter(self) -> Self::IntoIter { 108 | let GappedSequence { 109 | sequence, 110 | alignment, 111 | } = self; 112 | 113 | let bases = sequence.bases.iter(); 114 | let alignment = alignment.0.iter(); 115 | GappedSequenceIter { bases, alignment } 116 | } 117 | } 118 | 119 | impl<'a> IntoIterator for GappedSequence<'a> { 120 | type Item = StatefulBaseOrGap; 121 | type IntoIter = GappedSequenceIter<'a>; 122 | 123 | #[inline] 124 | fn into_iter(self) -> Self::IntoIter { 125 | let GappedSequence { 126 | sequence, 127 | alignment, 128 | } = self; 129 | 130 | let bases = sequence.bases.iter(); 131 | let alignment = alignment.0.iter(); 132 | GappedSequenceIter { bases, alignment } 133 | } 134 | } 135 | 136 | impl Display for GappedSequence<'_> { 137 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 138 | use fmt::Write; 139 | 140 | self.iter().try_for_each(|b| match b { 141 | StatefulBaseOrGap::Base(b) => write!(f, "{}", b.display(self.sequence.molecule)), 142 | StatefulBaseOrGap::Gap => f.write_char('-'), 143 | }) 144 | } 145 | } 146 | 147 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 148 | pub(crate) enum StatefulBaseOrGap { 149 | Base(Base), 150 | Gap, 151 | } 152 | 153 | impl StatefulBaseOrGap { 154 | #[inline] 155 | pub(crate) fn to_base(self) -> Option { 156 | match self { 157 | Self::Base(base) => Some(base), 158 | Self::Gap => None, 159 | } 160 | } 161 | } 162 | 163 | pub(crate) struct GappedSequenceIter<'a> { 164 | bases: slice::Iter<'a, Base>, 165 | alignment: slice::Iter<'a, BaseOrGap>, 166 | } 167 | 168 | impl Iterator for GappedSequenceIter<'_> { 169 | type Item = StatefulBaseOrGap; 170 | 171 | #[inline] 172 | fn next(&mut self) -> Option { 173 | match self.alignment.next()? { 174 | BaseOrGap::Base => self.bases.next().copied().map(StatefulBaseOrGap::Base), 175 | BaseOrGap::Gap => self 176 | .bases 177 | .as_slice() 178 | .is_empty() 179 | .not() 180 | .then_some(StatefulBaseOrGap::Gap), 181 | } 182 | } 183 | 184 | fn size_hint(&self) -> (usize, Option) { 185 | let bases = self.bases.size_hint(); 186 | let alignment = self.alignment.size_hint(); 187 | 188 | ( 189 | bases.0.min(alignment.0), 190 | bases 191 | .1 192 | .map(|bases| alignment.1.map_or(bases, |alignment| bases.max(alignment))) 193 | .or(alignment.1), 194 | ) 195 | } 196 | } 197 | 198 | impl DoubleEndedIterator for GappedSequenceIter<'_> { 199 | fn next_back(&mut self) -> Option { 200 | match self.alignment.next_back()? { 201 | BaseOrGap::Base => self.bases.next_back().copied().map(StatefulBaseOrGap::Base), 202 | BaseOrGap::Gap => self 203 | .bases 204 | .as_slice() 205 | .is_empty() 206 | .not() 207 | .then_some(StatefulBaseOrGap::Gap), 208 | } 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/handle_query_entry.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | cmp::Reverse, 3 | ops::{Not, RangeInclusive}, 4 | sync::Arc, 5 | }; 6 | 7 | use rand::{rngs::ThreadRng, thread_rng}; 8 | use statrs::distribution::{self, ContinuousCDF}; 9 | 10 | use crate::{ 11 | alifold_mfe, alifold_on_result, 12 | aligner::{AlignedSequence, BacktrackBehavior, NoOpBehavior}, 13 | cli::Cli, 14 | dotbracket::{self, DotBracketOwnedSorted}, 15 | iter::IterWithRestExt, 16 | norm_dist::NormDist, 17 | null_model::ExtremeDistribution, 18 | query_aligner::{align_query_to_target_db, QueryAlignResult}, 19 | query_file, query_result, reuse_vec, AlifoldOnResult, HandlerData, MutableHandlerData, 20 | QueryResult, SequenceEntry, SharedHandlerData, 21 | }; 22 | 23 | pub(super) fn handle_query_entry<'a>( 24 | query_entry: &'a query_file::Entry, 25 | query_entry_orig: &'a query_file::Entry, 26 | handler_data: HandlerData<'a>, 27 | ) -> anyhow::Result> { 28 | let HandlerData { 29 | shared: 30 | SharedHandlerData { 31 | cli, 32 | db_entries, 33 | db_entries_orig, 34 | db_entries_shuffled, 35 | }, 36 | mutable: 37 | MutableHandlerData { 38 | mut aligner, 39 | mut null_all_scores, 40 | mut null_scores, 41 | mut query_all_results, 42 | mut reusable_query_results, 43 | mut index_to_remove, 44 | mut results, 45 | }, 46 | } = handler_data; 47 | 48 | let null_aligner = align_query_to_target_db::( 49 | query_entry, 50 | db_entries_shuffled, 51 | db_entries_shuffled, 52 | &mut null_all_scores, 53 | cli, 54 | )?; 55 | null_scores.clear(); 56 | null_scores.extend( 57 | null_aligner 58 | .into_iter(&null_all_scores, &mut aligner) 59 | .take(cli.null_hsgs.try_into().unwrap_or(usize::MAX)) 60 | .map(|query_align_result| query_align_result.score), 61 | ); 62 | let null_distribution = ExtremeDistribution::from_sample(&null_scores); 63 | 64 | let mut query_results = reuse_vec(reusable_query_results); 65 | query_results.extend( 66 | align_query_to_target_db::( 67 | query_entry, 68 | db_entries, 69 | db_entries_orig, 70 | &mut query_all_results, 71 | cli, 72 | )? 73 | .into_iter(&query_all_results, &mut aligner) 74 | .map(|result| { 75 | assert_eq!( 76 | result.alignment.query.0.len(), 77 | result.alignment.target.0.len() 78 | ); 79 | 80 | assert_eq!( 81 | result.db.end() - result.db.start() 82 | + result 83 | .alignment 84 | .target 85 | .0 86 | .iter() 87 | .filter(|bog| bog.is_base().not()) 88 | .count(), 89 | result.query.end() - result.query.start() 90 | + result 91 | .alignment 92 | .query 93 | .0 94 | .iter() 95 | .filter(|bog| bog.is_base().not()) 96 | .count(), 97 | ); 98 | 99 | let p_value = null_distribution.p_value(result.score); 100 | 101 | // FIXME: we need to avoid this clone 102 | (result.clone(), p_value, 0.) 103 | }), 104 | ); 105 | 106 | // In case of precision loss, it is still ok to evaluate the e_value 107 | #[allow(clippy::cast_precision_loss)] 108 | let results_len = query_results.len() as f64; 109 | let report_evalue = cli.report_evalue.max(cli.inclusion_evalue); 110 | query_results.retain_mut(|(_, p_value, e_value)| { 111 | *e_value = *p_value * results_len; 112 | *e_value <= report_evalue 113 | }); 114 | remove_overlapping_results(&mut query_results, &mut index_to_remove, cli); 115 | 116 | let mut query_results_handler = QueryResultHandler::new( 117 | Arc::clone(&query_entry.name), 118 | query_entry, 119 | query_entry_orig, 120 | cli, 121 | ); 122 | results.extend( 123 | query_results 124 | .iter() 125 | .map(|&(ref result, pvalue, evalue)| query_results_handler.run(result, pvalue, evalue)), 126 | ); 127 | 128 | reusable_query_results = reuse_vec(query_results); 129 | Ok(MutableHandlerData { 130 | aligner, 131 | null_all_scores, 132 | null_scores, 133 | query_all_results, 134 | reusable_query_results, 135 | index_to_remove, 136 | results, 137 | }) 138 | } 139 | 140 | struct QueryResultHandler<'a> { 141 | query: Arc, 142 | query_entry: &'a query_file::Entry, 143 | query_entry_orig: &'a query_file::Entry, 144 | cli: &'a Cli, 145 | dotbracket_results_buffer: Vec, 146 | dotbracket_temp_buffer: Vec, 147 | null_model_energies: Vec, 148 | rng: ThreadRng, 149 | } 150 | 151 | impl<'a> QueryResultHandler<'a> { 152 | fn new( 153 | query: Arc, 154 | query_entry: &'a query_file::Entry, 155 | query_entry_orig: &'a query_file::Entry, 156 | cli: &'a Cli, 157 | ) -> Self { 158 | Self { 159 | query, 160 | query_entry, 161 | query_entry_orig, 162 | cli, 163 | dotbracket_results_buffer: Vec::new(), 164 | dotbracket_temp_buffer: Vec::new(), 165 | null_model_energies: Vec::new(), 166 | rng: thread_rng(), 167 | } 168 | } 169 | 170 | fn run( 171 | &mut self, 172 | result: &QueryAlignResult, 173 | pvalue: f64, 174 | exp_value: f64, 175 | ) -> QueryResult { 176 | let mut status = if exp_value > self.cli.report_evalue { 177 | query_result::Status::NotPass 178 | } else if exp_value > self.cli.inclusion_evalue { 179 | query_result::Status::PassReportEvalue 180 | } else { 181 | query_result::Status::PassInclusionEvalue 182 | }; 183 | 184 | let (mfe_pvalue_dotbracket, bp_support) = self.get_mfe_data(result, &mut status); 185 | 186 | let &QueryAlignResult { 187 | db_entry, 188 | ref db_match, 189 | score, 190 | db: ref db_range, 191 | query: ref query_range, 192 | ref alignment, 193 | .. 194 | } = result; 195 | 196 | let db_entry = db_entry.name().to_owned(); 197 | let query = Arc::clone(&self.query); 198 | let alignment = Arc::clone(alignment); 199 | 200 | let query_seed = query_result::Range(db_match.query.clone()); 201 | let db_seed = query_result::Range(db_match.db.clone()); 202 | let query_start = *query_range.start(); 203 | let query_end = *query_range.end(); 204 | let db_start = *db_range.start(); 205 | let db_end = *db_range.end(); 206 | 207 | let (target_bp_support, query_bp_support) = bp_support 208 | .map(|BpSupport { target, query }| (target, query)) 209 | .unzip(); 210 | let (mfe_pvalue, dotbracket) = match mfe_pvalue_dotbracket { 211 | MfeResult::Evaluated { pvalue, dotbracket } => { 212 | (Some(pvalue.unwrap_or(1.)), Some(dotbracket)) 213 | } 214 | MfeResult::Unevaluated => (None, None), 215 | }; 216 | 217 | QueryResult { 218 | query, 219 | db_entry, 220 | query_start, 221 | query_end, 222 | db_start, 223 | db_end, 224 | query_seed, 225 | db_seed, 226 | score, 227 | pvalue, 228 | evalue: exp_value, 229 | target_bp_support, 230 | query_bp_support, 231 | mfe_pvalue, 232 | status, 233 | alignment, 234 | dotbracket, 235 | } 236 | } 237 | 238 | fn get_mfe_data( 239 | &mut self, 240 | result: &QueryAlignResult, 241 | status: &mut query_result::Status, 242 | ) -> (MfeResult, Option) { 243 | let &mut Self { 244 | ref mut query_entry, 245 | query_entry_orig, 246 | cli, 247 | ref mut dotbracket_results_buffer, 248 | ref mut dotbracket_temp_buffer, 249 | ref mut null_model_energies, 250 | ref mut rng, 251 | .. 252 | } = self; 253 | 254 | if cli.alignment_folding_eval_args.eval_align_fold.not() 255 | || matches!(status, query_result::Status::PassInclusionEvalue).not() 256 | { 257 | return (MfeResult::Unevaluated, None); 258 | } 259 | 260 | let AlifoldOnResult { 261 | dotbracket, 262 | ignore, 263 | mfe, 264 | gapped_data, 265 | target_bp_support, 266 | query_bp_support, 267 | } = alifold_on_result( 268 | result, 269 | query_entry, 270 | query_entry_orig, 271 | cli, 272 | dotbracket_results_buffer, 273 | dotbracket_temp_buffer, 274 | ); 275 | 276 | let bp_support = BpSupport { 277 | target: target_bp_support, 278 | query: query_bp_support, 279 | }; 280 | 281 | if ignore { 282 | *status = query_result::Status::PassReportEvalue; 283 | return (MfeResult::Unevaluated, Some(bp_support)); 284 | } 285 | 286 | let mut indices_buffer = Vec::new(); 287 | let mut block_indices_buffer = cli 288 | .alignment_folding_eval_args 289 | .in_block_shuffle 290 | .then_some(Vec::new()); 291 | 292 | let block_size = cli.alignment_folding_eval_args.block_size; 293 | null_model_energies.clear(); 294 | null_model_energies.extend((0..cli.alignment_folding_eval_args.shuffles).map(move |_| { 295 | if let Some(block_indices_buffer) = &mut block_indices_buffer { 296 | let gapped_data = gapped_data.clone().shuffled_in_blocks( 297 | block_size, 298 | &mut indices_buffer, 299 | block_indices_buffer, 300 | rng, 301 | ); 302 | 303 | let sequences = [gapped_data.target(), gapped_data.query()]; 304 | let (_, mfe) = alifold_mfe(&sequences, &sequences, cli); 305 | 306 | mfe 307 | } else { 308 | let gapped_data = gapped_data.clone().shuffled( 309 | cli.alignment_folding_eval_args.block_size, 310 | &mut indices_buffer, 311 | rng, 312 | ); 313 | 314 | let sequences = [gapped_data.target(), gapped_data.query()]; 315 | let (_, mfe) = alifold_mfe(&sequences, &sequences, cli); 316 | 317 | mfe 318 | } 319 | })); 320 | let dist = NormDist::from_sample(null_model_energies.as_slice()); 321 | let z_score = dist.z_score(mfe); 322 | 323 | let mfe_pvalue = (z_score < 0.).then(|| { 324 | distribution::Normal::new(dist.mean(), dist.stddev()) 325 | .expect("stddev is expected to be greater than 0") 326 | .cdf(mfe.into()) 327 | }); 328 | let mfe_result = MfeResult::Evaluated { 329 | pvalue: mfe_pvalue, 330 | dotbracket: dotbracket.unwrap().into_sorted().to_owned(), 331 | }; 332 | 333 | (mfe_result, Some(bp_support)) 334 | } 335 | } 336 | 337 | #[derive(Debug)] 338 | struct BpSupport { 339 | query: f32, 340 | target: f32, 341 | } 342 | 343 | #[derive(Debug)] 344 | enum MfeResult { 345 | Unevaluated, 346 | Evaluated { 347 | pvalue: Option, 348 | dotbracket: DotBracketOwnedSorted, 349 | }, 350 | } 351 | 352 | fn remove_overlapping_results( 353 | results: &mut Vec<(QueryAlignResult<'_, AlignedSequence>, f64, f64)>, 354 | indices_buffer: &mut Vec, 355 | cli: &Cli, 356 | ) { 357 | let max_align_overlap: f64 = cli.max_align_overlap.into(); 358 | indices_buffer.clear(); 359 | results.sort_unstable_by(|(a, _, _), (b, _, _)| { 360 | a.db_entry 361 | .id 362 | .cmp(&b.db_entry.id) 363 | .then(a.query.start().cmp(b.query.start())) 364 | .then(a.query.end().cmp(b.query.end()).reverse()) 365 | }); 366 | results 367 | .iter_with_rest() 368 | .enumerate() 369 | .flat_map(|(a_index, (a, rest))| { 370 | let same_db_index = rest.partition_point(|b| a.0.db_entry.id == b.0.db_entry.id); 371 | // TODO: check if pre-calculating `a_len` does change anything 372 | rest[..same_db_index] 373 | .iter() 374 | .enumerate() 375 | .take_while(|(_, b)| are_overlapping(&a.0.query, &b.0.query, max_align_overlap)) 376 | .filter(|(_, b)| are_overlapping(&a.0.db, &b.0.db, max_align_overlap)) 377 | .map(move |(b_offset, b)| (a_index, a_index + b_offset + 1, a.0.score, b.0.score)) 378 | }) 379 | .for_each(|(a_index, b_index, a_score, b_score)| { 380 | if a_score >= b_score { 381 | indices_buffer.push(b_index); 382 | } else { 383 | indices_buffer.push(a_index); 384 | } 385 | }); 386 | 387 | indices_buffer.sort_unstable_by_key(|&index| Reverse(index)); 388 | 389 | if let Some(&first_index) = indices_buffer.first() { 390 | results.swap_remove(first_index); 391 | indices_buffer 392 | .windows(2) 393 | .map(|win| [win[0], win[1]]) 394 | .filter(|[a, b]| a != b) 395 | .for_each(|[_, index]| { 396 | results.swap_remove(index); 397 | }); 398 | } 399 | } 400 | 401 | #[inline] 402 | fn are_overlapping( 403 | a: &RangeInclusive, 404 | b: &RangeInclusive, 405 | max_align_overlap: f64, 406 | ) -> bool { 407 | b.start() < a.end() && { 408 | let overlap = overlapping_range(a, b); 409 | // If we are losing precision, ranges are so bit that we are probably going to crash elsewhere 410 | // anyway 411 | #[allow(clippy::cast_precision_loss)] 412 | let a_len = (a.end() + 1 - a.start()) as f64; 413 | #[allow(clippy::cast_precision_loss)] 414 | let b_len = (b.end() + 1 - b.start()) as f64; 415 | 416 | #[allow(clippy::cast_precision_loss)] 417 | let overlap = (overlap.end() + 1).saturating_sub(*overlap.start()) as f64; 418 | 419 | overlap > (a_len.min(b_len)) * max_align_overlap 420 | } 421 | } 422 | 423 | #[inline] 424 | fn overlapping_range(a: &RangeInclusive, b: &RangeInclusive) -> RangeInclusive 425 | where 426 | T: Ord + Clone, 427 | { 428 | let start = a.start().max(b.start()).clone(); 429 | let end = a.end().min(b.end()).clone(); 430 | start..=end 431 | } 432 | 433 | #[cfg(test)] 434 | mod tests { 435 | use crate::{ 436 | aligner::{AlignmentResult, BaseOrGap}, 437 | db_file::{self, ReactivityWithPlaceholder}, 438 | tests::dummy_cli, 439 | Base, MatchRanges, 440 | }; 441 | 442 | use super::*; 443 | 444 | #[test] 445 | fn remove_overlapping_results_empty() { 446 | remove_overlapping_results(&mut Vec::new(), &mut vec![1, 2, 3], &dummy_cli()); 447 | } 448 | 449 | static EMPTY_ENTRY: db_file::Entry = db_file::Entry { 450 | id: String::new(), 451 | sequence: Vec::new(), 452 | reactivity: Vec::new(), 453 | }; 454 | macro_rules! query_result { 455 | ($query:expr, $score:expr) => { 456 | ( 457 | QueryAlignResult { 458 | db_entry: &EMPTY_ENTRY, 459 | db_entry_orig: &EMPTY_ENTRY, 460 | db_match: MatchRanges { 461 | db: 0..=0, 462 | query: 0..=0, 463 | }, 464 | score: $score, 465 | db: 0..=10, 466 | query: $query, 467 | alignment: Default::default(), 468 | }, 469 | 0.0, 470 | 0.0, 471 | ) 472 | }; 473 | 474 | ($query:expr) => { 475 | query_result!($query, 0.) 476 | }; 477 | } 478 | 479 | #[test] 480 | fn remove_overlapping_results_non_overlapping() { 481 | let mut results = vec![ 482 | query_result!(0..=4), 483 | query_result!(4..=10), 484 | query_result!(8..=16), 485 | ]; 486 | let initial_results = results.clone(); 487 | remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli()); 488 | 489 | assert_eq!(results, initial_results); 490 | } 491 | 492 | #[test] 493 | fn remove_overlapping_results_simple_overlap() { 494 | let mut results = vec![query_result!(0..=10, 0.5), query_result!(4..=14, 0.2)]; 495 | remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli()); 496 | 497 | assert_eq!(results, vec![query_result!(0..=10, 0.5)]); 498 | 499 | results = vec![query_result!(0..=10, 0.2), query_result!(4..=14, 0.5)]; 500 | remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli()); 501 | 502 | assert_eq!(results, vec![query_result!(4..=14, 0.5)]); 503 | } 504 | 505 | #[test] 506 | fn remove_overlapping_results_nested_overlap() { 507 | let mut results = vec![query_result!(0..=10, 0.5), query_result!(2..=6, 0.2)]; 508 | remove_overlapping_results(&mut results, &mut vec![], &dummy_cli()); 509 | 510 | assert_eq!(results, vec![query_result!(0..=10, 0.5)]); 511 | 512 | results = vec![query_result!(0..=10, 0.2), query_result!(2..=6, 0.5)]; 513 | remove_overlapping_results(&mut results, &mut vec![], &dummy_cli()); 514 | 515 | assert_eq!(results, vec![query_result!(2..=6, 0.5)]); 516 | } 517 | 518 | #[test] 519 | fn remove_overlapping_results_chained_overlap() { 520 | let mut results = vec![ 521 | query_result!(0..=5, 0.5), 522 | query_result!(2..=8, 0.5), 523 | query_result!(4..=10, 0.7), 524 | query_result!(6..=12, 0.5), 525 | query_result!(8..=14, 0.5), 526 | query_result!(10..=16, 0.5), 527 | ]; 528 | remove_overlapping_results(&mut results, &mut vec![], &dummy_cli()); 529 | 530 | // The reason for this result is, using the current algorithm: 531 | // - the second is removed because this is the current behavior for overlapping sequences 532 | // with the same score; 533 | // - the third is kept (it removes the second, again) because it has a higher score; 534 | // - all the others are removed "by chaining" (the first has a higher priority in case of 535 | // equal score, but it's been removed by another comparison). 536 | assert_eq!( 537 | results, 538 | vec![query_result!(0..=5, 0.5), query_result!(4..=10, 0.7)] 539 | ); 540 | } 541 | 542 | #[test] 543 | fn keep_targets_with_overlapping_results() { 544 | let cli = Cli::dummy(); 545 | 546 | let targets: Vec<_> = (0..5) 547 | .map(|index| db_file::Entry { 548 | id: format!("db_{index}"), 549 | sequence: vec![Base::A; 100], 550 | reactivity: vec![ReactivityWithPlaceholder::from(0.5); 100], 551 | }) 552 | .collect(); 553 | 554 | let alignment_result = Arc::new(AlignmentResult { 555 | query: AlignedSequence(vec![BaseOrGap::Base; 4]), 556 | target: AlignedSequence(vec![BaseOrGap::Base; 4]), 557 | }); 558 | let alignment_result = &alignment_result; 559 | let mut results = targets 560 | .iter() 561 | .enumerate() 562 | .flat_map(|(outer_index, db_entry)| { 563 | (0..3).map(move |inner_index| { 564 | let result = QueryAlignResult { 565 | db_entry, 566 | db_entry_orig: db_entry, 567 | db_match: MatchRanges { 568 | db: 13..=16, 569 | query: 13..=16, 570 | }, 571 | score: 15. + f32::from(u16::try_from(outer_index).unwrap()) * 3. 572 | - (f32::from(i16::try_from(inner_index).unwrap()) * 2.), 573 | db: 10..=20, 574 | query: 10..=20, 575 | alignment: Arc::clone(alignment_result), 576 | }; 577 | 578 | let p_value = f64::from(u32::try_from(outer_index).unwrap()) / 1000. + 0.01 579 | - (f64::from(inner_index + 1) / 10000.); 580 | 581 | (result, p_value, p_value) 582 | }) 583 | }) 584 | .collect(); 585 | 586 | let mut indices = vec![]; 587 | remove_overlapping_results(&mut results, &mut indices, &cli); 588 | 589 | assert_eq!(results.len(), targets.len()); 590 | let mut scores: Vec<_> = results 591 | .into_iter() 592 | .map(|(result, _, _)| result.score) 593 | .collect(); 594 | scores.sort_unstable_by(f32::total_cmp); 595 | assert_eq!( 596 | scores, 597 | (0..5) 598 | .map(|index| 15. + f32::from(i16::try_from(index).unwrap()) * 3.) 599 | .collect::>() 600 | ); 601 | } 602 | 603 | #[test] 604 | fn keep_targets_with_overlapping_results_different_target() { 605 | let mut results = vec![ 606 | query_result!(0..=4), 607 | ( 608 | QueryAlignResult { 609 | db: 15..=20, 610 | ..(query_result!(0..=4).0) 611 | }, 612 | 0., 613 | 0., 614 | ), 615 | ]; 616 | let initial_results = results.clone(); 617 | remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli()); 618 | 619 | assert_eq!(results, initial_results); 620 | } 621 | } 622 | -------------------------------------------------------------------------------- /src/iter.rs: -------------------------------------------------------------------------------- 1 | use std::iter::FusedIterator; 2 | 3 | pub(crate) trait IterWithRestExt { 4 | fn iter_with_rest(&self) -> IterWithRest<'_, T>; 5 | } 6 | 7 | pub(crate) struct IterWithRest<'a, T>(&'a [T]); 8 | 9 | impl<'a, T> Iterator for IterWithRest<'a, T> { 10 | type Item = (&'a T, &'a [T]); 11 | 12 | #[inline] 13 | fn next(&mut self) -> Option { 14 | let (first, rest) = self.0.split_first()?; 15 | self.0 = rest; 16 | Some((first, rest)) 17 | } 18 | 19 | #[inline] 20 | fn size_hint(&self) -> (usize, Option) { 21 | (self.0.len(), Some(self.0.len())) 22 | } 23 | } 24 | 25 | impl ExactSizeIterator for IterWithRest<'_, T> { 26 | #[inline] 27 | fn len(&self) -> usize { 28 | self.0.len() 29 | } 30 | } 31 | 32 | impl FusedIterator for IterWithRest<'_, T> {} 33 | 34 | impl DoubleEndedIterator for IterWithRest<'_, T> { 35 | #[inline] 36 | fn next_back(&mut self) -> Option { 37 | let (last, rest) = self.0.split_last()?; 38 | self.0 = rest; 39 | Some((last, rest)) 40 | } 41 | } 42 | 43 | impl IterWithRestExt for [T] { 44 | #[inline] 45 | fn iter_with_rest(&self) -> IterWithRest<'_, T> { 46 | IterWithRest(self) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/mass.rs: -------------------------------------------------------------------------------- 1 | use fftw::{ 2 | array::AlignedVec, 3 | plan::C2CPlan, 4 | types::{Flag, Sign}, 5 | }; 6 | use num_complex::Complex; 7 | use num_traits::{float::FloatCore, Float}; 8 | 9 | use crate::{db_file::ReactivityWithPlaceholder, mean_stddev, C2CPlanExt, Reactivity}; 10 | 11 | pub(crate) struct Mass { 12 | forward_plan: ::Plan, 13 | backward_plan: ::Plan, 14 | aligned_query: AlignedVec>, 15 | query_transform: AlignedVec>, 16 | product: AlignedVec>, 17 | product_inverse: AlignedVec>, 18 | } 19 | 20 | impl Mass { 21 | pub(crate) fn new(size: usize) -> Result { 22 | let forward_plan: ::Plan = 23 | C2CPlan::aligned(&[size], Sign::Forward, Flag::ESTIMATE)?; 24 | let backward_plan: ::Plan = 25 | C2CPlan::aligned(&[size], Sign::Backward, Flag::ESTIMATE)?; 26 | let aligned_query = AlignedVec::new(size); 27 | let query_transform = aligned_query.clone(); 28 | let product = aligned_query.clone(); 29 | let product_inverse = aligned_query.clone(); 30 | 31 | Ok(Self { 32 | forward_plan, 33 | backward_plan, 34 | aligned_query, 35 | query_transform, 36 | product, 37 | product_inverse, 38 | }) 39 | } 40 | 41 | pub(crate) fn run( 42 | &mut self, 43 | db: &[ReactivityWithPlaceholder], 44 | db_transform: &AlignedVec>, 45 | query: &[Reactivity], 46 | ) -> Result>, fftw::error::Error> { 47 | let ts_len = db_transform.len(); 48 | let query_len = query.len(); 49 | 50 | query 51 | .iter() 52 | .rev() 53 | .copied() 54 | .zip(self.aligned_query.iter_mut()) 55 | .for_each(|(q, y)| y.re = q); 56 | self.forward_plan 57 | .c2c(&mut self.aligned_query, &mut self.query_transform)?; 58 | 59 | self.product 60 | .iter_mut() 61 | .zip(&**db_transform) 62 | .zip(&*self.query_transform) 63 | .for_each(|((z, x), y)| *z = x * y); 64 | 65 | self.backward_plan 66 | .c2c(&mut self.product, &mut self.product_inverse)?; 67 | 68 | // Normalize results 69 | #[allow(clippy::cast_precision_loss)] 70 | let scale_factor = 1. / (ts_len as Reactivity); 71 | for z in &mut *self.product_inverse { 72 | *z *= scale_factor; 73 | } 74 | 75 | let mean_sigma_x = db 76 | .windows(query_len) 77 | .map(|window| mean_stddev(window.iter().map(|r| r.to_maybe_placeholder()), 0)); 78 | let (mean_y, sigma_y) = mean_stddev(query.iter().copied(), 0); 79 | 80 | // We are using this with the z value, it is ok to lose precision 81 | #[allow(clippy::cast_precision_loss)] 82 | let query_len_float = query_len as Reactivity; 83 | Ok(self 84 | .product_inverse 85 | .iter() 86 | .skip(query_len - 1) 87 | .take(ts_len.saturating_sub(query_len - 1)) 88 | .zip(mean_sigma_x) 89 | .map(|(z, (mean_x, sigma_x))| { 90 | let squared = 2. 91 | * (query_len_float 92 | - (z - query_len_float * mean_x * mean_y) / (sigma_x * sigma_y)); 93 | squared.sqrt() 94 | }) 95 | .collect()) 96 | } 97 | } 98 | 99 | pub(crate) trait ComplexExt { 100 | fn sqrt(&self) -> Self; 101 | fn powi(&self, n: i32) -> Self; 102 | fn is_finite(&self) -> bool; 103 | } 104 | 105 | impl ComplexExt for f64 { 106 | fn sqrt(&self) -> Self { 107 | f64::sqrt(*self) 108 | } 109 | 110 | fn powi(&self, n: i32) -> Self { 111 | f64::powi(*self, n) 112 | } 113 | 114 | fn is_finite(&self) -> bool { 115 | f64::is_finite(*self) 116 | } 117 | } 118 | 119 | impl ComplexExt for f32 { 120 | fn sqrt(&self) -> Self { 121 | f32::sqrt(*self) 122 | } 123 | 124 | fn powi(&self, n: i32) -> Self { 125 | f32::powi(*self, n) 126 | } 127 | 128 | fn is_finite(&self) -> bool { 129 | f32::is_finite(*self) 130 | } 131 | } 132 | 133 | impl ComplexExt for Complex 134 | where 135 | T: Float + FloatCore, 136 | { 137 | fn sqrt(&self) -> Self { 138 | >::sqrt(*self) 139 | } 140 | 141 | fn powi(&self, n: i32) -> Self { 142 | Complex::powi(self, n) 143 | } 144 | 145 | fn is_finite(&self) -> bool { 146 | Complex::is_finite(*self) 147 | } 148 | } 149 | 150 | #[cfg(test)] 151 | mod tests { 152 | use approx::assert_abs_diff_eq; 153 | 154 | use super::*; 155 | use crate::transform_db; 156 | 157 | #[test] 158 | fn test_mass() { 159 | const EXPECTED: [Complex; 5] = [ 160 | Complex::new(0.676_408_23, 2.349_848_8e-7), 161 | Complex::new(3.430_923_5, 0.), 162 | Complex::new(3.430_923_5, 4.632_738_3e-8), 163 | Complex::new(0.000_690_533_95, 0.), 164 | Complex::new(1.851_136_1, -2.082_504_7e-8), 165 | ]; 166 | 167 | let ts = [1., 1., 1., 2., 1., 1., 4., 5.].map(ReactivityWithPlaceholder::from); 168 | let ts_t = transform_db(&ts).unwrap(); 169 | let query = [2., 1., 1., 4.]; 170 | let result = Mass::new(ts.len()) 171 | .unwrap() 172 | .run(ts.as_ref(), &ts_t, query.as_ref()) 173 | .unwrap(); 174 | 175 | assert_abs_diff_eq!(&*result, EXPECTED.as_ref(), epsilon = 1e-6); 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/norm_dist.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Deref; 2 | 3 | use num_traits::AsPrimitive; 4 | use once_cell::unsync::OnceCell; 5 | 6 | pub struct NormDist { 7 | data: D, 8 | mean: OnceCell, 9 | stddev: OnceCell, 10 | } 11 | 12 | impl NormDist { 13 | #[inline] 14 | pub fn from_sample(data: D) -> Self { 15 | Self { 16 | data, 17 | mean: OnceCell::new(), 18 | stddev: OnceCell::new(), 19 | } 20 | } 21 | } 22 | 23 | impl NormDist 24 | where 25 | D: Deref, 26 | T: AsPrimitive, 27 | { 28 | pub fn z_score(&self, value: T) -> f64 { 29 | let mean = self.mean(); 30 | let stddev = self.stddev(); 31 | 32 | (value.as_() - mean) / stddev 33 | } 34 | 35 | pub fn mean(&self) -> f64 { 36 | *self.mean.get_or_init(|| { 37 | let len = self.data.len(); 38 | if len == 0 { 39 | 0. 40 | } else { 41 | // It is fine to evaluate the mean 42 | #[allow(clippy::cast_precision_loss)] 43 | let len_recip = (len as f64).recip(); 44 | self.data.iter().map(|x| x.as_() * len_recip).sum() 45 | } 46 | }) 47 | } 48 | 49 | pub fn stddev(&self) -> f64 { 50 | *self.stddev.get_or_init(|| { 51 | self.data.len().checked_sub(1).map_or(0., |adj_len| { 52 | // It is fine to evaluate the variance 53 | #[allow(clippy::cast_precision_loss)] 54 | let denominator = (adj_len as f64).recip(); 55 | 56 | let mean = self.mean(); 57 | let variance: f64 = self 58 | .data 59 | .iter() 60 | .map(|x| (x.as_() - mean).powi(2) * denominator) 61 | .sum(); 62 | 63 | variance.sqrt() 64 | }) 65 | }) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/null_model.rs: -------------------------------------------------------------------------------- 1 | use std::{iter, mem}; 2 | 3 | use num_traits::cast; 4 | use rand::prelude::*; 5 | 6 | use crate::db_file::Entry; 7 | 8 | #[inline] 9 | pub fn make_shuffled_db( 10 | db: &[Entry], 11 | block_size: usize, 12 | shuffle_iterations: usize, 13 | in_block_shuffle: bool, 14 | ) -> Vec { 15 | make_shuffled_db_inner( 16 | db, 17 | block_size, 18 | shuffle_iterations, 19 | in_block_shuffle, 20 | rand::thread_rng(), 21 | ) 22 | } 23 | 24 | fn make_shuffled_db_inner( 25 | db: &[Entry], 26 | block_size: usize, 27 | shuffle_iterations: usize, 28 | in_block_shuffle: bool, 29 | mut rng: R, 30 | ) -> Vec { 31 | let mut chunk_indices = Vec::new(); 32 | 33 | let sequences = db.len() * shuffle_iterations; 34 | iter::from_fn(move || { 35 | let entry = db.choose(&mut rng)?; 36 | let (offset, chunks) = 37 | get_random_offset_and_chunks(entry.sequence.len(), block_size, &mut rng); 38 | 39 | resize_indices(&mut chunk_indices, chunks); 40 | chunk_indices.shuffle(&mut rng); 41 | 42 | Some(get_shuffled_entry( 43 | &chunk_indices, 44 | entry, 45 | offset, 46 | block_size, 47 | in_block_shuffle, 48 | &mut rng, 49 | )) 50 | }) 51 | .take(sequences) 52 | .collect() 53 | } 54 | 55 | #[inline] 56 | fn get_shuffled_entry( 57 | chunk_indices: &[usize], 58 | entry: &Entry, 59 | offset: usize, 60 | block_size: usize, 61 | in_block_shuffle: bool, 62 | mut rng: R, 63 | ) -> Entry 64 | where 65 | R: Rng, 66 | { 67 | let mut sequence = Vec::with_capacity(entry.sequence.len()); 68 | let mut reactivity = Vec::with_capacity(entry.reactivity.len()); 69 | 70 | match offset { 71 | 0 => { 72 | for &chunk_index in chunk_indices { 73 | extend_with_shuffle( 74 | &mut sequence, 75 | get_chunk_without_offset(chunk_index, block_size, &entry.sequence), 76 | in_block_shuffle, 77 | &mut rng, 78 | ); 79 | extend_with_shuffle( 80 | &mut reactivity, 81 | get_chunk_without_offset(chunk_index, block_size, &entry.reactivity), 82 | in_block_shuffle, 83 | &mut rng, 84 | ); 85 | } 86 | } 87 | _ => { 88 | for &chunk_index in chunk_indices { 89 | extend_with_shuffle( 90 | &mut sequence, 91 | get_chunk_with_offset(chunk_index, offset, block_size, &entry.sequence), 92 | in_block_shuffle, 93 | &mut rng, 94 | ); 95 | extend_with_shuffle( 96 | &mut reactivity, 97 | get_chunk_with_offset(chunk_index, offset, block_size, &entry.reactivity), 98 | in_block_shuffle, 99 | &mut rng, 100 | ); 101 | } 102 | } 103 | } 104 | 105 | Entry { 106 | id: entry.id.clone(), 107 | sequence, 108 | reactivity, 109 | } 110 | } 111 | 112 | fn resize_indices(indices: &mut Vec, new_size: usize) { 113 | let old_len = indices.len(); 114 | 115 | let mut index = old_len; 116 | indices.resize_with(new_size, move || { 117 | let new_index = index + 1; 118 | mem::replace(&mut index, new_index) 119 | }); 120 | 121 | let mut index = 0; 122 | indices[..new_size.min(old_len)].fill_with(move || { 123 | let new_index = index + 1; 124 | mem::replace(&mut index, new_index) 125 | }); 126 | } 127 | 128 | fn get_random_offset_and_chunks( 129 | len: usize, 130 | block_size: usize, 131 | mut rng: R, 132 | ) -> (usize, usize) { 133 | let block_remainder = len % block_size; 134 | let offset = (block_remainder > 0) 135 | .then(|| rng.gen_range(0..block_remainder)) 136 | .unwrap_or(0); 137 | 138 | let len_without_offset = len - offset; 139 | let aux_chunks = match (offset, len_without_offset % block_size) { 140 | (0, 0) => 0, 141 | (_, 0) | (0, _) => 1, 142 | (_, _) => 2, 143 | }; 144 | let chunks = len_without_offset / block_size + aux_chunks; 145 | 146 | (offset, chunks) 147 | } 148 | 149 | #[derive(Debug)] 150 | pub(crate) struct ExtremeDistribution { 151 | pub(crate) location: f64, 152 | pub(crate) scale: f64, 153 | } 154 | 155 | const EULER_MASCHERONI: f64 = 0.577_215_664_901_532_9; 156 | 157 | impl ExtremeDistribution { 158 | pub(crate) fn from_sample(sample: &[T]) -> Self 159 | where 160 | T: num_traits::NumCast + Copy, 161 | { 162 | let len = sample.len(); 163 | match len { 164 | 0 => { 165 | return Self { 166 | location: 0., 167 | scale: 0., 168 | }; 169 | } 170 | 1 => { 171 | return Self { 172 | location: cast(sample[0]).unwrap(), 173 | scale: 0., 174 | }; 175 | } 176 | _ => {} 177 | } 178 | 179 | // It is fine to evaluate mean and variance\ 180 | #[allow(clippy::cast_precision_loss)] 181 | let len = len as f64; 182 | let len_inv = 1. / len; 183 | let mean: f64 = sample 184 | .iter() 185 | .copied() 186 | .map(|x| cast::<_, f64>(x).unwrap() * len_inv) 187 | .sum(); 188 | 189 | let variance = sample 190 | .iter() 191 | .copied() 192 | .map(|x| (cast::<_, f64>(x).unwrap() - mean).powi(2)) 193 | .sum::() 194 | / (len - 1.); 195 | 196 | Self::from_mean_and_variance(mean, variance) 197 | } 198 | 199 | fn from_mean_and_variance(mean: f64, variance: f64) -> Self { 200 | use std::f64::consts::PI; 201 | 202 | let scale = (variance * 6. / PI.powi(2)).sqrt(); 203 | let location = mean - scale * EULER_MASCHERONI; 204 | 205 | Self { location, scale } 206 | } 207 | 208 | pub(crate) fn cdf(&self, value: T) -> f64 209 | where 210 | T: num_traits::NumCast, 211 | { 212 | let z = (cast::<_, f64>(value).unwrap() - self.location) / self.scale; 213 | f64::exp(-f64::exp(-z)) 214 | } 215 | 216 | #[inline] 217 | pub(crate) fn p_value(&self, value: T) -> f64 218 | where 219 | T: num_traits::NumCast, 220 | { 221 | 1. - self.cdf(value) 222 | } 223 | } 224 | 225 | #[inline] 226 | fn get_chunk_with_offset(index: usize, offset: usize, block_size: usize, data: &[T]) -> &[T] { 227 | match index.checked_sub(1) { 228 | Some(index) => data 229 | .get(offset..) 230 | .map(|data| { 231 | data.chunks(block_size) 232 | .nth(index) 233 | .expect("chunk index out of bound") 234 | }) 235 | .unwrap_or_default(), 236 | 237 | None => data.get(..offset).unwrap_or_default(), 238 | } 239 | } 240 | 241 | #[inline] 242 | fn get_chunk_without_offset(index: usize, block_size: usize, data: &[T]) -> &[T] { 243 | data.chunks(block_size) 244 | .nth(index) 245 | .expect("chunk index out of bound") 246 | } 247 | 248 | fn extend_with_shuffle(data: &mut Vec, to_append: &[T], in_block_shuffle: bool, mut rng: R) 249 | where 250 | T: Copy, 251 | R: Rng, 252 | { 253 | if in_block_shuffle { 254 | let len_before = data.len(); 255 | data.extend(to_append); 256 | data[len_before..].shuffle(&mut rng); 257 | } else { 258 | data.extend(to_append); 259 | } 260 | } 261 | 262 | #[cfg(test)] 263 | mod tests { 264 | use std::fs::File; 265 | 266 | use rand::rngs::{mock::StepRng, SmallRng}; 267 | 268 | use crate::{db_file, SequenceEntry}; 269 | 270 | use super::*; 271 | 272 | #[test] 273 | fn shuffled_entry_with_offset() { 274 | const SEQUENCE_LEN: usize = 1553; 275 | const BLOCK_SIZE: usize = 13; 276 | const OFFSET: usize = 3; 277 | // 1 chunk for the remainder 278 | const EXPECTED_CHUNKS: usize = 279 | 1 + (1553 - OFFSET) / BLOCK_SIZE + ((SEQUENCE_LEN - OFFSET) % BLOCK_SIZE != 0) as usize; 280 | const SHUFFLED_INDICES: [usize; EXPECTED_CHUNKS] = [ 281 | 72, 38, 19, 42, 26, 13, 69, 51, 5, 97, 79, 62, 102, 28, 3, 120, 44, 103, 32, 81, 85, 282 | 27, 93, 113, 106, 15, 65, 36, 59, 98, 99, 77, 84, 29, 39, 61, 16, 30, 80, 1, 68, 14, 283 | 78, 90, 95, 118, 41, 10, 116, 91, 37, 70, 0, 92, 46, 18, 9, 2, 63, 57, 110, 40, 66, 49, 284 | 108, 56, 119, 7, 50, 107, 55, 54, 4, 104, 115, 23, 53, 111, 82, 24, 35, 71, 12, 43, 76, 285 | 48, 52, 25, 87, 100, 17, 74, 20, 94, 114, 109, 86, 34, 58, 21, 105, 64, 88, 60, 117, 286 | 22, 31, 89, 73, 11, 101, 75, 112, 96, 83, 47, 67, 8, 45, 33, 6, 287 | ]; 288 | 289 | let mut db = 290 | db_file::native::Reader::new(File::open("test_data/test.db").unwrap()).unwrap(); 291 | let entry = db.entries().next().unwrap().unwrap(); 292 | 293 | assert_eq!(entry.sequence.len(), SEQUENCE_LEN); 294 | 295 | let split_sequence: Vec<_> = iter::once(&entry.sequence[..OFFSET]) 296 | .chain(entry.sequence[OFFSET..].chunks(BLOCK_SIZE)) 297 | .collect(); 298 | let expected_sequence: Vec<_> = SHUFFLED_INDICES 299 | .into_iter() 300 | .flat_map(|index| split_sequence[index]) 301 | .copied() 302 | .collect(); 303 | 304 | let split_reactivities: Vec<_> = iter::once(&entry.reactivity[..OFFSET]) 305 | .chain(entry.reactivity[OFFSET..].chunks(BLOCK_SIZE)) 306 | .collect(); 307 | let expected_reactivity: Vec<_> = SHUFFLED_INDICES 308 | .into_iter() 309 | .flat_map(|index| split_reactivities[index]) 310 | .copied() 311 | .collect(); 312 | 313 | let shuffled_entry = get_shuffled_entry( 314 | &SHUFFLED_INDICES, 315 | &entry, 316 | OFFSET, 317 | BLOCK_SIZE, 318 | false, 319 | thread_rng(), 320 | ); 321 | 322 | assert_eq!(shuffled_entry.id, entry.id); 323 | assert_eq!(shuffled_entry.sequence, expected_sequence); 324 | assert!(shuffled_entry 325 | .reactivity() 326 | .iter() 327 | .copied() 328 | .zip(expected_reactivity) 329 | .all(|(a, b)| (a.is_nan() && b.is_nan()) || a == b)); 330 | } 331 | 332 | #[test] 333 | fn shuffled_entry_without_offset() { 334 | const SEQUENCE_LEN: usize = 1553; 335 | const BLOCK_SIZE: usize = 13; 336 | const EXPECTED_CHUNKS: usize = 337 | SEQUENCE_LEN / BLOCK_SIZE + (SEQUENCE_LEN % BLOCK_SIZE != 0) as usize; 338 | const SHUFFLED_INDICES: [usize; EXPECTED_CHUNKS] = [ 339 | 72, 38, 19, 42, 26, 13, 69, 51, 5, 97, 79, 62, 102, 28, 3, 44, 103, 32, 81, 85, 27, 93, 340 | 113, 106, 15, 65, 36, 59, 98, 99, 77, 84, 29, 39, 61, 16, 30, 80, 1, 68, 14, 78, 90, 341 | 95, 118, 41, 10, 116, 91, 37, 70, 0, 92, 46, 18, 9, 2, 63, 57, 110, 40, 66, 49, 108, 342 | 56, 119, 7, 50, 107, 55, 54, 4, 104, 115, 23, 53, 111, 82, 24, 35, 71, 12, 43, 76, 48, 343 | 52, 25, 87, 100, 17, 74, 20, 94, 114, 109, 86, 34, 58, 21, 105, 64, 88, 60, 117, 22, 344 | 31, 89, 73, 11, 101, 75, 112, 96, 83, 47, 67, 8, 45, 33, 6, 345 | ]; 346 | 347 | let mut db = 348 | db_file::native::Reader::new(File::open("test_data/test.db").unwrap()).unwrap(); 349 | let entry = db.entries().next().unwrap().unwrap(); 350 | 351 | assert_eq!(entry.sequence.len(), SEQUENCE_LEN); 352 | 353 | let split_sequence: Vec<_> = entry.sequence.chunks(BLOCK_SIZE).collect(); 354 | let expected_sequence: Vec<_> = SHUFFLED_INDICES 355 | .into_iter() 356 | .flat_map(|index| split_sequence[index]) 357 | .copied() 358 | .collect(); 359 | 360 | let split_reactivities: Vec<_> = entry.reactivity.chunks(BLOCK_SIZE).collect(); 361 | let expected_reactivity: Vec<_> = SHUFFLED_INDICES 362 | .into_iter() 363 | .flat_map(|index| split_reactivities[index]) 364 | .copied() 365 | .collect(); 366 | 367 | let shuffled_entry = get_shuffled_entry( 368 | &SHUFFLED_INDICES, 369 | &entry, 370 | 0, 371 | BLOCK_SIZE, 372 | false, 373 | thread_rng(), 374 | ); 375 | 376 | assert_eq!(shuffled_entry.id, entry.id); 377 | assert_eq!(shuffled_entry.sequence, expected_sequence); 378 | assert!(shuffled_entry 379 | .reactivity() 380 | .iter() 381 | .copied() 382 | .zip(expected_reactivity) 383 | .all(|(a, b)| (a.is_nan() && b.is_nan()) || a == b)); 384 | } 385 | 386 | #[test] 387 | fn chunks_with_zero_offset_no_remainder() { 388 | assert_eq!( 389 | get_random_offset_and_chunks(30, 5, StepRng::new(0, 0)), 390 | (0, 6) 391 | ); 392 | } 393 | 394 | #[test] 395 | fn chunks_with_zero_offset_with_remainder() { 396 | let rng = StepRng::new(0, 0); 397 | assert_eq!(rng.clone().gen_range(0..3), 0); 398 | assert_eq!(get_random_offset_and_chunks(33, 5, rng), (0, 7)); 399 | } 400 | 401 | #[test] 402 | fn chunks_with_offset_with_remainder() { 403 | let rng = SmallRng::seed_from_u64(0); 404 | assert_eq!(rng.clone().gen_range(0..3), 1); 405 | assert_eq!(get_random_offset_and_chunks(33, 5, rng), (1, 8)); 406 | } 407 | 408 | #[test] 409 | fn resize_indices() { 410 | let mut indices = Vec::new(); 411 | super::resize_indices(&mut indices, 6); 412 | assert_eq!(indices.len(), 6); 413 | assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b)); 414 | 415 | indices.fill(9999); 416 | super::resize_indices(&mut indices, 24); 417 | assert_eq!(indices.len(), 24); 418 | assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b)); 419 | 420 | indices.fill(9999); 421 | super::resize_indices(&mut indices, 8); 422 | assert_eq!(indices.len(), 8); 423 | assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b)); 424 | } 425 | 426 | #[test] 427 | fn chunk_with_offset() { 428 | let data: [u32; 13] = std::array::from_fn(|index| u32::try_from(index).unwrap()); 429 | assert_eq!(get_chunk_with_offset(0, 3, 5, &data), [0, 1, 2]); 430 | assert_eq!(get_chunk_with_offset(1, 3, 5, &data), [3, 4, 5, 6, 7]); 431 | assert_eq!(get_chunk_with_offset(2, 3, 5, &data), [8, 9, 10, 11, 12]); 432 | 433 | assert_eq!(get_chunk_with_offset(0, 15, 5, &data), [] as [u32; 0]); 434 | assert_eq!(get_chunk_with_offset(1, 15, 5, &data), [] as [u32; 0]); 435 | } 436 | 437 | #[test] 438 | fn chunk_without_offset() { 439 | let data: [u32; 9] = std::array::from_fn(|index| u32::try_from(index).unwrap()); 440 | assert_eq!(get_chunk_without_offset(0, 3, &data), [0, 1, 2]); 441 | assert_eq!(get_chunk_without_offset(1, 3, &data), [3, 4, 5]); 442 | assert_eq!(get_chunk_without_offset(2, 3, &data), [6, 7, 8]); 443 | } 444 | 445 | #[test] 446 | fn extreme_distribution_from_mean_and_variance() { 447 | let dist = ExtremeDistribution::from_mean_and_variance( 448 | 1.508_101_930_862_146, 449 | 3.224_070_771_022_524, 450 | ); 451 | assert!((dist.location - 0.7).abs() < 0.00001); 452 | assert!((dist.scale - 1.4).abs() < 0.00001); 453 | } 454 | 455 | #[test] 456 | fn extreme_distribution_from_sample() { 457 | const DATA: [f64; 12] = [1., 2., 3., 4., 4.5, 5., 5.5, 6., 7., 8., 9., 10.]; 458 | const MEAN: f64 = 5.416_666_666_666_667; 459 | const VARIANCE: f64 = 7.583_333_333_333_334; 460 | 461 | let dist = ExtremeDistribution::from_sample(&DATA); 462 | let expected_dist = ExtremeDistribution::from_mean_and_variance(MEAN, VARIANCE); 463 | assert!((dist.location - expected_dist.location).abs() < 0.000_001); 464 | assert!((dist.scale - expected_dist.scale).abs() < 0.000_001); 465 | } 466 | 467 | #[test] 468 | fn extreme_distribution_cdf() { 469 | let dist = ExtremeDistribution { 470 | location: 0.7, 471 | scale: 1.4, 472 | }; 473 | 474 | assert!((dist.cdf(4.5f64) - 0.935_894_746_496_076_2).abs() < 0.000_000_01); 475 | } 476 | 477 | #[test] 478 | fn extreme_distribution_p_value() { 479 | let dist = ExtremeDistribution { 480 | location: 0.7, 481 | scale: 1.4, 482 | }; 483 | 484 | assert!((dist.p_value(8f64) - 0.005_423_555_727_838_702_5).abs() < 0.000_000_01); 485 | } 486 | } 487 | -------------------------------------------------------------------------------- /src/query_aligner.rs: -------------------------------------------------------------------------------- 1 | use core::slice; 2 | use std::{ 3 | marker::PhantomData, 4 | mem, 5 | num::NonZeroUsize, 6 | ops::{self, Not, Range, RangeInclusive}, 7 | sync::Arc, 8 | }; 9 | 10 | use crate::{ 11 | aligner::{ 12 | calc_seed_align_tolerance, trimmed_range, AlignBehavior, AlignParams, AlignTolerance, 13 | Aligner, AlignmentResult, Direction, 14 | }, 15 | calc_seed_alignment_score, 16 | cli::Cli, 17 | db_file, get_matching_kmers, group_matching_kmers, query_file, DbData, DbEntryMatches, 18 | MatchRanges, Reactivity, SequenceEntry, 19 | }; 20 | 21 | pub(crate) fn align_query_to_target_db<'a, 'cli, Behavior>( 22 | query_entry: &'a query_file::Entry, 23 | db_entries: &'a [db_file::Entry], 24 | db_entries_orig: &'a [db_file::Entry], 25 | query_results: &mut Vec>, 26 | cli: &'cli Cli, 27 | ) -> anyhow::Result> { 28 | query_results.clear(); 29 | db_entries 30 | .iter() 31 | .zip(db_entries_orig) 32 | .try_for_each(|(db_entry, db_entry_orig)| { 33 | let db_file::Entry { 34 | sequence, 35 | reactivity, 36 | .. 37 | } = db_entry; 38 | 39 | let db_data = DbData::new(sequence, reactivity)?; 40 | let matching_kmers = get_matching_kmers( 41 | query_entry.reactivity(), 42 | query_entry.sequence(), 43 | &db_data, 44 | cli, 45 | )?; 46 | let grouped = group_matching_kmers(&matching_kmers, cli); 47 | 48 | if grouped.is_empty().not() { 49 | query_results.push(DbEntryMatches { 50 | db_entry, 51 | db_entry_orig, 52 | matches: grouped, 53 | }); 54 | } 55 | Ok::<_, anyhow::Error>(()) 56 | })?; 57 | 58 | Ok(QueryAligner { 59 | query_entry, 60 | cli, 61 | _marker: PhantomData, 62 | }) 63 | } 64 | 65 | pub(crate) struct QueryAligner<'a, 'cli, Behavior> { 66 | query_entry: &'a query_file::Entry, 67 | cli: &'cli Cli, 68 | _marker: PhantomData, 69 | } 70 | 71 | impl<'a, 'cli, Behavior> QueryAligner<'a, 'cli, Behavior> { 72 | pub(crate) fn into_iter<'aln>( 73 | self, 74 | query_results: &'a [DbEntryMatches<'a>], 75 | aligner: &'aln mut Aligner<'cli>, 76 | ) -> QueryAlignIterator<'a, 'cli, 'aln, Behavior> { 77 | let Self { 78 | query_entry, 79 | cli, 80 | _marker, 81 | } = self; 82 | 83 | let query_results = query_results.iter(); 84 | QueryAlignIterator(QueryAlignIteratorEnum::Empty { 85 | query_results, 86 | query_entry, 87 | cli, 88 | aligner, 89 | }) 90 | } 91 | } 92 | 93 | pub(crate) struct QueryAlignIterator<'a, 'cli, 'aln, Behavior>( 94 | QueryAlignIteratorEnum<'a, 'cli, 'aln, Behavior>, 95 | ); 96 | 97 | enum QueryAlignIteratorEnum<'a, 'cli, 'aln, Behavior> { 98 | Empty { 99 | query_results: slice::Iter<'a, DbEntryMatches<'a>>, 100 | query_entry: &'a query_file::Entry, 101 | cli: &'cli Cli, 102 | aligner: &'aln mut Aligner<'cli>, 103 | }, 104 | Full { 105 | query_results: slice::Iter<'a, DbEntryMatches<'a>>, 106 | iter: QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>, 107 | query_entry: &'a query_file::Entry, 108 | cli: &'cli Cli, 109 | }, 110 | Finished, 111 | } 112 | 113 | impl<'a, 'cli, 'aln, Behavior> QueryAlignIterator<'a, 'cli, 'aln, Behavior> { 114 | fn make_new_iter(&mut self) -> Option<&mut QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>> { 115 | match mem::replace(&mut self.0, QueryAlignIteratorEnum::Finished) { 116 | QueryAlignIteratorEnum::Empty { 117 | query_results, 118 | query_entry, 119 | cli, 120 | aligner, 121 | } => self.create_new_state(query_results, query_entry, cli, aligner), 122 | QueryAlignIteratorEnum::Full { 123 | query_results, 124 | iter, 125 | query_entry, 126 | cli, 127 | } => { 128 | let aligner = iter.aligner; 129 | self.create_new_state(query_results, query_entry, cli, aligner) 130 | } 131 | QueryAlignIteratorEnum::Finished => None, 132 | } 133 | } 134 | 135 | fn create_new_state( 136 | &mut self, 137 | mut query_results: slice::Iter<'a, DbEntryMatches<'a>>, 138 | query_entry: &'a query_file::Entry, 139 | cli: &'cli Cli, 140 | aligner: &'aln mut Aligner<'cli>, 141 | ) -> Option<&mut QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>> { 142 | query_results.next().map(|query_result| { 143 | let &DbEntryMatches { 144 | db_entry, 145 | db_entry_orig, 146 | matches: ref db, 147 | } = query_result; 148 | 149 | let iter = QueryAlignIteratorInner { 150 | aligner, 151 | db_iter: db.iter(), 152 | query_entry, 153 | db_entry, 154 | db_entry_orig, 155 | cli, 156 | _marker: PhantomData, 157 | }; 158 | 159 | self.0 = QueryAlignIteratorEnum::Full { 160 | query_results, 161 | iter, 162 | query_entry, 163 | cli, 164 | }; 165 | 166 | match &mut self.0 { 167 | QueryAlignIteratorEnum::Full { iter, .. } => iter, 168 | _ => unreachable!(), 169 | } 170 | }) 171 | } 172 | 173 | #[inline] 174 | fn get_next_from_new_iter(&mut self) -> Option> 175 | where 176 | Behavior: AlignBehavior, 177 | ::Alignment: std::fmt::Debug, 178 | { 179 | loop { 180 | let next = self.make_new_iter()?.next(); 181 | if next.is_some() { 182 | break next; 183 | } 184 | } 185 | } 186 | } 187 | 188 | impl<'a, Behavior> Iterator for QueryAlignIterator<'a, '_, '_, Behavior> 189 | where 190 | Behavior: AlignBehavior, 191 | ::Alignment: std::fmt::Debug, 192 | { 193 | type Item = QueryAlignResult<'a, Behavior::Alignment>; 194 | 195 | fn next(&mut self) -> Option { 196 | match &mut self.0 { 197 | QueryAlignIteratorEnum::Empty { .. } => self.get_next_from_new_iter(), 198 | QueryAlignIteratorEnum::Full { iter, .. } => match iter.next() { 199 | Some(item) => Some(item), 200 | None => self.get_next_from_new_iter(), 201 | }, 202 | QueryAlignIteratorEnum::Finished => None, 203 | } 204 | } 205 | } 206 | 207 | struct QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior> { 208 | aligner: &'aln mut Aligner<'cli>, 209 | db_iter: slice::Iter<'a, MatchRanges>, 210 | query_entry: &'a query_file::Entry, 211 | db_entry: &'a db_file::Entry, 212 | db_entry_orig: &'a db_file::Entry, 213 | cli: &'cli Cli, 214 | _marker: PhantomData, 215 | } 216 | 217 | #[derive(Debug, Clone, PartialEq)] 218 | pub(crate) struct QueryAlignResult<'a, Alignment> { 219 | pub(crate) db_entry: &'a db_file::Entry, 220 | pub(crate) db_entry_orig: &'a db_file::Entry, 221 | pub(crate) db_match: MatchRanges, 222 | pub(crate) score: Reactivity, 223 | pub(crate) db: ops::RangeInclusive, 224 | pub(crate) query: ops::RangeInclusive, 225 | pub(crate) alignment: Arc>, 226 | } 227 | 228 | impl<'a, Behavior> Iterator for QueryAlignIteratorInner<'a, '_, '_, Behavior> 229 | where 230 | Behavior: AlignBehavior, 231 | ::Alignment: std::fmt::Debug, 232 | { 233 | type Item = QueryAlignResult<'a, Behavior::Alignment>; 234 | 235 | fn next(&mut self) -> Option { 236 | let &mut Self { 237 | ref mut aligner, 238 | ref mut db_iter, 239 | query_entry, 240 | db_entry, 241 | db_entry_orig, 242 | cli, 243 | _marker, 244 | } = self; 245 | 246 | loop { 247 | let db_match = db_iter.next()?; 248 | let seed_score = calc_seed_alignment_score( 249 | query_entry, 250 | db_entry, 251 | db_match.query.clone(), 252 | db_match.db.clone(), 253 | cli, 254 | ); 255 | 256 | if seed_score <= 0. { 257 | continue; 258 | } 259 | 260 | let result = handle_match::( 261 | db_match.clone(), 262 | query_entry, 263 | db_entry, 264 | db_entry_orig, 265 | seed_score, 266 | aligner, 267 | cli, 268 | ); 269 | break Some(result); 270 | } 271 | } 272 | } 273 | 274 | fn handle_match<'db, 'cli, Behavior>( 275 | db_match: MatchRanges, 276 | query_entry: &query_file::Entry, 277 | db_entry: &'db db_file::Entry, 278 | db_entry_orig: &'db db_file::Entry, 279 | seed_score: f32, 280 | aligner: &mut Aligner<'cli>, 281 | cli: &'cli Cli, 282 | ) -> QueryAlignResult<'db, Behavior::Alignment> 283 | where 284 | Behavior: AlignBehavior, 285 | ::Alignment: std::fmt::Debug, 286 | { 287 | let MatchRanges { db, query } = db_match.clone(); 288 | 289 | let AlignToleranceData { 290 | trimmed_query_range, 291 | query, 292 | db, 293 | seed_length, 294 | align_tolerance, 295 | } = get_align_tolerance_data(query_entry, db_entry, query, db, cli); 296 | 297 | let query_len = trimmed_query_range.len(); 298 | let align_tolerance = &align_tolerance; 299 | 300 | let upstream_result = aligner.align::(AlignParams { 301 | query: query_entry, 302 | target: db_entry, 303 | query_range: query.clone(), 304 | target_range: db.clone(), 305 | seed_score, 306 | align_tolerance, 307 | direction: Direction::Upstream, 308 | }); 309 | 310 | let downstream_result = aligner.align::(AlignParams { 311 | query: query_entry, 312 | target: db_entry, 313 | query_range: query, 314 | target_range: db, 315 | seed_score: upstream_result.score, 316 | align_tolerance, 317 | direction: Direction::Downstream, 318 | }); 319 | 320 | debug_assert!(upstream_result.query_index >= trimmed_query_range.start); 321 | debug_assert!(downstream_result.query_index < trimmed_query_range.end); 322 | debug_assert!(downstream_result.target_index < db_entry.reactivity().len()); 323 | 324 | let query = upstream_result.query_index..=downstream_result.query_index; 325 | let db = upstream_result.target_index..=downstream_result.target_index; 326 | let aligned_query_len = downstream_result.query_index + 1 - upstream_result.query_index; 327 | debug_assert!(aligned_query_len >= seed_length.get()); 328 | 329 | // It is ok to lose precision to evaluate the score 330 | #[allow(clippy::cast_precision_loss, clippy::cast_lossless)] 331 | let score = downstream_result.score as f64 332 | * ((aligned_query_len as f64).ln() / (query_len as f64).ln()); 333 | 334 | // We don't need so much precision on score after the calculation 335 | #[allow(clippy::cast_possible_truncation)] 336 | let score = score as Reactivity; 337 | 338 | let query_alignment = Behavior::merge_upstream_downstream( 339 | upstream_result.query_alignment, 340 | downstream_result.query_alignment, 341 | seed_length, 342 | ); 343 | let target_alignment = Behavior::merge_upstream_downstream( 344 | upstream_result.target_alignment, 345 | downstream_result.target_alignment, 346 | seed_length, 347 | ); 348 | let alignment = Arc::new(AlignmentResult { 349 | query: query_alignment, 350 | target: target_alignment, 351 | }); 352 | 353 | QueryAlignResult { 354 | db_entry, 355 | db_entry_orig, 356 | db_match, 357 | score, 358 | db, 359 | query, 360 | alignment, 361 | } 362 | } 363 | 364 | #[inline] 365 | fn get_align_tolerance_data( 366 | query_entry: &query_file::Entry, 367 | db_entry: &db_file::Entry, 368 | query: RangeInclusive, 369 | db: RangeInclusive, 370 | cli: &Cli, 371 | ) -> AlignToleranceData { 372 | let trimmed_query_range = trimmed_range(query_entry.reactivity()); 373 | let query = intersect_range(query, trimmed_query_range.clone()); 374 | 375 | let seed_length = db.end() - db.start() + 1; 376 | debug_assert_eq!(seed_length, query.end() - query.start() + 1); 377 | let seed_length = NonZeroUsize::new(seed_length) 378 | .expect("seed must have a length greater than zero (and more)"); 379 | 380 | let align_tolerance = calc_seed_align_tolerance( 381 | query.clone(), 382 | db.clone(), 383 | trimmed_query_range.clone(), 384 | db_entry.reactivity().len(), 385 | cli.alignment_args.align_len_tolerance, 386 | ); 387 | 388 | AlignToleranceData { 389 | trimmed_query_range, 390 | query, 391 | db, 392 | seed_length, 393 | align_tolerance, 394 | } 395 | } 396 | 397 | struct AlignToleranceData { 398 | trimmed_query_range: Range, 399 | query: RangeInclusive, 400 | db: RangeInclusive, 401 | seed_length: NonZeroUsize, 402 | align_tolerance: AlignTolerance, 403 | } 404 | 405 | #[inline] 406 | fn intersect_range(a: RangeInclusive, b: Range) -> RangeInclusive { 407 | let start = *a.start().max(&b.start); 408 | let end = *a.end().min(&b.end.saturating_sub(1)); 409 | start..=end 410 | } 411 | 412 | #[cfg(test)] 413 | mod tests { 414 | use std::io::Cursor; 415 | 416 | use crate::aligner::{AlignResult, AlignedSequence, BacktrackBehavior, BaseOrGap}; 417 | 418 | use super::*; 419 | 420 | const TEST_DB: &[u8] = include_bytes!("../test_data/test.db"); 421 | const QUERY: &[u8] = include_bytes!("../test_data/query_align.txt"); 422 | const QUERY_RANGE: RangeInclusive = 0..=138; 423 | const DB_RANGE: RangeInclusive = 1350..=1488; 424 | const QUERY_SEED_RANGE: RangeInclusive = 0..=70; 425 | const DB_SEED_RANGE: RangeInclusive = 1350..=1420; 426 | const SEED_SCORE: f32 = 96.34601; 427 | 428 | struct TestEntries { 429 | db_entry_orig: db_file::Entry, 430 | db_entry: db_file::Entry, 431 | query_entry: query_file::Entry, 432 | cli: Cli, 433 | } 434 | 435 | fn get_test_entries() -> TestEntries { 436 | let cli = Cli::dummy(); 437 | 438 | let mut test_db = db_file::native::Reader::new(Cursor::new(TEST_DB)).unwrap(); 439 | let db_entry_orig = test_db 440 | .entries() 441 | .map(Result::unwrap) 442 | .find(|entry| entry.id == "16S_Bsubtilis") 443 | .unwrap(); 444 | let mut db_entry = db_entry_orig.clone(); 445 | db_entry.cap_reactivities(cli.max_reactivity); 446 | 447 | let mut queries = query_file::read_file_content(Cursor::new(QUERY)) 448 | .unwrap() 449 | .into_iter(); 450 | let mut query_entry = queries.next().unwrap(); 451 | assert!(queries.next().is_none()); 452 | query_entry.cap_reactivities(cli.max_reactivity); 453 | 454 | TestEntries { 455 | db_entry_orig, 456 | db_entry, 457 | query_entry, 458 | cli, 459 | } 460 | } 461 | 462 | #[test] 463 | fn simple_alignment() { 464 | let TestEntries { 465 | db_entry_orig, 466 | db_entry, 467 | query_entry, 468 | cli, 469 | } = get_test_entries(); 470 | 471 | let match_range = MatchRanges { 472 | db: DB_SEED_RANGE, 473 | query: QUERY_SEED_RANGE, 474 | }; 475 | 476 | let seed_score = calc_seed_alignment_score( 477 | &query_entry, 478 | &db_entry, 479 | match_range.query.clone(), 480 | match_range.db.clone(), 481 | &cli, 482 | ); 483 | assert!(f32::abs(seed_score - SEED_SCORE) < 0.0001); 484 | 485 | let mut aligner = Aligner::new(&cli); 486 | 487 | let matched = handle_match::( 488 | match_range, 489 | &query_entry, 490 | &db_entry, 491 | &db_entry_orig, 492 | seed_score, 493 | &mut aligner, 494 | &cli, 495 | ); 496 | assert_eq!(matched.db, DB_RANGE); 497 | assert_eq!(matched.query, QUERY_RANGE); 498 | 499 | assert_eq!(matched.alignment.target.0.len(), 139); 500 | assert_eq!(matched.alignment.query.0.len(), 139); 501 | 502 | assert!(matched 503 | .alignment 504 | .target 505 | .0 506 | .iter() 507 | .all(|base_or_gap| base_or_gap.is_base())); 508 | assert!(matched 509 | .alignment 510 | .query 511 | .0 512 | .iter() 513 | .all(|base_or_gap| base_or_gap.is_base())); 514 | } 515 | 516 | #[test] 517 | fn empty_upstream_alignment() { 518 | let TestEntries { 519 | db_entry, 520 | query_entry, 521 | cli, 522 | .. 523 | } = get_test_entries(); 524 | 525 | let AlignToleranceData { 526 | query, 527 | db, 528 | align_tolerance, 529 | .. 530 | } = get_align_tolerance_data( 531 | &query_entry, 532 | &db_entry, 533 | QUERY_SEED_RANGE, 534 | DB_SEED_RANGE, 535 | &cli, 536 | ); 537 | let align_tolerance = &align_tolerance; 538 | 539 | let mut aligner = Aligner::new(&cli); 540 | 541 | let upstream_result = aligner.align::(AlignParams { 542 | query: &query_entry, 543 | target: &db_entry, 544 | query_range: query, 545 | target_range: db, 546 | seed_score: SEED_SCORE, 547 | align_tolerance, 548 | direction: Direction::Upstream, 549 | }); 550 | 551 | let AlignResult { 552 | query_index, 553 | query_alignment, 554 | target_index, 555 | target_alignment, 556 | score, 557 | } = upstream_result; 558 | 559 | assert_eq!(query_index, *QUERY_RANGE.start()); 560 | assert_eq!(query_alignment, AlignedSequence(vec![BaseOrGap::Base])); 561 | assert_eq!(target_index, *DB_RANGE.start()); 562 | assert_eq!(target_alignment, AlignedSequence(vec![BaseOrGap::Base])); 563 | assert!((score - SEED_SCORE).abs() < 0.0001); 564 | } 565 | 566 | #[test] 567 | fn downstream_alignment() { 568 | let TestEntries { 569 | db_entry, 570 | query_entry, 571 | cli, 572 | .. 573 | } = get_test_entries(); 574 | 575 | let AlignToleranceData { 576 | query, 577 | db, 578 | align_tolerance, 579 | .. 580 | } = get_align_tolerance_data( 581 | &query_entry, 582 | &db_entry, 583 | QUERY_SEED_RANGE, 584 | DB_SEED_RANGE, 585 | &cli, 586 | ); 587 | let align_tolerance = &align_tolerance; 588 | 589 | let mut aligner = Aligner::new(&cli); 590 | 591 | let downstream_result = aligner.align::(AlignParams { 592 | query: &query_entry, 593 | target: &db_entry, 594 | query_range: query, 595 | target_range: db, 596 | seed_score: SEED_SCORE, 597 | align_tolerance, 598 | direction: Direction::Downstream, 599 | }); 600 | 601 | let AlignResult { 602 | query_index, 603 | query_alignment, 604 | target_index, 605 | target_alignment, 606 | score, 607 | } = downstream_result; 608 | 609 | assert_eq!(query_index, *QUERY_RANGE.end()); 610 | assert_eq!( 611 | query_alignment.0.len(), 612 | QUERY_RANGE.end() + 1 - QUERY_SEED_RANGE.end() 613 | ); 614 | assert!(query_alignment 615 | .0 616 | .iter() 617 | .all(|base_or_gap| base_or_gap.is_base())); 618 | assert_eq!(target_index, *DB_RANGE.end()); 619 | assert_eq!( 620 | target_alignment.0.len(), 621 | DB_RANGE.end() + 1 - DB_SEED_RANGE.end() 622 | ); 623 | assert!(target_alignment 624 | .0 625 | .iter() 626 | .all(|base_or_gap| base_or_gap.is_base())); 627 | assert!((score - 129.59003).abs() < 0.00001); 628 | } 629 | } 630 | -------------------------------------------------------------------------------- /src/query_file.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | error::Error as StdError, 3 | fmt::{self, Display}, 4 | fs::File, 5 | io::{self, BufRead, BufReader}, 6 | ops::Not, 7 | path::Path, 8 | sync::Arc, 9 | }; 10 | 11 | use crate::{db_file::ReactivityWithPlaceholder, Base, Molecule, Reactivity, SequenceEntry}; 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct Entry { 15 | pub name: Arc, 16 | sequence: Vec, 17 | reactivities: Vec, 18 | pub(crate) molecule: Molecule, 19 | } 20 | 21 | impl Entry { 22 | #[cfg(test)] 23 | pub(crate) fn new_unchecked( 24 | name: impl Into>, 25 | sequence: Vec, 26 | reactivities: Vec, 27 | molecule: Molecule, 28 | ) -> Self { 29 | let name = name.into(); 30 | Self { 31 | name, 32 | sequence, 33 | reactivities, 34 | molecule, 35 | } 36 | } 37 | 38 | pub fn cap_reactivities(&mut self, max_reactivity: Reactivity) { 39 | self.reactivities.iter_mut().for_each(|reactivity| { 40 | if let Some(x) = reactivity.get_non_nan() { 41 | *reactivity = x.min(max_reactivity).into(); 42 | } 43 | }); 44 | } 45 | } 46 | 47 | impl SequenceEntry for Entry { 48 | type Reactivity = ReactivityWithPlaceholder; 49 | 50 | fn name(&self) -> &str { 51 | &self.name 52 | } 53 | 54 | fn sequence(&self) -> &[Base] { 55 | &self.sequence 56 | } 57 | 58 | fn reactivity(&self) -> &[Self::Reactivity] { 59 | &self.reactivities 60 | } 61 | 62 | fn molecule(&self) -> Molecule { 63 | self.molecule 64 | } 65 | } 66 | 67 | #[derive(Debug)] 68 | pub enum Error { 69 | TruncatedExpectedSequence, 70 | TruncatedExpectedReactivities, 71 | InvalidSequenceBase(RowColumn), 72 | InvalidReactivity(RowColumn), 73 | EmptySequence(usize), 74 | UnmatchedLengths(UnmatchedLengths), 75 | OpenFile(io::Error), 76 | ReadNameLine(io::Error), 77 | ReadSequenceLine(io::Error), 78 | ReadReactivityLine(io::Error), 79 | } 80 | 81 | impl Display for Error { 82 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 83 | match self { 84 | Error::TruncatedExpectedSequence => f.write_str("file truncated, expected sequence"), 85 | Error::TruncatedExpectedReactivities => { 86 | f.write_str("file truncated, expected reactivities") 87 | } 88 | Error::InvalidSequenceBase(row_column) => { 89 | write!( 90 | f, 91 | "invalid sequence base at line {} and column {}", 92 | row_column.row, row_column.column, 93 | ) 94 | } 95 | Error::InvalidReactivity(row_column) => write!( 96 | f, 97 | "invalid reactivity at line {} and column {}", 98 | row_column.row, row_column.column, 99 | ), 100 | Error::EmptySequence(row) => write!(f, "unexpected empty sequence at line {row}"), 101 | Error::UnmatchedLengths(lengths) => { 102 | write!( 103 | f, 104 | "unmatching lengths between sequence ({}) and reactivities ({}) for query \ 105 | starting at line {}", 106 | lengths.sequence, lengths.reactivities, lengths.line, 107 | ) 108 | } 109 | Error::OpenFile(_) => f.write_str("cannot open file"), 110 | Error::ReadNameLine(_) => f.write_str("cannot read line containing sequence name"), 111 | Error::ReadSequenceLine(_) => f.write_str("cannot read line containing sequence data"), 112 | Error::ReadReactivityLine(_) => { 113 | f.write_str("cannot read line containing sequence reactivity") 114 | } 115 | } 116 | } 117 | } 118 | 119 | impl StdError for Error { 120 | fn source(&self) -> Option<&(dyn StdError + 'static)> { 121 | match self { 122 | Error::TruncatedExpectedSequence 123 | | Error::TruncatedExpectedReactivities 124 | | Error::InvalidSequenceBase(_) 125 | | Error::InvalidReactivity(_) 126 | | Error::EmptySequence(_) 127 | | Error::UnmatchedLengths(_) => None, 128 | 129 | Error::OpenFile(source) 130 | | Error::ReadNameLine(source) 131 | | Error::ReadSequenceLine(source) 132 | | Error::ReadReactivityLine(source) => Some(source), 133 | } 134 | } 135 | } 136 | 137 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 138 | pub struct RowColumn { 139 | pub row: usize, 140 | pub column: usize, 141 | } 142 | 143 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 144 | pub struct UnmatchedLengths { 145 | sequence: usize, 146 | reactivities: usize, 147 | line: usize, 148 | } 149 | 150 | #[inline] 151 | pub fn read_file(path: &Path) -> Result, Error> { 152 | let reader = BufReader::new(File::open(path).map_err(Error::OpenFile)?); 153 | read_file_content(reader) 154 | } 155 | 156 | pub fn read_file_content(mut reader: R) -> Result, Error> 157 | where 158 | R: BufRead, 159 | { 160 | let mut line = String::new(); 161 | let mut entries = Vec::new(); 162 | 163 | let mut file_row = 0; 164 | loop { 165 | line.clear(); 166 | file_row += 1; 167 | if reader.read_line(&mut line).map_err(Error::ReadNameLine)? == 0 { 168 | break; 169 | } 170 | 171 | if line.as_bytes().iter().all(u8::is_ascii_whitespace) { 172 | continue; 173 | } 174 | 175 | let name = Arc::from(line.trim().to_string()); 176 | 177 | file_row += 1; 178 | line.clear(); 179 | if reader 180 | .read_line(&mut line) 181 | .map_err(Error::ReadSequenceLine)? 182 | == 0 183 | { 184 | return Err(Error::TruncatedExpectedSequence); 185 | } 186 | 187 | let (sequence, molecule) = parse_sequence(&line, file_row)?; 188 | 189 | if sequence.is_empty() { 190 | return Err(Error::EmptySequence(file_row)); 191 | } 192 | 193 | file_row += 1; 194 | line.clear(); 195 | if reader 196 | .read_line(&mut line) 197 | .map_err(Error::ReadReactivityLine)? 198 | == 0 199 | { 200 | return Err(Error::TruncatedExpectedReactivities); 201 | } 202 | 203 | let mut column = 1; 204 | let reactivities = line 205 | .trim_end() 206 | .split(',') 207 | .map(|raw_reactivity| { 208 | if column != 1 { 209 | column += 1; 210 | } 211 | 212 | let reactivity = if raw_reactivity.eq_ignore_ascii_case("NaN") { 213 | ReactivityWithPlaceholder::from(Reactivity::NAN) 214 | } else { 215 | raw_reactivity 216 | .parse::() 217 | .map(ReactivityWithPlaceholder::from) 218 | .map_err(|_| { 219 | Error::InvalidReactivity(RowColumn { 220 | row: file_row, 221 | column, 222 | }) 223 | })? 224 | }; 225 | 226 | column += raw_reactivity.len(); 227 | Ok::<_, Error>(reactivity) 228 | }) 229 | .collect::, _>>()?; 230 | 231 | if sequence.len() != reactivities.len() { 232 | return Err(Error::UnmatchedLengths(UnmatchedLengths { 233 | sequence: sequence.len(), 234 | reactivities: reactivities.len(), 235 | line: file_row - 2, 236 | })); 237 | } 238 | 239 | entries.push(Entry { 240 | name, 241 | sequence, 242 | reactivities, 243 | molecule, 244 | }); 245 | } 246 | 247 | Ok(entries) 248 | } 249 | 250 | fn parse_sequence(raw_line: &str, row: usize) -> Result<(Vec, Molecule), Error> { 251 | let mut molecule = Molecule::default(); 252 | raw_line 253 | .as_bytes() 254 | .iter() 255 | .copied() 256 | .enumerate() 257 | .skip_while(|(_, c)| c.is_ascii_whitespace()) 258 | .take_while(|(_, c)| c.is_ascii_whitespace().not()) 259 | .map(|(index, c)| { 260 | match (c, molecule) { 261 | (b'T', Molecule::Unknown) => molecule = Molecule::Dna, 262 | (b'U', Molecule::Unknown) => molecule = Molecule::Rna, 263 | (b'T', Molecule::Rna) | (b'U', Molecule::Dna) => { 264 | return Err(Error::InvalidSequenceBase(RowColumn { 265 | row, 266 | column: index + 1, 267 | })); 268 | } 269 | _ => {} 270 | } 271 | 272 | Base::try_from(c).map_err(|_| { 273 | Error::InvalidSequenceBase(RowColumn { 274 | row, 275 | column: index + 1, 276 | }) 277 | }) 278 | }) 279 | .collect::>() 280 | .map(|sequence| (sequence, molecule)) 281 | } 282 | 283 | #[cfg(test)] 284 | mod tests { 285 | use std::io::Cursor; 286 | 287 | use super::*; 288 | 289 | macro_rules! base { 290 | (A) => { 291 | crate::Base::A 292 | }; 293 | 294 | (C) => { 295 | crate::Base::C 296 | }; 297 | 298 | (G) => { 299 | crate::Base::G 300 | }; 301 | 302 | (T) => { 303 | crate::Base::T 304 | }; 305 | 306 | (N) => { 307 | crate::Base::N 308 | }; 309 | } 310 | 311 | macro_rules! seq { 312 | ([$($bases:expr),*] $(,)?) => { 313 | &[$($bases),*] 314 | }; 315 | 316 | ([$($bases:expr),* $(,)?] $base:ident $($rest:ident)*) => { 317 | seq!([$($bases,)* base!($base)] $($rest)*) 318 | }; 319 | 320 | ($($bases:ident)*) => { 321 | seq!([] $($bases)*) 322 | }; 323 | } 324 | 325 | fn reactivities_eq(a: I1, b: I2) -> bool 326 | where 327 | I1: IntoIterator, 328 | I2: IntoIterator, 329 | { 330 | a.into_iter().zip(b).all(|(a, b)| { 331 | if b.is_nan() { 332 | a.is_nan() 333 | } else { 334 | (a.to_maybe_placeholder() - b).abs() < 10e-5 335 | } 336 | }) 337 | } 338 | 339 | #[test] 340 | fn read_valid_file() { 341 | const CONTENT: &str = include_str!("../test_data/valid_query.txt"); 342 | let entries = read_file_content(Cursor::new(CONTENT)).unwrap(); 343 | 344 | assert_eq!(entries.len(), 2); 345 | assert_eq!(&*entries[0].name, "test1"); 346 | assert_eq!(entries[0].sequence, seq!(A C G T N)); 347 | assert!(reactivities_eq( 348 | entries[0].reactivities.iter().copied(), 349 | [0.123, 0.456, 0.789, 1.234, Reactivity::NAN] 350 | )); 351 | 352 | assert_eq!(&*entries[1].name, "test2"); 353 | assert_eq!(entries[1].sequence, seq!(N A C G T)); 354 | assert!(reactivities_eq( 355 | entries[1].reactivities.iter().copied(), 356 | [Reactivity::NAN, 12., 0.456, 0.789, 0.012] 357 | )); 358 | } 359 | 360 | #[test] 361 | fn empty_sequence() { 362 | const CONTENT: &str = include_str!("../test_data/query_empty_sequence.txt"); 363 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 364 | 365 | assert!(matches!(err, Error::EmptySequence(6))); 366 | } 367 | 368 | #[test] 369 | fn truncated_sequence() { 370 | const CONTENT: &str = include_str!("../test_data/query_truncated_sequence.txt"); 371 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 372 | 373 | assert!(matches!(err, Error::TruncatedExpectedSequence)); 374 | } 375 | 376 | #[test] 377 | fn truncated_reactivities() { 378 | const CONTENT: &str = include_str!("../test_data/query_truncated_reactivities.txt"); 379 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 380 | 381 | assert!(matches!(err, Error::TruncatedExpectedReactivities)); 382 | } 383 | 384 | #[test] 385 | fn invalid_sequence_base() { 386 | const CONTENT: &str = include_str!("../test_data/query_invalid_base.txt"); 387 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 388 | 389 | if let Error::InvalidSequenceBase(err) = err { 390 | assert_eq!(err, RowColumn { row: 6, column: 3 }); 391 | } else { 392 | panic!() 393 | } 394 | } 395 | 396 | #[test] 397 | fn invalid_sequence_reactivity() { 398 | const CONTENT: &str = include_str!("../test_data/query_invalid_reactivity.txt"); 399 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 400 | 401 | if let Error::InvalidReactivity(err) = err { 402 | assert_eq!(err, RowColumn { row: 7, column: 11 }); 403 | } else { 404 | panic!() 405 | } 406 | } 407 | 408 | #[test] 409 | fn invalid_lengths() { 410 | const CONTENT: &str = include_str!("../test_data/query_invalid_lengths.txt"); 411 | let err = read_file_content(Cursor::new(CONTENT)).unwrap_err(); 412 | 413 | if let Error::UnmatchedLengths(err) = err { 414 | assert_eq!( 415 | err, 416 | UnmatchedLengths { 417 | sequence: 6, 418 | reactivities: 5, 419 | line: 5, 420 | } 421 | ); 422 | } else { 423 | panic!() 424 | } 425 | } 426 | 427 | #[test] 428 | fn cap_reactivities() { 429 | const CONTENT: &str = include_str!("../test_data/valid_query.txt"); 430 | let mut entries = read_file_content(Cursor::new(CONTENT)).unwrap(); 431 | entries 432 | .iter_mut() 433 | .for_each(|entry| entry.cap_reactivities(1.)); 434 | 435 | assert!(reactivities_eq( 436 | entries[0].reactivities.iter().copied(), 437 | [0.123, 0.456, 0.789, 1., Reactivity::NAN] 438 | )); 439 | 440 | assert!(reactivities_eq( 441 | entries[1].reactivities.iter().copied(), 442 | [Reactivity::NAN, 1., 0.456, 0.789, 0.012] 443 | )); 444 | } 445 | } 446 | -------------------------------------------------------------------------------- /src/query_result.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::{self, Display}, 3 | ops, 4 | sync::Arc, 5 | }; 6 | 7 | use num_traits::{Float, FromPrimitive}; 8 | use serde::{ser::SerializeStruct, Deserialize, Deserializer, Serialize, Serializer}; 9 | use tabled::Tabled; 10 | 11 | use crate::{ 12 | aligner::{AlignedSequence, AlignmentResult}, 13 | dotbracket::DotBracketOwnedSorted, 14 | }; 15 | 16 | #[derive(Debug, Deserialize, Serialize, Tabled)] 17 | pub struct QueryResult { 18 | #[serde(rename = "Query")] 19 | pub query: Arc, 20 | 21 | #[serde(rename = "DB entry")] 22 | pub db_entry: String, 23 | 24 | #[serde(rename = "Qstart")] 25 | pub query_start: usize, 26 | 27 | #[serde(rename = "Qend")] 28 | pub query_end: usize, 29 | 30 | #[serde(rename = "Dstart")] 31 | pub db_start: usize, 32 | 33 | #[serde(rename = "Dend")] 34 | pub db_end: usize, 35 | 36 | #[serde(rename = "Qseed")] 37 | pub query_seed: Range, 38 | 39 | #[serde(rename = "Dseed")] 40 | pub db_seed: Range, 41 | 42 | #[serde(rename = "Score")] 43 | pub score: f32, 44 | 45 | #[serde(rename = "P-value")] 46 | #[tabled(display_with = "display_scientific")] 47 | pub pvalue: f64, 48 | 49 | #[serde(rename = "E-value")] 50 | #[tabled(display_with = "display_scientific")] 51 | pub evalue: f64, 52 | 53 | #[serde(rename = "TargetBpSupport")] 54 | #[tabled(display_with = "display_scientific_opt")] 55 | pub target_bp_support: Option, 56 | 57 | #[serde(rename = "QueryBpSupport")] 58 | #[tabled(display_with = "display_scientific_opt")] 59 | pub query_bp_support: Option, 60 | 61 | #[serde(rename = "MfePvalue")] 62 | #[tabled(display_with = "display_scientific_opt")] 63 | pub mfe_pvalue: Option, 64 | 65 | #[serde(rename = "")] 66 | pub status: Status, 67 | 68 | #[serde(skip)] 69 | #[tabled(skip)] 70 | pub alignment: Arc>, 71 | 72 | #[serde(skip)] 73 | #[tabled(skip)] 74 | pub dotbracket: Option, 75 | } 76 | 77 | impl QueryResult { 78 | pub fn new(query: impl Into>) -> Self { 79 | let query = query.into(); 80 | Self { 81 | query, 82 | db_entry: String::default(), 83 | query_start: Default::default(), 84 | query_end: Default::default(), 85 | db_start: Default::default(), 86 | db_end: Default::default(), 87 | query_seed: Range::default(), 88 | db_seed: Range::default(), 89 | score: Default::default(), 90 | pvalue: Default::default(), 91 | evalue: Default::default(), 92 | status: Status::default(), 93 | target_bp_support: Option::default(), 94 | query_bp_support: Option::default(), 95 | mfe_pvalue: Option::default(), 96 | alignment: Arc::default(), 97 | dotbracket: Option::default(), 98 | } 99 | } 100 | } 101 | 102 | #[derive(Debug)] 103 | pub struct Serializeable<'a> { 104 | pub query_result: &'a QueryResult, 105 | pub eval_align_fold: bool, 106 | } 107 | 108 | impl serde::Serialize for Serializeable<'_> { 109 | fn serialize(&self, serializer: S) -> Result 110 | where 111 | S: Serializer, 112 | { 113 | let fields_count = if self.eval_align_fold { 15 } else { 12 }; 114 | let mut struc = serializer.serialize_struct("QueryResult", fields_count)?; 115 | 116 | let query_result = self.query_result; 117 | struc.serialize_field("Query", &query_result.query)?; 118 | struc.serialize_field("DB field", &query_result.db_entry)?; 119 | struc.serialize_field("Qstart", &query_result.query_start)?; 120 | struc.serialize_field("Qend", &query_result.query_end)?; 121 | struc.serialize_field("Dstart", &query_result.db_start)?; 122 | struc.serialize_field("Dend", &query_result.db_end)?; 123 | struc.serialize_field("Qseed", &query_result.query_seed)?; 124 | struc.serialize_field("Dseed", &query_result.db_seed)?; 125 | struc.serialize_field("Score", &query_result.score)?; 126 | struc.serialize_field("P-value", &display_scientific(&query_result.pvalue))?; 127 | struc.serialize_field("E-value", &display_scientific(&query_result.evalue))?; 128 | 129 | if self.eval_align_fold { 130 | struc.serialize_field( 131 | "TargetBpSupport", 132 | &display_scientific_opt(&query_result.target_bp_support), 133 | )?; 134 | struc.serialize_field( 135 | "QueryBpSupport", 136 | &display_scientific_opt(&query_result.query_bp_support), 137 | )?; 138 | struc.serialize_field( 139 | "MfePvalue", 140 | &display_scientific_opt(&query_result.mfe_pvalue), 141 | )?; 142 | } 143 | 144 | struc.serialize_field("", &query_result.status)?; 145 | struc.end() 146 | } 147 | } 148 | 149 | #[derive(Debug)] 150 | pub struct Range(pub ops::RangeInclusive); 151 | 152 | impl Default for Range { 153 | fn default() -> Self { 154 | Self(0..=0) 155 | } 156 | } 157 | 158 | impl Serialize for Range { 159 | fn serialize(&self, serializer: S) -> Result 160 | where 161 | S: serde::Serializer, 162 | { 163 | serializer.collect_str(self) 164 | } 165 | } 166 | 167 | impl<'de> Deserialize<'de> for Range { 168 | fn deserialize(deserializer: D) -> Result 169 | where 170 | D: Deserializer<'de>, 171 | { 172 | use serde::de::Error; 173 | 174 | let raw = <&str>::deserialize(deserializer)?; 175 | let mut split = raw.split('-').map(str::parse); 176 | let start = split 177 | .next() 178 | .ok_or_else(|| Error::custom("missing start in range"))? 179 | .map_err(|_| Error::custom("invalid start in range"))?; 180 | 181 | let end = split 182 | .next() 183 | .ok_or_else(|| Error::custom("missing end in range"))? 184 | .map_err(|_| Error::custom("invalid end in range"))?; 185 | 186 | if split.next().is_some() { 187 | return Err(Error::custom("invalid range format")); 188 | } 189 | 190 | Ok(Self(start..=end)) 191 | } 192 | } 193 | 194 | impl fmt::Display for Range { 195 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 196 | write!(f, "{}-{}", self.0.start(), self.0.end()) 197 | } 198 | } 199 | 200 | #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 201 | pub enum Status { 202 | #[serde(rename = "!")] 203 | PassInclusionEvalue, 204 | 205 | #[serde(rename = "?")] 206 | PassReportEvalue, 207 | 208 | #[default] 209 | #[serde(rename = "")] 210 | NotPass, 211 | } 212 | 213 | impl fmt::Display for Status { 214 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 215 | match self { 216 | Self::PassInclusionEvalue => f.write_str("!"), 217 | Self::PassReportEvalue => f.write_str("?"), 218 | Self::NotPass => f.write_str(""), 219 | } 220 | } 221 | } 222 | 223 | fn display_scientific(x: &T) -> String 224 | where 225 | T: Float + FromPrimitive + Display + fmt::LowerExp, 226 | { 227 | if *x >= T::from_f32(0.1).unwrap() { 228 | format!("{x:.3}") 229 | } else { 230 | format!("{x:.3e}") 231 | } 232 | } 233 | 234 | #[allow(clippy::ref_option)] // The signature is because of serde 235 | fn display_scientific_opt(x: &Option) -> String 236 | where 237 | T: Float + FromPrimitive + Display + fmt::LowerExp, 238 | { 239 | x.as_ref().map(display_scientific).unwrap_or_default() 240 | } 241 | -------------------------------------------------------------------------------- /src/stockholm.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::{self, Display}, 3 | fs::File, 4 | io::{self, BufWriter}, 5 | path::Path, 6 | rc::Rc, 7 | }; 8 | 9 | use anyhow::Context; 10 | 11 | use crate::{ 12 | db_file, gapped_sequence::GappedSequence, query_file, QueryResult, ResultFileFormat, 13 | SequenceEntry, 14 | }; 15 | 16 | pub(crate) fn write_result( 17 | result: &QueryResult, 18 | db_entries: &[db_file::Entry], 19 | query_entries: &[query_file::Entry], 20 | alignments_path: &Path, 21 | ) -> Result<(), anyhow::Error> { 22 | let stockholm_path = alignments_path.join(format!("{}.sto", ResultFileFormat::from(result))); 23 | let file = File::create(stockholm_path).context("Unable to create stockholm file")?; 24 | let writer = BufWriter::new(file); 25 | 26 | write_result_to_writer(result, db_entries, query_entries, writer) 27 | } 28 | 29 | #[inline] 30 | fn write_result_to_writer( 31 | result: &QueryResult, 32 | db_entries: &[db_file::Entry], 33 | query_entries: &[query_file::Entry], 34 | writer: W, 35 | ) -> Result<(), anyhow::Error> { 36 | let &QueryResult { 37 | ref query, 38 | db_entry: ref db, 39 | query_start, 40 | query_end, 41 | db_start, 42 | db_end, 43 | ref alignment, 44 | ref dotbracket, 45 | .. 46 | } = result; 47 | 48 | let db_entry = db_entries 49 | .iter() 50 | .find(|entry| entry.name() == db) 51 | .expect("db entry should be available"); 52 | let query_entry = query_entries 53 | .iter() 54 | .find(|entry| entry.name() == &**query) 55 | .expect("query entry should be available"); 56 | 57 | let db_sequence = GappedSequence { 58 | sequence: crate::Sequence { 59 | bases: &db_entry.sequence()[db_start..=db_end], 60 | molecule: db_entry.molecule(), 61 | }, 62 | alignment: alignment.target.to_ref(), 63 | }; 64 | 65 | let query_sequence = GappedSequence { 66 | sequence: crate::Sequence { 67 | bases: &query_entry.sequence()[query_start..=query_end], 68 | molecule: query_entry.molecule(), 69 | }, 70 | alignment: alignment.query.to_ref(), 71 | }; 72 | 73 | let seq_label_align = db.len().max(query.len()).max("#=GC SS_cons".len()) + 1; 74 | 75 | let mut stockholm = Stockholm::default() 76 | .with_identification(ResultFileFormat::from(result)) 77 | .with_author(format!("SHAPEwarp {}", env!("CARGO_PKG_VERSION"))) 78 | .with_empty_line() 79 | .with_sequence(format!("{db:seq_label_align$}"), db_sequence) 80 | .with_sequence(format!("{query:seq_label_align$}"), query_sequence); 81 | 82 | if let Some(dotbracket) = dotbracket { 83 | stockholm = stockholm.with_column_annotation( 84 | format!("{:1$}", "SS_cons", seq_label_align - "#=GC ".len()), 85 | dotbracket, 86 | ); 87 | } 88 | 89 | stockholm.write(writer)?; 90 | 91 | Ok::<_, anyhow::Error>(()) 92 | } 93 | 94 | #[derive(Debug, Default, Clone, PartialEq)] 95 | pub struct Stockholm(Vec); 96 | 97 | impl Stockholm { 98 | pub fn write(&self, mut writer: W) -> io::Result<()> 99 | where 100 | W: io::Write, 101 | { 102 | writeln!(writer, "# STOCKHOLM 1.0")?; 103 | self.0 104 | .iter() 105 | .try_for_each(|entry| entry.write(&mut writer))?; 106 | writeln!(writer, "//")?; 107 | Ok(()) 108 | } 109 | 110 | pub fn with_identification(mut self, id: impl Display) -> Self { 111 | self.0 112 | .push(Entry::FeatureAnnotation(FeatureAnnotation::Identification( 113 | id.to_string(), 114 | ))); 115 | self 116 | } 117 | 118 | pub fn with_author(mut self, author: impl Display) -> Self { 119 | self.0 120 | .push(Entry::FeatureAnnotation(FeatureAnnotation::Author( 121 | author.to_string(), 122 | ))); 123 | self 124 | } 125 | 126 | pub fn with_sequence(mut self, name: impl Into>, aligned: impl Display) -> Self { 127 | let name = name.into(); 128 | let aligned = aligned.to_string(); 129 | 130 | self.0.push(Entry::Sequence(Sequence { name, aligned })); 131 | self 132 | } 133 | 134 | pub fn with_column_annotation( 135 | mut self, 136 | feature: impl Display, 137 | annotation: impl Display, 138 | ) -> Self { 139 | let feature = feature.to_string(); 140 | let annotation = annotation.to_string(); 141 | 142 | self.0.push(Entry::ColumnAnnotation { 143 | feature, 144 | annotation, 145 | }); 146 | self 147 | } 148 | 149 | pub fn with_empty_line(mut self) -> Self { 150 | self.0.push(Entry::Empty); 151 | self 152 | } 153 | } 154 | 155 | #[allow(unused)] 156 | #[derive(Debug, Clone, PartialEq)] 157 | pub enum Entry { 158 | Sequence(Sequence), 159 | FeatureAnnotation(FeatureAnnotation), 160 | ColumnAnnotation { 161 | feature: String, 162 | annotation: String, 163 | }, 164 | SequenceAnnotation { 165 | sequence: Rc, 166 | feature: String, 167 | annotation: String, 168 | }, 169 | ResidueAnnotation { 170 | sequence: Rc, 171 | feature: String, 172 | annotation: String, 173 | }, 174 | Empty, 175 | } 176 | 177 | impl Entry { 178 | pub fn write(&self, mut writer: W) -> io::Result<()> 179 | where 180 | W: io::Write, 181 | { 182 | match self { 183 | Entry::Sequence(sequence) => { 184 | writeln!(writer, "{} {}", sequence.name, sequence.aligned) 185 | } 186 | Entry::FeatureAnnotation(ann) => ann.write(writer), 187 | Entry::ColumnAnnotation { 188 | feature, 189 | annotation, 190 | } => writeln!(writer, "#=GC {feature} {annotation}"), 191 | Entry::SequenceAnnotation { 192 | sequence, 193 | feature, 194 | annotation, 195 | } => writeln!(writer, "#=GS {sequence} {feature} {annotation}"), 196 | Entry::ResidueAnnotation { 197 | sequence, 198 | feature, 199 | annotation, 200 | } => writeln!(writer, "#=GR {sequence} {feature} {annotation}"), 201 | Entry::Empty => writeln!(writer), 202 | } 203 | } 204 | } 205 | 206 | #[derive(Debug, Clone, PartialEq, Hash)] 207 | pub struct Sequence { 208 | pub name: Rc, 209 | pub aligned: String, 210 | } 211 | 212 | #[allow(unused)] 213 | #[derive(Debug, Clone, PartialEq)] 214 | pub enum FeatureAnnotation { 215 | /// Accession number in form `PFxxxxx` (Pfam) or `RFxxxxx` (Rfam). 216 | AccessionNumber(String), 217 | 218 | /// One word name for family. 219 | Identification(String), 220 | 221 | /// Short description of family. 222 | Definition(String), 223 | 224 | /// Authors of the entry. 225 | Author(String), 226 | 227 | /// The source suggesting the seed members belong to one family. 228 | SourceOfSeed(String), 229 | 230 | /// The source (prediction or publication) of the consensus RNA secondary structure used by Rfam. 231 | SourceOfStructure(String), 232 | 233 | /// Command line used to generate the model 234 | BuildMethod(String), 235 | 236 | /// Command line used to perform the search 237 | SearchMethod(String), 238 | 239 | /// Search threshold to build the full alignment. 240 | GatheringThreshold(f32), 241 | 242 | /// Lowest sequence score (and domain score for Pfam) of match in the full alignment. 243 | TrustedCutoff(f32), 244 | 245 | /// Highest sequence score (and domain score for Pfam) of match not in full alignment. 246 | NoiseCutoff(f32), 247 | 248 | /// Type of family. 249 | Type(FamilyType), 250 | 251 | /// Number of sequences in alignment. 252 | Sequence(u8), 253 | 254 | /// Comment about database reference. 255 | DatabaseComment(String), 256 | 257 | /// Reference to external database. 258 | DatabaseReference(String), 259 | 260 | /// Comment about literature reference. 261 | ReferenceComment(String), 262 | 263 | /// Reference Number. 264 | ReferenceNumber(String), 265 | 266 | /// Eight digit medline UI number. 267 | ReferenceMedline(u32), 268 | 269 | /// Reference Title. 270 | ReferenceTitle(String), 271 | 272 | /// Reference Author 273 | ReferenceAuthor(String), 274 | 275 | /// Journal location. 276 | ReferenceLocation(String), 277 | 278 | /// Record of all previous ID lines. 279 | PreviousIdentifier(String), 280 | 281 | /// Keywords. 282 | Keywords(Vec), 283 | 284 | /// Comments. 285 | Comment(String), 286 | 287 | /// Indicates a nested domain. 288 | PfamAccession(String), 289 | 290 | /// Location of nested domains - sequence ID, start and end of insert. 291 | Location(String), 292 | 293 | /// Wikipedia page 294 | WikipediaLink(String), 295 | 296 | /// Clan accession 297 | Clan(String), 298 | 299 | /// Used for listing Clan membership 300 | Membership(String), 301 | 302 | /// A method used to set the bit score threshold based on the ratio of expected false positives 303 | /// to true positives. Floating point number between 0 and 1. 304 | FalseDiscoveryRate(f32), 305 | 306 | /// Command line used to calibrate the model (Rfam only, release 12.0 and later) 307 | CalibrationMethod(String), 308 | } 309 | 310 | impl FeatureAnnotation { 311 | pub fn write(&self, mut writer: W) -> io::Result<()> 312 | where 313 | W: io::Write, 314 | { 315 | macro_rules! match_features { 316 | (@inner $($pat:pat => $expr:expr,)* ; $(,)? ) => { 317 | match self { 318 | $($pat => $expr,)* 319 | } 320 | }; 321 | 322 | (@inner $($pat:pat => $expr:expr,)* ; $feature:ident => $repr:literal, $($rest:tt)*) => { 323 | match_features!( 324 | @inner 325 | $($pat => $expr,)* 326 | FeatureAnnotation::$feature(ann) => writeln!(writer, "#=GF {} {ann}", $repr), 327 | ; $($rest)* 328 | ) 329 | }; 330 | 331 | (@inner $($pat:pat => $expr:expr,)* ; $feature:ident($feat_pat:pat) => $handle_ann:expr, $($rest:tt)*) => { 332 | match_features!( 333 | @inner 334 | $($pat => $expr,)* 335 | FeatureAnnotation::$feature($feat_pat) => $handle_ann, 336 | ; $($rest)* 337 | ) 338 | }; 339 | 340 | ($($tt:tt)*) => { 341 | match_features!(@inner ; $($tt)*) 342 | }; 343 | } 344 | 345 | match_features!( 346 | AccessionNumber => "AC", 347 | Identification => "ID", 348 | Definition => "DE", 349 | Author => "AU", 350 | SourceOfSeed => "SE", 351 | SourceOfStructure => "SS", 352 | BuildMethod => "BM", 353 | SearchMethod => "SM", 354 | GatheringThreshold => "GA", 355 | TrustedCutoff => "TC", 356 | NoiseCutoff => "NC", 357 | Type => "TP", 358 | Sequence => "SQ", 359 | DatabaseComment => "DC", 360 | DatabaseReference => "DR", 361 | ReferenceComment => "RC", 362 | ReferenceNumber => "RN", 363 | ReferenceMedline => "RM", 364 | ReferenceTitle => "RT", 365 | ReferenceAuthor => "RA", 366 | ReferenceLocation => "RL", 367 | PreviousIdentifier => "PI", 368 | Keywords(keywords) => { 369 | writer.write_all(b"#=GF KW")?; 370 | let mut keywords = keywords.iter(); 371 | if let Some(keyword) = keywords.next() { 372 | write!(writer, " {keyword}")?; 373 | keywords.try_for_each(|keyword| { 374 | write!(writer, ",{keyword}") 375 | })?; 376 | } 377 | writeln!(writer) 378 | }, 379 | Comment => "CC", 380 | PfamAccession => "NE", 381 | Location => "NL", 382 | WikipediaLink => "WK", 383 | Clan => "CL", 384 | Membership => "MB", 385 | FalseDiscoveryRate => "FR", 386 | CalibrationMethod => "CB", 387 | ) 388 | } 389 | } 390 | 391 | #[allow(unused)] 392 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 393 | pub enum FamilyType { 394 | Family, 395 | Domain, 396 | Motif, 397 | Repeat, 398 | } 399 | 400 | impl Display for FamilyType { 401 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 402 | let s = match self { 403 | FamilyType::Family => "Family", 404 | FamilyType::Domain => "Domain", 405 | FamilyType::Motif => "Motif", 406 | FamilyType::Repeat => "Repeat", 407 | }; 408 | 409 | f.write_str(s) 410 | } 411 | } 412 | -------------------------------------------------------------------------------- /test_data/query.txt: -------------------------------------------------------------------------------- 1 | 16S_750 2 | TGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGT 3 | 0.102,0.083,0.066,0.040,0.075,0.061,0.573,0.631,0.427,0.265,1.190,0.066,0.042,0.085,0.424,0.413,0.375,0.447,0.035,0.045,0.037,0.242,0.221,0.157,0.170,0.370,1.238,0.743,0.571,0.138,0.837,0.859,0.042,0.021,0.080,0.318,0.195,0.792,1.581,1.058,2.004,1.512,2.273,1.256,0.036,0.005,0.094,0.091,0.464,0.741,0.667,0.367,0.428,0.162,0.020,0.000,0.046,0.044,0.114,0.054,0.101,1.192,1.264,0.104,0.623,0.937,1.593,1.279,0.599,1.695,0.072,0.030,0.002,0.030,0.094,0.120,0.332,1.424,0.173,0.100,0.513,0.266,0.276,0.146,0.229,0.271,0.436,0.846,0.093,0.160,0.552,1.456,5.895,1.110,2.465,1.198,0.055,0.094,0.073,0.061 4 | -------------------------------------------------------------------------------- /test_data/query_align.txt: -------------------------------------------------------------------------------- 1 | 16S_last 2 | GCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA 3 | 0.136,0.751,0.839,0.223,0.136,0.112,0.054,0.022,0.019,0.011,0.008,0.021,0.000,0.053,0.126,0.803,0.790,0.503,0.603,0.437,0.401,1.932,0.073,0.001,0.019,0.029,0.024,0.000,0.063,0.034,0.534,0.058,0.081,0.444,0.347,1.495,0.635,0.461,0.911,0.146,0.167,0.173,0.054,0.040,0.341,0.114,0.324,1.437,2.262,2.565,1.827,0.334,0.738,0.102,0.249,0.941,0.791,0.821,0.709,0.178,0.135,0.086,0.095,0.150,0.281,0.497,0.084,0.566,0.562,0.647,1.142,1.238,0.517,0.522,0.581,0.448,0.130,0.130,0.050,0.117,0.195,0.042,0.052,0.040,0.317,0.039,0.150,0.510,0.705,0.653,0.587,0.302,0.056,0.118,0.042,0.260,0.348,1.595,2.895,0.955,0.265,1.437,1.949,1.877,2.014,0.045,0.096,0.436,0.784,2.203,0.813,0.162,0.133,0.430,0.159,0.126,0.180,0.042,0.033,0.042,0.017,0.042,0.025,0.063,0.143,0.423,0.203,0.000,0.019,0.006,0.056,0.036,0.000,0.044,0.004,0.048,0.022,0.031,0.046,0.046,0.000,0.021,0.130,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN 4 | -------------------------------------------------------------------------------- /test_data/query_empty_sequence.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | 7 | NaN,0.123,0.456,0.789,0.012 8 | -------------------------------------------------------------------------------- /test_data/query_invalid_base.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | NAFGT 7 | NaN,0.123,0.456,0.789,0.012 8 | -------------------------------------------------------------------------------- /test_data/query_invalid_lengths.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | NACGTA 7 | NaN,0.123,0.456,0.789,0.012 8 | -------------------------------------------------------------------------------- /test_data/query_invalid_reactivity.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | NACGT 7 | NaN,0.123,asd,0.789,0.012 8 | -------------------------------------------------------------------------------- /test_data/query_truncated_reactivities.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | NACGT 7 | -------------------------------------------------------------------------------- /test_data/query_truncated_sequence.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,0.012,NaN 4 | 5 | test2 6 | -------------------------------------------------------------------------------- /test_data/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dincarnato/SHAPEwarp/30fe71a82f078d7ec6807ffeee31af0d338d6da1/test_data/test.db -------------------------------------------------------------------------------- /test_data/test_db.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Saccharomyces cerevisiae 5 | 2A3 6 | 7 | Marinus et al., 2020 8 | 33398343 9 | 10 | 1 11 | in vivo 12 | 13 | 14 | 15 | TATCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTCT 16 | AAGTATAAGCAATTTATACAGTGAAACTGCGAATGGCTCATTAAATCAGTTATCGTTTAT 17 | TTGATAGTTCCTTTACTACATGGTATAACTGTGGTAATTCTAGAGCTAATACATGCTTAA 18 | AATCTCGACCCTTTGGAAGAGATGTATTTATTAGATAAAAAATCAATGTCTTCGGACTCT 19 | TTGATGATTCATAATAACTTTTCGAATCGCATGGCCTTGTGCTGGCGATGGTTCATTCAA 20 | ATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTTTCAACGGGTAA 21 | CGGGGAATAAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGA 22 | AGGCAGCAGGCGCGCAAATTACCCAATCCTAATTCAGGGAGGTAGTGACAATAAATAACG 23 | ATACAGGGCCCATTCGGGTCTTGTAATTGGAATGAGTACAATGTAAATACCTTAACGAGG 24 | AACAATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTAT 25 | ATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACTTTGGGCCCGGTTGGCCGGTCCG 26 | ATTTTTTCGTGTACTGGATTTCCAACGGGGCCTTTCCTTCTGGCTAACCTTGAGTCCTTG 27 | TGGCTCTTGGCGAACCAGGACTTTTACTTTGAAAAAATTAGAGTGTTCAAAGCAGGCGTA 28 | TTGCTCGAATATATTAGCATGGAATAATAGAATAGGACGTTTGGTTCTATTTTGTTGGTT 29 | TCTAGGACCATCGTAATGATTAATAGGGACGGTCGGGGGCATCAGTATTCAATTGTCAGA 30 | GGTGAAATTCTTGGATTTATTGAAGACTAACTACTGCGAAAGCATTTGCCAAGGACGTTT 31 | TCATTAATCAAGAACGAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGTAGTCTTAA 32 | CCATAAACTATGCCGACTAGGGATCGGGTGGTGTTTTTTTAATGACCCACTCGGCACCTT 33 | ACGAGAAATCAAAGTCTTTGGGTTCTGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAG 34 | GAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGG 35 | GAAACTCACCAGGTCCAGACACAATAAGGATTGACAGATTGAGAGCTCTTTCTTGATTTT 36 | GTGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGTGATTTGTCTGCTTAATTGCGAT 37 | AACGAACGAGACCTTAACCTACTAAATAGTGGTGCTAGCATTTGCTGGTTATCCACTTCT 38 | TAGAGGGACTATCGGTTTCAAGCCGATGGAAGTTTGAGGCAATAACAGGTCTGTGATGCC 39 | CTTAGACGTTCTGGGCCGCACGCGCGCTACACTGACGGAGCCAGCGAGTCTAACCTTGGC 40 | CGAGAGGTCTTGGTAATCTTGTGAAACTCCGTCGTGCTGGGGATAGAGCATTGTAATTAT 41 | TGCTCTTCAACGAGGAATTCCTAGTAAGCGCAAGTCATCAGCTTGCGTTGATTACGTCCC 42 | TGCCCTTTGTACACACCGCCCGTCGCTAGTACCGATTGAATGGCTTAGTGAGGCCTCAGG 43 | ATCTGCTTAGAGAAGGGGGCAACTCCATCTCAGAGCGGAGAATTTGGACAAACTTGGTCA 44 | TTTAGAGGAACTAAAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTA 45 | 46 | 47 | 48 | NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,0.389,0.000,0.000,1.771,0.535,0.368,0.395,0.831,0.315,0.000,0.941,0.515,0.019,0.000,0.000,0.160,0.158,0.182,0.766,0.000,0.000,0.105,0.000, 49 | 0.231,0.000,0.000,0.000,0.428,0.000,1.140,2.903,0.747,0.000,0.773,6.292,2.504,16.392,26.687,18.395,7.285,3.472,1.969,1.531,0.015,0.000,2.028,0.277,0.000,0.695,0.060,0.000,0.000,0.000,0.000,0.387,0.278,0.320,0.000,0.000,0.000,0.000,0.000,1.249,0.511,0.565,13.736,11.059,2.451,0.329,0.019,0.632,0.000,0.424,0.031,0.000,0.353,0.258,1.792,0.000,0.000,1.444,0.066,0.000, 50 | 0.000,0.000,0.530,0.376,0.008,0.626,3.301,0.760,3.483,2.574,5.758,20.583,16.959,43.598,16.526,4.905,1.273,0.951,19.970,8.753,0.220,0.000,5.367,22.728,0.000,0.000,1.439,0.747,0.180,0.150,0.000,1.130,0.000,0.000,0.430,0.000,2.278,3.390,3.277,0.707,0.000,0.000,0.000,0.000,0.870,0.157,0.000,0.000,0.000,0.000,0.655,0.000,0.000,0.000,0.000,0.573,4.084,8.101,0.967,0.567, 51 | 0.000,4.739,0.680,0.000,0.720,0.019,0.490,0.053,0.000,0.463,5.995,9.746,10.837,44.167,1.073,0.444,0.487,0.000,0.000,0.065,0.000,0.284,4.298,0.000,0.166,0.113,0.430,0.280,0.727,1.256,1.131,0.000,0.870,5.313,4.611,NaN,11.272,3.660,0.000,0.000,0.000,1.090,0.524,0.000,0.000,1.089,1.186,0.879,0.248,0.738,1.538,7.189,25.524,10.602,1.722,0.469,5.201,5.769,20.508,23.514, 52 | 7.896,24.506,0.197,0.097,0.423,0.000,0.071,5.256,11.918,0.268,0.296,0.144,0.000,0.000,0.443,0.077,0.647,0.224,18.633,13.787,2.001,44.475,0.733,0.887,0.000,0.327,0.351,0.095,0.000,1.561,5.880,19.362,0.705,1.301,0.677,2.157,16.048,44.133,8.957,16.729,3.410,0.013,0.000,0.557,0.000,0.120,0.229,0.134,0.182,0.000,0.065,0.587,0.091,0.000,0.000,0.000,0.000,0.092,0.000,0.080, 53 | 0.000,0.093,0.496,0.647,0.060,2.088,26.681,0.459,0.064,2.308,12.659,18.487,143.025,1.242,0.365,0.049,0.166,0.378,8.188,11.029,3.117,0.342,0.000,0.132,0.106,0.022,0.000,0.000,0.000,0.077,0.000,0.080,0.000,0.820,0.000,0.477,0.133,0.000,0.000,0.368,0.000,0.000,0.589,2.386,23.441,0.000,0.000,0.000,0.102,0.388,0.000,0.000,0.065,0.180,0.000,0.159,0.234,0.356,0.156,0.693, 54 | 0.346,0.043,0.000,0.172,0.595,0.148,0.290,0.608,0.494,1.092,0.728,0.237,0.007,0.000,0.383,0.335,0.031,0.000,0.000,0.237,0.130,0.000,0.210,0.088,0.000,0.074,0.000,0.109,0.000,0.137,0.000,0.000,0.239,0.067,0.053,0.150,0.041,0.000,0.043,0.341,0.209,0.109,0.170,0.021,0.031,0.000,0.059,0.000,0.035,0.000,0.073,0.840,0.122,0.232,0.538,0.200,1.191,0.479,0.127,0.150, 55 | 1.024,0.000,0.191,0.000,0.306,0.438,0.405,0.233,0.139,0.000,0.061,0.128,0.013,0.170,0.141,0.080,0.000,0.526,0.000,0.246,0.000,0.022,0.000,0.812,0.000,0.199,0.974,0.168,0.735,0.436,0.000,0.770,0.694,1.493,0.586,0.420,0.328,0.214,0.064,0.473,0.055,0.297,0.611,0.139,0.248,0.319,0.000,0.099,0.109,0.027,0.000,0.261,0.260,0.053,0.078,0.403,0.084,0.207,0.589,0.945, 56 | 0.138,0.668,0.517,0.328,0.602,0.349,0.425,4.947,0.000,0.199,1.579,6.105,19.322,46.609,26.706,0.447,0.412,0.219,0.235,0.643,0.000,0.110,0.073,3.196,0.536,3.455,0.685,0.808,0.247,0.327,0.272,0.192,0.457,0.153,0.035,0.642,0.754,0.295,0.573,0.882,0.181,0.251,0.187,0.312,0.023,0.065,0.516,0.179,0.110,0.140,0.103,0.262,0.317,0.757,0.809,0.334,0.099,0.177,0.623,1.061, 57 | 0.073,0.182,0.000,0.040,0.346,0.462,0.868,0.609,0.968,0.114,0.466,0.224,0.002,0.124,0.541,0.102,0.668,3.227,0.248,0.341,0.121,0.000,0.259,0.715,0.092,0.000,0.047,0.292,0.390,0.208,1.087,0.145,1.312,0.810,0.107,0.659,0.288,0.780,0.367,2.016,0.758,0.064,0.000,0.263,0.000,0.432,0.000,0.000,0.000,0.205,0.066,0.259,0.430,0.045,0.609,0.343,0.000,0.000,0.000,0.382, 58 | 0.000,0.232,0.538,0.000,0.000,0.000,0.550,0.000,0.000,0.555,0.000,0.194,0.000,0.028,0.051,0.279,0.342,0.324,0.000,0.000,0.042,0.428,0.000,0.000,0.000,0.124,0.378,1.154,1.072,0.304,0.000,0.000,0.000,0.000,0.000,0.147,0.000,2.004,2.918,6.195,0.954,0.656,1.703,0.430,0.000,0.075,8.902,0.411,0.749,0.287,0.790,1.172,0.184,0.482,0.418,0.215,0.404,0.476,1.074,0.940, 59 | 17.668,2.394,1.781,0.969,0.732,3.651,32.772,8.051,12.658,9.111,3.327,18.257,0.653,0.278,0.631,1.287,6.529,11.216,4.910,4.150,23.076,0.214,0.173,0.000,0.175,0.777,0.747,0.166,0.102,0.153,0.000,0.000,0.148,1.160,2.285,1.114,1.905,0.715,1.933,0.057,0.589,0.136,0.693,0.604,0.770,0.286,0.663,0.075,0.000,0.089,0.380,0.177,0.000,0.387,1.321,1.762,2.934,14.663,35.554,9.828, 60 | 32.815,3.665,0.000,0.142,0.000,0.161,0.000,0.545,0.130,0.265,0.057,0.052,0.143,0.421,0.076,0.087,0.213,0.213,0.221,0.000,1.071,8.491,9.912,1.553,5.278,0.032,0.281,0.337,0.658,1.499,NaN,0.188,0.110,0.000,0.142,0.063,0.857,0.350,0.343,0.240,0.017,0.000,0.425,0.307,0.489,0.220,0.380,0.223,0.278,0.000,0.175,0.000,0.000,0.149,0.285,0.376,2.513,5.107,16.830,13.964, 61 | 8.064,59.968,0.373,0.167,0.310,0.251,0.641,0.336,0.227,1.486,0.200,0.673,6.099,2.678,11.366,0.414,0.148,0.290,0.071,0.369,0.394,0.418,1.108,0.280,0.453,0.560,0.440,1.441,1.939,6.911,2.394,4.441,4.936,8.365,5.098,15.768,0.210,0.531,3.564,0.851,1.007,3.763,0.605,0.000,0.000,0.219,0.000,0.000,0.916,0.000,0.000,0.132,0.567,0.570,0.000,0.189,0.473,1.177,0.000,0.007, 62 | 0.006,1.161,0.098,0.124,0.164,0.550,0.097,0.184,0.354,0.155,0.242,0.000,0.110,1.167,0.371,2.310,3.507,3.366,3.191,0.101,0.000,0.000,0.213,0.088,0.564,0.201,0.314,0.443,0.377,0.421,0.559,0.950,1.834,1.042,0.952,0.668,0.605,0.000,0.674,0.374,0.363,0.206,0.392,0.477,0.101,0.000,0.421,0.093,0.000,0.000,0.000,0.656,0.452,0.775,0.532,0.286,0.706,0.786,0.362,0.445, 63 | 0.840,1.335,2.877,5.233,0.418,0.336,2.239,1.596,0.466,1.183,16.256,56.125,2.014,0.463,0.056,0.000,0.000,2.308,0.446,0.000,0.692,0.000,0.000,0.110,0.208,0.083,0.102,0.003,0.414,0.558,1.040,2.186,0.491,1.036,0.307,0.054,0.156,0.420,0.073,0.052,0.470,0.238,0.082,0.364,0.010,0.000,0.546,0.156,0.057,0.077,0.000,0.201,0.029,0.099,0.027,0.051,0.055,0.000,0.024,0.024, 64 | 0.152,0.000,0.000,0.108,0.000,0.077,0.043,0.000,0.865,0.840,0.305,0.316,0.037,0.239,0.271,0.333,0.394,0.165,1.063,0.156,0.633,0.987,0.658,0.387,0.333,0.337,1.092,0.328,0.576,0.298,0.328,0.447,0.434,1.199,1.740,1.443,0.074,0.100,0.301,0.852,0.589,0.697,1.018,2.615,1.028,0.181,0.547,1.031,0.618,0.746,0.378,0.555,0.268,0.442,0.204,0.156,0.790,1.100,0.645,1.127, 65 | 0.221,0.230,0.963,0.760,1.184,0.327,0.674,0.586,0.907,0.357,2.046,0.223,0.154,0.088,0.100,0.193,0.058,0.220,0.143,0.306,0.131,0.896,0.626,0.124,0.246,0.340,2.437,0.401,0.697,0.821,4.319,21.614,NaN,13.351,1.630,0.705,0.653,0.310,2.389,29.364,11.525,11.107,13.900,0.696,0.077,0.000,0.033,0.043,0.297,0.000,0.011,0.000,0.000,0.000,0.046,0.550,0.075,0.186,0.318,0.499, 66 | 1.340,3.238,0.000,0.000,0.143,0.156,0.098,0.000,0.000,0.295,0.123,0.000,0.165,0.089,0.000,0.444,0.205,0.000,0.088,0.115,0.000,0.244,0.176,0.551,0.389,1.726,0.904,1.634,0.928,0.565,0.262,0.402,0.025,0.221,0.000,0.409,1.584,0.017,0.338,0.164,0.037,0.329,0.427,0.000,0.282,0.078,0.000,0.135,0.211,1.656,1.627,0.248,0.653,0.255,0.000,0.542,0.198,0.000,0.362,0.165, 67 | 0.234,0.093,0.225,0.297,1.103,0.372,0.387,0.858,1.700,1.624,0.212,0.488,0.366,0.122,0.018,0.144,0.000,0.202,0.122,0.066,0.012,0.000,0.038,0.036,0.046,0.078,0.028,0.144,0.000,0.240,0.027,0.036,0.000,0.045,0.000,0.032,0.000,0.144,0.108,0.312,0.000,0.091,0.225,0.460,0.073,0.284,0.553,1.641,1.980,4.315,NaN,NaN,0.183,0.036,0.059,0.000,0.137,0.243,0.133,0.060, 68 | 0.101,0.036,0.165,0.000,0.108,0.048,0.054,0.000,0.000,0.069,0.094,0.000,0.286,0.274,0.057,0.000,0.148,0.068,0.203,0.000,0.000,0.070,0.057,0.162,0.359,0.148,0.144,0.029,0.073,0.000,0.183,0.127,0.000,0.078,0.216,1.345,17.537,14.039,1.948,10.131,35.650,56.798,3.340,0.296,9.479,0.078,0.000,0.000,0.000,0.000,0.180,0.000,0.013,0.159,0.109,0.226,0.000,0.070,0.119,0.265, 69 | 0.216,0.574,0.300,0.210,0.470,0.222,0.011,0.036,0.247,0.043,0.000,0.015,0.155,0.050,0.000,0.140,0.000,0.187,0.160,0.000,0.084,0.047,0.198,0.000,0.466,0.406,0.457,1.294,0.903,2.806,31.280,1.625,0.663,0.221,0.175,0.335,0.000,0.551,0.358,0.163,0.000,0.000,0.241,1.005,0.188,2.120,5.791,0.000,0.000,0.188,0.148,0.041,0.396,0.436,0.311,0.000,0.092,0.425,0.730,0.250, 70 | 0.308,0.000,0.364,0.222,0.000,0.000,0.117,0.000,0.000,0.318,0.000,0.311,0.000,0.041,0.236,0.000,0.000,0.000,2.747,25.746,2.650,0.958,0.852,1.150,5.138,36.958,0.439,0.483,0.095,0.000,0.180,0.455,0.158,0.517,0.000,0.031,0.013,0.000,0.470,13.140,14.697,6.355,118.942,0.107,0.812,0.069,0.172,1.754,3.259,14.553,5.980,8.906,0.138,0.000,0.000,0.000,0.166,0.114,0.030,0.045, 71 | 0.071,0.117,0.340,0.551,0.250,0.184,0.176,0.030,0.133,0.030,0.140,0.267,0.265,0.634,6.958,1.857,8.962,26.787,22.133,0.222,0.060,0.000,0.000,0.000,0.046,0.082,0.152,0.000,0.000,0.059,0.065,0.000,0.000,0.161,0.432,0.032,0.095,0.000,0.399,0.003,0.071,0.228,0.269,0.151,0.000,0.118,0.000,0.164,0.000,0.000,0.063,0.000,0.228,0.000,0.110,0.000,0.225,0.888,0.110,0.465, 72 | 0.365,0.177,0.368,2.584,3.048,5.617,0.226,0.430,0.000,0.416,0.392,1.164,0.000,0.448,0.530,0.000,0.000,0.175,0.575,9.425,1.399,0.272,0.411,0.000,0.103,0.000,0.238,0.112,0.182,0.005,0.238,0.487,0.601,0.265,0.000,1.591,0.404,0.298,0.237,0.000,0.357,0.000,0.589,0.000,0.000,0.552,0.237,0.414,1.823,13.542,13.007,9.973,18.799,0.276,0.000,0.235,0.361,0.209,0.156,0.000, 73 | 0.042,0.067,0.000,0.145,0.194,0.080,0.000,0.000,0.000,0.800,0.000,0.012,0.003,0.113,0.152,0.270,0.166,0.061,0.000,0.000,0.000,0.081,0.485,0.173,0.000,0.000,0.000,0.118,0.331,0.286,0.031,0.215,0.082,0.000,0.286,0.978,3.695,0.000,0.224,0.119,0.068,0.249,0.230,0.254,0.218,0.024,0.000,0.231,0.000,0.000,0.042,0.000,0.000,0.000,0.122,0.193,0.080,0.478,0.048,0.030, 74 | 0.000,0.000,0.062,0.000,0.000,0.057,0.144,0.049,0.000,0.000,0.000,0.460,1.059,0.243,0.695,1.985,0.660,0.104,0.417,0.000,0.000,0.054,0.149,0.000,0.000,0.069,0.000,0.000,0.000,0.000,0.000,0.084,0.000,0.063,0.000,0.000,0.198,0.000,0.109,0.168,0.000,0.000,0.091,0.000,0.257,0.351,0.000,0.000,0.090,0.000,0.119,0.000,0.065,0.178,0.224,0.000,0.000,0.182,0.000,0.000, 75 | 0.325,0.696,0.000,0.197,0.000,0.000,0.183,0.058,0.991,1.003,0.765,0.000,0.214,0.687,1.089,1.719,1.727,2.076,0.000,0.375,1.015,0.834,0.416,0.448,0.563,0.813,0.110,0.000,0.086,0.924,0.944,0.022,0.572,1.132,1.610,0.614,5.694,0.529,0.000,0.108,0.358,0.000,0.003,0.372,0.141,0.000,0.026,0.225,0.977,0.584,0.000,0.457,0.104,0.000,0.000,0.074,0.100,0.407,0.000,0.210, 76 | 0.000,0.271,0.000,0.197,0.809,0.111,0.141,1.140,0.902,0.041,0.158,0.454,0.826,3.797,0.211,0.549,0.000,0.122,0.669,0.000,0.875,3.657,0.312,0.717,0.181,0.280,2.284,0.051,0.000,0.000,0.467,2.511,1.125,0.000,0.166,0.581,0.433,0.084,0.636,0.517,0.189,0.000,0.307,0.272,0.564,0.500,0.048,0.124,0.000,0.125,0.000,0.050,0.143,0.549,1.297,0.358,0.390,0.793,0.064,0.000, 77 | 0.230,0.000,0.474,0.933,0.710,0.818,0.106,0.068,0.000,0.000,0.000,0.279,0.000,0.239,0.272,0.670,1.665,1.701,0.481,1.297,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /test_data/valid_query.txt: -------------------------------------------------------------------------------- 1 | test1 2 | ACGTN 3 | 0.123,0.456,0.789,1.234,NaN 4 | 5 | test2 6 | NACGT 7 | NaN,12,0.456,0.789,0.012 8 | -------------------------------------------------------------------------------- /viennarna-mfe-sys/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /viennarna-mfe-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "viennarna-mfe-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | license = "GPL-3.0-or-later" 6 | 7 | [build-dependencies] 8 | bindgen = "0.65.1" 9 | pkg-config = "0.3.25" 10 | -------------------------------------------------------------------------------- /viennarna-mfe-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::{env, path::PathBuf}; 2 | 3 | fn main() { 4 | let vrna = pkg_config::Config::new() 5 | .range_version("2.4.18".."2.7") 6 | .probe("RNAlib2") 7 | .unwrap(); 8 | 9 | println!("cargo:rerun-if-changed=wrapper.h"); 10 | 11 | let bindings = bindgen::Builder::default() 12 | .header("wrapper.h") 13 | .clang_args( 14 | vrna.include_paths 15 | .into_iter() 16 | .map(|path| format!("-I{}", path.display())), 17 | ) 18 | .parse_callbacks(Box::new(bindgen::CargoCallbacks)) 19 | .allowlist_function("vrna_mfe") 20 | .allowlist_function("vrna_mfe_dimer") 21 | .allowlist_function("vrna_fold") 22 | .allowlist_function("vrna_circfold") 23 | .allowlist_function("vrna_alifold") 24 | .allowlist_function("vrna_circalifold") 25 | .allowlist_function("vrna_cofold") 26 | .allowlist_function("vrna_fold_compound_.*") 27 | .allowlist_function("vrna_md_set_default") 28 | .allowlist_function("vrna_sc_init") 29 | .allowlist_function("vrna_sc_set_stack_comparative") 30 | .allowlist_function("vrna_sc_add_SHAPE_deigan_ali") 31 | .allowlist_function("vrna_mfe_window_cb") 32 | .allowlist_var("VRNA_OPTION.*") 33 | .allowlist_type("vrna_sc_s") 34 | .allowlist_type("vrna_sc_bp_storage_t") 35 | .generate() 36 | .expect("Unable to generate bindings"); 37 | 38 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 39 | bindings 40 | .write_to_file(out_path.join("bindings.rs")) 41 | .expect("Couldn't write bindings!"); 42 | } 43 | -------------------------------------------------------------------------------- /viennarna-mfe-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | #![allow(rustdoc::broken_intra_doc_links)] 5 | 6 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 7 | -------------------------------------------------------------------------------- /viennarna-mfe-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | struct vrna_hc_depot_s { 8 | unsigned int strands; 9 | size_t *up_size; 10 | struct hc_nuc **up; 11 | size_t *bp_size; 12 | struct hc_basepair **bp; 13 | }; 14 | 15 | struct hc_nuc { 16 | int direction; 17 | unsigned char context; 18 | unsigned char nonspec; 19 | }; 20 | 21 | struct hc_basepair { 22 | size_t list_size; 23 | size_t list_mem; 24 | unsigned int *j; 25 | unsigned int *strand_j; 26 | unsigned char *context; 27 | }; 28 | --------------------------------------------------------------------------------