├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── build.rs
├── src
    ├── aligner.rs
    ├── cli.rs
    ├── db_file.rs
    ├── db_file
    │   ├── native.rs
    │   └── xml.rs
    ├── dotbracket.rs
    ├── fasta.rs
    ├── gapped_data.rs
    ├── gapped_reactivity.rs
    ├── gapped_sequence.rs
    ├── handle_query_entry.rs
    ├── iter.rs
    ├── main.rs
    ├── mass.rs
    ├── norm_dist.rs
    ├── null_model.rs
    ├── query_aligner.rs
    ├── query_file.rs
    ├── query_result.rs
    ├── stockholm.rs
    └── viennarna.rs
├── test_data
    ├── query.txt
    ├── query_align.txt
    ├── query_empty_sequence.txt
    ├── query_invalid_base.txt
    ├── query_invalid_lengths.txt
    ├── query_invalid_reactivity.txt
    ├── query_truncated_reactivities.txt
    ├── query_truncated_sequence.txt
    ├── test.db
    ├── test_db.xml
    └── valid_query.txt
└── viennarna-mfe-sys
    ├── .gitignore
    ├── Cargo.toml
    ├── build.rs
    ├── src
        └── lib.rs
    └── wrapper.h


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "SHAPEwarp"
 3 | version = "2.2.0"
 4 | edition = "2021"
 5 | license = "GPL-3.0-or-later"
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0.58"
 9 | bitflags = "2.3.1"
10 | clap = { version = "4.3.0", features = ["derive"] }
11 | csv = "1.1.6"
12 | fftw = { version = "0.8.0", default-features = false, features = ["system"] }
13 | fnv = "1.0.7"
14 | itertools = "0.10.3"
15 | ndarray = "0.15.4"
16 | num-complex = "0.4.3"
17 | num-traits = "0.2.14"
18 | once_cell = "1.17.1"
19 | quick-xml = "0.31.0"
20 | rand = "0.8.5"
21 | rayon = "1.5.3"
22 | serde = { version = "1.0.139", features = ["derive", "rc"] }
23 | serde_json = "1.0.85"
24 | smallvec = "1.8.0"
25 | statrs = "0.16.0"
26 | tabled = "0.17.0"
27 | toml_edit = { version = "0.19.10", features = ["serde"] }
28 | viennarna-mfe-sys = { version = "0.1.0", path = "viennarna-mfe-sys" }
29 | 
30 | [dev-dependencies]
31 | approx = { version = "0.5.1", features = ["num-complex"] }
32 | rand = { version = "0.8.5", features = ["small_rng"] }
33 | tempfile = "3.5.0"
34 | 
35 | [profile.release-opt]
36 | inherits = "release"
37 | lto = true
38 | codegen-units = 1
39 | 
40 | [build-dependencies]
41 | pkg-config = "0.3.27"
42 | semver = "1.0.18"
43 | 
44 | [lints.rust]
45 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(vrna24)', 'cfg(vrna25)', 'cfg(vrna251)', 'cfg(vrna26)'] }
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![SHAPEwarp logo](http://www.incarnatolab.com/images/software/SHAPEwarp.png)
 2 | <br />
 3 | ## Introduction
 4 | 
 5 | The model-guided search for structurally-homologous RNAs is a non-trivial task, as it largely depends on the quality of the inferred structure model. When it comes to inferring RNA structures from chemical probing data, the challenges are numerous. Use of different chemical probes, or of different approaches for incorporating experimental reactivities as pseudo-free energy contributions can significantly affect the reliability of the inferred RNA structure model.
 6 | 
 7 | __SHAPEwarp__ is a sequence-agnostic method for the identification of structurally-similar RNA elements in a database of chemical probing-derived reactivity profiles. The approach used by SHAPEwarp is inspired by the BLAST algorithm and builds on top of two widely used methods for similarity search in time series data: Mueen's Algorithm for Similarity Search ([MASS](https://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html)) and dynamic time warping (DTW). 
 8 | 
 9 | For support requests, please post your questions to: <https://github.com/dincarnato/SHAPEwarp/issues>
10 | 
11 | For a complete documentation, please refer to: <https://shapewarp-docs.readthedocs.io/en/latest/>
12 | 
13 | 
14 | ## Author(s)
15 | 
16 | Edoardo Morandi (emorandi[at]rnaframework.com)<br/>
17 | Danny Incarnato (dincarnato[at]rnaframework.com)<br/>
18 | 
19 | 
20 | ## References
21 | 
22 | Morandi *et al*., 2022. SHAPE-guided RNA structure homology search and motif discovery. Nature Communications (PMID: [35361788](https://pubmed.ncbi.nlm.nih.gov/35361788/))
23 | 
24 | Scholten *et al*., 2024. SHAPEwarp-web: sequence-agnostic search for structurally homologous RNA regions across databases of chemical probing data. Nucleic Acids Research (PMID: [38709889](https://pubmed.ncbi.nlm.nih.gov/38709889/))
25 | 
26 | 
27 | ## License
28 | 
29 | This program is free software, and can be redistribute and/or modified under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or any later version.
30 | 
31 | Please see <http://www.gnu.org/licenses/> for more information.
32 | 
33 | 
34 | ## Prerequisites
35 | 
36 | - Linux system
37 | - Rust and Cargo (Installation instructions: <https://doc.rust-lang.org/cargo/getting-started/installation.html>)
38 | - [FFTW](http://fftw.org/) 3.x library.
39 | 
40 | ## Installation of FFTW
41 | 
42 | This library is generally provided by package managers, keep in mind that some distros split the `-dev` package (which are needed to compile projects depending on the library) from the main one.
43 | 
44 | ### Debian based distros (i.e. Debian, Ubuntu)
45 | 
46 | ```bash
47 | sudo apt install libfftw3-dev
48 | ```
49 | 
50 | ### Red-Hat based distros (i.e. Fedora, CentOS, Alma Linux)
51 | 
52 | ```bash
53 | sudo dnf install fftw-devel
54 | ```
55 | 
56 | ### Arch based distros (i.e. Arch, Manjaro)
57 | 
58 | ```bash
59 | sudo pacman -S fftw
60 | ```
61 | 
62 | ## Installation
63 | 
64 | ```bash
65 | $ git clone https://github.com/dincarnato/SHAPEwarp
66 | $ cd SHAPEwarp
67 | 
68 | # Add to PKG_CONFIG_PATH the path to the directory containing RNAlib2.pc from the ViennaRNA package
69 | $ export PKG_CONFIG_PATH=/path/to/dir/containing/RNAlib2.pc
70 | 
71 | $ export RUSTFLAGS=-Ctarget-cpu=native
72 | $ cargo build --release
73 | ```
74 | 
75 | The SHAPEwarp executable will be located under ``target/release/``.<br/>
76 | 
77 | 
78 | ### Note for Mac OS X users:
79 | To compile SHAPEwarp on Mac OS X, after having installed the ViennaRNA package, open the RNAlib2.pc file in a text editor and replace the ``-lstdc++`` flag with ``-lc++``.</br>
80 | 
81 | 
82 | ## Testing the SHAPEwarp installation
83 | 
84 | To test SHAPEwarp on a small test dataset, issue the following command from within the SHAPEwarp install directory:
85 | 
86 | ```bash
87 | target/release/SHAPEwarp --query test_data/query.txt --database test_data/test.db --output test_out --ow
88 | ```
89 | The search will take less than 10 seconds, and the expected output should look like the following:
90 | 
91 | ```bash
92 |  query    db_entry       query_start  query_end  db_start  db_end  query_seed  db_seed  score    pvalue    evalue    status
93 |  16S_750  16S_Bsubtilis  0            99         758       857     15-79       773-837  109.103  5.665e-8  1.003e-5  !
94 | ```
95 | 


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
 1 | use semver::Version;
 2 | 
 3 | fn main() {
 4 |     let vrna = pkg_config::Config::new()
 5 |         .range_version("2.4.18".."2.7")
 6 |         .cargo_metadata(false)
 7 |         .env_metadata(false)
 8 |         .print_system_libs(false)
 9 |         .print_system_cflags(false)
10 |         .probe("RNAlib2")
11 |         .unwrap();
12 | 
13 |     println!("cargo:rerun-if-changed=build.rs");
14 | 
15 |     let version: Version = vrna
16 |         .version
17 |         .parse()
18 |         .expect("unable to parse ViennaRNA version");
19 | 
20 |     let version_cfg = format!("vrna{}{}", version.major, version.minor);
21 |     println!("cargo:rustc-cfg={version_cfg}");
22 | 
23 |     if version.major == 2 && version.minor == 5 {
24 |         let version_cfg = format!("vrna{}{}{}", version.major, version.minor, version.patch);
25 |         println!("cargo:rustc-cfg={version_cfg}");
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
  1 | // We are defining CLI structs
  2 | #![allow(clippy::struct_excessive_bools)]
  3 | 
  4 | use clap::{Args, Parser, ValueEnum};
  5 | use serde::Serialize;
  6 | use std::{fmt, ops::Range, path::PathBuf, str::FromStr};
  7 | 
  8 | use crate::{Distance, Reactivity};
  9 | 
 10 | #[derive(Debug, Parser, Serialize)]
 11 | #[clap(author, version, about, allow_negative_numbers = true)]
 12 | #[serde(rename_all = "kebab-case")]
 13 | /// SHAPE-guided RNA structural homology search
 14 | pub struct Cli {
 15 |     /// Path to a database file, or to a (directory of) XML file(s)
 16 |     #[clap(long, visible_alias = "db")]
 17 |     #[serde(skip)]
 18 |     pub database: PathBuf,
 19 | 
 20 |     /// Path to a shuffled database file
 21 |     ///
 22 |     /// Uses a file containing the shuffled database instead of generating one on the fly.
 23 |     /// A shuffled database can be dumped to file using `--dump-shuffled-db`.
 24 |     #[clap(
 25 |         long,
 26 |         conflicts_with_all = &[
 27 |             "dump_shuffled_db",
 28 |             "db_shuffles",
 29 |             "db_block_size",
 30 |             "db_in_block_shuffle",
 31 |         ],
 32 |     )]
 33 |     #[serde(skip)]
 34 |     pub shuffled_db: Option<PathBuf>,
 35 | 
 36 |     /// Dumps the database to the specified file.
 37 |     ///
 38 |     /// Input is a (directory of) XML file(s).
 39 |     #[clap(long)]
 40 |     #[serde(skip)]
 41 |     pub dump_db: Option<PathBuf>,
 42 | 
 43 |     /// Dumps the shuffled database to the specified file.
 44 |     ///
 45 |     /// Shuffled databases can be imported using the `--shuffled-db` parameter.
 46 |     #[clap(long)]
 47 |     #[serde(skip)]
 48 |     pub dump_shuffled_db: Option<PathBuf>,
 49 | 
 50 |     /// Path to the query file
 51 |     ///
 52 |     /// Note: each entry should contain (one per row) the sequence id, the nucleotide sequence and
 53 |     /// a comma-separated list of SHAPE reactivities
 54 |     #[clap(short, long)]
 55 |     #[serde(skip)]
 56 |     pub query: PathBuf,
 57 | 
 58 |     /// Output directory
 59 |     #[clap(short, long, default_value = "sw_out/")]
 60 |     pub output: PathBuf,
 61 | 
 62 |     /// Overwrites the output directory (if the specified path already exists)
 63 |     #[clap(long, visible_alias = "ow")]
 64 |     pub overwrite: bool,
 65 | 
 66 |     /// Number of processors to use
 67 |     ///
 68 |     /// Uses all available processors if not specified
 69 |     #[clap(long)]
 70 |     pub threads: Option<u16>,
 71 | 
 72 |     /// Number of shuffles to perform for each sequence in db
 73 |     ///
 74 |     /// In case the parameter is unspecified, it is automatically evaluated based on the length of
 75 |     /// the sequences in the database.
 76 |     ///
 77 |     /// Given `L` as the sum of the lengths of each sequence in the database, the number of
 78 |     /// shuffles is calculated as `max(1, 500000 / L)`.
 79 |     #[clap(long, alias = "dbShuffles")]
 80 |     pub db_shuffles: Option<u16>,
 81 | 
 82 |     /// Size (in nt) of the blocks for shuffling the sequences in db
 83 |     #[clap(long, alias = "dbBlockSize", default_value_t = 10)]
 84 |     pub db_block_size: u16,
 85 | 
 86 |     /// Besides shuffling blocks, residues within each block in db will be shuffled as well
 87 |     #[clap(long, alias = "dbInBlockShuffle")]
 88 |     pub db_in_block_shuffle: bool,
 89 | 
 90 |     /// Maximum value to which reactivities will be capped
 91 |     #[clap(long, default_value_t = 1., alias = "maxReactivity")]
 92 |     pub max_reactivity: Reactivity,
 93 | 
 94 |     /// If two significant alignments overlap by more than this value, the least significant one
 95 |     /// (the one with the lowest alignment score) will be discarded
 96 |     #[clap(long, default_value_t = 0.5, alias = "maxAlignOverlap")]
 97 |     pub max_align_overlap: f32,
 98 | 
 99 |     /// Number of HSGs in the shuffled database to be extended to build the null model
100 |     #[clap(long, default_value_t = 10_000, alias = "nullHSGs")]
101 |     pub null_hsgs: u32,
102 | 
103 |     /// E-value threshold to consider an alignment significant
104 |     #[clap(long, default_value_t = 0.01, aliases = &["inclusionEvalue", "incE"], visible_alias = "inc-e")]
105 |     pub inclusion_evalue: f64,
106 | 
107 |     /// E-value threshold to report a match
108 |     #[clap(long, default_value_t = 0.1, aliases = &["reportEvalue", "repE"], visible_alias = "rep-e")]
109 |     pub report_evalue: f64,
110 | 
111 |     /// Reports sequence alignments in the specified format
112 |     ///
113 |     /// Note: alignments are reported only for matches below the inclusion E-value cutoff
114 |     #[clap(long, alias = "reportAln", value_enum)]
115 |     pub report_alignment: Option<ReportAlignment>,
116 | 
117 |     /// Reports the aligned reactivities for significant matches in the "reactivities/" subfolder of the output
118 |     /// directory, in JSON format
119 |     #[clap(long)]
120 |     pub report_reactivity: bool,
121 | 
122 |     #[clap(flatten, next_help_heading = "Kmer lookup options")]
123 |     #[serde(flatten)]
124 |     pub kmer_lookup_args: KmerLookupArgs,
125 | 
126 |     #[clap(flatten, next_help_heading = "Alignment options")]
127 |     #[serde(flatten)]
128 |     pub alignment_args: AlignmentArgs,
129 | 
130 |     #[clap(flatten, next_help_heading = r#"Alignment folding evaluation options"#)]
131 |     #[serde(flatten)]
132 |     pub alignment_folding_eval_args: AlignmentFoldingEvaluationArgs,
133 | }
134 | 
135 | #[derive(Debug, Args, Serialize)]
136 | #[serde(rename_all = "kebab-case")]
137 | pub struct KmerLookupArgs {
138 |     /// Minimum number of kmers required to form a High Scoring Group (HSG)
139 |     #[clap(long, default_value_t = 2, alias = "minKmers")]
140 |     pub min_kmers: u16,
141 | 
142 |     /// Maximum distance between two kmers to be merged in a HSG
143 |     #[clap(long, default_value_t = 30, alias = "maxKmerDist")]
144 |     pub max_kmer_dist: u16,
145 | 
146 |     /// Length (in nt) of the kmers
147 |     #[clap(long, default_value_t = 15, alias = "kmerLen")]
148 |     pub kmer_len: u16,
149 | 
150 |     /// Sliding offset for extracting candidate kmers from the query
151 |     #[clap(long, default_value_t = 1, alias = "kmerOffset")]
152 |     pub kmer_offset: u16,
153 | 
154 |     /// The sequence of a query kmer and the corresponding database match must have GC% contents
155 |     /// differing no more than --kmer-max-gc-diff
156 |     #[clap(long, alias = "matchKmerGCcontent")]
157 |     pub match_kmer_gc_content: bool,
158 | 
159 |     /// Maximum allowed GC% difference to retain a kmer match
160 |     ///
161 |     /// Note: the default value is automatically determined based on the chosen kmer length
162 |     #[clap(long, requires = "match_kmer_gc_content", alias = "kmerMaxGCdiff")]
163 |     pub kmer_max_gc_diff: Option<f32>,
164 | 
165 |     /// The sequence of a query kmer and the corresponding database match must differ no more than
166 |     /// --kmer-max-seq-dist
167 |     #[clap(long, alias = "matchKmerSeq")]
168 |     pub match_kmer_seq: bool,
169 | 
170 |     /// Maximum allowed sequence distance to retain a kmer match
171 |     ///
172 |     /// Note: when >= 1, this is interpreted as the absolute number of bases that are allowed to
173 |     /// differ between the kmer and the matching region. When < 1, this is interpreted as a
174 |     /// fraction of the kmer's length
175 |     #[clap(long, requires = "match_kmer_seq", alias = "kmerMaxSeqDist")]
176 |     pub kmer_max_seq_dist: Option<Distance<u32>>,
177 | 
178 |     /// Minimum complexity (measured as Gini coefficient) of candidate kmers
179 |     #[clap(long, default_value_t = 0.3, alias = "kmerMinComplexity")]
180 |     pub kmer_min_complexity: f32,
181 | 
182 |     /// A kmer is allowed to match a database entry on average every this many nt
183 |     #[clap(long, default_value_t = 200, alias = "kmerMaxMatchEveryNt")]
184 |     pub kmer_max_match_every_nt: u32,
185 | }
186 | 
187 | #[derive(Debug, Args, Serialize)]
188 | #[serde(rename_all = "kebab-case")]
189 | #[allow(clippy::struct_field_names)]
190 | pub struct AlignmentArgs {
191 |     /// Minimum and maximum score reactivity differences below 0.5 will be mapped to
192 |     #[clap(long, default_value_t = MinMax (-0.5..2.), alias = "alignMatchScore", allow_hyphen_values = true)]
193 |     pub align_match_score: MinMax<Reactivity>,
194 | 
195 |     /// Minimum and maximum score reactivity differences above 0.5 will be mapped to
196 |     #[clap(long, default_value_t = MinMax (-6.0..-0.5), alias = "alignMismatchScore", allow_hyphen_values = true)]
197 |     pub align_mismatch_score: MinMax<Reactivity>,
198 | 
199 |     /// Gap open penalty
200 |     #[clap(long, default_value_t = -14., alias = "alignGapOpenPenal")]
201 |     pub align_gap_open_penalty: f32,
202 | 
203 |     /// Gap extension penalty
204 |     #[clap(long, default_value_t = -5., alias = "alignGapExtPenal")]
205 |     pub align_gap_ext_penalty: f32,
206 | 
207 |     /// An alignment is allowed to drop by maximum this fraction of the best score encountered so
208 |     /// far, before extension is interrupted
209 |     #[clap(long, default_value_t = 0.8, alias = "alignMaxDropOffRate")]
210 |     pub align_max_drop_off_rate: f32,
211 | 
212 |     /// An alignment is allowed to drop below the best score encountered so far *
213 |     /// --align-max-drop-off-rate by this number of bases, before extension is interrupted
214 |     #[clap(long, default_value_t = 8, alias = "alignMaxDropOffBases")]
215 |     pub align_max_drop_off_bases: u16,
216 | 
217 |     /// The maximum allowed tolerated length difference between the query and db sequences to look
218 |     /// for the ideal alignment along the diagonal (measured as a fraction of the length of the
219 |     /// shortest sequence among db and query)
220 |     #[clap(long, default_value_t = 0.1, alias = "alignLenTolerance")]
221 |     pub align_len_tolerance: f32,
222 | 
223 |     /// Sequence matches are rewarded during the alignment
224 |     #[clap(long, alias = "alignScoreSeq")]
225 |     pub align_score_seq: bool,
226 | 
227 |     /// Score reward for matching bases
228 |     #[clap(
229 |         long,
230 |         default_value_t = 0.5,
231 |         requires = "align_score_seq",
232 |         alias = "alignSeqMatchScore"
233 |     )]
234 |     pub align_seq_match_score: f32,
235 | 
236 |     /// Score penalty for mismatching bases
237 |     #[clap(
238 |         long,
239 |         default_value_t = -2.,
240 |         requires = "align_score_seq",
241 |         alias = "alignSeqMismatchScore"
242 |     )]
243 |     pub align_seq_mismatch_score: f32,
244 | }
245 | 
246 | #[derive(Debug, Args, Serialize)]
247 | #[serde(rename_all = "kebab-case")]
248 | pub struct AlignmentFoldingEvaluationArgs {
249 |     /// Alignments passing the --inclusion-evalue threshold, are further evaluated for the presence
250 |     /// or a conserved RNA structure by using `RNAalifold`
251 |     #[clap(long, alias = "evalAlignFold")]
252 |     pub eval_align_fold: bool,
253 | 
254 |     /// Number of shuffles to perform for each alignment during folding evaluation
255 |     #[clap(long, default_value_t = 100)]
256 |     pub shuffles: u16,
257 | 
258 |     /// Size (in nt) of the blocks for shuffling the alignment during folding evaluation
259 |     #[clap(long, alias = "blockSize", default_value_t = 3)]
260 |     pub block_size: u16,
261 | 
262 |     /// Besides shuffling blocks, residues within each block will be shuffled as well during
263 |     /// folding evaluation
264 |     #[clap(long, alias = "inBlockShuffle")]
265 |     pub in_block_shuffle: bool,
266 | 
267 |     /// Minimum fraction of base-pairs of the RNAalifold-inferred structure that should be
268 |     /// supported by both query and db sequence to retain a match
269 |     #[clap(long, default_value_t = 0.75, alias = "minBpSupport")]
270 |     pub min_bp_support: f32,
271 | 
272 |     /// Use RIBOSUM scoring matrix
273 |     #[clap(long, alias = "ribosumScoring")]
274 |     pub ribosum_scoring: bool,
275 | 
276 |     /// Slope for SHAPE reactivities conversion into pseudo-free energy contributions
277 |     #[clap(long, default_value_t = 1.8, requires = "eval_align_fold")]
278 |     pub slope: Reactivity,
279 | 
280 |     /// Intercept for SHAPE reactivities conversion into pseudo-free energy contributions
281 |     #[clap(long, default_value_t = -0.6, requires = "eval_align_fold")]
282 |     pub intercept: Reactivity,
283 | 
284 |     /// Maximum allowed base-pairing distance
285 |     #[clap(
286 |         long,
287 |         default_value_t = 600,
288 |         alias = "maxBPspan",
289 |         requires = "eval_align_fold"
290 |     )]
291 |     pub max_bp_span: u32,
292 | 
293 |     /// Disallows lonely pairs (helices of 1 bp)
294 |     #[clap(long, alias = "noLonelyPairs", requires = "eval_align_fold")]
295 |     pub no_lonely_pairs: bool,
296 | 
297 |     /// Disallows G:U wobbles at the end of helices
298 |     #[clap(long, alias = "noClosingGU", requires = "eval_align_fold")]
299 |     pub no_closing_gu: bool,
300 | 
301 |     /// Folding temperature
302 |     #[clap(long, default_value_t = 37., requires = "eval_align_fold")]
303 |     pub temperature: f32,
304 | }
305 | 
306 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
307 | pub struct MinMax<T>(pub Range<T>);
308 | 
309 | impl<T> fmt::Display for MinMax<T>
310 | where
311 |     T: fmt::Display,
312 | {
313 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
314 |         write!(f, "{},{}", self.0.start, self.0.end)
315 |     }
316 | }
317 | 
318 | impl<T> Serialize for MinMax<T>
319 | where
320 |     T: fmt::Display,
321 | {
322 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
323 |     where
324 |         S: serde::Serializer,
325 |     {
326 |         serializer.collect_str(self)
327 |     }
328 | }
329 | 
330 | #[derive(Debug, Clone, PartialEq, Eq)]
331 | pub enum ParseMinMaxError<T> {
332 |     InvalidFormat,
333 |     InnerError { index: u8, error: T },
334 | }
335 | 
336 | impl<T> FromStr for MinMax<T>
337 | where
338 |     T: FromStr,
339 | {
340 |     type Err = ParseMinMaxError<T::Err>;
341 | 
342 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
343 |         let (start, end) = s.split_once(',').ok_or(ParseMinMaxError::InvalidFormat)?;
344 | 
345 |         let start = start
346 |             .parse()
347 |             .map_err(|error| ParseMinMaxError::InnerError { index: 0, error })?;
348 | 
349 |         let end = end
350 |             .parse()
351 |             .map_err(|error| ParseMinMaxError::InnerError { index: 1, error })?;
352 | 
353 |         Ok(Self(start..end))
354 |     }
355 | }
356 | 
357 | impl<T> fmt::Display for ParseMinMaxError<T>
358 | where
359 |     T: fmt::Display,
360 | {
361 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
362 |         match self {
363 |             Self::InvalidFormat => {
364 |                 f.write_str("invalid min-max format, two comma-separated values expected")
365 |             }
366 |             Self::InnerError { index, error } => {
367 |                 let part = match index {
368 |                     0 => "min",
369 |                     1 => "max",
370 |                     _ => unreachable!(),
371 |                 };
372 |                 write!(f, "{part} part of min-max format is invalid: {error}")
373 |             }
374 |         }
375 |     }
376 | }
377 | 
378 | impl<T> std::error::Error for ParseMinMaxError<T> where T: std::error::Error {}
379 | 
380 | #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize)]
381 | pub enum ReportAlignment {
382 |     #[clap(alias = "f")]
383 |     Fasta,
384 | 
385 |     #[clap(alias = "s")]
386 |     Stockholm,
387 | }
388 | 
389 | impl Cli {
390 |     #[cfg(test)]
391 |     pub(crate) fn dummy() -> Self {
392 |         Self::parse_from(["test", "--database", "test", "--query", "test"])
393 |     }
394 | }
395 | 
396 | /// Hidden dumper for XML files.
397 | ///
398 | /// Read an XML or the XML files from a directory and dump the content to a native DB format.
399 | #[derive(Debug, Parser)]
400 | pub struct Alternative {
401 |     /// Path to a database file, or to a (directory of) XML file(s)
402 |     #[clap(long, visible_alias = "db")]
403 |     pub database: PathBuf,
404 | 
405 |     /// Dumps the database to the specified file.
406 |     ///
407 |     /// Input is a (directory of) XML file(s).
408 |     #[clap(long)]
409 |     pub dump_db: PathBuf,
410 | 
411 |     /// Dumps the shuffled database to the specified file.
412 |     ///
413 |     /// Shuffled databases can be imported using the `--shuffled-db` parameter.
414 |     #[clap(long)]
415 |     pub dump_shuffled_db: Option<PathBuf>,
416 | 
417 |     /// Number of processors to use
418 |     ///
419 |     /// Uses all available processors if not specified
420 |     #[clap(long)]
421 |     pub threads: Option<u16>,
422 | 
423 |     /// Number of shuffles to perform for each sequence in db
424 |     ///
425 |     /// In case the parameter is unspecified, it is automatically evaluated based on the length of
426 |     /// the sequences in the database.
427 |     ///
428 |     /// Given `L` as the sum of the lengths of each sequence in the database, the number of
429 |     /// shuffles is calculated as `max(1, 500000 / L)`.
430 |     #[clap(long, alias = "dbShuffles")]
431 |     pub db_shuffles: Option<u16>,
432 | 
433 |     /// Size (in nt) of the blocks for shuffling the sequences in db
434 |     #[clap(long, alias = "dbBlockSize", default_value_t = 10)]
435 |     pub db_block_size: u16,
436 | 
437 |     /// Besides shuffling blocks, residues within each block in db will be shuffled as well
438 |     #[clap(long, alias = "dbInBlockShuffle")]
439 |     pub db_in_block_shuffle: bool,
440 | }
441 | 


--------------------------------------------------------------------------------
/src/db_file.rs:
--------------------------------------------------------------------------------
  1 | pub mod native;
  2 | mod xml;
  3 | 
  4 | use std::{
  5 |     convert::TryInto,
  6 |     error::Error as StdError,
  7 |     ffi::OsString,
  8 |     fmt::{self, Display},
  9 |     io,
 10 |     path::Path,
 11 |     ptr,
 12 |     string::FromUtf8Error,
 13 | };
 14 | 
 15 | use serde::{Serialize, Serializer};
 16 | 
 17 | use crate::{Base, Molecule, Reactivity, SequenceEntry};
 18 | 
 19 | #[derive(Debug, Clone, PartialEq)]
 20 | pub struct Entry {
 21 |     pub id: String,
 22 |     pub(crate) sequence: Vec<Base>,
 23 |     pub reactivity: Vec<ReactivityWithPlaceholder>,
 24 | }
 25 | 
 26 | const NAN_PLACEHOLDER: Reactivity = -999.;
 27 | 
 28 | #[derive(Debug, Clone, Copy)]
 29 | #[repr(transparent)]
 30 | pub struct ReactivityWithPlaceholder(Reactivity);
 31 | 
 32 | impl ReactivityWithPlaceholder {
 33 |     pub fn is_nan(self) -> bool {
 34 |         self.0.is_nan() | (self.0 == NAN_PLACEHOLDER)
 35 |     }
 36 | 
 37 |     pub fn get_non_nan(self) -> Option<Reactivity> {
 38 |         if self.is_nan() {
 39 |             None
 40 |         } else {
 41 |             Some(self.0)
 42 |         }
 43 |     }
 44 | 
 45 |     pub fn to_maybe_placeholder(self) -> Reactivity {
 46 |         if self.0.is_nan() {
 47 |             NAN_PLACEHOLDER
 48 |         } else {
 49 |             self.0
 50 |         }
 51 |     }
 52 | 
 53 |     pub fn as_inner_slice(this: &[ReactivityWithPlaceholder]) -> &[Reactivity] {
 54 |         // Safety:
 55 |         // - `ReactivityWithPlaceholder` is transparent and it contains only a `Reactivity`
 56 |         // - lifetime is maintained
 57 |         unsafe { &*(ptr::from_ref(this) as *const [Reactivity]) }
 58 |     }
 59 | 
 60 |     pub fn inner(self) -> Reactivity {
 61 |         self.0
 62 |     }
 63 | 
 64 |     #[inline]
 65 |     #[must_use]
 66 |     pub const fn nan_placeholder() -> Self {
 67 |         Self(NAN_PLACEHOLDER)
 68 |     }
 69 | }
 70 | 
 71 | impl PartialEq for ReactivityWithPlaceholder {
 72 |     fn eq(&self, other: &Self) -> bool {
 73 |         if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) {
 74 |             false
 75 |         } else {
 76 |             self.0 == other.0
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | impl PartialEq<Reactivity> for ReactivityWithPlaceholder {
 82 |     fn eq(&self, other: &Reactivity) -> bool {
 83 |         if self.0 == NAN_PLACEHOLDER {
 84 |             false
 85 |         } else {
 86 |             self.0 == *other
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | impl PartialOrd for ReactivityWithPlaceholder {
 92 |     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
 93 |         if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) {
 94 |             None
 95 |         } else {
 96 |             self.0.partial_cmp(&other.0)
 97 |         }
 98 |     }
 99 | }
100 | 
101 | impl PartialOrd<Reactivity> for ReactivityWithPlaceholder {
102 |     fn partial_cmp(&self, other: &Reactivity) -> Option<std::cmp::Ordering> {
103 |         if self.0 == NAN_PLACEHOLDER {
104 |             None
105 |         } else {
106 |             self.0.partial_cmp(other)
107 |         }
108 |     }
109 | }
110 | 
111 | impl From<Reactivity> for ReactivityWithPlaceholder {
112 |     fn from(reactivity: Reactivity) -> Self {
113 |         Self(reactivity)
114 |     }
115 | }
116 | 
117 | impl Serialize for ReactivityWithPlaceholder {
118 |     #[inline]
119 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
120 |     where
121 |         S: Serializer,
122 |     {
123 |         self.get_non_nan()
124 |             .unwrap_or(Reactivity::NAN)
125 |             .serialize(serializer)
126 |     }
127 | }
128 | 
129 | pub trait ReactivityLike: Copy + PartialOrd + PartialEq {
130 |     fn is_nan(self) -> bool;
131 |     fn value(self) -> Reactivity;
132 | }
133 | 
134 | impl ReactivityLike for Reactivity {
135 |     #[inline]
136 |     fn is_nan(self) -> bool {
137 |         Reactivity::is_nan(self)
138 |     }
139 | 
140 |     #[inline]
141 |     fn value(self) -> Reactivity {
142 |         self
143 |     }
144 | }
145 | 
146 | impl ReactivityLike for ReactivityWithPlaceholder {
147 |     #[inline]
148 |     fn is_nan(self) -> bool {
149 |         ReactivityWithPlaceholder::is_nan(self)
150 |     }
151 | 
152 |     #[inline]
153 |     fn value(self) -> Reactivity {
154 |         self.to_maybe_placeholder()
155 |     }
156 | }
157 | 
158 | impl Entry {
159 |     pub fn cap_reactivities(&mut self, max_reactivity: Reactivity) {
160 |         self.reactivity.iter_mut().for_each(|reactivity| {
161 |             if let Some(r) = reactivity.get_non_nan() {
162 |                 *reactivity = r.min(max_reactivity).into();
163 |             }
164 |         });
165 |     }
166 | }
167 | 
168 | impl SequenceEntry for Entry {
169 |     type Reactivity = ReactivityWithPlaceholder;
170 | 
171 |     fn name(&self) -> &str {
172 |         &self.id
173 |     }
174 | 
175 |     fn sequence(&self) -> &[Base] {
176 |         &self.sequence
177 |     }
178 | 
179 |     fn reactivity(&self) -> &[Self::Reactivity] {
180 |         &self.reactivity
181 |     }
182 | 
183 |     fn molecule(&self) -> crate::Molecule {
184 |         Molecule::Dna
185 |     }
186 | }
187 | 
188 | #[derive(Debug)]
189 | pub enum ReaderError {
190 |     TooSmall,
191 |     InvalidMarker,
192 | }
193 | 
194 | impl Display for ReaderError {
195 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196 |         let s = match self {
197 |             ReaderError::TooSmall => "DB file is too small",
198 |             ReaderError::InvalidMarker => "DB file contains and invalid EOF marker",
199 |         };
200 | 
201 |         f.write_str(s)
202 |     }
203 | }
204 | 
205 | impl StdError for ReaderError {}
206 | 
207 | #[derive(Debug)]
208 | pub enum EntryError {
209 |     InvalidSequenceId(FromUtf8Error),
210 |     InvalidBase,
211 |     UnexpectedEof,
212 |     SurpassedEofMarker,
213 | }
214 | 
215 | impl Display for EntryError {
216 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
217 |         let s = match self {
218 |             EntryError::InvalidSequenceId(_) => "Invalid sequence ID string",
219 |             EntryError::InvalidBase => "Invalid encoded nucleobase",
220 |             EntryError::UnexpectedEof => "Unexpected end of file",
221 |             EntryError::SurpassedEofMarker => "End of file marked has been surpassed",
222 |         };
223 | 
224 |         f.write_str(s)
225 |     }
226 | }
227 | 
228 | impl StdError for EntryError {
229 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
230 |         match self {
231 |             EntryError::InvalidSequenceId(source) => Some(source),
232 |             EntryError::InvalidBase
233 |             | EntryError::UnexpectedEof
234 |             | EntryError::SurpassedEofMarker => None,
235 |         }
236 |     }
237 | }
238 | 
239 | pub fn read_db(path: &Path) -> Result<Vec<Entry>, Error> {
240 |     if path.is_dir() {
241 |         xml::read_directory(path).map_err(Error::Directory)
242 |     } else {
243 |         let extension = path.extension().ok_or(Error::NoExtension)?;
244 |         if extension.eq_ignore_ascii_case("db") {
245 |             native::read_file(path).map_err(Error::Native)
246 |         } else if extension.eq_ignore_ascii_case("xml") {
247 |             let entry = xml::read_file(path).map_err(Error::Xml)?;
248 |             Ok(vec![entry])
249 |         } else {
250 |             Err(Error::InvalidExtension(extension.to_os_string()))
251 |         }
252 |     }
253 | }
254 | 
255 | #[derive(Debug)]
256 | pub enum Error {
257 |     NoExtension,
258 |     InvalidExtension(OsString),
259 |     Native(native::Error),
260 |     Xml(xml::ReadFileError),
261 |     Directory(xml::ReadDirectoryError),
262 | }
263 | 
264 | impl Display for Error {
265 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
266 |         match self {
267 |             Error::NoExtension => f.write_str("db file does not have an extension"),
268 |             Error::InvalidExtension(extension) => {
269 |                 write!(
270 |                     f,
271 |                     "extension \"{}\" is not valid for a db",
272 |                     extension.to_string_lossy()
273 |                 )
274 |             }
275 |             Error::Native(_) => f.write_str("cannot read native db file"),
276 |             Error::Xml(_) => f.write_str("cannot read xml db file"),
277 |             Error::Directory(_) => f.write_str("cannot read xml entries from a directory"),
278 |         }
279 |     }
280 | }
281 | 
282 | impl StdError for Error {
283 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
284 |         match self {
285 |             Error::NoExtension | Error::InvalidExtension(_) => None,
286 |             Error::Native(source) => Some(source),
287 |             Error::Xml(source) => Some(source),
288 |             Error::Directory(source) => Some(source),
289 |         }
290 |     }
291 | }
292 | 
293 | pub fn write_entries<W: io::Write>(entries: &[Entry], mut writer: W) -> io::Result<()> {
294 |     entries.iter().try_for_each(|entry| {
295 |         let name = entry.name();
296 |         let sequence = entry.sequence();
297 |         let name_len_buf = u32::try_from(name.len().checked_add(1).unwrap())
298 |             .unwrap()
299 |             .to_le_bytes();
300 |         let seq_len_buf = u32::try_from(sequence.len()).unwrap().to_le_bytes();
301 | 
302 |         writer.write_all(name_len_buf.as_slice())?;
303 |         writer.write_all(name.as_bytes())?;
304 |         writer.write_all(&[0])?;
305 |         writer.write_all(seq_len_buf.as_slice())?;
306 |         sequence.chunks_exact(2).try_for_each(|pair| {
307 |             writer.write_all(&[Base::pair_to_nibble(pair.try_into().unwrap())])
308 |         })?;
309 |         if let Some(base) = sequence.chunks_exact(2).remainder().first().copied() {
310 |             writer.write_all(&[Base::pair_to_nibble([base, Base::A])])?;
311 |         }
312 | 
313 |         entry.reactivity().iter().try_for_each(|reactivity| {
314 |             let reactivity = f64::from(reactivity.inner()).to_le_bytes();
315 |             writer.write_all(reactivity.as_slice())
316 |         })?;
317 | 
318 |         Ok::<_, io::Error>(())
319 |     })?;
320 | 
321 |     let n_entries = u64::try_from(entries.len()).unwrap().to_le_bytes();
322 |     writer.write_all(n_entries.as_slice())?;
323 |     writer.write_all(native::VERSION.to_le_bytes().as_slice())?;
324 |     writer.write_all(native::END_MARKER)?;
325 |     writer.flush()?;
326 | 
327 |     Ok(())
328 | }
329 | 


--------------------------------------------------------------------------------
/src/db_file/native.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     convert::TryInto,
  3 |     error::Error as StdError,
  4 |     fmt::{self, Display},
  5 |     fs::File,
  6 |     io::{self, BufReader, Read, Seek, SeekFrom},
  7 |     path::Path,
  8 |     string::FromUtf8Error,
  9 | };
 10 | 
 11 | use itertools::Itertools;
 12 | 
 13 | use crate::{db_file::ReactivityWithPlaceholder, Base, InvalidBasePair, Reactivity};
 14 | 
 15 | use super::Entry;
 16 | 
 17 | pub(super) const END_SIZE: u8 = 17;
 18 | pub(super) const END_MARKER: &[u8] = b"[eofdb]";
 19 | pub(super) const VERSION: u16 = 1;
 20 | 
 21 | #[derive(Debug)]
 22 | pub struct Reader<R> {
 23 |     inner: R,
 24 |     _db_len: u64,
 25 |     _version: u16,
 26 |     end_offset: u64,
 27 | }
 28 | 
 29 | impl<R> Reader<R>
 30 | where
 31 |     R: Read + Seek,
 32 | {
 33 |     pub fn new(mut reader: R) -> Result<Self, NewReaderError> {
 34 |         use NewReaderError as E;
 35 | 
 36 |         let end_offset = reader
 37 |             .seek(SeekFrom::End(-i64::from(END_SIZE)))
 38 |             .map_err(E::SeekToMetadata)?;
 39 |         let mut end_buf = [0; END_SIZE as usize];
 40 |         reader.read_exact(&mut end_buf).map_err(E::ReadMetadata)?;
 41 | 
 42 |         if &end_buf[10..17] != END_MARKER {
 43 |             return Err(E::InvalidMarker);
 44 |         }
 45 | 
 46 |         let db_len = u64::from_le_bytes(end_buf[0..8].try_into().unwrap());
 47 |         let version = u16::from_le_bytes(end_buf[8..10].try_into().unwrap());
 48 |         Ok(Self {
 49 |             inner: reader,
 50 |             _db_len: db_len,
 51 |             _version: version,
 52 |             end_offset,
 53 |         })
 54 |     }
 55 | 
 56 |     pub fn entries(&mut self) -> EntryIter<R> {
 57 |         let &mut Self {
 58 |             ref mut inner,
 59 |             end_offset,
 60 |             ..
 61 |         } = self;
 62 | 
 63 |         EntryIter {
 64 |             reader: inner,
 65 |             end_offset,
 66 |             offset: 0,
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | #[derive(Debug)]
 72 | pub enum NewReaderError {
 73 |     SeekToMetadata(io::Error),
 74 |     ReadMetadata(io::Error),
 75 |     InvalidMarker,
 76 | }
 77 | 
 78 | impl Display for NewReaderError {
 79 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 80 |         let s = match self {
 81 |             NewReaderError::SeekToMetadata(_) => "unable to seek to metadata",
 82 |             NewReaderError::ReadMetadata(_) => "unable to read metadata",
 83 |             NewReaderError::InvalidMarker => "invalid metadata marker",
 84 |         };
 85 | 
 86 |         f.write_str(s)
 87 |     }
 88 | }
 89 | 
 90 | impl StdError for NewReaderError {
 91 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
 92 |         match self {
 93 |             NewReaderError::SeekToMetadata(source) | NewReaderError::ReadMetadata(source) => {
 94 |                 Some(source)
 95 |             }
 96 |             NewReaderError::InvalidMarker => None,
 97 |         }
 98 |     }
 99 | }
100 | 
101 | #[derive(Debug)]
102 | pub struct EntryIter<'a, R> {
103 |     reader: &'a mut R,
104 |     end_offset: u64,
105 |     offset: u64,
106 | }
107 | 
108 | impl<R> Iterator for EntryIter<'_, R>
109 | where
110 |     R: Seek + Read,
111 | {
112 |     type Item = Result<Entry, NextEntryError>;
113 | 
114 |     fn next(&mut self) -> Option<Self::Item> {
115 |         (self.offset != self.end_offset).then(|| self.next_entry())
116 |     }
117 | }
118 | 
119 | impl<R> EntryIter<'_, R>
120 | where
121 |     R: Seek + Read,
122 | {
123 |     fn next_entry(&mut self) -> Result<Entry, NextEntryError> {
124 |         use NextEntryError as E;
125 | 
126 |         if self.offset == 0 {
127 |             self.reader.seek(SeekFrom::Start(0)).map_err(E::SeekStart)?;
128 |         }
129 | 
130 |         let mut id_len_with_nul_buf = [0; 4];
131 |         self.reader
132 |             .read_exact(&mut id_len_with_nul_buf)
133 |             .map_err(E::ReadIdLen)?;
134 |         let id_len_with_nul: usize = u32::from_le_bytes(id_len_with_nul_buf)
135 |             .try_into()
136 |             .expect("cannot represent id length as usize for the current architecture");
137 |         let mut sequence_id = vec![0; id_len_with_nul];
138 |         self.reader
139 |             .read_exact(&mut sequence_id)
140 |             .map_err(E::ReadSequenceId)?;
141 |         if sequence_id.pop().filter(|&b| b == 0).is_none() {
142 |             return Err(E::MissingSequenceIdNul);
143 |         }
144 |         let sequence_id =
145 |             String::from_utf8(sequence_id).map_err(NextEntryError::InvalidSequenceId)?;
146 |         let mut sequence_len_buf = [0; 4];
147 |         self.reader
148 |             .read_exact(&mut sequence_len_buf)
149 |             .map_err(E::ReadSequenceLen)?;
150 |         let sequence_len: usize = u32::from_le_bytes(sequence_len_buf)
151 |             .try_into()
152 |             .expect("cannot represent sequence length as usize for the current architecture");
153 | 
154 |         let sequence_bytes = sequence_len / 2 + sequence_len % 2;
155 |         let mut sequence = self
156 |             .reader
157 |             .bytes()
158 |             .take(sequence_bytes)
159 |             .map(|result| {
160 |                 result.map_err(E::ReadSequence).and_then(|byte| {
161 |                     Base::try_pair_from_byte(byte)
162 |                         .map(|[first, second]| [first, second])
163 |                         .map_err(E::InvalidEncodedBase)
164 |                 })
165 |             })
166 |             .flatten_ok()
167 |             .collect::<Result<Vec<_>, _>>()?;
168 | 
169 |         if sequence_len > 0 && sequence_len % 2 == 1 {
170 |             sequence.pop().unwrap();
171 |         }
172 | 
173 |         if sequence.len() != sequence_len {
174 |             return Err(E::UnexpectedEof);
175 |         }
176 | 
177 |         let reactivity = (0..sequence_len)
178 |             .map(|_| {
179 |                 let mut reactivity_buffer = [0; 8];
180 |                 self.reader
181 |                     .read_exact(&mut reactivity_buffer)
182 |                     .map(|()| reactivity_buffer)
183 |                     .map_err(E::ReadReactivity)
184 |             })
185 |             // Reactivity is an alias to either f32 or f64
186 |             .map_ok(|bytes| {
187 |                 // We internally use a fixed type that can be f32, there is no need to necessarily
188 |                 // have 64 bits of precision
189 |                 #[allow(clippy::cast_possible_truncation)]
190 |                 let reactivity = f64::from_le_bytes(bytes) as Reactivity;
191 |                 ReactivityWithPlaceholder::from(reactivity)
192 |             })
193 |             .collect::<Result<Vec<_>, _>>()?;
194 | 
195 |         if reactivity.len() != sequence_len {
196 |             return Err(E::UnexpectedEof);
197 |         }
198 | 
199 |         let offset = self.reader.stream_position().map_err(E::StreamPosition)?;
200 |         if offset > self.end_offset {
201 |             return Err(E::SurpassedEofMarker);
202 |         }
203 |         self.offset = offset;
204 | 
205 |         Ok(Entry {
206 |             id: sequence_id,
207 |             sequence,
208 |             reactivity,
209 |         })
210 |     }
211 | }
212 | 
213 | #[derive(Debug)]
214 | pub enum NextEntryError {
215 |     SeekStart(io::Error),
216 |     ReadIdLen(io::Error),
217 |     ReadSequenceId(io::Error),
218 |     MissingSequenceIdNul,
219 |     InvalidSequenceId(FromUtf8Error),
220 |     ReadSequenceLen(io::Error),
221 |     ReadSequence(io::Error),
222 |     InvalidEncodedBase(InvalidBasePair),
223 |     ReadReactivity(io::Error),
224 |     UnexpectedEof,
225 |     SurpassedEofMarker,
226 |     StreamPosition(io::Error),
227 | }
228 | 
229 | impl Display for NextEntryError {
230 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
231 |         let s = match self {
232 |             NextEntryError::SeekStart(_) => "unable to seek to the start of the file",
233 |             NextEntryError::ReadIdLen(_) => "unable to read the length of the sequence id",
234 |             NextEntryError::ReadSequenceId(_) => "unable to read sequence id",
235 |             NextEntryError::MissingSequenceIdNul => {
236 |                 "sequence id does not have a nul termination character"
237 |             }
238 |             NextEntryError::InvalidSequenceId(_) => "sequence id is not valid",
239 |             NextEntryError::ReadSequenceLen(_) => "unable to read sequence length",
240 |             NextEntryError::ReadSequence(_) => "unable to read sequence content",
241 |             NextEntryError::InvalidEncodedBase(_) => "invalid encoded base",
242 |             NextEntryError::ReadReactivity(_) => "unable to read rectivity",
243 |             NextEntryError::UnexpectedEof => "unexpected end of file",
244 |             NextEntryError::SurpassedEofMarker => "end of file marker is being surpassed",
245 |             NextEntryError::StreamPosition(_) => "unable to get stream position",
246 |         };
247 | 
248 |         f.write_str(s)
249 |     }
250 | }
251 | 
252 | impl StdError for NextEntryError {
253 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
254 |         match self {
255 |             NextEntryError::SeekStart(source)
256 |             | NextEntryError::ReadIdLen(source)
257 |             | NextEntryError::ReadSequenceId(source)
258 |             | NextEntryError::ReadSequenceLen(source)
259 |             | NextEntryError::ReadSequence(source)
260 |             | NextEntryError::ReadReactivity(source)
261 |             | NextEntryError::StreamPosition(source) => Some(source),
262 |             NextEntryError::MissingSequenceIdNul
263 |             | NextEntryError::UnexpectedEof
264 |             | NextEntryError::SurpassedEofMarker => None,
265 |             NextEntryError::InvalidSequenceId(source) => Some(source),
266 |             NextEntryError::InvalidEncodedBase(source) => Some(source),
267 |         }
268 |     }
269 | }
270 | 
271 | #[derive(Debug)]
272 | pub enum Error {
273 |     OpenFile(io::Error),
274 |     NewReader(NewReaderError),
275 |     Entry(NextEntryError),
276 | }
277 | 
278 | impl Display for Error {
279 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280 |         let s = match self {
281 |             Error::OpenFile(_) => "unable to open file",
282 |             Error::NewReader(_) => "unable to create new reader",
283 |             Error::Entry(_) => "unable to get the next entry",
284 |         };
285 | 
286 |         f.write_str(s)
287 |     }
288 | }
289 | 
290 | impl StdError for Error {
291 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
292 |         match self {
293 |             Error::OpenFile(source) => Some(source),
294 |             Error::NewReader(source) => Some(source),
295 |             Error::Entry(source) => Some(source),
296 |         }
297 |     }
298 | }
299 | 
300 | pub fn read_file(path: &Path) -> Result<Vec<Entry>, Error> {
301 |     let file = File::open(path).map_err(Error::OpenFile)?;
302 |     let mut reader = Reader::new(BufReader::new(file)).map_err(Error::NewReader)?;
303 |     let entries = reader
304 |         .entries()
305 |         .collect::<Result<_, _>>()
306 |         .map_err(Error::Entry)?;
307 |     Ok(entries)
308 | }
309 | 
310 | #[cfg(test)]
311 | mod tests {
312 |     use std::io::Cursor;
313 | 
314 |     use super::*;
315 | 
316 |     const TEST_DB: &[u8] = include_bytes!("../../test_data/test.db");
317 | 
318 |     #[test]
319 |     fn valid_reader() {
320 |         let reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
321 |         #[allow(clippy::used_underscore_binding)]
322 |         let len = reader._db_len;
323 | 
324 |         #[allow(clippy::used_underscore_binding)]
325 |         let version = reader._version;
326 | 
327 |         assert_eq!(len, 0x1181);
328 |         assert_eq!(version, 1);
329 |     }
330 | 
331 |     #[test]
332 |     fn read_all_db() {
333 |         let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
334 |         let db_len = reader
335 |             .entries()
336 |             .map_ok(|entry| entry.sequence.len())
337 |             .try_fold(0, |acc, seq_len| seq_len.map(|seq_len| acc + seq_len))
338 |             .unwrap();
339 | 
340 |         #[allow(clippy::used_underscore_binding)]
341 |         let reader_len = usize::try_from(reader._db_len).unwrap();
342 |         assert_eq!(db_len, reader_len);
343 |     }
344 | 
345 |     #[test]
346 |     fn transform_pseudo_nans() {
347 |         let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
348 |         let entry = reader.entries().next().unwrap().unwrap();
349 | 
350 |         // The first 13 reactivities are -999 in the file
351 |         assert!(entry.reactivity[..13]
352 |             .iter()
353 |             .copied()
354 |             .all(ReactivityWithPlaceholder::is_nan));
355 |     }
356 | }
357 | 


--------------------------------------------------------------------------------
/src/db_file/xml.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     borrow::Cow,
  3 |     error::Error as StdError,
  4 |     fmt::{self, Display},
  5 |     fs::File,
  6 |     io::{self, BufReader},
  7 |     num::ParseFloatError,
  8 |     ops::Not,
  9 |     path::Path,
 10 |     str::Utf8Error,
 11 | };
 12 | 
 13 | use quick_xml::{
 14 |     events::{BytesEnd, BytesStart, BytesText},
 15 |     Reader,
 16 | };
 17 | use rayon::iter::{ParallelBridge, ParallelIterator};
 18 | 
 19 | use crate::{Base, InvalidBase, Reactivity};
 20 | 
 21 | use super::{Entry, ReactivityWithPlaceholder};
 22 | 
 23 | pub fn read_file(path: &Path) -> Result<Entry, ReadFileError> {
 24 |     use quick_xml::events::Event;
 25 |     use ReadFileError as E;
 26 | 
 27 |     let mut reader = Reader::from_file(path).map_err(E::ReaderFromFile)?;
 28 |     let mut buffer = Vec::new();
 29 |     let mut state = XmlState::default();
 30 | 
 31 |     let mut id = None;
 32 |     let mut sequence = None;
 33 |     let mut reactivity = None;
 34 | 
 35 |     loop {
 36 |         let event = reader
 37 |             .read_event_into(&mut buffer)
 38 |             .map_err(|source| E::ReadEvent {
 39 |                 buffer_position: reader.buffer_position(),
 40 |                 source,
 41 |             })?;
 42 | 
 43 |         match event {
 44 |             Event::Start(start) => {
 45 |                 state = handle_start_event(&start, state, &mut id)?;
 46 |             }
 47 | 
 48 |             Event::End(end) => {
 49 |                 state = handle_end_event(&end, state)?;
 50 |             }
 51 | 
 52 |             Event::Empty(tag) => return Err(E::UnexpectedEmptyTag(tag.name().as_ref().to_owned())),
 53 |             Event::Text(text) => {
 54 |                 handle_text_event(&text, &state, &mut sequence, &mut reactivity, &reader)?;
 55 |             }
 56 | 
 57 |             Event::CData(_)
 58 |             | Event::Comment(_)
 59 |             | Event::Decl(_)
 60 |             | Event::PI(_)
 61 |             | Event::DocType(_) => {}
 62 | 
 63 |             Event::Eof => break,
 64 |         }
 65 |     }
 66 | 
 67 |     let id = id.ok_or(E::MissingTranscript)?;
 68 |     let sequence = sequence.ok_or(E::MissingSequence)?;
 69 |     let reactivity = reactivity.ok_or(E::MissingReactivity)?;
 70 | 
 71 |     if sequence.len() != reactivity.len() {
 72 |         return Err(E::InconsistentLength {
 73 |             sequence: sequence.len(),
 74 |             reactivity: reactivity.len(),
 75 |         });
 76 |     }
 77 | 
 78 |     Ok(Entry {
 79 |         id,
 80 |         sequence,
 81 |         reactivity,
 82 |     })
 83 | }
 84 | 
 85 | fn handle_start_event(
 86 |     start: &BytesStart<'_>,
 87 |     state: XmlState,
 88 |     id: &mut Option<String>,
 89 | ) -> Result<XmlState, ReadFileError> {
 90 |     use ReadFileError as E;
 91 | 
 92 |     match (start.name().as_ref(), state) {
 93 |         (b"data", XmlState::Start) => Ok(XmlState::Data),
 94 |         (b"meta-data", XmlState::Data) => Ok(XmlState::MetaData),
 95 |         (b"organism", XmlState::MetaData) => Ok(XmlState::Organism),
 96 |         (b"probe", XmlState::MetaData) => Ok(XmlState::Probe),
 97 |         (b"source", XmlState::MetaData) => Ok(XmlState::Source),
 98 |         (b"citation", XmlState::Source) => Ok(XmlState::Citation),
 99 |         (b"pmid", XmlState::Source) => Ok(XmlState::Pmid),
100 |         (b"replicate", XmlState::MetaData) => Ok(XmlState::Replicate),
101 |         (b"condition", XmlState::MetaData) => Ok(XmlState::Condition),
102 |         (b"transcript", XmlState::Data) => {
103 |             if id.is_some() {
104 |                 return Err(E::MultipleTranscripts);
105 |             }
106 | 
107 |             let id_attr = start
108 |                 .try_get_attribute("id")
109 |                 .map_err(E::MalformedTranscriptTag)?
110 |                 .ok_or(E::MissingId)?;
111 | 
112 |             let id_string = match id_attr.value {
113 |                 Cow::Borrowed(id) => std::str::from_utf8(id)
114 |                     .map(str::to_owned)
115 |                     .map_err(E::InvalidId)?,
116 |                 Cow::Owned(id) => {
117 |                     String::from_utf8(id).map_err(|err| E::InvalidId(err.utf8_error()))?
118 |                 }
119 |             };
120 |             *id = Some(id_string);
121 | 
122 |             Ok(XmlState::Transcript)
123 |         }
124 |         (b"sequence", XmlState::Transcript) => Ok(XmlState::Sequence),
125 |         (b"reactivity", XmlState::Transcript) => Ok(XmlState::Reactivity),
126 |         _ => Err(E::UnexpectedOpenTag(start.name().as_ref().to_owned())),
127 |     }
128 | }
129 | 
130 | fn handle_end_event(end: &BytesEnd<'_>, state: XmlState) -> Result<XmlState, ReadFileError> {
131 |     use ReadFileError as E;
132 | 
133 |     match (end.name().as_ref(), state) {
134 |         (b"data", XmlState::Data) => Ok(XmlState::End),
135 | 
136 |         (b"meta-data", XmlState::MetaData) | (b"transcript", XmlState::Transcript) => {
137 |             Ok(XmlState::Data)
138 |         }
139 | 
140 |         (b"organism", XmlState::Organism)
141 |         | (b"probe", XmlState::Probe)
142 |         | (b"source", XmlState::Source)
143 |         | (b"replicate", XmlState::Replicate)
144 |         | (b"condition", XmlState::Condition) => Ok(XmlState::MetaData),
145 | 
146 |         (b"citation", XmlState::Citation) | (b"pmid", XmlState::Pmid) => Ok(XmlState::Source),
147 | 
148 |         (b"sequence", XmlState::Sequence) | (b"reactivity", XmlState::Reactivity) => {
149 |             Ok(XmlState::Transcript)
150 |         }
151 | 
152 |         _ => Err(E::UnexpectedCloseTag(end.name().as_ref().to_owned())),
153 |     }
154 | }
155 | 
156 | fn handle_text_event(
157 |     text: &BytesText<'_>,
158 |     state: &XmlState,
159 |     sequence: &mut Option<Vec<Base>>,
160 |     reactivity: &mut Option<Vec<ReactivityWithPlaceholder>>,
161 |     reader: &Reader<BufReader<File>>,
162 | ) -> Result<(), ReadFileError> {
163 |     use ReadFileError as E;
164 | 
165 |     if text.iter().all(u8::is_ascii_whitespace) {
166 |         return Ok(());
167 |     }
168 | 
169 |     match state {
170 |         XmlState::Start
171 |         | XmlState::Data
172 |         | XmlState::MetaData
173 |         | XmlState::Source
174 |         | XmlState::Transcript
175 |         | XmlState::End => return Err(E::UnexpectedText(reader.buffer_position())),
176 | 
177 |         XmlState::Organism
178 |         | XmlState::Probe
179 |         | XmlState::Citation
180 |         | XmlState::Pmid
181 |         | XmlState::Replicate
182 |         | XmlState::Condition => {}
183 | 
184 |         XmlState::Sequence => {
185 |             if sequence.is_some() {
186 |                 return Err(E::MultipleSequences);
187 |             }
188 |             *sequence = Some(parse_sequence(text).map_err(E::InvalidSequence)?);
189 |         }
190 |         XmlState::Reactivity => {
191 |             if reactivity.is_some() {
192 |                 return Err(E::MultipleReactivities);
193 |             }
194 | 
195 |             *reactivity = Some(parse_reactivity(text).map_err(E::InvalidReactivity)?);
196 |         }
197 |     }
198 | 
199 |     Ok(())
200 | }
201 | 
202 | #[derive(Debug, Default)]
203 | enum XmlState {
204 |     #[default]
205 |     Start,
206 |     Data,
207 |     MetaData,
208 |     Organism,
209 |     Probe,
210 |     Source,
211 |     Citation,
212 |     Pmid,
213 |     Replicate,
214 |     Condition,
215 |     Transcript,
216 |     Sequence,
217 |     Reactivity,
218 |     End,
219 | }
220 | 
221 | #[derive(Debug)]
222 | pub enum ReadFileError {
223 |     ReaderFromFile(quick_xml::Error),
224 |     ReadEvent {
225 |         buffer_position: usize,
226 |         source: quick_xml::Error,
227 |     },
228 |     UnexpectedOpenTag(Vec<u8>),
229 |     UnexpectedCloseTag(Vec<u8>),
230 |     UnexpectedEmptyTag(Vec<u8>),
231 |     UnexpectedText(usize),
232 |     MultipleTranscripts,
233 |     MalformedTranscriptTag(quick_xml::Error),
234 |     MissingId,
235 |     InvalidId(Utf8Error),
236 |     MultipleSequences,
237 |     InvalidSequence(InvalidBase),
238 |     MultipleReactivities,
239 |     InvalidReactivity(InvalidReactivity),
240 |     MissingTranscript,
241 |     MissingSequence,
242 |     MissingReactivity,
243 |     InconsistentLength {
244 |         sequence: usize,
245 |         reactivity: usize,
246 |     },
247 | }
248 | 
249 | impl Display for ReadFileError {
250 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
251 |         match self {
252 |             ReadFileError::ReaderFromFile(_) => {
253 |                 f.write_str("unable to create XML reader from file")
254 |             }
255 |             ReadFileError::ReadEvent {
256 |                 buffer_position,
257 |                 source: _,
258 |             } => write!(f, "unable to read XML event at position {buffer_position}"),
259 |             ReadFileError::UnexpectedOpenTag(tag) => write!(
260 |                 f,
261 |                 r#"unexpected opening tag "{}""#,
262 |                 String::from_utf8_lossy(tag)
263 |             ),
264 |             ReadFileError::UnexpectedCloseTag(tag) => write!(
265 |                 f,
266 |                 r#"unexpected closing tag "{}""#,
267 |                 String::from_utf8_lossy(tag),
268 |             ),
269 |             ReadFileError::UnexpectedEmptyTag(tag) => write!(
270 |                 f,
271 |                 r#"unexpected empty tag "{}""#,
272 |                 String::from_utf8_lossy(tag),
273 |             ),
274 |             ReadFileError::UnexpectedText(position) => {
275 |                 write!(f, "unexpected text content at position {position}")
276 |             }
277 |             ReadFileError::MultipleTranscripts => f.write_str("more than one transcript tag found"),
278 |             ReadFileError::MalformedTranscriptTag(_) => {
279 |                 f.write_str("transcript tag has invalid or duplicated attributes")
280 |             }
281 |             ReadFileError::MissingId => {
282 |                 f.write_str(r#""id" attribute is missing from transcript tag"#)
283 |             }
284 |             ReadFileError::InvalidId(_) => f.write_str("transcript id is not a valid UTF-8 string"),
285 |             ReadFileError::MultipleSequences => f.write_str("more than one sequence tag found"),
286 |             ReadFileError::InvalidSequence(_) => f.write_str("sequence is invalid"),
287 |             ReadFileError::MultipleReactivities => {
288 |                 f.write_str("more than one reactivity tag found")
289 |             }
290 |             ReadFileError::InvalidReactivity(_) => f.write_str("reactivity data is invalid"),
291 |             ReadFileError::MissingTranscript => f.write_str("transcript tag is missing"),
292 |             ReadFileError::MissingSequence => f.write_str("sequence tag is missing"),
293 |             ReadFileError::MissingReactivity => f.write_str("reactivity tag is missing"),
294 |             ReadFileError::InconsistentLength {
295 |                 sequence,
296 |                 reactivity,
297 |             } => write!(
298 |                 f,
299 |                 "sequence length ({sequence}) is different from reactivity sequence {reactivity}"
300 |             ),
301 |         }
302 |     }
303 | }
304 | 
305 | impl StdError for ReadFileError {
306 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
307 |         match self {
308 |             ReadFileError::ReaderFromFile(source) | ReadFileError::ReadEvent { source, .. } => {
309 |                 Some(source)
310 |             }
311 | 
312 |             ReadFileError::UnexpectedOpenTag(_)
313 |             | ReadFileError::UnexpectedCloseTag(_)
314 |             | ReadFileError::UnexpectedEmptyTag(_)
315 |             | ReadFileError::UnexpectedText(_)
316 |             | ReadFileError::MultipleTranscripts
317 |             | ReadFileError::MissingId
318 |             | ReadFileError::MultipleSequences
319 |             | ReadFileError::MultipleReactivities
320 |             | ReadFileError::MissingTranscript
321 |             | ReadFileError::MissingSequence
322 |             | ReadFileError::MissingReactivity
323 |             | ReadFileError::InconsistentLength { .. } => None,
324 | 
325 |             ReadFileError::MalformedTranscriptTag(source) => Some(source),
326 |             ReadFileError::InvalidId(source) => Some(source),
327 |             ReadFileError::InvalidSequence(source) => Some(source),
328 |             ReadFileError::InvalidReactivity(source) => Some(source),
329 |         }
330 |     }
331 | }
332 | 
333 | fn parse_sequence(raw: &[u8]) -> Result<Vec<Base>, InvalidBase> {
334 |     raw.iter()
335 |         .filter(|c| c.is_ascii_whitespace().not())
336 |         .copied()
337 |         .map(Base::try_from)
338 |         .collect()
339 | }
340 | 
341 | fn parse_reactivity(raw: &[u8]) -> Result<Vec<ReactivityWithPlaceholder>, InvalidReactivity> {
342 |     use InvalidReactivity as E;
343 | 
344 |     raw.split(|&c| c == b',')
345 |         .map(|raw| {
346 |             let raw = std::str::from_utf8(raw).map_err(E::Utf8)?.trim();
347 | 
348 |             if raw == "NaN" {
349 |                 Ok(ReactivityWithPlaceholder::nan_placeholder())
350 |             } else {
351 |                 raw.parse::<Reactivity>()
352 |                     .map(ReactivityWithPlaceholder::from)
353 |                     .map_err(InvalidReactivity::Value)
354 |             }
355 |         })
356 |         .collect()
357 | }
358 | 
359 | #[derive(Debug)]
360 | pub enum InvalidReactivity {
361 |     Utf8(Utf8Error),
362 |     Value(ParseFloatError),
363 | }
364 | 
365 | impl Display for InvalidReactivity {
366 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
367 |         let s = match self {
368 |             InvalidReactivity::Utf8(_) => "rectivity is not a valid UTF-8 string",
369 |             InvalidReactivity::Value(_) => "unable to parse reactivity value",
370 |         };
371 | 
372 |         f.write_str(s)
373 |     }
374 | }
375 | 
376 | impl StdError for InvalidReactivity {
377 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
378 |         match self {
379 |             InvalidReactivity::Utf8(source) => Some(source),
380 |             InvalidReactivity::Value(source) => Some(source),
381 |         }
382 |     }
383 | }
384 | 
385 | pub fn read_directory(path: &Path) -> Result<Vec<Entry>, ReadDirectoryError> {
386 |     use ReadDirectoryError as E;
387 | 
388 |     path.read_dir()
389 |         .map_err(E::Dir)?
390 |         .filter_map(|entry| {
391 |             entry
392 |                 .map(|entry| {
393 |                     let path = entry.path();
394 |                     let extension = path.extension()?;
395 |                     extension.eq_ignore_ascii_case("xml").then_some(path)
396 |                 })
397 |                 .transpose()
398 |         })
399 |         .par_bridge()
400 |         .filter_map(|path| {
401 |             let path = match path {
402 |                 Ok(path) => path,
403 |                 Err(err) => return Some(Err(E::DirEntry(err))),
404 |             };
405 |             match read_file(&path) {
406 |                 Ok(entry) => Some(Ok(entry)),
407 |                 Err(err) => {
408 |                     eprintln!(
409 |                         "WARNING: unable to read XML path {}: {:#}",
410 |                         path.display(),
411 |                         anyhow::Error::from(err)
412 |                     );
413 |                     None
414 |                 }
415 |             }
416 |         })
417 |         .collect()
418 | }
419 | 
420 | #[derive(Debug)]
421 | pub enum ReadDirectoryError {
422 |     Dir(io::Error),
423 |     DirEntry(io::Error),
424 | }
425 | 
426 | impl Display for ReadDirectoryError {
427 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
428 |         match self {
429 |             ReadDirectoryError::Dir(_) => f.write_str("unable to read directory"),
430 |             ReadDirectoryError::DirEntry(_) => f.write_str("unable to read directory entry"),
431 |         }
432 |     }
433 | }
434 | 
435 | impl StdError for ReadDirectoryError {
436 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
437 |         match self {
438 |             ReadDirectoryError::Dir(source) | ReadDirectoryError::DirEntry(source) => Some(source),
439 |         }
440 |     }
441 | }
442 | 
443 | #[cfg(test)]
444 | mod tests {
445 |     use std::{
446 |         fs,
447 |         path::{Path, PathBuf},
448 |         sync::OnceLock,
449 |     };
450 | 
451 |     use tempfile::tempdir;
452 | 
453 |     use crate::{db_file::ReactivityWithPlaceholder, Base};
454 | 
455 |     use super::{read_directory, read_file};
456 | 
457 |     fn raw_xml_db_path() -> &'static Path {
458 |         static RAW_XML_DB_PATH: OnceLock<PathBuf> = OnceLock::new();
459 | 
460 |         RAW_XML_DB_PATH.get_or_init(|| {
461 |             let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
462 |             manifest_dir.join("test_data/test_db.xml")
463 |         })
464 |     }
465 | 
466 |     #[test]
467 |     fn read_valid_xml() {
468 |         let entry = read_file(raw_xml_db_path()).unwrap();
469 |         assert_eq!(entry.id, "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S");
470 |         assert_eq!(entry.sequence.len(), 1800);
471 |         assert_eq!(
472 |             entry.sequence[..5],
473 |             [Base::T, Base::A, Base::T, Base::C, Base::T]
474 |         );
475 |         assert!(entry.reactivity[..37]
476 |             .iter()
477 |             .copied()
478 |             .all(ReactivityWithPlaceholder::is_nan));
479 |         assert!((entry.reactivity[37].get_non_nan().unwrap() - 0.389).abs() < 0.001);
480 |     }
481 | 
482 |     #[test]
483 |     fn read_directory_ignores_non_xml_files() {
484 |         let tempdir = tempdir().unwrap();
485 |         let temp_path = tempdir.path();
486 |         fs::write(temp_path.join("test.txt"), "hello world").unwrap();
487 |         fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap();
488 |         let entries = read_directory(temp_path).unwrap();
489 |         assert_eq!(entries.len(), 1);
490 |         assert_eq!(
491 |             entries[0].id,
492 |             "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S",
493 |         );
494 |     }
495 | 
496 |     #[test]
497 |     fn read_directory_ignores_invalid_xml_files() {
498 |         let tempdir = tempdir().unwrap();
499 |         let temp_path = tempdir.path();
500 |         let xml_file_path = temp_path.join("test.xml");
501 |         fs::write(xml_file_path, "invalid xml").unwrap();
502 |         fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap();
503 |         let entries = read_directory(temp_path).unwrap();
504 |         assert_eq!(entries.len(), 1);
505 |         assert_eq!(
506 |             entries[0].id,
507 |             "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S",
508 |         );
509 |     }
510 | }
511 | 


--------------------------------------------------------------------------------
/src/dotbracket.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     cmp::Ordering,
  3 |     fmt::{self, Display},
  4 |     ops::{Not, Range},
  5 |     str::FromStr,
  6 | };
  7 | 
  8 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
  9 | pub struct DotBracket<C, const SORTED: bool> {
 10 |     paired_blocks: C,
 11 |     len: usize,
 12 | }
 13 | 
 14 | impl<C> DotBracket<C, false>
 15 | where
 16 |     C: AsMut<Vec<PairedBlock>>,
 17 | {
 18 |     #[inline]
 19 |     pub fn from_str(dot_bracket: &str, paired_blocks_buffer: C) -> Result<Self, InvalidDotBracket> {
 20 |         Self::from_str_with_buffer(dot_bracket, paired_blocks_buffer, &mut Vec::new())
 21 |     }
 22 | 
 23 |     #[inline]
 24 |     pub fn from_str_with_buffer(
 25 |         dot_bracket: &str,
 26 |         paired_blocks_buffer: C,
 27 |         working_buffer: &mut Vec<PartialPairedBlock>,
 28 |     ) -> Result<Self, InvalidDotBracket> {
 29 |         Self::from_bytes_with_buffer(dot_bracket.as_bytes(), paired_blocks_buffer, working_buffer)
 30 |     }
 31 | 
 32 |     pub fn from_bytes_with_buffer(
 33 |         dot_bracket: &[u8],
 34 |         mut paired_blocks_buffer: C,
 35 |         working_buffer: &mut Vec<PartialPairedBlock>,
 36 |     ) -> Result<Self, InvalidDotBracket> {
 37 |         let len = dot_bracket.len();
 38 | 
 39 |         let paired_blocks_buffer_ref = paired_blocks_buffer.as_mut();
 40 |         paired_blocks_buffer_ref.clear();
 41 |         working_buffer.clear();
 42 |         let state = dot_bracket
 43 |             .iter()
 44 |             .enumerate()
 45 |             .try_fold(None, |partial, (index, &c)| {
 46 |                 try_fold_from_bytes(partial, index, c, paired_blocks_buffer_ref, working_buffer)
 47 |             })?;
 48 | 
 49 |         if working_buffer.is_empty().not() {
 50 |             return Err(InvalidDotBracket);
 51 |         }
 52 | 
 53 |         if let Some(state) = state {
 54 |             let PartialPairedBlockUnstored {
 55 |                 left_start,
 56 |                 other:
 57 |                     Some(PartialPairedBlockOther {
 58 |                         left_end,
 59 |                         right_start,
 60 |                     }),
 61 |             } = state
 62 |             else {
 63 |                 return Err(InvalidDotBracket);
 64 |             };
 65 | 
 66 |             let left = left_start..left_end;
 67 |             let right = right_start..dot_bracket.len();
 68 |             if left.len() != right.len() {
 69 |                 return Err(InvalidDotBracket);
 70 |             }
 71 | 
 72 |             paired_blocks_buffer_ref.push(PairedBlock { left, right });
 73 |         }
 74 | 
 75 |         Ok(DotBracket {
 76 |             paired_blocks: paired_blocks_buffer,
 77 |             len,
 78 |         })
 79 |     }
 80 | 
 81 |     #[inline]
 82 |     pub fn into_sorted(self) -> DotBracket<C, true> {
 83 |         let Self {
 84 |             mut paired_blocks,
 85 |             len,
 86 |         } = self;
 87 |         paired_blocks
 88 |             .as_mut()
 89 |             .sort_unstable_by_key(|block| block.left.start);
 90 | 
 91 |         DotBracket { paired_blocks, len }
 92 |     }
 93 | }
 94 | 
 95 | fn try_fold_from_bytes(
 96 |     partial: Option<PartialPairedBlockUnstored>,
 97 |     index: usize,
 98 |     c: u8,
 99 |     paired_blocks_buffer: &mut Vec<PairedBlock>,
100 |     working_buffer: &mut Vec<PartialPairedBlock>,
101 | ) -> Result<Option<PartialPairedBlockUnstored>, InvalidDotBracket> {
102 |     match c {
103 |         b'(' => Ok(Some(handle_opening_bracket(
104 |             partial,
105 |             index,
106 |             paired_blocks_buffer,
107 |             working_buffer,
108 |         ))),
109 | 
110 |         b'.' => Ok(handle_dot(
111 |             partial,
112 |             index,
113 |             paired_blocks_buffer,
114 |             working_buffer,
115 |         )),
116 | 
117 |         b')' => handle_closing_bracket(
118 |             partial.as_ref(),
119 |             index,
120 |             paired_blocks_buffer,
121 |             working_buffer,
122 |         ),
123 | 
124 |         _ => Err(InvalidDotBracket),
125 |     }
126 | }
127 | 
128 | fn handle_opening_bracket(
129 |     partial: Option<PartialPairedBlockUnstored>,
130 |     index: usize,
131 |     paired_blocks_buffer: &mut Vec<PairedBlock>,
132 |     working_buffer: &mut Vec<PartialPairedBlock>,
133 | ) -> PartialPairedBlockUnstored {
134 |     partial.map_or(
135 |         PartialPairedBlockUnstored {
136 |             left_start: index,
137 |             other: None,
138 |         },
139 |         |partial| match partial {
140 |             partial @ PartialPairedBlockUnstored {
141 |                 left_start: _,
142 |                 other: None,
143 |             } => partial,
144 | 
145 |             PartialPairedBlockUnstored {
146 |                 left_start,
147 |                 other:
148 |                     Some(PartialPairedBlockOther {
149 |                         left_end,
150 |                         right_start,
151 |                     }),
152 |             } => {
153 |                 paired_blocks_buffer.push(handle_lr_paired_block(
154 |                     index,
155 |                     right_start,
156 |                     left_start,
157 |                     left_end,
158 |                     working_buffer,
159 |                 ));
160 | 
161 |                 PartialPairedBlockUnstored {
162 |                     left_start: index,
163 |                     other: None,
164 |                 }
165 |             }
166 |         },
167 |     )
168 | }
169 | 
170 | fn handle_dot(
171 |     partial: Option<PartialPairedBlockUnstored>,
172 |     index: usize,
173 |     paired_blocks_buffer: &mut Vec<PairedBlock>,
174 |     working_buffer: &mut Vec<PartialPairedBlock>,
175 | ) -> Option<PartialPairedBlockUnstored> {
176 |     if let Some(partial) = partial {
177 |         let PartialPairedBlockUnstored { left_start, other } = partial;
178 |         match other {
179 |             Some(PartialPairedBlockOther {
180 |                 left_end,
181 |                 right_start,
182 |             }) => paired_blocks_buffer.push(handle_lr_paired_block(
183 |                 index,
184 |                 right_start,
185 |                 left_start,
186 |                 left_end,
187 |                 working_buffer,
188 |             )),
189 |             None => working_buffer.push(PartialPairedBlock {
190 |                 left: left_start..index,
191 |             }),
192 |         }
193 |     }
194 | 
195 |     None
196 | }
197 | 
198 | fn handle_closing_bracket(
199 |     partial: Option<&PartialPairedBlockUnstored>,
200 |     index: usize,
201 |     paired_blocks_buffer: &mut Vec<PairedBlock>,
202 |     working_buffer: &mut Vec<PartialPairedBlock>,
203 | ) -> Result<Option<PartialPairedBlockUnstored>, InvalidDotBracket> {
204 |     match partial {
205 |         None => {
206 |             let PartialPairedBlock { left } = working_buffer.pop().ok_or(InvalidDotBracket)?;
207 |             if left.end - left.start == 1 {
208 |                 // Cannot use InclusiveRange here
209 |                 #[allow(clippy::range_plus_one)]
210 |                 let right = index..(index + 1);
211 |                 paired_blocks_buffer.push(PairedBlock { left, right });
212 |                 Ok(None)
213 |             } else {
214 |                 Ok(Some(PartialPairedBlockUnstored {
215 |                     left_start: left.start,
216 |                     other: Some(PartialPairedBlockOther {
217 |                         left_end: left.end,
218 |                         right_start: index,
219 |                     }),
220 |                 }))
221 |             }
222 |         }
223 | 
224 |         Some(&PartialPairedBlockUnstored {
225 |             left_start,
226 |             other:
227 |                 Some(PartialPairedBlockOther {
228 |                     left_end,
229 |                     right_start,
230 |                 }),
231 |         }) => match (left_end - left_start).cmp(&(index + 1 - right_start)) {
232 |             Ordering::Greater => Ok(Some(PartialPairedBlockUnstored {
233 |                 left_start,
234 |                 other: Some(PartialPairedBlockOther {
235 |                     left_end,
236 |                     right_start,
237 |                 }),
238 |             })),
239 | 
240 |             Ordering::Equal => {
241 |                 let left = left_start..left_end;
242 |                 // Cannot use InclusiveRange here
243 |                 #[allow(clippy::range_plus_one)]
244 |                 let right = right_start..(index + 1);
245 |                 paired_blocks_buffer.push(PairedBlock { left, right });
246 | 
247 |                 Ok(None)
248 |             }
249 | 
250 |             Ordering::Less => {
251 |                 panic!("invalid partial paired blocks status")
252 |             }
253 |         },
254 | 
255 |         Some(&PartialPairedBlockUnstored {
256 |             left_start,
257 |             other: None,
258 |         }) => Ok(Some(PartialPairedBlockUnstored {
259 |             left_start,
260 |             other: Some(PartialPairedBlockOther {
261 |                 left_end: index,
262 |                 right_start: index,
263 |             }),
264 |         })),
265 |     }
266 | }
267 | 
268 | fn handle_lr_paired_block(
269 |     index: usize,
270 |     right_start: usize,
271 |     left_start: usize,
272 |     left_end: usize,
273 |     working_buffer: &mut Vec<PartialPairedBlock>,
274 | ) -> PairedBlock {
275 |     let right = right_start..index;
276 | 
277 |     let left_len = left_end - left_start;
278 |     let right_len = index - right_start;
279 |     let left = match left_len.cmp(&right_len) {
280 |         Ordering::Greater => {
281 |             let new_left_start = left_end - right_len;
282 |             working_buffer.push(PartialPairedBlock {
283 |                 left: left_start..new_left_start,
284 |             });
285 | 
286 |             new_left_start..left_end
287 |         }
288 |         Ordering::Equal => left_start..left_end,
289 |         Ordering::Less => unreachable!("invalid paired blocks"),
290 |     };
291 | 
292 |     PairedBlock { left, right }
293 | }
294 | 
295 | impl<C, const SORTED: bool> DotBracket<C, SORTED>
296 | where
297 |     C: AsRef<[PairedBlock]>,
298 | {
299 |     #[inline]
300 |     pub fn paired_blocks(&self) -> &[PairedBlock] {
301 |         self.paired_blocks.as_ref()
302 |     }
303 | 
304 |     pub fn to_owned(&self) -> DotBracket<Vec<PairedBlock>, SORTED> {
305 |         let &Self {
306 |             ref paired_blocks,
307 |             len,
308 |         } = self;
309 |         let paired_blocks = paired_blocks.as_ref().to_owned();
310 |         DotBracket { paired_blocks, len }
311 |     }
312 | }
313 | 
314 | impl<C, const SORTED: bool> Display for DotBracket<C, SORTED>
315 | where
316 |     C: AsRef<[PairedBlock]>,
317 | {
318 |     // TODO: find a better implementation
319 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
320 |         let &Self {
321 |             ref paired_blocks,
322 |             len,
323 |         } = self;
324 | 
325 |         let mut buf = vec![b'.'; len];
326 |         for block in paired_blocks.as_ref() {
327 |             buf[block.left().clone()].fill(b'(');
328 |             buf[block.right().clone()].fill(b')');
329 |         }
330 | 
331 |         f.write_str(std::str::from_utf8(&buf).unwrap())
332 |     }
333 | }
334 | 
335 | pub type DotBracketOwned = DotBracket<Vec<PairedBlock>, false>;
336 | pub type DotBracketOwnedSorted = DotBracket<Vec<PairedBlock>, true>;
337 | 
338 | pub type DotBracketBuffered<'a> = DotBracket<&'a mut Vec<PairedBlock>, false>;
339 | 
340 | impl FromStr for DotBracketOwned {
341 |     type Err = InvalidDotBracket;
342 | 
343 |     #[inline]
344 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
345 |         DotBracket::from_str(s, Vec::new())
346 |     }
347 | }
348 | 
349 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
350 | pub struct InvalidDotBracket;
351 | 
352 | impl Display for InvalidDotBracket {
353 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
354 |         f.write_str("invalid dot-bracket notation string")
355 |     }
356 | }
357 | 
358 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
359 | pub struct PairedBlock {
360 |     left: Range<usize>,
361 |     right: Range<usize>,
362 | }
363 | 
364 | #[derive(Debug)]
365 | #[doc(hidden)]
366 | pub struct PartialPairedBlock {
367 |     left: Range<usize>,
368 | }
369 | 
370 | #[derive(Debug)]
371 | struct PartialPairedBlockUnstored {
372 |     left_start: usize,
373 |     other: Option<PartialPairedBlockOther>,
374 | }
375 | 
376 | #[derive(Debug)]
377 | struct PartialPairedBlockOther {
378 |     left_end: usize,
379 |     right_start: usize,
380 | }
381 | 
382 | impl PairedBlock {
383 |     #[inline]
384 |     pub fn left(&self) -> &Range<usize> {
385 |         &self.left
386 |     }
387 | 
388 |     #[inline]
389 |     pub fn right(&self) -> &Range<usize> {
390 |         &self.right
391 |     }
392 | }
393 | 
394 | #[cfg(test)]
395 | mod tests {
396 |     use super::*;
397 | 
398 |     const STEM_LOOP_DB: &str = "...(((((((((....)))))))))";
399 |     fn test_stem_loop<C>(db: &DotBracket<C, false>)
400 |     where
401 |         C: AsRef<[PairedBlock]> + fmt::Debug,
402 |     {
403 |         assert_eq!(db.len, 25);
404 |         assert_eq!(
405 |             db.paired_blocks.as_ref(),
406 |             [PairedBlock {
407 |                 left: 3..12,
408 |                 right: 16..25,
409 |             }],
410 |         );
411 |     }
412 | 
413 |     #[test]
414 |     fn simple_stem_loop_owned() {
415 |         let db: DotBracketOwned = STEM_LOOP_DB.parse().unwrap();
416 |         test_stem_loop(&db);
417 |     }
418 | 
419 |     #[test]
420 |     fn simple_stem_loop_buffered() {
421 |         let mut buffer = vec![];
422 |         let db = DotBracketBuffered::from_str(STEM_LOOP_DB, &mut buffer).unwrap();
423 |         test_stem_loop(&db);
424 |     }
425 | 
426 |     #[test]
427 |     fn multiple_stem_loop() {
428 |         let db: DotBracketOwned = "...((((....))))..(((....))).....((....)).."
429 |             .parse()
430 |             .unwrap();
431 |         assert_eq!(db.len, 42);
432 |         assert_eq!(
433 |             db.paired_blocks,
434 |             [
435 |                 PairedBlock {
436 |                     left: 3..7,
437 |                     right: 11..15,
438 |                 },
439 |                 PairedBlock {
440 |                     left: 17..20,
441 |                     right: 24..27,
442 |                 },
443 |                 PairedBlock {
444 |                     left: 32..34,
445 |                     right: 38..40,
446 |                 },
447 |             ],
448 |         );
449 |     }
450 | 
451 |     #[test]
452 |     fn tight_loop() {
453 |         let db: DotBracketOwned = "(((())))".parse().unwrap();
454 |         assert_eq!(db.len, 8);
455 |         assert_eq!(
456 |             db.paired_blocks,
457 |             [PairedBlock {
458 |                 left: 0..4,
459 |                 right: 4..8,
460 |             }],
461 |         );
462 |     }
463 | 
464 |     #[test]
465 |     fn simple_nested_loop_of_one_bp() {
466 |         let db: DotBracketOwned = "(.(..))".parse().unwrap();
467 |         assert_eq!(db.len, 7);
468 |         assert_eq!(
469 |             db.paired_blocks,
470 |             [
471 |                 PairedBlock {
472 |                     left: 2..3,
473 |                     right: 5..6,
474 |                 },
475 |                 PairedBlock {
476 |                     left: 0..1,
477 |                     right: 6..7,
478 |                 },
479 |             ],
480 |         );
481 |     }
482 | 
483 |     #[test]
484 |     fn nested_stem_loop_left() {
485 |         let db: DotBracketOwned = "((((.(((((...)))..))...))))..".parse().unwrap();
486 |         assert_eq!(db.len, 29);
487 |         assert_eq!(
488 |             db.paired_blocks,
489 |             [
490 |                 PairedBlock {
491 |                     left: 7..10,
492 |                     right: 13..16,
493 |                 },
494 |                 PairedBlock {
495 |                     left: 5..7,
496 |                     right: 18..20,
497 |                 },
498 |                 PairedBlock {
499 |                     left: 0..4,
500 |                     right: 23..27,
501 |                 },
502 |             ],
503 |         );
504 | 
505 |         assert_eq!(
506 |             db.into_sorted(),
507 |             DotBracket::<_, true> {
508 |                 len: 29,
509 |                 paired_blocks: vec![
510 |                     PairedBlock {
511 |                         left: 0..4,
512 |                         right: 23..27,
513 |                     },
514 |                     PairedBlock {
515 |                         left: 5..7,
516 |                         right: 18..20,
517 |                     },
518 |                     PairedBlock {
519 |                         left: 7..10,
520 |                         right: 13..16,
521 |                     },
522 |                 ],
523 |             }
524 |         );
525 |     }
526 | 
527 |     #[test]
528 |     fn nested_stem_loop_right() {
529 |         let db: DotBracketOwned = "((((.((..(((...)))))...))))..".parse().unwrap();
530 |         assert_eq!(db.len, 29);
531 |         assert_eq!(
532 |             db.paired_blocks,
533 |             [
534 |                 PairedBlock {
535 |                     left: 9..12,
536 |                     right: 15..18,
537 |                 },
538 |                 PairedBlock {
539 |                     left: 5..7,
540 |                     right: 18..20,
541 |                 },
542 |                 PairedBlock {
543 |                     left: 0..4,
544 |                     right: 23..27,
545 |                 },
546 |             ],
547 |         );
548 | 
549 |         assert_eq!(
550 |             db.into_sorted(),
551 |             DotBracket::<_, true> {
552 |                 len: 29,
553 |                 paired_blocks: vec![
554 |                     PairedBlock {
555 |                         left: 0..4,
556 |                         right: 23..27,
557 |                     },
558 |                     PairedBlock {
559 |                         left: 5..7,
560 |                         right: 18..20,
561 |                     },
562 |                     PairedBlock {
563 |                         left: 9..12,
564 |                         right: 15..18,
565 |                     },
566 |                 ],
567 |             },
568 |         );
569 |     }
570 | 
571 |     #[test]
572 |     fn ending_with_state() {
573 |         let db: DotBracketOwned = "(.((..)))".parse().unwrap();
574 |         assert_eq!(db.len, 9);
575 |         assert_eq!(
576 |             db.paired_blocks,
577 |             [
578 |                 PairedBlock {
579 |                     left: 2..4,
580 |                     right: 6..8,
581 |                 },
582 |                 PairedBlock {
583 |                     left: 0..1,
584 |                     right: 8..9,
585 |                 },
586 |             ],
587 |         );
588 | 
589 |         assert_eq!(
590 |             db.into_sorted(),
591 |             DotBracket::<_, true> {
592 |                 len: 9,
593 |                 paired_blocks: vec![
594 |                     PairedBlock {
595 |                         left: 0..1,
596 |                         right: 8..9,
597 |                     },
598 |                     PairedBlock {
599 |                         left: 2..4,
600 |                         right: 6..8,
601 |                     },
602 |                 ],
603 |             },
604 |         );
605 |     }
606 | 
607 |     #[test]
608 |     fn new_loop_after_unstored_block() {
609 |         let db: DotBracketOwned = "((.((..)))(((..))))".parse().unwrap();
610 |         assert_eq!(db.len, 19);
611 |         assert_eq!(
612 |             db.paired_blocks,
613 |             [
614 |                 PairedBlock {
615 |                     left: 3..5,
616 |                     right: 7..9,
617 |                 },
618 |                 PairedBlock {
619 |                     left: 1..2,
620 |                     right: 9..10,
621 |                 },
622 |                 PairedBlock {
623 |                     left: 10..13,
624 |                     right: 15..18,
625 |                 },
626 |                 PairedBlock {
627 |                     left: 0..1,
628 |                     right: 18..19,
629 |                 },
630 |             ],
631 |         );
632 | 
633 |         assert_eq!(
634 |             db.into_sorted(),
635 |             DotBracket::<_, true> {
636 |                 len: 19,
637 |                 paired_blocks: vec![
638 |                     PairedBlock {
639 |                         left: 0..1,
640 |                         right: 18..19,
641 |                     },
642 |                     PairedBlock {
643 |                         left: 1..2,
644 |                         right: 9..10,
645 |                     },
646 |                     PairedBlock {
647 |                         left: 3..5,
648 |                         right: 7..9,
649 |                     },
650 |                     PairedBlock {
651 |                         left: 10..13,
652 |                         right: 15..18,
653 |                     },
654 |                 ],
655 |             },
656 |         );
657 |     }
658 | }
659 | 


--------------------------------------------------------------------------------
/src/fasta.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fmt,
  3 |     fs::File,
  4 |     io::{self, BufWriter},
  5 |     path::Path,
  6 | };
  7 | 
  8 | use anyhow::Context;
  9 | 
 10 | use crate::{
 11 |     aligner::{AlignedSequence, BaseOrGap},
 12 |     db_file, query_file, QueryResult, ResultFileFormat, Sequence, SequenceEntry,
 13 | };
 14 | 
 15 | pub(crate) struct Entry<'a> {
 16 |     pub(crate) description: &'a str,
 17 |     pub(crate) sequence: Sequence<'a>,
 18 |     pub(crate) alignment: Option<&'a AlignedSequence>,
 19 | }
 20 | 
 21 | impl fmt::Display for Entry<'_> {
 22 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 23 |         writeln!(f, ">{}", self.description)?;
 24 |         match self.alignment {
 25 |             Some(alignment) => {
 26 |                 let mut sequence = self.sequence.bases.iter();
 27 |                 for base_or_gap in &alignment.0 {
 28 |                     match base_or_gap {
 29 |                         BaseOrGap::Base => match sequence.next() {
 30 |                             Some(base) => write!(f, "{}", base.display(self.sequence.molecule))?,
 31 |                             None => break,
 32 |                         },
 33 |                         BaseOrGap::Gap => f.write_str("-")?,
 34 |                     }
 35 |                 }
 36 |                 sequence.try_for_each(|base| write!(f, "{}", base.display(self.sequence.molecule)))
 37 |             }
 38 |             None => write!(f, "{}", self.sequence),
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | pub(crate) fn write_result(
 44 |     result: &QueryResult,
 45 |     db_entries: &[db_file::Entry],
 46 |     query_entries: &[query_file::Entry],
 47 |     alignments_path: &Path,
 48 | ) -> Result<(), anyhow::Error> {
 49 |     let fasta_path = alignments_path.join(result_filename(result));
 50 |     let file = File::create(fasta_path).context("Unable to create FASTA file")?;
 51 |     let writer = BufWriter::new(file);
 52 | 
 53 |     write_result_to_writer(result, db_entries, query_entries, writer)
 54 | }
 55 | 
 56 | fn result_filename(result: &QueryResult) -> String {
 57 |     format!("{}.fasta", ResultFileFormat::from(result))
 58 | }
 59 | 
 60 | #[inline]
 61 | fn write_result_to_writer<W: io::Write>(
 62 |     result: &QueryResult,
 63 |     db_entries: &[db_file::Entry],
 64 |     query_entries: &[query_file::Entry],
 65 |     mut writer: W,
 66 | ) -> Result<(), anyhow::Error> {
 67 |     let &QueryResult {
 68 |         ref query,
 69 |         ref db_entry,
 70 |         query_start,
 71 |         query_end,
 72 |         db_start,
 73 |         db_end,
 74 |         ref alignment,
 75 |         ..
 76 |     } = result;
 77 | 
 78 |     let db_entry = db_entries
 79 |         .iter()
 80 |         .find(|entry| entry.name() == db_entry)
 81 |         .expect("db entry should be available");
 82 |     let query_entry = query_entries
 83 |         .iter()
 84 |         .find(|entry| entry.name() == &**query)
 85 |         .expect("query entry should be available");
 86 | 
 87 |     let db_sequence = Sequence {
 88 |         bases: &db_entry.sequence()[db_start..=db_end],
 89 |         molecule: db_entry.molecule(),
 90 |     };
 91 |     let query_sequence = Sequence {
 92 |         bases: &query_entry.sequence()[query_start..=query_end],
 93 |         molecule: query_entry.molecule(),
 94 |     };
 95 | 
 96 |     writeln!(
 97 |         writer,
 98 |         "{}\n{}",
 99 |         Entry {
100 |             description: db_entry.name(),
101 |             sequence: db_sequence,
102 |             alignment: Some(&alignment.target),
103 |         },
104 |         Entry {
105 |             description: query_entry.name(),
106 |             sequence: query_sequence,
107 |             alignment: Some(&alignment.query),
108 |         }
109 |     )
110 |     .context("Unable to write to FASTA file")?;
111 | 
112 |     Ok::<_, anyhow::Error>(())
113 | }
114 | 
115 | #[cfg(test)]
116 | mod test {
117 |     use std::sync::Arc;
118 | 
119 |     use crate::{aligner::BaseOrGap, query_result, Molecule};
120 | 
121 |     use super::*;
122 | 
123 |     #[test]
124 |     fn write_result() {
125 |         let query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap();
126 |         let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap();
127 |         let query_name = query[0].name().into();
128 |         let db_name = db[0].name().to_owned();
129 | 
130 |         let query_result = QueryResult {
131 |             query: query_name,
132 |             db_entry: db_name,
133 |             query_start: 5,
134 |             query_end: 10,
135 |             db_start: 15,
136 |             db_end: 20,
137 |             query_seed: query_result::Range(0..=10),
138 |             db_seed: query_result::Range(0..=10),
139 |             score: 0.,
140 |             pvalue: 0.,
141 |             evalue: 0.,
142 |             target_bp_support: Option::default(),
143 |             query_bp_support: Option::default(),
144 |             mfe_pvalue: Option::default(),
145 |             status: query_result::Status::PassInclusionEvalue,
146 |             alignment: Arc::default(),
147 |             dotbracket: Option::default(),
148 |         };
149 | 
150 |         let mut writer = vec![];
151 |         write_result_to_writer(&query_result, &db, &query, &mut writer).unwrap();
152 |         let written = String::from_utf8(writer).unwrap();
153 |         assert_eq!(written, ">16S_Bsubtilis\nGATCCT\n>16S_750\nCTCAGG\n");
154 |     }
155 | 
156 |     #[test]
157 |     fn write_result_rna() {
158 |         let mut query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap();
159 |         query[0].molecule = Molecule::Rna;
160 |         let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap();
161 |         let query_name = query[0].name().into();
162 |         let db_name = db[0].name().to_owned();
163 | 
164 |         let query_result = QueryResult {
165 |             query: query_name,
166 |             db_entry: db_name,
167 |             query_start: 5,
168 |             query_end: 10,
169 |             db_start: 15,
170 |             db_end: 20,
171 |             query_seed: query_result::Range(0..=10),
172 |             db_seed: query_result::Range(0..=10),
173 |             score: 0.,
174 |             pvalue: 0.,
175 |             evalue: 0.,
176 |             target_bp_support: Option::default(),
177 |             query_bp_support: Option::default(),
178 |             mfe_pvalue: Option::default(),
179 |             status: query_result::Status::PassInclusionEvalue,
180 |             alignment: Arc::default(),
181 |             dotbracket: Option::default(),
182 |         };
183 | 
184 |         let mut writer = vec![];
185 |         write_result_to_writer(&query_result, &db, &query, &mut writer).unwrap();
186 |         let written = String::from_utf8(writer).unwrap();
187 |         assert_eq!(written, ">16S_Bsubtilis\nGATCCT\n>16S_750\nCUCAGG\n");
188 |     }
189 | 
190 |     #[test]
191 |     fn result_filename() {
192 |         let query = query_file::read_file(Path::new("./test_data/query.txt")).unwrap();
193 |         let db = db_file::read_db(Path::new("./test_data/test.db")).unwrap();
194 |         let query_name = query[0].name().into();
195 |         let db_name = db[0].name().to_owned();
196 | 
197 |         let query_result = QueryResult {
198 |             query: query_name,
199 |             db_entry: db_name,
200 |             query_start: 5,
201 |             query_end: 10,
202 |             db_start: 15,
203 |             db_end: 20,
204 |             query_seed: query_result::Range(0..=10),
205 |             db_seed: query_result::Range(0..=10),
206 |             score: 0.,
207 |             pvalue: 0.,
208 |             evalue: 0.,
209 |             target_bp_support: Option::default(),
210 |             query_bp_support: Option::default(),
211 |             mfe_pvalue: Option::default(),
212 |             status: query_result::Status::PassInclusionEvalue,
213 |             alignment: Arc::default(),
214 |             dotbracket: Option::default(),
215 |         };
216 | 
217 |         assert_eq!(
218 |             super::result_filename(&query_result),
219 |             "16S_Bsubtilis_15-20_16S_750_5-10.fasta"
220 |         );
221 |     }
222 | 
223 |     #[test]
224 |     fn display_aligned_entry() {
225 |         use crate::Base::*;
226 |         use BaseOrGap::*;
227 | 
228 |         let alignment = AlignedSequence(vec![Base, Base, Gap, Base, Gap, Gap, Base]);
229 |         let entry = Entry {
230 |             description: "test",
231 |             sequence: Sequence {
232 |                 bases: &[A, C, T, G, A, A],
233 |                 molecule: crate::Molecule::Dna,
234 |             },
235 |             alignment: Some(&alignment),
236 |         };
237 | 
238 |         assert_eq!(entry.to_string(), ">test\nAC-T--GAA");
239 |     }
240 | }
241 | 


--------------------------------------------------------------------------------
/src/gapped_reactivity.rs:
--------------------------------------------------------------------------------
  1 | use std::{iter, ops::Not, slice};
  2 | 
  3 | use serde::{ser::SerializeSeq, Serialize, Serializer};
  4 | 
  5 | use crate::{
  6 |     aligner::{AlignedSequenceRef, BaseOrGap},
  7 |     db_file::ReactivityLike,
  8 | };
  9 | 
 10 | #[derive(Clone, Copy, Debug, PartialEq)]
 11 | pub(crate) struct GappedReactivity<'a, T> {
 12 |     pub(crate) reactivity: &'a [T],
 13 |     pub(crate) alignment: AlignedSequenceRef<'a>,
 14 | }
 15 | 
 16 | pub(crate) trait GappedReactivityLike<R> {
 17 |     type AlignmentIter<'a>: Iterator<Item = BaseOrGap> + 'a
 18 |     where
 19 |         Self: 'a;
 20 | 
 21 |     type ReactivityIter<'a>: Iterator<Item = R> + 'a
 22 |     where
 23 |         Self: 'a;
 24 | 
 25 |     fn alignment(&self) -> Self::AlignmentIter<'_>;
 26 |     fn reactivity(&self) -> Self::ReactivityIter<'_>;
 27 | }
 28 | 
 29 | impl<T> Serialize for GappedReactivity<'_, T>
 30 | where
 31 |     T: ReactivityLike + Serialize,
 32 | {
 33 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 34 |     where
 35 |         S: Serializer,
 36 |     {
 37 |         let mut usable_alignment = 0;
 38 |         let gaps = self
 39 |             .alignment
 40 |             .0
 41 |             .iter()
 42 |             .scan(0, |bases, base| {
 43 |                 if *bases == self.reactivity.len() {
 44 |                     None
 45 |                 } else {
 46 |                     usable_alignment += 1;
 47 |                     if let BaseOrGap::Base = base {
 48 |                         *bases += 1;
 49 |                     }
 50 |                     Some(base)
 51 |                 }
 52 |             })
 53 |             .filter(|base| matches!(base, BaseOrGap::Gap))
 54 |             .count();
 55 |         let usable_alignment = usable_alignment;
 56 |         let len = self.reactivity.len() + gaps;
 57 |         let mut seq = serializer.serialize_seq(Some(len))?;
 58 | 
 59 |         let mut reactivities = self.reactivity.iter();
 60 |         for base_or_gap in &self.alignment.0[..usable_alignment] {
 61 |             match base_or_gap {
 62 |                 BaseOrGap::Base => match reactivities.next() {
 63 |                     Some(reactivity) if reactivity.is_nan() => seq.serialize_element("NaN")?,
 64 |                     Some(reactivity) => seq.serialize_element(reactivity)?,
 65 |                     None => break,
 66 |                 },
 67 |                 BaseOrGap::Gap => seq.serialize_element(&f32::NAN)?,
 68 |             }
 69 |         }
 70 | 
 71 |         reactivities.try_for_each(|reactivity| seq.serialize_element(reactivity))?;
 72 |         seq.end()
 73 |     }
 74 | }
 75 | 
 76 | impl<T> GappedReactivityLike<T> for GappedReactivity<'_, T>
 77 | where
 78 |     T: Copy,
 79 | {
 80 |     type AlignmentIter<'a> = iter::Copied<slice::Iter<'a, BaseOrGap>>
 81 |     where
 82 |         Self: 'a;
 83 | 
 84 |     type ReactivityIter<'a> = iter::Copied<slice::Iter<'a, T>>
 85 |     where
 86 |         Self: 'a;
 87 | 
 88 |     #[inline]
 89 |     fn alignment(&self) -> Self::AlignmentIter<'_> {
 90 |         self.alignment.0.iter().copied()
 91 |     }
 92 | 
 93 |     #[inline]
 94 |     fn reactivity(&self) -> Self::ReactivityIter<'_> {
 95 |         self.reactivity.iter().copied()
 96 |     }
 97 | }
 98 | 
 99 | impl<'a, T> IntoIterator for &'a GappedReactivity<'a, T>
100 | where
101 |     T: Copy,
102 | {
103 |     type Item = GappedReactivityValue<T>;
104 |     type IntoIter = GappedReactivityIter<'a, T>;
105 | 
106 |     #[inline]
107 |     fn into_iter(self) -> Self::IntoIter {
108 |         let reactivity = self.reactivity.iter();
109 |         let alignment = self.alignment.0.iter();
110 |         GappedReactivityIter {
111 |             reactivity,
112 |             alignment,
113 |         }
114 |     }
115 | }
116 | 
117 | impl<'a, T> IntoIterator for GappedReactivity<'a, T>
118 | where
119 |     T: Copy,
120 | {
121 |     type Item = GappedReactivityValue<T>;
122 |     type IntoIter = GappedReactivityIter<'a, T>;
123 | 
124 |     #[inline]
125 |     fn into_iter(self) -> Self::IntoIter {
126 |         let reactivity = self.reactivity.iter();
127 |         let alignment = self.alignment.0.iter();
128 |         GappedReactivityIter {
129 |             reactivity,
130 |             alignment,
131 |         }
132 |     }
133 | }
134 | 
135 | pub(crate) struct GappedReactivityIter<'a, T> {
136 |     reactivity: slice::Iter<'a, T>,
137 |     alignment: slice::Iter<'a, BaseOrGap>,
138 | }
139 | 
140 | #[derive(Clone, Copy, Debug, PartialEq)]
141 | pub(crate) enum GappedReactivityValue<T> {
142 |     Reactivity(T),
143 |     Gap,
144 | }
145 | 
146 | impl<T> Iterator for GappedReactivityIter<'_, T>
147 | where
148 |     T: Copy,
149 | {
150 |     type Item = GappedReactivityValue<T>;
151 | 
152 |     #[inline]
153 |     fn next(&mut self) -> Option<Self::Item> {
154 |         match self.alignment.next()? {
155 |             BaseOrGap::Base => self
156 |                 .reactivity
157 |                 .next()
158 |                 .copied()
159 |                 .map(GappedReactivityValue::Reactivity),
160 | 
161 |             BaseOrGap::Gap => self
162 |                 .reactivity
163 |                 .as_slice()
164 |                 .is_empty()
165 |                 .not()
166 |                 .then_some(GappedReactivityValue::Gap),
167 |         }
168 |     }
169 | 
170 |     fn size_hint(&self) -> (usize, Option<usize>) {
171 |         let reactivity = self.reactivity.size_hint();
172 |         let alignment = self.alignment.size_hint();
173 | 
174 |         (
175 |             reactivity.0.min(alignment.0),
176 |             reactivity
177 |                 .1
178 |                 .map(|bases| alignment.1.map_or(bases, |alignment| bases.max(alignment)))
179 |                 .or(alignment.1),
180 |         )
181 |     }
182 | }
183 | 


--------------------------------------------------------------------------------
/src/gapped_sequence.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     ffi::CString,
  3 |     fmt::{self, Display},
  4 |     ops::{Not, Range},
  5 |     slice,
  6 | };
  7 | 
  8 | use crate::{
  9 |     aligner::{AlignedSequence, AlignedSequenceRef, BaseOrGap},
 10 |     Base, Molecule, Sequence,
 11 | };
 12 | 
 13 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
 14 | pub(crate) struct GappedSequence<'a> {
 15 |     pub(crate) sequence: Sequence<'a>,
 16 |     pub(crate) alignment: AlignedSequenceRef<'a>,
 17 | }
 18 | 
 19 | pub(crate) trait GappedSequenceLike {
 20 |     fn to_cstring(&self, molecule: Option<Molecule>) -> CString;
 21 | }
 22 | 
 23 | impl<'a> GappedSequence<'a> {
 24 |     pub(crate) fn new(sequence: Sequence<'a>, alignment: &'a AlignedSequence) -> Self {
 25 |         let alignment = alignment.to_ref();
 26 |         Self {
 27 |             sequence,
 28 |             alignment,
 29 |         }
 30 |     }
 31 | 
 32 |     #[inline]
 33 |     pub(crate) fn iter(&self) -> GappedSequenceIter<'_> {
 34 |         IntoIterator::into_iter(self)
 35 |     }
 36 | 
 37 |     pub(crate) fn get(&self, index: Range<usize>) -> Option<GappedSequence<'a>> {
 38 |         let start = index.start;
 39 |         self.alignment.0.get(index).map(|alignment| {
 40 |             let bases_before = self.alignment.0[..start]
 41 |                 .iter()
 42 |                 .filter(|base_or_gap| base_or_gap.is_base())
 43 |                 .count();
 44 |             let bases = alignment
 45 |                 .iter()
 46 |                 .filter(|base_or_gap| base_or_gap.is_base())
 47 |                 .count();
 48 | 
 49 |             let bases = &self.sequence.bases[bases_before..(bases_before + bases)];
 50 |             let sequence = Sequence {
 51 |                 bases,
 52 |                 molecule: self.sequence.molecule,
 53 |             };
 54 |             let alignment = AlignedSequenceRef(alignment);
 55 | 
 56 |             GappedSequence {
 57 |                 sequence,
 58 |                 alignment,
 59 |             }
 60 |         })
 61 |     }
 62 | }
 63 | 
 64 | impl GappedSequenceLike for GappedSequence<'_> {
 65 |     #[inline]
 66 |     fn to_cstring(&self, molecule: Option<Molecule>) -> CString {
 67 |         // Rough estimation
 68 |         let estimated_len = self.alignment.0.len().max(self.sequence.bases.len()) + 1;
 69 |         let molecule = molecule.unwrap_or(self.sequence.molecule);
 70 |         sequence_cstring_from_iter(
 71 |             self.sequence.bases.iter().copied(),
 72 |             self.alignment.0.iter().copied(),
 73 |             molecule,
 74 |             estimated_len,
 75 |         )
 76 |     }
 77 | }
 78 | 
 79 | pub(crate) fn sequence_cstring_from_iter<S, B>(
 80 |     sequence: S,
 81 |     base_or_gap: B,
 82 |     molecule: Molecule,
 83 |     estimated_len: usize,
 84 | ) -> CString
 85 | where
 86 |     S: Iterator<Item = Base>,
 87 |     B: Iterator<Item = BaseOrGap>,
 88 | {
 89 |     let mut chars = Vec::with_capacity(estimated_len);
 90 |     let mut sequence = sequence.map(|base| base.to_byte(molecule));
 91 |     let iter = base_or_gap.filter_map(|alignment| match alignment {
 92 |         BaseOrGap::Base => sequence.next(),
 93 |         BaseOrGap::Gap => Some(b'-'),
 94 |     });
 95 |     chars.extend(iter);
 96 |     chars.extend(sequence);
 97 |     chars.push(b'\0');
 98 | 
 99 |     CString::from_vec_with_nul(chars).unwrap()
100 | }
101 | 
102 | impl<'a> IntoIterator for &'a GappedSequence<'a> {
103 |     type Item = StatefulBaseOrGap;
104 |     type IntoIter = GappedSequenceIter<'a>;
105 | 
106 |     #[inline]
107 |     fn into_iter(self) -> Self::IntoIter {
108 |         let GappedSequence {
109 |             sequence,
110 |             alignment,
111 |         } = self;
112 | 
113 |         let bases = sequence.bases.iter();
114 |         let alignment = alignment.0.iter();
115 |         GappedSequenceIter { bases, alignment }
116 |     }
117 | }
118 | 
119 | impl<'a> IntoIterator for GappedSequence<'a> {
120 |     type Item = StatefulBaseOrGap;
121 |     type IntoIter = GappedSequenceIter<'a>;
122 | 
123 |     #[inline]
124 |     fn into_iter(self) -> Self::IntoIter {
125 |         let GappedSequence {
126 |             sequence,
127 |             alignment,
128 |         } = self;
129 | 
130 |         let bases = sequence.bases.iter();
131 |         let alignment = alignment.0.iter();
132 |         GappedSequenceIter { bases, alignment }
133 |     }
134 | }
135 | 
136 | impl Display for GappedSequence<'_> {
137 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138 |         use fmt::Write;
139 | 
140 |         self.iter().try_for_each(|b| match b {
141 |             StatefulBaseOrGap::Base(b) => write!(f, "{}", b.display(self.sequence.molecule)),
142 |             StatefulBaseOrGap::Gap => f.write_char('-'),
143 |         })
144 |     }
145 | }
146 | 
147 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
148 | pub(crate) enum StatefulBaseOrGap {
149 |     Base(Base),
150 |     Gap,
151 | }
152 | 
153 | impl StatefulBaseOrGap {
154 |     #[inline]
155 |     pub(crate) fn to_base(self) -> Option<Base> {
156 |         match self {
157 |             Self::Base(base) => Some(base),
158 |             Self::Gap => None,
159 |         }
160 |     }
161 | }
162 | 
163 | pub(crate) struct GappedSequenceIter<'a> {
164 |     bases: slice::Iter<'a, Base>,
165 |     alignment: slice::Iter<'a, BaseOrGap>,
166 | }
167 | 
168 | impl Iterator for GappedSequenceIter<'_> {
169 |     type Item = StatefulBaseOrGap;
170 | 
171 |     #[inline]
172 |     fn next(&mut self) -> Option<Self::Item> {
173 |         match self.alignment.next()? {
174 |             BaseOrGap::Base => self.bases.next().copied().map(StatefulBaseOrGap::Base),
175 |             BaseOrGap::Gap => self
176 |                 .bases
177 |                 .as_slice()
178 |                 .is_empty()
179 |                 .not()
180 |                 .then_some(StatefulBaseOrGap::Gap),
181 |         }
182 |     }
183 | 
184 |     fn size_hint(&self) -> (usize, Option<usize>) {
185 |         let bases = self.bases.size_hint();
186 |         let alignment = self.alignment.size_hint();
187 | 
188 |         (
189 |             bases.0.min(alignment.0),
190 |             bases
191 |                 .1
192 |                 .map(|bases| alignment.1.map_or(bases, |alignment| bases.max(alignment)))
193 |                 .or(alignment.1),
194 |         )
195 |     }
196 | }
197 | 
198 | impl DoubleEndedIterator for GappedSequenceIter<'_> {
199 |     fn next_back(&mut self) -> Option<Self::Item> {
200 |         match self.alignment.next_back()? {
201 |             BaseOrGap::Base => self.bases.next_back().copied().map(StatefulBaseOrGap::Base),
202 |             BaseOrGap::Gap => self
203 |                 .bases
204 |                 .as_slice()
205 |                 .is_empty()
206 |                 .not()
207 |                 .then_some(StatefulBaseOrGap::Gap),
208 |         }
209 |     }
210 | }
211 | 


--------------------------------------------------------------------------------
/src/handle_query_entry.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     cmp::Reverse,
  3 |     ops::{Not, RangeInclusive},
  4 |     sync::Arc,
  5 | };
  6 | 
  7 | use rand::{rngs::ThreadRng, thread_rng};
  8 | use statrs::distribution::{self, ContinuousCDF};
  9 | 
 10 | use crate::{
 11 |     alifold_mfe, alifold_on_result,
 12 |     aligner::{AlignedSequence, BacktrackBehavior, NoOpBehavior},
 13 |     cli::Cli,
 14 |     dotbracket::{self, DotBracketOwnedSorted},
 15 |     iter::IterWithRestExt,
 16 |     norm_dist::NormDist,
 17 |     null_model::ExtremeDistribution,
 18 |     query_aligner::{align_query_to_target_db, QueryAlignResult},
 19 |     query_file, query_result, reuse_vec, AlifoldOnResult, HandlerData, MutableHandlerData,
 20 |     QueryResult, SequenceEntry, SharedHandlerData,
 21 | };
 22 | 
 23 | pub(super) fn handle_query_entry<'a>(
 24 |     query_entry: &'a query_file::Entry,
 25 |     query_entry_orig: &'a query_file::Entry,
 26 |     handler_data: HandlerData<'a>,
 27 | ) -> anyhow::Result<MutableHandlerData<'a>> {
 28 |     let HandlerData {
 29 |         shared:
 30 |             SharedHandlerData {
 31 |                 cli,
 32 |                 db_entries,
 33 |                 db_entries_orig,
 34 |                 db_entries_shuffled,
 35 |             },
 36 |         mutable:
 37 |             MutableHandlerData {
 38 |                 mut aligner,
 39 |                 mut null_all_scores,
 40 |                 mut null_scores,
 41 |                 mut query_all_results,
 42 |                 mut reusable_query_results,
 43 |                 mut index_to_remove,
 44 |                 mut results,
 45 |             },
 46 |     } = handler_data;
 47 | 
 48 |     let null_aligner = align_query_to_target_db::<NoOpBehavior>(
 49 |         query_entry,
 50 |         db_entries_shuffled,
 51 |         db_entries_shuffled,
 52 |         &mut null_all_scores,
 53 |         cli,
 54 |     )?;
 55 |     null_scores.clear();
 56 |     null_scores.extend(
 57 |         null_aligner
 58 |             .into_iter(&null_all_scores, &mut aligner)
 59 |             .take(cli.null_hsgs.try_into().unwrap_or(usize::MAX))
 60 |             .map(|query_align_result| query_align_result.score),
 61 |     );
 62 |     let null_distribution = ExtremeDistribution::from_sample(&null_scores);
 63 | 
 64 |     let mut query_results = reuse_vec(reusable_query_results);
 65 |     query_results.extend(
 66 |         align_query_to_target_db::<BacktrackBehavior>(
 67 |             query_entry,
 68 |             db_entries,
 69 |             db_entries_orig,
 70 |             &mut query_all_results,
 71 |             cli,
 72 |         )?
 73 |         .into_iter(&query_all_results, &mut aligner)
 74 |         .map(|result| {
 75 |             assert_eq!(
 76 |                 result.alignment.query.0.len(),
 77 |                 result.alignment.target.0.len()
 78 |             );
 79 | 
 80 |             assert_eq!(
 81 |                 result.db.end() - result.db.start()
 82 |                     + result
 83 |                         .alignment
 84 |                         .target
 85 |                         .0
 86 |                         .iter()
 87 |                         .filter(|bog| bog.is_base().not())
 88 |                         .count(),
 89 |                 result.query.end() - result.query.start()
 90 |                     + result
 91 |                         .alignment
 92 |                         .query
 93 |                         .0
 94 |                         .iter()
 95 |                         .filter(|bog| bog.is_base().not())
 96 |                         .count(),
 97 |             );
 98 | 
 99 |             let p_value = null_distribution.p_value(result.score);
100 | 
101 |             // FIXME: we need to avoid this clone
102 |             (result.clone(), p_value, 0.)
103 |         }),
104 |     );
105 | 
106 |     // In case of precision loss, it is still ok to evaluate the e_value
107 |     #[allow(clippy::cast_precision_loss)]
108 |     let results_len = query_results.len() as f64;
109 |     let report_evalue = cli.report_evalue.max(cli.inclusion_evalue);
110 |     query_results.retain_mut(|(_, p_value, e_value)| {
111 |         *e_value = *p_value * results_len;
112 |         *e_value <= report_evalue
113 |     });
114 |     remove_overlapping_results(&mut query_results, &mut index_to_remove, cli);
115 | 
116 |     let mut query_results_handler = QueryResultHandler::new(
117 |         Arc::clone(&query_entry.name),
118 |         query_entry,
119 |         query_entry_orig,
120 |         cli,
121 |     );
122 |     results.extend(
123 |         query_results
124 |             .iter()
125 |             .map(|&(ref result, pvalue, evalue)| query_results_handler.run(result, pvalue, evalue)),
126 |     );
127 | 
128 |     reusable_query_results = reuse_vec(query_results);
129 |     Ok(MutableHandlerData {
130 |         aligner,
131 |         null_all_scores,
132 |         null_scores,
133 |         query_all_results,
134 |         reusable_query_results,
135 |         index_to_remove,
136 |         results,
137 |     })
138 | }
139 | 
140 | struct QueryResultHandler<'a> {
141 |     query: Arc<str>,
142 |     query_entry: &'a query_file::Entry,
143 |     query_entry_orig: &'a query_file::Entry,
144 |     cli: &'a Cli,
145 |     dotbracket_results_buffer: Vec<dotbracket::PairedBlock>,
146 |     dotbracket_temp_buffer: Vec<dotbracket::PartialPairedBlock>,
147 |     null_model_energies: Vec<f32>,
148 |     rng: ThreadRng,
149 | }
150 | 
151 | impl<'a> QueryResultHandler<'a> {
152 |     fn new(
153 |         query: Arc<str>,
154 |         query_entry: &'a query_file::Entry,
155 |         query_entry_orig: &'a query_file::Entry,
156 |         cli: &'a Cli,
157 |     ) -> Self {
158 |         Self {
159 |             query,
160 |             query_entry,
161 |             query_entry_orig,
162 |             cli,
163 |             dotbracket_results_buffer: Vec::new(),
164 |             dotbracket_temp_buffer: Vec::new(),
165 |             null_model_energies: Vec::new(),
166 |             rng: thread_rng(),
167 |         }
168 |     }
169 | 
170 |     fn run(
171 |         &mut self,
172 |         result: &QueryAlignResult<AlignedSequence>,
173 |         pvalue: f64,
174 |         exp_value: f64,
175 |     ) -> QueryResult {
176 |         let mut status = if exp_value > self.cli.report_evalue {
177 |             query_result::Status::NotPass
178 |         } else if exp_value > self.cli.inclusion_evalue {
179 |             query_result::Status::PassReportEvalue
180 |         } else {
181 |             query_result::Status::PassInclusionEvalue
182 |         };
183 | 
184 |         let (mfe_pvalue_dotbracket, bp_support) = self.get_mfe_data(result, &mut status);
185 | 
186 |         let &QueryAlignResult {
187 |             db_entry,
188 |             ref db_match,
189 |             score,
190 |             db: ref db_range,
191 |             query: ref query_range,
192 |             ref alignment,
193 |             ..
194 |         } = result;
195 | 
196 |         let db_entry = db_entry.name().to_owned();
197 |         let query = Arc::clone(&self.query);
198 |         let alignment = Arc::clone(alignment);
199 | 
200 |         let query_seed = query_result::Range(db_match.query.clone());
201 |         let db_seed = query_result::Range(db_match.db.clone());
202 |         let query_start = *query_range.start();
203 |         let query_end = *query_range.end();
204 |         let db_start = *db_range.start();
205 |         let db_end = *db_range.end();
206 | 
207 |         let (target_bp_support, query_bp_support) = bp_support
208 |             .map(|BpSupport { target, query }| (target, query))
209 |             .unzip();
210 |         let (mfe_pvalue, dotbracket) = match mfe_pvalue_dotbracket {
211 |             MfeResult::Evaluated { pvalue, dotbracket } => {
212 |                 (Some(pvalue.unwrap_or(1.)), Some(dotbracket))
213 |             }
214 |             MfeResult::Unevaluated => (None, None),
215 |         };
216 | 
217 |         QueryResult {
218 |             query,
219 |             db_entry,
220 |             query_start,
221 |             query_end,
222 |             db_start,
223 |             db_end,
224 |             query_seed,
225 |             db_seed,
226 |             score,
227 |             pvalue,
228 |             evalue: exp_value,
229 |             target_bp_support,
230 |             query_bp_support,
231 |             mfe_pvalue,
232 |             status,
233 |             alignment,
234 |             dotbracket,
235 |         }
236 |     }
237 | 
238 |     fn get_mfe_data(
239 |         &mut self,
240 |         result: &QueryAlignResult<AlignedSequence>,
241 |         status: &mut query_result::Status,
242 |     ) -> (MfeResult, Option<BpSupport>) {
243 |         let &mut Self {
244 |             ref mut query_entry,
245 |             query_entry_orig,
246 |             cli,
247 |             ref mut dotbracket_results_buffer,
248 |             ref mut dotbracket_temp_buffer,
249 |             ref mut null_model_energies,
250 |             ref mut rng,
251 |             ..
252 |         } = self;
253 | 
254 |         if cli.alignment_folding_eval_args.eval_align_fold.not()
255 |             || matches!(status, query_result::Status::PassInclusionEvalue).not()
256 |         {
257 |             return (MfeResult::Unevaluated, None);
258 |         }
259 | 
260 |         let AlifoldOnResult {
261 |             dotbracket,
262 |             ignore,
263 |             mfe,
264 |             gapped_data,
265 |             target_bp_support,
266 |             query_bp_support,
267 |         } = alifold_on_result(
268 |             result,
269 |             query_entry,
270 |             query_entry_orig,
271 |             cli,
272 |             dotbracket_results_buffer,
273 |             dotbracket_temp_buffer,
274 |         );
275 | 
276 |         let bp_support = BpSupport {
277 |             target: target_bp_support,
278 |             query: query_bp_support,
279 |         };
280 | 
281 |         if ignore {
282 |             *status = query_result::Status::PassReportEvalue;
283 |             return (MfeResult::Unevaluated, Some(bp_support));
284 |         }
285 | 
286 |         let mut indices_buffer = Vec::new();
287 |         let mut block_indices_buffer = cli
288 |             .alignment_folding_eval_args
289 |             .in_block_shuffle
290 |             .then_some(Vec::new());
291 | 
292 |         let block_size = cli.alignment_folding_eval_args.block_size;
293 |         null_model_energies.clear();
294 |         null_model_energies.extend((0..cli.alignment_folding_eval_args.shuffles).map(move |_| {
295 |             if let Some(block_indices_buffer) = &mut block_indices_buffer {
296 |                 let gapped_data = gapped_data.clone().shuffled_in_blocks(
297 |                     block_size,
298 |                     &mut indices_buffer,
299 |                     block_indices_buffer,
300 |                     rng,
301 |                 );
302 | 
303 |                 let sequences = [gapped_data.target(), gapped_data.query()];
304 |                 let (_, mfe) = alifold_mfe(&sequences, &sequences, cli);
305 | 
306 |                 mfe
307 |             } else {
308 |                 let gapped_data = gapped_data.clone().shuffled(
309 |                     cli.alignment_folding_eval_args.block_size,
310 |                     &mut indices_buffer,
311 |                     rng,
312 |                 );
313 | 
314 |                 let sequences = [gapped_data.target(), gapped_data.query()];
315 |                 let (_, mfe) = alifold_mfe(&sequences, &sequences, cli);
316 | 
317 |                 mfe
318 |             }
319 |         }));
320 |         let dist = NormDist::from_sample(null_model_energies.as_slice());
321 |         let z_score = dist.z_score(mfe);
322 | 
323 |         let mfe_pvalue = (z_score < 0.).then(|| {
324 |             distribution::Normal::new(dist.mean(), dist.stddev())
325 |                 .expect("stddev is expected to be greater than 0")
326 |                 .cdf(mfe.into())
327 |         });
328 |         let mfe_result = MfeResult::Evaluated {
329 |             pvalue: mfe_pvalue,
330 |             dotbracket: dotbracket.unwrap().into_sorted().to_owned(),
331 |         };
332 | 
333 |         (mfe_result, Some(bp_support))
334 |     }
335 | }
336 | 
337 | #[derive(Debug)]
338 | struct BpSupport {
339 |     query: f32,
340 |     target: f32,
341 | }
342 | 
343 | #[derive(Debug)]
344 | enum MfeResult {
345 |     Unevaluated,
346 |     Evaluated {
347 |         pvalue: Option<f64>,
348 |         dotbracket: DotBracketOwnedSorted,
349 |     },
350 | }
351 | 
352 | fn remove_overlapping_results(
353 |     results: &mut Vec<(QueryAlignResult<'_, AlignedSequence>, f64, f64)>,
354 |     indices_buffer: &mut Vec<usize>,
355 |     cli: &Cli,
356 | ) {
357 |     let max_align_overlap: f64 = cli.max_align_overlap.into();
358 |     indices_buffer.clear();
359 |     results.sort_unstable_by(|(a, _, _), (b, _, _)| {
360 |         a.db_entry
361 |             .id
362 |             .cmp(&b.db_entry.id)
363 |             .then(a.query.start().cmp(b.query.start()))
364 |             .then(a.query.end().cmp(b.query.end()).reverse())
365 |     });
366 |     results
367 |         .iter_with_rest()
368 |         .enumerate()
369 |         .flat_map(|(a_index, (a, rest))| {
370 |             let same_db_index = rest.partition_point(|b| a.0.db_entry.id == b.0.db_entry.id);
371 |             // TODO: check if pre-calculating `a_len` does change anything
372 |             rest[..same_db_index]
373 |                 .iter()
374 |                 .enumerate()
375 |                 .take_while(|(_, b)| are_overlapping(&a.0.query, &b.0.query, max_align_overlap))
376 |                 .filter(|(_, b)| are_overlapping(&a.0.db, &b.0.db, max_align_overlap))
377 |                 .map(move |(b_offset, b)| (a_index, a_index + b_offset + 1, a.0.score, b.0.score))
378 |         })
379 |         .for_each(|(a_index, b_index, a_score, b_score)| {
380 |             if a_score >= b_score {
381 |                 indices_buffer.push(b_index);
382 |             } else {
383 |                 indices_buffer.push(a_index);
384 |             }
385 |         });
386 | 
387 |     indices_buffer.sort_unstable_by_key(|&index| Reverse(index));
388 | 
389 |     if let Some(&first_index) = indices_buffer.first() {
390 |         results.swap_remove(first_index);
391 |         indices_buffer
392 |             .windows(2)
393 |             .map(|win| [win[0], win[1]])
394 |             .filter(|[a, b]| a != b)
395 |             .for_each(|[_, index]| {
396 |                 results.swap_remove(index);
397 |             });
398 |     }
399 | }
400 | 
401 | #[inline]
402 | fn are_overlapping(
403 |     a: &RangeInclusive<usize>,
404 |     b: &RangeInclusive<usize>,
405 |     max_align_overlap: f64,
406 | ) -> bool {
407 |     b.start() < a.end() && {
408 |         let overlap = overlapping_range(a, b);
409 |         // If we are losing precision, ranges are so bit that we are probably going to crash elsewhere
410 |         // anyway
411 |         #[allow(clippy::cast_precision_loss)]
412 |         let a_len = (a.end() + 1 - a.start()) as f64;
413 |         #[allow(clippy::cast_precision_loss)]
414 |         let b_len = (b.end() + 1 - b.start()) as f64;
415 | 
416 |         #[allow(clippy::cast_precision_loss)]
417 |         let overlap = (overlap.end() + 1).saturating_sub(*overlap.start()) as f64;
418 | 
419 |         overlap > (a_len.min(b_len)) * max_align_overlap
420 |     }
421 | }
422 | 
423 | #[inline]
424 | fn overlapping_range<T>(a: &RangeInclusive<T>, b: &RangeInclusive<T>) -> RangeInclusive<T>
425 | where
426 |     T: Ord + Clone,
427 | {
428 |     let start = a.start().max(b.start()).clone();
429 |     let end = a.end().min(b.end()).clone();
430 |     start..=end
431 | }
432 | 
433 | #[cfg(test)]
434 | mod tests {
435 |     use crate::{
436 |         aligner::{AlignmentResult, BaseOrGap},
437 |         db_file::{self, ReactivityWithPlaceholder},
438 |         tests::dummy_cli,
439 |         Base, MatchRanges,
440 |     };
441 | 
442 |     use super::*;
443 | 
444 |     #[test]
445 |     fn remove_overlapping_results_empty() {
446 |         remove_overlapping_results(&mut Vec::new(), &mut vec![1, 2, 3], &dummy_cli());
447 |     }
448 | 
449 |     static EMPTY_ENTRY: db_file::Entry = db_file::Entry {
450 |         id: String::new(),
451 |         sequence: Vec::new(),
452 |         reactivity: Vec::new(),
453 |     };
454 |     macro_rules! query_result {
455 |         ($query:expr, $score:expr) => {
456 |             (
457 |                 QueryAlignResult {
458 |                     db_entry: &EMPTY_ENTRY,
459 |                     db_entry_orig: &EMPTY_ENTRY,
460 |                     db_match: MatchRanges {
461 |                         db: 0..=0,
462 |                         query: 0..=0,
463 |                     },
464 |                     score: $score,
465 |                     db: 0..=10,
466 |                     query: $query,
467 |                     alignment: Default::default(),
468 |                 },
469 |                 0.0,
470 |                 0.0,
471 |             )
472 |         };
473 | 
474 |         ($query:expr) => {
475 |             query_result!($query, 0.)
476 |         };
477 |     }
478 | 
479 |     #[test]
480 |     fn remove_overlapping_results_non_overlapping() {
481 |         let mut results = vec![
482 |             query_result!(0..=4),
483 |             query_result!(4..=10),
484 |             query_result!(8..=16),
485 |         ];
486 |         let initial_results = results.clone();
487 |         remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli());
488 | 
489 |         assert_eq!(results, initial_results);
490 |     }
491 | 
492 |     #[test]
493 |     fn remove_overlapping_results_simple_overlap() {
494 |         let mut results = vec![query_result!(0..=10, 0.5), query_result!(4..=14, 0.2)];
495 |         remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli());
496 | 
497 |         assert_eq!(results, vec![query_result!(0..=10, 0.5)]);
498 | 
499 |         results = vec![query_result!(0..=10, 0.2), query_result!(4..=14, 0.5)];
500 |         remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli());
501 | 
502 |         assert_eq!(results, vec![query_result!(4..=14, 0.5)]);
503 |     }
504 | 
505 |     #[test]
506 |     fn remove_overlapping_results_nested_overlap() {
507 |         let mut results = vec![query_result!(0..=10, 0.5), query_result!(2..=6, 0.2)];
508 |         remove_overlapping_results(&mut results, &mut vec![], &dummy_cli());
509 | 
510 |         assert_eq!(results, vec![query_result!(0..=10, 0.5)]);
511 | 
512 |         results = vec![query_result!(0..=10, 0.2), query_result!(2..=6, 0.5)];
513 |         remove_overlapping_results(&mut results, &mut vec![], &dummy_cli());
514 | 
515 |         assert_eq!(results, vec![query_result!(2..=6, 0.5)]);
516 |     }
517 | 
518 |     #[test]
519 |     fn remove_overlapping_results_chained_overlap() {
520 |         let mut results = vec![
521 |             query_result!(0..=5, 0.5),
522 |             query_result!(2..=8, 0.5),
523 |             query_result!(4..=10, 0.7),
524 |             query_result!(6..=12, 0.5),
525 |             query_result!(8..=14, 0.5),
526 |             query_result!(10..=16, 0.5),
527 |         ];
528 |         remove_overlapping_results(&mut results, &mut vec![], &dummy_cli());
529 | 
530 |         // The reason for this result is, using the current algorithm:
531 |         // - the second is removed because this is the current behavior for overlapping sequences
532 |         //   with the same score;
533 |         // - the third is kept (it removes the second, again) because it has a higher score;
534 |         // - all the others are removed "by chaining" (the first has a higher priority in case of
535 |         //   equal score, but it's been removed by another comparison).
536 |         assert_eq!(
537 |             results,
538 |             vec![query_result!(0..=5, 0.5), query_result!(4..=10, 0.7)]
539 |         );
540 |     }
541 | 
542 |     #[test]
543 |     fn keep_targets_with_overlapping_results() {
544 |         let cli = Cli::dummy();
545 | 
546 |         let targets: Vec<_> = (0..5)
547 |             .map(|index| db_file::Entry {
548 |                 id: format!("db_{index}"),
549 |                 sequence: vec![Base::A; 100],
550 |                 reactivity: vec![ReactivityWithPlaceholder::from(0.5); 100],
551 |             })
552 |             .collect();
553 | 
554 |         let alignment_result = Arc::new(AlignmentResult {
555 |             query: AlignedSequence(vec![BaseOrGap::Base; 4]),
556 |             target: AlignedSequence(vec![BaseOrGap::Base; 4]),
557 |         });
558 |         let alignment_result = &alignment_result;
559 |         let mut results = targets
560 |             .iter()
561 |             .enumerate()
562 |             .flat_map(|(outer_index, db_entry)| {
563 |                 (0..3).map(move |inner_index| {
564 |                     let result = QueryAlignResult {
565 |                         db_entry,
566 |                         db_entry_orig: db_entry,
567 |                         db_match: MatchRanges {
568 |                             db: 13..=16,
569 |                             query: 13..=16,
570 |                         },
571 |                         score: 15. + f32::from(u16::try_from(outer_index).unwrap()) * 3.
572 |                             - (f32::from(i16::try_from(inner_index).unwrap()) * 2.),
573 |                         db: 10..=20,
574 |                         query: 10..=20,
575 |                         alignment: Arc::clone(alignment_result),
576 |                     };
577 | 
578 |                     let p_value = f64::from(u32::try_from(outer_index).unwrap()) / 1000. + 0.01
579 |                         - (f64::from(inner_index + 1) / 10000.);
580 | 
581 |                     (result, p_value, p_value)
582 |                 })
583 |             })
584 |             .collect();
585 | 
586 |         let mut indices = vec![];
587 |         remove_overlapping_results(&mut results, &mut indices, &cli);
588 | 
589 |         assert_eq!(results.len(), targets.len());
590 |         let mut scores: Vec<_> = results
591 |             .into_iter()
592 |             .map(|(result, _, _)| result.score)
593 |             .collect();
594 |         scores.sort_unstable_by(f32::total_cmp);
595 |         assert_eq!(
596 |             scores,
597 |             (0..5)
598 |                 .map(|index| 15. + f32::from(i16::try_from(index).unwrap()) * 3.)
599 |                 .collect::<Vec<_>>()
600 |         );
601 |     }
602 | 
603 |     #[test]
604 |     fn keep_targets_with_overlapping_results_different_target() {
605 |         let mut results = vec![
606 |             query_result!(0..=4),
607 |             (
608 |                 QueryAlignResult {
609 |                     db: 15..=20,
610 |                     ..(query_result!(0..=4).0)
611 |                 },
612 |                 0.,
613 |                 0.,
614 |             ),
615 |         ];
616 |         let initial_results = results.clone();
617 |         remove_overlapping_results(&mut results, &mut vec![0, 1, 2, 3, 4], &dummy_cli());
618 | 
619 |         assert_eq!(results, initial_results);
620 |     }
621 | }
622 | 


--------------------------------------------------------------------------------
/src/iter.rs:
--------------------------------------------------------------------------------
 1 | use std::iter::FusedIterator;
 2 | 
 3 | pub(crate) trait IterWithRestExt<T> {
 4 |     fn iter_with_rest(&self) -> IterWithRest<'_, T>;
 5 | }
 6 | 
 7 | pub(crate) struct IterWithRest<'a, T>(&'a [T]);
 8 | 
 9 | impl<'a, T> Iterator for IterWithRest<'a, T> {
10 |     type Item = (&'a T, &'a [T]);
11 | 
12 |     #[inline]
13 |     fn next(&mut self) -> Option<Self::Item> {
14 |         let (first, rest) = self.0.split_first()?;
15 |         self.0 = rest;
16 |         Some((first, rest))
17 |     }
18 | 
19 |     #[inline]
20 |     fn size_hint(&self) -> (usize, Option<usize>) {
21 |         (self.0.len(), Some(self.0.len()))
22 |     }
23 | }
24 | 
25 | impl<T> ExactSizeIterator for IterWithRest<'_, T> {
26 |     #[inline]
27 |     fn len(&self) -> usize {
28 |         self.0.len()
29 |     }
30 | }
31 | 
32 | impl<T> FusedIterator for IterWithRest<'_, T> {}
33 | 
34 | impl<T> DoubleEndedIterator for IterWithRest<'_, T> {
35 |     #[inline]
36 |     fn next_back(&mut self) -> Option<Self::Item> {
37 |         let (last, rest) = self.0.split_last()?;
38 |         self.0 = rest;
39 |         Some((last, rest))
40 |     }
41 | }
42 | 
43 | impl<T> IterWithRestExt<T> for [T] {
44 |     #[inline]
45 |     fn iter_with_rest(&self) -> IterWithRest<'_, T> {
46 |         IterWithRest(self)
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/mass.rs:
--------------------------------------------------------------------------------
  1 | use fftw::{
  2 |     array::AlignedVec,
  3 |     plan::C2CPlan,
  4 |     types::{Flag, Sign},
  5 | };
  6 | use num_complex::Complex;
  7 | use num_traits::{float::FloatCore, Float};
  8 | 
  9 | use crate::{db_file::ReactivityWithPlaceholder, mean_stddev, C2CPlanExt, Reactivity};
 10 | 
 11 | pub(crate) struct Mass {
 12 |     forward_plan: <Reactivity as C2CPlanExt>::Plan,
 13 |     backward_plan: <Reactivity as C2CPlanExt>::Plan,
 14 |     aligned_query: AlignedVec<Complex<Reactivity>>,
 15 |     query_transform: AlignedVec<Complex<Reactivity>>,
 16 |     product: AlignedVec<Complex<Reactivity>>,
 17 |     product_inverse: AlignedVec<Complex<Reactivity>>,
 18 | }
 19 | 
 20 | impl Mass {
 21 |     pub(crate) fn new(size: usize) -> Result<Self, fftw::error::Error> {
 22 |         let forward_plan: <Reactivity as C2CPlanExt>::Plan =
 23 |             C2CPlan::aligned(&[size], Sign::Forward, Flag::ESTIMATE)?;
 24 |         let backward_plan: <Reactivity as C2CPlanExt>::Plan =
 25 |             C2CPlan::aligned(&[size], Sign::Backward, Flag::ESTIMATE)?;
 26 |         let aligned_query = AlignedVec::new(size);
 27 |         let query_transform = aligned_query.clone();
 28 |         let product = aligned_query.clone();
 29 |         let product_inverse = aligned_query.clone();
 30 | 
 31 |         Ok(Self {
 32 |             forward_plan,
 33 |             backward_plan,
 34 |             aligned_query,
 35 |             query_transform,
 36 |             product,
 37 |             product_inverse,
 38 |         })
 39 |     }
 40 | 
 41 |     pub(crate) fn run(
 42 |         &mut self,
 43 |         db: &[ReactivityWithPlaceholder],
 44 |         db_transform: &AlignedVec<Complex<Reactivity>>,
 45 |         query: &[Reactivity],
 46 |     ) -> Result<Vec<Complex<Reactivity>>, fftw::error::Error> {
 47 |         let ts_len = db_transform.len();
 48 |         let query_len = query.len();
 49 | 
 50 |         query
 51 |             .iter()
 52 |             .rev()
 53 |             .copied()
 54 |             .zip(self.aligned_query.iter_mut())
 55 |             .for_each(|(q, y)| y.re = q);
 56 |         self.forward_plan
 57 |             .c2c(&mut self.aligned_query, &mut self.query_transform)?;
 58 | 
 59 |         self.product
 60 |             .iter_mut()
 61 |             .zip(&**db_transform)
 62 |             .zip(&*self.query_transform)
 63 |             .for_each(|((z, x), y)| *z = x * y);
 64 | 
 65 |         self.backward_plan
 66 |             .c2c(&mut self.product, &mut self.product_inverse)?;
 67 | 
 68 |         // Normalize results
 69 |         #[allow(clippy::cast_precision_loss)]
 70 |         let scale_factor = 1. / (ts_len as Reactivity);
 71 |         for z in &mut *self.product_inverse {
 72 |             *z *= scale_factor;
 73 |         }
 74 | 
 75 |         let mean_sigma_x = db
 76 |             .windows(query_len)
 77 |             .map(|window| mean_stddev(window.iter().map(|r| r.to_maybe_placeholder()), 0));
 78 |         let (mean_y, sigma_y) = mean_stddev(query.iter().copied(), 0);
 79 | 
 80 |         // We are using this with the z value, it is ok to lose precision
 81 |         #[allow(clippy::cast_precision_loss)]
 82 |         let query_len_float = query_len as Reactivity;
 83 |         Ok(self
 84 |             .product_inverse
 85 |             .iter()
 86 |             .skip(query_len - 1)
 87 |             .take(ts_len.saturating_sub(query_len - 1))
 88 |             .zip(mean_sigma_x)
 89 |             .map(|(z, (mean_x, sigma_x))| {
 90 |                 let squared = 2.
 91 |                     * (query_len_float
 92 |                         - (z - query_len_float * mean_x * mean_y) / (sigma_x * sigma_y));
 93 |                 squared.sqrt()
 94 |             })
 95 |             .collect())
 96 |     }
 97 | }
 98 | 
 99 | pub(crate) trait ComplexExt {
100 |     fn sqrt(&self) -> Self;
101 |     fn powi(&self, n: i32) -> Self;
102 |     fn is_finite(&self) -> bool;
103 | }
104 | 
105 | impl ComplexExt for f64 {
106 |     fn sqrt(&self) -> Self {
107 |         f64::sqrt(*self)
108 |     }
109 | 
110 |     fn powi(&self, n: i32) -> Self {
111 |         f64::powi(*self, n)
112 |     }
113 | 
114 |     fn is_finite(&self) -> bool {
115 |         f64::is_finite(*self)
116 |     }
117 | }
118 | 
119 | impl ComplexExt for f32 {
120 |     fn sqrt(&self) -> Self {
121 |         f32::sqrt(*self)
122 |     }
123 | 
124 |     fn powi(&self, n: i32) -> Self {
125 |         f32::powi(*self, n)
126 |     }
127 | 
128 |     fn is_finite(&self) -> bool {
129 |         f32::is_finite(*self)
130 |     }
131 | }
132 | 
133 | impl<T> ComplexExt for Complex<T>
134 | where
135 |     T: Float + FloatCore,
136 | {
137 |     fn sqrt(&self) -> Self {
138 |         <Complex<T>>::sqrt(*self)
139 |     }
140 | 
141 |     fn powi(&self, n: i32) -> Self {
142 |         Complex::powi(self, n)
143 |     }
144 | 
145 |     fn is_finite(&self) -> bool {
146 |         Complex::is_finite(*self)
147 |     }
148 | }
149 | 
150 | #[cfg(test)]
151 | mod tests {
152 |     use approx::assert_abs_diff_eq;
153 | 
154 |     use super::*;
155 |     use crate::transform_db;
156 | 
157 |     #[test]
158 |     fn test_mass() {
159 |         const EXPECTED: [Complex<Reactivity>; 5] = [
160 |             Complex::new(0.676_408_23, 2.349_848_8e-7),
161 |             Complex::new(3.430_923_5, 0.),
162 |             Complex::new(3.430_923_5, 4.632_738_3e-8),
163 |             Complex::new(0.000_690_533_95, 0.),
164 |             Complex::new(1.851_136_1, -2.082_504_7e-8),
165 |         ];
166 | 
167 |         let ts = [1., 1., 1., 2., 1., 1., 4., 5.].map(ReactivityWithPlaceholder::from);
168 |         let ts_t = transform_db(&ts).unwrap();
169 |         let query = [2., 1., 1., 4.];
170 |         let result = Mass::new(ts.len())
171 |             .unwrap()
172 |             .run(ts.as_ref(), &ts_t, query.as_ref())
173 |             .unwrap();
174 | 
175 |         assert_abs_diff_eq!(&*result, EXPECTED.as_ref(), epsilon = 1e-6);
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/norm_dist.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::Deref;
 2 | 
 3 | use num_traits::AsPrimitive;
 4 | use once_cell::unsync::OnceCell;
 5 | 
 6 | pub struct NormDist<D> {
 7 |     data: D,
 8 |     mean: OnceCell<f64>,
 9 |     stddev: OnceCell<f64>,
10 | }
11 | 
12 | impl<D> NormDist<D> {
13 |     #[inline]
14 |     pub fn from_sample(data: D) -> Self {
15 |         Self {
16 |             data,
17 |             mean: OnceCell::new(),
18 |             stddev: OnceCell::new(),
19 |         }
20 |     }
21 | }
22 | 
23 | impl<D, T> NormDist<D>
24 | where
25 |     D: Deref<Target = [T]>,
26 |     T: AsPrimitive<f64>,
27 | {
28 |     pub fn z_score(&self, value: T) -> f64 {
29 |         let mean = self.mean();
30 |         let stddev = self.stddev();
31 | 
32 |         (value.as_() - mean) / stddev
33 |     }
34 | 
35 |     pub fn mean(&self) -> f64 {
36 |         *self.mean.get_or_init(|| {
37 |             let len = self.data.len();
38 |             if len == 0 {
39 |                 0.
40 |             } else {
41 |                 // It is fine to evaluate the mean
42 |                 #[allow(clippy::cast_precision_loss)]
43 |                 let len_recip = (len as f64).recip();
44 |                 self.data.iter().map(|x| x.as_() * len_recip).sum()
45 |             }
46 |         })
47 |     }
48 | 
49 |     pub fn stddev(&self) -> f64 {
50 |         *self.stddev.get_or_init(|| {
51 |             self.data.len().checked_sub(1).map_or(0., |adj_len| {
52 |                 // It is fine to evaluate the variance
53 |                 #[allow(clippy::cast_precision_loss)]
54 |                 let denominator = (adj_len as f64).recip();
55 | 
56 |                 let mean = self.mean();
57 |                 let variance: f64 = self
58 |                     .data
59 |                     .iter()
60 |                     .map(|x| (x.as_() - mean).powi(2) * denominator)
61 |                     .sum();
62 | 
63 |                 variance.sqrt()
64 |             })
65 |         })
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/null_model.rs:
--------------------------------------------------------------------------------
  1 | use std::{iter, mem};
  2 | 
  3 | use num_traits::cast;
  4 | use rand::prelude::*;
  5 | 
  6 | use crate::db_file::Entry;
  7 | 
  8 | #[inline]
  9 | pub fn make_shuffled_db(
 10 |     db: &[Entry],
 11 |     block_size: usize,
 12 |     shuffle_iterations: usize,
 13 |     in_block_shuffle: bool,
 14 | ) -> Vec<Entry> {
 15 |     make_shuffled_db_inner(
 16 |         db,
 17 |         block_size,
 18 |         shuffle_iterations,
 19 |         in_block_shuffle,
 20 |         rand::thread_rng(),
 21 |     )
 22 | }
 23 | 
 24 | fn make_shuffled_db_inner<R: Rng>(
 25 |     db: &[Entry],
 26 |     block_size: usize,
 27 |     shuffle_iterations: usize,
 28 |     in_block_shuffle: bool,
 29 |     mut rng: R,
 30 | ) -> Vec<Entry> {
 31 |     let mut chunk_indices = Vec::new();
 32 | 
 33 |     let sequences = db.len() * shuffle_iterations;
 34 |     iter::from_fn(move || {
 35 |         let entry = db.choose(&mut rng)?;
 36 |         let (offset, chunks) =
 37 |             get_random_offset_and_chunks(entry.sequence.len(), block_size, &mut rng);
 38 | 
 39 |         resize_indices(&mut chunk_indices, chunks);
 40 |         chunk_indices.shuffle(&mut rng);
 41 | 
 42 |         Some(get_shuffled_entry(
 43 |             &chunk_indices,
 44 |             entry,
 45 |             offset,
 46 |             block_size,
 47 |             in_block_shuffle,
 48 |             &mut rng,
 49 |         ))
 50 |     })
 51 |     .take(sequences)
 52 |     .collect()
 53 | }
 54 | 
 55 | #[inline]
 56 | fn get_shuffled_entry<R>(
 57 |     chunk_indices: &[usize],
 58 |     entry: &Entry,
 59 |     offset: usize,
 60 |     block_size: usize,
 61 |     in_block_shuffle: bool,
 62 |     mut rng: R,
 63 | ) -> Entry
 64 | where
 65 |     R: Rng,
 66 | {
 67 |     let mut sequence = Vec::with_capacity(entry.sequence.len());
 68 |     let mut reactivity = Vec::with_capacity(entry.reactivity.len());
 69 | 
 70 |     match offset {
 71 |         0 => {
 72 |             for &chunk_index in chunk_indices {
 73 |                 extend_with_shuffle(
 74 |                     &mut sequence,
 75 |                     get_chunk_without_offset(chunk_index, block_size, &entry.sequence),
 76 |                     in_block_shuffle,
 77 |                     &mut rng,
 78 |                 );
 79 |                 extend_with_shuffle(
 80 |                     &mut reactivity,
 81 |                     get_chunk_without_offset(chunk_index, block_size, &entry.reactivity),
 82 |                     in_block_shuffle,
 83 |                     &mut rng,
 84 |                 );
 85 |             }
 86 |         }
 87 |         _ => {
 88 |             for &chunk_index in chunk_indices {
 89 |                 extend_with_shuffle(
 90 |                     &mut sequence,
 91 |                     get_chunk_with_offset(chunk_index, offset, block_size, &entry.sequence),
 92 |                     in_block_shuffle,
 93 |                     &mut rng,
 94 |                 );
 95 |                 extend_with_shuffle(
 96 |                     &mut reactivity,
 97 |                     get_chunk_with_offset(chunk_index, offset, block_size, &entry.reactivity),
 98 |                     in_block_shuffle,
 99 |                     &mut rng,
100 |                 );
101 |             }
102 |         }
103 |     }
104 | 
105 |     Entry {
106 |         id: entry.id.clone(),
107 |         sequence,
108 |         reactivity,
109 |     }
110 | }
111 | 
112 | fn resize_indices(indices: &mut Vec<usize>, new_size: usize) {
113 |     let old_len = indices.len();
114 | 
115 |     let mut index = old_len;
116 |     indices.resize_with(new_size, move || {
117 |         let new_index = index + 1;
118 |         mem::replace(&mut index, new_index)
119 |     });
120 | 
121 |     let mut index = 0;
122 |     indices[..new_size.min(old_len)].fill_with(move || {
123 |         let new_index = index + 1;
124 |         mem::replace(&mut index, new_index)
125 |     });
126 | }
127 | 
128 | fn get_random_offset_and_chunks<R: Rng>(
129 |     len: usize,
130 |     block_size: usize,
131 |     mut rng: R,
132 | ) -> (usize, usize) {
133 |     let block_remainder = len % block_size;
134 |     let offset = (block_remainder > 0)
135 |         .then(|| rng.gen_range(0..block_remainder))
136 |         .unwrap_or(0);
137 | 
138 |     let len_without_offset = len - offset;
139 |     let aux_chunks = match (offset, len_without_offset % block_size) {
140 |         (0, 0) => 0,
141 |         (_, 0) | (0, _) => 1,
142 |         (_, _) => 2,
143 |     };
144 |     let chunks = len_without_offset / block_size + aux_chunks;
145 | 
146 |     (offset, chunks)
147 | }
148 | 
149 | #[derive(Debug)]
150 | pub(crate) struct ExtremeDistribution {
151 |     pub(crate) location: f64,
152 |     pub(crate) scale: f64,
153 | }
154 | 
155 | const EULER_MASCHERONI: f64 = 0.577_215_664_901_532_9;
156 | 
157 | impl ExtremeDistribution {
158 |     pub(crate) fn from_sample<T>(sample: &[T]) -> Self
159 |     where
160 |         T: num_traits::NumCast + Copy,
161 |     {
162 |         let len = sample.len();
163 |         match len {
164 |             0 => {
165 |                 return Self {
166 |                     location: 0.,
167 |                     scale: 0.,
168 |                 };
169 |             }
170 |             1 => {
171 |                 return Self {
172 |                     location: cast(sample[0]).unwrap(),
173 |                     scale: 0.,
174 |                 };
175 |             }
176 |             _ => {}
177 |         }
178 | 
179 |         // It is fine to evaluate mean and variance\
180 |         #[allow(clippy::cast_precision_loss)]
181 |         let len = len as f64;
182 |         let len_inv = 1. / len;
183 |         let mean: f64 = sample
184 |             .iter()
185 |             .copied()
186 |             .map(|x| cast::<_, f64>(x).unwrap() * len_inv)
187 |             .sum();
188 | 
189 |         let variance = sample
190 |             .iter()
191 |             .copied()
192 |             .map(|x| (cast::<_, f64>(x).unwrap() - mean).powi(2))
193 |             .sum::<f64>()
194 |             / (len - 1.);
195 | 
196 |         Self::from_mean_and_variance(mean, variance)
197 |     }
198 | 
199 |     fn from_mean_and_variance(mean: f64, variance: f64) -> Self {
200 |         use std::f64::consts::PI;
201 | 
202 |         let scale = (variance * 6. / PI.powi(2)).sqrt();
203 |         let location = mean - scale * EULER_MASCHERONI;
204 | 
205 |         Self { location, scale }
206 |     }
207 | 
208 |     pub(crate) fn cdf<T>(&self, value: T) -> f64
209 |     where
210 |         T: num_traits::NumCast,
211 |     {
212 |         let z = (cast::<_, f64>(value).unwrap() - self.location) / self.scale;
213 |         f64::exp(-f64::exp(-z))
214 |     }
215 | 
216 |     #[inline]
217 |     pub(crate) fn p_value<T>(&self, value: T) -> f64
218 |     where
219 |         T: num_traits::NumCast,
220 |     {
221 |         1. - self.cdf(value)
222 |     }
223 | }
224 | 
225 | #[inline]
226 | fn get_chunk_with_offset<T>(index: usize, offset: usize, block_size: usize, data: &[T]) -> &[T] {
227 |     match index.checked_sub(1) {
228 |         Some(index) => data
229 |             .get(offset..)
230 |             .map(|data| {
231 |                 data.chunks(block_size)
232 |                     .nth(index)
233 |                     .expect("chunk index out of bound")
234 |             })
235 |             .unwrap_or_default(),
236 | 
237 |         None => data.get(..offset).unwrap_or_default(),
238 |     }
239 | }
240 | 
241 | #[inline]
242 | fn get_chunk_without_offset<T>(index: usize, block_size: usize, data: &[T]) -> &[T] {
243 |     data.chunks(block_size)
244 |         .nth(index)
245 |         .expect("chunk index out of bound")
246 | }
247 | 
248 | fn extend_with_shuffle<T, R>(data: &mut Vec<T>, to_append: &[T], in_block_shuffle: bool, mut rng: R)
249 | where
250 |     T: Copy,
251 |     R: Rng,
252 | {
253 |     if in_block_shuffle {
254 |         let len_before = data.len();
255 |         data.extend(to_append);
256 |         data[len_before..].shuffle(&mut rng);
257 |     } else {
258 |         data.extend(to_append);
259 |     }
260 | }
261 | 
262 | #[cfg(test)]
263 | mod tests {
264 |     use std::fs::File;
265 | 
266 |     use rand::rngs::{mock::StepRng, SmallRng};
267 | 
268 |     use crate::{db_file, SequenceEntry};
269 | 
270 |     use super::*;
271 | 
272 |     #[test]
273 |     fn shuffled_entry_with_offset() {
274 |         const SEQUENCE_LEN: usize = 1553;
275 |         const BLOCK_SIZE: usize = 13;
276 |         const OFFSET: usize = 3;
277 |         // 1 chunk for the remainder
278 |         const EXPECTED_CHUNKS: usize =
279 |             1 + (1553 - OFFSET) / BLOCK_SIZE + ((SEQUENCE_LEN - OFFSET) % BLOCK_SIZE != 0) as usize;
280 |         const SHUFFLED_INDICES: [usize; EXPECTED_CHUNKS] = [
281 |             72, 38, 19, 42, 26, 13, 69, 51, 5, 97, 79, 62, 102, 28, 3, 120, 44, 103, 32, 81, 85,
282 |             27, 93, 113, 106, 15, 65, 36, 59, 98, 99, 77, 84, 29, 39, 61, 16, 30, 80, 1, 68, 14,
283 |             78, 90, 95, 118, 41, 10, 116, 91, 37, 70, 0, 92, 46, 18, 9, 2, 63, 57, 110, 40, 66, 49,
284 |             108, 56, 119, 7, 50, 107, 55, 54, 4, 104, 115, 23, 53, 111, 82, 24, 35, 71, 12, 43, 76,
285 |             48, 52, 25, 87, 100, 17, 74, 20, 94, 114, 109, 86, 34, 58, 21, 105, 64, 88, 60, 117,
286 |             22, 31, 89, 73, 11, 101, 75, 112, 96, 83, 47, 67, 8, 45, 33, 6,
287 |         ];
288 | 
289 |         let mut db =
290 |             db_file::native::Reader::new(File::open("test_data/test.db").unwrap()).unwrap();
291 |         let entry = db.entries().next().unwrap().unwrap();
292 | 
293 |         assert_eq!(entry.sequence.len(), SEQUENCE_LEN);
294 | 
295 |         let split_sequence: Vec<_> = iter::once(&entry.sequence[..OFFSET])
296 |             .chain(entry.sequence[OFFSET..].chunks(BLOCK_SIZE))
297 |             .collect();
298 |         let expected_sequence: Vec<_> = SHUFFLED_INDICES
299 |             .into_iter()
300 |             .flat_map(|index| split_sequence[index])
301 |             .copied()
302 |             .collect();
303 | 
304 |         let split_reactivities: Vec<_> = iter::once(&entry.reactivity[..OFFSET])
305 |             .chain(entry.reactivity[OFFSET..].chunks(BLOCK_SIZE))
306 |             .collect();
307 |         let expected_reactivity: Vec<_> = SHUFFLED_INDICES
308 |             .into_iter()
309 |             .flat_map(|index| split_reactivities[index])
310 |             .copied()
311 |             .collect();
312 | 
313 |         let shuffled_entry = get_shuffled_entry(
314 |             &SHUFFLED_INDICES,
315 |             &entry,
316 |             OFFSET,
317 |             BLOCK_SIZE,
318 |             false,
319 |             thread_rng(),
320 |         );
321 | 
322 |         assert_eq!(shuffled_entry.id, entry.id);
323 |         assert_eq!(shuffled_entry.sequence, expected_sequence);
324 |         assert!(shuffled_entry
325 |             .reactivity()
326 |             .iter()
327 |             .copied()
328 |             .zip(expected_reactivity)
329 |             .all(|(a, b)| (a.is_nan() && b.is_nan()) || a == b));
330 |     }
331 | 
332 |     #[test]
333 |     fn shuffled_entry_without_offset() {
334 |         const SEQUENCE_LEN: usize = 1553;
335 |         const BLOCK_SIZE: usize = 13;
336 |         const EXPECTED_CHUNKS: usize =
337 |             SEQUENCE_LEN / BLOCK_SIZE + (SEQUENCE_LEN % BLOCK_SIZE != 0) as usize;
338 |         const SHUFFLED_INDICES: [usize; EXPECTED_CHUNKS] = [
339 |             72, 38, 19, 42, 26, 13, 69, 51, 5, 97, 79, 62, 102, 28, 3, 44, 103, 32, 81, 85, 27, 93,
340 |             113, 106, 15, 65, 36, 59, 98, 99, 77, 84, 29, 39, 61, 16, 30, 80, 1, 68, 14, 78, 90,
341 |             95, 118, 41, 10, 116, 91, 37, 70, 0, 92, 46, 18, 9, 2, 63, 57, 110, 40, 66, 49, 108,
342 |             56, 119, 7, 50, 107, 55, 54, 4, 104, 115, 23, 53, 111, 82, 24, 35, 71, 12, 43, 76, 48,
343 |             52, 25, 87, 100, 17, 74, 20, 94, 114, 109, 86, 34, 58, 21, 105, 64, 88, 60, 117, 22,
344 |             31, 89, 73, 11, 101, 75, 112, 96, 83, 47, 67, 8, 45, 33, 6,
345 |         ];
346 | 
347 |         let mut db =
348 |             db_file::native::Reader::new(File::open("test_data/test.db").unwrap()).unwrap();
349 |         let entry = db.entries().next().unwrap().unwrap();
350 | 
351 |         assert_eq!(entry.sequence.len(), SEQUENCE_LEN);
352 | 
353 |         let split_sequence: Vec<_> = entry.sequence.chunks(BLOCK_SIZE).collect();
354 |         let expected_sequence: Vec<_> = SHUFFLED_INDICES
355 |             .into_iter()
356 |             .flat_map(|index| split_sequence[index])
357 |             .copied()
358 |             .collect();
359 | 
360 |         let split_reactivities: Vec<_> = entry.reactivity.chunks(BLOCK_SIZE).collect();
361 |         let expected_reactivity: Vec<_> = SHUFFLED_INDICES
362 |             .into_iter()
363 |             .flat_map(|index| split_reactivities[index])
364 |             .copied()
365 |             .collect();
366 | 
367 |         let shuffled_entry = get_shuffled_entry(
368 |             &SHUFFLED_INDICES,
369 |             &entry,
370 |             0,
371 |             BLOCK_SIZE,
372 |             false,
373 |             thread_rng(),
374 |         );
375 | 
376 |         assert_eq!(shuffled_entry.id, entry.id);
377 |         assert_eq!(shuffled_entry.sequence, expected_sequence);
378 |         assert!(shuffled_entry
379 |             .reactivity()
380 |             .iter()
381 |             .copied()
382 |             .zip(expected_reactivity)
383 |             .all(|(a, b)| (a.is_nan() && b.is_nan()) || a == b));
384 |     }
385 | 
386 |     #[test]
387 |     fn chunks_with_zero_offset_no_remainder() {
388 |         assert_eq!(
389 |             get_random_offset_and_chunks(30, 5, StepRng::new(0, 0)),
390 |             (0, 6)
391 |         );
392 |     }
393 | 
394 |     #[test]
395 |     fn chunks_with_zero_offset_with_remainder() {
396 |         let rng = StepRng::new(0, 0);
397 |         assert_eq!(rng.clone().gen_range(0..3), 0);
398 |         assert_eq!(get_random_offset_and_chunks(33, 5, rng), (0, 7));
399 |     }
400 | 
401 |     #[test]
402 |     fn chunks_with_offset_with_remainder() {
403 |         let rng = SmallRng::seed_from_u64(0);
404 |         assert_eq!(rng.clone().gen_range(0..3), 1);
405 |         assert_eq!(get_random_offset_and_chunks(33, 5, rng), (1, 8));
406 |     }
407 | 
408 |     #[test]
409 |     fn resize_indices() {
410 |         let mut indices = Vec::new();
411 |         super::resize_indices(&mut indices, 6);
412 |         assert_eq!(indices.len(), 6);
413 |         assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b));
414 | 
415 |         indices.fill(9999);
416 |         super::resize_indices(&mut indices, 24);
417 |         assert_eq!(indices.len(), 24);
418 |         assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b));
419 | 
420 |         indices.fill(9999);
421 |         super::resize_indices(&mut indices, 8);
422 |         assert_eq!(indices.len(), 8);
423 |         assert!(indices.iter().copied().enumerate().all(|(a, b)| a == b));
424 |     }
425 | 
426 |     #[test]
427 |     fn chunk_with_offset() {
428 |         let data: [u32; 13] = std::array::from_fn(|index| u32::try_from(index).unwrap());
429 |         assert_eq!(get_chunk_with_offset(0, 3, 5, &data), [0, 1, 2]);
430 |         assert_eq!(get_chunk_with_offset(1, 3, 5, &data), [3, 4, 5, 6, 7]);
431 |         assert_eq!(get_chunk_with_offset(2, 3, 5, &data), [8, 9, 10, 11, 12]);
432 | 
433 |         assert_eq!(get_chunk_with_offset(0, 15, 5, &data), [] as [u32; 0]);
434 |         assert_eq!(get_chunk_with_offset(1, 15, 5, &data), [] as [u32; 0]);
435 |     }
436 | 
437 |     #[test]
438 |     fn chunk_without_offset() {
439 |         let data: [u32; 9] = std::array::from_fn(|index| u32::try_from(index).unwrap());
440 |         assert_eq!(get_chunk_without_offset(0, 3, &data), [0, 1, 2]);
441 |         assert_eq!(get_chunk_without_offset(1, 3, &data), [3, 4, 5]);
442 |         assert_eq!(get_chunk_without_offset(2, 3, &data), [6, 7, 8]);
443 |     }
444 | 
445 |     #[test]
446 |     fn extreme_distribution_from_mean_and_variance() {
447 |         let dist = ExtremeDistribution::from_mean_and_variance(
448 |             1.508_101_930_862_146,
449 |             3.224_070_771_022_524,
450 |         );
451 |         assert!((dist.location - 0.7).abs() < 0.00001);
452 |         assert!((dist.scale - 1.4).abs() < 0.00001);
453 |     }
454 | 
455 |     #[test]
456 |     fn extreme_distribution_from_sample() {
457 |         const DATA: [f64; 12] = [1., 2., 3., 4., 4.5, 5., 5.5, 6., 7., 8., 9., 10.];
458 |         const MEAN: f64 = 5.416_666_666_666_667;
459 |         const VARIANCE: f64 = 7.583_333_333_333_334;
460 | 
461 |         let dist = ExtremeDistribution::from_sample(&DATA);
462 |         let expected_dist = ExtremeDistribution::from_mean_and_variance(MEAN, VARIANCE);
463 |         assert!((dist.location - expected_dist.location).abs() < 0.000_001);
464 |         assert!((dist.scale - expected_dist.scale).abs() < 0.000_001);
465 |     }
466 | 
467 |     #[test]
468 |     fn extreme_distribution_cdf() {
469 |         let dist = ExtremeDistribution {
470 |             location: 0.7,
471 |             scale: 1.4,
472 |         };
473 | 
474 |         assert!((dist.cdf(4.5f64) - 0.935_894_746_496_076_2).abs() < 0.000_000_01);
475 |     }
476 | 
477 |     #[test]
478 |     fn extreme_distribution_p_value() {
479 |         let dist = ExtremeDistribution {
480 |             location: 0.7,
481 |             scale: 1.4,
482 |         };
483 | 
484 |         assert!((dist.p_value(8f64) - 0.005_423_555_727_838_702_5).abs() < 0.000_000_01);
485 |     }
486 | }
487 | 


--------------------------------------------------------------------------------
/src/query_aligner.rs:
--------------------------------------------------------------------------------
  1 | use core::slice;
  2 | use std::{
  3 |     marker::PhantomData,
  4 |     mem,
  5 |     num::NonZeroUsize,
  6 |     ops::{self, Not, Range, RangeInclusive},
  7 |     sync::Arc,
  8 | };
  9 | 
 10 | use crate::{
 11 |     aligner::{
 12 |         calc_seed_align_tolerance, trimmed_range, AlignBehavior, AlignParams, AlignTolerance,
 13 |         Aligner, AlignmentResult, Direction,
 14 |     },
 15 |     calc_seed_alignment_score,
 16 |     cli::Cli,
 17 |     db_file, get_matching_kmers, group_matching_kmers, query_file, DbData, DbEntryMatches,
 18 |     MatchRanges, Reactivity, SequenceEntry,
 19 | };
 20 | 
 21 | pub(crate) fn align_query_to_target_db<'a, 'cli, Behavior>(
 22 |     query_entry: &'a query_file::Entry,
 23 |     db_entries: &'a [db_file::Entry],
 24 |     db_entries_orig: &'a [db_file::Entry],
 25 |     query_results: &mut Vec<DbEntryMatches<'a>>,
 26 |     cli: &'cli Cli,
 27 | ) -> anyhow::Result<QueryAligner<'a, 'cli, Behavior>> {
 28 |     query_results.clear();
 29 |     db_entries
 30 |         .iter()
 31 |         .zip(db_entries_orig)
 32 |         .try_for_each(|(db_entry, db_entry_orig)| {
 33 |             let db_file::Entry {
 34 |                 sequence,
 35 |                 reactivity,
 36 |                 ..
 37 |             } = db_entry;
 38 | 
 39 |             let db_data = DbData::new(sequence, reactivity)?;
 40 |             let matching_kmers = get_matching_kmers(
 41 |                 query_entry.reactivity(),
 42 |                 query_entry.sequence(),
 43 |                 &db_data,
 44 |                 cli,
 45 |             )?;
 46 |             let grouped = group_matching_kmers(&matching_kmers, cli);
 47 | 
 48 |             if grouped.is_empty().not() {
 49 |                 query_results.push(DbEntryMatches {
 50 |                     db_entry,
 51 |                     db_entry_orig,
 52 |                     matches: grouped,
 53 |                 });
 54 |             }
 55 |             Ok::<_, anyhow::Error>(())
 56 |         })?;
 57 | 
 58 |     Ok(QueryAligner {
 59 |         query_entry,
 60 |         cli,
 61 |         _marker: PhantomData,
 62 |     })
 63 | }
 64 | 
 65 | pub(crate) struct QueryAligner<'a, 'cli, Behavior> {
 66 |     query_entry: &'a query_file::Entry,
 67 |     cli: &'cli Cli,
 68 |     _marker: PhantomData<Behavior>,
 69 | }
 70 | 
 71 | impl<'a, 'cli, Behavior> QueryAligner<'a, 'cli, Behavior> {
 72 |     pub(crate) fn into_iter<'aln>(
 73 |         self,
 74 |         query_results: &'a [DbEntryMatches<'a>],
 75 |         aligner: &'aln mut Aligner<'cli>,
 76 |     ) -> QueryAlignIterator<'a, 'cli, 'aln, Behavior> {
 77 |         let Self {
 78 |             query_entry,
 79 |             cli,
 80 |             _marker,
 81 |         } = self;
 82 | 
 83 |         let query_results = query_results.iter();
 84 |         QueryAlignIterator(QueryAlignIteratorEnum::Empty {
 85 |             query_results,
 86 |             query_entry,
 87 |             cli,
 88 |             aligner,
 89 |         })
 90 |     }
 91 | }
 92 | 
 93 | pub(crate) struct QueryAlignIterator<'a, 'cli, 'aln, Behavior>(
 94 |     QueryAlignIteratorEnum<'a, 'cli, 'aln, Behavior>,
 95 | );
 96 | 
 97 | enum QueryAlignIteratorEnum<'a, 'cli, 'aln, Behavior> {
 98 |     Empty {
 99 |         query_results: slice::Iter<'a, DbEntryMatches<'a>>,
100 |         query_entry: &'a query_file::Entry,
101 |         cli: &'cli Cli,
102 |         aligner: &'aln mut Aligner<'cli>,
103 |     },
104 |     Full {
105 |         query_results: slice::Iter<'a, DbEntryMatches<'a>>,
106 |         iter: QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>,
107 |         query_entry: &'a query_file::Entry,
108 |         cli: &'cli Cli,
109 |     },
110 |     Finished,
111 | }
112 | 
113 | impl<'a, 'cli, 'aln, Behavior> QueryAlignIterator<'a, 'cli, 'aln, Behavior> {
114 |     fn make_new_iter(&mut self) -> Option<&mut QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>> {
115 |         match mem::replace(&mut self.0, QueryAlignIteratorEnum::Finished) {
116 |             QueryAlignIteratorEnum::Empty {
117 |                 query_results,
118 |                 query_entry,
119 |                 cli,
120 |                 aligner,
121 |             } => self.create_new_state(query_results, query_entry, cli, aligner),
122 |             QueryAlignIteratorEnum::Full {
123 |                 query_results,
124 |                 iter,
125 |                 query_entry,
126 |                 cli,
127 |             } => {
128 |                 let aligner = iter.aligner;
129 |                 self.create_new_state(query_results, query_entry, cli, aligner)
130 |             }
131 |             QueryAlignIteratorEnum::Finished => None,
132 |         }
133 |     }
134 | 
135 |     fn create_new_state(
136 |         &mut self,
137 |         mut query_results: slice::Iter<'a, DbEntryMatches<'a>>,
138 |         query_entry: &'a query_file::Entry,
139 |         cli: &'cli Cli,
140 |         aligner: &'aln mut Aligner<'cli>,
141 |     ) -> Option<&mut QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior>> {
142 |         query_results.next().map(|query_result| {
143 |             let &DbEntryMatches {
144 |                 db_entry,
145 |                 db_entry_orig,
146 |                 matches: ref db,
147 |             } = query_result;
148 | 
149 |             let iter = QueryAlignIteratorInner {
150 |                 aligner,
151 |                 db_iter: db.iter(),
152 |                 query_entry,
153 |                 db_entry,
154 |                 db_entry_orig,
155 |                 cli,
156 |                 _marker: PhantomData,
157 |             };
158 | 
159 |             self.0 = QueryAlignIteratorEnum::Full {
160 |                 query_results,
161 |                 iter,
162 |                 query_entry,
163 |                 cli,
164 |             };
165 | 
166 |             match &mut self.0 {
167 |                 QueryAlignIteratorEnum::Full { iter, .. } => iter,
168 |                 _ => unreachable!(),
169 |             }
170 |         })
171 |     }
172 | 
173 |     #[inline]
174 |     fn get_next_from_new_iter(&mut self) -> Option<QueryAlignResult<'a, Behavior::Alignment>>
175 |     where
176 |         Behavior: AlignBehavior,
177 |         <Behavior as AlignBehavior>::Alignment: std::fmt::Debug,
178 |     {
179 |         loop {
180 |             let next = self.make_new_iter()?.next();
181 |             if next.is_some() {
182 |                 break next;
183 |             }
184 |         }
185 |     }
186 | }
187 | 
188 | impl<'a, Behavior> Iterator for QueryAlignIterator<'a, '_, '_, Behavior>
189 | where
190 |     Behavior: AlignBehavior,
191 |     <Behavior as AlignBehavior>::Alignment: std::fmt::Debug,
192 | {
193 |     type Item = QueryAlignResult<'a, Behavior::Alignment>;
194 | 
195 |     fn next(&mut self) -> Option<Self::Item> {
196 |         match &mut self.0 {
197 |             QueryAlignIteratorEnum::Empty { .. } => self.get_next_from_new_iter(),
198 |             QueryAlignIteratorEnum::Full { iter, .. } => match iter.next() {
199 |                 Some(item) => Some(item),
200 |                 None => self.get_next_from_new_iter(),
201 |             },
202 |             QueryAlignIteratorEnum::Finished => None,
203 |         }
204 |     }
205 | }
206 | 
207 | struct QueryAlignIteratorInner<'a, 'cli, 'aln, Behavior> {
208 |     aligner: &'aln mut Aligner<'cli>,
209 |     db_iter: slice::Iter<'a, MatchRanges>,
210 |     query_entry: &'a query_file::Entry,
211 |     db_entry: &'a db_file::Entry,
212 |     db_entry_orig: &'a db_file::Entry,
213 |     cli: &'cli Cli,
214 |     _marker: PhantomData<Behavior>,
215 | }
216 | 
217 | #[derive(Debug, Clone, PartialEq)]
218 | pub(crate) struct QueryAlignResult<'a, Alignment> {
219 |     pub(crate) db_entry: &'a db_file::Entry,
220 |     pub(crate) db_entry_orig: &'a db_file::Entry,
221 |     pub(crate) db_match: MatchRanges,
222 |     pub(crate) score: Reactivity,
223 |     pub(crate) db: ops::RangeInclusive<usize>,
224 |     pub(crate) query: ops::RangeInclusive<usize>,
225 |     pub(crate) alignment: Arc<AlignmentResult<Alignment>>,
226 | }
227 | 
228 | impl<'a, Behavior> Iterator for QueryAlignIteratorInner<'a, '_, '_, Behavior>
229 | where
230 |     Behavior: AlignBehavior,
231 |     <Behavior as AlignBehavior>::Alignment: std::fmt::Debug,
232 | {
233 |     type Item = QueryAlignResult<'a, Behavior::Alignment>;
234 | 
235 |     fn next(&mut self) -> Option<Self::Item> {
236 |         let &mut Self {
237 |             ref mut aligner,
238 |             ref mut db_iter,
239 |             query_entry,
240 |             db_entry,
241 |             db_entry_orig,
242 |             cli,
243 |             _marker,
244 |         } = self;
245 | 
246 |         loop {
247 |             let db_match = db_iter.next()?;
248 |             let seed_score = calc_seed_alignment_score(
249 |                 query_entry,
250 |                 db_entry,
251 |                 db_match.query.clone(),
252 |                 db_match.db.clone(),
253 |                 cli,
254 |             );
255 | 
256 |             if seed_score <= 0. {
257 |                 continue;
258 |             }
259 | 
260 |             let result = handle_match::<Behavior>(
261 |                 db_match.clone(),
262 |                 query_entry,
263 |                 db_entry,
264 |                 db_entry_orig,
265 |                 seed_score,
266 |                 aligner,
267 |                 cli,
268 |             );
269 |             break Some(result);
270 |         }
271 |     }
272 | }
273 | 
274 | fn handle_match<'db, 'cli, Behavior>(
275 |     db_match: MatchRanges,
276 |     query_entry: &query_file::Entry,
277 |     db_entry: &'db db_file::Entry,
278 |     db_entry_orig: &'db db_file::Entry,
279 |     seed_score: f32,
280 |     aligner: &mut Aligner<'cli>,
281 |     cli: &'cli Cli,
282 | ) -> QueryAlignResult<'db, Behavior::Alignment>
283 | where
284 |     Behavior: AlignBehavior,
285 |     <Behavior as AlignBehavior>::Alignment: std::fmt::Debug,
286 | {
287 |     let MatchRanges { db, query } = db_match.clone();
288 | 
289 |     let AlignToleranceData {
290 |         trimmed_query_range,
291 |         query,
292 |         db,
293 |         seed_length,
294 |         align_tolerance,
295 |     } = get_align_tolerance_data(query_entry, db_entry, query, db, cli);
296 | 
297 |     let query_len = trimmed_query_range.len();
298 |     let align_tolerance = &align_tolerance;
299 | 
300 |     let upstream_result = aligner.align::<Behavior>(AlignParams {
301 |         query: query_entry,
302 |         target: db_entry,
303 |         query_range: query.clone(),
304 |         target_range: db.clone(),
305 |         seed_score,
306 |         align_tolerance,
307 |         direction: Direction::Upstream,
308 |     });
309 | 
310 |     let downstream_result = aligner.align::<Behavior>(AlignParams {
311 |         query: query_entry,
312 |         target: db_entry,
313 |         query_range: query,
314 |         target_range: db,
315 |         seed_score: upstream_result.score,
316 |         align_tolerance,
317 |         direction: Direction::Downstream,
318 |     });
319 | 
320 |     debug_assert!(upstream_result.query_index >= trimmed_query_range.start);
321 |     debug_assert!(downstream_result.query_index < trimmed_query_range.end);
322 |     debug_assert!(downstream_result.target_index < db_entry.reactivity().len());
323 | 
324 |     let query = upstream_result.query_index..=downstream_result.query_index;
325 |     let db = upstream_result.target_index..=downstream_result.target_index;
326 |     let aligned_query_len = downstream_result.query_index + 1 - upstream_result.query_index;
327 |     debug_assert!(aligned_query_len >= seed_length.get());
328 | 
329 |     // It is ok to lose precision to evaluate the score
330 |     #[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
331 |     let score = downstream_result.score as f64
332 |         * ((aligned_query_len as f64).ln() / (query_len as f64).ln());
333 | 
334 |     // We don't need so much precision on score after the calculation
335 |     #[allow(clippy::cast_possible_truncation)]
336 |     let score = score as Reactivity;
337 | 
338 |     let query_alignment = Behavior::merge_upstream_downstream(
339 |         upstream_result.query_alignment,
340 |         downstream_result.query_alignment,
341 |         seed_length,
342 |     );
343 |     let target_alignment = Behavior::merge_upstream_downstream(
344 |         upstream_result.target_alignment,
345 |         downstream_result.target_alignment,
346 |         seed_length,
347 |     );
348 |     let alignment = Arc::new(AlignmentResult {
349 |         query: query_alignment,
350 |         target: target_alignment,
351 |     });
352 | 
353 |     QueryAlignResult {
354 |         db_entry,
355 |         db_entry_orig,
356 |         db_match,
357 |         score,
358 |         db,
359 |         query,
360 |         alignment,
361 |     }
362 | }
363 | 
364 | #[inline]
365 | fn get_align_tolerance_data(
366 |     query_entry: &query_file::Entry,
367 |     db_entry: &db_file::Entry,
368 |     query: RangeInclusive<usize>,
369 |     db: RangeInclusive<usize>,
370 |     cli: &Cli,
371 | ) -> AlignToleranceData {
372 |     let trimmed_query_range = trimmed_range(query_entry.reactivity());
373 |     let query = intersect_range(query, trimmed_query_range.clone());
374 | 
375 |     let seed_length = db.end() - db.start() + 1;
376 |     debug_assert_eq!(seed_length, query.end() - query.start() + 1);
377 |     let seed_length = NonZeroUsize::new(seed_length)
378 |         .expect("seed must have a length greater than zero (and more)");
379 | 
380 |     let align_tolerance = calc_seed_align_tolerance(
381 |         query.clone(),
382 |         db.clone(),
383 |         trimmed_query_range.clone(),
384 |         db_entry.reactivity().len(),
385 |         cli.alignment_args.align_len_tolerance,
386 |     );
387 | 
388 |     AlignToleranceData {
389 |         trimmed_query_range,
390 |         query,
391 |         db,
392 |         seed_length,
393 |         align_tolerance,
394 |     }
395 | }
396 | 
397 | struct AlignToleranceData {
398 |     trimmed_query_range: Range<usize>,
399 |     query: RangeInclusive<usize>,
400 |     db: RangeInclusive<usize>,
401 |     seed_length: NonZeroUsize,
402 |     align_tolerance: AlignTolerance,
403 | }
404 | 
405 | #[inline]
406 | fn intersect_range(a: RangeInclusive<usize>, b: Range<usize>) -> RangeInclusive<usize> {
407 |     let start = *a.start().max(&b.start);
408 |     let end = *a.end().min(&b.end.saturating_sub(1));
409 |     start..=end
410 | }
411 | 
412 | #[cfg(test)]
413 | mod tests {
414 |     use std::io::Cursor;
415 | 
416 |     use crate::aligner::{AlignResult, AlignedSequence, BacktrackBehavior, BaseOrGap};
417 | 
418 |     use super::*;
419 | 
420 |     const TEST_DB: &[u8] = include_bytes!("../test_data/test.db");
421 |     const QUERY: &[u8] = include_bytes!("../test_data/query_align.txt");
422 |     const QUERY_RANGE: RangeInclusive<usize> = 0..=138;
423 |     const DB_RANGE: RangeInclusive<usize> = 1350..=1488;
424 |     const QUERY_SEED_RANGE: RangeInclusive<usize> = 0..=70;
425 |     const DB_SEED_RANGE: RangeInclusive<usize> = 1350..=1420;
426 |     const SEED_SCORE: f32 = 96.34601;
427 | 
428 |     struct TestEntries {
429 |         db_entry_orig: db_file::Entry,
430 |         db_entry: db_file::Entry,
431 |         query_entry: query_file::Entry,
432 |         cli: Cli,
433 |     }
434 | 
435 |     fn get_test_entries() -> TestEntries {
436 |         let cli = Cli::dummy();
437 | 
438 |         let mut test_db = db_file::native::Reader::new(Cursor::new(TEST_DB)).unwrap();
439 |         let db_entry_orig = test_db
440 |             .entries()
441 |             .map(Result::unwrap)
442 |             .find(|entry| entry.id == "16S_Bsubtilis")
443 |             .unwrap();
444 |         let mut db_entry = db_entry_orig.clone();
445 |         db_entry.cap_reactivities(cli.max_reactivity);
446 | 
447 |         let mut queries = query_file::read_file_content(Cursor::new(QUERY))
448 |             .unwrap()
449 |             .into_iter();
450 |         let mut query_entry = queries.next().unwrap();
451 |         assert!(queries.next().is_none());
452 |         query_entry.cap_reactivities(cli.max_reactivity);
453 | 
454 |         TestEntries {
455 |             db_entry_orig,
456 |             db_entry,
457 |             query_entry,
458 |             cli,
459 |         }
460 |     }
461 | 
462 |     #[test]
463 |     fn simple_alignment() {
464 |         let TestEntries {
465 |             db_entry_orig,
466 |             db_entry,
467 |             query_entry,
468 |             cli,
469 |         } = get_test_entries();
470 | 
471 |         let match_range = MatchRanges {
472 |             db: DB_SEED_RANGE,
473 |             query: QUERY_SEED_RANGE,
474 |         };
475 | 
476 |         let seed_score = calc_seed_alignment_score(
477 |             &query_entry,
478 |             &db_entry,
479 |             match_range.query.clone(),
480 |             match_range.db.clone(),
481 |             &cli,
482 |         );
483 |         assert!(f32::abs(seed_score - SEED_SCORE) < 0.0001);
484 | 
485 |         let mut aligner = Aligner::new(&cli);
486 | 
487 |         let matched = handle_match::<BacktrackBehavior>(
488 |             match_range,
489 |             &query_entry,
490 |             &db_entry,
491 |             &db_entry_orig,
492 |             seed_score,
493 |             &mut aligner,
494 |             &cli,
495 |         );
496 |         assert_eq!(matched.db, DB_RANGE);
497 |         assert_eq!(matched.query, QUERY_RANGE);
498 | 
499 |         assert_eq!(matched.alignment.target.0.len(), 139);
500 |         assert_eq!(matched.alignment.query.0.len(), 139);
501 | 
502 |         assert!(matched
503 |             .alignment
504 |             .target
505 |             .0
506 |             .iter()
507 |             .all(|base_or_gap| base_or_gap.is_base()));
508 |         assert!(matched
509 |             .alignment
510 |             .query
511 |             .0
512 |             .iter()
513 |             .all(|base_or_gap| base_or_gap.is_base()));
514 |     }
515 | 
516 |     #[test]
517 |     fn empty_upstream_alignment() {
518 |         let TestEntries {
519 |             db_entry,
520 |             query_entry,
521 |             cli,
522 |             ..
523 |         } = get_test_entries();
524 | 
525 |         let AlignToleranceData {
526 |             query,
527 |             db,
528 |             align_tolerance,
529 |             ..
530 |         } = get_align_tolerance_data(
531 |             &query_entry,
532 |             &db_entry,
533 |             QUERY_SEED_RANGE,
534 |             DB_SEED_RANGE,
535 |             &cli,
536 |         );
537 |         let align_tolerance = &align_tolerance;
538 | 
539 |         let mut aligner = Aligner::new(&cli);
540 | 
541 |         let upstream_result = aligner.align::<BacktrackBehavior>(AlignParams {
542 |             query: &query_entry,
543 |             target: &db_entry,
544 |             query_range: query,
545 |             target_range: db,
546 |             seed_score: SEED_SCORE,
547 |             align_tolerance,
548 |             direction: Direction::Upstream,
549 |         });
550 | 
551 |         let AlignResult {
552 |             query_index,
553 |             query_alignment,
554 |             target_index,
555 |             target_alignment,
556 |             score,
557 |         } = upstream_result;
558 | 
559 |         assert_eq!(query_index, *QUERY_RANGE.start());
560 |         assert_eq!(query_alignment, AlignedSequence(vec![BaseOrGap::Base]));
561 |         assert_eq!(target_index, *DB_RANGE.start());
562 |         assert_eq!(target_alignment, AlignedSequence(vec![BaseOrGap::Base]));
563 |         assert!((score - SEED_SCORE).abs() < 0.0001);
564 |     }
565 | 
566 |     #[test]
567 |     fn downstream_alignment() {
568 |         let TestEntries {
569 |             db_entry,
570 |             query_entry,
571 |             cli,
572 |             ..
573 |         } = get_test_entries();
574 | 
575 |         let AlignToleranceData {
576 |             query,
577 |             db,
578 |             align_tolerance,
579 |             ..
580 |         } = get_align_tolerance_data(
581 |             &query_entry,
582 |             &db_entry,
583 |             QUERY_SEED_RANGE,
584 |             DB_SEED_RANGE,
585 |             &cli,
586 |         );
587 |         let align_tolerance = &align_tolerance;
588 | 
589 |         let mut aligner = Aligner::new(&cli);
590 | 
591 |         let downstream_result = aligner.align::<BacktrackBehavior>(AlignParams {
592 |             query: &query_entry,
593 |             target: &db_entry,
594 |             query_range: query,
595 |             target_range: db,
596 |             seed_score: SEED_SCORE,
597 |             align_tolerance,
598 |             direction: Direction::Downstream,
599 |         });
600 | 
601 |         let AlignResult {
602 |             query_index,
603 |             query_alignment,
604 |             target_index,
605 |             target_alignment,
606 |             score,
607 |         } = downstream_result;
608 | 
609 |         assert_eq!(query_index, *QUERY_RANGE.end());
610 |         assert_eq!(
611 |             query_alignment.0.len(),
612 |             QUERY_RANGE.end() + 1 - QUERY_SEED_RANGE.end()
613 |         );
614 |         assert!(query_alignment
615 |             .0
616 |             .iter()
617 |             .all(|base_or_gap| base_or_gap.is_base()));
618 |         assert_eq!(target_index, *DB_RANGE.end());
619 |         assert_eq!(
620 |             target_alignment.0.len(),
621 |             DB_RANGE.end() + 1 - DB_SEED_RANGE.end()
622 |         );
623 |         assert!(target_alignment
624 |             .0
625 |             .iter()
626 |             .all(|base_or_gap| base_or_gap.is_base()));
627 |         assert!((score - 129.59003).abs() < 0.00001);
628 |     }
629 | }
630 | 


--------------------------------------------------------------------------------
/src/query_file.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     error::Error as StdError,
  3 |     fmt::{self, Display},
  4 |     fs::File,
  5 |     io::{self, BufRead, BufReader},
  6 |     ops::Not,
  7 |     path::Path,
  8 |     sync::Arc,
  9 | };
 10 | 
 11 | use crate::{db_file::ReactivityWithPlaceholder, Base, Molecule, Reactivity, SequenceEntry};
 12 | 
 13 | #[derive(Debug, Clone)]
 14 | pub struct Entry {
 15 |     pub name: Arc<str>,
 16 |     sequence: Vec<Base>,
 17 |     reactivities: Vec<ReactivityWithPlaceholder>,
 18 |     pub(crate) molecule: Molecule,
 19 | }
 20 | 
 21 | impl Entry {
 22 |     #[cfg(test)]
 23 |     pub(crate) fn new_unchecked(
 24 |         name: impl Into<Arc<str>>,
 25 |         sequence: Vec<Base>,
 26 |         reactivities: Vec<ReactivityWithPlaceholder>,
 27 |         molecule: Molecule,
 28 |     ) -> Self {
 29 |         let name = name.into();
 30 |         Self {
 31 |             name,
 32 |             sequence,
 33 |             reactivities,
 34 |             molecule,
 35 |         }
 36 |     }
 37 | 
 38 |     pub fn cap_reactivities(&mut self, max_reactivity: Reactivity) {
 39 |         self.reactivities.iter_mut().for_each(|reactivity| {
 40 |             if let Some(x) = reactivity.get_non_nan() {
 41 |                 *reactivity = x.min(max_reactivity).into();
 42 |             }
 43 |         });
 44 |     }
 45 | }
 46 | 
 47 | impl SequenceEntry for Entry {
 48 |     type Reactivity = ReactivityWithPlaceholder;
 49 | 
 50 |     fn name(&self) -> &str {
 51 |         &self.name
 52 |     }
 53 | 
 54 |     fn sequence(&self) -> &[Base] {
 55 |         &self.sequence
 56 |     }
 57 | 
 58 |     fn reactivity(&self) -> &[Self::Reactivity] {
 59 |         &self.reactivities
 60 |     }
 61 | 
 62 |     fn molecule(&self) -> Molecule {
 63 |         self.molecule
 64 |     }
 65 | }
 66 | 
 67 | #[derive(Debug)]
 68 | pub enum Error {
 69 |     TruncatedExpectedSequence,
 70 |     TruncatedExpectedReactivities,
 71 |     InvalidSequenceBase(RowColumn),
 72 |     InvalidReactivity(RowColumn),
 73 |     EmptySequence(usize),
 74 |     UnmatchedLengths(UnmatchedLengths),
 75 |     OpenFile(io::Error),
 76 |     ReadNameLine(io::Error),
 77 |     ReadSequenceLine(io::Error),
 78 |     ReadReactivityLine(io::Error),
 79 | }
 80 | 
 81 | impl Display for Error {
 82 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 83 |         match self {
 84 |             Error::TruncatedExpectedSequence => f.write_str("file truncated, expected sequence"),
 85 |             Error::TruncatedExpectedReactivities => {
 86 |                 f.write_str("file truncated, expected reactivities")
 87 |             }
 88 |             Error::InvalidSequenceBase(row_column) => {
 89 |                 write!(
 90 |                     f,
 91 |                     "invalid sequence base at line {} and column {}",
 92 |                     row_column.row, row_column.column,
 93 |                 )
 94 |             }
 95 |             Error::InvalidReactivity(row_column) => write!(
 96 |                 f,
 97 |                 "invalid reactivity at line {} and column {}",
 98 |                 row_column.row, row_column.column,
 99 |             ),
100 |             Error::EmptySequence(row) => write!(f, "unexpected empty sequence at line {row}"),
101 |             Error::UnmatchedLengths(lengths) => {
102 |                 write!(
103 |                     f,
104 |                     "unmatching lengths between sequence ({}) and reactivities ({}) for query \
105 |                     starting at line {}",
106 |                     lengths.sequence, lengths.reactivities, lengths.line,
107 |                 )
108 |             }
109 |             Error::OpenFile(_) => f.write_str("cannot open file"),
110 |             Error::ReadNameLine(_) => f.write_str("cannot read line containing sequence name"),
111 |             Error::ReadSequenceLine(_) => f.write_str("cannot read line containing sequence data"),
112 |             Error::ReadReactivityLine(_) => {
113 |                 f.write_str("cannot read line containing sequence reactivity")
114 |             }
115 |         }
116 |     }
117 | }
118 | 
119 | impl StdError for Error {
120 |     fn source(&self) -> Option<&(dyn StdError + 'static)> {
121 |         match self {
122 |             Error::TruncatedExpectedSequence
123 |             | Error::TruncatedExpectedReactivities
124 |             | Error::InvalidSequenceBase(_)
125 |             | Error::InvalidReactivity(_)
126 |             | Error::EmptySequence(_)
127 |             | Error::UnmatchedLengths(_) => None,
128 | 
129 |             Error::OpenFile(source)
130 |             | Error::ReadNameLine(source)
131 |             | Error::ReadSequenceLine(source)
132 |             | Error::ReadReactivityLine(source) => Some(source),
133 |         }
134 |     }
135 | }
136 | 
137 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
138 | pub struct RowColumn {
139 |     pub row: usize,
140 |     pub column: usize,
141 | }
142 | 
143 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
144 | pub struct UnmatchedLengths {
145 |     sequence: usize,
146 |     reactivities: usize,
147 |     line: usize,
148 | }
149 | 
150 | #[inline]
151 | pub fn read_file(path: &Path) -> Result<Vec<Entry>, Error> {
152 |     let reader = BufReader::new(File::open(path).map_err(Error::OpenFile)?);
153 |     read_file_content(reader)
154 | }
155 | 
156 | pub fn read_file_content<R>(mut reader: R) -> Result<Vec<Entry>, Error>
157 | where
158 |     R: BufRead,
159 | {
160 |     let mut line = String::new();
161 |     let mut entries = Vec::new();
162 | 
163 |     let mut file_row = 0;
164 |     loop {
165 |         line.clear();
166 |         file_row += 1;
167 |         if reader.read_line(&mut line).map_err(Error::ReadNameLine)? == 0 {
168 |             break;
169 |         }
170 | 
171 |         if line.as_bytes().iter().all(u8::is_ascii_whitespace) {
172 |             continue;
173 |         }
174 | 
175 |         let name = Arc::from(line.trim().to_string());
176 | 
177 |         file_row += 1;
178 |         line.clear();
179 |         if reader
180 |             .read_line(&mut line)
181 |             .map_err(Error::ReadSequenceLine)?
182 |             == 0
183 |         {
184 |             return Err(Error::TruncatedExpectedSequence);
185 |         }
186 | 
187 |         let (sequence, molecule) = parse_sequence(&line, file_row)?;
188 | 
189 |         if sequence.is_empty() {
190 |             return Err(Error::EmptySequence(file_row));
191 |         }
192 | 
193 |         file_row += 1;
194 |         line.clear();
195 |         if reader
196 |             .read_line(&mut line)
197 |             .map_err(Error::ReadReactivityLine)?
198 |             == 0
199 |         {
200 |             return Err(Error::TruncatedExpectedReactivities);
201 |         }
202 | 
203 |         let mut column = 1;
204 |         let reactivities = line
205 |             .trim_end()
206 |             .split(',')
207 |             .map(|raw_reactivity| {
208 |                 if column != 1 {
209 |                     column += 1;
210 |                 }
211 | 
212 |                 let reactivity = if raw_reactivity.eq_ignore_ascii_case("NaN") {
213 |                     ReactivityWithPlaceholder::from(Reactivity::NAN)
214 |                 } else {
215 |                     raw_reactivity
216 |                         .parse::<Reactivity>()
217 |                         .map(ReactivityWithPlaceholder::from)
218 |                         .map_err(|_| {
219 |                             Error::InvalidReactivity(RowColumn {
220 |                                 row: file_row,
221 |                                 column,
222 |                             })
223 |                         })?
224 |                 };
225 | 
226 |                 column += raw_reactivity.len();
227 |                 Ok::<_, Error>(reactivity)
228 |             })
229 |             .collect::<Result<Vec<_>, _>>()?;
230 | 
231 |         if sequence.len() != reactivities.len() {
232 |             return Err(Error::UnmatchedLengths(UnmatchedLengths {
233 |                 sequence: sequence.len(),
234 |                 reactivities: reactivities.len(),
235 |                 line: file_row - 2,
236 |             }));
237 |         }
238 | 
239 |         entries.push(Entry {
240 |             name,
241 |             sequence,
242 |             reactivities,
243 |             molecule,
244 |         });
245 |     }
246 | 
247 |     Ok(entries)
248 | }
249 | 
250 | fn parse_sequence(raw_line: &str, row: usize) -> Result<(Vec<Base>, Molecule), Error> {
251 |     let mut molecule = Molecule::default();
252 |     raw_line
253 |         .as_bytes()
254 |         .iter()
255 |         .copied()
256 |         .enumerate()
257 |         .skip_while(|(_, c)| c.is_ascii_whitespace())
258 |         .take_while(|(_, c)| c.is_ascii_whitespace().not())
259 |         .map(|(index, c)| {
260 |             match (c, molecule) {
261 |                 (b'T', Molecule::Unknown) => molecule = Molecule::Dna,
262 |                 (b'U', Molecule::Unknown) => molecule = Molecule::Rna,
263 |                 (b'T', Molecule::Rna) | (b'U', Molecule::Dna) => {
264 |                     return Err(Error::InvalidSequenceBase(RowColumn {
265 |                         row,
266 |                         column: index + 1,
267 |                     }));
268 |                 }
269 |                 _ => {}
270 |             }
271 | 
272 |             Base::try_from(c).map_err(|_| {
273 |                 Error::InvalidSequenceBase(RowColumn {
274 |                     row,
275 |                     column: index + 1,
276 |                 })
277 |             })
278 |         })
279 |         .collect::<Result<_, _>>()
280 |         .map(|sequence| (sequence, molecule))
281 | }
282 | 
283 | #[cfg(test)]
284 | mod tests {
285 |     use std::io::Cursor;
286 | 
287 |     use super::*;
288 | 
289 |     macro_rules! base {
290 |         (A) => {
291 |             crate::Base::A
292 |         };
293 | 
294 |         (C) => {
295 |             crate::Base::C
296 |         };
297 | 
298 |         (G) => {
299 |             crate::Base::G
300 |         };
301 | 
302 |         (T) => {
303 |             crate::Base::T
304 |         };
305 | 
306 |         (N) => {
307 |             crate::Base::N
308 |         };
309 |     }
310 | 
311 |     macro_rules! seq {
312 |         ([$($bases:expr),*] $(,)?) => {
313 |             &[$($bases),*]
314 |         };
315 | 
316 |         ([$($bases:expr),* $(,)?] $base:ident $($rest:ident)*) => {
317 |             seq!([$($bases,)* base!($base)] $($rest)*)
318 |         };
319 | 
320 |         ($($bases:ident)*) => {
321 |             seq!([] $($bases)*)
322 |         };
323 |     }
324 | 
325 |     fn reactivities_eq<I1, I2>(a: I1, b: I2) -> bool
326 |     where
327 |         I1: IntoIterator<Item = ReactivityWithPlaceholder>,
328 |         I2: IntoIterator<Item = Reactivity>,
329 |     {
330 |         a.into_iter().zip(b).all(|(a, b)| {
331 |             if b.is_nan() {
332 |                 a.is_nan()
333 |             } else {
334 |                 (a.to_maybe_placeholder() - b).abs() < 10e-5
335 |             }
336 |         })
337 |     }
338 | 
339 |     #[test]
340 |     fn read_valid_file() {
341 |         const CONTENT: &str = include_str!("../test_data/valid_query.txt");
342 |         let entries = read_file_content(Cursor::new(CONTENT)).unwrap();
343 | 
344 |         assert_eq!(entries.len(), 2);
345 |         assert_eq!(&*entries[0].name, "test1");
346 |         assert_eq!(entries[0].sequence, seq!(A C G T N));
347 |         assert!(reactivities_eq(
348 |             entries[0].reactivities.iter().copied(),
349 |             [0.123, 0.456, 0.789, 1.234, Reactivity::NAN]
350 |         ));
351 | 
352 |         assert_eq!(&*entries[1].name, "test2");
353 |         assert_eq!(entries[1].sequence, seq!(N A C G T));
354 |         assert!(reactivities_eq(
355 |             entries[1].reactivities.iter().copied(),
356 |             [Reactivity::NAN, 12., 0.456, 0.789, 0.012]
357 |         ));
358 |     }
359 | 
360 |     #[test]
361 |     fn empty_sequence() {
362 |         const CONTENT: &str = include_str!("../test_data/query_empty_sequence.txt");
363 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
364 | 
365 |         assert!(matches!(err, Error::EmptySequence(6)));
366 |     }
367 | 
368 |     #[test]
369 |     fn truncated_sequence() {
370 |         const CONTENT: &str = include_str!("../test_data/query_truncated_sequence.txt");
371 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
372 | 
373 |         assert!(matches!(err, Error::TruncatedExpectedSequence));
374 |     }
375 | 
376 |     #[test]
377 |     fn truncated_reactivities() {
378 |         const CONTENT: &str = include_str!("../test_data/query_truncated_reactivities.txt");
379 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
380 | 
381 |         assert!(matches!(err, Error::TruncatedExpectedReactivities));
382 |     }
383 | 
384 |     #[test]
385 |     fn invalid_sequence_base() {
386 |         const CONTENT: &str = include_str!("../test_data/query_invalid_base.txt");
387 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
388 | 
389 |         if let Error::InvalidSequenceBase(err) = err {
390 |             assert_eq!(err, RowColumn { row: 6, column: 3 });
391 |         } else {
392 |             panic!()
393 |         }
394 |     }
395 | 
396 |     #[test]
397 |     fn invalid_sequence_reactivity() {
398 |         const CONTENT: &str = include_str!("../test_data/query_invalid_reactivity.txt");
399 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
400 | 
401 |         if let Error::InvalidReactivity(err) = err {
402 |             assert_eq!(err, RowColumn { row: 7, column: 11 });
403 |         } else {
404 |             panic!()
405 |         }
406 |     }
407 | 
408 |     #[test]
409 |     fn invalid_lengths() {
410 |         const CONTENT: &str = include_str!("../test_data/query_invalid_lengths.txt");
411 |         let err = read_file_content(Cursor::new(CONTENT)).unwrap_err();
412 | 
413 |         if let Error::UnmatchedLengths(err) = err {
414 |             assert_eq!(
415 |                 err,
416 |                 UnmatchedLengths {
417 |                     sequence: 6,
418 |                     reactivities: 5,
419 |                     line: 5,
420 |                 }
421 |             );
422 |         } else {
423 |             panic!()
424 |         }
425 |     }
426 | 
427 |     #[test]
428 |     fn cap_reactivities() {
429 |         const CONTENT: &str = include_str!("../test_data/valid_query.txt");
430 |         let mut entries = read_file_content(Cursor::new(CONTENT)).unwrap();
431 |         entries
432 |             .iter_mut()
433 |             .for_each(|entry| entry.cap_reactivities(1.));
434 | 
435 |         assert!(reactivities_eq(
436 |             entries[0].reactivities.iter().copied(),
437 |             [0.123, 0.456, 0.789, 1., Reactivity::NAN]
438 |         ));
439 | 
440 |         assert!(reactivities_eq(
441 |             entries[1].reactivities.iter().copied(),
442 |             [Reactivity::NAN, 1., 0.456, 0.789, 0.012]
443 |         ));
444 |     }
445 | }
446 | 


--------------------------------------------------------------------------------
/src/query_result.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fmt::{self, Display},
  3 |     ops,
  4 |     sync::Arc,
  5 | };
  6 | 
  7 | use num_traits::{Float, FromPrimitive};
  8 | use serde::{ser::SerializeStruct, Deserialize, Deserializer, Serialize, Serializer};
  9 | use tabled::Tabled;
 10 | 
 11 | use crate::{
 12 |     aligner::{AlignedSequence, AlignmentResult},
 13 |     dotbracket::DotBracketOwnedSorted,
 14 | };
 15 | 
 16 | #[derive(Debug, Deserialize, Serialize, Tabled)]
 17 | pub struct QueryResult {
 18 |     #[serde(rename = "Query")]
 19 |     pub query: Arc<str>,
 20 | 
 21 |     #[serde(rename = "DB entry")]
 22 |     pub db_entry: String,
 23 | 
 24 |     #[serde(rename = "Qstart")]
 25 |     pub query_start: usize,
 26 | 
 27 |     #[serde(rename = "Qend")]
 28 |     pub query_end: usize,
 29 | 
 30 |     #[serde(rename = "Dstart")]
 31 |     pub db_start: usize,
 32 | 
 33 |     #[serde(rename = "Dend")]
 34 |     pub db_end: usize,
 35 | 
 36 |     #[serde(rename = "Qseed")]
 37 |     pub query_seed: Range,
 38 | 
 39 |     #[serde(rename = "Dseed")]
 40 |     pub db_seed: Range,
 41 | 
 42 |     #[serde(rename = "Score")]
 43 |     pub score: f32,
 44 | 
 45 |     #[serde(rename = "P-value")]
 46 |     #[tabled(display_with = "display_scientific")]
 47 |     pub pvalue: f64,
 48 | 
 49 |     #[serde(rename = "E-value")]
 50 |     #[tabled(display_with = "display_scientific")]
 51 |     pub evalue: f64,
 52 | 
 53 |     #[serde(rename = "TargetBpSupport")]
 54 |     #[tabled(display_with = "display_scientific_opt")]
 55 |     pub target_bp_support: Option<f32>,
 56 | 
 57 |     #[serde(rename = "QueryBpSupport")]
 58 |     #[tabled(display_with = "display_scientific_opt")]
 59 |     pub query_bp_support: Option<f32>,
 60 | 
 61 |     #[serde(rename = "MfePvalue")]
 62 |     #[tabled(display_with = "display_scientific_opt")]
 63 |     pub mfe_pvalue: Option<f64>,
 64 | 
 65 |     #[serde(rename = "")]
 66 |     pub status: Status,
 67 | 
 68 |     #[serde(skip)]
 69 |     #[tabled(skip)]
 70 |     pub alignment: Arc<AlignmentResult<AlignedSequence>>,
 71 | 
 72 |     #[serde(skip)]
 73 |     #[tabled(skip)]
 74 |     pub dotbracket: Option<DotBracketOwnedSorted>,
 75 | }
 76 | 
 77 | impl QueryResult {
 78 |     pub fn new(query: impl Into<Arc<str>>) -> Self {
 79 |         let query = query.into();
 80 |         Self {
 81 |             query,
 82 |             db_entry: String::default(),
 83 |             query_start: Default::default(),
 84 |             query_end: Default::default(),
 85 |             db_start: Default::default(),
 86 |             db_end: Default::default(),
 87 |             query_seed: Range::default(),
 88 |             db_seed: Range::default(),
 89 |             score: Default::default(),
 90 |             pvalue: Default::default(),
 91 |             evalue: Default::default(),
 92 |             status: Status::default(),
 93 |             target_bp_support: Option::default(),
 94 |             query_bp_support: Option::default(),
 95 |             mfe_pvalue: Option::default(),
 96 |             alignment: Arc::default(),
 97 |             dotbracket: Option::default(),
 98 |         }
 99 |     }
100 | }
101 | 
102 | #[derive(Debug)]
103 | pub struct Serializeable<'a> {
104 |     pub query_result: &'a QueryResult,
105 |     pub eval_align_fold: bool,
106 | }
107 | 
108 | impl serde::Serialize for Serializeable<'_> {
109 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
110 |     where
111 |         S: Serializer,
112 |     {
113 |         let fields_count = if self.eval_align_fold { 15 } else { 12 };
114 |         let mut struc = serializer.serialize_struct("QueryResult", fields_count)?;
115 | 
116 |         let query_result = self.query_result;
117 |         struc.serialize_field("Query", &query_result.query)?;
118 |         struc.serialize_field("DB field", &query_result.db_entry)?;
119 |         struc.serialize_field("Qstart", &query_result.query_start)?;
120 |         struc.serialize_field("Qend", &query_result.query_end)?;
121 |         struc.serialize_field("Dstart", &query_result.db_start)?;
122 |         struc.serialize_field("Dend", &query_result.db_end)?;
123 |         struc.serialize_field("Qseed", &query_result.query_seed)?;
124 |         struc.serialize_field("Dseed", &query_result.db_seed)?;
125 |         struc.serialize_field("Score", &query_result.score)?;
126 |         struc.serialize_field("P-value", &display_scientific(&query_result.pvalue))?;
127 |         struc.serialize_field("E-value", &display_scientific(&query_result.evalue))?;
128 | 
129 |         if self.eval_align_fold {
130 |             struc.serialize_field(
131 |                 "TargetBpSupport",
132 |                 &display_scientific_opt(&query_result.target_bp_support),
133 |             )?;
134 |             struc.serialize_field(
135 |                 "QueryBpSupport",
136 |                 &display_scientific_opt(&query_result.query_bp_support),
137 |             )?;
138 |             struc.serialize_field(
139 |                 "MfePvalue",
140 |                 &display_scientific_opt(&query_result.mfe_pvalue),
141 |             )?;
142 |         }
143 | 
144 |         struc.serialize_field("", &query_result.status)?;
145 |         struc.end()
146 |     }
147 | }
148 | 
149 | #[derive(Debug)]
150 | pub struct Range(pub ops::RangeInclusive<usize>);
151 | 
152 | impl Default for Range {
153 |     fn default() -> Self {
154 |         Self(0..=0)
155 |     }
156 | }
157 | 
158 | impl Serialize for Range {
159 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
160 |     where
161 |         S: serde::Serializer,
162 |     {
163 |         serializer.collect_str(self)
164 |     }
165 | }
166 | 
167 | impl<'de> Deserialize<'de> for Range {
168 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
169 |     where
170 |         D: Deserializer<'de>,
171 |     {
172 |         use serde::de::Error;
173 | 
174 |         let raw = <&str>::deserialize(deserializer)?;
175 |         let mut split = raw.split('-').map(str::parse);
176 |         let start = split
177 |             .next()
178 |             .ok_or_else(|| Error::custom("missing start in range"))?
179 |             .map_err(|_| Error::custom("invalid start in range"))?;
180 | 
181 |         let end = split
182 |             .next()
183 |             .ok_or_else(|| Error::custom("missing end in range"))?
184 |             .map_err(|_| Error::custom("invalid end in range"))?;
185 | 
186 |         if split.next().is_some() {
187 |             return Err(Error::custom("invalid range format"));
188 |         }
189 | 
190 |         Ok(Self(start..=end))
191 |     }
192 | }
193 | 
194 | impl fmt::Display for Range {
195 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196 |         write!(f, "{}-{}", self.0.start(), self.0.end())
197 |     }
198 | }
199 | 
200 | #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
201 | pub enum Status {
202 |     #[serde(rename = "!")]
203 |     PassInclusionEvalue,
204 | 
205 |     #[serde(rename = "?")]
206 |     PassReportEvalue,
207 | 
208 |     #[default]
209 |     #[serde(rename = "")]
210 |     NotPass,
211 | }
212 | 
213 | impl fmt::Display for Status {
214 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
215 |         match self {
216 |             Self::PassInclusionEvalue => f.write_str("!"),
217 |             Self::PassReportEvalue => f.write_str("?"),
218 |             Self::NotPass => f.write_str(""),
219 |         }
220 |     }
221 | }
222 | 
223 | fn display_scientific<T>(x: &T) -> String
224 | where
225 |     T: Float + FromPrimitive + Display + fmt::LowerExp,
226 | {
227 |     if *x >= T::from_f32(0.1).unwrap() {
228 |         format!("{x:.3}")
229 |     } else {
230 |         format!("{x:.3e}")
231 |     }
232 | }
233 | 
234 | #[allow(clippy::ref_option)] // The signature is because of serde
235 | fn display_scientific_opt<T>(x: &Option<T>) -> String
236 | where
237 |     T: Float + FromPrimitive + Display + fmt::LowerExp,
238 | {
239 |     x.as_ref().map(display_scientific).unwrap_or_default()
240 | }
241 | 


--------------------------------------------------------------------------------
/src/stockholm.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     fmt::{self, Display},
  3 |     fs::File,
  4 |     io::{self, BufWriter},
  5 |     path::Path,
  6 |     rc::Rc,
  7 | };
  8 | 
  9 | use anyhow::Context;
 10 | 
 11 | use crate::{
 12 |     db_file, gapped_sequence::GappedSequence, query_file, QueryResult, ResultFileFormat,
 13 |     SequenceEntry,
 14 | };
 15 | 
 16 | pub(crate) fn write_result(
 17 |     result: &QueryResult,
 18 |     db_entries: &[db_file::Entry],
 19 |     query_entries: &[query_file::Entry],
 20 |     alignments_path: &Path,
 21 | ) -> Result<(), anyhow::Error> {
 22 |     let stockholm_path = alignments_path.join(format!("{}.sto", ResultFileFormat::from(result)));
 23 |     let file = File::create(stockholm_path).context("Unable to create stockholm file")?;
 24 |     let writer = BufWriter::new(file);
 25 | 
 26 |     write_result_to_writer(result, db_entries, query_entries, writer)
 27 | }
 28 | 
 29 | #[inline]
 30 | fn write_result_to_writer<W: io::Write>(
 31 |     result: &QueryResult,
 32 |     db_entries: &[db_file::Entry],
 33 |     query_entries: &[query_file::Entry],
 34 |     writer: W,
 35 | ) -> Result<(), anyhow::Error> {
 36 |     let &QueryResult {
 37 |         ref query,
 38 |         db_entry: ref db,
 39 |         query_start,
 40 |         query_end,
 41 |         db_start,
 42 |         db_end,
 43 |         ref alignment,
 44 |         ref dotbracket,
 45 |         ..
 46 |     } = result;
 47 | 
 48 |     let db_entry = db_entries
 49 |         .iter()
 50 |         .find(|entry| entry.name() == db)
 51 |         .expect("db entry should be available");
 52 |     let query_entry = query_entries
 53 |         .iter()
 54 |         .find(|entry| entry.name() == &**query)
 55 |         .expect("query entry should be available");
 56 | 
 57 |     let db_sequence = GappedSequence {
 58 |         sequence: crate::Sequence {
 59 |             bases: &db_entry.sequence()[db_start..=db_end],
 60 |             molecule: db_entry.molecule(),
 61 |         },
 62 |         alignment: alignment.target.to_ref(),
 63 |     };
 64 | 
 65 |     let query_sequence = GappedSequence {
 66 |         sequence: crate::Sequence {
 67 |             bases: &query_entry.sequence()[query_start..=query_end],
 68 |             molecule: query_entry.molecule(),
 69 |         },
 70 |         alignment: alignment.query.to_ref(),
 71 |     };
 72 | 
 73 |     let seq_label_align = db.len().max(query.len()).max("#=GC SS_cons".len()) + 1;
 74 | 
 75 |     let mut stockholm = Stockholm::default()
 76 |         .with_identification(ResultFileFormat::from(result))
 77 |         .with_author(format!("SHAPEwarp {}", env!("CARGO_PKG_VERSION")))
 78 |         .with_empty_line()
 79 |         .with_sequence(format!("{db:seq_label_align$}"), db_sequence)
 80 |         .with_sequence(format!("{query:seq_label_align$}"), query_sequence);
 81 | 
 82 |     if let Some(dotbracket) = dotbracket {
 83 |         stockholm = stockholm.with_column_annotation(
 84 |             format!("{:1$}", "SS_cons", seq_label_align - "#=GC ".len()),
 85 |             dotbracket,
 86 |         );
 87 |     }
 88 | 
 89 |     stockholm.write(writer)?;
 90 | 
 91 |     Ok::<_, anyhow::Error>(())
 92 | }
 93 | 
 94 | #[derive(Debug, Default, Clone, PartialEq)]
 95 | pub struct Stockholm(Vec<Entry>);
 96 | 
 97 | impl Stockholm {
 98 |     pub fn write<W>(&self, mut writer: W) -> io::Result<()>
 99 |     where
100 |         W: io::Write,
101 |     {
102 |         writeln!(writer, "# STOCKHOLM 1.0")?;
103 |         self.0
104 |             .iter()
105 |             .try_for_each(|entry| entry.write(&mut writer))?;
106 |         writeln!(writer, "//")?;
107 |         Ok(())
108 |     }
109 | 
110 |     pub fn with_identification(mut self, id: impl Display) -> Self {
111 |         self.0
112 |             .push(Entry::FeatureAnnotation(FeatureAnnotation::Identification(
113 |                 id.to_string(),
114 |             )));
115 |         self
116 |     }
117 | 
118 |     pub fn with_author(mut self, author: impl Display) -> Self {
119 |         self.0
120 |             .push(Entry::FeatureAnnotation(FeatureAnnotation::Author(
121 |                 author.to_string(),
122 |             )));
123 |         self
124 |     }
125 | 
126 |     pub fn with_sequence(mut self, name: impl Into<Rc<str>>, aligned: impl Display) -> Self {
127 |         let name = name.into();
128 |         let aligned = aligned.to_string();
129 | 
130 |         self.0.push(Entry::Sequence(Sequence { name, aligned }));
131 |         self
132 |     }
133 | 
134 |     pub fn with_column_annotation(
135 |         mut self,
136 |         feature: impl Display,
137 |         annotation: impl Display,
138 |     ) -> Self {
139 |         let feature = feature.to_string();
140 |         let annotation = annotation.to_string();
141 | 
142 |         self.0.push(Entry::ColumnAnnotation {
143 |             feature,
144 |             annotation,
145 |         });
146 |         self
147 |     }
148 | 
149 |     pub fn with_empty_line(mut self) -> Self {
150 |         self.0.push(Entry::Empty);
151 |         self
152 |     }
153 | }
154 | 
155 | #[allow(unused)]
156 | #[derive(Debug, Clone, PartialEq)]
157 | pub enum Entry {
158 |     Sequence(Sequence),
159 |     FeatureAnnotation(FeatureAnnotation),
160 |     ColumnAnnotation {
161 |         feature: String,
162 |         annotation: String,
163 |     },
164 |     SequenceAnnotation {
165 |         sequence: Rc<str>,
166 |         feature: String,
167 |         annotation: String,
168 |     },
169 |     ResidueAnnotation {
170 |         sequence: Rc<str>,
171 |         feature: String,
172 |         annotation: String,
173 |     },
174 |     Empty,
175 | }
176 | 
177 | impl Entry {
178 |     pub fn write<W>(&self, mut writer: W) -> io::Result<()>
179 |     where
180 |         W: io::Write,
181 |     {
182 |         match self {
183 |             Entry::Sequence(sequence) => {
184 |                 writeln!(writer, "{} {}", sequence.name, sequence.aligned)
185 |             }
186 |             Entry::FeatureAnnotation(ann) => ann.write(writer),
187 |             Entry::ColumnAnnotation {
188 |                 feature,
189 |                 annotation,
190 |             } => writeln!(writer, "#=GC {feature} {annotation}"),
191 |             Entry::SequenceAnnotation {
192 |                 sequence,
193 |                 feature,
194 |                 annotation,
195 |             } => writeln!(writer, "#=GS {sequence} {feature} {annotation}"),
196 |             Entry::ResidueAnnotation {
197 |                 sequence,
198 |                 feature,
199 |                 annotation,
200 |             } => writeln!(writer, "#=GR {sequence} {feature} {annotation}"),
201 |             Entry::Empty => writeln!(writer),
202 |         }
203 |     }
204 | }
205 | 
206 | #[derive(Debug, Clone, PartialEq, Hash)]
207 | pub struct Sequence {
208 |     pub name: Rc<str>,
209 |     pub aligned: String,
210 | }
211 | 
212 | #[allow(unused)]
213 | #[derive(Debug, Clone, PartialEq)]
214 | pub enum FeatureAnnotation {
215 |     /// Accession number in form `PFxxxxx` (Pfam) or `RFxxxxx` (Rfam).
216 |     AccessionNumber(String),
217 | 
218 |     /// One word name for family.
219 |     Identification(String),
220 | 
221 |     /// Short description of family.
222 |     Definition(String),
223 | 
224 |     /// Authors of the entry.
225 |     Author(String),
226 | 
227 |     /// The source suggesting the seed members belong to one family.
228 |     SourceOfSeed(String),
229 | 
230 |     /// The source (prediction or publication) of the consensus RNA secondary structure used by Rfam.
231 |     SourceOfStructure(String),
232 | 
233 |     /// Command line used to generate the model
234 |     BuildMethod(String),
235 | 
236 |     /// Command line used to perform the search
237 |     SearchMethod(String),
238 | 
239 |     /// Search threshold to build the full alignment.
240 |     GatheringThreshold(f32),
241 | 
242 |     /// Lowest sequence score (and domain score for Pfam) of match in the full alignment.
243 |     TrustedCutoff(f32),
244 | 
245 |     /// Highest sequence score (and domain score for Pfam) of match not in full alignment.
246 |     NoiseCutoff(f32),
247 | 
248 |     /// Type of family.
249 |     Type(FamilyType),
250 | 
251 |     /// Number of sequences in alignment.
252 |     Sequence(u8),
253 | 
254 |     /// Comment about database reference.
255 |     DatabaseComment(String),
256 | 
257 |     /// Reference to external database.
258 |     DatabaseReference(String),
259 | 
260 |     /// Comment about literature reference.
261 |     ReferenceComment(String),
262 | 
263 |     /// Reference Number.
264 |     ReferenceNumber(String),
265 | 
266 |     /// Eight digit medline UI number.
267 |     ReferenceMedline(u32),
268 | 
269 |     /// Reference Title.
270 |     ReferenceTitle(String),
271 | 
272 |     /// Reference Author
273 |     ReferenceAuthor(String),
274 | 
275 |     /// Journal location.
276 |     ReferenceLocation(String),
277 | 
278 |     /// Record of all previous ID lines.
279 |     PreviousIdentifier(String),
280 | 
281 |     /// Keywords.
282 |     Keywords(Vec<String>),
283 | 
284 |     /// Comments.
285 |     Comment(String),
286 | 
287 |     /// Indicates a nested domain.
288 |     PfamAccession(String),
289 | 
290 |     /// Location of nested domains - sequence ID, start and end of insert.
291 |     Location(String),
292 | 
293 |     /// Wikipedia page
294 |     WikipediaLink(String),
295 | 
296 |     /// Clan accession
297 |     Clan(String),
298 | 
299 |     /// Used for listing Clan membership
300 |     Membership(String),
301 | 
302 |     /// A method used to set the bit score threshold based on the ratio of expected false positives
303 |     /// to true positives. Floating point number between 0 and 1.
304 |     FalseDiscoveryRate(f32),
305 | 
306 |     /// Command line used to calibrate the model (Rfam only, release 12.0 and later)
307 |     CalibrationMethod(String),
308 | }
309 | 
310 | impl FeatureAnnotation {
311 |     pub fn write<W>(&self, mut writer: W) -> io::Result<()>
312 |     where
313 |         W: io::Write,
314 |     {
315 |         macro_rules! match_features {
316 |             (@inner $($pat:pat => $expr:expr,)* ; $(,)? ) => {
317 |                 match self {
318 |                     $($pat => $expr,)*
319 |                 }
320 |             };
321 | 
322 |             (@inner $($pat:pat => $expr:expr,)* ; $feature:ident => $repr:literal, $($rest:tt)*) => {
323 |                 match_features!(
324 |                     @inner
325 |                     $($pat => $expr,)*
326 |                     FeatureAnnotation::$feature(ann) => writeln!(writer, "#=GF {} {ann}", $repr),
327 |                     ; $($rest)*
328 |                 )
329 |             };
330 | 
331 |             (@inner $($pat:pat => $expr:expr,)* ; $feature:ident($feat_pat:pat) => $handle_ann:expr, $($rest:tt)*) => {
332 |                 match_features!(
333 |                     @inner
334 |                     $($pat => $expr,)*
335 |                     FeatureAnnotation::$feature($feat_pat) => $handle_ann,
336 |                     ; $($rest)*
337 |                 )
338 |             };
339 | 
340 |             ($($tt:tt)*) => {
341 |                 match_features!(@inner ; $($tt)*)
342 |             };
343 |         }
344 | 
345 |         match_features!(
346 |             AccessionNumber => "AC",
347 |             Identification => "ID",
348 |             Definition => "DE",
349 |             Author => "AU",
350 |             SourceOfSeed => "SE",
351 |             SourceOfStructure => "SS",
352 |             BuildMethod => "BM",
353 |             SearchMethod => "SM",
354 |             GatheringThreshold => "GA",
355 |             TrustedCutoff => "TC",
356 |             NoiseCutoff => "NC",
357 |             Type => "TP",
358 |             Sequence => "SQ",
359 |             DatabaseComment => "DC",
360 |             DatabaseReference => "DR",
361 |             ReferenceComment => "RC",
362 |             ReferenceNumber => "RN",
363 |             ReferenceMedline => "RM",
364 |             ReferenceTitle => "RT",
365 |             ReferenceAuthor => "RA",
366 |             ReferenceLocation => "RL",
367 |             PreviousIdentifier => "PI",
368 |             Keywords(keywords) => {
369 |                 writer.write_all(b"#=GF KW")?;
370 |                 let mut keywords = keywords.iter();
371 |                 if let Some(keyword) = keywords.next() {
372 |                     write!(writer, " {keyword}")?;
373 |                     keywords.try_for_each(|keyword| {
374 |                         write!(writer, ",{keyword}")
375 |                     })?;
376 |                 }
377 |                 writeln!(writer)
378 |             },
379 |             Comment => "CC",
380 |             PfamAccession => "NE",
381 |             Location => "NL",
382 |             WikipediaLink => "WK",
383 |             Clan => "CL",
384 |             Membership => "MB",
385 |             FalseDiscoveryRate => "FR",
386 |             CalibrationMethod => "CB",
387 |         )
388 |     }
389 | }
390 | 
391 | #[allow(unused)]
392 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
393 | pub enum FamilyType {
394 |     Family,
395 |     Domain,
396 |     Motif,
397 |     Repeat,
398 | }
399 | 
400 | impl Display for FamilyType {
401 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
402 |         let s = match self {
403 |             FamilyType::Family => "Family",
404 |             FamilyType::Domain => "Domain",
405 |             FamilyType::Motif => "Motif",
406 |             FamilyType::Repeat => "Repeat",
407 |         };
408 | 
409 |         f.write_str(s)
410 |     }
411 | }
412 | 


--------------------------------------------------------------------------------
/test_data/query.txt:
--------------------------------------------------------------------------------
1 | 16S_750
2 | TGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGT
3 | 0.102,0.083,0.066,0.040,0.075,0.061,0.573,0.631,0.427,0.265,1.190,0.066,0.042,0.085,0.424,0.413,0.375,0.447,0.035,0.045,0.037,0.242,0.221,0.157,0.170,0.370,1.238,0.743,0.571,0.138,0.837,0.859,0.042,0.021,0.080,0.318,0.195,0.792,1.581,1.058,2.004,1.512,2.273,1.256,0.036,0.005,0.094,0.091,0.464,0.741,0.667,0.367,0.428,0.162,0.020,0.000,0.046,0.044,0.114,0.054,0.101,1.192,1.264,0.104,0.623,0.937,1.593,1.279,0.599,1.695,0.072,0.030,0.002,0.030,0.094,0.120,0.332,1.424,0.173,0.100,0.513,0.266,0.276,0.146,0.229,0.271,0.436,0.846,0.093,0.160,0.552,1.456,5.895,1.110,2.465,1.198,0.055,0.094,0.073,0.061
4 | 


--------------------------------------------------------------------------------
/test_data/query_align.txt:
--------------------------------------------------------------------------------
1 | 16S_last
2 | GCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTA
3 | 0.136,0.751,0.839,0.223,0.136,0.112,0.054,0.022,0.019,0.011,0.008,0.021,0.000,0.053,0.126,0.803,0.790,0.503,0.603,0.437,0.401,1.932,0.073,0.001,0.019,0.029,0.024,0.000,0.063,0.034,0.534,0.058,0.081,0.444,0.347,1.495,0.635,0.461,0.911,0.146,0.167,0.173,0.054,0.040,0.341,0.114,0.324,1.437,2.262,2.565,1.827,0.334,0.738,0.102,0.249,0.941,0.791,0.821,0.709,0.178,0.135,0.086,0.095,0.150,0.281,0.497,0.084,0.566,0.562,0.647,1.142,1.238,0.517,0.522,0.581,0.448,0.130,0.130,0.050,0.117,0.195,0.042,0.052,0.040,0.317,0.039,0.150,0.510,0.705,0.653,0.587,0.302,0.056,0.118,0.042,0.260,0.348,1.595,2.895,0.955,0.265,1.437,1.949,1.877,2.014,0.045,0.096,0.436,0.784,2.203,0.813,0.162,0.133,0.430,0.159,0.126,0.180,0.042,0.033,0.042,0.017,0.042,0.025,0.063,0.143,0.423,0.203,0.000,0.019,0.006,0.056,0.036,0.000,0.044,0.004,0.048,0.022,0.031,0.046,0.046,0.000,0.021,0.130,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
4 | 


--------------------------------------------------------------------------------
/test_data/query_empty_sequence.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | 
7 | NaN,0.123,0.456,0.789,0.012
8 | 


--------------------------------------------------------------------------------
/test_data/query_invalid_base.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | NAFGT
7 | NaN,0.123,0.456,0.789,0.012
8 | 


--------------------------------------------------------------------------------
/test_data/query_invalid_lengths.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | NACGTA
7 | NaN,0.123,0.456,0.789,0.012
8 | 


--------------------------------------------------------------------------------
/test_data/query_invalid_reactivity.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | NACGT
7 | NaN,0.123,asd,0.789,0.012
8 | 


--------------------------------------------------------------------------------
/test_data/query_truncated_reactivities.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | NACGT
7 | 


--------------------------------------------------------------------------------
/test_data/query_truncated_sequence.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,0.012,NaN
4 | 
5 |   test2    
6 | 


--------------------------------------------------------------------------------
/test_data/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dincarnato/SHAPEwarp/30fe71a82f078d7ec6807ffeee31af0d338d6da1/test_data/test.db


--------------------------------------------------------------------------------
/test_data/test_db.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <data combined="FALSE" maxmutrate="1" maxumut="0.05" norm="Box-plot" offset="1000000000" reactive="ACGT" remap="0" scoring="Siegfried" tool="rf-norm" win="1000000000">
 3 | 	<meta-data>
 4 | 		<organism>Saccharomyces cerevisiae</organism>
 5 |         <probe>2A3</probe>
 6 |         <source>
 7 |                 <citation>Marinus et al., 2020</citation>
 8 |                 <pmid>33398343</pmid>
 9 |         </source>
10 | 		<replicate>1</replicate>
11 | 		<condition>in vivo</condition>
12 | 	</meta-data>
13 | 	<transcript id="Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S" length="1800">
14 | 		<sequence>
15 | 			TATCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTCT
16 | 			AAGTATAAGCAATTTATACAGTGAAACTGCGAATGGCTCATTAAATCAGTTATCGTTTAT
17 | 			TTGATAGTTCCTTTACTACATGGTATAACTGTGGTAATTCTAGAGCTAATACATGCTTAA
18 | 			AATCTCGACCCTTTGGAAGAGATGTATTTATTAGATAAAAAATCAATGTCTTCGGACTCT
19 | 			TTGATGATTCATAATAACTTTTCGAATCGCATGGCCTTGTGCTGGCGATGGTTCATTCAA
20 | 			ATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTTTCAACGGGTAA
21 | 			CGGGGAATAAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGA
22 | 			AGGCAGCAGGCGCGCAAATTACCCAATCCTAATTCAGGGAGGTAGTGACAATAAATAACG
23 | 			ATACAGGGCCCATTCGGGTCTTGTAATTGGAATGAGTACAATGTAAATACCTTAACGAGG
24 | 			AACAATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTAT
25 | 			ATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACTTTGGGCCCGGTTGGCCGGTCCG
26 | 			ATTTTTTCGTGTACTGGATTTCCAACGGGGCCTTTCCTTCTGGCTAACCTTGAGTCCTTG
27 | 			TGGCTCTTGGCGAACCAGGACTTTTACTTTGAAAAAATTAGAGTGTTCAAAGCAGGCGTA
28 | 			TTGCTCGAATATATTAGCATGGAATAATAGAATAGGACGTTTGGTTCTATTTTGTTGGTT
29 | 			TCTAGGACCATCGTAATGATTAATAGGGACGGTCGGGGGCATCAGTATTCAATTGTCAGA
30 | 			GGTGAAATTCTTGGATTTATTGAAGACTAACTACTGCGAAAGCATTTGCCAAGGACGTTT
31 | 			TCATTAATCAAGAACGAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGTAGTCTTAA
32 | 			CCATAAACTATGCCGACTAGGGATCGGGTGGTGTTTTTTTAATGACCCACTCGGCACCTT
33 | 			ACGAGAAATCAAAGTCTTTGGGTTCTGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAG
34 | 			GAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGG
35 | 			GAAACTCACCAGGTCCAGACACAATAAGGATTGACAGATTGAGAGCTCTTTCTTGATTTT
36 | 			GTGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGTGATTTGTCTGCTTAATTGCGAT
37 | 			AACGAACGAGACCTTAACCTACTAAATAGTGGTGCTAGCATTTGCTGGTTATCCACTTCT
38 | 			TAGAGGGACTATCGGTTTCAAGCCGATGGAAGTTTGAGGCAATAACAGGTCTGTGATGCC
39 | 			CTTAGACGTTCTGGGCCGCACGCGCGCTACACTGACGGAGCCAGCGAGTCTAACCTTGGC
40 | 			CGAGAGGTCTTGGTAATCTTGTGAAACTCCGTCGTGCTGGGGATAGAGCATTGTAATTAT
41 | 			TGCTCTTCAACGAGGAATTCCTAGTAAGCGCAAGTCATCAGCTTGCGTTGATTACGTCCC
42 | 			TGCCCTTTGTACACACCGCCCGTCGCTAGTACCGATTGAATGGCTTAGTGAGGCCTCAGG
43 | 			ATCTGCTTAGAGAAGGGGGCAACTCCATCTCAGAGCGGAGAATTTGGACAAACTTGGTCA
44 | 			TTTAGAGGAACTAAAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTA
45 | 
46 | 		</sequence>
47 | 		<reactivity>
48 | 			NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,0.389,0.000,0.000,1.771,0.535,0.368,0.395,0.831,0.315,0.000,0.941,0.515,0.019,0.000,0.000,0.160,0.158,0.182,0.766,0.000,0.000,0.105,0.000,
49 | 			0.231,0.000,0.000,0.000,0.428,0.000,1.140,2.903,0.747,0.000,0.773,6.292,2.504,16.392,26.687,18.395,7.285,3.472,1.969,1.531,0.015,0.000,2.028,0.277,0.000,0.695,0.060,0.000,0.000,0.000,0.000,0.387,0.278,0.320,0.000,0.000,0.000,0.000,0.000,1.249,0.511,0.565,13.736,11.059,2.451,0.329,0.019,0.632,0.000,0.424,0.031,0.000,0.353,0.258,1.792,0.000,0.000,1.444,0.066,0.000,
50 | 			0.000,0.000,0.530,0.376,0.008,0.626,3.301,0.760,3.483,2.574,5.758,20.583,16.959,43.598,16.526,4.905,1.273,0.951,19.970,8.753,0.220,0.000,5.367,22.728,0.000,0.000,1.439,0.747,0.180,0.150,0.000,1.130,0.000,0.000,0.430,0.000,2.278,3.390,3.277,0.707,0.000,0.000,0.000,0.000,0.870,0.157,0.000,0.000,0.000,0.000,0.655,0.000,0.000,0.000,0.000,0.573,4.084,8.101,0.967,0.567,
51 | 			0.000,4.739,0.680,0.000,0.720,0.019,0.490,0.053,0.000,0.463,5.995,9.746,10.837,44.167,1.073,0.444,0.487,0.000,0.000,0.065,0.000,0.284,4.298,0.000,0.166,0.113,0.430,0.280,0.727,1.256,1.131,0.000,0.870,5.313,4.611,NaN,11.272,3.660,0.000,0.000,0.000,1.090,0.524,0.000,0.000,1.089,1.186,0.879,0.248,0.738,1.538,7.189,25.524,10.602,1.722,0.469,5.201,5.769,20.508,23.514,
52 | 			7.896,24.506,0.197,0.097,0.423,0.000,0.071,5.256,11.918,0.268,0.296,0.144,0.000,0.000,0.443,0.077,0.647,0.224,18.633,13.787,2.001,44.475,0.733,0.887,0.000,0.327,0.351,0.095,0.000,1.561,5.880,19.362,0.705,1.301,0.677,2.157,16.048,44.133,8.957,16.729,3.410,0.013,0.000,0.557,0.000,0.120,0.229,0.134,0.182,0.000,0.065,0.587,0.091,0.000,0.000,0.000,0.000,0.092,0.000,0.080,
53 | 			0.000,0.093,0.496,0.647,0.060,2.088,26.681,0.459,0.064,2.308,12.659,18.487,143.025,1.242,0.365,0.049,0.166,0.378,8.188,11.029,3.117,0.342,0.000,0.132,0.106,0.022,0.000,0.000,0.000,0.077,0.000,0.080,0.000,0.820,0.000,0.477,0.133,0.000,0.000,0.368,0.000,0.000,0.589,2.386,23.441,0.000,0.000,0.000,0.102,0.388,0.000,0.000,0.065,0.180,0.000,0.159,0.234,0.356,0.156,0.693,
54 | 			0.346,0.043,0.000,0.172,0.595,0.148,0.290,0.608,0.494,1.092,0.728,0.237,0.007,0.000,0.383,0.335,0.031,0.000,0.000,0.237,0.130,0.000,0.210,0.088,0.000,0.074,0.000,0.109,0.000,0.137,0.000,0.000,0.239,0.067,0.053,0.150,0.041,0.000,0.043,0.341,0.209,0.109,0.170,0.021,0.031,0.000,0.059,0.000,0.035,0.000,0.073,0.840,0.122,0.232,0.538,0.200,1.191,0.479,0.127,0.150,
55 | 			1.024,0.000,0.191,0.000,0.306,0.438,0.405,0.233,0.139,0.000,0.061,0.128,0.013,0.170,0.141,0.080,0.000,0.526,0.000,0.246,0.000,0.022,0.000,0.812,0.000,0.199,0.974,0.168,0.735,0.436,0.000,0.770,0.694,1.493,0.586,0.420,0.328,0.214,0.064,0.473,0.055,0.297,0.611,0.139,0.248,0.319,0.000,0.099,0.109,0.027,0.000,0.261,0.260,0.053,0.078,0.403,0.084,0.207,0.589,0.945,
56 | 			0.138,0.668,0.517,0.328,0.602,0.349,0.425,4.947,0.000,0.199,1.579,6.105,19.322,46.609,26.706,0.447,0.412,0.219,0.235,0.643,0.000,0.110,0.073,3.196,0.536,3.455,0.685,0.808,0.247,0.327,0.272,0.192,0.457,0.153,0.035,0.642,0.754,0.295,0.573,0.882,0.181,0.251,0.187,0.312,0.023,0.065,0.516,0.179,0.110,0.140,0.103,0.262,0.317,0.757,0.809,0.334,0.099,0.177,0.623,1.061,
57 | 			0.073,0.182,0.000,0.040,0.346,0.462,0.868,0.609,0.968,0.114,0.466,0.224,0.002,0.124,0.541,0.102,0.668,3.227,0.248,0.341,0.121,0.000,0.259,0.715,0.092,0.000,0.047,0.292,0.390,0.208,1.087,0.145,1.312,0.810,0.107,0.659,0.288,0.780,0.367,2.016,0.758,0.064,0.000,0.263,0.000,0.432,0.000,0.000,0.000,0.205,0.066,0.259,0.430,0.045,0.609,0.343,0.000,0.000,0.000,0.382,
58 | 			0.000,0.232,0.538,0.000,0.000,0.000,0.550,0.000,0.000,0.555,0.000,0.194,0.000,0.028,0.051,0.279,0.342,0.324,0.000,0.000,0.042,0.428,0.000,0.000,0.000,0.124,0.378,1.154,1.072,0.304,0.000,0.000,0.000,0.000,0.000,0.147,0.000,2.004,2.918,6.195,0.954,0.656,1.703,0.430,0.000,0.075,8.902,0.411,0.749,0.287,0.790,1.172,0.184,0.482,0.418,0.215,0.404,0.476,1.074,0.940,
59 | 			17.668,2.394,1.781,0.969,0.732,3.651,32.772,8.051,12.658,9.111,3.327,18.257,0.653,0.278,0.631,1.287,6.529,11.216,4.910,4.150,23.076,0.214,0.173,0.000,0.175,0.777,0.747,0.166,0.102,0.153,0.000,0.000,0.148,1.160,2.285,1.114,1.905,0.715,1.933,0.057,0.589,0.136,0.693,0.604,0.770,0.286,0.663,0.075,0.000,0.089,0.380,0.177,0.000,0.387,1.321,1.762,2.934,14.663,35.554,9.828,
60 | 			32.815,3.665,0.000,0.142,0.000,0.161,0.000,0.545,0.130,0.265,0.057,0.052,0.143,0.421,0.076,0.087,0.213,0.213,0.221,0.000,1.071,8.491,9.912,1.553,5.278,0.032,0.281,0.337,0.658,1.499,NaN,0.188,0.110,0.000,0.142,0.063,0.857,0.350,0.343,0.240,0.017,0.000,0.425,0.307,0.489,0.220,0.380,0.223,0.278,0.000,0.175,0.000,0.000,0.149,0.285,0.376,2.513,5.107,16.830,13.964,
61 | 			8.064,59.968,0.373,0.167,0.310,0.251,0.641,0.336,0.227,1.486,0.200,0.673,6.099,2.678,11.366,0.414,0.148,0.290,0.071,0.369,0.394,0.418,1.108,0.280,0.453,0.560,0.440,1.441,1.939,6.911,2.394,4.441,4.936,8.365,5.098,15.768,0.210,0.531,3.564,0.851,1.007,3.763,0.605,0.000,0.000,0.219,0.000,0.000,0.916,0.000,0.000,0.132,0.567,0.570,0.000,0.189,0.473,1.177,0.000,0.007,
62 | 			0.006,1.161,0.098,0.124,0.164,0.550,0.097,0.184,0.354,0.155,0.242,0.000,0.110,1.167,0.371,2.310,3.507,3.366,3.191,0.101,0.000,0.000,0.213,0.088,0.564,0.201,0.314,0.443,0.377,0.421,0.559,0.950,1.834,1.042,0.952,0.668,0.605,0.000,0.674,0.374,0.363,0.206,0.392,0.477,0.101,0.000,0.421,0.093,0.000,0.000,0.000,0.656,0.452,0.775,0.532,0.286,0.706,0.786,0.362,0.445,
63 | 			0.840,1.335,2.877,5.233,0.418,0.336,2.239,1.596,0.466,1.183,16.256,56.125,2.014,0.463,0.056,0.000,0.000,2.308,0.446,0.000,0.692,0.000,0.000,0.110,0.208,0.083,0.102,0.003,0.414,0.558,1.040,2.186,0.491,1.036,0.307,0.054,0.156,0.420,0.073,0.052,0.470,0.238,0.082,0.364,0.010,0.000,0.546,0.156,0.057,0.077,0.000,0.201,0.029,0.099,0.027,0.051,0.055,0.000,0.024,0.024,
64 | 			0.152,0.000,0.000,0.108,0.000,0.077,0.043,0.000,0.865,0.840,0.305,0.316,0.037,0.239,0.271,0.333,0.394,0.165,1.063,0.156,0.633,0.987,0.658,0.387,0.333,0.337,1.092,0.328,0.576,0.298,0.328,0.447,0.434,1.199,1.740,1.443,0.074,0.100,0.301,0.852,0.589,0.697,1.018,2.615,1.028,0.181,0.547,1.031,0.618,0.746,0.378,0.555,0.268,0.442,0.204,0.156,0.790,1.100,0.645,1.127,
65 | 			0.221,0.230,0.963,0.760,1.184,0.327,0.674,0.586,0.907,0.357,2.046,0.223,0.154,0.088,0.100,0.193,0.058,0.220,0.143,0.306,0.131,0.896,0.626,0.124,0.246,0.340,2.437,0.401,0.697,0.821,4.319,21.614,NaN,13.351,1.630,0.705,0.653,0.310,2.389,29.364,11.525,11.107,13.900,0.696,0.077,0.000,0.033,0.043,0.297,0.000,0.011,0.000,0.000,0.000,0.046,0.550,0.075,0.186,0.318,0.499,
66 | 			1.340,3.238,0.000,0.000,0.143,0.156,0.098,0.000,0.000,0.295,0.123,0.000,0.165,0.089,0.000,0.444,0.205,0.000,0.088,0.115,0.000,0.244,0.176,0.551,0.389,1.726,0.904,1.634,0.928,0.565,0.262,0.402,0.025,0.221,0.000,0.409,1.584,0.017,0.338,0.164,0.037,0.329,0.427,0.000,0.282,0.078,0.000,0.135,0.211,1.656,1.627,0.248,0.653,0.255,0.000,0.542,0.198,0.000,0.362,0.165,
67 | 			0.234,0.093,0.225,0.297,1.103,0.372,0.387,0.858,1.700,1.624,0.212,0.488,0.366,0.122,0.018,0.144,0.000,0.202,0.122,0.066,0.012,0.000,0.038,0.036,0.046,0.078,0.028,0.144,0.000,0.240,0.027,0.036,0.000,0.045,0.000,0.032,0.000,0.144,0.108,0.312,0.000,0.091,0.225,0.460,0.073,0.284,0.553,1.641,1.980,4.315,NaN,NaN,0.183,0.036,0.059,0.000,0.137,0.243,0.133,0.060,
68 | 			0.101,0.036,0.165,0.000,0.108,0.048,0.054,0.000,0.000,0.069,0.094,0.000,0.286,0.274,0.057,0.000,0.148,0.068,0.203,0.000,0.000,0.070,0.057,0.162,0.359,0.148,0.144,0.029,0.073,0.000,0.183,0.127,0.000,0.078,0.216,1.345,17.537,14.039,1.948,10.131,35.650,56.798,3.340,0.296,9.479,0.078,0.000,0.000,0.000,0.000,0.180,0.000,0.013,0.159,0.109,0.226,0.000,0.070,0.119,0.265,
69 | 			0.216,0.574,0.300,0.210,0.470,0.222,0.011,0.036,0.247,0.043,0.000,0.015,0.155,0.050,0.000,0.140,0.000,0.187,0.160,0.000,0.084,0.047,0.198,0.000,0.466,0.406,0.457,1.294,0.903,2.806,31.280,1.625,0.663,0.221,0.175,0.335,0.000,0.551,0.358,0.163,0.000,0.000,0.241,1.005,0.188,2.120,5.791,0.000,0.000,0.188,0.148,0.041,0.396,0.436,0.311,0.000,0.092,0.425,0.730,0.250,
70 | 			0.308,0.000,0.364,0.222,0.000,0.000,0.117,0.000,0.000,0.318,0.000,0.311,0.000,0.041,0.236,0.000,0.000,0.000,2.747,25.746,2.650,0.958,0.852,1.150,5.138,36.958,0.439,0.483,0.095,0.000,0.180,0.455,0.158,0.517,0.000,0.031,0.013,0.000,0.470,13.140,14.697,6.355,118.942,0.107,0.812,0.069,0.172,1.754,3.259,14.553,5.980,8.906,0.138,0.000,0.000,0.000,0.166,0.114,0.030,0.045,
71 | 			0.071,0.117,0.340,0.551,0.250,0.184,0.176,0.030,0.133,0.030,0.140,0.267,0.265,0.634,6.958,1.857,8.962,26.787,22.133,0.222,0.060,0.000,0.000,0.000,0.046,0.082,0.152,0.000,0.000,0.059,0.065,0.000,0.000,0.161,0.432,0.032,0.095,0.000,0.399,0.003,0.071,0.228,0.269,0.151,0.000,0.118,0.000,0.164,0.000,0.000,0.063,0.000,0.228,0.000,0.110,0.000,0.225,0.888,0.110,0.465,
72 | 			0.365,0.177,0.368,2.584,3.048,5.617,0.226,0.430,0.000,0.416,0.392,1.164,0.000,0.448,0.530,0.000,0.000,0.175,0.575,9.425,1.399,0.272,0.411,0.000,0.103,0.000,0.238,0.112,0.182,0.005,0.238,0.487,0.601,0.265,0.000,1.591,0.404,0.298,0.237,0.000,0.357,0.000,0.589,0.000,0.000,0.552,0.237,0.414,1.823,13.542,13.007,9.973,18.799,0.276,0.000,0.235,0.361,0.209,0.156,0.000,
73 | 			0.042,0.067,0.000,0.145,0.194,0.080,0.000,0.000,0.000,0.800,0.000,0.012,0.003,0.113,0.152,0.270,0.166,0.061,0.000,0.000,0.000,0.081,0.485,0.173,0.000,0.000,0.000,0.118,0.331,0.286,0.031,0.215,0.082,0.000,0.286,0.978,3.695,0.000,0.224,0.119,0.068,0.249,0.230,0.254,0.218,0.024,0.000,0.231,0.000,0.000,0.042,0.000,0.000,0.000,0.122,0.193,0.080,0.478,0.048,0.030,
74 | 			0.000,0.000,0.062,0.000,0.000,0.057,0.144,0.049,0.000,0.000,0.000,0.460,1.059,0.243,0.695,1.985,0.660,0.104,0.417,0.000,0.000,0.054,0.149,0.000,0.000,0.069,0.000,0.000,0.000,0.000,0.000,0.084,0.000,0.063,0.000,0.000,0.198,0.000,0.109,0.168,0.000,0.000,0.091,0.000,0.257,0.351,0.000,0.000,0.090,0.000,0.119,0.000,0.065,0.178,0.224,0.000,0.000,0.182,0.000,0.000,
75 | 			0.325,0.696,0.000,0.197,0.000,0.000,0.183,0.058,0.991,1.003,0.765,0.000,0.214,0.687,1.089,1.719,1.727,2.076,0.000,0.375,1.015,0.834,0.416,0.448,0.563,0.813,0.110,0.000,0.086,0.924,0.944,0.022,0.572,1.132,1.610,0.614,5.694,0.529,0.000,0.108,0.358,0.000,0.003,0.372,0.141,0.000,0.026,0.225,0.977,0.584,0.000,0.457,0.104,0.000,0.000,0.074,0.100,0.407,0.000,0.210,
76 | 			0.000,0.271,0.000,0.197,0.809,0.111,0.141,1.140,0.902,0.041,0.158,0.454,0.826,3.797,0.211,0.549,0.000,0.122,0.669,0.000,0.875,3.657,0.312,0.717,0.181,0.280,2.284,0.051,0.000,0.000,0.467,2.511,1.125,0.000,0.166,0.581,0.433,0.084,0.636,0.517,0.189,0.000,0.307,0.272,0.564,0.500,0.048,0.124,0.000,0.125,0.000,0.050,0.143,0.549,1.297,0.358,0.390,0.793,0.064,0.000,
77 | 			0.230,0.000,0.474,0.933,0.710,0.818,0.106,0.068,0.000,0.000,0.000,0.279,0.000,0.239,0.272,0.670,1.665,1.701,0.481,1.297,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
78 | 		</reactivity>
79 | 	</transcript>
80 | </data>
81 | 


--------------------------------------------------------------------------------
/test_data/valid_query.txt:
--------------------------------------------------------------------------------
1 | test1
2 | ACGTN
3 | 0.123,0.456,0.789,1.234,NaN
4 | 
5 |   test2    
6 | NACGT
7 | NaN,12,0.456,0.789,0.012
8 | 


--------------------------------------------------------------------------------
/viennarna-mfe-sys/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | 


--------------------------------------------------------------------------------
/viennarna-mfe-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "viennarna-mfe-sys"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | license = "GPL-3.0-or-later"
 6 | 
 7 | [build-dependencies]
 8 | bindgen = "0.65.1"
 9 | pkg-config = "0.3.25"
10 | 


--------------------------------------------------------------------------------
/viennarna-mfe-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::{env, path::PathBuf};
 2 | 
 3 | fn main() {
 4 |     let vrna = pkg_config::Config::new()
 5 |         .range_version("2.4.18".."2.7")
 6 |         .probe("RNAlib2")
 7 |         .unwrap();
 8 | 
 9 |     println!("cargo:rerun-if-changed=wrapper.h");
10 | 
11 |     let bindings = bindgen::Builder::default()
12 |         .header("wrapper.h")
13 |         .clang_args(
14 |             vrna.include_paths
15 |                 .into_iter()
16 |                 .map(|path| format!("-I{}", path.display())),
17 |         )
18 |         .parse_callbacks(Box::new(bindgen::CargoCallbacks))
19 |         .allowlist_function("vrna_mfe")
20 |         .allowlist_function("vrna_mfe_dimer")
21 |         .allowlist_function("vrna_fold")
22 |         .allowlist_function("vrna_circfold")
23 |         .allowlist_function("vrna_alifold")
24 |         .allowlist_function("vrna_circalifold")
25 |         .allowlist_function("vrna_cofold")
26 |         .allowlist_function("vrna_fold_compound_.*")
27 |         .allowlist_function("vrna_md_set_default")
28 |         .allowlist_function("vrna_sc_init")
29 |         .allowlist_function("vrna_sc_set_stack_comparative")
30 |         .allowlist_function("vrna_sc_add_SHAPE_deigan_ali")
31 |         .allowlist_function("vrna_mfe_window_cb")
32 |         .allowlist_var("VRNA_OPTION.*")
33 |         .allowlist_type("vrna_sc_s")
34 |         .allowlist_type("vrna_sc_bp_storage_t")
35 |         .generate()
36 |         .expect("Unable to generate bindings");
37 | 
38 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
39 |     bindings
40 |         .write_to_file(out_path.join("bindings.rs"))
41 |         .expect("Couldn't write bindings!");
42 | }
43 | 


--------------------------------------------------------------------------------
/viennarna-mfe-sys/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![allow(non_upper_case_globals)]
2 | #![allow(non_camel_case_types)]
3 | #![allow(non_snake_case)]
4 | #![allow(rustdoc::broken_intra_doc_links)]
5 | 
6 | include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
7 | 


--------------------------------------------------------------------------------
/viennarna-mfe-sys/wrapper.h:
--------------------------------------------------------------------------------
 1 | #include <constraints/SHAPE.h>
 2 | #include <constraints/soft.h>
 3 | #include <fold_compound.h>
 4 | #include <mfe.h>
 5 | #include <mfe_window.h>
 6 | 
 7 | struct vrna_hc_depot_s {
 8 |   unsigned int strands;
 9 |   size_t *up_size;
10 |   struct hc_nuc **up;
11 |   size_t *bp_size;
12 |   struct hc_basepair **bp;
13 | };
14 | 
15 | struct hc_nuc {
16 |   int direction;
17 |   unsigned char context;
18 |   unsigned char nonspec;
19 | };
20 | 
21 | struct hc_basepair {
22 |   size_t list_size;
23 |   size_t list_mem;
24 |   unsigned int *j;
25 |   unsigned int *strand_j;
26 |   unsigned char *context;
27 | };
28 | 


--------------------------------------------------------------------------------