├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── build.rs
├── src
├── aligner.rs
├── cli.rs
├── db_file.rs
├── db_file
│ ├── native.rs
│ └── xml.rs
├── dotbracket.rs
├── fasta.rs
├── gapped_data.rs
├── gapped_reactivity.rs
├── gapped_sequence.rs
├── handle_query_entry.rs
├── iter.rs
├── main.rs
├── mass.rs
├── norm_dist.rs
├── null_model.rs
├── query_aligner.rs
├── query_file.rs
├── query_result.rs
├── stockholm.rs
└── viennarna.rs
├── test_data
├── query.txt
├── query_align.txt
├── query_empty_sequence.txt
├── query_invalid_base.txt
├── query_invalid_lengths.txt
├── query_invalid_reactivity.txt
├── query_truncated_reactivities.txt
├── query_truncated_sequence.txt
├── test.db
├── test_db.xml
└── valid_query.txt
└── viennarna-mfe-sys
├── .gitignore
├── Cargo.toml
├── build.rs
├── src
└── lib.rs
└── wrapper.h
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "SHAPEwarp"
3 | version = "2.2.0"
4 | edition = "2021"
5 | license = "GPL-3.0-or-later"
6 |
7 | [dependencies]
8 | anyhow = "1.0.58"
9 | bitflags = "2.3.1"
10 | clap = { version = "4.3.0", features = ["derive"] }
11 | csv = "1.1.6"
12 | fftw = { version = "0.8.0", default-features = false, features = ["system"] }
13 | fnv = "1.0.7"
14 | itertools = "0.10.3"
15 | ndarray = "0.15.4"
16 | num-complex = "0.4.3"
17 | num-traits = "0.2.14"
18 | once_cell = "1.17.1"
19 | quick-xml = "0.31.0"
20 | rand = "0.8.5"
21 | rayon = "1.5.3"
22 | serde = { version = "1.0.139", features = ["derive", "rc"] }
23 | serde_json = "1.0.85"
24 | smallvec = "1.8.0"
25 | statrs = "0.16.0"
26 | tabled = "0.17.0"
27 | toml_edit = { version = "0.19.10", features = ["serde"] }
28 | viennarna-mfe-sys = { version = "0.1.0", path = "viennarna-mfe-sys" }
29 |
30 | [dev-dependencies]
31 | approx = { version = "0.5.1", features = ["num-complex"] }
32 | rand = { version = "0.8.5", features = ["small_rng"] }
33 | tempfile = "3.5.0"
34 |
35 | [profile.release-opt]
36 | inherits = "release"
37 | lto = true
38 | codegen-units = 1
39 |
40 | [build-dependencies]
41 | pkg-config = "0.3.27"
42 | semver = "1.0.18"
43 |
44 | [lints.rust]
45 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(vrna24)', 'cfg(vrna25)', 'cfg(vrna251)', 'cfg(vrna26)'] }
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | ## Introduction
4 |
5 | The model-guided search for structurally-homologous RNAs is a non-trivial task, as it largely depends on the quality of the inferred structure model. When it comes to inferring RNA structures from chemical probing data, the challenges are numerous. Use of different chemical probes, or of different approaches for incorporating experimental reactivities as pseudo-free energy contributions can significantly affect the reliability of the inferred RNA structure model.
6 |
7 | __SHAPEwarp__ is a sequence-agnostic method for the identification of structurally-similar RNA elements in a database of chemical probing-derived reactivity profiles. The approach used by SHAPEwarp is inspired by the BLAST algorithm and builds on top of two widely used methods for similarity search in time series data: Mueen's Algorithm for Similarity Search ([MASS](https://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html)) and dynamic time warping (DTW).
8 |
9 | For support requests, please post your questions to:
10 |
11 | For a complete documentation, please refer to:
12 |
13 |
14 | ## Author(s)
15 |
16 | Edoardo Morandi (emorandi[at]rnaframework.com)
17 | Danny Incarnato (dincarnato[at]rnaframework.com)
18 |
19 |
20 | ## References
21 |
22 | Morandi *et al*., 2022. SHAPE-guided RNA structure homology search and motif discovery. Nature Communications (PMID: [35361788](https://pubmed.ncbi.nlm.nih.gov/35361788/))
23 |
24 | Scholten *et al*., 2024. SHAPEwarp-web: sequence-agnostic search for structurally homologous RNA regions across databases of chemical probing data. Nucleic Acids Research (PMID: [38709889](https://pubmed.ncbi.nlm.nih.gov/38709889/))
25 |
26 |
27 | ## License
28 |
29 | This program is free software, and can be redistribute and/or modified under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or any later version.
30 |
31 | Please see for more information.
32 |
33 |
34 | ## Prerequisites
35 |
36 | - Linux system
37 | - Rust and Cargo (Installation instructions: )
38 | - [FFTW](http://fftw.org/) 3.x library.
39 |
40 | ## Installation of FFTW
41 |
42 | This library is generally provided by package managers, keep in mind that some distros split the `-dev` package (which are needed to compile projects depending on the library) from the main one.
43 |
44 | ### Debian based distros (i.e. Debian, Ubuntu)
45 |
46 | ```bash
47 | sudo apt install libfftw3-dev
48 | ```
49 |
50 | ### Red-Hat based distros (i.e. Fedora, CentOS, Alma Linux)
51 |
52 | ```bash
53 | sudo dnf install fftw-devel
54 | ```
55 |
56 | ### Arch based distros (i.e. Arch, Manjaro)
57 |
58 | ```bash
59 | sudo pacman -S fftw
60 | ```
61 |
62 | ## Installation
63 |
64 | ```bash
65 | $ git clone https://github.com/dincarnato/SHAPEwarp
66 | $ cd SHAPEwarp
67 |
68 | # Add to PKG_CONFIG_PATH the path to the directory containing RNAlib2.pc from the ViennaRNA package
69 | $ export PKG_CONFIG_PATH=/path/to/dir/containing/RNAlib2.pc
70 |
71 | $ export RUSTFLAGS=-Ctarget-cpu=native
72 | $ cargo build --release
73 | ```
74 |
75 | The SHAPEwarp executable will be located under ``target/release/``.
76 |
77 |
78 | ### Note for Mac OS X users:
79 | To compile SHAPEwarp on Mac OS X, after having installed the ViennaRNA package, open the RNAlib2.pc file in a text editor and replace the ``-lstdc++`` flag with ``-lc++``.
80 |
81 |
82 | ## Testing the SHAPEwarp installation
83 |
84 | To test SHAPEwarp on a small test dataset, issue the following command from within the SHAPEwarp install directory:
85 |
86 | ```bash
87 | target/release/SHAPEwarp --query test_data/query.txt --database test_data/test.db --output test_out --ow
88 | ```
89 | The search will take less than 10 seconds, and the expected output should look like the following:
90 |
91 | ```bash
92 | query db_entry query_start query_end db_start db_end query_seed db_seed score pvalue evalue status
93 | 16S_750 16S_Bsubtilis 0 99 758 857 15-79 773-837 109.103 5.665e-8 1.003e-5 !
94 | ```
95 |
--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | use semver::Version;
2 |
3 | fn main() {
4 | let vrna = pkg_config::Config::new()
5 | .range_version("2.4.18".."2.7")
6 | .cargo_metadata(false)
7 | .env_metadata(false)
8 | .print_system_libs(false)
9 | .print_system_cflags(false)
10 | .probe("RNAlib2")
11 | .unwrap();
12 |
13 | println!("cargo:rerun-if-changed=build.rs");
14 |
15 | let version: Version = vrna
16 | .version
17 | .parse()
18 | .expect("unable to parse ViennaRNA version");
19 |
20 | let version_cfg = format!("vrna{}{}", version.major, version.minor);
21 | println!("cargo:rustc-cfg={version_cfg}");
22 |
23 | if version.major == 2 && version.minor == 5 {
24 | let version_cfg = format!("vrna{}{}{}", version.major, version.minor, version.patch);
25 | println!("cargo:rustc-cfg={version_cfg}");
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
1 | // We are defining CLI structs
2 | #![allow(clippy::struct_excessive_bools)]
3 |
4 | use clap::{Args, Parser, ValueEnum};
5 | use serde::Serialize;
6 | use std::{fmt, ops::Range, path::PathBuf, str::FromStr};
7 |
8 | use crate::{Distance, Reactivity};
9 |
10 | #[derive(Debug, Parser, Serialize)]
11 | #[clap(author, version, about, allow_negative_numbers = true)]
12 | #[serde(rename_all = "kebab-case")]
13 | /// SHAPE-guided RNA structural homology search
14 | pub struct Cli {
15 | /// Path to a database file, or to a (directory of) XML file(s)
16 | #[clap(long, visible_alias = "db")]
17 | #[serde(skip)]
18 | pub database: PathBuf,
19 |
20 | /// Path to a shuffled database file
21 | ///
22 | /// Uses a file containing the shuffled database instead of generating one on the fly.
23 | /// A shuffled database can be dumped to file using `--dump-shuffled-db`.
24 | #[clap(
25 | long,
26 | conflicts_with_all = &[
27 | "dump_shuffled_db",
28 | "db_shuffles",
29 | "db_block_size",
30 | "db_in_block_shuffle",
31 | ],
32 | )]
33 | #[serde(skip)]
34 | pub shuffled_db: Option,
35 |
36 | /// Dumps the database to the specified file.
37 | ///
38 | /// Input is a (directory of) XML file(s).
39 | #[clap(long)]
40 | #[serde(skip)]
41 | pub dump_db: Option,
42 |
43 | /// Dumps the shuffled database to the specified file.
44 | ///
45 | /// Shuffled databases can be imported using the `--shuffled-db` parameter.
46 | #[clap(long)]
47 | #[serde(skip)]
48 | pub dump_shuffled_db: Option,
49 |
50 | /// Path to the query file
51 | ///
52 | /// Note: each entry should contain (one per row) the sequence id, the nucleotide sequence and
53 | /// a comma-separated list of SHAPE reactivities
54 | #[clap(short, long)]
55 | #[serde(skip)]
56 | pub query: PathBuf,
57 |
58 | /// Output directory
59 | #[clap(short, long, default_value = "sw_out/")]
60 | pub output: PathBuf,
61 |
62 | /// Overwrites the output directory (if the specified path already exists)
63 | #[clap(long, visible_alias = "ow")]
64 | pub overwrite: bool,
65 |
66 | /// Number of processors to use
67 | ///
68 | /// Uses all available processors if not specified
69 | #[clap(long)]
70 | pub threads: Option,
71 |
72 | /// Number of shuffles to perform for each sequence in db
73 | ///
74 | /// In case the parameter is unspecified, it is automatically evaluated based on the length of
75 | /// the sequences in the database.
76 | ///
77 | /// Given `L` as the sum of the lengths of each sequence in the database, the number of
78 | /// shuffles is calculated as `max(1, 500000 / L)`.
79 | #[clap(long, alias = "dbShuffles")]
80 | pub db_shuffles: Option,
81 |
82 | /// Size (in nt) of the blocks for shuffling the sequences in db
83 | #[clap(long, alias = "dbBlockSize", default_value_t = 10)]
84 | pub db_block_size: u16,
85 |
86 | /// Besides shuffling blocks, residues within each block in db will be shuffled as well
87 | #[clap(long, alias = "dbInBlockShuffle")]
88 | pub db_in_block_shuffle: bool,
89 |
90 | /// Maximum value to which reactivities will be capped
91 | #[clap(long, default_value_t = 1., alias = "maxReactivity")]
92 | pub max_reactivity: Reactivity,
93 |
94 | /// If two significant alignments overlap by more than this value, the least significant one
95 | /// (the one with the lowest alignment score) will be discarded
96 | #[clap(long, default_value_t = 0.5, alias = "maxAlignOverlap")]
97 | pub max_align_overlap: f32,
98 |
99 | /// Number of HSGs in the shuffled database to be extended to build the null model
100 | #[clap(long, default_value_t = 10_000, alias = "nullHSGs")]
101 | pub null_hsgs: u32,
102 |
103 | /// E-value threshold to consider an alignment significant
104 | #[clap(long, default_value_t = 0.01, aliases = &["inclusionEvalue", "incE"], visible_alias = "inc-e")]
105 | pub inclusion_evalue: f64,
106 |
107 | /// E-value threshold to report a match
108 | #[clap(long, default_value_t = 0.1, aliases = &["reportEvalue", "repE"], visible_alias = "rep-e")]
109 | pub report_evalue: f64,
110 |
111 | /// Reports sequence alignments in the specified format
112 | ///
113 | /// Note: alignments are reported only for matches below the inclusion E-value cutoff
114 | #[clap(long, alias = "reportAln", value_enum)]
115 | pub report_alignment: Option,
116 |
117 | /// Reports the aligned reactivities for significant matches in the "reactivities/" subfolder of the output
118 | /// directory, in JSON format
119 | #[clap(long)]
120 | pub report_reactivity: bool,
121 |
122 | #[clap(flatten, next_help_heading = "Kmer lookup options")]
123 | #[serde(flatten)]
124 | pub kmer_lookup_args: KmerLookupArgs,
125 |
126 | #[clap(flatten, next_help_heading = "Alignment options")]
127 | #[serde(flatten)]
128 | pub alignment_args: AlignmentArgs,
129 |
130 | #[clap(flatten, next_help_heading = r#"Alignment folding evaluation options"#)]
131 | #[serde(flatten)]
132 | pub alignment_folding_eval_args: AlignmentFoldingEvaluationArgs,
133 | }
134 |
135 | #[derive(Debug, Args, Serialize)]
136 | #[serde(rename_all = "kebab-case")]
137 | pub struct KmerLookupArgs {
138 | /// Minimum number of kmers required to form a High Scoring Group (HSG)
139 | #[clap(long, default_value_t = 2, alias = "minKmers")]
140 | pub min_kmers: u16,
141 |
142 | /// Maximum distance between two kmers to be merged in a HSG
143 | #[clap(long, default_value_t = 30, alias = "maxKmerDist")]
144 | pub max_kmer_dist: u16,
145 |
146 | /// Length (in nt) of the kmers
147 | #[clap(long, default_value_t = 15, alias = "kmerLen")]
148 | pub kmer_len: u16,
149 |
150 | /// Sliding offset for extracting candidate kmers from the query
151 | #[clap(long, default_value_t = 1, alias = "kmerOffset")]
152 | pub kmer_offset: u16,
153 |
154 | /// The sequence of a query kmer and the corresponding database match must have GC% contents
155 | /// differing no more than --kmer-max-gc-diff
156 | #[clap(long, alias = "matchKmerGCcontent")]
157 | pub match_kmer_gc_content: bool,
158 |
159 | /// Maximum allowed GC% difference to retain a kmer match
160 | ///
161 | /// Note: the default value is automatically determined based on the chosen kmer length
162 | #[clap(long, requires = "match_kmer_gc_content", alias = "kmerMaxGCdiff")]
163 | pub kmer_max_gc_diff: Option,
164 |
165 | /// The sequence of a query kmer and the corresponding database match must differ no more than
166 | /// --kmer-max-seq-dist
167 | #[clap(long, alias = "matchKmerSeq")]
168 | pub match_kmer_seq: bool,
169 |
170 | /// Maximum allowed sequence distance to retain a kmer match
171 | ///
172 | /// Note: when >= 1, this is interpreted as the absolute number of bases that are allowed to
173 | /// differ between the kmer and the matching region. When < 1, this is interpreted as a
174 | /// fraction of the kmer's length
175 | #[clap(long, requires = "match_kmer_seq", alias = "kmerMaxSeqDist")]
176 | pub kmer_max_seq_dist: Option>,
177 |
178 | /// Minimum complexity (measured as Gini coefficient) of candidate kmers
179 | #[clap(long, default_value_t = 0.3, alias = "kmerMinComplexity")]
180 | pub kmer_min_complexity: f32,
181 |
182 | /// A kmer is allowed to match a database entry on average every this many nt
183 | #[clap(long, default_value_t = 200, alias = "kmerMaxMatchEveryNt")]
184 | pub kmer_max_match_every_nt: u32,
185 | }
186 |
187 | #[derive(Debug, Args, Serialize)]
188 | #[serde(rename_all = "kebab-case")]
189 | #[allow(clippy::struct_field_names)]
190 | pub struct AlignmentArgs {
191 | /// Minimum and maximum score reactivity differences below 0.5 will be mapped to
192 | #[clap(long, default_value_t = MinMax (-0.5..2.), alias = "alignMatchScore", allow_hyphen_values = true)]
193 | pub align_match_score: MinMax,
194 |
195 | /// Minimum and maximum score reactivity differences above 0.5 will be mapped to
196 | #[clap(long, default_value_t = MinMax (-6.0..-0.5), alias = "alignMismatchScore", allow_hyphen_values = true)]
197 | pub align_mismatch_score: MinMax,
198 |
199 | /// Gap open penalty
200 | #[clap(long, default_value_t = -14., alias = "alignGapOpenPenal")]
201 | pub align_gap_open_penalty: f32,
202 |
203 | /// Gap extension penalty
204 | #[clap(long, default_value_t = -5., alias = "alignGapExtPenal")]
205 | pub align_gap_ext_penalty: f32,
206 |
207 | /// An alignment is allowed to drop by maximum this fraction of the best score encountered so
208 | /// far, before extension is interrupted
209 | #[clap(long, default_value_t = 0.8, alias = "alignMaxDropOffRate")]
210 | pub align_max_drop_off_rate: f32,
211 |
212 | /// An alignment is allowed to drop below the best score encountered so far *
213 | /// --align-max-drop-off-rate by this number of bases, before extension is interrupted
214 | #[clap(long, default_value_t = 8, alias = "alignMaxDropOffBases")]
215 | pub align_max_drop_off_bases: u16,
216 |
217 | /// The maximum allowed tolerated length difference between the query and db sequences to look
218 | /// for the ideal alignment along the diagonal (measured as a fraction of the length of the
219 | /// shortest sequence among db and query)
220 | #[clap(long, default_value_t = 0.1, alias = "alignLenTolerance")]
221 | pub align_len_tolerance: f32,
222 |
223 | /// Sequence matches are rewarded during the alignment
224 | #[clap(long, alias = "alignScoreSeq")]
225 | pub align_score_seq: bool,
226 |
227 | /// Score reward for matching bases
228 | #[clap(
229 | long,
230 | default_value_t = 0.5,
231 | requires = "align_score_seq",
232 | alias = "alignSeqMatchScore"
233 | )]
234 | pub align_seq_match_score: f32,
235 |
236 | /// Score penalty for mismatching bases
237 | #[clap(
238 | long,
239 | default_value_t = -2.,
240 | requires = "align_score_seq",
241 | alias = "alignSeqMismatchScore"
242 | )]
243 | pub align_seq_mismatch_score: f32,
244 | }
245 |
246 | #[derive(Debug, Args, Serialize)]
247 | #[serde(rename_all = "kebab-case")]
248 | pub struct AlignmentFoldingEvaluationArgs {
249 | /// Alignments passing the --inclusion-evalue threshold, are further evaluated for the presence
250 | /// or a conserved RNA structure by using `RNAalifold`
251 | #[clap(long, alias = "evalAlignFold")]
252 | pub eval_align_fold: bool,
253 |
254 | /// Number of shuffles to perform for each alignment during folding evaluation
255 | #[clap(long, default_value_t = 100)]
256 | pub shuffles: u16,
257 |
258 | /// Size (in nt) of the blocks for shuffling the alignment during folding evaluation
259 | #[clap(long, alias = "blockSize", default_value_t = 3)]
260 | pub block_size: u16,
261 |
262 | /// Besides shuffling blocks, residues within each block will be shuffled as well during
263 | /// folding evaluation
264 | #[clap(long, alias = "inBlockShuffle")]
265 | pub in_block_shuffle: bool,
266 |
267 | /// Minimum fraction of base-pairs of the RNAalifold-inferred structure that should be
268 | /// supported by both query and db sequence to retain a match
269 | #[clap(long, default_value_t = 0.75, alias = "minBpSupport")]
270 | pub min_bp_support: f32,
271 |
272 | /// Use RIBOSUM scoring matrix
273 | #[clap(long, alias = "ribosumScoring")]
274 | pub ribosum_scoring: bool,
275 |
276 | /// Slope for SHAPE reactivities conversion into pseudo-free energy contributions
277 | #[clap(long, default_value_t = 1.8, requires = "eval_align_fold")]
278 | pub slope: Reactivity,
279 |
280 | /// Intercept for SHAPE reactivities conversion into pseudo-free energy contributions
281 | #[clap(long, default_value_t = -0.6, requires = "eval_align_fold")]
282 | pub intercept: Reactivity,
283 |
284 | /// Maximum allowed base-pairing distance
285 | #[clap(
286 | long,
287 | default_value_t = 600,
288 | alias = "maxBPspan",
289 | requires = "eval_align_fold"
290 | )]
291 | pub max_bp_span: u32,
292 |
293 | /// Disallows lonely pairs (helices of 1 bp)
294 | #[clap(long, alias = "noLonelyPairs", requires = "eval_align_fold")]
295 | pub no_lonely_pairs: bool,
296 |
297 | /// Disallows G:U wobbles at the end of helices
298 | #[clap(long, alias = "noClosingGU", requires = "eval_align_fold")]
299 | pub no_closing_gu: bool,
300 |
301 | /// Folding temperature
302 | #[clap(long, default_value_t = 37., requires = "eval_align_fold")]
303 | pub temperature: f32,
304 | }
305 |
306 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
307 | pub struct MinMax(pub Range);
308 |
309 | impl fmt::Display for MinMax
310 | where
311 | T: fmt::Display,
312 | {
313 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
314 | write!(f, "{},{}", self.0.start, self.0.end)
315 | }
316 | }
317 |
318 | impl Serialize for MinMax
319 | where
320 | T: fmt::Display,
321 | {
322 | fn serialize(&self, serializer: S) -> Result
323 | where
324 | S: serde::Serializer,
325 | {
326 | serializer.collect_str(self)
327 | }
328 | }
329 |
330 | #[derive(Debug, Clone, PartialEq, Eq)]
331 | pub enum ParseMinMaxError {
332 | InvalidFormat,
333 | InnerError { index: u8, error: T },
334 | }
335 |
336 | impl FromStr for MinMax
337 | where
338 | T: FromStr,
339 | {
340 | type Err = ParseMinMaxError;
341 |
342 | fn from_str(s: &str) -> Result {
343 | let (start, end) = s.split_once(',').ok_or(ParseMinMaxError::InvalidFormat)?;
344 |
345 | let start = start
346 | .parse()
347 | .map_err(|error| ParseMinMaxError::InnerError { index: 0, error })?;
348 |
349 | let end = end
350 | .parse()
351 | .map_err(|error| ParseMinMaxError::InnerError { index: 1, error })?;
352 |
353 | Ok(Self(start..end))
354 | }
355 | }
356 |
357 | impl fmt::Display for ParseMinMaxError
358 | where
359 | T: fmt::Display,
360 | {
361 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
362 | match self {
363 | Self::InvalidFormat => {
364 | f.write_str("invalid min-max format, two comma-separated values expected")
365 | }
366 | Self::InnerError { index, error } => {
367 | let part = match index {
368 | 0 => "min",
369 | 1 => "max",
370 | _ => unreachable!(),
371 | };
372 | write!(f, "{part} part of min-max format is invalid: {error}")
373 | }
374 | }
375 | }
376 | }
377 |
378 | impl std::error::Error for ParseMinMaxError where T: std::error::Error {}
379 |
380 | #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize)]
381 | pub enum ReportAlignment {
382 | #[clap(alias = "f")]
383 | Fasta,
384 |
385 | #[clap(alias = "s")]
386 | Stockholm,
387 | }
388 |
389 | impl Cli {
390 | #[cfg(test)]
391 | pub(crate) fn dummy() -> Self {
392 | Self::parse_from(["test", "--database", "test", "--query", "test"])
393 | }
394 | }
395 |
396 | /// Hidden dumper for XML files.
397 | ///
398 | /// Read an XML or the XML files from a directory and dump the content to a native DB format.
399 | #[derive(Debug, Parser)]
400 | pub struct Alternative {
401 | /// Path to a database file, or to a (directory of) XML file(s)
402 | #[clap(long, visible_alias = "db")]
403 | pub database: PathBuf,
404 |
405 | /// Dumps the database to the specified file.
406 | ///
407 | /// Input is a (directory of) XML file(s).
408 | #[clap(long)]
409 | pub dump_db: PathBuf,
410 |
411 | /// Dumps the shuffled database to the specified file.
412 | ///
413 | /// Shuffled databases can be imported using the `--shuffled-db` parameter.
414 | #[clap(long)]
415 | pub dump_shuffled_db: Option,
416 |
417 | /// Number of processors to use
418 | ///
419 | /// Uses all available processors if not specified
420 | #[clap(long)]
421 | pub threads: Option,
422 |
423 | /// Number of shuffles to perform for each sequence in db
424 | ///
425 | /// In case the parameter is unspecified, it is automatically evaluated based on the length of
426 | /// the sequences in the database.
427 | ///
428 | /// Given `L` as the sum of the lengths of each sequence in the database, the number of
429 | /// shuffles is calculated as `max(1, 500000 / L)`.
430 | #[clap(long, alias = "dbShuffles")]
431 | pub db_shuffles: Option,
432 |
433 | /// Size (in nt) of the blocks for shuffling the sequences in db
434 | #[clap(long, alias = "dbBlockSize", default_value_t = 10)]
435 | pub db_block_size: u16,
436 |
437 | /// Besides shuffling blocks, residues within each block in db will be shuffled as well
438 | #[clap(long, alias = "dbInBlockShuffle")]
439 | pub db_in_block_shuffle: bool,
440 | }
441 |
--------------------------------------------------------------------------------
/src/db_file.rs:
--------------------------------------------------------------------------------
1 | pub mod native;
2 | mod xml;
3 |
4 | use std::{
5 | convert::TryInto,
6 | error::Error as StdError,
7 | ffi::OsString,
8 | fmt::{self, Display},
9 | io,
10 | path::Path,
11 | ptr,
12 | string::FromUtf8Error,
13 | };
14 |
15 | use serde::{Serialize, Serializer};
16 |
17 | use crate::{Base, Molecule, Reactivity, SequenceEntry};
18 |
19 | #[derive(Debug, Clone, PartialEq)]
20 | pub struct Entry {
21 | pub id: String,
22 | pub(crate) sequence: Vec,
23 | pub reactivity: Vec,
24 | }
25 |
26 | const NAN_PLACEHOLDER: Reactivity = -999.;
27 |
28 | #[derive(Debug, Clone, Copy)]
29 | #[repr(transparent)]
30 | pub struct ReactivityWithPlaceholder(Reactivity);
31 |
32 | impl ReactivityWithPlaceholder {
33 | pub fn is_nan(self) -> bool {
34 | self.0.is_nan() | (self.0 == NAN_PLACEHOLDER)
35 | }
36 |
37 | pub fn get_non_nan(self) -> Option {
38 | if self.is_nan() {
39 | None
40 | } else {
41 | Some(self.0)
42 | }
43 | }
44 |
45 | pub fn to_maybe_placeholder(self) -> Reactivity {
46 | if self.0.is_nan() {
47 | NAN_PLACEHOLDER
48 | } else {
49 | self.0
50 | }
51 | }
52 |
53 | pub fn as_inner_slice(this: &[ReactivityWithPlaceholder]) -> &[Reactivity] {
54 | // Safety:
55 | // - `ReactivityWithPlaceholder` is transparent and it contains only a `Reactivity`
56 | // - lifetime is maintained
57 | unsafe { &*(ptr::from_ref(this) as *const [Reactivity]) }
58 | }
59 |
60 | pub fn inner(self) -> Reactivity {
61 | self.0
62 | }
63 |
64 | #[inline]
65 | #[must_use]
66 | pub const fn nan_placeholder() -> Self {
67 | Self(NAN_PLACEHOLDER)
68 | }
69 | }
70 |
71 | impl PartialEq for ReactivityWithPlaceholder {
72 | fn eq(&self, other: &Self) -> bool {
73 | if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) {
74 | false
75 | } else {
76 | self.0 == other.0
77 | }
78 | }
79 | }
80 |
81 | impl PartialEq for ReactivityWithPlaceholder {
82 | fn eq(&self, other: &Reactivity) -> bool {
83 | if self.0 == NAN_PLACEHOLDER {
84 | false
85 | } else {
86 | self.0 == *other
87 | }
88 | }
89 | }
90 |
91 | impl PartialOrd for ReactivityWithPlaceholder {
92 | fn partial_cmp(&self, other: &Self) -> Option {
93 | if (self.0 == NAN_PLACEHOLDER) | (other.0 == NAN_PLACEHOLDER) {
94 | None
95 | } else {
96 | self.0.partial_cmp(&other.0)
97 | }
98 | }
99 | }
100 |
101 | impl PartialOrd for ReactivityWithPlaceholder {
102 | fn partial_cmp(&self, other: &Reactivity) -> Option {
103 | if self.0 == NAN_PLACEHOLDER {
104 | None
105 | } else {
106 | self.0.partial_cmp(other)
107 | }
108 | }
109 | }
110 |
111 | impl From for ReactivityWithPlaceholder {
112 | fn from(reactivity: Reactivity) -> Self {
113 | Self(reactivity)
114 | }
115 | }
116 |
117 | impl Serialize for ReactivityWithPlaceholder {
118 | #[inline]
119 | fn serialize(&self, serializer: S) -> Result
120 | where
121 | S: Serializer,
122 | {
123 | self.get_non_nan()
124 | .unwrap_or(Reactivity::NAN)
125 | .serialize(serializer)
126 | }
127 | }
128 |
129 | pub trait ReactivityLike: Copy + PartialOrd + PartialEq {
130 | fn is_nan(self) -> bool;
131 | fn value(self) -> Reactivity;
132 | }
133 |
134 | impl ReactivityLike for Reactivity {
135 | #[inline]
136 | fn is_nan(self) -> bool {
137 | Reactivity::is_nan(self)
138 | }
139 |
140 | #[inline]
141 | fn value(self) -> Reactivity {
142 | self
143 | }
144 | }
145 |
146 | impl ReactivityLike for ReactivityWithPlaceholder {
147 | #[inline]
148 | fn is_nan(self) -> bool {
149 | ReactivityWithPlaceholder::is_nan(self)
150 | }
151 |
152 | #[inline]
153 | fn value(self) -> Reactivity {
154 | self.to_maybe_placeholder()
155 | }
156 | }
157 |
158 | impl Entry {
159 | pub fn cap_reactivities(&mut self, max_reactivity: Reactivity) {
160 | self.reactivity.iter_mut().for_each(|reactivity| {
161 | if let Some(r) = reactivity.get_non_nan() {
162 | *reactivity = r.min(max_reactivity).into();
163 | }
164 | });
165 | }
166 | }
167 |
168 | impl SequenceEntry for Entry {
169 | type Reactivity = ReactivityWithPlaceholder;
170 |
171 | fn name(&self) -> &str {
172 | &self.id
173 | }
174 |
175 | fn sequence(&self) -> &[Base] {
176 | &self.sequence
177 | }
178 |
179 | fn reactivity(&self) -> &[Self::Reactivity] {
180 | &self.reactivity
181 | }
182 |
183 | fn molecule(&self) -> crate::Molecule {
184 | Molecule::Dna
185 | }
186 | }
187 |
188 | #[derive(Debug)]
189 | pub enum ReaderError {
190 | TooSmall,
191 | InvalidMarker,
192 | }
193 |
194 | impl Display for ReaderError {
195 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196 | let s = match self {
197 | ReaderError::TooSmall => "DB file is too small",
198 | ReaderError::InvalidMarker => "DB file contains and invalid EOF marker",
199 | };
200 |
201 | f.write_str(s)
202 | }
203 | }
204 |
205 | impl StdError for ReaderError {}
206 |
207 | #[derive(Debug)]
208 | pub enum EntryError {
209 | InvalidSequenceId(FromUtf8Error),
210 | InvalidBase,
211 | UnexpectedEof,
212 | SurpassedEofMarker,
213 | }
214 |
215 | impl Display for EntryError {
216 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
217 | let s = match self {
218 | EntryError::InvalidSequenceId(_) => "Invalid sequence ID string",
219 | EntryError::InvalidBase => "Invalid encoded nucleobase",
220 | EntryError::UnexpectedEof => "Unexpected end of file",
221 | EntryError::SurpassedEofMarker => "End of file marked has been surpassed",
222 | };
223 |
224 | f.write_str(s)
225 | }
226 | }
227 |
228 | impl StdError for EntryError {
229 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
230 | match self {
231 | EntryError::InvalidSequenceId(source) => Some(source),
232 | EntryError::InvalidBase
233 | | EntryError::UnexpectedEof
234 | | EntryError::SurpassedEofMarker => None,
235 | }
236 | }
237 | }
238 |
239 | pub fn read_db(path: &Path) -> Result, Error> {
240 | if path.is_dir() {
241 | xml::read_directory(path).map_err(Error::Directory)
242 | } else {
243 | let extension = path.extension().ok_or(Error::NoExtension)?;
244 | if extension.eq_ignore_ascii_case("db") {
245 | native::read_file(path).map_err(Error::Native)
246 | } else if extension.eq_ignore_ascii_case("xml") {
247 | let entry = xml::read_file(path).map_err(Error::Xml)?;
248 | Ok(vec![entry])
249 | } else {
250 | Err(Error::InvalidExtension(extension.to_os_string()))
251 | }
252 | }
253 | }
254 |
255 | #[derive(Debug)]
256 | pub enum Error {
257 | NoExtension,
258 | InvalidExtension(OsString),
259 | Native(native::Error),
260 | Xml(xml::ReadFileError),
261 | Directory(xml::ReadDirectoryError),
262 | }
263 |
264 | impl Display for Error {
265 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
266 | match self {
267 | Error::NoExtension => f.write_str("db file does not have an extension"),
268 | Error::InvalidExtension(extension) => {
269 | write!(
270 | f,
271 | "extension \"{}\" is not valid for a db",
272 | extension.to_string_lossy()
273 | )
274 | }
275 | Error::Native(_) => f.write_str("cannot read native db file"),
276 | Error::Xml(_) => f.write_str("cannot read xml db file"),
277 | Error::Directory(_) => f.write_str("cannot read xml entries from a directory"),
278 | }
279 | }
280 | }
281 |
282 | impl StdError for Error {
283 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
284 | match self {
285 | Error::NoExtension | Error::InvalidExtension(_) => None,
286 | Error::Native(source) => Some(source),
287 | Error::Xml(source) => Some(source),
288 | Error::Directory(source) => Some(source),
289 | }
290 | }
291 | }
292 |
293 | pub fn write_entries(entries: &[Entry], mut writer: W) -> io::Result<()> {
294 | entries.iter().try_for_each(|entry| {
295 | let name = entry.name();
296 | let sequence = entry.sequence();
297 | let name_len_buf = u32::try_from(name.len().checked_add(1).unwrap())
298 | .unwrap()
299 | .to_le_bytes();
300 | let seq_len_buf = u32::try_from(sequence.len()).unwrap().to_le_bytes();
301 |
302 | writer.write_all(name_len_buf.as_slice())?;
303 | writer.write_all(name.as_bytes())?;
304 | writer.write_all(&[0])?;
305 | writer.write_all(seq_len_buf.as_slice())?;
306 | sequence.chunks_exact(2).try_for_each(|pair| {
307 | writer.write_all(&[Base::pair_to_nibble(pair.try_into().unwrap())])
308 | })?;
309 | if let Some(base) = sequence.chunks_exact(2).remainder().first().copied() {
310 | writer.write_all(&[Base::pair_to_nibble([base, Base::A])])?;
311 | }
312 |
313 | entry.reactivity().iter().try_for_each(|reactivity| {
314 | let reactivity = f64::from(reactivity.inner()).to_le_bytes();
315 | writer.write_all(reactivity.as_slice())
316 | })?;
317 |
318 | Ok::<_, io::Error>(())
319 | })?;
320 |
321 | let n_entries = u64::try_from(entries.len()).unwrap().to_le_bytes();
322 | writer.write_all(n_entries.as_slice())?;
323 | writer.write_all(native::VERSION.to_le_bytes().as_slice())?;
324 | writer.write_all(native::END_MARKER)?;
325 | writer.flush()?;
326 |
327 | Ok(())
328 | }
329 |
--------------------------------------------------------------------------------
/src/db_file/native.rs:
--------------------------------------------------------------------------------
1 | use std::{
2 | convert::TryInto,
3 | error::Error as StdError,
4 | fmt::{self, Display},
5 | fs::File,
6 | io::{self, BufReader, Read, Seek, SeekFrom},
7 | path::Path,
8 | string::FromUtf8Error,
9 | };
10 |
11 | use itertools::Itertools;
12 |
13 | use crate::{db_file::ReactivityWithPlaceholder, Base, InvalidBasePair, Reactivity};
14 |
15 | use super::Entry;
16 |
17 | pub(super) const END_SIZE: u8 = 17;
18 | pub(super) const END_MARKER: &[u8] = b"[eofdb]";
19 | pub(super) const VERSION: u16 = 1;
20 |
21 | #[derive(Debug)]
22 | pub struct Reader {
23 | inner: R,
24 | _db_len: u64,
25 | _version: u16,
26 | end_offset: u64,
27 | }
28 |
29 | impl Reader
30 | where
31 | R: Read + Seek,
32 | {
33 | pub fn new(mut reader: R) -> Result {
34 | use NewReaderError as E;
35 |
36 | let end_offset = reader
37 | .seek(SeekFrom::End(-i64::from(END_SIZE)))
38 | .map_err(E::SeekToMetadata)?;
39 | let mut end_buf = [0; END_SIZE as usize];
40 | reader.read_exact(&mut end_buf).map_err(E::ReadMetadata)?;
41 |
42 | if &end_buf[10..17] != END_MARKER {
43 | return Err(E::InvalidMarker);
44 | }
45 |
46 | let db_len = u64::from_le_bytes(end_buf[0..8].try_into().unwrap());
47 | let version = u16::from_le_bytes(end_buf[8..10].try_into().unwrap());
48 | Ok(Self {
49 | inner: reader,
50 | _db_len: db_len,
51 | _version: version,
52 | end_offset,
53 | })
54 | }
55 |
56 | pub fn entries(&mut self) -> EntryIter {
57 | let &mut Self {
58 | ref mut inner,
59 | end_offset,
60 | ..
61 | } = self;
62 |
63 | EntryIter {
64 | reader: inner,
65 | end_offset,
66 | offset: 0,
67 | }
68 | }
69 | }
70 |
71 | #[derive(Debug)]
72 | pub enum NewReaderError {
73 | SeekToMetadata(io::Error),
74 | ReadMetadata(io::Error),
75 | InvalidMarker,
76 | }
77 |
78 | impl Display for NewReaderError {
79 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 | let s = match self {
81 | NewReaderError::SeekToMetadata(_) => "unable to seek to metadata",
82 | NewReaderError::ReadMetadata(_) => "unable to read metadata",
83 | NewReaderError::InvalidMarker => "invalid metadata marker",
84 | };
85 |
86 | f.write_str(s)
87 | }
88 | }
89 |
90 | impl StdError for NewReaderError {
91 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
92 | match self {
93 | NewReaderError::SeekToMetadata(source) | NewReaderError::ReadMetadata(source) => {
94 | Some(source)
95 | }
96 | NewReaderError::InvalidMarker => None,
97 | }
98 | }
99 | }
100 |
101 | #[derive(Debug)]
102 | pub struct EntryIter<'a, R> {
103 | reader: &'a mut R,
104 | end_offset: u64,
105 | offset: u64,
106 | }
107 |
108 | impl Iterator for EntryIter<'_, R>
109 | where
110 | R: Seek + Read,
111 | {
112 | type Item = Result;
113 |
114 | fn next(&mut self) -> Option {
115 | (self.offset != self.end_offset).then(|| self.next_entry())
116 | }
117 | }
118 |
119 | impl EntryIter<'_, R>
120 | where
121 | R: Seek + Read,
122 | {
123 | fn next_entry(&mut self) -> Result {
124 | use NextEntryError as E;
125 |
126 | if self.offset == 0 {
127 | self.reader.seek(SeekFrom::Start(0)).map_err(E::SeekStart)?;
128 | }
129 |
130 | let mut id_len_with_nul_buf = [0; 4];
131 | self.reader
132 | .read_exact(&mut id_len_with_nul_buf)
133 | .map_err(E::ReadIdLen)?;
134 | let id_len_with_nul: usize = u32::from_le_bytes(id_len_with_nul_buf)
135 | .try_into()
136 | .expect("cannot represent id length as usize for the current architecture");
137 | let mut sequence_id = vec![0; id_len_with_nul];
138 | self.reader
139 | .read_exact(&mut sequence_id)
140 | .map_err(E::ReadSequenceId)?;
141 | if sequence_id.pop().filter(|&b| b == 0).is_none() {
142 | return Err(E::MissingSequenceIdNul);
143 | }
144 | let sequence_id =
145 | String::from_utf8(sequence_id).map_err(NextEntryError::InvalidSequenceId)?;
146 | let mut sequence_len_buf = [0; 4];
147 | self.reader
148 | .read_exact(&mut sequence_len_buf)
149 | .map_err(E::ReadSequenceLen)?;
150 | let sequence_len: usize = u32::from_le_bytes(sequence_len_buf)
151 | .try_into()
152 | .expect("cannot represent sequence length as usize for the current architecture");
153 |
154 | let sequence_bytes = sequence_len / 2 + sequence_len % 2;
155 | let mut sequence = self
156 | .reader
157 | .bytes()
158 | .take(sequence_bytes)
159 | .map(|result| {
160 | result.map_err(E::ReadSequence).and_then(|byte| {
161 | Base::try_pair_from_byte(byte)
162 | .map(|[first, second]| [first, second])
163 | .map_err(E::InvalidEncodedBase)
164 | })
165 | })
166 | .flatten_ok()
167 | .collect::, _>>()?;
168 |
169 | if sequence_len > 0 && sequence_len % 2 == 1 {
170 | sequence.pop().unwrap();
171 | }
172 |
173 | if sequence.len() != sequence_len {
174 | return Err(E::UnexpectedEof);
175 | }
176 |
177 | let reactivity = (0..sequence_len)
178 | .map(|_| {
179 | let mut reactivity_buffer = [0; 8];
180 | self.reader
181 | .read_exact(&mut reactivity_buffer)
182 | .map(|()| reactivity_buffer)
183 | .map_err(E::ReadReactivity)
184 | })
185 | // Reactivity is an alias to either f32 or f64
186 | .map_ok(|bytes| {
187 | // We internally use a fixed type that can be f32, there is no need to necessarily
188 | // have 64 bits of precision
189 | #[allow(clippy::cast_possible_truncation)]
190 | let reactivity = f64::from_le_bytes(bytes) as Reactivity;
191 | ReactivityWithPlaceholder::from(reactivity)
192 | })
193 | .collect::, _>>()?;
194 |
195 | if reactivity.len() != sequence_len {
196 | return Err(E::UnexpectedEof);
197 | }
198 |
199 | let offset = self.reader.stream_position().map_err(E::StreamPosition)?;
200 | if offset > self.end_offset {
201 | return Err(E::SurpassedEofMarker);
202 | }
203 | self.offset = offset;
204 |
205 | Ok(Entry {
206 | id: sequence_id,
207 | sequence,
208 | reactivity,
209 | })
210 | }
211 | }
212 |
213 | #[derive(Debug)]
214 | pub enum NextEntryError {
215 | SeekStart(io::Error),
216 | ReadIdLen(io::Error),
217 | ReadSequenceId(io::Error),
218 | MissingSequenceIdNul,
219 | InvalidSequenceId(FromUtf8Error),
220 | ReadSequenceLen(io::Error),
221 | ReadSequence(io::Error),
222 | InvalidEncodedBase(InvalidBasePair),
223 | ReadReactivity(io::Error),
224 | UnexpectedEof,
225 | SurpassedEofMarker,
226 | StreamPosition(io::Error),
227 | }
228 |
229 | impl Display for NextEntryError {
230 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
231 | let s = match self {
232 | NextEntryError::SeekStart(_) => "unable to seek to the start of the file",
233 | NextEntryError::ReadIdLen(_) => "unable to read the length of the sequence id",
234 | NextEntryError::ReadSequenceId(_) => "unable to read sequence id",
235 | NextEntryError::MissingSequenceIdNul => {
236 | "sequence id does not have a nul termination character"
237 | }
238 | NextEntryError::InvalidSequenceId(_) => "sequence id is not valid",
239 | NextEntryError::ReadSequenceLen(_) => "unable to read sequence length",
240 | NextEntryError::ReadSequence(_) => "unable to read sequence content",
241 | NextEntryError::InvalidEncodedBase(_) => "invalid encoded base",
242 | NextEntryError::ReadReactivity(_) => "unable to read rectivity",
243 | NextEntryError::UnexpectedEof => "unexpected end of file",
244 | NextEntryError::SurpassedEofMarker => "end of file marker is being surpassed",
245 | NextEntryError::StreamPosition(_) => "unable to get stream position",
246 | };
247 |
248 | f.write_str(s)
249 | }
250 | }
251 |
252 | impl StdError for NextEntryError {
253 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
254 | match self {
255 | NextEntryError::SeekStart(source)
256 | | NextEntryError::ReadIdLen(source)
257 | | NextEntryError::ReadSequenceId(source)
258 | | NextEntryError::ReadSequenceLen(source)
259 | | NextEntryError::ReadSequence(source)
260 | | NextEntryError::ReadReactivity(source)
261 | | NextEntryError::StreamPosition(source) => Some(source),
262 | NextEntryError::MissingSequenceIdNul
263 | | NextEntryError::UnexpectedEof
264 | | NextEntryError::SurpassedEofMarker => None,
265 | NextEntryError::InvalidSequenceId(source) => Some(source),
266 | NextEntryError::InvalidEncodedBase(source) => Some(source),
267 | }
268 | }
269 | }
270 |
271 | #[derive(Debug)]
272 | pub enum Error {
273 | OpenFile(io::Error),
274 | NewReader(NewReaderError),
275 | Entry(NextEntryError),
276 | }
277 |
278 | impl Display for Error {
279 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280 | let s = match self {
281 | Error::OpenFile(_) => "unable to open file",
282 | Error::NewReader(_) => "unable to create new reader",
283 | Error::Entry(_) => "unable to get the next entry",
284 | };
285 |
286 | f.write_str(s)
287 | }
288 | }
289 |
290 | impl StdError for Error {
291 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
292 | match self {
293 | Error::OpenFile(source) => Some(source),
294 | Error::NewReader(source) => Some(source),
295 | Error::Entry(source) => Some(source),
296 | }
297 | }
298 | }
299 |
300 | pub fn read_file(path: &Path) -> Result, Error> {
301 | let file = File::open(path).map_err(Error::OpenFile)?;
302 | let mut reader = Reader::new(BufReader::new(file)).map_err(Error::NewReader)?;
303 | let entries = reader
304 | .entries()
305 | .collect::>()
306 | .map_err(Error::Entry)?;
307 | Ok(entries)
308 | }
309 |
310 | #[cfg(test)]
311 | mod tests {
312 | use std::io::Cursor;
313 |
314 | use super::*;
315 |
316 | const TEST_DB: &[u8] = include_bytes!("../../test_data/test.db");
317 |
318 | #[test]
319 | fn valid_reader() {
320 | let reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
321 | #[allow(clippy::used_underscore_binding)]
322 | let len = reader._db_len;
323 |
324 | #[allow(clippy::used_underscore_binding)]
325 | let version = reader._version;
326 |
327 | assert_eq!(len, 0x1181);
328 | assert_eq!(version, 1);
329 | }
330 |
331 | #[test]
332 | fn read_all_db() {
333 | let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
334 | let db_len = reader
335 | .entries()
336 | .map_ok(|entry| entry.sequence.len())
337 | .try_fold(0, |acc, seq_len| seq_len.map(|seq_len| acc + seq_len))
338 | .unwrap();
339 |
340 | #[allow(clippy::used_underscore_binding)]
341 | let reader_len = usize::try_from(reader._db_len).unwrap();
342 | assert_eq!(db_len, reader_len);
343 | }
344 |
345 | #[test]
346 | fn transform_pseudo_nans() {
347 | let mut reader = Reader::new(Cursor::new(TEST_DB)).unwrap();
348 | let entry = reader.entries().next().unwrap().unwrap();
349 |
350 | // The first 13 reactivities are -999 in the file
351 | assert!(entry.reactivity[..13]
352 | .iter()
353 | .copied()
354 | .all(ReactivityWithPlaceholder::is_nan));
355 | }
356 | }
357 |
--------------------------------------------------------------------------------
/src/db_file/xml.rs:
--------------------------------------------------------------------------------
1 | use std::{
2 | borrow::Cow,
3 | error::Error as StdError,
4 | fmt::{self, Display},
5 | fs::File,
6 | io::{self, BufReader},
7 | num::ParseFloatError,
8 | ops::Not,
9 | path::Path,
10 | str::Utf8Error,
11 | };
12 |
13 | use quick_xml::{
14 | events::{BytesEnd, BytesStart, BytesText},
15 | Reader,
16 | };
17 | use rayon::iter::{ParallelBridge, ParallelIterator};
18 |
19 | use crate::{Base, InvalidBase, Reactivity};
20 |
21 | use super::{Entry, ReactivityWithPlaceholder};
22 |
23 | pub fn read_file(path: &Path) -> Result {
24 | use quick_xml::events::Event;
25 | use ReadFileError as E;
26 |
27 | let mut reader = Reader::from_file(path).map_err(E::ReaderFromFile)?;
28 | let mut buffer = Vec::new();
29 | let mut state = XmlState::default();
30 |
31 | let mut id = None;
32 | let mut sequence = None;
33 | let mut reactivity = None;
34 |
35 | loop {
36 | let event = reader
37 | .read_event_into(&mut buffer)
38 | .map_err(|source| E::ReadEvent {
39 | buffer_position: reader.buffer_position(),
40 | source,
41 | })?;
42 |
43 | match event {
44 | Event::Start(start) => {
45 | state = handle_start_event(&start, state, &mut id)?;
46 | }
47 |
48 | Event::End(end) => {
49 | state = handle_end_event(&end, state)?;
50 | }
51 |
52 | Event::Empty(tag) => return Err(E::UnexpectedEmptyTag(tag.name().as_ref().to_owned())),
53 | Event::Text(text) => {
54 | handle_text_event(&text, &state, &mut sequence, &mut reactivity, &reader)?;
55 | }
56 |
57 | Event::CData(_)
58 | | Event::Comment(_)
59 | | Event::Decl(_)
60 | | Event::PI(_)
61 | | Event::DocType(_) => {}
62 |
63 | Event::Eof => break,
64 | }
65 | }
66 |
67 | let id = id.ok_or(E::MissingTranscript)?;
68 | let sequence = sequence.ok_or(E::MissingSequence)?;
69 | let reactivity = reactivity.ok_or(E::MissingReactivity)?;
70 |
71 | if sequence.len() != reactivity.len() {
72 | return Err(E::InconsistentLength {
73 | sequence: sequence.len(),
74 | reactivity: reactivity.len(),
75 | });
76 | }
77 |
78 | Ok(Entry {
79 | id,
80 | sequence,
81 | reactivity,
82 | })
83 | }
84 |
85 | fn handle_start_event(
86 | start: &BytesStart<'_>,
87 | state: XmlState,
88 | id: &mut Option,
89 | ) -> Result {
90 | use ReadFileError as E;
91 |
92 | match (start.name().as_ref(), state) {
93 | (b"data", XmlState::Start) => Ok(XmlState::Data),
94 | (b"meta-data", XmlState::Data) => Ok(XmlState::MetaData),
95 | (b"organism", XmlState::MetaData) => Ok(XmlState::Organism),
96 | (b"probe", XmlState::MetaData) => Ok(XmlState::Probe),
97 | (b"source", XmlState::MetaData) => Ok(XmlState::Source),
98 | (b"citation", XmlState::Source) => Ok(XmlState::Citation),
99 | (b"pmid", XmlState::Source) => Ok(XmlState::Pmid),
100 | (b"replicate", XmlState::MetaData) => Ok(XmlState::Replicate),
101 | (b"condition", XmlState::MetaData) => Ok(XmlState::Condition),
102 | (b"transcript", XmlState::Data) => {
103 | if id.is_some() {
104 | return Err(E::MultipleTranscripts);
105 | }
106 |
107 | let id_attr = start
108 | .try_get_attribute("id")
109 | .map_err(E::MalformedTranscriptTag)?
110 | .ok_or(E::MissingId)?;
111 |
112 | let id_string = match id_attr.value {
113 | Cow::Borrowed(id) => std::str::from_utf8(id)
114 | .map(str::to_owned)
115 | .map_err(E::InvalidId)?,
116 | Cow::Owned(id) => {
117 | String::from_utf8(id).map_err(|err| E::InvalidId(err.utf8_error()))?
118 | }
119 | };
120 | *id = Some(id_string);
121 |
122 | Ok(XmlState::Transcript)
123 | }
124 | (b"sequence", XmlState::Transcript) => Ok(XmlState::Sequence),
125 | (b"reactivity", XmlState::Transcript) => Ok(XmlState::Reactivity),
126 | _ => Err(E::UnexpectedOpenTag(start.name().as_ref().to_owned())),
127 | }
128 | }
129 |
130 | fn handle_end_event(end: &BytesEnd<'_>, state: XmlState) -> Result {
131 | use ReadFileError as E;
132 |
133 | match (end.name().as_ref(), state) {
134 | (b"data", XmlState::Data) => Ok(XmlState::End),
135 |
136 | (b"meta-data", XmlState::MetaData) | (b"transcript", XmlState::Transcript) => {
137 | Ok(XmlState::Data)
138 | }
139 |
140 | (b"organism", XmlState::Organism)
141 | | (b"probe", XmlState::Probe)
142 | | (b"source", XmlState::Source)
143 | | (b"replicate", XmlState::Replicate)
144 | | (b"condition", XmlState::Condition) => Ok(XmlState::MetaData),
145 |
146 | (b"citation", XmlState::Citation) | (b"pmid", XmlState::Pmid) => Ok(XmlState::Source),
147 |
148 | (b"sequence", XmlState::Sequence) | (b"reactivity", XmlState::Reactivity) => {
149 | Ok(XmlState::Transcript)
150 | }
151 |
152 | _ => Err(E::UnexpectedCloseTag(end.name().as_ref().to_owned())),
153 | }
154 | }
155 |
156 | fn handle_text_event(
157 | text: &BytesText<'_>,
158 | state: &XmlState,
159 | sequence: &mut Option>,
160 | reactivity: &mut Option>,
161 | reader: &Reader>,
162 | ) -> Result<(), ReadFileError> {
163 | use ReadFileError as E;
164 |
165 | if text.iter().all(u8::is_ascii_whitespace) {
166 | return Ok(());
167 | }
168 |
169 | match state {
170 | XmlState::Start
171 | | XmlState::Data
172 | | XmlState::MetaData
173 | | XmlState::Source
174 | | XmlState::Transcript
175 | | XmlState::End => return Err(E::UnexpectedText(reader.buffer_position())),
176 |
177 | XmlState::Organism
178 | | XmlState::Probe
179 | | XmlState::Citation
180 | | XmlState::Pmid
181 | | XmlState::Replicate
182 | | XmlState::Condition => {}
183 |
184 | XmlState::Sequence => {
185 | if sequence.is_some() {
186 | return Err(E::MultipleSequences);
187 | }
188 | *sequence = Some(parse_sequence(text).map_err(E::InvalidSequence)?);
189 | }
190 | XmlState::Reactivity => {
191 | if reactivity.is_some() {
192 | return Err(E::MultipleReactivities);
193 | }
194 |
195 | *reactivity = Some(parse_reactivity(text).map_err(E::InvalidReactivity)?);
196 | }
197 | }
198 |
199 | Ok(())
200 | }
201 |
202 | #[derive(Debug, Default)]
203 | enum XmlState {
204 | #[default]
205 | Start,
206 | Data,
207 | MetaData,
208 | Organism,
209 | Probe,
210 | Source,
211 | Citation,
212 | Pmid,
213 | Replicate,
214 | Condition,
215 | Transcript,
216 | Sequence,
217 | Reactivity,
218 | End,
219 | }
220 |
221 | #[derive(Debug)]
222 | pub enum ReadFileError {
223 | ReaderFromFile(quick_xml::Error),
224 | ReadEvent {
225 | buffer_position: usize,
226 | source: quick_xml::Error,
227 | },
228 | UnexpectedOpenTag(Vec),
229 | UnexpectedCloseTag(Vec),
230 | UnexpectedEmptyTag(Vec),
231 | UnexpectedText(usize),
232 | MultipleTranscripts,
233 | MalformedTranscriptTag(quick_xml::Error),
234 | MissingId,
235 | InvalidId(Utf8Error),
236 | MultipleSequences,
237 | InvalidSequence(InvalidBase),
238 | MultipleReactivities,
239 | InvalidReactivity(InvalidReactivity),
240 | MissingTranscript,
241 | MissingSequence,
242 | MissingReactivity,
243 | InconsistentLength {
244 | sequence: usize,
245 | reactivity: usize,
246 | },
247 | }
248 |
249 | impl Display for ReadFileError {
250 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
251 | match self {
252 | ReadFileError::ReaderFromFile(_) => {
253 | f.write_str("unable to create XML reader from file")
254 | }
255 | ReadFileError::ReadEvent {
256 | buffer_position,
257 | source: _,
258 | } => write!(f, "unable to read XML event at position {buffer_position}"),
259 | ReadFileError::UnexpectedOpenTag(tag) => write!(
260 | f,
261 | r#"unexpected opening tag "{}""#,
262 | String::from_utf8_lossy(tag)
263 | ),
264 | ReadFileError::UnexpectedCloseTag(tag) => write!(
265 | f,
266 | r#"unexpected closing tag "{}""#,
267 | String::from_utf8_lossy(tag),
268 | ),
269 | ReadFileError::UnexpectedEmptyTag(tag) => write!(
270 | f,
271 | r#"unexpected empty tag "{}""#,
272 | String::from_utf8_lossy(tag),
273 | ),
274 | ReadFileError::UnexpectedText(position) => {
275 | write!(f, "unexpected text content at position {position}")
276 | }
277 | ReadFileError::MultipleTranscripts => f.write_str("more than one transcript tag found"),
278 | ReadFileError::MalformedTranscriptTag(_) => {
279 | f.write_str("transcript tag has invalid or duplicated attributes")
280 | }
281 | ReadFileError::MissingId => {
282 | f.write_str(r#""id" attribute is missing from transcript tag"#)
283 | }
284 | ReadFileError::InvalidId(_) => f.write_str("transcript id is not a valid UTF-8 string"),
285 | ReadFileError::MultipleSequences => f.write_str("more than one sequence tag found"),
286 | ReadFileError::InvalidSequence(_) => f.write_str("sequence is invalid"),
287 | ReadFileError::MultipleReactivities => {
288 | f.write_str("more than one reactivity tag found")
289 | }
290 | ReadFileError::InvalidReactivity(_) => f.write_str("reactivity data is invalid"),
291 | ReadFileError::MissingTranscript => f.write_str("transcript tag is missing"),
292 | ReadFileError::MissingSequence => f.write_str("sequence tag is missing"),
293 | ReadFileError::MissingReactivity => f.write_str("reactivity tag is missing"),
294 | ReadFileError::InconsistentLength {
295 | sequence,
296 | reactivity,
297 | } => write!(
298 | f,
299 | "sequence length ({sequence}) is different from reactivity sequence {reactivity}"
300 | ),
301 | }
302 | }
303 | }
304 |
305 | impl StdError for ReadFileError {
306 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
307 | match self {
308 | ReadFileError::ReaderFromFile(source) | ReadFileError::ReadEvent { source, .. } => {
309 | Some(source)
310 | }
311 |
312 | ReadFileError::UnexpectedOpenTag(_)
313 | | ReadFileError::UnexpectedCloseTag(_)
314 | | ReadFileError::UnexpectedEmptyTag(_)
315 | | ReadFileError::UnexpectedText(_)
316 | | ReadFileError::MultipleTranscripts
317 | | ReadFileError::MissingId
318 | | ReadFileError::MultipleSequences
319 | | ReadFileError::MultipleReactivities
320 | | ReadFileError::MissingTranscript
321 | | ReadFileError::MissingSequence
322 | | ReadFileError::MissingReactivity
323 | | ReadFileError::InconsistentLength { .. } => None,
324 |
325 | ReadFileError::MalformedTranscriptTag(source) => Some(source),
326 | ReadFileError::InvalidId(source) => Some(source),
327 | ReadFileError::InvalidSequence(source) => Some(source),
328 | ReadFileError::InvalidReactivity(source) => Some(source),
329 | }
330 | }
331 | }
332 |
333 | fn parse_sequence(raw: &[u8]) -> Result, InvalidBase> {
334 | raw.iter()
335 | .filter(|c| c.is_ascii_whitespace().not())
336 | .copied()
337 | .map(Base::try_from)
338 | .collect()
339 | }
340 |
341 | fn parse_reactivity(raw: &[u8]) -> Result, InvalidReactivity> {
342 | use InvalidReactivity as E;
343 |
344 | raw.split(|&c| c == b',')
345 | .map(|raw| {
346 | let raw = std::str::from_utf8(raw).map_err(E::Utf8)?.trim();
347 |
348 | if raw == "NaN" {
349 | Ok(ReactivityWithPlaceholder::nan_placeholder())
350 | } else {
351 | raw.parse::()
352 | .map(ReactivityWithPlaceholder::from)
353 | .map_err(InvalidReactivity::Value)
354 | }
355 | })
356 | .collect()
357 | }
358 |
359 | #[derive(Debug)]
360 | pub enum InvalidReactivity {
361 | Utf8(Utf8Error),
362 | Value(ParseFloatError),
363 | }
364 |
365 | impl Display for InvalidReactivity {
366 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
367 | let s = match self {
368 | InvalidReactivity::Utf8(_) => "rectivity is not a valid UTF-8 string",
369 | InvalidReactivity::Value(_) => "unable to parse reactivity value",
370 | };
371 |
372 | f.write_str(s)
373 | }
374 | }
375 |
376 | impl StdError for InvalidReactivity {
377 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
378 | match self {
379 | InvalidReactivity::Utf8(source) => Some(source),
380 | InvalidReactivity::Value(source) => Some(source),
381 | }
382 | }
383 | }
384 |
385 | pub fn read_directory(path: &Path) -> Result, ReadDirectoryError> {
386 | use ReadDirectoryError as E;
387 |
388 | path.read_dir()
389 | .map_err(E::Dir)?
390 | .filter_map(|entry| {
391 | entry
392 | .map(|entry| {
393 | let path = entry.path();
394 | let extension = path.extension()?;
395 | extension.eq_ignore_ascii_case("xml").then_some(path)
396 | })
397 | .transpose()
398 | })
399 | .par_bridge()
400 | .filter_map(|path| {
401 | let path = match path {
402 | Ok(path) => path,
403 | Err(err) => return Some(Err(E::DirEntry(err))),
404 | };
405 | match read_file(&path) {
406 | Ok(entry) => Some(Ok(entry)),
407 | Err(err) => {
408 | eprintln!(
409 | "WARNING: unable to read XML path {}: {:#}",
410 | path.display(),
411 | anyhow::Error::from(err)
412 | );
413 | None
414 | }
415 | }
416 | })
417 | .collect()
418 | }
419 |
420 | #[derive(Debug)]
421 | pub enum ReadDirectoryError {
422 | Dir(io::Error),
423 | DirEntry(io::Error),
424 | }
425 |
426 | impl Display for ReadDirectoryError {
427 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
428 | match self {
429 | ReadDirectoryError::Dir(_) => f.write_str("unable to read directory"),
430 | ReadDirectoryError::DirEntry(_) => f.write_str("unable to read directory entry"),
431 | }
432 | }
433 | }
434 |
435 | impl StdError for ReadDirectoryError {
436 | fn source(&self) -> Option<&(dyn StdError + 'static)> {
437 | match self {
438 | ReadDirectoryError::Dir(source) | ReadDirectoryError::DirEntry(source) => Some(source),
439 | }
440 | }
441 | }
442 |
443 | #[cfg(test)]
444 | mod tests {
445 | use std::{
446 | fs,
447 | path::{Path, PathBuf},
448 | sync::OnceLock,
449 | };
450 |
451 | use tempfile::tempdir;
452 |
453 | use crate::{db_file::ReactivityWithPlaceholder, Base};
454 |
455 | use super::{read_directory, read_file};
456 |
457 | fn raw_xml_db_path() -> &'static Path {
458 | static RAW_XML_DB_PATH: OnceLock = OnceLock::new();
459 |
460 | RAW_XML_DB_PATH.get_or_init(|| {
461 | let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
462 | manifest_dir.join("test_data/test_db.xml")
463 | })
464 | }
465 |
466 | #[test]
467 | fn read_valid_xml() {
468 | let entry = read_file(raw_xml_db_path()).unwrap();
469 | assert_eq!(entry.id, "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S");
470 | assert_eq!(entry.sequence.len(), 1800);
471 | assert_eq!(
472 | entry.sequence[..5],
473 | [Base::T, Base::A, Base::T, Base::C, Base::T]
474 | );
475 | assert!(entry.reactivity[..37]
476 | .iter()
477 | .copied()
478 | .all(ReactivityWithPlaceholder::is_nan));
479 | assert!((entry.reactivity[37].get_non_nan().unwrap() - 0.389).abs() < 0.001);
480 | }
481 |
482 | #[test]
483 | fn read_directory_ignores_non_xml_files() {
484 | let tempdir = tempdir().unwrap();
485 | let temp_path = tempdir.path();
486 | fs::write(temp_path.join("test.txt"), "hello world").unwrap();
487 | fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap();
488 | let entries = read_directory(temp_path).unwrap();
489 | assert_eq!(entries.len(), 1);
490 | assert_eq!(
491 | entries[0].id,
492 | "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S",
493 | );
494 | }
495 |
496 | #[test]
497 | fn read_directory_ignores_invalid_xml_files() {
498 | let tempdir = tempdir().unwrap();
499 | let temp_path = tempdir.path();
500 | let xml_file_path = temp_path.join("test.xml");
501 | fs::write(xml_file_path, "invalid xml").unwrap();
502 | fs::copy(raw_xml_db_path(), temp_path.join("valid.xml")).unwrap();
503 | let entries = read_directory(temp_path).unwrap();
504 | assert_eq!(entries.len(), 1);
505 | assert_eq!(
506 | entries[0].id,
507 | "Saccharomyces.cerevisiae_rc:URS00005F2C2D_18S",
508 | );
509 | }
510 | }
511 |
--------------------------------------------------------------------------------
/src/dotbracket.rs:
--------------------------------------------------------------------------------
1 | use std::{
2 | cmp::Ordering,
3 | fmt::{self, Display},
4 | ops::{Not, Range},
5 | str::FromStr,
6 | };
7 |
8 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
9 | pub struct DotBracket {
10 | paired_blocks: C,
11 | len: usize,
12 | }
13 |
14 | impl DotBracket
15 | where
16 | C: AsMut>,
17 | {
18 | #[inline]
19 | pub fn from_str(dot_bracket: &str, paired_blocks_buffer: C) -> Result {
20 | Self::from_str_with_buffer(dot_bracket, paired_blocks_buffer, &mut Vec::new())
21 | }
22 |
23 | #[inline]
24 | pub fn from_str_with_buffer(
25 | dot_bracket: &str,
26 | paired_blocks_buffer: C,
27 | working_buffer: &mut Vec,
28 | ) -> Result {
29 | Self::from_bytes_with_buffer(dot_bracket.as_bytes(), paired_blocks_buffer, working_buffer)
30 | }
31 |
32 | pub fn from_bytes_with_buffer(
33 | dot_bracket: &[u8],
34 | mut paired_blocks_buffer: C,
35 | working_buffer: &mut Vec,
36 | ) -> Result {
37 | let len = dot_bracket.len();
38 |
39 | let paired_blocks_buffer_ref = paired_blocks_buffer.as_mut();
40 | paired_blocks_buffer_ref.clear();
41 | working_buffer.clear();
42 | let state = dot_bracket
43 | .iter()
44 | .enumerate()
45 | .try_fold(None, |partial, (index, &c)| {
46 | try_fold_from_bytes(partial, index, c, paired_blocks_buffer_ref, working_buffer)
47 | })?;
48 |
49 | if working_buffer.is_empty().not() {
50 | return Err(InvalidDotBracket);
51 | }
52 |
53 | if let Some(state) = state {
54 | let PartialPairedBlockUnstored {
55 | left_start,
56 | other:
57 | Some(PartialPairedBlockOther {
58 | left_end,
59 | right_start,
60 | }),
61 | } = state
62 | else {
63 | return Err(InvalidDotBracket);
64 | };
65 |
66 | let left = left_start..left_end;
67 | let right = right_start..dot_bracket.len();
68 | if left.len() != right.len() {
69 | return Err(InvalidDotBracket);
70 | }
71 |
72 | paired_blocks_buffer_ref.push(PairedBlock { left, right });
73 | }
74 |
75 | Ok(DotBracket {
76 | paired_blocks: paired_blocks_buffer,
77 | len,
78 | })
79 | }
80 |
81 | #[inline]
82 | pub fn into_sorted(self) -> DotBracket {
83 | let Self {
84 | mut paired_blocks,
85 | len,
86 | } = self;
87 | paired_blocks
88 | .as_mut()
89 | .sort_unstable_by_key(|block| block.left.start);
90 |
91 | DotBracket { paired_blocks, len }
92 | }
93 | }
94 |
95 | fn try_fold_from_bytes(
96 | partial: Option,
97 | index: usize,
98 | c: u8,
99 | paired_blocks_buffer: &mut Vec,
100 | working_buffer: &mut Vec,
101 | ) -> Result