├── .gitignore
├── src
    ├── test
    │   ├── head.rs
    │   ├── lower.rs
    │   ├── upper.rs
    │   ├── tail.rs
    │   ├── set.rs
    │   ├── interleave.rs
    │   ├── slice.rs
    │   ├── del.rs
    │   ├── mask.rs
    │   ├── revcomp.rs
    │   ├── replace.rs
    │   ├── stat.rs
    │   ├── pass.rs
    │   ├── compress.rs
    │   ├── concat.rs
    │   ├── filter.rs
    │   ├── split.rs
    │   ├── count.rs
    │   ├── trim.rs
    │   ├── sample.rs
    │   └── cmp.rs
    ├── helpers
    │   ├── any.rs
    │   ├── thread_local.rs
    │   ├── slice.rs
    │   ├── macros.rs
    │   ├── mod.rs
    │   ├── complement.rs
    │   ├── write_list.rs
    │   ├── vec_buf.rs
    │   ├── replace.rs
    │   ├── bytesize.rs
    │   ├── heap_merge.rs
    │   ├── number.rs
    │   ├── value.rs
    │   └── seqtype.rs
    ├── cmd
    │   ├── shared
    │   │   ├── mod.rs
    │   │   └── key.rs
    │   ├── pass.rs
    │   ├── head.rs
    │   ├── lower.rs
    │   ├── upper.rs
    │   ├── interleave.rs
    │   ├── find
    │   │   ├── ambig.rs
    │   │   └── matcher
    │   │   │   ├── exact.rs
    │   │   │   └── mod.rs
    │   ├── stat.rs
    │   ├── del.rs
    │   ├── tail.rs
    │   ├── sort
    │   │   ├── vars.rs
    │   │   ├── mem.rs
    │   │   ├── file.rs
    │   │   ├── cli.rs
    │   │   └── mod.rs
    │   ├── mod.rs
    │   ├── set.rs
    │   ├── slice.rs
    │   ├── filter.rs
    │   ├── view
    │   │   ├── color.rs
    │   │   └── mod.rs
    │   ├── cmp
    │   │   ├── vars.rs
    │   │   └── mod.rs
    │   ├── unique
    │   │   ├── map.rs
    │   │   └── cli.rs
    │   ├── revcomp.rs
    │   ├── mask.rs
    │   ├── concat.rs
    │   └── replace.rs
    ├── main.rs
    ├── var
    │   └── modules
    │   │   ├── expr
    │   │       ├── js
    │   │       │   └── mod.rs
    │   │       ├── mod.rs
    │   │       ├── expressions.rs
    │   │       └── var_provider.rs
    │   │   └── mod.rs
    ├── io
    │   ├── input
    │   │   ├── reader.rs
    │   │   ├── fastx.rs
    │   │   ├── fastq.rs
    │   │   ├── fasta.rs
    │   │   └── fa_qual.rs
    │   ├── output
    │   │   ├── fastx.rs
    │   │   ├── writer.rs
    │   │   ├── fasta.rs
    │   │   ├── csv.rs
    │   │   ├── fastq.rs
    │   │   └── fa_qual.rs
    │   ├── mod.rs
    │   └── format.rs
    └── error.rs
├── var_provider
    ├── Cargo.toml
    ├── variable_enum_macro
    │   └── Cargo.toml
    └── src
    │   ├── lib.rs
    │   ├── usage.rs
    │   └── func.rs
├── js
    └── include.js
├── scripts
    ├── gen_html_help.sh
    ├── gen_ambig_map.py
    ├── validate_js_parser.sh
    ├── parse_varhelp.py
    ├── validate_features.sh
    ├── gen_help.sh
    ├── summarize_comparison.py
    └── time.sh
├── dist-workspace.toml
├── LICENSE-MIT
├── profile
    ├── fastq_urls.txt
    └── README.md
├── .github
    └── workflows
    │   └── ci.yaml
└── Cargo.toml


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | Cargo.lock
3 | **/*.rs.bk
4 | .Rhistory
5 | .DS_Store
6 | _*


--------------------------------------------------------------------------------
/src/test/head.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 | 
3 | #[test]
4 | fn head() {
5 |     cmp(&["head", "-n", "3"], &*FASTA, &SEQS[..3].concat());
6 | }
7 | 


--------------------------------------------------------------------------------
/src/test/lower.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 | 
3 | #[test]
4 | fn lower() {
5 |     let fa = ">seq\naTgC\n";
6 |     cmp(&["lower"], fa, ">seq\natgc\n");
7 | }
8 | 


--------------------------------------------------------------------------------
/src/test/upper.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 | 
3 | #[test]
4 | fn upper() {
5 |     let fa = ">seq\naTgC\n";
6 |     cmp(&["upper"], fa, ">seq\nATGC\n");
7 | }
8 | 


--------------------------------------------------------------------------------
/src/test/tail.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 | 
3 | #[test]
4 | fn tail() {
5 |     fails(&["tail", "-n", "3"], &*FASTA, "Cannot use STDIN as input");
6 |     let input = tmp_file("st_tail_", ".fasta", &FASTA);
7 |     cmp(&["tail", "-n", "2"], input, records!(2, 3));
8 | }
9 | 


--------------------------------------------------------------------------------
/var_provider/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "var_provider"
 3 | version = "0.4.0-beta.4"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | itertools = "0.14"
 9 | strum = "0.27"
10 | strum_macros = "0.27"
11 | crossterm = "0.29"
12 | textwrap = "0.16"
13 | 


--------------------------------------------------------------------------------
/src/test/set.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn set() {
 5 |     let fasta = ">seq\nATGC\n";
 6 | 
 7 |     cmp(&["set", "-i", "seq2"], fasta, ">seq2\nATGC\n");
 8 |     cmp(&["set", "-d", "desc"], fasta, ">seq desc\nATGC\n");
 9 |     cmp(&["set", "-s", "NNNN"], fasta, ">seq\nNNNN\n");
10 | }
11 | 


--------------------------------------------------------------------------------
/js/include.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | function num(x) {
 4 |     let f = parseFloat(x);
 5 |     if (isNaN(f)) {
 6 |         if (x === undefined) return undefined;
 7 |         if (x === null) return null;
 8 |         throw `Could not convert '${x}' to a decimal number`;
 9 |     }
10 |     return f;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/interleave.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn interleave() {
 5 |     with_tmpdir("st_interleave_", |td| {
 6 |         cmp(
 7 |             &["interleave"],
 8 |             td.multi_file(".fasta", vec![&&*FASTA, &&*FASTA]),
 9 |             records!(0, 0, 1, 1, 2, 2, 3, 3),
10 |         );
11 |     });
12 | }
13 | 


--------------------------------------------------------------------------------
/src/test/slice.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn slice() {
 5 |     cmp(&["slice", ":"], &*FASTA, &FASTA);
 6 |     cmp(&["slice", "1:"], &*FASTA, &FASTA);
 7 |     cmp(&["slice", ":2"], &*FASTA, &SEQS[..2].concat());
 8 |     cmp(&["slice", "1:2"], &*FASTA, &SEQS[..2].concat());
 9 |     cmp(&["slice", "2:3"], &*FASTA, &SEQS[1..3].concat());
10 | }
11 | 


--------------------------------------------------------------------------------
/var_provider/variable_enum_macro/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "variable_enum_macro"
 3 | version = "0.4.0-beta.4"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | var_provider = { path=".." }
 9 | proc-macro2 = "1.0.79"
10 | quote = "1.0.36"
11 | itertools = "0.14"
12 | syn = { version = "2.0.58" }
13 | strum = "0.27"
14 | strum_macros = "0.27"
15 | 
16 | [lib]
17 | proc-macro = true
18 | 


--------------------------------------------------------------------------------
/src/test/del.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn del() {
 5 |     let fasta = ">seq;p=0 a=1 b=2\nATGC\n";
 6 | 
 7 |     cmp(&["del", "-d"], fasta, ">seq;p=0\nATGC\n");
 8 |     // TODO: the extra space should be removed
 9 |     cmp(&["del", "--attrs", "a,b"], fasta, ">seq;p=0 \nATGC\n");
10 |     cmp(
11 |         &["del", "--attrs", "p", "--attr-fmt", ";key=value"],
12 |         fasta,
13 |         ">seq a=1 b=2\nATGC\n",
14 |     );
15 | }
16 | 


--------------------------------------------------------------------------------
/src/helpers/any.rs:
--------------------------------------------------------------------------------
 1 | use std::any::Any;
 2 | 
 3 | // pub trait AsAny {
 4 | //     fn as_any(&self) -> &dyn Any;
 5 | // }
 6 | 
 7 | // impl<T: Any> AsAny for T {
 8 | //     fn as_any(&self) -> &dyn Any {
 9 | //         self
10 | //     }
11 | // }
12 | 
13 | pub trait AsAnyMut {
14 |     fn as_any_mut(&mut self) -> &mut dyn Any;
15 | }
16 | 
17 | impl<T: Any> AsAnyMut for T {
18 |     fn as_any_mut(&mut self) -> &mut dyn Any {
19 |         self
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/cmd/shared/mod.rs:
--------------------------------------------------------------------------------
 1 | //! This module contains code shared between at least two commands, which
 2 | //! relies on some external crates and therefore needs feature flags.
 3 | 
 4 | #[cfg(any(
 5 |     feature = "all-commands",
 6 |     feature = "cmp",
 7 |     feature = "count",
 8 |     feature = "sort",
 9 |     feature = "unique"
10 | ))]
11 | pub mod key;
12 | 
13 | #[cfg(any(feature = "all-commands", feature = "sort", feature = "unique"))]
14 | pub mod tmp_store;
15 | 


--------------------------------------------------------------------------------
/src/test/mask.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn mask() {
 5 |     let fa = ">seq\nATGCa\ntgc\n";
 6 |     cmp(&["mask", ":"], fa, ">seq\natgcatgc\n");
 7 |     cmp(&["mask", ":2,-2:"], fa, ">seq\natGCatgc\n");
 8 |     cmp(&["mask", "4:"], fa, ">seq\nATGcatgc\n");
 9 |     cmp(&["mask", "--hard", "N", "4:"], fa, ">seq\nATGNNNNN\n");
10 |     cmp(
11 |         &["mask", "--unmask", "4:"],
12 |         ">seq\nATGcatgc\n",
13 |         ">seq\nATGCATGC\n",
14 |     );
15 | }
16 | 


--------------------------------------------------------------------------------
/var_provider/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use strum_macros::{Display, EnumString};
 2 | 
 3 | mod func;
 4 | mod usage;
 5 | mod var_provider;
 6 | 
 7 | pub use self::func::*;
 8 | pub use self::usage::*;
 9 | pub use self::var_provider::*;
10 | 
11 | /// Provides information about the expected variable/function output type
12 | #[derive(Debug, Clone, EnumString, Display)]
13 | #[strum(serialize_all = "snake_case")]
14 | pub enum VarType {
15 |     Text,
16 |     Number,
17 |     Boolean,
18 | }
19 | 


--------------------------------------------------------------------------------
/scripts/gen_html_help.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | wiki=../seqtool.wiki
 4 | html=../seqtool-doc
 5 | 
 6 | # generate local HTML docs
 7 | 
 8 | cnv_links() {
 9 |   sed -E 's_<a href="(wiki\/[^"#]+|[^"\/#]+)(#[^"])?"_<a href="\1.html\2"_g' $1
10 | }
11 | 
12 | mkdir -p $html $html/wiki
13 | pandoc --self-contained -s -c doc/pandoc.css README.md | cnv_links > $html/README.html
14 | for f in $wiki/*.md; do
15 |   name="$(basename ${f%.*})"
16 |   pandoc --self-contained -s -c doc/pandoc.css $f| cnv_links > $html/wiki/$name.html
17 | done
18 | 


--------------------------------------------------------------------------------
/scripts/gen_ambig_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | bases = ['A', 'C', 'G', 'T', 'U']
 4 | 
 5 | mapping = [
 6 |     ('M', 'AC'),
 7 |     ('R', 'AG'),
 8 |     ('W', 'AT'),
 9 |     ('S', 'CG'),
10 |     ('Y', 'CT'),
11 |     ('K', 'GT'),
12 |     ('V', 'ACG'),
13 |     ('H', 'ACT'),
14 |     ('D', 'AGT'),
15 |     ('B', 'CGT'),
16 |     ('N', 'ACGT'),
17 | ]
18 | 
19 | for b, bases in mapping:
20 |     other = [a for a, v in mapping if all(b in bases for b in v)]
21 |     print("b'{}' => b\"{}\".to_vec(),".format(b, ''.join(list(bases) + other)))
22 | 


--------------------------------------------------------------------------------
/src/helpers/thread_local.rs:
--------------------------------------------------------------------------------
 1 | use std::cell::RefCell;
 2 | use std::thread::LocalKey;
 3 | 
 4 | pub fn with_mut_thread_local<I, F, T, R>(
 5 |     lkey: &'static LocalKey<RefCell<Option<T>>>,
 6 |     init: I,
 7 |     f: F,
 8 | ) -> R
 9 | where
10 |     F: FnOnce(&mut T) -> R,
11 |     I: FnOnce() -> T,
12 | {
13 |     lkey.with(|d| {
14 |         let mut d = d.borrow_mut();
15 |         let data = if let Some(ref mut data) = *d {
16 |             data
17 |         } else {
18 |             *d = Some(init());
19 |             d.as_mut().unwrap()
20 |         };
21 |         f(data)
22 |     })
23 | }
24 | 


--------------------------------------------------------------------------------
/src/cmd/pass.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | 
 7 | #[derive(Parser, Clone, Debug)]
 8 | pub struct PassCommand {
 9 |     #[command(flatten)]
10 |     pub common: CommonArgs,
11 | }
12 | 
13 | pub fn run(mut cfg: Config, _args: PassCommand) -> CliResult<()> {
14 |     let mut format_writer = cfg.get_format_writer()?;
15 |     cfg.with_io_writer(|io_writer, mut cfg| {
16 |         cfg.read(|record, ctx| {
17 |             format_writer.write(&record, io_writer, ctx)?;
18 |             Ok(true)
19 |         })
20 |     })
21 | }
22 | 


--------------------------------------------------------------------------------
/src/helpers/slice.rs:
--------------------------------------------------------------------------------
 1 | use std::mem::replace;
 2 | 
 3 | pub fn split_text(text: &'_ [u8], sep: u8) -> SplitIter<'_> {
 4 |     SplitIter { sep, text }
 5 | }
 6 | 
 7 | pub struct SplitIter<'a> {
 8 |     sep: u8,
 9 |     text: &'a [u8],
10 | }
11 | 
12 | impl<'a> Iterator for SplitIter<'a> {
13 |     type Item = &'a [u8];
14 | 
15 |     fn next(&mut self) -> Option<Self::Item> {
16 |         if let Some(pos) = memchr::memchr(self.sep, self.text) {
17 |             let (t, rest) = self.text.split_at(pos);
18 |             self.text = &rest[1..];
19 |             return Some(t);
20 |         }
21 |         if self.text.is_empty() {
22 |             return None;
23 |         }
24 |         Some(replace(&mut self.text, b""))
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/helpers/macros.rs:
--------------------------------------------------------------------------------
 1 | macro_rules! fail {
 2 |     ($e:expr) => {
 3 |         Err($e.into())
 4 |     };
 5 |     ($e:expr, $($args:tt)*) => {
 6 |         Err(format!($e, $($args)*).into())
 7 |     };
 8 | }
 9 | 
10 | macro_rules! try_opt {
11 |     ($expr:expr) => {
12 |         match $expr {
13 |             Ok(item) => item,
14 |             Err(e) => return Some(Err(std::convert::From::from(e))),
15 |         }
16 |     };
17 | }
18 | 
19 | macro_rules! report {
20 |     ($verbose:expr, $fmt:expr) => (
21 |         if $verbose {
22 |             eprintln!($fmt)
23 |         }
24 |     );
25 |     ($verbose:expr, $fmt:expr, $($arg:tt)*) => (
26 |         if $verbose {
27 |             eprintln!($fmt, $($arg)*)
28 |         }
29 |     );
30 | }
31 | 


--------------------------------------------------------------------------------
/dist-workspace.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["cargo:."]
 3 | 
 4 | # Config for 'dist'
 5 | [dist]
 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax)
 7 | cargo-dist-version = "0.30.0"
 8 | # CI backends to support
 9 | ci = "github"
10 | # The installers to generate for each app
11 | installers = ["shell", "powershell", "homebrew", "msi"]
12 | # Target platforms to build apps for (Rust target-triple syntax)
13 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl", "x86_64-pc-windows-msvc"]
14 | # Path that installers should place binaries in
15 | install-path = "CARGO_HOME"
16 | # Whether to install an updater program
17 | install-updater = false
18 | 
19 | [target.x86_64-pc-windows-msvc]
20 | rustflags = [
21 |     "-C debuginfo=0",
22 |     "-C link-arg=/DEBUG:NONE"
23 | ]
24 | 


--------------------------------------------------------------------------------
/src/helpers/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Utilities used by many commands,
 2 | //! which do not use optional crates that depend on feature flags.
 3 | 
 4 | use std::collections::{HashMap, HashSet};
 5 | 
 6 | // The default hash map to use
 7 | use ahash::RandomState;
 8 | 
 9 | pub type DefaultHashMap<K, V> = HashMap<K, V, RandomState>;
10 | pub type DefaultHashSet<V> = HashSet<V, RandomState>;
11 | pub type DefaultBuildHasher = ahash::RandomState; // BuildHasherDefault<ahash::AHasher>;
12 | 
13 | // missing data string
14 | pub const NA: &str = "undefined";
15 | 
16 | #[macro_use]
17 | pub mod macros;
18 | pub mod any;
19 | pub mod bytesize;
20 | pub mod complement;
21 | pub mod heap_merge;
22 | pub mod number;
23 | pub mod replace;
24 | pub mod rng;
25 | pub mod seqtype;
26 | pub mod slice;
27 | pub mod thread_local;
28 | pub mod value;
29 | pub mod var_range;
30 | pub mod vec_buf;
31 | pub mod write_list;
32 | 


--------------------------------------------------------------------------------
/src/test/revcomp.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn revcomp() {
 5 |     // DNA with ambiguities
 6 |     cmp(
 7 |         &["revcomp"],
 8 |         ">id\nAGCT\nYRWS\nKMDV\nHBN\n",
 9 |         ">id\nNVDBHKMSWYRAGCT\n",
10 |     );
11 |     // RNA
12 |     cmp(
13 |         &["revcomp"],
14 |         ">id\nAGCU\nYRWS\nKMDV\nHBN\n",
15 |         ">id\nNVDBHKMSWYRAGCU\n",
16 |     );
17 |     // mixed / protein
18 |     fails(
19 |         &["revcomp"],
20 |         ">id\nTX\n",
21 |         "Only DNA/RNA sequences can be reverse-complemented",
22 |     );
23 |     // with explicitly set sequence type, invalid letters are left untouched
24 |     cmp(&["revcomp", "--seqtype", "dna"], ">id\nUA\n", ">id\nTU\n");
25 | }
26 | 
27 | #[test]
28 | fn revcomp_qual() {
29 |     let fq = "@seq\nANCT\n+\n1234\n";
30 |     let rc = "@seq\nAGNT\n+\n4321\n";
31 |     cmp(&["revcomp", "--fq"], fq, rc);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/helpers/complement.rs:
--------------------------------------------------------------------------------
 1 | use super::seqtype::SeqType;
 2 | 
 3 | /// Reverse complements a set of sequence chunks belonging to the same sequence
 4 | /// writes the contiguous reverse-complement to output
 5 | pub fn reverse_complement<'a, S>(
 6 |     seq_iter: S,
 7 |     out: &mut Vec<u8>,
 8 |     seqtype: SeqType,
 9 | ) -> Result<(), String>
10 | where
11 |     S: Iterator<Item = &'a [u8]> + DoubleEndedIterator,
12 | {
13 |     let complement = match seqtype {
14 |         SeqType::DNA => bio::alphabets::dna::complement,
15 |         SeqType::RNA => bio::alphabets::rna::complement,
16 |         _ => {
17 |             return Err(format!(
18 |                 "Only DNA/RNA sequences can be reverse-complemented, but the sequence type \
19 |                 is '{seqtype}'. Wrongly recognized sequence types can be adjusted with `--seqtype`."
20 |             ))
21 |         }
22 |     };
23 |     out.clear();
24 |     for s in seq_iter.rev() {
25 |         out.extend(s.iter().rev().cloned().map(complement));
26 |     }
27 |     Ok(())
28 | }
29 | 


--------------------------------------------------------------------------------
/src/cmd/head.rs:
--------------------------------------------------------------------------------
 1 | use clap::{value_parser, Parser};
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | 
 7 | #[derive(Parser, Clone, Debug)]
 8 | #[clap(next_help_heading = "'Head' command options")]
 9 | pub struct HeadCommand {
10 |     /// Number of sequences to return
11 |     #[arg(short, long, value_name = "N", default_value_t = 10, value_parser = value_parser!(u64).range(1..))]
12 |     num_seqs: u64,
13 | 
14 |     #[command(flatten)]
15 |     pub common: CommonArgs,
16 | }
17 | 
18 | pub fn run(mut cfg: Config, args: HeadCommand) -> CliResult<()> {
19 |     let n = args.num_seqs;
20 | 
21 |     let mut format_writer = cfg.get_format_writer()?;
22 |     cfg.with_io_writer(|io_writer, mut cfg| {
23 |         let mut i = 0;
24 | 
25 |         cfg.read(|record, ctx| {
26 |             if i >= n {
27 |                 return Ok(false);
28 |             }
29 |             format_writer.write(&record, io_writer, ctx)?;
30 |             i += 1;
31 |             Ok(true)
32 |         })
33 |     })
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/replace.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | static INPUT: &str = ">id_123 some desc\nA\nT\nGC\n";
 4 | 
 5 | #[test]
 6 | fn exact() {
 7 |     cmp(&["replace", "T", "U"], INPUT, ">id_123 some desc\nAUGC\n");
 8 |     cmp(
 9 |         &["replace", "T", "U"],
10 |         ">a\nT\nT\n>b\nT\nT\n>c\nT\nT\n",
11 |         ">a\nUU\n>b\nUU\n>c\nUU\n",
12 |     );
13 |     cmp(
14 |         &["replace", "ATG", "TGA"],
15 |         INPUT,
16 |         ">id_123 some desc\nTGAC\n",
17 |     );
18 |     cmp(
19 |         &["replace", "-d", "e", "a"],
20 |         INPUT,
21 |         ">id_123 soma dasc\nATGC\n",
22 |     );
23 | }
24 | 
25 | #[test]
26 | fn regex() {
27 |     cmp(
28 |         &["replace", "-r", "[AT]", "?"],
29 |         INPUT,
30 |         ">id_123 some desc\n??GC\n",
31 |     );
32 |     cmp(
33 |         &["replace", "-ir", r"_\d{3}", ".."],
34 |         INPUT,
35 |         ">id.. some desc\nATGC\n",
36 |     );
37 |     cmp(
38 |         &["replace", "-ir", r"_(\d{3})", "..$1"],
39 |         INPUT,
40 |         ">id..123 some desc\nATGC\n",
41 |     );
42 | }
43 | 


--------------------------------------------------------------------------------
/scripts/validate_js_parser.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script validates the JS parser using ECMA-262 parser tests
 3 | # All syntax should be accepted, exept for:
 4 | # - regex literals
 5 | # - \u unicode escapes in identifiers
 6 | # - characters that are not char::is_alphabetic() (there seem to be some)
 7 | 
 8 | set -euo pipefail
 9 | 
10 | if [ ! -e test262-parser-tests-master ]; then
11 |     wget https://github.com/tc39/test262-parser-tests/archive/refs/heads/master.zip
12 |     unzip master.zip
13 |     rm master.zip
14 | fi
15 | 
16 | cargo build
17 | st=target/debug/st
18 | 
19 | echo "" > _input.fa
20 | js=test262-parser-tests-master
21 | for f in $js/pass/*.js $js/pass-explicit/*.js $js/early/*.js; do
22 |     echo $f
23 |     out=$(($st . --to-tsv "{{file:$f}}" _input.fa || true) 2>&1)
24 |     # recognize errors, but exclude strings containing unsupported character escaped
25 |     if [[ "$out" == *"Failed to parse"* && ! "$out" =~ (\\0|\\u[0-9]{4}|\\u\{[a-zA-Z0-9]{1,6}\}|\\x[a-zA-Z0-9]{2}) ]]; then
26 |         printf "$out"
27 |     fi
28 | done
29 | rm -R _input.fa $js
30 | 


--------------------------------------------------------------------------------
/src/cmd/lower.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::io::SeqQualRecord;
 7 | 
 8 | #[derive(Parser, Clone, Debug)]
 9 | #[clap(next_help_heading = "'Lower' command options")]
10 | pub struct LowerCommand {
11 |     #[command(flatten)]
12 |     pub common: CommonArgs,
13 | }
14 | 
15 | pub fn run(mut cfg: Config, _args: LowerCommand) -> CliResult<()> {
16 |     let mut format_writer = cfg.get_format_writer()?;
17 |     cfg.with_io_writer(|io_writer, mut cfg| {
18 |         let mut seq = vec![];
19 |         cfg.read(|record, ctx| {
20 |             seq.clear();
21 |             for s in record.seq_segments() {
22 |                 seq.extend(s.iter().cloned().map(|ref mut b| {
23 |                     b.make_ascii_lowercase();
24 |                     *b
25 |                 }));
26 |             }
27 |             let ucase_rec = SeqQualRecord::new(&record, &seq, None);
28 |             format_writer.write(&ucase_rec, io_writer, ctx)?;
29 |             Ok(true)
30 |         })
31 |     })
32 | }
33 | 


--------------------------------------------------------------------------------
/src/cmd/upper.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::io::SeqQualRecord;
 7 | 
 8 | #[derive(Parser, Clone, Debug)]
 9 | #[clap(next_help_heading = "'Upper' command options")]
10 | pub struct UpperCommand {
11 |     #[command(flatten)]
12 |     pub common: CommonArgs,
13 | }
14 | 
15 | pub fn run(mut cfg: Config, _args: UpperCommand) -> CliResult<()> {
16 |     let mut format_writer = cfg.get_format_writer()?;
17 |     cfg.with_io_writer(|io_writer, mut cfg| {
18 |         let mut seq = vec![];
19 |         cfg.read(|record, ctx| {
20 |             seq.clear();
21 |             for s in record.seq_segments() {
22 |                 seq.extend(s.iter().cloned().map(|ref mut b| {
23 |                     b.make_ascii_uppercase();
24 |                     *b
25 |                 }));
26 |             }
27 |             let ucase_rec = SeqQualRecord::new(&record, &seq, None);
28 |             format_writer.write(&ucase_rec, io_writer, ctx)?;
29 |             Ok(true)
30 |         })
31 |     })
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/stat.rs:
--------------------------------------------------------------------------------
 1 | use std::str;
 2 | 
 3 | use super::*;
 4 | 
 5 | #[test]
 6 | fn stats() {
 7 |     let seq = ">seq\nATGC-NYA\n";
 8 |     let retval = "seq\t8\t7\t40\t0.4\t2\t3\n";
 9 |     let vars = "seqlen,ungapped_seqlen,gc_percent,gc,charcount(A),charcount(AT)";
10 |     #[cfg(any(feature = "all-commands", feature = "pass"))]
11 |     cmp(&[".", "--to-tsv", &format!("id,{vars}")], seq, retval);
12 |     cmp(&["stat", vars], seq, retval);
13 | }
14 | 
15 | #[test]
16 | fn qualstat() {
17 |     cmp(
18 |         &["stat", "--fq", "exp_err"],
19 |         format!("@id\nAAA\n+\n{}\n", str::from_utf8(&[33, 43, 53]).unwrap()),
20 |         "id\t1.11\n",
21 |     );
22 |     cmp(
23 |         &["stat", "--fq-illumina", "exp_err"],
24 |         format!("@id\nAAA\n+\n{}\n", str::from_utf8(&[64, 74, 84]).unwrap()),
25 |         "id\t1.11\n",
26 |     );
27 |     fails(
28 |         &["stat", "--fq", "exp_err"],
29 |         format!("@id\nA\n+\n{}\n", str::from_utf8(&[32]).unwrap()),
30 |         "Invalid quality",
31 |     );
32 |     fails(&["stat", "exp_err"], ">seq\nAA", "No quality scores");
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Fast and flexible tool for reading, modifying and writing biological sequences
 3 | */
 4 | 
 5 | // suppress warnings unless most features are used
 6 | #![cfg_attr(not(feature = "default"), allow(warnings, unused))]
 7 | 
 8 | #[macro_use]
 9 | extern crate seq_io;
10 | 
11 | use crate::cli::Cli;
12 | use crate::config::Config;
13 | 
14 | use self::error::*;
15 | use std::process;
16 | 
17 | #[macro_use]
18 | mod helpers;
19 | #[macro_use]
20 | mod error;
21 | mod cli;
22 | mod cmd;
23 | mod config;
24 | mod context;
25 | mod io;
26 | mod var;
27 | 
28 | #[cfg(test)]
29 | mod test;
30 | 
31 | fn main() {
32 |     let res = Cli::new().and_then(|cli| cli.run());
33 |     match res {
34 |         // normal exit
35 |         Ok(()) => {}
36 |         Err(CliError::Io(e)) => {
37 |             if e.kind() != std::io::ErrorKind::BrokenPipe {
38 |                 exit(&format!("{e}"), 1)
39 |             }
40 |         }
41 |         Err(e) => exit(&format!("{e}"), 1),
42 |     }
43 | }
44 | 
45 | fn exit(msg: &str, code: i32) {
46 |     eprintln!("{msg}");
47 |     process::exit(code);
48 | }
49 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Markus Schlegel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/cmd/interleave.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::{CommonArgs, WORDY_HELP};
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | 
 7 | pub const DESC: &str = "\
 8 | The records are returned in the same order as in the input files.";
 9 | 
10 | #[derive(Parser, Clone, Debug)]
11 | #[clap(next_help_heading = "'Interleave' command options")]
12 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
13 | pub struct InterleaveCommand {
14 |     /// Don't check if the IDs of the files match
15 |     #[arg(short, long)]
16 |     no_id_check: bool,
17 | 
18 |     #[command(flatten)]
19 |     pub common: CommonArgs,
20 | }
21 | 
22 | pub fn run(mut cfg: Config, args: InterleaveCommand) -> CliResult<()> {
23 |     let id_check = !args.no_id_check;
24 | 
25 |     let mut format_writer = cfg.get_format_writer()?;
26 |     cfg.with_io_writer(|io_writer, mut cfg| {
27 |         cfg.read_alongside(id_check, |_, rec, ctx| {
28 |             // handle variables (read_alongside requires this to be done manually)
29 |             ctx.set_record(&rec, 0)?;
30 |             format_writer.write(rec, io_writer, ctx)?;
31 |             Ok(true)
32 |         })
33 |     })
34 | }
35 | 


--------------------------------------------------------------------------------
/profile/fastq_urls.txt:
--------------------------------------------------------------------------------
 1 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/079/ERR12573579/ERR12573579_1.fastq.gz
 2 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/096/ERR12551596/ERR12551596_1.fastq.gz
 3 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/003/ERR12551603/ERR12551603_1.fastq.gz
 4 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/005/ERR12551605/ERR12551605_1.fastq.gz
 5 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/009/ERR12551609/ERR12551609_1.fastq.gz
 6 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/091/ERR12551691/ERR12551691_1.fastq.gz
 7 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/094/ERR12551694/ERR12551694_1.fastq.gz
 8 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/095/ERR12551695/ERR12551695_1.fastq.gz
 9 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/097/ERR12551697/ERR12551697_1.fastq.gz
10 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/099/ERR12551699/ERR12551699_1.fastq.gz
11 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/002/ERR12551702/ERR12551702_1.fastq.gz
12 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/004/ERR12551704/ERR12551704_1.fastq.gz
13 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/007/ERR12551707/ERR12551707_1.fastq.gz
14 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/011/ERR12551711/ERR12551711_1.fastq.gz
15 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/063/ERR12551763/ERR12551763_1.fastq.gz
16 | 


--------------------------------------------------------------------------------
/src/helpers/write_list.rs:
--------------------------------------------------------------------------------
 1 | //use std::ops::Deref;
 2 | use std::{convert::AsRef, io};
 3 | 
 4 | /// Writes an iterator of of text slices as delimited list to the output.
 5 | /// Returns true if the list is not empty
 6 | pub fn write_list<L, I, W>(list: L, sep: &[u8], out: &mut W) -> io::Result<bool>
 7 | where
 8 |     L: IntoIterator<Item = I>,
 9 |     I: AsRef<[u8]>,
10 |     W: io::Write + ?Sized,
11 | {
12 |     write_list_with(list, sep, out, |item, o| o.write_all(item.as_ref()))
13 | }
14 | 
15 | /// Writes an iterator of of values as delimited list to the output
16 | /// using a custom writing function.
17 | /// Returns true if the list is not empty
18 | #[inline]
19 | pub fn write_list_with<L, I, W, F>(
20 |     list: L,
21 |     sep: &[u8],
22 |     out: &mut W,
23 |     mut write_fn: F,
24 | ) -> io::Result<bool>
25 | where
26 |     L: IntoIterator<Item = I>,
27 |     W: io::Write + ?Sized,
28 |     F: FnMut(I, &mut W) -> io::Result<()>,
29 | {
30 |     let mut first = true;
31 |     for item in list {
32 |         if first {
33 |             first = false;
34 |         } else {
35 |             out.write_all(sep)?;
36 |         }
37 |         write_fn(item, out)?;
38 |     }
39 |     Ok(!first)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/cmd/find/ambig.rs:
--------------------------------------------------------------------------------
 1 | // according to IUPAC https://iubmb.qmul.ac.uk/misc/naseq.html#500, Table 1
 2 | // whereby ambiguity codes completely contained in another are also included
 3 | // (e.g. V matches M, R and S i naddition to A, C and G)
 4 | pub static AMBIG_DNA: &[(u8, &[u8])] = &[
 5 |     (b'M', b"AC"),
 6 |     (b'R', b"AG"),
 7 |     (b'W', b"AT"),
 8 |     (b'S', b"CG"),
 9 |     (b'Y', b"CT"),
10 |     (b'K', b"GT"),
11 |     (b'V', b"ACGMRS"),
12 |     (b'H', b"ACTMWY"),
13 |     (b'D', b"AGTRWK"),
14 |     (b'B', b"CGTSYK"),
15 |     (b'N', b"ACGTMRWSYKVHDB"),
16 | ];
17 | 
18 | // same as DNA, T -> U
19 | pub static AMBIG_RNA: &[(u8, &[u8])] = &[
20 |     (b'M', b"AC"),
21 |     (b'R', b"AG"),
22 |     (b'W', b"AU"),
23 |     (b'S', b"CG"),
24 |     (b'Y', b"CU"),
25 |     (b'K', b"GU"),
26 |     (b'V', b"ACGMRS"),
27 |     (b'H', b"ACUMWY"),
28 |     (b'D', b"AGURWK"),
29 |     (b'B', b"CGUSYK"),
30 |     (b'N', b"ACGUMRWSYKVHDB"),
31 | ];
32 | 
33 | // according to IUPAC, https://iupac.qmul.ac.uk/AminoAcid/A2021.html#AA212
34 | pub static AMBIG_PROTEIN: &[(u8, &[u8])] = &[
35 |     (b'B', b"DN"),
36 |     // note: B and Z are matched by X as well
37 |     (b'X', b"ARNDCEQGHILKMFPSTWYVBZ"),
38 |     (b'Z', b"EQ"),
39 | ];
40 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | # Make sure CI fails on all warnings, including Clippy lints
10 | # env:
11 | #   RUSTFLAGS: "-Dwarnings"
12 | 
13 | jobs:
14 |   test:
15 |     name: test
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       matrix:
19 |         build: [linux, osx, win-msvc, win-gnu]
20 |         include:
21 |         - build: linux
22 |           os: ubuntu-latest
23 |           rust: stable
24 |         - build: osx
25 |           os: macos-latest
26 |           rust: stable
27 |         - build: win-msvc
28 |           os: windows-latest
29 |           rust: stable
30 |         - build: win-gnu
31 |           os: windows-latest
32 |           rust: stable-x86_64-gnu
33 |     env:
34 |       RUSTFLAGS: ${{ matrix.build == 'win-msvc' && '-C debuginfo=0 -C link-args=/DEBUG:NONE' || '' }}
35 |   
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v5
39 |     - name: Install Rust
40 |       uses: dtolnay/rust-toolchain@master
41 |       with:
42 |         toolchain: ${{ matrix.rust }}
43 |     - run: cargo build --verbose --jobs 1
44 |     - run: cargo test --verbose --jobs 1
45 | 


--------------------------------------------------------------------------------
/src/test/pass.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn pass() {
 5 |     cmp(&["pass"], &*FASTA, &FASTA);
 6 |     cmp(&["."], &*FASTA, &FASTA);
 7 | }
 8 | 
 9 | #[test]
10 | fn append() {
11 |     with_tmpdir("st_pass_append_", |td| {
12 |         let fa = ">seq\nATGC\n";
13 |         let out = td.path("pass_append_out.fasta");
14 |         succeeds(&["pass", "--append", "-o", &out], fa);
15 |         assert_eq!(&out.content(), fa);
16 |         succeeds(&["pass", "--append", "-o", &out], fa);
17 |         assert_eq!(&out.content(), &(fa.to_string() + fa));
18 |         succeeds(&["pass", "--append", "-o", &out], fa);
19 |         assert_eq!(&out.content(), &(fa.to_string() + fa + fa));
20 |     });
21 | }
22 | 
23 | #[test]
24 | fn fasta_io() {
25 |     let fa = ">seq\nATGC\n";
26 |     let fa_wrap = ">seq\nAT\nGC\n";
27 |     let fa_wrap3 = ">seq\nATG\nC\n";
28 | 
29 |     cmp(&["."], fa, fa);
30 |     cmp(&["."], fa_wrap, fa);
31 |     cmp(&[".", "--wrap", "2"], fa, fa_wrap);
32 |     cmp(&[".", "--wrap", "3"], fa_wrap, fa_wrap3);
33 | }
34 | 
35 | #[test]
36 | fn pass_pipe() {
37 |     cmp_pipe(&["."], &FASTA, &["."], &FASTA);
38 | }
39 | 
40 | #[test]
41 | fn thread_io() {
42 |     cmp(&[".", "-T", "--write-thread"], &*FASTA, &FASTA);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/var/modules/expr/js/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::helpers::DefaultHashMap as HashMap;
 2 | use crate::var::VarBuilder;
 3 | 
 4 | use self::parser::SimpleAst;
 5 | 
 6 | use super::{ExprContext, Expression, Var};
 7 | 
 8 | mod expr;
 9 | pub mod parser;
10 | 
11 | pub use self::expr::*;
12 | 
13 | pub fn replace_register_vars(
14 |     ast: &SimpleAst,
15 |     b: &mut VarBuilder,
16 | ) -> Result<(String, Vec<Var>), String> {
17 |     let mut vars = HashMap::default();
18 |     let new_code = ast.rewrite(|func| {
19 |         b.register_var(func.name, func.args()).map(|res| {
20 |             res.map(|(symbol_id, _)| {
21 |                 // get unique placeholder variable name (function arguments are hashed)
22 |                 let js_varname = if func.args().is_empty() {
23 |                     func.name.to_string()
24 |                 } else {
25 |                     format!("{}_{}", func.name, symbol_id)
26 |                 };
27 |                 vars.insert(symbol_id, js_varname.clone());
28 |                 js_varname
29 |             })
30 |         })
31 |     })?;
32 |     // dbg!(ast, &new_code);
33 |     Ok((
34 |         new_code,
35 |         vars.into_iter()
36 |             .map(|(symbol_id, name)| Var { symbol_id, name })
37 |             .collect(),
38 |     ))
39 | }
40 | 


--------------------------------------------------------------------------------
/src/cmd/find/matcher/exact.rs:
--------------------------------------------------------------------------------
 1 | use memchr::memmem::Finder;
 2 | 
 3 | use super::{Hit, Match, Matcher};
 4 | 
 5 | #[derive(Debug, Clone)]
 6 | pub struct ExactMatcher {
 7 |     finder: Finder<'static>,
 8 |     pattern_len: usize,
 9 | }
10 | 
11 | impl ExactMatcher {
12 |     pub fn new(pattern: &[u8]) -> Self {
13 |         Self {
14 |             finder: Finder::new(pattern).into_owned(),
15 |             pattern_len: pattern.len(),
16 |         }
17 |     }
18 | }
19 | 
20 | impl Matcher for ExactMatcher {
21 |     fn has_matches(&self, text: &[u8]) -> Result<bool, String> {
22 |         Ok(self.finder.find_iter(text).next().is_some())
23 |     }
24 | 
25 |     fn do_search(
26 |         &mut self,
27 |         text: &[u8],
28 |         func: &mut dyn FnMut(&dyn Hit) -> Result<bool, String>,
29 |     ) -> Result<(), String> {
30 |         for start in self.finder.find_iter(text) {
31 |             if !func(&(start, start + self.pattern_len))? {
32 |                 break;
33 |             }
34 |         }
35 |         Ok(())
36 |     }
37 | }
38 | 
39 | impl Hit for (usize, usize) {
40 |     fn get_group(&self, group: usize, out: &mut Match) -> Result<(), String> {
41 |         debug_assert!(group == 0);
42 |         out.start = self.0;
43 |         out.end = self.1;
44 |         Ok(())
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/scripts/parse_varhelp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | prev_line = ''
 7 | in_table = False
 8 | for line in sys.stdin:
 9 |     if line.startswith('---'):
10 |         prev_line = '### {}\n'.format(re.sub('(.+?Usage: )(.*)', '\\1`\\2`', prev_line))
11 |         line = next(sys.stdin)
12 |         if line[0].islower():
13 |             prev_line = prev_line + '\n| variable | description |\n| - | - '
14 |             in_table = True
15 | 
16 |     if in_table:
17 |         if line.startswith('Example') or len(line) == 0:
18 |             in_table = False
19 |         else:
20 |             #line = line[1:]
21 |             if line[0] != ' ':
22 |                 prev_line += '|\n'
23 |                 s = line.strip().split(' ', 1)
24 |                 if len(s) == 2:
25 |                     name, desc = s
26 |                     line = '| {} | {} '.format(name, desc)
27 |             else:
28 |                 prev_line += ' ' + line.strip()
29 |                 continue
30 | 
31 |     if line.startswith('Example'):
32 |         line = '#### {}\n\n'.format(line.strip())
33 | 
34 |     elif line.startswith('> '):
35 |         line = '\n```bash\n{}```\n\n'.format(line[2:])
36 | 
37 |     print(prev_line, end='')
38 |     prev_line = line
39 | 
40 | print(prev_line, end='')
41 | 


--------------------------------------------------------------------------------
/src/cmd/stat.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::{CommonArgs, WORDY_HELP};
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::io::output::OutFormat;
 7 | 
 8 | use super::pass::{self, PassCommand};
 9 | 
10 | pub const DESC: &str = "\
11 | Sequence statistics variables (seqlen, exp_err, charcount(...), etc.)
12 | are supplied as comma-delimited list, e.g. `id,seqlen,exp_err`.
13 | The stat command is equivalent to `st pass --to-tsv 'id,var1,var2,...' input`
14 | 
15 | See `st stat -V/--help-vars` for a list of all possible variables.";
16 | 
17 | #[derive(Parser, Clone, Debug)]
18 | #[clap(next_help_heading = "'Stat' command options")]
19 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
20 | pub struct StatCommand {
21 |     /// Comma delimited list of statistics variables.
22 |     #[arg(value_name = "VAR")]
23 |     vars: String,
24 | 
25 |     #[command(flatten)]
26 |     pub common: CommonArgs,
27 | }
28 | 
29 | pub fn run(mut cfg: Config, args: StatCommand) -> CliResult<()> {
30 |     let cmd = PassCommand {
31 |         common: args.common,
32 |     };
33 |     let fields = "id,".to_string() + &args.vars;
34 |     cfg.output_config.format = OutFormat::DelimitedText {
35 |         fields,
36 |         delim: b'\t',
37 |     };
38 |     pass::run(cfg, cmd)
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/compress.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn compress_pipe() {
 5 |     #[cfg(feature = "gz")]
 6 |     cmp_pipe(
 7 |         &[".", "--to", "fasta.gz", "--compr-level", "9"],
 8 |         &FASTA,
 9 |         &[".", "--fmt", "fasta.gz"],
10 |         &FASTA,
11 |     );
12 | 
13 |     #[cfg(feature = "bz2")]
14 |     cmp_pipe(
15 |         &[".", "--to", "fasta.bz2", "--compr-level", "9"],
16 |         &FASTA,
17 |         &[".", "--fmt", "fasta.bz2"],
18 |         &FASTA,
19 |     );
20 | 
21 |     #[cfg(feature = "lz4")]
22 |     cmp_pipe(
23 |         &[".", "--to", "fasta.lz4", "--compr-level", "9"],
24 |         &FASTA,
25 |         &[".", "--fmt", "fasta.lz4"],
26 |         &FASTA,
27 |     );
28 | 
29 |     #[cfg(feature = "zstd")]
30 |     cmp_pipe(
31 |         &[".", "--to", "fasta.zst", "--compr-level", "9"],
32 |         &FASTA,
33 |         &[".", "--fmt", "fasta.zst"],
34 |         &FASTA,
35 |     );
36 | }
37 | 
38 | #[test]
39 | #[cfg(feature = "gz")]
40 | fn compress_file() {
41 |     with_tmpdir("st_compress_", |td| {
42 |         let f = td.path("compr_out.fa.gz");
43 |         succeeds(&[".", "-o", &f], &*FASTA);
44 |         fails(&[".", "--fmt", "fasta"], &f, "FASTA parse error");
45 |         cmp(&["."], &f, &FASTA);
46 |         cmp(&[".", "--fmt", "fasta.gz"], &f, &FASTA);
47 |     });
48 | }
49 | 


--------------------------------------------------------------------------------
/src/io/input/reader.rs:
--------------------------------------------------------------------------------
 1 | use crate::error::CliResult;
 2 | use crate::io::Record;
 3 | 
 4 | /// Trait for reading sequence records
 5 | pub trait SeqReader {
 6 |     /// Reads the next record and provides it in a closure.
 7 |     /// The closure may return `false` to indicate that reading should stop.
 8 |     /// Returns `Some(Ok(do_stop))` if a record was found, otherwise `None`
 9 |     fn read_next_conditional(
10 |         &mut self,
11 |         func: &mut dyn FnMut(&dyn Record) -> CliResult<bool>,
12 |     ) -> Option<CliResult<bool>>;
13 | 
14 |     /// Reads the next record and returns `true` if it was found.
15 |     /// There is no way the closure can signal back that the reading should stop.
16 |     fn read_next(&mut self, func: &mut dyn FnMut(&dyn Record) -> CliResult<()>) -> CliResult<bool> {
17 |         self.read_next_conditional(&mut |rec| func(rec).map(|_| true))
18 |             .unwrap_or(Ok(false))
19 |     }
20 | }
21 | 
22 | impl<'a> SeqReader for Box<dyn SeqReader + 'a> {
23 |     fn read_next_conditional(
24 |         &mut self,
25 |         func: &mut dyn FnMut(&dyn Record) -> CliResult<bool>,
26 |     ) -> Option<CliResult<bool>> {
27 |         (**self).read_next_conditional(func)
28 |     }
29 | 
30 |     fn read_next(&mut self, func: &mut dyn FnMut(&dyn Record) -> CliResult<()>) -> CliResult<bool> {
31 |         (**self).read_next(func)
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/io/output/fastx.rs:
--------------------------------------------------------------------------------
 1 | use std::str::FromStr;
 2 | 
 3 | use crate::error::CliResult;
 4 | use crate::var::{attr::AttrWriteAction, varstring::VarString, VarBuilder};
 5 | 
 6 | #[derive(Debug, Clone, Eq, PartialEq)]
 7 | pub struct Attribute {
 8 |     pub name: String,
 9 |     pub value: String,
10 | }
11 | 
12 | impl FromStr for Attribute {
13 |     type Err = String;
14 | 
15 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
16 |         let mut parts = s.splitn(2, '=');
17 |         let name = parts.next().unwrap().to_string();
18 |         let value = match parts.next() {
19 |             Some(p) => p.to_string(),
20 |             None => {
21 |                 return Err(format!(
22 |                     "Invalid attribute: '{name}'. Attributes need to be in the format: name=value"
23 |                 ))
24 |             }
25 |         };
26 |         Ok(Attribute { name, value })
27 |     }
28 | }
29 | 
30 | pub fn register_attributes(attrs: &[(Attribute, bool)], builder: &mut VarBuilder) -> CliResult<()> {
31 |     for (attr, replace_existing) in attrs {
32 |         let (vs, _) = VarString::parse_register(&attr.value, builder, false)?;
33 |         let action = if *replace_existing {
34 |             AttrWriteAction::Edit(vs)
35 |         } else {
36 |             AttrWriteAction::Append(vs)
37 |         };
38 |         builder.register_attr(&attr.name, Some(action))?;
39 |     }
40 |     Ok(())
41 | }
42 | 


--------------------------------------------------------------------------------
/src/helpers/vec_buf.rs:
--------------------------------------------------------------------------------
 1 | /// A "factory" of vectors that determines the best initial capacity in a quite
 2 | /// simple (not very sophisticated) way.
 3 | /// These vectors are intended to be used as buffers, to which many rounds of writing
 4 | /// done (using `io::Write::write_all()`).
 5 | /// In each writing round, `write_fn` may issue many repeated `write_all()`, so the capacity
 6 | /// is not easy to manage.
 7 | /// The needed capacity is recalculated in regular intervals to make sure
 8 | /// that the vectors do not use too much memory.
 9 | #[derive(Debug, Default)]
10 | pub struct VecFactory {
11 |     minlen: usize,
12 |     maxlen: usize,
13 |     counter: u16,
14 | }
15 | 
16 | impl VecFactory {
17 |     pub fn new() -> VecFactory {
18 |         Self::default()
19 |     }
20 | 
21 |     pub fn get<F, E>(&mut self, mut write_fn: F) -> Result<Vec<u8>, E>
22 |     where
23 |         F: FnMut(&mut Vec<u8>) -> Result<(), E>,
24 |     {
25 |         if self.counter >= 1000 {
26 |             self.maxlen = self.minlen;
27 |             self.counter = 0;
28 |         }
29 |         let mut v = Vec::with_capacity(self.maxlen);
30 |         write_fn(&mut v)?;
31 |         v.shrink_to_fit();
32 |         if v.len() > self.maxlen {
33 |             self.maxlen = v.len();
34 |             self.counter += 1;
35 |         } else if v.len() < self.minlen {
36 |             self.minlen = v.len();
37 |         }
38 |         Ok(v)
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/scripts/validate_features.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script runs the compilation and unit tests for each individual feature
 3 | 
 4 | set -euo pipefail
 5 | 
 6 | features=( \
 7 |     pass,gz \
 8 |     pass,lz4 \
 9 |     pass,zstd \
10 |     pass,bz2 \
11 |     expr \
12 |     all-commands \
13 |     all-commands,expr \
14 |     pass \
15 |     pass,regex-fast \
16 |     view \
17 |     count \
18 |     stat \
19 |     head \
20 |     tail \
21 |     slice \
22 |     sample \
23 |     sort \
24 |     unique \
25 |     filter,expr \
26 |     split \
27 |     cmp \
28 |     interleave \
29 |     find \
30 |     find,regex-fast \
31 |     replace \
32 |     replace,regex-fast \
33 |     del \
34 |     set \
35 |     trim \
36 |     mask \
37 |     upper \
38 |     lower \
39 |     revcomp \
40 |     concat \
41 | )
42 | 
43 | cores=8
44 | 
45 | echo "===== NO features ======================"
46 | echo -n "build... "
47 | cargo build -q -j $cores --no-default-features
48 | echo "test..."
49 | cargo test -q -j $cores --no-default-features
50 | 
51 | echo "===== Default features ======================"
52 | echo -n "build... "
53 | cargo build -q -j $cores
54 | echo "test..."
55 | cargo test -q -j $cores
56 | 
57 | # single feature
58 | for feature in ${features[@]}; do
59 |     echo "===== Feature(s) '$feature' ======================"
60 |     echo -n "build... "
61 |     cargo build -q -j $cores --no-default-features --features=$feature
62 |     echo "test..."
63 |     cargo test -q -j $cores --no-default-features --features=$feature
64 | done
65 | 


--------------------------------------------------------------------------------
/src/io/output/writer.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use crate::context::{RecordMeta, SeqContext};
 4 | use crate::error::CliResult;
 5 | use crate::io::{QualConverter, Record};
 6 | 
 7 | pub trait SeqFormatter {
 8 |     /// Write a formatted record to `out`, given the metadata in `ctx`.
 9 |     /// This is a convenience wrapper around `write_with`, which allows directly
10 |     /// providing `SeqContext`.
11 |     fn write(
12 |         &mut self,
13 |         record: &dyn Record,
14 |         out: &mut dyn io::Write,
15 |         ctx: &mut SeqContext,
16 |     ) -> CliResult<()> {
17 |         self.write_with(record, &ctx.meta[0], out, &mut ctx.qual_converter)
18 |     }
19 | 
20 |     /// Write a formatted record to `out`, given all necessary metadata.
21 |     fn write_with(
22 |         &mut self,
23 |         record: &dyn Record,
24 |         data: &RecordMeta,
25 |         out: &mut dyn io::Write,
26 |         qc: &mut QualConverter,
27 |     ) -> CliResult<()>;
28 | }
29 | 
30 | impl<W: SeqFormatter + ?Sized> SeqFormatter for Box<W> {
31 |     fn write(
32 |         &mut self,
33 |         record: &dyn Record,
34 |         out: &mut dyn io::Write,
35 |         ctx: &mut SeqContext,
36 |     ) -> CliResult<()> {
37 |         (**self).write(record, out, ctx)
38 |     }
39 | 
40 |     fn write_with(
41 |         &mut self,
42 |         record: &dyn Record,
43 |         data: &RecordMeta,
44 |         out: &mut dyn io::Write,
45 |         qc: &mut QualConverter,
46 |     ) -> CliResult<()> {
47 |         (**self).write_with(record, data, out, qc)
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/io/output/fasta.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use seq_io::fasta;
 4 | 
 5 | use crate::context::RecordMeta;
 6 | use crate::error::CliResult;
 7 | use crate::io::QualConverter;
 8 | use crate::var::VarBuilder;
 9 | 
10 | use crate::io::{
11 |     output::{fastx::register_attributes, SeqFormatter},
12 |     Record,
13 | };
14 | 
15 | use super::fastx::Attribute;
16 | 
17 | pub struct FastaWriter {
18 |     wrap: Option<usize>,
19 | }
20 | 
21 | impl FastaWriter {
22 |     pub fn new(
23 |         wrap: Option<usize>,
24 |         attrs: &[(Attribute, bool)],
25 |         builder: &mut VarBuilder,
26 |     ) -> CliResult<Self> {
27 |         register_attributes(attrs, builder)?;
28 |         Ok(Self { wrap })
29 |     }
30 | }
31 | 
32 | impl SeqFormatter for FastaWriter {
33 |     fn write_with(
34 |         &mut self,
35 |         record: &dyn Record,
36 |         data: &RecordMeta,
37 |         out: &mut dyn io::Write,
38 |         _qc: &mut QualConverter,
39 |     ) -> CliResult<()> {
40 |         write_fasta(record, data, out, self.wrap)
41 |     }
42 | }
43 | 
44 | fn write_fasta<W: io::Write>(
45 |     record: &dyn Record,
46 |     data: &RecordMeta,
47 |     mut out: W,
48 |     wrap: Option<usize>,
49 | ) -> CliResult<()> {
50 |     out.write_all(b">")?;
51 |     data.attrs.write_head(record, &mut out, &data.symbols)?;
52 |     out.write_all(b"\n")?;
53 |     if let Some(w) = wrap {
54 |         fasta::write_wrap_seq_iter(&mut out, record.seq_segments(), w)?;
55 |     } else {
56 |         fasta::write_seq_iter(&mut out, record.seq_segments())?;
57 |     }
58 |     Ok(())
59 | }
60 | 


--------------------------------------------------------------------------------
/src/cmd/del.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::io::HeaderRecord;
 7 | use crate::var::attr;
 8 | 
 9 | #[derive(Parser, Clone, Debug)]
10 | #[clap(next_help_heading = "'Del' command options")]
11 | pub struct DelCommand {
12 |     /// Delete description fields
13 |     #[arg(short, long)]
14 |     desc: bool,
15 | 
16 |     /// Delete attributes
17 |     #[arg(long, value_delimiter = ',')]
18 |     attrs: Option<Vec<String>>,
19 | 
20 |     #[command(flatten)]
21 |     pub common: CommonArgs,
22 | }
23 | 
24 | pub fn run(mut cfg: Config, args: DelCommand) -> CliResult<()> {
25 |     let del_desc = args.desc;
26 |     let del_attrs = args.attrs.as_deref();
27 | 
28 |     let mut format_writer = cfg.get_format_writer()?;
29 |     cfg.with_io_writer(|io_writer, mut cfg| {
30 |         if let Some(attrs) = del_attrs {
31 |             cfg.build_vars(|b| {
32 |                 for attr in attrs {
33 |                     b.register_attr(attr, Some(attr::AttrWriteAction::Delete))?;
34 |                 }
35 |                 Ok::<_, String>(())
36 |             })?;
37 |         }
38 | 
39 |         cfg.read(|record, ctx| {
40 |             if del_desc {
41 |                 let id = record.id();
42 |                 let record = HeaderRecord::new(&record, id, None);
43 |                 format_writer.write(&record, io_writer, ctx)?;
44 |             } else {
45 |                 format_writer.write(&record, io_writer, ctx)?;
46 |             }
47 |             Ok(true)
48 |         })
49 |     })
50 | }
51 | 


--------------------------------------------------------------------------------
/src/io/output/csv.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use super::{Record, SeqFormatter};
 4 | use crate::context::RecordMeta;
 5 | use crate::io::QualConverter;
 6 | use crate::var::{varstring, VarBuilder};
 7 | use crate::{error::CliResult, var::varstring::register_var_list};
 8 | 
 9 | pub const DEFAULT_OUTFIELDS: &str = "id,desc,seq";
10 | 
11 | pub struct CsvWriter {
12 |     delim: u8,
13 |     fields: Vec<varstring::VarString>,
14 | }
15 | 
16 | impl CsvWriter {
17 |     pub fn new(field_list: &str, delim: u8, builder: &mut VarBuilder) -> CliResult<CsvWriter> {
18 |         let mut out = Self {
19 |             delim,
20 |             fields: vec![],
21 |         };
22 | 
23 |         // progressively parse fields; this is necessary because there can be
24 |         // commas in functions as well
25 |         register_var_list(field_list, builder, &mut out.fields, None, true, true)?;
26 |         Ok(out)
27 |     }
28 | }
29 | 
30 | impl SeqFormatter for CsvWriter {
31 |     // #[inline]
32 |     // fn has_vars(&self) -> bool {
33 |     //     !self.fields.is_empty()
34 |     // }
35 | 
36 |     fn write_with(
37 |         &mut self,
38 |         record: &dyn Record,
39 |         data: &RecordMeta,
40 |         out: &mut dyn io::Write,
41 |         _qc: &mut QualConverter,
42 |     ) -> CliResult<()> {
43 |         let mut is_first = true;
44 |         for expr in &self.fields {
45 |             if !is_first {
46 |                 write!(out, "{}", self.delim as char)?;
47 |             }
48 |             is_first = false;
49 |             expr.compose(out, &data.symbols, record)?;
50 |         }
51 |         writeln!(out)?;
52 |         Ok(())
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/io/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::convert::Infallible;
 2 | use std::fmt;
 3 | use std::path::{Path, PathBuf};
 4 | use std::str::FromStr;
 5 | 
 6 | pub use self::format::*;
 7 | pub use self::qual_format::*;
 8 | pub use self::record::*;
 9 | 
10 | mod format;
11 | pub mod input;
12 | pub mod output;
13 | mod qual_format;
14 | mod record;
15 | 
16 | pub const DEFAULT_FORMAT: FormatVariant = FormatVariant::Fasta;
17 | 
18 | pub const DEFAULT_IO_READER_BUFSIZE: usize = 1 << 22;
19 | pub const DEFAULT_IO_WRITER_BUFSIZE: usize = 1 << 22;
20 | 
21 | #[derive(Eq, PartialEq, Debug, Clone)]
22 | pub enum IoKind {
23 |     Stdio,
24 |     File(PathBuf),
25 | }
26 | 
27 | impl IoKind {
28 |     pub fn from_path<P: AsRef<Path>>(p: P) -> Result<Self, String> {
29 |         let p = p.as_ref();
30 |         if let Some(s) = p.to_str() {
31 |             Ok(Self::from_str(s).unwrap())
32 |         } else {
33 |             Err(format!("Invalid path: '{}'", p.to_string_lossy()))
34 |         }
35 |     }
36 | }
37 | 
38 | impl FromStr for IoKind {
39 |     type Err = Infallible;
40 | 
41 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
42 |         if s == "-" {
43 |             Ok(Self::Stdio)
44 |         } else {
45 |             Ok(Self::File(s.into()))
46 |         }
47 |     }
48 | }
49 | 
50 | impl<S> From<S> for IoKind
51 | where
52 |     S: AsRef<str>,
53 | {
54 |     fn from(s: S) -> Self {
55 |         Self::from_str(s.as_ref()).unwrap()
56 |     }
57 | }
58 | 
59 | impl fmt::Display for IoKind {
60 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61 |         match *self {
62 |             Self::Stdio => write!(f, "-"),
63 |             Self::File(ref p) => write!(f, "{}", p.as_path().to_string_lossy()),
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/cmd/tail.rs:
--------------------------------------------------------------------------------
 1 | use std::cmp::max;
 2 | 
 3 | use clap::{value_parser, Parser};
 4 | 
 5 | use crate::cli::{CommonArgs, WORDY_HELP};
 6 | use crate::config::Config;
 7 | use crate::error::CliResult;
 8 | 
 9 | pub const DESC: &str = "\
10 | This only works for files (not STDIN), since records are counted in a first
11 | step, and only returned after reading a second time.";
12 | 
13 | #[derive(Parser, Clone, Debug)]
14 | #[clap(next_help_heading = "'Tail' command options")]
15 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
16 | pub struct TailCommand {
17 |     /// Number of sequences to return
18 |     #[arg(short, long, value_name = "N", default_value_t = 10, value_parser = value_parser!(u64).range(1..))]
19 |     num_seqs: u64,
20 | 
21 |     #[command(flatten)]
22 |     pub common: CommonArgs,
23 | }
24 | 
25 | pub fn run(mut cfg: Config, args: TailCommand) -> CliResult<()> {
26 |     let n_select = args.num_seqs;
27 | 
28 |     if cfg.has_stdin() {
29 |         return fail!("Cannot use STDIN as input, since we need to count all sequences before");
30 |     }
31 | 
32 |     let mut format_writer = cfg.get_format_writer()?;
33 |     cfg.with_io_writer(|io_writer, mut cfg| {
34 |         // first count the sequences
35 |         // TODO: use .fai files once supported?
36 |         let mut n = 0;
37 | 
38 |         cfg.read(|_, _| {
39 |             n += 1;
40 |             Ok(true)
41 |         })?;
42 | 
43 |         let mut i = 0;
44 |         let select_from = max(n, n_select) - n_select;
45 | 
46 |         cfg.read(|record, ctx| {
47 |             i += 1;
48 |             if i > select_from {
49 |                 format_writer.write(&record, io_writer, ctx)?;
50 |             }
51 |             Ok(true)
52 |         })
53 |     })
54 | }
55 | 


--------------------------------------------------------------------------------
/var_provider/src/usage.rs:
--------------------------------------------------------------------------------
 1 | //! Types and functions providing/handling variable/function usage information
 2 | 
 3 | use itertools::Itertools;
 4 | 
 5 | use crate::VarType;
 6 | 
 7 | #[cold]
 8 | pub(crate) fn usage_list(info: &FuncUsage) -> Vec<String> {
 9 |     let n_args = info.args.len();
10 |     let n_required = info
11 |         .args
12 |         .iter()
13 |         .position(|arg| arg.default_value.is_some())
14 |         .unwrap_or(info.args.len());
15 |     let mut out = Vec::with_capacity(1 + n_args - n_required);
16 |     if n_required == 0 {
17 |         out.push(info.name.to_string());
18 |     }
19 |     if n_args > 0 {
20 |         for i in n_required.clamp(1, n_args)..n_args + 1 {
21 |             out.push(format!(
22 |                 "{}({})",
23 |                 info.name,
24 |                 info.args[..i].iter().map(|u| u.name).join(", ")
25 |             ));
26 |         }
27 |     }
28 |     // dbg!(info, n_args, n_required, &out);
29 |     out
30 | }
31 | 
32 | #[derive(Debug)]
33 | pub struct FuncUsage {
34 |     pub name: &'static str,
35 |     // multiple argument collections possible
36 |     // (different usage patterns)
37 |     pub args: &'static [ArgUsage],
38 |     pub description: &'static str,
39 |     pub output_type: Option<VarType>,
40 |     pub hidden: bool,
41 | }
42 | 
43 | #[derive(Debug)]
44 | pub struct ArgUsage {
45 |     pub name: &'static str,
46 |     // the default value is always specified as &str in the usage string (for the help page), even
47 |     // though from_func() will parse it further
48 |     pub default_value: Option<&'static str>,
49 | }
50 | 
51 | #[derive(Debug)]
52 | pub struct UsageExample {
53 |     pub description: &'static str,
54 |     pub command: &'static str,
55 |     pub output: Option<&'static str>,
56 | }
57 | 


--------------------------------------------------------------------------------
/src/io/input/fastx.rs:
--------------------------------------------------------------------------------
 1 | use std::cell::Cell;
 2 | 
 3 | use memchr::memchr;
 4 | use seq_io::policy::BufPolicy;
 5 | 
 6 | #[derive(Default, Clone, Debug)]
 7 | pub struct FastxHeaderParser {
 8 |     delim_pos: Cell<Option<Option<usize>>>,
 9 | }
10 | 
11 | impl FastxHeaderParser {
12 |     // #[inline(always)]
13 |     pub fn id_desc<'a>(&self, head: &'a [u8]) -> (&'a [u8], Option<&'a [u8]>) {
14 |         if self.delim_pos.get().is_none() {
15 |             self.delim_pos.set(Some(memchr(b' ', head)));
16 |         }
17 |         Self::_split_header(head, self.delim_pos.get().unwrap())
18 |     }
19 | 
20 |     fn _split_header(head: &[u8], delim: Option<usize>) -> (&[u8], Option<&[u8]>) {
21 |         if let Some(d) = delim {
22 |             let (id, desc) = head.split_at(d);
23 |             (id, Some(&desc[1..]))
24 |         } else {
25 |             (head, None)
26 |         }
27 |     }
28 | 
29 |     pub fn parsed_id_desc<'a>(&self, head: &'a [u8]) -> Option<(&'a [u8], Option<&'a [u8]>)> {
30 |         self.delim_pos.get().map(|d| Self::_split_header(head, d))
31 |     }
32 | 
33 |     pub fn delim_pos(&self) -> Option<Option<usize>> {
34 |         self.delim_pos.get()
35 |     }
36 | 
37 |     pub fn set_delim_pos(&self, delim_pos: Option<Option<usize>>) {
38 |         self.delim_pos.set(delim_pos);
39 |     }
40 | }
41 | 
42 | #[derive(Clone)]
43 | pub struct LimitedBuffer {
44 |     pub double_until: usize,
45 |     pub limit: usize,
46 | }
47 | 
48 | impl BufPolicy for LimitedBuffer {
49 |     fn grow_to(&mut self, current_size: usize) -> Option<usize> {
50 |         if current_size < self.double_until {
51 |             Some(current_size * 2)
52 |         } else if current_size < self.limit {
53 |             Some(current_size + self.double_until)
54 |         } else {
55 |             None
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/var/modules/expr/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::borrow::Cow;
 2 | use std::fmt::Debug;
 3 | use std::fs::read_to_string;
 4 | 
 5 | use crate::io::Record;
 6 | use crate::var::symbols::{OptValue, SymbolTable};
 7 | 
 8 | mod expressions;
 9 | pub mod js;
10 | mod var_provider;
11 | 
12 | pub use self::var_provider::*;
13 | 
14 | #[derive(Debug, Default, Clone)]
15 | pub struct Var {
16 |     pub symbol_id: usize,
17 |     pub name: String,
18 | }
19 | 
20 | /// General trait used for registering/evaluating expressions, which
21 | /// can be implemented for different expression engines
22 | pub trait Expression: Default + Debug {
23 |     type Context: ExprContext;
24 | 
25 |     fn register(
26 |         &mut self,
27 |         expr_id: usize,
28 |         expr: &str,
29 |         ctx: &mut Self::Context,
30 |     ) -> Result<(), String>;
31 | 
32 |     fn eval(&mut self, out: &mut OptValue, ctx: &mut Self::Context) -> Result<(), String>;
33 | }
34 | 
35 | pub trait ExprContext: Default {
36 |     fn init(&mut self, init_code: Option<&str>) -> Result<(), String>;
37 | 
38 |     fn next_record(
39 |         &mut self,
40 |         symbols: &SymbolTable,
41 |         record: &dyn Record,
42 |     ) -> Result<(), (usize, String)>;
43 | 
44 |     // fn clear(&mut self) {}
45 | 
46 |     fn register(&mut self, _var: &Var) -> Result<(), String> {
47 |         Ok(())
48 |     }
49 | }
50 | 
51 | pub fn code_or_file(expr: &'_ str) -> Result<Cow<'_, str>, String> {
52 |     let expr = expr.trim();
53 |     let prefix = "file:";
54 |     #[allow(clippy::manual_strip)]
55 |     if expr.starts_with(prefix) {
56 |         let path = expr[prefix.len()..].trim_start();
57 |         return read_to_string(path)
58 |             .map(String::into)
59 |             .map_err(|e| format!("Unable to read script file '{path}': {e}"));
60 |     }
61 |     Ok(expr.into())
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/concat.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn concat() {
 5 |     with_tmpdir("st_concat_", |td| {
 6 |         let input = td.multi_file(
 7 |             ".fastq",
 8 |             [
 9 |                 "@id1\nAAA\n+\nAAA\n@id2\nAAA\n+\nAAA\n",
10 |                 "@id1\nBBB\n+\nBBB\n@id2\nBBB\n+\nBBB\n",
11 |                 "@id1\nCCC\n+\nCCC\n@id2\nCCC\n+\nCCC\n",
12 |             ],
13 |         );
14 | 
15 |         cmp(
16 |             &["concat"],
17 |             &input,
18 |             "@id1\nAAABBBCCC\n+\nAAABBBCCC\n@id2\nAAABBBCCC\n+\nAAABBBCCC\n",
19 |         );
20 |         cmp(
21 |             &["concat", "-s2"],
22 |             &input,
23 |             "@id1\nAAANNBBBNNCCC\n+\nAAAJJBBBJJCCC\n@id2\nAAANNBBBNNCCC\n+\nAAAJJBBBJJCCC\n",
24 |         );
25 |         cmp(
26 |             &["concat", "-s2", "-c", "-", "--q-char", "~"],
27 |             &input,
28 |             "@id1\nAAA--BBB--CCC\n+\nAAA~~BBB~~CCC\n@id2\nAAA--BBB--CCC\n+\nAAA~~BBB~~CCC\n",
29 |         );
30 | 
31 |         // id mismatch
32 |         fails(
33 |             &["concat"],
34 |             td.multi_file(".fasta", [">id1\nATG", ">id\nATG"]),
35 |             "ID of record #2 (id) does not match the ID of the first one (id1)",
36 |         );
37 | 
38 |         // too few records in second input
39 |         fails(
40 |             &["concat"],
41 |             td.multi_file(".fasta", [">id1\nATG\n>id2\nA", ">id1\nATG"]),
42 |             "The number of records in input #2 does not match the number of records in input #1",
43 |         );
44 | 
45 |         // too many records in second input
46 |         fails(
47 |             &["concat"],
48 |             td.multi_file(".fasta", [">id1\nATG", ">id1\nATG\n>id2\nA"]),
49 |             "The number of records in input #2 does not match the number of records in input #1",
50 |         );
51 |     });
52 | }
53 | 


--------------------------------------------------------------------------------
/var_provider/src/func.rs:
--------------------------------------------------------------------------------
 1 | //! Functions related to constructing a variable/function enum type with its arguments.
 2 | 
 3 | use std::fmt::Display;
 4 | 
 5 | pub trait FromArg<A>: Sized {
 6 |     fn from_arg(func_name: &str, arg_name: &str, arg: A) -> Result<Self, String>;
 7 | }
 8 | 
 9 | impl<'a> FromArg<&'a str> for &'a str {
10 |     fn from_arg(_: &str, _: &str, value: &'a str) -> Result<Self, String> {
11 |         Ok(value)
12 |     }
13 | }
14 | 
15 | macro_rules! impl_from_arg {
16 |     ($ty:ty, $cnv:expr, $what:expr) => {
17 |         impl FromArg<&str> for $ty {
18 |             fn from_arg(func_name: &str, arg_name: &str, value: &str) -> Result<Self, String> {
19 |                 $cnv(value).map_err(|_| invalid_value(func_name, arg_name, value))
20 |             }
21 |         }
22 |     };
23 | }
24 | 
25 | impl_from_arg!(usize, |s: &str| s.parse(), "an integer number");
26 | impl_from_arg!(f64, |s: &str| s.parse(), "a decimal number");
27 | impl_from_arg!(bool, |s: &str| s.parse(), "a boolean (true/false)");
28 | impl_from_arg!(String, |s: &str| Ok::<_, String>(s.to_string()), "a string");
29 | 
30 | #[inline(never)]
31 | pub fn invalid_value<V: Display>(var_name: &str, arg_name: &str, value: V) -> String {
32 |     format!("Invalid value for argument '{arg_name}' of function '{var_name}': '{value}'")
33 | }
34 | 
35 | #[inline(never)]
36 | pub fn missing_argument(var_name: &str, arg_name: &str) -> String {
37 |     format!("The function '{var_name}' is missing the argument '{arg_name}'")
38 | }
39 | 
40 | #[inline(never)]
41 | pub fn too_many_args<V: Display>(var_name: &str, max_args: usize, arg: V) -> String {
42 |     format!(
43 |         "The function '{}' got an unexpected argument '{}', expecting only {} argument{}",
44 |         var_name,
45 |         arg,
46 |         max_args,
47 |         if max_args == 1 { "" } else { "s" }
48 |     )
49 | }
50 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use std::borrow::ToOwned;
 2 | use std::convert::From;
 3 | use std::error;
 4 | use std::fmt;
 5 | use std::io;
 6 | use std::num::{ParseFloatError, ParseIntError};
 7 | use std::str::Utf8Error;
 8 | use std::string::FromUtf8Error;
 9 | 
10 | pub type CliResult<T> = Result<T, CliError>;
11 | 
12 | #[derive(Debug)]
13 | pub enum CliError {
14 |     Io(io::Error),
15 |     Other(String),
16 | }
17 | 
18 | impl fmt::Display for CliError {
19 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
20 |         match *self {
21 |             CliError::Io(ref e) => e.fmt(f),
22 |             CliError::Other(ref s) => f.write_str(s),
23 |         }
24 |     }
25 | }
26 | 
27 | impl error::Error for CliError {
28 |     fn description(&self) -> &str {
29 |         "seqtool commandline error"
30 |     }
31 | }
32 | 
33 | impl From<io::Error> for CliError {
34 |     fn from(err: io::Error) -> CliError {
35 |         CliError::Io(err)
36 |     }
37 | }
38 | 
39 | impl<'a> From<&'a str> for CliError {
40 |     fn from(err: &'a str) -> CliError {
41 |         CliError::Other(err.to_owned())
42 |     }
43 | }
44 | 
45 | impl<W> From<csv::IntoInnerError<W>> for CliError {
46 |     fn from(err: csv::IntoInnerError<W>) -> CliError {
47 |         CliError::Other(format!("{err}"))
48 |     }
49 | }
50 | 
51 | macro_rules! from_err(($e:ty) => (
52 |     impl From<$e> for CliError {
53 |         fn from(err: $e) -> CliError {
54 |             CliError::Other(format!("{err}"))
55 |         }
56 |     }
57 | ));
58 | 
59 | from_err!(String);
60 | from_err!(fmt::Error);
61 | from_err!(seq_io::fasta::Error);
62 | from_err!(seq_io::fastq::Error);
63 | #[cfg(any(feature = "all-commands", feature = "find", feature = "replace"))]
64 | from_err!(regex_lite::Error);
65 | #[cfg(feature = "regex-fast")]
66 | from_err!(regex::Error);
67 | from_err!(Utf8Error);
68 | from_err!(FromUtf8Error);
69 | from_err!(ParseIntError);
70 | from_err!(ParseFloatError);
71 | from_err!(csv::Error);
72 | 


--------------------------------------------------------------------------------
/src/cmd/sort/vars.rs:
--------------------------------------------------------------------------------
 1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType};
 2 | use variable_enum_macro::variable_enum;
 3 | 
 4 | use crate::cmd::shared::key::Key;
 5 | use crate::var::{modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder};
 6 | 
 7 | variable_enum! {
 8 |     /// # Variables provided by the 'sort' command
 9 |     ///
10 |     /// # Examples
11 |     ///
12 |     /// Sort by part of the sequence ID, which is obtained using
13 |     /// a JavaScript expression.
14 |     /// We additionally keep this substring by writing the sort key to a header
15 |     /// attribute:
16 |     ///
17 |     /// `st sort -n '{ id.slice(2, 5) }' -a id_num='{num(key)}' input.fasta`
18 |     ///
19 |     /// >id001 id_num=1
20 |     /// SEQ
21 |     /// >id002 id_num=2
22 |     /// SEQ
23 |     /// (...)
24 |     SortVar {
25 |         /// The value of the key used for sorting
26 |         Key(?),
27 |     }
28 | }
29 | 
30 | #[derive(Debug, Default)]
31 | pub struct SortVars {
32 |     key_id: Option<usize>,
33 | }
34 | 
35 | impl SortVars {
36 |     pub fn set(&mut self, key: &Key, symbols: &mut SymbolTable) {
37 |         if let Some(var_id) = self.key_id {
38 |             key.write_to_symbol(symbols.get_mut(var_id));
39 |         }
40 |     }
41 | }
42 | 
43 | impl VarProvider for SortVars {
44 |     fn info(&self) -> &dyn DynVarProviderInfo {
45 |         &dyn_var_provider!(SortVar)
46 |     }
47 | 
48 |     fn register(
49 |         &mut self,
50 |         name: &str,
51 |         args: &[Arg],
52 |         builder: &mut VarBuilder,
53 |     ) -> Result<Option<(usize, Option<VarType>)>, String> {
54 |         Ok(SortVar::from_func(name, args)?.map(|(var, out_type)| {
55 |             let SortVar::Key = var;
56 |             let symbol_id = self.key_id.get_or_insert_with(|| builder.increment());
57 |             (*symbol_id, out_type)
58 |         }))
59 |     }
60 | 
61 |     fn has_vars(&self) -> bool {
62 |         self.key_id.is_some()
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/filter.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[test]
 4 | fn filter() {
 5 |     let fa = ">id\nSEQ\n>id2 a=20\nSEQ\n>id3 a=\nSEQ";
 6 | 
 7 |     cmp(
 8 |         &["filter", "seqlen > ungapped_seqlen && attr('p') >= 10"],
 9 |         &*FASTA,
10 |         &SEQS[2..].concat(),
11 |     );
12 |     cmp(&["filter", "id == 'seq0'"], &*FASTA, SEQS[1]);
13 |     cmp(&["filter", "id == undefined"], &*FASTA, "");
14 |     // note: comparison with undefined in Javascript returns false, thus only sequences
15 |     // with defined attributes are kept
16 |     cmp(
17 |         &[
18 |             "filter",
19 |             "opt_attr('a') != undefined && opt_attr('a') >= 20",
20 |             "--to-tsv",
21 |             "id",
22 |         ],
23 |         fa,
24 |         "id2\n",
25 |     );
26 |     cmp(
27 |         &["filter", "opt_attr('a') >= 20", "--to-tsv", "id"],
28 |         fa,
29 |         "id2\n",
30 |     );
31 |     // Javascript Regex:
32 |     // currently /regex/ syntax with strings matching any variable/function
33 |     // cannot be handled
34 |     // cmp(
35 |     //     &["filter", r"(/id\d+/).test(id)", "--to-tsv", "id"],
36 |     //     fa,
37 |     //     "id2\nid3\n",
38 |     // );
39 |     cmp(
40 |         &[
41 |             "filter",
42 |             r"(new RegExp('id\\d+')).test(id)",
43 |             "--to-tsv",
44 |             "id",
45 |         ],
46 |         fa,
47 |         "id2\nid3\n",
48 |     );
49 | }
50 | 
51 | #[test]
52 | fn drop_file() {
53 |     with_tmpdir("st_drop_file_", |td| {
54 |         let dropped = td.path(".csv");
55 |         let input = "@id1\nSEQ\n+\nJJJ\n@id2\nOTHER\n+\nJJJJJ\n";
56 |         let cmd = &[
57 |             "filter",
58 |             "seq != 'SEQ'",
59 |             "--fq",
60 |             "--to-csv",
61 |             "id,seq_num,seq",
62 |             "--dropped",
63 |             &dropped,
64 |         ];
65 |         cmp(cmd, input, "id2,2,OTHER\n");
66 |         assert_eq!(&dropped.content(), "id1,1,SEQ\n");
67 |     });
68 | }
69 | 


--------------------------------------------------------------------------------
/src/cmd/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod shared;
 2 | 
 3 | #[cfg(any(feature = "all-commands", feature = "pass", feature = "stat"))]
 4 | pub mod pass;
 5 | #[cfg(any(feature = "all-commands", feature = "view"))]
 6 | pub mod view;
 7 | 
 8 | #[cfg(any(feature = "all-commands", feature = "count"))]
 9 | pub mod count;
10 | #[cfg(any(feature = "all-commands", feature = "stat"))]
11 | pub mod stat;
12 | 
13 | #[cfg(any(feature = "all-commands", feature = "cmp"))]
14 | pub mod cmp;
15 | #[cfg(any(
16 |     all(feature = "expr", feature = "all-commands"),
17 |     all(feature = "expr", feature = "filter")
18 | ))]
19 | pub mod filter;
20 | #[cfg(any(feature = "all-commands", feature = "head"))]
21 | pub mod head;
22 | #[cfg(any(feature = "all-commands", feature = "interleave"))]
23 | pub mod interleave;
24 | #[cfg(any(feature = "all-commands", feature = "sample"))]
25 | pub mod sample;
26 | #[cfg(any(feature = "all-commands", feature = "slice"))]
27 | pub mod slice;
28 | #[cfg(any(feature = "all-commands", feature = "sort"))]
29 | pub mod sort;
30 | #[cfg(any(feature = "all-commands", feature = "split"))]
31 | pub mod split;
32 | #[cfg(any(feature = "all-commands", feature = "tail"))]
33 | pub mod tail;
34 | #[cfg(any(feature = "all-commands", feature = "unique"))]
35 | pub mod unique;
36 | 
37 | #[cfg(any(feature = "all-commands", feature = "concat"))]
38 | pub mod concat;
39 | #[cfg(any(feature = "all-commands", feature = "del"))]
40 | pub mod del;
41 | #[cfg(any(feature = "all-commands", feature = "find"))]
42 | pub mod find;
43 | #[cfg(any(feature = "all-commands", feature = "lower"))]
44 | pub mod lower;
45 | #[cfg(any(feature = "all-commands", feature = "mask"))]
46 | pub mod mask;
47 | #[cfg(any(feature = "all-commands", feature = "replace"))]
48 | pub mod replace;
49 | #[cfg(any(feature = "all-commands", feature = "revcomp"))]
50 | pub mod revcomp;
51 | #[cfg(any(feature = "all-commands", feature = "set"))]
52 | pub mod set;
53 | #[cfg(any(feature = "all-commands", feature = "trim"))]
54 | pub mod trim;
55 | #[cfg(any(feature = "all-commands", feature = "upper"))]
56 | pub mod upper;
57 | 


--------------------------------------------------------------------------------
/src/cmd/set.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::io::{RecordAttr, RecordEditor};
 7 | use crate::var::varstring::VarString;
 8 | 
 9 | #[derive(Parser, Clone, Debug)]
10 | #[clap(next_help_heading = "'Set' command options")]
11 | pub struct SetCommand {
12 |     /// New ID (variables allowed)
13 |     #[arg(short, long)]
14 |     id: Option<String>,
15 | 
16 |     /// New description (variables allowed)
17 |     #[arg(short, long)]
18 |     desc: Option<String>,
19 | 
20 |     /// New sequence (variables allowed)
21 |     #[arg(short, long)]
22 |     seq: Option<String>,
23 | 
24 |     #[command(flatten)]
25 |     pub common: CommonArgs,
26 | }
27 | 
28 | pub fn run(mut cfg: Config, args: SetCommand) -> CliResult<()> {
29 |     let mut replacements = vec![];
30 |     if let Some(string) = args.id.as_ref() {
31 |         replacements.push((string, RecordAttr::Id));
32 |     }
33 |     if let Some(string) = args.desc.as_ref() {
34 |         replacements.push((string, RecordAttr::Desc));
35 |     }
36 |     if let Some(string) = args.seq.as_ref() {
37 |         replacements.push((string, RecordAttr::Seq));
38 |     }
39 | 
40 |     let mut format_writer = cfg.get_format_writer()?;
41 |     cfg.with_io_writer(|io_writer, mut cfg| {
42 |         // get String -> VarString
43 |         let replacements: Vec<_> = replacements
44 |             .iter()
45 |             .map(|&(e, attr)| {
46 |                 let (e, _) = cfg.build_vars(|b| VarString::parse_register(e, b, false))?;
47 |                 Ok((e, attr))
48 |             })
49 |             .collect::<CliResult<_>>()?;
50 | 
51 |         let mut editor = RecordEditor::new();
52 | 
53 |         cfg.read(|record, ctx| {
54 |             for &(ref expr, attr) in &replacements {
55 |                 let val = editor.edit(attr);
56 |                 expr.compose(val, ctx.symbols(), record)?;
57 |             }
58 | 
59 |             format_writer.write(&editor.record(&record), io_writer, ctx)?;
60 |             Ok(true)
61 |         })
62 |     })
63 | }
64 | 


--------------------------------------------------------------------------------
/src/helpers/replace.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | /// Helper function for replacing parts of a given text
 4 | /// with a new text and writing the result to an io::Write instance.
 5 | /// Requires an iterator over (start, end) positions.
 6 | #[inline(always)]
 7 | pub fn replace_iter<M, W>(
 8 |     text: &[u8],
 9 |     replacement: &[u8],
10 |     matches: M,
11 |     out: &mut W,
12 | ) -> io::Result<()>
13 | where
14 |     M: Iterator<Item = (usize, usize)>,
15 |     W: io::Write + ?Sized,
16 | {
17 |     replace_iter_custom(text, matches, out, |out, _, _| out.write_all(replacement))
18 | }
19 | 
20 | /// Like replace_iter, but with custom replacement function,
21 | /// which is given the matched text and all remaining text
22 | /// and allows writing anything to the output stream.
23 | #[inline(always)]
24 | pub fn replace_iter_custom<R, M, W>(
25 |     text: &[u8],
26 |     matches: M,
27 |     out: &mut W,
28 |     mut write_replacement: R,
29 | ) -> io::Result<()>
30 | where
31 |     R: FnMut(&mut W, &[u8], &[u8]) -> io::Result<()>,
32 |     M: Iterator<Item = (usize, usize)>,
33 |     W: io::Write + ?Sized,
34 | {
35 |     let mut last_end = 0;
36 |     for (start, end) in matches {
37 |         out.write_all(&text[last_end..start])?;
38 |         write_replacement(out, &text[start..end], &text[end..])?;
39 |         last_end = end;
40 |     }
41 |     out.write_all(&text[last_end..])?;
42 |     Ok(())
43 | }
44 | 
45 | #[cfg(test)]
46 | mod tests {
47 |     use std::io::Write;
48 | 
49 |     #[test]
50 |     fn replace_iter() {
51 |         let pos = &[(1, 2), (4, 6), (7, 8)];
52 |         let text = b"012345678";
53 |         let replaced = b"0x23x6x8";
54 | 
55 |         let mut out = vec![];
56 |         super::replace_iter_custom(text, pos.iter().cloned(), &mut out, |out, _, _| {
57 |             out.write_all(b"x")
58 |         })
59 |         .unwrap();
60 |         assert_eq!(&out, replaced);
61 | 
62 |         // let mut out = vec![];
63 |         // unsafe { super::replace_iter_unchecked(text, b"x", &mut out, pos.iter().cloned()) };
64 |         // assert_eq!(&out, replaced)
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/io/output/fastq.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use crate::context::RecordMeta;
 4 | use crate::error::CliResult;
 5 | use crate::io::{QualConverter, QualFormat};
 6 | use crate::var::VarBuilder;
 7 | 
 8 | use crate::io::{
 9 |     output::{fastx::register_attributes, SeqFormatter},
10 |     Record,
11 | };
12 | 
13 | use super::fastx::Attribute;
14 | 
15 | pub struct FastqWriter {
16 |     format: QualFormat,
17 | }
18 | 
19 | impl FastqWriter {
20 |     pub fn new(
21 |         format: QualFormat,
22 |         attrs: &[(Attribute, bool)],
23 |         builder: &mut VarBuilder,
24 |     ) -> CliResult<Self> {
25 |         register_attributes(attrs, builder)?;
26 |         Ok(Self { format })
27 |     }
28 | }
29 | 
30 | impl SeqFormatter for FastqWriter {
31 |     fn write_with(
32 |         &mut self,
33 |         record: &dyn Record,
34 |         data: &RecordMeta,
35 |         out: &mut dyn io::Write,
36 |         qc: &mut QualConverter,
37 |     ) -> CliResult<()> {
38 |         write_fastq(record, data, out, qc, self.format)
39 |     }
40 | }
41 | 
42 | fn write_fastq<W: io::Write>(
43 |     record: &dyn Record,
44 |     data: &RecordMeta,
45 |     mut out: W,
46 |     qual_converter: &mut QualConverter,
47 |     format: QualFormat,
48 | ) -> CliResult<()> {
49 |     // TODO: could use seq_io::fastq::write_to / write_parts, but the sequence is an iterator of segments
50 |     let qual = record.qual().ok_or("No quality scores found in input.")?;
51 | 
52 |     // header
53 |     out.write_all(b"@")?;
54 |     data.attrs.write_head(record, &mut out, &data.symbols)?;
55 |     out.write_all(b"\n")?;
56 | 
57 |     // sequence
58 |     for seq in record.seq_segments() {
59 |         out.write_all(seq)?;
60 |     }
61 |     out.write_all(b"\n+\n")?;
62 | 
63 |     // quality scores
64 |     let qual = qual_converter.convert_to(qual, format).map_err(|e| {
65 |         format!(
66 |             "Error writing record '{}'. {}",
67 |             String::from_utf8_lossy(record.id()),
68 |             e
69 |         )
70 |     })?;
71 |     out.write_all(qual)?;
72 |     out.write_all(b"\n")?;
73 | 
74 |     Ok(())
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/split.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | use itertools::Itertools;
 3 | 
 4 | use std::str;
 5 | 
 6 | #[test]
 7 | fn chunks() {
 8 |     with_tmpdir("st_split_chunks_", |td| {
 9 |         for size in 1..5 {
10 |             let key = td.persistent_path("f_{chunk}.{default_ext}");
11 |             succeeds(&["split", "-n", &format!("{size}"), "-po", &key], &*FASTA);
12 | 
13 |             for (i, seqs) in SEQS.iter().chunks(size).into_iter().enumerate() {
14 |                 let f = td.path(&format!("f_{}.fasta", i + 1));
15 |                 assert_eq!(f.content(), seqs.into_iter().join(""));
16 |             }
17 |         }
18 |     });
19 | }
20 | 
21 | #[test]
22 | fn key() {
23 |     with_tmpdir("st_split_key_", |td| {
24 |         let out_path = td.persistent_path("{id}_{attr(p)}.fasta");
25 |         succeeds(&["split", "-po", &out_path], &*FASTA);
26 | 
27 |         let expected = &["seq1_2", "seq0_1", "seq3_10", "seq2_11"];
28 | 
29 |         for (name, seq) in expected.iter().zip(SEQS) {
30 |             let f = td.path(&format!("{name}.fasta"));
31 |             assert_eq!(f.content(), seq);
32 |         }
33 |     });
34 | }
35 | 
36 | #[test]
37 | fn seqlen_count() {
38 |     with_tmpdir("st_split_sl_", |td| {
39 |         let key = td.persistent_path("{seqlen}.fasta");
40 |         succeeds(&["split", "-o", &key], &*FASTA);
41 | 
42 |         let out = td.path("25.fasta");
43 |         cmp(
44 |             &["split", "-po", &key, "-c", "-"],
45 |             &*FASTA,
46 |             &format!("{}\t4\n", out.as_str()),
47 |         );
48 |         assert_eq!(out.content(), &*FASTA as &str);
49 |     });
50 | }
51 | 
52 | #[cfg(feature = "gz")]
53 | #[test]
54 | fn compression() {
55 |     with_tmpdir("st_split_compr_", |td| {
56 |         let key = td.persistent_path("{id}_{attr(p)}.fasta.gz");
57 |         succeeds(&["split", "-po", &key], &*FASTA);
58 | 
59 |         let expected = &["seq1_2", "seq0_1", "seq3_10", "seq2_11"];
60 | 
61 |         for (name, seq) in expected.iter().zip(SEQS) {
62 |             let f = td.path(&format!("{name}.fasta.gz"));
63 |             assert_eq!(f.gz_content(), seq);
64 |         }
65 |     });
66 | }
67 | 


--------------------------------------------------------------------------------
/src/helpers/bytesize.rs:
--------------------------------------------------------------------------------
 1 | //! Small function that parses memory sizes, accepting
 2 | //! different units (K, M, G, T). They are interpreted as powers of 2
 3 | //! (kibibytes, etc.).
 4 | //! Decimal numbers are rounded to the next integer.
 5 | pub fn parse_bytesize(size: &str) -> Result<usize, String> {
 6 |     let size = size.trim();
 7 |     if size.is_empty() {
 8 |         return Err("Empty size string.".to_string());
 9 |     }
10 |     let number = size.parse::<f64>();
11 | 
12 |     match number {
13 |         Ok(n) => Ok(n.round() as usize),
14 |         Err(_) => {
15 |             let (unit_size, unit) = size.split_at(size.len() - 1);
16 |             let suffixes = [b'B', b'K', b'M', b'G', b'T']; //, "P", "E"]
17 |             let unit_byte = unit.to_ascii_uppercase().as_bytes()[0];
18 |             if let Some(pow) = suffixes.iter().position(|s| *s == unit_byte) {
19 |                 if let Ok(s) = unit_size.trim().parse::<f64>() {
20 |                     Ok((s * (1024_f64).powi(pow as i32)).round() as usize)
21 |                 } else {
22 |                     Err(format!("Invalid size string: '{size}'"))
23 |                 }
24 |             } else {
25 |                 Err(format!("Unknown size unit: '{unit}'"))
26 |             }
27 |         }
28 |     }
29 | }
30 | 
31 | #[cfg(test)]
32 | mod tests {
33 |     use super::*;
34 | 
35 |     #[test]
36 |     fn test_bytesize() {
37 |         assert_eq!(parse_bytesize("1.").unwrap(), 1);
38 |         assert_eq!(parse_bytesize(" 1 B").unwrap(), 1);
39 |         assert_eq!(parse_bytesize(" 100K ").unwrap(), 100 * 1024);
40 |         assert_eq!(
41 |             parse_bytesize("2.3M").unwrap(),
42 |             (2.3_f64 * 1024. * 1024.).round() as usize
43 |         );
44 |         assert_eq!(
45 |             parse_bytesize("2.3M").unwrap(),
46 |             (2.3_f64 * 1024. * 1024.).round() as usize
47 |         );
48 |         assert_eq!(parse_bytesize("9 g").unwrap(), 9 * 1024 * 1024 * 1024);
49 |         assert_eq!(parse_bytesize("1T").unwrap(), 1024 * 1024 * 1024 * 1024);
50 |         assert!(parse_bytesize("x").is_err());
51 |         assert!(parse_bytesize("1x").is_err());
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/cmd/slice.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::{CommonArgs, WORDY_HELP};
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::helpers::rng::Range;
 7 | 
 8 | pub const DESC: &str = "\
 9 | The range is specified as `start:end`, whereby start and end
10 | are the sequence numbers (starting from 1). Open ranges are
11 | possible, in the form `start:` or `:end`.
12 | 
13 | The following is equivalent with `st head input.fasta`:
14 | `st slice ':10' input.fasta`
15 | 
16 | The following is equivalent with `st tail input.fasta`:
17 |  `st slice '-10:' input.fasta`
18 | 
19 | The 'slice' command does not extract subsequences; see the
20 | 'trim' command for that.";
21 | #[derive(Parser, Clone, Debug)]
22 | #[clap(next_help_heading = "'Slice' command options")]
23 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
24 | pub struct SliceCommand {
25 |     /// Range in form 'start:end' or ':end' or 'start:'
26 |     #[arg(value_name = "FROM:TO")]
27 |     range: Range,
28 | 
29 |     #[command(flatten)]
30 |     pub common: CommonArgs,
31 | }
32 | 
33 | pub fn run(mut cfg: Config, args: SliceCommand) -> CliResult<()> {
34 |     let range = args.range;
35 | 
36 |     let mut format_writer = cfg.get_format_writer()?;
37 |     cfg.with_io_writer(|io_writer, mut cfg| {
38 |         // convert from 1-based to 0-based coordinates
39 |         let mut start = range.start.unwrap_or(1);
40 |         if start == 0 {
41 |             return fail!("Select ranges are 1-based, zero is not a valid start value");
42 |         }
43 |         start -= 1;
44 |         let end = range.end;
45 | 
46 |         let mut i = 0;
47 | 
48 |         cfg.read(|record, ctx| {
49 |             // if a start value was specified, skip records
50 |             // was thinking about using Itertools::dropping(), but have to check for errors...
51 |             if i >= start {
52 |                 if let Some(e) = end {
53 |                     if i >= e {
54 |                         return Ok(false);
55 |                     }
56 |                 }
57 |                 format_writer.write(&record, io_writer, ctx)?;
58 |             }
59 |             i += 1;
60 |             Ok(true)
61 |         })
62 |     })
63 | }
64 | 


--------------------------------------------------------------------------------
/src/cmd/filter.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::CommonArgs;
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::var::{modules::expr::js::parser::Expression, symbols::Value};
 7 | 
 8 | #[derive(Parser, Clone, Debug)]
 9 | #[clap(next_help_heading = "'Filter' command options")]
10 | pub struct FilterCommand {
11 |     /// Filter expression
12 |     expression: String,
13 |     /// Output file for sequences that were removed by filtering.
14 |     /// The format is auto-recognized from the extension.
15 |     #[arg(short, long, value_name = "FILE")]
16 |     dropped: Option<String>,
17 | 
18 |     #[command(flatten)]
19 |     pub common: CommonArgs,
20 | }
21 | 
22 | pub fn run(mut cfg: Config, args: FilterCommand) -> CliResult<()> {
23 |     let expr = args.expression.trim();
24 |     if expr.starts_with('{') && expr.ends_with('}') {
25 |         eprintln!(
26 |             "Warning: found filter expression in the form {{ expression }}. \
27 |             The surrounding brackets are unnecessary and should be removed for the \
28 |             expression to work properly."
29 |         )
30 |     }
31 | 
32 |     let parsed_expr = Expression::parse(expr)?;
33 |     let (symbol_id, _) = cfg.build_vars(move |b| b.register_expr(&parsed_expr))?;
34 | 
35 |     let mut dropped_out = args
36 |         .dropped
37 |         .as_ref()
38 |         .map(|f| cfg.new_output(f))
39 |         .transpose()?;
40 | 
41 |     let mut format_writer = cfg.get_format_writer()?;
42 |     cfg.with_io_writer(|io_writer, mut cfg| {
43 |         cfg.read(|record, ctx| {
44 |             let v = ctx.symbols().get(symbol_id);
45 |             let result = match v.inner() {
46 |                 Some(Value::Bool(b)) => *b.get(),
47 |                 _ => {
48 |                     return fail!(
49 |                         "Filter expression did not return a boolean (true/false), \
50 |                         found '{}' instead",
51 |                         v
52 |                     )
53 |                 }
54 |             };
55 | 
56 |             if result {
57 |                 format_writer.write(&record, io_writer, ctx)?;
58 |             } else if let Some((d_writer, d_format_writer)) = dropped_out.as_mut() {
59 |                 d_format_writer.write(&record, d_writer, ctx)?;
60 |             }
61 |             Ok(true)
62 |         })?;
63 |         if let Some((w, _)) = dropped_out {
64 |             w.finish()?;
65 |         }
66 |         Ok(())
67 |     })
68 | }
69 | 


--------------------------------------------------------------------------------
/src/cmd/view/color.rs:
--------------------------------------------------------------------------------
 1 | use std::str::FromStr;
 2 | 
 3 | use palette::{named, white_point::D65, Srgb};
 4 | 
 5 | #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 6 | pub enum ColorSource {
 7 |     Seq,
 8 |     Qual,
 9 | }
10 | 
11 | #[derive(Debug, Clone, Copy, Eq, PartialEq)]
12 | pub struct Color {
13 |     rgb: (u8, u8, u8),
14 |     ansi: AnsiColor,
15 | }
16 | 
17 | impl Color {
18 |     pub fn from_rgb(c: Srgb<u8>) -> Self {
19 |         Self {
20 |             rgb: (c.red, c.green, c.blue),
21 |             ansi: c.into(),
22 |         }
23 |     }
24 | 
25 |     pub fn from_str(s: &str) -> Result<Self, String> {
26 |         parse_color(s).map(Self::from_rgb)
27 |     }
28 | 
29 |     pub fn to_ratatui(self, rgb: bool) -> ratatui::style::Color {
30 |         if rgb {
31 |             ratatui::style::Color::Rgb(self.rgb.0, self.rgb.1, self.rgb.2)
32 |         } else {
33 |             ratatui::style::Color::Indexed(self.ansi.0)
34 |         }
35 |     }
36 | 
37 |     pub fn to_crossterm(self, rgb: bool) -> crossterm::style::Color {
38 |         if rgb {
39 |             crossterm::style::Color::Rgb {
40 |                 r: self.rgb.0,
41 |                 g: self.rgb.1,
42 |                 b: self.rgb.2,
43 |             }
44 |         } else {
45 |             crossterm::style::Color::AnsiValue(self.ansi.0)
46 |         }
47 |     }
48 | }
49 | 
50 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
51 | struct AnsiColor(u8);
52 | 
53 | impl From<Srgb<u8>> for AnsiColor {
54 |     fn from(c: Srgb<u8>) -> Self {
55 |         // Simple conversion adapted from the colorsys.rs crate, not using the grayscale ramp
56 |         let to_ansi = |c| if c < 75 { 0 } else { (c - 35) / 40 };
57 |         Self(to_ansi(c.red) * 6 * 6 + to_ansi(c.green) * 6 + to_ansi(c.blue) + 16)
58 |     }
59 | }
60 | 
61 | pub fn parse_color(s: &str) -> Result<Srgb<u8>, String> {
62 |     named::from_str(s).or_else(|| Srgb::from_str(s).ok())
63 |         .ok_or_else(|| format!("Invalid color code: '{s}'. The colors must be in Hex format (rrggbb) or a name (e.g. 'cyan')"))
64 | }
65 | 
66 | /// chooses the optimal text color based on the brightness/darkness of the background color
67 | pub fn choose_fg(fg_dark: &Color, fg_bright: &Color, bg_col: &Color) -> Color {
68 |     let dark_l = palette::Lab::<D65, _>::from(fg_dark.rgb).l as f32;
69 |     let bright_l = palette::Lab::<D65, _>::from(fg_bright.rgb).l as f32;
70 |     let bg = palette::Lab::<D65, _>::from(bg_col.rgb).l as f32;
71 |     if (bright_l - bg) / (bright_l - dark_l) < 0.3 {
72 |         *fg_dark
73 |     } else {
74 |         *fg_bright
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/cmd/sort/mem.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Write};
 2 | use std::mem;
 3 | use std::path::PathBuf;
 4 | 
 5 | use deepsize::DeepSizeOf;
 6 | 
 7 | use crate::cmd::shared::tmp_store::{Item, TmpWriter};
 8 | use crate::error::CliResult;
 9 | 
10 | use super::{FileSorter, SortHandle};
11 | 
12 | #[derive(Debug, Clone)]
13 | pub struct MemSorter {
14 |     records: Vec<Item<Box<[u8]>>>,
15 |     reverse: bool,
16 |     mem: usize,
17 |     max_mem: usize,
18 | }
19 | 
20 | impl MemSorter {
21 |     pub fn new(reverse: bool, max_mem: usize) -> Self {
22 |         // we cannot know the exact length of the input, we just initialize
23 |         // with capacity that should at least hold some records, while still
24 |         // not using too much memory
25 |         let records = Vec::with_capacity((max_mem / 400).clamp(1, 10000));
26 |         Self {
27 |             mem: records.deep_size_of(),
28 |             records,
29 |             reverse,
30 |             max_mem,
31 |         }
32 |     }
33 | 
34 |     pub fn add(&mut self, item: Item<Box<[u8]>>) -> bool {
35 |         self.mem += item.deep_size_of();
36 |         self.records.push(item);
37 |         self.mem < self.max_mem
38 |     }
39 | 
40 |     fn sort(&mut self) {
41 |         if !self.reverse {
42 |             self.records.sort_by(|i1, i2| i1.key.cmp(&i2.key));
43 |         } else {
44 |             self.records.sort_by(|i1, i2| i2.key.cmp(&i1.key));
45 |         }
46 |     }
47 | 
48 |     pub fn len(&self) -> usize {
49 |         self.records.len()
50 |     }
51 | 
52 |     pub fn reverse(&self) -> bool {
53 |         self.reverse
54 |     }
55 | 
56 |     pub fn write_sorted(&mut self, io_writer: &mut dyn Write) -> CliResult<()> {
57 |         self.sort();
58 |         for item in &self.records {
59 |             io_writer.write_all(&item.record)?;
60 |         }
61 |         Ok(())
62 |     }
63 | 
64 |     pub fn get_file_sorter(
65 |         &mut self,
66 |         tmp_dir: PathBuf,
67 |         file_limit: usize,
68 |     ) -> io::Result<FileSorter> {
69 |         let mut other = MemSorter::new(self.reverse, self.max_mem);
70 |         other.records = mem::take(&mut self.records);
71 |         FileSorter::from_mem(other, tmp_dir, file_limit)
72 |     }
73 | 
74 |     pub fn serialize_sorted(
75 |         &mut self,
76 |         mut writer: TmpWriter<Item<Box<[u8]>>>,
77 |     ) -> io::Result<(usize, SortHandle)> {
78 |         self.sort();
79 |         for item in &self.records {
80 |             writer.write(item)?;
81 |         }
82 |         let n = self.records.len();
83 |         self.records.clear();
84 |         self.mem = 0;
85 |         writer.done().map(|h| (n, h))
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/cmd/sort/file.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Write};
 2 | use std::path::PathBuf;
 3 | 
 4 | use crate::cmd::shared::tmp_store::{Item, TmpHandle, TmpStore};
 5 | use crate::error::CliResult;
 6 | use crate::helpers::heap_merge::HeapMerge;
 7 | 
 8 | use super::MemSorter;
 9 | 
10 | pub type SortHandle = TmpHandle<Item<Box<[u8]>>>;
11 | 
12 | #[derive(Debug)]
13 | pub struct FileSorter {
14 |     mem_sorter: MemSorter,
15 |     tmp_store: TmpStore,
16 |     handles: Vec<SortHandle>,
17 |     n_written: usize,
18 | }
19 | 
20 | impl FileSorter {
21 |     pub fn from_mem(
22 |         mem_sorter: MemSorter,
23 |         tmp_dir: PathBuf,
24 |         file_limit: usize,
25 |     ) -> io::Result<Self> {
26 |         Ok(Self {
27 |             mem_sorter,
28 |             handles: Vec::new(),
29 |             tmp_store: TmpStore::new(tmp_dir, "st_sort_", file_limit)?,
30 |             n_written: 0,
31 |         })
32 |     }
33 | 
34 |     pub fn add(&mut self, item: Item<Box<[u8]>>, quiet: bool) -> CliResult<bool> {
35 |         if !self.mem_sorter.add(item) {
36 |             self.write_to_file(quiet)?;
37 |         }
38 |         Ok(true)
39 |     }
40 | 
41 |     pub fn write_to_file(&mut self, quiet: bool) -> CliResult<()> {
42 |         let writer = self.tmp_store.writer(quiet)?;
43 |         let (n, handle) = self.mem_sorter.serialize_sorted(writer)?;
44 |         self.n_written += n;
45 |         self.handles.push(handle);
46 |         Ok(())
47 |     }
48 | 
49 |     pub fn write_records(
50 |         &mut self,
51 |         io_writer: &mut dyn Write,
52 |         quiet: bool,
53 |         verbose: bool,
54 |     ) -> CliResult<()> {
55 |         // write last chunk of records
56 |         self.write_to_file(quiet)?;
57 | 
58 |         if verbose {
59 |             eprintln!(
60 |                 "Sorted {} records using {} temporary files ({:.1} records per file).",
61 |                 self.n_written,
62 |                 self.handles.len(),
63 |                 self.n_written as f64 / self.handles.len() as f64
64 |             );
65 |         }
66 | 
67 |         // readers for all sorted file chunks
68 |         let mut readers = self
69 |             .handles
70 |             .iter_mut()
71 |             .map(|handle| handle.reader())
72 |             .collect::<Result<Vec<_>, _>>()?;
73 |         // use k-way merging of sorted chunks with a min-heap to obtain
74 |         // the final sorted output
75 |         let kmerge = HeapMerge::new(&mut readers, self.mem_sorter.reverse())?;
76 |         for item in kmerge {
77 |             io_writer.write_all(&item?.record)?;
78 |         }
79 |         // clean up
80 |         for rdr in readers {
81 |             rdr.done()?;
82 |         }
83 |         Ok(())
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/cmd/cmp/vars.rs:
--------------------------------------------------------------------------------
 1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType};
 2 | use variable_enum_macro::variable_enum;
 3 | 
 4 | use crate::cmd::shared::key::Key;
 5 | use crate::var::{modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder, VarStore};
 6 | 
 7 | use super::Category;
 8 | 
 9 | variable_enum! {
10 |     /// # Variables/functions provided by the 'cmp' command
11 |     ///
12 |     /// # Examples
13 |     ///
14 |     /// Compare two files by ID and sequence hash and store all commonly found
15 |     /// records in a new file (some statistics is printed to STDERR):
16 |     ///
17 |     /// `st cmp input1.fasta input2.fasta --common1 common.fasta`
18 |     ///
19 |     /// common  942
20 |     /// unique1  51
21 |     /// unique2  18
22 |     CmpVar {
23 |         /// Record category:
24 |         /// 'common' (record present in both files based on comparison of keys),
25 |         /// 'unique1' (record only in first file),
26 |         /// or 'unique2' (record only in second file).
27 |         Category(Text),
28 |         /// Short category code: 'c' for common, 'u1' for unique1, 'u2' for unique2
29 |         CategoryShort(Text),
30 |         /// The value of the compared key
31 |         Key(?),
32 |     }
33 | }
34 | 
35 | #[derive(Debug, Default)]
36 | pub struct CmpVars {
37 |     vars: VarStore<CmpVar>,
38 | }
39 | 
40 | impl CmpVars {
41 |     pub fn set(&mut self, key: &Key, cat: Category, symbols: &mut SymbolTable) {
42 |         for (symbol_id, var) in self.vars.iter() {
43 |             match var {
44 |                 CmpVar::Key => key.write_to_symbol(symbols.get_mut(*symbol_id)),
45 |                 CmpVar::Category => symbols
46 |                     .get_mut(*symbol_id)
47 |                     .inner_mut()
48 |                     .set_text(cat.long_text().as_bytes()),
49 |                 CmpVar::CategoryShort => symbols
50 |                     .get_mut(*symbol_id)
51 |                     .inner_mut()
52 |                     .set_text(cat.short_text().as_bytes()),
53 |             }
54 |         }
55 |     }
56 | }
57 | 
58 | impl VarProvider for CmpVars {
59 |     fn info(&self) -> &dyn DynVarProviderInfo {
60 |         &dyn_var_provider!(CmpVar)
61 |     }
62 | 
63 |     fn register(
64 |         &mut self,
65 |         name: &str,
66 |         args: &[Arg],
67 |         builder: &mut VarBuilder,
68 |     ) -> Result<Option<(usize, Option<VarType>)>, String> {
69 |         Ok(CmpVar::from_func(name, args)?.map(|(var, out_type)| {
70 |             let symbol_id = builder.store_register(var, &mut self.vars);
71 |             (symbol_id, out_type)
72 |         }))
73 |     }
74 | 
75 |     fn has_vars(&self) -> bool {
76 |         !self.vars.is_empty()
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/io/input/fastq.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use seq_io::fastq::{self, Reader, Record as _};
 4 | use seq_io::policy::BufPolicy;
 5 | 
 6 | use crate::error::CliResult;
 7 | use crate::io::{MaybeModified, Record, RecordHeader};
 8 | 
 9 | use super::fastx::FastxHeaderParser;
10 | use super::SeqReader;
11 | 
12 | pub struct FastqReader<R: io::Read, P: BufPolicy>(pub Reader<R, P>);
13 | 
14 | impl<R, P> FastqReader<R, P>
15 | where
16 |     R: io::Read,
17 |     P: BufPolicy,
18 | {
19 |     pub fn new(rdr: R, cap: usize, policy: P) -> Self {
20 |         FastqReader(Reader::with_capacity(rdr, cap).set_policy(policy))
21 |     }
22 | }
23 | 
24 | impl<R, P> SeqReader for FastqReader<R, P>
25 | where
26 |     R: io::Read,
27 |     P: BufPolicy,
28 | {
29 |     fn read_next_conditional(
30 |         &mut self,
31 |         func: &mut dyn FnMut(&dyn Record) -> CliResult<bool>,
32 |     ) -> Option<CliResult<bool>> {
33 |         self.0.next().map(|r| {
34 |             let r = FastqRecord::new(r?);
35 |             func(&r)
36 |         })
37 |     }
38 | }
39 | 
40 | // Wrapper for FASTQ record
41 | 
42 | pub struct FastqRecord<'a> {
43 |     rec: fastq::RefRecord<'a>,
44 |     header_parser: FastxHeaderParser,
45 | }
46 | 
47 | impl<'a> FastqRecord<'a> {
48 |     #[inline(always)]
49 |     pub fn new(inner: fastq::RefRecord<'a>) -> FastqRecord<'a> {
50 |         FastqRecord {
51 |             rec: inner,
52 |             header_parser: Default::default(),
53 |         }
54 |     }
55 | }
56 | 
57 | impl Record for FastqRecord<'_> {
58 |     fn id(&self) -> &[u8] {
59 |         self.header_parser.id_desc(self.rec.head()).0
60 |     }
61 | 
62 |     fn desc(&self) -> Option<&[u8]> {
63 |         self.header_parser.id_desc(self.rec.head()).1
64 |     }
65 | 
66 |     fn id_desc(&self) -> (&[u8], Option<&[u8]>) {
67 |         self.header_parser.id_desc(self.rec.head())
68 |     }
69 | 
70 |     fn current_header(&'_ self) -> RecordHeader<'_> {
71 |         if let Some((id, desc)) = self.header_parser.parsed_id_desc(self.rec.head()) {
72 |             RecordHeader::IdDesc(
73 |                 MaybeModified::new(id, false),
74 |                 MaybeModified::new(desc, false),
75 |             )
76 |         } else {
77 |             RecordHeader::Full(self.rec.head())
78 |         }
79 |     }
80 | 
81 |     fn raw_seq(&self) -> &[u8] {
82 |         self.rec.seq()
83 |     }
84 | 
85 |     fn qual(&self) -> Option<&[u8]> {
86 |         Some(<fastq::RefRecord as fastq::Record>::qual(&self.rec))
87 |     }
88 | 
89 |     fn header_delim_pos(&self) -> Option<Option<usize>> {
90 |         self.header_parser.delim_pos()
91 |     }
92 | 
93 |     fn set_header_delim_pos(&self, delim: Option<usize>) {
94 |         self.header_parser.set_delim_pos(Some(delim))
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/var/modules/expr/expressions.rs:
--------------------------------------------------------------------------------
 1 | //! `Expressions` evaluates a list of expressions repeatedly using a single
 2 | //! engine.
 3 | //!
 4 | //! Currently, this is more of an an unnecessary wrapper, but the reason for this
 5 | //! being a separate module is that a more complicated evaluator featuring
 6 | //! two different engines (a fast and simple, and a slower JavaScript engine)
 7 | //! may be added in the future.
 8 | 
 9 | use var_provider::VarType;
10 | 
11 | use crate::io::Record;
12 | use crate::var::{symbols::SymbolTable, VarBuilder};
13 | 
14 | use super::js::{parser::SimpleAst, replace_register_vars};
15 | use super::{ExprContext, Expression};
16 | 
17 | #[derive(Debug)]
18 | pub struct Expressions<E: Expression> {
19 |     expressions: Vec<(usize, String, E)>,
20 |     // NOTE: context must come *after* expressions, since rquickjs expressions contain
21 |     // Persistent<Atom>, which should not live longer than the context
22 |     // TODO: always ok?
23 |     context: E::Context,
24 | }
25 | 
26 | impl<E: Expression> Expressions<E> {
27 |     pub fn new(init_code: Option<&str>) -> Result<Self, String> {
28 |         let mut context = E::Context::default();
29 |         context.init(init_code)?;
30 |         Ok(Self {
31 |             expressions: vec![],
32 |             context,
33 |         })
34 |     }
35 | 
36 |     fn lookup(&self, script: &str) -> Option<usize> {
37 |         self.expressions
38 |             .iter()
39 |             .find_map(|(id, code, _expr)| if script == code { Some(*id) } else { None })
40 |     }
41 | 
42 |     pub fn register_expr(
43 |         &mut self,
44 |         ast: &SimpleAst,
45 |         builder: &mut VarBuilder,
46 |     ) -> Result<(usize, Option<VarType>), String> {
47 |         if let Some(symbol_id) = self.lookup(ast.script) {
48 |             Ok((symbol_id, None))
49 |         } else {
50 |             let mut expr = E::default();
51 |             let (code, vars) = replace_register_vars(ast, builder)?;
52 |             let symbol_id = builder.increment();
53 |             expr.register(symbol_id, &code, &mut self.context)?;
54 |             self.expressions.push((symbol_id, code, expr));
55 |             for var in vars {
56 |                 self.context.register(&var)?;
57 |             }
58 |             Ok((symbol_id, None))
59 |         }
60 |     }
61 | 
62 |     pub fn num_exprs(&self) -> usize {
63 |         self.expressions.len()
64 |     }
65 | 
66 |     pub fn eval(&mut self, symbols: &mut SymbolTable, record: &dyn Record) -> Result<(), String> {
67 |         self.context
68 |             .next_record(symbols, record)
69 |             .map_err(|(_, msg)| msg)?;
70 |         for (out_id, _, expr) in &mut self.expressions {
71 |             expr.eval(symbols.get_mut(*out_id), &mut self.context)?;
72 |         }
73 |         Ok(())
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/cmd/sort/cli.rs:
--------------------------------------------------------------------------------
 1 | use std::cell::LazyCell;
 2 | use std::path::PathBuf;
 3 | 
 4 | use clap::Parser;
 5 | 
 6 | use crate::cli::{CommonArgs, WORDY_HELP};
 7 | use crate::helpers::bytesize::parse_bytesize;
 8 | 
 9 | pub const DESC: &str = "\
10 | The sort key can be 'seq', 'id', or any variable/function, expression, or
11 | text containing them (see <KEY> help and `st sort -V/--help-vars`).
12 | 
13 | Records with identical keys are kept in input order.
14 | 
15 | The actual value of the key is available through the 'key' variable. It can
16 | be written to a header attribute or TSV field.
17 | ";
18 | 
19 | pub const EXAMPLES: LazyCell<String> = LazyCell::new(|| {
20 |     color_print::cformat!(
21 |         "\
22 |  <c>st sort seqlen input.fasta</c><r>
23 | >>id10
24 | SEQ
25 | >>id3
26 | SEQUE
27 | >>id1
28 | SEQUENCE
29 | </r>
30 | "
31 |     )
32 | });
33 | 
34 | #[derive(Parser, Clone, Debug)]
35 | #[clap(next_help_heading = "'Sort' command options")]
36 | #[clap(before_help=DESC, after_help=&*EXAMPLES, help_template=WORDY_HELP)]
37 | pub struct SortCommand {
38 |     /// The key used to sort the records. It can be a single variable/function
39 |     /// such as 'seq', 'id', a composed string, e.g. '{id}_{desc}',
40 |     /// or a comma-delimited list of multiple variables/functions to sort by,
41 |     /// e.g. 'seq,attr(a)'. In this case, the records are first sorted by sequence,
42 |     /// but in case of identical sequences, records are sorted by the header
43 |     /// attribute 'a'.
44 |     ///
45 |     /// To sort by a numeric FASTA attribute in the form '>id;size=123':
46 |     /// `st sort 'num(attr(size))' --attr-fmt ';key=value' input.fasta`.
47 |     ///
48 |     /// Regarding formulas returning mixed text/numbers, the sorted records with
49 |     /// text keys will be returned first and the sorted number records after them.
50 |     /// Furthermore, NaN and missing values (null/undefined in JS expressions,
51 |     /// missing `opt_attr()` values or missing entries in associated metadata)
52 |     /// will appear last.
53 |     pub key: String,
54 | 
55 |     /// Sort in reverse order
56 |     #[arg(short, long)]
57 |     pub reverse: bool,
58 | 
59 |     /// Maximum amount of memory (approximate) to use for sorting.
60 |     /// Either a plain number (bytes) a number with unit (K, M, G, T)
61 |     /// based on powers of 2.
62 |     #[arg(short = 'M', long, value_name = "SIZE", value_parser = parse_bytesize, default_value = "5G")]
63 |     pub max_mem: usize,
64 | 
65 |     /// Path to temporary directory (only if memory limit is exceeded)
66 |     #[arg(long, value_name = "PATH")]
67 |     pub temp_dir: Option<PathBuf>,
68 | 
69 |     /// Maximum number of temporary files allowed
70 |     #[arg(long, value_name = "N", default_value_t = 1000)]
71 |     pub temp_file_limit: usize,
72 | 
73 |     #[command(flatten)]
74 |     pub common: CommonArgs,
75 | }
76 | 


--------------------------------------------------------------------------------
/src/var/modules/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::any::{Any, TypeId};
 2 | use std::fmt::Debug;
 3 | 
 4 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType};
 5 | 
 6 | use crate::helpers::any::AsAnyMut;
 7 | use crate::io::{input::InputConfig, output::OutputConfig, QualConverter, Record};
 8 | 
 9 | use super::attr::Attributes;
10 | use super::parser::Arg;
11 | use super::symbols::SymbolTable;
12 | use super::VarBuilder;
13 | 
14 | pub mod attr;
15 | pub mod cnv;
16 | #[cfg(feature = "expr")]
17 | pub mod expr;
18 | pub mod general;
19 | pub mod meta;
20 | pub mod stats;
21 | 
22 | /// List of all variable/function provider modules,
23 | /// used to generate the help pages
24 | /// (independently of the variable provider modules themselves)
25 | pub const MODULE_INFO: &[&dyn DynVarProviderInfo] = &[
26 |     &dyn_var_provider!(general::GeneralVar),
27 |     &dyn_var_provider!(stats::StatVar),
28 |     &dyn_var_provider!(attr::AttrVar),
29 |     &dyn_var_provider!(meta::MetaVar),
30 |     #[cfg(feature = "expr")]
31 |     &dyn_var_provider!(expr::ExprVar),
32 |     &dyn_var_provider!(cnv::CnvVar),
33 | ];
34 | 
35 | /// *The* trait for variable/function provider modules.
36 | pub trait VarProvider: Debug + AsAnyMut {
37 |     fn info(&self) -> &dyn DynVarProviderInfo;
38 | 
39 |     /// Tries registering a variable / function with a module
40 |     /// and returns `Some(VarType)` or `None` if the type is unknown beforehand.
41 |     fn register(
42 |         &mut self,
43 |         name: &str,
44 |         args: &[Arg],
45 |         builder: &mut VarBuilder,
46 |     ) -> Result<Option<(usize, Option<VarType>)>, String>;
47 | 
48 |     fn has_vars(&self) -> bool;
49 | 
50 |     /// Supplies a new record to the variable provider and expects it to
51 |     /// update the symbol table with the variable values.
52 |     fn set_record(
53 |         &mut self,
54 |         _rec: &dyn Record,
55 |         _sym: &mut SymbolTable,
56 |         _attr: &Attributes,
57 |         _qc: &mut QualConverter,
58 |     ) -> Result<(), String> {
59 |         Ok(())
60 |     }
61 | 
62 |     /// Called on every new output stream (STDOUT or file).
63 |     /// Some variable providers may need the information.
64 |     /// Additional output files created using `Config::new_output()` are
65 |     /// *not* provided here.
66 |     fn init_output(&mut self, _: &OutputConfig) -> Result<(), String> {
67 |         Ok(())
68 |     }
69 | 
70 |     /// Called on every new input (STDIN or file).
71 |     /// Some variable providers may need the information.
72 |     fn init_input(&mut self, _: &InputConfig) -> Result<(), String> {
73 |         Ok(())
74 |     }
75 | 
76 |     /// Returns the type ID of the given concrete type
77 |     /// (used for identifying custom variable providers in `Ctx`)
78 |     fn get_type_id(&self) -> TypeId
79 |     where
80 |         Self: 'static,
81 |     {
82 |         self.type_id()
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/cmd/unique/map.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use clap::ValueEnum;
 4 | 
 5 | use crate::cmd::shared::tmp_store::Key;
 6 | use crate::helpers::write_list::{write_list, write_list_with};
 7 | 
 8 | use super::DuplicateInfo;
 9 | 
10 | #[derive(ValueEnum, Debug, Clone, Copy, PartialEq)]
11 | pub enum MapFormat {
12 |     /// Sequence ID, reference record ID
13 |     Long,
14 |     /// Like `long`, but sets the reference record ID to `*` for the reference
15 |     /// record itself instead of repeating the same ID twice.
16 |     LongStar,
17 |     /// Tab-delimited list of all duplicates, with the reference record ID first
18 |     /// (e.g. corresponds to Swarm output format).
19 |     Wide,
20 |     /// Reference ID, comma-delimited list of duplicates including the reference ID
21 |     ///  (corresponds to mothur `.names` file).
22 |     WideComma,
23 |     /// Like `wide`, but with the unique key in the first column and all duplicate
24 |     /// IDs in the following columns.
25 |     WideKey,
26 | }
27 | 
28 | pub struct MapWriter<W: io::Write> {
29 |     inner: W,
30 |     format: MapFormat,
31 | }
32 | 
33 | impl<W: io::Write> MapWriter<W> {
34 |     pub fn new(inner: W, format: MapFormat) -> Self {
35 |         Self { inner, format }
36 |     }
37 | 
38 |     pub fn into_inner(self) -> W {
39 |         self.inner
40 |     }
41 | 
42 |     pub fn write(&mut self, key: &Key, duplicates: &DuplicateInfo) -> io::Result<()> {
43 |         let ids = match duplicates {
44 |             DuplicateInfo::Ids(ids) => ids,
45 |             _ => panic!(),
46 |         };
47 |         match self.format {
48 |             MapFormat::Long | MapFormat::LongStar => {
49 |                 let mut first = true;
50 |                 for id in ids {
51 |                     self.inner.write_all(id)?;
52 |                     write!(self.inner, "\t")?;
53 |                     if self.format == MapFormat::LongStar && first {
54 |                         first = false;
55 |                         self.inner.write_all(b"*")?;
56 |                     } else {
57 |                         self.inner.write_all(&ids[0])?;
58 |                     }
59 |                     writeln!(self.inner)?;
60 |                 }
61 |             }
62 |             MapFormat::Wide | MapFormat::WideKey => {
63 |                 if self.format == MapFormat::WideKey {
64 |                     write_list_with(key.as_slice(), b"\t", &mut self.inner, |v, o| v.write(o))?;
65 |                     write!(self.inner, "\t")?;
66 |                 }
67 |                 write_list(ids, b"\t", &mut self.inner)?;
68 |                 writeln!(self.inner)?;
69 |             }
70 |             MapFormat::WideComma => {
71 |                 self.inner.write_all(&ids[0])?;
72 |                 write!(self.inner, "\t")?;
73 |                 write_list(ids, b",", &mut self.inner)?;
74 |                 writeln!(self.inner)?;
75 |             }
76 |         }
77 |         Ok(())
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/helpers/heap_merge.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::{Ordering, Reverse};
  2 | use std::collections::BinaryHeap;
  3 | use std::fmt::Debug;
  4 | 
  5 | #[derive(Debug)]
  6 | struct Item<T: Ord + Debug> {
  7 |     inner: T,
  8 |     reverse: bool,
  9 | }
 10 | 
 11 | impl<T: Ord + Debug> Item<T> {
 12 |     fn new(inner: T, reverse: bool) -> Self {
 13 |         Self { inner, reverse }
 14 |     }
 15 | }
 16 | 
 17 | impl<T: Ord + Debug> PartialOrd for Item<T> {
 18 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 19 |         Some(self.cmp(other))
 20 |     }
 21 | }
 22 | 
 23 | impl<T: Ord + Debug> PartialEq for Item<T> {
 24 |     fn eq(&self, other: &Self) -> bool {
 25 |         self.inner == other.inner
 26 |     }
 27 | }
 28 | 
 29 | impl<T: Ord + Debug> Eq for Item<T> {}
 30 | 
 31 | impl<T: Ord + Debug> Ord for Item<T> {
 32 |     fn cmp(&self, other: &Self) -> Ordering {
 33 |         if !self.reverse {
 34 |             self.inner.cmp(&other.inner)
 35 |         } else {
 36 |             other.inner.cmp(&self.inner)
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | /// Merges sorted streams using a binary heap.
 42 | /// In case of ties, items are sorted by the order of the input streams, in which
 43 | /// they occur.
 44 | #[derive(Debug)]
 45 | pub struct HeapMerge<T, I, E>
 46 | where
 47 |     T: Ord + Debug,
 48 |     I: Iterator<Item = Result<T, E>>,
 49 |     E: Debug,
 50 | {
 51 |     streams: Box<[I]>,
 52 |     heap: BinaryHeap<Reverse<(Item<T>, usize)>>,
 53 |     reverse: bool,
 54 | }
 55 | 
 56 | impl<T, I, E> HeapMerge<T, I, E>
 57 | where
 58 |     T: Ord + Debug,
 59 |     I: Iterator<Item = Result<T, E>>,
 60 |     E: Debug,
 61 | {
 62 |     pub fn new<S>(streams: S, reverse: bool) -> Result<Self, E>
 63 |     where
 64 |         S: IntoIterator<Item = I>,
 65 |     {
 66 |         let mut streams = streams.into_iter().collect::<Box<[_]>>();
 67 |         let mut heap = BinaryHeap::with_capacity(streams.len());
 68 |         for (i, stream) in streams.iter_mut().enumerate() {
 69 |             if let Some(item) = stream.next() {
 70 |                 heap.push(Reverse((Item::new(item?, reverse), i)));
 71 |             }
 72 |         }
 73 |         Ok(Self {
 74 |             heap,
 75 |             reverse,
 76 |             streams,
 77 |         })
 78 |     }
 79 | }
 80 | 
 81 | impl<T, I, E> Iterator for HeapMerge<T, I, E>
 82 | where
 83 |     T: Ord + Debug,
 84 |     I: Iterator<Item = Result<T, E>>,
 85 |     E: Debug,
 86 | {
 87 |     type Item = Result<T, E>;
 88 | 
 89 |     fn next(&mut self) -> Option<Self::Item> {
 90 |         self.heap.pop().map(|top| {
 91 |             let (top_item, top_i) = top.0;
 92 |             if let Some(next_item) = self.streams[top_i].next() {
 93 |                 self.heap
 94 |                     .push(Reverse((Item::new(next_item?, self.reverse), top_i)));
 95 |             }
 96 |             Ok(top_item.inner)
 97 |         })
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/cmd/view/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::config::Config;
 2 | use crate::error::CliResult;
 3 | 
 4 | mod cli;
 5 | mod color;
 6 | mod fmt;
 7 | mod pager;
 8 | mod pal;
 9 | 
10 | pub use self::cli::*;
11 | use self::color::*;
12 | use self::fmt::*;
13 | use self::pager::*;
14 | use self::pal::*;
15 | 
16 | pub fn run(mut cfg: Config, args: ViewCommand) -> CliResult<()> {
17 |     if args.color.list_pal {
18 |         print_palettes(
19 |             &args.color.textcols,
20 |             args.color.truecolor.unwrap_or_else(has_truecolor),
21 |         )?;
22 |         return Ok(());
23 |     }
24 | 
25 |     // setup colors
26 |     use ColorSource::*;
27 |     let use_qual = cfg.input_config[0].format.format.has_qual() && !args.general.no_qual;
28 |     let (bg, fg, bold) = if use_qual {
29 |         if args.general.foreground {
30 |             (Some(Qual), Some(Seq), true)
31 |         } else {
32 |             (Some(Qual), None, false)
33 |         }
34 |     } else if args.general.foreground {
35 |         (None, Some(Seq), false)
36 |     } else {
37 |         (Some(Seq), None, false)
38 |     };
39 |     let mut formatter = Formatter::new(args.general.id_len, args.general.show_desc)
40 |         .capabilities(
41 |             args.color.truecolor.unwrap_or_else(has_truecolor),
42 |             has_utf8(),
43 |         )
44 |         .textcols(args.color.textcols.0, args.color.textcols.1)?
45 |         .color_config(bg, fg)
46 |         .bold(args.general.bold || bold);
47 | 
48 |     let palettes = args.color.palettes();
49 |     let mut pager = GrowingPager::new();
50 |     let mut terminal = ratatui::init();
51 |     // let mut terminal = ratatui::Terminal::new(ratatui::backend::CrosstermBackend::new(std::io::stdout()))?;
52 | 
53 |     #[derive(Debug, Clone, Copy, PartialEq, Eq)]
54 |     enum Progress {
55 |         New,
56 |         Running,
57 |         Done,
58 |     }
59 | 
60 |     let mut progress = Progress::New;
61 |     let res = cfg.read(|record, ctx| {
62 |         loop {
63 |             match pager.check_draw(&mut terminal, false)? {
64 |                 Status::Ok => {}
65 |                 Status::MissingLines => {
66 |                     let (line, len) =
67 |                         formatter.format(record, &mut ctx.qual_converter, &palettes)?;
68 |                     progress = Progress::Running;
69 |                     if use_qual {
70 |                         pager.set_color_scale(formatter.format_scale(0, (2..47).step_by(2)));
71 |                     }
72 |                     pager.add(line, len);
73 |                     break;
74 |                 }
75 |                 Status::Quit => {
76 |                     progress = Progress::Done;
77 |                     return Ok(false);
78 |                 }
79 |             }
80 |         }
81 |         Ok(true)
82 |     });
83 |     if res.is_ok() && progress == Progress::Running {
84 |         while !matches!(pager.check_draw(&mut terminal, true)?, Status::Quit) {}
85 |     }
86 |     ratatui::restore();
87 |     res
88 | }
89 | 


--------------------------------------------------------------------------------
/src/io/output/fa_qual.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | use std::io;
 3 | use std::path::Path;
 4 | 
 5 | use crate::context::RecordMeta;
 6 | use crate::error::CliResult;
 7 | use crate::io::QualConverter;
 8 | use crate::var::VarBuilder;
 9 | 
10 | use super::{Attribute, Record, SeqFormatter};
11 | 
12 | pub struct FaQualWriter {
13 |     fa_writer: super::fasta::FastaWriter,
14 |     // This is a bit awkward: the FASTA writer is not part of this struct,
15 |     // (supplied to write(), while the .qual writer is).
16 |     // However, this is a special case and not a problem.
17 |     qual_out: io::BufWriter<File>,
18 |     wrap: usize,
19 | }
20 | 
21 | impl FaQualWriter {
22 |     pub fn new<Q>(
23 |         wrap: Option<usize>,
24 |         qual_path: Q,
25 |         attrs: &[(Attribute, bool)],
26 |         builder: &mut VarBuilder,
27 |     ) -> CliResult<Self>
28 |     where
29 |         Q: AsRef<Path>,
30 |     {
31 |         let q_handle = File::create(&qual_path).map_err(|e| {
32 |             io::Error::other(format!(
33 |                 "Error creating '{}': {}",
34 |                 qual_path.as_ref().to_string_lossy(),
35 |                 e
36 |             ))
37 |         })?;
38 | 
39 |         Ok(FaQualWriter {
40 |             fa_writer: super::fasta::FastaWriter::new(wrap, attrs, builder)?,
41 |             qual_out: io::BufWriter::new(q_handle),
42 |             wrap: wrap.unwrap_or(usize::MAX),
43 |         })
44 |     }
45 | }
46 | 
47 | impl SeqFormatter for FaQualWriter {
48 |     fn write_with(
49 |         &mut self,
50 |         record: &dyn Record,
51 |         data: &RecordMeta,
52 |         out: &mut dyn io::Write,
53 |         qc: &mut QualConverter,
54 |     ) -> CliResult<()> {
55 |         self.fa_writer.write_with(record, data, out, qc)?;
56 |         write_qscores(record, &mut self.qual_out, data, qc, self.wrap)
57 |     }
58 | }
59 | 
60 | fn write_qscores<W: io::Write>(
61 |     record: &dyn Record,
62 |     mut out: W,
63 |     data: &RecordMeta,
64 |     qual_converter: &mut QualConverter,
65 |     wrap: usize,
66 | ) -> CliResult<()> {
67 |     let qual = record.qual().ok_or("No quality scores found in input.")?;
68 |     // header
69 |     out.write_all(b">")?;
70 |     data.attrs.write_head(record, &mut out, &data.symbols)?;
71 |     out.write_all(b"\n")?;
72 | 
73 |     // Phred scores
74 |     for qline in qual.chunks(wrap) {
75 |         if !qline.is_empty() {
76 |             let phred_qual = qual_converter.phred_scores(qline).map_err(|e| {
77 |                 format!(
78 |                     "Error writing record '{}'. {}",
79 |                     String::from_utf8_lossy(record.id()),
80 |                     e
81 |                 )
82 |             })?;
83 |             let mut q_iter = phred_qual.scores().iter();
84 |             for q in q_iter.by_ref().take(qline.len() - 1) {
85 |                 write!(&mut out, "{} ", *q)?;
86 |             }
87 |             writeln!(&mut out, "{}", q_iter.next().unwrap())?;
88 |         }
89 |     }
90 |     Ok(())
91 | }
92 | 


--------------------------------------------------------------------------------
/src/io/input/fasta.rs:
--------------------------------------------------------------------------------
  1 | use std::io;
  2 | 
  3 | use seq_io::fasta::{self, Reader, Record as _};
  4 | use seq_io::policy::BufPolicy;
  5 | 
  6 | use crate::error::CliResult;
  7 | use crate::io::{MaybeModified, Record, RecordHeader, SeqLineIter};
  8 | 
  9 | use super::fastx::FastxHeaderParser;
 10 | use super::SeqReader;
 11 | 
 12 | pub struct FastaReader<R: io::Read, P: BufPolicy>(pub Reader<R, P>);
 13 | 
 14 | impl<R, P> FastaReader<R, P>
 15 | where
 16 |     R: io::Read,
 17 |     P: BufPolicy,
 18 | {
 19 |     pub fn new(rdr: R, cap: usize, policy: P) -> Self {
 20 |         FastaReader(Reader::with_capacity(rdr, cap).set_policy(policy))
 21 |     }
 22 | }
 23 | 
 24 | impl<R, P> SeqReader for FastaReader<R, P>
 25 | where
 26 |     R: io::Read,
 27 |     P: BufPolicy,
 28 | {
 29 |     fn read_next_conditional(
 30 |         &mut self,
 31 |         func: &mut dyn FnMut(&dyn Record) -> CliResult<bool>,
 32 |     ) -> Option<CliResult<bool>> {
 33 |         self.0.next().map(|r| {
 34 |             let r = FastaRecord::new(r?);
 35 |             func(&r)
 36 |         })
 37 |     }
 38 | }
 39 | 
 40 | // Wrapper for FASTA record
 41 | 
 42 | pub struct FastaRecord<'a> {
 43 |     rec: fasta::RefRecord<'a>,
 44 |     header_parser: FastxHeaderParser,
 45 | }
 46 | 
 47 | impl<'a> FastaRecord<'a> {
 48 |     #[inline(always)]
 49 |     pub fn new(inner: fasta::RefRecord<'a>) -> FastaRecord<'a> {
 50 |         FastaRecord {
 51 |             rec: inner,
 52 |             header_parser: Default::default(),
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | impl Record for FastaRecord<'_> {
 58 |     fn id(&self) -> &[u8] {
 59 |         self.header_parser.id_desc(self.rec.head()).0
 60 |     }
 61 | 
 62 |     fn desc(&self) -> Option<&[u8]> {
 63 |         self.header_parser.id_desc(self.rec.head()).1
 64 |     }
 65 | 
 66 |     fn id_desc(&self) -> (&[u8], Option<&[u8]>) {
 67 |         self.header_parser.id_desc(self.rec.head())
 68 |     }
 69 | 
 70 |     fn current_header(&'_ self) -> RecordHeader<'_> {
 71 |         if let Some((id, desc)) = self.header_parser.parsed_id_desc(self.rec.head()) {
 72 |             RecordHeader::IdDesc(
 73 |                 MaybeModified::new(id, false),
 74 |                 MaybeModified::new(desc, false),
 75 |             )
 76 |         } else {
 77 |             RecordHeader::Full(self.rec.head())
 78 |         }
 79 |     }
 80 | 
 81 |     fn raw_seq(&self) -> &[u8] {
 82 |         self.rec.seq()
 83 |     }
 84 | 
 85 |     fn qual(&self) -> Option<&[u8]> {
 86 |         None
 87 |     }
 88 | 
 89 |     fn header_delim_pos(&self) -> Option<Option<usize>> {
 90 |         self.header_parser.delim_pos()
 91 |     }
 92 | 
 93 |     fn set_header_delim_pos(&self, delim: Option<usize>) {
 94 |         self.header_parser.set_delim_pos(Some(delim))
 95 |     }
 96 | 
 97 |     fn has_seq_lines(&self) -> bool {
 98 |         self.rec.num_seq_lines() > 1
 99 |     }
100 | 
101 |     fn seq_segments(&'_ self) -> SeqLineIter<'_> {
102 |         SeqLineIter::Fasta(self.rec.seq_lines())
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/cmd/unique/cli.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | use clap::Parser;
 4 | 
 5 | use super::MapFormat;
 6 | use crate::cli::{CommonArgs, WORDY_HELP};
 7 | use crate::helpers::bytesize::parse_bytesize;
 8 | 
 9 | pub const DESC: &str = "\
10 | The unique key can be 'seq' or any variable/function, expression, or
11 | text containing them (see <KEY> help and `st unique -V/--help-vars`).
12 | 
13 | The order of the records is the same as in the input unless the memory limit
14 | is exceeded, in which case temporary files are used and all remaining records
15 | are sorted by the unique key. Use `-s/--sorted` to always sort the output
16 | by key.";
17 | 
18 | #[derive(Parser, Clone, Debug)]
19 | #[clap(next_help_heading = "'Unique' command options")]
20 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
21 | pub struct UniqueCommand {
22 |     /// The key used to determine, which records are unique.
23 |     /// The key can be a single variable/function such as 'seq',
24 |     /// a composed string such as '{attr(a)}_{attr(b)}',
25 |     /// or a comma-delimited list of multiple variables/functions, whose
26 |     /// values are all taken into account, e.g. 'seq,num(attr(a))'. In case of
27 |     /// identical sequences, records are still de-replicated by the header
28 |     /// attribute 'a'.
29 |     /// The 'num()' function turns text values into numbers, which can
30 |     /// speed up the de-replication.
31 |     /// For each key, the *first* encountered record is returned, and all
32 |     /// remaining ones with the same key are discarded.
33 |     pub key: String,
34 | 
35 |     /// Sort the output by key.
36 |     /// Without this option, the records are in input order if the memory limit
37 |     /// is *not* exceeded, but are sorted by key otherwise.
38 |     #[arg(short, long)]
39 |     pub sort: bool,
40 | 
41 |     /// Write a map of all duplicate sequence IDs to the given file (or '-' for stdout).
42 |     /// The (optional) compression format is auto-recognized from the extension.
43 |     /// By default, a two-column mapping of sequence ID -> unique reference record ID
44 |     /// is written (`long` format).
45 |     /// More formats can be selected with `--map_format`.
46 |     #[arg(long)]
47 |     pub map_out: Option<PathBuf>,
48 | 
49 |     /// Column format for the duplicate map `--map-out` (use `--help` for details).
50 |     #[arg(long, value_enum, default_value = "long")]
51 |     pub map_fmt: MapFormat,
52 | 
53 |     /// Maximum amount of memory (approximate) to use for de-duplicating.
54 |     /// Either a plain number (bytes) a number with unit (K, M, G, T)
55 |     /// based on powers of 2.
56 |     #[arg(short = 'M', long, value_name = "SIZE", value_parser = parse_bytesize, default_value = "5G")]
57 |     pub max_mem: usize,
58 | 
59 |     /// Path to temporary directory (only if memory limit is exceeded)
60 |     #[arg(long, value_name = "PATH")]
61 |     pub temp_dir: Option<PathBuf>,
62 | 
63 |     /// Maximum number of temporary files allowed
64 |     #[arg(long, value_name = "N", default_value_t = 1000)]
65 |     pub temp_file_limit: usize,
66 | 
67 |     #[command(flatten)]
68 |     pub common: CommonArgs,
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/count.rs:
--------------------------------------------------------------------------------
  1 | use crate::helpers::NA;
  2 | 
  3 | use super::*;
  4 | 
  5 | #[test]
  6 | fn simple() {
  7 |     cmp(&["count"], &*FASTA, "4\n");
  8 |     cmp(
  9 |         &["count", "-k", "attr(p)"],
 10 |         &*FASTA,
 11 |         "1\t1\n10\t1\n11\t1\n2\t1\n",
 12 |     );
 13 | }
 14 | 
 15 | #[test]
 16 | fn fixed() {
 17 |     cmp(&["count"], &*FASTA, "4\n");
 18 |     cmp(&["count"], &*FASTA, "4\n");
 19 |     cmp(&["count", "-k", "num('2.3')"], &*FASTA, "2.3\t4\n");
 20 |     cmp(&["count"], &*FASTA, "4\n");
 21 |     cmp(&["count", "-k", "bin('2.3', 1)"], &*FASTA, "(2, 3]\t4\n");
 22 |     cmp(
 23 |         &["count", "-k", "opt_attr(non_existent)"],
 24 |         &*FASTA,
 25 |         &format!("{NA}\t4\n"),
 26 |     );
 27 | }
 28 | 
 29 | #[test]
 30 | fn multi() {
 31 |     let out = "25\t23\t1\n25\t24\t2\n25\t25\t1\n";
 32 | 
 33 |     cmp(&["count", "-k", "seqlen,ungapped_seqlen"], &*FASTA, out);
 34 |     cmp(
 35 |         &["count", "-k", "seqlen", "-k", "ungapped_seqlen"],
 36 |         &*FASTA,
 37 |         out,
 38 |     );
 39 | }
 40 | 
 41 | #[test]
 42 | fn discrete_bins() {
 43 |     cmp(
 44 |         &["count", "-k", "{bin(attr(p), 10)}"],
 45 |         &*FASTA,
 46 |         "(0, 10]\t2\n(10, 20]\t2\n",
 47 |     );
 48 | }
 49 | 
 50 | const FLOAT_FASTA: &str = "\
 51 | >s1 a=1.10000000000002 =1.1
 52 | SEQ
 53 | >s2 a=0.00000000000001 =1e-14
 54 | SEQ
 55 | >s3 a=1.10000000000001 =1.1
 56 | SEQ
 57 | >s4 a=1.1000001 =1.1 with <=6 significant digits
 58 | SEQ
 59 | >s5 a=0.000000000000011 =1.1e-14
 60 | SEQ
 61 | >s6 a=11013452400000000001 =1.101345e19
 62 | SEQ
 63 | >s7 a=1.10000000000002 =1.1 (same as s1)
 64 | SEQ
 65 | ";
 66 | 
 67 | #[test]
 68 | fn float() {
 69 |     cmp(
 70 |         &["count", "-k", "attr(a)"],
 71 |         FLOAT_FASTA,
 72 |         "0.00000000000001\t1\n0.000000000000011\t1\n1.10000000000001\t1\n1.10000000000002\t2\n1.1000001\t1\n11013452400000000001\t1\n",
 73 |     );
 74 |     cmp(
 75 |         &["count", "-k", "num(attr(a))"],
 76 |         FLOAT_FASTA,
 77 |         "1e-14\t1\n1.1e-14\t1\n1.10000\t4\n1.10135e19\t1\n",
 78 |     );
 79 |     cmp(
 80 |         &["count", "-k", "bin(attr(a), 1)"],
 81 |         FLOAT_FASTA,
 82 |         "(0, 1]\t2\n(1, 2]\t4\n(1.10135e19, 1.10135e19]\t1\n",
 83 |     );
 84 | }
 85 | 
 86 | #[test]
 87 | fn missing() {
 88 |     cmp(
 89 |         &["count", "-k", "{opt_attr(missing)}"],
 90 |         &*FASTA,
 91 |         &format!("{NA}\t4\n"),
 92 |     );
 93 |     cmp(
 94 |         &["count", "-k", "{num(opt_attr(missing))}"],
 95 |         &*FASTA,
 96 |         &format!("{NA}\t4\n"),
 97 |     );
 98 |     fails(
 99 |         &["count", "-k", "{attr(missing)}"],
100 |         &*FASTA,
101 |         "'missing' not found in record",
102 |     );
103 | 
104 |     #[cfg(feature = "expr")]
105 |     {
106 |         cmp(
107 |             &["count", "-k", "{opt_attr('missing') + 1}"],
108 |             &*FASTA,
109 |             "NaN\t4\n",
110 |         );
111 |         cmp(
112 |             &["count", "-k", "{num(opt_attr('missing')) + 1}"],
113 |             &*FASTA,
114 |             "NaN\t4\n",
115 |         );
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/cmd/find/matcher/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::Debug;
 2 | 
 3 | use bio::alignment::AlignmentOperation;
 4 | 
 5 | use crate::error::CliResult;
 6 | 
 7 | use super::opts::{Algorithm, PatternConfig, SearchConfig, SearchOpts};
 8 | 
 9 | pub mod approx;
10 | pub mod exact;
11 | pub mod regex;
12 | 
13 | pub trait Matcher: Debug + MatcherBoxClone {
14 |     fn has_matches(&self, text: &[u8]) -> Result<bool, String>;
15 | 
16 |     /// This method iterates over all hits and provides these to the
17 |     /// given closure. The exact hit type may vary depending on the
18 |     /// implementation.
19 |     /// The looping should be interrupted if the closure returns false.
20 |     fn do_search(
21 |         &mut self,
22 |         text: &[u8],
23 |         func: &mut dyn FnMut(&dyn Hit) -> Result<bool, String>,
24 |     ) -> Result<(), String>;
25 | }
26 | 
27 | pub trait MatcherBoxClone {
28 |     fn clone_box(&self) -> Box<dyn Matcher + Send + Sync>;
29 | }
30 | 
31 | impl<T> MatcherBoxClone for T
32 | where
33 |     T: 'static + Matcher + Clone + Send + Sync,
34 | {
35 |     fn clone_box(&self) -> Box<dyn Matcher + Send + Sync> {
36 |         Box::new(self.clone())
37 |     }
38 | }
39 | 
40 | impl Clone for Box<dyn Matcher + Send + Sync> {
41 |     fn clone(&self) -> Box<dyn Matcher + Send + Sync> {
42 |         self.clone_box()
43 |     }
44 | }
45 | 
46 | pub trait Hit {
47 |     fn get_group(&self, group: usize, out: &mut Match) -> Result<(), String>;
48 | }
49 | 
50 | /// contains 0-based coordinates and distance
51 | #[derive(Debug, Clone, Default, Eq, PartialEq)]
52 | pub struct Match {
53 |     pub start: usize,
54 |     pub end: usize,
55 |     pub dist: usize,
56 |     pub alignment_path: Vec<AlignmentOperation>,
57 | }
58 | 
59 | impl Match {
60 |     pub fn neg_start1(&self, seq_len: usize) -> i64 {
61 |         self.start as i64 - seq_len as i64
62 |     }
63 | 
64 |     pub fn neg_end1(&self, seq_len: usize) -> i64 {
65 |         self.end as i64 - seq_len as i64 - 1
66 |     }
67 | }
68 | 
69 | pub fn get_matcher(
70 |     cfg: &PatternConfig,
71 |     opts: &SearchOpts,
72 | ) -> CliResult<Box<dyn Matcher + Send + Sync>> {
73 |     use Algorithm::*;
74 |     if cfg.algorithm != Regex && opts.has_regex_groups {
75 |         return fail!(
76 |             "Match groups > 0 can only be used with regular expression searches (-r/--regex or --regex-unicode)."
77 |         );
78 |     }
79 |     let matcher: Box<dyn Matcher + Send + Sync> = match cfg.algorithm {
80 |         Exact => Box::new(exact::ExactMatcher::new(cfg.pattern.seq.as_bytes())),
81 |         Regex => regex::get_matcher(
82 |             &cfg.pattern.seq,
83 |             opts.hit_limit <= 1,
84 |             opts.has_regex_groups,
85 |             opts.case_insensitive,
86 |         )?,
87 |         Myers => approx::get_matcher(&cfg.pattern.seq, cfg.max_dist, cfg.has_ambigs, opts)?,
88 |     };
89 |     Ok(matcher)
90 | }
91 | 
92 | pub fn get_matchers(cfg: &SearchConfig) -> CliResult<Vec<Box<dyn Matcher + Send + Sync>>> {
93 |     let opts = cfg.get_opts();
94 |     cfg.patterns()
95 |         .iter()
96 |         .map(|p| get_matcher(p, opts))
97 |         .collect::<CliResult<Vec<_>>>()
98 | }
99 | 


--------------------------------------------------------------------------------
/profile/README.md:
--------------------------------------------------------------------------------
  1 | # Measuring time and memory / comparison to other tools
  2 | 
  3 | The following should work for Ubuntu Linux.
  4 | 
  5 | ```bash
  6 | outdir=target/st_benchmark
  7 | fq=$outdir/reads.fq
  8 | mkdir -p $outdir
  9 | ```
 10 | 
 11 | ## Build the binary
 12 | 
 13 | ```bash
 14 | cargo build --release
 15 | st=target/release/st
 16 | ```
 17 | 
 18 | <!-- 
 19 | 
 20 | OR PGO
 21 | 
 22 | ```bash
 23 | cores=4
 24 | target=x86_64-unknown-linux-gnu
 25 | st=target/$target/release/st
 26 | cargo pgo build -- -j $cores --target=$target
 27 | scripts/compare_tools.py -b $st -k main -d $outdir -o bench.json -t $tmp $fq scripts/perf_commands.yaml 
 28 | 
 29 | cargo pgo optimize
 30 | ``` -->
 31 | 
 32 | ## Download sequencing reads
 33 | 
 34 | <!-- ```bash
 35 | fq=big.fq
 36 | 
 37 | if [ ! -f $fq ]; then
 38 |     wget https://github.com/caporaso-lab/mockrobiota/archive/refs/heads/master.zip
 39 |     unzip master.zip
 40 |     echo -n > $fq
 41 |     for i in 11; do
 42 |         meta=mockrobiota-master/data/mock-$i/dataset-metadata.tsv
 43 |         cat $meta
 44 |         grep 'raw-data-url-forward-read' $meta |
 45 |             cut -f 2 -d $'\t' |
 46 |             xargs wget -O - |
 47 |             zcat |
 48 |             s sample -p 0.4 --fq \
 49 |             >> $fq
 50 |     done
 51 |     rm -Rf mockrobiota-master master.zip
 52 | fi
 53 | ``` -->
 54 | 
 55 | ```bash
 56 | wget -qi profile/fastq_urls.txt -O - | zcat > $fq
 57 | ls -lh $fq
 58 | ```
 59 | 
 60 | ## Create temporary storage
 61 | 
 62 | We rely on *tmpfs* to store output (and some input) files in memory,
 63 | avoiding disk IO latency as much as possible.
 64 | 
 65 | ```bash
 66 | rm -Rf $outdir/workdir && mkdir $outdir/workdir
 67 | chmod 777 $outdir/workdir
 68 | sudo mount -t tmpfs -o size=10G none $outdir/workdir
 69 | mkdir -p $outdir/workdir/tmp
 70 | ```
 71 | 
 72 | Prepare forward primer for searching
 73 | 
 74 | ```bash
 75 | # >ITS_S2F
 76 | # CGATACTTGGTGTGAAT
 77 | # >ITS3
 78 | # TCGATGAAGAACGCAGC
 79 | cat > $outdir/workdir/primers.fasta <<- EOM
 80 | >ITS4
 81 | GTCCTCCGCTTATTGATATGC
 82 | EOM
 83 | ```
 84 | 
 85 | ## Run the benchmarks
 86 | 
 87 | Before running, disable frequency boost:
 88 | 
 89 | ```bash
 90 | # requires cpufrequtils installed
 91 | echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
 92 | ```
 93 | 
 94 | On Ubuntu, disable the indexer for full-text search
 95 | 
 96 | ```bash
 97 | echo -n > $outdir/workdir/.trackerignore
 98 | ```
 99 | 
100 | Run the comparison. The `compare_tools.py` does not only compare runtimes / memory usage,
101 | but in some cases also validates that the output is the same.
102 | See `comparison_commands.yaml`.
103 | 
104 | ```bash
105 | export SEQKIT_THREADS=1
106 | $st count $fq  # cache the file in memory
107 | scripts/compare_tools.py \
108 |     -b $st -d $outdir/workdir -o profile/comparison.json -t $outdir/workdir/tmp \
109 |     -k main \
110 |     $fq profile/comparison_commands.yaml 
111 |  # -k main,other
112 | scripts/summarize_comparison.py profile/comparison.json - > profile/comparison.md
113 | ```
114 | 
115 | ## Clean up
116 | 
117 | ```bash
118 | rm -Rf $outdir/workdir
119 | ```
120 | 


--------------------------------------------------------------------------------
/src/cmd/revcomp.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::OnceLock;
 2 | 
 3 | use clap::Parser;
 4 | 
 5 | use crate::cli::{CommonArgs, WORDY_HELP};
 6 | use crate::config::Config;
 7 | use crate::error::CliResult;
 8 | use crate::helpers::{
 9 |     complement::reverse_complement,
10 |     seqtype::{SeqType, SeqtypeHelper},
11 | };
12 | use crate::io::SeqQualRecord;
13 | 
14 | pub const DESC: &str = "\
15 | The sequence type is automatically detected based on the first record,
16 | unless the `--seqtype` option is used.
17 | 
18 | *Note*: Unknown letters are not reversed, but left unchanged.
19 | 
20 | If quality scores are present, their order is just reversed";
21 | 
22 | #[derive(Parser, Clone, Debug)]
23 | #[clap(next_help_heading = "'Revcomp' command options")]
24 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
25 | pub struct RevcompCommand {
26 |     /// Number of threads to use
27 |     #[arg(short, long, default_value_t = 1)]
28 |     threads: u32,
29 | 
30 |     #[command(flatten)]
31 |     pub common: CommonArgs,
32 | }
33 | 
34 | #[derive(Default, Clone, Debug)]
35 | struct RevCompRecord {
36 |     seq: Vec<u8>,
37 |     qual: Option<Vec<u8>>,
38 |     seqtype: Option<SeqType>,
39 | }
40 | 
41 | // TODO: wait for https://doc.rust-lang.org/std/sync/struct.OnceLock.html#method.get_or_try_init stabilization
42 | static SEQTYPE: OnceLock<Result<SeqType, String>> = OnceLock::new();
43 | 
44 | pub fn run(mut cfg: Config, args: RevcompCommand) -> CliResult<()> {
45 |     let num_threads = args.threads;
46 | 
47 |     let mut format_writer = cfg.get_format_writer()?;
48 |     let typehint = cfg.input_config[0].format.seqtype;
49 |     let mut final_seqtype = None;
50 |     cfg.with_io_writer(|io_writer, mut cfg| {
51 |         cfg.read_parallel_init(
52 |             num_threads - 1,
53 |             Default::default,
54 |             |record, out: &mut Box<RevCompRecord>| {
55 |                 if out.seqtype.is_none() {
56 |                     let seqtype = SEQTYPE.get_or_init(|| {
57 |                         SeqtypeHelper::new(typehint).get_or_guess(record)
58 |                     }).clone()?;
59 |                     out.seqtype = Some(seqtype);
60 |                 }
61 |                 reverse_complement(record.seq_segments(), &mut out.seq, out.seqtype.unwrap())?;
62 |                 if let Some(q) = record.qual() {
63 |                     let qual = out.qual.get_or_insert_with(|| Vec::with_capacity(q.len()));
64 |                     qual.clear();
65 |                     qual.extend(q.iter().rev());
66 |                 }
67 |                 Ok(())
68 |             },
69 |             |record, revcomp_record, ctx| {
70 |                 if final_seqtype.is_none() {
71 |                     final_seqtype = revcomp_record.seqtype;
72 |                 } else if revcomp_record.seqtype != final_seqtype {
73 |                     // fail if there is a mismatch in sequence types guessed in different threads
74 |                     return fail!("Could not reliably guess the sequence type. Please specify with `--seqtype`");
75 |                 }
76 |                 let rc_rec = SeqQualRecord::new(
77 |                     &record,
78 |                     &revcomp_record.seq,
79 |                     revcomp_record.qual.as_deref(),
80 |                 );
81 |                 format_writer.write(&rc_rec, io_writer, ctx)?;
82 |                 Ok(true)
83 |             },
84 |         )
85 |     })?;
86 |     Ok(())
87 | }
88 | 


--------------------------------------------------------------------------------
/src/helpers/number.rs:
--------------------------------------------------------------------------------
  1 | //! Helpers for number handling
  2 | 
  3 | use std::{
  4 |     fmt,
  5 |     ops::{Deref, DerefMut},
  6 | };
  7 | 
  8 | use ordered_float::OrderedFloat;
  9 | 
 10 | /// Discretizes a floating-point number into bins of width 'interval'
 11 | pub fn bin(num: f64, interval: f64) -> Interval {
 12 |     let start = (num / interval).floor() * interval;
 13 |     Interval::new(start, start + interval)
 14 | }
 15 | 
 16 | // TODO: consider replacing rust-lexical https://github.com/rustsec/advisory-db/issues/1757
 17 | pub fn parse_float(text: &[u8]) -> Result<f64, String> {
 18 |     lexical::parse(text).map_err(|_| {
 19 |         format!(
 20 |             "Could not convert '{}' to a decimal number.",
 21 |             String::from_utf8_lossy(text)
 22 |         )
 23 |     })
 24 | }
 25 | 
 26 | pub fn parse_int(text: &[u8]) -> Result<i64, String> {
 27 |     atoi::atoi(text).ok_or_else(|| {
 28 |         format!(
 29 |             "Could not convert '{}' to an integer number.",
 30 |             String::from_utf8_lossy(text)
 31 |         )
 32 |     })
 33 | }
 34 | 
 35 | /// Wrapper used for float values across this crate.
 36 | /// It can be sorted/hashed and provides a `Display` implementation that
 37 | /// allows to print the numbers in a human-readable way.
 38 | #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
 39 | #[cfg_attr(
 40 |     any(feature = "all-commands", feature = "sort", feature = "unique"),
 41 |     derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize),
 42 |     archive(compare(PartialEq), check_bytes)
 43 | )]
 44 | pub struct Float(OrderedFloat<f64>);
 45 | 
 46 | impl Float {
 47 |     pub fn new(f: f64) -> Self {
 48 |         Self(OrderedFloat(f))
 49 |     }
 50 | 
 51 |     pub fn inner(&self) -> f64 {
 52 |         self.0 .0
 53 |     }
 54 | }
 55 | 
 56 | impl Deref for Float {
 57 |     type Target = f64;
 58 |     fn deref(&self) -> &Self::Target {
 59 |         &self.0
 60 |     }
 61 | }
 62 | 
 63 | impl DerefMut for Float {
 64 |     fn deref_mut(&mut self) -> &mut Self::Target {
 65 |         &mut self.0
 66 |     }
 67 | }
 68 | 
 69 | impl fmt::Display for Float {
 70 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 71 |         // TODO: consider replacing rust-lexical https://github.com/rustsec/advisory-db/issues/1757
 72 |         use lexical::WriteFloatOptions;
 73 |         let opts = WriteFloatOptions::builder()
 74 |             .trim_floats(true)
 75 |             .max_significant_digits(std::num::NonZeroUsize::new(6))
 76 |             // matching JS formatting
 77 |             .nan_string(Some(b"NaN"))
 78 |             .inf_string(Some(b"Infinity"))
 79 |             .build()
 80 |             .unwrap();
 81 |         const FMT: u128 = lexical::format::STANDARD;
 82 |         let formatted = lexical::to_string_with_options::<_, FMT>(self.inner(), &opts);
 83 |         // ryu::Buffer::new().format($f)};
 84 |         write!(f, "{formatted}")
 85 |     }
 86 | }
 87 | 
 88 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
 89 | #[cfg_attr(
 90 |     any(feature = "all-commands", feature = "sort", feature = "unique"),
 91 |     derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize),
 92 |     archive(compare(PartialEq), check_bytes)
 93 | )]
 94 | pub struct Interval(pub Float, pub Float);
 95 | 
 96 | impl Interval {
 97 |     pub fn new(start: f64, end: f64) -> Self {
 98 |         Self(Float::new(start), Float::new(end))
 99 |     }
100 | }
101 | 
102 | impl fmt::Display for Interval {
103 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |         write!(f, "({}, {}]", self.0, self.1)
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/cmd/mask.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use crate::cli::{CommonArgs, WORDY_HELP};
 4 | use crate::config::Config;
 5 | use crate::error::CliResult;
 6 | use crate::helpers::var_range::VarRanges;
 7 | use crate::io::SeqQualRecord;
 8 | 
 9 | pub const DESC: &str = "\
10 | Masks the sequence within a given range or comma delimited list of ranges
11 | by converting to lowercase (soft mask) or replacing with a character (hard
12 | masking). Reverting soft masking is also possible.";
13 | 
14 | #[derive(Parser, Clone, Debug)]
15 | #[clap(next_help_heading = "'Mask' command options")]
16 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
17 | pub struct MaskCommand {
18 |     /// Range in the form 'start:end' or 'start:' or ':end',
19 |     /// The range start/end may be defined by varialbes/functions,
20 |     /// or the varialbe/function may contain a whole range.
21 |     ranges: String,
22 | 
23 |     /// Do hard masking instead of soft masking, replacing
24 |     /// everything in the range(s) with the given character
25 |     #[arg(long, value_name = "CHAR")]
26 |     hard: Option<char>,
27 | 
28 |     /// Unmask (convert to uppercase instead of lowercase)
29 |     #[arg(long)]
30 |     unmask: bool,
31 | 
32 |     /// Exclusive range: excludes start and end positions
33 |     /// from the masked sequence.
34 |     /// In the case of unbounded ranges (`start:` or `:end`), the range still
35 |     /// extends to the complete end or the start of the sequence.
36 |     #[arg(short, long)]
37 |     exclusive: bool,
38 | 
39 |     /// Interpret range as 0-based, with the end not included.
40 |     #[arg(short('0'), long)]
41 |     zero_based: bool,
42 | 
43 |     #[command(flatten)]
44 |     pub common: CommonArgs,
45 | }
46 | 
47 | pub fn run(mut cfg: Config, args: MaskCommand) -> CliResult<()> {
48 |     let ranges = &args.ranges;
49 |     let hard_mask = args.hard;
50 |     let rng0 = args.zero_based;
51 |     let exclusive = args.exclusive;
52 |     let unmask = args.unmask;
53 | 
54 |     let mut format_writer = cfg.get_format_writer()?;
55 |     cfg.with_io_writer(|io_writer, mut cfg| {
56 |         let mut ranges = cfg.build_vars(|b| VarRanges::from_str(ranges, b))?;
57 |         let mut seq = Vec::new();
58 |         let mut num_buf = Vec::new();
59 | 
60 |         cfg.read(|record, ctx| {
61 |             // obtain full sequence
62 |             seq.clear();
63 |             let mut seqlen = 0;
64 |             for s in record.seq_segments() {
65 |                 seq.extend_from_slice(s);
66 |                 seqlen += s.len();
67 |             }
68 | 
69 |             let calc_ranges = ranges.resolve(ctx.symbols(), record, &mut num_buf)?;
70 | 
71 |             if let Some(h) = hard_mask {
72 |                 for rng in calc_ranges {
73 |                     let (start, end) = rng.adjust(rng0, exclusive)?.resolve(seqlen);
74 |                     for c in &mut seq[start..end] {
75 |                         *c = h as u8;
76 |                     }
77 |                 }
78 |             } else {
79 |                 for rng in calc_ranges {
80 |                     let (start, end) = rng.adjust(rng0, exclusive)?.resolve(seqlen);
81 |                     for c in &mut seq[start..end] {
82 |                         if unmask {
83 |                             c.make_ascii_uppercase()
84 |                         } else {
85 |                             c.make_ascii_lowercase()
86 |                         };
87 |                     }
88 |                 }
89 |             }
90 | 
91 |             let rec = SeqQualRecord::new(&record, &seq, None);
92 |             format_writer.write(&rec, io_writer, ctx)?;
93 | 
94 |             Ok(true)
95 |         })
96 |     })
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/trim.rs:
--------------------------------------------------------------------------------
  1 | use crate::helpers::NA;
  2 | 
  3 | use super::*;
  4 | 
  5 | #[test]
  6 | fn trim() {
  7 |     let seq = "ATGC";
  8 |     let fasta = fasta_record(seq);
  9 | 
 10 |     cmp(&["trim", ":"], &fasta, &fasta);
 11 |     cmp(&["trim", "1:"], &fasta, &fasta);
 12 |     cmp(&["trim", ":1"], &fasta, &fasta_record(&seq[..1]));
 13 |     cmp(&["trim", "2:-2"], &fasta, &fasta_record(&seq[1..3]));
 14 |     // exclusive
 15 |     cmp(&["trim", "-e", "1:3"], &fasta, &fasta_record(&seq[1..2]));
 16 |     cmp(&["trim", "-e", "2:3"], &fasta, &fasta_record(&seq[2..2]));
 17 |     cmp(&["trim", "-e", "2:4"], &fasta, &fasta_record(&seq[2..3]));
 18 |     // exclusive + unbounded
 19 |     cmp(&["trim", "-e", ":3"], &fasta, &fasta_record(&seq[..2]));
 20 |     cmp(&["trim", "-e", "2:"], &fasta, &fasta_record(&seq[2..]));
 21 |     // empty seq
 22 |     cmp(&["trim", "2:1"], &fasta, &fasta_record(""));
 23 | }
 24 | 
 25 | #[test]
 26 | fn trim0() {
 27 |     let seq = "ATGC";
 28 |     let fasta = fasta_record(seq);
 29 | 
 30 |     cmp(&["trim", "-0", "1:3"], &fasta, &fasta_record(&seq[1..3]));
 31 |     cmp(&["trim", "-0", ":3"], &fasta, &fasta_record(&seq[..3]));
 32 |     cmp(&["trim", "-0", "2:"], &fasta, &fasta_record(&seq[2..]));
 33 | }
 34 | 
 35 | #[test]
 36 | fn trim_qual() {
 37 |     // quality trimming
 38 |     let fq = "@id\nATGC\n+\n1234\n";
 39 | 
 40 |     cmp(&["trim", "--fq", ":2"], fq, "@id\nAT\n+\n12\n");
 41 |     cmp(&["trim", "--fq", "2:3"], fq, "@id\nTG\n+\n23\n");
 42 | }
 43 | 
 44 | #[test]
 45 | fn trim_vars() {
 46 |     let id = "id start=2 end=3 range=2:3";
 47 |     let fa = format!(">{id}\nATGC\n");
 48 |     let trimmed = format!(">{id}\nTG\n");
 49 | 
 50 |     cmp(&["trim", "{attr(start)}:{attr(end)}"], &fa, &trimmed);
 51 |     cmp(&["trim", "{attr(range)}"], &fa, &trimmed);
 52 |     // multiple ranges
 53 |     // TODO: space not deleted
 54 |     cmp(
 55 |         &["trim", "{attr_del(r)}"],
 56 |         ">id r=1:2,4:4\nATGC\n",
 57 |         ">id \nATC\n",
 58 |     );
 59 | }
 60 | 
 61 | #[test]
 62 | fn trim_multiline() {
 63 |     let fa = ">id\nAB\nCDE\nFGHI\nJ";
 64 |     let seq = "ABCDEFGHIJ";
 65 | 
 66 |     cmp(&["trim", ":"], fa, &format!(">id\n{seq}\n"));
 67 | 
 68 |     for start in 0..seq.len() - 1 {
 69 |         for end in start..seq.len() {
 70 |             cmp(
 71 |                 &["trim", "-0", &format!("{start}:{end}")],
 72 |                 fa,
 73 |                 &format!(">id\n{}\n", &seq[start..end]),
 74 |             );
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | #[test]
 80 | fn trim_multiline_multirange() {
 81 |     let fa = ">id\nAB\nC\nDE\nFGHI\nJ";
 82 | 
 83 |     cmp(&["trim", "2:4,6:7"], fa, ">id\nBCDFG\n");
 84 |     cmp(&["trim", "-4:-3,-1:"], fa, ">id\nGHJ\n");
 85 | }
 86 | 
 87 | #[test]
 88 | fn trim_na() {
 89 |     cmp(&["trim", &format!("{NA}:")], ">id\nABCDE\n", ">id\nABCDE\n");
 90 |     cmp(
 91 |         &["trim", &format!("{NA}:{NA}")],
 92 |         ">id\nABCDE\n",
 93 |         ">id\nABCDE\n",
 94 |     );
 95 |     cmp(
 96 |         &["trim", "{opt_attr(s)}:{attr(e)}"],
 97 |         format!(">id s={NA} e=3\nABCDE\n"),
 98 |         &format!(">id s={NA} e=3\nABC\n"),
 99 |     );
100 |     fails(
101 |         &["trim", "{attr(s)}:{attr(e)}"],
102 |         format!(">id s={NA} e=3\nABCDE\n"),
103 |         "reserved for missing values",
104 |     );
105 |     cmp(
106 |         &["trim", "{opt_attr(s)}:{opt_attr(e)}"],
107 |         ">id s=3\nABCDE\n",
108 |         ">id s=3\nCDE\n",
109 |     );
110 |     fails(
111 |         &["trim", "{opt_attr(s)}:{opt_attr(e)}"],
112 |         ">id s=something\nABCDE\n",
113 |         "Could not convert 'something' to an integer number",
114 |     );
115 | }
116 | 


--------------------------------------------------------------------------------
/scripts/gen_help.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # cargo build
  4 | 
  5 | seqtool=target/debug/st
  6 | outdir=../seqtool-docs
  7 | main=../seqtool-docs/index.md #_README.md
  8 | nav=../seqtool-docs/nav.yml
  9 | 
 10 | # # prepend table of contents if there are H3 headings
 11 | # prepend_toc() {
 12 | #   contents="$1"
 13 | #   level="$2"
 14 | #   toc_level="$3"
 15 | #   toc=`grep "^$level " "$contents" |
 16 | #     sed -E "s/^$level (.*)/* [\1]   #\1/g" |
 17 | #     awk -F'   ' '{ gsub(" ", "-", $2); rep=gsub("[()]", "", $2); print sprintf("%s(%s)", $1, tolower($2)) }'`
 18 | 
 19 | #   if [ `printf "$toc" | wc -l` -gt 1 ]; then
 20 | #     printf "$toc_level Contents\n\n$toc\n\n" | cat - "$contents" > tmp_out
 21 | #     mv tmp_out "$contents"
 22 | #   fi
 23 | # }
 24 | 
 25 | 
 26 | # echo -e "---\npermalink: /\ntitle: title\nwide: true\nsidebar:\n  \nnav: docs\n---\n" > tmp_out
 27 | 
 28 | echo -e "docs:\n  - title: Commands\n    children:\n" > $nav
 29 | 
 30 | cat doc/_head.md > $main
 31 | 
 32 | # generate command files
 33 | 
 34 | printf "\n## Commands" >> $main
 35 | 
 36 | cmd=(
 37 |   ">Basic conversion / editing" pass
 38 |   ">Information about sequences" view count stat
 39 |   ">Subsetting/shuffling sequences" sort unique filter split sample slice head tail interleave
 40 |   ">Searching and replacing" find replace
 41 |   ">Modifying commands" del set trim mask upper lower revcomp concat
 42 | )
 43 | 
 44 | # create one MD file per command
 45 | 
 46 | for c in "${cmd[@]}"; do
 47 |   echo "$c"
 48 | 
 49 |   if [[ "$c" = ">"* ]]; then
 50 |     # category name
 51 |     c=$(echo "$c" | cut -c2-)
 52 |     printf "\n### $c\n" >> $main
 53 |     continue
 54 |   fi
 55 | 
 56 |   out=$outdir/$c.md
 57 |   echo -n > $out
 58 | 
 59 |   opts=$(stty cols 80 && $seqtool "$c" -h 2>&1 | sed -n '/General options/q;p')
 60 |   desc=$(echo "$opts" | sed -n '/^ *$/q;p')
 61 | 
 62 |   # add command to overview
 63 |   echo "* **[$c](https://markschl.github.io/seqtool/$c)**: $desc" >> $main
 64 | 
 65 |   # add custom help content if file exists in doc dir
 66 |   desc_f=doc/$c.md
 67 |   if [ -f $desc_f ]; then
 68 |     echo "## Details" >> $out
 69 |     cat $desc_f >> $out
 70 |   fi
 71 | 
 72 |   # add variable help if present
 73 |   vars=$($seqtool $c  --help-vars-md --help-cmd-vars 2>&1 || true)
 74 |   if [ ! -z "$vars" -a "$vars" != " "  ]; then
 75 |     echo -e "$vars" | sed 's/^ *#/##/g' >> $out
 76 |   fi
 77 | 
 78 |   # prepend_toc $out '###' '##'
 79 | 
 80 |   # TODO: why prepend?
 81 |   # prepend usage info
 82 |   # echo -e "---\npermalink: /$c/\ntitle: $c\ntoc: true\nsidebar:\n  nav: docs\n---\n" > tmp_out
 83 |   usage=$(echo "$opts" | sed '/Usage:/,$!d')
 84 |   printf "$desc\n\n\`\`\`\n$usage\n\`\`\`\n\n[See this page](opts) for the options common to all commands.\n\n" |
 85 |     cat - $out >> tmp_out
 86 |     mv tmp_out $out
 87 | 
 88 |   echo -e "      - title: $c\n        url: /$c" >> $nav
 89 | 
 90 | done
 91 | 
 92 | 
 93 | echo >> $main
 94 | cat doc/_desc.md >> $main
 95 | 
 96 | # variables/functions
 97 | out=$outdir/variables.md
 98 | cp doc/variables.md $outdir/variables.md
 99 | # full variables reference
100 | out=$outdir/var_reference.md
101 | $seqtool . --help-vars-md 2>&1 > $out
102 | prepend_toc $out '##' '##'
103 | mv $out tmp_out
104 | echo -e "\n# Variables/functions: full reference\n" > $out
105 | cat tmp_out >> $out
106 | rm tmp_out
107 | 
108 | # args common to all commands
109 | out=$outdir/opts.md
110 | printf "\n\n### Options recognized by all commands\n\n" > $out
111 | echo "\`\`\`" >> $out
112 | stty cols 80 && $seqtool pass -h 2>1 | sed '/General options/,$!d' >> $out
113 | echo "\`\`\`" >> $out
114 | 
115 | # other files
116 | 
117 | # TODO: doc/expressions.md
118 | cp doc/meta.md doc/ranges.md doc/attributes.md $outdir
119 | 


--------------------------------------------------------------------------------
/src/cmd/concat.rs:
--------------------------------------------------------------------------------
  1 | use std::iter::repeat_n;
  2 | 
  3 | use clap::Parser;
  4 | 
  5 | use crate::cli::{CommonArgs, WORDY_HELP};
  6 | use crate::config::Config;
  7 | use crate::error::CliResult;
  8 | use crate::io::OwnedRecord;
  9 | 
 10 | pub const DESC: &str = "\
 11 | The sequence IDs must be in the same order in all files;
 12 | Fails if the IDs don't match.";
 13 | 
 14 | #[derive(Parser, Clone, Debug)]
 15 | #[clap(next_help_heading = "'Concat' command options")]
 16 | #[clap(before_help=DESC, help_template=WORDY_HELP)]
 17 | pub struct ConcatCommand {
 18 |     /// Don't check if the IDs of the records from
 19 |     /// the different files match
 20 |     #[arg(short, long, short)]
 21 |     no_id_check: bool,
 22 | 
 23 |     /// Add a spacer of <N> characters inbetween the concatenated
 24 |     /// sequences.
 25 |     #[arg(short, long, short)]
 26 |     spacer: Option<usize>,
 27 | 
 28 |     /// Character to use as spacer for sequences
 29 |     #[arg(short('c'), long, default_value = "N")]
 30 |     s_char: char,
 31 | 
 32 |     /// Character to use as spacer for qualities.
 33 |     /// Defaults to a phred score of 41 (Illumina 1.8+/Phred+33 encoding, which
 34 |     /// is the default assumed encoding).
 35 |     #[arg(short = 'Q', long, default_value = "J")]
 36 |     q_char: char,
 37 | 
 38 |     #[command(flatten)]
 39 |     pub common: CommonArgs,
 40 | }
 41 | 
 42 | pub fn run(mut cfg: Config, args: ConcatCommand) -> CliResult<()> {
 43 |     let id_check = !args.no_id_check;
 44 |     let spacer_n = args.spacer;
 45 |     let s_char = args.s_char as u8;
 46 |     let q_char = args.q_char as u8;
 47 | 
 48 |     let mut format_writer = cfg.get_format_writer()?;
 49 |     cfg.with_io_writer(|io_writer, mut cfg| {
 50 |         let mut record = OwnedRecord::default();
 51 |         let num_readers = cfg.num_readers();
 52 |         if num_readers == 0 {
 53 |             return fail!("Nothing to concatenate!");
 54 |         }
 55 |         let max_idx = num_readers - 1;
 56 | 
 57 |         cfg.read_alongside(false, |i, rec, ctx| {
 58 |             if i == 0 {
 59 |                 // initialize record
 60 |                 record.update_header_from(rec);
 61 |                 record.seq.clear();
 62 |             } else if id_check && rec.id() != record.id.as_slice() {
 63 |                 return fail!(format!(
 64 |                     "ID of record #{} ({}) does not match the ID of the first one ({})",
 65 |                     i + 1,
 66 |                     String::from_utf8_lossy(rec.id()),
 67 |                     String::from_utf8_lossy(&record.id)
 68 |                 ));
 69 |             }
 70 | 
 71 |             // extend seq
 72 |             for s in rec.seq_segments() {
 73 |                 record.seq.extend(s);
 74 |             }
 75 | 
 76 |             // handle qual
 77 |             if let Some(q) = rec.qual() {
 78 |                 let qual = record.qual.get_or_insert_with(Vec::new);
 79 |                 if i == 0 {
 80 |                     qual.clear();
 81 |                 }
 82 |                 qual.extend(q);
 83 |             }
 84 | 
 85 |             // spacer
 86 |             if let Some(n) = spacer_n {
 87 |                 if i < max_idx {
 88 |                     record.seq.extend(repeat_n(s_char, n));
 89 |                     if let Some(q) = record.qual.as_mut() {
 90 |                         q.extend(repeat_n(q_char, n));
 91 |                     }
 92 |                 }
 93 |             }
 94 | 
 95 |             // write at last
 96 |             if i == max_idx {
 97 |                 // handle variables (read_alongside requires this to be done manually)
 98 |                 ctx.set_record(&record, 0)?;
 99 |                 format_writer.write(&record, io_writer, ctx)?;
100 |             }
101 |             Ok(true)
102 |         })
103 |     })
104 | }
105 | 


--------------------------------------------------------------------------------
/src/test/sample.rs:
--------------------------------------------------------------------------------
  1 | extern crate rand;
  2 | use rand::{distr::Uniform, prelude::*, seq::IteratorRandom};
  3 | 
  4 | use crate::cmd::sample::DefaultRng;
  5 | 
  6 | use super::*;
  7 | 
  8 | #[test]
  9 | fn simple() {
 10 |     let input = tmp_file("sample_simple__", ".fasta", &FASTA);
 11 |     // very simple tests
 12 |     cmp(&["sample", "-n", "4"], &input, &FASTA);
 13 |     fails(
 14 |         &["sample", "-p", "2"],
 15 |         &input,
 16 |         "Fractions should be between 0 and 1",
 17 |     );
 18 |     fails(
 19 |         &["sample", "-p", "1"],
 20 |         &input,
 21 |         "Fractions should be between 0 and 1",
 22 |     );
 23 | }
 24 | 
 25 | #[test]
 26 | fn large() {
 27 |     with_tmpdir("st_sample_large_", |td| {
 28 |         // RNGs and seeds
 29 |         // test with integer seed
 30 |         let seed1 = 602993;
 31 |         // string seed
 32 |         let seed2 = "ABCDEFGHIJKLMNOP";
 33 |         let mut seed2_array = [0; 32];
 34 |         (&mut seed2_array[..]).write_all(seed2.as_bytes()).unwrap();
 35 |         let rngs: Vec<(String, Box<dyn Fn() -> DefaultRng>)> = vec![
 36 |             (
 37 |                 format!("{seed1}"),
 38 |                 Box::new(|| DefaultRng::seed_from_u64(seed1)),
 39 |             ),
 40 |             (
 41 |                 seed2.to_string(),
 42 |                 Box::new(|| DefaultRng::from_seed(seed2_array)),
 43 |             ),
 44 |         ];
 45 | 
 46 |         // input
 47 | 
 48 |         let n_records = 1000;
 49 |         let seqs: Vec<_> = (0..n_records).map(|i| format!(">{i}\nSEQ\n")).collect();
 50 |         let fasta = seqs.join("");
 51 | 
 52 |         let input = td.file(".fasta", &fasta);
 53 | 
 54 |         for (seed, get_rng) in &rngs {
 55 |             // test fixed number (-n);
 56 |             for n in [1, 10, 100, 500, 998, 1000] {
 57 |                 // also test different memory limits to ensure that switching
 58 |                 // from sampling whole records to indices only works
 59 |                 for rec_limit in [1, 5, 10, 100, 200, 500, 800, 1000, 10000] {
 60 |                     for two_pass in [false, true] {
 61 |                         // expected output:
 62 |                         // we use reservoir sampling implemented in the rand crate,
 63 |                         // which is a way of validating our own reimplementation.
 64 |                         let mut rng = get_rng();
 65 |                         let mut indices = (0..n_records).choose_multiple(&mut rng, n);
 66 |                         indices.sort(); // results always in input order
 67 |                         let expected = indices.into_iter().map(|i| seqs[i].clone()).join("");
 68 |                         // run sample command
 69 |                         let mem_limit = rec_limit * n * 12;
 70 |                         let n = format!("{n}");
 71 |                         let mem = format!("{mem_limit}");
 72 |                         let mut args = vec!["sample", "-n", &n, "-s", seed, "--max-mem", &mem];
 73 |                         if two_pass {
 74 |                             args.push("-2");
 75 |                         }
 76 |                         cmp(&args, &input, &expected);
 77 |                     }
 78 |                 }
 79 |             }
 80 | 
 81 |             // test probability sampling (-p);
 82 |             let distr = Uniform::new(0f32, 1.).unwrap();
 83 |             for &p in &[0., 0.1, 0.3, 0.5, 0.7, 0.95] {
 84 |                 let mut rng = get_rng();
 85 |                 let expected = seqs
 86 |                     .iter()
 87 |                     .filter(|&_| distr.sample(&mut rng) < p)
 88 |                     .cloned()
 89 |                     .join("");
 90 | 
 91 |                 cmp(
 92 |                     &["sample", "-p", &format!("{p}"), "-s", seed],
 93 |                     &input,
 94 |                     &expected,
 95 |                 );
 96 |             }
 97 |         }
 98 |     });
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/cmd/cmp/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::config::Config;
  2 | use crate::error::CliResult;
  3 | use crate::var::{modules::VarProvider, varstring::register_var_list};
  4 | 
  5 | mod cli;
  6 | mod complete;
  7 | mod in_order;
  8 | mod output;
  9 | mod vars;
 10 | 
 11 | pub use self::cli::*;
 12 | pub use self::output::*;
 13 | pub use self::vars::*;
 14 | 
 15 | #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 16 | pub enum Category {
 17 |     Common,
 18 |     Unique1,
 19 |     Unique2,
 20 | }
 21 | 
 22 | use self::Category::*;
 23 | 
 24 | impl Category {
 25 |     fn long_text(self) -> &'static str {
 26 |         match self {
 27 |             Common => "common",
 28 |             Unique1 => "unique1",
 29 |             Unique2 => "unique2",
 30 |         }
 31 |     }
 32 | 
 33 |     fn short_text(self) -> &'static str {
 34 |         match self {
 35 |             Common => "c",
 36 |             Unique1 => "u1",
 37 |             Unique2 => "u2",
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | #[derive(Debug, Default, Copy, Clone)]
 43 | struct CmpStats {
 44 |     pub common: u64,
 45 |     pub unique1: u64,
 46 |     pub unique2: u64,
 47 | }
 48 | 
 49 | /// Factor for adjusting the calculated memory usage (based on size of items)
 50 | /// to obtain the approximately correct total memory usage.
 51 | /// It corrects for the extra memory that may not be in the calculation otherwise.
 52 | static MEM_OVERHEAD: f32 = 1.1;
 53 | 
 54 | pub fn run(mut cfg: Config, mut args: CmpCommand) -> CliResult<()> {
 55 |     let quiet = args.common.general.quiet;
 56 |     let two_pass = args.two_pass;
 57 |     let max_mem = (args.max_mem as f32 / MEM_OVERHEAD) as usize;
 58 | 
 59 |     // register variables/functions:
 60 |     // tuples of (varstring, text buffer)
 61 |     cfg.set_custom_varmodule(Box::<CmpVars>::default())?;
 62 | 
 63 |     let mut var_key = Vec::with_capacity(1);
 64 |     cfg.build_vars(|b| {
 65 |         for key in &args.key {
 66 |             register_var_list(key.as_ref(), b, &mut var_key, None, true, true)?;
 67 |         }
 68 |         Ok::<_, String>(())
 69 |     })?;
 70 | 
 71 |     let diff_fields = args
 72 |         .diff
 73 |         .as_ref()
 74 |         .map(|fields| {
 75 |             let mut vs = Vec::with_capacity(1);
 76 |             cfg.build_vars(|b| {
 77 |                 for f in fields {
 78 |                     register_var_list(f, b, &mut vs, None, true, true)?;
 79 |                 }
 80 |                 Ok::<_, String>(())
 81 |             })?;
 82 |             Ok::<_, String>(vs)
 83 |         })
 84 |         .transpose()?;
 85 |     let diff_writer = diff_fields.map(|fields| DiffWriter::new(fields, args.diff_width));
 86 | 
 87 |     let mut out = Output::from_args(&mut args, &mut cfg)?;
 88 | 
 89 |     cfg.with_custom_varmod(|v: &mut CmpVars| {
 90 |         if out.has_combined_output() && !v.has_vars() {
 91 |             return fail!(
 92 |                 "Specified mixed output in 'cmp' command ' -o/--output/--output2', \
 93 |                 but no variables are used to distinguish records. Please specify \
 94 |                 one of `category`, `category_short` or `key`, or specify unique \
 95 |                 output instead (--unique1/--unique2)."
 96 |             );
 97 |         }
 98 |         Ok::<_, String>(())
 99 |     })?;
100 | 
101 |     let stats = if args.in_order {
102 |         in_order::cmp_in_order(&mut cfg, &var_key, &mut out, diff_writer, max_mem)?
103 |     } else {
104 |         complete::cmp_complete(
105 |             &mut cfg,
106 |             var_key,
107 |             &mut out,
108 |             diff_writer,
109 |             max_mem,
110 |             two_pass,
111 |             quiet,
112 |         )?
113 |     };
114 |     if !quiet {
115 |         eprintln!(
116 |             "common\t{}\nunique1\t{}\nunique2\t{}",
117 |             stats.common, stats.unique1, stats.unique2
118 |         );
119 |     }
120 |     if args.check && (stats.unique1 > 0 || stats.unique2 > 0) {
121 |         return fail!("Not an exact match");
122 |     }
123 |     Ok(())
124 | }
125 | 


--------------------------------------------------------------------------------
/scripts/summarize_comparison.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json
 4 | import re
 5 | 
 6 | 
 7 | def gen_summary(json_input, md_out):
 8 |     from html import escape
 9 |     # def cnv_breaks(s):
10 |     #     return re.sub(r'[\r\n]', '<br/>', s, re.DOTALL)
11 | 
12 |     def fmt_bench(d):
13 |         return '{}{}<br/>{}'.format(
14 |             format_time(d),
15 |             ' {:.0f}% CPU'.format(d['cpu']) if abs(100 - d['cpu']) > 5 else '',
16 |             format_mem(d),
17 |         )
18 | 
19 |     def find_best(d):
20 |         runs = [d['st']] + list(d.get('other', {}).values())
21 |         if len(runs) > 1:
22 |             times = sorted((r['elapsed'], i) for i, r in enumerate(runs))
23 |             if times[0][0] > 0:
24 |                 runs[times[0][1]]['fastest'] = times[1][0] / times[0][0]
25 |             mem = sorted((r['max_mib'], i) for i, r in enumerate(runs))
26 |             if mem[0][0] > 0:
27 |                 runs[mem[0][1]]['lowest_mem'] = mem[1][0] / mem[0][0]
28 | 
29 |     def format_time(d):
30 |         f = d.get('fastest')
31 |         if f is not None:
32 |             return '🕓 <b>{:.1f} s</b> 🏆 ({:.1f}x)'.format(d['elapsed'], f)
33 |         return '🕓 {:.1f} s'.format(d['elapsed'])
34 | 
35 |     def format_mem(d):
36 |         f = d.get('lowest_mem')
37 |         if f is not None:
38 |             return '📈 <b>{:.1f} MiB</b> 🏆 ({:.2f}x)'.format(d['max_mib'], f)
39 |         return '📈 {:.1f} MiB'.format(d['max_mib'])
40 | 
41 |     def fmt_output(d):
42 |         strip_newlines = lambda msg: re.sub(r'(?ms:[\r\n\s]+$)', '', msg)
43 |         out = ''
44 |         if d['stdout']:
45 |             # TODO: get this to work: https://github.com/squidfunk/mkdocs-material/issues/4964
46 |             out += '<details><summary>🟦 output</summary>\n\n```\n{}\n```\n\n</details>\n'.format(strip_newlines(d['stdout']))
47 |         if d['stderr']:
48 |             out += '<details><summary> messages</summary>\n\n```\n{}\n```\n\n</details>\n'.format(strip_newlines(d['stderr']))
49 |         return out
50 | 
51 |     for command, comparisons in json.load(json_input).items():
52 |         md_out.write('## {}\n'.format(command))
53 |         md_out.write('<table markdown class="cmd">\n\n')
54 |         for comparison, d in comparisons.items():
55 |             find_best(d)
56 |             st = d['st']
57 |             md_out.write('<tr markdown>\n<td markdown>\n\n{}\n\n</td>\n\n'.format(escape(d.get('description', comparison))))
58 |             md_out.write('<td markdown>\n\n```bash\n{}\n```\n\n{}\n'.format(st['cmd'], fmt_output(st)))
59 |             if 'other' in d and d['other']:
60 |                 md_out.write('<details markdown><summary>{}</summary>\n\n<table markdown class="cmd">\n'.format(
61 |                     "  ❙  ".join('<b>{}</b> {}'.format(
62 |                         escape(tool), format_time(o),
63 |                     )
64 |                     for tool, o in d['other'].items())
65 |                 )
66 |                 )
67 |                 for tool, o in d['other'].items():
68 |                     code = '<td markdown>\n\n```bash\n{}\n```\n\n{}</td>'.format(
69 |                         o['cmd'].replace('\n', ' '),
70 |                         fmt_output(o)
71 |                     )
72 |                     md_out.write('\n<tr markdown>\n<td markdown>{}</td>\n\n{}\n\n<td markdown>{}</td>\n\n</tr>\n\n'.format(
73 |                         escape(tool),
74 |                         code,
75 |                         fmt_bench(o)
76 |                     ))
77 |                 md_out.write('</table>\n\n</details>\n\n')
78 |             md_out.write('</td>\n\n<td>{}</td>\n\n</tr>\n\n'.format(fmt_bench(st)))
79 |         md_out.write('</table>\n\n')
80 | 
81 | if __name__ == '__main__':
82 |     import argparse
83 | 
84 |     parser = argparse.ArgumentParser()
85 |     parser.add_argument('json_input', type=argparse.FileType('r'))
86 |     parser.add_argument('md_out', type=argparse.FileType('w'))
87 |     # parser.add_argument('-m', '--main-only', action='store_true')
88 |     args = parser.parse_args()
89 | 
90 |     gen_summary(**vars(args))
91 | 


--------------------------------------------------------------------------------
/src/var/modules/expr/var_provider.rs:
--------------------------------------------------------------------------------
  1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType};
  2 | use variable_enum_macro::variable_enum;
  3 | 
  4 | use crate::io::{QualConverter, Record};
  5 | use crate::var::{
  6 |     attr::Attributes, modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder,
  7 | };
  8 | 
  9 | use super::code_or_file;
 10 | use super::js::{parser::Expression, JsExpr};
 11 | 
 12 | type Expressions = super::expressions::Expressions<JsExpr>;
 13 | 
 14 | variable_enum! {
 15 |     /// # Expressions (JavaScript)
 16 |     ///
 17 |     /// Expressions with variables, from simple mathematical operations to
 18 |     /// arbitrarily complex JavaScript code.
 19 |     ///
 20 |     /// Expressions are always enclosed in { curly brackets }. These brackets
 21 |     /// are optional for simple variables/functions in some cases,
 22 |     /// but mandatory for expressions.
 23 |     /// In addition, the 'filter' command takes an expression (without { brackets }).
 24 |     ///
 25 |     ///
 26 |     /// Instead of JavaScript code, it is possible to refer to a source file
 27 |     /// using 'file:path.js'.
 28 |     ///
 29 |     ///
 30 |     /// *Returned value*: For simple one-liner expressions, the value is
 31 |     /// directly used.
 32 |     /// More complex scripts with multiple statements (if/else, loops, etc.)
 33 |     /// explicitly require a `return` statement to return the value.
 34 |     ///
 35 |     ///
 36 |     /// # Examples
 37 |     ///
 38 |     /// Calculate the number of ambiguous bases in a set of DNA sequences and
 39 |     /// add the result as an attribute (ambig=...) to the header
 40 |     ///
 41 |     /// `st pass -a ambig='{seqlen - charcount("ACGT")}' seqs.fasta`
 42 |     ///
 43 |     /// >id1 ambig=3
 44 |     /// TCNTTAWTAACCTGATTAN
 45 |     /// >id2 ambig=0
 46 |     /// GGAGGATCCGAGCG
 47 |     /// (...)
 48 |     ///
 49 |     ///
 50 |     /// Discard sequences with >1% ambiguous bases or sequences shorter than 100bp
 51 |     ///
 52 |     /// `st filter 'charcount("ACGT") / seqlen >= 0.99 && seqlen >= 100' seqs.fasta`
 53 |     ///
 54 |     ///
 55 |     /// Distribute sequences into different files by a slightly complicated condition.
 56 |     /// Note the 'return' statments are are necessary here, since this is not a simple expression.
 57 |     /// With even longer code, consider using an extra script and supplying
 58 |     /// -o "outdir/{file:code.js}.fasta" instead
 59 |     ///
 60 |     /// `st split -po "outdir/{ if (id.startsWith('some_prefix_')) { return 'file_1' } return 'file_2' }.fasta" input.fasta`
 61 |     ///
 62 |     /// There should be two files now (`ls file_*.fasta`):
 63 |     /// file_1.fasta
 64 |     /// file_2.fasta
 65 |     ExprVar<'a> {
 66 |         #[hidden]
 67 |         ____Expr(?) { expr: Expression<'a> },
 68 |     }
 69 | }
 70 | 
 71 | #[derive(Debug)]
 72 | pub struct ExprVars(Expressions);
 73 | 
 74 | impl ExprVars {
 75 |     pub fn new(init_code: Option<&str>) -> Result<Self, String> {
 76 |         let init_code = init_code
 77 |             .map(|c| Ok::<_, String>(code_or_file(c)?.to_string()))
 78 |             .transpose()?;
 79 |         Ok(Self(Expressions::new(init_code.as_deref())?))
 80 |     }
 81 | }
 82 | 
 83 | impl VarProvider for ExprVars {
 84 |     fn info(&self) -> &dyn DynVarProviderInfo {
 85 |         &dyn_var_provider!(ExprVar)
 86 |     }
 87 | 
 88 |     fn register(
 89 |         &mut self,
 90 |         name: &str,
 91 |         args: &[Arg],
 92 |         builder: &mut VarBuilder,
 93 |     ) -> Result<Option<(usize, Option<VarType>)>, String> {
 94 |         if let Some((var, _)) = ExprVar::from_func(name, args)? {
 95 |             let ExprVar::____Expr { expr } = var;
 96 |             return expr
 97 |                 .with_tree(|ast| Ok(Some(self.0.register_expr(ast, builder)?)))
 98 |                 .and_then(|res| res);
 99 |         }
100 |         Ok(None)
101 |     }
102 | 
103 |     fn has_vars(&self) -> bool {
104 |         self.0.num_exprs() > 0
105 |     }
106 | 
107 |     fn set_record(
108 |         &mut self,
109 |         record: &dyn Record,
110 |         symbols: &mut SymbolTable,
111 |         _: &Attributes,
112 |         _: &mut QualConverter,
113 |     ) -> Result<(), String> {
114 |         self.0.eval(symbols, record)
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/helpers/value.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | use std::io;
  3 | use std::mem;
  4 | 
  5 | use deepsize::{Context, DeepSizeOf};
  6 | 
  7 | use crate::helpers::NA;
  8 | use crate::io::Record;
  9 | use crate::var::symbols::{OptValue, Value};
 10 | 
 11 | use super::number::{Float, Interval};
 12 | 
 13 | /// A simple value type that can be either text, numeric, boolean, interval or undefined/none.
 14 | /// Can also be serialized using rkyv (only enabled for sort and unique commands).
 15 | ///
 16 | /// This type is simpler than the Value type in the symbol table, which often have
 17 | /// additional information stored/allocated.
 18 | /// Another difference: SimpleValue does not have an integer type, any number will
 19 | /// thus behave the same (as float). This is important when sorting/hashing.
 20 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
 21 | #[cfg_attr(
 22 |     any(feature = "all-commands", feature = "sort", feature = "unique"),
 23 |     derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize),
 24 |     archive(compare(PartialEq), check_bytes)
 25 | )]
 26 | pub enum SimpleValue {
 27 |     Text(Box<[u8]>),
 28 |     Number(Float),
 29 |     Boolean(bool),
 30 |     Interval(Interval),
 31 |     None,
 32 | }
 33 | 
 34 | impl SimpleValue {
 35 |     #[inline]
 36 |     pub fn write<W: io::Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
 37 |         use SimpleValue::*;
 38 |         match self {
 39 |             Text(v) => writer.write_all(v),
 40 |             Number(v) => write!(writer, "{v}"),
 41 |             Boolean(v) => write!(writer, "{v}"),
 42 |             Interval(i) => write!(writer, "{i}"),
 43 |             None => writer.write_all(NA.as_bytes()),
 44 |         }
 45 |     }
 46 | 
 47 |     #[inline]
 48 |     pub fn to_symbol(&self, sym: &mut OptValue) {
 49 |         use SimpleValue::*;
 50 |         match self {
 51 |             Text(t) => sym.inner_mut().set_text(t),
 52 |             Number(n) => sym.inner_mut().set_float(n.inner()),
 53 |             Boolean(b) => sym.inner_mut().set_bool(*b),
 54 |             Interval(i) => sym.inner_mut().set_interval(*i),
 55 |             None => sym.set_none(),
 56 |         }
 57 |     }
 58 | 
 59 |     #[inline]
 60 |     pub fn replace_from_symbol(
 61 |         &mut self,
 62 |         sym: &OptValue,
 63 |         rec: &dyn Record,
 64 |         text_buf: &mut Vec<u8>,
 65 |     ) {
 66 |         if let SimpleValue::Text(t) = self {
 67 |             // If present, take the text buffer from SimpleValue.
 68 |             // If `text_buf` is already non-empty (allocated), this allocation
 69 |             // will be lost. But it is assumed that the allocation is always
 70 |             // either referenced by SimpleValue::Text() or by `text_buf`, never
 71 |             // both.
 72 |             *text_buf = mem::take(t).into_vec();
 73 |         }
 74 |         *self = if let Some(v) = sym.inner() {
 75 |             match v {
 76 |                 Value::Text(_) | Value::Attr(_) => {
 77 |                     v.as_text(rec, |t| {
 78 |                         text_buf.clear();
 79 |                         text_buf.extend_from_slice(t);
 80 |                         Ok::<(), ()>(())
 81 |                     })
 82 |                     .unwrap();
 83 |                     SimpleValue::Text(mem::take(text_buf).into_boxed_slice())
 84 |                 }
 85 |                 Value::Int(v) => SimpleValue::Number(Float::new(*v.get() as f64)),
 86 |                 Value::Float(v) => SimpleValue::Number(Float::new(*v.get())),
 87 |                 Value::Interval(v) => SimpleValue::Interval(*v.get()),
 88 |                 Value::Bool(v) => SimpleValue::Boolean(*v.get()),
 89 |             }
 90 |         } else {
 91 |             SimpleValue::None
 92 |         };
 93 |     }
 94 | }
 95 | 
 96 | // Dispaly implementation only for error messages
 97 | impl fmt::Display for SimpleValue {
 98 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 99 |         let mut buf = Vec::new();
100 |         self.write(&mut buf).unwrap();
101 |         f.write_str(&String::from_utf8_lossy(&buf))
102 |     }
103 | }
104 | 
105 | impl DeepSizeOf for SimpleValue {
106 |     fn deep_size_of_children(&self, _: &mut Context) -> usize {
107 |         if let SimpleValue::Text(v) = self {
108 |             return v.len();
109 |         }
110 |         0
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/cmd/sort/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::env::temp_dir;
  2 | use std::io::Write;
  3 | use std::path::Path;
  4 | 
  5 | use crate::config::Config;
  6 | use crate::error::CliResult;
  7 | use crate::helpers::vec_buf::VecFactory;
  8 | use crate::var::varstring::register_var_list;
  9 | 
 10 | use super::shared::tmp_store::{Item, Key};
 11 | 
 12 | pub mod cli;
 13 | pub mod file;
 14 | pub mod mem;
 15 | pub mod vars;
 16 | 
 17 | pub use self::cli::*;
 18 | pub use self::file::*;
 19 | pub use self::mem::*;
 20 | pub use self::vars::*;
 21 | 
 22 | /// Factor for adjusting the calculated memory usage (based on size of items)
 23 | /// to obtain the approximately correct total memory usage.
 24 | /// It corrects for the extra memory used by Vec::sort() and other allocations
 25 | /// that may not be in the calculation otherwise.
 26 | /// (factor found by memory profiling on Linux)
 27 | static MEM_OVERHEAD: f32 = 1.1;
 28 | 
 29 | pub fn run(mut cfg: Config, args: SortCommand) -> CliResult<()> {
 30 |     let verbose = args.common.general.verbose;
 31 |     let quiet = args.common.general.quiet;
 32 |     let max_mem = (args.max_mem as f32 / MEM_OVERHEAD) as usize;
 33 |     // TODO: not activated, since we use a low limit for testing
 34 |     // if args.max_mem < 1 << 22 {
 35 |     //     return fail!("The memory limit should be at least 2MiB");
 36 |     // }
 37 |     let mut record_buf_factory = VecFactory::new();
 38 |     let tmp_path = args.temp_dir.clone().unwrap_or_else(temp_dir);
 39 |     let mut sorter = Sorter::new(args.reverse, max_mem);
 40 | 
 41 |     cfg.set_custom_varmodule(Box::<SortVars>::default())?;
 42 | 
 43 |     let mut format_writer = cfg.get_format_writer()?;
 44 | 
 45 |     cfg.with_io_writer(|io_writer, mut cfg| {
 46 |         // assemble key
 47 |         let mut varstring_keys = Vec::with_capacity(1);
 48 |         cfg.build_vars(|b| register_var_list(&args.key, b, &mut varstring_keys, None, true, true))?;
 49 |         let mut key_values = Key::with_size(varstring_keys.len());
 50 |         let mut text_buf = vec![Vec::new(); varstring_keys.len()];
 51 | 
 52 |         cfg.read(|record, ctx| {
 53 |             // assemble key
 54 |             key_values.compose_from(&varstring_keys, &mut text_buf, ctx.symbols(), record)?;
 55 |             ctx.with_custom_varmod(0, |m: &mut SortVars, sym| m.set(&key_values, sym));
 56 |             // write formatted record to a buffer
 57 |             let record_out =
 58 |                 record_buf_factory.get(|out| format_writer.write(&record, out, ctx))?;
 59 |             // add both to the object that handles the sorting
 60 |             sorter.add(
 61 |                 Item::new(key_values.clone(), record_out.into_boxed_slice()),
 62 |                 &tmp_path,
 63 |                 args.temp_file_limit,
 64 |                 quiet,
 65 |             )?;
 66 |             Ok(true)
 67 |         })?;
 68 |         // write sorted output
 69 |         sorter.write(io_writer, quiet, verbose)
 70 |     })
 71 | }
 72 | 
 73 | #[derive(Debug)]
 74 | enum Sorter {
 75 |     Mem(MemSorter),
 76 |     File(FileSorter),
 77 | }
 78 | 
 79 | impl Sorter {
 80 |     fn new(reverse: bool, max_mem: usize) -> Self {
 81 |         Self::Mem(MemSorter::new(reverse, max_mem))
 82 |     }
 83 | 
 84 |     fn add(
 85 |         &mut self,
 86 |         item: Item<Box<[u8]>>,
 87 |         tmp_path: &Path,
 88 |         file_limit: usize,
 89 |         quiet: bool,
 90 |     ) -> CliResult<()> {
 91 |         match self {
 92 |             Self::Mem(m) => {
 93 |                 if !m.add(item) {
 94 |                     if !quiet {
 95 |                         eprintln!(
 96 |                             "Memory limit reached after {} records, writing to temporary file(s). \
 97 |                             Consider raising the limit (-M/--max-mem) to speed up sorting. \
 98 |                             Use -q/--quiet to silence this message.",
 99 |                             m.len()
100 |                         );
101 |                     }
102 |                     let mut f = m.get_file_sorter(tmp_path.to_owned(), file_limit)?;
103 |                     f.write_to_file(quiet)?;
104 |                     *self = Self::File(f);
105 |                 }
106 |             }
107 |             Self::File(f) => {
108 |                 f.add(item, quiet)?;
109 |             }
110 |         }
111 |         Ok(())
112 |     }
113 | 
114 |     fn write(&mut self, io_writer: &mut dyn Write, quiet: bool, verbose: bool) -> CliResult<()> {
115 |         match self {
116 |             Self::Mem(m) => m.write_sorted(io_writer),
117 |             Self::File(f) => f.write_records(io_writer, quiet, verbose),
118 |         }
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/src/test/cmp.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | const FA1: &str = "\
  4 | 1,AAA
  5 | 2,AAA
  6 | 3,CCC
  7 | 5,CCC
  8 | 7,TTT
  9 | 8,ATG
 10 | 9,TGA
 11 | ";
 12 | 
 13 | const FA2: &str = "\
 14 | 1,AAA
 15 | 3,CCC
 16 | 4,CCC
 17 | 5,CCC
 18 | 6,TTT
 19 | 8,GGG
 20 | 10,GAT
 21 | ";
 22 | 
 23 | const STATS: &str = "\
 24 | common\t3
 25 | unique1\t4
 26 | unique2\t4
 27 | ";
 28 | 
 29 | const STATS_ID: &str = "\
 30 | common\t4
 31 | unique1\t3
 32 | unique2\t3
 33 | ";
 34 | 
 35 | const CATEGORY1: &str = "\
 36 | 1,AAA,common
 37 | 2,AAA,unique1
 38 | 3,CCC,common
 39 | 5,CCC,common
 40 | 7,TTT,unique1
 41 | 8,ATG,unique1
 42 | 9,TGA,unique1
 43 | ";
 44 | 
 45 | const CATEGORY2: &str = "\
 46 | 1,AAA,common
 47 | 3,CCC,common
 48 | 4,CCC,unique2
 49 | 5,CCC,common
 50 | 6,TTT,unique2
 51 | 8,GGG,unique2
 52 | 10,GAT,unique2
 53 | ";
 54 | 
 55 | const CATEGORY_ID1: &str = "\
 56 | 1,AAA,common
 57 | 2,AAA,unique1
 58 | 3,CCC,common
 59 | 5,CCC,common
 60 | 7,TTT,unique1
 61 | 8,ATG,common
 62 | 9,TGA,unique1
 63 | ";
 64 | 
 65 | const CATEGORY_ID2: &str = "\
 66 | 1,AAA,common
 67 | 3,CCC,common
 68 | 4,CCC,unique2
 69 | 5,CCC,common
 70 | 6,TTT,unique2
 71 | 8,GGG,common
 72 | 10,GAT,unique2
 73 | ";
 74 | 
 75 | const COMMON: &str = "\
 76 | 1,AAA
 77 | 3,CCC
 78 | 5,CCC
 79 | ";
 80 | 
 81 | const COMMON_ID1: &str = "\
 82 | 1,AAA
 83 | 3,CCC
 84 | 5,CCC
 85 | 8,ATG
 86 | ";
 87 | 
 88 | const COMMON_ID2: &str = "\
 89 | 1,AAA
 90 | 3,CCC
 91 | 5,CCC
 92 | 8,GGG
 93 | ";
 94 | 
 95 | const UNIQUE1: &str = "\
 96 | 2,AAA
 97 | 7,TTT
 98 | 8,ATG
 99 | 9,TGA
100 | ";
101 | 
102 | const UNIQUE_ID1: &str = "\
103 | 2,AAA
104 | 7,TTT
105 | 9,TGA
106 | ";
107 | 
108 | const UNIQUE2: &str = "\
109 | 4,CCC
110 | 6,TTT
111 | 8,GGG
112 | 10,GAT
113 | ";
114 | 
115 | const UNIQUE_ID2: &str = "\
116 | 4,CCC
117 | 6,TTT
118 | 10,GAT
119 | ";
120 | 
121 | #[test]
122 | fn cmp_() {
123 |     with_tmpdir("st_cmp_", |td| {
124 |         let common1 = td.path("cmp_common1.csv");
125 |         let common2 = td.path("cmp_common2.csv");
126 |         let uniq1 = td.path("cmp_unique1.csv");
127 |         let uniq2 = td.path("cmp_unique2.csv");
128 | 
129 |         let input = td.multi_file(".csv", [FA1, FA2]);
130 | 
131 |         // compare by ID and sequence
132 |         let cli = &[
133 |             "cmp",
134 |             "--csv",
135 |             "id,seq",
136 |             "--common1",
137 |             &common1,
138 |             "--common2",
139 |             &common2,
140 |             "--unique1",
141 |             &uniq1,
142 |             "--unique2",
143 |             &uniq2,
144 |         ];
145 |         cmd(cli, &input).stderr(STATS);
146 |         assert_eq!(common1.content(), COMMON);
147 |         assert_eq!(common2.content(), COMMON);
148 |         assert_eq!(uniq1.content(), UNIQUE1);
149 |         assert_eq!(uniq2.content(), UNIQUE2);
150 | 
151 |         // compare by ID only
152 |         let cli = &[
153 |             "cmp",
154 |             "-k",
155 |             "id",
156 |             "--csv",
157 |             "id,seq",
158 |             "--common1",
159 |             &common1,
160 |             "--common2",
161 |             &common2,
162 |             "--unique1",
163 |             &uniq1,
164 |             "--unique2",
165 |             &uniq2,
166 |         ];
167 |         cmd(cli, &input).stderr(STATS_ID);
168 |         assert_eq!(common1.content(), COMMON_ID1);
169 |         assert_eq!(common2.content(), COMMON_ID2);
170 |         assert_eq!(uniq1.content(), UNIQUE_ID1);
171 |         assert_eq!(uniq2.content(), UNIQUE_ID2);
172 |     });
173 | }
174 | 
175 | #[test]
176 | fn cmp_category() {
177 |     with_tmpdir("st_cmp_category_", |td| {
178 |         let cat1 = td.path("cmp_cat1.csv");
179 |         let cat2 = td.path("cmp_cat2.csv");
180 | 
181 |         let input = td.multi_file(".csv", [FA1, FA2]);
182 | 
183 |         // compare by ID and sequence
184 |         let cli = &[
185 |             "cmp",
186 |             "--csv",
187 |             "id,seq",
188 |             "--to-csv",
189 |             "id,seq,category",
190 |             "-o",
191 |             &cat1,
192 |             "--output2",
193 |             &cat2,
194 |         ];
195 |         cmd(cli, &input).stderr(STATS);
196 |         assert_eq!(cat1.content(), CATEGORY1);
197 |         assert_eq!(cat2.content(), CATEGORY2);
198 | 
199 |         // compare by ID only
200 |         let cli = &[
201 |             "cmp",
202 |             "-k",
203 |             "id",
204 |             "--csv",
205 |             "id,seq",
206 |             "--to-csv",
207 |             "id,seq,category",
208 |             "-o",
209 |             &cat1,
210 |             "--output2",
211 |             &cat2,
212 |         ];
213 |         cmd(cli, &input).stderr(STATS_ID);
214 |         assert_eq!(cat1.content(), CATEGORY_ID1);
215 |         assert_eq!(cat2.content(), CATEGORY_ID2);
216 |     });
217 | }
218 | 


--------------------------------------------------------------------------------
/src/cmd/shared/key.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | use std::io;
  3 | use std::ops::{Deref, DerefMut};
  4 | 
  5 | use deepsize::DeepSizeOf;
  6 | 
  7 | use crate::helpers::{value::SimpleValue, write_list::write_list_with};
  8 | use crate::io::Record;
  9 | use crate::var::{
 10 |     symbols::{OptValue, SymbolTable},
 11 |     varstring::VarString,
 12 | };
 13 | 
 14 | #[derive(DeepSizeOf, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
 15 | #[cfg_attr(
 16 |     any(feature = "all-commands", feature = "sort", feature = "unique"),
 17 |     derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize),
 18 |     archive(compare(PartialEq), check_bytes)
 19 | )]
 20 | pub enum Key {
 21 |     Single(SimpleValue),
 22 |     // This saves time with two values per key (but appears to make >2 values slower)
 23 |     // TODO: activating this comes with the tradeoff of increased memory usage
 24 |     // Two([SimpleValue; 2]),
 25 |     Multiple(Box<[SimpleValue]>),
 26 | }
 27 | 
 28 | impl Key {
 29 |     pub fn with_size(key_size: usize) -> Self {
 30 |         match key_size {
 31 |             0 => panic!(),
 32 |             1 => Self::Single(SimpleValue::None),
 33 |             // 2 => Self::Two([SimpleValue::None, SimpleValue::None]),
 34 |             _ => Self::Multiple(vec![SimpleValue::None; key_size].into_boxed_slice()),
 35 |         }
 36 |     }
 37 | 
 38 |     pub fn as_slice(&self) -> &[SimpleValue] {
 39 |         match self {
 40 |             Self::Single(v) => std::slice::from_ref(v),
 41 |             // Self::Two(v) => v,
 42 |             Self::Multiple(v) => v,
 43 |         }
 44 |     }
 45 | 
 46 |     pub fn compose_from(
 47 |         &mut self,
 48 |         varstrings: &[VarString],
 49 |         key_buf: &mut [Vec<u8>],
 50 |         symbols: &SymbolTable,
 51 |         record: &dyn Record,
 52 |     ) -> Result<(), String> {
 53 |         match self {
 54 |             Key::Single(v) => {
 55 |                 debug_assert!(varstrings.len() == 1 && key_buf.len() == 1);
 56 |                 varstrings[0].simple_value(v, &mut key_buf[0], symbols, record)?
 57 |             }
 58 |             // Key::Two(v) => {
 59 |             //     debug_assert!(varstrings.len() == 2 && key_buf.len() == 2);
 60 |             //     for i in 0..2 {
 61 |             //         varstrings[i].into_simple(
 62 |             //             &mut v[i],
 63 |             //             &mut key_buf[i],
 64 |             //             symbols,
 65 |             //             record,
 66 |             //             force_numeric,
 67 |             //         )?;
 68 |             //     }
 69 |             // }
 70 |             Key::Multiple(values) => {
 71 |                 debug_assert!(varstrings.len() == values.len() && key_buf.len() == values.len());
 72 |                 for ((vs, key_buf), val) in varstrings
 73 |                     .iter()
 74 |                     .zip(key_buf.iter_mut())
 75 |                     .zip(values.iter_mut())
 76 |                 {
 77 |                     vs.simple_value(val, key_buf, symbols, record)?;
 78 |                 }
 79 |             }
 80 |         }
 81 |         Ok(())
 82 |     }
 83 | 
 84 |     #[inline]
 85 |     pub fn write_delimited<W: io::Write + ?Sized>(
 86 |         &self,
 87 |         writer: &mut W,
 88 |         sep: &[u8],
 89 |     ) -> io::Result<()> {
 90 |         write_list_with(self.iter(), sep, writer, |v, o| v.write(o))?;
 91 |         Ok(())
 92 |     }
 93 | 
 94 |     pub fn write_to_symbol(&self, sym: &mut OptValue) {
 95 |         match self {
 96 |             Key::Single(v) => v.to_symbol(sym),
 97 |             // Key::Two(v) => {
 98 |             //     let text = sym.inner_mut().mut_text();
 99 |             //     write_list_with(v, b",", text, |v, o| v.write(o)).unwrap();
100 |             // }
101 |             Key::Multiple(values) => {
102 |                 let text = sym.inner_mut().mut_text();
103 |                 write_list_with(values.iter(), b",", text, |v, o| v.write(o)).unwrap();
104 |             }
105 |         }
106 |     }
107 | }
108 | 
109 | // for error messages
110 | impl fmt::Display for Key {
111 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
112 |         for (i, k) in self.as_slice().iter().enumerate() {
113 |             if i > 0 {
114 |                 write!(f, ",")?;
115 |             }
116 |             write!(f, "{k}")?;
117 |         }
118 |         Ok(())
119 |     }
120 | }
121 | 
122 | impl Deref for Key {
123 |     type Target = [SimpleValue];
124 |     fn deref(&self) -> &Self::Target {
125 |         match self {
126 |             Self::Single(v) => std::slice::from_ref(v),
127 |             Self::Multiple(v) => v,
128 |         }
129 |     }
130 | }
131 | 
132 | impl DerefMut for Key {
133 |     fn deref_mut(&mut self) -> &mut Self::Target {
134 |         match self {
135 |             Self::Single(v) => std::slice::from_mut(v),
136 |             Self::Multiple(v) => v,
137 |         }
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/helpers/seqtype.rs:
--------------------------------------------------------------------------------
  1 | use crate::io::Record;
  2 | 
  3 | use bio::alphabets::{dna, protein, rna};
  4 | use clap::ValueEnum;
  5 | use strum_macros::{Display, EnumString};
  6 | 
  7 | use SeqType::*;
  8 | 
  9 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Display, EnumString, ValueEnum)]
 10 | pub enum SeqType {
 11 |     #[allow(clippy::upper_case_acronyms)]
 12 |     DNA,
 13 |     #[allow(clippy::upper_case_acronyms)]
 14 |     RNA,
 15 |     Protein,
 16 |     Other,
 17 | }
 18 | 
 19 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
 20 | pub struct SeqTypeInfo {
 21 |     pub seqtype: SeqType,
 22 |     /// has DNA/RNA or protein wildcards (N/X)
 23 |     pub has_wildcard: bool,
 24 |     /// has IUPAC ambiguities
 25 |     pub has_ambiguities: bool,
 26 | }
 27 | 
 28 | impl SeqTypeInfo {
 29 |     pub fn new(ty: SeqType, has_wildcard: bool, has_ambiguities: bool) -> Self {
 30 |         Self {
 31 |             seqtype: ty,
 32 |             has_ambiguities,
 33 |             has_wildcard,
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | // For excluding certain characters when running recognition
 39 | fn filter_iter(text: &[u8]) -> impl Iterator<Item = &u8> {
 40 |     text.iter()
 41 |         .filter(|&s| !matches!(s, b'-' | b'.' | b'?' | b' '))
 42 | }
 43 | 
 44 | /// Returns information about the sequence type. In case a type hint is provided,
 45 | /// the sequence is still checked for ambiguities and wildcards.
 46 | /// Returns Err(typehint) if the type hint does not match the actual sequence.
 47 | pub fn guess_seqtype(text: &[u8], hint: Option<SeqType>) -> Result<SeqTypeInfo, SeqType> {
 48 |     match hint {
 49 |         Some(DNA) => guess_dna(text).ok_or(DNA),
 50 |         Some(RNA) => guess_rna(text).ok_or(RNA),
 51 |         Some(Protein) => guess_protein(text).ok_or(Protein),
 52 |         Some(Other) => Ok(SeqTypeInfo::new(Other, false, false)),
 53 |         None => Ok(guess_dna(text)
 54 |             .or_else(|| guess_rna(text))
 55 |             .or_else(|| guess_protein(text))
 56 |             .unwrap_or(SeqTypeInfo::new(Other, false, false))),
 57 |     }
 58 | }
 59 | 
 60 | pub fn guess_seqtype_or_fail(
 61 |     text: &[u8],
 62 |     hint: Option<SeqType>,
 63 |     allow_other: bool,
 64 | ) -> Result<SeqTypeInfo, String> {
 65 |     let info = guess_seqtype(text, hint).map_err(|hint| {
 66 |         format!(
 67 |             "The sequence type '{hint}' provided with `--seqtype` does not appear to be valid \
 68 |             for the given sequence. Please make sure that only valid characters are used and \
 69 |             note that only standard ambiguities according to IUPAC are recognized \
 70 |             (e.g. see https://bioinformatics.org/sms/iupac.html)."
 71 |         )
 72 |     })?;
 73 |     if !allow_other && info.seqtype == Other {
 74 |         return Err("Could not guess sequence type, please provide with `--seqtype`".to_string());
 75 |     }
 76 |     Ok(info)
 77 | }
 78 | 
 79 | pub fn guess_dna(text: &[u8]) -> Option<SeqTypeInfo> {
 80 |     if dna::alphabet().is_word(filter_iter(text)) {
 81 |         Some(SeqTypeInfo::new(DNA, false, false))
 82 |     } else if dna::n_alphabet().is_word(filter_iter(text)) {
 83 |         Some(SeqTypeInfo::new(DNA, true, false))
 84 |     } else if dna::iupac_alphabet().is_word(filter_iter(text)) {
 85 |         Some(SeqTypeInfo::new(DNA, true, true))
 86 |     } else {
 87 |         None
 88 |     }
 89 | }
 90 | 
 91 | pub fn guess_rna(text: &[u8]) -> Option<SeqTypeInfo> {
 92 |     if rna::alphabet().is_word(filter_iter(text)) {
 93 |         Some(SeqTypeInfo::new(RNA, false, false))
 94 |     } else if rna::n_alphabet().is_word(filter_iter(text)) {
 95 |         Some(SeqTypeInfo::new(RNA, true, false))
 96 |     } else if rna::iupac_alphabet().is_word(filter_iter(text)) {
 97 |         Some(SeqTypeInfo::new(RNA, true, true))
 98 |     } else {
 99 |         None
100 |     }
101 | }
102 | 
103 | pub fn guess_protein(text: &[u8]) -> Option<SeqTypeInfo> {
104 |     if protein::alphabet().is_word(filter_iter(text)) {
105 |         Some(SeqTypeInfo::new(Protein, false, false))
106 |     } else if protein::iupac_alphabet().is_word(filter_iter(text)) {
107 |         Some(SeqTypeInfo::new(Protein, true, true))
108 |     } else {
109 |         None
110 |     }
111 | }
112 | 
113 | #[derive(Debug, Clone, Default)]
114 | pub struct SeqtypeHelper {
115 |     seqtype: Option<SeqType>,
116 | }
117 | 
118 | impl SeqtypeHelper {
119 |     pub fn new(typehint: Option<SeqType>) -> Self {
120 |         Self { seqtype: typehint }
121 |     }
122 | 
123 |     pub fn get_or_guess(&mut self, record: &dyn Record) -> Result<SeqType, String> {
124 |         if let Some(seqtype) = self.seqtype {
125 |             Ok(seqtype)
126 |         } else {
127 |             let mut buf = Vec::new();
128 |             let seq = record.full_seq(&mut buf);
129 |             let info = guess_seqtype_or_fail(&seq, self.seqtype, false)?;
130 |             self.seqtype = Some(info.seqtype);
131 |             Ok(info.seqtype)
132 |         }
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/scripts/time.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # FASTQ file
  4 | f=$1
  5 | # primer seqs. for searching
  6 | seq1=$2
  7 | seq2=$3
  8 | 
  9 | 
 10 | alias s=target/release/st
 11 | 
 12 | # prepare
 13 | # s . -a gc={s:gc} $f > $f.with_gc.fq
 14 | # s . --qual-out $f.qual --to-fa $f > /dev/null
 15 | # s . --to-fa $f > $f.fa
 16 | # gzip -k $f
 17 | # lz4 -k $f
 18 | # bzip2 -k $f
 19 | # zstd -k $f
 20 | 
 21 | # load files into memory
 22 | s count $f $f.* -k filename
 23 | 
 24 | logfile=timing.txt
 25 | exec > $logfile 2>&1
 26 | set -x
 27 | 
 28 | # conversion
 29 | time s . --to-fa $f > /dev/null
 30 | time s . --to fastq-illumina $f > /dev/null
 31 | time s . --qual $f.qual $f.fa > /dev/null
 32 | time s . --to-fq --qual $f.qual $f.fa > /dev/null
 33 | time read_fastq -i $f -e base_33 | write_fasta -x > /dev/null
 34 | time cat $f | fastq_to_fasta -Q33 > /dev/null
 35 | time fastq_to_fasta -Q33 -i $f > /dev/null
 36 | time seqtk seq -A $f > /dev/null
 37 | time seqkit fq2fa $f > /dev/null
 38 | time seqkit convert --from 'Sanger' --to 'Illumina-1.3+' $f > /dev/null
 39 | 
 40 | # random subsampling
 41 | time s sample -f 0.1 $f > /dev/null
 42 | time seqtk sample $f 0.1 > /dev/null
 43 | time seqkit sample -p 0.1 $f > /dev/null
 44 | 
 45 | # counting
 46 | time s count $f
 47 | time read_fastq -i $f -e base_33 | count_records -x
 48 | time wc -l $f
 49 | 
 50 | # reverse complement (note, qualities are only reversed by seqtool)
 51 | time fastx_reverse_complement -i $f -Q33 > /dev/null
 52 | time read_fastq -i $f -e base_33 | reverse_seq | complement_seq | write_fastq -x > /dev/null
 53 | time s revcomp $f > /dev/null
 54 | time s revcomp -t4 $f > /dev/null
 55 | time seqtk seq -r $f > /dev/null
 56 | time seqkit seq -rp $f > /dev/null
 57 | 
 58 | # compress
 59 | time s . $f > /dev/null
 60 | time s . $f --to fastq.lz4 > /dev/null
 61 | time s . $f | lz4 -c > /dev/null
 62 | time s . $f --to fastq.gz > /dev/null
 63 | time s . $f | gzip -c > /dev/null
 64 | 
 65 | # decompress
 66 | time s . $f.lz4 > /dev/null
 67 | time lz4 -dc $f.lz4 | s . --fq > /dev/null
 68 | time s . $f.gz > /dev/null
 69 | time gzip -dc $f.gz | s . --fq > /dev/null
 70 | time seqtk seq $f.gz > /dev/null
 71 | time gzip -dc $f.gz | seqtk seq $f.gz > /dev/null
 72 | 
 73 | # RNA -> DNA
 74 | time s replace T U $f > /dev/null
 75 | time s replace T U $f -t4 > /dev/null
 76 | time s find T --rep U $f  > /dev/null
 77 | time s find T --rep U $f -t4 > /dev/null
 78 | time seqkit seq --dna2rna $f > /dev/null
 79 | time read_fastq -i $f -e base_33 | transliterate_vals -k SEQ -s T -r U | write_fastq -x > /dev/null
 80 | time fasta_nucleotide_changer -i $f -Q33 -r > /dev/null
 81 | 
 82 | # GC content "histogram"
 83 | time s count -k n:10:{s:gc} $f
 84 | 
 85 | # from variable
 86 | time s count -k n:10:{a:gc} $f.with_gc.fq
 87 | 
 88 | # with expression
 89 | time s count -k n:.1:{{s:gc/100}} $f
 90 | 
 91 | # filter by length
 92 | time s filter 's:seqlen >= 100' $f > /dev/null
 93 | time seqtk seq -L 100 $f > /dev/null
 94 | time seqkit seq -m 100 $f > /dev/null
 95 | time read_fasta -i $f | grab -e 'SEQ_LEN >= 100' | write_fasta -x > /dev/null
 96 | 
 97 | # filter by quality
 98 | time s filter 's:exp_err < 1' $f --to-fa > /dev/null
 99 | time usearch -fastq_filter $f -fastq_maxee 1 -fastaout $f.filter.fa
100 | time vsearch -fastq_filter $f -fastq_maxee 1 -fastaout $f.filter.fa
101 | rm $f.filter.fa
102 | 
103 | # primer finding
104 | 
105 | printf ">primer1\n$seq1\n>primer2\n$seq2\n" > _primer_file.fa
106 | fp=_primer_file.fa
107 | printf "$seq1\n$seq2\n" | tr 'YR' 'N' > _primer_list.txt
108 | sed 's/R/[AG]/g' _primer_file.fa > _primer_file_ambig.fa
109 | 
110 | run_find() {
111 |     time s find -v file:$1 $f -a primer={f:name} -a rng={f:range} "${@:2}" > /dev/null
112 |     time s find -v file:$1 $f -a primer={f:name} -a rng={f:range} -t4 "${@:2}" > /dev/null
113 | }
114 | 
115 | run_find $fp --algo myers
116 | run_find $fp --algo myers -d1
117 | run_find $fp --algo myers -d4
118 | run_find $fp --algo myers -d8
119 | run_find $fp --algo myers -d4 --in-order
120 | run_find $fp --algo myers -d4 --rng ..25
121 | time s find -v file:$fp $f -a d={f:dist} > /dev/null
122 | time s find -v file:$fp $f -a d={f:dist} -t4 > /dev/null
123 | run_find $fp --algo exact
124 | run_find _primer_file_ambig.fa -r --seqtype other
125 | 
126 | adapter_removal() {
127 |     time AdapterRemoval --file1 $f --adapter-list _primer_list.txt --shift 8 --threads 4 \
128 |      --output1 /dev/null --discarded /dev/stdout --settings /dev/null "$@" > /dev/null
129 |     time AdapterRemoval --file1 $f --adapter-list _primer_list.txt --shift 8 --threads 4 \
130 |     --output1 /dev/null --discarded /dev/stdout --settings /dev/null --threads 4 "$@" > /dev/null
131 | }
132 | 
133 | adapter_removal --mm 1
134 | adapter_removal --mm 4
135 | adapter_removal --mm 8
136 | 
137 | time cutadapt -a primer1=$seq1$ -a primer2=$seq2$ $f -e 0.23 -y ' primer={name}' > /dev/null
138 | time cutadapt -a primer1=$seq1$ -a primer2=$seq2$ $f -e 0.23 -y ' primer={name}' -j4 > /dev/null
139 | 
140 | # trim
141 | 
142 | time s find -f file:$fp $f -a primer={f:name} -a end={f:end} -t5 > $f.find.fq
143 | time s trim -e {a:end}.. $f.find.fq > /dev/null
144 | 


--------------------------------------------------------------------------------
/src/io/format.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | use std::io;
  3 | use std::path::Path;
  4 | use std::str::FromStr;
  5 | 
  6 | use itertools::Itertools;
  7 | 
  8 | use super::{QualFormat, DEFAULT_IO_READER_BUFSIZE, DEFAULT_IO_WRITER_BUFSIZE};
  9 | 
 10 | #[derive(Eq, PartialEq, Debug, Clone, Copy)]
 11 | pub enum FormatVariant {
 12 |     Fasta,
 13 |     Fastq(QualFormat),
 14 |     Csv,
 15 |     Tsv,
 16 | }
 17 | 
 18 | impl FormatVariant {
 19 |     pub fn str_match(s: &str) -> Option<FormatVariant> {
 20 |         match s.to_ascii_lowercase().as_str() {
 21 |             "fasta" | "fa" | "fna" => Some(FormatVariant::Fasta),
 22 |             "fastq" | "fq" => Some(FormatVariant::Fastq(QualFormat::Sanger)),
 23 |             "fastq-illumina" | "fq-illumina" => Some(FormatVariant::Fastq(QualFormat::Illumina)),
 24 |             "fastq-solexa" | "fq-solexa" => Some(FormatVariant::Fastq(QualFormat::Solexa)),
 25 |             "csv" => Some(FormatVariant::Csv),
 26 |             "tsv" | "txt" => Some(FormatVariant::Tsv),
 27 |             _ => None,
 28 |         }
 29 |     }
 30 | }
 31 | 
 32 | impl fmt::Display for FormatVariant {
 33 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 34 |         match *self {
 35 |             FormatVariant::Fasta => write!(f, "fasta"),
 36 |             FormatVariant::Fastq(fmt) => match fmt {
 37 |                 QualFormat::Sanger | QualFormat::Phred => write!(f, "fastq"),
 38 |                 QualFormat::Illumina => write!(f, "fastq-illumina"),
 39 |                 QualFormat::Solexa => write!(f, "fastq-solexa"),
 40 |             },
 41 |             FormatVariant::Csv => write!(f, "csv"),
 42 |             FormatVariant::Tsv => write!(f, "tsv"),
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | impl FromStr for FormatVariant {
 48 |     type Err = String;
 49 | 
 50 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 51 |         FormatVariant::str_match(s).ok_or_else(|| format!("Unknown format: {s}"))
 52 |     }
 53 | }
 54 | 
 55 | #[derive(Eq, PartialEq, Debug, Clone, Copy)]
 56 | pub enum CompressionFormat {
 57 |     #[cfg(feature = "gz")]
 58 |     Gzip,
 59 |     #[cfg(feature = "bz2")]
 60 |     Bzip2,
 61 |     #[cfg(feature = "lz4")]
 62 |     Lz4,
 63 |     #[cfg(feature = "zstd")]
 64 |     Zstd,
 65 | }
 66 | 
 67 | impl CompressionFormat {
 68 |     const FORMAT_MAP: &[(&[&str], CompressionFormat)] = &[
 69 |         #[cfg(feature = "gz")]
 70 |         (&["gz", "gzip"], CompressionFormat::Gzip),
 71 |         #[cfg(feature = "bz2")]
 72 |         (&["bz2", "bzip2"], CompressionFormat::Bzip2),
 73 |         #[cfg(feature = "lz4")]
 74 |         (&["lz4"], CompressionFormat::Lz4),
 75 |         #[cfg(feature = "zstd")]
 76 |         (&["zst", "zstd", "zstandard"], CompressionFormat::Zstd),
 77 |     ];
 78 | 
 79 |     pub fn str_match(s: &str) -> Option<CompressionFormat> {
 80 |         let s = s.to_ascii_lowercase();
 81 |         for (names, format) in Self::FORMAT_MAP {
 82 |             if names.contains(&s.as_str()) {
 83 |                 return Some(*format);
 84 |             }
 85 |         }
 86 |         None
 87 |     }
 88 | 
 89 |     pub fn recommended_read_bufsize(self) -> usize {
 90 |         match self {
 91 |             #[cfg(feature = "zstd")]
 92 |             CompressionFormat::Zstd => zstd::Decoder::<io::Empty>::recommended_output_size(),
 93 |             _ => DEFAULT_IO_READER_BUFSIZE,
 94 |         }
 95 |     }
 96 | 
 97 |     pub fn recommended_write_bufsize(self) -> usize {
 98 |         match self {
 99 |             #[cfg(feature = "zstd")]
100 |             CompressionFormat::Zstd => zstd::Encoder::<io::Sink>::recommended_input_size(),
101 |             _ => DEFAULT_IO_WRITER_BUFSIZE,
102 |         }
103 |     }
104 | }
105 | 
106 | impl FromStr for CompressionFormat {
107 |     type Err = String;
108 | 
109 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
110 |         if let Some(format) = CompressionFormat::str_match(s) {
111 |             Ok(format)
112 |         } else {
113 |             let fmt_list = CompressionFormat::FORMAT_MAP
114 |                 .iter()
115 |                 .map(|(names, _)| names.join("/"))
116 |                 .join(", ");
117 |             Err(format!(
118 |                 "Unknown compression format: {s}. Valid formats are: {fmt_list}."
119 |             ))
120 |         }
121 |     }
122 | }
123 | 
124 | /// Parses a single or double extension from a path:
125 | /// If the extension is recognized as a compression format,
126 | /// it is returned along with the inner extension.
127 | /// Otherwise, only the outer extension is returned (no compression assumed).
128 | pub fn parse_compr_ext<P: AsRef<Path> + ?Sized>(
129 |     path: &P,
130 | ) -> (Option<CompressionFormat>, Option<&str>) {
131 |     let path = path.as_ref();
132 |     let mut fmt = None;
133 |     let mut ext = None;
134 |     if let Some(e) = path.extension().and_then(|e| e.to_str()) {
135 |         if let Some(f) = CompressionFormat::str_match(e) {
136 |             fmt = Some(f);
137 |             if let Some(e) = Path::new(path.file_stem().unwrap()).extension() {
138 |                 ext = e.to_str();
139 |             }
140 |         } else {
141 |             ext = Some(e);
142 |         }
143 |     }
144 |     (fmt, ext)
145 | }
146 | 


--------------------------------------------------------------------------------
/src/cmd/replace.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::ToOwned;
  2 | use std::str;
  3 | 
  4 | use clap::{value_parser, Parser};
  5 | use memchr::memmem::find_iter;
  6 | 
  7 | use crate::cli::CommonArgs;
  8 | use crate::error::CliResult;
  9 | use crate::helpers::replace::replace_iter;
 10 | use crate::io::{RecordAttr, RecordEditor};
 11 | use crate::Config;
 12 | 
 13 | #[derive(Parser, Clone, Debug)]
 14 | #[clap(next_help_heading = "'Replace' command options")]
 15 | pub struct ReplaceCommand {
 16 |     /// Search pattern
 17 |     pattern: String,
 18 | 
 19 |     /// Replacement string, cannot contain variables.
 20 |     replacement: String,
 21 | 
 22 |     /// Replace in IDs instead of sequences
 23 |     #[arg(short, long)]
 24 |     id: bool,
 25 | 
 26 |     /// Replace in descriptions
 27 |     #[arg(short, long)]
 28 |     desc: bool,
 29 | 
 30 |     /// Interpret pattern as a regular expression.
 31 |     /// Unicode characters are supported when searching in IDs/descriptions,
 32 |     /// but not for sequence searches.
 33 |     #[arg(short, long)]
 34 |     regex: bool,
 35 | 
 36 |     /// Number of threads
 37 |     #[arg(short, long, value_name = "N", default_value_t = 1, value_parser = value_parser!(u32).range(1..))]
 38 |     threads: u32,
 39 | 
 40 |     #[command(flatten)]
 41 |     pub common: CommonArgs,
 42 | }
 43 | 
 44 | pub fn run(mut cfg: Config, args: ReplaceCommand) -> CliResult<()> {
 45 |     // what should be replaced?
 46 |     let attr = if args.id {
 47 |         RecordAttr::Id
 48 |     } else if args.desc {
 49 |         RecordAttr::Desc
 50 |     } else {
 51 |         RecordAttr::Seq
 52 |     };
 53 |     let pattern = &args.pattern;
 54 |     let replacement = args.replacement.as_bytes();
 55 |     let has_backrefs = replacement.contains(&b'$');
 56 |     let regex = args.regex;
 57 |     let num_threads = args.threads;
 58 | 
 59 |     let replacer = get_replacer(pattern, regex, has_backrefs)?;
 60 | 
 61 |     let mut format_writer = cfg.get_format_writer()?;
 62 | 
 63 |     cfg.with_io_writer(|io_writer, mut cfg| {
 64 |         cfg.read_parallel(
 65 |             num_threads - 1,
 66 |             |record, editor: &mut RecordEditor| {
 67 |                 editor.edit_with_val(attr, &record, false, |text, out| {
 68 |                     replacer.replace(text, replacement, out)
 69 |                 })
 70 |             },
 71 |             |record, editor, ctx| {
 72 |                 format_writer.write(&editor.record(&record), io_writer, ctx)?;
 73 |                 Ok(true)
 74 |             },
 75 |         )
 76 |     })?;
 77 |     Ok(())
 78 | }
 79 | 
 80 | trait Replacer {
 81 |     fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec<u8>) -> CliResult<()>;
 82 | }
 83 | 
 84 | struct BytesReplacer(Vec<u8>);
 85 | 
 86 | impl Replacer for BytesReplacer {
 87 |     fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec<u8>) -> CliResult<()> {
 88 |         let matches = find_iter(text, &self.0).map(|start| (start, start + self.0.len()));
 89 |         replace_iter(text, replacement, matches, out).unwrap();
 90 |         Ok(())
 91 |     }
 92 | }
 93 | 
 94 | macro_rules! regex_replacer_impl {
 95 |     ($name:ident, $regex:ty, $to_string:expr, $to_bytes:expr) => {
 96 |         struct $name {
 97 |             re: $regex,
 98 |             has_backrefs: bool,
 99 |         }
100 | 
101 |         impl $name {
102 |             fn new(pattern: &str, has_backrefs: bool) -> CliResult<Self> {
103 |                 Ok(Self {
104 |                     re: <$regex>::new(pattern)?,
105 |                     has_backrefs,
106 |                 })
107 |             }
108 |         }
109 | 
110 |         impl Replacer for $name {
111 |             #[allow(clippy::redundant_closure_call)]
112 |             fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec<u8>) -> CliResult<()> {
113 |                 let search_text = $to_string(text)?;
114 |                 if !self.has_backrefs {
115 |                     let matches = self.re.find_iter(search_text).map(|m| (m.start(), m.end()));
116 |                     replace_iter(text, replacement, matches, out).unwrap();
117 |                 } else {
118 |                     // requires allocations
119 |                     let repl_text = $to_string(replacement)?;
120 |                     let replaced = self.re.replace_all(search_text, repl_text);
121 |                     out.extend_from_slice($to_bytes(replaced.as_ref()));
122 |                 }
123 |                 Ok(())
124 |             }
125 |         }
126 |     };
127 | }
128 | 
129 | cfg_if::cfg_if! {
130 |     if #[cfg(feature = "regex-fast")] {
131 |         regex_replacer_impl!(BytesRegexReplacer, regex::bytes::Regex, Ok::<_, crate::error::CliError>, |s| s);
132 |     } else {
133 |         // TODO: no way to operate on byte slices (although it might be added according to regex_lite docs)
134 |         regex_replacer_impl!(BytesRegexReplacer, regex_lite::Regex, |t| std::str::from_utf8(t), str::as_bytes);
135 |     }
136 | }
137 | 
138 | fn get_replacer(
139 |     pattern: &str,
140 |     regex: bool,
141 |     has_backrefs: bool,
142 | ) -> CliResult<Box<dyn Replacer + Sync>> {
143 |     if regex {
144 |         Ok(Box::new(BytesRegexReplacer::new(pattern, has_backrefs)?))
145 |     } else {
146 |         Ok(Box::new(BytesReplacer(pattern.as_bytes().to_owned())))
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
  1 | [package]
  2 | name = "seqtool"
  3 | version = "0.4.0-beta.4"
  4 | edition = "2021"
  5 | authors = ["Markus Schlegel <markschl19@gmail.com>"]
  6 | description = "General-purpose tool for reading, modifying and writing biological sequences."
  7 | license = "MIT OR Apache-2.0"
  8 | repository = "https://github.com/markschl/seqtool"
  9 | homepage = "https://github.com/markschl/seqtool"
 10 | readme = "README.md"
 11 | build = "build.rs"
 12 | 
 13 | [workspace]
 14 | members = ["var_provider", "var_provider/variable_enum_macro"]
 15 | 
 16 | [dependencies]
 17 | ahash = "0.8"
 18 | xxhash-rust = { version = "0.8", features = ["xxh3"] }
 19 | memchr = "2.7"
 20 | winnow = { version = "0.7", features = ["simd"] }
 21 | vec_map = "0.8"
 22 | deepsize = "0.2"
 23 | itertools = "0.14"
 24 | bytecount = "0.6"
 25 | strum = "0.27"
 26 | strum_macros = "0.27"
 27 | lexical = { version = "7.0", default-features = false, features = ["parse-floats", "parse-integers", "parse", "write-floats"] }
 28 | atoi = "2.0"
 29 | ordered-float = { version = "5.0", default-features = false, features = ["std", "rkyv", "rkyv_ck"] }
 30 | cfg-if = "1.0"
 31 | # CLI
 32 | clap = { version = "4.5", features = ["derive", "help", "wrap_help", "env"] }
 33 | textwrap = { version = "0.16", default-features = false }
 34 | color-print = { version = "0.3" }
 35 | # I/O
 36 | seq_io = "0.3.4"
 37 | thread_io = "0.3"
 38 | csv = "1.3"
 39 | # compression formats (behind feature flags)
 40 | bzip2 = { version = "0.6", optional = true }
 41 | lz4 = { version = "1.28", optional = true }
 42 | zstd = { version = "0.13", default-features = false, features = ["zdict_builder"], optional = true }
 43 | # variables / functions
 44 | var_provider = { path = "var_provider" }
 45 | variable_enum_macro = { path = "var_provider/variable_enum_macro" }
 46 | # JS expressions
 47 | rquickjs = { version = "0.9", features=["classes", "properties", "rust-alloc", "macro"], optional=true }
 48 | phf = { version = "0.13", features = ["macros"], optional = true }
 49 | # find, replace, revcomp, view commands
 50 | bio = { version = "3.0", default-features = false }
 51 | regex-lite = { version = "0.1", optional = true }
 52 | regex = { version = "1.11", optional = true }
 53 | # view
 54 | ratatui = { version = "0.29", optional = true, default-features = false, features = ["crossterm"] }
 55 | # view, cmp
 56 | crossterm ={ version = "0.29", optional = true }
 57 | # view
 58 | palette = { version = "0.7", default-features = false, features = ["std", "named_from_str"], optional = true }
 59 | enterpolation = { version = "0.3", default-features = false, features = ["std", "linear"], optional = true }
 60 | # sample
 61 | rand = { version = "0.9", optional = true }
 62 | rand_xoshiro = { version = "0.7.0", optional = true }
 63 | # sort / unique / cmp commands
 64 | indexmap = { version = "2.10", optional = true }
 65 | # cmp
 66 | ringmap = { version = "0.1", optional = true }
 67 | # TODO: v0.8 update blocked by https://github.com/reem/rust-ordered-float/issues/163
 68 | rkyv = { version = "0.7", optional = true}
 69 | byteorder = { version = "1.5", optional = true }
 70 | tempfile = { version = "3.20", optional = true }
 71 | 
 72 | [target.'cfg(not(target_os = "windows"))'.dependencies.flate2]
 73 | version = "1.1"
 74 | default-features = false
 75 | features = ["zlib-ng"]
 76 | optional = true
 77 | 
 78 | [target.'cfg(target_os = "windows")'.dependencies.flate2]
 79 | version = "1.1"
 80 | optional = true
 81 | 
 82 | [build-dependencies]
 83 | regex-lite = "0.1"
 84 | 
 85 | [dev-dependencies]
 86 | assert_cmd = "2.0"
 87 | predicates = "3.1"
 88 | approx = "0.5"
 89 | rand = "0.9"
 90 | rand_xoshiro = "0.7"
 91 | tempfile = "3.10"
 92 | 
 93 | [features]
 94 | default = ["all-commands", "regex-fast", "expr", "gz", "bz2", "lz4", "zstd"]
 95 | # JavaScript expressions
 96 | expr = ["rquickjs", "phf"]
 97 | # Compression formats
 98 | gz = ["flate2"]
 99 | lz4 = ["dep:lz4"]
100 | zstd = ["dep:zstd"]
101 | bz2 = ["bzip2"]
102 | # Regex searching in find/replace
103 | regex-fast = ["regex"]  # adds ~1.4 MiB to binary (Linux)
104 | # Commands
105 | all-commands = [
106 |     "palette", "enterpolation", # view
107 |     "ratatui", "crossterm", # view, cmp
108 |     "rand", "rand_xoshiro",  # sample
109 |     "indexmap", "ringmap", "rkyv", "byteorder", "tempfile",  # sort, unique, cmp
110 |     "regex-lite",  # find, replace
111 | ]
112 | pass = []
113 | view = ["palette", "enterpolation", "crossterm"]
114 | count = ["rkyv"]
115 | stat = []
116 | head = []
117 | tail = []
118 | slice = []
119 | sample = ["rand", "rand_xoshiro"]
120 | sort = ["indexmap", "rkyv", "byteorder", "tempfile"]
121 | unique = ["indexmap", "rkyv", "byteorder", "tempfile"]
122 | filter = []
123 | split = []
124 | cmp = ["ringmap", "indexmap"]
125 | interleave = []
126 | find = ["regex-lite"]
127 | replace = ["regex-lite"]
128 | del = []
129 | set = []
130 | trim = []
131 | mask = []
132 | upper = []
133 | lower = []
134 | revcomp = []
135 | concat = []
136 | 
137 | [[bin]]
138 | path = "src/main.rs"
139 | name = "st"
140 | 
141 | [profile.release]
142 | lto = "thin"
143 | codegen-units = 1
144 | panic = "abort"
145 | strip = true
146 | #debug = true
147 | 
148 | # The profile that 'cargo dist' will build with
149 | [profile.dist]
150 | inherits = "release"
151 | 
152 | [package.metadata.wix]
153 | upgrade-guid = "41883C8F-F72D-46C8-A526-F415D0511C8F"
154 | path-guid = "12F3E865-70E4-4FA2-BA01-D728DF2B14E1"
155 | license = false
156 | eula = false
157 | 


--------------------------------------------------------------------------------
/src/io/input/fa_qual.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::min;
  2 | use std::fs::File;
  3 | use std::io;
  4 | use std::path::Path;
  5 | 
  6 | use seq_io::{
  7 |     fasta::{self, Record as FR},
  8 |     policy::BufPolicy,
  9 | };
 10 | 
 11 | use crate::error::CliResult;
 12 | use crate::io::{Record, RecordHeader, SeqLineIter};
 13 | 
 14 | use super::SeqReader;
 15 | 
 16 | // Reader
 17 | 
 18 | pub struct FaQualReader<R: io::Read, P: BufPolicy> {
 19 |     fa_rdr: fasta::Reader<R, P>,
 20 |     qual_rdr: fasta::Reader<File, P>,
 21 |     quals: Vec<u8>,
 22 | }
 23 | 
 24 | impl<R, P> FaQualReader<R, P>
 25 | where
 26 |     R: io::Read,
 27 |     P: BufPolicy + Clone,
 28 | {
 29 |     pub fn new<Q>(rdr: R, cap: usize, policy: P, qfile: Q) -> CliResult<Self>
 30 |     where
 31 |         Q: AsRef<Path>,
 32 |     {
 33 |         let qhandle = File::open(&qfile).map_err(|e| {
 34 |             format!(
 35 |                 "Error opening '{}': {}",
 36 |                 qfile.as_ref().to_string_lossy(),
 37 |                 e
 38 |             )
 39 |         })?;
 40 | 
 41 |         Ok(FaQualReader {
 42 |             fa_rdr: fasta::Reader::with_capacity(rdr, cap).set_policy(policy.clone()),
 43 |             qual_rdr: fasta::Reader::with_capacity(qhandle, cap).set_policy(policy),
 44 |             quals: vec![],
 45 |         })
 46 |     }
 47 | }
 48 | 
 49 | impl<R, P> SeqReader for FaQualReader<R, P>
 50 | where
 51 |     R: io::Read,
 52 |     P: BufPolicy,
 53 | {
 54 |     fn read_next_conditional(
 55 |         &mut self,
 56 |         func: &mut dyn FnMut(&dyn Record) -> CliResult<bool>,
 57 |     ) -> Option<CliResult<bool>> {
 58 |         let quals = &mut self.quals;
 59 |         let qual_rdr = &mut self.qual_rdr;
 60 | 
 61 |         self.fa_rdr.next().map(|rec| {
 62 |             let rec = rec?;
 63 | 
 64 |             // quality info
 65 |             quals.clear();
 66 |             let qrec = qual_rdr.next().ok_or_else(|| {
 67 |                 format!(
 68 |                     "Quality scores in QUAL file missing for record '{}'",
 69 |                     String::from_utf8_lossy(rec.id_bytes())
 70 |                 )
 71 |             })??;
 72 | 
 73 |             if qrec.id() != rec.id() {
 74 |                 return fail!(format!(
 75 |                     "ID mismatch with QUAL file: '{}' != '{}'",
 76 |                     String::from_utf8_lossy(rec.id_bytes()),
 77 |                     String::from_utf8_lossy(qrec.id_bytes()),
 78 |                 ));
 79 |             }
 80 | 
 81 |             for seq in qrec.seq_lines() {
 82 |                 parse_quals(seq, quals)?;
 83 |             }
 84 | 
 85 |             // check sequence length
 86 |             // this may have a performance impact
 87 |             let seqlen = rec.seq_lines().fold(0, |l, seq| l + seq.len());
 88 | 
 89 |             if seqlen != quals.len() {
 90 |                 return fail!(format!(
 91 |                     "The number of quality scores ({}) is not equal to sequence length ({}) in record '{}'",
 92 |                     quals.len(), seqlen,
 93 |                     String::from_utf8_lossy(rec.id_bytes()),
 94 |                 ));
 95 |             }
 96 | 
 97 |             let r = FaQualRecord {
 98 |                 fa_rec: super::fasta::FastaRecord::new(rec),
 99 |                 qual: quals,
100 |             };
101 |             func(&r)
102 |         })
103 |     }
104 | }
105 | 
106 | fn parse_quals(line: &[u8], out: &mut Vec<u8>) -> Result<(), String> {
107 |     for qual in line.split(|c| *c == b' ') {
108 |         let q = parse_int(qual).map_err(|_| {
109 |             format!(
110 |                 "Invalid quality score found: '{}'",
111 |                 String::from_utf8_lossy(qual)
112 |             )
113 |         })?;
114 |         out.push(min(q as u8, 255));
115 |     }
116 |     Ok(())
117 | }
118 | 
119 | fn parse_int(bytes: &[u8]) -> Result<usize, ()> {
120 |     if bytes.is_empty() {
121 |         return Err(());
122 |     }
123 |     let mut out = 0;
124 |     for &b in bytes {
125 |         if !b.is_ascii_digit() {
126 |             return Err(());
127 |         }
128 |         out = 10 * out + (b - b'0') as usize;
129 |     }
130 |     Ok(out)
131 | }
132 | 
133 | // Wrapper for FASTA record
134 | 
135 | pub struct FaQualRecord<'a> {
136 |     fa_rec: super::fasta::FastaRecord<'a>,
137 |     qual: &'a [u8],
138 | }
139 | 
140 | impl Record for FaQualRecord<'_> {
141 |     fn id(&self) -> &[u8] {
142 |         self.fa_rec.id()
143 |     }
144 | 
145 |     fn desc(&self) -> Option<&[u8]> {
146 |         self.fa_rec.desc()
147 |     }
148 | 
149 |     fn id_desc(&self) -> (&[u8], Option<&[u8]>) {
150 |         self.fa_rec.id_desc()
151 |     }
152 | 
153 |     fn current_header(&'_ self) -> RecordHeader<'_> {
154 |         self.fa_rec.current_header()
155 |     }
156 | 
157 |     fn raw_seq(&self) -> &[u8] {
158 |         self.fa_rec.raw_seq()
159 |     }
160 | 
161 |     fn qual(&self) -> Option<&[u8]> {
162 |         Some(self.qual)
163 |     }
164 | 
165 |     fn header_delim_pos(&self) -> Option<Option<usize>> {
166 |         self.fa_rec.header_delim_pos()
167 |     }
168 | 
169 |     fn set_header_delim_pos(&self, delim: Option<usize>) {
170 |         self.fa_rec.set_header_delim_pos(delim)
171 |     }
172 | 
173 |     fn has_seq_lines(&self) -> bool {
174 |         self.fa_rec.has_seq_lines()
175 |     }
176 | 
177 |     fn seq_segments(&'_ self) -> SeqLineIter<'_> {
178 |         self.fa_rec.seq_segments()
179 |     }
180 | }
181 | 


--------------------------------------------------------------------------------