├── .gitignore ├── src ├── test │ ├── head.rs │ ├── lower.rs │ ├── upper.rs │ ├── tail.rs │ ├── set.rs │ ├── interleave.rs │ ├── slice.rs │ ├── del.rs │ ├── mask.rs │ ├── revcomp.rs │ ├── replace.rs │ ├── stat.rs │ ├── pass.rs │ ├── compress.rs │ ├── concat.rs │ ├── filter.rs │ ├── split.rs │ ├── count.rs │ ├── trim.rs │ ├── sample.rs │ └── cmp.rs ├── helpers │ ├── any.rs │ ├── thread_local.rs │ ├── slice.rs │ ├── macros.rs │ ├── mod.rs │ ├── complement.rs │ ├── write_list.rs │ ├── vec_buf.rs │ ├── replace.rs │ ├── bytesize.rs │ ├── heap_merge.rs │ ├── number.rs │ ├── value.rs │ └── seqtype.rs ├── cmd │ ├── shared │ │ ├── mod.rs │ │ └── key.rs │ ├── pass.rs │ ├── head.rs │ ├── lower.rs │ ├── upper.rs │ ├── interleave.rs │ ├── find │ │ ├── ambig.rs │ │ └── matcher │ │ │ ├── exact.rs │ │ │ └── mod.rs │ ├── stat.rs │ ├── del.rs │ ├── tail.rs │ ├── sort │ │ ├── vars.rs │ │ ├── mem.rs │ │ ├── file.rs │ │ ├── cli.rs │ │ └── mod.rs │ ├── mod.rs │ ├── set.rs │ ├── slice.rs │ ├── filter.rs │ ├── view │ │ ├── color.rs │ │ └── mod.rs │ ├── cmp │ │ ├── vars.rs │ │ └── mod.rs │ ├── unique │ │ ├── map.rs │ │ └── cli.rs │ ├── revcomp.rs │ ├── mask.rs │ ├── concat.rs │ └── replace.rs ├── main.rs ├── var │ └── modules │ │ ├── expr │ │ ├── js │ │ │ └── mod.rs │ │ ├── mod.rs │ │ ├── expressions.rs │ │ └── var_provider.rs │ │ └── mod.rs ├── io │ ├── input │ │ ├── reader.rs │ │ ├── fastx.rs │ │ ├── fastq.rs │ │ ├── fasta.rs │ │ └── fa_qual.rs │ ├── output │ │ ├── fastx.rs │ │ ├── writer.rs │ │ ├── fasta.rs │ │ ├── csv.rs │ │ ├── fastq.rs │ │ └── fa_qual.rs │ ├── mod.rs │ └── format.rs └── error.rs ├── var_provider ├── Cargo.toml ├── variable_enum_macro │ └── Cargo.toml └── src │ ├── lib.rs │ ├── usage.rs │ └── func.rs ├── js └── include.js ├── scripts ├── gen_html_help.sh ├── gen_ambig_map.py ├── validate_js_parser.sh ├── parse_varhelp.py ├── validate_features.sh ├── gen_help.sh ├── summarize_comparison.py └── time.sh ├── dist-workspace.toml ├── LICENSE-MIT ├── profile ├── fastq_urls.txt └── README.md ├── .github └── workflows │ └── ci.yaml └── Cargo.toml /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | Cargo.lock 3 | **/*.rs.bk 4 | .Rhistory 5 | .DS_Store 6 | _* -------------------------------------------------------------------------------- /src/test/head.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn head() { 5 | cmp(&["head", "-n", "3"], &*FASTA, &SEQS[..3].concat()); 6 | } 7 | -------------------------------------------------------------------------------- /src/test/lower.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn lower() { 5 | let fa = ">seq\naTgC\n"; 6 | cmp(&["lower"], fa, ">seq\natgc\n"); 7 | } 8 | -------------------------------------------------------------------------------- /src/test/upper.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn upper() { 5 | let fa = ">seq\naTgC\n"; 6 | cmp(&["upper"], fa, ">seq\nATGC\n"); 7 | } 8 | -------------------------------------------------------------------------------- /src/test/tail.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn tail() { 5 | fails(&["tail", "-n", "3"], &*FASTA, "Cannot use STDIN as input"); 6 | let input = tmp_file("st_tail_", ".fasta", &FASTA); 7 | cmp(&["tail", "-n", "2"], input, records!(2, 3)); 8 | } 9 | -------------------------------------------------------------------------------- /var_provider/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "var_provider" 3 | version = "0.4.0-beta.4" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | 7 | [dependencies] 8 | itertools = "0.14" 9 | strum = "0.27" 10 | strum_macros = "0.27" 11 | crossterm = "0.29" 12 | textwrap = "0.16" 13 | -------------------------------------------------------------------------------- /src/test/set.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn set() { 5 | let fasta = ">seq\nATGC\n"; 6 | 7 | cmp(&["set", "-i", "seq2"], fasta, ">seq2\nATGC\n"); 8 | cmp(&["set", "-d", "desc"], fasta, ">seq desc\nATGC\n"); 9 | cmp(&["set", "-s", "NNNN"], fasta, ">seq\nNNNN\n"); 10 | } 11 | -------------------------------------------------------------------------------- /js/include.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | function num(x) { 4 | let f = parseFloat(x); 5 | if (isNaN(f)) { 6 | if (x === undefined) return undefined; 7 | if (x === null) return null; 8 | throw `Could not convert '${x}' to a decimal number`; 9 | } 10 | return f; 11 | } 12 | -------------------------------------------------------------------------------- /src/test/interleave.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn interleave() { 5 | with_tmpdir("st_interleave_", |td| { 6 | cmp( 7 | &["interleave"], 8 | td.multi_file(".fasta", vec![&&*FASTA, &&*FASTA]), 9 | records!(0, 0, 1, 1, 2, 2, 3, 3), 10 | ); 11 | }); 12 | } 13 | -------------------------------------------------------------------------------- /src/test/slice.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn slice() { 5 | cmp(&["slice", ":"], &*FASTA, &FASTA); 6 | cmp(&["slice", "1:"], &*FASTA, &FASTA); 7 | cmp(&["slice", ":2"], &*FASTA, &SEQS[..2].concat()); 8 | cmp(&["slice", "1:2"], &*FASTA, &SEQS[..2].concat()); 9 | cmp(&["slice", "2:3"], &*FASTA, &SEQS[1..3].concat()); 10 | } 11 | -------------------------------------------------------------------------------- /var_provider/variable_enum_macro/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "variable_enum_macro" 3 | version = "0.4.0-beta.4" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | 7 | [dependencies] 8 | var_provider = { path=".." } 9 | proc-macro2 = "1.0.79" 10 | quote = "1.0.36" 11 | itertools = "0.14" 12 | syn = { version = "2.0.58" } 13 | strum = "0.27" 14 | strum_macros = "0.27" 15 | 16 | [lib] 17 | proc-macro = true 18 | -------------------------------------------------------------------------------- /src/test/del.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn del() { 5 | let fasta = ">seq;p=0 a=1 b=2\nATGC\n"; 6 | 7 | cmp(&["del", "-d"], fasta, ">seq;p=0\nATGC\n"); 8 | // TODO: the extra space should be removed 9 | cmp(&["del", "--attrs", "a,b"], fasta, ">seq;p=0 \nATGC\n"); 10 | cmp( 11 | &["del", "--attrs", "p", "--attr-fmt", ";key=value"], 12 | fasta, 13 | ">seq a=1 b=2\nATGC\n", 14 | ); 15 | } 16 | -------------------------------------------------------------------------------- /src/helpers/any.rs: -------------------------------------------------------------------------------- 1 | use std::any::Any; 2 | 3 | // pub trait AsAny { 4 | // fn as_any(&self) -> &dyn Any; 5 | // } 6 | 7 | // impl AsAny for T { 8 | // fn as_any(&self) -> &dyn Any { 9 | // self 10 | // } 11 | // } 12 | 13 | pub trait AsAnyMut { 14 | fn as_any_mut(&mut self) -> &mut dyn Any; 15 | } 16 | 17 | impl AsAnyMut for T { 18 | fn as_any_mut(&mut self) -> &mut dyn Any { 19 | self 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/cmd/shared/mod.rs: -------------------------------------------------------------------------------- 1 | //! This module contains code shared between at least two commands, which 2 | //! relies on some external crates and therefore needs feature flags. 3 | 4 | #[cfg(any( 5 | feature = "all-commands", 6 | feature = "cmp", 7 | feature = "count", 8 | feature = "sort", 9 | feature = "unique" 10 | ))] 11 | pub mod key; 12 | 13 | #[cfg(any(feature = "all-commands", feature = "sort", feature = "unique"))] 14 | pub mod tmp_store; 15 | -------------------------------------------------------------------------------- /src/test/mask.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn mask() { 5 | let fa = ">seq\nATGCa\ntgc\n"; 6 | cmp(&["mask", ":"], fa, ">seq\natgcatgc\n"); 7 | cmp(&["mask", ":2,-2:"], fa, ">seq\natGCatgc\n"); 8 | cmp(&["mask", "4:"], fa, ">seq\nATGcatgc\n"); 9 | cmp(&["mask", "--hard", "N", "4:"], fa, ">seq\nATGNNNNN\n"); 10 | cmp( 11 | &["mask", "--unmask", "4:"], 12 | ">seq\nATGcatgc\n", 13 | ">seq\nATGCATGC\n", 14 | ); 15 | } 16 | -------------------------------------------------------------------------------- /var_provider/src/lib.rs: -------------------------------------------------------------------------------- 1 | use strum_macros::{Display, EnumString}; 2 | 3 | mod func; 4 | mod usage; 5 | mod var_provider; 6 | 7 | pub use self::func::*; 8 | pub use self::usage::*; 9 | pub use self::var_provider::*; 10 | 11 | /// Provides information about the expected variable/function output type 12 | #[derive(Debug, Clone, EnumString, Display)] 13 | #[strum(serialize_all = "snake_case")] 14 | pub enum VarType { 15 | Text, 16 | Number, 17 | Boolean, 18 | } 19 | -------------------------------------------------------------------------------- /scripts/gen_html_help.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | wiki=../seqtool.wiki 4 | html=../seqtool-doc 5 | 6 | # generate local HTML docs 7 | 8 | cnv_links() { 9 | sed -E 's_ $html/README.html 14 | for f in $wiki/*.md; do 15 | name="$(basename ${f%.*})" 16 | pandoc --self-contained -s -c doc/pandoc.css $f| cnv_links > $html/wiki/$name.html 17 | done 18 | -------------------------------------------------------------------------------- /scripts/gen_ambig_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | bases = ['A', 'C', 'G', 'T', 'U'] 4 | 5 | mapping = [ 6 | ('M', 'AC'), 7 | ('R', 'AG'), 8 | ('W', 'AT'), 9 | ('S', 'CG'), 10 | ('Y', 'CT'), 11 | ('K', 'GT'), 12 | ('V', 'ACG'), 13 | ('H', 'ACT'), 14 | ('D', 'AGT'), 15 | ('B', 'CGT'), 16 | ('N', 'ACGT'), 17 | ] 18 | 19 | for b, bases in mapping: 20 | other = [a for a, v in mapping if all(b in bases for b in v)] 21 | print("b'{}' => b\"{}\".to_vec(),".format(b, ''.join(list(bases) + other))) 22 | -------------------------------------------------------------------------------- /src/helpers/thread_local.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::thread::LocalKey; 3 | 4 | pub fn with_mut_thread_local( 5 | lkey: &'static LocalKey>>, 6 | init: I, 7 | f: F, 8 | ) -> R 9 | where 10 | F: FnOnce(&mut T) -> R, 11 | I: FnOnce() -> T, 12 | { 13 | lkey.with(|d| { 14 | let mut d = d.borrow_mut(); 15 | let data = if let Some(ref mut data) = *d { 16 | data 17 | } else { 18 | *d = Some(init()); 19 | d.as_mut().unwrap() 20 | }; 21 | f(data) 22 | }) 23 | } 24 | -------------------------------------------------------------------------------- /src/cmd/pass.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | 7 | #[derive(Parser, Clone, Debug)] 8 | pub struct PassCommand { 9 | #[command(flatten)] 10 | pub common: CommonArgs, 11 | } 12 | 13 | pub fn run(mut cfg: Config, _args: PassCommand) -> CliResult<()> { 14 | let mut format_writer = cfg.get_format_writer()?; 15 | cfg.with_io_writer(|io_writer, mut cfg| { 16 | cfg.read(|record, ctx| { 17 | format_writer.write(&record, io_writer, ctx)?; 18 | Ok(true) 19 | }) 20 | }) 21 | } 22 | -------------------------------------------------------------------------------- /src/helpers/slice.rs: -------------------------------------------------------------------------------- 1 | use std::mem::replace; 2 | 3 | pub fn split_text(text: &'_ [u8], sep: u8) -> SplitIter<'_> { 4 | SplitIter { sep, text } 5 | } 6 | 7 | pub struct SplitIter<'a> { 8 | sep: u8, 9 | text: &'a [u8], 10 | } 11 | 12 | impl<'a> Iterator for SplitIter<'a> { 13 | type Item = &'a [u8]; 14 | 15 | fn next(&mut self) -> Option { 16 | if let Some(pos) = memchr::memchr(self.sep, self.text) { 17 | let (t, rest) = self.text.split_at(pos); 18 | self.text = &rest[1..]; 19 | return Some(t); 20 | } 21 | if self.text.is_empty() { 22 | return None; 23 | } 24 | Some(replace(&mut self.text, b"")) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/helpers/macros.rs: -------------------------------------------------------------------------------- 1 | macro_rules! fail { 2 | ($e:expr) => { 3 | Err($e.into()) 4 | }; 5 | ($e:expr, $($args:tt)*) => { 6 | Err(format!($e, $($args)*).into()) 7 | }; 8 | } 9 | 10 | macro_rules! try_opt { 11 | ($expr:expr) => { 12 | match $expr { 13 | Ok(item) => item, 14 | Err(e) => return Some(Err(std::convert::From::from(e))), 15 | } 16 | }; 17 | } 18 | 19 | macro_rules! report { 20 | ($verbose:expr, $fmt:expr) => ( 21 | if $verbose { 22 | eprintln!($fmt) 23 | } 24 | ); 25 | ($verbose:expr, $fmt:expr, $($arg:tt)*) => ( 26 | if $verbose { 27 | eprintln!($fmt, $($arg)*) 28 | } 29 | ); 30 | } 31 | -------------------------------------------------------------------------------- /dist-workspace.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cargo:."] 3 | 4 | # Config for 'dist' 5 | [dist] 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax) 7 | cargo-dist-version = "0.30.0" 8 | # CI backends to support 9 | ci = "github" 10 | # The installers to generate for each app 11 | installers = ["shell", "powershell", "homebrew", "msi"] 12 | # Target platforms to build apps for (Rust target-triple syntax) 13 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl", "x86_64-pc-windows-msvc"] 14 | # Path that installers should place binaries in 15 | install-path = "CARGO_HOME" 16 | # Whether to install an updater program 17 | install-updater = false 18 | 19 | [target.x86_64-pc-windows-msvc] 20 | rustflags = [ 21 | "-C debuginfo=0", 22 | "-C link-arg=/DEBUG:NONE" 23 | ] 24 | -------------------------------------------------------------------------------- /src/helpers/mod.rs: -------------------------------------------------------------------------------- 1 | //! Utilities used by many commands, 2 | //! which do not use optional crates that depend on feature flags. 3 | 4 | use std::collections::{HashMap, HashSet}; 5 | 6 | // The default hash map to use 7 | use ahash::RandomState; 8 | 9 | pub type DefaultHashMap = HashMap; 10 | pub type DefaultHashSet = HashSet; 11 | pub type DefaultBuildHasher = ahash::RandomState; // BuildHasherDefault; 12 | 13 | // missing data string 14 | pub const NA: &str = "undefined"; 15 | 16 | #[macro_use] 17 | pub mod macros; 18 | pub mod any; 19 | pub mod bytesize; 20 | pub mod complement; 21 | pub mod heap_merge; 22 | pub mod number; 23 | pub mod replace; 24 | pub mod rng; 25 | pub mod seqtype; 26 | pub mod slice; 27 | pub mod thread_local; 28 | pub mod value; 29 | pub mod var_range; 30 | pub mod vec_buf; 31 | pub mod write_list; 32 | -------------------------------------------------------------------------------- /src/test/revcomp.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn revcomp() { 5 | // DNA with ambiguities 6 | cmp( 7 | &["revcomp"], 8 | ">id\nAGCT\nYRWS\nKMDV\nHBN\n", 9 | ">id\nNVDBHKMSWYRAGCT\n", 10 | ); 11 | // RNA 12 | cmp( 13 | &["revcomp"], 14 | ">id\nAGCU\nYRWS\nKMDV\nHBN\n", 15 | ">id\nNVDBHKMSWYRAGCU\n", 16 | ); 17 | // mixed / protein 18 | fails( 19 | &["revcomp"], 20 | ">id\nTX\n", 21 | "Only DNA/RNA sequences can be reverse-complemented", 22 | ); 23 | // with explicitly set sequence type, invalid letters are left untouched 24 | cmp(&["revcomp", "--seqtype", "dna"], ">id\nUA\n", ">id\nTU\n"); 25 | } 26 | 27 | #[test] 28 | fn revcomp_qual() { 29 | let fq = "@seq\nANCT\n+\n1234\n"; 30 | let rc = "@seq\nAGNT\n+\n4321\n"; 31 | cmp(&["revcomp", "--fq"], fq, rc); 32 | } 33 | -------------------------------------------------------------------------------- /src/helpers/complement.rs: -------------------------------------------------------------------------------- 1 | use super::seqtype::SeqType; 2 | 3 | /// Reverse complements a set of sequence chunks belonging to the same sequence 4 | /// writes the contiguous reverse-complement to output 5 | pub fn reverse_complement<'a, S>( 6 | seq_iter: S, 7 | out: &mut Vec, 8 | seqtype: SeqType, 9 | ) -> Result<(), String> 10 | where 11 | S: Iterator + DoubleEndedIterator, 12 | { 13 | let complement = match seqtype { 14 | SeqType::DNA => bio::alphabets::dna::complement, 15 | SeqType::RNA => bio::alphabets::rna::complement, 16 | _ => { 17 | return Err(format!( 18 | "Only DNA/RNA sequences can be reverse-complemented, but the sequence type \ 19 | is '{seqtype}'. Wrongly recognized sequence types can be adjusted with `--seqtype`." 20 | )) 21 | } 22 | }; 23 | out.clear(); 24 | for s in seq_iter.rev() { 25 | out.extend(s.iter().rev().cloned().map(complement)); 26 | } 27 | Ok(()) 28 | } 29 | -------------------------------------------------------------------------------- /src/cmd/head.rs: -------------------------------------------------------------------------------- 1 | use clap::{value_parser, Parser}; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | 7 | #[derive(Parser, Clone, Debug)] 8 | #[clap(next_help_heading = "'Head' command options")] 9 | pub struct HeadCommand { 10 | /// Number of sequences to return 11 | #[arg(short, long, value_name = "N", default_value_t = 10, value_parser = value_parser!(u64).range(1..))] 12 | num_seqs: u64, 13 | 14 | #[command(flatten)] 15 | pub common: CommonArgs, 16 | } 17 | 18 | pub fn run(mut cfg: Config, args: HeadCommand) -> CliResult<()> { 19 | let n = args.num_seqs; 20 | 21 | let mut format_writer = cfg.get_format_writer()?; 22 | cfg.with_io_writer(|io_writer, mut cfg| { 23 | let mut i = 0; 24 | 25 | cfg.read(|record, ctx| { 26 | if i >= n { 27 | return Ok(false); 28 | } 29 | format_writer.write(&record, io_writer, ctx)?; 30 | i += 1; 31 | Ok(true) 32 | }) 33 | }) 34 | } 35 | -------------------------------------------------------------------------------- /src/test/replace.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | static INPUT: &str = ">id_123 some desc\nA\nT\nGC\n"; 4 | 5 | #[test] 6 | fn exact() { 7 | cmp(&["replace", "T", "U"], INPUT, ">id_123 some desc\nAUGC\n"); 8 | cmp( 9 | &["replace", "T", "U"], 10 | ">a\nT\nT\n>b\nT\nT\n>c\nT\nT\n", 11 | ">a\nUU\n>b\nUU\n>c\nUU\n", 12 | ); 13 | cmp( 14 | &["replace", "ATG", "TGA"], 15 | INPUT, 16 | ">id_123 some desc\nTGAC\n", 17 | ); 18 | cmp( 19 | &["replace", "-d", "e", "a"], 20 | INPUT, 21 | ">id_123 soma dasc\nATGC\n", 22 | ); 23 | } 24 | 25 | #[test] 26 | fn regex() { 27 | cmp( 28 | &["replace", "-r", "[AT]", "?"], 29 | INPUT, 30 | ">id_123 some desc\n??GC\n", 31 | ); 32 | cmp( 33 | &["replace", "-ir", r"_\d{3}", ".."], 34 | INPUT, 35 | ">id.. some desc\nATGC\n", 36 | ); 37 | cmp( 38 | &["replace", "-ir", r"_(\d{3})", "..$1"], 39 | INPUT, 40 | ">id..123 some desc\nATGC\n", 41 | ); 42 | } 43 | -------------------------------------------------------------------------------- /scripts/validate_js_parser.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script validates the JS parser using ECMA-262 parser tests 3 | # All syntax should be accepted, exept for: 4 | # - regex literals 5 | # - \u unicode escapes in identifiers 6 | # - characters that are not char::is_alphabetic() (there seem to be some) 7 | 8 | set -euo pipefail 9 | 10 | if [ ! -e test262-parser-tests-master ]; then 11 | wget https://github.com/tc39/test262-parser-tests/archive/refs/heads/master.zip 12 | unzip master.zip 13 | rm master.zip 14 | fi 15 | 16 | cargo build 17 | st=target/debug/st 18 | 19 | echo "" > _input.fa 20 | js=test262-parser-tests-master 21 | for f in $js/pass/*.js $js/pass-explicit/*.js $js/early/*.js; do 22 | echo $f 23 | out=$(($st . --to-tsv "{{file:$f}}" _input.fa || true) 2>&1) 24 | # recognize errors, but exclude strings containing unsupported character escaped 25 | if [[ "$out" == *"Failed to parse"* && ! "$out" =~ (\\0|\\u[0-9]{4}|\\u\{[a-zA-Z0-9]{1,6}\}|\\x[a-zA-Z0-9]{2}) ]]; then 26 | printf "$out" 27 | fi 28 | done 29 | rm -R _input.fa $js 30 | -------------------------------------------------------------------------------- /src/cmd/lower.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::io::SeqQualRecord; 7 | 8 | #[derive(Parser, Clone, Debug)] 9 | #[clap(next_help_heading = "'Lower' command options")] 10 | pub struct LowerCommand { 11 | #[command(flatten)] 12 | pub common: CommonArgs, 13 | } 14 | 15 | pub fn run(mut cfg: Config, _args: LowerCommand) -> CliResult<()> { 16 | let mut format_writer = cfg.get_format_writer()?; 17 | cfg.with_io_writer(|io_writer, mut cfg| { 18 | let mut seq = vec![]; 19 | cfg.read(|record, ctx| { 20 | seq.clear(); 21 | for s in record.seq_segments() { 22 | seq.extend(s.iter().cloned().map(|ref mut b| { 23 | b.make_ascii_lowercase(); 24 | *b 25 | })); 26 | } 27 | let ucase_rec = SeqQualRecord::new(&record, &seq, None); 28 | format_writer.write(&ucase_rec, io_writer, ctx)?; 29 | Ok(true) 30 | }) 31 | }) 32 | } 33 | -------------------------------------------------------------------------------- /src/cmd/upper.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::io::SeqQualRecord; 7 | 8 | #[derive(Parser, Clone, Debug)] 9 | #[clap(next_help_heading = "'Upper' command options")] 10 | pub struct UpperCommand { 11 | #[command(flatten)] 12 | pub common: CommonArgs, 13 | } 14 | 15 | pub fn run(mut cfg: Config, _args: UpperCommand) -> CliResult<()> { 16 | let mut format_writer = cfg.get_format_writer()?; 17 | cfg.with_io_writer(|io_writer, mut cfg| { 18 | let mut seq = vec![]; 19 | cfg.read(|record, ctx| { 20 | seq.clear(); 21 | for s in record.seq_segments() { 22 | seq.extend(s.iter().cloned().map(|ref mut b| { 23 | b.make_ascii_uppercase(); 24 | *b 25 | })); 26 | } 27 | let ucase_rec = SeqQualRecord::new(&record, &seq, None); 28 | format_writer.write(&ucase_rec, io_writer, ctx)?; 29 | Ok(true) 30 | }) 31 | }) 32 | } 33 | -------------------------------------------------------------------------------- /src/test/stat.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | 3 | use super::*; 4 | 5 | #[test] 6 | fn stats() { 7 | let seq = ">seq\nATGC-NYA\n"; 8 | let retval = "seq\t8\t7\t40\t0.4\t2\t3\n"; 9 | let vars = "seqlen,ungapped_seqlen,gc_percent,gc,charcount(A),charcount(AT)"; 10 | #[cfg(any(feature = "all-commands", feature = "pass"))] 11 | cmp(&[".", "--to-tsv", &format!("id,{vars}")], seq, retval); 12 | cmp(&["stat", vars], seq, retval); 13 | } 14 | 15 | #[test] 16 | fn qualstat() { 17 | cmp( 18 | &["stat", "--fq", "exp_err"], 19 | format!("@id\nAAA\n+\n{}\n", str::from_utf8(&[33, 43, 53]).unwrap()), 20 | "id\t1.11\n", 21 | ); 22 | cmp( 23 | &["stat", "--fq-illumina", "exp_err"], 24 | format!("@id\nAAA\n+\n{}\n", str::from_utf8(&[64, 74, 84]).unwrap()), 25 | "id\t1.11\n", 26 | ); 27 | fails( 28 | &["stat", "--fq", "exp_err"], 29 | format!("@id\nA\n+\n{}\n", str::from_utf8(&[32]).unwrap()), 30 | "Invalid quality", 31 | ); 32 | fails(&["stat", "exp_err"], ">seq\nAA", "No quality scores"); 33 | } 34 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | /* 2 | Fast and flexible tool for reading, modifying and writing biological sequences 3 | */ 4 | 5 | // suppress warnings unless most features are used 6 | #![cfg_attr(not(feature = "default"), allow(warnings, unused))] 7 | 8 | #[macro_use] 9 | extern crate seq_io; 10 | 11 | use crate::cli::Cli; 12 | use crate::config::Config; 13 | 14 | use self::error::*; 15 | use std::process; 16 | 17 | #[macro_use] 18 | mod helpers; 19 | #[macro_use] 20 | mod error; 21 | mod cli; 22 | mod cmd; 23 | mod config; 24 | mod context; 25 | mod io; 26 | mod var; 27 | 28 | #[cfg(test)] 29 | mod test; 30 | 31 | fn main() { 32 | let res = Cli::new().and_then(|cli| cli.run()); 33 | match res { 34 | // normal exit 35 | Ok(()) => {} 36 | Err(CliError::Io(e)) => { 37 | if e.kind() != std::io::ErrorKind::BrokenPipe { 38 | exit(&format!("{e}"), 1) 39 | } 40 | } 41 | Err(e) => exit(&format!("{e}"), 1), 42 | } 43 | } 44 | 45 | fn exit(msg: &str, code: i32) { 46 | eprintln!("{msg}"); 47 | process::exit(code); 48 | } 49 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Markus Schlegel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/cmd/interleave.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::{CommonArgs, WORDY_HELP}; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | 7 | pub const DESC: &str = "\ 8 | The records are returned in the same order as in the input files."; 9 | 10 | #[derive(Parser, Clone, Debug)] 11 | #[clap(next_help_heading = "'Interleave' command options")] 12 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 13 | pub struct InterleaveCommand { 14 | /// Don't check if the IDs of the files match 15 | #[arg(short, long)] 16 | no_id_check: bool, 17 | 18 | #[command(flatten)] 19 | pub common: CommonArgs, 20 | } 21 | 22 | pub fn run(mut cfg: Config, args: InterleaveCommand) -> CliResult<()> { 23 | let id_check = !args.no_id_check; 24 | 25 | let mut format_writer = cfg.get_format_writer()?; 26 | cfg.with_io_writer(|io_writer, mut cfg| { 27 | cfg.read_alongside(id_check, |_, rec, ctx| { 28 | // handle variables (read_alongside requires this to be done manually) 29 | ctx.set_record(&rec, 0)?; 30 | format_writer.write(rec, io_writer, ctx)?; 31 | Ok(true) 32 | }) 33 | }) 34 | } 35 | -------------------------------------------------------------------------------- /profile/fastq_urls.txt: -------------------------------------------------------------------------------- 1 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/079/ERR12573579/ERR12573579_1.fastq.gz 2 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/096/ERR12551596/ERR12551596_1.fastq.gz 3 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/003/ERR12551603/ERR12551603_1.fastq.gz 4 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/005/ERR12551605/ERR12551605_1.fastq.gz 5 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/009/ERR12551609/ERR12551609_1.fastq.gz 6 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/091/ERR12551691/ERR12551691_1.fastq.gz 7 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/094/ERR12551694/ERR12551694_1.fastq.gz 8 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/095/ERR12551695/ERR12551695_1.fastq.gz 9 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/097/ERR12551697/ERR12551697_1.fastq.gz 10 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/099/ERR12551699/ERR12551699_1.fastq.gz 11 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/002/ERR12551702/ERR12551702_1.fastq.gz 12 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/004/ERR12551704/ERR12551704_1.fastq.gz 13 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/007/ERR12551707/ERR12551707_1.fastq.gz 14 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/011/ERR12551711/ERR12551711_1.fastq.gz 15 | ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/063/ERR12551763/ERR12551763_1.fastq.gz 16 | -------------------------------------------------------------------------------- /src/helpers/write_list.rs: -------------------------------------------------------------------------------- 1 | //use std::ops::Deref; 2 | use std::{convert::AsRef, io}; 3 | 4 | /// Writes an iterator of of text slices as delimited list to the output. 5 | /// Returns true if the list is not empty 6 | pub fn write_list(list: L, sep: &[u8], out: &mut W) -> io::Result 7 | where 8 | L: IntoIterator, 9 | I: AsRef<[u8]>, 10 | W: io::Write + ?Sized, 11 | { 12 | write_list_with(list, sep, out, |item, o| o.write_all(item.as_ref())) 13 | } 14 | 15 | /// Writes an iterator of of values as delimited list to the output 16 | /// using a custom writing function. 17 | /// Returns true if the list is not empty 18 | #[inline] 19 | pub fn write_list_with( 20 | list: L, 21 | sep: &[u8], 22 | out: &mut W, 23 | mut write_fn: F, 24 | ) -> io::Result 25 | where 26 | L: IntoIterator, 27 | W: io::Write + ?Sized, 28 | F: FnMut(I, &mut W) -> io::Result<()>, 29 | { 30 | let mut first = true; 31 | for item in list { 32 | if first { 33 | first = false; 34 | } else { 35 | out.write_all(sep)?; 36 | } 37 | write_fn(item, out)?; 38 | } 39 | Ok(!first) 40 | } 41 | -------------------------------------------------------------------------------- /src/cmd/find/ambig.rs: -------------------------------------------------------------------------------- 1 | // according to IUPAC https://iubmb.qmul.ac.uk/misc/naseq.html#500, Table 1 2 | // whereby ambiguity codes completely contained in another are also included 3 | // (e.g. V matches M, R and S i naddition to A, C and G) 4 | pub static AMBIG_DNA: &[(u8, &[u8])] = &[ 5 | (b'M', b"AC"), 6 | (b'R', b"AG"), 7 | (b'W', b"AT"), 8 | (b'S', b"CG"), 9 | (b'Y', b"CT"), 10 | (b'K', b"GT"), 11 | (b'V', b"ACGMRS"), 12 | (b'H', b"ACTMWY"), 13 | (b'D', b"AGTRWK"), 14 | (b'B', b"CGTSYK"), 15 | (b'N', b"ACGTMRWSYKVHDB"), 16 | ]; 17 | 18 | // same as DNA, T -> U 19 | pub static AMBIG_RNA: &[(u8, &[u8])] = &[ 20 | (b'M', b"AC"), 21 | (b'R', b"AG"), 22 | (b'W', b"AU"), 23 | (b'S', b"CG"), 24 | (b'Y', b"CU"), 25 | (b'K', b"GU"), 26 | (b'V', b"ACGMRS"), 27 | (b'H', b"ACUMWY"), 28 | (b'D', b"AGURWK"), 29 | (b'B', b"CGUSYK"), 30 | (b'N', b"ACGUMRWSYKVHDB"), 31 | ]; 32 | 33 | // according to IUPAC, https://iupac.qmul.ac.uk/AminoAcid/A2021.html#AA212 34 | pub static AMBIG_PROTEIN: &[(u8, &[u8])] = &[ 35 | (b'B', b"DN"), 36 | // note: B and Z are matched by X as well 37 | (b'X', b"ARNDCEQGHILKMFPSTWYVBZ"), 38 | (b'Z', b"EQ"), 39 | ]; 40 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | # Make sure CI fails on all warnings, including Clippy lints 10 | # env: 11 | # RUSTFLAGS: "-Dwarnings" 12 | 13 | jobs: 14 | test: 15 | name: test 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | matrix: 19 | build: [linux, osx, win-msvc, win-gnu] 20 | include: 21 | - build: linux 22 | os: ubuntu-latest 23 | rust: stable 24 | - build: osx 25 | os: macos-latest 26 | rust: stable 27 | - build: win-msvc 28 | os: windows-latest 29 | rust: stable 30 | - build: win-gnu 31 | os: windows-latest 32 | rust: stable-x86_64-gnu 33 | env: 34 | RUSTFLAGS: ${{ matrix.build == 'win-msvc' && '-C debuginfo=0 -C link-args=/DEBUG:NONE' || '' }} 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v5 39 | - name: Install Rust 40 | uses: dtolnay/rust-toolchain@master 41 | with: 42 | toolchain: ${{ matrix.rust }} 43 | - run: cargo build --verbose --jobs 1 44 | - run: cargo test --verbose --jobs 1 45 | -------------------------------------------------------------------------------- /src/test/pass.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn pass() { 5 | cmp(&["pass"], &*FASTA, &FASTA); 6 | cmp(&["."], &*FASTA, &FASTA); 7 | } 8 | 9 | #[test] 10 | fn append() { 11 | with_tmpdir("st_pass_append_", |td| { 12 | let fa = ">seq\nATGC\n"; 13 | let out = td.path("pass_append_out.fasta"); 14 | succeeds(&["pass", "--append", "-o", &out], fa); 15 | assert_eq!(&out.content(), fa); 16 | succeeds(&["pass", "--append", "-o", &out], fa); 17 | assert_eq!(&out.content(), &(fa.to_string() + fa)); 18 | succeeds(&["pass", "--append", "-o", &out], fa); 19 | assert_eq!(&out.content(), &(fa.to_string() + fa + fa)); 20 | }); 21 | } 22 | 23 | #[test] 24 | fn fasta_io() { 25 | let fa = ">seq\nATGC\n"; 26 | let fa_wrap = ">seq\nAT\nGC\n"; 27 | let fa_wrap3 = ">seq\nATG\nC\n"; 28 | 29 | cmp(&["."], fa, fa); 30 | cmp(&["."], fa_wrap, fa); 31 | cmp(&[".", "--wrap", "2"], fa, fa_wrap); 32 | cmp(&[".", "--wrap", "3"], fa_wrap, fa_wrap3); 33 | } 34 | 35 | #[test] 36 | fn pass_pipe() { 37 | cmp_pipe(&["."], &FASTA, &["."], &FASTA); 38 | } 39 | 40 | #[test] 41 | fn thread_io() { 42 | cmp(&[".", "-T", "--write-thread"], &*FASTA, &FASTA); 43 | } 44 | -------------------------------------------------------------------------------- /src/var/modules/expr/js/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::helpers::DefaultHashMap as HashMap; 2 | use crate::var::VarBuilder; 3 | 4 | use self::parser::SimpleAst; 5 | 6 | use super::{ExprContext, Expression, Var}; 7 | 8 | mod expr; 9 | pub mod parser; 10 | 11 | pub use self::expr::*; 12 | 13 | pub fn replace_register_vars( 14 | ast: &SimpleAst, 15 | b: &mut VarBuilder, 16 | ) -> Result<(String, Vec), String> { 17 | let mut vars = HashMap::default(); 18 | let new_code = ast.rewrite(|func| { 19 | b.register_var(func.name, func.args()).map(|res| { 20 | res.map(|(symbol_id, _)| { 21 | // get unique placeholder variable name (function arguments are hashed) 22 | let js_varname = if func.args().is_empty() { 23 | func.name.to_string() 24 | } else { 25 | format!("{}_{}", func.name, symbol_id) 26 | }; 27 | vars.insert(symbol_id, js_varname.clone()); 28 | js_varname 29 | }) 30 | }) 31 | })?; 32 | // dbg!(ast, &new_code); 33 | Ok(( 34 | new_code, 35 | vars.into_iter() 36 | .map(|(symbol_id, name)| Var { symbol_id, name }) 37 | .collect(), 38 | )) 39 | } 40 | -------------------------------------------------------------------------------- /src/cmd/find/matcher/exact.rs: -------------------------------------------------------------------------------- 1 | use memchr::memmem::Finder; 2 | 3 | use super::{Hit, Match, Matcher}; 4 | 5 | #[derive(Debug, Clone)] 6 | pub struct ExactMatcher { 7 | finder: Finder<'static>, 8 | pattern_len: usize, 9 | } 10 | 11 | impl ExactMatcher { 12 | pub fn new(pattern: &[u8]) -> Self { 13 | Self { 14 | finder: Finder::new(pattern).into_owned(), 15 | pattern_len: pattern.len(), 16 | } 17 | } 18 | } 19 | 20 | impl Matcher for ExactMatcher { 21 | fn has_matches(&self, text: &[u8]) -> Result { 22 | Ok(self.finder.find_iter(text).next().is_some()) 23 | } 24 | 25 | fn do_search( 26 | &mut self, 27 | text: &[u8], 28 | func: &mut dyn FnMut(&dyn Hit) -> Result, 29 | ) -> Result<(), String> { 30 | for start in self.finder.find_iter(text) { 31 | if !func(&(start, start + self.pattern_len))? { 32 | break; 33 | } 34 | } 35 | Ok(()) 36 | } 37 | } 38 | 39 | impl Hit for (usize, usize) { 40 | fn get_group(&self, group: usize, out: &mut Match) -> Result<(), String> { 41 | debug_assert!(group == 0); 42 | out.start = self.0; 43 | out.end = self.1; 44 | Ok(()) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /scripts/parse_varhelp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import re 5 | 6 | prev_line = '' 7 | in_table = False 8 | for line in sys.stdin: 9 | if line.startswith('---'): 10 | prev_line = '### {}\n'.format(re.sub('(.+?Usage: )(.*)', '\\1`\\2`', prev_line)) 11 | line = next(sys.stdin) 12 | if line[0].islower(): 13 | prev_line = prev_line + '\n| variable | description |\n| - | - ' 14 | in_table = True 15 | 16 | if in_table: 17 | if line.startswith('Example') or len(line) == 0: 18 | in_table = False 19 | else: 20 | #line = line[1:] 21 | if line[0] != ' ': 22 | prev_line += '|\n' 23 | s = line.strip().split(' ', 1) 24 | if len(s) == 2: 25 | name, desc = s 26 | line = '| {} | {} '.format(name, desc) 27 | else: 28 | prev_line += ' ' + line.strip() 29 | continue 30 | 31 | if line.startswith('Example'): 32 | line = '#### {}\n\n'.format(line.strip()) 33 | 34 | elif line.startswith('> '): 35 | line = '\n```bash\n{}```\n\n'.format(line[2:]) 36 | 37 | print(prev_line, end='') 38 | prev_line = line 39 | 40 | print(prev_line, end='') 41 | -------------------------------------------------------------------------------- /src/cmd/stat.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::{CommonArgs, WORDY_HELP}; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::io::output::OutFormat; 7 | 8 | use super::pass::{self, PassCommand}; 9 | 10 | pub const DESC: &str = "\ 11 | Sequence statistics variables (seqlen, exp_err, charcount(...), etc.) 12 | are supplied as comma-delimited list, e.g. `id,seqlen,exp_err`. 13 | The stat command is equivalent to `st pass --to-tsv 'id,var1,var2,...' input` 14 | 15 | See `st stat -V/--help-vars` for a list of all possible variables."; 16 | 17 | #[derive(Parser, Clone, Debug)] 18 | #[clap(next_help_heading = "'Stat' command options")] 19 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 20 | pub struct StatCommand { 21 | /// Comma delimited list of statistics variables. 22 | #[arg(value_name = "VAR")] 23 | vars: String, 24 | 25 | #[command(flatten)] 26 | pub common: CommonArgs, 27 | } 28 | 29 | pub fn run(mut cfg: Config, args: StatCommand) -> CliResult<()> { 30 | let cmd = PassCommand { 31 | common: args.common, 32 | }; 33 | let fields = "id,".to_string() + &args.vars; 34 | cfg.output_config.format = OutFormat::DelimitedText { 35 | fields, 36 | delim: b'\t', 37 | }; 38 | pass::run(cfg, cmd) 39 | } 40 | -------------------------------------------------------------------------------- /src/test/compress.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn compress_pipe() { 5 | #[cfg(feature = "gz")] 6 | cmp_pipe( 7 | &[".", "--to", "fasta.gz", "--compr-level", "9"], 8 | &FASTA, 9 | &[".", "--fmt", "fasta.gz"], 10 | &FASTA, 11 | ); 12 | 13 | #[cfg(feature = "bz2")] 14 | cmp_pipe( 15 | &[".", "--to", "fasta.bz2", "--compr-level", "9"], 16 | &FASTA, 17 | &[".", "--fmt", "fasta.bz2"], 18 | &FASTA, 19 | ); 20 | 21 | #[cfg(feature = "lz4")] 22 | cmp_pipe( 23 | &[".", "--to", "fasta.lz4", "--compr-level", "9"], 24 | &FASTA, 25 | &[".", "--fmt", "fasta.lz4"], 26 | &FASTA, 27 | ); 28 | 29 | #[cfg(feature = "zstd")] 30 | cmp_pipe( 31 | &[".", "--to", "fasta.zst", "--compr-level", "9"], 32 | &FASTA, 33 | &[".", "--fmt", "fasta.zst"], 34 | &FASTA, 35 | ); 36 | } 37 | 38 | #[test] 39 | #[cfg(feature = "gz")] 40 | fn compress_file() { 41 | with_tmpdir("st_compress_", |td| { 42 | let f = td.path("compr_out.fa.gz"); 43 | succeeds(&[".", "-o", &f], &*FASTA); 44 | fails(&[".", "--fmt", "fasta"], &f, "FASTA parse error"); 45 | cmp(&["."], &f, &FASTA); 46 | cmp(&[".", "--fmt", "fasta.gz"], &f, &FASTA); 47 | }); 48 | } 49 | -------------------------------------------------------------------------------- /src/io/input/reader.rs: -------------------------------------------------------------------------------- 1 | use crate::error::CliResult; 2 | use crate::io::Record; 3 | 4 | /// Trait for reading sequence records 5 | pub trait SeqReader { 6 | /// Reads the next record and provides it in a closure. 7 | /// The closure may return `false` to indicate that reading should stop. 8 | /// Returns `Some(Ok(do_stop))` if a record was found, otherwise `None` 9 | fn read_next_conditional( 10 | &mut self, 11 | func: &mut dyn FnMut(&dyn Record) -> CliResult, 12 | ) -> Option>; 13 | 14 | /// Reads the next record and returns `true` if it was found. 15 | /// There is no way the closure can signal back that the reading should stop. 16 | fn read_next(&mut self, func: &mut dyn FnMut(&dyn Record) -> CliResult<()>) -> CliResult { 17 | self.read_next_conditional(&mut |rec| func(rec).map(|_| true)) 18 | .unwrap_or(Ok(false)) 19 | } 20 | } 21 | 22 | impl<'a> SeqReader for Box { 23 | fn read_next_conditional( 24 | &mut self, 25 | func: &mut dyn FnMut(&dyn Record) -> CliResult, 26 | ) -> Option> { 27 | (**self).read_next_conditional(func) 28 | } 29 | 30 | fn read_next(&mut self, func: &mut dyn FnMut(&dyn Record) -> CliResult<()>) -> CliResult { 31 | (**self).read_next(func) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/io/output/fastx.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | 3 | use crate::error::CliResult; 4 | use crate::var::{attr::AttrWriteAction, varstring::VarString, VarBuilder}; 5 | 6 | #[derive(Debug, Clone, Eq, PartialEq)] 7 | pub struct Attribute { 8 | pub name: String, 9 | pub value: String, 10 | } 11 | 12 | impl FromStr for Attribute { 13 | type Err = String; 14 | 15 | fn from_str(s: &str) -> Result { 16 | let mut parts = s.splitn(2, '='); 17 | let name = parts.next().unwrap().to_string(); 18 | let value = match parts.next() { 19 | Some(p) => p.to_string(), 20 | None => { 21 | return Err(format!( 22 | "Invalid attribute: '{name}'. Attributes need to be in the format: name=value" 23 | )) 24 | } 25 | }; 26 | Ok(Attribute { name, value }) 27 | } 28 | } 29 | 30 | pub fn register_attributes(attrs: &[(Attribute, bool)], builder: &mut VarBuilder) -> CliResult<()> { 31 | for (attr, replace_existing) in attrs { 32 | let (vs, _) = VarString::parse_register(&attr.value, builder, false)?; 33 | let action = if *replace_existing { 34 | AttrWriteAction::Edit(vs) 35 | } else { 36 | AttrWriteAction::Append(vs) 37 | }; 38 | builder.register_attr(&attr.name, Some(action))?; 39 | } 40 | Ok(()) 41 | } 42 | -------------------------------------------------------------------------------- /src/helpers/vec_buf.rs: -------------------------------------------------------------------------------- 1 | /// A "factory" of vectors that determines the best initial capacity in a quite 2 | /// simple (not very sophisticated) way. 3 | /// These vectors are intended to be used as buffers, to which many rounds of writing 4 | /// done (using `io::Write::write_all()`). 5 | /// In each writing round, `write_fn` may issue many repeated `write_all()`, so the capacity 6 | /// is not easy to manage. 7 | /// The needed capacity is recalculated in regular intervals to make sure 8 | /// that the vectors do not use too much memory. 9 | #[derive(Debug, Default)] 10 | pub struct VecFactory { 11 | minlen: usize, 12 | maxlen: usize, 13 | counter: u16, 14 | } 15 | 16 | impl VecFactory { 17 | pub fn new() -> VecFactory { 18 | Self::default() 19 | } 20 | 21 | pub fn get(&mut self, mut write_fn: F) -> Result, E> 22 | where 23 | F: FnMut(&mut Vec) -> Result<(), E>, 24 | { 25 | if self.counter >= 1000 { 26 | self.maxlen = self.minlen; 27 | self.counter = 0; 28 | } 29 | let mut v = Vec::with_capacity(self.maxlen); 30 | write_fn(&mut v)?; 31 | v.shrink_to_fit(); 32 | if v.len() > self.maxlen { 33 | self.maxlen = v.len(); 34 | self.counter += 1; 35 | } else if v.len() < self.minlen { 36 | self.minlen = v.len(); 37 | } 38 | Ok(v) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /scripts/validate_features.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script runs the compilation and unit tests for each individual feature 3 | 4 | set -euo pipefail 5 | 6 | features=( \ 7 | pass,gz \ 8 | pass,lz4 \ 9 | pass,zstd \ 10 | pass,bz2 \ 11 | expr \ 12 | all-commands \ 13 | all-commands,expr \ 14 | pass \ 15 | pass,regex-fast \ 16 | view \ 17 | count \ 18 | stat \ 19 | head \ 20 | tail \ 21 | slice \ 22 | sample \ 23 | sort \ 24 | unique \ 25 | filter,expr \ 26 | split \ 27 | cmp \ 28 | interleave \ 29 | find \ 30 | find,regex-fast \ 31 | replace \ 32 | replace,regex-fast \ 33 | del \ 34 | set \ 35 | trim \ 36 | mask \ 37 | upper \ 38 | lower \ 39 | revcomp \ 40 | concat \ 41 | ) 42 | 43 | cores=8 44 | 45 | echo "===== NO features ======================" 46 | echo -n "build... " 47 | cargo build -q -j $cores --no-default-features 48 | echo "test..." 49 | cargo test -q -j $cores --no-default-features 50 | 51 | echo "===== Default features ======================" 52 | echo -n "build... " 53 | cargo build -q -j $cores 54 | echo "test..." 55 | cargo test -q -j $cores 56 | 57 | # single feature 58 | for feature in ${features[@]}; do 59 | echo "===== Feature(s) '$feature' ======================" 60 | echo -n "build... " 61 | cargo build -q -j $cores --no-default-features --features=$feature 62 | echo "test..." 63 | cargo test -q -j $cores --no-default-features --features=$feature 64 | done 65 | -------------------------------------------------------------------------------- /src/io/output/writer.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use crate::context::{RecordMeta, SeqContext}; 4 | use crate::error::CliResult; 5 | use crate::io::{QualConverter, Record}; 6 | 7 | pub trait SeqFormatter { 8 | /// Write a formatted record to `out`, given the metadata in `ctx`. 9 | /// This is a convenience wrapper around `write_with`, which allows directly 10 | /// providing `SeqContext`. 11 | fn write( 12 | &mut self, 13 | record: &dyn Record, 14 | out: &mut dyn io::Write, 15 | ctx: &mut SeqContext, 16 | ) -> CliResult<()> { 17 | self.write_with(record, &ctx.meta[0], out, &mut ctx.qual_converter) 18 | } 19 | 20 | /// Write a formatted record to `out`, given all necessary metadata. 21 | fn write_with( 22 | &mut self, 23 | record: &dyn Record, 24 | data: &RecordMeta, 25 | out: &mut dyn io::Write, 26 | qc: &mut QualConverter, 27 | ) -> CliResult<()>; 28 | } 29 | 30 | impl SeqFormatter for Box { 31 | fn write( 32 | &mut self, 33 | record: &dyn Record, 34 | out: &mut dyn io::Write, 35 | ctx: &mut SeqContext, 36 | ) -> CliResult<()> { 37 | (**self).write(record, out, ctx) 38 | } 39 | 40 | fn write_with( 41 | &mut self, 42 | record: &dyn Record, 43 | data: &RecordMeta, 44 | out: &mut dyn io::Write, 45 | qc: &mut QualConverter, 46 | ) -> CliResult<()> { 47 | (**self).write_with(record, data, out, qc) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/io/output/fasta.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use seq_io::fasta; 4 | 5 | use crate::context::RecordMeta; 6 | use crate::error::CliResult; 7 | use crate::io::QualConverter; 8 | use crate::var::VarBuilder; 9 | 10 | use crate::io::{ 11 | output::{fastx::register_attributes, SeqFormatter}, 12 | Record, 13 | }; 14 | 15 | use super::fastx::Attribute; 16 | 17 | pub struct FastaWriter { 18 | wrap: Option, 19 | } 20 | 21 | impl FastaWriter { 22 | pub fn new( 23 | wrap: Option, 24 | attrs: &[(Attribute, bool)], 25 | builder: &mut VarBuilder, 26 | ) -> CliResult { 27 | register_attributes(attrs, builder)?; 28 | Ok(Self { wrap }) 29 | } 30 | } 31 | 32 | impl SeqFormatter for FastaWriter { 33 | fn write_with( 34 | &mut self, 35 | record: &dyn Record, 36 | data: &RecordMeta, 37 | out: &mut dyn io::Write, 38 | _qc: &mut QualConverter, 39 | ) -> CliResult<()> { 40 | write_fasta(record, data, out, self.wrap) 41 | } 42 | } 43 | 44 | fn write_fasta( 45 | record: &dyn Record, 46 | data: &RecordMeta, 47 | mut out: W, 48 | wrap: Option, 49 | ) -> CliResult<()> { 50 | out.write_all(b">")?; 51 | data.attrs.write_head(record, &mut out, &data.symbols)?; 52 | out.write_all(b"\n")?; 53 | if let Some(w) = wrap { 54 | fasta::write_wrap_seq_iter(&mut out, record.seq_segments(), w)?; 55 | } else { 56 | fasta::write_seq_iter(&mut out, record.seq_segments())?; 57 | } 58 | Ok(()) 59 | } 60 | -------------------------------------------------------------------------------- /src/cmd/del.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::io::HeaderRecord; 7 | use crate::var::attr; 8 | 9 | #[derive(Parser, Clone, Debug)] 10 | #[clap(next_help_heading = "'Del' command options")] 11 | pub struct DelCommand { 12 | /// Delete description fields 13 | #[arg(short, long)] 14 | desc: bool, 15 | 16 | /// Delete attributes 17 | #[arg(long, value_delimiter = ',')] 18 | attrs: Option>, 19 | 20 | #[command(flatten)] 21 | pub common: CommonArgs, 22 | } 23 | 24 | pub fn run(mut cfg: Config, args: DelCommand) -> CliResult<()> { 25 | let del_desc = args.desc; 26 | let del_attrs = args.attrs.as_deref(); 27 | 28 | let mut format_writer = cfg.get_format_writer()?; 29 | cfg.with_io_writer(|io_writer, mut cfg| { 30 | if let Some(attrs) = del_attrs { 31 | cfg.build_vars(|b| { 32 | for attr in attrs { 33 | b.register_attr(attr, Some(attr::AttrWriteAction::Delete))?; 34 | } 35 | Ok::<_, String>(()) 36 | })?; 37 | } 38 | 39 | cfg.read(|record, ctx| { 40 | if del_desc { 41 | let id = record.id(); 42 | let record = HeaderRecord::new(&record, id, None); 43 | format_writer.write(&record, io_writer, ctx)?; 44 | } else { 45 | format_writer.write(&record, io_writer, ctx)?; 46 | } 47 | Ok(true) 48 | }) 49 | }) 50 | } 51 | -------------------------------------------------------------------------------- /src/io/output/csv.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use super::{Record, SeqFormatter}; 4 | use crate::context::RecordMeta; 5 | use crate::io::QualConverter; 6 | use crate::var::{varstring, VarBuilder}; 7 | use crate::{error::CliResult, var::varstring::register_var_list}; 8 | 9 | pub const DEFAULT_OUTFIELDS: &str = "id,desc,seq"; 10 | 11 | pub struct CsvWriter { 12 | delim: u8, 13 | fields: Vec, 14 | } 15 | 16 | impl CsvWriter { 17 | pub fn new(field_list: &str, delim: u8, builder: &mut VarBuilder) -> CliResult { 18 | let mut out = Self { 19 | delim, 20 | fields: vec![], 21 | }; 22 | 23 | // progressively parse fields; this is necessary because there can be 24 | // commas in functions as well 25 | register_var_list(field_list, builder, &mut out.fields, None, true, true)?; 26 | Ok(out) 27 | } 28 | } 29 | 30 | impl SeqFormatter for CsvWriter { 31 | // #[inline] 32 | // fn has_vars(&self) -> bool { 33 | // !self.fields.is_empty() 34 | // } 35 | 36 | fn write_with( 37 | &mut self, 38 | record: &dyn Record, 39 | data: &RecordMeta, 40 | out: &mut dyn io::Write, 41 | _qc: &mut QualConverter, 42 | ) -> CliResult<()> { 43 | let mut is_first = true; 44 | for expr in &self.fields { 45 | if !is_first { 46 | write!(out, "{}", self.delim as char)?; 47 | } 48 | is_first = false; 49 | expr.compose(out, &data.symbols, record)?; 50 | } 51 | writeln!(out)?; 52 | Ok(()) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/io/mod.rs: -------------------------------------------------------------------------------- 1 | use std::convert::Infallible; 2 | use std::fmt; 3 | use std::path::{Path, PathBuf}; 4 | use std::str::FromStr; 5 | 6 | pub use self::format::*; 7 | pub use self::qual_format::*; 8 | pub use self::record::*; 9 | 10 | mod format; 11 | pub mod input; 12 | pub mod output; 13 | mod qual_format; 14 | mod record; 15 | 16 | pub const DEFAULT_FORMAT: FormatVariant = FormatVariant::Fasta; 17 | 18 | pub const DEFAULT_IO_READER_BUFSIZE: usize = 1 << 22; 19 | pub const DEFAULT_IO_WRITER_BUFSIZE: usize = 1 << 22; 20 | 21 | #[derive(Eq, PartialEq, Debug, Clone)] 22 | pub enum IoKind { 23 | Stdio, 24 | File(PathBuf), 25 | } 26 | 27 | impl IoKind { 28 | pub fn from_path>(p: P) -> Result { 29 | let p = p.as_ref(); 30 | if let Some(s) = p.to_str() { 31 | Ok(Self::from_str(s).unwrap()) 32 | } else { 33 | Err(format!("Invalid path: '{}'", p.to_string_lossy())) 34 | } 35 | } 36 | } 37 | 38 | impl FromStr for IoKind { 39 | type Err = Infallible; 40 | 41 | fn from_str(s: &str) -> Result { 42 | if s == "-" { 43 | Ok(Self::Stdio) 44 | } else { 45 | Ok(Self::File(s.into())) 46 | } 47 | } 48 | } 49 | 50 | impl From for IoKind 51 | where 52 | S: AsRef, 53 | { 54 | fn from(s: S) -> Self { 55 | Self::from_str(s.as_ref()).unwrap() 56 | } 57 | } 58 | 59 | impl fmt::Display for IoKind { 60 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 61 | match *self { 62 | Self::Stdio => write!(f, "-"), 63 | Self::File(ref p) => write!(f, "{}", p.as_path().to_string_lossy()), 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/cmd/tail.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::max; 2 | 3 | use clap::{value_parser, Parser}; 4 | 5 | use crate::cli::{CommonArgs, WORDY_HELP}; 6 | use crate::config::Config; 7 | use crate::error::CliResult; 8 | 9 | pub const DESC: &str = "\ 10 | This only works for files (not STDIN), since records are counted in a first 11 | step, and only returned after reading a second time."; 12 | 13 | #[derive(Parser, Clone, Debug)] 14 | #[clap(next_help_heading = "'Tail' command options")] 15 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 16 | pub struct TailCommand { 17 | /// Number of sequences to return 18 | #[arg(short, long, value_name = "N", default_value_t = 10, value_parser = value_parser!(u64).range(1..))] 19 | num_seqs: u64, 20 | 21 | #[command(flatten)] 22 | pub common: CommonArgs, 23 | } 24 | 25 | pub fn run(mut cfg: Config, args: TailCommand) -> CliResult<()> { 26 | let n_select = args.num_seqs; 27 | 28 | if cfg.has_stdin() { 29 | return fail!("Cannot use STDIN as input, since we need to count all sequences before"); 30 | } 31 | 32 | let mut format_writer = cfg.get_format_writer()?; 33 | cfg.with_io_writer(|io_writer, mut cfg| { 34 | // first count the sequences 35 | // TODO: use .fai files once supported? 36 | let mut n = 0; 37 | 38 | cfg.read(|_, _| { 39 | n += 1; 40 | Ok(true) 41 | })?; 42 | 43 | let mut i = 0; 44 | let select_from = max(n, n_select) - n_select; 45 | 46 | cfg.read(|record, ctx| { 47 | i += 1; 48 | if i > select_from { 49 | format_writer.write(&record, io_writer, ctx)?; 50 | } 51 | Ok(true) 52 | }) 53 | }) 54 | } 55 | -------------------------------------------------------------------------------- /var_provider/src/usage.rs: -------------------------------------------------------------------------------- 1 | //! Types and functions providing/handling variable/function usage information 2 | 3 | use itertools::Itertools; 4 | 5 | use crate::VarType; 6 | 7 | #[cold] 8 | pub(crate) fn usage_list(info: &FuncUsage) -> Vec { 9 | let n_args = info.args.len(); 10 | let n_required = info 11 | .args 12 | .iter() 13 | .position(|arg| arg.default_value.is_some()) 14 | .unwrap_or(info.args.len()); 15 | let mut out = Vec::with_capacity(1 + n_args - n_required); 16 | if n_required == 0 { 17 | out.push(info.name.to_string()); 18 | } 19 | if n_args > 0 { 20 | for i in n_required.clamp(1, n_args)..n_args + 1 { 21 | out.push(format!( 22 | "{}({})", 23 | info.name, 24 | info.args[..i].iter().map(|u| u.name).join(", ") 25 | )); 26 | } 27 | } 28 | // dbg!(info, n_args, n_required, &out); 29 | out 30 | } 31 | 32 | #[derive(Debug)] 33 | pub struct FuncUsage { 34 | pub name: &'static str, 35 | // multiple argument collections possible 36 | // (different usage patterns) 37 | pub args: &'static [ArgUsage], 38 | pub description: &'static str, 39 | pub output_type: Option, 40 | pub hidden: bool, 41 | } 42 | 43 | #[derive(Debug)] 44 | pub struct ArgUsage { 45 | pub name: &'static str, 46 | // the default value is always specified as &str in the usage string (for the help page), even 47 | // though from_func() will parse it further 48 | pub default_value: Option<&'static str>, 49 | } 50 | 51 | #[derive(Debug)] 52 | pub struct UsageExample { 53 | pub description: &'static str, 54 | pub command: &'static str, 55 | pub output: Option<&'static str>, 56 | } 57 | -------------------------------------------------------------------------------- /src/io/input/fastx.rs: -------------------------------------------------------------------------------- 1 | use std::cell::Cell; 2 | 3 | use memchr::memchr; 4 | use seq_io::policy::BufPolicy; 5 | 6 | #[derive(Default, Clone, Debug)] 7 | pub struct FastxHeaderParser { 8 | delim_pos: Cell>>, 9 | } 10 | 11 | impl FastxHeaderParser { 12 | // #[inline(always)] 13 | pub fn id_desc<'a>(&self, head: &'a [u8]) -> (&'a [u8], Option<&'a [u8]>) { 14 | if self.delim_pos.get().is_none() { 15 | self.delim_pos.set(Some(memchr(b' ', head))); 16 | } 17 | Self::_split_header(head, self.delim_pos.get().unwrap()) 18 | } 19 | 20 | fn _split_header(head: &[u8], delim: Option) -> (&[u8], Option<&[u8]>) { 21 | if let Some(d) = delim { 22 | let (id, desc) = head.split_at(d); 23 | (id, Some(&desc[1..])) 24 | } else { 25 | (head, None) 26 | } 27 | } 28 | 29 | pub fn parsed_id_desc<'a>(&self, head: &'a [u8]) -> Option<(&'a [u8], Option<&'a [u8]>)> { 30 | self.delim_pos.get().map(|d| Self::_split_header(head, d)) 31 | } 32 | 33 | pub fn delim_pos(&self) -> Option> { 34 | self.delim_pos.get() 35 | } 36 | 37 | pub fn set_delim_pos(&self, delim_pos: Option>) { 38 | self.delim_pos.set(delim_pos); 39 | } 40 | } 41 | 42 | #[derive(Clone)] 43 | pub struct LimitedBuffer { 44 | pub double_until: usize, 45 | pub limit: usize, 46 | } 47 | 48 | impl BufPolicy for LimitedBuffer { 49 | fn grow_to(&mut self, current_size: usize) -> Option { 50 | if current_size < self.double_until { 51 | Some(current_size * 2) 52 | } else if current_size < self.limit { 53 | Some(current_size + self.double_until) 54 | } else { 55 | None 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/var/modules/expr/mod.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use std::fmt::Debug; 3 | use std::fs::read_to_string; 4 | 5 | use crate::io::Record; 6 | use crate::var::symbols::{OptValue, SymbolTable}; 7 | 8 | mod expressions; 9 | pub mod js; 10 | mod var_provider; 11 | 12 | pub use self::var_provider::*; 13 | 14 | #[derive(Debug, Default, Clone)] 15 | pub struct Var { 16 | pub symbol_id: usize, 17 | pub name: String, 18 | } 19 | 20 | /// General trait used for registering/evaluating expressions, which 21 | /// can be implemented for different expression engines 22 | pub trait Expression: Default + Debug { 23 | type Context: ExprContext; 24 | 25 | fn register( 26 | &mut self, 27 | expr_id: usize, 28 | expr: &str, 29 | ctx: &mut Self::Context, 30 | ) -> Result<(), String>; 31 | 32 | fn eval(&mut self, out: &mut OptValue, ctx: &mut Self::Context) -> Result<(), String>; 33 | } 34 | 35 | pub trait ExprContext: Default { 36 | fn init(&mut self, init_code: Option<&str>) -> Result<(), String>; 37 | 38 | fn next_record( 39 | &mut self, 40 | symbols: &SymbolTable, 41 | record: &dyn Record, 42 | ) -> Result<(), (usize, String)>; 43 | 44 | // fn clear(&mut self) {} 45 | 46 | fn register(&mut self, _var: &Var) -> Result<(), String> { 47 | Ok(()) 48 | } 49 | } 50 | 51 | pub fn code_or_file(expr: &'_ str) -> Result, String> { 52 | let expr = expr.trim(); 53 | let prefix = "file:"; 54 | #[allow(clippy::manual_strip)] 55 | if expr.starts_with(prefix) { 56 | let path = expr[prefix.len()..].trim_start(); 57 | return read_to_string(path) 58 | .map(String::into) 59 | .map_err(|e| format!("Unable to read script file '{path}': {e}")); 60 | } 61 | Ok(expr.into()) 62 | } 63 | -------------------------------------------------------------------------------- /src/test/concat.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn concat() { 5 | with_tmpdir("st_concat_", |td| { 6 | let input = td.multi_file( 7 | ".fastq", 8 | [ 9 | "@id1\nAAA\n+\nAAA\n@id2\nAAA\n+\nAAA\n", 10 | "@id1\nBBB\n+\nBBB\n@id2\nBBB\n+\nBBB\n", 11 | "@id1\nCCC\n+\nCCC\n@id2\nCCC\n+\nCCC\n", 12 | ], 13 | ); 14 | 15 | cmp( 16 | &["concat"], 17 | &input, 18 | "@id1\nAAABBBCCC\n+\nAAABBBCCC\n@id2\nAAABBBCCC\n+\nAAABBBCCC\n", 19 | ); 20 | cmp( 21 | &["concat", "-s2"], 22 | &input, 23 | "@id1\nAAANNBBBNNCCC\n+\nAAAJJBBBJJCCC\n@id2\nAAANNBBBNNCCC\n+\nAAAJJBBBJJCCC\n", 24 | ); 25 | cmp( 26 | &["concat", "-s2", "-c", "-", "--q-char", "~"], 27 | &input, 28 | "@id1\nAAA--BBB--CCC\n+\nAAA~~BBB~~CCC\n@id2\nAAA--BBB--CCC\n+\nAAA~~BBB~~CCC\n", 29 | ); 30 | 31 | // id mismatch 32 | fails( 33 | &["concat"], 34 | td.multi_file(".fasta", [">id1\nATG", ">id\nATG"]), 35 | "ID of record #2 (id) does not match the ID of the first one (id1)", 36 | ); 37 | 38 | // too few records in second input 39 | fails( 40 | &["concat"], 41 | td.multi_file(".fasta", [">id1\nATG\n>id2\nA", ">id1\nATG"]), 42 | "The number of records in input #2 does not match the number of records in input #1", 43 | ); 44 | 45 | // too many records in second input 46 | fails( 47 | &["concat"], 48 | td.multi_file(".fasta", [">id1\nATG", ">id1\nATG\n>id2\nA"]), 49 | "The number of records in input #2 does not match the number of records in input #1", 50 | ); 51 | }); 52 | } 53 | -------------------------------------------------------------------------------- /var_provider/src/func.rs: -------------------------------------------------------------------------------- 1 | //! Functions related to constructing a variable/function enum type with its arguments. 2 | 3 | use std::fmt::Display; 4 | 5 | pub trait FromArg: Sized { 6 | fn from_arg(func_name: &str, arg_name: &str, arg: A) -> Result; 7 | } 8 | 9 | impl<'a> FromArg<&'a str> for &'a str { 10 | fn from_arg(_: &str, _: &str, value: &'a str) -> Result { 11 | Ok(value) 12 | } 13 | } 14 | 15 | macro_rules! impl_from_arg { 16 | ($ty:ty, $cnv:expr, $what:expr) => { 17 | impl FromArg<&str> for $ty { 18 | fn from_arg(func_name: &str, arg_name: &str, value: &str) -> Result { 19 | $cnv(value).map_err(|_| invalid_value(func_name, arg_name, value)) 20 | } 21 | } 22 | }; 23 | } 24 | 25 | impl_from_arg!(usize, |s: &str| s.parse(), "an integer number"); 26 | impl_from_arg!(f64, |s: &str| s.parse(), "a decimal number"); 27 | impl_from_arg!(bool, |s: &str| s.parse(), "a boolean (true/false)"); 28 | impl_from_arg!(String, |s: &str| Ok::<_, String>(s.to_string()), "a string"); 29 | 30 | #[inline(never)] 31 | pub fn invalid_value(var_name: &str, arg_name: &str, value: V) -> String { 32 | format!("Invalid value for argument '{arg_name}' of function '{var_name}': '{value}'") 33 | } 34 | 35 | #[inline(never)] 36 | pub fn missing_argument(var_name: &str, arg_name: &str) -> String { 37 | format!("The function '{var_name}' is missing the argument '{arg_name}'") 38 | } 39 | 40 | #[inline(never)] 41 | pub fn too_many_args(var_name: &str, max_args: usize, arg: V) -> String { 42 | format!( 43 | "The function '{}' got an unexpected argument '{}', expecting only {} argument{}", 44 | var_name, 45 | arg, 46 | max_args, 47 | if max_args == 1 { "" } else { "s" } 48 | ) 49 | } 50 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::ToOwned; 2 | use std::convert::From; 3 | use std::error; 4 | use std::fmt; 5 | use std::io; 6 | use std::num::{ParseFloatError, ParseIntError}; 7 | use std::str::Utf8Error; 8 | use std::string::FromUtf8Error; 9 | 10 | pub type CliResult = Result; 11 | 12 | #[derive(Debug)] 13 | pub enum CliError { 14 | Io(io::Error), 15 | Other(String), 16 | } 17 | 18 | impl fmt::Display for CliError { 19 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 20 | match *self { 21 | CliError::Io(ref e) => e.fmt(f), 22 | CliError::Other(ref s) => f.write_str(s), 23 | } 24 | } 25 | } 26 | 27 | impl error::Error for CliError { 28 | fn description(&self) -> &str { 29 | "seqtool commandline error" 30 | } 31 | } 32 | 33 | impl From for CliError { 34 | fn from(err: io::Error) -> CliError { 35 | CliError::Io(err) 36 | } 37 | } 38 | 39 | impl<'a> From<&'a str> for CliError { 40 | fn from(err: &'a str) -> CliError { 41 | CliError::Other(err.to_owned()) 42 | } 43 | } 44 | 45 | impl From> for CliError { 46 | fn from(err: csv::IntoInnerError) -> CliError { 47 | CliError::Other(format!("{err}")) 48 | } 49 | } 50 | 51 | macro_rules! from_err(($e:ty) => ( 52 | impl From<$e> for CliError { 53 | fn from(err: $e) -> CliError { 54 | CliError::Other(format!("{err}")) 55 | } 56 | } 57 | )); 58 | 59 | from_err!(String); 60 | from_err!(fmt::Error); 61 | from_err!(seq_io::fasta::Error); 62 | from_err!(seq_io::fastq::Error); 63 | #[cfg(any(feature = "all-commands", feature = "find", feature = "replace"))] 64 | from_err!(regex_lite::Error); 65 | #[cfg(feature = "regex-fast")] 66 | from_err!(regex::Error); 67 | from_err!(Utf8Error); 68 | from_err!(FromUtf8Error); 69 | from_err!(ParseIntError); 70 | from_err!(ParseFloatError); 71 | from_err!(csv::Error); 72 | -------------------------------------------------------------------------------- /src/cmd/sort/vars.rs: -------------------------------------------------------------------------------- 1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType}; 2 | use variable_enum_macro::variable_enum; 3 | 4 | use crate::cmd::shared::key::Key; 5 | use crate::var::{modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder}; 6 | 7 | variable_enum! { 8 | /// # Variables provided by the 'sort' command 9 | /// 10 | /// # Examples 11 | /// 12 | /// Sort by part of the sequence ID, which is obtained using 13 | /// a JavaScript expression. 14 | /// We additionally keep this substring by writing the sort key to a header 15 | /// attribute: 16 | /// 17 | /// `st sort -n '{ id.slice(2, 5) }' -a id_num='{num(key)}' input.fasta` 18 | /// 19 | /// >id001 id_num=1 20 | /// SEQ 21 | /// >id002 id_num=2 22 | /// SEQ 23 | /// (...) 24 | SortVar { 25 | /// The value of the key used for sorting 26 | Key(?), 27 | } 28 | } 29 | 30 | #[derive(Debug, Default)] 31 | pub struct SortVars { 32 | key_id: Option, 33 | } 34 | 35 | impl SortVars { 36 | pub fn set(&mut self, key: &Key, symbols: &mut SymbolTable) { 37 | if let Some(var_id) = self.key_id { 38 | key.write_to_symbol(symbols.get_mut(var_id)); 39 | } 40 | } 41 | } 42 | 43 | impl VarProvider for SortVars { 44 | fn info(&self) -> &dyn DynVarProviderInfo { 45 | &dyn_var_provider!(SortVar) 46 | } 47 | 48 | fn register( 49 | &mut self, 50 | name: &str, 51 | args: &[Arg], 52 | builder: &mut VarBuilder, 53 | ) -> Result)>, String> { 54 | Ok(SortVar::from_func(name, args)?.map(|(var, out_type)| { 55 | let SortVar::Key = var; 56 | let symbol_id = self.key_id.get_or_insert_with(|| builder.increment()); 57 | (*symbol_id, out_type) 58 | })) 59 | } 60 | 61 | fn has_vars(&self) -> bool { 62 | self.key_id.is_some() 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/filter.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn filter() { 5 | let fa = ">id\nSEQ\n>id2 a=20\nSEQ\n>id3 a=\nSEQ"; 6 | 7 | cmp( 8 | &["filter", "seqlen > ungapped_seqlen && attr('p') >= 10"], 9 | &*FASTA, 10 | &SEQS[2..].concat(), 11 | ); 12 | cmp(&["filter", "id == 'seq0'"], &*FASTA, SEQS[1]); 13 | cmp(&["filter", "id == undefined"], &*FASTA, ""); 14 | // note: comparison with undefined in Javascript returns false, thus only sequences 15 | // with defined attributes are kept 16 | cmp( 17 | &[ 18 | "filter", 19 | "opt_attr('a') != undefined && opt_attr('a') >= 20", 20 | "--to-tsv", 21 | "id", 22 | ], 23 | fa, 24 | "id2\n", 25 | ); 26 | cmp( 27 | &["filter", "opt_attr('a') >= 20", "--to-tsv", "id"], 28 | fa, 29 | "id2\n", 30 | ); 31 | // Javascript Regex: 32 | // currently /regex/ syntax with strings matching any variable/function 33 | // cannot be handled 34 | // cmp( 35 | // &["filter", r"(/id\d+/).test(id)", "--to-tsv", "id"], 36 | // fa, 37 | // "id2\nid3\n", 38 | // ); 39 | cmp( 40 | &[ 41 | "filter", 42 | r"(new RegExp('id\\d+')).test(id)", 43 | "--to-tsv", 44 | "id", 45 | ], 46 | fa, 47 | "id2\nid3\n", 48 | ); 49 | } 50 | 51 | #[test] 52 | fn drop_file() { 53 | with_tmpdir("st_drop_file_", |td| { 54 | let dropped = td.path(".csv"); 55 | let input = "@id1\nSEQ\n+\nJJJ\n@id2\nOTHER\n+\nJJJJJ\n"; 56 | let cmd = &[ 57 | "filter", 58 | "seq != 'SEQ'", 59 | "--fq", 60 | "--to-csv", 61 | "id,seq_num,seq", 62 | "--dropped", 63 | &dropped, 64 | ]; 65 | cmp(cmd, input, "id2,2,OTHER\n"); 66 | assert_eq!(&dropped.content(), "id1,1,SEQ\n"); 67 | }); 68 | } 69 | -------------------------------------------------------------------------------- /src/cmd/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod shared; 2 | 3 | #[cfg(any(feature = "all-commands", feature = "pass", feature = "stat"))] 4 | pub mod pass; 5 | #[cfg(any(feature = "all-commands", feature = "view"))] 6 | pub mod view; 7 | 8 | #[cfg(any(feature = "all-commands", feature = "count"))] 9 | pub mod count; 10 | #[cfg(any(feature = "all-commands", feature = "stat"))] 11 | pub mod stat; 12 | 13 | #[cfg(any(feature = "all-commands", feature = "cmp"))] 14 | pub mod cmp; 15 | #[cfg(any( 16 | all(feature = "expr", feature = "all-commands"), 17 | all(feature = "expr", feature = "filter") 18 | ))] 19 | pub mod filter; 20 | #[cfg(any(feature = "all-commands", feature = "head"))] 21 | pub mod head; 22 | #[cfg(any(feature = "all-commands", feature = "interleave"))] 23 | pub mod interleave; 24 | #[cfg(any(feature = "all-commands", feature = "sample"))] 25 | pub mod sample; 26 | #[cfg(any(feature = "all-commands", feature = "slice"))] 27 | pub mod slice; 28 | #[cfg(any(feature = "all-commands", feature = "sort"))] 29 | pub mod sort; 30 | #[cfg(any(feature = "all-commands", feature = "split"))] 31 | pub mod split; 32 | #[cfg(any(feature = "all-commands", feature = "tail"))] 33 | pub mod tail; 34 | #[cfg(any(feature = "all-commands", feature = "unique"))] 35 | pub mod unique; 36 | 37 | #[cfg(any(feature = "all-commands", feature = "concat"))] 38 | pub mod concat; 39 | #[cfg(any(feature = "all-commands", feature = "del"))] 40 | pub mod del; 41 | #[cfg(any(feature = "all-commands", feature = "find"))] 42 | pub mod find; 43 | #[cfg(any(feature = "all-commands", feature = "lower"))] 44 | pub mod lower; 45 | #[cfg(any(feature = "all-commands", feature = "mask"))] 46 | pub mod mask; 47 | #[cfg(any(feature = "all-commands", feature = "replace"))] 48 | pub mod replace; 49 | #[cfg(any(feature = "all-commands", feature = "revcomp"))] 50 | pub mod revcomp; 51 | #[cfg(any(feature = "all-commands", feature = "set"))] 52 | pub mod set; 53 | #[cfg(any(feature = "all-commands", feature = "trim"))] 54 | pub mod trim; 55 | #[cfg(any(feature = "all-commands", feature = "upper"))] 56 | pub mod upper; 57 | -------------------------------------------------------------------------------- /src/cmd/set.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::io::{RecordAttr, RecordEditor}; 7 | use crate::var::varstring::VarString; 8 | 9 | #[derive(Parser, Clone, Debug)] 10 | #[clap(next_help_heading = "'Set' command options")] 11 | pub struct SetCommand { 12 | /// New ID (variables allowed) 13 | #[arg(short, long)] 14 | id: Option, 15 | 16 | /// New description (variables allowed) 17 | #[arg(short, long)] 18 | desc: Option, 19 | 20 | /// New sequence (variables allowed) 21 | #[arg(short, long)] 22 | seq: Option, 23 | 24 | #[command(flatten)] 25 | pub common: CommonArgs, 26 | } 27 | 28 | pub fn run(mut cfg: Config, args: SetCommand) -> CliResult<()> { 29 | let mut replacements = vec![]; 30 | if let Some(string) = args.id.as_ref() { 31 | replacements.push((string, RecordAttr::Id)); 32 | } 33 | if let Some(string) = args.desc.as_ref() { 34 | replacements.push((string, RecordAttr::Desc)); 35 | } 36 | if let Some(string) = args.seq.as_ref() { 37 | replacements.push((string, RecordAttr::Seq)); 38 | } 39 | 40 | let mut format_writer = cfg.get_format_writer()?; 41 | cfg.with_io_writer(|io_writer, mut cfg| { 42 | // get String -> VarString 43 | let replacements: Vec<_> = replacements 44 | .iter() 45 | .map(|&(e, attr)| { 46 | let (e, _) = cfg.build_vars(|b| VarString::parse_register(e, b, false))?; 47 | Ok((e, attr)) 48 | }) 49 | .collect::>()?; 50 | 51 | let mut editor = RecordEditor::new(); 52 | 53 | cfg.read(|record, ctx| { 54 | for &(ref expr, attr) in &replacements { 55 | let val = editor.edit(attr); 56 | expr.compose(val, ctx.symbols(), record)?; 57 | } 58 | 59 | format_writer.write(&editor.record(&record), io_writer, ctx)?; 60 | Ok(true) 61 | }) 62 | }) 63 | } 64 | -------------------------------------------------------------------------------- /src/helpers/replace.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | /// Helper function for replacing parts of a given text 4 | /// with a new text and writing the result to an io::Write instance. 5 | /// Requires an iterator over (start, end) positions. 6 | #[inline(always)] 7 | pub fn replace_iter( 8 | text: &[u8], 9 | replacement: &[u8], 10 | matches: M, 11 | out: &mut W, 12 | ) -> io::Result<()> 13 | where 14 | M: Iterator, 15 | W: io::Write + ?Sized, 16 | { 17 | replace_iter_custom(text, matches, out, |out, _, _| out.write_all(replacement)) 18 | } 19 | 20 | /// Like replace_iter, but with custom replacement function, 21 | /// which is given the matched text and all remaining text 22 | /// and allows writing anything to the output stream. 23 | #[inline(always)] 24 | pub fn replace_iter_custom( 25 | text: &[u8], 26 | matches: M, 27 | out: &mut W, 28 | mut write_replacement: R, 29 | ) -> io::Result<()> 30 | where 31 | R: FnMut(&mut W, &[u8], &[u8]) -> io::Result<()>, 32 | M: Iterator, 33 | W: io::Write + ?Sized, 34 | { 35 | let mut last_end = 0; 36 | for (start, end) in matches { 37 | out.write_all(&text[last_end..start])?; 38 | write_replacement(out, &text[start..end], &text[end..])?; 39 | last_end = end; 40 | } 41 | out.write_all(&text[last_end..])?; 42 | Ok(()) 43 | } 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | use std::io::Write; 48 | 49 | #[test] 50 | fn replace_iter() { 51 | let pos = &[(1, 2), (4, 6), (7, 8)]; 52 | let text = b"012345678"; 53 | let replaced = b"0x23x6x8"; 54 | 55 | let mut out = vec![]; 56 | super::replace_iter_custom(text, pos.iter().cloned(), &mut out, |out, _, _| { 57 | out.write_all(b"x") 58 | }) 59 | .unwrap(); 60 | assert_eq!(&out, replaced); 61 | 62 | // let mut out = vec![]; 63 | // unsafe { super::replace_iter_unchecked(text, b"x", &mut out, pos.iter().cloned()) }; 64 | // assert_eq!(&out, replaced) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/io/output/fastq.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use crate::context::RecordMeta; 4 | use crate::error::CliResult; 5 | use crate::io::{QualConverter, QualFormat}; 6 | use crate::var::VarBuilder; 7 | 8 | use crate::io::{ 9 | output::{fastx::register_attributes, SeqFormatter}, 10 | Record, 11 | }; 12 | 13 | use super::fastx::Attribute; 14 | 15 | pub struct FastqWriter { 16 | format: QualFormat, 17 | } 18 | 19 | impl FastqWriter { 20 | pub fn new( 21 | format: QualFormat, 22 | attrs: &[(Attribute, bool)], 23 | builder: &mut VarBuilder, 24 | ) -> CliResult { 25 | register_attributes(attrs, builder)?; 26 | Ok(Self { format }) 27 | } 28 | } 29 | 30 | impl SeqFormatter for FastqWriter { 31 | fn write_with( 32 | &mut self, 33 | record: &dyn Record, 34 | data: &RecordMeta, 35 | out: &mut dyn io::Write, 36 | qc: &mut QualConverter, 37 | ) -> CliResult<()> { 38 | write_fastq(record, data, out, qc, self.format) 39 | } 40 | } 41 | 42 | fn write_fastq( 43 | record: &dyn Record, 44 | data: &RecordMeta, 45 | mut out: W, 46 | qual_converter: &mut QualConverter, 47 | format: QualFormat, 48 | ) -> CliResult<()> { 49 | // TODO: could use seq_io::fastq::write_to / write_parts, but the sequence is an iterator of segments 50 | let qual = record.qual().ok_or("No quality scores found in input.")?; 51 | 52 | // header 53 | out.write_all(b"@")?; 54 | data.attrs.write_head(record, &mut out, &data.symbols)?; 55 | out.write_all(b"\n")?; 56 | 57 | // sequence 58 | for seq in record.seq_segments() { 59 | out.write_all(seq)?; 60 | } 61 | out.write_all(b"\n+\n")?; 62 | 63 | // quality scores 64 | let qual = qual_converter.convert_to(qual, format).map_err(|e| { 65 | format!( 66 | "Error writing record '{}'. {}", 67 | String::from_utf8_lossy(record.id()), 68 | e 69 | ) 70 | })?; 71 | out.write_all(qual)?; 72 | out.write_all(b"\n")?; 73 | 74 | Ok(()) 75 | } 76 | -------------------------------------------------------------------------------- /src/test/split.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use itertools::Itertools; 3 | 4 | use std::str; 5 | 6 | #[test] 7 | fn chunks() { 8 | with_tmpdir("st_split_chunks_", |td| { 9 | for size in 1..5 { 10 | let key = td.persistent_path("f_{chunk}.{default_ext}"); 11 | succeeds(&["split", "-n", &format!("{size}"), "-po", &key], &*FASTA); 12 | 13 | for (i, seqs) in SEQS.iter().chunks(size).into_iter().enumerate() { 14 | let f = td.path(&format!("f_{}.fasta", i + 1)); 15 | assert_eq!(f.content(), seqs.into_iter().join("")); 16 | } 17 | } 18 | }); 19 | } 20 | 21 | #[test] 22 | fn key() { 23 | with_tmpdir("st_split_key_", |td| { 24 | let out_path = td.persistent_path("{id}_{attr(p)}.fasta"); 25 | succeeds(&["split", "-po", &out_path], &*FASTA); 26 | 27 | let expected = &["seq1_2", "seq0_1", "seq3_10", "seq2_11"]; 28 | 29 | for (name, seq) in expected.iter().zip(SEQS) { 30 | let f = td.path(&format!("{name}.fasta")); 31 | assert_eq!(f.content(), seq); 32 | } 33 | }); 34 | } 35 | 36 | #[test] 37 | fn seqlen_count() { 38 | with_tmpdir("st_split_sl_", |td| { 39 | let key = td.persistent_path("{seqlen}.fasta"); 40 | succeeds(&["split", "-o", &key], &*FASTA); 41 | 42 | let out = td.path("25.fasta"); 43 | cmp( 44 | &["split", "-po", &key, "-c", "-"], 45 | &*FASTA, 46 | &format!("{}\t4\n", out.as_str()), 47 | ); 48 | assert_eq!(out.content(), &*FASTA as &str); 49 | }); 50 | } 51 | 52 | #[cfg(feature = "gz")] 53 | #[test] 54 | fn compression() { 55 | with_tmpdir("st_split_compr_", |td| { 56 | let key = td.persistent_path("{id}_{attr(p)}.fasta.gz"); 57 | succeeds(&["split", "-po", &key], &*FASTA); 58 | 59 | let expected = &["seq1_2", "seq0_1", "seq3_10", "seq2_11"]; 60 | 61 | for (name, seq) in expected.iter().zip(SEQS) { 62 | let f = td.path(&format!("{name}.fasta.gz")); 63 | assert_eq!(f.gz_content(), seq); 64 | } 65 | }); 66 | } 67 | -------------------------------------------------------------------------------- /src/helpers/bytesize.rs: -------------------------------------------------------------------------------- 1 | //! Small function that parses memory sizes, accepting 2 | //! different units (K, M, G, T). They are interpreted as powers of 2 3 | //! (kibibytes, etc.). 4 | //! Decimal numbers are rounded to the next integer. 5 | pub fn parse_bytesize(size: &str) -> Result { 6 | let size = size.trim(); 7 | if size.is_empty() { 8 | return Err("Empty size string.".to_string()); 9 | } 10 | let number = size.parse::(); 11 | 12 | match number { 13 | Ok(n) => Ok(n.round() as usize), 14 | Err(_) => { 15 | let (unit_size, unit) = size.split_at(size.len() - 1); 16 | let suffixes = [b'B', b'K', b'M', b'G', b'T']; //, "P", "E"] 17 | let unit_byte = unit.to_ascii_uppercase().as_bytes()[0]; 18 | if let Some(pow) = suffixes.iter().position(|s| *s == unit_byte) { 19 | if let Ok(s) = unit_size.trim().parse::() { 20 | Ok((s * (1024_f64).powi(pow as i32)).round() as usize) 21 | } else { 22 | Err(format!("Invalid size string: '{size}'")) 23 | } 24 | } else { 25 | Err(format!("Unknown size unit: '{unit}'")) 26 | } 27 | } 28 | } 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | use super::*; 34 | 35 | #[test] 36 | fn test_bytesize() { 37 | assert_eq!(parse_bytesize("1.").unwrap(), 1); 38 | assert_eq!(parse_bytesize(" 1 B").unwrap(), 1); 39 | assert_eq!(parse_bytesize(" 100K ").unwrap(), 100 * 1024); 40 | assert_eq!( 41 | parse_bytesize("2.3M").unwrap(), 42 | (2.3_f64 * 1024. * 1024.).round() as usize 43 | ); 44 | assert_eq!( 45 | parse_bytesize("2.3M").unwrap(), 46 | (2.3_f64 * 1024. * 1024.).round() as usize 47 | ); 48 | assert_eq!(parse_bytesize("9 g").unwrap(), 9 * 1024 * 1024 * 1024); 49 | assert_eq!(parse_bytesize("1T").unwrap(), 1024 * 1024 * 1024 * 1024); 50 | assert!(parse_bytesize("x").is_err()); 51 | assert!(parse_bytesize("1x").is_err()); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/cmd/slice.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::{CommonArgs, WORDY_HELP}; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::helpers::rng::Range; 7 | 8 | pub const DESC: &str = "\ 9 | The range is specified as `start:end`, whereby start and end 10 | are the sequence numbers (starting from 1). Open ranges are 11 | possible, in the form `start:` or `:end`. 12 | 13 | The following is equivalent with `st head input.fasta`: 14 | `st slice ':10' input.fasta` 15 | 16 | The following is equivalent with `st tail input.fasta`: 17 | `st slice '-10:' input.fasta` 18 | 19 | The 'slice' command does not extract subsequences; see the 20 | 'trim' command for that."; 21 | #[derive(Parser, Clone, Debug)] 22 | #[clap(next_help_heading = "'Slice' command options")] 23 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 24 | pub struct SliceCommand { 25 | /// Range in form 'start:end' or ':end' or 'start:' 26 | #[arg(value_name = "FROM:TO")] 27 | range: Range, 28 | 29 | #[command(flatten)] 30 | pub common: CommonArgs, 31 | } 32 | 33 | pub fn run(mut cfg: Config, args: SliceCommand) -> CliResult<()> { 34 | let range = args.range; 35 | 36 | let mut format_writer = cfg.get_format_writer()?; 37 | cfg.with_io_writer(|io_writer, mut cfg| { 38 | // convert from 1-based to 0-based coordinates 39 | let mut start = range.start.unwrap_or(1); 40 | if start == 0 { 41 | return fail!("Select ranges are 1-based, zero is not a valid start value"); 42 | } 43 | start -= 1; 44 | let end = range.end; 45 | 46 | let mut i = 0; 47 | 48 | cfg.read(|record, ctx| { 49 | // if a start value was specified, skip records 50 | // was thinking about using Itertools::dropping(), but have to check for errors... 51 | if i >= start { 52 | if let Some(e) = end { 53 | if i >= e { 54 | return Ok(false); 55 | } 56 | } 57 | format_writer.write(&record, io_writer, ctx)?; 58 | } 59 | i += 1; 60 | Ok(true) 61 | }) 62 | }) 63 | } 64 | -------------------------------------------------------------------------------- /src/cmd/filter.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::CommonArgs; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::var::{modules::expr::js::parser::Expression, symbols::Value}; 7 | 8 | #[derive(Parser, Clone, Debug)] 9 | #[clap(next_help_heading = "'Filter' command options")] 10 | pub struct FilterCommand { 11 | /// Filter expression 12 | expression: String, 13 | /// Output file for sequences that were removed by filtering. 14 | /// The format is auto-recognized from the extension. 15 | #[arg(short, long, value_name = "FILE")] 16 | dropped: Option, 17 | 18 | #[command(flatten)] 19 | pub common: CommonArgs, 20 | } 21 | 22 | pub fn run(mut cfg: Config, args: FilterCommand) -> CliResult<()> { 23 | let expr = args.expression.trim(); 24 | if expr.starts_with('{') && expr.ends_with('}') { 25 | eprintln!( 26 | "Warning: found filter expression in the form {{ expression }}. \ 27 | The surrounding brackets are unnecessary and should be removed for the \ 28 | expression to work properly." 29 | ) 30 | } 31 | 32 | let parsed_expr = Expression::parse(expr)?; 33 | let (symbol_id, _) = cfg.build_vars(move |b| b.register_expr(&parsed_expr))?; 34 | 35 | let mut dropped_out = args 36 | .dropped 37 | .as_ref() 38 | .map(|f| cfg.new_output(f)) 39 | .transpose()?; 40 | 41 | let mut format_writer = cfg.get_format_writer()?; 42 | cfg.with_io_writer(|io_writer, mut cfg| { 43 | cfg.read(|record, ctx| { 44 | let v = ctx.symbols().get(symbol_id); 45 | let result = match v.inner() { 46 | Some(Value::Bool(b)) => *b.get(), 47 | _ => { 48 | return fail!( 49 | "Filter expression did not return a boolean (true/false), \ 50 | found '{}' instead", 51 | v 52 | ) 53 | } 54 | }; 55 | 56 | if result { 57 | format_writer.write(&record, io_writer, ctx)?; 58 | } else if let Some((d_writer, d_format_writer)) = dropped_out.as_mut() { 59 | d_format_writer.write(&record, d_writer, ctx)?; 60 | } 61 | Ok(true) 62 | })?; 63 | if let Some((w, _)) = dropped_out { 64 | w.finish()?; 65 | } 66 | Ok(()) 67 | }) 68 | } 69 | -------------------------------------------------------------------------------- /src/cmd/view/color.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | 3 | use palette::{named, white_point::D65, Srgb}; 4 | 5 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 6 | pub enum ColorSource { 7 | Seq, 8 | Qual, 9 | } 10 | 11 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 12 | pub struct Color { 13 | rgb: (u8, u8, u8), 14 | ansi: AnsiColor, 15 | } 16 | 17 | impl Color { 18 | pub fn from_rgb(c: Srgb) -> Self { 19 | Self { 20 | rgb: (c.red, c.green, c.blue), 21 | ansi: c.into(), 22 | } 23 | } 24 | 25 | pub fn from_str(s: &str) -> Result { 26 | parse_color(s).map(Self::from_rgb) 27 | } 28 | 29 | pub fn to_ratatui(self, rgb: bool) -> ratatui::style::Color { 30 | if rgb { 31 | ratatui::style::Color::Rgb(self.rgb.0, self.rgb.1, self.rgb.2) 32 | } else { 33 | ratatui::style::Color::Indexed(self.ansi.0) 34 | } 35 | } 36 | 37 | pub fn to_crossterm(self, rgb: bool) -> crossterm::style::Color { 38 | if rgb { 39 | crossterm::style::Color::Rgb { 40 | r: self.rgb.0, 41 | g: self.rgb.1, 42 | b: self.rgb.2, 43 | } 44 | } else { 45 | crossterm::style::Color::AnsiValue(self.ansi.0) 46 | } 47 | } 48 | } 49 | 50 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] 51 | struct AnsiColor(u8); 52 | 53 | impl From> for AnsiColor { 54 | fn from(c: Srgb) -> Self { 55 | // Simple conversion adapted from the colorsys.rs crate, not using the grayscale ramp 56 | let to_ansi = |c| if c < 75 { 0 } else { (c - 35) / 40 }; 57 | Self(to_ansi(c.red) * 6 * 6 + to_ansi(c.green) * 6 + to_ansi(c.blue) + 16) 58 | } 59 | } 60 | 61 | pub fn parse_color(s: &str) -> Result, String> { 62 | named::from_str(s).or_else(|| Srgb::from_str(s).ok()) 63 | .ok_or_else(|| format!("Invalid color code: '{s}'. The colors must be in Hex format (rrggbb) or a name (e.g. 'cyan')")) 64 | } 65 | 66 | /// chooses the optimal text color based on the brightness/darkness of the background color 67 | pub fn choose_fg(fg_dark: &Color, fg_bright: &Color, bg_col: &Color) -> Color { 68 | let dark_l = palette::Lab::::from(fg_dark.rgb).l as f32; 69 | let bright_l = palette::Lab::::from(fg_bright.rgb).l as f32; 70 | let bg = palette::Lab::::from(bg_col.rgb).l as f32; 71 | if (bright_l - bg) / (bright_l - dark_l) < 0.3 { 72 | *fg_dark 73 | } else { 74 | *fg_bright 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/cmd/sort/mem.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | use std::mem; 3 | use std::path::PathBuf; 4 | 5 | use deepsize::DeepSizeOf; 6 | 7 | use crate::cmd::shared::tmp_store::{Item, TmpWriter}; 8 | use crate::error::CliResult; 9 | 10 | use super::{FileSorter, SortHandle}; 11 | 12 | #[derive(Debug, Clone)] 13 | pub struct MemSorter { 14 | records: Vec>>, 15 | reverse: bool, 16 | mem: usize, 17 | max_mem: usize, 18 | } 19 | 20 | impl MemSorter { 21 | pub fn new(reverse: bool, max_mem: usize) -> Self { 22 | // we cannot know the exact length of the input, we just initialize 23 | // with capacity that should at least hold some records, while still 24 | // not using too much memory 25 | let records = Vec::with_capacity((max_mem / 400).clamp(1, 10000)); 26 | Self { 27 | mem: records.deep_size_of(), 28 | records, 29 | reverse, 30 | max_mem, 31 | } 32 | } 33 | 34 | pub fn add(&mut self, item: Item>) -> bool { 35 | self.mem += item.deep_size_of(); 36 | self.records.push(item); 37 | self.mem < self.max_mem 38 | } 39 | 40 | fn sort(&mut self) { 41 | if !self.reverse { 42 | self.records.sort_by(|i1, i2| i1.key.cmp(&i2.key)); 43 | } else { 44 | self.records.sort_by(|i1, i2| i2.key.cmp(&i1.key)); 45 | } 46 | } 47 | 48 | pub fn len(&self) -> usize { 49 | self.records.len() 50 | } 51 | 52 | pub fn reverse(&self) -> bool { 53 | self.reverse 54 | } 55 | 56 | pub fn write_sorted(&mut self, io_writer: &mut dyn Write) -> CliResult<()> { 57 | self.sort(); 58 | for item in &self.records { 59 | io_writer.write_all(&item.record)?; 60 | } 61 | Ok(()) 62 | } 63 | 64 | pub fn get_file_sorter( 65 | &mut self, 66 | tmp_dir: PathBuf, 67 | file_limit: usize, 68 | ) -> io::Result { 69 | let mut other = MemSorter::new(self.reverse, self.max_mem); 70 | other.records = mem::take(&mut self.records); 71 | FileSorter::from_mem(other, tmp_dir, file_limit) 72 | } 73 | 74 | pub fn serialize_sorted( 75 | &mut self, 76 | mut writer: TmpWriter>>, 77 | ) -> io::Result<(usize, SortHandle)> { 78 | self.sort(); 79 | for item in &self.records { 80 | writer.write(item)?; 81 | } 82 | let n = self.records.len(); 83 | self.records.clear(); 84 | self.mem = 0; 85 | writer.done().map(|h| (n, h)) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/cmd/sort/file.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | use std::path::PathBuf; 3 | 4 | use crate::cmd::shared::tmp_store::{Item, TmpHandle, TmpStore}; 5 | use crate::error::CliResult; 6 | use crate::helpers::heap_merge::HeapMerge; 7 | 8 | use super::MemSorter; 9 | 10 | pub type SortHandle = TmpHandle>>; 11 | 12 | #[derive(Debug)] 13 | pub struct FileSorter { 14 | mem_sorter: MemSorter, 15 | tmp_store: TmpStore, 16 | handles: Vec, 17 | n_written: usize, 18 | } 19 | 20 | impl FileSorter { 21 | pub fn from_mem( 22 | mem_sorter: MemSorter, 23 | tmp_dir: PathBuf, 24 | file_limit: usize, 25 | ) -> io::Result { 26 | Ok(Self { 27 | mem_sorter, 28 | handles: Vec::new(), 29 | tmp_store: TmpStore::new(tmp_dir, "st_sort_", file_limit)?, 30 | n_written: 0, 31 | }) 32 | } 33 | 34 | pub fn add(&mut self, item: Item>, quiet: bool) -> CliResult { 35 | if !self.mem_sorter.add(item) { 36 | self.write_to_file(quiet)?; 37 | } 38 | Ok(true) 39 | } 40 | 41 | pub fn write_to_file(&mut self, quiet: bool) -> CliResult<()> { 42 | let writer = self.tmp_store.writer(quiet)?; 43 | let (n, handle) = self.mem_sorter.serialize_sorted(writer)?; 44 | self.n_written += n; 45 | self.handles.push(handle); 46 | Ok(()) 47 | } 48 | 49 | pub fn write_records( 50 | &mut self, 51 | io_writer: &mut dyn Write, 52 | quiet: bool, 53 | verbose: bool, 54 | ) -> CliResult<()> { 55 | // write last chunk of records 56 | self.write_to_file(quiet)?; 57 | 58 | if verbose { 59 | eprintln!( 60 | "Sorted {} records using {} temporary files ({:.1} records per file).", 61 | self.n_written, 62 | self.handles.len(), 63 | self.n_written as f64 / self.handles.len() as f64 64 | ); 65 | } 66 | 67 | // readers for all sorted file chunks 68 | let mut readers = self 69 | .handles 70 | .iter_mut() 71 | .map(|handle| handle.reader()) 72 | .collect::, _>>()?; 73 | // use k-way merging of sorted chunks with a min-heap to obtain 74 | // the final sorted output 75 | let kmerge = HeapMerge::new(&mut readers, self.mem_sorter.reverse())?; 76 | for item in kmerge { 77 | io_writer.write_all(&item?.record)?; 78 | } 79 | // clean up 80 | for rdr in readers { 81 | rdr.done()?; 82 | } 83 | Ok(()) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/cmd/cmp/vars.rs: -------------------------------------------------------------------------------- 1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType}; 2 | use variable_enum_macro::variable_enum; 3 | 4 | use crate::cmd::shared::key::Key; 5 | use crate::var::{modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder, VarStore}; 6 | 7 | use super::Category; 8 | 9 | variable_enum! { 10 | /// # Variables/functions provided by the 'cmp' command 11 | /// 12 | /// # Examples 13 | /// 14 | /// Compare two files by ID and sequence hash and store all commonly found 15 | /// records in a new file (some statistics is printed to STDERR): 16 | /// 17 | /// `st cmp input1.fasta input2.fasta --common1 common.fasta` 18 | /// 19 | /// common 942 20 | /// unique1 51 21 | /// unique2 18 22 | CmpVar { 23 | /// Record category: 24 | /// 'common' (record present in both files based on comparison of keys), 25 | /// 'unique1' (record only in first file), 26 | /// or 'unique2' (record only in second file). 27 | Category(Text), 28 | /// Short category code: 'c' for common, 'u1' for unique1, 'u2' for unique2 29 | CategoryShort(Text), 30 | /// The value of the compared key 31 | Key(?), 32 | } 33 | } 34 | 35 | #[derive(Debug, Default)] 36 | pub struct CmpVars { 37 | vars: VarStore, 38 | } 39 | 40 | impl CmpVars { 41 | pub fn set(&mut self, key: &Key, cat: Category, symbols: &mut SymbolTable) { 42 | for (symbol_id, var) in self.vars.iter() { 43 | match var { 44 | CmpVar::Key => key.write_to_symbol(symbols.get_mut(*symbol_id)), 45 | CmpVar::Category => symbols 46 | .get_mut(*symbol_id) 47 | .inner_mut() 48 | .set_text(cat.long_text().as_bytes()), 49 | CmpVar::CategoryShort => symbols 50 | .get_mut(*symbol_id) 51 | .inner_mut() 52 | .set_text(cat.short_text().as_bytes()), 53 | } 54 | } 55 | } 56 | } 57 | 58 | impl VarProvider for CmpVars { 59 | fn info(&self) -> &dyn DynVarProviderInfo { 60 | &dyn_var_provider!(CmpVar) 61 | } 62 | 63 | fn register( 64 | &mut self, 65 | name: &str, 66 | args: &[Arg], 67 | builder: &mut VarBuilder, 68 | ) -> Result)>, String> { 69 | Ok(CmpVar::from_func(name, args)?.map(|(var, out_type)| { 70 | let symbol_id = builder.store_register(var, &mut self.vars); 71 | (symbol_id, out_type) 72 | })) 73 | } 74 | 75 | fn has_vars(&self) -> bool { 76 | !self.vars.is_empty() 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/io/input/fastq.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use seq_io::fastq::{self, Reader, Record as _}; 4 | use seq_io::policy::BufPolicy; 5 | 6 | use crate::error::CliResult; 7 | use crate::io::{MaybeModified, Record, RecordHeader}; 8 | 9 | use super::fastx::FastxHeaderParser; 10 | use super::SeqReader; 11 | 12 | pub struct FastqReader(pub Reader); 13 | 14 | impl FastqReader 15 | where 16 | R: io::Read, 17 | P: BufPolicy, 18 | { 19 | pub fn new(rdr: R, cap: usize, policy: P) -> Self { 20 | FastqReader(Reader::with_capacity(rdr, cap).set_policy(policy)) 21 | } 22 | } 23 | 24 | impl SeqReader for FastqReader 25 | where 26 | R: io::Read, 27 | P: BufPolicy, 28 | { 29 | fn read_next_conditional( 30 | &mut self, 31 | func: &mut dyn FnMut(&dyn Record) -> CliResult, 32 | ) -> Option> { 33 | self.0.next().map(|r| { 34 | let r = FastqRecord::new(r?); 35 | func(&r) 36 | }) 37 | } 38 | } 39 | 40 | // Wrapper for FASTQ record 41 | 42 | pub struct FastqRecord<'a> { 43 | rec: fastq::RefRecord<'a>, 44 | header_parser: FastxHeaderParser, 45 | } 46 | 47 | impl<'a> FastqRecord<'a> { 48 | #[inline(always)] 49 | pub fn new(inner: fastq::RefRecord<'a>) -> FastqRecord<'a> { 50 | FastqRecord { 51 | rec: inner, 52 | header_parser: Default::default(), 53 | } 54 | } 55 | } 56 | 57 | impl Record for FastqRecord<'_> { 58 | fn id(&self) -> &[u8] { 59 | self.header_parser.id_desc(self.rec.head()).0 60 | } 61 | 62 | fn desc(&self) -> Option<&[u8]> { 63 | self.header_parser.id_desc(self.rec.head()).1 64 | } 65 | 66 | fn id_desc(&self) -> (&[u8], Option<&[u8]>) { 67 | self.header_parser.id_desc(self.rec.head()) 68 | } 69 | 70 | fn current_header(&'_ self) -> RecordHeader<'_> { 71 | if let Some((id, desc)) = self.header_parser.parsed_id_desc(self.rec.head()) { 72 | RecordHeader::IdDesc( 73 | MaybeModified::new(id, false), 74 | MaybeModified::new(desc, false), 75 | ) 76 | } else { 77 | RecordHeader::Full(self.rec.head()) 78 | } 79 | } 80 | 81 | fn raw_seq(&self) -> &[u8] { 82 | self.rec.seq() 83 | } 84 | 85 | fn qual(&self) -> Option<&[u8]> { 86 | Some(::qual(&self.rec)) 87 | } 88 | 89 | fn header_delim_pos(&self) -> Option> { 90 | self.header_parser.delim_pos() 91 | } 92 | 93 | fn set_header_delim_pos(&self, delim: Option) { 94 | self.header_parser.set_delim_pos(Some(delim)) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/var/modules/expr/expressions.rs: -------------------------------------------------------------------------------- 1 | //! `Expressions` evaluates a list of expressions repeatedly using a single 2 | //! engine. 3 | //! 4 | //! Currently, this is more of an an unnecessary wrapper, but the reason for this 5 | //! being a separate module is that a more complicated evaluator featuring 6 | //! two different engines (a fast and simple, and a slower JavaScript engine) 7 | //! may be added in the future. 8 | 9 | use var_provider::VarType; 10 | 11 | use crate::io::Record; 12 | use crate::var::{symbols::SymbolTable, VarBuilder}; 13 | 14 | use super::js::{parser::SimpleAst, replace_register_vars}; 15 | use super::{ExprContext, Expression}; 16 | 17 | #[derive(Debug)] 18 | pub struct Expressions { 19 | expressions: Vec<(usize, String, E)>, 20 | // NOTE: context must come *after* expressions, since rquickjs expressions contain 21 | // Persistent, which should not live longer than the context 22 | // TODO: always ok? 23 | context: E::Context, 24 | } 25 | 26 | impl Expressions { 27 | pub fn new(init_code: Option<&str>) -> Result { 28 | let mut context = E::Context::default(); 29 | context.init(init_code)?; 30 | Ok(Self { 31 | expressions: vec![], 32 | context, 33 | }) 34 | } 35 | 36 | fn lookup(&self, script: &str) -> Option { 37 | self.expressions 38 | .iter() 39 | .find_map(|(id, code, _expr)| if script == code { Some(*id) } else { None }) 40 | } 41 | 42 | pub fn register_expr( 43 | &mut self, 44 | ast: &SimpleAst, 45 | builder: &mut VarBuilder, 46 | ) -> Result<(usize, Option), String> { 47 | if let Some(symbol_id) = self.lookup(ast.script) { 48 | Ok((symbol_id, None)) 49 | } else { 50 | let mut expr = E::default(); 51 | let (code, vars) = replace_register_vars(ast, builder)?; 52 | let symbol_id = builder.increment(); 53 | expr.register(symbol_id, &code, &mut self.context)?; 54 | self.expressions.push((symbol_id, code, expr)); 55 | for var in vars { 56 | self.context.register(&var)?; 57 | } 58 | Ok((symbol_id, None)) 59 | } 60 | } 61 | 62 | pub fn num_exprs(&self) -> usize { 63 | self.expressions.len() 64 | } 65 | 66 | pub fn eval(&mut self, symbols: &mut SymbolTable, record: &dyn Record) -> Result<(), String> { 67 | self.context 68 | .next_record(symbols, record) 69 | .map_err(|(_, msg)| msg)?; 70 | for (out_id, _, expr) in &mut self.expressions { 71 | expr.eval(symbols.get_mut(*out_id), &mut self.context)?; 72 | } 73 | Ok(()) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/cmd/sort/cli.rs: -------------------------------------------------------------------------------- 1 | use std::cell::LazyCell; 2 | use std::path::PathBuf; 3 | 4 | use clap::Parser; 5 | 6 | use crate::cli::{CommonArgs, WORDY_HELP}; 7 | use crate::helpers::bytesize::parse_bytesize; 8 | 9 | pub const DESC: &str = "\ 10 | The sort key can be 'seq', 'id', or any variable/function, expression, or 11 | text containing them (see help and `st sort -V/--help-vars`). 12 | 13 | Records with identical keys are kept in input order. 14 | 15 | The actual value of the key is available through the 'key' variable. It can 16 | be written to a header attribute or TSV field. 17 | "; 18 | 19 | pub const EXAMPLES: LazyCell = LazyCell::new(|| { 20 | color_print::cformat!( 21 | "\ 22 | st sort seqlen input.fasta 23 | >>id10 24 | SEQ 25 | >>id3 26 | SEQUE 27 | >>id1 28 | SEQUENCE 29 | 30 | " 31 | ) 32 | }); 33 | 34 | #[derive(Parser, Clone, Debug)] 35 | #[clap(next_help_heading = "'Sort' command options")] 36 | #[clap(before_help=DESC, after_help=&*EXAMPLES, help_template=WORDY_HELP)] 37 | pub struct SortCommand { 38 | /// The key used to sort the records. It can be a single variable/function 39 | /// such as 'seq', 'id', a composed string, e.g. '{id}_{desc}', 40 | /// or a comma-delimited list of multiple variables/functions to sort by, 41 | /// e.g. 'seq,attr(a)'. In this case, the records are first sorted by sequence, 42 | /// but in case of identical sequences, records are sorted by the header 43 | /// attribute 'a'. 44 | /// 45 | /// To sort by a numeric FASTA attribute in the form '>id;size=123': 46 | /// `st sort 'num(attr(size))' --attr-fmt ';key=value' input.fasta`. 47 | /// 48 | /// Regarding formulas returning mixed text/numbers, the sorted records with 49 | /// text keys will be returned first and the sorted number records after them. 50 | /// Furthermore, NaN and missing values (null/undefined in JS expressions, 51 | /// missing `opt_attr()` values or missing entries in associated metadata) 52 | /// will appear last. 53 | pub key: String, 54 | 55 | /// Sort in reverse order 56 | #[arg(short, long)] 57 | pub reverse: bool, 58 | 59 | /// Maximum amount of memory (approximate) to use for sorting. 60 | /// Either a plain number (bytes) a number with unit (K, M, G, T) 61 | /// based on powers of 2. 62 | #[arg(short = 'M', long, value_name = "SIZE", value_parser = parse_bytesize, default_value = "5G")] 63 | pub max_mem: usize, 64 | 65 | /// Path to temporary directory (only if memory limit is exceeded) 66 | #[arg(long, value_name = "PATH")] 67 | pub temp_dir: Option, 68 | 69 | /// Maximum number of temporary files allowed 70 | #[arg(long, value_name = "N", default_value_t = 1000)] 71 | pub temp_file_limit: usize, 72 | 73 | #[command(flatten)] 74 | pub common: CommonArgs, 75 | } 76 | -------------------------------------------------------------------------------- /src/var/modules/mod.rs: -------------------------------------------------------------------------------- 1 | use std::any::{Any, TypeId}; 2 | use std::fmt::Debug; 3 | 4 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType}; 5 | 6 | use crate::helpers::any::AsAnyMut; 7 | use crate::io::{input::InputConfig, output::OutputConfig, QualConverter, Record}; 8 | 9 | use super::attr::Attributes; 10 | use super::parser::Arg; 11 | use super::symbols::SymbolTable; 12 | use super::VarBuilder; 13 | 14 | pub mod attr; 15 | pub mod cnv; 16 | #[cfg(feature = "expr")] 17 | pub mod expr; 18 | pub mod general; 19 | pub mod meta; 20 | pub mod stats; 21 | 22 | /// List of all variable/function provider modules, 23 | /// used to generate the help pages 24 | /// (independently of the variable provider modules themselves) 25 | pub const MODULE_INFO: &[&dyn DynVarProviderInfo] = &[ 26 | &dyn_var_provider!(general::GeneralVar), 27 | &dyn_var_provider!(stats::StatVar), 28 | &dyn_var_provider!(attr::AttrVar), 29 | &dyn_var_provider!(meta::MetaVar), 30 | #[cfg(feature = "expr")] 31 | &dyn_var_provider!(expr::ExprVar), 32 | &dyn_var_provider!(cnv::CnvVar), 33 | ]; 34 | 35 | /// *The* trait for variable/function provider modules. 36 | pub trait VarProvider: Debug + AsAnyMut { 37 | fn info(&self) -> &dyn DynVarProviderInfo; 38 | 39 | /// Tries registering a variable / function with a module 40 | /// and returns `Some(VarType)` or `None` if the type is unknown beforehand. 41 | fn register( 42 | &mut self, 43 | name: &str, 44 | args: &[Arg], 45 | builder: &mut VarBuilder, 46 | ) -> Result)>, String>; 47 | 48 | fn has_vars(&self) -> bool; 49 | 50 | /// Supplies a new record to the variable provider and expects it to 51 | /// update the symbol table with the variable values. 52 | fn set_record( 53 | &mut self, 54 | _rec: &dyn Record, 55 | _sym: &mut SymbolTable, 56 | _attr: &Attributes, 57 | _qc: &mut QualConverter, 58 | ) -> Result<(), String> { 59 | Ok(()) 60 | } 61 | 62 | /// Called on every new output stream (STDOUT or file). 63 | /// Some variable providers may need the information. 64 | /// Additional output files created using `Config::new_output()` are 65 | /// *not* provided here. 66 | fn init_output(&mut self, _: &OutputConfig) -> Result<(), String> { 67 | Ok(()) 68 | } 69 | 70 | /// Called on every new input (STDIN or file). 71 | /// Some variable providers may need the information. 72 | fn init_input(&mut self, _: &InputConfig) -> Result<(), String> { 73 | Ok(()) 74 | } 75 | 76 | /// Returns the type ID of the given concrete type 77 | /// (used for identifying custom variable providers in `Ctx`) 78 | fn get_type_id(&self) -> TypeId 79 | where 80 | Self: 'static, 81 | { 82 | self.type_id() 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/cmd/unique/map.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use clap::ValueEnum; 4 | 5 | use crate::cmd::shared::tmp_store::Key; 6 | use crate::helpers::write_list::{write_list, write_list_with}; 7 | 8 | use super::DuplicateInfo; 9 | 10 | #[derive(ValueEnum, Debug, Clone, Copy, PartialEq)] 11 | pub enum MapFormat { 12 | /// Sequence ID, reference record ID 13 | Long, 14 | /// Like `long`, but sets the reference record ID to `*` for the reference 15 | /// record itself instead of repeating the same ID twice. 16 | LongStar, 17 | /// Tab-delimited list of all duplicates, with the reference record ID first 18 | /// (e.g. corresponds to Swarm output format). 19 | Wide, 20 | /// Reference ID, comma-delimited list of duplicates including the reference ID 21 | /// (corresponds to mothur `.names` file). 22 | WideComma, 23 | /// Like `wide`, but with the unique key in the first column and all duplicate 24 | /// IDs in the following columns. 25 | WideKey, 26 | } 27 | 28 | pub struct MapWriter { 29 | inner: W, 30 | format: MapFormat, 31 | } 32 | 33 | impl MapWriter { 34 | pub fn new(inner: W, format: MapFormat) -> Self { 35 | Self { inner, format } 36 | } 37 | 38 | pub fn into_inner(self) -> W { 39 | self.inner 40 | } 41 | 42 | pub fn write(&mut self, key: &Key, duplicates: &DuplicateInfo) -> io::Result<()> { 43 | let ids = match duplicates { 44 | DuplicateInfo::Ids(ids) => ids, 45 | _ => panic!(), 46 | }; 47 | match self.format { 48 | MapFormat::Long | MapFormat::LongStar => { 49 | let mut first = true; 50 | for id in ids { 51 | self.inner.write_all(id)?; 52 | write!(self.inner, "\t")?; 53 | if self.format == MapFormat::LongStar && first { 54 | first = false; 55 | self.inner.write_all(b"*")?; 56 | } else { 57 | self.inner.write_all(&ids[0])?; 58 | } 59 | writeln!(self.inner)?; 60 | } 61 | } 62 | MapFormat::Wide | MapFormat::WideKey => { 63 | if self.format == MapFormat::WideKey { 64 | write_list_with(key.as_slice(), b"\t", &mut self.inner, |v, o| v.write(o))?; 65 | write!(self.inner, "\t")?; 66 | } 67 | write_list(ids, b"\t", &mut self.inner)?; 68 | writeln!(self.inner)?; 69 | } 70 | MapFormat::WideComma => { 71 | self.inner.write_all(&ids[0])?; 72 | write!(self.inner, "\t")?; 73 | write_list(ids, b",", &mut self.inner)?; 74 | writeln!(self.inner)?; 75 | } 76 | } 77 | Ok(()) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/helpers/heap_merge.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::{Ordering, Reverse}; 2 | use std::collections::BinaryHeap; 3 | use std::fmt::Debug; 4 | 5 | #[derive(Debug)] 6 | struct Item { 7 | inner: T, 8 | reverse: bool, 9 | } 10 | 11 | impl Item { 12 | fn new(inner: T, reverse: bool) -> Self { 13 | Self { inner, reverse } 14 | } 15 | } 16 | 17 | impl PartialOrd for Item { 18 | fn partial_cmp(&self, other: &Self) -> Option { 19 | Some(self.cmp(other)) 20 | } 21 | } 22 | 23 | impl PartialEq for Item { 24 | fn eq(&self, other: &Self) -> bool { 25 | self.inner == other.inner 26 | } 27 | } 28 | 29 | impl Eq for Item {} 30 | 31 | impl Ord for Item { 32 | fn cmp(&self, other: &Self) -> Ordering { 33 | if !self.reverse { 34 | self.inner.cmp(&other.inner) 35 | } else { 36 | other.inner.cmp(&self.inner) 37 | } 38 | } 39 | } 40 | 41 | /// Merges sorted streams using a binary heap. 42 | /// In case of ties, items are sorted by the order of the input streams, in which 43 | /// they occur. 44 | #[derive(Debug)] 45 | pub struct HeapMerge 46 | where 47 | T: Ord + Debug, 48 | I: Iterator>, 49 | E: Debug, 50 | { 51 | streams: Box<[I]>, 52 | heap: BinaryHeap, usize)>>, 53 | reverse: bool, 54 | } 55 | 56 | impl HeapMerge 57 | where 58 | T: Ord + Debug, 59 | I: Iterator>, 60 | E: Debug, 61 | { 62 | pub fn new(streams: S, reverse: bool) -> Result 63 | where 64 | S: IntoIterator, 65 | { 66 | let mut streams = streams.into_iter().collect::>(); 67 | let mut heap = BinaryHeap::with_capacity(streams.len()); 68 | for (i, stream) in streams.iter_mut().enumerate() { 69 | if let Some(item) = stream.next() { 70 | heap.push(Reverse((Item::new(item?, reverse), i))); 71 | } 72 | } 73 | Ok(Self { 74 | heap, 75 | reverse, 76 | streams, 77 | }) 78 | } 79 | } 80 | 81 | impl Iterator for HeapMerge 82 | where 83 | T: Ord + Debug, 84 | I: Iterator>, 85 | E: Debug, 86 | { 87 | type Item = Result; 88 | 89 | fn next(&mut self) -> Option { 90 | self.heap.pop().map(|top| { 91 | let (top_item, top_i) = top.0; 92 | if let Some(next_item) = self.streams[top_i].next() { 93 | self.heap 94 | .push(Reverse((Item::new(next_item?, self.reverse), top_i))); 95 | } 96 | Ok(top_item.inner) 97 | }) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/cmd/view/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::config::Config; 2 | use crate::error::CliResult; 3 | 4 | mod cli; 5 | mod color; 6 | mod fmt; 7 | mod pager; 8 | mod pal; 9 | 10 | pub use self::cli::*; 11 | use self::color::*; 12 | use self::fmt::*; 13 | use self::pager::*; 14 | use self::pal::*; 15 | 16 | pub fn run(mut cfg: Config, args: ViewCommand) -> CliResult<()> { 17 | if args.color.list_pal { 18 | print_palettes( 19 | &args.color.textcols, 20 | args.color.truecolor.unwrap_or_else(has_truecolor), 21 | )?; 22 | return Ok(()); 23 | } 24 | 25 | // setup colors 26 | use ColorSource::*; 27 | let use_qual = cfg.input_config[0].format.format.has_qual() && !args.general.no_qual; 28 | let (bg, fg, bold) = if use_qual { 29 | if args.general.foreground { 30 | (Some(Qual), Some(Seq), true) 31 | } else { 32 | (Some(Qual), None, false) 33 | } 34 | } else if args.general.foreground { 35 | (None, Some(Seq), false) 36 | } else { 37 | (Some(Seq), None, false) 38 | }; 39 | let mut formatter = Formatter::new(args.general.id_len, args.general.show_desc) 40 | .capabilities( 41 | args.color.truecolor.unwrap_or_else(has_truecolor), 42 | has_utf8(), 43 | ) 44 | .textcols(args.color.textcols.0, args.color.textcols.1)? 45 | .color_config(bg, fg) 46 | .bold(args.general.bold || bold); 47 | 48 | let palettes = args.color.palettes(); 49 | let mut pager = GrowingPager::new(); 50 | let mut terminal = ratatui::init(); 51 | // let mut terminal = ratatui::Terminal::new(ratatui::backend::CrosstermBackend::new(std::io::stdout()))?; 52 | 53 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 54 | enum Progress { 55 | New, 56 | Running, 57 | Done, 58 | } 59 | 60 | let mut progress = Progress::New; 61 | let res = cfg.read(|record, ctx| { 62 | loop { 63 | match pager.check_draw(&mut terminal, false)? { 64 | Status::Ok => {} 65 | Status::MissingLines => { 66 | let (line, len) = 67 | formatter.format(record, &mut ctx.qual_converter, &palettes)?; 68 | progress = Progress::Running; 69 | if use_qual { 70 | pager.set_color_scale(formatter.format_scale(0, (2..47).step_by(2))); 71 | } 72 | pager.add(line, len); 73 | break; 74 | } 75 | Status::Quit => { 76 | progress = Progress::Done; 77 | return Ok(false); 78 | } 79 | } 80 | } 81 | Ok(true) 82 | }); 83 | if res.is_ok() && progress == Progress::Running { 84 | while !matches!(pager.check_draw(&mut terminal, true)?, Status::Quit) {} 85 | } 86 | ratatui::restore(); 87 | res 88 | } 89 | -------------------------------------------------------------------------------- /src/io/output/fa_qual.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io; 3 | use std::path::Path; 4 | 5 | use crate::context::RecordMeta; 6 | use crate::error::CliResult; 7 | use crate::io::QualConverter; 8 | use crate::var::VarBuilder; 9 | 10 | use super::{Attribute, Record, SeqFormatter}; 11 | 12 | pub struct FaQualWriter { 13 | fa_writer: super::fasta::FastaWriter, 14 | // This is a bit awkward: the FASTA writer is not part of this struct, 15 | // (supplied to write(), while the .qual writer is). 16 | // However, this is a special case and not a problem. 17 | qual_out: io::BufWriter, 18 | wrap: usize, 19 | } 20 | 21 | impl FaQualWriter { 22 | pub fn new( 23 | wrap: Option, 24 | qual_path: Q, 25 | attrs: &[(Attribute, bool)], 26 | builder: &mut VarBuilder, 27 | ) -> CliResult 28 | where 29 | Q: AsRef, 30 | { 31 | let q_handle = File::create(&qual_path).map_err(|e| { 32 | io::Error::other(format!( 33 | "Error creating '{}': {}", 34 | qual_path.as_ref().to_string_lossy(), 35 | e 36 | )) 37 | })?; 38 | 39 | Ok(FaQualWriter { 40 | fa_writer: super::fasta::FastaWriter::new(wrap, attrs, builder)?, 41 | qual_out: io::BufWriter::new(q_handle), 42 | wrap: wrap.unwrap_or(usize::MAX), 43 | }) 44 | } 45 | } 46 | 47 | impl SeqFormatter for FaQualWriter { 48 | fn write_with( 49 | &mut self, 50 | record: &dyn Record, 51 | data: &RecordMeta, 52 | out: &mut dyn io::Write, 53 | qc: &mut QualConverter, 54 | ) -> CliResult<()> { 55 | self.fa_writer.write_with(record, data, out, qc)?; 56 | write_qscores(record, &mut self.qual_out, data, qc, self.wrap) 57 | } 58 | } 59 | 60 | fn write_qscores( 61 | record: &dyn Record, 62 | mut out: W, 63 | data: &RecordMeta, 64 | qual_converter: &mut QualConverter, 65 | wrap: usize, 66 | ) -> CliResult<()> { 67 | let qual = record.qual().ok_or("No quality scores found in input.")?; 68 | // header 69 | out.write_all(b">")?; 70 | data.attrs.write_head(record, &mut out, &data.symbols)?; 71 | out.write_all(b"\n")?; 72 | 73 | // Phred scores 74 | for qline in qual.chunks(wrap) { 75 | if !qline.is_empty() { 76 | let phred_qual = qual_converter.phred_scores(qline).map_err(|e| { 77 | format!( 78 | "Error writing record '{}'. {}", 79 | String::from_utf8_lossy(record.id()), 80 | e 81 | ) 82 | })?; 83 | let mut q_iter = phred_qual.scores().iter(); 84 | for q in q_iter.by_ref().take(qline.len() - 1) { 85 | write!(&mut out, "{} ", *q)?; 86 | } 87 | writeln!(&mut out, "{}", q_iter.next().unwrap())?; 88 | } 89 | } 90 | Ok(()) 91 | } 92 | -------------------------------------------------------------------------------- /src/io/input/fasta.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use seq_io::fasta::{self, Reader, Record as _}; 4 | use seq_io::policy::BufPolicy; 5 | 6 | use crate::error::CliResult; 7 | use crate::io::{MaybeModified, Record, RecordHeader, SeqLineIter}; 8 | 9 | use super::fastx::FastxHeaderParser; 10 | use super::SeqReader; 11 | 12 | pub struct FastaReader(pub Reader); 13 | 14 | impl FastaReader 15 | where 16 | R: io::Read, 17 | P: BufPolicy, 18 | { 19 | pub fn new(rdr: R, cap: usize, policy: P) -> Self { 20 | FastaReader(Reader::with_capacity(rdr, cap).set_policy(policy)) 21 | } 22 | } 23 | 24 | impl SeqReader for FastaReader 25 | where 26 | R: io::Read, 27 | P: BufPolicy, 28 | { 29 | fn read_next_conditional( 30 | &mut self, 31 | func: &mut dyn FnMut(&dyn Record) -> CliResult, 32 | ) -> Option> { 33 | self.0.next().map(|r| { 34 | let r = FastaRecord::new(r?); 35 | func(&r) 36 | }) 37 | } 38 | } 39 | 40 | // Wrapper for FASTA record 41 | 42 | pub struct FastaRecord<'a> { 43 | rec: fasta::RefRecord<'a>, 44 | header_parser: FastxHeaderParser, 45 | } 46 | 47 | impl<'a> FastaRecord<'a> { 48 | #[inline(always)] 49 | pub fn new(inner: fasta::RefRecord<'a>) -> FastaRecord<'a> { 50 | FastaRecord { 51 | rec: inner, 52 | header_parser: Default::default(), 53 | } 54 | } 55 | } 56 | 57 | impl Record for FastaRecord<'_> { 58 | fn id(&self) -> &[u8] { 59 | self.header_parser.id_desc(self.rec.head()).0 60 | } 61 | 62 | fn desc(&self) -> Option<&[u8]> { 63 | self.header_parser.id_desc(self.rec.head()).1 64 | } 65 | 66 | fn id_desc(&self) -> (&[u8], Option<&[u8]>) { 67 | self.header_parser.id_desc(self.rec.head()) 68 | } 69 | 70 | fn current_header(&'_ self) -> RecordHeader<'_> { 71 | if let Some((id, desc)) = self.header_parser.parsed_id_desc(self.rec.head()) { 72 | RecordHeader::IdDesc( 73 | MaybeModified::new(id, false), 74 | MaybeModified::new(desc, false), 75 | ) 76 | } else { 77 | RecordHeader::Full(self.rec.head()) 78 | } 79 | } 80 | 81 | fn raw_seq(&self) -> &[u8] { 82 | self.rec.seq() 83 | } 84 | 85 | fn qual(&self) -> Option<&[u8]> { 86 | None 87 | } 88 | 89 | fn header_delim_pos(&self) -> Option> { 90 | self.header_parser.delim_pos() 91 | } 92 | 93 | fn set_header_delim_pos(&self, delim: Option) { 94 | self.header_parser.set_delim_pos(Some(delim)) 95 | } 96 | 97 | fn has_seq_lines(&self) -> bool { 98 | self.rec.num_seq_lines() > 1 99 | } 100 | 101 | fn seq_segments(&'_ self) -> SeqLineIter<'_> { 102 | SeqLineIter::Fasta(self.rec.seq_lines()) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/cmd/unique/cli.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::Parser; 4 | 5 | use super::MapFormat; 6 | use crate::cli::{CommonArgs, WORDY_HELP}; 7 | use crate::helpers::bytesize::parse_bytesize; 8 | 9 | pub const DESC: &str = "\ 10 | The unique key can be 'seq' or any variable/function, expression, or 11 | text containing them (see help and `st unique -V/--help-vars`). 12 | 13 | The order of the records is the same as in the input unless the memory limit 14 | is exceeded, in which case temporary files are used and all remaining records 15 | are sorted by the unique key. Use `-s/--sorted` to always sort the output 16 | by key."; 17 | 18 | #[derive(Parser, Clone, Debug)] 19 | #[clap(next_help_heading = "'Unique' command options")] 20 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 21 | pub struct UniqueCommand { 22 | /// The key used to determine, which records are unique. 23 | /// The key can be a single variable/function such as 'seq', 24 | /// a composed string such as '{attr(a)}_{attr(b)}', 25 | /// or a comma-delimited list of multiple variables/functions, whose 26 | /// values are all taken into account, e.g. 'seq,num(attr(a))'. In case of 27 | /// identical sequences, records are still de-replicated by the header 28 | /// attribute 'a'. 29 | /// The 'num()' function turns text values into numbers, which can 30 | /// speed up the de-replication. 31 | /// For each key, the *first* encountered record is returned, and all 32 | /// remaining ones with the same key are discarded. 33 | pub key: String, 34 | 35 | /// Sort the output by key. 36 | /// Without this option, the records are in input order if the memory limit 37 | /// is *not* exceeded, but are sorted by key otherwise. 38 | #[arg(short, long)] 39 | pub sort: bool, 40 | 41 | /// Write a map of all duplicate sequence IDs to the given file (or '-' for stdout). 42 | /// The (optional) compression format is auto-recognized from the extension. 43 | /// By default, a two-column mapping of sequence ID -> unique reference record ID 44 | /// is written (`long` format). 45 | /// More formats can be selected with `--map_format`. 46 | #[arg(long)] 47 | pub map_out: Option, 48 | 49 | /// Column format for the duplicate map `--map-out` (use `--help` for details). 50 | #[arg(long, value_enum, default_value = "long")] 51 | pub map_fmt: MapFormat, 52 | 53 | /// Maximum amount of memory (approximate) to use for de-duplicating. 54 | /// Either a plain number (bytes) a number with unit (K, M, G, T) 55 | /// based on powers of 2. 56 | #[arg(short = 'M', long, value_name = "SIZE", value_parser = parse_bytesize, default_value = "5G")] 57 | pub max_mem: usize, 58 | 59 | /// Path to temporary directory (only if memory limit is exceeded) 60 | #[arg(long, value_name = "PATH")] 61 | pub temp_dir: Option, 62 | 63 | /// Maximum number of temporary files allowed 64 | #[arg(long, value_name = "N", default_value_t = 1000)] 65 | pub temp_file_limit: usize, 66 | 67 | #[command(flatten)] 68 | pub common: CommonArgs, 69 | } 70 | -------------------------------------------------------------------------------- /src/test/count.rs: -------------------------------------------------------------------------------- 1 | use crate::helpers::NA; 2 | 3 | use super::*; 4 | 5 | #[test] 6 | fn simple() { 7 | cmp(&["count"], &*FASTA, "4\n"); 8 | cmp( 9 | &["count", "-k", "attr(p)"], 10 | &*FASTA, 11 | "1\t1\n10\t1\n11\t1\n2\t1\n", 12 | ); 13 | } 14 | 15 | #[test] 16 | fn fixed() { 17 | cmp(&["count"], &*FASTA, "4\n"); 18 | cmp(&["count"], &*FASTA, "4\n"); 19 | cmp(&["count", "-k", "num('2.3')"], &*FASTA, "2.3\t4\n"); 20 | cmp(&["count"], &*FASTA, "4\n"); 21 | cmp(&["count", "-k", "bin('2.3', 1)"], &*FASTA, "(2, 3]\t4\n"); 22 | cmp( 23 | &["count", "-k", "opt_attr(non_existent)"], 24 | &*FASTA, 25 | &format!("{NA}\t4\n"), 26 | ); 27 | } 28 | 29 | #[test] 30 | fn multi() { 31 | let out = "25\t23\t1\n25\t24\t2\n25\t25\t1\n"; 32 | 33 | cmp(&["count", "-k", "seqlen,ungapped_seqlen"], &*FASTA, out); 34 | cmp( 35 | &["count", "-k", "seqlen", "-k", "ungapped_seqlen"], 36 | &*FASTA, 37 | out, 38 | ); 39 | } 40 | 41 | #[test] 42 | fn discrete_bins() { 43 | cmp( 44 | &["count", "-k", "{bin(attr(p), 10)}"], 45 | &*FASTA, 46 | "(0, 10]\t2\n(10, 20]\t2\n", 47 | ); 48 | } 49 | 50 | const FLOAT_FASTA: &str = "\ 51 | >s1 a=1.10000000000002 =1.1 52 | SEQ 53 | >s2 a=0.00000000000001 =1e-14 54 | SEQ 55 | >s3 a=1.10000000000001 =1.1 56 | SEQ 57 | >s4 a=1.1000001 =1.1 with <=6 significant digits 58 | SEQ 59 | >s5 a=0.000000000000011 =1.1e-14 60 | SEQ 61 | >s6 a=11013452400000000001 =1.101345e19 62 | SEQ 63 | >s7 a=1.10000000000002 =1.1 (same as s1) 64 | SEQ 65 | "; 66 | 67 | #[test] 68 | fn float() { 69 | cmp( 70 | &["count", "-k", "attr(a)"], 71 | FLOAT_FASTA, 72 | "0.00000000000001\t1\n0.000000000000011\t1\n1.10000000000001\t1\n1.10000000000002\t2\n1.1000001\t1\n11013452400000000001\t1\n", 73 | ); 74 | cmp( 75 | &["count", "-k", "num(attr(a))"], 76 | FLOAT_FASTA, 77 | "1e-14\t1\n1.1e-14\t1\n1.10000\t4\n1.10135e19\t1\n", 78 | ); 79 | cmp( 80 | &["count", "-k", "bin(attr(a), 1)"], 81 | FLOAT_FASTA, 82 | "(0, 1]\t2\n(1, 2]\t4\n(1.10135e19, 1.10135e19]\t1\n", 83 | ); 84 | } 85 | 86 | #[test] 87 | fn missing() { 88 | cmp( 89 | &["count", "-k", "{opt_attr(missing)}"], 90 | &*FASTA, 91 | &format!("{NA}\t4\n"), 92 | ); 93 | cmp( 94 | &["count", "-k", "{num(opt_attr(missing))}"], 95 | &*FASTA, 96 | &format!("{NA}\t4\n"), 97 | ); 98 | fails( 99 | &["count", "-k", "{attr(missing)}"], 100 | &*FASTA, 101 | "'missing' not found in record", 102 | ); 103 | 104 | #[cfg(feature = "expr")] 105 | { 106 | cmp( 107 | &["count", "-k", "{opt_attr('missing') + 1}"], 108 | &*FASTA, 109 | "NaN\t4\n", 110 | ); 111 | cmp( 112 | &["count", "-k", "{num(opt_attr('missing')) + 1}"], 113 | &*FASTA, 114 | "NaN\t4\n", 115 | ); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/cmd/find/matcher/mod.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | use bio::alignment::AlignmentOperation; 4 | 5 | use crate::error::CliResult; 6 | 7 | use super::opts::{Algorithm, PatternConfig, SearchConfig, SearchOpts}; 8 | 9 | pub mod approx; 10 | pub mod exact; 11 | pub mod regex; 12 | 13 | pub trait Matcher: Debug + MatcherBoxClone { 14 | fn has_matches(&self, text: &[u8]) -> Result; 15 | 16 | /// This method iterates over all hits and provides these to the 17 | /// given closure. The exact hit type may vary depending on the 18 | /// implementation. 19 | /// The looping should be interrupted if the closure returns false. 20 | fn do_search( 21 | &mut self, 22 | text: &[u8], 23 | func: &mut dyn FnMut(&dyn Hit) -> Result, 24 | ) -> Result<(), String>; 25 | } 26 | 27 | pub trait MatcherBoxClone { 28 | fn clone_box(&self) -> Box; 29 | } 30 | 31 | impl MatcherBoxClone for T 32 | where 33 | T: 'static + Matcher + Clone + Send + Sync, 34 | { 35 | fn clone_box(&self) -> Box { 36 | Box::new(self.clone()) 37 | } 38 | } 39 | 40 | impl Clone for Box { 41 | fn clone(&self) -> Box { 42 | self.clone_box() 43 | } 44 | } 45 | 46 | pub trait Hit { 47 | fn get_group(&self, group: usize, out: &mut Match) -> Result<(), String>; 48 | } 49 | 50 | /// contains 0-based coordinates and distance 51 | #[derive(Debug, Clone, Default, Eq, PartialEq)] 52 | pub struct Match { 53 | pub start: usize, 54 | pub end: usize, 55 | pub dist: usize, 56 | pub alignment_path: Vec, 57 | } 58 | 59 | impl Match { 60 | pub fn neg_start1(&self, seq_len: usize) -> i64 { 61 | self.start as i64 - seq_len as i64 62 | } 63 | 64 | pub fn neg_end1(&self, seq_len: usize) -> i64 { 65 | self.end as i64 - seq_len as i64 - 1 66 | } 67 | } 68 | 69 | pub fn get_matcher( 70 | cfg: &PatternConfig, 71 | opts: &SearchOpts, 72 | ) -> CliResult> { 73 | use Algorithm::*; 74 | if cfg.algorithm != Regex && opts.has_regex_groups { 75 | return fail!( 76 | "Match groups > 0 can only be used with regular expression searches (-r/--regex or --regex-unicode)." 77 | ); 78 | } 79 | let matcher: Box = match cfg.algorithm { 80 | Exact => Box::new(exact::ExactMatcher::new(cfg.pattern.seq.as_bytes())), 81 | Regex => regex::get_matcher( 82 | &cfg.pattern.seq, 83 | opts.hit_limit <= 1, 84 | opts.has_regex_groups, 85 | opts.case_insensitive, 86 | )?, 87 | Myers => approx::get_matcher(&cfg.pattern.seq, cfg.max_dist, cfg.has_ambigs, opts)?, 88 | }; 89 | Ok(matcher) 90 | } 91 | 92 | pub fn get_matchers(cfg: &SearchConfig) -> CliResult>> { 93 | let opts = cfg.get_opts(); 94 | cfg.patterns() 95 | .iter() 96 | .map(|p| get_matcher(p, opts)) 97 | .collect::>>() 98 | } 99 | -------------------------------------------------------------------------------- /profile/README.md: -------------------------------------------------------------------------------- 1 | # Measuring time and memory / comparison to other tools 2 | 3 | The following should work for Ubuntu Linux. 4 | 5 | ```bash 6 | outdir=target/st_benchmark 7 | fq=$outdir/reads.fq 8 | mkdir -p $outdir 9 | ``` 10 | 11 | ## Build the binary 12 | 13 | ```bash 14 | cargo build --release 15 | st=target/release/st 16 | ``` 17 | 18 | 31 | 32 | ## Download sequencing reads 33 | 34 | 54 | 55 | ```bash 56 | wget -qi profile/fastq_urls.txt -O - | zcat > $fq 57 | ls -lh $fq 58 | ``` 59 | 60 | ## Create temporary storage 61 | 62 | We rely on *tmpfs* to store output (and some input) files in memory, 63 | avoiding disk IO latency as much as possible. 64 | 65 | ```bash 66 | rm -Rf $outdir/workdir && mkdir $outdir/workdir 67 | chmod 777 $outdir/workdir 68 | sudo mount -t tmpfs -o size=10G none $outdir/workdir 69 | mkdir -p $outdir/workdir/tmp 70 | ``` 71 | 72 | Prepare forward primer for searching 73 | 74 | ```bash 75 | # >ITS_S2F 76 | # CGATACTTGGTGTGAAT 77 | # >ITS3 78 | # TCGATGAAGAACGCAGC 79 | cat > $outdir/workdir/primers.fasta <<- EOM 80 | >ITS4 81 | GTCCTCCGCTTATTGATATGC 82 | EOM 83 | ``` 84 | 85 | ## Run the benchmarks 86 | 87 | Before running, disable frequency boost: 88 | 89 | ```bash 90 | # requires cpufrequtils installed 91 | echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost 92 | ``` 93 | 94 | On Ubuntu, disable the indexer for full-text search 95 | 96 | ```bash 97 | echo -n > $outdir/workdir/.trackerignore 98 | ``` 99 | 100 | Run the comparison. The `compare_tools.py` does not only compare runtimes / memory usage, 101 | but in some cases also validates that the output is the same. 102 | See `comparison_commands.yaml`. 103 | 104 | ```bash 105 | export SEQKIT_THREADS=1 106 | $st count $fq # cache the file in memory 107 | scripts/compare_tools.py \ 108 | -b $st -d $outdir/workdir -o profile/comparison.json -t $outdir/workdir/tmp \ 109 | -k main \ 110 | $fq profile/comparison_commands.yaml 111 | # -k main,other 112 | scripts/summarize_comparison.py profile/comparison.json - > profile/comparison.md 113 | ``` 114 | 115 | ## Clean up 116 | 117 | ```bash 118 | rm -Rf $outdir/workdir 119 | ``` 120 | -------------------------------------------------------------------------------- /src/cmd/revcomp.rs: -------------------------------------------------------------------------------- 1 | use std::sync::OnceLock; 2 | 3 | use clap::Parser; 4 | 5 | use crate::cli::{CommonArgs, WORDY_HELP}; 6 | use crate::config::Config; 7 | use crate::error::CliResult; 8 | use crate::helpers::{ 9 | complement::reverse_complement, 10 | seqtype::{SeqType, SeqtypeHelper}, 11 | }; 12 | use crate::io::SeqQualRecord; 13 | 14 | pub const DESC: &str = "\ 15 | The sequence type is automatically detected based on the first record, 16 | unless the `--seqtype` option is used. 17 | 18 | *Note*: Unknown letters are not reversed, but left unchanged. 19 | 20 | If quality scores are present, their order is just reversed"; 21 | 22 | #[derive(Parser, Clone, Debug)] 23 | #[clap(next_help_heading = "'Revcomp' command options")] 24 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 25 | pub struct RevcompCommand { 26 | /// Number of threads to use 27 | #[arg(short, long, default_value_t = 1)] 28 | threads: u32, 29 | 30 | #[command(flatten)] 31 | pub common: CommonArgs, 32 | } 33 | 34 | #[derive(Default, Clone, Debug)] 35 | struct RevCompRecord { 36 | seq: Vec, 37 | qual: Option>, 38 | seqtype: Option, 39 | } 40 | 41 | // TODO: wait for https://doc.rust-lang.org/std/sync/struct.OnceLock.html#method.get_or_try_init stabilization 42 | static SEQTYPE: OnceLock> = OnceLock::new(); 43 | 44 | pub fn run(mut cfg: Config, args: RevcompCommand) -> CliResult<()> { 45 | let num_threads = args.threads; 46 | 47 | let mut format_writer = cfg.get_format_writer()?; 48 | let typehint = cfg.input_config[0].format.seqtype; 49 | let mut final_seqtype = None; 50 | cfg.with_io_writer(|io_writer, mut cfg| { 51 | cfg.read_parallel_init( 52 | num_threads - 1, 53 | Default::default, 54 | |record, out: &mut Box| { 55 | if out.seqtype.is_none() { 56 | let seqtype = SEQTYPE.get_or_init(|| { 57 | SeqtypeHelper::new(typehint).get_or_guess(record) 58 | }).clone()?; 59 | out.seqtype = Some(seqtype); 60 | } 61 | reverse_complement(record.seq_segments(), &mut out.seq, out.seqtype.unwrap())?; 62 | if let Some(q) = record.qual() { 63 | let qual = out.qual.get_or_insert_with(|| Vec::with_capacity(q.len())); 64 | qual.clear(); 65 | qual.extend(q.iter().rev()); 66 | } 67 | Ok(()) 68 | }, 69 | |record, revcomp_record, ctx| { 70 | if final_seqtype.is_none() { 71 | final_seqtype = revcomp_record.seqtype; 72 | } else if revcomp_record.seqtype != final_seqtype { 73 | // fail if there is a mismatch in sequence types guessed in different threads 74 | return fail!("Could not reliably guess the sequence type. Please specify with `--seqtype`"); 75 | } 76 | let rc_rec = SeqQualRecord::new( 77 | &record, 78 | &revcomp_record.seq, 79 | revcomp_record.qual.as_deref(), 80 | ); 81 | format_writer.write(&rc_rec, io_writer, ctx)?; 82 | Ok(true) 83 | }, 84 | ) 85 | })?; 86 | Ok(()) 87 | } 88 | -------------------------------------------------------------------------------- /src/helpers/number.rs: -------------------------------------------------------------------------------- 1 | //! Helpers for number handling 2 | 3 | use std::{ 4 | fmt, 5 | ops::{Deref, DerefMut}, 6 | }; 7 | 8 | use ordered_float::OrderedFloat; 9 | 10 | /// Discretizes a floating-point number into bins of width 'interval' 11 | pub fn bin(num: f64, interval: f64) -> Interval { 12 | let start = (num / interval).floor() * interval; 13 | Interval::new(start, start + interval) 14 | } 15 | 16 | // TODO: consider replacing rust-lexical https://github.com/rustsec/advisory-db/issues/1757 17 | pub fn parse_float(text: &[u8]) -> Result { 18 | lexical::parse(text).map_err(|_| { 19 | format!( 20 | "Could not convert '{}' to a decimal number.", 21 | String::from_utf8_lossy(text) 22 | ) 23 | }) 24 | } 25 | 26 | pub fn parse_int(text: &[u8]) -> Result { 27 | atoi::atoi(text).ok_or_else(|| { 28 | format!( 29 | "Could not convert '{}' to an integer number.", 30 | String::from_utf8_lossy(text) 31 | ) 32 | }) 33 | } 34 | 35 | /// Wrapper used for float values across this crate. 36 | /// It can be sorted/hashed and provides a `Display` implementation that 37 | /// allows to print the numbers in a human-readable way. 38 | #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] 39 | #[cfg_attr( 40 | any(feature = "all-commands", feature = "sort", feature = "unique"), 41 | derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize), 42 | archive(compare(PartialEq), check_bytes) 43 | )] 44 | pub struct Float(OrderedFloat); 45 | 46 | impl Float { 47 | pub fn new(f: f64) -> Self { 48 | Self(OrderedFloat(f)) 49 | } 50 | 51 | pub fn inner(&self) -> f64 { 52 | self.0 .0 53 | } 54 | } 55 | 56 | impl Deref for Float { 57 | type Target = f64; 58 | fn deref(&self) -> &Self::Target { 59 | &self.0 60 | } 61 | } 62 | 63 | impl DerefMut for Float { 64 | fn deref_mut(&mut self) -> &mut Self::Target { 65 | &mut self.0 66 | } 67 | } 68 | 69 | impl fmt::Display for Float { 70 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 71 | // TODO: consider replacing rust-lexical https://github.com/rustsec/advisory-db/issues/1757 72 | use lexical::WriteFloatOptions; 73 | let opts = WriteFloatOptions::builder() 74 | .trim_floats(true) 75 | .max_significant_digits(std::num::NonZeroUsize::new(6)) 76 | // matching JS formatting 77 | .nan_string(Some(b"NaN")) 78 | .inf_string(Some(b"Infinity")) 79 | .build() 80 | .unwrap(); 81 | const FMT: u128 = lexical::format::STANDARD; 82 | let formatted = lexical::to_string_with_options::<_, FMT>(self.inner(), &opts); 83 | // ryu::Buffer::new().format($f)}; 84 | write!(f, "{formatted}") 85 | } 86 | } 87 | 88 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] 89 | #[cfg_attr( 90 | any(feature = "all-commands", feature = "sort", feature = "unique"), 91 | derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize), 92 | archive(compare(PartialEq), check_bytes) 93 | )] 94 | pub struct Interval(pub Float, pub Float); 95 | 96 | impl Interval { 97 | pub fn new(start: f64, end: f64) -> Self { 98 | Self(Float::new(start), Float::new(end)) 99 | } 100 | } 101 | 102 | impl fmt::Display for Interval { 103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | write!(f, "({}, {}]", self.0, self.1) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/cmd/mask.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use crate::cli::{CommonArgs, WORDY_HELP}; 4 | use crate::config::Config; 5 | use crate::error::CliResult; 6 | use crate::helpers::var_range::VarRanges; 7 | use crate::io::SeqQualRecord; 8 | 9 | pub const DESC: &str = "\ 10 | Masks the sequence within a given range or comma delimited list of ranges 11 | by converting to lowercase (soft mask) or replacing with a character (hard 12 | masking). Reverting soft masking is also possible."; 13 | 14 | #[derive(Parser, Clone, Debug)] 15 | #[clap(next_help_heading = "'Mask' command options")] 16 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 17 | pub struct MaskCommand { 18 | /// Range in the form 'start:end' or 'start:' or ':end', 19 | /// The range start/end may be defined by varialbes/functions, 20 | /// or the varialbe/function may contain a whole range. 21 | ranges: String, 22 | 23 | /// Do hard masking instead of soft masking, replacing 24 | /// everything in the range(s) with the given character 25 | #[arg(long, value_name = "CHAR")] 26 | hard: Option, 27 | 28 | /// Unmask (convert to uppercase instead of lowercase) 29 | #[arg(long)] 30 | unmask: bool, 31 | 32 | /// Exclusive range: excludes start and end positions 33 | /// from the masked sequence. 34 | /// In the case of unbounded ranges (`start:` or `:end`), the range still 35 | /// extends to the complete end or the start of the sequence. 36 | #[arg(short, long)] 37 | exclusive: bool, 38 | 39 | /// Interpret range as 0-based, with the end not included. 40 | #[arg(short('0'), long)] 41 | zero_based: bool, 42 | 43 | #[command(flatten)] 44 | pub common: CommonArgs, 45 | } 46 | 47 | pub fn run(mut cfg: Config, args: MaskCommand) -> CliResult<()> { 48 | let ranges = &args.ranges; 49 | let hard_mask = args.hard; 50 | let rng0 = args.zero_based; 51 | let exclusive = args.exclusive; 52 | let unmask = args.unmask; 53 | 54 | let mut format_writer = cfg.get_format_writer()?; 55 | cfg.with_io_writer(|io_writer, mut cfg| { 56 | let mut ranges = cfg.build_vars(|b| VarRanges::from_str(ranges, b))?; 57 | let mut seq = Vec::new(); 58 | let mut num_buf = Vec::new(); 59 | 60 | cfg.read(|record, ctx| { 61 | // obtain full sequence 62 | seq.clear(); 63 | let mut seqlen = 0; 64 | for s in record.seq_segments() { 65 | seq.extend_from_slice(s); 66 | seqlen += s.len(); 67 | } 68 | 69 | let calc_ranges = ranges.resolve(ctx.symbols(), record, &mut num_buf)?; 70 | 71 | if let Some(h) = hard_mask { 72 | for rng in calc_ranges { 73 | let (start, end) = rng.adjust(rng0, exclusive)?.resolve(seqlen); 74 | for c in &mut seq[start..end] { 75 | *c = h as u8; 76 | } 77 | } 78 | } else { 79 | for rng in calc_ranges { 80 | let (start, end) = rng.adjust(rng0, exclusive)?.resolve(seqlen); 81 | for c in &mut seq[start..end] { 82 | if unmask { 83 | c.make_ascii_uppercase() 84 | } else { 85 | c.make_ascii_lowercase() 86 | }; 87 | } 88 | } 89 | } 90 | 91 | let rec = SeqQualRecord::new(&record, &seq, None); 92 | format_writer.write(&rec, io_writer, ctx)?; 93 | 94 | Ok(true) 95 | }) 96 | }) 97 | } 98 | -------------------------------------------------------------------------------- /src/test/trim.rs: -------------------------------------------------------------------------------- 1 | use crate::helpers::NA; 2 | 3 | use super::*; 4 | 5 | #[test] 6 | fn trim() { 7 | let seq = "ATGC"; 8 | let fasta = fasta_record(seq); 9 | 10 | cmp(&["trim", ":"], &fasta, &fasta); 11 | cmp(&["trim", "1:"], &fasta, &fasta); 12 | cmp(&["trim", ":1"], &fasta, &fasta_record(&seq[..1])); 13 | cmp(&["trim", "2:-2"], &fasta, &fasta_record(&seq[1..3])); 14 | // exclusive 15 | cmp(&["trim", "-e", "1:3"], &fasta, &fasta_record(&seq[1..2])); 16 | cmp(&["trim", "-e", "2:3"], &fasta, &fasta_record(&seq[2..2])); 17 | cmp(&["trim", "-e", "2:4"], &fasta, &fasta_record(&seq[2..3])); 18 | // exclusive + unbounded 19 | cmp(&["trim", "-e", ":3"], &fasta, &fasta_record(&seq[..2])); 20 | cmp(&["trim", "-e", "2:"], &fasta, &fasta_record(&seq[2..])); 21 | // empty seq 22 | cmp(&["trim", "2:1"], &fasta, &fasta_record("")); 23 | } 24 | 25 | #[test] 26 | fn trim0() { 27 | let seq = "ATGC"; 28 | let fasta = fasta_record(seq); 29 | 30 | cmp(&["trim", "-0", "1:3"], &fasta, &fasta_record(&seq[1..3])); 31 | cmp(&["trim", "-0", ":3"], &fasta, &fasta_record(&seq[..3])); 32 | cmp(&["trim", "-0", "2:"], &fasta, &fasta_record(&seq[2..])); 33 | } 34 | 35 | #[test] 36 | fn trim_qual() { 37 | // quality trimming 38 | let fq = "@id\nATGC\n+\n1234\n"; 39 | 40 | cmp(&["trim", "--fq", ":2"], fq, "@id\nAT\n+\n12\n"); 41 | cmp(&["trim", "--fq", "2:3"], fq, "@id\nTG\n+\n23\n"); 42 | } 43 | 44 | #[test] 45 | fn trim_vars() { 46 | let id = "id start=2 end=3 range=2:3"; 47 | let fa = format!(">{id}\nATGC\n"); 48 | let trimmed = format!(">{id}\nTG\n"); 49 | 50 | cmp(&["trim", "{attr(start)}:{attr(end)}"], &fa, &trimmed); 51 | cmp(&["trim", "{attr(range)}"], &fa, &trimmed); 52 | // multiple ranges 53 | // TODO: space not deleted 54 | cmp( 55 | &["trim", "{attr_del(r)}"], 56 | ">id r=1:2,4:4\nATGC\n", 57 | ">id \nATC\n", 58 | ); 59 | } 60 | 61 | #[test] 62 | fn trim_multiline() { 63 | let fa = ">id\nAB\nCDE\nFGHI\nJ"; 64 | let seq = "ABCDEFGHIJ"; 65 | 66 | cmp(&["trim", ":"], fa, &format!(">id\n{seq}\n")); 67 | 68 | for start in 0..seq.len() - 1 { 69 | for end in start..seq.len() { 70 | cmp( 71 | &["trim", "-0", &format!("{start}:{end}")], 72 | fa, 73 | &format!(">id\n{}\n", &seq[start..end]), 74 | ); 75 | } 76 | } 77 | } 78 | 79 | #[test] 80 | fn trim_multiline_multirange() { 81 | let fa = ">id\nAB\nC\nDE\nFGHI\nJ"; 82 | 83 | cmp(&["trim", "2:4,6:7"], fa, ">id\nBCDFG\n"); 84 | cmp(&["trim", "-4:-3,-1:"], fa, ">id\nGHJ\n"); 85 | } 86 | 87 | #[test] 88 | fn trim_na() { 89 | cmp(&["trim", &format!("{NA}:")], ">id\nABCDE\n", ">id\nABCDE\n"); 90 | cmp( 91 | &["trim", &format!("{NA}:{NA}")], 92 | ">id\nABCDE\n", 93 | ">id\nABCDE\n", 94 | ); 95 | cmp( 96 | &["trim", "{opt_attr(s)}:{attr(e)}"], 97 | format!(">id s={NA} e=3\nABCDE\n"), 98 | &format!(">id s={NA} e=3\nABC\n"), 99 | ); 100 | fails( 101 | &["trim", "{attr(s)}:{attr(e)}"], 102 | format!(">id s={NA} e=3\nABCDE\n"), 103 | "reserved for missing values", 104 | ); 105 | cmp( 106 | &["trim", "{opt_attr(s)}:{opt_attr(e)}"], 107 | ">id s=3\nABCDE\n", 108 | ">id s=3\nCDE\n", 109 | ); 110 | fails( 111 | &["trim", "{opt_attr(s)}:{opt_attr(e)}"], 112 | ">id s=something\nABCDE\n", 113 | "Could not convert 'something' to an integer number", 114 | ); 115 | } 116 | -------------------------------------------------------------------------------- /scripts/gen_help.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # cargo build 4 | 5 | seqtool=target/debug/st 6 | outdir=../seqtool-docs 7 | main=../seqtool-docs/index.md #_README.md 8 | nav=../seqtool-docs/nav.yml 9 | 10 | # # prepend table of contents if there are H3 headings 11 | # prepend_toc() { 12 | # contents="$1" 13 | # level="$2" 14 | # toc_level="$3" 15 | # toc=`grep "^$level " "$contents" | 16 | # sed -E "s/^$level (.*)/* [\1] #\1/g" | 17 | # awk -F' ' '{ gsub(" ", "-", $2); rep=gsub("[()]", "", $2); print sprintf("%s(%s)", $1, tolower($2)) }'` 18 | 19 | # if [ `printf "$toc" | wc -l` -gt 1 ]; then 20 | # printf "$toc_level Contents\n\n$toc\n\n" | cat - "$contents" > tmp_out 21 | # mv tmp_out "$contents" 22 | # fi 23 | # } 24 | 25 | 26 | # echo -e "---\npermalink: /\ntitle: title\nwide: true\nsidebar:\n \nnav: docs\n---\n" > tmp_out 27 | 28 | echo -e "docs:\n - title: Commands\n children:\n" > $nav 29 | 30 | cat doc/_head.md > $main 31 | 32 | # generate command files 33 | 34 | printf "\n## Commands" >> $main 35 | 36 | cmd=( 37 | ">Basic conversion / editing" pass 38 | ">Information about sequences" view count stat 39 | ">Subsetting/shuffling sequences" sort unique filter split sample slice head tail interleave 40 | ">Searching and replacing" find replace 41 | ">Modifying commands" del set trim mask upper lower revcomp concat 42 | ) 43 | 44 | # create one MD file per command 45 | 46 | for c in "${cmd[@]}"; do 47 | echo "$c" 48 | 49 | if [[ "$c" = ">"* ]]; then 50 | # category name 51 | c=$(echo "$c" | cut -c2-) 52 | printf "\n### $c\n" >> $main 53 | continue 54 | fi 55 | 56 | out=$outdir/$c.md 57 | echo -n > $out 58 | 59 | opts=$(stty cols 80 && $seqtool "$c" -h 2>&1 | sed -n '/General options/q;p') 60 | desc=$(echo "$opts" | sed -n '/^ *$/q;p') 61 | 62 | # add command to overview 63 | echo "* **[$c](https://markschl.github.io/seqtool/$c)**: $desc" >> $main 64 | 65 | # add custom help content if file exists in doc dir 66 | desc_f=doc/$c.md 67 | if [ -f $desc_f ]; then 68 | echo "## Details" >> $out 69 | cat $desc_f >> $out 70 | fi 71 | 72 | # add variable help if present 73 | vars=$($seqtool $c --help-vars-md --help-cmd-vars 2>&1 || true) 74 | if [ ! -z "$vars" -a "$vars" != " " ]; then 75 | echo -e "$vars" | sed 's/^ *#/##/g' >> $out 76 | fi 77 | 78 | # prepend_toc $out '###' '##' 79 | 80 | # TODO: why prepend? 81 | # prepend usage info 82 | # echo -e "---\npermalink: /$c/\ntitle: $c\ntoc: true\nsidebar:\n nav: docs\n---\n" > tmp_out 83 | usage=$(echo "$opts" | sed '/Usage:/,$!d') 84 | printf "$desc\n\n\`\`\`\n$usage\n\`\`\`\n\n[See this page](opts) for the options common to all commands.\n\n" | 85 | cat - $out >> tmp_out 86 | mv tmp_out $out 87 | 88 | echo -e " - title: $c\n url: /$c" >> $nav 89 | 90 | done 91 | 92 | 93 | echo >> $main 94 | cat doc/_desc.md >> $main 95 | 96 | # variables/functions 97 | out=$outdir/variables.md 98 | cp doc/variables.md $outdir/variables.md 99 | # full variables reference 100 | out=$outdir/var_reference.md 101 | $seqtool . --help-vars-md 2>&1 > $out 102 | prepend_toc $out '##' '##' 103 | mv $out tmp_out 104 | echo -e "\n# Variables/functions: full reference\n" > $out 105 | cat tmp_out >> $out 106 | rm tmp_out 107 | 108 | # args common to all commands 109 | out=$outdir/opts.md 110 | printf "\n\n### Options recognized by all commands\n\n" > $out 111 | echo "\`\`\`" >> $out 112 | stty cols 80 && $seqtool pass -h 2>1 | sed '/General options/,$!d' >> $out 113 | echo "\`\`\`" >> $out 114 | 115 | # other files 116 | 117 | # TODO: doc/expressions.md 118 | cp doc/meta.md doc/ranges.md doc/attributes.md $outdir 119 | -------------------------------------------------------------------------------- /src/cmd/concat.rs: -------------------------------------------------------------------------------- 1 | use std::iter::repeat_n; 2 | 3 | use clap::Parser; 4 | 5 | use crate::cli::{CommonArgs, WORDY_HELP}; 6 | use crate::config::Config; 7 | use crate::error::CliResult; 8 | use crate::io::OwnedRecord; 9 | 10 | pub const DESC: &str = "\ 11 | The sequence IDs must be in the same order in all files; 12 | Fails if the IDs don't match."; 13 | 14 | #[derive(Parser, Clone, Debug)] 15 | #[clap(next_help_heading = "'Concat' command options")] 16 | #[clap(before_help=DESC, help_template=WORDY_HELP)] 17 | pub struct ConcatCommand { 18 | /// Don't check if the IDs of the records from 19 | /// the different files match 20 | #[arg(short, long, short)] 21 | no_id_check: bool, 22 | 23 | /// Add a spacer of characters inbetween the concatenated 24 | /// sequences. 25 | #[arg(short, long, short)] 26 | spacer: Option, 27 | 28 | /// Character to use as spacer for sequences 29 | #[arg(short('c'), long, default_value = "N")] 30 | s_char: char, 31 | 32 | /// Character to use as spacer for qualities. 33 | /// Defaults to a phred score of 41 (Illumina 1.8+/Phred+33 encoding, which 34 | /// is the default assumed encoding). 35 | #[arg(short = 'Q', long, default_value = "J")] 36 | q_char: char, 37 | 38 | #[command(flatten)] 39 | pub common: CommonArgs, 40 | } 41 | 42 | pub fn run(mut cfg: Config, args: ConcatCommand) -> CliResult<()> { 43 | let id_check = !args.no_id_check; 44 | let spacer_n = args.spacer; 45 | let s_char = args.s_char as u8; 46 | let q_char = args.q_char as u8; 47 | 48 | let mut format_writer = cfg.get_format_writer()?; 49 | cfg.with_io_writer(|io_writer, mut cfg| { 50 | let mut record = OwnedRecord::default(); 51 | let num_readers = cfg.num_readers(); 52 | if num_readers == 0 { 53 | return fail!("Nothing to concatenate!"); 54 | } 55 | let max_idx = num_readers - 1; 56 | 57 | cfg.read_alongside(false, |i, rec, ctx| { 58 | if i == 0 { 59 | // initialize record 60 | record.update_header_from(rec); 61 | record.seq.clear(); 62 | } else if id_check && rec.id() != record.id.as_slice() { 63 | return fail!(format!( 64 | "ID of record #{} ({}) does not match the ID of the first one ({})", 65 | i + 1, 66 | String::from_utf8_lossy(rec.id()), 67 | String::from_utf8_lossy(&record.id) 68 | )); 69 | } 70 | 71 | // extend seq 72 | for s in rec.seq_segments() { 73 | record.seq.extend(s); 74 | } 75 | 76 | // handle qual 77 | if let Some(q) = rec.qual() { 78 | let qual = record.qual.get_or_insert_with(Vec::new); 79 | if i == 0 { 80 | qual.clear(); 81 | } 82 | qual.extend(q); 83 | } 84 | 85 | // spacer 86 | if let Some(n) = spacer_n { 87 | if i < max_idx { 88 | record.seq.extend(repeat_n(s_char, n)); 89 | if let Some(q) = record.qual.as_mut() { 90 | q.extend(repeat_n(q_char, n)); 91 | } 92 | } 93 | } 94 | 95 | // write at last 96 | if i == max_idx { 97 | // handle variables (read_alongside requires this to be done manually) 98 | ctx.set_record(&record, 0)?; 99 | format_writer.write(&record, io_writer, ctx)?; 100 | } 101 | Ok(true) 102 | }) 103 | }) 104 | } 105 | -------------------------------------------------------------------------------- /src/test/sample.rs: -------------------------------------------------------------------------------- 1 | extern crate rand; 2 | use rand::{distr::Uniform, prelude::*, seq::IteratorRandom}; 3 | 4 | use crate::cmd::sample::DefaultRng; 5 | 6 | use super::*; 7 | 8 | #[test] 9 | fn simple() { 10 | let input = tmp_file("sample_simple__", ".fasta", &FASTA); 11 | // very simple tests 12 | cmp(&["sample", "-n", "4"], &input, &FASTA); 13 | fails( 14 | &["sample", "-p", "2"], 15 | &input, 16 | "Fractions should be between 0 and 1", 17 | ); 18 | fails( 19 | &["sample", "-p", "1"], 20 | &input, 21 | "Fractions should be between 0 and 1", 22 | ); 23 | } 24 | 25 | #[test] 26 | fn large() { 27 | with_tmpdir("st_sample_large_", |td| { 28 | // RNGs and seeds 29 | // test with integer seed 30 | let seed1 = 602993; 31 | // string seed 32 | let seed2 = "ABCDEFGHIJKLMNOP"; 33 | let mut seed2_array = [0; 32]; 34 | (&mut seed2_array[..]).write_all(seed2.as_bytes()).unwrap(); 35 | let rngs: Vec<(String, Box DefaultRng>)> = vec![ 36 | ( 37 | format!("{seed1}"), 38 | Box::new(|| DefaultRng::seed_from_u64(seed1)), 39 | ), 40 | ( 41 | seed2.to_string(), 42 | Box::new(|| DefaultRng::from_seed(seed2_array)), 43 | ), 44 | ]; 45 | 46 | // input 47 | 48 | let n_records = 1000; 49 | let seqs: Vec<_> = (0..n_records).map(|i| format!(">{i}\nSEQ\n")).collect(); 50 | let fasta = seqs.join(""); 51 | 52 | let input = td.file(".fasta", &fasta); 53 | 54 | for (seed, get_rng) in &rngs { 55 | // test fixed number (-n); 56 | for n in [1, 10, 100, 500, 998, 1000] { 57 | // also test different memory limits to ensure that switching 58 | // from sampling whole records to indices only works 59 | for rec_limit in [1, 5, 10, 100, 200, 500, 800, 1000, 10000] { 60 | for two_pass in [false, true] { 61 | // expected output: 62 | // we use reservoir sampling implemented in the rand crate, 63 | // which is a way of validating our own reimplementation. 64 | let mut rng = get_rng(); 65 | let mut indices = (0..n_records).choose_multiple(&mut rng, n); 66 | indices.sort(); // results always in input order 67 | let expected = indices.into_iter().map(|i| seqs[i].clone()).join(""); 68 | // run sample command 69 | let mem_limit = rec_limit * n * 12; 70 | let n = format!("{n}"); 71 | let mem = format!("{mem_limit}"); 72 | let mut args = vec!["sample", "-n", &n, "-s", seed, "--max-mem", &mem]; 73 | if two_pass { 74 | args.push("-2"); 75 | } 76 | cmp(&args, &input, &expected); 77 | } 78 | } 79 | } 80 | 81 | // test probability sampling (-p); 82 | let distr = Uniform::new(0f32, 1.).unwrap(); 83 | for &p in &[0., 0.1, 0.3, 0.5, 0.7, 0.95] { 84 | let mut rng = get_rng(); 85 | let expected = seqs 86 | .iter() 87 | .filter(|&_| distr.sample(&mut rng) < p) 88 | .cloned() 89 | .join(""); 90 | 91 | cmp( 92 | &["sample", "-p", &format!("{p}"), "-s", seed], 93 | &input, 94 | &expected, 95 | ); 96 | } 97 | } 98 | }); 99 | } 100 | -------------------------------------------------------------------------------- /src/cmd/cmp/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::config::Config; 2 | use crate::error::CliResult; 3 | use crate::var::{modules::VarProvider, varstring::register_var_list}; 4 | 5 | mod cli; 6 | mod complete; 7 | mod in_order; 8 | mod output; 9 | mod vars; 10 | 11 | pub use self::cli::*; 12 | pub use self::output::*; 13 | pub use self::vars::*; 14 | 15 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 16 | pub enum Category { 17 | Common, 18 | Unique1, 19 | Unique2, 20 | } 21 | 22 | use self::Category::*; 23 | 24 | impl Category { 25 | fn long_text(self) -> &'static str { 26 | match self { 27 | Common => "common", 28 | Unique1 => "unique1", 29 | Unique2 => "unique2", 30 | } 31 | } 32 | 33 | fn short_text(self) -> &'static str { 34 | match self { 35 | Common => "c", 36 | Unique1 => "u1", 37 | Unique2 => "u2", 38 | } 39 | } 40 | } 41 | 42 | #[derive(Debug, Default, Copy, Clone)] 43 | struct CmpStats { 44 | pub common: u64, 45 | pub unique1: u64, 46 | pub unique2: u64, 47 | } 48 | 49 | /// Factor for adjusting the calculated memory usage (based on size of items) 50 | /// to obtain the approximately correct total memory usage. 51 | /// It corrects for the extra memory that may not be in the calculation otherwise. 52 | static MEM_OVERHEAD: f32 = 1.1; 53 | 54 | pub fn run(mut cfg: Config, mut args: CmpCommand) -> CliResult<()> { 55 | let quiet = args.common.general.quiet; 56 | let two_pass = args.two_pass; 57 | let max_mem = (args.max_mem as f32 / MEM_OVERHEAD) as usize; 58 | 59 | // register variables/functions: 60 | // tuples of (varstring, text buffer) 61 | cfg.set_custom_varmodule(Box::::default())?; 62 | 63 | let mut var_key = Vec::with_capacity(1); 64 | cfg.build_vars(|b| { 65 | for key in &args.key { 66 | register_var_list(key.as_ref(), b, &mut var_key, None, true, true)?; 67 | } 68 | Ok::<_, String>(()) 69 | })?; 70 | 71 | let diff_fields = args 72 | .diff 73 | .as_ref() 74 | .map(|fields| { 75 | let mut vs = Vec::with_capacity(1); 76 | cfg.build_vars(|b| { 77 | for f in fields { 78 | register_var_list(f, b, &mut vs, None, true, true)?; 79 | } 80 | Ok::<_, String>(()) 81 | })?; 82 | Ok::<_, String>(vs) 83 | }) 84 | .transpose()?; 85 | let diff_writer = diff_fields.map(|fields| DiffWriter::new(fields, args.diff_width)); 86 | 87 | let mut out = Output::from_args(&mut args, &mut cfg)?; 88 | 89 | cfg.with_custom_varmod(|v: &mut CmpVars| { 90 | if out.has_combined_output() && !v.has_vars() { 91 | return fail!( 92 | "Specified mixed output in 'cmp' command ' -o/--output/--output2', \ 93 | but no variables are used to distinguish records. Please specify \ 94 | one of `category`, `category_short` or `key`, or specify unique \ 95 | output instead (--unique1/--unique2)." 96 | ); 97 | } 98 | Ok::<_, String>(()) 99 | })?; 100 | 101 | let stats = if args.in_order { 102 | in_order::cmp_in_order(&mut cfg, &var_key, &mut out, diff_writer, max_mem)? 103 | } else { 104 | complete::cmp_complete( 105 | &mut cfg, 106 | var_key, 107 | &mut out, 108 | diff_writer, 109 | max_mem, 110 | two_pass, 111 | quiet, 112 | )? 113 | }; 114 | if !quiet { 115 | eprintln!( 116 | "common\t{}\nunique1\t{}\nunique2\t{}", 117 | stats.common, stats.unique1, stats.unique2 118 | ); 119 | } 120 | if args.check && (stats.unique1 > 0 || stats.unique2 > 0) { 121 | return fail!("Not an exact match"); 122 | } 123 | Ok(()) 124 | } 125 | -------------------------------------------------------------------------------- /scripts/summarize_comparison.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import re 5 | 6 | 7 | def gen_summary(json_input, md_out): 8 | from html import escape 9 | # def cnv_breaks(s): 10 | # return re.sub(r'[\r\n]', '
', s, re.DOTALL) 11 | 12 | def fmt_bench(d): 13 | return '{}{}
{}'.format( 14 | format_time(d), 15 | ' {:.0f}% CPU'.format(d['cpu']) if abs(100 - d['cpu']) > 5 else '', 16 | format_mem(d), 17 | ) 18 | 19 | def find_best(d): 20 | runs = [d['st']] + list(d.get('other', {}).values()) 21 | if len(runs) > 1: 22 | times = sorted((r['elapsed'], i) for i, r in enumerate(runs)) 23 | if times[0][0] > 0: 24 | runs[times[0][1]]['fastest'] = times[1][0] / times[0][0] 25 | mem = sorted((r['max_mib'], i) for i, r in enumerate(runs)) 26 | if mem[0][0] > 0: 27 | runs[mem[0][1]]['lowest_mem'] = mem[1][0] / mem[0][0] 28 | 29 | def format_time(d): 30 | f = d.get('fastest') 31 | if f is not None: 32 | return '🕓 {:.1f} s 🏆 ({:.1f}x)'.format(d['elapsed'], f) 33 | return '🕓 {:.1f} s'.format(d['elapsed']) 34 | 35 | def format_mem(d): 36 | f = d.get('lowest_mem') 37 | if f is not None: 38 | return '📈 {:.1f} MiB 🏆 ({:.2f}x)'.format(d['max_mib'], f) 39 | return '📈 {:.1f} MiB'.format(d['max_mib']) 40 | 41 | def fmt_output(d): 42 | strip_newlines = lambda msg: re.sub(r'(?ms:[\r\n\s]+$)', '', msg) 43 | out = '' 44 | if d['stdout']: 45 | # TODO: get this to work: https://github.com/squidfunk/mkdocs-material/issues/4964 46 | out += '
🟦 output\n\n```\n{}\n```\n\n
\n'.format(strip_newlines(d['stdout'])) 47 | if d['stderr']: 48 | out += '
 messages\n\n```\n{}\n```\n\n
\n'.format(strip_newlines(d['stderr'])) 49 | return out 50 | 51 | for command, comparisons in json.load(json_input).items(): 52 | md_out.write('## {}\n'.format(command)) 53 | md_out.write('\n\n') 54 | for comparison, d in comparisons.items(): 55 | find_best(d) 56 | st = d['st'] 57 | md_out.write('\n\n\n'.format(escape(d.get('description', comparison)))) 58 | md_out.write('\n\n\n\n\n\n'.format(fmt_bench(st))) 79 | md_out.write('
\n\n{}\n\n\n\n```bash\n{}\n```\n\n{}\n'.format(st['cmd'], fmt_output(st))) 59 | if 'other' in d and d['other']: 60 | md_out.write('
{}\n\n\n'.format( 61 | "  ❙ ".join('{} {}'.format( 62 | escape(tool), format_time(o), 63 | ) 64 | for tool, o in d['other'].items()) 65 | ) 66 | ) 67 | for tool, o in d['other'].items(): 68 | code = ''.format( 69 | o['cmd'].replace('\n', ' '), 70 | fmt_output(o) 71 | ) 72 | md_out.write('\n\n\n\n{}\n\n\n\n\n\n'.format( 73 | escape(tool), 74 | code, 75 | fmt_bench(o) 76 | )) 77 | md_out.write('
\n\n```bash\n{}\n```\n\n{}
{}{}
\n\n
\n\n') 78 | md_out.write('
{}
\n\n') 80 | 81 | if __name__ == '__main__': 82 | import argparse 83 | 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('json_input', type=argparse.FileType('r')) 86 | parser.add_argument('md_out', type=argparse.FileType('w')) 87 | # parser.add_argument('-m', '--main-only', action='store_true') 88 | args = parser.parse_args() 89 | 90 | gen_summary(**vars(args)) 91 | -------------------------------------------------------------------------------- /src/var/modules/expr/var_provider.rs: -------------------------------------------------------------------------------- 1 | use var_provider::{dyn_var_provider, DynVarProviderInfo, VarType}; 2 | use variable_enum_macro::variable_enum; 3 | 4 | use crate::io::{QualConverter, Record}; 5 | use crate::var::{ 6 | attr::Attributes, modules::VarProvider, parser::Arg, symbols::SymbolTable, VarBuilder, 7 | }; 8 | 9 | use super::code_or_file; 10 | use super::js::{parser::Expression, JsExpr}; 11 | 12 | type Expressions = super::expressions::Expressions; 13 | 14 | variable_enum! { 15 | /// # Expressions (JavaScript) 16 | /// 17 | /// Expressions with variables, from simple mathematical operations to 18 | /// arbitrarily complex JavaScript code. 19 | /// 20 | /// Expressions are always enclosed in { curly brackets }. These brackets 21 | /// are optional for simple variables/functions in some cases, 22 | /// but mandatory for expressions. 23 | /// In addition, the 'filter' command takes an expression (without { brackets }). 24 | /// 25 | /// 26 | /// Instead of JavaScript code, it is possible to refer to a source file 27 | /// using 'file:path.js'. 28 | /// 29 | /// 30 | /// *Returned value*: For simple one-liner expressions, the value is 31 | /// directly used. 32 | /// More complex scripts with multiple statements (if/else, loops, etc.) 33 | /// explicitly require a `return` statement to return the value. 34 | /// 35 | /// 36 | /// # Examples 37 | /// 38 | /// Calculate the number of ambiguous bases in a set of DNA sequences and 39 | /// add the result as an attribute (ambig=...) to the header 40 | /// 41 | /// `st pass -a ambig='{seqlen - charcount("ACGT")}' seqs.fasta` 42 | /// 43 | /// >id1 ambig=3 44 | /// TCNTTAWTAACCTGATTAN 45 | /// >id2 ambig=0 46 | /// GGAGGATCCGAGCG 47 | /// (...) 48 | /// 49 | /// 50 | /// Discard sequences with >1% ambiguous bases or sequences shorter than 100bp 51 | /// 52 | /// `st filter 'charcount("ACGT") / seqlen >= 0.99 && seqlen >= 100' seqs.fasta` 53 | /// 54 | /// 55 | /// Distribute sequences into different files by a slightly complicated condition. 56 | /// Note the 'return' statments are are necessary here, since this is not a simple expression. 57 | /// With even longer code, consider using an extra script and supplying 58 | /// -o "outdir/{file:code.js}.fasta" instead 59 | /// 60 | /// `st split -po "outdir/{ if (id.startsWith('some_prefix_')) { return 'file_1' } return 'file_2' }.fasta" input.fasta` 61 | /// 62 | /// There should be two files now (`ls file_*.fasta`): 63 | /// file_1.fasta 64 | /// file_2.fasta 65 | ExprVar<'a> { 66 | #[hidden] 67 | ____Expr(?) { expr: Expression<'a> }, 68 | } 69 | } 70 | 71 | #[derive(Debug)] 72 | pub struct ExprVars(Expressions); 73 | 74 | impl ExprVars { 75 | pub fn new(init_code: Option<&str>) -> Result { 76 | let init_code = init_code 77 | .map(|c| Ok::<_, String>(code_or_file(c)?.to_string())) 78 | .transpose()?; 79 | Ok(Self(Expressions::new(init_code.as_deref())?)) 80 | } 81 | } 82 | 83 | impl VarProvider for ExprVars { 84 | fn info(&self) -> &dyn DynVarProviderInfo { 85 | &dyn_var_provider!(ExprVar) 86 | } 87 | 88 | fn register( 89 | &mut self, 90 | name: &str, 91 | args: &[Arg], 92 | builder: &mut VarBuilder, 93 | ) -> Result)>, String> { 94 | if let Some((var, _)) = ExprVar::from_func(name, args)? { 95 | let ExprVar::____Expr { expr } = var; 96 | return expr 97 | .with_tree(|ast| Ok(Some(self.0.register_expr(ast, builder)?))) 98 | .and_then(|res| res); 99 | } 100 | Ok(None) 101 | } 102 | 103 | fn has_vars(&self) -> bool { 104 | self.0.num_exprs() > 0 105 | } 106 | 107 | fn set_record( 108 | &mut self, 109 | record: &dyn Record, 110 | symbols: &mut SymbolTable, 111 | _: &Attributes, 112 | _: &mut QualConverter, 113 | ) -> Result<(), String> { 114 | self.0.eval(symbols, record) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/helpers/value.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::io; 3 | use std::mem; 4 | 5 | use deepsize::{Context, DeepSizeOf}; 6 | 7 | use crate::helpers::NA; 8 | use crate::io::Record; 9 | use crate::var::symbols::{OptValue, Value}; 10 | 11 | use super::number::{Float, Interval}; 12 | 13 | /// A simple value type that can be either text, numeric, boolean, interval or undefined/none. 14 | /// Can also be serialized using rkyv (only enabled for sort and unique commands). 15 | /// 16 | /// This type is simpler than the Value type in the symbol table, which often have 17 | /// additional information stored/allocated. 18 | /// Another difference: SimpleValue does not have an integer type, any number will 19 | /// thus behave the same (as float). This is important when sorting/hashing. 20 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] 21 | #[cfg_attr( 22 | any(feature = "all-commands", feature = "sort", feature = "unique"), 23 | derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize), 24 | archive(compare(PartialEq), check_bytes) 25 | )] 26 | pub enum SimpleValue { 27 | Text(Box<[u8]>), 28 | Number(Float), 29 | Boolean(bool), 30 | Interval(Interval), 31 | None, 32 | } 33 | 34 | impl SimpleValue { 35 | #[inline] 36 | pub fn write(&self, writer: &mut W) -> io::Result<()> { 37 | use SimpleValue::*; 38 | match self { 39 | Text(v) => writer.write_all(v), 40 | Number(v) => write!(writer, "{v}"), 41 | Boolean(v) => write!(writer, "{v}"), 42 | Interval(i) => write!(writer, "{i}"), 43 | None => writer.write_all(NA.as_bytes()), 44 | } 45 | } 46 | 47 | #[inline] 48 | pub fn to_symbol(&self, sym: &mut OptValue) { 49 | use SimpleValue::*; 50 | match self { 51 | Text(t) => sym.inner_mut().set_text(t), 52 | Number(n) => sym.inner_mut().set_float(n.inner()), 53 | Boolean(b) => sym.inner_mut().set_bool(*b), 54 | Interval(i) => sym.inner_mut().set_interval(*i), 55 | None => sym.set_none(), 56 | } 57 | } 58 | 59 | #[inline] 60 | pub fn replace_from_symbol( 61 | &mut self, 62 | sym: &OptValue, 63 | rec: &dyn Record, 64 | text_buf: &mut Vec, 65 | ) { 66 | if let SimpleValue::Text(t) = self { 67 | // If present, take the text buffer from SimpleValue. 68 | // If `text_buf` is already non-empty (allocated), this allocation 69 | // will be lost. But it is assumed that the allocation is always 70 | // either referenced by SimpleValue::Text() or by `text_buf`, never 71 | // both. 72 | *text_buf = mem::take(t).into_vec(); 73 | } 74 | *self = if let Some(v) = sym.inner() { 75 | match v { 76 | Value::Text(_) | Value::Attr(_) => { 77 | v.as_text(rec, |t| { 78 | text_buf.clear(); 79 | text_buf.extend_from_slice(t); 80 | Ok::<(), ()>(()) 81 | }) 82 | .unwrap(); 83 | SimpleValue::Text(mem::take(text_buf).into_boxed_slice()) 84 | } 85 | Value::Int(v) => SimpleValue::Number(Float::new(*v.get() as f64)), 86 | Value::Float(v) => SimpleValue::Number(Float::new(*v.get())), 87 | Value::Interval(v) => SimpleValue::Interval(*v.get()), 88 | Value::Bool(v) => SimpleValue::Boolean(*v.get()), 89 | } 90 | } else { 91 | SimpleValue::None 92 | }; 93 | } 94 | } 95 | 96 | // Dispaly implementation only for error messages 97 | impl fmt::Display for SimpleValue { 98 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 99 | let mut buf = Vec::new(); 100 | self.write(&mut buf).unwrap(); 101 | f.write_str(&String::from_utf8_lossy(&buf)) 102 | } 103 | } 104 | 105 | impl DeepSizeOf for SimpleValue { 106 | fn deep_size_of_children(&self, _: &mut Context) -> usize { 107 | if let SimpleValue::Text(v) = self { 108 | return v.len(); 109 | } 110 | 0 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/cmd/sort/mod.rs: -------------------------------------------------------------------------------- 1 | use std::env::temp_dir; 2 | use std::io::Write; 3 | use std::path::Path; 4 | 5 | use crate::config::Config; 6 | use crate::error::CliResult; 7 | use crate::helpers::vec_buf::VecFactory; 8 | use crate::var::varstring::register_var_list; 9 | 10 | use super::shared::tmp_store::{Item, Key}; 11 | 12 | pub mod cli; 13 | pub mod file; 14 | pub mod mem; 15 | pub mod vars; 16 | 17 | pub use self::cli::*; 18 | pub use self::file::*; 19 | pub use self::mem::*; 20 | pub use self::vars::*; 21 | 22 | /// Factor for adjusting the calculated memory usage (based on size of items) 23 | /// to obtain the approximately correct total memory usage. 24 | /// It corrects for the extra memory used by Vec::sort() and other allocations 25 | /// that may not be in the calculation otherwise. 26 | /// (factor found by memory profiling on Linux) 27 | static MEM_OVERHEAD: f32 = 1.1; 28 | 29 | pub fn run(mut cfg: Config, args: SortCommand) -> CliResult<()> { 30 | let verbose = args.common.general.verbose; 31 | let quiet = args.common.general.quiet; 32 | let max_mem = (args.max_mem as f32 / MEM_OVERHEAD) as usize; 33 | // TODO: not activated, since we use a low limit for testing 34 | // if args.max_mem < 1 << 22 { 35 | // return fail!("The memory limit should be at least 2MiB"); 36 | // } 37 | let mut record_buf_factory = VecFactory::new(); 38 | let tmp_path = args.temp_dir.clone().unwrap_or_else(temp_dir); 39 | let mut sorter = Sorter::new(args.reverse, max_mem); 40 | 41 | cfg.set_custom_varmodule(Box::::default())?; 42 | 43 | let mut format_writer = cfg.get_format_writer()?; 44 | 45 | cfg.with_io_writer(|io_writer, mut cfg| { 46 | // assemble key 47 | let mut varstring_keys = Vec::with_capacity(1); 48 | cfg.build_vars(|b| register_var_list(&args.key, b, &mut varstring_keys, None, true, true))?; 49 | let mut key_values = Key::with_size(varstring_keys.len()); 50 | let mut text_buf = vec![Vec::new(); varstring_keys.len()]; 51 | 52 | cfg.read(|record, ctx| { 53 | // assemble key 54 | key_values.compose_from(&varstring_keys, &mut text_buf, ctx.symbols(), record)?; 55 | ctx.with_custom_varmod(0, |m: &mut SortVars, sym| m.set(&key_values, sym)); 56 | // write formatted record to a buffer 57 | let record_out = 58 | record_buf_factory.get(|out| format_writer.write(&record, out, ctx))?; 59 | // add both to the object that handles the sorting 60 | sorter.add( 61 | Item::new(key_values.clone(), record_out.into_boxed_slice()), 62 | &tmp_path, 63 | args.temp_file_limit, 64 | quiet, 65 | )?; 66 | Ok(true) 67 | })?; 68 | // write sorted output 69 | sorter.write(io_writer, quiet, verbose) 70 | }) 71 | } 72 | 73 | #[derive(Debug)] 74 | enum Sorter { 75 | Mem(MemSorter), 76 | File(FileSorter), 77 | } 78 | 79 | impl Sorter { 80 | fn new(reverse: bool, max_mem: usize) -> Self { 81 | Self::Mem(MemSorter::new(reverse, max_mem)) 82 | } 83 | 84 | fn add( 85 | &mut self, 86 | item: Item>, 87 | tmp_path: &Path, 88 | file_limit: usize, 89 | quiet: bool, 90 | ) -> CliResult<()> { 91 | match self { 92 | Self::Mem(m) => { 93 | if !m.add(item) { 94 | if !quiet { 95 | eprintln!( 96 | "Memory limit reached after {} records, writing to temporary file(s). \ 97 | Consider raising the limit (-M/--max-mem) to speed up sorting. \ 98 | Use -q/--quiet to silence this message.", 99 | m.len() 100 | ); 101 | } 102 | let mut f = m.get_file_sorter(tmp_path.to_owned(), file_limit)?; 103 | f.write_to_file(quiet)?; 104 | *self = Self::File(f); 105 | } 106 | } 107 | Self::File(f) => { 108 | f.add(item, quiet)?; 109 | } 110 | } 111 | Ok(()) 112 | } 113 | 114 | fn write(&mut self, io_writer: &mut dyn Write, quiet: bool, verbose: bool) -> CliResult<()> { 115 | match self { 116 | Self::Mem(m) => m.write_sorted(io_writer), 117 | Self::File(f) => f.write_records(io_writer, quiet, verbose), 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/test/cmp.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | const FA1: &str = "\ 4 | 1,AAA 5 | 2,AAA 6 | 3,CCC 7 | 5,CCC 8 | 7,TTT 9 | 8,ATG 10 | 9,TGA 11 | "; 12 | 13 | const FA2: &str = "\ 14 | 1,AAA 15 | 3,CCC 16 | 4,CCC 17 | 5,CCC 18 | 6,TTT 19 | 8,GGG 20 | 10,GAT 21 | "; 22 | 23 | const STATS: &str = "\ 24 | common\t3 25 | unique1\t4 26 | unique2\t4 27 | "; 28 | 29 | const STATS_ID: &str = "\ 30 | common\t4 31 | unique1\t3 32 | unique2\t3 33 | "; 34 | 35 | const CATEGORY1: &str = "\ 36 | 1,AAA,common 37 | 2,AAA,unique1 38 | 3,CCC,common 39 | 5,CCC,common 40 | 7,TTT,unique1 41 | 8,ATG,unique1 42 | 9,TGA,unique1 43 | "; 44 | 45 | const CATEGORY2: &str = "\ 46 | 1,AAA,common 47 | 3,CCC,common 48 | 4,CCC,unique2 49 | 5,CCC,common 50 | 6,TTT,unique2 51 | 8,GGG,unique2 52 | 10,GAT,unique2 53 | "; 54 | 55 | const CATEGORY_ID1: &str = "\ 56 | 1,AAA,common 57 | 2,AAA,unique1 58 | 3,CCC,common 59 | 5,CCC,common 60 | 7,TTT,unique1 61 | 8,ATG,common 62 | 9,TGA,unique1 63 | "; 64 | 65 | const CATEGORY_ID2: &str = "\ 66 | 1,AAA,common 67 | 3,CCC,common 68 | 4,CCC,unique2 69 | 5,CCC,common 70 | 6,TTT,unique2 71 | 8,GGG,common 72 | 10,GAT,unique2 73 | "; 74 | 75 | const COMMON: &str = "\ 76 | 1,AAA 77 | 3,CCC 78 | 5,CCC 79 | "; 80 | 81 | const COMMON_ID1: &str = "\ 82 | 1,AAA 83 | 3,CCC 84 | 5,CCC 85 | 8,ATG 86 | "; 87 | 88 | const COMMON_ID2: &str = "\ 89 | 1,AAA 90 | 3,CCC 91 | 5,CCC 92 | 8,GGG 93 | "; 94 | 95 | const UNIQUE1: &str = "\ 96 | 2,AAA 97 | 7,TTT 98 | 8,ATG 99 | 9,TGA 100 | "; 101 | 102 | const UNIQUE_ID1: &str = "\ 103 | 2,AAA 104 | 7,TTT 105 | 9,TGA 106 | "; 107 | 108 | const UNIQUE2: &str = "\ 109 | 4,CCC 110 | 6,TTT 111 | 8,GGG 112 | 10,GAT 113 | "; 114 | 115 | const UNIQUE_ID2: &str = "\ 116 | 4,CCC 117 | 6,TTT 118 | 10,GAT 119 | "; 120 | 121 | #[test] 122 | fn cmp_() { 123 | with_tmpdir("st_cmp_", |td| { 124 | let common1 = td.path("cmp_common1.csv"); 125 | let common2 = td.path("cmp_common2.csv"); 126 | let uniq1 = td.path("cmp_unique1.csv"); 127 | let uniq2 = td.path("cmp_unique2.csv"); 128 | 129 | let input = td.multi_file(".csv", [FA1, FA2]); 130 | 131 | // compare by ID and sequence 132 | let cli = &[ 133 | "cmp", 134 | "--csv", 135 | "id,seq", 136 | "--common1", 137 | &common1, 138 | "--common2", 139 | &common2, 140 | "--unique1", 141 | &uniq1, 142 | "--unique2", 143 | &uniq2, 144 | ]; 145 | cmd(cli, &input).stderr(STATS); 146 | assert_eq!(common1.content(), COMMON); 147 | assert_eq!(common2.content(), COMMON); 148 | assert_eq!(uniq1.content(), UNIQUE1); 149 | assert_eq!(uniq2.content(), UNIQUE2); 150 | 151 | // compare by ID only 152 | let cli = &[ 153 | "cmp", 154 | "-k", 155 | "id", 156 | "--csv", 157 | "id,seq", 158 | "--common1", 159 | &common1, 160 | "--common2", 161 | &common2, 162 | "--unique1", 163 | &uniq1, 164 | "--unique2", 165 | &uniq2, 166 | ]; 167 | cmd(cli, &input).stderr(STATS_ID); 168 | assert_eq!(common1.content(), COMMON_ID1); 169 | assert_eq!(common2.content(), COMMON_ID2); 170 | assert_eq!(uniq1.content(), UNIQUE_ID1); 171 | assert_eq!(uniq2.content(), UNIQUE_ID2); 172 | }); 173 | } 174 | 175 | #[test] 176 | fn cmp_category() { 177 | with_tmpdir("st_cmp_category_", |td| { 178 | let cat1 = td.path("cmp_cat1.csv"); 179 | let cat2 = td.path("cmp_cat2.csv"); 180 | 181 | let input = td.multi_file(".csv", [FA1, FA2]); 182 | 183 | // compare by ID and sequence 184 | let cli = &[ 185 | "cmp", 186 | "--csv", 187 | "id,seq", 188 | "--to-csv", 189 | "id,seq,category", 190 | "-o", 191 | &cat1, 192 | "--output2", 193 | &cat2, 194 | ]; 195 | cmd(cli, &input).stderr(STATS); 196 | assert_eq!(cat1.content(), CATEGORY1); 197 | assert_eq!(cat2.content(), CATEGORY2); 198 | 199 | // compare by ID only 200 | let cli = &[ 201 | "cmp", 202 | "-k", 203 | "id", 204 | "--csv", 205 | "id,seq", 206 | "--to-csv", 207 | "id,seq,category", 208 | "-o", 209 | &cat1, 210 | "--output2", 211 | &cat2, 212 | ]; 213 | cmd(cli, &input).stderr(STATS_ID); 214 | assert_eq!(cat1.content(), CATEGORY_ID1); 215 | assert_eq!(cat2.content(), CATEGORY_ID2); 216 | }); 217 | } 218 | -------------------------------------------------------------------------------- /src/cmd/shared/key.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::io; 3 | use std::ops::{Deref, DerefMut}; 4 | 5 | use deepsize::DeepSizeOf; 6 | 7 | use crate::helpers::{value::SimpleValue, write_list::write_list_with}; 8 | use crate::io::Record; 9 | use crate::var::{ 10 | symbols::{OptValue, SymbolTable}, 11 | varstring::VarString, 12 | }; 13 | 14 | #[derive(DeepSizeOf, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] 15 | #[cfg_attr( 16 | any(feature = "all-commands", feature = "sort", feature = "unique"), 17 | derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize), 18 | archive(compare(PartialEq), check_bytes) 19 | )] 20 | pub enum Key { 21 | Single(SimpleValue), 22 | // This saves time with two values per key (but appears to make >2 values slower) 23 | // TODO: activating this comes with the tradeoff of increased memory usage 24 | // Two([SimpleValue; 2]), 25 | Multiple(Box<[SimpleValue]>), 26 | } 27 | 28 | impl Key { 29 | pub fn with_size(key_size: usize) -> Self { 30 | match key_size { 31 | 0 => panic!(), 32 | 1 => Self::Single(SimpleValue::None), 33 | // 2 => Self::Two([SimpleValue::None, SimpleValue::None]), 34 | _ => Self::Multiple(vec![SimpleValue::None; key_size].into_boxed_slice()), 35 | } 36 | } 37 | 38 | pub fn as_slice(&self) -> &[SimpleValue] { 39 | match self { 40 | Self::Single(v) => std::slice::from_ref(v), 41 | // Self::Two(v) => v, 42 | Self::Multiple(v) => v, 43 | } 44 | } 45 | 46 | pub fn compose_from( 47 | &mut self, 48 | varstrings: &[VarString], 49 | key_buf: &mut [Vec], 50 | symbols: &SymbolTable, 51 | record: &dyn Record, 52 | ) -> Result<(), String> { 53 | match self { 54 | Key::Single(v) => { 55 | debug_assert!(varstrings.len() == 1 && key_buf.len() == 1); 56 | varstrings[0].simple_value(v, &mut key_buf[0], symbols, record)? 57 | } 58 | // Key::Two(v) => { 59 | // debug_assert!(varstrings.len() == 2 && key_buf.len() == 2); 60 | // for i in 0..2 { 61 | // varstrings[i].into_simple( 62 | // &mut v[i], 63 | // &mut key_buf[i], 64 | // symbols, 65 | // record, 66 | // force_numeric, 67 | // )?; 68 | // } 69 | // } 70 | Key::Multiple(values) => { 71 | debug_assert!(varstrings.len() == values.len() && key_buf.len() == values.len()); 72 | for ((vs, key_buf), val) in varstrings 73 | .iter() 74 | .zip(key_buf.iter_mut()) 75 | .zip(values.iter_mut()) 76 | { 77 | vs.simple_value(val, key_buf, symbols, record)?; 78 | } 79 | } 80 | } 81 | Ok(()) 82 | } 83 | 84 | #[inline] 85 | pub fn write_delimited( 86 | &self, 87 | writer: &mut W, 88 | sep: &[u8], 89 | ) -> io::Result<()> { 90 | write_list_with(self.iter(), sep, writer, |v, o| v.write(o))?; 91 | Ok(()) 92 | } 93 | 94 | pub fn write_to_symbol(&self, sym: &mut OptValue) { 95 | match self { 96 | Key::Single(v) => v.to_symbol(sym), 97 | // Key::Two(v) => { 98 | // let text = sym.inner_mut().mut_text(); 99 | // write_list_with(v, b",", text, |v, o| v.write(o)).unwrap(); 100 | // } 101 | Key::Multiple(values) => { 102 | let text = sym.inner_mut().mut_text(); 103 | write_list_with(values.iter(), b",", text, |v, o| v.write(o)).unwrap(); 104 | } 105 | } 106 | } 107 | } 108 | 109 | // for error messages 110 | impl fmt::Display for Key { 111 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 112 | for (i, k) in self.as_slice().iter().enumerate() { 113 | if i > 0 { 114 | write!(f, ",")?; 115 | } 116 | write!(f, "{k}")?; 117 | } 118 | Ok(()) 119 | } 120 | } 121 | 122 | impl Deref for Key { 123 | type Target = [SimpleValue]; 124 | fn deref(&self) -> &Self::Target { 125 | match self { 126 | Self::Single(v) => std::slice::from_ref(v), 127 | Self::Multiple(v) => v, 128 | } 129 | } 130 | } 131 | 132 | impl DerefMut for Key { 133 | fn deref_mut(&mut self) -> &mut Self::Target { 134 | match self { 135 | Self::Single(v) => std::slice::from_mut(v), 136 | Self::Multiple(v) => v, 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/helpers/seqtype.rs: -------------------------------------------------------------------------------- 1 | use crate::io::Record; 2 | 3 | use bio::alphabets::{dna, protein, rna}; 4 | use clap::ValueEnum; 5 | use strum_macros::{Display, EnumString}; 6 | 7 | use SeqType::*; 8 | 9 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Display, EnumString, ValueEnum)] 10 | pub enum SeqType { 11 | #[allow(clippy::upper_case_acronyms)] 12 | DNA, 13 | #[allow(clippy::upper_case_acronyms)] 14 | RNA, 15 | Protein, 16 | Other, 17 | } 18 | 19 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] 20 | pub struct SeqTypeInfo { 21 | pub seqtype: SeqType, 22 | /// has DNA/RNA or protein wildcards (N/X) 23 | pub has_wildcard: bool, 24 | /// has IUPAC ambiguities 25 | pub has_ambiguities: bool, 26 | } 27 | 28 | impl SeqTypeInfo { 29 | pub fn new(ty: SeqType, has_wildcard: bool, has_ambiguities: bool) -> Self { 30 | Self { 31 | seqtype: ty, 32 | has_ambiguities, 33 | has_wildcard, 34 | } 35 | } 36 | } 37 | 38 | // For excluding certain characters when running recognition 39 | fn filter_iter(text: &[u8]) -> impl Iterator { 40 | text.iter() 41 | .filter(|&s| !matches!(s, b'-' | b'.' | b'?' | b' ')) 42 | } 43 | 44 | /// Returns information about the sequence type. In case a type hint is provided, 45 | /// the sequence is still checked for ambiguities and wildcards. 46 | /// Returns Err(typehint) if the type hint does not match the actual sequence. 47 | pub fn guess_seqtype(text: &[u8], hint: Option) -> Result { 48 | match hint { 49 | Some(DNA) => guess_dna(text).ok_or(DNA), 50 | Some(RNA) => guess_rna(text).ok_or(RNA), 51 | Some(Protein) => guess_protein(text).ok_or(Protein), 52 | Some(Other) => Ok(SeqTypeInfo::new(Other, false, false)), 53 | None => Ok(guess_dna(text) 54 | .or_else(|| guess_rna(text)) 55 | .or_else(|| guess_protein(text)) 56 | .unwrap_or(SeqTypeInfo::new(Other, false, false))), 57 | } 58 | } 59 | 60 | pub fn guess_seqtype_or_fail( 61 | text: &[u8], 62 | hint: Option, 63 | allow_other: bool, 64 | ) -> Result { 65 | let info = guess_seqtype(text, hint).map_err(|hint| { 66 | format!( 67 | "The sequence type '{hint}' provided with `--seqtype` does not appear to be valid \ 68 | for the given sequence. Please make sure that only valid characters are used and \ 69 | note that only standard ambiguities according to IUPAC are recognized \ 70 | (e.g. see https://bioinformatics.org/sms/iupac.html)." 71 | ) 72 | })?; 73 | if !allow_other && info.seqtype == Other { 74 | return Err("Could not guess sequence type, please provide with `--seqtype`".to_string()); 75 | } 76 | Ok(info) 77 | } 78 | 79 | pub fn guess_dna(text: &[u8]) -> Option { 80 | if dna::alphabet().is_word(filter_iter(text)) { 81 | Some(SeqTypeInfo::new(DNA, false, false)) 82 | } else if dna::n_alphabet().is_word(filter_iter(text)) { 83 | Some(SeqTypeInfo::new(DNA, true, false)) 84 | } else if dna::iupac_alphabet().is_word(filter_iter(text)) { 85 | Some(SeqTypeInfo::new(DNA, true, true)) 86 | } else { 87 | None 88 | } 89 | } 90 | 91 | pub fn guess_rna(text: &[u8]) -> Option { 92 | if rna::alphabet().is_word(filter_iter(text)) { 93 | Some(SeqTypeInfo::new(RNA, false, false)) 94 | } else if rna::n_alphabet().is_word(filter_iter(text)) { 95 | Some(SeqTypeInfo::new(RNA, true, false)) 96 | } else if rna::iupac_alphabet().is_word(filter_iter(text)) { 97 | Some(SeqTypeInfo::new(RNA, true, true)) 98 | } else { 99 | None 100 | } 101 | } 102 | 103 | pub fn guess_protein(text: &[u8]) -> Option { 104 | if protein::alphabet().is_word(filter_iter(text)) { 105 | Some(SeqTypeInfo::new(Protein, false, false)) 106 | } else if protein::iupac_alphabet().is_word(filter_iter(text)) { 107 | Some(SeqTypeInfo::new(Protein, true, true)) 108 | } else { 109 | None 110 | } 111 | } 112 | 113 | #[derive(Debug, Clone, Default)] 114 | pub struct SeqtypeHelper { 115 | seqtype: Option, 116 | } 117 | 118 | impl SeqtypeHelper { 119 | pub fn new(typehint: Option) -> Self { 120 | Self { seqtype: typehint } 121 | } 122 | 123 | pub fn get_or_guess(&mut self, record: &dyn Record) -> Result { 124 | if let Some(seqtype) = self.seqtype { 125 | Ok(seqtype) 126 | } else { 127 | let mut buf = Vec::new(); 128 | let seq = record.full_seq(&mut buf); 129 | let info = guess_seqtype_or_fail(&seq, self.seqtype, false)?; 130 | self.seqtype = Some(info.seqtype); 131 | Ok(info.seqtype) 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /scripts/time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # FASTQ file 4 | f=$1 5 | # primer seqs. for searching 6 | seq1=$2 7 | seq2=$3 8 | 9 | 10 | alias s=target/release/st 11 | 12 | # prepare 13 | # s . -a gc={s:gc} $f > $f.with_gc.fq 14 | # s . --qual-out $f.qual --to-fa $f > /dev/null 15 | # s . --to-fa $f > $f.fa 16 | # gzip -k $f 17 | # lz4 -k $f 18 | # bzip2 -k $f 19 | # zstd -k $f 20 | 21 | # load files into memory 22 | s count $f $f.* -k filename 23 | 24 | logfile=timing.txt 25 | exec > $logfile 2>&1 26 | set -x 27 | 28 | # conversion 29 | time s . --to-fa $f > /dev/null 30 | time s . --to fastq-illumina $f > /dev/null 31 | time s . --qual $f.qual $f.fa > /dev/null 32 | time s . --to-fq --qual $f.qual $f.fa > /dev/null 33 | time read_fastq -i $f -e base_33 | write_fasta -x > /dev/null 34 | time cat $f | fastq_to_fasta -Q33 > /dev/null 35 | time fastq_to_fasta -Q33 -i $f > /dev/null 36 | time seqtk seq -A $f > /dev/null 37 | time seqkit fq2fa $f > /dev/null 38 | time seqkit convert --from 'Sanger' --to 'Illumina-1.3+' $f > /dev/null 39 | 40 | # random subsampling 41 | time s sample -f 0.1 $f > /dev/null 42 | time seqtk sample $f 0.1 > /dev/null 43 | time seqkit sample -p 0.1 $f > /dev/null 44 | 45 | # counting 46 | time s count $f 47 | time read_fastq -i $f -e base_33 | count_records -x 48 | time wc -l $f 49 | 50 | # reverse complement (note, qualities are only reversed by seqtool) 51 | time fastx_reverse_complement -i $f -Q33 > /dev/null 52 | time read_fastq -i $f -e base_33 | reverse_seq | complement_seq | write_fastq -x > /dev/null 53 | time s revcomp $f > /dev/null 54 | time s revcomp -t4 $f > /dev/null 55 | time seqtk seq -r $f > /dev/null 56 | time seqkit seq -rp $f > /dev/null 57 | 58 | # compress 59 | time s . $f > /dev/null 60 | time s . $f --to fastq.lz4 > /dev/null 61 | time s . $f | lz4 -c > /dev/null 62 | time s . $f --to fastq.gz > /dev/null 63 | time s . $f | gzip -c > /dev/null 64 | 65 | # decompress 66 | time s . $f.lz4 > /dev/null 67 | time lz4 -dc $f.lz4 | s . --fq > /dev/null 68 | time s . $f.gz > /dev/null 69 | time gzip -dc $f.gz | s . --fq > /dev/null 70 | time seqtk seq $f.gz > /dev/null 71 | time gzip -dc $f.gz | seqtk seq $f.gz > /dev/null 72 | 73 | # RNA -> DNA 74 | time s replace T U $f > /dev/null 75 | time s replace T U $f -t4 > /dev/null 76 | time s find T --rep U $f > /dev/null 77 | time s find T --rep U $f -t4 > /dev/null 78 | time seqkit seq --dna2rna $f > /dev/null 79 | time read_fastq -i $f -e base_33 | transliterate_vals -k SEQ -s T -r U | write_fastq -x > /dev/null 80 | time fasta_nucleotide_changer -i $f -Q33 -r > /dev/null 81 | 82 | # GC content "histogram" 83 | time s count -k n:10:{s:gc} $f 84 | 85 | # from variable 86 | time s count -k n:10:{a:gc} $f.with_gc.fq 87 | 88 | # with expression 89 | time s count -k n:.1:{{s:gc/100}} $f 90 | 91 | # filter by length 92 | time s filter 's:seqlen >= 100' $f > /dev/null 93 | time seqtk seq -L 100 $f > /dev/null 94 | time seqkit seq -m 100 $f > /dev/null 95 | time read_fasta -i $f | grab -e 'SEQ_LEN >= 100' | write_fasta -x > /dev/null 96 | 97 | # filter by quality 98 | time s filter 's:exp_err < 1' $f --to-fa > /dev/null 99 | time usearch -fastq_filter $f -fastq_maxee 1 -fastaout $f.filter.fa 100 | time vsearch -fastq_filter $f -fastq_maxee 1 -fastaout $f.filter.fa 101 | rm $f.filter.fa 102 | 103 | # primer finding 104 | 105 | printf ">primer1\n$seq1\n>primer2\n$seq2\n" > _primer_file.fa 106 | fp=_primer_file.fa 107 | printf "$seq1\n$seq2\n" | tr 'YR' 'N' > _primer_list.txt 108 | sed 's/R/[AG]/g' _primer_file.fa > _primer_file_ambig.fa 109 | 110 | run_find() { 111 | time s find -v file:$1 $f -a primer={f:name} -a rng={f:range} "${@:2}" > /dev/null 112 | time s find -v file:$1 $f -a primer={f:name} -a rng={f:range} -t4 "${@:2}" > /dev/null 113 | } 114 | 115 | run_find $fp --algo myers 116 | run_find $fp --algo myers -d1 117 | run_find $fp --algo myers -d4 118 | run_find $fp --algo myers -d8 119 | run_find $fp --algo myers -d4 --in-order 120 | run_find $fp --algo myers -d4 --rng ..25 121 | time s find -v file:$fp $f -a d={f:dist} > /dev/null 122 | time s find -v file:$fp $f -a d={f:dist} -t4 > /dev/null 123 | run_find $fp --algo exact 124 | run_find _primer_file_ambig.fa -r --seqtype other 125 | 126 | adapter_removal() { 127 | time AdapterRemoval --file1 $f --adapter-list _primer_list.txt --shift 8 --threads 4 \ 128 | --output1 /dev/null --discarded /dev/stdout --settings /dev/null "$@" > /dev/null 129 | time AdapterRemoval --file1 $f --adapter-list _primer_list.txt --shift 8 --threads 4 \ 130 | --output1 /dev/null --discarded /dev/stdout --settings /dev/null --threads 4 "$@" > /dev/null 131 | } 132 | 133 | adapter_removal --mm 1 134 | adapter_removal --mm 4 135 | adapter_removal --mm 8 136 | 137 | time cutadapt -a primer1=$seq1$ -a primer2=$seq2$ $f -e 0.23 -y ' primer={name}' > /dev/null 138 | time cutadapt -a primer1=$seq1$ -a primer2=$seq2$ $f -e 0.23 -y ' primer={name}' -j4 > /dev/null 139 | 140 | # trim 141 | 142 | time s find -f file:$fp $f -a primer={f:name} -a end={f:end} -t5 > $f.find.fq 143 | time s trim -e {a:end}.. $f.find.fq > /dev/null 144 | -------------------------------------------------------------------------------- /src/io/format.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::io; 3 | use std::path::Path; 4 | use std::str::FromStr; 5 | 6 | use itertools::Itertools; 7 | 8 | use super::{QualFormat, DEFAULT_IO_READER_BUFSIZE, DEFAULT_IO_WRITER_BUFSIZE}; 9 | 10 | #[derive(Eq, PartialEq, Debug, Clone, Copy)] 11 | pub enum FormatVariant { 12 | Fasta, 13 | Fastq(QualFormat), 14 | Csv, 15 | Tsv, 16 | } 17 | 18 | impl FormatVariant { 19 | pub fn str_match(s: &str) -> Option { 20 | match s.to_ascii_lowercase().as_str() { 21 | "fasta" | "fa" | "fna" => Some(FormatVariant::Fasta), 22 | "fastq" | "fq" => Some(FormatVariant::Fastq(QualFormat::Sanger)), 23 | "fastq-illumina" | "fq-illumina" => Some(FormatVariant::Fastq(QualFormat::Illumina)), 24 | "fastq-solexa" | "fq-solexa" => Some(FormatVariant::Fastq(QualFormat::Solexa)), 25 | "csv" => Some(FormatVariant::Csv), 26 | "tsv" | "txt" => Some(FormatVariant::Tsv), 27 | _ => None, 28 | } 29 | } 30 | } 31 | 32 | impl fmt::Display for FormatVariant { 33 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 34 | match *self { 35 | FormatVariant::Fasta => write!(f, "fasta"), 36 | FormatVariant::Fastq(fmt) => match fmt { 37 | QualFormat::Sanger | QualFormat::Phred => write!(f, "fastq"), 38 | QualFormat::Illumina => write!(f, "fastq-illumina"), 39 | QualFormat::Solexa => write!(f, "fastq-solexa"), 40 | }, 41 | FormatVariant::Csv => write!(f, "csv"), 42 | FormatVariant::Tsv => write!(f, "tsv"), 43 | } 44 | } 45 | } 46 | 47 | impl FromStr for FormatVariant { 48 | type Err = String; 49 | 50 | fn from_str(s: &str) -> Result { 51 | FormatVariant::str_match(s).ok_or_else(|| format!("Unknown format: {s}")) 52 | } 53 | } 54 | 55 | #[derive(Eq, PartialEq, Debug, Clone, Copy)] 56 | pub enum CompressionFormat { 57 | #[cfg(feature = "gz")] 58 | Gzip, 59 | #[cfg(feature = "bz2")] 60 | Bzip2, 61 | #[cfg(feature = "lz4")] 62 | Lz4, 63 | #[cfg(feature = "zstd")] 64 | Zstd, 65 | } 66 | 67 | impl CompressionFormat { 68 | const FORMAT_MAP: &[(&[&str], CompressionFormat)] = &[ 69 | #[cfg(feature = "gz")] 70 | (&["gz", "gzip"], CompressionFormat::Gzip), 71 | #[cfg(feature = "bz2")] 72 | (&["bz2", "bzip2"], CompressionFormat::Bzip2), 73 | #[cfg(feature = "lz4")] 74 | (&["lz4"], CompressionFormat::Lz4), 75 | #[cfg(feature = "zstd")] 76 | (&["zst", "zstd", "zstandard"], CompressionFormat::Zstd), 77 | ]; 78 | 79 | pub fn str_match(s: &str) -> Option { 80 | let s = s.to_ascii_lowercase(); 81 | for (names, format) in Self::FORMAT_MAP { 82 | if names.contains(&s.as_str()) { 83 | return Some(*format); 84 | } 85 | } 86 | None 87 | } 88 | 89 | pub fn recommended_read_bufsize(self) -> usize { 90 | match self { 91 | #[cfg(feature = "zstd")] 92 | CompressionFormat::Zstd => zstd::Decoder::::recommended_output_size(), 93 | _ => DEFAULT_IO_READER_BUFSIZE, 94 | } 95 | } 96 | 97 | pub fn recommended_write_bufsize(self) -> usize { 98 | match self { 99 | #[cfg(feature = "zstd")] 100 | CompressionFormat::Zstd => zstd::Encoder::::recommended_input_size(), 101 | _ => DEFAULT_IO_WRITER_BUFSIZE, 102 | } 103 | } 104 | } 105 | 106 | impl FromStr for CompressionFormat { 107 | type Err = String; 108 | 109 | fn from_str(s: &str) -> Result { 110 | if let Some(format) = CompressionFormat::str_match(s) { 111 | Ok(format) 112 | } else { 113 | let fmt_list = CompressionFormat::FORMAT_MAP 114 | .iter() 115 | .map(|(names, _)| names.join("/")) 116 | .join(", "); 117 | Err(format!( 118 | "Unknown compression format: {s}. Valid formats are: {fmt_list}." 119 | )) 120 | } 121 | } 122 | } 123 | 124 | /// Parses a single or double extension from a path: 125 | /// If the extension is recognized as a compression format, 126 | /// it is returned along with the inner extension. 127 | /// Otherwise, only the outer extension is returned (no compression assumed). 128 | pub fn parse_compr_ext + ?Sized>( 129 | path: &P, 130 | ) -> (Option, Option<&str>) { 131 | let path = path.as_ref(); 132 | let mut fmt = None; 133 | let mut ext = None; 134 | if let Some(e) = path.extension().and_then(|e| e.to_str()) { 135 | if let Some(f) = CompressionFormat::str_match(e) { 136 | fmt = Some(f); 137 | if let Some(e) = Path::new(path.file_stem().unwrap()).extension() { 138 | ext = e.to_str(); 139 | } 140 | } else { 141 | ext = Some(e); 142 | } 143 | } 144 | (fmt, ext) 145 | } 146 | -------------------------------------------------------------------------------- /src/cmd/replace.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::ToOwned; 2 | use std::str; 3 | 4 | use clap::{value_parser, Parser}; 5 | use memchr::memmem::find_iter; 6 | 7 | use crate::cli::CommonArgs; 8 | use crate::error::CliResult; 9 | use crate::helpers::replace::replace_iter; 10 | use crate::io::{RecordAttr, RecordEditor}; 11 | use crate::Config; 12 | 13 | #[derive(Parser, Clone, Debug)] 14 | #[clap(next_help_heading = "'Replace' command options")] 15 | pub struct ReplaceCommand { 16 | /// Search pattern 17 | pattern: String, 18 | 19 | /// Replacement string, cannot contain variables. 20 | replacement: String, 21 | 22 | /// Replace in IDs instead of sequences 23 | #[arg(short, long)] 24 | id: bool, 25 | 26 | /// Replace in descriptions 27 | #[arg(short, long)] 28 | desc: bool, 29 | 30 | /// Interpret pattern as a regular expression. 31 | /// Unicode characters are supported when searching in IDs/descriptions, 32 | /// but not for sequence searches. 33 | #[arg(short, long)] 34 | regex: bool, 35 | 36 | /// Number of threads 37 | #[arg(short, long, value_name = "N", default_value_t = 1, value_parser = value_parser!(u32).range(1..))] 38 | threads: u32, 39 | 40 | #[command(flatten)] 41 | pub common: CommonArgs, 42 | } 43 | 44 | pub fn run(mut cfg: Config, args: ReplaceCommand) -> CliResult<()> { 45 | // what should be replaced? 46 | let attr = if args.id { 47 | RecordAttr::Id 48 | } else if args.desc { 49 | RecordAttr::Desc 50 | } else { 51 | RecordAttr::Seq 52 | }; 53 | let pattern = &args.pattern; 54 | let replacement = args.replacement.as_bytes(); 55 | let has_backrefs = replacement.contains(&b'$'); 56 | let regex = args.regex; 57 | let num_threads = args.threads; 58 | 59 | let replacer = get_replacer(pattern, regex, has_backrefs)?; 60 | 61 | let mut format_writer = cfg.get_format_writer()?; 62 | 63 | cfg.with_io_writer(|io_writer, mut cfg| { 64 | cfg.read_parallel( 65 | num_threads - 1, 66 | |record, editor: &mut RecordEditor| { 67 | editor.edit_with_val(attr, &record, false, |text, out| { 68 | replacer.replace(text, replacement, out) 69 | }) 70 | }, 71 | |record, editor, ctx| { 72 | format_writer.write(&editor.record(&record), io_writer, ctx)?; 73 | Ok(true) 74 | }, 75 | ) 76 | })?; 77 | Ok(()) 78 | } 79 | 80 | trait Replacer { 81 | fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec) -> CliResult<()>; 82 | } 83 | 84 | struct BytesReplacer(Vec); 85 | 86 | impl Replacer for BytesReplacer { 87 | fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec) -> CliResult<()> { 88 | let matches = find_iter(text, &self.0).map(|start| (start, start + self.0.len())); 89 | replace_iter(text, replacement, matches, out).unwrap(); 90 | Ok(()) 91 | } 92 | } 93 | 94 | macro_rules! regex_replacer_impl { 95 | ($name:ident, $regex:ty, $to_string:expr, $to_bytes:expr) => { 96 | struct $name { 97 | re: $regex, 98 | has_backrefs: bool, 99 | } 100 | 101 | impl $name { 102 | fn new(pattern: &str, has_backrefs: bool) -> CliResult { 103 | Ok(Self { 104 | re: <$regex>::new(pattern)?, 105 | has_backrefs, 106 | }) 107 | } 108 | } 109 | 110 | impl Replacer for $name { 111 | #[allow(clippy::redundant_closure_call)] 112 | fn replace(&self, text: &[u8], replacement: &[u8], out: &mut Vec) -> CliResult<()> { 113 | let search_text = $to_string(text)?; 114 | if !self.has_backrefs { 115 | let matches = self.re.find_iter(search_text).map(|m| (m.start(), m.end())); 116 | replace_iter(text, replacement, matches, out).unwrap(); 117 | } else { 118 | // requires allocations 119 | let repl_text = $to_string(replacement)?; 120 | let replaced = self.re.replace_all(search_text, repl_text); 121 | out.extend_from_slice($to_bytes(replaced.as_ref())); 122 | } 123 | Ok(()) 124 | } 125 | } 126 | }; 127 | } 128 | 129 | cfg_if::cfg_if! { 130 | if #[cfg(feature = "regex-fast")] { 131 | regex_replacer_impl!(BytesRegexReplacer, regex::bytes::Regex, Ok::<_, crate::error::CliError>, |s| s); 132 | } else { 133 | // TODO: no way to operate on byte slices (although it might be added according to regex_lite docs) 134 | regex_replacer_impl!(BytesRegexReplacer, regex_lite::Regex, |t| std::str::from_utf8(t), str::as_bytes); 135 | } 136 | } 137 | 138 | fn get_replacer( 139 | pattern: &str, 140 | regex: bool, 141 | has_backrefs: bool, 142 | ) -> CliResult> { 143 | if regex { 144 | Ok(Box::new(BytesRegexReplacer::new(pattern, has_backrefs)?)) 145 | } else { 146 | Ok(Box::new(BytesReplacer(pattern.as_bytes().to_owned()))) 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "seqtool" 3 | version = "0.4.0-beta.4" 4 | edition = "2021" 5 | authors = ["Markus Schlegel "] 6 | description = "General-purpose tool for reading, modifying and writing biological sequences." 7 | license = "MIT OR Apache-2.0" 8 | repository = "https://github.com/markschl/seqtool" 9 | homepage = "https://github.com/markschl/seqtool" 10 | readme = "README.md" 11 | build = "build.rs" 12 | 13 | [workspace] 14 | members = ["var_provider", "var_provider/variable_enum_macro"] 15 | 16 | [dependencies] 17 | ahash = "0.8" 18 | xxhash-rust = { version = "0.8", features = ["xxh3"] } 19 | memchr = "2.7" 20 | winnow = { version = "0.7", features = ["simd"] } 21 | vec_map = "0.8" 22 | deepsize = "0.2" 23 | itertools = "0.14" 24 | bytecount = "0.6" 25 | strum = "0.27" 26 | strum_macros = "0.27" 27 | lexical = { version = "7.0", default-features = false, features = ["parse-floats", "parse-integers", "parse", "write-floats"] } 28 | atoi = "2.0" 29 | ordered-float = { version = "5.0", default-features = false, features = ["std", "rkyv", "rkyv_ck"] } 30 | cfg-if = "1.0" 31 | # CLI 32 | clap = { version = "4.5", features = ["derive", "help", "wrap_help", "env"] } 33 | textwrap = { version = "0.16", default-features = false } 34 | color-print = { version = "0.3" } 35 | # I/O 36 | seq_io = "0.3.4" 37 | thread_io = "0.3" 38 | csv = "1.3" 39 | # compression formats (behind feature flags) 40 | bzip2 = { version = "0.6", optional = true } 41 | lz4 = { version = "1.28", optional = true } 42 | zstd = { version = "0.13", default-features = false, features = ["zdict_builder"], optional = true } 43 | # variables / functions 44 | var_provider = { path = "var_provider" } 45 | variable_enum_macro = { path = "var_provider/variable_enum_macro" } 46 | # JS expressions 47 | rquickjs = { version = "0.9", features=["classes", "properties", "rust-alloc", "macro"], optional=true } 48 | phf = { version = "0.13", features = ["macros"], optional = true } 49 | # find, replace, revcomp, view commands 50 | bio = { version = "3.0", default-features = false } 51 | regex-lite = { version = "0.1", optional = true } 52 | regex = { version = "1.11", optional = true } 53 | # view 54 | ratatui = { version = "0.29", optional = true, default-features = false, features = ["crossterm"] } 55 | # view, cmp 56 | crossterm ={ version = "0.29", optional = true } 57 | # view 58 | palette = { version = "0.7", default-features = false, features = ["std", "named_from_str"], optional = true } 59 | enterpolation = { version = "0.3", default-features = false, features = ["std", "linear"], optional = true } 60 | # sample 61 | rand = { version = "0.9", optional = true } 62 | rand_xoshiro = { version = "0.7.0", optional = true } 63 | # sort / unique / cmp commands 64 | indexmap = { version = "2.10", optional = true } 65 | # cmp 66 | ringmap = { version = "0.1", optional = true } 67 | # TODO: v0.8 update blocked by https://github.com/reem/rust-ordered-float/issues/163 68 | rkyv = { version = "0.7", optional = true} 69 | byteorder = { version = "1.5", optional = true } 70 | tempfile = { version = "3.20", optional = true } 71 | 72 | [target.'cfg(not(target_os = "windows"))'.dependencies.flate2] 73 | version = "1.1" 74 | default-features = false 75 | features = ["zlib-ng"] 76 | optional = true 77 | 78 | [target.'cfg(target_os = "windows")'.dependencies.flate2] 79 | version = "1.1" 80 | optional = true 81 | 82 | [build-dependencies] 83 | regex-lite = "0.1" 84 | 85 | [dev-dependencies] 86 | assert_cmd = "2.0" 87 | predicates = "3.1" 88 | approx = "0.5" 89 | rand = "0.9" 90 | rand_xoshiro = "0.7" 91 | tempfile = "3.10" 92 | 93 | [features] 94 | default = ["all-commands", "regex-fast", "expr", "gz", "bz2", "lz4", "zstd"] 95 | # JavaScript expressions 96 | expr = ["rquickjs", "phf"] 97 | # Compression formats 98 | gz = ["flate2"] 99 | lz4 = ["dep:lz4"] 100 | zstd = ["dep:zstd"] 101 | bz2 = ["bzip2"] 102 | # Regex searching in find/replace 103 | regex-fast = ["regex"] # adds ~1.4 MiB to binary (Linux) 104 | # Commands 105 | all-commands = [ 106 | "palette", "enterpolation", # view 107 | "ratatui", "crossterm", # view, cmp 108 | "rand", "rand_xoshiro", # sample 109 | "indexmap", "ringmap", "rkyv", "byteorder", "tempfile", # sort, unique, cmp 110 | "regex-lite", # find, replace 111 | ] 112 | pass = [] 113 | view = ["palette", "enterpolation", "crossterm"] 114 | count = ["rkyv"] 115 | stat = [] 116 | head = [] 117 | tail = [] 118 | slice = [] 119 | sample = ["rand", "rand_xoshiro"] 120 | sort = ["indexmap", "rkyv", "byteorder", "tempfile"] 121 | unique = ["indexmap", "rkyv", "byteorder", "tempfile"] 122 | filter = [] 123 | split = [] 124 | cmp = ["ringmap", "indexmap"] 125 | interleave = [] 126 | find = ["regex-lite"] 127 | replace = ["regex-lite"] 128 | del = [] 129 | set = [] 130 | trim = [] 131 | mask = [] 132 | upper = [] 133 | lower = [] 134 | revcomp = [] 135 | concat = [] 136 | 137 | [[bin]] 138 | path = "src/main.rs" 139 | name = "st" 140 | 141 | [profile.release] 142 | lto = "thin" 143 | codegen-units = 1 144 | panic = "abort" 145 | strip = true 146 | #debug = true 147 | 148 | # The profile that 'cargo dist' will build with 149 | [profile.dist] 150 | inherits = "release" 151 | 152 | [package.metadata.wix] 153 | upgrade-guid = "41883C8F-F72D-46C8-A526-F415D0511C8F" 154 | path-guid = "12F3E865-70E4-4FA2-BA01-D728DF2B14E1" 155 | license = false 156 | eula = false 157 | -------------------------------------------------------------------------------- /src/io/input/fa_qual.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::min; 2 | use std::fs::File; 3 | use std::io; 4 | use std::path::Path; 5 | 6 | use seq_io::{ 7 | fasta::{self, Record as FR}, 8 | policy::BufPolicy, 9 | }; 10 | 11 | use crate::error::CliResult; 12 | use crate::io::{Record, RecordHeader, SeqLineIter}; 13 | 14 | use super::SeqReader; 15 | 16 | // Reader 17 | 18 | pub struct FaQualReader { 19 | fa_rdr: fasta::Reader, 20 | qual_rdr: fasta::Reader, 21 | quals: Vec, 22 | } 23 | 24 | impl FaQualReader 25 | where 26 | R: io::Read, 27 | P: BufPolicy + Clone, 28 | { 29 | pub fn new(rdr: R, cap: usize, policy: P, qfile: Q) -> CliResult 30 | where 31 | Q: AsRef, 32 | { 33 | let qhandle = File::open(&qfile).map_err(|e| { 34 | format!( 35 | "Error opening '{}': {}", 36 | qfile.as_ref().to_string_lossy(), 37 | e 38 | ) 39 | })?; 40 | 41 | Ok(FaQualReader { 42 | fa_rdr: fasta::Reader::with_capacity(rdr, cap).set_policy(policy.clone()), 43 | qual_rdr: fasta::Reader::with_capacity(qhandle, cap).set_policy(policy), 44 | quals: vec![], 45 | }) 46 | } 47 | } 48 | 49 | impl SeqReader for FaQualReader 50 | where 51 | R: io::Read, 52 | P: BufPolicy, 53 | { 54 | fn read_next_conditional( 55 | &mut self, 56 | func: &mut dyn FnMut(&dyn Record) -> CliResult, 57 | ) -> Option> { 58 | let quals = &mut self.quals; 59 | let qual_rdr = &mut self.qual_rdr; 60 | 61 | self.fa_rdr.next().map(|rec| { 62 | let rec = rec?; 63 | 64 | // quality info 65 | quals.clear(); 66 | let qrec = qual_rdr.next().ok_or_else(|| { 67 | format!( 68 | "Quality scores in QUAL file missing for record '{}'", 69 | String::from_utf8_lossy(rec.id_bytes()) 70 | ) 71 | })??; 72 | 73 | if qrec.id() != rec.id() { 74 | return fail!(format!( 75 | "ID mismatch with QUAL file: '{}' != '{}'", 76 | String::from_utf8_lossy(rec.id_bytes()), 77 | String::from_utf8_lossy(qrec.id_bytes()), 78 | )); 79 | } 80 | 81 | for seq in qrec.seq_lines() { 82 | parse_quals(seq, quals)?; 83 | } 84 | 85 | // check sequence length 86 | // this may have a performance impact 87 | let seqlen = rec.seq_lines().fold(0, |l, seq| l + seq.len()); 88 | 89 | if seqlen != quals.len() { 90 | return fail!(format!( 91 | "The number of quality scores ({}) is not equal to sequence length ({}) in record '{}'", 92 | quals.len(), seqlen, 93 | String::from_utf8_lossy(rec.id_bytes()), 94 | )); 95 | } 96 | 97 | let r = FaQualRecord { 98 | fa_rec: super::fasta::FastaRecord::new(rec), 99 | qual: quals, 100 | }; 101 | func(&r) 102 | }) 103 | } 104 | } 105 | 106 | fn parse_quals(line: &[u8], out: &mut Vec) -> Result<(), String> { 107 | for qual in line.split(|c| *c == b' ') { 108 | let q = parse_int(qual).map_err(|_| { 109 | format!( 110 | "Invalid quality score found: '{}'", 111 | String::from_utf8_lossy(qual) 112 | ) 113 | })?; 114 | out.push(min(q as u8, 255)); 115 | } 116 | Ok(()) 117 | } 118 | 119 | fn parse_int(bytes: &[u8]) -> Result { 120 | if bytes.is_empty() { 121 | return Err(()); 122 | } 123 | let mut out = 0; 124 | for &b in bytes { 125 | if !b.is_ascii_digit() { 126 | return Err(()); 127 | } 128 | out = 10 * out + (b - b'0') as usize; 129 | } 130 | Ok(out) 131 | } 132 | 133 | // Wrapper for FASTA record 134 | 135 | pub struct FaQualRecord<'a> { 136 | fa_rec: super::fasta::FastaRecord<'a>, 137 | qual: &'a [u8], 138 | } 139 | 140 | impl Record for FaQualRecord<'_> { 141 | fn id(&self) -> &[u8] { 142 | self.fa_rec.id() 143 | } 144 | 145 | fn desc(&self) -> Option<&[u8]> { 146 | self.fa_rec.desc() 147 | } 148 | 149 | fn id_desc(&self) -> (&[u8], Option<&[u8]>) { 150 | self.fa_rec.id_desc() 151 | } 152 | 153 | fn current_header(&'_ self) -> RecordHeader<'_> { 154 | self.fa_rec.current_header() 155 | } 156 | 157 | fn raw_seq(&self) -> &[u8] { 158 | self.fa_rec.raw_seq() 159 | } 160 | 161 | fn qual(&self) -> Option<&[u8]> { 162 | Some(self.qual) 163 | } 164 | 165 | fn header_delim_pos(&self) -> Option> { 166 | self.fa_rec.header_delim_pos() 167 | } 168 | 169 | fn set_header_delim_pos(&self, delim: Option) { 170 | self.fa_rec.set_header_delim_pos(delim) 171 | } 172 | 173 | fn has_seq_lines(&self) -> bool { 174 | self.fa_rec.has_seq_lines() 175 | } 176 | 177 | fn seq_segments(&'_ self) -> SeqLineIter<'_> { 178 | self.fa_rec.seq_segments() 179 | } 180 | } 181 | --------------------------------------------------------------------------------