├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── benchmark.rs ├── src ├── lib.rs └── record.rs └── tests └── test.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "kseq" 3 | version = "0.5.3" 4 | authors = ["Moold "] 5 | edition = "2018" 6 | license = "MIT" 7 | description = "a simple fasta/fastq format parser library" 8 | homepage = "https://github.com/moold/kseq" 9 | repository = "https://github.com/moold/kseq" 10 | readme = "README.md" 11 | keywords = ["fastq", "fasta"] 12 | 13 | [dependencies] 14 | atty = "0.2" 15 | flate2 = { version = ">=1.0.17", features = ["zlib-ng-compat"], default-features = false } 16 | memchr = "2.5" 17 | 18 | [dev-dependencies] 19 | criterion = "0.4" 20 | needletail = "0.4" 21 | 22 | [[bench]] 23 | name = "benchmark" 24 | harness = false 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Hu Jiang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Crates.io](https://img.shields.io/crates/d/kseq?logo=rust)](https://github.com/moold/kseq/archive/refs/heads/main.zip) 2 | [![Crates.io](https://img.shields.io/crates/v/kseq)](https://crates.io/crates/kseq) 3 | [![docs.rs](https://img.shields.io/docsrs/kseq)](https://docs.rs/kseq/) 4 | # kseq 5 | `kseq` is a simple fasta/fastq (**fastx**) format parser library for [Rust](https://www.rust-lang.org/), its main function is to iterate over the records from fastx files (similar to [kseq](https://attractivechaos.github.io/klib/#Kseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser) in `C`). It uses shared buffer to read and store records, so the speed is very fast. It supports a **plain** or **gz** fastx file or [`io::stdin`](https://doc.rust-lang.org/std/io/fn.stdin.html), as well as a **fofn** (file-of-file-names) file, which contains multiple plain or gz fastx files (one per line). 6 | 7 | Using `kseq` is very simple. Users only need to call `parse_path` to parse a path or `parse_reader` to parse a reader, and then use `iter_record` method to get each record. 8 | 9 | - `parse_path` This function takes a path that implements [`AsRef`](https://doc.rust-lang.org/std/path/struct.Path.html) as input, a path can be a `fastx` file, `-` for [`io::stdin`](https://doc.rust-lang.org/std/io/fn.stdin.html), or a `fofn` file. It returns a `Result` type: 10 | - `Ok(T)`: A struct `T` with the `iter_record` method. 11 | - `Err(E)`: An error `E` including missing input, can't open or read, wrong fastx format or invalid path or file errors. 12 | 13 | - `parse_reader` This function takes a reader that implements [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) as input. It returns a `Result` type: 14 | - `Ok(T)`: A struct `T` with the `iter_record` method. 15 | - `Err(E)`: An error `E` including missing input, can't open or read, wrong fastx format or invalid path or file errors. 16 | 17 | - `iter_record` This function can be called in a loop, it returns a `Result>` type: 18 | - `Ok(Some(Record))`: A struct `Record` with methods: 19 | - `head -> &str`: get sequence id/identifier 20 | - `seq -> &str`: get sequence 21 | - `des -> &str`: get sequence description/comment 22 | - `sep -> &str`: get separator 23 | - `qual -> &str`: get quality scores 24 | - `len -> usize`: get sequence length 25 | 26 | ***Note:*** call `des`, `sep` and `qual` will return `""` if `Record` doesn't have these attributes. 27 | - `Ok(None)`: Stream has reached `EOF`. 28 | - `Err(ParseError)`: An error [`ParseError`](https://docs.rs/kseq/0.3.0/kseq/record/enum.ParseError.html) including `IO`, `TruncateFile`, `InvalidFasta` or `InvalidFastq` errors. 29 | 30 | ## Example 31 | ```no_run 32 | use std::env::args; 33 | use std::fs::File; 34 | use kseq::parse_path; 35 | 36 | fn main(){ 37 | let path: String = args().nth(1).unwrap(); 38 | let mut records = parse_path(path).unwrap(); 39 | // let mut records = parse_reader(File::open(path).unwrap()).unwrap(); 40 | while let Some(record) = records.iter_record().unwrap() { 41 | println!("head:{} des:{} seq:{} qual:{} len:{}", 42 | record.head(), record.des(), record.seq(), 43 | record.qual(), record.len()); 44 | } 45 | } 46 | ``` 47 | 48 | ## Installation 49 | ```text 50 | cargo add kseq 51 | ``` 52 | 53 | ## Benchmarking 54 | ```text 55 | cargo bench 56 | ``` -------------------------------------------------------------------------------- /benches/benchmark.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use kseq; 3 | use needletail::parser::{FastaReader, FastqReader, FastxReader}; 4 | use std::{io::Cursor, iter}; 5 | 6 | fn simulate_fastq(total: usize) -> Vec { 7 | let mut data: Vec = vec![]; 8 | let mut n = 0; 9 | let mut sum = 0; 10 | let mut seq_len = 100; 11 | loop { 12 | n += 1; 13 | data.push(b'@'); 14 | data.extend(n.to_string().as_bytes()); 15 | data.push(b'\n'); 16 | if sum + seq_len > total { 17 | seq_len = total - sum; 18 | } 19 | data.extend(iter::repeat(b'A').take(seq_len)); 20 | data.extend([b'\n', b'+', b'\n']); 21 | data.extend(iter::repeat(b'!').take(seq_len)); 22 | data.push(b'\n'); 23 | sum += seq_len; 24 | seq_len += 2; 25 | if sum >= total { 26 | break; 27 | } 28 | } 29 | // println!("{}", str::from_utf8(&data).unwrap()); 30 | data 31 | } 32 | 33 | fn simulate_fasta(total: usize) -> Vec { 34 | let mut data: Vec = vec![]; 35 | let mut n = 0; 36 | let mut sum = 0; 37 | let mut seq_len = 100; 38 | loop { 39 | n += 1; 40 | data.push(b'>'); 41 | data.extend(n.to_string().as_bytes()); 42 | data.push(b'\n'); 43 | if sum + seq_len > total { 44 | seq_len = total - sum; 45 | } 46 | for _ in 0..seq_len / 100 { 47 | data.extend(iter::repeat(b'A').take(100)); 48 | data.push(b'\n'); 49 | } 50 | data.extend(iter::repeat(b'A').take(seq_len % 100)); 51 | data.push(b'\n'); 52 | sum += seq_len; 53 | seq_len += 2; 54 | if sum >= total { 55 | break; 56 | } 57 | } 58 | // println!("{}", str::from_utf8(&data).unwrap()); 59 | data 60 | } 61 | 62 | fn bench_fasta_file(c: &mut Criterion) { 63 | let n_total = 1_000_000_000; 64 | let data = simulate_fasta(n_total); 65 | 66 | let mut group = c.benchmark_group("FASTA parsing(1GB)"); 67 | group.sample_size(30); 68 | 69 | group.bench_function("kseq", |bench| { 70 | bench.iter(|| { 71 | let mut n_bases = 0; 72 | let mut records = kseq::parse_reader(Cursor::new(&data)).unwrap(); 73 | while let Ok(Some(record)) = records.iter_record() { 74 | n_bases += record.seq().len() as u64; 75 | } 76 | assert_eq!(n_bases, n_total as u64); 77 | }); 78 | }); 79 | 80 | group.bench_function("needletail", |bench| { 81 | bench.iter(|| { 82 | let mut n_bases = 0; 83 | let mut records = FastaReader::new(Cursor::new(&data)); 84 | while let Some(Ok(record)) = records.next() { 85 | n_bases += record.seq().len() as u64; 86 | } 87 | assert_eq!(n_bases, n_total as u64); 88 | }); 89 | }); 90 | 91 | group.finish(); 92 | } 93 | 94 | fn bench_fastq_file(c: &mut Criterion) { 95 | let n_total = 1_000_000_000; 96 | let data = simulate_fastq(n_total); 97 | 98 | let mut group = c.benchmark_group("FASTQ parsing(1GB)"); 99 | group.sample_size(30); 100 | 101 | group.bench_function("kseq", |bench| { 102 | bench.iter(|| { 103 | let mut n_bases = 0; 104 | let mut records = kseq::parse_reader(Cursor::new(&data)).unwrap(); 105 | while let Ok(Some(record)) = records.iter_record() { 106 | n_bases += record.seq().len() as u64; 107 | } 108 | assert_eq!(n_bases, n_total as u64); 109 | }); 110 | }); 111 | 112 | group.bench_function("needletail", |bench| { 113 | bench.iter(|| { 114 | let mut n_bases = 0; 115 | let mut records = FastqReader::new(Cursor::new(&data)); 116 | while let Some(Ok(record)) = records.next() { 117 | n_bases += record.seq().len() as u64; 118 | } 119 | assert_eq!(n_bases, n_total as u64); 120 | }); 121 | }); 122 | 123 | group.finish(); 124 | } 125 | 126 | criterion_group!(io, bench_fastq_file, bench_fasta_file); 127 | criterion_main!(io); 128 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc = include_str!("../README.md")] 2 | // Note: kseq is inspired by fastq-rs and kseq in C 3 | 4 | use flate2::read::MultiGzDecoder; 5 | use std::{ 6 | fs::File, 7 | io::{stdin, BufRead, BufReader, Cursor, Error, ErrorKind, Read, Result}, 8 | path::Path, 9 | }; 10 | 11 | pub mod record; 12 | use record::{Fastx, Reader, Readers, Result as ParseResult}; 13 | 14 | /// a reader for a single path or readers for multiple paths 15 | pub enum Paths<'a> { 16 | Reader(Reader<'a>), 17 | Readers(Readers<'a>), 18 | } 19 | 20 | impl<'a> Paths<'a> { 21 | // parse a reader to a Reader or Readers 22 | fn new(mut reader: Box, path: &Path) -> Result { 23 | let mut format_bytes = [0u8; 4]; 24 | reader.read_exact(&mut format_bytes)?; 25 | reader = Box::new(Cursor::new(format_bytes.to_vec()).chain(reader)); 26 | if &format_bytes[..2] == b"\x1f\x8b" { 27 | // for gz format 28 | reader = Box::new(BufReader::with_capacity(65536, MultiGzDecoder::new(reader))); 29 | format_bytes.iter_mut().for_each(|m| *m = 0); 30 | reader.read_exact(&mut format_bytes)?; 31 | reader = Box::new(Cursor::new(format_bytes.to_vec()).chain(reader)); 32 | } 33 | 34 | match format_bytes[0] { 35 | b'@' | b'>' => Ok(Paths::Reader(Reader::new(reader))), 36 | _ => { 37 | // for a fofn file 38 | let mut paths = Readers::new(); 39 | let parent = path.parent().unwrap_or_else(|| Path::new("")); 40 | 41 | for _line in reader.lines() { 42 | let _line = _line?; 43 | let line = _line.trim(); 44 | if line.starts_with('#') || line.is_empty() { 45 | continue; 46 | } 47 | let path = parent.join(line); // convert to an absolute path 48 | if path.exists() { 49 | match parse_path(path)? { 50 | Paths::Reader(reader) => paths.readers.push(reader), 51 | Paths::Readers(readers) => paths.readers.extend(readers.readers), 52 | } 53 | } else { 54 | return Err(Error::new( 55 | ErrorKind::InvalidData, 56 | format!("{:?} is not a valid fastq/fasta/fofn file", path), 57 | )); 58 | } 59 | } 60 | Ok(Paths::Readers(paths)) 61 | } 62 | } 63 | } 64 | 65 | /// iterate a fatsx record for a Reader or Readers 66 | pub fn iter_record(&mut self) -> ParseResult> { 67 | match self { 68 | Paths::Reader(t) => t.iter_record(), 69 | Paths::Readers(t) => t.iter_record(), 70 | } 71 | } 72 | } 73 | 74 | /// parse path to a Reader or Readers 75 | pub fn parse_path<'a, P: AsRef + 'a>(path: P) -> Result> { 76 | let path = path.as_ref(); 77 | let reader: Box = if path == Path::new("-") { 78 | if atty::is(atty::Stream::Stdin) { 79 | return Err(Error::new(ErrorKind::InvalidInput, "Missing input")); 80 | } 81 | Box::new(BufReader::with_capacity(65536, stdin())) 82 | } else { 83 | Box::new(BufReader::with_capacity(65536, File::open(path)?)) 84 | }; 85 | Paths::new(reader, path) 86 | } 87 | 88 | /// parse reader to a Reader or Readers 89 | pub fn parse_reader<'a, R: Read + 'a>(reader: R) -> Result> { 90 | Paths::new( 91 | Box::new(BufReader::with_capacity(65536, reader)), 92 | Path::new(""), 93 | ) 94 | } 95 | -------------------------------------------------------------------------------- /src/record.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | error, fmt, 3 | io::{self, ErrorKind}, 4 | str, 5 | }; 6 | 7 | pub type Result = std::result::Result; 8 | 9 | /// The type of error that returned during parsing fastx files 10 | #[derive(Debug)] 11 | pub enum ParseError { 12 | /// IO error 13 | Io(io::Error), 14 | /// A truncated record was found 15 | TruncateFile(String), 16 | /// Not a valid fastx record, the record doesn't start with `>` and '@' 17 | InvalidFastx(String), 18 | /// Not a valid fasta record, the record starts with `>` but the sequence length is 0 19 | InvalidFasta(String), 20 | /// Not a valid fastq record, the record start with `@` but the sequence and quality lengths are not equal or 0 21 | InvalidFastq(String), 22 | } 23 | 24 | impl fmt::Display for ParseError { 25 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 26 | match self { 27 | ParseError::Io(err) => write!(f, "IO error: {}", err), 28 | ParseError::TruncateFile(record) => { 29 | write!(f, "Truncate file, problematic record: {}", record) 30 | } 31 | ParseError::InvalidFastx(record) => { 32 | write!(f, "Not a valid fastx record: {}", record) 33 | } 34 | ParseError::InvalidFasta(record) => { 35 | write!(f, "Not a valid fasta record: {}", record) 36 | } 37 | ParseError::InvalidFastq(record) => { 38 | write!(f, "Not a valid fastq record: {}", record) 39 | } 40 | } 41 | } 42 | } 43 | 44 | impl From for ParseError { 45 | fn from(err: io::Error) -> ParseError { 46 | ParseError::Io(err) 47 | } 48 | } 49 | 50 | impl error::Error for ParseError {} 51 | 52 | /// a structure representing the sequence in a fastx file 53 | pub struct Fastx<'a> { 54 | _head: usize, 55 | _des: usize, 56 | _seq: usize, 57 | _sep: usize, 58 | _qual: usize, 59 | _data: &'a Vec, 60 | } 61 | 62 | impl Fastx<'_> { 63 | /// get sequence id/identifier 64 | #[inline] 65 | pub fn head(&self) -> &str { 66 | unsafe { str::from_utf8_unchecked(&self._data[1..self._head]) } 67 | } 68 | 69 | /// get sequence 70 | #[inline] 71 | pub fn seq(&self) -> &str { 72 | unsafe { str::from_utf8_unchecked(&self._data[self._des..self._seq]) } 73 | } 74 | 75 | /// get sequence description/comment 76 | #[inline] 77 | pub fn des(&self) -> &str { 78 | if self._head < self._des { 79 | unsafe { str::from_utf8_unchecked(&self._data[self._head..self._des]) } 80 | } else { 81 | "" 82 | } 83 | } 84 | 85 | /// get separator 86 | #[inline] 87 | pub fn sep(&self) -> &str { 88 | if self._seq < self._sep { 89 | unsafe { str::from_utf8_unchecked(&self._data[self._seq..self._sep]) } 90 | } else { 91 | "" 92 | } 93 | } 94 | 95 | /// get quality scores 96 | #[inline] 97 | pub fn qual(&self) -> &str { 98 | if self._sep < self._qual { 99 | unsafe { str::from_utf8_unchecked(&self._data[self._sep..self._qual]) } 100 | } else { 101 | "" 102 | } 103 | } 104 | 105 | /// get sequence length 106 | #[inline] 107 | pub fn len(&self) -> usize { 108 | self._seq - self._des 109 | } 110 | 111 | /// check whether a fastx record is empty 112 | pub fn is_empty(&self) -> bool { 113 | self.len() == 0 114 | } 115 | 116 | /// check whether a fastx record is a fasta record 117 | pub fn is_fasta(&self) -> bool { 118 | (!self._data.is_empty()) && self._data[0] == b'>' 119 | } 120 | 121 | /// check whether a fastx record is a fastq record 122 | pub fn is_fastq(&self) -> bool { 123 | (!self._data.is_empty()) && self._data[0] == b'@' 124 | } 125 | 126 | /// check a fastq record is valid 127 | fn validate_fastq(&self) -> bool { 128 | self.is_fastq() && !self.is_empty() && self._seq - self._des == self._qual - self._sep && self._head > 1 129 | } 130 | 131 | /// check a fasta record is valid 132 | fn validate_fasta(&self) -> bool { 133 | self.is_fasta() && !self.is_empty() && self._head > 1 134 | } 135 | } 136 | 137 | /// a reader with shared buffer 138 | pub struct Reader<'a> { 139 | reader: Box, 140 | data: Vec, 141 | } 142 | 143 | impl<'a> Reader<'a> { 144 | // Create a new Reader 145 | pub(crate) fn new(r: Box) -> Self { 146 | Reader { 147 | reader: r, 148 | data: Vec::with_capacity(1024), 149 | } 150 | } 151 | 152 | // Check if this reader has any data left to be read. 153 | fn has_data_left(&mut self) -> Result { 154 | loop{ 155 | let available = self.reader.fill_buf().map_err(ParseError::Io)?; 156 | if available.iter().any(|&x| !char::is_whitespace(x as char)){ 157 | return Ok(true); 158 | }else if available.is_empty() { 159 | return Ok(false); 160 | } 161 | let len = available.len(); 162 | self.reader.consume(len); 163 | } 164 | } 165 | 166 | // Return the next byte of the internal buffer 167 | #[allow(dead_code)] 168 | fn next_byte(&mut self) -> Result> { 169 | loop { 170 | match self.reader.fill_buf() { 171 | Ok(n) => return Ok(n.first().copied()), 172 | Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, 173 | Err(e) => return Err(ParseError::Io(e)), 174 | }; 175 | } 176 | } 177 | 178 | // Read all non-newline bytes into data until the newline byte or EOF is reached, 179 | // the newline byte (if found) will not be appended to data. 180 | fn read_line(&mut self, skip_blank_line: bool) -> Result { 181 | let delim = b'\n'; 182 | loop { 183 | let mut n = self.reader.read_until(delim, &mut self.data)?; 184 | // reached EOF 185 | if n == 0 { 186 | return Ok(n); 187 | } 188 | 189 | if self.data.last() == Some(&delim) { 190 | self.data.pop(); 191 | n -= 1; 192 | } 193 | if n != 0 || !skip_blank_line { 194 | return Ok(n); 195 | } 196 | } 197 | } 198 | 199 | // Read all non-newline bytes into data until the delimiter byte or EOF is reached, 200 | // the delimiter (if found) will not be appended to data. 201 | fn read_until(&mut self, delim: u8) -> Result { 202 | let mut read = 0; 203 | loop { 204 | let (done, used) = { 205 | let available = match self.reader.fill_buf() { 206 | Ok(n) => n, 207 | Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, 208 | Err(e) => return Err(ParseError::Io(e)), 209 | }; 210 | let mut s = 0; 211 | let mut mch = memchr::memchr2_iter(delim, b'\n', available); 212 | loop { 213 | match mch.next() { 214 | Some(i) => { 215 | self.data.extend_from_slice(&available[s..i]); 216 | read += i - s; 217 | s = i + 1; 218 | if available[i] == delim { 219 | break (true, i); 220 | } 221 | } 222 | None => { 223 | self.data.extend_from_slice(&available[s..]); 224 | read += available.len() - s; 225 | break (false, available.len()); 226 | } 227 | } 228 | } 229 | }; 230 | self.reader.consume(used); 231 | if done || used == 0 { 232 | return Ok(read); 233 | } 234 | } 235 | } 236 | 237 | // Read the exact number of non-newline bytes into data. 238 | fn read_exact(&mut self, len: usize) -> Result { 239 | let mut read = 0; 240 | loop { 241 | let (done, used) = { 242 | let available = match self.reader.fill_buf() { 243 | Ok(n) => n, 244 | Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, 245 | Err(e) => return Err(ParseError::Io(e)), 246 | }; 247 | let mut s = 0; 248 | let mut mch = memchr::memchr_iter(b'\n', available); 249 | loop { 250 | match mch.next() { 251 | Some(i) => { 252 | if read + i - s >= len { 253 | let e = len - read + s; 254 | self.data.extend_from_slice(&available[s..e]); 255 | read = len; 256 | break (true, e); 257 | } else { 258 | self.data.extend_from_slice(&available[s..i]); 259 | read += i - s; 260 | } 261 | s = i + 1; 262 | } 263 | None => { 264 | if available.len() - s + read >= len { 265 | let e = len - read + s; 266 | self.data.extend_from_slice(&available[s..e]); 267 | read = len; 268 | break (true, e); 269 | } else { 270 | self.data.extend_from_slice(&available[s..]); 271 | read += available.len() - s; 272 | break (false, available.len()); 273 | } 274 | } 275 | } 276 | } 277 | }; 278 | self.reader.consume(used); 279 | if done || used == 0 { 280 | return Ok(read); 281 | } 282 | } 283 | } 284 | 285 | /// iterate over a record from this Reader 286 | pub fn iter_record(&mut self) -> Result> { 287 | // clean the last record 288 | self.data.clear(); 289 | // read sequence head 290 | let des = self.read_line(true)?; 291 | if des == 0 { 292 | // reach the EOF 293 | return Ok(None); 294 | } else if self.data[0] != b'>' && self.data[0] != b'@' { 295 | // safely unwrap 296 | return Err(ParseError::InvalidFastx( 297 | String::from_utf8(self.data.to_owned()).unwrap(), 298 | )); 299 | } 300 | 301 | let head = self 302 | .data 303 | .iter() 304 | .position(|&x| char::is_whitespace(x as char)) 305 | .map_or_else(|| des, |x| x); 306 | let mut seq = des; 307 | let mut sep = seq; 308 | let mut qual = sep; 309 | 310 | let is_fasta = self.data[0] == b'>'; 311 | if is_fasta { 312 | seq += self.read_until(b'>')?; 313 | } else { 314 | seq += self.read_until(b'+')?; 315 | sep = seq + self.read_line(true)?; 316 | qual = sep + self.read_exact(seq - des)?; 317 | } 318 | 319 | if !self.has_data_left()? && (head == 1 || seq == des || (!is_fasta && (sep == seq || qual == sep))){ 320 | // safely unwrap 321 | return Err(ParseError::TruncateFile( 322 | String::from_utf8(self.data.to_owned()).unwrap(), 323 | )); 324 | } 325 | // println!("head:{head} des {des} seq {seq} sep {sep} qual {qual}"); 326 | let fastx = Fastx { 327 | _head: head, 328 | _des: des, 329 | _seq: seq, 330 | _sep: sep, 331 | _qual: qual, 332 | _data: &self.data, 333 | }; 334 | 335 | if is_fasta && !fastx.validate_fasta() { 336 | return Err(ParseError::InvalidFasta(fastx.head().to_string())); 337 | } else if !(is_fasta || fastx.validate_fastq()) { 338 | return Err(ParseError::InvalidFastq(fastx.head().to_string())); 339 | } 340 | Ok(Some(fastx)) 341 | } 342 | } 343 | 344 | /// multiple readers for a fofn file 345 | pub struct Readers<'a> { 346 | index: usize, 347 | pub(crate) readers: Vec>, 348 | } 349 | 350 | impl<'a> Default for Readers<'a> { 351 | fn default() -> Self { 352 | Self::new() 353 | } 354 | } 355 | 356 | impl<'a> Readers<'a> { 357 | /// create a new Readers 358 | pub(crate) fn new() -> Self { 359 | Readers { 360 | index: 0, 361 | readers: Vec::new(), 362 | } 363 | } 364 | 365 | /// iterate over a record from this Readers 366 | pub(crate) fn iter_record(&mut self) -> Result> { 367 | for idx in self.index..self.readers.len() { 368 | if self.readers[idx].has_data_left()? { 369 | return self.readers[idx].iter_record(); 370 | } 371 | self.index += 1; 372 | } 373 | Ok(None) 374 | } 375 | } 376 | -------------------------------------------------------------------------------- /tests/test.rs: -------------------------------------------------------------------------------- 1 | use kseq; 2 | use std::io::Cursor; 3 | use std::result::Result; 4 | 5 | // https://stackoverflow.com/questions/53124930/how-do-you-test-for-a-specific-rust-error 6 | macro_rules! assert_err { 7 | ($expression:expr, $($pattern:tt)+) => { 8 | match $expression { 9 | $($pattern)+ => (), 10 | ref e => panic!("expected `{}` but got `{:?}`", stringify!($($pattern)+), e), 11 | } 12 | } 13 | } 14 | 15 | fn count_base(input: Vec) -> Result { 16 | let mut len_bases = 0; 17 | let mut seq_bases = 0; 18 | let mut records = kseq::parse_reader(Cursor::new(input))?; 19 | while let Some(record) = records.iter_record()? { 20 | len_bases += record.len(); 21 | seq_bases += record.seq().len(); 22 | } 23 | assert_eq!(len_bases, seq_bases); 24 | Ok(len_bases) 25 | } 26 | 27 | static BASE_SEQ: &str = "ATGCATGCATGC"; 28 | static BASE_QUAL: &str = "@@@@@@@@@@@@"; 29 | 30 | #[test] 31 | fn test_normal_one_line_fasta() { 32 | let data: Vec = 33 | format!(">1 record1\n{seq}\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes(); 34 | assert_eq!(BASE_SEQ.len() * 2, count_base(data).unwrap()); 35 | } 36 | 37 | #[test] 38 | fn test_normal_one_line_fastq() { 39 | let data: Vec = format!( 40 | "@1 record1\n{seq}\n+\n{qual}\n@2 record2\n{seq}\n+\n{qual}", 41 | seq = BASE_SEQ, 42 | qual = BASE_QUAL 43 | ) 44 | .into_bytes(); 45 | assert_eq!(BASE_SEQ.len() * 2, count_base(data).unwrap()); 46 | } 47 | 48 | #[test] 49 | fn test_normal_multi_line_fasta() { 50 | let data: Vec = format!( 51 | ">1 record1\n{seq}\n{seq}\n>2 record2\n{seq}\n{seq}", 52 | seq = BASE_SEQ 53 | ) 54 | .into_bytes(); 55 | assert_eq!(BASE_SEQ.len() * 4, count_base(data).unwrap()); 56 | } 57 | 58 | #[test] 59 | fn test_normal_multi_line_fastq() { 60 | let data: Vec = format!( 61 | "@1 record1\n{seq}\n{seq}\n+\n{qual}\n{qual}\n@2 record2\n{seq}\n{seq}\n+\n{qual}{qual}\n", 62 | seq = BASE_SEQ, 63 | qual = BASE_QUAL 64 | ) 65 | .into_bytes(); 66 | assert_eq!(BASE_SEQ.len() * 4, count_base(data).unwrap()); 67 | } 68 | 69 | #[test] 70 | fn test_truncate_fasta_miss_head() { 71 | let data: Vec = format!(">1 record1\n{seq}\n>", seq = BASE_SEQ).into_bytes(); 72 | assert_err!( 73 | count_base(data), 74 | Err(kseq::record::ParseError::TruncateFile(_)) 75 | ); 76 | } 77 | 78 | #[test] 79 | fn test_truncate_fasta_miss_seq() { 80 | let data: Vec = format!(">1 record1\n{seq}{seq}\n>2 record2", seq = BASE_SEQ).into_bytes(); 81 | assert_err!( 82 | count_base(data), 83 | Err(kseq::record::ParseError::TruncateFile(_)) 84 | ); 85 | } 86 | 87 | #[test] 88 | fn test_truncate_fastq_miss_head() { 89 | let data: Vec = format!( 90 | "@1 record1\n{seq}{seq}\n+\n{qual}{qual}\n@\n", 91 | seq = BASE_SEQ, 92 | qual = BASE_QUAL 93 | ) 94 | .into_bytes(); 95 | assert_err!( 96 | count_base(data), 97 | Err(kseq::record::ParseError::TruncateFile(_)) 98 | ); 99 | } 100 | 101 | #[test] 102 | fn test_truncate_fastq_miss_seq() { 103 | let data: Vec = format!("@1 record1").into_bytes(); 104 | assert_err!( 105 | count_base(data), 106 | Err(kseq::record::ParseError::TruncateFile(_)) 107 | ); 108 | } 109 | 110 | #[test] 111 | fn test_truncate_fastq_miss_sep() { 112 | let data: Vec = format!("@1 record1\n{seq}{seq}", seq = BASE_SEQ).into_bytes(); 113 | assert_err!( 114 | count_base(data), 115 | Err(kseq::record::ParseError::TruncateFile(_)) 116 | ); 117 | } 118 | 119 | #[test] 120 | fn test_truncate_fastq_miss_qual() { 121 | let data: Vec = format!("@1 record1\n{seq}{seq}\n+", seq = BASE_SEQ).into_bytes(); 122 | assert_err!( 123 | count_base(data), 124 | Err(kseq::record::ParseError::TruncateFile(_)) 125 | ); 126 | } 127 | 128 | #[test] 129 | fn test_invalid_fasta_miss_head() { 130 | let data: Vec = format!(">\n{seq}\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes(); 131 | assert_err!( 132 | count_base(data), 133 | Err(kseq::record::ParseError::InvalidFasta(_)) 134 | ); 135 | } 136 | 137 | #[test] 138 | fn test_invalid_fasta_miss_seq() { 139 | let data: Vec = format!(">1 record1\n\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes(); 140 | assert_err!( 141 | count_base(data), 142 | Err(kseq::record::ParseError::InvalidFasta(_)) 143 | ); 144 | } 145 | 146 | #[test] 147 | fn test_invalid_fasta_with_seq_len_is_0() { 148 | let data: Vec = format!(">1 record1\n{seq}\n>2 record2\n{seq}", seq = "").into_bytes(); 149 | assert_err!( 150 | count_base(data), 151 | Err(kseq::record::ParseError::InvalidFasta(_)) 152 | ); 153 | } 154 | 155 | #[test] 156 | fn test_invalid_fastq_miss_head() { 157 | let data: Vec = format!( 158 | "@\n{seq}{seq}\n+\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n", 159 | seq = BASE_SEQ, 160 | qual = BASE_QUAL 161 | ) 162 | .into_bytes(); 163 | assert_err!( 164 | count_base(data), 165 | Err(kseq::record::ParseError::InvalidFastq(_)) 166 | ); 167 | } 168 | 169 | #[test] 170 | fn test_invalid_fastq_miss_seq() { 171 | let data: Vec = format!( 172 | "@1 record1\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n", 173 | seq = BASE_SEQ, 174 | qual = BASE_QUAL 175 | ) 176 | .into_bytes(); 177 | assert_err!( 178 | count_base(data), 179 | Err(kseq::record::ParseError::InvalidFastq(_)) 180 | ); 181 | } 182 | 183 | #[test] 184 | fn test_invalid_fastq_miss_sep() { 185 | let data: Vec = format!( 186 | "@1 record1\n{seq}{seq}\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n", 187 | seq = BASE_SEQ, 188 | qual = BASE_QUAL 189 | ) 190 | .into_bytes(); 191 | assert_err!( 192 | count_base(data), 193 | Err(kseq::record::ParseError::InvalidFastq(_)) 194 | ); 195 | } 196 | 197 | #[test] 198 | fn test_invalid_fastq_miss_qual() { 199 | let data: Vec = format!( 200 | "@1 record1\n{seq}{seq}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n", 201 | seq = BASE_SEQ, 202 | qual = BASE_QUAL 203 | ) 204 | .into_bytes(); 205 | assert_err!( 206 | count_base(data), 207 | Err(kseq::record::ParseError::InvalidFastq(_)) 208 | ); 209 | } 210 | 211 | 212 | #[test] 213 | fn test_invalid_fastq_seq_has_diff_len_with_qual() { 214 | let data: Vec = format!( 215 | "@1 record1\n{seq}{seq}\n+\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n", 216 | seq = BASE_SEQ, 217 | qual = &BASE_QUAL[0..BASE_SEQ.len() - 1] 218 | ) 219 | .into_bytes(); 220 | assert_err!( 221 | count_base(data), 222 | Err(kseq::record::ParseError::InvalidFastx(_)) 223 | ); 224 | } 225 | 226 | #[test] 227 | fn test_invalid_fastq_with_seq_len_is_0() { 228 | let data: Vec = format!( 229 | "@1 record1\n{seq}\n+\n{qual}\n@2 record2\n{seq}\n+\n{qual}\n", 230 | seq = "", 231 | qual = "" 232 | ) 233 | .into_bytes(); 234 | assert_err!( 235 | count_base(data), 236 | Err(kseq::record::ParseError::InvalidFastq(_)) 237 | ); 238 | } 239 | 240 | // #[test] 241 | // fn test_large_fasta() { 242 | // let count = 1_000_000; 243 | // let mut total_len = 0; 244 | // let mut data = Vec::with_capacity(count * 20); 245 | // (0..count).for_each(|x| { 246 | // writeln!(&mut data, ">{x} record{x}\n").expect("Failed to write to Vec"); 247 | // (0..count & 0xff).for_each(|_| { 248 | // total_len += BASE_SEQ.len(); 249 | // writeln!(&mut data, "{BASE_SEQ}").expect("Failed to write to Vec"); 250 | // }); 251 | // writeln!(&mut data, "\n").expect("Failed to write to Vec"); 252 | // }); 253 | 254 | // assert_eq!(count_base(data).unwrap(), total_len); 255 | // } 256 | 257 | // #[test] 258 | // fn test_large_fastq() { 259 | // let count = 1_000_000; 260 | // let mut total_len = 0; 261 | // let mut data = Vec::with_capacity(count * 20); 262 | // (0..count).for_each(|x| { 263 | // writeln!(&mut data, "@{x} record{x}\n").expect("Failed to write to Vec"); 264 | // (0..count & 0xff).for_each(|_| { 265 | // total_len += BASE_SEQ.len(); 266 | // writeln!(&mut data, "{BASE_SEQ}").expect("Failed to write to Vec"); 267 | // }); 268 | // writeln!(&mut data, "\n+\n").expect("Failed to write to Vec"); 269 | // (0..count & 0xff).for_each(|_| { 270 | // writeln!(&mut data, "{BASE_QUAL}").expect("Failed to write to Vec"); 271 | // }); 272 | // writeln!(&mut data, "\n").expect("Failed to write to Vec"); 273 | // }); 274 | 275 | // assert_eq!(count_base(data).unwrap(), total_len); 276 | // } 277 | --------------------------------------------------------------------------------