├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
    └── benchmark.rs
├── src
    ├── lib.rs
    └── record.rs
└── tests
    └── test.rs


/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Build
20 |       run: cargo build --verbose
21 |     - name: Run tests
22 |       run: cargo test --verbose
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "kseq"
 3 | version = "0.5.3"
 4 | authors = ["Moold <mooldhu@gmail.com>"]
 5 | edition = "2018"
 6 | license = "MIT"
 7 | description = "a simple fasta/fastq format parser library"
 8 | homepage = "https://github.com/moold/kseq"
 9 | repository = "https://github.com/moold/kseq"
10 | readme = "README.md"
11 | keywords  = ["fastq", "fasta"]
12 | 
13 | [dependencies]
14 | atty = "0.2"
15 | flate2 = { version = ">=1.0.17", features = ["zlib-ng-compat"], default-features = false }
16 | memchr = "2.5"
17 | 
18 | [dev-dependencies]
19 | criterion = "0.4"
20 | needletail = "0.4"
21 | 
22 | [[bench]]
23 | name = "benchmark"
24 | harness = false
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Hu Jiang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Crates.io](https://img.shields.io/crates/d/kseq?logo=rust)](https://github.com/moold/kseq/archive/refs/heads/main.zip)
 2 | [![Crates.io](https://img.shields.io/crates/v/kseq)](https://crates.io/crates/kseq)
 3 | [![docs.rs](https://img.shields.io/docsrs/kseq)](https://docs.rs/kseq/)
 4 | # kseq
 5 | `kseq` is a simple fasta/fastq (**fastx**) format parser library for [Rust](https://www.rust-lang.org/), its main function is to iterate over the records from fastx files (similar to [kseq](https://attractivechaos.github.io/klib/#Kseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser) in `C`). It uses shared buffer to read and store records, so the speed is very fast. It supports a **plain** or **gz** fastx file or [`io::stdin`](https://doc.rust-lang.org/std/io/fn.stdin.html), as well as a **fofn** (file-of-file-names) file, which contains multiple plain or gz fastx files (one per line).
 6 | 
 7 | Using `kseq` is very simple. Users only need to call `parse_path` to parse a path or `parse_reader` to parse a reader, and then use `iter_record` method to get each record.
 8 | 
 9 | - `parse_path` This function takes a path that implements [`AsRef<std::path::Path>`](https://doc.rust-lang.org/std/path/struct.Path.html) as input, a path can be a `fastx` file, `-` for [`io::stdin`](https://doc.rust-lang.org/std/io/fn.stdin.html), or a `fofn` file. It returns a `Result` type:
10 | 	- `Ok(T)`: A struct `T` with the `iter_record` method.
11 | 	- `Err(E)`: An error `E` including missing input, can't open or read, wrong fastx format or invalid path or file errors.
12 | 
13 | - `parse_reader` This function takes a reader that implements [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) as input. It returns a `Result` type:
14 | 	- `Ok(T)`: A struct `T` with the `iter_record` method.
15 | 	- `Err(E)`: An error `E` including missing input, can't open or read, wrong fastx format or invalid path or file errors.
16 | 
17 | - `iter_record` This function can be called in a loop, it returns a `Result<Option<Record>>` type:
18 | 	- `Ok(Some(Record))`: A struct `Record` with methods:
19 | 		- `head -> &str`: get sequence id/identifier
20 | 		- `seq -> &str`:  get sequence
21 | 		- `des -> &str`:  get sequence description/comment
22 | 		- `sep -> &str`:  get separator
23 | 		- `qual -> &str`: get quality scores
24 | 		- `len -> usize`: get sequence length
25 | 
26 | 		***Note:*** call `des`, `sep` and `qual` will return `""` if `Record` doesn't have these attributes.
27 | 	- `Ok(None)`: Stream has reached `EOF`.
28 | 	- `Err(ParseError)`: An error [`ParseError`](https://docs.rs/kseq/0.3.0/kseq/record/enum.ParseError.html) including `IO`, `TruncateFile`, `InvalidFasta` or `InvalidFastq` errors.
29 | 
30 | ## Example
31 | ```no_run 
32 | use std::env::args;
33 | use std::fs::File;
34 | use kseq::parse_path;
35 | 
36 | fn main(){
37 | 	let path: String = args().nth(1).unwrap();
38 | 	let mut records = parse_path(path).unwrap();
39 | 	// let mut records = parse_reader(File::open(path).unwrap()).unwrap();
40 | 	while let Some(record) = records.iter_record().unwrap() {
41 | 		println!("head:{} des:{} seq:{} qual:{} len:{}", 
42 | 			record.head(), record.des(), record.seq(), 
43 | 			record.qual(), record.len());
44 | 	}
45 | }
46 | ```
47 | 
48 | ## Installation
49 | ```text 
50 | cargo add kseq
51 | ```
52 | 
53 | ## Benchmarking 
54 | ```text
55 | cargo bench
56 | ```


--------------------------------------------------------------------------------
/benches/benchmark.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{criterion_group, criterion_main, Criterion};
  2 | use kseq;
  3 | use needletail::parser::{FastaReader, FastqReader, FastxReader};
  4 | use std::{io::Cursor, iter};
  5 | 
  6 | fn simulate_fastq(total: usize) -> Vec<u8> {
  7 |     let mut data: Vec<u8> = vec![];
  8 |     let mut n = 0;
  9 |     let mut sum = 0;
 10 |     let mut seq_len = 100;
 11 |     loop {
 12 |         n += 1;
 13 |         data.push(b'@');
 14 |         data.extend(n.to_string().as_bytes());
 15 |         data.push(b'\n');
 16 |         if sum + seq_len > total {
 17 |             seq_len = total - sum;
 18 |         }
 19 |         data.extend(iter::repeat(b'A').take(seq_len));
 20 |         data.extend([b'\n', b'+', b'\n']);
 21 |         data.extend(iter::repeat(b'!').take(seq_len));
 22 |         data.push(b'\n');
 23 |         sum += seq_len;
 24 |         seq_len += 2;
 25 |         if sum >= total {
 26 |             break;
 27 |         }
 28 |     }
 29 |     // println!("{}", str::from_utf8(&data).unwrap());
 30 |     data
 31 | }
 32 | 
 33 | fn simulate_fasta(total: usize) -> Vec<u8> {
 34 |     let mut data: Vec<u8> = vec![];
 35 |     let mut n = 0;
 36 |     let mut sum = 0;
 37 |     let mut seq_len = 100;
 38 |     loop {
 39 |         n += 1;
 40 |         data.push(b'>');
 41 |         data.extend(n.to_string().as_bytes());
 42 |         data.push(b'\n');
 43 |         if sum + seq_len > total {
 44 |             seq_len = total - sum;
 45 |         }
 46 |         for _ in 0..seq_len / 100 {
 47 |             data.extend(iter::repeat(b'A').take(100));
 48 |             data.push(b'\n');
 49 |         }
 50 |         data.extend(iter::repeat(b'A').take(seq_len % 100));
 51 |         data.push(b'\n');
 52 |         sum += seq_len;
 53 |         seq_len += 2;
 54 |         if sum >= total {
 55 |             break;
 56 |         }
 57 |     }
 58 |     // println!("{}", str::from_utf8(&data).unwrap());
 59 |     data
 60 | }
 61 | 
 62 | fn bench_fasta_file(c: &mut Criterion) {
 63 |     let n_total = 1_000_000_000;
 64 |     let data = simulate_fasta(n_total);
 65 | 
 66 |     let mut group = c.benchmark_group("FASTA parsing(1GB)");
 67 |     group.sample_size(30);
 68 | 
 69 |     group.bench_function("kseq", |bench| {
 70 |         bench.iter(|| {
 71 |             let mut n_bases = 0;
 72 |             let mut records = kseq::parse_reader(Cursor::new(&data)).unwrap();
 73 |             while let Ok(Some(record)) = records.iter_record() {
 74 |                 n_bases += record.seq().len() as u64;
 75 |             }
 76 |             assert_eq!(n_bases, n_total as u64);
 77 |         });
 78 |     });
 79 | 
 80 |     group.bench_function("needletail", |bench| {
 81 |         bench.iter(|| {
 82 |             let mut n_bases = 0;
 83 |             let mut records = FastaReader::new(Cursor::new(&data));
 84 |             while let Some(Ok(record)) = records.next() {
 85 |                 n_bases += record.seq().len() as u64;
 86 |             }
 87 |             assert_eq!(n_bases, n_total as u64);
 88 |         });
 89 |     });
 90 | 
 91 |     group.finish();
 92 | }
 93 | 
 94 | fn bench_fastq_file(c: &mut Criterion) {
 95 |     let n_total = 1_000_000_000;
 96 |     let data = simulate_fastq(n_total);
 97 | 
 98 |     let mut group = c.benchmark_group("FASTQ parsing(1GB)");
 99 |     group.sample_size(30);
100 | 
101 |     group.bench_function("kseq", |bench| {
102 |         bench.iter(|| {
103 |             let mut n_bases = 0;
104 |             let mut records = kseq::parse_reader(Cursor::new(&data)).unwrap();
105 |             while let Ok(Some(record)) = records.iter_record() {
106 |                 n_bases += record.seq().len() as u64;
107 |             }
108 |             assert_eq!(n_bases, n_total as u64);
109 |         });
110 |     });
111 | 
112 |     group.bench_function("needletail", |bench| {
113 |         bench.iter(|| {
114 |             let mut n_bases = 0;
115 |             let mut records = FastqReader::new(Cursor::new(&data));
116 |             while let Some(Ok(record)) = records.next() {
117 |                 n_bases += record.seq().len() as u64;
118 |             }
119 |             assert_eq!(n_bases, n_total as u64);
120 |         });
121 |     });
122 | 
123 |     group.finish();
124 | }
125 | 
126 | criterion_group!(io, bench_fastq_file, bench_fasta_file);
127 | criterion_main!(io);
128 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![doc = include_str!("../README.md")]
 2 | // Note: kseq is inspired by fastq-rs and kseq in C
 3 | 
 4 | use flate2::read::MultiGzDecoder;
 5 | use std::{
 6 |     fs::File,
 7 |     io::{stdin, BufRead, BufReader, Cursor, Error, ErrorKind, Read, Result},
 8 |     path::Path,
 9 | };
10 | 
11 | pub mod record;
12 | use record::{Fastx, Reader, Readers, Result as ParseResult};
13 | 
14 | /// a reader for a single path or readers for multiple paths
15 | pub enum Paths<'a> {
16 |     Reader(Reader<'a>),
17 |     Readers(Readers<'a>),
18 | }
19 | 
20 | impl<'a> Paths<'a> {
21 |     // parse a reader to a Reader or Readers
22 |     fn new(mut reader: Box<dyn BufRead + 'a>, path: &Path) -> Result<Self> {
23 |         let mut format_bytes = [0u8; 4];
24 |         reader.read_exact(&mut format_bytes)?;
25 |         reader = Box::new(Cursor::new(format_bytes.to_vec()).chain(reader));
26 |         if &format_bytes[..2] == b"\x1f\x8b" {
27 |             // for gz format
28 |             reader = Box::new(BufReader::with_capacity(65536, MultiGzDecoder::new(reader)));
29 |             format_bytes.iter_mut().for_each(|m| *m = 0);
30 |             reader.read_exact(&mut format_bytes)?;
31 |             reader = Box::new(Cursor::new(format_bytes.to_vec()).chain(reader));
32 |         }
33 | 
34 |         match format_bytes[0] {
35 |             b'@' | b'>' => Ok(Paths::Reader(Reader::new(reader))),
36 |             _ => {
37 |                 // for a fofn file
38 |                 let mut paths = Readers::new();
39 |                 let parent = path.parent().unwrap_or_else(|| Path::new(""));
40 | 
41 |                 for _line in reader.lines() {
42 |                     let _line = _line?;
43 |                     let line = _line.trim();
44 |                     if line.starts_with('#') || line.is_empty() {
45 |                         continue;
46 |                     }
47 |                     let path = parent.join(line); // convert to an absolute path
48 |                     if path.exists() {
49 |                         match parse_path(path)? {
50 |                             Paths::Reader(reader) => paths.readers.push(reader),
51 |                             Paths::Readers(readers) => paths.readers.extend(readers.readers),
52 |                         }
53 |                     } else {
54 |                         return Err(Error::new(
55 |                             ErrorKind::InvalidData,
56 |                             format!("{:?} is not a valid fastq/fasta/fofn file", path),
57 |                         ));
58 |                     }
59 |                 }
60 |                 Ok(Paths::Readers(paths))
61 |             }
62 |         }
63 |     }
64 | 
65 |     /// iterate a fatsx record for a Reader or Readers
66 |     pub fn iter_record(&mut self) -> ParseResult<Option<Fastx>> {
67 |         match self {
68 |             Paths::Reader(t) => t.iter_record(),
69 |             Paths::Readers(t) => t.iter_record(),
70 |         }
71 |     }
72 | }
73 | 
74 | /// parse path to a Reader or Readers
75 | pub fn parse_path<'a, P: AsRef<Path> + 'a>(path: P) -> Result<Paths<'a>> {
76 |     let path = path.as_ref();
77 |     let reader: Box<dyn BufRead> = if path == Path::new("-") {
78 |         if atty::is(atty::Stream::Stdin) {
79 |             return Err(Error::new(ErrorKind::InvalidInput, "Missing input"));
80 |         }
81 |         Box::new(BufReader::with_capacity(65536, stdin()))
82 |     } else {
83 |         Box::new(BufReader::with_capacity(65536, File::open(path)?))
84 |     };
85 |     Paths::new(reader, path)
86 | }
87 | 
88 | /// parse reader to a Reader or Readers
89 | pub fn parse_reader<'a, R: Read + 'a>(reader: R) -> Result<Paths<'a>> {
90 |     Paths::new(
91 |         Box::new(BufReader::with_capacity(65536, reader)),
92 |         Path::new(""),
93 |     )
94 | }
95 | 


--------------------------------------------------------------------------------
/src/record.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     error, fmt,
  3 |     io::{self, ErrorKind},
  4 |     str,
  5 | };
  6 | 
  7 | pub type Result<T> = std::result::Result<T, ParseError>;
  8 | 
  9 | /// The type of error that returned during parsing fastx files
 10 | #[derive(Debug)]
 11 | pub enum ParseError {
 12 |     /// IO error
 13 |     Io(io::Error),
 14 |     /// A truncated record was found
 15 |     TruncateFile(String),
 16 |     /// Not a valid fastx record, the record doesn't start with `>` and '@'
 17 |     InvalidFastx(String),
 18 |     /// Not a valid fasta record, the record starts with `>` but the sequence length is 0
 19 |     InvalidFasta(String),
 20 |     /// Not a valid fastq record, the record start with `@` but the sequence and quality lengths are not equal or 0
 21 |     InvalidFastq(String),
 22 | }
 23 | 
 24 | impl fmt::Display for ParseError {
 25 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 26 |         match self {
 27 |             ParseError::Io(err) => write!(f, "IO error: {}", err),
 28 |             ParseError::TruncateFile(record) => {
 29 |                 write!(f, "Truncate file, problematic record: {}", record)
 30 |             }
 31 |             ParseError::InvalidFastx(record) => {
 32 |                 write!(f, "Not a valid fastx record: {}", record)
 33 |             }
 34 |             ParseError::InvalidFasta(record) => {
 35 |                 write!(f, "Not a valid fasta record: {}", record)
 36 |             }
 37 |             ParseError::InvalidFastq(record) => {
 38 |                 write!(f, "Not a valid fastq record: {}", record)
 39 |             }
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | impl From<io::Error> for ParseError {
 45 |     fn from(err: io::Error) -> ParseError {
 46 |         ParseError::Io(err)
 47 |     }
 48 | }
 49 | 
 50 | impl error::Error for ParseError {}
 51 | 
 52 | /// a structure representing the sequence in a fastx file
 53 | pub struct Fastx<'a> {
 54 |     _head: usize,
 55 |     _des: usize,
 56 |     _seq: usize,
 57 |     _sep: usize,
 58 |     _qual: usize,
 59 |     _data: &'a Vec<u8>,
 60 | }
 61 | 
 62 | impl Fastx<'_> {
 63 |     /// get sequence id/identifier
 64 |     #[inline]
 65 |     pub fn head(&self) -> &str {
 66 |         unsafe { str::from_utf8_unchecked(&self._data[1..self._head]) }
 67 |     }
 68 | 
 69 |     /// get sequence
 70 |     #[inline]
 71 |     pub fn seq(&self) -> &str {
 72 |         unsafe { str::from_utf8_unchecked(&self._data[self._des..self._seq]) }
 73 |     }
 74 | 
 75 |     /// get sequence description/comment
 76 |     #[inline]
 77 |     pub fn des(&self) -> &str {
 78 |         if self._head < self._des {
 79 |             unsafe { str::from_utf8_unchecked(&self._data[self._head..self._des]) }
 80 |         } else {
 81 |             ""
 82 |         }
 83 |     }
 84 | 
 85 |     /// get separator
 86 |     #[inline]
 87 |     pub fn sep(&self) -> &str {
 88 |         if self._seq < self._sep {
 89 |             unsafe { str::from_utf8_unchecked(&self._data[self._seq..self._sep]) }
 90 |         } else {
 91 |             ""
 92 |         }
 93 |     }
 94 | 
 95 |     /// get quality scores
 96 |     #[inline]
 97 |     pub fn qual(&self) -> &str {
 98 |         if self._sep < self._qual {
 99 |             unsafe { str::from_utf8_unchecked(&self._data[self._sep..self._qual]) }
100 |         } else {
101 |             ""
102 |         }
103 |     }
104 | 
105 |     /// get sequence length
106 |     #[inline]
107 |     pub fn len(&self) -> usize {
108 |         self._seq - self._des
109 |     }
110 | 
111 |     /// check whether a fastx record is empty
112 |     pub fn is_empty(&self) -> bool {
113 |         self.len() == 0
114 |     }
115 | 
116 |     /// check whether a fastx record is a fasta record
117 |     pub fn is_fasta(&self) -> bool {
118 |         (!self._data.is_empty()) && self._data[0] == b'>'
119 |     }
120 | 
121 |     /// check whether a fastx record is a fastq record
122 |     pub fn is_fastq(&self) -> bool {
123 |         (!self._data.is_empty()) && self._data[0] == b'@'
124 |     }
125 | 
126 |     /// check a fastq record is valid
127 |     fn validate_fastq(&self) -> bool {
128 |         self.is_fastq() && !self.is_empty() && self._seq - self._des == self._qual - self._sep && self._head > 1
129 |     }
130 | 
131 |     /// check a fasta record is valid
132 |     fn validate_fasta(&self) -> bool {
133 |         self.is_fasta() && !self.is_empty() && self._head > 1
134 |     }
135 | }
136 | 
137 | /// a reader with shared buffer
138 | pub struct Reader<'a> {
139 |     reader: Box<dyn io::BufRead + 'a>,
140 |     data: Vec<u8>,
141 | }
142 | 
143 | impl<'a> Reader<'a> {
144 |     // Create a new Reader
145 |     pub(crate) fn new(r: Box<dyn io::BufRead + 'a>) -> Self {
146 |         Reader {
147 |             reader: r,
148 |             data: Vec::with_capacity(1024),
149 |         }
150 |     }
151 | 
152 |     // Check if this reader has any data left to be read.
153 |     fn has_data_left(&mut self) -> Result<bool> {
154 |         loop{
155 |             let available = self.reader.fill_buf().map_err(ParseError::Io)?;
156 |             if available.iter().any(|&x| !char::is_whitespace(x as char)){
157 |                 return Ok(true);
158 |             }else if available.is_empty() {
159 |                 return Ok(false);
160 |             }
161 |             let len = available.len();
162 |             self.reader.consume(len);
163 |         }
164 |     }
165 | 
166 |     // Return the next byte of the internal buffer
167 |     #[allow(dead_code)]
168 |     fn next_byte(&mut self) -> Result<Option<u8>> {
169 |         loop {
170 |             match self.reader.fill_buf() {
171 |                 Ok(n) => return Ok(n.first().copied()),
172 |                 Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
173 |                 Err(e) => return Err(ParseError::Io(e)),
174 |             };
175 |         }
176 |     }
177 | 
178 |     // Read all non-newline bytes into data until the newline byte or EOF is reached,
179 |     // the newline byte (if found) will not be appended to data.
180 |     fn read_line(&mut self, skip_blank_line: bool) -> Result<usize> {
181 |         let delim = b'\n';
182 |         loop {
183 |             let mut n = self.reader.read_until(delim, &mut self.data)?;
184 |             // reached EOF
185 |             if n == 0 {
186 |                 return Ok(n);
187 |             }
188 | 
189 |             if self.data.last() == Some(&delim) {
190 |                 self.data.pop();
191 |                 n -= 1;
192 |             }
193 |             if n != 0 || !skip_blank_line {
194 |                 return Ok(n);
195 |             }
196 |         }
197 |     }
198 | 
199 |     // Read all non-newline bytes into data until the delimiter byte or EOF is reached,
200 |     // the delimiter (if found) will not be appended to data.
201 |     fn read_until(&mut self, delim: u8) -> Result<usize> {
202 |         let mut read = 0;
203 |         loop {
204 |             let (done, used) = {
205 |                 let available = match self.reader.fill_buf() {
206 |                     Ok(n) => n,
207 |                     Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
208 |                     Err(e) => return Err(ParseError::Io(e)),
209 |                 };
210 |                 let mut s = 0;
211 |                 let mut mch = memchr::memchr2_iter(delim, b'\n', available);
212 |                 loop {
213 |                     match mch.next() {
214 |                         Some(i) => {
215 |                             self.data.extend_from_slice(&available[s..i]);
216 |                             read += i - s;
217 |                             s = i + 1;
218 |                             if available[i] == delim {
219 |                                 break (true, i);
220 |                             }
221 |                         }
222 |                         None => {
223 |                             self.data.extend_from_slice(&available[s..]);
224 |                             read += available.len() - s;
225 |                             break (false, available.len());
226 |                         }
227 |                     }
228 |                 }
229 |             };
230 |             self.reader.consume(used);
231 |             if done || used == 0 {
232 |                 return Ok(read);
233 |             }
234 |         }
235 |     }
236 | 
237 |     // Read the exact number of non-newline bytes into data.
238 |     fn read_exact(&mut self, len: usize) -> Result<usize> {
239 |         let mut read = 0;
240 |         loop {
241 |             let (done, used) = {
242 |                 let available = match self.reader.fill_buf() {
243 |                     Ok(n) => n,
244 |                     Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
245 |                     Err(e) => return Err(ParseError::Io(e)),
246 |                 };
247 |                 let mut s = 0;
248 |                 let mut mch = memchr::memchr_iter(b'\n', available);
249 |                 loop {
250 |                     match mch.next() {
251 |                         Some(i) => {
252 |                             if read + i - s >= len {
253 |                                 let e = len - read + s;
254 |                                 self.data.extend_from_slice(&available[s..e]);
255 |                                 read = len;
256 |                                 break (true, e);
257 |                             } else {
258 |                                 self.data.extend_from_slice(&available[s..i]);
259 |                                 read += i - s;
260 |                             }
261 |                             s = i + 1;
262 |                         }
263 |                         None => {
264 |                             if available.len() - s + read >= len {
265 |                                 let e = len - read + s;
266 |                                 self.data.extend_from_slice(&available[s..e]);
267 |                                 read = len;
268 |                                 break (true, e);
269 |                             } else {
270 |                                 self.data.extend_from_slice(&available[s..]);
271 |                                 read += available.len() - s;
272 |                                 break (false, available.len());
273 |                             }
274 |                         }
275 |                     }
276 |                 }
277 |             };
278 |             self.reader.consume(used);
279 |             if done || used == 0 {
280 |                 return Ok(read);
281 |             }
282 |         }
283 |     }
284 | 
285 |     /// iterate over a record from this Reader
286 |     pub fn iter_record(&mut self) -> Result<Option<Fastx>> {
287 |         // clean the last record
288 |         self.data.clear();
289 |         // read sequence head
290 |         let des = self.read_line(true)?;
291 |         if des == 0 {
292 |             // reach the EOF
293 |             return Ok(None);
294 |         } else if self.data[0] != b'>' && self.data[0] != b'@' {
295 |             // safely unwrap
296 |             return Err(ParseError::InvalidFastx(
297 |                 String::from_utf8(self.data.to_owned()).unwrap(),
298 |             ));
299 |         }
300 | 
301 |         let head = self
302 |             .data
303 |             .iter()
304 |             .position(|&x| char::is_whitespace(x as char))
305 |             .map_or_else(|| des, |x| x);
306 |         let mut seq = des;
307 |         let mut sep = seq;
308 |         let mut qual = sep;
309 | 
310 |         let is_fasta = self.data[0] == b'>';
311 |         if is_fasta {
312 |             seq += self.read_until(b'>')?;
313 |         } else {
314 |             seq += self.read_until(b'+')?;
315 |             sep = seq + self.read_line(true)?;
316 |             qual = sep + self.read_exact(seq - des)?;
317 |         }
318 | 
319 |         if !self.has_data_left()? && (head == 1 || seq == des || (!is_fasta && (sep == seq || qual == sep))){
320 |             // safely unwrap
321 |             return Err(ParseError::TruncateFile(
322 |                 String::from_utf8(self.data.to_owned()).unwrap(),
323 |             ));
324 |         }
325 |         // println!("head:{head} des {des} seq {seq} sep {sep} qual {qual}");
326 |         let fastx = Fastx {
327 |             _head: head,
328 |             _des: des,
329 |             _seq: seq,
330 |             _sep: sep,
331 |             _qual: qual,
332 |             _data: &self.data,
333 |         };
334 | 
335 |         if is_fasta && !fastx.validate_fasta() {
336 |             return Err(ParseError::InvalidFasta(fastx.head().to_string()));
337 |         } else if !(is_fasta || fastx.validate_fastq()) {
338 |             return Err(ParseError::InvalidFastq(fastx.head().to_string()));
339 |         }
340 |         Ok(Some(fastx))
341 |     }
342 | }
343 | 
344 | /// multiple readers for a fofn file
345 | pub struct Readers<'a> {
346 |     index: usize,
347 |     pub(crate) readers: Vec<Reader<'a>>,
348 | }
349 | 
350 | impl<'a> Default for Readers<'a> {
351 |     fn default() -> Self {
352 |         Self::new()
353 |     }
354 | }
355 | 
356 | impl<'a> Readers<'a> {
357 |     /// create a new Readers
358 |     pub(crate) fn new() -> Self {
359 |         Readers {
360 |             index: 0,
361 |             readers: Vec::new(),
362 |         }
363 |     }
364 | 
365 |     /// iterate over a record from this Readers
366 |     pub(crate) fn iter_record(&mut self) -> Result<Option<Fastx>> {
367 |         for idx in self.index..self.readers.len() {
368 |             if self.readers[idx].has_data_left()? {
369 |                 return self.readers[idx].iter_record();
370 |             }
371 |             self.index += 1;
372 |         }
373 |         Ok(None)
374 |     }
375 | }
376 | 


--------------------------------------------------------------------------------
/tests/test.rs:
--------------------------------------------------------------------------------
  1 | use kseq;
  2 | use std::io::Cursor;
  3 | use std::result::Result;
  4 | 
  5 | // https://stackoverflow.com/questions/53124930/how-do-you-test-for-a-specific-rust-error
  6 | macro_rules! assert_err {
  7 |     ($expression:expr, $($pattern:tt)+) => {
  8 |         match $expression {
  9 |             $($pattern)+ => (),
 10 |             ref e => panic!("expected `{}` but got `{:?}`", stringify!($($pattern)+), e),
 11 |         }
 12 |     }
 13 | }
 14 | 
 15 | fn count_base(input: Vec<u8>) -> Result<usize, kseq::record::ParseError> {
 16 |     let mut len_bases = 0;
 17 |     let mut seq_bases = 0;
 18 |     let mut records = kseq::parse_reader(Cursor::new(input))?;
 19 |     while let Some(record) = records.iter_record()? {
 20 |         len_bases += record.len();
 21 |         seq_bases += record.seq().len();
 22 |     }
 23 |     assert_eq!(len_bases, seq_bases);
 24 |     Ok(len_bases)
 25 | }
 26 | 
 27 | static BASE_SEQ: &str = "ATGCATGCATGC";
 28 | static BASE_QUAL: &str = "@@@@@@@@@@@@";
 29 | 
 30 | #[test]
 31 | fn test_normal_one_line_fasta() {
 32 |     let data: Vec<u8> =
 33 |         format!(">1 record1\n{seq}\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes();
 34 |     assert_eq!(BASE_SEQ.len() * 2, count_base(data).unwrap());
 35 | }
 36 | 
 37 | #[test]
 38 | fn test_normal_one_line_fastq() {
 39 |     let data: Vec<u8> = format!(
 40 |         "@1 record1\n{seq}\n+\n{qual}\n@2 record2\n{seq}\n+\n{qual}",
 41 |         seq = BASE_SEQ,
 42 |         qual = BASE_QUAL
 43 |     )
 44 |     .into_bytes();
 45 |     assert_eq!(BASE_SEQ.len() * 2, count_base(data).unwrap());
 46 | }
 47 | 
 48 | #[test]
 49 | fn test_normal_multi_line_fasta() {
 50 |     let data: Vec<u8> = format!(
 51 |         ">1 record1\n{seq}\n{seq}\n>2 record2\n{seq}\n{seq}",
 52 |         seq = BASE_SEQ
 53 |     )
 54 |     .into_bytes();
 55 |     assert_eq!(BASE_SEQ.len() * 4, count_base(data).unwrap());
 56 | }
 57 | 
 58 | #[test]
 59 | fn test_normal_multi_line_fastq() {
 60 |     let data: Vec<u8> = format!(
 61 |         "@1 record1\n{seq}\n{seq}\n+\n{qual}\n{qual}\n@2 record2\n{seq}\n{seq}\n+\n{qual}{qual}\n",
 62 |         seq = BASE_SEQ,
 63 |         qual = BASE_QUAL
 64 |     )
 65 |     .into_bytes();
 66 |     assert_eq!(BASE_SEQ.len() * 4, count_base(data).unwrap());
 67 | }
 68 | 
 69 | #[test]
 70 | fn test_truncate_fasta_miss_head() {
 71 |     let data: Vec<u8> = format!(">1 record1\n{seq}\n>", seq = BASE_SEQ).into_bytes();
 72 |     assert_err!(
 73 |         count_base(data),
 74 |         Err(kseq::record::ParseError::TruncateFile(_))
 75 |     );
 76 | }
 77 | 
 78 | #[test]
 79 | fn test_truncate_fasta_miss_seq() {
 80 |     let data: Vec<u8> = format!(">1 record1\n{seq}{seq}\n>2 record2", seq = BASE_SEQ).into_bytes();
 81 |     assert_err!(
 82 |         count_base(data),
 83 |         Err(kseq::record::ParseError::TruncateFile(_))
 84 |     );
 85 | }
 86 | 
 87 | #[test]
 88 | fn test_truncate_fastq_miss_head() {
 89 |     let data: Vec<u8> = format!(
 90 |         "@1 record1\n{seq}{seq}\n+\n{qual}{qual}\n@\n",
 91 |         seq = BASE_SEQ,
 92 |         qual = BASE_QUAL
 93 |     )
 94 |     .into_bytes();
 95 |     assert_err!(
 96 |         count_base(data),
 97 |         Err(kseq::record::ParseError::TruncateFile(_))
 98 |     );
 99 | }
100 | 
101 | #[test]
102 | fn test_truncate_fastq_miss_seq() {
103 |     let data: Vec<u8> = format!("@1 record1").into_bytes();
104 |     assert_err!(
105 |         count_base(data),
106 |         Err(kseq::record::ParseError::TruncateFile(_))
107 |     );
108 | }
109 | 
110 | #[test]
111 | fn test_truncate_fastq_miss_sep() {
112 |     let data: Vec<u8> = format!("@1 record1\n{seq}{seq}", seq = BASE_SEQ).into_bytes();
113 |     assert_err!(
114 |         count_base(data),
115 |         Err(kseq::record::ParseError::TruncateFile(_))
116 |     );
117 | }
118 | 
119 | #[test]
120 | fn test_truncate_fastq_miss_qual() {
121 |     let data: Vec<u8> = format!("@1 record1\n{seq}{seq}\n+", seq = BASE_SEQ).into_bytes();
122 |     assert_err!(
123 |         count_base(data),
124 |         Err(kseq::record::ParseError::TruncateFile(_))
125 |     );
126 | }
127 | 
128 | #[test]
129 | fn test_invalid_fasta_miss_head() {
130 |     let data: Vec<u8> = format!(">\n{seq}\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes();
131 |     assert_err!(
132 |         count_base(data),
133 |         Err(kseq::record::ParseError::InvalidFasta(_))
134 |     );
135 | }
136 | 
137 | #[test]
138 | fn test_invalid_fasta_miss_seq() {
139 |     let data: Vec<u8> = format!(">1 record1\n\n>2 record2\n{seq}", seq = BASE_SEQ).into_bytes();
140 |     assert_err!(
141 |         count_base(data),
142 |         Err(kseq::record::ParseError::InvalidFasta(_))
143 |     );
144 | }
145 | 
146 | #[test]
147 | fn test_invalid_fasta_with_seq_len_is_0() {
148 |     let data: Vec<u8> = format!(">1 record1\n{seq}\n>2 record2\n{seq}", seq = "").into_bytes();
149 |     assert_err!(
150 |         count_base(data),
151 |         Err(kseq::record::ParseError::InvalidFasta(_))
152 |     );
153 | }
154 | 
155 | #[test]
156 | fn test_invalid_fastq_miss_head() {
157 |     let data: Vec<u8> = format!(
158 |         "@\n{seq}{seq}\n+\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n",
159 |         seq = BASE_SEQ,
160 |         qual = BASE_QUAL
161 |     )
162 |     .into_bytes();
163 |     assert_err!(
164 |         count_base(data),
165 |         Err(kseq::record::ParseError::InvalidFastq(_))
166 |     );
167 | }
168 | 
169 | #[test]
170 | fn test_invalid_fastq_miss_seq() {
171 |     let data: Vec<u8> = format!(
172 |         "@1 record1\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n",
173 |         seq = BASE_SEQ,
174 |         qual = BASE_QUAL
175 |     )
176 |     .into_bytes();
177 |     assert_err!(
178 |         count_base(data),
179 |         Err(kseq::record::ParseError::InvalidFastq(_))
180 |     );
181 | }
182 | 
183 | #[test]
184 | fn test_invalid_fastq_miss_sep() {
185 |     let data: Vec<u8> = format!(
186 |         "@1 record1\n{seq}{seq}\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n",
187 |         seq = BASE_SEQ,
188 |         qual = BASE_QUAL
189 |     )
190 |     .into_bytes();
191 |     assert_err!(
192 |         count_base(data),
193 |         Err(kseq::record::ParseError::InvalidFastq(_))
194 |     );
195 | }
196 | 
197 | #[test]
198 | fn test_invalid_fastq_miss_qual() {
199 |     let data: Vec<u8> = format!(
200 |         "@1 record1\n{seq}{seq}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n",
201 |         seq = BASE_SEQ,
202 |         qual = BASE_QUAL
203 |     )
204 |     .into_bytes();
205 |     assert_err!(
206 |         count_base(data),
207 |         Err(kseq::record::ParseError::InvalidFastq(_))
208 |     );
209 | }
210 | 
211 | 
212 | #[test]
213 | fn test_invalid_fastq_seq_has_diff_len_with_qual() {
214 |     let data: Vec<u8> = format!(
215 |         "@1 record1\n{seq}{seq}\n+\n{qual}{qual}\n@2 record2\n{seq}{seq}\n+\n{qual}{qual}\n",
216 |         seq = BASE_SEQ,
217 |         qual = &BASE_QUAL[0..BASE_SEQ.len() - 1]
218 |     )
219 |     .into_bytes();
220 |     assert_err!(
221 |         count_base(data),
222 |         Err(kseq::record::ParseError::InvalidFastx(_))
223 |     );
224 | }
225 | 
226 | #[test]
227 | fn test_invalid_fastq_with_seq_len_is_0() {
228 |     let data: Vec<u8> = format!(
229 |         "@1 record1\n{seq}\n+\n{qual}\n@2 record2\n{seq}\n+\n{qual}\n",
230 |         seq = "",
231 |         qual = ""
232 |     )
233 |     .into_bytes();
234 |     assert_err!(
235 |         count_base(data),
236 |         Err(kseq::record::ParseError::InvalidFastq(_))
237 |     );
238 | }
239 | 
240 | // #[test]
241 | // fn test_large_fasta() {
242 | //     let count = 1_000_000;
243 | //     let mut total_len = 0;
244 | //     let mut data = Vec::with_capacity(count * 20);
245 | //     (0..count).for_each(|x| {
246 | //         writeln!(&mut data, ">{x} record{x}\n").expect("Failed to write to Vec");
247 | //         (0..count & 0xff).for_each(|_| {
248 | //             total_len += BASE_SEQ.len();
249 | //             writeln!(&mut data, "{BASE_SEQ}").expect("Failed to write to Vec");
250 | //         });
251 | //         writeln!(&mut data, "\n").expect("Failed to write to Vec");
252 | //     });
253 | 
254 | //     assert_eq!(count_base(data).unwrap(), total_len);
255 | // }
256 | 
257 | // #[test]
258 | // fn test_large_fastq() {
259 | //     let count = 1_000_000;
260 | //     let mut total_len = 0;
261 | //     let mut data = Vec::with_capacity(count * 20);
262 | //     (0..count).for_each(|x| {
263 | //         writeln!(&mut data, "@{x} record{x}\n").expect("Failed to write to Vec");
264 | //         (0..count & 0xff).for_each(|_| {
265 | //             total_len += BASE_SEQ.len();
266 | //             writeln!(&mut data, "{BASE_SEQ}").expect("Failed to write to Vec");
267 | //         });
268 | //         writeln!(&mut data, "\n+\n").expect("Failed to write to Vec");
269 | //         (0..count & 0xff).for_each(|_| {
270 | //             writeln!(&mut data, "{BASE_QUAL}").expect("Failed to write to Vec");
271 | //         });
272 | //         writeln!(&mut data, "\n").expect("Failed to write to Vec");
273 | //     });
274 | 
275 | //     assert_eq!(count_base(data).unwrap(), total_len);
276 | // }
277 | 


--------------------------------------------------------------------------------