├── src
    ├── data.rs
    ├── subcommands.rs
    ├── lib.rs
    ├── parser.rs
    ├── bracken.rs
    ├── subcommands
    │   ├── convert_phylo.rs
    │   ├── convert_abundance.rs
    │   ├── combine_phylo.rs
    │   └── combine_abundance.rs
    ├── errors.rs
    ├── main.rs
    ├── io
    │   ├── report.rs
    │   ├── abundance_csv.rs
    │   └── newick.rs
    ├── kraken.rs
    ├── cli
    │   ├── logging.rs
    │   ├── subcommands.rs
    │   └── args.rs
    ├── data
    │   ├── abundance.rs
    │   └── tree.rs
    ├── cli.rs
    ├── io.rs
    └── taxonomy.rs
├── .gitignore
├── ci
    ├── script.sh
    ├── before_deploy.sh
    └── install.sh
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   └── rust.yml
├── tests
    └── sample_data
    │   ├── converted.tree
    │   ├── combined.tree
    │   ├── sample.kreport
    │   ├── converted.csv
    │   ├── sample_2.kreport
    │   └── combined.csv
├── LICENSE-MIT
├── CHANGELOG.md
├── Cargo.toml
├── .travis.yml
├── README.md
└── LICENSE-APACHE


/src/data.rs:
--------------------------------------------------------------------------------
1 | pub mod abundance;
2 | pub mod tree;
3 | 


--------------------------------------------------------------------------------
/src/subcommands.rs:
--------------------------------------------------------------------------------
1 | mod combine_abundance;
2 | mod combine_phylo;
3 | mod convert_abundance;
4 | mod convert_phylo;
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | Cargo.lock
 3 | _test_data
 4 | .vscode
 5 | .Rproj.user
 6 | *.Rproj
 7 | *.tree
 8 | *.newick
 9 | *.graph
10 | log
11 | .Rhistory
12 | TODO.txt
13 | 
14 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // #![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
 2 | // #![allow(clippy::missing_const_for_fn)]
 3 | // #![allow(clippy::multiple_crate_versions)]
 4 | // #![allow(clippy::missing_errors_doc)]
 5 | // #![allow(clippy::module_name_repetitions)]
 6 | 
 7 | pub mod bracken;
 8 | pub mod data;
 9 | pub mod errors;
10 | pub mod kraken;
11 | pub mod parser;
12 | pub mod taxonomy;
13 | 
14 | #[macro_use]
15 | extern crate serde;
16 | 


--------------------------------------------------------------------------------
/ci/script.sh:
--------------------------------------------------------------------------------
 1 | # This script takes care of testing your crate
 2 | 
 3 | set -ex
 4 | 
 5 | main() {
 6 |     # cross build --target $TARGET
 7 |     cross build --target $TARGET --release
 8 | 
 9 |     if [ ! -z $DISABLE_TESTS ]; then
10 |         return
11 |     fi
12 | 
13 |     # cross test --target $TARGET
14 |     # cross test --target $TARGET --release
15 | 
16 |     # cross run --target $TARGET
17 |     # cross run --target $TARGET --release
18 | }
19 | 
20 | # we don't run the "test phase" when doing deploys
21 | if [ -z $TRAVIS_TAG ]; then
22 |     main
23 | fi
24 | 


--------------------------------------------------------------------------------
/src/parser.rs:
--------------------------------------------------------------------------------
 1 | use nom::IResult;
 2 | 
 3 | use crate::kraken::Indent;
 4 | 
 5 | pub fn spaces_and_rest(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> {
 6 |     nom::multi::fold_many0(
 7 |         nom::bytes::complete::tag("  "),
 8 |         Vec::new(),
 9 |         |mut acc: Vec<_>, item| {
10 |             acc.push(item);
11 |             acc
12 |         },
13 |     )(input)
14 | }
15 | 
16 | pub fn parse_ident_organism_name(input: &[u8]) -> IResult<&[u8], (Indent, &[u8])> {
17 |     let (name, spaces) = spaces_and_rest(input)?;
18 | 
19 |     Ok((&[], (spaces.len(), name)))
20 | }
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE] title"
 5 | labels: "[type] feature"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. 
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: "[problem] bug"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behaviour:
15 | 
16 | **Expected behaviour**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Environment (please complete the following information):**
23 |  - OS: [e.g. Windows 10, Linux, MacOS, all]
24 |  - Version [e.g. 0.0.1]
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/tests/sample_data/converted.tree:
--------------------------------------------------------------------------------
1 | (((((((((Pyrobaculum_neutrophilum:1)Pyrobaculum:1)Thermoproteaceae:1)Thermoproteales:1)Thermoprotei:1)Crenarchaeota:1)Archaea:1,((((((Lactococcus_lactis:1)Lactococcus:1)Streptococcaceae:1)Lactobacillales:1)Bacilli:1)Firmicutes:1,(((((Escherichia_coli:1)Escherichia:1)Enterobacteriaceae:1)Enterobacterales:1)Gammaproteobacteria:1)Proteobacteria:1)Bacteria:1,(((((((Saccharomyces_cerevisiae:1)Saccharomyces:1)Saccharomycetaceae:1)Saccharomycetales:1)Saccharomycetes:1)Ascomycota:1)Fungi:1,((((((((Tursiops_truncatus:1)Tursiops:1)Delphinidae:1)Artiodactyla:1,(((Canis_lupus:1)Canis:1)Canidae:1)Carnivora:1)Laurasiatheria:1,((((Homo_sapiens:1)Homo:1)Hominidae:1)Primates:1)Euarchontoglires:1)Boreoeutheria:1)Mammalia:1)Chordata:1)Metazoa:1)Eukaryota:1)cellular_organisms:1)root:0)unclassified:0;
2 | 


--------------------------------------------------------------------------------
/ci/before_deploy.sh:
--------------------------------------------------------------------------------
 1 | # This script takes care of building your crate and packaging it for release
 2 | 
 3 | set -ex
 4 | 
 5 | main() {
 6 |     local src=$(pwd) \
 7 |           stage=
 8 | 
 9 |     case $TRAVIS_OS_NAME in
10 |         linux)
11 |             stage=$(mktemp -d)
12 |             ;;
13 |         osx)
14 |             stage=$(mktemp -d -t tmp)
15 |             ;;
16 |     esac
17 | 
18 |     test -f Cargo.lock || cargo generate-lockfile
19 | 
20 |     cross build --target $TARGET --release
21 |     
22 |     if [[ $TARGET == *"pc-windows"* ]]; then
23 |         cp target/$TARGET/release/spideog.exe $stage/
24 |     else 
25 |         cp target/$TARGET/release/spideog $stage/
26 |     fi
27 | 
28 |     cd $stage
29 |     tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
30 |     cd $src
31 | 
32 |     rm -rf $stage
33 | }
34 | 
35 | main
36 | 


--------------------------------------------------------------------------------
/tests/sample_data/combined.tree:
--------------------------------------------------------------------------------
1 | (((((((((Tilapia_tilapinevirus:1)Tilapinevirus:1)Amnoonviridae:1,((Influenza_B_virus:1)Betainfluenzavirus:1)Orthomyxoviridae:1)Articulavirales:1)Insthoviricetes:1)Negarnaviricota:1)Orthornavirae:1)Viruses:1,(((((((Pyrobaculum_neutrophilum:1)Pyrobaculum:1)Thermoproteaceae:1)Thermoproteales:1)Thermoprotei:1)Crenarchaeota:1)Archaea:1,((((((Lactococcus_lactis:1)Lactococcus:1)Streptococcaceae:1)Lactobacillales:1)Bacilli:1)Firmicutes:1,(((((Escherichia_coli:1)Escherichia:1)Enterobacteriaceae:1)Enterobacterales:1)Gammaproteobacteria:1)Proteobacteria:1)Bacteria:1,(((((((Saccharomyces_pastorianus:1,Saccharomyces_kudriavzevii:1,Saccharomyces_cerevisiae:1)Saccharomyces:1)Saccharomycetaceae:1)Saccharomycetales:1)Saccharomycetes:1)Ascomycota:1)Fungi:1,((((((((Tursiops_truncatus:1)Tursiops:1)Delphinidae:1)Artiodactyla:1,(((Felis_catus:1)Felis:1)Felidae:1,((Canis_lupus:1)Canis:1)Canidae:1)Carnivora:1)Laurasiatheria:1,((((Homo_sapiens:1)Homo:1)Hominidae:1)Primates:1)Euarchontoglires:1)Boreoeutheria:1)Mammalia:1)Chordata:1)Metazoa:1)Eukaryota:1)cellular_organisms:1)root:0)unclassified:0;
2 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020 Jean Manguy
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/src/bracken.rs:
--------------------------------------------------------------------------------
 1 | use crate::kraken::Taxon;
 2 | 
 3 | #[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
 4 | pub struct AbundanceValues {
 5 |     pub kraken_assigned_reads: u64,
 6 |     pub added_reads: u64,
 7 |     pub new_est_reads: u64,
 8 |     pub fraction_total_reads: f64,
 9 | }
10 | #[derive(Clone, PartialEq, PartialOrd, Debug, Deserialize)]
11 | pub struct BrackenRecord {
12 |     #[serde(flatten)]
13 |     pub taxon: Taxon,
14 |     #[serde(flatten)]
15 |     pub abundance_values: AbundanceValues,
16 | }
17 | 
18 | #[cfg(test)]
19 | mod tests {
20 |     // use super::*;
21 |     // use indextree::Arena;
22 |     // use std::collections::BTreeMap;
23 | 
24 |     // #[test]
25 |     // fn bracken_works() {
26 |     //     let mut rdr = csv::ReaderBuilder::new()
27 |     //         .has_headers(true)
28 |     //         .delimiter(b'\t')
29 |     //         .from_path(r"C:\Users\Jean\Documents\spideog\_test_data\Sam9_species.bracken")
30 |     //         .unwrap();
31 |     //     // dbg!(rdr);
32 | 
33 |     //     let mut bracken = BTreeMap::new();
34 | 
35 |     //     for result in rdr.deserialize() {
36 |     //         let record: BrackenRecord = result.unwrap();
37 |     //         bracken.insert(record.organism, record.abundance_values);
38 |     //     }
39 | 
40 |     //     println!("{:?}", bracken);
41 |     // }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/subcommands/convert_phylo.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::{Help, Report};
 2 | use eyre::Context;
 3 | use libspideog::data::tree::Tree;
 4 | use tracing::instrument;
 5 | 
 6 | use crate::{
 7 |     cli::subcommands::{ConvertTree, Runner},
 8 |     io::newick::write_newick,
 9 |     io::{report::ParseKrakenReport, Output},
10 | };
11 | 
12 | impl Runner for ConvertTree {
13 |     #[instrument]
14 |     fn run(self) -> Result<(), Report> {
15 |         let input = &self.input.path;
16 | 
17 |         let reader = self.input.open_report()?;
18 |         let mut csv_reader = csv::ReaderBuilder::new()
19 |             .has_headers(self.input.headers)
20 |             .delimiter(b'\t')
21 |             .double_quote(false)
22 |             .flexible(true)
23 |             .from_reader(reader);
24 | 
25 |         let output = Output::from(self.output.file);
26 |         output.try_writtable()?;
27 | 
28 |         let tree: Tree = ParseKrakenReport::parse(&mut csv_reader)
29 |             .wrap_err_with(|| format!("failed to parse file `{}`", &input.display()))
30 |             .suggestion("try using the `--has-headers` option if your Kraken report has headers")?;
31 | 
32 |         let mut writer = output.writer()?;
33 | 
34 |         match self.output.format {
35 |             crate::io::OutputPhyloFormat::Newick => write_newick(&mut writer, &tree)?,
36 |         }
37 | 
38 |         Ok(())
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   check:
14 |     name: Check
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         rust:
19 |           - stable
20 |           - nightly
21 |     steps:
22 |       - uses: actions/checkout@v1
23 |       - uses: actions-rs/toolchain@v1
24 |         with:
25 |           toolchain: ${{ matrix.rust }}
26 |           override: true
27 |       - uses: actions-rs/cargo@v1
28 |         with:
29 |           command: check
30 | 
31 |   test:
32 |     name: Tests
33 |     runs-on: ubuntu-latest
34 |     strategy:
35 |       matrix:
36 |         rust:
37 |           - stable
38 |           - nightly
39 | 
40 |     steps:
41 |       - uses: actions/checkout@v2
42 |       - name: Build
43 |         run: cargo build --verbose
44 |       - name: Run tests
45 |         run: cargo test --verbose
46 | 
47 |   clippy:
48 |     name: Clippy
49 |     runs-on: ubuntu-latest
50 |     strategy:
51 |       matrix:
52 |         rust:
53 |           - stable
54 |     steps:
55 |       - uses: actions/checkout@v1
56 |       - uses: actions-rs/toolchain@v1
57 |         with:
58 |           toolchain: ${{ matrix.rust }}
59 |           override: true
60 |       - run: rustup component add clippy
61 |       - uses: actions-rs/cargo@v1
62 |         with:
63 |           command: clippy
64 |           args: -- -D warnings
65 | 


--------------------------------------------------------------------------------
/src/subcommands/convert_abundance.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::{Help, Report};
 2 | use csv::ReaderBuilder;
 3 | use eyre::Context;
 4 | use libspideog::data::abundance::AbundanceData;
 5 | use tracing::instrument;
 6 | 
 7 | use crate::{
 8 |     cli::subcommands::{ConvertAbundance, Runner},
 9 |     io::{abundance_csv::WriteAbundanceCsv, report::ParseKrakenReport, Output},
10 | };
11 | 
12 | impl Runner for ConvertAbundance {
13 |     #[instrument]
14 |     fn run(self) -> Result<(), Report> {
15 |         let input = &self.input.path;
16 | 
17 |         let reader = self.input.open_report()?;
18 |         let mut csv_reader = ReaderBuilder::new()
19 |             .has_headers(self.input.headers)
20 |             .delimiter(b'\t')
21 |             .double_quote(false)
22 |             .flexible(true)
23 |             .from_reader(reader);
24 | 
25 |         let output = Output::from(self.output.file);
26 |         output.try_writtable()?;
27 | 
28 |         let data: AbundanceData = AbundanceData::parse(&mut csv_reader)
29 |             .wrap_err_with(|| format!("failed to parse file `{}`", &input.display()))
30 |             .suggestion("try using the `--has-headers` option if your Kraken report has headers")?;
31 | 
32 |         let mut writer = output.writer()?;
33 | 
34 |         match self.output.format {
35 |             crate::io::OutputAbundanceFormat::Csv => {
36 |                 data.write_csv(&mut writer)
37 |                     .wrap_err("failed to write output to CSV")?;
38 |             }
39 |         }
40 | 
41 |         Ok(())
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/ci/install.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | main() {
 4 |     local target=
 5 |     if [ $TRAVIS_OS_NAME = linux ]; then
 6 |         target=x86_64-unknown-linux-musl
 7 |         sort=sort
 8 |     else
 9 |         target=x86_64-apple-darwin
10 |         sort=gsort  # for `sort --sort-version`, from brew's coreutils.
11 |     fi
12 | 
13 |     # Builds for iOS are done on OSX, but require the specific target to be
14 |     # installed.
15 |     case $TARGET in
16 |         aarch64-apple-ios)
17 |             rustup target install aarch64-apple-ios
18 |             ;;
19 |         armv7-apple-ios)
20 |             rustup target install armv7-apple-ios
21 |             ;;
22 |         armv7s-apple-ios)
23 |             rustup target install armv7s-apple-ios
24 |             ;;
25 |         i386-apple-ios)
26 |             rustup target install i386-apple-ios
27 |             ;;
28 |         x86_64-apple-ios)
29 |             rustup target install x86_64-apple-ios
30 |             ;;
31 |     esac
32 | 
33 |     # This fetches latest stable release
34 |     local tag=$(git ls-remote --tags --refs --exit-code https://github.com/rust-embedded/cross \
35 |                        | cut -d/ -f3 \
36 |                        | grep -E '^v0.[0-9]+.[0-9]+$' \
37 |                        | $sort --version-sort \
38 |                        | tail -n1)
39 |     curl -LSfs https://japaric.github.io/trust/install.sh | \
40 |         sh -s -- \
41 |            --force \
42 |            --git japaric/cross \
43 |            --tag $tag \
44 |            --target $target
45 | }
46 | 
47 | main
48 | 


--------------------------------------------------------------------------------
/src/errors.rs:
--------------------------------------------------------------------------------
 1 | use crate::{data::tree::IndentedTaxon, taxonomy::Rank};
 2 | use displaydoc::Display;
 3 | use thiserror::Error;
 4 | 
 5 | #[derive(Display, Error, Debug)]
 6 | #[non_exhaustive]
 7 | pub enum SpideogError {
 8 |     /// expected root with no indentation, found indentation level: `{0}`
 9 |     NonZeroIndentRoot(usize),
10 |     /// no suitable parent found for node `{0}` of indent `{1}` and rank `{2}`
11 |     NoSuitableParent(String, usize, Rank),
12 |     /// no node added to the tree
13 |     NoNodeAdded,
14 |     /// failed to parse line `{0}`
15 |     LineParsingError(usize),
16 |     /// node not found
17 |     NodeNotFound,
18 |     /// edge between `{0}` and `{1}` not found
19 |     EdgeNotFound(IndentedTaxon, IndentedTaxon),
20 |     /// parse output error
21 |     ParseOutputPathError,
22 |     /// input file is empty
23 |     EmptyFile,
24 |     /// Kraken parser error
25 |     KrakenParser(#[source] csv::Error),
26 |     /// taxonomy tree is not initialized
27 |     TreeNotInitialized,
28 |     /// failed to parse taxon name and identation
29 |     KrakenIndentParsing,
30 |     /// other
31 |     Other,
32 | }
33 | 
34 | #[derive(Display, Error, Debug)]
35 | #[non_exhaustive]
36 | pub enum TaxRankParsingError {
37 |     /// failed to parse taxonomy rank offset from `{0}`: `{1}` is not a number (0..9)
38 |     OffsetNotANumber(String, char),
39 |     /// failed to parse taxonomy rank from `{0}`: found length `{1}` expected 1 or 2
40 |     InvalidLength(String, usize),
41 |     /// failed to parse taxonomy rank from `{0}`: invalid rank code `{1}` expected R, D, K, P, C, O, F, G, S, U, or -
42 |     InvalidRankCode(String, char),
43 |     /// failed to parse taxonomy rank from `{0}`: cannot infer previous taxonomy rank from previous records
44 |     TaxRankParsingCannotInferRank(String),
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | // #![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
 2 | // #![allow(clippy::missing_const_for_fn)]
 3 | // #![allow(clippy::multiple_crate_versions)]
 4 | // #![allow(clippy::missing_errors_doc)]
 5 | // #![allow(clippy::module_name_repetitions)]
 6 | 
 7 | #[macro_use]
 8 | extern crate eyre;
 9 | #[macro_use]
10 | extern crate clap;
11 | #[macro_use]
12 | extern crate custom_derive;
13 | #[macro_use]
14 | extern crate enum_derive;
15 | 
16 | mod cli;
17 | mod io;
18 | mod subcommands;
19 | 
20 | use crate::clap::Clap;
21 | use cli::{
22 |     subcommands::{Command, Runner},
23 |     Opts,
24 | };
25 | 
26 | use color_eyre::eyre::Report;
27 | use displaydoc::Display;
28 | use eyre::Context;
29 | use thiserror::Error;
30 | use tracing::instrument;
31 | 
32 | #[derive(Display, Error, Debug)]
33 | #[non_exhaustive]
34 | pub enum BinError {
35 |     /// IO error with `{path}`
36 |     Io {
37 |         #[source]
38 |         err: std::io::Error,
39 |         path: std::path::PathBuf,
40 |     },
41 |     /// encountered multiple errors
42 |     MultipleErrors,
43 | }
44 | 
45 | #[instrument]
46 | fn main() -> Result<(), Report> {
47 |     cli::install_tracing();
48 |     cli::setup_error_hook()?;
49 | 
50 |     let opts: Opts = Opts::parse();
51 | 
52 |     match opts.command {
53 |         Command::ConvertTree(args) => {
54 |             args.run().wrap_err("failed to convert taxonomy tree")?;
55 |         }
56 |         Command::CombineTrees(args) => {
57 |             args.run().wrap_err("failed to combine taxonomy trees")?;
58 |         }
59 |         Command::ConvertAbundance(args) => {
60 |             args.run()
61 |                 .wrap_err("failed to convert taxonomy abundance data")?;
62 |         }
63 |         Command::CombineAbundances(args) => {
64 |             args.run().wrap_err("failed to combine abundance data")?;
65 |         }
66 |     }
67 | 
68 |     Ok(())
69 | }
70 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | <!-- next-header -->
 8 | 
 9 | ## [Unreleased] - ReleaseDate
10 | 
11 | ## [0.2.0] - 2020-11-17
12 | 
13 | ### Added
14 | 
15 | - command `convert-abundance`
16 | - command `combine-abundance`
17 | 
18 | 
19 | ### Modified
20 | 
21 | - fixed README typos
22 | 
23 | ## [0.1.2] - 2020-10-29
24 | 
25 | ### Added
26 | 
27 | - command `combine-trees` 
28 |   - read multiple Kraken reports
29 |   - write one Newick taxonomy tree
30 | - second example Kraken report to test combining trees
31 | 
32 | ### Modified
33 | 
34 | - changed `tree` to `convert-tree`
35 |   - read only one file
36 |   - write only one file (default: stdout)
37 | - dev: split codebase between libspideog (src/lib.rs) and spideog (src/main.rs)
38 | - dev: other refactoring and improvements of the codebase
39 | 
40 | ## [0.1.1] - 2020-10-24
41 | 
42 | ### Added 
43 | 
44 | - dev: continous integration builds for linux, osx, and windows
45 | - error: add spantrace
46 | - documentation: example kraken report and output
47 | - documentation: links to downloads
48 | 
49 | ### Modified
50 | 
51 | - bugfix: quotes and round brackets were added to the list of characters to escape in taxon name
52 | - refactor: started to refactor to facilitate unit testing
53 | 
54 | ## [0.1.0] - 2020-10-19
55 | 
56 | ### Added
57 | 
58 | - command `tree` to convert the taxonomy tree from Kraken reports to newick format
59 | 
60 | 
61 | <!-- next-url -->
62 | [Unreleased]: https://github.com/jeanmanguy/spideog/compare/v0.2.0...HEAD
63 | [0.2.0]: https://github.com/jeanmanguy/spideog/compare/v0.1.2...v0.2.0
64 | [0.1.2]: https://github.com/jeanmanguy/spideog/compare/v0.1.1...v0.1.2
65 | [0.1.1]: https://github.com/jeanmanguy/spideog/compare/v0.1.0...v0.1.1
66 | [0.1.0]: https://github.com/jeanmanguy/spideog/releases/tag/v0.1.0


--------------------------------------------------------------------------------
/src/io/report.rs:
--------------------------------------------------------------------------------
 1 | use std::{convert::TryFrom, fs::File};
 2 | 
 3 | use csv::Reader;
 4 | use libspideog::{
 5 |     data::abundance::AbundanceData,
 6 |     data::tree::{IndentedTaxon, Tree},
 7 |     errors::SpideogError,
 8 |     kraken::{Fragments, ReportRecord, Taxon},
 9 | };
10 | use tracing::instrument;
11 | 
12 | pub trait ParseKrakenReport: Sized {
13 |     fn parse(reader: &mut Reader<File>) -> Result<Self, SpideogError>;
14 | }
15 | 
16 | fn parse_origin_tree(
17 |     first_line: Option<Result<ReportRecord, csv::Error>>,
18 | ) -> Result<Tree, SpideogError> {
19 |     let first_line = first_line.ok_or(SpideogError::EmptyFile)?;
20 |     let first_record: ReportRecord = first_line.map_err(SpideogError::KrakenParser)?;
21 |     let origin = IndentedTaxon::try_from(first_record)?;
22 |     let mut taxonomy_tree = Tree::new();
23 |     taxonomy_tree.with_origin(origin);
24 |     Ok(taxonomy_tree)
25 | }
26 | 
27 | impl ParseKrakenReport for Tree {
28 |     #[instrument]
29 |     fn parse(reader: &mut Reader<File>) -> Result<Self, SpideogError> {
30 |         let first_line = reader.deserialize().next();
31 | 
32 |         let mut taxonomy_tree = parse_origin_tree(first_line)?;
33 | 
34 |         for result in reader.deserialize() {
35 |             let record: ReportRecord = result.map_err(SpideogError::KrakenParser)?;
36 |             let node = IndentedTaxon::try_from(record)?;
37 |             let parent = taxonomy_tree.find_valid_parent_for(&node)?;
38 |             taxonomy_tree.child(parent, node);
39 |         }
40 | 
41 |         Ok(taxonomy_tree)
42 |     }
43 | }
44 | 
45 | impl ParseKrakenReport for AbundanceData {
46 |     #[instrument]
47 |     fn parse(reader: &mut Reader<File>) -> Result<Self, SpideogError> {
48 |         let mut data = Self::new();
49 | 
50 |         for result in reader.deserialize() {
51 |             let record: ReportRecord = result.map_err(SpideogError::KrakenParser)?;
52 |             let taxon = Taxon::try_from(record.clone())?;
53 |             let fragments = Fragments::try_from(record)?;
54 |             data.insert(taxon, fragments);
55 |         }
56 | 
57 |         Ok(data)
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/kraken.rs:
--------------------------------------------------------------------------------
 1 | use std::{convert::TryFrom, fmt::Display};
 2 | 
 3 | use tracing::instrument;
 4 | 
 5 | use crate::{errors::SpideogError, parser::parse_ident_organism_name, taxonomy::Rank};
 6 | 
 7 | pub type ReportRecord = (String, u64, u64, Rank, u64, String);
 8 | pub type Indent = usize;
 9 | 
10 | #[derive(Clone, PartialEq, PartialOrd, Debug, Ord, Eq, Hash, Deserialize)]
11 | pub struct Taxon {
12 |     #[serde(rename = "taxonomy_lvl")]
13 |     pub taxonomy_level: Rank,
14 |     pub name: String,
15 |     pub taxonomy_id: u64,
16 | }
17 | 
18 | impl Display for Taxon {
19 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
20 |         write!(
21 |             f,
22 |             "{} taxid:{} rank:{}",
23 |             self.name, self.taxonomy_id, self.taxonomy_level
24 |         )
25 |     }
26 | }
27 | 
28 | impl TryFrom<ReportRecord> for Taxon {
29 |     type Error = SpideogError;
30 | 
31 |     #[instrument]
32 |     fn try_from(value: ReportRecord) -> Result<Self, Self::Error> {
33 |         let (_, (_, name)) = parse_ident_organism_name(value.5.as_bytes())
34 |             .map_err(|_e| SpideogError::KrakenIndentParsing)?;
35 | 
36 |         let taxon = Self {
37 |             taxonomy_level: value.3,
38 |             name: String::from_utf8_lossy(name).trim().to_string(),
39 |             taxonomy_id: value.4,
40 |         };
41 | 
42 |         Ok(taxon)
43 |     }
44 | }
45 | 
46 | #[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize, Default)]
47 | pub struct Fragments {
48 |     pub clade_percentage: f64,
49 |     pub clade_count_reads: u64,
50 |     pub taxon_count_reads: u64,
51 | }
52 | 
53 | impl TryFrom<ReportRecord> for Fragments {
54 |     type Error = SpideogError;
55 | 
56 |     #[instrument]
57 |     fn try_from(value: ReportRecord) -> Result<Self, Self::Error> {
58 |         let percentage = value.0.parse::<f64>().map_err(|_e| SpideogError::Other)?;
59 | 
60 |         let fragments = Self {
61 |             clade_percentage: percentage,
62 |             clade_count_reads: value.1,
63 |             taxon_count_reads: value.1,
64 |         };
65 | 
66 |         Ok(fragments)
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/cli/logging.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::{eyre::Context, Report};
 2 | use log::LevelFilter;
 3 | use simplelog::{ConfigBuilder, TermLogger, TerminalMode, WriteLogger};
 4 | use std::fs::OpenOptions;
 5 | use std::path::PathBuf;
 6 | use tracing::instrument;
 7 | 
 8 | #[derive(Clap, Debug)]
 9 | pub struct Logging {
10 |     /// Log file (stdout if not present)
11 |     #[clap(long, short, parse(from_os_str), global = true)]
12 |     pub log: Option<PathBuf>,
13 |     /// Show addditional information.
14 |     #[clap(long, global = true)]
15 |     pub verbose: bool,
16 | }
17 | 
18 | impl Logging {
19 |     #[instrument]
20 |     pub fn setup(&self) -> Result<(), Report> {
21 |         let verbosity: LevelFilter = if self.verbose {
22 |             LevelFilter::Debug
23 |         } else {
24 |             LevelFilter::Warn
25 |         };
26 | 
27 |         match &self.log {
28 |             Some(filepath) => Self::setup_file_log(verbosity, filepath),
29 |             None => Self::setup_term_log(verbosity),
30 |         }
31 |     }
32 | 
33 |     #[instrument]
34 |     fn setup_file_log(verbosity: LevelFilter, filepath: &PathBuf) -> Result<(), Report> {
35 |         let file = OpenOptions::new()
36 |             .write(true)
37 |             .truncate(false)
38 |             .create(true)
39 |             .open(filepath)?;
40 |         WriteLogger::init(
41 |             verbosity,
42 |             ConfigBuilder::new().set_time_format_str("%F %R%:z").build(),
43 |             file,
44 |         )
45 |         .wrap_err_with(|| {
46 |             format!(
47 |                 "Failed to setup the writer logger for file {}",
48 |                 filepath.display()
49 |             )
50 |         })?;
51 | 
52 |         Ok(())
53 |     }
54 | 
55 |     #[instrument]
56 |     fn setup_term_log(verbosity: LevelFilter) -> Result<(), Report> {
57 |         TermLogger::init(
58 |             verbosity,
59 |             ConfigBuilder::new()
60 |                 .set_time_level(LevelFilter::Off)
61 |                 .build(),
62 |             TerminalMode::Stderr,
63 |         )
64 |         .wrap_err("Failed to setup the writer logger for stdout")?;
65 | 
66 |         Ok(())
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/subcommands/combine_phylo.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::{Help, Report};
 2 | use libspideog::{data::tree::Tree, errors::SpideogError};
 3 | use tracing::instrument;
 4 | 
 5 | use crate::{
 6 |     cli::subcommands::{CombineTrees, Runner},
 7 |     io::{newick::write_newick, report::ParseKrakenReport, Output},
 8 | };
 9 | 
10 | type VecResultTrees = Vec<Result<Tree, SpideogError>>;
11 | 
12 | impl Runner for CombineTrees {
13 |     #[instrument]
14 |     fn run(self) -> Result<(), Report> {
15 |         let readers = self.input.open_reports()?;
16 |         let output = Output::from(self.output.file.clone());
17 |         output.try_writtable()?;
18 | 
19 |         let (ok_trees, errors_trees): (VecResultTrees, VecResultTrees) = readers
20 |             .into_iter()
21 |             .map(|r| -> Result<Tree, SpideogError> {
22 |                 let mut csv_reader = csv::ReaderBuilder::new()
23 |                     .has_headers(self.input.headers)
24 |                     .delimiter(b'\t')
25 |                     .double_quote(false)
26 |                     .flexible(true)
27 |                     .from_reader(r);
28 | 
29 |                 let tree: Tree = ParseKrakenReport::parse(&mut csv_reader)?;
30 | 
31 |                 Ok(tree)
32 |             })
33 |             .partition(Result::is_ok);
34 | 
35 |         if !errors_trees.is_empty() {
36 |             return errors_trees
37 |                 .into_iter()
38 |                 .filter_map(|result| {
39 |                     if let Err(error) = result {
40 |                         Some(error)
41 |                     } else {
42 |                         None
43 |                     }
44 |                 })
45 |                 .fold(Err(eyre!("encountered multiple errors")), |report, e| {
46 |                     report.error(e)
47 |                 });
48 |         }
49 | 
50 |         let mut trees_iter = ok_trees.into_iter().map(Result::unwrap);
51 | 
52 |         let combined_tree = trees_iter.try_fold(Tree::new(), Tree::try_combine_with)?;
53 | 
54 |         let mut writer = output.writer()?;
55 |         match self.output.format {
56 |             crate::io::OutputPhyloFormat::Newick => write_newick(&mut writer, &combined_tree)?,
57 |         }
58 | 
59 |         Ok(())
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/tests/sample_data/sample.kreport:
--------------------------------------------------------------------------------
 1 | 33.64	180	180	U	0	unclassified
 2 | 66.36	355	0	R	1	root
 3 | 66.36	355	0	R1	131567	  cellular organisms
 4 | 41.5	222	0	R2	2759	    Eukaryota
 5 | 33.64	180	0	K	33208	      Metazoa
 6 | 33.64	180	0	P	7711	        Chordata
 7 | 33.64	180	0	C	40674	          Mammalia
 8 | 33.64	180	0	C3	1437010	            Boreoeutheria
 9 | 9.35	50	0	C4	314146	              Euarchontoglires
10 | 9.35	50	0	O	9443	                Primates
11 | 9.35	50	0	F	9604	                  Hominidae
12 | 9.35	50	0	G	9605	                    Homo
13 | 9.35	50	50	S	9606	                      Homo sapiens
14 | 24.3	130	0	C4	314145	              Laurasiatheria
15 | 1.87	10	0	O	33554	                Carnivora
16 | 1.87	10	0	F	9608	                  Canidae
17 | 1.87	10	0	G	9611	                    Canis
18 | 1.87	10	10	S	9612	                      Canis lupus
19 | 22.43	120	0	O	91561	                Artiodactyla
20 | 22.43	120	0	F	9726	                  Delphinidae
21 | 22.43	120	0	G	9738	                    Tursiops
22 | 22.43	120	120	S	9739	                      Tursiops truncatus
23 | 7.85	42	0	K	4751	      Fungi
24 | 7.85	42	0	P	4890	        Ascomycota
25 | 7.85	42	0	C	4891	          Saccharomycetes
26 | 7.85	42	0	O	4892	            Saccharomycetales
27 | 7.85	42	0	F	4893	              Saccharomycetaceae
28 | 7.85	42	0	G	4930	                Saccharomyces
29 | 7.85	42	42	S	4932	                  Saccharomyces cerevisiae
30 | 20.19	108	0	D	2	    Bacteria
31 | 5.61	30	0	P	1224	      Proteobacteria
32 | 5.61	30	0	C	1236	        Gammaproteobacteria
33 | 5.61	30	0	O	91347	          Enterobacterales
34 | 5.61	30	0	F	543	            Enterobacteriaceae
35 | 5.61	30	0	G	561	              Escherichia
36 | 5.61	30	30	S	562	                Escherichia coli
37 | 14.58	78	0	P	1239	      Firmicutes 
38 | 14.58	78	0	C	91061	        Bacilli
39 | 14.58	78	0	O	186826	          Lactobacillales 
40 | 14.58	78	0	F	1300	            Streptococcaceae
41 | 14.58	78	0	G	1357	              Lactococcus
42 | 14.58	78	78	S	1358	                Lactococcus lactis
43 | 4.67	25	0	D	2157	    Archaea
44 | 4.67	25	0	P	28889	      Crenarchaeota 
45 | 4.67	25	0	C	183924	        Thermoprotei 
46 | 4.67	25	0	O	2266	          Thermoproteales 
47 | 4.67	25	0	F	2267	            Thermoproteaceae 
48 | 4.67	25	0	G	2276	              Pyrobaculum 
49 | 4.67	25	25	S	70771	                Pyrobaculum neutrophilum
50 | 


--------------------------------------------------------------------------------
/src/cli/subcommands.rs:
--------------------------------------------------------------------------------
 1 | use super::args::{MultipleReports, OutputAbundance, OutputPhylo, SingleReport};
 2 | #[derive(Clap, Debug)]
 3 | #[non_exhaustive]
 4 | pub enum Command {
 5 |     // Info(Info),
 6 |     ConvertTree(ConvertTree),
 7 |     ConvertAbundance(ConvertAbundance),
 8 |     CombineTrees(CombineTrees),
 9 |     CombineAbundances(CombineAbundances),
10 |     // Track(Track),
11 | }
12 | 
13 | /// Extract diverse information about multiple reports
14 | #[derive(Clap, Debug)]
15 | #[clap(after_help = super::AFTER_HELP)]
16 | pub struct Info {
17 |     #[clap(flatten)]
18 |     pub input: MultipleReports,
19 |     #[clap(flatten)]
20 |     pub output: OutputAbundance,
21 | }
22 | 
23 | /// Track one or multiple species across multiple reports
24 | #[derive(Clap, Debug)]
25 | #[clap(after_help = super::AFTER_HELP)]
26 | pub struct Track {
27 |     #[clap(flatten)]
28 |     pub input: MultipleReports,
29 |     #[clap(flatten)]
30 |     pub output: OutputAbundance,
31 | }
32 | 
33 | /// Convert one report to one taxonomy tree
34 | #[derive(Clap, Debug)]
35 | #[clap(after_help = super::AFTER_HELP)]
36 | pub struct ConvertTree {
37 |     #[clap(flatten)]
38 |     pub input: SingleReport,
39 |     #[clap(flatten)]
40 |     pub output: OutputPhylo,
41 | }
42 | 
43 | /// Convert one report to one abundance table
44 | #[derive(Clap, Debug)]
45 | #[clap(after_help = super::AFTER_HELP)]
46 | pub struct ConvertAbundance {
47 |     #[clap(flatten)]
48 |     pub input: SingleReport,
49 |     #[clap(flatten)]
50 |     pub output: OutputAbundance,
51 | }
52 | 
53 | /// Combine multiple reports to one taxonomy tree
54 | #[derive(Clap, Debug)]
55 | #[clap(after_help = super::AFTER_HELP)]
56 | pub struct CombineTrees {
57 |     #[clap(flatten)]
58 |     pub input: MultipleReports,
59 |     #[clap(flatten)]
60 |     pub output: OutputPhylo,
61 | }
62 | 
63 | /// Merge multiple reports to one abundance table
64 | #[derive(Clap, Debug)]
65 | #[clap(after_help = super::AFTER_HELP)]
66 | pub struct CombineAbundances {
67 |     #[clap(flatten)]
68 |     pub input: MultipleReports,
69 |     #[clap(flatten)]
70 |     pub output: OutputAbundance,
71 |     /// add missing taxons for each sample
72 |     #[clap(long = "add-missing-taxons", takes_value(false))]
73 |     pub add_missing_taxons: bool,
74 | }
75 | 
76 | pub trait Runner {
77 |     fn run(self) -> Result<(), color_eyre::eyre::Report>;
78 | }
79 | 


--------------------------------------------------------------------------------
/src/data/abundance.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::iter::FromIterator;
 3 | 
 4 | use crate::kraken::{Fragments, Taxon};
 5 | 
 6 | pub type AbundanceData = HashMap<Taxon, Fragments>;
 7 | 
 8 | pub type SampleName = String;
 9 | 
10 | #[derive(Debug, Default, PartialEq)]
11 | pub struct SampleAbundance {
12 |     pub name: SampleName,
13 |     pub dataset: AbundanceData,
14 | }
15 | 
16 | impl SampleAbundance {
17 |     #[must_use]
18 |     pub fn taxons(&self) -> Vec<Taxon> {
19 |         self.dataset.keys().cloned().collect()
20 |     }
21 | }
22 | 
23 | impl From<(SampleName, AbundanceData)> for SampleAbundance {
24 |     fn from(values: (SampleName, AbundanceData)) -> Self {
25 |         Self {
26 |             name: values.0,
27 |             dataset: values.1,
28 |         }
29 |     }
30 | }
31 | 
32 | pub type SamplesAbundanceData = Vec<SampleAbundance>; // FIXME remove
33 | 
34 | #[derive(Debug, Default, PartialEq)]
35 | pub struct Samples {
36 |     pub data: Vec<SampleAbundance>,
37 |     pub unique_taxons: Vec<Taxon>,
38 | }
39 | 
40 | impl Samples {
41 |     #[must_use]
42 |     pub fn new() -> Self {
43 |         Self {
44 |             data: Vec::new(),
45 |             unique_taxons: Vec::new(),
46 |         }
47 |     }
48 | 
49 |     fn add(&mut self, elem: SampleAbundance) {
50 |         let new_taxons = elem.taxons();
51 | 
52 |         for taxon in new_taxons {
53 |             if !self.unique_taxons.contains(&taxon) {
54 |                 self.unique_taxons.push(taxon);
55 |             }
56 |         }
57 | 
58 |         self.data.push(elem);
59 |     }
60 | 
61 |     pub fn add_missing_taxons(&mut self) -> &mut Self {
62 |         for datum in &mut self.data {
63 |             for taxon in &self.unique_taxons {
64 |                 datum
65 |                     .dataset
66 |                     .entry(taxon.clone())
67 |                     .or_insert_with(Fragments::default);
68 |             }
69 |         }
70 | 
71 |         self
72 |     }
73 | }
74 | 
75 | impl FromIterator<(SampleName, AbundanceData)> for Samples {
76 |     fn from_iter<T: IntoIterator<Item = (SampleName, AbundanceData)>>(iter: T) -> Self {
77 |         let mut samples = Self::new();
78 | 
79 |         for i in iter {
80 |             let sample = SampleAbundance::from(i);
81 |             samples.add(sample);
82 |         }
83 | 
84 |         samples
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
 1 | pub mod args;
 2 | pub mod logging;
 3 | pub mod subcommands;
 4 | 
 5 | use clap::AppSettings;
 6 | use color_eyre::{Report, Result};
 7 | 
 8 | static AFTER_HELP: &str = "Thank you for using Spideog. Please send any feedback, bug report or feature request to the project's github page: https://github.com/jeanmanguy/spideog";
 9 | 
10 | #[derive(Debug, Clap)]
11 | #[clap(author, about, version)]
12 | #[clap(global_setting = AppSettings::ColoredHelp)]
13 | #[clap(global_setting = AppSettings::ColorAuto)]
14 | #[clap(global_setting = AppSettings::DeriveDisplayOrder)]
15 | #[clap(global_setting = AppSettings::DontCollapseArgsInUsage)]
16 | #[clap(global_setting = AppSettings::GlobalVersion)]
17 | #[clap(global_setting = AppSettings::ArgRequiredElseHelp)]
18 | #[clap(global_setting = AppSettings::HelpRequired)]
19 | #[clap(global_setting = AppSettings::UnifiedHelpMessage)]
20 | #[clap(after_help = AFTER_HELP)]
21 | pub struct Opts {
22 |     #[clap(subcommand)]
23 |     pub command: subcommands::Command,
24 |     // #[clap(flatten)]
25 |     // pub logging: logging::Logging,
26 | }
27 | 
28 | pub fn setup_error_hook() -> Result<(), Report> {
29 |     color_eyre::config::HookBuilder::default()
30 |         .add_default_filters()
31 |         .issue_url(concat!(env!("CARGO_PKG_REPOSITORY"), "/issues/new"))
32 |         .add_issue_metadata("version", crate_version!())
33 |         .add_issue_metadata("architecture", std::env::consts::ARCH)
34 |         .add_issue_metadata("OS", std::env::consts::OS)
35 |         .issue_filter(|kind| match kind {
36 |             color_eyre::ErrorKind::NonRecoverable(_) => true,
37 |             color_eyre::ErrorKind::Recoverable(_) => false,
38 |         })
39 |         .install()
40 | }
41 | 
42 | // Boilerplate: https://github.com/yaahc/color-eyre/blob/master/examples/usage.rs
43 | // TODO: adjust for use
44 | // TODO: move to logging.rs?
45 | pub fn install_tracing() {
46 |     use tracing_error::ErrorLayer;
47 |     use tracing_subscriber::prelude::*;
48 |     use tracing_subscriber::{fmt, EnvFilter};
49 | 
50 |     let fmt_layer = fmt::layer().with_target(false);
51 |     let filter_layer = EnvFilter::try_from_default_env()
52 |         .or_else(|_| EnvFilter::try_new("debug"))
53 |         .unwrap();
54 | 
55 |     tracing_subscriber::registry()
56 |         .with(filter_layer)
57 |         .with(fmt_layer)
58 |         .with(ErrorLayer::default())
59 |         .init();
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/sample_data/converted.csv:
--------------------------------------------------------------------------------
 1 | "taxon","taxid","rank","clade_percentage","clade_count_reads","taxon_count_reads"
 2 | "Thermoproteales",2266,"Order_0",4.67,25,25
 3 | "Homo",9605,"Genus_0",9.35,50,50
 4 | "Laurasiatheria",314145,"Class_4",24.3,130,130
 5 | "Artiodactyla",91561,"Order_0",22.43,120,120
 6 | "Delphinidae",9726,"Family_0",22.43,120,120
 7 | "Canidae",9608,"Family_0",1.87,10,10
 8 | "Ascomycota",4890,"Phylum_0",7.85,42,42
 9 | "Proteobacteria",1224,"Phylum_0",5.61,30,30
10 | "Tursiops",9738,"Genus_0",22.43,120,120
11 | "Enterobacterales",91347,"Order_0",5.61,30,30
12 | "Carnivora",33554,"Order_0",1.87,10,10
13 | "Fungi",4751,"Kingdom_0",7.85,42,42
14 | "Homo sapiens",9606,"Species_0",9.35,50,50
15 | "Chordata",7711,"Phylum_0",33.64,180,180
16 | "Lactococcus lactis",1358,"Species_0",14.58,78,78
17 | "Pyrobaculum",2276,"Genus_0",4.67,25,25
18 | "Archaea",2157,"Domain_0",4.67,25,25
19 | "Euarchontoglires",314146,"Class_4",9.35,50,50
20 | "Tursiops truncatus",9739,"Species_0",22.43,120,120
21 | "Primates",9443,"Order_0",9.35,50,50
22 | "Bacteria",2,"Domain_0",20.19,108,108
23 | "Saccharomycetaceae",4893,"Family_0",7.85,42,42
24 | "Thermoprotei",183924,"Class_0",4.67,25,25
25 | "Bacilli",91061,"Class_0",14.58,78,78
26 | "Thermoproteaceae",2267,"Family_0",4.67,25,25
27 | "Saccharomycetales",4892,"Order_0",7.85,42,42
28 | "Crenarchaeota",28889,"Phylum_0",4.67,25,25
29 | "unclassified",0,"Unclassified_0",33.64,180,180
30 | "Hominidae",9604,"Family_0",9.35,50,50
31 | "Gammaproteobacteria",1236,"Class_0",5.61,30,30
32 | "Lactococcus",1357,"Genus_0",14.58,78,78
33 | "Eukaryota",2759,"Root_2",41.5,222,222
34 | "cellular organisms",131567,"Root_1",66.36,355,355
35 | "Saccharomyces",4930,"Genus_0",7.85,42,42
36 | "Canis",9611,"Genus_0",1.87,10,10
37 | "Pyrobaculum neutrophilum",70771,"Species_0",4.67,25,25
38 | "Lactobacillales",186826,"Order_0",14.58,78,78
39 | "root",1,"Root_0",66.36,355,355
40 | "Canis lupus",9612,"Species_0",1.87,10,10
41 | "Enterobacteriaceae",543,"Family_0",5.61,30,30
42 | "Escherichia",561,"Genus_0",5.61,30,30
43 | "Mammalia",40674,"Class_0",33.64,180,180
44 | "Saccharomyces cerevisiae",4932,"Species_0",7.85,42,42
45 | "Saccharomycetes",4891,"Class_0",7.85,42,42
46 | "Boreoeutheria",1437010,"Class_3",33.64,180,180
47 | "Metazoa",33208,"Kingdom_0",33.64,180,180
48 | "Escherichia coli",562,"Species_0",5.61,30,30
49 | "Streptococcaceae",1300,"Family_0",14.58,78,78
50 | "Firmicutes",1239,"Phylum_0",14.58,78,78
51 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "spideog"
 3 | version = "0.2.0"
 4 | authors = ["Jean Manguy <jean@manguy.eu>"]
 5 | edition = "2018"
 6 | description = "Command line utility to analyse and convert Kraken reports"
 7 | publish = false
 8 | readme = "README.md"
 9 | repository = "https://github.com/jeanmanguy/spideog"
10 | documentation = "https://github.com/jeanmanguy/spideog/blob/main/README.md"
11 | keywords = ["cli", "bioinformatics", "metagenomics"]
12 | categories = ["command-line-utilities", "science"]
13 | license = "MIT/Apache-2.0"
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | 
17 | [dependencies]
18 | thiserror = "1.0"
19 | serde = { version = "1.0", features = ["derive"] }
20 | csv = "1"
21 | color-eyre = { version = "0.5", features = ["issue-url"] }
22 | clap = { version = "~3.0.0-beta.2", default-features = false, features = [ "derive", "suggestions", "color", "std" ] }
23 | clap_generate = "3.0.0-beta.2"
24 | nom = "6.0"
25 | daggy = "0.7"
26 | petgraph = "0.5"
27 | log = "0.4"
28 | simplelog = "0.8"
29 | displaydoc = "0.1"
30 | once_cell = "1.4"
31 | atty = "0.2" 
32 | dialoguer = "0.6"
33 | enum_derive = "0.1.7"
34 | custom_derive = "0.1"
35 | eyre = "0.6"
36 | tracing-error = "0.1.2"
37 | tracing = { version = "0.1.17", features = [ "attributes" ] }
38 | tracing-subscriber = "0.2.10"
39 | exitcode = "1.1.2"
40 | 
41 | [dev-dependencies]
42 | test-case = "1.0"
43 | pretty_assertions = "0.6.1"
44 | 
45 | 
46 | [lib]
47 | name = "libspideog"
48 | path = "src/lib.rs"
49 | 
50 | [[bin]]
51 | name = "spideog"
52 | path = "src/main.rs"
53 | 
54 | [profile.dev]
55 | panic = "unwind"
56 | 
57 | [profile.dev.package.backtrace]
58 | opt-level = 3
59 | 
60 | [profile.release]
61 | lto = true
62 | panic = "unwind"
63 | codegen-units = 1
64 | 
65 | 
66 | [package.metadata.release]
67 | disable-publish = true
68 | consolidate-commits = true
69 | no-dev-version = true
70 | 
71 | [[package.metadata.release.pre-release-replacements]]
72 | file = "CHANGELOG.md"
73 | search = "Unreleased"
74 | replace="{{version}}"
75 | 
76 | [[package.metadata.release.pre-release-replacements]]
77 | file = "CHANGELOG.md"
78 | search = "\\.\\.\\.HEAD"
79 | replace="...{{tag_name}}"
80 | exactly = 1
81 | 
82 | [[package.metadata.release.pre-release-replacements]]
83 | file = "CHANGELOG.md"
84 | search = "ReleaseDate"
85 | replace="{{date}}"
86 | 
87 | [[package.metadata.release.pre-release-replacements]]
88 | file="CHANGELOG.md"
89 | search="<!-- next-header -->"
90 | replace="<!-- next-header -->\n\n## [Unreleased] - ReleaseDate"
91 | exactly=1
92 | 
93 | [[package.metadata.release.pre-release-replacements]]
94 | file="CHANGELOG.md"
95 | search="<!-- next-url -->"
96 | replace="<!-- next-url -->\n[Unreleased]: https://github.com/jeanmanguy/{{crate_name}}/compare/{{tag_name}}...HEAD"
97 | exactly=1
98 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Based on the "trust" template v0.1.2
 2 | # https://github.com/japaric/trust/tree/v0.1.2
 3 | 
 4 | os: linux
 5 | dist: xenial
 6 | language: rust
 7 | services: docker
 8 | 
 9 | env:
10 |   global:
11 |     - CRATE_NAME=spideog
12 |     - secure: B39IuyC3UCdPlx6jq7xMB1aVOQiXUQry+0TbgR6CB18Yip9gGVRSkeIRGHnl38t6OkTGQ52U4nWRSigKOkXCJmzPirRgyC28TPGG/O7/SJlSFkhRv83w+9BfyjXyZAuRAaSdjTCxLcvJIMdrgos+r1C8TBEVnbVtg9dVDJEzIo2zvEIbxtOuCALRai7sL595oD5csrQtwR9sJgf4FsfZQRYSeIEvNgDUIEkUsu/38Xkx43ekgLOaGVwJ5qFAzaHY1LAdU5ilV+tHX4k7GBvM4oRFA1on2kBhRBtLj2XW3CFMBJe2kHi3fktMCeHxvZCDDpvs3q0rpj7ddk4kVxhihfY6Wa9KvrsZAbyJptiEDz4piB/xjMA1oGyllNpfI7+0E3Toj1Etmk5NqPt+Iumq6OHw0DQeqlUQZuaXYa1hJX5lxqEO36/YBouP907PrceKWoKEoHlbVZXOBLhy0WZqCmGGUl7QweNXqQ2Va0Ypmv5AbLvlYokop56WzcDlQswcKNJeAMOnOpEH0h5BUg4fmnpFyztL7M04U+JKOk4tFwZ2IbDILhm4zJGTzJaaE8yyUt3XPh4wxsVcftFj22eB+OF03jUmzZdb9raur5zgChwlR95Bs6YrFd2DFkiFk0hirs6AwFMMIAuDN4MhTF1EyGSq8QTObsMjIBGL5RexBhE=
13 | 
14 | jobs:
15 |   include:
16 |     # Linux
17 |     # - env: TARGET=aarch64-unknown-linux-gnu
18 |     # - env: TARGET=arm-unknown-linux-gnueabi
19 |     # - env: TARGET=armv7-unknown-linux-gnueabihf
20 |     # - env: TARGET=i686-unknown-linux-gnu
21 |     # - env: TARGET=i686-unknown-linux-musl
22 |     # - env: TARGET=mips-unknown-linux-gnu
23 |     # - env: TARGET=mips64-unknown-linux-gnuabi64
24 |     # - env: TARGET=mips64el-unknown-linux-gnuabi64
25 |     # - env: TARGET=mipsel-unknown-linux-gnu
26 |     # - env: TARGET=powerpc-unknown-linux-gnu
27 |     # - env: TARGET=powerpc64-unknown-linux-gnu
28 |     # - env: TARGET=powerpc64le-unknown-linux-gnu
29 |     # - env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
30 |     # - env: TARGET=x86_64-unknown-linux-gnu
31 |     - env: TARGET=x86_64-unknown-linux-musl
32 | 
33 |     # OSX
34 |     # - env: TARGET=i686-apple-darwin
35 |     #   os: osx
36 |     - env: TARGET=x86_64-apple-darwin
37 |       os: osx
38 | 
39 |     # *BSD
40 |     # - env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
41 |     # - env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
42 |     # - env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
43 | 
44 |     # Windows
45 |     - env: TARGET=x86_64-pc-windows-gnu
46 | 
47 | before_install:
48 |   - set -e
49 |   - rustup self update
50 | 
51 | install:
52 |   - sh ci/install.sh
53 |   - source ~/.cargo/env || true
54 | 
55 | script:
56 |   - bash ci/script.sh
57 | 
58 | after_script: set +e
59 | 
60 | before_deploy:
61 |   - bash ci/before_deploy.sh
62 | 
63 | deploy:
64 |   - provider: releases
65 |     api_key: $GITHUB_TOKEN
66 | 
67 |     file_glob: true
68 |     file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
69 |     on:
70 |       condition: $TRAVIS_RUST_VERSION = stable
71 |       tags: true
72 | 
73 |     skip_cleanup: true
74 | 
75 | cache: cargo
76 | before_cache:
77 |   # Travis can't cache files that are not readable by "others"
78 |   - chmod -R a+r $HOME/.cargo
79 | 
80 | branches:
81 |   only:
82 |     # release tags
83 |     - /^v\d+\.\d+\.\d+.*$/
84 |     - main
85 | 
86 | notifications:
87 |   email:
88 |     on_success: always
89 | 


--------------------------------------------------------------------------------
/tests/sample_data/sample_2.kreport:
--------------------------------------------------------------------------------
 1 | 30.82	249	249	U	0	unclassified
 2 | 69.18	559	0	R	1	root
 3 | 61.01	493	0	R1	131567	  cellular organisms
 4 | 45.42	367	0	R2	2759	    Eukaryota
 5 | 37.13	300	0	K	33208	      Metazoa
 6 | 37.13	300	0	P	7711	        Chordata
 7 | 37.13	300	0	C	40674	          Mammalia
 8 | 37.13	300	0	C3	1437010	            Boreoeutheria
 9 | 24.75	200	0	C4	314146	              Euarchontoglires
10 | 24.75	200	0	O	9443	                Primates
11 | 24.75	200	0	F	9604	                  Hominidae
12 | 24.75	200	0	G	9605	                    Homo
13 | 24.75	200	200	S	9606	                      Homo sapiens
14 | 12.38	100	0	C4	314145	              Laurasiatheria
15 | 4.95	40	0	O	33554	                Carnivora
16 | 4.95	40	0	F	9681	                  Felidae
17 | 4.95	40	0	G	9682	                    Felis
18 | 4.95	40	40	S	9685	                      Felis catus
19 | 7.43	60	0	O	91561	                Artiodactyla
20 | 7.43	60	0	F	9726	                  Delphinidae
21 | 7.43	60	0	G	9738	                    Tursiops
22 | 7.43	60	60	S	9739	                      Tursiops truncatus
23 | 8.29	67	0	K	4751	      Fungi
24 | 8.29	67	0	P	4890	        Ascomycota
25 | 8.29	67	0	C	4891	          Saccharomycetes
26 | 8.29	67	0	O	4892	            Saccharomycetales
27 | 8.29	67	0	F	4893	              Saccharomycetaceae
28 | 8.29	67	0	G	4930	                Saccharomyces
29 | 1.24	10	10	S	4932	                  Saccharomyces cerevisiae
30 | 1.86	15	15	S	114524	                  Saccharomyces kudriavzevii
31 | 5.20	42	42	S	27292	                  Saccharomyces pastorianus
32 | 14.11	114	0	D	2	    Bacteria
33 | 9.90	80	0	P	1224	      Proteobacteria
34 | 9.90	80	0	C	1236	        Gammaproteobacteria
35 | 9.90	80	0	O	91347	          Enterobacterales
36 | 9.90	80	0	F	543	            Enterobacteriaceae
37 | 9.90	80	0	G	561	              Escherichia
38 | 9.90	80	80	S	562	                Escherichia coli
39 | 4.21	34	0	P	1239	      Firmicutes 
40 | 4.21	34	0	C	91061	        Bacilli
41 | 4.21	34	0	O	186826	          Lactobacillales 
42 | 4.21	34	0	F	1300	            Streptococcaceae
43 | 4.21	34	0	G	1357	              Lactococcus
44 | 4.21	34	34	S	1358	                Lactococcus lactis
45 | 1.49	12	0	D	2157	    Archaea
46 | 1.49	12	0	P	28889	      Crenarchaeota 
47 | 1.49	12	0	C	183924	        Thermoprotei 
48 | 1.49	12	0	O	2266	          Thermoproteales 
49 | 1.49	12	0	F	2267	            Thermoproteaceae 
50 | 1.49	12	0	G	2276	              Pyrobaculum 
51 | 1.49	12	12	S	70771	                Pyrobaculum neutrophilum
52 | 8.17	66	0	D	10239	  Viruses
53 | 8.17	66	0	K	2732396	    Orthornavirae
54 | 8.17	66	0	P	2497569	      Negarnaviricota
55 | 8.17	66	0	C	2497577	        Insthoviricetes
56 | 8.17	66	0	O	2499411	          Articulavirales
57 | 6.68	54	0	F	11308	            Orthomyxoviridae
58 | 6.68	54	0	G	197912	              Betainfluenzavirus
59 | 6.68	54	54	S	11520	                Influenza B virus
60 | 1.49	12	0	F	2501949	            Amnoonviridae
61 | 1.49	12	0	G	2034997	              Tilapinevirus
62 | 1.49	12	12	S	2034996	                Tilapia tilapinevirus
63 | 


--------------------------------------------------------------------------------
/src/subcommands/combine_abundance.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::{Help, Report};
 2 | use eyre::Context;
 3 | use libspideog::{
 4 |     data::abundance::{AbundanceData, SampleName, Samples},
 5 |     errors::SpideogError,
 6 | };
 7 | use std::iter::FromIterator;
 8 | use tracing::instrument;
 9 | 
10 | use crate::{
11 |     cli::subcommands::{CombineAbundances, Runner},
12 |     io::{abundance_csv::WriteAbundanceCsv, report::ParseKrakenReport, Output},
13 | };
14 | 
15 | type VecResultAbundanceData = Vec<Result<(SampleName, AbundanceData), SpideogError>>;
16 | 
17 | impl Runner for CombineAbundances {
18 |     #[instrument]
19 |     fn run(self) -> Result<(), Report> {
20 |         let sample_names: Vec<SampleName> = self
21 |             .input
22 |             .paths
23 |             .iter()
24 |             .map(|p| p.file_stem().unwrap().to_string_lossy().into())
25 |             .collect();
26 |         let readers = self.input.open_reports()?;
27 |         let output = Output::from(self.output.file.clone());
28 |         output.try_writtable()?;
29 | 
30 |         let (ok_abundance_data, errors_abundance_data): (
31 |             VecResultAbundanceData,
32 |             VecResultAbundanceData,
33 |         ) = readers
34 |             .into_iter()
35 |             .zip(sample_names)
36 |             .map(
37 |                 |(file, sample_name)| -> Result<(SampleName, AbundanceData), SpideogError> {
38 |                     let mut csv_reader = csv::ReaderBuilder::new()
39 |                         .has_headers(self.input.headers)
40 |                         .delimiter(b'\t')
41 |                         .double_quote(false)
42 |                         .flexible(true)
43 |                         .from_reader(file);
44 | 
45 |                     let tree: AbundanceData = ParseKrakenReport::parse(&mut csv_reader)?;
46 | 
47 |                     Ok((sample_name, tree))
48 |                 },
49 |             )
50 |             .partition(Result::is_ok);
51 | 
52 |         if !errors_abundance_data.is_empty() {
53 |             return errors_abundance_data
54 |                 .into_iter()
55 |                 .filter_map(|result| {
56 |                     if let Err(error) = result {
57 |                         Some(error)
58 |                     } else {
59 |                         None
60 |                     }
61 |                 })
62 |                 .fold(Err(eyre!("encountered multiple errors")), |report, e| {
63 |                     report.error(e)
64 |                 });
65 |         }
66 | 
67 |         let mut samples = Samples::from_iter(ok_abundance_data.into_iter().map(Result::unwrap));
68 | 
69 |         if self.add_missing_taxons {
70 |             samples.add_missing_taxons();
71 |         }
72 | 
73 |         let mut writer = output.writer()?;
74 |         match self.output.format {
75 |             crate::io::OutputAbundanceFormat::Csv => {
76 |                 samples
77 |                     .write_csv(&mut writer)
78 |                     .wrap_err("failed to write output to CSV")?;
79 |             }
80 |         }
81 | 
82 |         Ok(())
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/io/abundance_csv.rs:
--------------------------------------------------------------------------------
 1 | use color_eyre::Report;
 2 | use csv::Writer;
 3 | use eyre::Context;
 4 | use libspideog::data::abundance::{AbundanceData, Samples};
 5 | use serde::Serialize;
 6 | 
 7 | pub trait WriteAbundanceCsv: Sized {
 8 |     fn write_csv<W: std::io::Write>(self, writer: &mut W) -> Result<(), Report> {
 9 |         let mut csv_writer = csv::WriterBuilder::new()
10 |             .delimiter(b',')
11 |             .quote_style(csv::QuoteStyle::NonNumeric)
12 |             .has_headers(true)
13 |             .from_writer(writer);
14 | 
15 |         self.write_records(&mut csv_writer)?;
16 | 
17 |         Ok(())
18 |     }
19 | 
20 |     fn write_records<W: std::io::Write>(self, csv_writer: &mut Writer<W>) -> Result<(), Report>;
21 | }
22 | 
23 | #[derive(Serialize)]
24 | struct RowAbundanceData {
25 |     #[serde(rename = "taxon")]
26 |     name: String,
27 |     #[serde(rename = "taxid")]
28 |     taxonomy_id: u64,
29 |     #[serde(rename = "rank")]
30 |     taxonomy_level: String,
31 |     clade_percentage: f64,
32 |     clade_count_reads: u64,
33 |     taxon_count_reads: u64,
34 | }
35 | 
36 | #[derive(Serialize)]
37 | struct RowSampleAbundanceData {
38 |     sample: String,
39 |     #[serde(rename = "taxon")]
40 |     name: String,
41 |     #[serde(rename = "taxid")]
42 |     taxonomy_id: u64,
43 |     #[serde(rename = "rank")]
44 |     taxonomy_level: String,
45 |     clade_percentage: f64,
46 |     clade_count_reads: u64,
47 |     taxon_count_reads: u64,
48 | }
49 | 
50 | impl WriteAbundanceCsv for AbundanceData {
51 |     fn write_records<W: std::io::Write>(self, csv_writer: &mut Writer<W>) -> Result<(), Report> {
52 |         for (taxon, abundance_data) in self {
53 |             csv_writer
54 |                 .serialize(RowAbundanceData {
55 |                     name: taxon.name.clone(),
56 |                     taxonomy_id: taxon.taxonomy_id,
57 |                     taxonomy_level: format!("{}", taxon.taxonomy_level),
58 |                     clade_percentage: abundance_data.clade_percentage,
59 |                     clade_count_reads: abundance_data.clade_count_reads,
60 |                     taxon_count_reads: abundance_data.taxon_count_reads,
61 |                 })
62 |                 .wrap_err_with(|| format!("failed to write record for `{}`", taxon.name))?;
63 |         }
64 | 
65 |         Ok(())
66 |     }
67 | }
68 | 
69 | impl WriteAbundanceCsv for Samples {
70 |     fn write_records<W: std::io::Write>(self, csv_writer: &mut Writer<W>) -> Result<(), Report> {
71 |         for sample in self.data {
72 |             for (taxon, abundance_data) in &sample.dataset {
73 |                 csv_writer
74 |                     .serialize(RowSampleAbundanceData {
75 |                         sample: sample.name.clone(),
76 |                         name: taxon.name.clone(),
77 |                         taxonomy_id: taxon.taxonomy_id,
78 |                         taxonomy_level: format!("{}", taxon.taxonomy_level),
79 |                         clade_percentage: abundance_data.clade_percentage,
80 |                         clade_count_reads: abundance_data.clade_count_reads,
81 |                         taxon_count_reads: abundance_data.taxon_count_reads,
82 |                     })
83 |                     .wrap_err_with(|| {
84 |                         format!(
85 |                             "failed to write record for sample `{}` `{}`",
86 |                             sample.name, taxon.name
87 |                         )
88 |                     })?;
89 |             }
90 |         }
91 | 
92 |         Ok(())
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/cli/args.rs:
--------------------------------------------------------------------------------
  1 | use std::path::PathBuf;
  2 | 
  3 | use clap::{Clap, ValueHint};
  4 | 
  5 | // #[derive(Clap, Debug, Clone)]
  6 | // pub struct KrakenReport {
  7 | //     /// Kraken reports
  8 | //     #[clap(name = "FILE", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(true))]
  9 | //     pub reports: Vec<PathBuf>,
 10 | // }
 11 | 
 12 | // #[derive(Clap, Debug, PartialEq)]
 13 | // pub enum ExtractKind {
 14 | //     #[clap(alias = "p")]
 15 | //     Phylo,
 16 | //     #[clap(alias = "d")]
 17 | //     Data,
 18 | // }
 19 | 
 20 | // #[derive(Clap, Debug)]
 21 | // pub struct ExtractKind2 {
 22 | //     /// Extract taxonomy tree
 23 | //     #[clap(long, conflicts_with("data"))]
 24 | //     phylo: bool,
 25 | //     /// extract data
 26 | //     #[clap(long, conflicts_with("phylo"))]
 27 | //     data: bool
 28 | // }
 29 | 
 30 | // #[derive(Clap, Debug)]
 31 | // pub struct Extract {
 32 | //     #[clap(arg_enum, name = "kind", case_insensitive(true))]
 33 | //     pub kind: ExtractKind
 34 | // }
 35 | 
 36 | #[derive(Clap, Debug)]
 37 | pub struct SingleReport {
 38 |     /// A single Kraken report
 39 |     #[clap(name = "FILE", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(false), takes_value(true))]
 40 |     pub path: PathBuf,
 41 |     /// Input report format
 42 |     #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))]
 43 |     pub format: crate::io::InputReportFormat,
 44 |     /// Does the kraken report has headers
 45 |     #[clap(long = "has-headers", takes_value(false))]
 46 |     pub headers: bool
 47 | }
 48 | 
 49 | #[derive(Clap, Debug)]
 50 | pub struct MultipleReports {
 51 |     /// Multiple Kraken reports
 52 |     #[clap(name = "FILES", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(true), takes_value(true))]
 53 |     pub paths: Vec<PathBuf>,
 54 |     /// Input reports format (all reports must have the format)
 55 |     #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))]
 56 |     pub format: crate::io::InputReportFormat,
 57 |     /// Does the kraken reports have headers (all or none)
 58 |     #[clap(long = "have-headers", takes_value(false))]
 59 |     pub headers: bool
 60 | }
 61 | 
 62 | #[derive(Clap, Debug, Clone)]
 63 | pub struct OutputFile {
 64 |     /// Output file [default: stdout (-)]
 65 |     #[clap(
 66 |         name = "output", 
 67 |         global(true), 
 68 |         long = "output", 
 69 |         parse(from_os_str), 
 70 |         value_hint = ValueHint::AnyPath, 
 71 |         takes_value(true),
 72 |     )]
 73 |     pub path: Option<PathBuf>,
 74 |     /// force overwriting exiting output file
 75 |     #[clap(
 76 |         long, 
 77 |         requires("output"), 
 78 |         global(true),
 79 |     )]
 80 |     pub overwrite: bool,
 81 | }
 82 | 
 83 | #[derive(Clap, Debug)]
 84 | pub struct InputReport {
 85 |     /// Input report format
 86 |     #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))]
 87 |     pub format: crate::io::InputReportFormat,
 88 | }
 89 | 
 90 | #[derive(Clap, Debug)]
 91 | pub struct OutputPhylo {
 92 |     #[clap(flatten)]
 93 |     pub file: OutputFile,
 94 |     /// Output tree format
 95 |     #[clap(long = "format", name = "output-format", arg_enum, case_insensitive(true), default_value("Newick"))]
 96 |     pub format: crate::io::OutputPhyloFormat,
 97 | }
 98 | 
 99 | 
100 | #[derive(Clap, Debug)]
101 | pub struct OutputAbundance {
102 |     #[clap(flatten)]
103 |     pub file: OutputFile,
104 |     /// Output abundance format
105 |     #[clap(long = "format", name = "output-format", arg_enum, case_insensitive(true), default_value("csv"))]
106 |     pub format: crate::io::OutputAbundanceFormat,
107 | }
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/src/io/newick.rs:
--------------------------------------------------------------------------------
  1 | use color_eyre::Report;
  2 | use daggy::{NodeIndex, Walker};
  3 | use eyre::ContextCompat;
  4 | use libspideog::data::tree::Tree;
  5 | use std::{borrow::Cow, io};
  6 | use tracing::instrument;
  7 | 
  8 | pub fn write_newick<W>(writer: &mut W, tree: &Tree) -> Result<(), Report>
  9 | where
 10 |     W: std::io::Write,
 11 | {
 12 |     write_children_recursively(writer, tree, tree.origin.unwrap(), 0)?; // TODO: add error / panic Tree not initilialised
 13 |     write_end(writer)?;
 14 | 
 15 |     Ok(())
 16 | }
 17 | 
 18 | #[inline]
 19 | pub fn write_name_distance<W, S>(writer: &mut W, name: S, distance: usize) -> Result<(), io::Error>
 20 | where
 21 |     W: io::Write,
 22 |     S: AsRef<str>,
 23 | {
 24 |     write!(writer, "{}", format_name_distance(name, distance))
 25 | }
 26 | 
 27 | #[inline]
 28 | fn format_name_distance<S: AsRef<str>>(name: S, distance: usize) -> String {
 29 |     format!("{}:{}", clean_name(name.as_ref()), distance)
 30 | }
 31 | 
 32 | #[inline]
 33 | pub fn write_end<W>(writer: &mut W) -> Result<(), io::Error>
 34 | where
 35 |     W: io::Write,
 36 | {
 37 |     write!(writer, "{}", format_end())
 38 | }
 39 | 
 40 | #[instrument]
 41 | #[inline]
 42 | fn format_end() -> String {
 43 |     String::from(";\n")
 44 | }
 45 | 
 46 | fn is_trouble(c: char) -> bool {
 47 |     c == ' ' || c == '.' || c == ',' || c == '=' || c == '[' || c == ']' || c == '/' || c == ':'
 48 | }
 49 | 
 50 | // based from https://lise-henry.github.io/articles/optimising_strings.html
 51 | // not going to use regex just for that
 52 | pub fn clean_name<'a, S: Into<Cow<'a, str>>>(input: S) -> Cow<'a, str> {
 53 |     let input = input.into();
 54 | 
 55 |     let first_trouble_character = input.find(is_trouble);
 56 |     if let Some(first_trouble_character) = first_trouble_character {
 57 |         let mut output = String::from(&input[0..first_trouble_character]);
 58 |         output.reserve(input.len() - first_trouble_character);
 59 |         let rest = input[first_trouble_character..].chars();
 60 |         for c in rest {
 61 |             match c {
 62 |                 ' ' | '-' | '/' | ':' => output.push_str("_"),
 63 |                 '.' | ',' | '[' | ']' | '(' | ')' | '\'' | '\"' => {}
 64 |                 _ => output.push(c),
 65 |             }
 66 |         }
 67 |         Cow::Owned(output)
 68 |     } else {
 69 |         input
 70 |     }
 71 | }
 72 | 
 73 | pub fn write_children_recursively<W>(
 74 |     writer: &mut W,
 75 |     tree: &Tree,
 76 |     node: NodeIndex,
 77 |     parent_indent: usize,
 78 | ) -> Result<(), Report>
 79 | where
 80 |     W: io::Write,
 81 | {
 82 |     let mut child_walker = tree.tree.children(node);
 83 |     let mut children = Vec::new();
 84 |     while let Some((_, node)) = child_walker.walk_next(&tree.tree) {
 85 |         children.push(node);
 86 |     }
 87 | 
 88 |     let node_data = tree.tree.node_weight(node).wrap_err("node not found")?;
 89 |     let distance = node_data
 90 |         .indent
 91 |         .checked_sub(parent_indent)
 92 |         .wrap_err_with(|| {
 93 |             format!(
 94 |                 "failed to compute new distance: node {} - parent {}",
 95 |                 node_data.indent, parent_indent
 96 |             )
 97 |         })?;
 98 | 
 99 |     if children.is_empty() {
100 |         write_name_distance(writer, &node_data.taxon.name, distance)?;
101 |     } else {
102 |         writer.write_all(b"(")?;
103 | 
104 |         let mut children_iter = children.iter().peekable();
105 | 
106 |         while let Some(node_id) = children_iter.next() {
107 |             write_children_recursively(writer, tree, *node_id, node_data.indent)?;
108 | 
109 |             // not the last child, add a comma
110 |             if children_iter.peek().is_some() {
111 |                 writer.write_all(b",")?;
112 |             }
113 |         }
114 | 
115 |         writer.write_all(b")")?;
116 | 
117 |         write_name_distance(writer, &node_data.taxon.name, distance)?;
118 |     }
119 | 
120 |     Ok(())
121 | }
122 | 
123 | #[cfg(test)]
124 | mod tests {
125 |     use super::*;
126 |     use test_case::test_case;
127 | 
128 |     #[test_case(&("Homo sapiens", 2), "Homo_sapiens:2")]
129 |     #[test_case(&("Bacteroidetes/Chlorobi group", 1), "Bacteroidetes_Chlorobi_group:1")]
130 |     fn test_format_name_distance<S: AsRef<str>>(input: &(S, usize), expected: S) {
131 |         assert_eq!(
132 |             format_name_distance(input.0.as_ref(), input.1),
133 |             expected.as_ref()
134 |         );
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/io.rs:
--------------------------------------------------------------------------------
  1 | use atty::Stream;
  2 | use color_eyre::{Help, Report};
  3 | use csv::Reader;
  4 | use dialoguer::Confirm;
  5 | use std::process;
  6 | use std::{fs::File, fs::OpenOptions, io, path::PathBuf};
  7 | use tracing::instrument;
  8 | 
  9 | use crate::{
 10 |     cli::args::{MultipleReports, SingleReport},
 11 |     BinError,
 12 | };
 13 | 
 14 | pub mod abundance_csv;
 15 | pub mod newick;
 16 | pub mod report;
 17 | 
 18 | /* ---------------------------------- Input --------------------------------- */
 19 | 
 20 | custom_derive! {
 21 |     #[derive(clap::Clap, Debug, PartialEq)]
 22 |     #[derive(EnumFromStr, EnumDisplay)]
 23 |     pub enum InputReportFormat {
 24 |         Kraken,
 25 |     }
 26 | }
 27 | 
 28 | #[instrument]
 29 | pub fn get_reader(input: &PathBuf, headers: bool) -> Result<Reader<File>, csv::Error> {
 30 |     csv::ReaderBuilder::new()
 31 |         .has_headers(headers)
 32 |         .delimiter(b'\t')
 33 |         .double_quote(false)
 34 |         .flexible(true)
 35 |         .from_path(input)
 36 | }
 37 | 
 38 | impl SingleReport {
 39 |     #[instrument]
 40 |     pub fn open_report(&self) -> Result<File, BinError> {
 41 |         let path = &self.path;
 42 |         open_file(path)
 43 |     }
 44 | }
 45 | 
 46 | impl MultipleReports {
 47 |     fn join_errors(errors: Vec<Result<File, BinError>>) -> Result<(), Report> {
 48 |         if errors.is_empty() {
 49 |             return Ok(());
 50 |         }
 51 | 
 52 |         errors
 53 |             .into_iter()
 54 |             .filter_map(|result| {
 55 |                 if let Err(error) = result {
 56 |                     Some(error)
 57 |                 } else {
 58 |                     None
 59 |                 }
 60 |             })
 61 |             .fold(Err(eyre!("encountered multiple errors")), |report, e| {
 62 |                 report.error(e)
 63 |             })
 64 |     }
 65 | 
 66 |     #[instrument]
 67 |     pub fn open_reports(&self) -> Result<Vec<File>, Report> {
 68 |         let readers: Vec<Result<File, BinError>> =
 69 |             self.paths.iter().map(|p| open_file(p)).collect();
 70 | 
 71 |         let (ok, errors) = readers.into_iter().partition(Result::is_ok);
 72 | 
 73 |         Self::join_errors(errors)?;
 74 | 
 75 |         Ok(ok.into_iter().map(Result::unwrap).collect::<Vec<File>>())
 76 |     }
 77 | }
 78 | 
 79 | #[instrument]
 80 | pub fn open_file(path: &PathBuf) -> Result<File, BinError> {
 81 |     let path = path;
 82 |     OpenOptions::new()
 83 |         .read(true)
 84 |         .write(false)
 85 |         .open(path)
 86 |         .map_err(|err| BinError::Io {
 87 |             err,
 88 |             path: path.clone(),
 89 |         })
 90 | }
 91 | 
 92 | /* --------------------------------- OUTPUT --------------------------------- */
 93 | 
 94 | custom_derive! {
 95 |     #[derive(clap::Clap, Debug, PartialEq)]
 96 |     #[derive(EnumFromStr, EnumDisplay)]
 97 |     pub enum OutputPhyloFormat {
 98 |         Newick,
 99 |     }
100 | }
101 | 
102 | custom_derive! {
103 |     #[derive(clap::Clap, Debug, PartialEq)]
104 |     #[derive(EnumFromStr, EnumDisplay)]
105 |     pub enum OutputAbundanceFormat {
106 |         Csv,
107 |     }
108 | }
109 | 
110 | #[derive(Debug, Clone)]
111 | pub enum OutputKind {
112 |     File(PathBuf),
113 |     Stdout,
114 | }
115 | 
116 | #[derive(Debug, Clone)]
117 | pub struct Output {
118 |     pub kind: OutputKind,
119 |     pub overwrite: bool,
120 | }
121 | 
122 | impl From<Option<PathBuf>> for OutputKind {
123 |     fn from(path: Option<PathBuf>) -> Self {
124 |         path.map_or(Self::Stdout, |p| {
125 |             if p == PathBuf::from(r"-") {
126 |                 Self::Stdout
127 |             } else {
128 |                 Self::File(p)
129 |             }
130 |         })
131 |     }
132 | }
133 | 
134 | impl From<crate::cli::args::OutputFile> for Output {
135 |     fn from(clap_output: crate::cli::args::OutputFile) -> Self {
136 |         Self {
137 |             kind: OutputKind::from(clap_output.path),
138 |             overwrite: clap_output.overwrite,
139 |         }
140 |     }
141 | }
142 | 
143 | impl Output {
144 |     pub fn try_writtable(&self) -> Result<(), Report> {
145 |         #[instrument]
146 |         fn internal_can_open_file(path: &PathBuf, overwrite: bool) -> Result<(), Report> {
147 |             if path.exists() {
148 |                 if overwrite {
149 |                 } else if atty::is(Stream::Stdout) {
150 |                     if Confirm::new()
151 |                         .with_prompt(format!("Overwrite `{}`?", path.display()))
152 |                         .interact()?
153 |                     {
154 |                     } else {
155 |                         process::exit(exitcode::NOPERM);
156 |                     }
157 |                 } else {
158 |                     {
159 |                         Err(std::io::Error::new(
160 |                             std::io::ErrorKind::AlreadyExists,
161 |                             "File already exists",
162 |                         ))
163 |                     }?
164 |                 }
165 |             }
166 |             Ok(())
167 |         }
168 | 
169 |         match &self.kind {
170 |             OutputKind::File(path) => internal_can_open_file(path, self.overwrite),
171 |             OutputKind::Stdout => Ok(()),
172 |         }
173 |     }
174 | 
175 |     pub fn writer(&self) -> Result<Box<dyn io::Write>, Report> {
176 |         match &self.kind {
177 |             OutputKind::File(path) => Ok(Box::new(
178 |                 OpenOptions::new()
179 |                     .write(true)
180 |                     .truncate(true)
181 |                     .create(true)
182 |                     .open(path)?,
183 |             ) as Box<dyn io::Write>),
184 |             OutputKind::Stdout => Ok(Box::new(io::stdout()) as Box<dyn io::Write>),
185 |         }
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🐦 spideog - Command line utility for Kraken2 reports. <!-- omit in toc -->
  2 | 
  3 | [![lastest version](https://img.shields.io/github/v/release/jeanmanguy/spideog)](https://github.com/jeanmanguy/spideog/releases/tag/v0.1.2-alpha.1)
  4 | 
  5 | [![Build Status](https://travis-ci.com/jeanmanguy/spideog.svg?branch=main)](https://travis-ci.com/jeanmanguy/spideog)
  6 | [![Rust](https://github.com/jeanmanguy/spideog/workflows/Rust/badge.svg?branch=main)](https://github.com/jeanmanguy/spideog/actions?query=workflow%3ARust)
  7 | 
  8 | This is a work in progress. The commands may change between released versions, please read the [CHANGELOG](CHANGELOG).
  9 | 
 10 | - [Goals](#goals)
 11 | - [Installation](#installation)
 12 | - [Usage](#usage)
 13 |   - [`convert-tree`](#convert-tree)
 14 |   - [`convert-abundance`](#convert-abundance)
 15 |   - [`combine-trees`](#combine-trees)
 16 |   - [`combine-abundances`](#combine-abundances)
 17 | - [Contributing](#contributing)
 18 | - [License](#license)
 19 | - [Credits](#credits)
 20 | 
 21 | ## Goals
 22 | 
 23 | The first goal of this project is to convert Kraken reports into standard file formats that can be easily read with R to allow people to craft thier own data visualisations and compute statistics more easily using the tidyverse, vegan, ape, and ggtree/treeio. The second goal is to get summary information from the Kraken reports directly from the command line.
 24 | 
 25 | Supports Kraken reports from [Kraken2](https://github.com/DerrickWood/kraken2) or from [Bracken](https://github.com/jenniferlu717/Bracken). 
 26 | 
 27 | ## Installation
 28 | 
 29 | Binaries for Linux, OSX, and Windows are available in the [Github release page](https://github.com/jeanmanguy/spideog/releases). No dependencies are required.
 30 | 
 31 | ## Usage
 32 | 
 33 | ```sh
 34 | spideog --help
 35 | spideog --version
 36 | spideog convert-tree <REPORT_FILE>
 37 | spideog convert-abundance <REPORT_FILE>
 38 | spideog combine-trees <REPORT_FILE>...
 39 | spideog combine-abundances <REPORT_FILE>...
 40 | ```
 41 | 
 42 | Windows: you will need to add the `.exe` extension to the commands.
 43 | 
 44 | ### `convert-tree`
 45 | 
 46 | Convert the taxonomy tree of a Kraken report to the Newick format.
 47 | 
 48 | The following command will generate the file `converted.tree`.
 49 | 
 50 | ```sh
 51 | spideog convert-tree sample.kreport --output converted.tree
 52 | ```
 53 | 
 54 | ### Example files <!-- omit in toc -->
 55 | 
 56 | - input: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport).
 57 | - output: [tests/sample_data/converted.tree](tests/sample_data/converted.tree)
 58 | 
 59 | #### Options <!-- omit in toc -->
 60 | 
 61 | - `--has-headers` necessary if the input report has headers
 62 | - `--output` output file path
 63 | - `--overwrite` force overwriting if the output file already exist
 64 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment]
 65 | - `--format` output format (default: newick) [Only newick is supported at the moment]
 66 | 
 67 | ### `convert-abundance`
 68 | 
 69 | Convert the abundance data of a Kraken report to the CSV format.
 70 | 
 71 | 
 72 | The following command will generate the file `converted.csv`.
 73 | 
 74 | ```sh
 75 | spideog convert-abundance sample.kreport --output converted.csv
 76 | ```
 77 | 
 78 | 
 79 | ### Example files <!-- omit in toc -->
 80 | 
 81 | - input: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport).
 82 | - output: [tests/sample_data/converted.csv](tests/sample_data/converted.csv)
 83 | 
 84 | #### Options <!-- omit in toc -->
 85 | 
 86 | - `--has-headers` necessary if the input report has headers
 87 | - `--output` output file path
 88 | - `--overwrite` force overwriting if the output file already exist
 89 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment]
 90 | - `--format` output format (default: CSV) [Only CSV is supported at the moment]
 91 | 
 92 | 
 93 | ### `combine-trees`
 94 | 
 95 | Combine and convert taxonomy trees from multiple Kraken report (e.g. from different samples of the same experiment) to the Newick format.
 96 | 
 97 | The following command will generate the file `combined.tree`.
 98 | 
 99 | ```sh
100 | spideog combine-trees sample.kreport sample_2.kreport --output combined.tree
101 | ```
102 | 
103 | ### Example files <!-- omit in toc -->
104 | 
105 | - inputs: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport) and [tests/sample_data/sample_2.kreport](tests/sample_data/sample_2.kreport).
106 | - output: [tests/sample_data/combined.tree](tests/sample_data/combined.tree)
107 | 
108 | #### Options <!-- omit in toc -->
109 | 
110 | - `--has-headers` necessary if the input reports have headers
111 | - `--output` output file path
112 | - `--overwrite` force overwriting if the output file already exist
113 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment]
114 | - `--format` output format (default: newick) [Only newick is supported at the moment]
115 | 
116 | 
117 | ### `combine-abundances`
118 | 
119 | Combine and convert abundance data from multiple Kraken report (e.g. from different samples of the same experiment) to the CSV format.
120 | 
121 | The following command will generate the file `combined.csv`.
122 | 
123 | ```sh
124 | spideog combine-abundances sample.kreport sample_2.kreport --add-missing-taxons --output combined.csv
125 | ```
126 | 
127 | ### Example files <!-- omit in toc -->
128 | 
129 | - inputs: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport) and [tests/sample_data/sample_2.kreport](tests/sample_data/sample_2.kreport).
130 | - output: [tests/sample_data/combined.csv](tests/sample_data/combined.csv)
131 | 
132 | 
133 | #### Options <!-- omit in toc -->
134 | 
135 | - `--add-missing-taxons` add missig taxons in some reports but present in other with zero values
136 | - `--has-headers` necessary if the input report has headers
137 | - `--output` output file path
138 | - `--overwrite` force overwriting if the output file already exist
139 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment]
140 | - `--format` output format (default: CSV) [Only CSV is supported at the moment]
141 | 
142 | 
143 | ## Contributing
144 | 
145 | The project is maintained by Jean Manguy. Please submit a bug report or a feature request [on the Github issues page](https://github.com/jeanmanguy/spideog/issues/new/choose).
146 | 
147 | ## License
148 | 
149 | `spideog` is distributed under the terms of both the MIT license and the
150 | Apache License (Version 2.0).
151 | 
152 | See [LICENSE-APACHE](./LICENSE-APACHE) and [LICENSE-MIT](./LICENSE-MIT) for
153 | details.
154 | 
155 | ## Credits
156 | 
157 | Cover picture: [Robin CC BY Greg Clarke](https://www.flickr.com/photos/leppre/25468458218)
158 | 


--------------------------------------------------------------------------------
/tests/sample_data/combined.csv:
--------------------------------------------------------------------------------
  1 | "sample","taxon","taxid","rank","clade_percentage","clade_count_reads","taxon_count_reads"
  2 | "sample","Pyrobaculum",2276,"Genus_0",4.67,25,25
  3 | "sample","Canis",9611,"Genus_0",1.87,10,10
  4 | "sample","Saccharomyces cerevisiae",4932,"Species_0",7.85,42,42
  5 | "sample","Tursiops truncatus",9739,"Species_0",22.43,120,120
  6 | "sample","Fungi",4751,"Kingdom_0",7.85,42,42
  7 | "sample","Saccharomycetaceae",4893,"Family_0",7.85,42,42
  8 | "sample","Thermoproteaceae",2267,"Family_0",4.67,25,25
  9 | "sample","Amnoonviridae",2501949,"Family_0",0.0,0,0
 10 | "sample","Viruses",10239,"Domain_0",0.0,0,0
 11 | "sample","Artiodactyla",91561,"Order_0",22.43,120,120
 12 | "sample","Chordata",7711,"Phylum_0",33.64,180,180
 13 | "sample","Primates",9443,"Order_0",9.35,50,50
 14 | "sample","Enterobacteriaceae",543,"Family_0",5.61,30,30
 15 | "sample","root",1,"Root_0",66.36,355,355
 16 | "sample","Saccharomycetes",4891,"Class_0",7.85,42,42
 17 | "sample","Thermoprotei",183924,"Class_0",4.67,25,25
 18 | "sample","unclassified",0,"Unclassified_0",33.64,180,180
 19 | "sample","Saccharomyces",4930,"Genus_0",7.85,42,42
 20 | "sample","Euarchontoglires",314146,"Class_4",9.35,50,50
 21 | "sample","Saccharomyces kudriavzevii",114524,"Species_0",0.0,0,0
 22 | "sample","Carnivora",33554,"Order_0",1.87,10,10
 23 | "sample","Influenza B virus",11520,"Species_0",0.0,0,0
 24 | "sample","Canidae",9608,"Family_0",1.87,10,10
 25 | "sample","Lactobacillales",186826,"Order_0",14.58,78,78
 26 | "sample","Thermoproteales",2266,"Order_0",4.67,25,25
 27 | "sample","Archaea",2157,"Domain_0",4.67,25,25
 28 | "sample","Tilapia tilapinevirus",2034996,"Species_0",0.0,0,0
 29 | "sample","Laurasiatheria",314145,"Class_4",24.3,130,130
 30 | "sample","Escherichia",561,"Genus_0",5.61,30,30
 31 | "sample","Homo",9605,"Genus_0",9.35,50,50
 32 | "sample","Saccharomycetales",4892,"Order_0",7.85,42,42
 33 | "sample","Felidae",9681,"Family_0",0.0,0,0
 34 | "sample","Proteobacteria",1224,"Phylum_0",5.61,30,30
 35 | "sample","Boreoeutheria",1437010,"Class_3",33.64,180,180
 36 | "sample","Bacteria",2,"Domain_0",20.19,108,108
 37 | "sample","Betainfluenzavirus",197912,"Genus_0",0.0,0,0
 38 | "sample","Orthomyxoviridae",11308,"Family_0",0.0,0,0
 39 | "sample","Enterobacterales",91347,"Order_0",5.61,30,30
 40 | "sample","Delphinidae",9726,"Family_0",22.43,120,120
 41 | "sample","Mammalia",40674,"Class_0",33.64,180,180
 42 | "sample","Articulavirales",2499411,"Order_0",0.0,0,0
 43 | "sample","Saccharomyces pastorianus",27292,"Species_0",0.0,0,0
 44 | "sample","Felis",9682,"Genus_0",0.0,0,0
 45 | "sample","Pyrobaculum neutrophilum",70771,"Species_0",4.67,25,25
 46 | "sample","Eukaryota",2759,"Root_2",41.5,222,222
 47 | "sample","Insthoviricetes",2497577,"Class_0",0.0,0,0
 48 | "sample","Metazoa",33208,"Kingdom_0",33.64,180,180
 49 | "sample","Homo sapiens",9606,"Species_0",9.35,50,50
 50 | "sample","Ascomycota",4890,"Phylum_0",7.85,42,42
 51 | "sample","Escherichia coli",562,"Species_0",5.61,30,30
 52 | "sample","Negarnaviricota",2497569,"Phylum_0",0.0,0,0
 53 | "sample","Orthornavirae",2732396,"Kingdom_0",0.0,0,0
 54 | "sample","Tilapinevirus",2034997,"Genus_0",0.0,0,0
 55 | "sample","Crenarchaeota",28889,"Phylum_0",4.67,25,25
 56 | "sample","Bacilli",91061,"Class_0",14.58,78,78
 57 | "sample","Streptococcaceae",1300,"Family_0",14.58,78,78
 58 | "sample","Hominidae",9604,"Family_0",9.35,50,50
 59 | "sample","Canis lupus",9612,"Species_0",1.87,10,10
 60 | "sample","Tursiops",9738,"Genus_0",22.43,120,120
 61 | "sample","Lactococcus",1357,"Genus_0",14.58,78,78
 62 | "sample","Felis catus",9685,"Species_0",0.0,0,0
 63 | "sample","Firmicutes",1239,"Phylum_0",14.58,78,78
 64 | "sample","Gammaproteobacteria",1236,"Class_0",5.61,30,30
 65 | "sample","cellular organisms",131567,"Root_1",66.36,355,355
 66 | "sample","Lactococcus lactis",1358,"Species_0",14.58,78,78
 67 | "sample_2","Saccharomyces",4930,"Genus_0",8.29,67,67
 68 | "sample_2","Ascomycota",4890,"Phylum_0",8.29,67,67
 69 | "sample_2","Felis catus",9685,"Species_0",4.95,40,40
 70 | "sample_2","Euarchontoglires",314146,"Class_4",24.75,200,200
 71 | "sample_2","Tilapinevirus",2034997,"Genus_0",1.49,12,12
 72 | "sample_2","Homo sapiens",9606,"Species_0",24.75,200,200
 73 | "sample_2","Tilapia tilapinevirus",2034996,"Species_0",1.49,12,12
 74 | "sample_2","Pyrobaculum",2276,"Genus_0",1.49,12,12
 75 | "sample_2","Streptococcaceae",1300,"Family_0",4.21,34,34
 76 | "sample_2","Escherichia",561,"Genus_0",9.9,80,80
 77 | "sample_2","Laurasiatheria",314145,"Class_4",12.38,100,100
 78 | "sample_2","Betainfluenzavirus",197912,"Genus_0",6.68,54,54
 79 | "sample_2","Proteobacteria",1224,"Phylum_0",9.9,80,80
 80 | "sample_2","Enterobacterales",91347,"Order_0",9.9,80,80
 81 | "sample_2","unclassified",0,"Unclassified_0",30.82,249,249
 82 | "sample_2","Canis",9611,"Genus_0",0.0,0,0
 83 | "sample_2","Delphinidae",9726,"Family_0",7.43,60,60
 84 | "sample_2","Insthoviricetes",2497577,"Class_0",8.17,66,66
 85 | "sample_2","Pyrobaculum neutrophilum",70771,"Species_0",1.49,12,12
 86 | "sample_2","Bacilli",91061,"Class_0",4.21,34,34
 87 | "sample_2","Amnoonviridae",2501949,"Family_0",1.49,12,12
 88 | "sample_2","Mammalia",40674,"Class_0",37.13,300,300
 89 | "sample_2","Tursiops truncatus",9739,"Species_0",7.43,60,60
 90 | "sample_2","Lactococcus lactis",1358,"Species_0",4.21,34,34
 91 | "sample_2","Homo",9605,"Genus_0",24.75,200,200
 92 | "sample_2","Viruses",10239,"Domain_0",8.17,66,66
 93 | "sample_2","root",1,"Root_0",69.18,559,559
 94 | "sample_2","Bacteria",2,"Domain_0",14.11,114,114
 95 | "sample_2","Thermoproteaceae",2267,"Family_0",1.49,12,12
 96 | "sample_2","Articulavirales",2499411,"Order_0",8.17,66,66
 97 | "sample_2","Artiodactyla",91561,"Order_0",7.43,60,60
 98 | "sample_2","Thermoprotei",183924,"Class_0",1.49,12,12
 99 | "sample_2","Saccharomycetes",4891,"Class_0",8.29,67,67
100 | "sample_2","Thermoproteales",2266,"Order_0",1.49,12,12
101 | "sample_2","Saccharomyces pastorianus",27292,"Species_0",5.2,42,42
102 | "sample_2","cellular organisms",131567,"Root_1",61.01,493,493
103 | "sample_2","Archaea",2157,"Domain_0",1.49,12,12
104 | "sample_2","Escherichia coli",562,"Species_0",9.9,80,80
105 | "sample_2","Carnivora",33554,"Order_0",4.95,40,40
106 | "sample_2","Felidae",9681,"Family_0",4.95,40,40
107 | "sample_2","Canidae",9608,"Family_0",0.0,0,0
108 | "sample_2","Boreoeutheria",1437010,"Class_3",37.13,300,300
109 | "sample_2","Chordata",7711,"Phylum_0",37.13,300,300
110 | "sample_2","Felis",9682,"Genus_0",4.95,40,40
111 | "sample_2","Negarnaviricota",2497569,"Phylum_0",8.17,66,66
112 | "sample_2","Saccharomyces cerevisiae",4932,"Species_0",1.24,10,10
113 | "sample_2","Saccharomyces kudriavzevii",114524,"Species_0",1.86,15,15
114 | "sample_2","Orthomyxoviridae",11308,"Family_0",6.68,54,54
115 | "sample_2","Fungi",4751,"Kingdom_0",8.29,67,67
116 | "sample_2","Gammaproteobacteria",1236,"Class_0",9.9,80,80
117 | "sample_2","Lactobacillales",186826,"Order_0",4.21,34,34
118 | "sample_2","Tursiops",9738,"Genus_0",7.43,60,60
119 | "sample_2","Saccharomycetales",4892,"Order_0",8.29,67,67
120 | "sample_2","Hominidae",9604,"Family_0",24.75,200,200
121 | "sample_2","Lactococcus",1357,"Genus_0",4.21,34,34
122 | "sample_2","Orthornavirae",2732396,"Kingdom_0",8.17,66,66
123 | "sample_2","Eukaryota",2759,"Root_2",45.42,367,367
124 | "sample_2","Influenza B virus",11520,"Species_0",6.68,54,54
125 | "sample_2","Enterobacteriaceae",543,"Family_0",9.9,80,80
126 | "sample_2","Primates",9443,"Order_0",24.75,200,200
127 | "sample_2","Metazoa",33208,"Kingdom_0",37.13,300,300
128 | "sample_2","Crenarchaeota",28889,"Phylum_0",1.49,12,12
129 | "sample_2","Firmicutes",1239,"Phylum_0",4.21,34,34
130 | "sample_2","Canis lupus",9612,"Species_0",0.0,0,0
131 | "sample_2","Saccharomycetaceae",4893,"Family_0",8.29,67,67
132 | 


--------------------------------------------------------------------------------
/src/taxonomy.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt::Display;
  2 | 
  3 | use once_cell::sync::Lazy;
  4 | use serde::{Deserialize, Deserializer};
  5 | use std::sync::Mutex;
  6 | use tracing::instrument;
  7 | 
  8 | use crate::errors::TaxRankParsingError;
  9 | 
 10 | static LAST_TAXONOMY_RANK_PARSED: Lazy<Mutex<Option<Rank>>> = Lazy::new(|| Mutex::new(None));
 11 | 
 12 | /// Taxonomy levels
 13 | ///
 14 | /// the u32 offset represents sub-clade (e.g. parvorder, subfamily, etc.)
 15 | #[derive(Clone, PartialEq, Debug, PartialOrd, Ord, Eq, Hash, Copy)]
 16 | pub enum Rank {
 17 |     Unclassified(u32),
 18 |     Root(u32),
 19 |     Domain(u32),
 20 |     Kingdom(u32),
 21 |     Phylum(u32),
 22 |     Class(u32),
 23 |     Order(u32),
 24 |     Family(u32),
 25 |     Genus(u32),
 26 |     Species(u32),
 27 | }
 28 | 
 29 | // TODO: order D1 as below of any R0..9
 30 | impl Display for Rank {
 31 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 32 |         match self {
 33 |             Self::Unclassified(i) => write!(f, "Unclassified_{}", i),
 34 |             Self::Root(i) => write!(f, "Root_{}", i),
 35 |             Self::Domain(i) => write!(f, "Domain_{}", i),
 36 |             Self::Kingdom(i) => write!(f, "Kingdom_{}", i),
 37 |             Self::Phylum(i) => write!(f, "Phylum_{}", i),
 38 |             Self::Class(i) => write!(f, "Class_{}", i),
 39 |             Self::Order(i) => write!(f, "Order_{}", i),
 40 |             Self::Family(i) => write!(f, "Family_{}", i),
 41 |             Self::Genus(i) => write!(f, "Genus_{}", i),
 42 |             Self::Species(i) => write!(f, "Species_{}", i),
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | impl Rank {
 48 |     #[must_use]
 49 |     pub fn plus_one(self) -> Self {
 50 |         match self {
 51 |             Self::Unclassified(i) => Self::Unclassified(i.checked_add(1).unwrap()),
 52 |             Self::Root(i) => Self::Root(i.checked_add(1).unwrap()),
 53 |             Self::Domain(i) => Self::Domain(i.checked_add(1).unwrap()),
 54 |             Self::Kingdom(i) => Self::Kingdom(i.checked_add(1).unwrap()),
 55 |             Self::Phylum(i) => Self::Phylum(i.checked_add(1).unwrap()),
 56 |             Self::Class(i) => Self::Class(i.checked_add(1).unwrap()),
 57 |             Self::Order(i) => Self::Order(i.checked_add(1).unwrap()),
 58 |             Self::Family(i) => Self::Family(i.checked_add(1).unwrap()),
 59 |             Self::Genus(i) => Self::Genus(i.checked_add(1).unwrap()),
 60 |             Self::Species(i) => Self::Species(i.checked_add(1).unwrap()),
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | #[instrument]
 66 | pub fn parse_taxonomy_level(string: &str) -> Result<Rank, TaxRankParsingError> {
 67 |     // TODO: add previous tax rank here, make it purely functional
 68 |     if string.len() > 2 {
 69 |         return Err(TaxRankParsingError::InvalidLength(
 70 |             String::from(string),
 71 |             string.len(),
 72 |         ));
 73 |     }
 74 | 
 75 |     let mut string_chars = string.chars();
 76 | 
 77 |     let letter = string_chars.next().unwrap();
 78 | 
 79 |     let offset: u32 =
 80 |         string_chars
 81 |             .next()
 82 |             .map_or(Ok(0_u32), |number| -> Result<u32, TaxRankParsingError> {
 83 |                 if number.is_ascii_digit() {
 84 |                     Ok(number.to_digit(10_u32).unwrap())
 85 |                 } else {
 86 |                     Err(TaxRankParsingError::OffsetNotANumber(
 87 |                         String::from(string),
 88 |                         number,
 89 |                     ))
 90 |                 }
 91 |             })?;
 92 | 
 93 |     let tax_rank: Rank = match letter {
 94 |         'U' => Ok(Rank::Unclassified(offset)),
 95 |         'R' => Ok(Rank::Root(offset)),
 96 |         'D' => Ok(Rank::Domain(offset)),
 97 |         'K' => Ok(Rank::Kingdom(offset)),
 98 |         'P' => Ok(Rank::Phylum(offset)),
 99 |         'C' => Ok(Rank::Class(offset)),
100 |         'O' => Ok(Rank::Order(offset)),
101 |         'F' => Ok(Rank::Family(offset)),
102 |         'G' => Ok(Rank::Genus(offset)),
103 |         'S' => Ok(Rank::Species(offset)),
104 |         '-' => {
105 |             // TODO: there has to be a better way to do that, maybe without the mutex business
106 |             (*LAST_TAXONOMY_RANK_PARSED.lock().unwrap()).map_or_else(
107 |                 || {
108 |                     Err(TaxRankParsingError::TaxRankParsingCannotInferRank(
109 |                         String::from(string),
110 |                     ))
111 |                 },
112 |                 |x| -> Result<Rank, TaxRankParsingError> { Ok(x.plus_one()) },
113 |             )
114 |         }
115 |         _ => Err(TaxRankParsingError::InvalidRankCode(
116 |             String::from(string),
117 |             letter,
118 |         )),
119 |     }?;
120 | 
121 |     let mut old_tax_rank = LAST_TAXONOMY_RANK_PARSED.lock().unwrap();
122 |     *old_tax_rank = Some(tax_rank);
123 | 
124 |     Ok(tax_rank)
125 | }
126 | 
127 | impl<'de> Deserialize<'de> for Rank {
128 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
129 |     where
130 |         D: Deserializer<'de>,
131 |     {
132 |         let string = String::deserialize(deserializer)?;
133 |         parse_taxonomy_level(&string).map_err(serde::de::Error::custom)
134 |     }
135 | 
136 |     fn deserialize_in_place<D>(deserializer: D, place: &mut Self) -> Result<(), D::Error>
137 |     where
138 |         D: Deserializer<'de>,
139 |     {
140 |         // Default implementation just delegates to `deserialize` impl.
141 |         *place = Deserialize::deserialize(deserializer)?;
142 |         Ok(())
143 |     }
144 | }
145 | 
146 | #[cfg(test)]
147 | mod tests {
148 |     use super::*;
149 |     use test_case::test_case;
150 | 
151 |     #[test]
152 |     fn test_order_taxonomy() {
153 |         assert!(Rank::Domain(0) > Rank::Root(1));
154 |         assert!(Rank::Domain(1) > Rank::Domain(0))
155 |     }
156 | 
157 |     #[test_case("U", Rank::Unclassified(0); "ok_U")]
158 |     #[test_case("U1", Rank::Unclassified(1); "ok_U1")]
159 |     #[test_case("R", Rank::Root(0); "ok_R")]
160 |     #[test_case("R1", Rank::Root(1); "ok_R1")]
161 |     #[test_case("P", Rank::Phylum(0); "ok_P")]
162 |     #[test_case("P1", Rank::Phylum(1); "ok_P1")]
163 |     #[test_case("C", Rank::Class(0); "ok_C")]
164 |     #[test_case("C1", Rank::Class(1); "ok_C1")]
165 |     #[test_case("O", Rank::Order(0); "ok_O")]
166 |     #[test_case("O1", Rank::Order(1); "ok_O1")]
167 |     #[test_case("F", Rank::Family(0); "ok_F")]
168 |     #[test_case("F1", Rank::Family(1); "ok_F1")]
169 |     #[test_case("S", Rank::Species(0); "ok_S")]
170 |     #[test_case("S1", Rank::Species(1); "ok_S1")]
171 |     fn test_parse_tax_level(input: &str, expected: Rank) {
172 |         pretty_assertions::assert_eq!(parse_taxonomy_level(input).unwrap(), expected);
173 |     }
174 | 
175 |     #[test]
176 |     fn test_plus_one() {
177 |         pretty_assertions::assert_eq!(Rank::Kingdom(2).plus_one(), Rank::Kingdom(3))
178 |     }
179 | 
180 |     #[test]
181 |     #[should_panic]
182 |     fn test_parse_tax_level_error_too_long() {
183 |         // TODO: implements Eq on errors (fix csv and io errors first)
184 |         parse_taxonomy_level("R11111").unwrap();
185 |     }
186 | 
187 |     #[test]
188 |     #[should_panic]
189 |     fn test_parse_tax_level_error_invalid_code() {
190 |         // TODO: implements Eq on errors (fix csv and io errors first)
191 |         parse_taxonomy_level("L4").unwrap();
192 |     }
193 | 
194 |     #[test]
195 |     #[should_panic]
196 |     fn test_parse_tax_level_error_offsetnotanumber() {
197 |         // TODO: implements Eq on errors (fix csv and io errors first)
198 |         parse_taxonomy_level("RR").unwrap();
199 |     }
200 | 
201 |     #[test]
202 |     #[should_panic]
203 |     fn test_parse_tax_level_error_cannotinferprevious() {
204 |         // reset
205 |         {
206 |             let mut old_tax_rank = LAST_TAXONOMY_RANK_PARSED.lock().unwrap();
207 |             *old_tax_rank = None;
208 |         }
209 |         // TODO: implements Eq on errors (fix csv and io errors first)
210 |         parse_taxonomy_level("-").unwrap();
211 |     }
212 | }
213 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright 2020 Jean Manguy
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/data/tree.rs:
--------------------------------------------------------------------------------
  1 | use color_eyre::Report;
  2 | use core::convert::TryFrom;
  3 | use std::fmt::Display;
  4 | 
  5 | use daggy::{Dag, NodeIndex, Walker};
  6 | use tracing::instrument;
  7 | 
  8 | use crate::{
  9 |     errors::SpideogError,
 10 |     kraken::{ReportRecord, Taxon},
 11 |     parser::parse_ident_organism_name,
 12 | };
 13 | 
 14 | #[derive(Debug, Eq, PartialEq, Clone)]
 15 | pub struct IndentedTaxon {
 16 |     pub indent: usize,
 17 |     pub taxon: Taxon,
 18 | }
 19 | 
 20 | impl Display for IndentedTaxon {
 21 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 22 |         write!(f, "{}", self.taxon)
 23 |     }
 24 | }
 25 | 
 26 | impl IndentedTaxon {
 27 |     #[must_use]
 28 |     pub fn inferior_indent(&self, than: &Self) -> bool {
 29 |         self.indent < than.indent
 30 |     }
 31 | }
 32 | 
 33 | impl TryFrom<ReportRecord> for IndentedTaxon {
 34 |     type Error = SpideogError;
 35 | 
 36 |     #[instrument]
 37 |     fn try_from(value: ReportRecord) -> Result<Self, Self::Error> {
 38 |         let (_, (indent, name)) = parse_ident_organism_name(value.5.as_bytes()).unwrap(); // TODO: make error here
 39 | 
 40 |         let organism_tree = Taxon {
 41 |             taxonomy_level: value.3,
 42 |             name: String::from_utf8_lossy(name).trim().to_string(),
 43 |             taxonomy_id: value.4,
 44 |         };
 45 | 
 46 |         let node = Self {
 47 |             indent,
 48 |             taxon: organism_tree,
 49 |         };
 50 | 
 51 |         Ok(node)
 52 |     }
 53 | }
 54 | 
 55 | pub trait TaxonomyTreeReader<T>: Sized {
 56 |     fn read(_: T) -> Result<Self, Report>;
 57 | }
 58 | 
 59 | #[derive(Debug, Default)]
 60 | pub struct Tree {
 61 |     pub tree: Dag<IndentedTaxon, u32, u32>,
 62 |     pub origin: Option<NodeIndex>,
 63 |     pub last_node_added_id: Option<NodeIndex>,
 64 | }
 65 | 
 66 | impl Tree {
 67 |     #[must_use]
 68 |     pub fn new() -> Self {
 69 |         Self {
 70 |             tree: Dag::new(),
 71 |             origin: None,
 72 |             last_node_added_id: None,
 73 |         }
 74 |     }
 75 | 
 76 |     pub fn with_origin(&mut self, origin: IndentedTaxon) -> &mut Self {
 77 |         let new_node_index = self.tree.add_node(origin);
 78 |         self.origin = Some(new_node_index);
 79 |         self.last_node_added_id = Some(new_node_index);
 80 | 
 81 |         self
 82 |     }
 83 | 
 84 |     pub fn child(&mut self, parent: NodeIndex, node: IndentedTaxon) -> &mut Self {
 85 |         let weight = 1;
 86 |         self.child_with_weight(parent, node, weight)
 87 |     }
 88 | 
 89 |     pub fn child_with_weight(
 90 |         &mut self,
 91 |         parent: NodeIndex,
 92 |         node: IndentedTaxon,
 93 |         weight: u32,
 94 |     ) -> &mut Self {
 95 |         let (_, new_node_id) = self.tree.add_child(parent, weight, node);
 96 |         self.last_node_added_id = Some(new_node_id);
 97 | 
 98 |         self
 99 |     }
100 | 
101 |     //  find a parent with a lower indent value or default to the origin
102 |     pub fn find_valid_parent_for(&self, taxon: &IndentedTaxon) -> Result<NodeIndex, SpideogError> {
103 |         // default value
104 |         let mut parent_id = self
105 |             .origin
106 |             .ok_or_else(|| SpideogError::TreeNotInitialized)?;
107 | 
108 |         let previously_added_node = self
109 |             .last_node_added_id
110 |             .ok_or_else(|| SpideogError::TreeNotInitialized)?;
111 | 
112 |         if self
113 |             .tree
114 |             .node_weight(previously_added_node)
115 |             .ok_or_else(|| SpideogError::NodeNotFound)?
116 |             .inferior_indent(taxon)
117 |         {
118 |             // previously added node is a suitable parent for the next taxon
119 |             return Ok(previously_added_node);
120 |         }
121 | 
122 |         // we need to go up the tree to find an adequate parent
123 |         let mut parent_recursion = self
124 |             .tree
125 |             .recursive_walk(previously_added_node, |g, n| g.parents(n).iter(g).last());
126 | 
127 |         while let Some((_, node_id)) = parent_recursion.walk_next(&self.tree) {
128 |             let node = self
129 |                 .tree
130 |                 .node_weight(node_id)
131 |                 .ok_or_else(|| SpideogError::NodeNotFound)?;
132 | 
133 |             if node.inferior_indent(taxon) {
134 |                 parent_id = node_id;
135 |                 break;
136 |             }
137 |         }
138 | 
139 |         Ok(parent_id)
140 |     }
141 | 
142 |     pub fn try_combine_with(mut self, rhs: Self) -> Result<Self, SpideogError> {
143 |         if self.origin.is_none() {
144 |             return Ok(rhs);
145 |         }
146 | 
147 |         for rhs_edge in rhs.tree.raw_edges().iter() {
148 |             let rhs_edge_source = rhs
149 |                 .tree
150 |                 .node_weight(rhs_edge.source())
151 |                 .ok_or_else(|| SpideogError::NodeNotFound)?;
152 |             let rhs_edge_target = rhs
153 |                 .tree
154 |                 .node_weight(rhs_edge.target())
155 |                 .ok_or_else(|| SpideogError::NodeNotFound)?;
156 | 
157 |             let original_nodes = self.tree.raw_nodes();
158 | 
159 |             let source_in_self: Option<NodeIndex<u32>> =
160 |                 original_nodes.iter().enumerate().find_map(|(index, node)| {
161 |                     if &node.weight == rhs_edge_source {
162 |                         Some(NodeIndex::new(index))
163 |                     } else {
164 |                         None
165 |                     }
166 |                 });
167 | 
168 |             let target_in_self: Option<NodeIndex<u32>> =
169 |                 original_nodes.iter().enumerate().find_map(|(index, node)| {
170 |                     if &node.weight == rhs_edge_target {
171 |                         Some(NodeIndex::new(index))
172 |                     } else {
173 |                         None
174 |                     }
175 |                 });
176 | 
177 |             match (source_in_self, target_in_self) {
178 |                 (None, None) => {
179 |                     // panic
180 |                     // no common node (not even root)
181 |                     // FIXME: what todo?
182 |                     // TODO:  make it an error with more info
183 |                     // dbg!(&rhs_edge_source);
184 |                     // dbg!(&rhs_edge_target);
185 |                     // dbg!(self.tree.node_weight(self.origin.unwrap()));
186 |                     panic!("source and target of an edge in RHS were not found in Self");
187 |                 }
188 |                 (None, Some(_)) => {
189 |                     // panic
190 |                     // possible diamond
191 |                     panic!(
192 |                         "source and edge node in RHS were not found in Self, but target was found"
193 |                     );
194 |                 }
195 |                 (Some(parent), None) => {
196 |                     self.child(parent, rhs_edge_target.clone());
197 |                 }
198 |                 (Some(s), Some(t)) => {
199 |                     // increment weight of edge
200 |                     // FIXME: some issues with different trees, can't found node that exist
201 |                     let original_edge = self.tree.find_edge(s, t).ok_or_else(|| {
202 |                         SpideogError::EdgeNotFound(
203 |                             self.tree.node_weight(s).unwrap().clone(),
204 |                             self.tree.node_weight(t).unwrap().clone(),
205 |                         )
206 |                     })?;
207 | 
208 |                     self.tree
209 |                         .update_edge(
210 |                             s,
211 |                             t,
212 |                             self.tree
213 |                                 .edge_weight(original_edge)
214 |                                 .unwrap_or(&1_u32)
215 |                                 .checked_add(1_u32)
216 |                                 .unwrap_or(1_u32),
217 |                         )
218 |                         .unwrap();
219 |                 }
220 |             }
221 |         }
222 | 
223 |         Ok(self)
224 |     }
225 | }
226 | 
227 | #[cfg(test)]
228 | mod tests {
229 |     use super::*;
230 |     // use test_case::test_case;
231 | 
232 |     #[test]
233 |     fn test_new() {
234 |         let origin = IndentedTaxon {
235 |             indent: 0,
236 |             taxon: Taxon {
237 |                 taxonomy_level: crate::taxonomy::Rank::Root(0),
238 |                 name: "root".to_string(),
239 |                 taxonomy_id: 0,
240 |             },
241 |         };
242 | 
243 |         let mut tree = Tree::new();
244 |         tree.with_origin(origin.clone());
245 | 
246 |         pretty_assertions::assert_eq!(tree.tree.edge_count(), 0);
247 |         pretty_assertions::assert_eq!(tree.tree.node_count(), 1);
248 |         pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(0)), Some(&origin));
249 | 
250 |         // tree.tree.
251 |     }
252 | 
253 |     #[test]
254 |     fn test_child() {
255 |         let origin = IndentedTaxon {
256 |             indent: 0,
257 |             taxon: Taxon {
258 |                 taxonomy_level: crate::taxonomy::Rank::Root(0),
259 |                 name: "root".to_string(),
260 |                 taxonomy_id: 0,
261 |             },
262 |         };
263 | 
264 |         let child = IndentedTaxon {
265 |             indent: 0,
266 |             taxon: Taxon {
267 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
268 |                 name: "child".to_string(),
269 |                 taxonomy_id: 1,
270 |             },
271 |         };
272 | 
273 |         let grand_child = IndentedTaxon {
274 |             indent: 2,
275 |             taxon: Taxon {
276 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
277 |                 name: "grand child".to_string(),
278 |                 taxonomy_id: 2,
279 |             },
280 |         };
281 | 
282 |         let mut tree = Tree::new();
283 |         tree.with_origin(origin.clone());
284 |         tree.child(NodeIndex::new(0), child.clone());
285 |         tree.child(NodeIndex::new(1), grand_child);
286 | 
287 |         pretty_assertions::assert_eq!(tree.tree.edge_count(), 2);
288 |         pretty_assertions::assert_eq!(tree.tree.node_count(), 3);
289 |         pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(0)), Some(&origin));
290 |         pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(1)), Some(&child));
291 | 
292 |         assert!(tree
293 |             .tree
294 |             .find_edge(NodeIndex::new(0), NodeIndex::new(1))
295 |             .is_some());
296 |         assert!(tree
297 |             .tree
298 |             .find_edge(NodeIndex::new(1), NodeIndex::new(2))
299 |             .is_some());
300 | 
301 |         pretty_assertions::assert_eq!(
302 |             tree.tree
303 |                 .parents(NodeIndex::new(2))
304 |                 .iter(&tree.tree)
305 |                 .next()
306 |                 .unwrap()
307 |                 .1,
308 |             NodeIndex::new(1)
309 |         )
310 |     }
311 | 
312 |     #[test]
313 |     fn test_find_valid_parent() {
314 |         let origin = IndentedTaxon {
315 |             indent: 0,
316 |             taxon: Taxon {
317 |                 taxonomy_level: crate::taxonomy::Rank::Root(0),
318 |                 name: "root".to_string(),
319 |                 taxonomy_id: 0,
320 |             },
321 |         };
322 | 
323 |         let child = IndentedTaxon {
324 |             indent: 1,
325 |             taxon: Taxon {
326 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
327 |                 name: "child".to_string(),
328 |                 taxonomy_id: 1,
329 |             },
330 |         };
331 | 
332 |         let grand_child = IndentedTaxon {
333 |             indent: 2,
334 |             taxon: Taxon {
335 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
336 |                 name: "grand child".to_string(),
337 |                 taxonomy_id: 2,
338 |             },
339 |         };
340 | 
341 |         let new_child = IndentedTaxon {
342 |             indent: 2,
343 |             taxon: Taxon {
344 |                 taxonomy_level: crate::taxonomy::Rank::Root(3),
345 |                 name: "new_child".to_string(),
346 |                 taxonomy_id: 3,
347 |             },
348 |         };
349 | 
350 |         let new_child_child = IndentedTaxon {
351 |             indent: 3,
352 |             taxon: Taxon {
353 |                 taxonomy_level: crate::taxonomy::Rank::Root(3),
354 |                 name: "new_child_child".to_string(),
355 |                 taxonomy_id: 4,
356 |             },
357 |         };
358 | 
359 |         let mut tree = Tree::new();
360 |         tree.with_origin(origin);
361 |         tree.child(NodeIndex::new(0), child);
362 |         tree.child(NodeIndex::new(1), grand_child);
363 | 
364 |         let parent = tree.find_valid_parent_for(&new_child).unwrap();
365 |         tree.child(parent, new_child);
366 | 
367 |         // pretty_assertions::assert_eq!(parent, NodeIndex::new(1));
368 | 
369 |         let parent = tree.find_valid_parent_for(&new_child_child).unwrap();
370 |         tree.child(parent, new_child_child);
371 | 
372 |         pretty_assertions::assert_eq!(parent, NodeIndex::new(3));
373 |     }
374 | 
375 |     #[test]
376 |     fn test_try_combine_with() {
377 |         let origin = IndentedTaxon {
378 |             indent: 0,
379 |             taxon: Taxon {
380 |                 taxonomy_level: crate::taxonomy::Rank::Root(0),
381 |                 name: "root".to_string(),
382 |                 taxonomy_id: 0,
383 |             },
384 |         };
385 | 
386 |         let child = IndentedTaxon {
387 |             indent: 1,
388 |             taxon: Taxon {
389 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
390 |                 name: "child".to_string(),
391 |                 taxonomy_id: 1,
392 |             },
393 |         };
394 | 
395 |         let second_child = IndentedTaxon {
396 |             indent: 1,
397 |             taxon: Taxon {
398 |                 taxonomy_level: crate::taxonomy::Rank::Root(1),
399 |                 name: "second child".to_string(),
400 |                 taxonomy_id: 2,
401 |             },
402 |         };
403 | 
404 |         let grand_child = IndentedTaxon {
405 |             indent: 2,
406 |             taxon: Taxon {
407 |                 taxonomy_level: crate::taxonomy::Rank::Root(2),
408 |                 name: "grand child".to_string(),
409 |                 taxonomy_id: 3,
410 |             },
411 |         };
412 | 
413 |         let mut tree_1 = Tree::new();
414 |         tree_1.with_origin(origin.clone());
415 |         tree_1.child(NodeIndex::new(0), child.clone());
416 |         tree_1.child(NodeIndex::new(0), second_child.clone());
417 | 
418 |         let mut tree_2 = Tree::new();
419 |         tree_2.with_origin(origin.clone());
420 |         tree_2.child(NodeIndex::new(0), child.clone());
421 |         tree_2.child(NodeIndex::new(1), grand_child.clone());
422 | 
423 |         let mut expected_tree = Tree::new();
424 |         expected_tree.with_origin(origin);
425 |         expected_tree.child_with_weight(NodeIndex::new(0), child, 2);
426 |         expected_tree.child(NodeIndex::new(0), second_child);
427 |         expected_tree.child(NodeIndex::new(1), grand_child);
428 | 
429 |         let combined_tree = tree_1.try_combine_with(tree_2).unwrap();
430 | 
431 |         pretty_assertions::assert_eq!(
432 |             combined_tree.tree.edge_count(),
433 |             expected_tree.tree.edge_count()
434 |         );
435 | 
436 |         pretty_assertions::assert_eq!(
437 |             combined_tree.tree.node_count(),
438 |             expected_tree.tree.node_count()
439 |         );
440 | 
441 |         pretty_assertions::assert_eq!(
442 |             combined_tree
443 |                 .tree
444 |                 .raw_edges()
445 |                 .iter()
446 |                 .map(|e| e.weight)
447 |                 .collect::<Vec<u32>>(),
448 |             expected_tree
449 |                 .tree
450 |                 .raw_edges()
451 |                 .iter()
452 |                 .map(|e| e.weight)
453 |                 .collect::<Vec<u32>>()
454 |         );
455 |     }
456 | }
457 | 


--------------------------------------------------------------------------------