├── src ├── data.rs ├── subcommands.rs ├── lib.rs ├── parser.rs ├── bracken.rs ├── subcommands │ ├── convert_phylo.rs │ ├── convert_abundance.rs │ ├── combine_phylo.rs │ └── combine_abundance.rs ├── errors.rs ├── main.rs ├── io │ ├── report.rs │ ├── abundance_csv.rs │ └── newick.rs ├── kraken.rs ├── cli │ ├── logging.rs │ ├── subcommands.rs │ └── args.rs ├── data │ ├── abundance.rs │ └── tree.rs ├── cli.rs ├── io.rs └── taxonomy.rs ├── .gitignore ├── ci ├── script.sh ├── before_deploy.sh └── install.sh ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ └── rust.yml ├── tests └── sample_data │ ├── converted.tree │ ├── combined.tree │ ├── sample.kreport │ ├── converted.csv │ ├── sample_2.kreport │ └── combined.csv ├── LICENSE-MIT ├── CHANGELOG.md ├── Cargo.toml ├── .travis.yml ├── README.md └── LICENSE-APACHE /src/data.rs: -------------------------------------------------------------------------------- 1 | pub mod abundance; 2 | pub mod tree; 3 | -------------------------------------------------------------------------------- /src/subcommands.rs: -------------------------------------------------------------------------------- 1 | mod combine_abundance; 2 | mod combine_phylo; 3 | mod convert_abundance; 4 | mod convert_phylo; 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | _test_data 4 | .vscode 5 | .Rproj.user 6 | *.Rproj 7 | *.tree 8 | *.newick 9 | *.graph 10 | log 11 | .Rhistory 12 | TODO.txt 13 | 14 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // #![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] 2 | // #![allow(clippy::missing_const_for_fn)] 3 | // #![allow(clippy::multiple_crate_versions)] 4 | // #![allow(clippy::missing_errors_doc)] 5 | // #![allow(clippy::module_name_repetitions)] 6 | 7 | pub mod bracken; 8 | pub mod data; 9 | pub mod errors; 10 | pub mod kraken; 11 | pub mod parser; 12 | pub mod taxonomy; 13 | 14 | #[macro_use] 15 | extern crate serde; 16 | -------------------------------------------------------------------------------- /ci/script.sh: -------------------------------------------------------------------------------- 1 | # This script takes care of testing your crate 2 | 3 | set -ex 4 | 5 | main() { 6 | # cross build --target $TARGET 7 | cross build --target $TARGET --release 8 | 9 | if [ ! -z $DISABLE_TESTS ]; then 10 | return 11 | fi 12 | 13 | # cross test --target $TARGET 14 | # cross test --target $TARGET --release 15 | 16 | # cross run --target $TARGET 17 | # cross run --target $TARGET --release 18 | } 19 | 20 | # we don't run the "test phase" when doing deploys 21 | if [ -z $TRAVIS_TAG ]; then 22 | main 23 | fi 24 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use nom::IResult; 2 | 3 | use crate::kraken::Indent; 4 | 5 | pub fn spaces_and_rest(input: &[u8]) -> IResult<&[u8], Vec<&[u8]>> { 6 | nom::multi::fold_many0( 7 | nom::bytes::complete::tag(" "), 8 | Vec::new(), 9 | |mut acc: Vec<_>, item| { 10 | acc.push(item); 11 | acc 12 | }, 13 | )(input) 14 | } 15 | 16 | pub fn parse_ident_organism_name(input: &[u8]) -> IResult<&[u8], (Indent, &[u8])> { 17 | let (name, spaces) = spaces_and_rest(input)?; 18 | 19 | Ok((&[], (spaces.len(), name))) 20 | } 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] title" 5 | labels: "[type] feature" 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: "[problem] bug" 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behaviour: 15 | 16 | **Expected behaviour** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Environment (please complete the following information):** 23 | - OS: [e.g. Windows 10, Linux, MacOS, all] 24 | - Version [e.g. 0.0.1] 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /tests/sample_data/converted.tree: -------------------------------------------------------------------------------- 1 | (((((((((Pyrobaculum_neutrophilum:1)Pyrobaculum:1)Thermoproteaceae:1)Thermoproteales:1)Thermoprotei:1)Crenarchaeota:1)Archaea:1,((((((Lactococcus_lactis:1)Lactococcus:1)Streptococcaceae:1)Lactobacillales:1)Bacilli:1)Firmicutes:1,(((((Escherichia_coli:1)Escherichia:1)Enterobacteriaceae:1)Enterobacterales:1)Gammaproteobacteria:1)Proteobacteria:1)Bacteria:1,(((((((Saccharomyces_cerevisiae:1)Saccharomyces:1)Saccharomycetaceae:1)Saccharomycetales:1)Saccharomycetes:1)Ascomycota:1)Fungi:1,((((((((Tursiops_truncatus:1)Tursiops:1)Delphinidae:1)Artiodactyla:1,(((Canis_lupus:1)Canis:1)Canidae:1)Carnivora:1)Laurasiatheria:1,((((Homo_sapiens:1)Homo:1)Hominidae:1)Primates:1)Euarchontoglires:1)Boreoeutheria:1)Mammalia:1)Chordata:1)Metazoa:1)Eukaryota:1)cellular_organisms:1)root:0)unclassified:0; 2 | -------------------------------------------------------------------------------- /ci/before_deploy.sh: -------------------------------------------------------------------------------- 1 | # This script takes care of building your crate and packaging it for release 2 | 3 | set -ex 4 | 5 | main() { 6 | local src=$(pwd) \ 7 | stage= 8 | 9 | case $TRAVIS_OS_NAME in 10 | linux) 11 | stage=$(mktemp -d) 12 | ;; 13 | osx) 14 | stage=$(mktemp -d -t tmp) 15 | ;; 16 | esac 17 | 18 | test -f Cargo.lock || cargo generate-lockfile 19 | 20 | cross build --target $TARGET --release 21 | 22 | if [[ $TARGET == *"pc-windows"* ]]; then 23 | cp target/$TARGET/release/spideog.exe $stage/ 24 | else 25 | cp target/$TARGET/release/spideog $stage/ 26 | fi 27 | 28 | cd $stage 29 | tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz * 30 | cd $src 31 | 32 | rm -rf $stage 33 | } 34 | 35 | main 36 | -------------------------------------------------------------------------------- /tests/sample_data/combined.tree: -------------------------------------------------------------------------------- 1 | (((((((((Tilapia_tilapinevirus:1)Tilapinevirus:1)Amnoonviridae:1,((Influenza_B_virus:1)Betainfluenzavirus:1)Orthomyxoviridae:1)Articulavirales:1)Insthoviricetes:1)Negarnaviricota:1)Orthornavirae:1)Viruses:1,(((((((Pyrobaculum_neutrophilum:1)Pyrobaculum:1)Thermoproteaceae:1)Thermoproteales:1)Thermoprotei:1)Crenarchaeota:1)Archaea:1,((((((Lactococcus_lactis:1)Lactococcus:1)Streptococcaceae:1)Lactobacillales:1)Bacilli:1)Firmicutes:1,(((((Escherichia_coli:1)Escherichia:1)Enterobacteriaceae:1)Enterobacterales:1)Gammaproteobacteria:1)Proteobacteria:1)Bacteria:1,(((((((Saccharomyces_pastorianus:1,Saccharomyces_kudriavzevii:1,Saccharomyces_cerevisiae:1)Saccharomyces:1)Saccharomycetaceae:1)Saccharomycetales:1)Saccharomycetes:1)Ascomycota:1)Fungi:1,((((((((Tursiops_truncatus:1)Tursiops:1)Delphinidae:1)Artiodactyla:1,(((Felis_catus:1)Felis:1)Felidae:1,((Canis_lupus:1)Canis:1)Canidae:1)Carnivora:1)Laurasiatheria:1,((((Homo_sapiens:1)Homo:1)Hominidae:1)Primates:1)Euarchontoglires:1)Boreoeutheria:1)Mammalia:1)Chordata:1)Metazoa:1)Eukaryota:1)cellular_organisms:1)root:0)unclassified:0; 2 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Jean Manguy 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /src/bracken.rs: -------------------------------------------------------------------------------- 1 | use crate::kraken::Taxon; 2 | 3 | #[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] 4 | pub struct AbundanceValues { 5 | pub kraken_assigned_reads: u64, 6 | pub added_reads: u64, 7 | pub new_est_reads: u64, 8 | pub fraction_total_reads: f64, 9 | } 10 | #[derive(Clone, PartialEq, PartialOrd, Debug, Deserialize)] 11 | pub struct BrackenRecord { 12 | #[serde(flatten)] 13 | pub taxon: Taxon, 14 | #[serde(flatten)] 15 | pub abundance_values: AbundanceValues, 16 | } 17 | 18 | #[cfg(test)] 19 | mod tests { 20 | // use super::*; 21 | // use indextree::Arena; 22 | // use std::collections::BTreeMap; 23 | 24 | // #[test] 25 | // fn bracken_works() { 26 | // let mut rdr = csv::ReaderBuilder::new() 27 | // .has_headers(true) 28 | // .delimiter(b'\t') 29 | // .from_path(r"C:\Users\Jean\Documents\spideog\_test_data\Sam9_species.bracken") 30 | // .unwrap(); 31 | // // dbg!(rdr); 32 | 33 | // let mut bracken = BTreeMap::new(); 34 | 35 | // for result in rdr.deserialize() { 36 | // let record: BrackenRecord = result.unwrap(); 37 | // bracken.insert(record.organism, record.abundance_values); 38 | // } 39 | 40 | // println!("{:?}", bracken); 41 | // } 42 | } 43 | -------------------------------------------------------------------------------- /src/subcommands/convert_phylo.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::{Help, Report}; 2 | use eyre::Context; 3 | use libspideog::data::tree::Tree; 4 | use tracing::instrument; 5 | 6 | use crate::{ 7 | cli::subcommands::{ConvertTree, Runner}, 8 | io::newick::write_newick, 9 | io::{report::ParseKrakenReport, Output}, 10 | }; 11 | 12 | impl Runner for ConvertTree { 13 | #[instrument] 14 | fn run(self) -> Result<(), Report> { 15 | let input = &self.input.path; 16 | 17 | let reader = self.input.open_report()?; 18 | let mut csv_reader = csv::ReaderBuilder::new() 19 | .has_headers(self.input.headers) 20 | .delimiter(b'\t') 21 | .double_quote(false) 22 | .flexible(true) 23 | .from_reader(reader); 24 | 25 | let output = Output::from(self.output.file); 26 | output.try_writtable()?; 27 | 28 | let tree: Tree = ParseKrakenReport::parse(&mut csv_reader) 29 | .wrap_err_with(|| format!("failed to parse file `{}`", &input.display())) 30 | .suggestion("try using the `--has-headers` option if your Kraken report has headers")?; 31 | 32 | let mut writer = output.writer()?; 33 | 34 | match self.output.format { 35 | crate::io::OutputPhyloFormat::Newick => write_newick(&mut writer, &tree)?, 36 | } 37 | 38 | Ok(()) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | check: 14 | name: Check 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | rust: 19 | - stable 20 | - nightly 21 | steps: 22 | - uses: actions/checkout@v1 23 | - uses: actions-rs/toolchain@v1 24 | with: 25 | toolchain: ${{ matrix.rust }} 26 | override: true 27 | - uses: actions-rs/cargo@v1 28 | with: 29 | command: check 30 | 31 | test: 32 | name: Tests 33 | runs-on: ubuntu-latest 34 | strategy: 35 | matrix: 36 | rust: 37 | - stable 38 | - nightly 39 | 40 | steps: 41 | - uses: actions/checkout@v2 42 | - name: Build 43 | run: cargo build --verbose 44 | - name: Run tests 45 | run: cargo test --verbose 46 | 47 | clippy: 48 | name: Clippy 49 | runs-on: ubuntu-latest 50 | strategy: 51 | matrix: 52 | rust: 53 | - stable 54 | steps: 55 | - uses: actions/checkout@v1 56 | - uses: actions-rs/toolchain@v1 57 | with: 58 | toolchain: ${{ matrix.rust }} 59 | override: true 60 | - run: rustup component add clippy 61 | - uses: actions-rs/cargo@v1 62 | with: 63 | command: clippy 64 | args: -- -D warnings 65 | -------------------------------------------------------------------------------- /src/subcommands/convert_abundance.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::{Help, Report}; 2 | use csv::ReaderBuilder; 3 | use eyre::Context; 4 | use libspideog::data::abundance::AbundanceData; 5 | use tracing::instrument; 6 | 7 | use crate::{ 8 | cli::subcommands::{ConvertAbundance, Runner}, 9 | io::{abundance_csv::WriteAbundanceCsv, report::ParseKrakenReport, Output}, 10 | }; 11 | 12 | impl Runner for ConvertAbundance { 13 | #[instrument] 14 | fn run(self) -> Result<(), Report> { 15 | let input = &self.input.path; 16 | 17 | let reader = self.input.open_report()?; 18 | let mut csv_reader = ReaderBuilder::new() 19 | .has_headers(self.input.headers) 20 | .delimiter(b'\t') 21 | .double_quote(false) 22 | .flexible(true) 23 | .from_reader(reader); 24 | 25 | let output = Output::from(self.output.file); 26 | output.try_writtable()?; 27 | 28 | let data: AbundanceData = AbundanceData::parse(&mut csv_reader) 29 | .wrap_err_with(|| format!("failed to parse file `{}`", &input.display())) 30 | .suggestion("try using the `--has-headers` option if your Kraken report has headers")?; 31 | 32 | let mut writer = output.writer()?; 33 | 34 | match self.output.format { 35 | crate::io::OutputAbundanceFormat::Csv => { 36 | data.write_csv(&mut writer) 37 | .wrap_err("failed to write output to CSV")?; 38 | } 39 | } 40 | 41 | Ok(()) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /ci/install.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | main() { 4 | local target= 5 | if [ $TRAVIS_OS_NAME = linux ]; then 6 | target=x86_64-unknown-linux-musl 7 | sort=sort 8 | else 9 | target=x86_64-apple-darwin 10 | sort=gsort # for `sort --sort-version`, from brew's coreutils. 11 | fi 12 | 13 | # Builds for iOS are done on OSX, but require the specific target to be 14 | # installed. 15 | case $TARGET in 16 | aarch64-apple-ios) 17 | rustup target install aarch64-apple-ios 18 | ;; 19 | armv7-apple-ios) 20 | rustup target install armv7-apple-ios 21 | ;; 22 | armv7s-apple-ios) 23 | rustup target install armv7s-apple-ios 24 | ;; 25 | i386-apple-ios) 26 | rustup target install i386-apple-ios 27 | ;; 28 | x86_64-apple-ios) 29 | rustup target install x86_64-apple-ios 30 | ;; 31 | esac 32 | 33 | # This fetches latest stable release 34 | local tag=$(git ls-remote --tags --refs --exit-code https://github.com/rust-embedded/cross \ 35 | | cut -d/ -f3 \ 36 | | grep -E '^v0.[0-9]+.[0-9]+$' \ 37 | | $sort --version-sort \ 38 | | tail -n1) 39 | curl -LSfs https://japaric.github.io/trust/install.sh | \ 40 | sh -s -- \ 41 | --force \ 42 | --git japaric/cross \ 43 | --tag $tag \ 44 | --target $target 45 | } 46 | 47 | main 48 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | use crate::{data::tree::IndentedTaxon, taxonomy::Rank}; 2 | use displaydoc::Display; 3 | use thiserror::Error; 4 | 5 | #[derive(Display, Error, Debug)] 6 | #[non_exhaustive] 7 | pub enum SpideogError { 8 | /// expected root with no indentation, found indentation level: `{0}` 9 | NonZeroIndentRoot(usize), 10 | /// no suitable parent found for node `{0}` of indent `{1}` and rank `{2}` 11 | NoSuitableParent(String, usize, Rank), 12 | /// no node added to the tree 13 | NoNodeAdded, 14 | /// failed to parse line `{0}` 15 | LineParsingError(usize), 16 | /// node not found 17 | NodeNotFound, 18 | /// edge between `{0}` and `{1}` not found 19 | EdgeNotFound(IndentedTaxon, IndentedTaxon), 20 | /// parse output error 21 | ParseOutputPathError, 22 | /// input file is empty 23 | EmptyFile, 24 | /// Kraken parser error 25 | KrakenParser(#[source] csv::Error), 26 | /// taxonomy tree is not initialized 27 | TreeNotInitialized, 28 | /// failed to parse taxon name and identation 29 | KrakenIndentParsing, 30 | /// other 31 | Other, 32 | } 33 | 34 | #[derive(Display, Error, Debug)] 35 | #[non_exhaustive] 36 | pub enum TaxRankParsingError { 37 | /// failed to parse taxonomy rank offset from `{0}`: `{1}` is not a number (0..9) 38 | OffsetNotANumber(String, char), 39 | /// failed to parse taxonomy rank from `{0}`: found length `{1}` expected 1 or 2 40 | InvalidLength(String, usize), 41 | /// failed to parse taxonomy rank from `{0}`: invalid rank code `{1}` expected R, D, K, P, C, O, F, G, S, U, or - 42 | InvalidRankCode(String, char), 43 | /// failed to parse taxonomy rank from `{0}`: cannot infer previous taxonomy rank from previous records 44 | TaxRankParsingCannotInferRank(String), 45 | } 46 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | // #![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)] 2 | // #![allow(clippy::missing_const_for_fn)] 3 | // #![allow(clippy::multiple_crate_versions)] 4 | // #![allow(clippy::missing_errors_doc)] 5 | // #![allow(clippy::module_name_repetitions)] 6 | 7 | #[macro_use] 8 | extern crate eyre; 9 | #[macro_use] 10 | extern crate clap; 11 | #[macro_use] 12 | extern crate custom_derive; 13 | #[macro_use] 14 | extern crate enum_derive; 15 | 16 | mod cli; 17 | mod io; 18 | mod subcommands; 19 | 20 | use crate::clap::Clap; 21 | use cli::{ 22 | subcommands::{Command, Runner}, 23 | Opts, 24 | }; 25 | 26 | use color_eyre::eyre::Report; 27 | use displaydoc::Display; 28 | use eyre::Context; 29 | use thiserror::Error; 30 | use tracing::instrument; 31 | 32 | #[derive(Display, Error, Debug)] 33 | #[non_exhaustive] 34 | pub enum BinError { 35 | /// IO error with `{path}` 36 | Io { 37 | #[source] 38 | err: std::io::Error, 39 | path: std::path::PathBuf, 40 | }, 41 | /// encountered multiple errors 42 | MultipleErrors, 43 | } 44 | 45 | #[instrument] 46 | fn main() -> Result<(), Report> { 47 | cli::install_tracing(); 48 | cli::setup_error_hook()?; 49 | 50 | let opts: Opts = Opts::parse(); 51 | 52 | match opts.command { 53 | Command::ConvertTree(args) => { 54 | args.run().wrap_err("failed to convert taxonomy tree")?; 55 | } 56 | Command::CombineTrees(args) => { 57 | args.run().wrap_err("failed to combine taxonomy trees")?; 58 | } 59 | Command::ConvertAbundance(args) => { 60 | args.run() 61 | .wrap_err("failed to convert taxonomy abundance data")?; 62 | } 63 | Command::CombineAbundances(args) => { 64 | args.run().wrap_err("failed to combine abundance data")?; 65 | } 66 | } 67 | 68 | Ok(()) 69 | } 70 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | 8 | 9 | ## [Unreleased] - ReleaseDate 10 | 11 | ## [0.2.0] - 2020-11-17 12 | 13 | ### Added 14 | 15 | - command `convert-abundance` 16 | - command `combine-abundance` 17 | 18 | 19 | ### Modified 20 | 21 | - fixed README typos 22 | 23 | ## [0.1.2] - 2020-10-29 24 | 25 | ### Added 26 | 27 | - command `combine-trees` 28 | - read multiple Kraken reports 29 | - write one Newick taxonomy tree 30 | - second example Kraken report to test combining trees 31 | 32 | ### Modified 33 | 34 | - changed `tree` to `convert-tree` 35 | - read only one file 36 | - write only one file (default: stdout) 37 | - dev: split codebase between libspideog (src/lib.rs) and spideog (src/main.rs) 38 | - dev: other refactoring and improvements of the codebase 39 | 40 | ## [0.1.1] - 2020-10-24 41 | 42 | ### Added 43 | 44 | - dev: continous integration builds for linux, osx, and windows 45 | - error: add spantrace 46 | - documentation: example kraken report and output 47 | - documentation: links to downloads 48 | 49 | ### Modified 50 | 51 | - bugfix: quotes and round brackets were added to the list of characters to escape in taxon name 52 | - refactor: started to refactor to facilitate unit testing 53 | 54 | ## [0.1.0] - 2020-10-19 55 | 56 | ### Added 57 | 58 | - command `tree` to convert the taxonomy tree from Kraken reports to newick format 59 | 60 | 61 | 62 | [Unreleased]: https://github.com/jeanmanguy/spideog/compare/v0.2.0...HEAD 63 | [0.2.0]: https://github.com/jeanmanguy/spideog/compare/v0.1.2...v0.2.0 64 | [0.1.2]: https://github.com/jeanmanguy/spideog/compare/v0.1.1...v0.1.2 65 | [0.1.1]: https://github.com/jeanmanguy/spideog/compare/v0.1.0...v0.1.1 66 | [0.1.0]: https://github.com/jeanmanguy/spideog/releases/tag/v0.1.0 -------------------------------------------------------------------------------- /src/io/report.rs: -------------------------------------------------------------------------------- 1 | use std::{convert::TryFrom, fs::File}; 2 | 3 | use csv::Reader; 4 | use libspideog::{ 5 | data::abundance::AbundanceData, 6 | data::tree::{IndentedTaxon, Tree}, 7 | errors::SpideogError, 8 | kraken::{Fragments, ReportRecord, Taxon}, 9 | }; 10 | use tracing::instrument; 11 | 12 | pub trait ParseKrakenReport: Sized { 13 | fn parse(reader: &mut Reader) -> Result; 14 | } 15 | 16 | fn parse_origin_tree( 17 | first_line: Option>, 18 | ) -> Result { 19 | let first_line = first_line.ok_or(SpideogError::EmptyFile)?; 20 | let first_record: ReportRecord = first_line.map_err(SpideogError::KrakenParser)?; 21 | let origin = IndentedTaxon::try_from(first_record)?; 22 | let mut taxonomy_tree = Tree::new(); 23 | taxonomy_tree.with_origin(origin); 24 | Ok(taxonomy_tree) 25 | } 26 | 27 | impl ParseKrakenReport for Tree { 28 | #[instrument] 29 | fn parse(reader: &mut Reader) -> Result { 30 | let first_line = reader.deserialize().next(); 31 | 32 | let mut taxonomy_tree = parse_origin_tree(first_line)?; 33 | 34 | for result in reader.deserialize() { 35 | let record: ReportRecord = result.map_err(SpideogError::KrakenParser)?; 36 | let node = IndentedTaxon::try_from(record)?; 37 | let parent = taxonomy_tree.find_valid_parent_for(&node)?; 38 | taxonomy_tree.child(parent, node); 39 | } 40 | 41 | Ok(taxonomy_tree) 42 | } 43 | } 44 | 45 | impl ParseKrakenReport for AbundanceData { 46 | #[instrument] 47 | fn parse(reader: &mut Reader) -> Result { 48 | let mut data = Self::new(); 49 | 50 | for result in reader.deserialize() { 51 | let record: ReportRecord = result.map_err(SpideogError::KrakenParser)?; 52 | let taxon = Taxon::try_from(record.clone())?; 53 | let fragments = Fragments::try_from(record)?; 54 | data.insert(taxon, fragments); 55 | } 56 | 57 | Ok(data) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/kraken.rs: -------------------------------------------------------------------------------- 1 | use std::{convert::TryFrom, fmt::Display}; 2 | 3 | use tracing::instrument; 4 | 5 | use crate::{errors::SpideogError, parser::parse_ident_organism_name, taxonomy::Rank}; 6 | 7 | pub type ReportRecord = (String, u64, u64, Rank, u64, String); 8 | pub type Indent = usize; 9 | 10 | #[derive(Clone, PartialEq, PartialOrd, Debug, Ord, Eq, Hash, Deserialize)] 11 | pub struct Taxon { 12 | #[serde(rename = "taxonomy_lvl")] 13 | pub taxonomy_level: Rank, 14 | pub name: String, 15 | pub taxonomy_id: u64, 16 | } 17 | 18 | impl Display for Taxon { 19 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 20 | write!( 21 | f, 22 | "{} taxid:{} rank:{}", 23 | self.name, self.taxonomy_id, self.taxonomy_level 24 | ) 25 | } 26 | } 27 | 28 | impl TryFrom for Taxon { 29 | type Error = SpideogError; 30 | 31 | #[instrument] 32 | fn try_from(value: ReportRecord) -> Result { 33 | let (_, (_, name)) = parse_ident_organism_name(value.5.as_bytes()) 34 | .map_err(|_e| SpideogError::KrakenIndentParsing)?; 35 | 36 | let taxon = Self { 37 | taxonomy_level: value.3, 38 | name: String::from_utf8_lossy(name).trim().to_string(), 39 | taxonomy_id: value.4, 40 | }; 41 | 42 | Ok(taxon) 43 | } 44 | } 45 | 46 | #[derive(Clone, PartialEq, PartialOrd, Debug, Serialize, Deserialize, Default)] 47 | pub struct Fragments { 48 | pub clade_percentage: f64, 49 | pub clade_count_reads: u64, 50 | pub taxon_count_reads: u64, 51 | } 52 | 53 | impl TryFrom for Fragments { 54 | type Error = SpideogError; 55 | 56 | #[instrument] 57 | fn try_from(value: ReportRecord) -> Result { 58 | let percentage = value.0.parse::().map_err(|_e| SpideogError::Other)?; 59 | 60 | let fragments = Self { 61 | clade_percentage: percentage, 62 | clade_count_reads: value.1, 63 | taxon_count_reads: value.1, 64 | }; 65 | 66 | Ok(fragments) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/cli/logging.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::{eyre::Context, Report}; 2 | use log::LevelFilter; 3 | use simplelog::{ConfigBuilder, TermLogger, TerminalMode, WriteLogger}; 4 | use std::fs::OpenOptions; 5 | use std::path::PathBuf; 6 | use tracing::instrument; 7 | 8 | #[derive(Clap, Debug)] 9 | pub struct Logging { 10 | /// Log file (stdout if not present) 11 | #[clap(long, short, parse(from_os_str), global = true)] 12 | pub log: Option, 13 | /// Show addditional information. 14 | #[clap(long, global = true)] 15 | pub verbose: bool, 16 | } 17 | 18 | impl Logging { 19 | #[instrument] 20 | pub fn setup(&self) -> Result<(), Report> { 21 | let verbosity: LevelFilter = if self.verbose { 22 | LevelFilter::Debug 23 | } else { 24 | LevelFilter::Warn 25 | }; 26 | 27 | match &self.log { 28 | Some(filepath) => Self::setup_file_log(verbosity, filepath), 29 | None => Self::setup_term_log(verbosity), 30 | } 31 | } 32 | 33 | #[instrument] 34 | fn setup_file_log(verbosity: LevelFilter, filepath: &PathBuf) -> Result<(), Report> { 35 | let file = OpenOptions::new() 36 | .write(true) 37 | .truncate(false) 38 | .create(true) 39 | .open(filepath)?; 40 | WriteLogger::init( 41 | verbosity, 42 | ConfigBuilder::new().set_time_format_str("%F %R%:z").build(), 43 | file, 44 | ) 45 | .wrap_err_with(|| { 46 | format!( 47 | "Failed to setup the writer logger for file {}", 48 | filepath.display() 49 | ) 50 | })?; 51 | 52 | Ok(()) 53 | } 54 | 55 | #[instrument] 56 | fn setup_term_log(verbosity: LevelFilter) -> Result<(), Report> { 57 | TermLogger::init( 58 | verbosity, 59 | ConfigBuilder::new() 60 | .set_time_level(LevelFilter::Off) 61 | .build(), 62 | TerminalMode::Stderr, 63 | ) 64 | .wrap_err("Failed to setup the writer logger for stdout")?; 65 | 66 | Ok(()) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/subcommands/combine_phylo.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::{Help, Report}; 2 | use libspideog::{data::tree::Tree, errors::SpideogError}; 3 | use tracing::instrument; 4 | 5 | use crate::{ 6 | cli::subcommands::{CombineTrees, Runner}, 7 | io::{newick::write_newick, report::ParseKrakenReport, Output}, 8 | }; 9 | 10 | type VecResultTrees = Vec>; 11 | 12 | impl Runner for CombineTrees { 13 | #[instrument] 14 | fn run(self) -> Result<(), Report> { 15 | let readers = self.input.open_reports()?; 16 | let output = Output::from(self.output.file.clone()); 17 | output.try_writtable()?; 18 | 19 | let (ok_trees, errors_trees): (VecResultTrees, VecResultTrees) = readers 20 | .into_iter() 21 | .map(|r| -> Result { 22 | let mut csv_reader = csv::ReaderBuilder::new() 23 | .has_headers(self.input.headers) 24 | .delimiter(b'\t') 25 | .double_quote(false) 26 | .flexible(true) 27 | .from_reader(r); 28 | 29 | let tree: Tree = ParseKrakenReport::parse(&mut csv_reader)?; 30 | 31 | Ok(tree) 32 | }) 33 | .partition(Result::is_ok); 34 | 35 | if !errors_trees.is_empty() { 36 | return errors_trees 37 | .into_iter() 38 | .filter_map(|result| { 39 | if let Err(error) = result { 40 | Some(error) 41 | } else { 42 | None 43 | } 44 | }) 45 | .fold(Err(eyre!("encountered multiple errors")), |report, e| { 46 | report.error(e) 47 | }); 48 | } 49 | 50 | let mut trees_iter = ok_trees.into_iter().map(Result::unwrap); 51 | 52 | let combined_tree = trees_iter.try_fold(Tree::new(), Tree::try_combine_with)?; 53 | 54 | let mut writer = output.writer()?; 55 | match self.output.format { 56 | crate::io::OutputPhyloFormat::Newick => write_newick(&mut writer, &combined_tree)?, 57 | } 58 | 59 | Ok(()) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /tests/sample_data/sample.kreport: -------------------------------------------------------------------------------- 1 | 33.64 180 180 U 0 unclassified 2 | 66.36 355 0 R 1 root 3 | 66.36 355 0 R1 131567 cellular organisms 4 | 41.5 222 0 R2 2759 Eukaryota 5 | 33.64 180 0 K 33208 Metazoa 6 | 33.64 180 0 P 7711 Chordata 7 | 33.64 180 0 C 40674 Mammalia 8 | 33.64 180 0 C3 1437010 Boreoeutheria 9 | 9.35 50 0 C4 314146 Euarchontoglires 10 | 9.35 50 0 O 9443 Primates 11 | 9.35 50 0 F 9604 Hominidae 12 | 9.35 50 0 G 9605 Homo 13 | 9.35 50 50 S 9606 Homo sapiens 14 | 24.3 130 0 C4 314145 Laurasiatheria 15 | 1.87 10 0 O 33554 Carnivora 16 | 1.87 10 0 F 9608 Canidae 17 | 1.87 10 0 G 9611 Canis 18 | 1.87 10 10 S 9612 Canis lupus 19 | 22.43 120 0 O 91561 Artiodactyla 20 | 22.43 120 0 F 9726 Delphinidae 21 | 22.43 120 0 G 9738 Tursiops 22 | 22.43 120 120 S 9739 Tursiops truncatus 23 | 7.85 42 0 K 4751 Fungi 24 | 7.85 42 0 P 4890 Ascomycota 25 | 7.85 42 0 C 4891 Saccharomycetes 26 | 7.85 42 0 O 4892 Saccharomycetales 27 | 7.85 42 0 F 4893 Saccharomycetaceae 28 | 7.85 42 0 G 4930 Saccharomyces 29 | 7.85 42 42 S 4932 Saccharomyces cerevisiae 30 | 20.19 108 0 D 2 Bacteria 31 | 5.61 30 0 P 1224 Proteobacteria 32 | 5.61 30 0 C 1236 Gammaproteobacteria 33 | 5.61 30 0 O 91347 Enterobacterales 34 | 5.61 30 0 F 543 Enterobacteriaceae 35 | 5.61 30 0 G 561 Escherichia 36 | 5.61 30 30 S 562 Escherichia coli 37 | 14.58 78 0 P 1239 Firmicutes 38 | 14.58 78 0 C 91061 Bacilli 39 | 14.58 78 0 O 186826 Lactobacillales 40 | 14.58 78 0 F 1300 Streptococcaceae 41 | 14.58 78 0 G 1357 Lactococcus 42 | 14.58 78 78 S 1358 Lactococcus lactis 43 | 4.67 25 0 D 2157 Archaea 44 | 4.67 25 0 P 28889 Crenarchaeota 45 | 4.67 25 0 C 183924 Thermoprotei 46 | 4.67 25 0 O 2266 Thermoproteales 47 | 4.67 25 0 F 2267 Thermoproteaceae 48 | 4.67 25 0 G 2276 Pyrobaculum 49 | 4.67 25 25 S 70771 Pyrobaculum neutrophilum 50 | -------------------------------------------------------------------------------- /src/cli/subcommands.rs: -------------------------------------------------------------------------------- 1 | use super::args::{MultipleReports, OutputAbundance, OutputPhylo, SingleReport}; 2 | #[derive(Clap, Debug)] 3 | #[non_exhaustive] 4 | pub enum Command { 5 | // Info(Info), 6 | ConvertTree(ConvertTree), 7 | ConvertAbundance(ConvertAbundance), 8 | CombineTrees(CombineTrees), 9 | CombineAbundances(CombineAbundances), 10 | // Track(Track), 11 | } 12 | 13 | /// Extract diverse information about multiple reports 14 | #[derive(Clap, Debug)] 15 | #[clap(after_help = super::AFTER_HELP)] 16 | pub struct Info { 17 | #[clap(flatten)] 18 | pub input: MultipleReports, 19 | #[clap(flatten)] 20 | pub output: OutputAbundance, 21 | } 22 | 23 | /// Track one or multiple species across multiple reports 24 | #[derive(Clap, Debug)] 25 | #[clap(after_help = super::AFTER_HELP)] 26 | pub struct Track { 27 | #[clap(flatten)] 28 | pub input: MultipleReports, 29 | #[clap(flatten)] 30 | pub output: OutputAbundance, 31 | } 32 | 33 | /// Convert one report to one taxonomy tree 34 | #[derive(Clap, Debug)] 35 | #[clap(after_help = super::AFTER_HELP)] 36 | pub struct ConvertTree { 37 | #[clap(flatten)] 38 | pub input: SingleReport, 39 | #[clap(flatten)] 40 | pub output: OutputPhylo, 41 | } 42 | 43 | /// Convert one report to one abundance table 44 | #[derive(Clap, Debug)] 45 | #[clap(after_help = super::AFTER_HELP)] 46 | pub struct ConvertAbundance { 47 | #[clap(flatten)] 48 | pub input: SingleReport, 49 | #[clap(flatten)] 50 | pub output: OutputAbundance, 51 | } 52 | 53 | /// Combine multiple reports to one taxonomy tree 54 | #[derive(Clap, Debug)] 55 | #[clap(after_help = super::AFTER_HELP)] 56 | pub struct CombineTrees { 57 | #[clap(flatten)] 58 | pub input: MultipleReports, 59 | #[clap(flatten)] 60 | pub output: OutputPhylo, 61 | } 62 | 63 | /// Merge multiple reports to one abundance table 64 | #[derive(Clap, Debug)] 65 | #[clap(after_help = super::AFTER_HELP)] 66 | pub struct CombineAbundances { 67 | #[clap(flatten)] 68 | pub input: MultipleReports, 69 | #[clap(flatten)] 70 | pub output: OutputAbundance, 71 | /// add missing taxons for each sample 72 | #[clap(long = "add-missing-taxons", takes_value(false))] 73 | pub add_missing_taxons: bool, 74 | } 75 | 76 | pub trait Runner { 77 | fn run(self) -> Result<(), color_eyre::eyre::Report>; 78 | } 79 | -------------------------------------------------------------------------------- /src/data/abundance.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::iter::FromIterator; 3 | 4 | use crate::kraken::{Fragments, Taxon}; 5 | 6 | pub type AbundanceData = HashMap; 7 | 8 | pub type SampleName = String; 9 | 10 | #[derive(Debug, Default, PartialEq)] 11 | pub struct SampleAbundance { 12 | pub name: SampleName, 13 | pub dataset: AbundanceData, 14 | } 15 | 16 | impl SampleAbundance { 17 | #[must_use] 18 | pub fn taxons(&self) -> Vec { 19 | self.dataset.keys().cloned().collect() 20 | } 21 | } 22 | 23 | impl From<(SampleName, AbundanceData)> for SampleAbundance { 24 | fn from(values: (SampleName, AbundanceData)) -> Self { 25 | Self { 26 | name: values.0, 27 | dataset: values.1, 28 | } 29 | } 30 | } 31 | 32 | pub type SamplesAbundanceData = Vec; // FIXME remove 33 | 34 | #[derive(Debug, Default, PartialEq)] 35 | pub struct Samples { 36 | pub data: Vec, 37 | pub unique_taxons: Vec, 38 | } 39 | 40 | impl Samples { 41 | #[must_use] 42 | pub fn new() -> Self { 43 | Self { 44 | data: Vec::new(), 45 | unique_taxons: Vec::new(), 46 | } 47 | } 48 | 49 | fn add(&mut self, elem: SampleAbundance) { 50 | let new_taxons = elem.taxons(); 51 | 52 | for taxon in new_taxons { 53 | if !self.unique_taxons.contains(&taxon) { 54 | self.unique_taxons.push(taxon); 55 | } 56 | } 57 | 58 | self.data.push(elem); 59 | } 60 | 61 | pub fn add_missing_taxons(&mut self) -> &mut Self { 62 | for datum in &mut self.data { 63 | for taxon in &self.unique_taxons { 64 | datum 65 | .dataset 66 | .entry(taxon.clone()) 67 | .or_insert_with(Fragments::default); 68 | } 69 | } 70 | 71 | self 72 | } 73 | } 74 | 75 | impl FromIterator<(SampleName, AbundanceData)> for Samples { 76 | fn from_iter>(iter: T) -> Self { 77 | let mut samples = Self::new(); 78 | 79 | for i in iter { 80 | let sample = SampleAbundance::from(i); 81 | samples.add(sample); 82 | } 83 | 84 | samples 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | pub mod args; 2 | pub mod logging; 3 | pub mod subcommands; 4 | 5 | use clap::AppSettings; 6 | use color_eyre::{Report, Result}; 7 | 8 | static AFTER_HELP: &str = "Thank you for using Spideog. Please send any feedback, bug report or feature request to the project's github page: https://github.com/jeanmanguy/spideog"; 9 | 10 | #[derive(Debug, Clap)] 11 | #[clap(author, about, version)] 12 | #[clap(global_setting = AppSettings::ColoredHelp)] 13 | #[clap(global_setting = AppSettings::ColorAuto)] 14 | #[clap(global_setting = AppSettings::DeriveDisplayOrder)] 15 | #[clap(global_setting = AppSettings::DontCollapseArgsInUsage)] 16 | #[clap(global_setting = AppSettings::GlobalVersion)] 17 | #[clap(global_setting = AppSettings::ArgRequiredElseHelp)] 18 | #[clap(global_setting = AppSettings::HelpRequired)] 19 | #[clap(global_setting = AppSettings::UnifiedHelpMessage)] 20 | #[clap(after_help = AFTER_HELP)] 21 | pub struct Opts { 22 | #[clap(subcommand)] 23 | pub command: subcommands::Command, 24 | // #[clap(flatten)] 25 | // pub logging: logging::Logging, 26 | } 27 | 28 | pub fn setup_error_hook() -> Result<(), Report> { 29 | color_eyre::config::HookBuilder::default() 30 | .add_default_filters() 31 | .issue_url(concat!(env!("CARGO_PKG_REPOSITORY"), "/issues/new")) 32 | .add_issue_metadata("version", crate_version!()) 33 | .add_issue_metadata("architecture", std::env::consts::ARCH) 34 | .add_issue_metadata("OS", std::env::consts::OS) 35 | .issue_filter(|kind| match kind { 36 | color_eyre::ErrorKind::NonRecoverable(_) => true, 37 | color_eyre::ErrorKind::Recoverable(_) => false, 38 | }) 39 | .install() 40 | } 41 | 42 | // Boilerplate: https://github.com/yaahc/color-eyre/blob/master/examples/usage.rs 43 | // TODO: adjust for use 44 | // TODO: move to logging.rs? 45 | pub fn install_tracing() { 46 | use tracing_error::ErrorLayer; 47 | use tracing_subscriber::prelude::*; 48 | use tracing_subscriber::{fmt, EnvFilter}; 49 | 50 | let fmt_layer = fmt::layer().with_target(false); 51 | let filter_layer = EnvFilter::try_from_default_env() 52 | .or_else(|_| EnvFilter::try_new("debug")) 53 | .unwrap(); 54 | 55 | tracing_subscriber::registry() 56 | .with(filter_layer) 57 | .with(fmt_layer) 58 | .with(ErrorLayer::default()) 59 | .init(); 60 | } 61 | -------------------------------------------------------------------------------- /tests/sample_data/converted.csv: -------------------------------------------------------------------------------- 1 | "taxon","taxid","rank","clade_percentage","clade_count_reads","taxon_count_reads" 2 | "Thermoproteales",2266,"Order_0",4.67,25,25 3 | "Homo",9605,"Genus_0",9.35,50,50 4 | "Laurasiatheria",314145,"Class_4",24.3,130,130 5 | "Artiodactyla",91561,"Order_0",22.43,120,120 6 | "Delphinidae",9726,"Family_0",22.43,120,120 7 | "Canidae",9608,"Family_0",1.87,10,10 8 | "Ascomycota",4890,"Phylum_0",7.85,42,42 9 | "Proteobacteria",1224,"Phylum_0",5.61,30,30 10 | "Tursiops",9738,"Genus_0",22.43,120,120 11 | "Enterobacterales",91347,"Order_0",5.61,30,30 12 | "Carnivora",33554,"Order_0",1.87,10,10 13 | "Fungi",4751,"Kingdom_0",7.85,42,42 14 | "Homo sapiens",9606,"Species_0",9.35,50,50 15 | "Chordata",7711,"Phylum_0",33.64,180,180 16 | "Lactococcus lactis",1358,"Species_0",14.58,78,78 17 | "Pyrobaculum",2276,"Genus_0",4.67,25,25 18 | "Archaea",2157,"Domain_0",4.67,25,25 19 | "Euarchontoglires",314146,"Class_4",9.35,50,50 20 | "Tursiops truncatus",9739,"Species_0",22.43,120,120 21 | "Primates",9443,"Order_0",9.35,50,50 22 | "Bacteria",2,"Domain_0",20.19,108,108 23 | "Saccharomycetaceae",4893,"Family_0",7.85,42,42 24 | "Thermoprotei",183924,"Class_0",4.67,25,25 25 | "Bacilli",91061,"Class_0",14.58,78,78 26 | "Thermoproteaceae",2267,"Family_0",4.67,25,25 27 | "Saccharomycetales",4892,"Order_0",7.85,42,42 28 | "Crenarchaeota",28889,"Phylum_0",4.67,25,25 29 | "unclassified",0,"Unclassified_0",33.64,180,180 30 | "Hominidae",9604,"Family_0",9.35,50,50 31 | "Gammaproteobacteria",1236,"Class_0",5.61,30,30 32 | "Lactococcus",1357,"Genus_0",14.58,78,78 33 | "Eukaryota",2759,"Root_2",41.5,222,222 34 | "cellular organisms",131567,"Root_1",66.36,355,355 35 | "Saccharomyces",4930,"Genus_0",7.85,42,42 36 | "Canis",9611,"Genus_0",1.87,10,10 37 | "Pyrobaculum neutrophilum",70771,"Species_0",4.67,25,25 38 | "Lactobacillales",186826,"Order_0",14.58,78,78 39 | "root",1,"Root_0",66.36,355,355 40 | "Canis lupus",9612,"Species_0",1.87,10,10 41 | "Enterobacteriaceae",543,"Family_0",5.61,30,30 42 | "Escherichia",561,"Genus_0",5.61,30,30 43 | "Mammalia",40674,"Class_0",33.64,180,180 44 | "Saccharomyces cerevisiae",4932,"Species_0",7.85,42,42 45 | "Saccharomycetes",4891,"Class_0",7.85,42,42 46 | "Boreoeutheria",1437010,"Class_3",33.64,180,180 47 | "Metazoa",33208,"Kingdom_0",33.64,180,180 48 | "Escherichia coli",562,"Species_0",5.61,30,30 49 | "Streptococcaceae",1300,"Family_0",14.58,78,78 50 | "Firmicutes",1239,"Phylum_0",14.58,78,78 51 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spideog" 3 | version = "0.2.0" 4 | authors = ["Jean Manguy "] 5 | edition = "2018" 6 | description = "Command line utility to analyse and convert Kraken reports" 7 | publish = false 8 | readme = "README.md" 9 | repository = "https://github.com/jeanmanguy/spideog" 10 | documentation = "https://github.com/jeanmanguy/spideog/blob/main/README.md" 11 | keywords = ["cli", "bioinformatics", "metagenomics"] 12 | categories = ["command-line-utilities", "science"] 13 | license = "MIT/Apache-2.0" 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | 17 | [dependencies] 18 | thiserror = "1.0" 19 | serde = { version = "1.0", features = ["derive"] } 20 | csv = "1" 21 | color-eyre = { version = "0.5", features = ["issue-url"] } 22 | clap = { version = "~3.0.0-beta.2", default-features = false, features = [ "derive", "suggestions", "color", "std" ] } 23 | clap_generate = "3.0.0-beta.2" 24 | nom = "6.0" 25 | daggy = "0.7" 26 | petgraph = "0.5" 27 | log = "0.4" 28 | simplelog = "0.8" 29 | displaydoc = "0.1" 30 | once_cell = "1.4" 31 | atty = "0.2" 32 | dialoguer = "0.6" 33 | enum_derive = "0.1.7" 34 | custom_derive = "0.1" 35 | eyre = "0.6" 36 | tracing-error = "0.1.2" 37 | tracing = { version = "0.1.17", features = [ "attributes" ] } 38 | tracing-subscriber = "0.2.10" 39 | exitcode = "1.1.2" 40 | 41 | [dev-dependencies] 42 | test-case = "1.0" 43 | pretty_assertions = "0.6.1" 44 | 45 | 46 | [lib] 47 | name = "libspideog" 48 | path = "src/lib.rs" 49 | 50 | [[bin]] 51 | name = "spideog" 52 | path = "src/main.rs" 53 | 54 | [profile.dev] 55 | panic = "unwind" 56 | 57 | [profile.dev.package.backtrace] 58 | opt-level = 3 59 | 60 | [profile.release] 61 | lto = true 62 | panic = "unwind" 63 | codegen-units = 1 64 | 65 | 66 | [package.metadata.release] 67 | disable-publish = true 68 | consolidate-commits = true 69 | no-dev-version = true 70 | 71 | [[package.metadata.release.pre-release-replacements]] 72 | file = "CHANGELOG.md" 73 | search = "Unreleased" 74 | replace="{{version}}" 75 | 76 | [[package.metadata.release.pre-release-replacements]] 77 | file = "CHANGELOG.md" 78 | search = "\\.\\.\\.HEAD" 79 | replace="...{{tag_name}}" 80 | exactly = 1 81 | 82 | [[package.metadata.release.pre-release-replacements]] 83 | file = "CHANGELOG.md" 84 | search = "ReleaseDate" 85 | replace="{{date}}" 86 | 87 | [[package.metadata.release.pre-release-replacements]] 88 | file="CHANGELOG.md" 89 | search="" 90 | replace="\n\n## [Unreleased] - ReleaseDate" 91 | exactly=1 92 | 93 | [[package.metadata.release.pre-release-replacements]] 94 | file="CHANGELOG.md" 95 | search="" 96 | replace="\n[Unreleased]: https://github.com/jeanmanguy/{{crate_name}}/compare/{{tag_name}}...HEAD" 97 | exactly=1 98 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Based on the "trust" template v0.1.2 2 | # https://github.com/japaric/trust/tree/v0.1.2 3 | 4 | os: linux 5 | dist: xenial 6 | language: rust 7 | services: docker 8 | 9 | env: 10 | global: 11 | - CRATE_NAME=spideog 12 | - secure: B39IuyC3UCdPlx6jq7xMB1aVOQiXUQry+0TbgR6CB18Yip9gGVRSkeIRGHnl38t6OkTGQ52U4nWRSigKOkXCJmzPirRgyC28TPGG/O7/SJlSFkhRv83w+9BfyjXyZAuRAaSdjTCxLcvJIMdrgos+r1C8TBEVnbVtg9dVDJEzIo2zvEIbxtOuCALRai7sL595oD5csrQtwR9sJgf4FsfZQRYSeIEvNgDUIEkUsu/38Xkx43ekgLOaGVwJ5qFAzaHY1LAdU5ilV+tHX4k7GBvM4oRFA1on2kBhRBtLj2XW3CFMBJe2kHi3fktMCeHxvZCDDpvs3q0rpj7ddk4kVxhihfY6Wa9KvrsZAbyJptiEDz4piB/xjMA1oGyllNpfI7+0E3Toj1Etmk5NqPt+Iumq6OHw0DQeqlUQZuaXYa1hJX5lxqEO36/YBouP907PrceKWoKEoHlbVZXOBLhy0WZqCmGGUl7QweNXqQ2Va0Ypmv5AbLvlYokop56WzcDlQswcKNJeAMOnOpEH0h5BUg4fmnpFyztL7M04U+JKOk4tFwZ2IbDILhm4zJGTzJaaE8yyUt3XPh4wxsVcftFj22eB+OF03jUmzZdb9raur5zgChwlR95Bs6YrFd2DFkiFk0hirs6AwFMMIAuDN4MhTF1EyGSq8QTObsMjIBGL5RexBhE= 13 | 14 | jobs: 15 | include: 16 | # Linux 17 | # - env: TARGET=aarch64-unknown-linux-gnu 18 | # - env: TARGET=arm-unknown-linux-gnueabi 19 | # - env: TARGET=armv7-unknown-linux-gnueabihf 20 | # - env: TARGET=i686-unknown-linux-gnu 21 | # - env: TARGET=i686-unknown-linux-musl 22 | # - env: TARGET=mips-unknown-linux-gnu 23 | # - env: TARGET=mips64-unknown-linux-gnuabi64 24 | # - env: TARGET=mips64el-unknown-linux-gnuabi64 25 | # - env: TARGET=mipsel-unknown-linux-gnu 26 | # - env: TARGET=powerpc-unknown-linux-gnu 27 | # - env: TARGET=powerpc64-unknown-linux-gnu 28 | # - env: TARGET=powerpc64le-unknown-linux-gnu 29 | # - env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1 30 | # - env: TARGET=x86_64-unknown-linux-gnu 31 | - env: TARGET=x86_64-unknown-linux-musl 32 | 33 | # OSX 34 | # - env: TARGET=i686-apple-darwin 35 | # os: osx 36 | - env: TARGET=x86_64-apple-darwin 37 | os: osx 38 | 39 | # *BSD 40 | # - env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1 41 | # - env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1 42 | # - env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1 43 | 44 | # Windows 45 | - env: TARGET=x86_64-pc-windows-gnu 46 | 47 | before_install: 48 | - set -e 49 | - rustup self update 50 | 51 | install: 52 | - sh ci/install.sh 53 | - source ~/.cargo/env || true 54 | 55 | script: 56 | - bash ci/script.sh 57 | 58 | after_script: set +e 59 | 60 | before_deploy: 61 | - bash ci/before_deploy.sh 62 | 63 | deploy: 64 | - provider: releases 65 | api_key: $GITHUB_TOKEN 66 | 67 | file_glob: true 68 | file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.* 69 | on: 70 | condition: $TRAVIS_RUST_VERSION = stable 71 | tags: true 72 | 73 | skip_cleanup: true 74 | 75 | cache: cargo 76 | before_cache: 77 | # Travis can't cache files that are not readable by "others" 78 | - chmod -R a+r $HOME/.cargo 79 | 80 | branches: 81 | only: 82 | # release tags 83 | - /^v\d+\.\d+\.\d+.*$/ 84 | - main 85 | 86 | notifications: 87 | email: 88 | on_success: always 89 | -------------------------------------------------------------------------------- /tests/sample_data/sample_2.kreport: -------------------------------------------------------------------------------- 1 | 30.82 249 249 U 0 unclassified 2 | 69.18 559 0 R 1 root 3 | 61.01 493 0 R1 131567 cellular organisms 4 | 45.42 367 0 R2 2759 Eukaryota 5 | 37.13 300 0 K 33208 Metazoa 6 | 37.13 300 0 P 7711 Chordata 7 | 37.13 300 0 C 40674 Mammalia 8 | 37.13 300 0 C3 1437010 Boreoeutheria 9 | 24.75 200 0 C4 314146 Euarchontoglires 10 | 24.75 200 0 O 9443 Primates 11 | 24.75 200 0 F 9604 Hominidae 12 | 24.75 200 0 G 9605 Homo 13 | 24.75 200 200 S 9606 Homo sapiens 14 | 12.38 100 0 C4 314145 Laurasiatheria 15 | 4.95 40 0 O 33554 Carnivora 16 | 4.95 40 0 F 9681 Felidae 17 | 4.95 40 0 G 9682 Felis 18 | 4.95 40 40 S 9685 Felis catus 19 | 7.43 60 0 O 91561 Artiodactyla 20 | 7.43 60 0 F 9726 Delphinidae 21 | 7.43 60 0 G 9738 Tursiops 22 | 7.43 60 60 S 9739 Tursiops truncatus 23 | 8.29 67 0 K 4751 Fungi 24 | 8.29 67 0 P 4890 Ascomycota 25 | 8.29 67 0 C 4891 Saccharomycetes 26 | 8.29 67 0 O 4892 Saccharomycetales 27 | 8.29 67 0 F 4893 Saccharomycetaceae 28 | 8.29 67 0 G 4930 Saccharomyces 29 | 1.24 10 10 S 4932 Saccharomyces cerevisiae 30 | 1.86 15 15 S 114524 Saccharomyces kudriavzevii 31 | 5.20 42 42 S 27292 Saccharomyces pastorianus 32 | 14.11 114 0 D 2 Bacteria 33 | 9.90 80 0 P 1224 Proteobacteria 34 | 9.90 80 0 C 1236 Gammaproteobacteria 35 | 9.90 80 0 O 91347 Enterobacterales 36 | 9.90 80 0 F 543 Enterobacteriaceae 37 | 9.90 80 0 G 561 Escherichia 38 | 9.90 80 80 S 562 Escherichia coli 39 | 4.21 34 0 P 1239 Firmicutes 40 | 4.21 34 0 C 91061 Bacilli 41 | 4.21 34 0 O 186826 Lactobacillales 42 | 4.21 34 0 F 1300 Streptococcaceae 43 | 4.21 34 0 G 1357 Lactococcus 44 | 4.21 34 34 S 1358 Lactococcus lactis 45 | 1.49 12 0 D 2157 Archaea 46 | 1.49 12 0 P 28889 Crenarchaeota 47 | 1.49 12 0 C 183924 Thermoprotei 48 | 1.49 12 0 O 2266 Thermoproteales 49 | 1.49 12 0 F 2267 Thermoproteaceae 50 | 1.49 12 0 G 2276 Pyrobaculum 51 | 1.49 12 12 S 70771 Pyrobaculum neutrophilum 52 | 8.17 66 0 D 10239 Viruses 53 | 8.17 66 0 K 2732396 Orthornavirae 54 | 8.17 66 0 P 2497569 Negarnaviricota 55 | 8.17 66 0 C 2497577 Insthoviricetes 56 | 8.17 66 0 O 2499411 Articulavirales 57 | 6.68 54 0 F 11308 Orthomyxoviridae 58 | 6.68 54 0 G 197912 Betainfluenzavirus 59 | 6.68 54 54 S 11520 Influenza B virus 60 | 1.49 12 0 F 2501949 Amnoonviridae 61 | 1.49 12 0 G 2034997 Tilapinevirus 62 | 1.49 12 12 S 2034996 Tilapia tilapinevirus 63 | -------------------------------------------------------------------------------- /src/subcommands/combine_abundance.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::{Help, Report}; 2 | use eyre::Context; 3 | use libspideog::{ 4 | data::abundance::{AbundanceData, SampleName, Samples}, 5 | errors::SpideogError, 6 | }; 7 | use std::iter::FromIterator; 8 | use tracing::instrument; 9 | 10 | use crate::{ 11 | cli::subcommands::{CombineAbundances, Runner}, 12 | io::{abundance_csv::WriteAbundanceCsv, report::ParseKrakenReport, Output}, 13 | }; 14 | 15 | type VecResultAbundanceData = Vec>; 16 | 17 | impl Runner for CombineAbundances { 18 | #[instrument] 19 | fn run(self) -> Result<(), Report> { 20 | let sample_names: Vec = self 21 | .input 22 | .paths 23 | .iter() 24 | .map(|p| p.file_stem().unwrap().to_string_lossy().into()) 25 | .collect(); 26 | let readers = self.input.open_reports()?; 27 | let output = Output::from(self.output.file.clone()); 28 | output.try_writtable()?; 29 | 30 | let (ok_abundance_data, errors_abundance_data): ( 31 | VecResultAbundanceData, 32 | VecResultAbundanceData, 33 | ) = readers 34 | .into_iter() 35 | .zip(sample_names) 36 | .map( 37 | |(file, sample_name)| -> Result<(SampleName, AbundanceData), SpideogError> { 38 | let mut csv_reader = csv::ReaderBuilder::new() 39 | .has_headers(self.input.headers) 40 | .delimiter(b'\t') 41 | .double_quote(false) 42 | .flexible(true) 43 | .from_reader(file); 44 | 45 | let tree: AbundanceData = ParseKrakenReport::parse(&mut csv_reader)?; 46 | 47 | Ok((sample_name, tree)) 48 | }, 49 | ) 50 | .partition(Result::is_ok); 51 | 52 | if !errors_abundance_data.is_empty() { 53 | return errors_abundance_data 54 | .into_iter() 55 | .filter_map(|result| { 56 | if let Err(error) = result { 57 | Some(error) 58 | } else { 59 | None 60 | } 61 | }) 62 | .fold(Err(eyre!("encountered multiple errors")), |report, e| { 63 | report.error(e) 64 | }); 65 | } 66 | 67 | let mut samples = Samples::from_iter(ok_abundance_data.into_iter().map(Result::unwrap)); 68 | 69 | if self.add_missing_taxons { 70 | samples.add_missing_taxons(); 71 | } 72 | 73 | let mut writer = output.writer()?; 74 | match self.output.format { 75 | crate::io::OutputAbundanceFormat::Csv => { 76 | samples 77 | .write_csv(&mut writer) 78 | .wrap_err("failed to write output to CSV")?; 79 | } 80 | } 81 | 82 | Ok(()) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/io/abundance_csv.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::Report; 2 | use csv::Writer; 3 | use eyre::Context; 4 | use libspideog::data::abundance::{AbundanceData, Samples}; 5 | use serde::Serialize; 6 | 7 | pub trait WriteAbundanceCsv: Sized { 8 | fn write_csv(self, writer: &mut W) -> Result<(), Report> { 9 | let mut csv_writer = csv::WriterBuilder::new() 10 | .delimiter(b',') 11 | .quote_style(csv::QuoteStyle::NonNumeric) 12 | .has_headers(true) 13 | .from_writer(writer); 14 | 15 | self.write_records(&mut csv_writer)?; 16 | 17 | Ok(()) 18 | } 19 | 20 | fn write_records(self, csv_writer: &mut Writer) -> Result<(), Report>; 21 | } 22 | 23 | #[derive(Serialize)] 24 | struct RowAbundanceData { 25 | #[serde(rename = "taxon")] 26 | name: String, 27 | #[serde(rename = "taxid")] 28 | taxonomy_id: u64, 29 | #[serde(rename = "rank")] 30 | taxonomy_level: String, 31 | clade_percentage: f64, 32 | clade_count_reads: u64, 33 | taxon_count_reads: u64, 34 | } 35 | 36 | #[derive(Serialize)] 37 | struct RowSampleAbundanceData { 38 | sample: String, 39 | #[serde(rename = "taxon")] 40 | name: String, 41 | #[serde(rename = "taxid")] 42 | taxonomy_id: u64, 43 | #[serde(rename = "rank")] 44 | taxonomy_level: String, 45 | clade_percentage: f64, 46 | clade_count_reads: u64, 47 | taxon_count_reads: u64, 48 | } 49 | 50 | impl WriteAbundanceCsv for AbundanceData { 51 | fn write_records(self, csv_writer: &mut Writer) -> Result<(), Report> { 52 | for (taxon, abundance_data) in self { 53 | csv_writer 54 | .serialize(RowAbundanceData { 55 | name: taxon.name.clone(), 56 | taxonomy_id: taxon.taxonomy_id, 57 | taxonomy_level: format!("{}", taxon.taxonomy_level), 58 | clade_percentage: abundance_data.clade_percentage, 59 | clade_count_reads: abundance_data.clade_count_reads, 60 | taxon_count_reads: abundance_data.taxon_count_reads, 61 | }) 62 | .wrap_err_with(|| format!("failed to write record for `{}`", taxon.name))?; 63 | } 64 | 65 | Ok(()) 66 | } 67 | } 68 | 69 | impl WriteAbundanceCsv for Samples { 70 | fn write_records(self, csv_writer: &mut Writer) -> Result<(), Report> { 71 | for sample in self.data { 72 | for (taxon, abundance_data) in &sample.dataset { 73 | csv_writer 74 | .serialize(RowSampleAbundanceData { 75 | sample: sample.name.clone(), 76 | name: taxon.name.clone(), 77 | taxonomy_id: taxon.taxonomy_id, 78 | taxonomy_level: format!("{}", taxon.taxonomy_level), 79 | clade_percentage: abundance_data.clade_percentage, 80 | clade_count_reads: abundance_data.clade_count_reads, 81 | taxon_count_reads: abundance_data.taxon_count_reads, 82 | }) 83 | .wrap_err_with(|| { 84 | format!( 85 | "failed to write record for sample `{}` `{}`", 86 | sample.name, taxon.name 87 | ) 88 | })?; 89 | } 90 | } 91 | 92 | Ok(()) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/cli/args.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::{Clap, ValueHint}; 4 | 5 | // #[derive(Clap, Debug, Clone)] 6 | // pub struct KrakenReport { 7 | // /// Kraken reports 8 | // #[clap(name = "FILE", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(true))] 9 | // pub reports: Vec, 10 | // } 11 | 12 | // #[derive(Clap, Debug, PartialEq)] 13 | // pub enum ExtractKind { 14 | // #[clap(alias = "p")] 15 | // Phylo, 16 | // #[clap(alias = "d")] 17 | // Data, 18 | // } 19 | 20 | // #[derive(Clap, Debug)] 21 | // pub struct ExtractKind2 { 22 | // /// Extract taxonomy tree 23 | // #[clap(long, conflicts_with("data"))] 24 | // phylo: bool, 25 | // /// extract data 26 | // #[clap(long, conflicts_with("phylo"))] 27 | // data: bool 28 | // } 29 | 30 | // #[derive(Clap, Debug)] 31 | // pub struct Extract { 32 | // #[clap(arg_enum, name = "kind", case_insensitive(true))] 33 | // pub kind: ExtractKind 34 | // } 35 | 36 | #[derive(Clap, Debug)] 37 | pub struct SingleReport { 38 | /// A single Kraken report 39 | #[clap(name = "FILE", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(false), takes_value(true))] 40 | pub path: PathBuf, 41 | /// Input report format 42 | #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))] 43 | pub format: crate::io::InputReportFormat, 44 | /// Does the kraken report has headers 45 | #[clap(long = "has-headers", takes_value(false))] 46 | pub headers: bool 47 | } 48 | 49 | #[derive(Clap, Debug)] 50 | pub struct MultipleReports { 51 | /// Multiple Kraken reports 52 | #[clap(name = "FILES", parse(from_os_str), value_hint = ValueHint::AnyPath, required(true), multiple(true), takes_value(true))] 53 | pub paths: Vec, 54 | /// Input reports format (all reports must have the format) 55 | #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))] 56 | pub format: crate::io::InputReportFormat, 57 | /// Does the kraken reports have headers (all or none) 58 | #[clap(long = "have-headers", takes_value(false))] 59 | pub headers: bool 60 | } 61 | 62 | #[derive(Clap, Debug, Clone)] 63 | pub struct OutputFile { 64 | /// Output file [default: stdout (-)] 65 | #[clap( 66 | name = "output", 67 | global(true), 68 | long = "output", 69 | parse(from_os_str), 70 | value_hint = ValueHint::AnyPath, 71 | takes_value(true), 72 | )] 73 | pub path: Option, 74 | /// force overwriting exiting output file 75 | #[clap( 76 | long, 77 | requires("output"), 78 | global(true), 79 | )] 80 | pub overwrite: bool, 81 | } 82 | 83 | #[derive(Clap, Debug)] 84 | pub struct InputReport { 85 | /// Input report format 86 | #[clap(long = "report-format", name = "report-format", arg_enum, case_insensitive(true), global(true), default_value("Kraken"))] 87 | pub format: crate::io::InputReportFormat, 88 | } 89 | 90 | #[derive(Clap, Debug)] 91 | pub struct OutputPhylo { 92 | #[clap(flatten)] 93 | pub file: OutputFile, 94 | /// Output tree format 95 | #[clap(long = "format", name = "output-format", arg_enum, case_insensitive(true), default_value("Newick"))] 96 | pub format: crate::io::OutputPhyloFormat, 97 | } 98 | 99 | 100 | #[derive(Clap, Debug)] 101 | pub struct OutputAbundance { 102 | #[clap(flatten)] 103 | pub file: OutputFile, 104 | /// Output abundance format 105 | #[clap(long = "format", name = "output-format", arg_enum, case_insensitive(true), default_value("csv"))] 106 | pub format: crate::io::OutputAbundanceFormat, 107 | } 108 | 109 | 110 | -------------------------------------------------------------------------------- /src/io/newick.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::Report; 2 | use daggy::{NodeIndex, Walker}; 3 | use eyre::ContextCompat; 4 | use libspideog::data::tree::Tree; 5 | use std::{borrow::Cow, io}; 6 | use tracing::instrument; 7 | 8 | pub fn write_newick(writer: &mut W, tree: &Tree) -> Result<(), Report> 9 | where 10 | W: std::io::Write, 11 | { 12 | write_children_recursively(writer, tree, tree.origin.unwrap(), 0)?; // TODO: add error / panic Tree not initilialised 13 | write_end(writer)?; 14 | 15 | Ok(()) 16 | } 17 | 18 | #[inline] 19 | pub fn write_name_distance(writer: &mut W, name: S, distance: usize) -> Result<(), io::Error> 20 | where 21 | W: io::Write, 22 | S: AsRef, 23 | { 24 | write!(writer, "{}", format_name_distance(name, distance)) 25 | } 26 | 27 | #[inline] 28 | fn format_name_distance>(name: S, distance: usize) -> String { 29 | format!("{}:{}", clean_name(name.as_ref()), distance) 30 | } 31 | 32 | #[inline] 33 | pub fn write_end(writer: &mut W) -> Result<(), io::Error> 34 | where 35 | W: io::Write, 36 | { 37 | write!(writer, "{}", format_end()) 38 | } 39 | 40 | #[instrument] 41 | #[inline] 42 | fn format_end() -> String { 43 | String::from(";\n") 44 | } 45 | 46 | fn is_trouble(c: char) -> bool { 47 | c == ' ' || c == '.' || c == ',' || c == '=' || c == '[' || c == ']' || c == '/' || c == ':' 48 | } 49 | 50 | // based from https://lise-henry.github.io/articles/optimising_strings.html 51 | // not going to use regex just for that 52 | pub fn clean_name<'a, S: Into>>(input: S) -> Cow<'a, str> { 53 | let input = input.into(); 54 | 55 | let first_trouble_character = input.find(is_trouble); 56 | if let Some(first_trouble_character) = first_trouble_character { 57 | let mut output = String::from(&input[0..first_trouble_character]); 58 | output.reserve(input.len() - first_trouble_character); 59 | let rest = input[first_trouble_character..].chars(); 60 | for c in rest { 61 | match c { 62 | ' ' | '-' | '/' | ':' => output.push_str("_"), 63 | '.' | ',' | '[' | ']' | '(' | ')' | '\'' | '\"' => {} 64 | _ => output.push(c), 65 | } 66 | } 67 | Cow::Owned(output) 68 | } else { 69 | input 70 | } 71 | } 72 | 73 | pub fn write_children_recursively( 74 | writer: &mut W, 75 | tree: &Tree, 76 | node: NodeIndex, 77 | parent_indent: usize, 78 | ) -> Result<(), Report> 79 | where 80 | W: io::Write, 81 | { 82 | let mut child_walker = tree.tree.children(node); 83 | let mut children = Vec::new(); 84 | while let Some((_, node)) = child_walker.walk_next(&tree.tree) { 85 | children.push(node); 86 | } 87 | 88 | let node_data = tree.tree.node_weight(node).wrap_err("node not found")?; 89 | let distance = node_data 90 | .indent 91 | .checked_sub(parent_indent) 92 | .wrap_err_with(|| { 93 | format!( 94 | "failed to compute new distance: node {} - parent {}", 95 | node_data.indent, parent_indent 96 | ) 97 | })?; 98 | 99 | if children.is_empty() { 100 | write_name_distance(writer, &node_data.taxon.name, distance)?; 101 | } else { 102 | writer.write_all(b"(")?; 103 | 104 | let mut children_iter = children.iter().peekable(); 105 | 106 | while let Some(node_id) = children_iter.next() { 107 | write_children_recursively(writer, tree, *node_id, node_data.indent)?; 108 | 109 | // not the last child, add a comma 110 | if children_iter.peek().is_some() { 111 | writer.write_all(b",")?; 112 | } 113 | } 114 | 115 | writer.write_all(b")")?; 116 | 117 | write_name_distance(writer, &node_data.taxon.name, distance)?; 118 | } 119 | 120 | Ok(()) 121 | } 122 | 123 | #[cfg(test)] 124 | mod tests { 125 | use super::*; 126 | use test_case::test_case; 127 | 128 | #[test_case(&("Homo sapiens", 2), "Homo_sapiens:2")] 129 | #[test_case(&("Bacteroidetes/Chlorobi group", 1), "Bacteroidetes_Chlorobi_group:1")] 130 | fn test_format_name_distance>(input: &(S, usize), expected: S) { 131 | assert_eq!( 132 | format_name_distance(input.0.as_ref(), input.1), 133 | expected.as_ref() 134 | ); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | use atty::Stream; 2 | use color_eyre::{Help, Report}; 3 | use csv::Reader; 4 | use dialoguer::Confirm; 5 | use std::process; 6 | use std::{fs::File, fs::OpenOptions, io, path::PathBuf}; 7 | use tracing::instrument; 8 | 9 | use crate::{ 10 | cli::args::{MultipleReports, SingleReport}, 11 | BinError, 12 | }; 13 | 14 | pub mod abundance_csv; 15 | pub mod newick; 16 | pub mod report; 17 | 18 | /* ---------------------------------- Input --------------------------------- */ 19 | 20 | custom_derive! { 21 | #[derive(clap::Clap, Debug, PartialEq)] 22 | #[derive(EnumFromStr, EnumDisplay)] 23 | pub enum InputReportFormat { 24 | Kraken, 25 | } 26 | } 27 | 28 | #[instrument] 29 | pub fn get_reader(input: &PathBuf, headers: bool) -> Result, csv::Error> { 30 | csv::ReaderBuilder::new() 31 | .has_headers(headers) 32 | .delimiter(b'\t') 33 | .double_quote(false) 34 | .flexible(true) 35 | .from_path(input) 36 | } 37 | 38 | impl SingleReport { 39 | #[instrument] 40 | pub fn open_report(&self) -> Result { 41 | let path = &self.path; 42 | open_file(path) 43 | } 44 | } 45 | 46 | impl MultipleReports { 47 | fn join_errors(errors: Vec>) -> Result<(), Report> { 48 | if errors.is_empty() { 49 | return Ok(()); 50 | } 51 | 52 | errors 53 | .into_iter() 54 | .filter_map(|result| { 55 | if let Err(error) = result { 56 | Some(error) 57 | } else { 58 | None 59 | } 60 | }) 61 | .fold(Err(eyre!("encountered multiple errors")), |report, e| { 62 | report.error(e) 63 | }) 64 | } 65 | 66 | #[instrument] 67 | pub fn open_reports(&self) -> Result, Report> { 68 | let readers: Vec> = 69 | self.paths.iter().map(|p| open_file(p)).collect(); 70 | 71 | let (ok, errors) = readers.into_iter().partition(Result::is_ok); 72 | 73 | Self::join_errors(errors)?; 74 | 75 | Ok(ok.into_iter().map(Result::unwrap).collect::>()) 76 | } 77 | } 78 | 79 | #[instrument] 80 | pub fn open_file(path: &PathBuf) -> Result { 81 | let path = path; 82 | OpenOptions::new() 83 | .read(true) 84 | .write(false) 85 | .open(path) 86 | .map_err(|err| BinError::Io { 87 | err, 88 | path: path.clone(), 89 | }) 90 | } 91 | 92 | /* --------------------------------- OUTPUT --------------------------------- */ 93 | 94 | custom_derive! { 95 | #[derive(clap::Clap, Debug, PartialEq)] 96 | #[derive(EnumFromStr, EnumDisplay)] 97 | pub enum OutputPhyloFormat { 98 | Newick, 99 | } 100 | } 101 | 102 | custom_derive! { 103 | #[derive(clap::Clap, Debug, PartialEq)] 104 | #[derive(EnumFromStr, EnumDisplay)] 105 | pub enum OutputAbundanceFormat { 106 | Csv, 107 | } 108 | } 109 | 110 | #[derive(Debug, Clone)] 111 | pub enum OutputKind { 112 | File(PathBuf), 113 | Stdout, 114 | } 115 | 116 | #[derive(Debug, Clone)] 117 | pub struct Output { 118 | pub kind: OutputKind, 119 | pub overwrite: bool, 120 | } 121 | 122 | impl From> for OutputKind { 123 | fn from(path: Option) -> Self { 124 | path.map_or(Self::Stdout, |p| { 125 | if p == PathBuf::from(r"-") { 126 | Self::Stdout 127 | } else { 128 | Self::File(p) 129 | } 130 | }) 131 | } 132 | } 133 | 134 | impl From for Output { 135 | fn from(clap_output: crate::cli::args::OutputFile) -> Self { 136 | Self { 137 | kind: OutputKind::from(clap_output.path), 138 | overwrite: clap_output.overwrite, 139 | } 140 | } 141 | } 142 | 143 | impl Output { 144 | pub fn try_writtable(&self) -> Result<(), Report> { 145 | #[instrument] 146 | fn internal_can_open_file(path: &PathBuf, overwrite: bool) -> Result<(), Report> { 147 | if path.exists() { 148 | if overwrite { 149 | } else if atty::is(Stream::Stdout) { 150 | if Confirm::new() 151 | .with_prompt(format!("Overwrite `{}`?", path.display())) 152 | .interact()? 153 | { 154 | } else { 155 | process::exit(exitcode::NOPERM); 156 | } 157 | } else { 158 | { 159 | Err(std::io::Error::new( 160 | std::io::ErrorKind::AlreadyExists, 161 | "File already exists", 162 | )) 163 | }? 164 | } 165 | } 166 | Ok(()) 167 | } 168 | 169 | match &self.kind { 170 | OutputKind::File(path) => internal_can_open_file(path, self.overwrite), 171 | OutputKind::Stdout => Ok(()), 172 | } 173 | } 174 | 175 | pub fn writer(&self) -> Result, Report> { 176 | match &self.kind { 177 | OutputKind::File(path) => Ok(Box::new( 178 | OpenOptions::new() 179 | .write(true) 180 | .truncate(true) 181 | .create(true) 182 | .open(path)?, 183 | ) as Box), 184 | OutputKind::Stdout => Ok(Box::new(io::stdout()) as Box), 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🐦 spideog - Command line utility for Kraken2 reports. 2 | 3 | [![lastest version](https://img.shields.io/github/v/release/jeanmanguy/spideog)](https://github.com/jeanmanguy/spideog/releases/tag/v0.1.2-alpha.1) 4 | 5 | [![Build Status](https://travis-ci.com/jeanmanguy/spideog.svg?branch=main)](https://travis-ci.com/jeanmanguy/spideog) 6 | [![Rust](https://github.com/jeanmanguy/spideog/workflows/Rust/badge.svg?branch=main)](https://github.com/jeanmanguy/spideog/actions?query=workflow%3ARust) 7 | 8 | This is a work in progress. The commands may change between released versions, please read the [CHANGELOG](CHANGELOG). 9 | 10 | - [Goals](#goals) 11 | - [Installation](#installation) 12 | - [Usage](#usage) 13 | - [`convert-tree`](#convert-tree) 14 | - [`convert-abundance`](#convert-abundance) 15 | - [`combine-trees`](#combine-trees) 16 | - [`combine-abundances`](#combine-abundances) 17 | - [Contributing](#contributing) 18 | - [License](#license) 19 | - [Credits](#credits) 20 | 21 | ## Goals 22 | 23 | The first goal of this project is to convert Kraken reports into standard file formats that can be easily read with R to allow people to craft thier own data visualisations and compute statistics more easily using the tidyverse, vegan, ape, and ggtree/treeio. The second goal is to get summary information from the Kraken reports directly from the command line. 24 | 25 | Supports Kraken reports from [Kraken2](https://github.com/DerrickWood/kraken2) or from [Bracken](https://github.com/jenniferlu717/Bracken). 26 | 27 | ## Installation 28 | 29 | Binaries for Linux, OSX, and Windows are available in the [Github release page](https://github.com/jeanmanguy/spideog/releases). No dependencies are required. 30 | 31 | ## Usage 32 | 33 | ```sh 34 | spideog --help 35 | spideog --version 36 | spideog convert-tree 37 | spideog convert-abundance 38 | spideog combine-trees ... 39 | spideog combine-abundances ... 40 | ``` 41 | 42 | Windows: you will need to add the `.exe` extension to the commands. 43 | 44 | ### `convert-tree` 45 | 46 | Convert the taxonomy tree of a Kraken report to the Newick format. 47 | 48 | The following command will generate the file `converted.tree`. 49 | 50 | ```sh 51 | spideog convert-tree sample.kreport --output converted.tree 52 | ``` 53 | 54 | ### Example files 55 | 56 | - input: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport). 57 | - output: [tests/sample_data/converted.tree](tests/sample_data/converted.tree) 58 | 59 | #### Options 60 | 61 | - `--has-headers` necessary if the input report has headers 62 | - `--output` output file path 63 | - `--overwrite` force overwriting if the output file already exist 64 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment] 65 | - `--format` output format (default: newick) [Only newick is supported at the moment] 66 | 67 | ### `convert-abundance` 68 | 69 | Convert the abundance data of a Kraken report to the CSV format. 70 | 71 | 72 | The following command will generate the file `converted.csv`. 73 | 74 | ```sh 75 | spideog convert-abundance sample.kreport --output converted.csv 76 | ``` 77 | 78 | 79 | ### Example files 80 | 81 | - input: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport). 82 | - output: [tests/sample_data/converted.csv](tests/sample_data/converted.csv) 83 | 84 | #### Options 85 | 86 | - `--has-headers` necessary if the input report has headers 87 | - `--output` output file path 88 | - `--overwrite` force overwriting if the output file already exist 89 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment] 90 | - `--format` output format (default: CSV) [Only CSV is supported at the moment] 91 | 92 | 93 | ### `combine-trees` 94 | 95 | Combine and convert taxonomy trees from multiple Kraken report (e.g. from different samples of the same experiment) to the Newick format. 96 | 97 | The following command will generate the file `combined.tree`. 98 | 99 | ```sh 100 | spideog combine-trees sample.kreport sample_2.kreport --output combined.tree 101 | ``` 102 | 103 | ### Example files 104 | 105 | - inputs: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport) and [tests/sample_data/sample_2.kreport](tests/sample_data/sample_2.kreport). 106 | - output: [tests/sample_data/combined.tree](tests/sample_data/combined.tree) 107 | 108 | #### Options 109 | 110 | - `--has-headers` necessary if the input reports have headers 111 | - `--output` output file path 112 | - `--overwrite` force overwriting if the output file already exist 113 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment] 114 | - `--format` output format (default: newick) [Only newick is supported at the moment] 115 | 116 | 117 | ### `combine-abundances` 118 | 119 | Combine and convert abundance data from multiple Kraken report (e.g. from different samples of the same experiment) to the CSV format. 120 | 121 | The following command will generate the file `combined.csv`. 122 | 123 | ```sh 124 | spideog combine-abundances sample.kreport sample_2.kreport --add-missing-taxons --output combined.csv 125 | ``` 126 | 127 | ### Example files 128 | 129 | - inputs: [tests/sample_data/sample.kreport](tests/sample_data/sample.kreport) and [tests/sample_data/sample_2.kreport](tests/sample_data/sample_2.kreport). 130 | - output: [tests/sample_data/combined.csv](tests/sample_data/combined.csv) 131 | 132 | 133 | #### Options 134 | 135 | - `--add-missing-taxons` add missig taxons in some reports but present in other with zero values 136 | - `--has-headers` necessary if the input report has headers 137 | - `--output` output file path 138 | - `--overwrite` force overwriting if the output file already exist 139 | - `--report-format` input format (default: Kraken) [Only Kraken reports are supported at the moment] 140 | - `--format` output format (default: CSV) [Only CSV is supported at the moment] 141 | 142 | 143 | ## Contributing 144 | 145 | The project is maintained by Jean Manguy. Please submit a bug report or a feature request [on the Github issues page](https://github.com/jeanmanguy/spideog/issues/new/choose). 146 | 147 | ## License 148 | 149 | `spideog` is distributed under the terms of both the MIT license and the 150 | Apache License (Version 2.0). 151 | 152 | See [LICENSE-APACHE](./LICENSE-APACHE) and [LICENSE-MIT](./LICENSE-MIT) for 153 | details. 154 | 155 | ## Credits 156 | 157 | Cover picture: [Robin CC BY Greg Clarke](https://www.flickr.com/photos/leppre/25468458218) 158 | -------------------------------------------------------------------------------- /tests/sample_data/combined.csv: -------------------------------------------------------------------------------- 1 | "sample","taxon","taxid","rank","clade_percentage","clade_count_reads","taxon_count_reads" 2 | "sample","Pyrobaculum",2276,"Genus_0",4.67,25,25 3 | "sample","Canis",9611,"Genus_0",1.87,10,10 4 | "sample","Saccharomyces cerevisiae",4932,"Species_0",7.85,42,42 5 | "sample","Tursiops truncatus",9739,"Species_0",22.43,120,120 6 | "sample","Fungi",4751,"Kingdom_0",7.85,42,42 7 | "sample","Saccharomycetaceae",4893,"Family_0",7.85,42,42 8 | "sample","Thermoproteaceae",2267,"Family_0",4.67,25,25 9 | "sample","Amnoonviridae",2501949,"Family_0",0.0,0,0 10 | "sample","Viruses",10239,"Domain_0",0.0,0,0 11 | "sample","Artiodactyla",91561,"Order_0",22.43,120,120 12 | "sample","Chordata",7711,"Phylum_0",33.64,180,180 13 | "sample","Primates",9443,"Order_0",9.35,50,50 14 | "sample","Enterobacteriaceae",543,"Family_0",5.61,30,30 15 | "sample","root",1,"Root_0",66.36,355,355 16 | "sample","Saccharomycetes",4891,"Class_0",7.85,42,42 17 | "sample","Thermoprotei",183924,"Class_0",4.67,25,25 18 | "sample","unclassified",0,"Unclassified_0",33.64,180,180 19 | "sample","Saccharomyces",4930,"Genus_0",7.85,42,42 20 | "sample","Euarchontoglires",314146,"Class_4",9.35,50,50 21 | "sample","Saccharomyces kudriavzevii",114524,"Species_0",0.0,0,0 22 | "sample","Carnivora",33554,"Order_0",1.87,10,10 23 | "sample","Influenza B virus",11520,"Species_0",0.0,0,0 24 | "sample","Canidae",9608,"Family_0",1.87,10,10 25 | "sample","Lactobacillales",186826,"Order_0",14.58,78,78 26 | "sample","Thermoproteales",2266,"Order_0",4.67,25,25 27 | "sample","Archaea",2157,"Domain_0",4.67,25,25 28 | "sample","Tilapia tilapinevirus",2034996,"Species_0",0.0,0,0 29 | "sample","Laurasiatheria",314145,"Class_4",24.3,130,130 30 | "sample","Escherichia",561,"Genus_0",5.61,30,30 31 | "sample","Homo",9605,"Genus_0",9.35,50,50 32 | "sample","Saccharomycetales",4892,"Order_0",7.85,42,42 33 | "sample","Felidae",9681,"Family_0",0.0,0,0 34 | "sample","Proteobacteria",1224,"Phylum_0",5.61,30,30 35 | "sample","Boreoeutheria",1437010,"Class_3",33.64,180,180 36 | "sample","Bacteria",2,"Domain_0",20.19,108,108 37 | "sample","Betainfluenzavirus",197912,"Genus_0",0.0,0,0 38 | "sample","Orthomyxoviridae",11308,"Family_0",0.0,0,0 39 | "sample","Enterobacterales",91347,"Order_0",5.61,30,30 40 | "sample","Delphinidae",9726,"Family_0",22.43,120,120 41 | "sample","Mammalia",40674,"Class_0",33.64,180,180 42 | "sample","Articulavirales",2499411,"Order_0",0.0,0,0 43 | "sample","Saccharomyces pastorianus",27292,"Species_0",0.0,0,0 44 | "sample","Felis",9682,"Genus_0",0.0,0,0 45 | "sample","Pyrobaculum neutrophilum",70771,"Species_0",4.67,25,25 46 | "sample","Eukaryota",2759,"Root_2",41.5,222,222 47 | "sample","Insthoviricetes",2497577,"Class_0",0.0,0,0 48 | "sample","Metazoa",33208,"Kingdom_0",33.64,180,180 49 | "sample","Homo sapiens",9606,"Species_0",9.35,50,50 50 | "sample","Ascomycota",4890,"Phylum_0",7.85,42,42 51 | "sample","Escherichia coli",562,"Species_0",5.61,30,30 52 | "sample","Negarnaviricota",2497569,"Phylum_0",0.0,0,0 53 | "sample","Orthornavirae",2732396,"Kingdom_0",0.0,0,0 54 | "sample","Tilapinevirus",2034997,"Genus_0",0.0,0,0 55 | "sample","Crenarchaeota",28889,"Phylum_0",4.67,25,25 56 | "sample","Bacilli",91061,"Class_0",14.58,78,78 57 | "sample","Streptococcaceae",1300,"Family_0",14.58,78,78 58 | "sample","Hominidae",9604,"Family_0",9.35,50,50 59 | "sample","Canis lupus",9612,"Species_0",1.87,10,10 60 | "sample","Tursiops",9738,"Genus_0",22.43,120,120 61 | "sample","Lactococcus",1357,"Genus_0",14.58,78,78 62 | "sample","Felis catus",9685,"Species_0",0.0,0,0 63 | "sample","Firmicutes",1239,"Phylum_0",14.58,78,78 64 | "sample","Gammaproteobacteria",1236,"Class_0",5.61,30,30 65 | "sample","cellular organisms",131567,"Root_1",66.36,355,355 66 | "sample","Lactococcus lactis",1358,"Species_0",14.58,78,78 67 | "sample_2","Saccharomyces",4930,"Genus_0",8.29,67,67 68 | "sample_2","Ascomycota",4890,"Phylum_0",8.29,67,67 69 | "sample_2","Felis catus",9685,"Species_0",4.95,40,40 70 | "sample_2","Euarchontoglires",314146,"Class_4",24.75,200,200 71 | "sample_2","Tilapinevirus",2034997,"Genus_0",1.49,12,12 72 | "sample_2","Homo sapiens",9606,"Species_0",24.75,200,200 73 | "sample_2","Tilapia tilapinevirus",2034996,"Species_0",1.49,12,12 74 | "sample_2","Pyrobaculum",2276,"Genus_0",1.49,12,12 75 | "sample_2","Streptococcaceae",1300,"Family_0",4.21,34,34 76 | "sample_2","Escherichia",561,"Genus_0",9.9,80,80 77 | "sample_2","Laurasiatheria",314145,"Class_4",12.38,100,100 78 | "sample_2","Betainfluenzavirus",197912,"Genus_0",6.68,54,54 79 | "sample_2","Proteobacteria",1224,"Phylum_0",9.9,80,80 80 | "sample_2","Enterobacterales",91347,"Order_0",9.9,80,80 81 | "sample_2","unclassified",0,"Unclassified_0",30.82,249,249 82 | "sample_2","Canis",9611,"Genus_0",0.0,0,0 83 | "sample_2","Delphinidae",9726,"Family_0",7.43,60,60 84 | "sample_2","Insthoviricetes",2497577,"Class_0",8.17,66,66 85 | "sample_2","Pyrobaculum neutrophilum",70771,"Species_0",1.49,12,12 86 | "sample_2","Bacilli",91061,"Class_0",4.21,34,34 87 | "sample_2","Amnoonviridae",2501949,"Family_0",1.49,12,12 88 | "sample_2","Mammalia",40674,"Class_0",37.13,300,300 89 | "sample_2","Tursiops truncatus",9739,"Species_0",7.43,60,60 90 | "sample_2","Lactococcus lactis",1358,"Species_0",4.21,34,34 91 | "sample_2","Homo",9605,"Genus_0",24.75,200,200 92 | "sample_2","Viruses",10239,"Domain_0",8.17,66,66 93 | "sample_2","root",1,"Root_0",69.18,559,559 94 | "sample_2","Bacteria",2,"Domain_0",14.11,114,114 95 | "sample_2","Thermoproteaceae",2267,"Family_0",1.49,12,12 96 | "sample_2","Articulavirales",2499411,"Order_0",8.17,66,66 97 | "sample_2","Artiodactyla",91561,"Order_0",7.43,60,60 98 | "sample_2","Thermoprotei",183924,"Class_0",1.49,12,12 99 | "sample_2","Saccharomycetes",4891,"Class_0",8.29,67,67 100 | "sample_2","Thermoproteales",2266,"Order_0",1.49,12,12 101 | "sample_2","Saccharomyces pastorianus",27292,"Species_0",5.2,42,42 102 | "sample_2","cellular organisms",131567,"Root_1",61.01,493,493 103 | "sample_2","Archaea",2157,"Domain_0",1.49,12,12 104 | "sample_2","Escherichia coli",562,"Species_0",9.9,80,80 105 | "sample_2","Carnivora",33554,"Order_0",4.95,40,40 106 | "sample_2","Felidae",9681,"Family_0",4.95,40,40 107 | "sample_2","Canidae",9608,"Family_0",0.0,0,0 108 | "sample_2","Boreoeutheria",1437010,"Class_3",37.13,300,300 109 | "sample_2","Chordata",7711,"Phylum_0",37.13,300,300 110 | "sample_2","Felis",9682,"Genus_0",4.95,40,40 111 | "sample_2","Negarnaviricota",2497569,"Phylum_0",8.17,66,66 112 | "sample_2","Saccharomyces cerevisiae",4932,"Species_0",1.24,10,10 113 | "sample_2","Saccharomyces kudriavzevii",114524,"Species_0",1.86,15,15 114 | "sample_2","Orthomyxoviridae",11308,"Family_0",6.68,54,54 115 | "sample_2","Fungi",4751,"Kingdom_0",8.29,67,67 116 | "sample_2","Gammaproteobacteria",1236,"Class_0",9.9,80,80 117 | "sample_2","Lactobacillales",186826,"Order_0",4.21,34,34 118 | "sample_2","Tursiops",9738,"Genus_0",7.43,60,60 119 | "sample_2","Saccharomycetales",4892,"Order_0",8.29,67,67 120 | "sample_2","Hominidae",9604,"Family_0",24.75,200,200 121 | "sample_2","Lactococcus",1357,"Genus_0",4.21,34,34 122 | "sample_2","Orthornavirae",2732396,"Kingdom_0",8.17,66,66 123 | "sample_2","Eukaryota",2759,"Root_2",45.42,367,367 124 | "sample_2","Influenza B virus",11520,"Species_0",6.68,54,54 125 | "sample_2","Enterobacteriaceae",543,"Family_0",9.9,80,80 126 | "sample_2","Primates",9443,"Order_0",24.75,200,200 127 | "sample_2","Metazoa",33208,"Kingdom_0",37.13,300,300 128 | "sample_2","Crenarchaeota",28889,"Phylum_0",1.49,12,12 129 | "sample_2","Firmicutes",1239,"Phylum_0",4.21,34,34 130 | "sample_2","Canis lupus",9612,"Species_0",0.0,0,0 131 | "sample_2","Saccharomycetaceae",4893,"Family_0",8.29,67,67 132 | -------------------------------------------------------------------------------- /src/taxonomy.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Display; 2 | 3 | use once_cell::sync::Lazy; 4 | use serde::{Deserialize, Deserializer}; 5 | use std::sync::Mutex; 6 | use tracing::instrument; 7 | 8 | use crate::errors::TaxRankParsingError; 9 | 10 | static LAST_TAXONOMY_RANK_PARSED: Lazy>> = Lazy::new(|| Mutex::new(None)); 11 | 12 | /// Taxonomy levels 13 | /// 14 | /// the u32 offset represents sub-clade (e.g. parvorder, subfamily, etc.) 15 | #[derive(Clone, PartialEq, Debug, PartialOrd, Ord, Eq, Hash, Copy)] 16 | pub enum Rank { 17 | Unclassified(u32), 18 | Root(u32), 19 | Domain(u32), 20 | Kingdom(u32), 21 | Phylum(u32), 22 | Class(u32), 23 | Order(u32), 24 | Family(u32), 25 | Genus(u32), 26 | Species(u32), 27 | } 28 | 29 | // TODO: order D1 as below of any R0..9 30 | impl Display for Rank { 31 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 32 | match self { 33 | Self::Unclassified(i) => write!(f, "Unclassified_{}", i), 34 | Self::Root(i) => write!(f, "Root_{}", i), 35 | Self::Domain(i) => write!(f, "Domain_{}", i), 36 | Self::Kingdom(i) => write!(f, "Kingdom_{}", i), 37 | Self::Phylum(i) => write!(f, "Phylum_{}", i), 38 | Self::Class(i) => write!(f, "Class_{}", i), 39 | Self::Order(i) => write!(f, "Order_{}", i), 40 | Self::Family(i) => write!(f, "Family_{}", i), 41 | Self::Genus(i) => write!(f, "Genus_{}", i), 42 | Self::Species(i) => write!(f, "Species_{}", i), 43 | } 44 | } 45 | } 46 | 47 | impl Rank { 48 | #[must_use] 49 | pub fn plus_one(self) -> Self { 50 | match self { 51 | Self::Unclassified(i) => Self::Unclassified(i.checked_add(1).unwrap()), 52 | Self::Root(i) => Self::Root(i.checked_add(1).unwrap()), 53 | Self::Domain(i) => Self::Domain(i.checked_add(1).unwrap()), 54 | Self::Kingdom(i) => Self::Kingdom(i.checked_add(1).unwrap()), 55 | Self::Phylum(i) => Self::Phylum(i.checked_add(1).unwrap()), 56 | Self::Class(i) => Self::Class(i.checked_add(1).unwrap()), 57 | Self::Order(i) => Self::Order(i.checked_add(1).unwrap()), 58 | Self::Family(i) => Self::Family(i.checked_add(1).unwrap()), 59 | Self::Genus(i) => Self::Genus(i.checked_add(1).unwrap()), 60 | Self::Species(i) => Self::Species(i.checked_add(1).unwrap()), 61 | } 62 | } 63 | } 64 | 65 | #[instrument] 66 | pub fn parse_taxonomy_level(string: &str) -> Result { 67 | // TODO: add previous tax rank here, make it purely functional 68 | if string.len() > 2 { 69 | return Err(TaxRankParsingError::InvalidLength( 70 | String::from(string), 71 | string.len(), 72 | )); 73 | } 74 | 75 | let mut string_chars = string.chars(); 76 | 77 | let letter = string_chars.next().unwrap(); 78 | 79 | let offset: u32 = 80 | string_chars 81 | .next() 82 | .map_or(Ok(0_u32), |number| -> Result { 83 | if number.is_ascii_digit() { 84 | Ok(number.to_digit(10_u32).unwrap()) 85 | } else { 86 | Err(TaxRankParsingError::OffsetNotANumber( 87 | String::from(string), 88 | number, 89 | )) 90 | } 91 | })?; 92 | 93 | let tax_rank: Rank = match letter { 94 | 'U' => Ok(Rank::Unclassified(offset)), 95 | 'R' => Ok(Rank::Root(offset)), 96 | 'D' => Ok(Rank::Domain(offset)), 97 | 'K' => Ok(Rank::Kingdom(offset)), 98 | 'P' => Ok(Rank::Phylum(offset)), 99 | 'C' => Ok(Rank::Class(offset)), 100 | 'O' => Ok(Rank::Order(offset)), 101 | 'F' => Ok(Rank::Family(offset)), 102 | 'G' => Ok(Rank::Genus(offset)), 103 | 'S' => Ok(Rank::Species(offset)), 104 | '-' => { 105 | // TODO: there has to be a better way to do that, maybe without the mutex business 106 | (*LAST_TAXONOMY_RANK_PARSED.lock().unwrap()).map_or_else( 107 | || { 108 | Err(TaxRankParsingError::TaxRankParsingCannotInferRank( 109 | String::from(string), 110 | )) 111 | }, 112 | |x| -> Result { Ok(x.plus_one()) }, 113 | ) 114 | } 115 | _ => Err(TaxRankParsingError::InvalidRankCode( 116 | String::from(string), 117 | letter, 118 | )), 119 | }?; 120 | 121 | let mut old_tax_rank = LAST_TAXONOMY_RANK_PARSED.lock().unwrap(); 122 | *old_tax_rank = Some(tax_rank); 123 | 124 | Ok(tax_rank) 125 | } 126 | 127 | impl<'de> Deserialize<'de> for Rank { 128 | fn deserialize(deserializer: D) -> Result 129 | where 130 | D: Deserializer<'de>, 131 | { 132 | let string = String::deserialize(deserializer)?; 133 | parse_taxonomy_level(&string).map_err(serde::de::Error::custom) 134 | } 135 | 136 | fn deserialize_in_place(deserializer: D, place: &mut Self) -> Result<(), D::Error> 137 | where 138 | D: Deserializer<'de>, 139 | { 140 | // Default implementation just delegates to `deserialize` impl. 141 | *place = Deserialize::deserialize(deserializer)?; 142 | Ok(()) 143 | } 144 | } 145 | 146 | #[cfg(test)] 147 | mod tests { 148 | use super::*; 149 | use test_case::test_case; 150 | 151 | #[test] 152 | fn test_order_taxonomy() { 153 | assert!(Rank::Domain(0) > Rank::Root(1)); 154 | assert!(Rank::Domain(1) > Rank::Domain(0)) 155 | } 156 | 157 | #[test_case("U", Rank::Unclassified(0); "ok_U")] 158 | #[test_case("U1", Rank::Unclassified(1); "ok_U1")] 159 | #[test_case("R", Rank::Root(0); "ok_R")] 160 | #[test_case("R1", Rank::Root(1); "ok_R1")] 161 | #[test_case("P", Rank::Phylum(0); "ok_P")] 162 | #[test_case("P1", Rank::Phylum(1); "ok_P1")] 163 | #[test_case("C", Rank::Class(0); "ok_C")] 164 | #[test_case("C1", Rank::Class(1); "ok_C1")] 165 | #[test_case("O", Rank::Order(0); "ok_O")] 166 | #[test_case("O1", Rank::Order(1); "ok_O1")] 167 | #[test_case("F", Rank::Family(0); "ok_F")] 168 | #[test_case("F1", Rank::Family(1); "ok_F1")] 169 | #[test_case("S", Rank::Species(0); "ok_S")] 170 | #[test_case("S1", Rank::Species(1); "ok_S1")] 171 | fn test_parse_tax_level(input: &str, expected: Rank) { 172 | pretty_assertions::assert_eq!(parse_taxonomy_level(input).unwrap(), expected); 173 | } 174 | 175 | #[test] 176 | fn test_plus_one() { 177 | pretty_assertions::assert_eq!(Rank::Kingdom(2).plus_one(), Rank::Kingdom(3)) 178 | } 179 | 180 | #[test] 181 | #[should_panic] 182 | fn test_parse_tax_level_error_too_long() { 183 | // TODO: implements Eq on errors (fix csv and io errors first) 184 | parse_taxonomy_level("R11111").unwrap(); 185 | } 186 | 187 | #[test] 188 | #[should_panic] 189 | fn test_parse_tax_level_error_invalid_code() { 190 | // TODO: implements Eq on errors (fix csv and io errors first) 191 | parse_taxonomy_level("L4").unwrap(); 192 | } 193 | 194 | #[test] 195 | #[should_panic] 196 | fn test_parse_tax_level_error_offsetnotanumber() { 197 | // TODO: implements Eq on errors (fix csv and io errors first) 198 | parse_taxonomy_level("RR").unwrap(); 199 | } 200 | 201 | #[test] 202 | #[should_panic] 203 | fn test_parse_tax_level_error_cannotinferprevious() { 204 | // reset 205 | { 206 | let mut old_tax_rank = LAST_TAXONOMY_RANK_PARSED.lock().unwrap(); 207 | *old_tax_rank = None; 208 | } 209 | // TODO: implements Eq on errors (fix csv and io errors first) 210 | parse_taxonomy_level("-").unwrap(); 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Jean Manguy 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/data/tree.rs: -------------------------------------------------------------------------------- 1 | use color_eyre::Report; 2 | use core::convert::TryFrom; 3 | use std::fmt::Display; 4 | 5 | use daggy::{Dag, NodeIndex, Walker}; 6 | use tracing::instrument; 7 | 8 | use crate::{ 9 | errors::SpideogError, 10 | kraken::{ReportRecord, Taxon}, 11 | parser::parse_ident_organism_name, 12 | }; 13 | 14 | #[derive(Debug, Eq, PartialEq, Clone)] 15 | pub struct IndentedTaxon { 16 | pub indent: usize, 17 | pub taxon: Taxon, 18 | } 19 | 20 | impl Display for IndentedTaxon { 21 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 22 | write!(f, "{}", self.taxon) 23 | } 24 | } 25 | 26 | impl IndentedTaxon { 27 | #[must_use] 28 | pub fn inferior_indent(&self, than: &Self) -> bool { 29 | self.indent < than.indent 30 | } 31 | } 32 | 33 | impl TryFrom for IndentedTaxon { 34 | type Error = SpideogError; 35 | 36 | #[instrument] 37 | fn try_from(value: ReportRecord) -> Result { 38 | let (_, (indent, name)) = parse_ident_organism_name(value.5.as_bytes()).unwrap(); // TODO: make error here 39 | 40 | let organism_tree = Taxon { 41 | taxonomy_level: value.3, 42 | name: String::from_utf8_lossy(name).trim().to_string(), 43 | taxonomy_id: value.4, 44 | }; 45 | 46 | let node = Self { 47 | indent, 48 | taxon: organism_tree, 49 | }; 50 | 51 | Ok(node) 52 | } 53 | } 54 | 55 | pub trait TaxonomyTreeReader: Sized { 56 | fn read(_: T) -> Result; 57 | } 58 | 59 | #[derive(Debug, Default)] 60 | pub struct Tree { 61 | pub tree: Dag, 62 | pub origin: Option, 63 | pub last_node_added_id: Option, 64 | } 65 | 66 | impl Tree { 67 | #[must_use] 68 | pub fn new() -> Self { 69 | Self { 70 | tree: Dag::new(), 71 | origin: None, 72 | last_node_added_id: None, 73 | } 74 | } 75 | 76 | pub fn with_origin(&mut self, origin: IndentedTaxon) -> &mut Self { 77 | let new_node_index = self.tree.add_node(origin); 78 | self.origin = Some(new_node_index); 79 | self.last_node_added_id = Some(new_node_index); 80 | 81 | self 82 | } 83 | 84 | pub fn child(&mut self, parent: NodeIndex, node: IndentedTaxon) -> &mut Self { 85 | let weight = 1; 86 | self.child_with_weight(parent, node, weight) 87 | } 88 | 89 | pub fn child_with_weight( 90 | &mut self, 91 | parent: NodeIndex, 92 | node: IndentedTaxon, 93 | weight: u32, 94 | ) -> &mut Self { 95 | let (_, new_node_id) = self.tree.add_child(parent, weight, node); 96 | self.last_node_added_id = Some(new_node_id); 97 | 98 | self 99 | } 100 | 101 | // find a parent with a lower indent value or default to the origin 102 | pub fn find_valid_parent_for(&self, taxon: &IndentedTaxon) -> Result { 103 | // default value 104 | let mut parent_id = self 105 | .origin 106 | .ok_or_else(|| SpideogError::TreeNotInitialized)?; 107 | 108 | let previously_added_node = self 109 | .last_node_added_id 110 | .ok_or_else(|| SpideogError::TreeNotInitialized)?; 111 | 112 | if self 113 | .tree 114 | .node_weight(previously_added_node) 115 | .ok_or_else(|| SpideogError::NodeNotFound)? 116 | .inferior_indent(taxon) 117 | { 118 | // previously added node is a suitable parent for the next taxon 119 | return Ok(previously_added_node); 120 | } 121 | 122 | // we need to go up the tree to find an adequate parent 123 | let mut parent_recursion = self 124 | .tree 125 | .recursive_walk(previously_added_node, |g, n| g.parents(n).iter(g).last()); 126 | 127 | while let Some((_, node_id)) = parent_recursion.walk_next(&self.tree) { 128 | let node = self 129 | .tree 130 | .node_weight(node_id) 131 | .ok_or_else(|| SpideogError::NodeNotFound)?; 132 | 133 | if node.inferior_indent(taxon) { 134 | parent_id = node_id; 135 | break; 136 | } 137 | } 138 | 139 | Ok(parent_id) 140 | } 141 | 142 | pub fn try_combine_with(mut self, rhs: Self) -> Result { 143 | if self.origin.is_none() { 144 | return Ok(rhs); 145 | } 146 | 147 | for rhs_edge in rhs.tree.raw_edges().iter() { 148 | let rhs_edge_source = rhs 149 | .tree 150 | .node_weight(rhs_edge.source()) 151 | .ok_or_else(|| SpideogError::NodeNotFound)?; 152 | let rhs_edge_target = rhs 153 | .tree 154 | .node_weight(rhs_edge.target()) 155 | .ok_or_else(|| SpideogError::NodeNotFound)?; 156 | 157 | let original_nodes = self.tree.raw_nodes(); 158 | 159 | let source_in_self: Option> = 160 | original_nodes.iter().enumerate().find_map(|(index, node)| { 161 | if &node.weight == rhs_edge_source { 162 | Some(NodeIndex::new(index)) 163 | } else { 164 | None 165 | } 166 | }); 167 | 168 | let target_in_self: Option> = 169 | original_nodes.iter().enumerate().find_map(|(index, node)| { 170 | if &node.weight == rhs_edge_target { 171 | Some(NodeIndex::new(index)) 172 | } else { 173 | None 174 | } 175 | }); 176 | 177 | match (source_in_self, target_in_self) { 178 | (None, None) => { 179 | // panic 180 | // no common node (not even root) 181 | // FIXME: what todo? 182 | // TODO: make it an error with more info 183 | // dbg!(&rhs_edge_source); 184 | // dbg!(&rhs_edge_target); 185 | // dbg!(self.tree.node_weight(self.origin.unwrap())); 186 | panic!("source and target of an edge in RHS were not found in Self"); 187 | } 188 | (None, Some(_)) => { 189 | // panic 190 | // possible diamond 191 | panic!( 192 | "source and edge node in RHS were not found in Self, but target was found" 193 | ); 194 | } 195 | (Some(parent), None) => { 196 | self.child(parent, rhs_edge_target.clone()); 197 | } 198 | (Some(s), Some(t)) => { 199 | // increment weight of edge 200 | // FIXME: some issues with different trees, can't found node that exist 201 | let original_edge = self.tree.find_edge(s, t).ok_or_else(|| { 202 | SpideogError::EdgeNotFound( 203 | self.tree.node_weight(s).unwrap().clone(), 204 | self.tree.node_weight(t).unwrap().clone(), 205 | ) 206 | })?; 207 | 208 | self.tree 209 | .update_edge( 210 | s, 211 | t, 212 | self.tree 213 | .edge_weight(original_edge) 214 | .unwrap_or(&1_u32) 215 | .checked_add(1_u32) 216 | .unwrap_or(1_u32), 217 | ) 218 | .unwrap(); 219 | } 220 | } 221 | } 222 | 223 | Ok(self) 224 | } 225 | } 226 | 227 | #[cfg(test)] 228 | mod tests { 229 | use super::*; 230 | // use test_case::test_case; 231 | 232 | #[test] 233 | fn test_new() { 234 | let origin = IndentedTaxon { 235 | indent: 0, 236 | taxon: Taxon { 237 | taxonomy_level: crate::taxonomy::Rank::Root(0), 238 | name: "root".to_string(), 239 | taxonomy_id: 0, 240 | }, 241 | }; 242 | 243 | let mut tree = Tree::new(); 244 | tree.with_origin(origin.clone()); 245 | 246 | pretty_assertions::assert_eq!(tree.tree.edge_count(), 0); 247 | pretty_assertions::assert_eq!(tree.tree.node_count(), 1); 248 | pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(0)), Some(&origin)); 249 | 250 | // tree.tree. 251 | } 252 | 253 | #[test] 254 | fn test_child() { 255 | let origin = IndentedTaxon { 256 | indent: 0, 257 | taxon: Taxon { 258 | taxonomy_level: crate::taxonomy::Rank::Root(0), 259 | name: "root".to_string(), 260 | taxonomy_id: 0, 261 | }, 262 | }; 263 | 264 | let child = IndentedTaxon { 265 | indent: 0, 266 | taxon: Taxon { 267 | taxonomy_level: crate::taxonomy::Rank::Root(1), 268 | name: "child".to_string(), 269 | taxonomy_id: 1, 270 | }, 271 | }; 272 | 273 | let grand_child = IndentedTaxon { 274 | indent: 2, 275 | taxon: Taxon { 276 | taxonomy_level: crate::taxonomy::Rank::Root(1), 277 | name: "grand child".to_string(), 278 | taxonomy_id: 2, 279 | }, 280 | }; 281 | 282 | let mut tree = Tree::new(); 283 | tree.with_origin(origin.clone()); 284 | tree.child(NodeIndex::new(0), child.clone()); 285 | tree.child(NodeIndex::new(1), grand_child); 286 | 287 | pretty_assertions::assert_eq!(tree.tree.edge_count(), 2); 288 | pretty_assertions::assert_eq!(tree.tree.node_count(), 3); 289 | pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(0)), Some(&origin)); 290 | pretty_assertions::assert_eq!(tree.tree.node_weight(NodeIndex::new(1)), Some(&child)); 291 | 292 | assert!(tree 293 | .tree 294 | .find_edge(NodeIndex::new(0), NodeIndex::new(1)) 295 | .is_some()); 296 | assert!(tree 297 | .tree 298 | .find_edge(NodeIndex::new(1), NodeIndex::new(2)) 299 | .is_some()); 300 | 301 | pretty_assertions::assert_eq!( 302 | tree.tree 303 | .parents(NodeIndex::new(2)) 304 | .iter(&tree.tree) 305 | .next() 306 | .unwrap() 307 | .1, 308 | NodeIndex::new(1) 309 | ) 310 | } 311 | 312 | #[test] 313 | fn test_find_valid_parent() { 314 | let origin = IndentedTaxon { 315 | indent: 0, 316 | taxon: Taxon { 317 | taxonomy_level: crate::taxonomy::Rank::Root(0), 318 | name: "root".to_string(), 319 | taxonomy_id: 0, 320 | }, 321 | }; 322 | 323 | let child = IndentedTaxon { 324 | indent: 1, 325 | taxon: Taxon { 326 | taxonomy_level: crate::taxonomy::Rank::Root(1), 327 | name: "child".to_string(), 328 | taxonomy_id: 1, 329 | }, 330 | }; 331 | 332 | let grand_child = IndentedTaxon { 333 | indent: 2, 334 | taxon: Taxon { 335 | taxonomy_level: crate::taxonomy::Rank::Root(1), 336 | name: "grand child".to_string(), 337 | taxonomy_id: 2, 338 | }, 339 | }; 340 | 341 | let new_child = IndentedTaxon { 342 | indent: 2, 343 | taxon: Taxon { 344 | taxonomy_level: crate::taxonomy::Rank::Root(3), 345 | name: "new_child".to_string(), 346 | taxonomy_id: 3, 347 | }, 348 | }; 349 | 350 | let new_child_child = IndentedTaxon { 351 | indent: 3, 352 | taxon: Taxon { 353 | taxonomy_level: crate::taxonomy::Rank::Root(3), 354 | name: "new_child_child".to_string(), 355 | taxonomy_id: 4, 356 | }, 357 | }; 358 | 359 | let mut tree = Tree::new(); 360 | tree.with_origin(origin); 361 | tree.child(NodeIndex::new(0), child); 362 | tree.child(NodeIndex::new(1), grand_child); 363 | 364 | let parent = tree.find_valid_parent_for(&new_child).unwrap(); 365 | tree.child(parent, new_child); 366 | 367 | // pretty_assertions::assert_eq!(parent, NodeIndex::new(1)); 368 | 369 | let parent = tree.find_valid_parent_for(&new_child_child).unwrap(); 370 | tree.child(parent, new_child_child); 371 | 372 | pretty_assertions::assert_eq!(parent, NodeIndex::new(3)); 373 | } 374 | 375 | #[test] 376 | fn test_try_combine_with() { 377 | let origin = IndentedTaxon { 378 | indent: 0, 379 | taxon: Taxon { 380 | taxonomy_level: crate::taxonomy::Rank::Root(0), 381 | name: "root".to_string(), 382 | taxonomy_id: 0, 383 | }, 384 | }; 385 | 386 | let child = IndentedTaxon { 387 | indent: 1, 388 | taxon: Taxon { 389 | taxonomy_level: crate::taxonomy::Rank::Root(1), 390 | name: "child".to_string(), 391 | taxonomy_id: 1, 392 | }, 393 | }; 394 | 395 | let second_child = IndentedTaxon { 396 | indent: 1, 397 | taxon: Taxon { 398 | taxonomy_level: crate::taxonomy::Rank::Root(1), 399 | name: "second child".to_string(), 400 | taxonomy_id: 2, 401 | }, 402 | }; 403 | 404 | let grand_child = IndentedTaxon { 405 | indent: 2, 406 | taxon: Taxon { 407 | taxonomy_level: crate::taxonomy::Rank::Root(2), 408 | name: "grand child".to_string(), 409 | taxonomy_id: 3, 410 | }, 411 | }; 412 | 413 | let mut tree_1 = Tree::new(); 414 | tree_1.with_origin(origin.clone()); 415 | tree_1.child(NodeIndex::new(0), child.clone()); 416 | tree_1.child(NodeIndex::new(0), second_child.clone()); 417 | 418 | let mut tree_2 = Tree::new(); 419 | tree_2.with_origin(origin.clone()); 420 | tree_2.child(NodeIndex::new(0), child.clone()); 421 | tree_2.child(NodeIndex::new(1), grand_child.clone()); 422 | 423 | let mut expected_tree = Tree::new(); 424 | expected_tree.with_origin(origin); 425 | expected_tree.child_with_weight(NodeIndex::new(0), child, 2); 426 | expected_tree.child(NodeIndex::new(0), second_child); 427 | expected_tree.child(NodeIndex::new(1), grand_child); 428 | 429 | let combined_tree = tree_1.try_combine_with(tree_2).unwrap(); 430 | 431 | pretty_assertions::assert_eq!( 432 | combined_tree.tree.edge_count(), 433 | expected_tree.tree.edge_count() 434 | ); 435 | 436 | pretty_assertions::assert_eq!( 437 | combined_tree.tree.node_count(), 438 | expected_tree.tree.node_count() 439 | ); 440 | 441 | pretty_assertions::assert_eq!( 442 | combined_tree 443 | .tree 444 | .raw_edges() 445 | .iter() 446 | .map(|e| e.weight) 447 | .collect::>(), 448 | expected_tree 449 | .tree 450 | .raw_edges() 451 | .iter() 452 | .map(|e| e.weight) 453 | .collect::>() 454 | ); 455 | } 456 | } 457 | --------------------------------------------------------------------------------