├── .gitignore
├── docs
    ├── images
    │   └── logo.png
    ├── releases.md
    └── index.md
├── mkdocs.yaml
├── Cargo.toml
├── scripts
    ├── README.md
    ├── species.py
    └── metrics.py
├── LICENSE
├── src
    ├── main.rs
    ├── cli.rs
    └── sketchy.rs
├── .github
    └── workflows
    │   ├── rust-ci.yaml
    │   └── release.yaml
├── README.md
└── Cargo.lock


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | site/
3 | test_data/
4 | 
5 | .idea/
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/esteinig/sketchy/HEAD/docs/images/logo.png


--------------------------------------------------------------------------------
/docs/releases.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Release binaries
 3 | 
 4 | Binary executables for releases of the Rust client are available for Linux and MacOS.
 5 | 
 6 | Configure environmental variables.
 7 | 
 8 | ```
 9 | VERSION=0.6.0
10 | GITHUB=https://github.com/esteinig/sketchy/releases/download
11 | ```
12 | 
13 | Download release binaries (Linux).
14 | 
15 | ```
16 | TAR=sketchy-${VERSION}-x86_64-unknown-linux-musl.tar.gz
17 | wget ${GITHUB}/${VERSION}/${TAR}
18 | 100%
19 | tar xf $TAR
20 | ```
21 | 
22 | Download release binaries (MacOS).
23 | 
24 | ```
25 | TAR=sketchy-${VERSION}-x86_64-apple-darwin.tar.gz
26 | wget ${GITHUB}/${VERSION}/${TAR}
27 | tar xf $TAR
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/mkdocs.yaml:
--------------------------------------------------------------------------------
 1 | site_name: Sketchy
 2 | theme:
 3 |   name: material
 4 |   logo: images/logo.png
 5 |   palette:
 6 |     primary: black
 7 |     accent: white
 8 | repo_name: esteinig/sketchy
 9 | repo_url: https://github.com/esteinig/sketchy
10 | nav:
11 | - Home: index.md
12 | - Releases: releases.md
13 | markdown_extensions:
14 | - toc:
15 |     permalink: true
16 | - markdown.extensions.codehilite:
17 |     guess_lang: false
18 | - admonition
19 | - codehilite
20 | - extra
21 | - pymdownx.superfences:
22 |     custom_fences:
23 |     - name: mermaid
24 |       class: mermaid
25 |       format: !!python/name:pymdownx.superfences.fence_div_format ''
26 | - pymdownx.tabbed
27 | extra_javascript:
28 | - https://unpkg.com/mermaid@8.4.6/dist/mermaid.min.js


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sketchy-rs"
 3 | version = "0.6.0"
 4 | authors = ["esteinig <eike.steinig@unimelb.edu.au>"]
 5 | description = "Rust command line client for Sketchy"
 6 | documentation = "https://github.com/esteinig/sketchy"
 7 | homepage = "https://github.com/esteinig/sketchy"
 8 | repository = "https://github.com/esteinig/sketchy"
 9 | readme = "README.md"
10 | keywords = ["sketchy", "nanopore", "gnt", "mash", "streaming"]
11 | categories = ["science"]
12 | license = "MIT"
13 | edition = "2018"
14 | include = [
15 |     "**/*.rs",
16 |     "src/data/*",
17 |     "Cargo.toml"
18 | ]
19 | 
20 | [dependencies]
21 | anyhow = "1.0"
22 | structopt = "0.3"
23 | clap = "2.33.0"
24 | finch = "0.4.1"
25 | rayon = "1.5.1"
26 | needletail = "0.4.1"
27 | thiserror = "1.0"
28 | csv = "1.1"
29 | 
30 | [[bin]]
31 | name = "sketchy"
32 | path = "src/main.rs"


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | # Extracting validation data from the Blackwell collection
2 | 
3 | This process extracts the MLST validation data from the Blackwell collection as outlined in the manuscript.
4 | 
5 | 1. Download the meta-data JSON (https://figshare.com/ndownloader/files/26578377) 
6 | 2. Run the `species.py` application (`python species.py --help`) on the JSON file, this will produce a summary of the meta-data including MLST (`species_data.tsv`), a file for all FTP addresses (`species_ftp.tsv`) and count data for all species (Bracken, `species_counts.tsv`)
7 | 3. Download the assemblies from the FTP addresses for a species of interest, these can be used for sketch construction. Genotype files can be constructed from subsets of the `species_data.tsv` file for the assemblies included in the sketch. 
8 | 4. You may want to run a check on the order and congruence between sketch and genotype file. Commands for sketch construction are outlined in the `local sketches` and `genotype files` sections of the [documentation](https://github.com/esteinig/sketchy/blob/master/docs/index.md).
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Eike Steinig
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | use crate::cli::Cli;
 2 | use crate::cli::Commands::{Check, Info, Predict, Shared, Sketch};
 3 | use crate::sketchy::{PredictConfig, Sketchy};
 4 | use anyhow::Result;
 5 | use structopt::StructOpt;
 6 | 
 7 | mod cli;
 8 | mod sketchy;
 9 | 
10 | /// Sketchy application
11 | ///
12 | /// Run the application from arguments provided
13 | /// by the command line interface
14 | ///
15 | /// Hash seed by default is 0; for hashing to
16 | /// replicate Mash, seed must be 42.
17 | fn main() -> Result<()> {
18 |     let args = Cli::from_args();
19 |     let sketchy = Sketchy::new();
20 | 
21 |     // Conduct a consensus check when using
22 | 
23 |     match args.commands {
24 |         Sketch {
25 |             input,
26 |             output,
27 |             sketch_size,
28 |             kmer_size,
29 |             scale,
30 |             seed,
31 |         } => {
32 |             sketchy.sketch(input, output, sketch_size, kmer_size, seed, scale)?;
33 |         }
34 |         Info { input, params } => {
35 |             sketchy.info(input, params)?;
36 |         }
37 |         Shared { reference, query } => {
38 |             sketchy.shared(reference, query)?;
39 |         }
40 |         Predict {
41 |             input,
42 |             reference,
43 |             genotypes,
44 |             top,
45 |             limit,
46 |             stream,
47 |             consensus,
48 |             header,
49 |         } => {
50 |             let config = PredictConfig {
51 |                 top,
52 |                 limit,
53 |                 stream,
54 |                 consensus,
55 |                 header,
56 |             };
57 |             sketchy.predict(input, reference, genotypes, config)?;
58 |         }
59 |         Check {
60 |             reference,
61 |             genotypes,
62 |         } => {
63 |             sketchy.check(reference, genotypes)?;
64 |         }
65 |     }
66 | 
67 |     Ok(())
68 | }
69 | 


--------------------------------------------------------------------------------
/.github/workflows/rust-ci.yaml:
--------------------------------------------------------------------------------
  1 | name: Rust CI
  2 | 
  3 | on: 
  4 |   push:
  5 |     branches:
  6 |       - master
  7 |     paths-ignore:
  8 |       - 'README.md'
  9 |   pull_request:
 10 |     branches:
 11 |       - master
 12 |     paths-ignore:
 13 |       - 'README.md'
 14 | 
 15 | env:
 16 |   CARGO_TERM_COLOR: always
 17 | 
 18 | jobs:
 19 |   check:
 20 |     name: Check Rust version ${{ matrix.rust }} on OS ${{ matrix.os }}
 21 |     strategy:
 22 |       matrix:
 23 |         os: [ubuntu-latest, macos-latest]
 24 |         rust:
 25 |           - stable
 26 |           - 1.53.0
 27 |     runs-on: ${{ matrix.os }}
 28 |     steps:
 29 |       - name: Checkout sources
 30 |         uses: actions/checkout@v2
 31 | 
 32 |       - name: Install toolchain
 33 |         uses: actions-rs/toolchain@v1
 34 |         with:
 35 |           toolchain: ${{ matrix.rust }}
 36 |           override: true
 37 | 
 38 |       - uses: actions/cache@v2
 39 |         with:
 40 |           path: |
 41 |             ~/.cargo/registry
 42 |             ~/.cargo/git
 43 |             target
 44 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
 45 | 
 46 |       - name: Run cargo check
 47 |         uses: actions-rs/cargo@v1
 48 |         with:
 49 |           command: check
 50 | 
 51 |   test:
 52 |     name: Test Rust version ${{ matrix.rust }} on OS ${{ matrix.os }}
 53 |     strategy:
 54 |       matrix:
 55 |         os: [ubuntu-latest, macos-latest]
 56 |         rust:
 57 |           - stable
 58 |           - 1.53.0
 59 |     runs-on: ${{ matrix.os }}
 60 |     steps:
 61 |       - name: Checkout sources
 62 |         uses: actions/checkout@v2
 63 | 
 64 |       - name: Install toolchain
 65 |         uses: actions-rs/toolchain@v1
 66 |         with:
 67 |           toolchain: ${{ matrix.rust }}
 68 |           override: true
 69 | 
 70 |       - uses: actions/cache@v2
 71 |         with:
 72 |           path: |
 73 |             ~/.cargo/registry
 74 |             ~/.cargo/git
 75 |             target
 76 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
 77 | 
 78 |       - name: Run cargo test
 79 |         uses: actions-rs/cargo@v1
 80 |         with:
 81 |           command: test
 82 |           args: -v --all-targets --no-fail-fast
 83 | 
 84 |   fmt:
 85 |     name: Rustfmt
 86 |     runs-on: ubuntu-latest
 87 |     strategy:
 88 |       matrix:
 89 |         rust:
 90 |           - stable
 91 |     steps:
 92 |       - name: Checkout sources
 93 |         uses: actions/checkout@v2
 94 | 
 95 |       - name: Install toolchain
 96 |         uses: actions-rs/toolchain@v1
 97 |         with:
 98 |           toolchain: ${{ matrix.rust }}
 99 |           override: true
100 | 
101 |       - name: Install rustfmt
102 |         run: rustup component add rustfmt
103 | 
104 |       - name: Run cargo fmt
105 |         uses: actions-rs/cargo@v1
106 |         with:
107 |           command: fmt
108 |           args: --all -- --check
109 | 
110 |   clippy:
111 |     name: Clippy
112 |     runs-on: ubuntu-latest
113 |     strategy:
114 |       matrix:
115 |         rust:
116 |           - stable
117 |     steps:
118 |       - name: Checkout sources
119 |         uses: actions/checkout@v2
120 | 
121 |       - name: Install toolchain
122 |         uses: actions-rs/toolchain@v1
123 |         with:
124 |           toolchain: ${{ matrix.rust }}
125 |           override: true
126 | 
127 |       - name: Install clippy
128 |         run: rustup component add clippy
129 | 
130 |       - name: Run cargo clippy
131 |         uses: actions-rs/cargo@v1
132 |         with:
133 |           command: clippy
134 |           args: --all-features --all-targets -- -D warnings


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sketchy <a href='https://github.com/esteinig'><img src='docs/images/logo.png' align="right" height="250" /></a>
 2 | 
 3 | ![](https://img.shields.io/badge/lang-rust-black.svg)
 4 | ![](https://img.shields.io/badge/version-0.6.0-green.svg)
 5 | ![](https://img.shields.io/badge/preprint-0.12.0-green.svg)
 6 | 
 7 | Genomic neighbor typing for lineage and genotype inference
 8 | 
 9 | ## Overview
10 | 
11 | **`v0.6.0`**
12 | 
13 | `Sketchy` is a lineage calling and genotyping tool based on the heuristic principle of genomic neighbor typing developed by [Karel Břinda and colleagues (2020)](https://www.biorxiv.org/content/10.1101/403204v2). It queries species-wide ('hypothesis-agnostic') reference sketches using MinHash and infers associated genotypes based on the closest match, including multi-locus sequence types, susceptibility profiles, virulence factors or other genome-associated features provided by the user. Unlike the original implementation in [`RASE`](https://github.com/c2-d2/rase-pipeline), `sketchy` does not use phylogenetic trees which has some downsides, e.g. for sublineage genotype predictions (see below). 
14 | 
15 | See the [latest docs](https://esteinig.github.io/sketchy) for install, usage and database building.
16 | 
17 | ## Install
18 | 
19 | Cargo:
20 | 
21 | ```
22 | cargo install sketchy
23 | ```
24 | 
25 | BioConda:
26 | 
27 | ```
28 | conda install -c bioconda sketchy
29 | ```
30 | 
31 | [Release binaries](https://github.com/esteinig/sketchy/releases) available for download. Reference sketches can be constructed from local [assembly and genotype collections](https://esteinig.github.io/sketchy/#local-sketches). *S. aureus* reference sketches are available in the data availability section below.
32 | 
33 | ## Strengths and limitations
34 | 
35 | 
36 | * Reference sketches and genotype indices can be constructed easily from large genotype collections
37 | * `Sketchy` requires few resources when using small sketch sizes (`s = 1000`) 
38 | * `Sketchy` performs best on lineage predictions and lineage-wide genotypes from very few reads - we found that tens to hundreds of reads can often give a good idea of the close matches in the reference sketch (especially when inspecting the top matches using `--top`)
39 | 
40 | However:
41 | 
42 | * Clade-specific genotype resolution is not as good as when using phylogenetic guide trees (`RASE`)
43 | * Sketch size can be increased to increase performance (`s = 10000`), but resources scale approximately linearly
44 | * `Sketchy` genotype inference may be difficult for species with high rates of homologous recombination
45 | 
46 | ## Data availability
47 | 
48 | * Reference sketches and genotype files (`s = 1000`, `s = 10000`, `k = 16`) for [*S. aureus*](https://cloudstor.aarnet.edu.au/plus/s/3EBgvXi6sVHW8Ne) (full genotypes including susceptibility predictions and other genotypes), *S. pneumoniae*, *K. pneumoniae*, *P. aeruginosa* and *Neisseria spp.* (MLST) can be found in the [data repository](https://cloudstor.aarnet.edu.au/plus/s/rL0RHYunqhRK3i1).
49 | * Reference sketches for cross-validation on the simulated species data can be found in this [data repository](https://cloudstor.aarnet.edu.au/plus/s/7ICPoSru6s6EHNY); genome assemblies for all species extracted from the ENA reference collection are available in this [data repository](https://cloudstor.aarnet.edu.au/plus/s/Td3ahBCPP2YAhCU)
50 | * Scripts to extract data from the ENA collections [Grace Blackwell et al.](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001421) and compute reference metrics can be found in the [scripts directory](scripts/).
51 | * Nanopore reads for the outbreak isolates and genotype surveillance panels in Papua New Guinea (Flongle, Goroka, sequential protocol) are available for download in the [data repository](https://cloudstor.aarnet.edu.au/plus/s/MFkirfq1N6uIosc). Raw sequence data (Illumina / ONT) is being uploaded to NCBI (PRJNA657380).
52 | 
53 | ## Preprint
54 | 
55 | If you use `sketchy` for research and other applications, please cite:
56 | 
57 | >  Steinig et al. (2022) - Genomic neighbor typing for bacterial outbreak surveillance - bioRxiv 2022.02.05.479210; doi: https://doi.org/10.1101/2022.02.05.479210 
58 | 


--------------------------------------------------------------------------------
/src/cli.rs:
--------------------------------------------------------------------------------
  1 | use std::ffi::OsStr;
  2 | use std::{ffi::OsString, path::PathBuf};
  3 | use structopt::StructOpt;
  4 | use thiserror::Error;
  5 | 
  6 | #[derive(Error, Debug)]
  7 | pub enum CliError {
  8 |     #[error("Scale parameter must be between 0 and 1")]
  9 |     InvalidScaleRange,
 10 |     #[error("Scale parameter must be a float")]
 11 |     InvalidScaleFloat,
 12 | }
 13 | 
 14 | /// Bacterial genomic neighbor typing using MinHash
 15 | #[derive(Debug, StructOpt)]
 16 | #[structopt(name = "sketchy")]
 17 | pub struct Cli {
 18 |     #[structopt(subcommand)]
 19 |     pub commands: Commands,
 20 | }
 21 | 
 22 | #[derive(Debug, StructOpt)]
 23 | pub enum Commands {
 24 |     /// Create a sketch from input sequences
 25 |     Sketch {
 26 |         /// Fast{a,q}.{gz,xz,bz}, stdin if not present
 27 |         #[structopt(short, long, parse(from_os_str), multiple = true)]
 28 |         input: Option<Vec<PathBuf>>,
 29 |         /// Output sketch file path.
 30 |         #[structopt(short, long, parse(from_os_str), required = true)]
 31 |         output: PathBuf,
 32 |         /// Sketch size.
 33 |         #[structopt(short, long, default_value = "1000")]
 34 |         sketch_size: usize,
 35 |         /// K-mer size.
 36 |         #[structopt(short = "k", long, default_value = "16")]
 37 |         kmer_size: u8,
 38 |         /// Hash scaler for finch format.
 39 |         #[structopt(
 40 |             short = "c", 
 41 |             long,
 42 |             parse(try_from_str = check_scale_limits),
 43 |             default_value = "0.001"
 44 |         )]
 45 |         scale: f64,
 46 |         /// Seed for hashing k-mers.
 47 |         #[structopt(short = "e", long, default_value = "0")]
 48 |         seed: u64,
 49 |     },
 50 |     /// List sketch genome order, sketch build parameters
 51 |     Info {
 52 |         /// Sketch file, format: Mash (.msh) or Finch (.fsh)
 53 |         #[structopt(
 54 |             short,
 55 |             long,
 56 |             parse(try_from_os_str = check_file_exists)
 57 |         )]
 58 |         input: PathBuf,
 59 |         /// Display the sketch build parameters.
 60 |         #[structopt(short, long)]
 61 |         params: bool,
 62 |     },
 63 | 
 64 |     /// Check match between sketch and genotype file
 65 |     Check {
 66 |         /// Sketch file, format: Mash (.msh) or Finch (.fsh)
 67 |         #[structopt(
 68 |             short,
 69 |             long,
 70 |             parse(try_from_os_str = check_file_exists)
 71 |         )]
 72 |         reference: PathBuf,
 73 |         /// Genotype file to validate with sketch file
 74 |         #[structopt(
 75 |             short,
 76 |             long,
 77 |             parse(try_from_os_str = check_file_exists)
 78 |         )]
 79 |         genotypes: PathBuf,
 80 |     },
 81 |     /// Compute shared hashes between two sketches
 82 |     Shared {
 83 |         /// Sketch file, format: Mash (.msh) or Finch (.fsh)
 84 |         #[structopt(
 85 |             short,
 86 |             long,
 87 |             parse(try_from_os_str = check_file_exists)
 88 |         )]
 89 |         reference: PathBuf,
 90 |         /// Sketch file, matching format: Mash (.msh) or Finch (.fsh)
 91 |         #[structopt(short, long)]
 92 |         query: PathBuf,
 93 |     },
 94 |     /// Predict genotypes from reads or read streams
 95 |     Predict {
 96 |         /// Fast{a,q}.{gz,xz,bz}, stdin if not present.
 97 |         #[structopt(
 98 |             short,
 99 |             long,
100 |             parse(try_from_os_str = check_file_exists)
101 |         )]
102 |         input: Option<PathBuf>,
103 |         /// Reference sketch, Mash (.msh) or Finch (.fsh)
104 |         #[structopt(
105 |             short,
106 |             long,
107 |             parse(try_from_os_str = check_file_exists)
108 |         )]
109 |         reference: PathBuf,
110 |         /// Reference genotype table (.tsv)
111 |         #[structopt(
112 |             short,
113 |             long,
114 |             parse(try_from_os_str = check_file_exists)
115 |         )]
116 |         genotypes: PathBuf,
117 |         /// Number of top ranked prediction to output
118 |         #[structopt(short, long, default_value = "1")]
119 |         top: usize,
120 |         /// Number of reads to process, all reads default
121 |         #[structopt(short, long, default_value = "0")]
122 |         limit: usize,
123 |         /// Sum of shared hashes per read output
124 |         #[structopt(short, long)]
125 |         stream: bool,
126 |         /// Consensus prediction over top feature values
127 |         #[structopt(short, long)]
128 |         consensus: bool,
129 |         /// Header added to output based on genotype file
130 |         #[structopt(short = "H", long)]
131 |         header: bool,
132 |     },
133 | }
134 | 
135 | fn check_scale_limits(scale: &str) -> Result<f64, CliError> {
136 |     match scale.parse::<f64>() {
137 |         Ok(x) => match x {
138 |             x if (0.0..=1.0).contains(&x) => Ok(x),
139 |             _ => Err(CliError::InvalidScaleRange),
140 |         },
141 |         _ => Err(CliError::InvalidScaleFloat),
142 |     }
143 | }
144 | 
145 | fn check_file_exists(file: &OsStr) -> Result<PathBuf, OsString> {
146 |     let path = PathBuf::from(file);
147 |     if path.exists() {
148 |         Ok(path)
149 |     } else {
150 |         Err(OsString::from(format!("{:?} does not exist", path)))
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ## Overview
  2 | 
  3 | `Sketchy` is a nanopore lineage calling and genotyping tool based on the heuristic principle of [genomic neighbor typing (Břinda et al. 2020)](https://www.biorxiv.org/content/10.1101/403204v2). `Sketchy`  queries species-wide (hypothesis-agnostic) reference sketches using MinHash methods to infer genotypes based on the closest reference match. Reference databases and genotypes, such has multi-locus sequence types, susceptibility profiles, or virulence factors, are configurable by users.
  4 | 
  5 | ## Install
  6 | 
  7 | Rust client.
  8 | 
  9 | ```
 10 | $ cargo install sketchy
 11 | ```
 12 | 
 13 | ```
 14 | $ conda install -c bioconda sketchy
 15 | ```
 16 | 
 17 | [Release binaries](releases.md) for Linux and MacOS are available.
 18 | 
 19 | 
 20 | ## Subcommands
 21 | 
 22 | Show subcommands available in the Rust client.
 23 | 
 24 | ```
 25 | $ sketchy --help
 26 | ```
 27 | 
 28 | ## Predictions
 29 | 
 30 | `Sketchy` predicts the genotype of the genome with the highest shared hashes in the reference sketch. Two modes are available:
 31 | 
 32 | 1. **Offline (read sets)**: k-mers are hashed and shared hashes computed from a set of reads. Output are the `--top` genome matches and their genotypes. 
 33 | 2. **Online  (streaming)**: k-mers are hashed for each incoming read, the sum of shared hashes for each genome in the reference sketch is updated and the `--top` genome matches and their genotypes based on the sorted sum of shared hashes at the current read are printed.
 34 | 
 35 | Sum of shared hashes based on streaming are slightly less informative than shared hashes based on read sets. Streaming requires less memory, but can be slow for large read sets, especially when deploying large reference sketches.
 36 | 
 37 | Predictions require the input sequences (`-i`), a reference sketch (`-r`) and a matching genotype file (`-g`) as described below ([genotype files](#genotype-files)).
 38 | 
 39 | ### Read sets
 40 | 
 41 | Output the best 5 matches against the reference sketch with header.
 42 | 
 43 | ```console
 44 | $ sketchy predict -i seq.fq -r saureus.msh -g saureus.tsv -t 5 -H
 45 | ```
 46 | 
 47 | ### Streaming
 48 | 
 49 | Output the updated best match against the reference sketch from a stream of reads.
 50 | 
 51 | ```console
 52 | $ cat seq.fq | sketchy predict -r saureus.msh -g saureus.tsv -s
 53 | ```
 54 | 
 55 | ## Sketches
 56 | 
 57 | ### Species sketches
 58 | 
 59 | Species sketches are available in the [data repository]() and can be built from local reference assembly collections.
 60 | 
 61 | ### Local sketches
 62 | 
 63 | Reference sketches can be built from any collection of assembled genomes for which associated genotype or phenotype data are available. 
 64 | 
 65 | Build a default-resolution (`s = 1000`) reference database from any collection of `fasta`.
 66 | 
 67 | ```
 68 | $ sketchy sketch -i *.fa -k 16 -s 1000 -o ref.msh
 69 | ```
 70 | 
 71 | You can pipe assemblies using `find` into the sketch construction, as wildcard expansions are limited to ~30,000 files.
 72 | 
 73 | ```
 74 | $ find assemblies/ -name "*.fa" | sketchy sketch -k 16 -s 1000 -o ref.msh
 75 | ```
 76 | 
 77 | List sketch parameters.
 78 | 
 79 | ```
 80 | $ sketchy info -i ref.msh -p 
 81 | ```
 82 | 
 83 | ### Genotype files
 84 | 
 85 | Prediction requires a **tab-delimited** genotype index **in the same order and of the same length** as the reference sketch. Names in the genotype index (first column) are the file name of the input files in the sketch
 86 | 
 87 | ```
 88 | name    mlst    tetracyline penicillin methicilin
 89 | ERR129347.fa    ST82    R   R   S
 90 | ERR121347.fa    ST93    S   S   S
 91 | ```
 92 | 
 93 | List the order of genomes in the sketch, their length (bp) and an estimate of cardinality (bp)
 94 | 
 95 | ```
 96 | $ sketchy info -i ref.msh
 97 | ```
 98 | 
 99 | Check if the genotype file contains the correct number and order of genomes as the sketch.
100 | 
101 | ```
102 | $ sketchy check -r ref.msh -g ref.tsv
103 | ```
104 | 
105 | This will outpout `ok` to `stdout` if the check is completed successfully and fail with an error message otherwise.
106 | 
107 | ## Sketch validation
108 | 
109 | We conducted simulations using `badread` of the following species:
110 | 
111 | * *Neisseria spp.*
112 | * *Streptococcus pneumoniae*
113 | * *Klebsiella pneumoniae*
114 | * *Staphylococcus aureus*
115 | * *Pseudomonas aeruginosa*
116 | 
117 | Cross-validation sketches (using subsampling of reference assemblies) and read data are available in the [data repository]().
118 | 
119 | ## Other
120 | 
121 | 
122 | ### Shared hashes
123 | 
124 | Given two assembled genome sequences, create a sketch at default k-mer size of `k = 16` and sketch size of `s = 1000`. `Mash` configuration can be replicated by setting `--seed 42`.
125 | 
126 | Sketch two genome assemblies with identical settings.
127 | 
128 | ```
129 | $ sketchy sketch -i genome1.fa -o genome1.msh
130 | $ sketchy sketch -i genome2.fa -o genome2.msh 
131 | ```
132 | 
133 | Compute shared hashes between the reference and query genomes.
134 | 
135 | ```
136 | $ sketchy shared -r genome1.msh -q genome2.msh
137 | 
138 | > genome1.fa genome2.fa 360
139 | ```
140 | 
141 | 
142 | If multiple sketches are available compute pairwise shared hashes.
143 | 
144 | ```
145 | $ sketchy sketch -i genome1.fa genome2.fa -o multi.msh
146 | $ sketchy shared -r multi.msh -q genome2.msh
147 | 
148 | > genome1.fa genome2.fa 360
149 | > genome2.fa genome2.fa 1000
150 | ```
151 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - "[0-9]+.[0-9]+.[0-9]+"
  7 | 
  8 | env:
  9 |   CICD_INTERMEDIATES_DIR: "_release-intermediates"
 10 | 
 11 | jobs:
 12 |   publish:
 13 |     name: Publish
 14 |     runs-on: ubuntu-latest
 15 |     steps:
 16 |       - name: Checkout sources
 17 |         uses: actions/checkout@v2
 18 | 
 19 |       - name: Install stable toolchain
 20 |         uses: actions-rs/toolchain@v1
 21 |         with:
 22 |           profile: minimal
 23 |           toolchain: stable
 24 |           override: true
 25 | 
 26 |       - name: Run `cargo publish` - upload to crates.io
 27 |         uses: actions-rs/cargo@v1
 28 |         env:
 29 |           CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
 30 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 31 |         with:
 32 |           command: publish
 33 | 
 34 |   upload:
 35 |     name: ${{ matrix.job.os }} (${{ matrix.job.target }})
 36 |     runs-on: ${{ matrix.job.os }}
 37 |     strategy:
 38 |       fail-fast: false
 39 |       matrix:
 40 |         job:
 41 |           - { os: ubuntu-latest, target: x86_64-unknown-linux-musl, use-cross: true }
 42 |           - { os: macos-latest, target: x86_64-apple-darwin, use-cross: true }
 43 |     steps:
 44 |       - name: Checkout source code
 45 |         uses: actions/checkout@v2
 46 | 
 47 |       - name: Extract crate information
 48 |         shell: bash
 49 |         run: |
 50 |           echo "PROJECT_NAME=sketchy" >> $GITHUB_ENV
 51 |           echo "PROJECT_VERSION=$(sed -n 's/^version = "\(.*\)"/\1/p' Cargo.toml | head -n1)" >> $GITHUB_ENV
 52 |           echo "PROJECT_MAINTAINER=$(sed -n 's/^authors = \["\(.*\)"\]/\1/p' Cargo.toml)" >> $GITHUB_ENV
 53 |           echo "PROJECT_HOMEPAGE=$(sed -n 's/^homepage = "\(.*\)"/\1/p' Cargo.toml)" >> $GITHUB_ENV
 54 |       - name: Install Rust toolchain
 55 |         uses: actions-rs/toolchain@v1
 56 |         with:
 57 |           toolchain: stable
 58 |           target: ${{ matrix.job.target }}
 59 |           override: true
 60 |           profile: minimal # minimal component installation (ie, no documentation)
 61 | 
 62 |       - name: Show version information (Rust, cargo, GCC)
 63 |         shell: bash
 64 |         run: |
 65 |           gcc --version || true
 66 |           rustup -V
 67 |           rustup toolchain list
 68 |           rustup default
 69 |           cargo -V
 70 |           rustc -V
 71 |       - name: Build
 72 |         uses: actions-rs/cargo@v1
 73 |         with:
 74 |           use-cross: ${{ matrix.job.use-cross }}
 75 |           command: build
 76 |           args: --release --target=${{ matrix.job.target }}
 77 | 
 78 |       - name: Strip debug information from executable
 79 |         id: strip
 80 |         shell: bash
 81 |         run: |
 82 |           # Figure out suffix of binary
 83 |           EXE_suffix=""
 84 |           # Figure out what strip tool to use if any
 85 |           STRIP="strip"
 86 |           # Setup paths
 87 |           BIN_DIR="${{ env.CICD_INTERMEDIATES_DIR }}/stripped-release-bin/"
 88 |           mkdir -p "${BIN_DIR}"
 89 |           BIN_NAME="${{ env.PROJECT_NAME }}${EXE_suffix}"
 90 |           BIN_PATH="${BIN_DIR}/${BIN_NAME}"
 91 |           # Copy the release build binary to the result location
 92 |           cp "target/${{ matrix.job.target }}/release/${BIN_NAME}" "${BIN_DIR}"
 93 |           # Also strip if possible
 94 |           if [ -n "${STRIP}" ]; then
 95 |             "${STRIP}" "${BIN_PATH}"
 96 |           fi
 97 |           # Let subsequent steps know where to find the (stripped) bin
 98 |           echo ::set-output name=BIN_PATH::${BIN_PATH}
 99 |           echo ::set-output name=BIN_NAME::${BIN_NAME}
100 |       - name: Set testing options
101 |         id: test-options
102 |         shell: bash
103 |         run: |
104 |           unset CARGO_TEST_OPTIONS
105 |           unset CARGO_TEST_OPTIONS ; case ${{ matrix.job.target }} in arm-* | aarch64-*) CARGO_TEST_OPTIONS="--bin ${PROJECT_NAME}" ;; esac;
106 |           echo ::set-output name=CARGO_TEST_OPTIONS::${CARGO_TEST_OPTIONS}
107 |       - name: Run tests
108 |         uses: actions-rs/cargo@v1
109 |         with:
110 |           use-cross: ${{ matrix.job.use-cross }}
111 |           command: test
112 |           args: --target=${{ matrix.job.target }} ${{ steps.test-options.outputs.CARGO_TEST_OPTIONS}}
113 | 
114 |       - name: Create tarball
115 |         id: package
116 |         shell: bash
117 |         run: |
118 |           PKG_suffix=".tar.gz" ; case ${{ matrix.job.target }} in *-pc-windows-*) PKG_suffix=".zip" ;; esac;
119 |           PKG_BASENAME=${PROJECT_NAME}-${PROJECT_VERSION}-${{ matrix.job.target }}
120 |           PKG_NAME=${PKG_BASENAME}${PKG_suffix}
121 |           echo ::set-output name=PKG_NAME::${PKG_NAME}
122 |           PKG_STAGING="${{ env.CICD_INTERMEDIATES_DIR }}/package"
123 |           ARCHIVE_DIR="${PKG_STAGING}/${PKG_BASENAME}/"
124 |           mkdir -p "${ARCHIVE_DIR}"
125 |           # Binary
126 |           cp "${{ steps.strip.outputs.BIN_PATH }}" "$ARCHIVE_DIR"
127 |           # README, LICENSE files
128 |           cp "README.md" "LICENSE" "$ARCHIVE_DIR"
129 |           # base compressed package
130 |           pushd "${PKG_STAGING}/" >/dev/null
131 |           case ${{ matrix.job.target }} in
132 |             *-pc-windows-*) 7z -y a "${PKG_NAME}" "${PKG_BASENAME}"/* | tail -2 ;;
133 |             *) tar czf "${PKG_NAME}" "${PKG_BASENAME}"/* ;;
134 |           esac;
135 |           popd >/dev/null
136 |           # Let subsequent steps know where to find the compressed package
137 |           echo ::set-output name=PKG_PATH::"${PKG_STAGING}/${PKG_NAME}"
138 |       - name: "Artifact upload: tarball"
139 |         uses: actions/upload-artifact@master
140 |         with:
141 |           name: ${{ steps.package.outputs.PKG_NAME }}
142 |           path: ${{ steps.package.outputs.PKG_PATH }}
143 | 
144 |       - name: Check for release
145 |         id: is-release
146 |         shell: bash
147 |         run: |
148 |           unset IS_RELEASE ; if [[ $GITHUB_REF =~ ^refs/tags/[0-9].* ]]; then IS_RELEASE='true' ; fi
149 |           echo ::set-output name=IS_RELEASE::${IS_RELEASE}
150 |       - name: Publish archives and packages
151 |         uses: softprops/action-gh-release@59c3b4891632ff9a897f99a91d7bc557467a3a22 # https://github.com/softprops/action-gh-release/issues/139
152 |         if: steps.is-release.outputs.IS_RELEASE
153 |         with:
154 |           files: |
155 |             ${{ steps.package.outputs.PKG_PATH }}
156 |         env:
157 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/scripts/species.py:
--------------------------------------------------------------------------------
  1 | """ ENA assembly collection methods (Blackwell et al. 2021) """
  2 | 
  3 | import ijson
  4 | import pandas
  5 | from pathlib import Path
  6 | 
  7 | 
  8 | def parse_metadata(json_file: Path):
  9 | 
 10 |     """
 11 |     Parse species, lineage and quality data of genomes in the ENA collection
 12 | 
 13 |     json_file: path to the ENA assembly metadata JSON
 14 | 
 15 |     --> retains genomes with full MLST (excluding "-") 
 16 | 
 17 |     * filename: Json1_ENA_metadata
 18 |     * file address: https://figshare.com/ndownloader/files/26578377
 19 |     * data citation: 
 20 |         Blackwell, Grace; Hunt, Martin; Malone, Kerri; Lima, Leandro; Horesh, Gal; T. F. Alako, Blaise; et al. (2021): 
 21 |         "Exploring bacterial diversity via a curated and searchable snapshot of archived DNA sequences".
 22 |         Dataset. https://doi.org/10.6084/m9.figshare.14061752.v1
 23 |     """
 24 | 
 25 |     data = []
 26 |     with json_file.open() as file:
 27 |             parser = ijson.parse(file)
 28 |             i = 0
 29 |             n = 0
 30 |             current_genome = ""
 31 |             for prefix, event, value in parser:
 32 |                 if event == 'start_map' and "." not in prefix:
 33 |                     current_genome = prefix
 34 |                     bracken = ("", 0.0)
 35 |                     completeness = None
 36 |                     contamination = None
 37 |                     mlst = None
 38 |                     mlst_species = None
 39 |                     keep = False
 40 |                     heterogeneity = None
 41 |                 if "checkm_results.Completeness" in prefix:
 42 |                     completeness = value
 43 |                 if "checkm_results.Contamination" in prefix:
 44 |                     contamination = value
 45 |                 if "checkm_results.Strain_heterogeneity" in prefix:
 46 |                     heterogeneity = value
 47 |                 if "mlst_results.species" in prefix:
 48 |                     mlst_species = value
 49 |                 if "mlst_results.mlst" in prefix:
 50 |                     mlst = value
 51 |                     if mlst != "-":
 52 |                         keep = True
 53 |                 if "bracken" in prefix and event == "string" and value not in ("NA", "-"):
 54 |                     species = prefix.replace(f"{current_genome}.bracken.", "")
 55 |                     abundance = float(value)
 56 |                     if abundance > bracken[1]:
 57 |                         bracken = (species, abundance)
 58 |                 if event == 'end_map' and "." not in prefix:
 59 |                     if keep:
 60 |                         data.append([
 61 |                             current_genome,
 62 |                             bracken[0],
 63 |                             bracken[1],
 64 |                             float(completeness),
 65 |                             float(contamination),
 66 |                             float(heterogeneity),
 67 |                             mlst,
 68 |                             mlst_species
 69 |                         ])
 70 |                         n += 1
 71 |                     # Reset variables to make sure none are
 72 |                     # accidentally used in the next entry
 73 |                     bracken = ("", 0.0)
 74 |                     completeness = None
 75 |                     contamination = None
 76 |                     mlst = None
 77 |                     mlst_species = None
 78 |                     keep = False
 79 |                     heterogeneity = None
 80 |                 # if i == 100000:
 81 |                 #     continue  # test break
 82 |                 i += 1
 83 | 
 84 |     df = pandas.DataFrame(
 85 |         data, columns=[
 86 |             "accession",
 87 |             "bracken_species",
 88 |             "bracken_abundance",
 89 |             "completeness",
 90 |             "contamination",
 91 |             "heterogeneity",
 92 |             "mlst",
 93 |             "mlst_species"
 94 |         ]
 95 |     )
 96 |     df.to_csv("meta.tsv", sep="\t", index=False)
 97 | 
 98 | 
 99 | def clean_metadata(meta_file: Path, assembly_paths: Path):
100 | 
101 |     """
102 |     Obtain a clean subset of the assembled genomes filtered by assembly quality
103 |     
104 |     * filter genomes: contamination > 1. && completeness < 99. && heterogeneity > 0.1
105 |     * species genome counts, separated into total and > 100
106 |     * FTP paths to species assemblies on the EMBL ENA server
107 | 
108 |     EMBL EBI address: ftp.ebi.ac.uk/pub/databases/ENA2018-bacteria-661k/sampleid_assembly_paths.txt
109 |     """
110 | 
111 |     with meta_file.open() as meta_file:
112 |         df = pandas.read_csv(meta_file, header=0, sep='\t')
113 |         contaminated = df[df['contamination'] > 0.1]
114 |         fragmented = df[df['completeness'] < 99.]
115 |         heterogenous = df[df['heterogeneity'] > 0.1]
116 | 
117 |         all_exclude = pandas.concat((contaminated, fragmented, heterogenous))
118 |         unique_exclude = all_exclude.index.unique()
119 | 
120 |         df_clean = df[~df.index.isin(unique_exclude)]
121 | 
122 |         species_counts_100 = []
123 |         species_counts_all = []
124 |         for species, data in df_clean.groupby("bracken_species"):
125 |             n = len(data)
126 |             species_counts_all.append([species, n])
127 |             if n >= 100:
128 |                 species_counts_100.append([species, n])
129 |         df_clean.to_csv("meta_clean.tsv", index=False, sep="\t")
130 | 
131 |         df_species_all = pandas.DataFrame(species_counts_all, columns=["species", "n"]).sort_values("n")
132 |         df_species_100 = pandas.DataFrame(species_counts_100, columns=["species", "n"]).sort_values("n")
133 | 
134 |         df_species_all.to_csv("species_counts.tsv", index=False, sep="\t")
135 |         df_species_100.to_csv("species_counts_100.tsv", index=False, sep="\t")
136 | 
137 |         assembly_paths = pandas.read_csv(assembly_paths, sep='\t', header=None, names=['id', 'path'])
138 | 
139 |         spec_paths = []
140 |         specs = []
141 |         for species, species_df in df_clean.groupby("bracken_species"):
142 |             ftp_paths = assembly_paths[assembly_paths['id'].isin(species_df['accession'])]
143 |             ftp_paths['path'] = [p.replace("/ebi/ftp", "http://ftp.ebi.ac.uk") for p in ftp_paths["path"]]
144 | 
145 |             species_paths = ftp_paths.drop(columns="id")
146 |             name = f"{species.lower().replace(' ', '_')}"
147 |             species_paths['species'] = [name for _ in species_paths.iterrows()]
148 |             species_df['species'] = [name for _ in species_df.iterrows()]
149 |             spec_paths.append(species_paths)
150 |             specs.append(species_df)
151 | 
152 |         species = pandas.concat(specs)
153 |         species.to_csv(f"species_data.tsv", sep='\t', index=False)
154 | 
155 |         species_paths = pandas.concat(spec_paths)
156 |         species_paths.to_csv(f"species_ftp.tsv", sep='\t', index=False, header=False)
157 | 
158 | 


--------------------------------------------------------------------------------
/scripts/metrics.py:
--------------------------------------------------------------------------------
  1 | import pandas
  2 | import warnings
  3 | import json
  4 | from numpy import nan
  5 | from typing import List
  6 | from pathlib import Path
  7 | from sklearn.metrics import accuracy_score
  8 | from sklearn.metrics import precision_score
  9 | from sklearn.metrics import recall_score
 10 | from sklearn.metrics import confusion_matrix
 11 | 
 12 | # in multi label metrics with many labels (e.g. sccmec type)
 13 | # warnings can be raised because of rare labels that are
 14 | # never predicted; these are ignored
 15 | warnings.filterwarnings('ignore')
 16 | 
 17 | SKIP_COLUMNS = ["name", "reads", "sketch_id", "shared_hashes"]
 18 | 
 19 | 
 20 | def compute_metrics(
 21 |     reference: Path,
 22 |     prediction: Path = None,
 23 |     label_config: Path = None,
 24 |     default_binary: List = ("R", "S"),
 25 |     output: Path = Path("metrics.tsv"),
 26 |     verbose: bool = True
 27 | ):
 28 |     """
 29 |     Computes summary metrics for predictions against a structured reference
 30 | 
 31 |     :params reference: a tab-delineated file with headers containing reference classifications
 32 |         * must contain a column `name` for file name matching with prediction file
 33 | 
 34 |     :params predictions: a tab-delineated file with headers containing prediction classfications
 35 |         * must contain a column 'name' for file name same as in reference file
 36 |         * this can be the output from the batch prediction pipeline
 37 | 
 38 |     :params label_config: a JSON file in the following structure:
 39 | 
 40 |         {
 41 |             "<column_name>": {
 42 |                 "binary": true/false,
 43 |                 "labels": [<label1>, <label2>] or null
 44 |         }
 45 | 
 46 |         If <binary> is true, then a list of two labels can be provided, which correspond to the
 47 |         postive and negative values for the confusion matrix.
 48 | 
 49 |         {
 50 |             "mlst": {
 51 |                 "binary": false,
 52 |                 "labels": null
 53 |             },
 54 |             "pvl": {
 55 |                 "binary": true,
 56 |                 "labels": ["PVL+", "PVL-"]
 57 |             },
 58 |             "mrsa": {
 59 |                 "binary": true,
 60 |                 "labels": ["MRSA", "MSSA"]
 61 |             }
 62 |         }
 63 | 
 64 |     """
 65 | 
 66 |     reference_df, prediction_df = process_data(reference=reference, prediction=prediction, verbose=verbose)
 67 | 
 68 |     if label_config is not None:
 69 |         config = read_label_config(file=label_config, default_binary=default_binary)
 70 |     else:
 71 |         config = {}
 72 | 
 73 |     metrics = get_metrics(
 74 |         reference=reference_df, prediction=prediction_df, config=config, default_binary=default_binary, verbose=verbose
 75 |     )
 76 | 
 77 |     metrics.to_csv(output, sep='\t', index=False)
 78 |     print(metrics)
 79 |     print(metrics.accuracy.mean())
 80 | 
 81 | 
 82 | def process_data(reference: Path, prediction: Path, verbose: bool):
 83 |     """
 84 |     Process reference and prediction tables, ensure that data is clean
 85 |     """
 86 | 
 87 |     ref = pandas.read_csv(reference, sep="\t", header=0)
 88 |     pred = pandas.read_csv(prediction, sep="\t", header=0)
 89 | 
 90 |     # Retain only predictions that are also in the reference genotype data
 91 |     prediction_df_clean = pred[pred['name'].isin(ref['name'])]
 92 | 
 93 |     # Retain only references that are also in the predictions
 94 |     ref_df_clean = ref[ref['name'].isin(pred['name'])]
 95 | 
 96 |     removed_prediction = len(pred) - len(prediction_df_clean)
 97 |     removed_ref = len(ref) - len(ref_df_clean)
 98 | 
 99 |     if verbose:
100 |         print(f"Removed {removed_prediction} entries from prediction data not present in reference data")
101 |         print(f"Removed {removed_ref} entries from reference data not present in prediction data")
102 | 
103 |     # Sort the dataframes in the same order by name
104 |     ref_df = ref_df_clean.sort_values('name')
105 |     prediction_df = prediction_df_clean.sort_values('name')
106 | 
107 |     assert ref_df['name'].tolist() == prediction_df['name'].tolist()  # names should be unique, therefore sortable
108 | 
109 |     # Make sure all columns in prediction are present in reference:
110 | 
111 |     ref_columns = ref_df.columns.tolist()
112 |     for column in prediction_df:
113 |         if column in SKIP_COLUMNS:
114 |             continue
115 |         else:
116 |             if column not in ref_columns:
117 |                 raise ValueError(f"Column `{column}` not in reference data")
118 | 
119 |     # Following should be done in the reference genotype files later on!
120 | 
121 |     # For the S. aureus data predictions can be PVL* (which is PVL negative, missing one gene)
122 |     # and resistances can include 'r' instead of 'R' - harmonize these by replacing with
123 |     # appropriate value (r -> R, PVL* -> PVL-)
124 | 
125 |     ref_df = ref_df.replace('r', 'R')
126 |     prediction_df = prediction_df.replace('r', 'R')
127 | 
128 |     if "pvl" in ref_df.columns and "pvl" in prediction_df.columns:
129 |         ref_df = ref_df.replace('PVL*', 'PVL-')
130 |         prediction_df = prediction_df.replace('PVL*', 'PVL-')
131 | 
132 |     if "scc" in ref_df.columns and "scc" in prediction_df.columns:
133 |         # Also remove leading whitespace from SCCmec types
134 |         ref_df['scc'] = ref_df.scc.str.strip()
135 |         prediction_df['scc'] = prediction_df.scc.str.strip()
136 | 
137 |     if "meca" in ref_df.columns and "meca" in prediction_df.columns:
138 |         # Drop mecA gene (assembly based) in favour of Mykrobe
139 |         # methicillin typing from reads (same genotype)
140 |         ref_df = ref_df.drop(columns='meca')
141 |         prediction_df = prediction_df.drop(columns='meca')
142 |         ref_df = ref_df.reset_index(drop=True)
143 |         prediction_df = prediction_df.reset_index(drop=True)
144 | 
145 |     return ref_df, prediction_df
146 | 
147 | 
148 | def read_label_config(file: Path, default_binary: list) -> dict:
149 |     """
150 |     Reads and validates the label configuration JSON
151 |     """
152 | 
153 |     with file.open() as json_file:
154 |         label_config = json.load(json_file)
155 | 
156 |     conf = {}
157 |     for column, config in label_config.items():
158 |         if "binary" not in config.keys():
159 |             raise ValueError(f"Could not find `binary` key in config for column: {column}")
160 | 
161 |         binary: bool = config["binary"]
162 |         if not isinstance(binary, bool):
163 |             raise ValueError(f"Binary value ({column}) is not a boolean")
164 | 
165 |         conf[column] = {
166 |             "binary": binary, "labels": list()
167 |         }
168 | 
169 |         if binary:
170 |             if "labels" not in config.keys():
171 |                 raise ValueError(f"Could not find `labels` key in config for column: {column}")
172 | 
173 |             labels: list = config["labels"]
174 |             if not isinstance(labels, list):
175 |                 raise ValueError(f"Label value ({column}) is not one of: List or None")
176 | 
177 |             if not labels:
178 |                 print(f"Setting default binary labels ({column}): {default_binary}")
179 |                 labels = default_binary.copy()
180 |             else:
181 |                 if len(labels) != 2:
182 |                     raise ValueError(f"Binary labels requires precisely two values ({column})")
183 | 
184 |             conf[column]["labels"] = labels
185 | 
186 |     return conf
187 | 
188 | 
189 | def get_metrics(
190 |     reference: pandas.DataFrame, prediction: pandas.DataFrame, config: dict, default_binary: List, verbose: bool
191 | ) -> pandas.DataFrame:
192 | 
193 |     metrics = []
194 |     for column in prediction.columns:
195 |         # Skip irrelevant columns
196 |         if column in SKIP_COLUMNS:
197 |             continue
198 |         else:
199 | 
200 |             try:
201 |                 binary = config[column]["binary"]
202 |                 if verbose:
203 |                     print(f"Column ({column}) is binary: {binary}")
204 |             except KeyError:
205 |                 if verbose:
206 |                     print(f"Column ({column}) is binary (default)")
207 |                 binary = True
208 | 
209 |             ref_vec = reference[column].tolist()  # sorted in process_data
210 |             pred_vec = prediction[column].tolist()
211 | 
212 |             if binary:
213 |                 try:
214 |                     labels = config[column]["labels"]
215 |                     if verbose:
216 |                         print(f"Column ({column}) labels: {', '.join(default_binary)}")
217 |                 except KeyError:
218 |                     if verbose:
219 |                         print(f"Column ({column}) labels: {', '.join(default_binary)} (default)")
220 |                     labels = default_binary.copy()
221 | 
222 |                 tp, fp, tn, fn, acc, tpr, tnr, ppv, npv = compute_binary_metrics(ref_vec, pred_vec, labels)
223 |             else:
224 |                 tp, fp, tn, fn, tnr = -1, -1, -1, -1, nan
225 |                 acc = accuracy_score(ref_vec, pred_vec)
226 |                 tpr = recall_score(ref_vec, pred_vec, average='weighted')
227 |                 ppv = precision_score(ref_vec, pred_vec, average='weighted')
228 | 
229 |             metrics.append([column, binary, tp, tn, fp, fn, acc, ppv, tpr, tnr])
230 | 
231 |     return pandas.DataFrame(
232 |         metrics,
233 |         columns=[
234 |             "feature", "binary", "true_positives", "true_negatives",
235 |             "false_positives", "false_negatives", "accuracy", "precision",
236 |             "recall", "specificity"
237 |         ]
238 |     )
239 | 
240 | 
241 | def compute_binary_metrics(ref: list, pred: list, labels: list):
242 |     try:
243 |         cm = confusion_matrix(ref, pred, labels=labels)
244 |     except ValueError:
245 |         print(ref, pred, labels)
246 |         raise
247 | 
248 |     tp = int(cm[0][0])
249 |     fn = int(cm[0][1])
250 |     fp = int(cm[1][0])
251 |     tn = int(cm[1][1])
252 | 
253 |     # In all cases, if either the numerator or
254 |     # denominator is zero, the metric is undefined
255 |     tpr = _metric(tp, (tp + fn))
256 |     tnr = _metric(tn, (tn + fp))
257 |     ppv = _metric(tp, (tp + fp))
258 |     npv = _metric(tn, (tn + fn))
259 |     acc = _metric((tp + tn), (tp + fp + fn + tn))
260 | 
261 |     return tp, fp, tn, fn, acc, tpr, tnr, ppv, npv
262 | 
263 | 
264 | def _metric(numerator: int, denominator: int):
265 | 
266 |     if numerator == 0:
267 |         return nan
268 |     if denominator == 0:
269 |         return nan
270 | 
271 |     return numerator/denominator
272 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "adler"
  7 | version = "1.0.2"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 10 | 
 11 | [[package]]
 12 | name = "ansi_term"
 13 | version = "0.11.0"
 14 | source = "registry+https://github.com/rust-lang/crates.io-index"
 15 | checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
 16 | dependencies = [
 17 |  "winapi",
 18 | ]
 19 | 
 20 | [[package]]
 21 | name = "anyhow"
 22 | version = "1.0.44"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
 25 | 
 26 | [[package]]
 27 | name = "atty"
 28 | version = "0.2.13"
 29 | source = "registry+https://github.com/rust-lang/crates.io-index"
 30 | checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90"
 31 | dependencies = [
 32 |  "libc",
 33 |  "winapi",
 34 | ]
 35 | 
 36 | [[package]]
 37 | name = "autocfg"
 38 | version = "1.0.1"
 39 | source = "registry+https://github.com/rust-lang/crates.io-index"
 40 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
 41 | 
 42 | [[package]]
 43 | name = "bincode"
 44 | version = "1.3.3"
 45 | source = "registry+https://github.com/rust-lang/crates.io-index"
 46 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
 47 | dependencies = [
 48 |  "serde",
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "bitflags"
 53 | version = "1.2.1"
 54 | source = "registry+https://github.com/rust-lang/crates.io-index"
 55 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
 56 | 
 57 | [[package]]
 58 | name = "bstr"
 59 | version = "0.2.17"
 60 | source = "registry+https://github.com/rust-lang/crates.io-index"
 61 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
 62 | dependencies = [
 63 |  "lazy_static",
 64 |  "memchr",
 65 |  "regex-automata",
 66 |  "serde",
 67 | ]
 68 | 
 69 | [[package]]
 70 | name = "buf_redux"
 71 | version = "0.8.4"
 72 | source = "registry+https://github.com/rust-lang/crates.io-index"
 73 | checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f"
 74 | dependencies = [
 75 |  "memchr",
 76 |  "safemem",
 77 | ]
 78 | 
 79 | [[package]]
 80 | name = "bytecount"
 81 | version = "0.6.2"
 82 | source = "registry+https://github.com/rust-lang/crates.io-index"
 83 | checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
 84 | 
 85 | [[package]]
 86 | name = "bzip2"
 87 | version = "0.4.3"
 88 | source = "registry+https://github.com/rust-lang/crates.io-index"
 89 | checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0"
 90 | dependencies = [
 91 |  "bzip2-sys",
 92 |  "libc",
 93 | ]
 94 | 
 95 | [[package]]
 96 | name = "bzip2-sys"
 97 | version = "0.1.11+1.0.8"
 98 | source = "registry+https://github.com/rust-lang/crates.io-index"
 99 | checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
100 | dependencies = [
101 |  "cc",
102 |  "libc",
103 |  "pkg-config",
104 | ]
105 | 
106 | [[package]]
107 | name = "capnp"
108 | version = "0.14.3"
109 | source = "registry+https://github.com/rust-lang/crates.io-index"
110 | checksum = "ae9b8a7119420b5279ddc2b4ee553ee15bcf4605df6135a26f03ffe153bee97c"
111 | 
112 | [[package]]
113 | name = "cc"
114 | version = "1.0.70"
115 | source = "registry+https://github.com/rust-lang/crates.io-index"
116 | checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0"
117 | 
118 | [[package]]
119 | name = "cfg-if"
120 | version = "1.0.0"
121 | source = "registry+https://github.com/rust-lang/crates.io-index"
122 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
123 | 
124 | [[package]]
125 | name = "clap"
126 | version = "2.33.0"
127 | source = "registry+https://github.com/rust-lang/crates.io-index"
128 | checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
129 | dependencies = [
130 |  "ansi_term",
131 |  "atty",
132 |  "bitflags",
133 |  "strsim",
134 |  "textwrap",
135 |  "unicode-width",
136 |  "vec_map",
137 | ]
138 | 
139 | [[package]]
140 | name = "crc32fast"
141 | version = "1.2.1"
142 | source = "registry+https://github.com/rust-lang/crates.io-index"
143 | checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a"
144 | dependencies = [
145 |  "cfg-if",
146 | ]
147 | 
148 | [[package]]
149 | name = "crossbeam-channel"
150 | version = "0.5.1"
151 | source = "registry+https://github.com/rust-lang/crates.io-index"
152 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
153 | dependencies = [
154 |  "cfg-if",
155 |  "crossbeam-utils",
156 | ]
157 | 
158 | [[package]]
159 | name = "crossbeam-deque"
160 | version = "0.8.1"
161 | source = "registry+https://github.com/rust-lang/crates.io-index"
162 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
163 | dependencies = [
164 |  "cfg-if",
165 |  "crossbeam-epoch",
166 |  "crossbeam-utils",
167 | ]
168 | 
169 | [[package]]
170 | name = "crossbeam-epoch"
171 | version = "0.9.5"
172 | source = "registry+https://github.com/rust-lang/crates.io-index"
173 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
174 | dependencies = [
175 |  "cfg-if",
176 |  "crossbeam-utils",
177 |  "lazy_static",
178 |  "memoffset",
179 |  "scopeguard",
180 | ]
181 | 
182 | [[package]]
183 | name = "crossbeam-utils"
184 | version = "0.8.8"
185 | source = "registry+https://github.com/rust-lang/crates.io-index"
186 | checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38"
187 | dependencies = [
188 |  "cfg-if",
189 |  "lazy_static",
190 | ]
191 | 
192 | [[package]]
193 | name = "csv"
194 | version = "1.1.6"
195 | source = "registry+https://github.com/rust-lang/crates.io-index"
196 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
197 | dependencies = [
198 |  "bstr",
199 |  "csv-core",
200 |  "itoa",
201 |  "ryu",
202 |  "serde",
203 | ]
204 | 
205 | [[package]]
206 | name = "csv-core"
207 | version = "0.1.10"
208 | source = "registry+https://github.com/rust-lang/crates.io-index"
209 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
210 | dependencies = [
211 |  "memchr",
212 | ]
213 | 
214 | [[package]]
215 | name = "either"
216 | version = "1.6.1"
217 | source = "registry+https://github.com/rust-lang/crates.io-index"
218 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
219 | 
220 | [[package]]
221 | name = "finch"
222 | version = "0.4.1"
223 | source = "registry+https://github.com/rust-lang/crates.io-index"
224 | checksum = "0f5b421df230ee6000ccb42073103407a8b29c0adc2b5870a346d2fd6281ceec"
225 | dependencies = [
226 |  "bincode",
227 |  "capnp",
228 |  "memmap",
229 |  "murmurhash3",
230 |  "ndarray",
231 |  "needletail",
232 |  "rayon",
233 |  "serde",
234 |  "serde_json",
235 |  "thiserror",
236 | ]
237 | 
238 | [[package]]
239 | name = "flate2"
240 | version = "1.0.22"
241 | source = "registry+https://github.com/rust-lang/crates.io-index"
242 | checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f"
243 | dependencies = [
244 |  "cfg-if",
245 |  "crc32fast",
246 |  "libc",
247 |  "miniz_oxide",
248 | ]
249 | 
250 | [[package]]
251 | name = "heck"
252 | version = "0.3.3"
253 | source = "registry+https://github.com/rust-lang/crates.io-index"
254 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
255 | dependencies = [
256 |  "unicode-segmentation",
257 | ]
258 | 
259 | [[package]]
260 | name = "hermit-abi"
261 | version = "0.1.19"
262 | source = "registry+https://github.com/rust-lang/crates.io-index"
263 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
264 | dependencies = [
265 |  "libc",
266 | ]
267 | 
268 | [[package]]
269 | name = "itoa"
270 | version = "0.4.7"
271 | source = "registry+https://github.com/rust-lang/crates.io-index"
272 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
273 | 
274 | [[package]]
275 | name = "lazy_static"
276 | version = "1.4.0"
277 | source = "registry+https://github.com/rust-lang/crates.io-index"
278 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
279 | 
280 | [[package]]
281 | name = "libc"
282 | version = "0.2.95"
283 | source = "registry+https://github.com/rust-lang/crates.io-index"
284 | checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36"
285 | 
286 | [[package]]
287 | name = "lzma-sys"
288 | version = "0.1.17"
289 | source = "registry+https://github.com/rust-lang/crates.io-index"
290 | checksum = "bdb4b7c3eddad11d3af9e86c487607d2d2442d185d848575365c4856ba96d619"
291 | dependencies = [
292 |  "cc",
293 |  "libc",
294 |  "pkg-config",
295 | ]
296 | 
297 | [[package]]
298 | name = "matrixmultiply"
299 | version = "0.2.4"
300 | source = "registry+https://github.com/rust-lang/crates.io-index"
301 | checksum = "916806ba0031cd542105d916a97c8572e1fa6dd79c9c51e7eb43a09ec2dd84c1"
302 | dependencies = [
303 |  "rawpointer",
304 | ]
305 | 
306 | [[package]]
307 | name = "memchr"
308 | version = "2.4.0"
309 | source = "registry+https://github.com/rust-lang/crates.io-index"
310 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
311 | 
312 | [[package]]
313 | name = "memmap"
314 | version = "0.7.0"
315 | source = "registry+https://github.com/rust-lang/crates.io-index"
316 | checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
317 | dependencies = [
318 |  "libc",
319 |  "winapi",
320 | ]
321 | 
322 | [[package]]
323 | name = "memoffset"
324 | version = "0.6.4"
325 | source = "registry+https://github.com/rust-lang/crates.io-index"
326 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9"
327 | dependencies = [
328 |  "autocfg",
329 | ]
330 | 
331 | [[package]]
332 | name = "miniz_oxide"
333 | version = "0.4.4"
334 | source = "registry+https://github.com/rust-lang/crates.io-index"
335 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
336 | dependencies = [
337 |  "adler",
338 |  "autocfg",
339 | ]
340 | 
341 | [[package]]
342 | name = "murmurhash3"
343 | version = "0.0.5"
344 | source = "registry+https://github.com/rust-lang/crates.io-index"
345 | checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664"
346 | 
347 | [[package]]
348 | name = "ndarray"
349 | version = "0.14.0"
350 | source = "registry+https://github.com/rust-lang/crates.io-index"
351 | checksum = "6c0d5c9540a691d153064dc47a4db2504587a75eae07bf1d73f7a596ebc73c04"
352 | dependencies = [
353 |  "matrixmultiply",
354 |  "num-complex",
355 |  "num-integer",
356 |  "num-traits",
357 |  "rawpointer",
358 | ]
359 | 
360 | [[package]]
361 | name = "needletail"
362 | version = "0.4.1"
363 | source = "registry+https://github.com/rust-lang/crates.io-index"
364 | checksum = "7fb4c43ebd04b0e776119c8fc3bd4c28178619cd04e1f19f600a4ef0282fa3cc"
365 | dependencies = [
366 |  "buf_redux",
367 |  "bytecount",
368 |  "bzip2",
369 |  "flate2",
370 |  "memchr",
371 |  "xz2",
372 | ]
373 | 
374 | [[package]]
375 | name = "num-complex"
376 | version = "0.3.1"
377 | source = "registry+https://github.com/rust-lang/crates.io-index"
378 | checksum = "747d632c0c558b87dbabbe6a82f3b4ae03720d0646ac5b7b4dae89394be5f2c5"
379 | dependencies = [
380 |  "num-traits",
381 | ]
382 | 
383 | [[package]]
384 | name = "num-integer"
385 | version = "0.1.44"
386 | source = "registry+https://github.com/rust-lang/crates.io-index"
387 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
388 | dependencies = [
389 |  "autocfg",
390 |  "num-traits",
391 | ]
392 | 
393 | [[package]]
394 | name = "num-traits"
395 | version = "0.2.14"
396 | source = "registry+https://github.com/rust-lang/crates.io-index"
397 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
398 | dependencies = [
399 |  "autocfg",
400 | ]
401 | 
402 | [[package]]
403 | name = "num_cpus"
404 | version = "1.13.0"
405 | source = "registry+https://github.com/rust-lang/crates.io-index"
406 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
407 | dependencies = [
408 |  "hermit-abi",
409 |  "libc",
410 | ]
411 | 
412 | [[package]]
413 | name = "pkg-config"
414 | version = "0.3.20"
415 | source = "registry+https://github.com/rust-lang/crates.io-index"
416 | checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb"
417 | 
418 | [[package]]
419 | name = "proc-macro-error"
420 | version = "1.0.4"
421 | source = "registry+https://github.com/rust-lang/crates.io-index"
422 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
423 | dependencies = [
424 |  "proc-macro-error-attr",
425 |  "proc-macro2",
426 |  "quote",
427 |  "syn",
428 |  "version_check",
429 | ]
430 | 
431 | [[package]]
432 | name = "proc-macro-error-attr"
433 | version = "1.0.4"
434 | source = "registry+https://github.com/rust-lang/crates.io-index"
435 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
436 | dependencies = [
437 |  "proc-macro2",
438 |  "quote",
439 |  "version_check",
440 | ]
441 | 
442 | [[package]]
443 | name = "proc-macro2"
444 | version = "1.0.29"
445 | source = "registry+https://github.com/rust-lang/crates.io-index"
446 | checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
447 | dependencies = [
448 |  "unicode-xid",
449 | ]
450 | 
451 | [[package]]
452 | name = "quote"
453 | version = "1.0.9"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
456 | dependencies = [
457 |  "proc-macro2",
458 | ]
459 | 
460 | [[package]]
461 | name = "rawpointer"
462 | version = "0.2.1"
463 | source = "registry+https://github.com/rust-lang/crates.io-index"
464 | checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
465 | 
466 | [[package]]
467 | name = "rayon"
468 | version = "1.5.1"
469 | source = "registry+https://github.com/rust-lang/crates.io-index"
470 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90"
471 | dependencies = [
472 |  "autocfg",
473 |  "crossbeam-deque",
474 |  "either",
475 |  "rayon-core",
476 | ]
477 | 
478 | [[package]]
479 | name = "rayon-core"
480 | version = "1.9.1"
481 | source = "registry+https://github.com/rust-lang/crates.io-index"
482 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e"
483 | dependencies = [
484 |  "crossbeam-channel",
485 |  "crossbeam-deque",
486 |  "crossbeam-utils",
487 |  "lazy_static",
488 |  "num_cpus",
489 | ]
490 | 
491 | [[package]]
492 | name = "regex-automata"
493 | version = "0.1.10"
494 | source = "registry+https://github.com/rust-lang/crates.io-index"
495 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
496 | 
497 | [[package]]
498 | name = "ryu"
499 | version = "1.0.5"
500 | source = "registry+https://github.com/rust-lang/crates.io-index"
501 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
502 | 
503 | [[package]]
504 | name = "safemem"
505 | version = "0.3.3"
506 | source = "registry+https://github.com/rust-lang/crates.io-index"
507 | checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
508 | 
509 | [[package]]
510 | name = "scopeguard"
511 | version = "1.1.0"
512 | source = "registry+https://github.com/rust-lang/crates.io-index"
513 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
514 | 
515 | [[package]]
516 | name = "serde"
517 | version = "1.0.126"
518 | source = "registry+https://github.com/rust-lang/crates.io-index"
519 | checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03"
520 | dependencies = [
521 |  "serde_derive",
522 | ]
523 | 
524 | [[package]]
525 | name = "serde_derive"
526 | version = "1.0.126"
527 | source = "registry+https://github.com/rust-lang/crates.io-index"
528 | checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43"
529 | dependencies = [
530 |  "proc-macro2",
531 |  "quote",
532 |  "syn",
533 | ]
534 | 
535 | [[package]]
536 | name = "serde_json"
537 | version = "1.0.64"
538 | source = "registry+https://github.com/rust-lang/crates.io-index"
539 | checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79"
540 | dependencies = [
541 |  "itoa",
542 |  "ryu",
543 |  "serde",
544 | ]
545 | 
546 | [[package]]
547 | name = "sketchy-rs"
548 | version = "0.6.0"
549 | dependencies = [
550 |  "anyhow",
551 |  "clap",
552 |  "csv",
553 |  "finch",
554 |  "needletail",
555 |  "rayon",
556 |  "structopt",
557 |  "thiserror",
558 | ]
559 | 
560 | [[package]]
561 | name = "strsim"
562 | version = "0.8.0"
563 | source = "registry+https://github.com/rust-lang/crates.io-index"
564 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
565 | 
566 | [[package]]
567 | name = "structopt"
568 | version = "0.3.23"
569 | source = "registry+https://github.com/rust-lang/crates.io-index"
570 | checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa"
571 | dependencies = [
572 |  "clap",
573 |  "lazy_static",
574 |  "structopt-derive",
575 | ]
576 | 
577 | [[package]]
578 | name = "structopt-derive"
579 | version = "0.4.16"
580 | source = "registry+https://github.com/rust-lang/crates.io-index"
581 | checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba"
582 | dependencies = [
583 |  "heck",
584 |  "proc-macro-error",
585 |  "proc-macro2",
586 |  "quote",
587 |  "syn",
588 | ]
589 | 
590 | [[package]]
591 | name = "syn"
592 | version = "1.0.77"
593 | source = "registry+https://github.com/rust-lang/crates.io-index"
594 | checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0"
595 | dependencies = [
596 |  "proc-macro2",
597 |  "quote",
598 |  "unicode-xid",
599 | ]
600 | 
601 | [[package]]
602 | name = "textwrap"
603 | version = "0.11.0"
604 | source = "registry+https://github.com/rust-lang/crates.io-index"
605 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
606 | dependencies = [
607 |  "unicode-width",
608 | ]
609 | 
610 | [[package]]
611 | name = "thiserror"
612 | version = "1.0.29"
613 | source = "registry+https://github.com/rust-lang/crates.io-index"
614 | checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88"
615 | dependencies = [
616 |  "thiserror-impl",
617 | ]
618 | 
619 | [[package]]
620 | name = "thiserror-impl"
621 | version = "1.0.29"
622 | source = "registry+https://github.com/rust-lang/crates.io-index"
623 | checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c"
624 | dependencies = [
625 |  "proc-macro2",
626 |  "quote",
627 |  "syn",
628 | ]
629 | 
630 | [[package]]
631 | name = "unicode-segmentation"
632 | version = "1.8.0"
633 | source = "registry+https://github.com/rust-lang/crates.io-index"
634 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b"
635 | 
636 | [[package]]
637 | name = "unicode-width"
638 | version = "0.1.7"
639 | source = "registry+https://github.com/rust-lang/crates.io-index"
640 | checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
641 | 
642 | [[package]]
643 | name = "unicode-xid"
644 | version = "0.2.2"
645 | source = "registry+https://github.com/rust-lang/crates.io-index"
646 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
647 | 
648 | [[package]]
649 | name = "vec_map"
650 | version = "0.8.1"
651 | source = "registry+https://github.com/rust-lang/crates.io-index"
652 | checksum = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
653 | 
654 | [[package]]
655 | name = "version_check"
656 | version = "0.9.3"
657 | source = "registry+https://github.com/rust-lang/crates.io-index"
658 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
659 | 
660 | [[package]]
661 | name = "winapi"
662 | version = "0.3.8"
663 | source = "registry+https://github.com/rust-lang/crates.io-index"
664 | checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
665 | dependencies = [
666 |  "winapi-i686-pc-windows-gnu",
667 |  "winapi-x86_64-pc-windows-gnu",
668 | ]
669 | 
670 | [[package]]
671 | name = "winapi-i686-pc-windows-gnu"
672 | version = "0.4.0"
673 | source = "registry+https://github.com/rust-lang/crates.io-index"
674 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
675 | 
676 | [[package]]
677 | name = "winapi-x86_64-pc-windows-gnu"
678 | version = "0.4.0"
679 | source = "registry+https://github.com/rust-lang/crates.io-index"
680 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
681 | 
682 | [[package]]
683 | name = "xz2"
684 | version = "0.1.6"
685 | source = "registry+https://github.com/rust-lang/crates.io-index"
686 | checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c"
687 | dependencies = [
688 |  "lzma-sys",
689 | ]
690 | 


--------------------------------------------------------------------------------
/src/sketchy.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::Result;
  2 | use finch::serialization::{
  3 |     read_finch_file, read_mash_file, write_finch_file, write_mash_file, Sketch,
  4 | };
  5 | use finch::sketch_schemes::{KmerCount, SketchParams};
  6 | use finch::statistics::cardinality;
  7 | use needletail::{parse_fastx_file, parse_fastx_stdin, FastxReader};
  8 | use rayon::prelude::*;
  9 | use std::cmp::Ordering;
 10 | use std::collections::HashMap;
 11 | use std::fs::File;
 12 | use std::io::prelude::*;
 13 | use std::io::BufReader;
 14 | use std::path::{Path, PathBuf};
 15 | use thiserror::Error;
 16 | 
 17 | #[derive(Error, Debug)]
 18 | pub enum SketchyError {
 19 |     #[error("reference sketch identifier {0} does not match genotype identifier {1} at line {2}")]
 20 |     InvalidIdentifier(String, String, String),
 21 |     #[error("reference sketch and genotype table must have the same length")]
 22 |     InvalidSize,
 23 |     #[error("reference sketch file must have Mash (.msh) or Finch (.fsh) extension")]
 24 |     InvalidExtension,
 25 |     #[error("reference ({0}) {1} ({2}) does not match query ({3}) {1} ({4})")]
 26 |     InvalidSketchMatch(String, String, String, String, String),
 27 |     #[error("consensus genotype could not be computed")]
 28 |     InvalidConsensusGenotype,
 29 |     #[error("--top must be an odd number when using --consensus")]
 30 |     InvalidConsensusTop,
 31 |     #[error("failed to open file")]
 32 |     IOError(#[from] std::io::Error),
 33 |     #[error("failed to open file with Finch")]
 34 |     FinchError(#[from] finch::errors::FinchError),
 35 |     #[error("failed to open genotype file or record with CSV")]
 36 |     GenotypeTableError(#[from] csv::Error),
 37 |     #[error("failed to open Fastx file or record with Needletail")]
 38 |     FastxError(#[from] needletail::errors::ParseError),
 39 | }
 40 | 
 41 | /// A `Struct` used for configuring some
 42 | /// params of the predict methods
 43 | #[derive(Debug, PartialEq, Eq)]
 44 | pub struct PredictConfig {
 45 |     pub top: usize,
 46 |     pub limit: usize,
 47 |     pub stream: bool,
 48 |     pub consensus: bool,
 49 |     pub header: bool,
 50 | }
 51 | 
 52 | /// A `Struct` used for building the formatted
 53 | /// Sketchy reference sketch
 54 | #[derive(Debug, PartialEq, Eq)]
 55 | pub struct Sketchy {}
 56 | 
 57 | impl Sketchy {
 58 |     /// Create a new reference sketch instance
 59 |     pub fn new() -> Self {
 60 |         Sketchy {}
 61 |     }
 62 | 
 63 |     /// Prediction method for Sketchy
 64 |     ///
 65 |     /// Implement documentation.
 66 |     pub fn predict(
 67 |         &self,
 68 |         fastx: Option<PathBuf>,
 69 |         reference: PathBuf,
 70 |         genotypes: PathBuf,
 71 |         config: PredictConfig,
 72 |     ) -> Result<(), SketchyError> {
 73 |         // First check that top_results is odd when consensus is true
 74 |         if config.consensus {
 75 |             let is_odd = matches!(config.top % 2, 1);
 76 |             if !is_odd {
 77 |                 return Err(SketchyError::InvalidConsensusTop);
 78 |             }
 79 |         }
 80 | 
 81 |         let reference_sketches = self._read_sketch(reference)?;
 82 |         let reference_params = &reference_sketches[0].sketch_params;
 83 | 
 84 |         let mut min_scale = 0.;
 85 |         if let Some(ref_scale) = reference_params.hash_info().3 {
 86 |             min_scale = ref_scale;
 87 |         }
 88 | 
 89 |         let fastx_reader = match fastx {
 90 |             Some(file) => parse_fastx_file(file)?,
 91 |             None => parse_fastx_stdin()?,
 92 |         };
 93 | 
 94 |         let (geno, header) = self._read_genotypes(&genotypes)?;
 95 |         let geno_map = self._genotype_hashmap(geno)?;
 96 | 
 97 |         // Header is printed already here, regardless of streaming or direct prediction mode
 98 | 
 99 |         if config.header {
100 |             println!("reads\tsketch_id\tshared_hashes\t{}", header);
101 |         }
102 | 
103 |         if config.stream {
104 |             self._sum_of_shared_hashes(
105 |                 fastx_reader,
106 |                 reference_params,
107 |                 &reference_sketches,
108 |                 geno_map,
109 |                 min_scale,
110 |                 config,
111 |             )?;
112 |         } else {
113 |             self._shared_hashes(
114 |                 fastx_reader,
115 |                 reference_params,
116 |                 &reference_sketches,
117 |                 geno_map,
118 |                 min_scale,
119 |                 config,
120 |             )?;
121 |         }
122 | 
123 |         Ok(())
124 |     }
125 |     /// Sketch building method for Sketchy
126 |     ///
127 |     /// Implement documentation.
128 |     pub fn sketch(
129 |         &self,
130 |         input: Option<Vec<PathBuf>>,
131 |         output: PathBuf,
132 |         sketch_size: usize,
133 |         kmer_size: u8,
134 |         seed: u64,
135 |         scale: f64,
136 |     ) -> Result<(), SketchyError> {
137 |         let files = match input {
138 |             None => {
139 |                 let stdin = std::io::stdin();
140 |                 let files = stdin
141 |                     .lock()
142 |                     .lines()
143 |                     .map(|x| PathBuf::from(x.unwrap()))
144 |                     .collect();
145 |                 files
146 |             }
147 |             Some(f) => f,
148 |         };
149 | 
150 |         let sketch_params =
151 |             self._get_sketch_params_from_extension(&output, sketch_size, kmer_size, scale, seed)?;
152 | 
153 |         let mut writer = File::create(&output)?;
154 |         let sketches = self._sketch_files(&sketch_params, files)?;
155 | 
156 |         match &sketch_params {
157 |             SketchParams::Mash { .. } => {
158 |                 write_mash_file(&mut writer, &sketches)?;
159 |             }
160 |             SketchParams::Scaled { .. } => {
161 |                 write_finch_file(&mut writer, &sketches)?;
162 |             }
163 |             SketchParams::AllCounts { .. } => unreachable!(), // no AllCounts from Sketchy::new()
164 |         }
165 | 
166 |         Ok(())
167 |     }
168 |     /// Information method for Sketchy
169 |     ///
170 |     /// Given a sketch input, list the sketch identifiers, the size of the sequence the
171 |     /// sketch was build from (bp) and the estimated  uniqueness of the sequence (bp)
172 |     pub fn info(&self, input: PathBuf, build: bool) -> Result<(), SketchyError> {
173 |         let sketches = self._read_sketch(input)?;
174 | 
175 |         if build {
176 |             let rep_sketch = &sketches[0]; // assumes all sketch params same
177 |             let rep_params = &rep_sketch.sketch_params;
178 | 
179 |             match rep_params {
180 |                 SketchParams::Scaled {
181 |                     kmers_to_sketch,
182 |                     kmer_length,
183 |                     scale,
184 |                     hash_seed,
185 |                 } => {
186 |                     println! {"type=scaled sketch_size={:} kmer_size={:} scale={:} seed={:}", kmers_to_sketch, kmer_length, scale, hash_seed}
187 |                 }
188 |                 SketchParams::Mash {
189 |                     kmers_to_sketch,
190 |                     final_size: _,
191 |                     kmer_length,
192 |                     no_strict: _,
193 |                     hash_seed,
194 |                 } => {
195 |                     println! {"type=mash sketch_size={:} kmer_size={:} seed={:}", kmers_to_sketch, kmer_length, hash_seed}
196 |                 }
197 |                 SketchParams::AllCounts { .. } => unimplemented!(),
198 |             };
199 |         } else {
200 |             for sketch in &sketches {
201 |                 let kmers = &sketch.hashes;
202 |                 if let Ok(c) = cardinality(kmers) {
203 |                     println!("{} {} {}", &sketch.name, &sketch.seq_length, c);
204 |                 }
205 |             }
206 |         }
207 |         Ok(())
208 |     }
209 |     /// Checks that sketches in the input refernce sketch and identifiers in
210 |     /// the genotype table are in the same order and that the reference sketch
211 |     /// collection and genotype table are of the same size
212 |     pub fn check(&self, input: PathBuf, genotypes: PathBuf) -> Result<(), SketchyError> {
213 |         let sketches = self._read_sketch(input)?;
214 |         let (genotype_data, _) = self
215 |             ._read_genotypes(&genotypes)
216 |             .expect("Could not read genotype file");
217 | 
218 |         // Check for same order of identifiers in sketch and genotype table
219 |         for (i, (sketch, genotype)) in sketches.iter().zip(&genotype_data).enumerate() {
220 |             match sketch.name == genotype[0] {
221 |                 true => continue,
222 |                 false => SketchyError::InvalidIdentifier(
223 |                     sketch.name.to_owned(),
224 |                     genotype[0].to_owned(),
225 |                     i.to_string(),
226 |                 ),
227 |             };
228 |         }
229 |         // Check for same size of sketch collection and genotype table
230 |         if sketches.len() != genotype_data.len() {
231 |             Err(SketchyError::InvalidSize)
232 |         } else {
233 |             println!("ok");
234 |             Ok(())
235 |         }
236 |     }
237 |     /// Compute and print shared hashes between reference and query sketches
238 |     pub fn shared(&self, reference: PathBuf, query: PathBuf) -> Result<(), SketchyError> {
239 |         let reference_sketches = self._read_sketch(reference)?;
240 |         let query_sketches = self._read_sketch(query)?;
241 | 
242 |         // Scale of sketches is inferred from first sketch in file --> might need to implement
243 |         // an empty file check which is not implemented in the finch::readers
244 |         let mut min_scale = 0.;
245 |         if let Some(scale1) = &query_sketches[0].sketch_params.hash_info().3 {
246 |             if let Some(scale2) = &reference_sketches[0].sketch_params.hash_info().3 {
247 |                 min_scale = f64::min(*scale1, *scale2);
248 |             }
249 |         }
250 |         // Pairwise shared hashes computation
251 |         for ref_sketch in &reference_sketches {
252 |             for query_sketch in &query_sketches {
253 |                 let compatible = ref_sketch
254 |                     .sketch_params
255 |                     .check_compatibility(&query_sketch.sketch_params);
256 |                 let common = match compatible {
257 |                     None => Ok(self._common_hashes(
258 |                         &ref_sketch.hashes,
259 |                         &query_sketch.hashes,
260 |                         min_scale,
261 |                     )),
262 |                     Some(incomp) => Err(SketchyError::InvalidSketchMatch(
263 |                         ref_sketch.name.to_owned(),
264 |                         incomp.0.to_string(),
265 |                         incomp.1,
266 |                         query_sketch.name.to_owned(),
267 |                         incomp.2,
268 |                     )),
269 |                 };
270 |                 println!(
271 |                     "{:} {:} {:}",
272 |                     ref_sketch.name,
273 |                     query_sketch.name,
274 |                     common.unwrap()
275 |                 ); // unwrap should be safe here
276 |             }
277 |         }
278 |         Ok(())
279 |     }
280 | 
281 |     fn _shared_hashes(
282 |         &self,
283 |         mut fastx_reader: Box<dyn FastxReader>,
284 |         reference_params: &SketchParams,
285 |         reference_sketches: &[Sketch],
286 |         geno_map: HashMap<String, Vec<String>>,
287 |         min_scale: f64,
288 |         config: PredictConfig,
289 |     ) -> Result<(), SketchyError> {
290 |         // Sketcher is created for all reads
291 |         let mut sketcher = reference_params.create_sketcher();
292 |         // Kmers are extracted for all reads
293 |         let mut read = 0;
294 |         while let Some(record) = fastx_reader.next() {
295 |             sketcher.process(record?);
296 |             read += 1;
297 |             if read == config.limit {
298 |                 break;
299 |             }
300 |         }
301 |         // Hashed kmers and counts are extracted across all reads
302 |         let read_hashes = sketcher.to_vec();
303 | 
304 |         let mut result_vec = vec![];
305 |         for ref_sketch in reference_sketches {
306 |             // Shared hashes are computed for each ref sketch
307 |             let shared_hashes = self._common_hashes(&ref_sketch.hashes, &read_hashes, min_scale);
308 |             result_vec.push((&ref_sketch.name, shared_hashes, &geno_map[&ref_sketch.name]));
309 |         }
310 |         result_vec.sort_by(|a, b| b.1.cmp(&a.1));
311 | 
312 |         self._print_results(result_vec, read, config.top, config.consensus)?;
313 | 
314 |         Ok(())
315 |     }
316 | 
317 |     fn _sum_of_shared_hashes(
318 |         &self,
319 |         mut fastx_reader: Box<dyn FastxReader>,
320 |         reference_params: &SketchParams,
321 |         reference_sketches: &[Sketch],
322 |         geno_map: HashMap<String, Vec<String>>,
323 |         min_scale: f64,
324 |         config: PredictConfig,
325 |     ) -> Result<(), SketchyError> {
326 |         let mut sum_of_shared_hashes = vec![0; reference_sketches.len()];
327 |         let mut read = 1;
328 |         while let Some(record) = fastx_reader.next() {
329 |             let mut result_vec = vec![];
330 |             // At each read we create a new sketcher based on the reference sketch
331 |             let mut sketcher = reference_params.create_sketcher();
332 |             // Kmers are then extracted for the record and hashed
333 |             sketcher.process(record?);
334 |             // Hashed kmers and counts are extracted for this read
335 |             let read_hashes = sketcher.to_vec();
336 |             // With each read, we compute the shared hashes with the reference sketch
337 |             for (i, ref_sketch) in reference_sketches.iter().enumerate() {
338 |                 let shared_hashes =
339 |                     self._common_hashes(&ref_sketch.hashes, &read_hashes, min_scale);
340 |                 // Finally the sum of shared hashes are updated
341 |                 sum_of_shared_hashes[i] += shared_hashes;
342 |                 result_vec.push((
343 |                     &ref_sketch.name,
344 |                     sum_of_shared_hashes[i],
345 |                     &geno_map[&ref_sketch.name],
346 |                 ));
347 |             }
348 |             result_vec.sort_by(|a, b| b.1.cmp(&a.1));
349 |             self._print_results(result_vec, read, config.top, config.consensus)?;
350 |             read += 1;
351 |             if read == config.limit + 1 {
352 |                 break;
353 |             }
354 |         }
355 |         Ok(())
356 |     }
357 | 
358 |     fn _print_results(
359 |         &self,
360 |         result_vec: Vec<(&String, u64, &Vec<String>)>,
361 |         read: usize,
362 |         top_results: usize,
363 |         consensus: bool,
364 |     ) -> Result<(), SketchyError> {
365 |         if consensus {
366 |             // For consensus calling, ignore reference genome names and shared hashes
367 |             // and for each genotype feature, gather the calls in a new vector
368 |             let ngenotypes = result_vec[0].2.len();
369 |             let mut genotype_features: Vec<Vec<&String>> = vec![Vec::new(); ngenotypes];
370 | 
371 |             for (_, _, genotypes) in result_vec[..top_results].iter() {
372 |                 for (j, genotype) in genotypes.iter().enumerate() {
373 |                     genotype_features[j].push(genotype)
374 |                 }
375 |             }
376 |             // For each feature consensus vector, call the most frequent value
377 |             // as the consensus. CLI implements a strict rule for only using
378 |             // odd --top values when using consensus calling
379 |             let mut consensus_genotype: Vec<String> = Vec::new();
380 |             for consensus_feature in genotype_features.iter() {
381 |                 let mut counts = HashMap::new();
382 |                 for genotype in consensus_feature {
383 |                     *counts.entry(genotype).or_insert(0) += 1;
384 |                 }
385 |                 let consensus_value = self._get_consensus_value(counts)?;
386 |                 consensus_genotype.push(consensus_value)
387 |             }
388 |             println!("{:}\t-\t-\t{:}", read, consensus_genotype.join("\t"));
389 |         } else {
390 |             // If not computing consensus simply iterate over top results and print to console
391 |             for (name, shared_hashes, genotype) in result_vec[..top_results].iter() {
392 |                 println!(
393 |                     "{:}\t{:}\t{:}\t{:}",
394 |                     read,
395 |                     name,
396 |                     shared_hashes,
397 |                     genotype.join("\t")
398 |                 );
399 |             }
400 |         }
401 |         Ok(())
402 |     }
403 | 
404 |     fn _get_consensus_value(
405 |         &self,
406 |         counts: HashMap<&&String, usize>,
407 |     ) -> Result<String, SketchyError> {
408 |         let consensus_value = counts.iter().max_by(|a, b| a.1.cmp(b.1)).map(|(k, _v)| k);
409 |         match consensus_value {
410 |             Some(value) => Ok(value.to_string()),
411 |             None => Err(SketchyError::InvalidConsensusGenotype),
412 |         }
413 |     }
414 |     /// Analogue of `finch::distance::raw_distance` reduced to extracting common hashes
415 |     ///
416 |     /// Assumes hashes are sorted - not sure if need to implement the check here, in particular
417 |     /// because we implement a read-by-read shared hashes computation in the prediction method
418 |     /// and this may increase cost in the end. Need to test.
419 |     fn _common_hashes(
420 |         &self,
421 |         ref_hashes: &[KmerCount],
422 |         query_hashes: &[KmerCount],
423 |         min_scale: f64,
424 |     ) -> u64 {
425 |         let mut i: usize = 0;
426 |         let mut j: usize = 0;
427 |         let mut common: u64 = 0;
428 |         while let (Some(query), Some(refer)) = (query_hashes.get(i), ref_hashes.get(j)) {
429 |             match query.hash.cmp(&refer.hash) {
430 |                 Ordering::Less => i += 1,
431 |                 Ordering::Greater => j += 1,
432 |                 Ordering::Equal => {
433 |                     common += 1;
434 |                     i += 1;
435 |                     j += 1;
436 |                 }
437 |             }
438 |         }
439 |         // At this point we've exhausted one of the two sketches, but we may have
440 |         // more counts in the other to compare if these were scaled sketches
441 |         if min_scale > 0. {
442 |             let max_hash = u64::max_value() / min_scale.recip() as u64;
443 |             while query_hashes
444 |                 .get(i)
445 |                 .map(|kmer_count| kmer_count.hash < max_hash)
446 |                 .unwrap_or(false)
447 |             {
448 |                 i += 1;
449 |             }
450 |             while ref_hashes
451 |                 .get(j)
452 |                 .map(|kmer_count| kmer_count.hash < max_hash)
453 |                 .unwrap_or(false)
454 |             {
455 |                 j += 1;
456 |             }
457 |         }
458 |         common
459 |     }
460 | 
461 |     /// Analogous method to `finch::sketch_files` excluding filtering options
462 |     ///
463 |     /// Filtering excluded, as we are not interested in sketching read files
464 |     /// but assembled reference genome sequences for the genotype database.
465 |     fn _sketch_files(
466 |         &self,
467 |         sketch_params: &SketchParams,
468 |         sequence_files: Vec<PathBuf>,
469 |     ) -> Result<Vec<Sketch>, SketchyError> {
470 |         sequence_files
471 |             .par_iter()
472 |             .map(|file| {
473 |                 let mut sketcher = sketch_params.create_sketcher();
474 |                 let mut fastx_reader = parse_fastx_file(file)?;
475 | 
476 |                 while let Some(record) = fastx_reader.next() {
477 |                     sketcher.process(record?);
478 |                 }
479 | 
480 |                 let sketch_hashes = sketcher.to_vec();
481 |                 let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers();
482 | 
483 |                 Ok(Sketch {
484 |                     name: file.file_name().unwrap().to_str().unwrap().to_string(),
485 |                     seq_length,
486 |                     num_valid_kmers,
487 |                     comment: "".to_string(),
488 |                     hashes: sketch_hashes,
489 |                     filter_params: finch::filtering::FilterParams::default(), // no filter params
490 |                     sketch_params: sketch_params.clone(),
491 |                 })
492 |             })
493 |             .collect()
494 |     }
495 | 
496 |     /// Read a sketch file into a vector of sketches based on the extension of the file
497 |     fn _read_sketch(&self, sketch_file: PathBuf) -> Result<Vec<Sketch>, SketchyError> {
498 |         let sketch_ext = match sketch_file.extension() {
499 |             None => Err(SketchyError::InvalidExtension),
500 |             Some(os_str) => match os_str.to_str() {
501 |                 Some("msh") => Ok("msh"),
502 |                 Some("fsh") => Ok("fsh"),
503 |                 _ => Err(SketchyError::InvalidExtension),
504 |             },
505 |         };
506 | 
507 |         let mut reader = BufReader::new(File::open(&sketch_file)?);
508 | 
509 |         match sketch_ext {
510 |             Ok("msh") => {
511 |                 let sketches = read_mash_file(&mut reader)?;
512 |                 // Fixing the sketch param object, as the parameters is not written to file for some reason [MASH]
513 |                 let sketches_with_params = sketches
514 |                     .iter()
515 |                     .map(|sketch| {
516 |                         // Rethink if really necessary, currently used only to instantiate a sketcher in the main
517 |                         // straming function and to output a summary statistic - so technically, we only need to add
518 |                         // the sketch size to SketchParams::Mash in the first sketch after reading into these methods
519 |                         // rather than fixing all of them, which may slow down with very large sketch collections
520 |                         let mut new_sketch = sketch.clone();
521 |                         new_sketch.sketch_params = SketchParams::Mash {
522 |                             kmers_to_sketch: sketch.hashes.len(),
523 |                             final_size: sketch.hashes.len(),
524 |                             kmer_length: sketch.sketch_params.k(),
525 |                             no_strict: false,
526 |                             hash_seed: sketch.sketch_params.hash_info().2,
527 |                         };
528 |                         new_sketch
529 |                     })
530 |                     .collect();
531 |                 Ok(sketches_with_params)
532 |             }
533 |             Ok("fsh") => Ok(read_finch_file(&mut reader)?),
534 |             _ => Err(SketchyError::InvalidExtension),
535 |         }
536 |     }
537 | 
538 |     fn _read_genotypes(
539 |         &self,
540 |         genotype_file: &Path,
541 |     ) -> Result<(Vec<Vec<String>>, String), SketchyError> {
542 |         let mut reader = csv::ReaderBuilder::new()
543 |             .delimiter(b'\t')
544 |             .has_headers(true)
545 |             .from_path(genotype_file)?;
546 |         let mut genotypes: Vec<Vec<String>> = vec![];
547 |         for result in reader.records() {
548 |             let record = result?;
549 |             let str_vec: Vec<String> = record.iter().map(|field| field.to_string()).collect();
550 |             genotypes.push(str_vec);
551 |         }
552 |         let header: Vec<String> = reader
553 |             .headers()?
554 |             .into_iter()
555 |             .map(|field| field.to_string())
556 |             .collect();
557 |         let header_str = header[1..].join("\t"); // exclude first column identifier
558 |         Ok((genotypes, header_str))
559 |     }
560 | 
561 |     fn _genotype_hashmap(
562 |         &self,
563 |         genotypes: Vec<Vec<String>>,
564 |     ) -> Result<HashMap<String, Vec<String>>, SketchyError> {
565 |         let genotype_map: HashMap<String, Vec<String>> = genotypes
566 |             .iter()
567 |             .map(|gvec| (gvec[0].to_owned(), gvec[1..].to_owned()))
568 |             .collect();
569 | 
570 |         Ok(genotype_map)
571 |     }
572 | 
573 |     fn _get_sketch_params_from_extension(
574 |         &self,
575 |         output: &Path,
576 |         sketch_size: usize,
577 |         kmer_size: u8,
578 |         scale: f64,
579 |         seed: u64,
580 |     ) -> Result<SketchParams, SketchyError> {
581 |         match output.extension() {
582 |             None => Err(SketchyError::InvalidExtension),
583 |             Some(os_str) => match os_str.to_str() {
584 |                 Some("msh") => Ok(SketchParams::Mash {
585 |                     kmers_to_sketch: sketch_size,
586 |                     final_size: sketch_size,
587 |                     no_strict: false,
588 |                     kmer_length: kmer_size,
589 |                     hash_seed: seed,
590 |                 }),
591 |                 Some("fsh") => Ok(SketchParams::Scaled {
592 |                     kmers_to_sketch: sketch_size,
593 |                     kmer_length: kmer_size,
594 |                     scale,
595 |                     hash_seed: seed,
596 |                 }),
597 |                 _ => Err(SketchyError::InvalidExtension),
598 |             },
599 |         }
600 |     }
601 | }
602 | 


--------------------------------------------------------------------------------