├── .gitignore ├── docs ├── images │ └── logo.png ├── releases.md └── index.md ├── mkdocs.yaml ├── Cargo.toml ├── scripts ├── README.md ├── species.py └── metrics.py ├── LICENSE ├── src ├── main.rs ├── cli.rs └── sketchy.rs ├── .github └── workflows │ ├── rust-ci.yaml │ └── release.yaml ├── README.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | site/ 3 | test_data/ 4 | 5 | .idea/ 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/esteinig/sketchy/HEAD/docs/images/logo.png -------------------------------------------------------------------------------- /docs/releases.md: -------------------------------------------------------------------------------- 1 | 2 | # Release binaries 3 | 4 | Binary executables for releases of the Rust client are available for Linux and MacOS. 5 | 6 | Configure environmental variables. 7 | 8 | ``` 9 | VERSION=0.6.0 10 | GITHUB=https://github.com/esteinig/sketchy/releases/download 11 | ``` 12 | 13 | Download release binaries (Linux). 14 | 15 | ``` 16 | TAR=sketchy-${VERSION}-x86_64-unknown-linux-musl.tar.gz 17 | wget ${GITHUB}/${VERSION}/${TAR} 18 | 100% 19 | tar xf $TAR 20 | ``` 21 | 22 | Download release binaries (MacOS). 23 | 24 | ``` 25 | TAR=sketchy-${VERSION}-x86_64-apple-darwin.tar.gz 26 | wget ${GITHUB}/${VERSION}/${TAR} 27 | tar xf $TAR 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /mkdocs.yaml: -------------------------------------------------------------------------------- 1 | site_name: Sketchy 2 | theme: 3 | name: material 4 | logo: images/logo.png 5 | palette: 6 | primary: black 7 | accent: white 8 | repo_name: esteinig/sketchy 9 | repo_url: https://github.com/esteinig/sketchy 10 | nav: 11 | - Home: index.md 12 | - Releases: releases.md 13 | markdown_extensions: 14 | - toc: 15 | permalink: true 16 | - markdown.extensions.codehilite: 17 | guess_lang: false 18 | - admonition 19 | - codehilite 20 | - extra 21 | - pymdownx.superfences: 22 | custom_fences: 23 | - name: mermaid 24 | class: mermaid 25 | format: !!python/name:pymdownx.superfences.fence_div_format '' 26 | - pymdownx.tabbed 27 | extra_javascript: 28 | - https://unpkg.com/mermaid@8.4.6/dist/mermaid.min.js -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sketchy-rs" 3 | version = "0.6.0" 4 | authors = ["esteinig "] 5 | description = "Rust command line client for Sketchy" 6 | documentation = "https://github.com/esteinig/sketchy" 7 | homepage = "https://github.com/esteinig/sketchy" 8 | repository = "https://github.com/esteinig/sketchy" 9 | readme = "README.md" 10 | keywords = ["sketchy", "nanopore", "gnt", "mash", "streaming"] 11 | categories = ["science"] 12 | license = "MIT" 13 | edition = "2018" 14 | include = [ 15 | "**/*.rs", 16 | "src/data/*", 17 | "Cargo.toml" 18 | ] 19 | 20 | [dependencies] 21 | anyhow = "1.0" 22 | structopt = "0.3" 23 | clap = "2.33.0" 24 | finch = "0.4.1" 25 | rayon = "1.5.1" 26 | needletail = "0.4.1" 27 | thiserror = "1.0" 28 | csv = "1.1" 29 | 30 | [[bin]] 31 | name = "sketchy" 32 | path = "src/main.rs" -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Extracting validation data from the Blackwell collection 2 | 3 | This process extracts the MLST validation data from the Blackwell collection as outlined in the manuscript. 4 | 5 | 1. Download the meta-data JSON (https://figshare.com/ndownloader/files/26578377) 6 | 2. Run the `species.py` application (`python species.py --help`) on the JSON file, this will produce a summary of the meta-data including MLST (`species_data.tsv`), a file for all FTP addresses (`species_ftp.tsv`) and count data for all species (Bracken, `species_counts.tsv`) 7 | 3. Download the assemblies from the FTP addresses for a species of interest, these can be used for sketch construction. Genotype files can be constructed from subsets of the `species_data.tsv` file for the assemblies included in the sketch. 8 | 4. You may want to run a check on the order and congruence between sketch and genotype file. Commands for sketch construction are outlined in the `local sketches` and `genotype files` sections of the [documentation](https://github.com/esteinig/sketchy/blob/master/docs/index.md). 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Eike Steinig 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use crate::cli::Cli; 2 | use crate::cli::Commands::{Check, Info, Predict, Shared, Sketch}; 3 | use crate::sketchy::{PredictConfig, Sketchy}; 4 | use anyhow::Result; 5 | use structopt::StructOpt; 6 | 7 | mod cli; 8 | mod sketchy; 9 | 10 | /// Sketchy application 11 | /// 12 | /// Run the application from arguments provided 13 | /// by the command line interface 14 | /// 15 | /// Hash seed by default is 0; for hashing to 16 | /// replicate Mash, seed must be 42. 17 | fn main() -> Result<()> { 18 | let args = Cli::from_args(); 19 | let sketchy = Sketchy::new(); 20 | 21 | // Conduct a consensus check when using 22 | 23 | match args.commands { 24 | Sketch { 25 | input, 26 | output, 27 | sketch_size, 28 | kmer_size, 29 | scale, 30 | seed, 31 | } => { 32 | sketchy.sketch(input, output, sketch_size, kmer_size, seed, scale)?; 33 | } 34 | Info { input, params } => { 35 | sketchy.info(input, params)?; 36 | } 37 | Shared { reference, query } => { 38 | sketchy.shared(reference, query)?; 39 | } 40 | Predict { 41 | input, 42 | reference, 43 | genotypes, 44 | top, 45 | limit, 46 | stream, 47 | consensus, 48 | header, 49 | } => { 50 | let config = PredictConfig { 51 | top, 52 | limit, 53 | stream, 54 | consensus, 55 | header, 56 | }; 57 | sketchy.predict(input, reference, genotypes, config)?; 58 | } 59 | Check { 60 | reference, 61 | genotypes, 62 | } => { 63 | sketchy.check(reference, genotypes)?; 64 | } 65 | } 66 | 67 | Ok(()) 68 | } 69 | -------------------------------------------------------------------------------- /.github/workflows/rust-ci.yaml: -------------------------------------------------------------------------------- 1 | name: Rust CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths-ignore: 8 | - 'README.md' 9 | pull_request: 10 | branches: 11 | - master 12 | paths-ignore: 13 | - 'README.md' 14 | 15 | env: 16 | CARGO_TERM_COLOR: always 17 | 18 | jobs: 19 | check: 20 | name: Check Rust version ${{ matrix.rust }} on OS ${{ matrix.os }} 21 | strategy: 22 | matrix: 23 | os: [ubuntu-latest, macos-latest] 24 | rust: 25 | - stable 26 | - 1.53.0 27 | runs-on: ${{ matrix.os }} 28 | steps: 29 | - name: Checkout sources 30 | uses: actions/checkout@v2 31 | 32 | - name: Install toolchain 33 | uses: actions-rs/toolchain@v1 34 | with: 35 | toolchain: ${{ matrix.rust }} 36 | override: true 37 | 38 | - uses: actions/cache@v2 39 | with: 40 | path: | 41 | ~/.cargo/registry 42 | ~/.cargo/git 43 | target 44 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 45 | 46 | - name: Run cargo check 47 | uses: actions-rs/cargo@v1 48 | with: 49 | command: check 50 | 51 | test: 52 | name: Test Rust version ${{ matrix.rust }} on OS ${{ matrix.os }} 53 | strategy: 54 | matrix: 55 | os: [ubuntu-latest, macos-latest] 56 | rust: 57 | - stable 58 | - 1.53.0 59 | runs-on: ${{ matrix.os }} 60 | steps: 61 | - name: Checkout sources 62 | uses: actions/checkout@v2 63 | 64 | - name: Install toolchain 65 | uses: actions-rs/toolchain@v1 66 | with: 67 | toolchain: ${{ matrix.rust }} 68 | override: true 69 | 70 | - uses: actions/cache@v2 71 | with: 72 | path: | 73 | ~/.cargo/registry 74 | ~/.cargo/git 75 | target 76 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 77 | 78 | - name: Run cargo test 79 | uses: actions-rs/cargo@v1 80 | with: 81 | command: test 82 | args: -v --all-targets --no-fail-fast 83 | 84 | fmt: 85 | name: Rustfmt 86 | runs-on: ubuntu-latest 87 | strategy: 88 | matrix: 89 | rust: 90 | - stable 91 | steps: 92 | - name: Checkout sources 93 | uses: actions/checkout@v2 94 | 95 | - name: Install toolchain 96 | uses: actions-rs/toolchain@v1 97 | with: 98 | toolchain: ${{ matrix.rust }} 99 | override: true 100 | 101 | - name: Install rustfmt 102 | run: rustup component add rustfmt 103 | 104 | - name: Run cargo fmt 105 | uses: actions-rs/cargo@v1 106 | with: 107 | command: fmt 108 | args: --all -- --check 109 | 110 | clippy: 111 | name: Clippy 112 | runs-on: ubuntu-latest 113 | strategy: 114 | matrix: 115 | rust: 116 | - stable 117 | steps: 118 | - name: Checkout sources 119 | uses: actions/checkout@v2 120 | 121 | - name: Install toolchain 122 | uses: actions-rs/toolchain@v1 123 | with: 124 | toolchain: ${{ matrix.rust }} 125 | override: true 126 | 127 | - name: Install clippy 128 | run: rustup component add clippy 129 | 130 | - name: Run cargo clippy 131 | uses: actions-rs/cargo@v1 132 | with: 133 | command: clippy 134 | args: --all-features --all-targets -- -D warnings -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sketchy 2 | 3 | ![](https://img.shields.io/badge/lang-rust-black.svg) 4 | ![](https://img.shields.io/badge/version-0.6.0-green.svg) 5 | ![](https://img.shields.io/badge/preprint-0.12.0-green.svg) 6 | 7 | Genomic neighbor typing for lineage and genotype inference 8 | 9 | ## Overview 10 | 11 | **`v0.6.0`** 12 | 13 | `Sketchy` is a lineage calling and genotyping tool based on the heuristic principle of genomic neighbor typing developed by [Karel Břinda and colleagues (2020)](https://www.biorxiv.org/content/10.1101/403204v2). It queries species-wide ('hypothesis-agnostic') reference sketches using MinHash and infers associated genotypes based on the closest match, including multi-locus sequence types, susceptibility profiles, virulence factors or other genome-associated features provided by the user. Unlike the original implementation in [`RASE`](https://github.com/c2-d2/rase-pipeline), `sketchy` does not use phylogenetic trees which has some downsides, e.g. for sublineage genotype predictions (see below). 14 | 15 | See the [latest docs](https://esteinig.github.io/sketchy) for install, usage and database building. 16 | 17 | ## Install 18 | 19 | Cargo: 20 | 21 | ``` 22 | cargo install sketchy 23 | ``` 24 | 25 | BioConda: 26 | 27 | ``` 28 | conda install -c bioconda sketchy 29 | ``` 30 | 31 | [Release binaries](https://github.com/esteinig/sketchy/releases) available for download. Reference sketches can be constructed from local [assembly and genotype collections](https://esteinig.github.io/sketchy/#local-sketches). *S. aureus* reference sketches are available in the data availability section below. 32 | 33 | ## Strengths and limitations 34 | 35 | 36 | * Reference sketches and genotype indices can be constructed easily from large genotype collections 37 | * `Sketchy` requires few resources when using small sketch sizes (`s = 1000`) 38 | * `Sketchy` performs best on lineage predictions and lineage-wide genotypes from very few reads - we found that tens to hundreds of reads can often give a good idea of the close matches in the reference sketch (especially when inspecting the top matches using `--top`) 39 | 40 | However: 41 | 42 | * Clade-specific genotype resolution is not as good as when using phylogenetic guide trees (`RASE`) 43 | * Sketch size can be increased to increase performance (`s = 10000`), but resources scale approximately linearly 44 | * `Sketchy` genotype inference may be difficult for species with high rates of homologous recombination 45 | 46 | ## Data availability 47 | 48 | * Reference sketches and genotype files (`s = 1000`, `s = 10000`, `k = 16`) for [*S. aureus*](https://cloudstor.aarnet.edu.au/plus/s/3EBgvXi6sVHW8Ne) (full genotypes including susceptibility predictions and other genotypes), *S. pneumoniae*, *K. pneumoniae*, *P. aeruginosa* and *Neisseria spp.* (MLST) can be found in the [data repository](https://cloudstor.aarnet.edu.au/plus/s/rL0RHYunqhRK3i1). 49 | * Reference sketches for cross-validation on the simulated species data can be found in this [data repository](https://cloudstor.aarnet.edu.au/plus/s/7ICPoSru6s6EHNY); genome assemblies for all species extracted from the ENA reference collection are available in this [data repository](https://cloudstor.aarnet.edu.au/plus/s/Td3ahBCPP2YAhCU) 50 | * Scripts to extract data from the ENA collections [Grace Blackwell et al.](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3001421) and compute reference metrics can be found in the [scripts directory](scripts/). 51 | * Nanopore reads for the outbreak isolates and genotype surveillance panels in Papua New Guinea (Flongle, Goroka, sequential protocol) are available for download in the [data repository](https://cloudstor.aarnet.edu.au/plus/s/MFkirfq1N6uIosc). Raw sequence data (Illumina / ONT) is being uploaded to NCBI (PRJNA657380). 52 | 53 | ## Preprint 54 | 55 | If you use `sketchy` for research and other applications, please cite: 56 | 57 | > Steinig et al. (2022) - Genomic neighbor typing for bacterial outbreak surveillance - bioRxiv 2022.02.05.479210; doi: https://doi.org/10.1101/2022.02.05.479210 58 | -------------------------------------------------------------------------------- /src/cli.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::OsStr; 2 | use std::{ffi::OsString, path::PathBuf}; 3 | use structopt::StructOpt; 4 | use thiserror::Error; 5 | 6 | #[derive(Error, Debug)] 7 | pub enum CliError { 8 | #[error("Scale parameter must be between 0 and 1")] 9 | InvalidScaleRange, 10 | #[error("Scale parameter must be a float")] 11 | InvalidScaleFloat, 12 | } 13 | 14 | /// Bacterial genomic neighbor typing using MinHash 15 | #[derive(Debug, StructOpt)] 16 | #[structopt(name = "sketchy")] 17 | pub struct Cli { 18 | #[structopt(subcommand)] 19 | pub commands: Commands, 20 | } 21 | 22 | #[derive(Debug, StructOpt)] 23 | pub enum Commands { 24 | /// Create a sketch from input sequences 25 | Sketch { 26 | /// Fast{a,q}.{gz,xz,bz}, stdin if not present 27 | #[structopt(short, long, parse(from_os_str), multiple = true)] 28 | input: Option>, 29 | /// Output sketch file path. 30 | #[structopt(short, long, parse(from_os_str), required = true)] 31 | output: PathBuf, 32 | /// Sketch size. 33 | #[structopt(short, long, default_value = "1000")] 34 | sketch_size: usize, 35 | /// K-mer size. 36 | #[structopt(short = "k", long, default_value = "16")] 37 | kmer_size: u8, 38 | /// Hash scaler for finch format. 39 | #[structopt( 40 | short = "c", 41 | long, 42 | parse(try_from_str = check_scale_limits), 43 | default_value = "0.001" 44 | )] 45 | scale: f64, 46 | /// Seed for hashing k-mers. 47 | #[structopt(short = "e", long, default_value = "0")] 48 | seed: u64, 49 | }, 50 | /// List sketch genome order, sketch build parameters 51 | Info { 52 | /// Sketch file, format: Mash (.msh) or Finch (.fsh) 53 | #[structopt( 54 | short, 55 | long, 56 | parse(try_from_os_str = check_file_exists) 57 | )] 58 | input: PathBuf, 59 | /// Display the sketch build parameters. 60 | #[structopt(short, long)] 61 | params: bool, 62 | }, 63 | 64 | /// Check match between sketch and genotype file 65 | Check { 66 | /// Sketch file, format: Mash (.msh) or Finch (.fsh) 67 | #[structopt( 68 | short, 69 | long, 70 | parse(try_from_os_str = check_file_exists) 71 | )] 72 | reference: PathBuf, 73 | /// Genotype file to validate with sketch file 74 | #[structopt( 75 | short, 76 | long, 77 | parse(try_from_os_str = check_file_exists) 78 | )] 79 | genotypes: PathBuf, 80 | }, 81 | /// Compute shared hashes between two sketches 82 | Shared { 83 | /// Sketch file, format: Mash (.msh) or Finch (.fsh) 84 | #[structopt( 85 | short, 86 | long, 87 | parse(try_from_os_str = check_file_exists) 88 | )] 89 | reference: PathBuf, 90 | /// Sketch file, matching format: Mash (.msh) or Finch (.fsh) 91 | #[structopt(short, long)] 92 | query: PathBuf, 93 | }, 94 | /// Predict genotypes from reads or read streams 95 | Predict { 96 | /// Fast{a,q}.{gz,xz,bz}, stdin if not present. 97 | #[structopt( 98 | short, 99 | long, 100 | parse(try_from_os_str = check_file_exists) 101 | )] 102 | input: Option, 103 | /// Reference sketch, Mash (.msh) or Finch (.fsh) 104 | #[structopt( 105 | short, 106 | long, 107 | parse(try_from_os_str = check_file_exists) 108 | )] 109 | reference: PathBuf, 110 | /// Reference genotype table (.tsv) 111 | #[structopt( 112 | short, 113 | long, 114 | parse(try_from_os_str = check_file_exists) 115 | )] 116 | genotypes: PathBuf, 117 | /// Number of top ranked prediction to output 118 | #[structopt(short, long, default_value = "1")] 119 | top: usize, 120 | /// Number of reads to process, all reads default 121 | #[structopt(short, long, default_value = "0")] 122 | limit: usize, 123 | /// Sum of shared hashes per read output 124 | #[structopt(short, long)] 125 | stream: bool, 126 | /// Consensus prediction over top feature values 127 | #[structopt(short, long)] 128 | consensus: bool, 129 | /// Header added to output based on genotype file 130 | #[structopt(short = "H", long)] 131 | header: bool, 132 | }, 133 | } 134 | 135 | fn check_scale_limits(scale: &str) -> Result { 136 | match scale.parse::() { 137 | Ok(x) => match x { 138 | x if (0.0..=1.0).contains(&x) => Ok(x), 139 | _ => Err(CliError::InvalidScaleRange), 140 | }, 141 | _ => Err(CliError::InvalidScaleFloat), 142 | } 143 | } 144 | 145 | fn check_file_exists(file: &OsStr) -> Result { 146 | let path = PathBuf::from(file); 147 | if path.exists() { 148 | Ok(path) 149 | } else { 150 | Err(OsString::from(format!("{:?} does not exist", path))) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | `Sketchy` is a nanopore lineage calling and genotyping tool based on the heuristic principle of [genomic neighbor typing (Břinda et al. 2020)](https://www.biorxiv.org/content/10.1101/403204v2). `Sketchy` queries species-wide (hypothesis-agnostic) reference sketches using MinHash methods to infer genotypes based on the closest reference match. Reference databases and genotypes, such has multi-locus sequence types, susceptibility profiles, or virulence factors, are configurable by users. 4 | 5 | ## Install 6 | 7 | Rust client. 8 | 9 | ``` 10 | $ cargo install sketchy 11 | ``` 12 | 13 | ``` 14 | $ conda install -c bioconda sketchy 15 | ``` 16 | 17 | [Release binaries](releases.md) for Linux and MacOS are available. 18 | 19 | 20 | ## Subcommands 21 | 22 | Show subcommands available in the Rust client. 23 | 24 | ``` 25 | $ sketchy --help 26 | ``` 27 | 28 | ## Predictions 29 | 30 | `Sketchy` predicts the genotype of the genome with the highest shared hashes in the reference sketch. Two modes are available: 31 | 32 | 1. **Offline (read sets)**: k-mers are hashed and shared hashes computed from a set of reads. Output are the `--top` genome matches and their genotypes. 33 | 2. **Online (streaming)**: k-mers are hashed for each incoming read, the sum of shared hashes for each genome in the reference sketch is updated and the `--top` genome matches and their genotypes based on the sorted sum of shared hashes at the current read are printed. 34 | 35 | Sum of shared hashes based on streaming are slightly less informative than shared hashes based on read sets. Streaming requires less memory, but can be slow for large read sets, especially when deploying large reference sketches. 36 | 37 | Predictions require the input sequences (`-i`), a reference sketch (`-r`) and a matching genotype file (`-g`) as described below ([genotype files](#genotype-files)). 38 | 39 | ### Read sets 40 | 41 | Output the best 5 matches against the reference sketch with header. 42 | 43 | ```console 44 | $ sketchy predict -i seq.fq -r saureus.msh -g saureus.tsv -t 5 -H 45 | ``` 46 | 47 | ### Streaming 48 | 49 | Output the updated best match against the reference sketch from a stream of reads. 50 | 51 | ```console 52 | $ cat seq.fq | sketchy predict -r saureus.msh -g saureus.tsv -s 53 | ``` 54 | 55 | ## Sketches 56 | 57 | ### Species sketches 58 | 59 | Species sketches are available in the [data repository]() and can be built from local reference assembly collections. 60 | 61 | ### Local sketches 62 | 63 | Reference sketches can be built from any collection of assembled genomes for which associated genotype or phenotype data are available. 64 | 65 | Build a default-resolution (`s = 1000`) reference database from any collection of `fasta`. 66 | 67 | ``` 68 | $ sketchy sketch -i *.fa -k 16 -s 1000 -o ref.msh 69 | ``` 70 | 71 | You can pipe assemblies using `find` into the sketch construction, as wildcard expansions are limited to ~30,000 files. 72 | 73 | ``` 74 | $ find assemblies/ -name "*.fa" | sketchy sketch -k 16 -s 1000 -o ref.msh 75 | ``` 76 | 77 | List sketch parameters. 78 | 79 | ``` 80 | $ sketchy info -i ref.msh -p 81 | ``` 82 | 83 | ### Genotype files 84 | 85 | Prediction requires a **tab-delimited** genotype index **in the same order and of the same length** as the reference sketch. Names in the genotype index (first column) are the file name of the input files in the sketch 86 | 87 | ``` 88 | name mlst tetracyline penicillin methicilin 89 | ERR129347.fa ST82 R R S 90 | ERR121347.fa ST93 S S S 91 | ``` 92 | 93 | List the order of genomes in the sketch, their length (bp) and an estimate of cardinality (bp) 94 | 95 | ``` 96 | $ sketchy info -i ref.msh 97 | ``` 98 | 99 | Check if the genotype file contains the correct number and order of genomes as the sketch. 100 | 101 | ``` 102 | $ sketchy check -r ref.msh -g ref.tsv 103 | ``` 104 | 105 | This will outpout `ok` to `stdout` if the check is completed successfully and fail with an error message otherwise. 106 | 107 | ## Sketch validation 108 | 109 | We conducted simulations using `badread` of the following species: 110 | 111 | * *Neisseria spp.* 112 | * *Streptococcus pneumoniae* 113 | * *Klebsiella pneumoniae* 114 | * *Staphylococcus aureus* 115 | * *Pseudomonas aeruginosa* 116 | 117 | Cross-validation sketches (using subsampling of reference assemblies) and read data are available in the [data repository](). 118 | 119 | ## Other 120 | 121 | 122 | ### Shared hashes 123 | 124 | Given two assembled genome sequences, create a sketch at default k-mer size of `k = 16` and sketch size of `s = 1000`. `Mash` configuration can be replicated by setting `--seed 42`. 125 | 126 | Sketch two genome assemblies with identical settings. 127 | 128 | ``` 129 | $ sketchy sketch -i genome1.fa -o genome1.msh 130 | $ sketchy sketch -i genome2.fa -o genome2.msh 131 | ``` 132 | 133 | Compute shared hashes between the reference and query genomes. 134 | 135 | ``` 136 | $ sketchy shared -r genome1.msh -q genome2.msh 137 | 138 | > genome1.fa genome2.fa 360 139 | ``` 140 | 141 | 142 | If multiple sketches are available compute pairwise shared hashes. 143 | 144 | ``` 145 | $ sketchy sketch -i genome1.fa genome2.fa -o multi.msh 146 | $ sketchy shared -r multi.msh -q genome2.msh 147 | 148 | > genome1.fa genome2.fa 360 149 | > genome2.fa genome2.fa 1000 150 | ``` 151 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - "[0-9]+.[0-9]+.[0-9]+" 7 | 8 | env: 9 | CICD_INTERMEDIATES_DIR: "_release-intermediates" 10 | 11 | jobs: 12 | publish: 13 | name: Publish 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout sources 17 | uses: actions/checkout@v2 18 | 19 | - name: Install stable toolchain 20 | uses: actions-rs/toolchain@v1 21 | with: 22 | profile: minimal 23 | toolchain: stable 24 | override: true 25 | 26 | - name: Run `cargo publish` - upload to crates.io 27 | uses: actions-rs/cargo@v1 28 | env: 29 | CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} 30 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 31 | with: 32 | command: publish 33 | 34 | upload: 35 | name: ${{ matrix.job.os }} (${{ matrix.job.target }}) 36 | runs-on: ${{ matrix.job.os }} 37 | strategy: 38 | fail-fast: false 39 | matrix: 40 | job: 41 | - { os: ubuntu-latest, target: x86_64-unknown-linux-musl, use-cross: true } 42 | - { os: macos-latest, target: x86_64-apple-darwin, use-cross: true } 43 | steps: 44 | - name: Checkout source code 45 | uses: actions/checkout@v2 46 | 47 | - name: Extract crate information 48 | shell: bash 49 | run: | 50 | echo "PROJECT_NAME=sketchy" >> $GITHUB_ENV 51 | echo "PROJECT_VERSION=$(sed -n 's/^version = "\(.*\)"/\1/p' Cargo.toml | head -n1)" >> $GITHUB_ENV 52 | echo "PROJECT_MAINTAINER=$(sed -n 's/^authors = \["\(.*\)"\]/\1/p' Cargo.toml)" >> $GITHUB_ENV 53 | echo "PROJECT_HOMEPAGE=$(sed -n 's/^homepage = "\(.*\)"/\1/p' Cargo.toml)" >> $GITHUB_ENV 54 | - name: Install Rust toolchain 55 | uses: actions-rs/toolchain@v1 56 | with: 57 | toolchain: stable 58 | target: ${{ matrix.job.target }} 59 | override: true 60 | profile: minimal # minimal component installation (ie, no documentation) 61 | 62 | - name: Show version information (Rust, cargo, GCC) 63 | shell: bash 64 | run: | 65 | gcc --version || true 66 | rustup -V 67 | rustup toolchain list 68 | rustup default 69 | cargo -V 70 | rustc -V 71 | - name: Build 72 | uses: actions-rs/cargo@v1 73 | with: 74 | use-cross: ${{ matrix.job.use-cross }} 75 | command: build 76 | args: --release --target=${{ matrix.job.target }} 77 | 78 | - name: Strip debug information from executable 79 | id: strip 80 | shell: bash 81 | run: | 82 | # Figure out suffix of binary 83 | EXE_suffix="" 84 | # Figure out what strip tool to use if any 85 | STRIP="strip" 86 | # Setup paths 87 | BIN_DIR="${{ env.CICD_INTERMEDIATES_DIR }}/stripped-release-bin/" 88 | mkdir -p "${BIN_DIR}" 89 | BIN_NAME="${{ env.PROJECT_NAME }}${EXE_suffix}" 90 | BIN_PATH="${BIN_DIR}/${BIN_NAME}" 91 | # Copy the release build binary to the result location 92 | cp "target/${{ matrix.job.target }}/release/${BIN_NAME}" "${BIN_DIR}" 93 | # Also strip if possible 94 | if [ -n "${STRIP}" ]; then 95 | "${STRIP}" "${BIN_PATH}" 96 | fi 97 | # Let subsequent steps know where to find the (stripped) bin 98 | echo ::set-output name=BIN_PATH::${BIN_PATH} 99 | echo ::set-output name=BIN_NAME::${BIN_NAME} 100 | - name: Set testing options 101 | id: test-options 102 | shell: bash 103 | run: | 104 | unset CARGO_TEST_OPTIONS 105 | unset CARGO_TEST_OPTIONS ; case ${{ matrix.job.target }} in arm-* | aarch64-*) CARGO_TEST_OPTIONS="--bin ${PROJECT_NAME}" ;; esac; 106 | echo ::set-output name=CARGO_TEST_OPTIONS::${CARGO_TEST_OPTIONS} 107 | - name: Run tests 108 | uses: actions-rs/cargo@v1 109 | with: 110 | use-cross: ${{ matrix.job.use-cross }} 111 | command: test 112 | args: --target=${{ matrix.job.target }} ${{ steps.test-options.outputs.CARGO_TEST_OPTIONS}} 113 | 114 | - name: Create tarball 115 | id: package 116 | shell: bash 117 | run: | 118 | PKG_suffix=".tar.gz" ; case ${{ matrix.job.target }} in *-pc-windows-*) PKG_suffix=".zip" ;; esac; 119 | PKG_BASENAME=${PROJECT_NAME}-${PROJECT_VERSION}-${{ matrix.job.target }} 120 | PKG_NAME=${PKG_BASENAME}${PKG_suffix} 121 | echo ::set-output name=PKG_NAME::${PKG_NAME} 122 | PKG_STAGING="${{ env.CICD_INTERMEDIATES_DIR }}/package" 123 | ARCHIVE_DIR="${PKG_STAGING}/${PKG_BASENAME}/" 124 | mkdir -p "${ARCHIVE_DIR}" 125 | # Binary 126 | cp "${{ steps.strip.outputs.BIN_PATH }}" "$ARCHIVE_DIR" 127 | # README, LICENSE files 128 | cp "README.md" "LICENSE" "$ARCHIVE_DIR" 129 | # base compressed package 130 | pushd "${PKG_STAGING}/" >/dev/null 131 | case ${{ matrix.job.target }} in 132 | *-pc-windows-*) 7z -y a "${PKG_NAME}" "${PKG_BASENAME}"/* | tail -2 ;; 133 | *) tar czf "${PKG_NAME}" "${PKG_BASENAME}"/* ;; 134 | esac; 135 | popd >/dev/null 136 | # Let subsequent steps know where to find the compressed package 137 | echo ::set-output name=PKG_PATH::"${PKG_STAGING}/${PKG_NAME}" 138 | - name: "Artifact upload: tarball" 139 | uses: actions/upload-artifact@master 140 | with: 141 | name: ${{ steps.package.outputs.PKG_NAME }} 142 | path: ${{ steps.package.outputs.PKG_PATH }} 143 | 144 | - name: Check for release 145 | id: is-release 146 | shell: bash 147 | run: | 148 | unset IS_RELEASE ; if [[ $GITHUB_REF =~ ^refs/tags/[0-9].* ]]; then IS_RELEASE='true' ; fi 149 | echo ::set-output name=IS_RELEASE::${IS_RELEASE} 150 | - name: Publish archives and packages 151 | uses: softprops/action-gh-release@59c3b4891632ff9a897f99a91d7bc557467a3a22 # https://github.com/softprops/action-gh-release/issues/139 152 | if: steps.is-release.outputs.IS_RELEASE 153 | with: 154 | files: | 155 | ${{ steps.package.outputs.PKG_PATH }} 156 | env: 157 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /scripts/species.py: -------------------------------------------------------------------------------- 1 | """ ENA assembly collection methods (Blackwell et al. 2021) """ 2 | 3 | import ijson 4 | import pandas 5 | from pathlib import Path 6 | 7 | 8 | def parse_metadata(json_file: Path): 9 | 10 | """ 11 | Parse species, lineage and quality data of genomes in the ENA collection 12 | 13 | json_file: path to the ENA assembly metadata JSON 14 | 15 | --> retains genomes with full MLST (excluding "-") 16 | 17 | * filename: Json1_ENA_metadata 18 | * file address: https://figshare.com/ndownloader/files/26578377 19 | * data citation: 20 | Blackwell, Grace; Hunt, Martin; Malone, Kerri; Lima, Leandro; Horesh, Gal; T. F. Alako, Blaise; et al. (2021): 21 | "Exploring bacterial diversity via a curated and searchable snapshot of archived DNA sequences". 22 | Dataset. https://doi.org/10.6084/m9.figshare.14061752.v1 23 | """ 24 | 25 | data = [] 26 | with json_file.open() as file: 27 | parser = ijson.parse(file) 28 | i = 0 29 | n = 0 30 | current_genome = "" 31 | for prefix, event, value in parser: 32 | if event == 'start_map' and "." not in prefix: 33 | current_genome = prefix 34 | bracken = ("", 0.0) 35 | completeness = None 36 | contamination = None 37 | mlst = None 38 | mlst_species = None 39 | keep = False 40 | heterogeneity = None 41 | if "checkm_results.Completeness" in prefix: 42 | completeness = value 43 | if "checkm_results.Contamination" in prefix: 44 | contamination = value 45 | if "checkm_results.Strain_heterogeneity" in prefix: 46 | heterogeneity = value 47 | if "mlst_results.species" in prefix: 48 | mlst_species = value 49 | if "mlst_results.mlst" in prefix: 50 | mlst = value 51 | if mlst != "-": 52 | keep = True 53 | if "bracken" in prefix and event == "string" and value not in ("NA", "-"): 54 | species = prefix.replace(f"{current_genome}.bracken.", "") 55 | abundance = float(value) 56 | if abundance > bracken[1]: 57 | bracken = (species, abundance) 58 | if event == 'end_map' and "." not in prefix: 59 | if keep: 60 | data.append([ 61 | current_genome, 62 | bracken[0], 63 | bracken[1], 64 | float(completeness), 65 | float(contamination), 66 | float(heterogeneity), 67 | mlst, 68 | mlst_species 69 | ]) 70 | n += 1 71 | # Reset variables to make sure none are 72 | # accidentally used in the next entry 73 | bracken = ("", 0.0) 74 | completeness = None 75 | contamination = None 76 | mlst = None 77 | mlst_species = None 78 | keep = False 79 | heterogeneity = None 80 | # if i == 100000: 81 | # continue # test break 82 | i += 1 83 | 84 | df = pandas.DataFrame( 85 | data, columns=[ 86 | "accession", 87 | "bracken_species", 88 | "bracken_abundance", 89 | "completeness", 90 | "contamination", 91 | "heterogeneity", 92 | "mlst", 93 | "mlst_species" 94 | ] 95 | ) 96 | df.to_csv("meta.tsv", sep="\t", index=False) 97 | 98 | 99 | def clean_metadata(meta_file: Path, assembly_paths: Path): 100 | 101 | """ 102 | Obtain a clean subset of the assembled genomes filtered by assembly quality 103 | 104 | * filter genomes: contamination > 1. && completeness < 99. && heterogeneity > 0.1 105 | * species genome counts, separated into total and > 100 106 | * FTP paths to species assemblies on the EMBL ENA server 107 | 108 | EMBL EBI address: ftp.ebi.ac.uk/pub/databases/ENA2018-bacteria-661k/sampleid_assembly_paths.txt 109 | """ 110 | 111 | with meta_file.open() as meta_file: 112 | df = pandas.read_csv(meta_file, header=0, sep='\t') 113 | contaminated = df[df['contamination'] > 0.1] 114 | fragmented = df[df['completeness'] < 99.] 115 | heterogenous = df[df['heterogeneity'] > 0.1] 116 | 117 | all_exclude = pandas.concat((contaminated, fragmented, heterogenous)) 118 | unique_exclude = all_exclude.index.unique() 119 | 120 | df_clean = df[~df.index.isin(unique_exclude)] 121 | 122 | species_counts_100 = [] 123 | species_counts_all = [] 124 | for species, data in df_clean.groupby("bracken_species"): 125 | n = len(data) 126 | species_counts_all.append([species, n]) 127 | if n >= 100: 128 | species_counts_100.append([species, n]) 129 | df_clean.to_csv("meta_clean.tsv", index=False, sep="\t") 130 | 131 | df_species_all = pandas.DataFrame(species_counts_all, columns=["species", "n"]).sort_values("n") 132 | df_species_100 = pandas.DataFrame(species_counts_100, columns=["species", "n"]).sort_values("n") 133 | 134 | df_species_all.to_csv("species_counts.tsv", index=False, sep="\t") 135 | df_species_100.to_csv("species_counts_100.tsv", index=False, sep="\t") 136 | 137 | assembly_paths = pandas.read_csv(assembly_paths, sep='\t', header=None, names=['id', 'path']) 138 | 139 | spec_paths = [] 140 | specs = [] 141 | for species, species_df in df_clean.groupby("bracken_species"): 142 | ftp_paths = assembly_paths[assembly_paths['id'].isin(species_df['accession'])] 143 | ftp_paths['path'] = [p.replace("/ebi/ftp", "http://ftp.ebi.ac.uk") for p in ftp_paths["path"]] 144 | 145 | species_paths = ftp_paths.drop(columns="id") 146 | name = f"{species.lower().replace(' ', '_')}" 147 | species_paths['species'] = [name for _ in species_paths.iterrows()] 148 | species_df['species'] = [name for _ in species_df.iterrows()] 149 | spec_paths.append(species_paths) 150 | specs.append(species_df) 151 | 152 | species = pandas.concat(specs) 153 | species.to_csv(f"species_data.tsv", sep='\t', index=False) 154 | 155 | species_paths = pandas.concat(spec_paths) 156 | species_paths.to_csv(f"species_ftp.tsv", sep='\t', index=False, header=False) 157 | 158 | -------------------------------------------------------------------------------- /scripts/metrics.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import warnings 3 | import json 4 | from numpy import nan 5 | from typing import List 6 | from pathlib import Path 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.metrics import precision_score 9 | from sklearn.metrics import recall_score 10 | from sklearn.metrics import confusion_matrix 11 | 12 | # in multi label metrics with many labels (e.g. sccmec type) 13 | # warnings can be raised because of rare labels that are 14 | # never predicted; these are ignored 15 | warnings.filterwarnings('ignore') 16 | 17 | SKIP_COLUMNS = ["name", "reads", "sketch_id", "shared_hashes"] 18 | 19 | 20 | def compute_metrics( 21 | reference: Path, 22 | prediction: Path = None, 23 | label_config: Path = None, 24 | default_binary: List = ("R", "S"), 25 | output: Path = Path("metrics.tsv"), 26 | verbose: bool = True 27 | ): 28 | """ 29 | Computes summary metrics for predictions against a structured reference 30 | 31 | :params reference: a tab-delineated file with headers containing reference classifications 32 | * must contain a column `name` for file name matching with prediction file 33 | 34 | :params predictions: a tab-delineated file with headers containing prediction classfications 35 | * must contain a column 'name' for file name same as in reference file 36 | * this can be the output from the batch prediction pipeline 37 | 38 | :params label_config: a JSON file in the following structure: 39 | 40 | { 41 | "": { 42 | "binary": true/false, 43 | "labels": [, ] or null 44 | } 45 | 46 | If is true, then a list of two labels can be provided, which correspond to the 47 | postive and negative values for the confusion matrix. 48 | 49 | { 50 | "mlst": { 51 | "binary": false, 52 | "labels": null 53 | }, 54 | "pvl": { 55 | "binary": true, 56 | "labels": ["PVL+", "PVL-"] 57 | }, 58 | "mrsa": { 59 | "binary": true, 60 | "labels": ["MRSA", "MSSA"] 61 | } 62 | } 63 | 64 | """ 65 | 66 | reference_df, prediction_df = process_data(reference=reference, prediction=prediction, verbose=verbose) 67 | 68 | if label_config is not None: 69 | config = read_label_config(file=label_config, default_binary=default_binary) 70 | else: 71 | config = {} 72 | 73 | metrics = get_metrics( 74 | reference=reference_df, prediction=prediction_df, config=config, default_binary=default_binary, verbose=verbose 75 | ) 76 | 77 | metrics.to_csv(output, sep='\t', index=False) 78 | print(metrics) 79 | print(metrics.accuracy.mean()) 80 | 81 | 82 | def process_data(reference: Path, prediction: Path, verbose: bool): 83 | """ 84 | Process reference and prediction tables, ensure that data is clean 85 | """ 86 | 87 | ref = pandas.read_csv(reference, sep="\t", header=0) 88 | pred = pandas.read_csv(prediction, sep="\t", header=0) 89 | 90 | # Retain only predictions that are also in the reference genotype data 91 | prediction_df_clean = pred[pred['name'].isin(ref['name'])] 92 | 93 | # Retain only references that are also in the predictions 94 | ref_df_clean = ref[ref['name'].isin(pred['name'])] 95 | 96 | removed_prediction = len(pred) - len(prediction_df_clean) 97 | removed_ref = len(ref) - len(ref_df_clean) 98 | 99 | if verbose: 100 | print(f"Removed {removed_prediction} entries from prediction data not present in reference data") 101 | print(f"Removed {removed_ref} entries from reference data not present in prediction data") 102 | 103 | # Sort the dataframes in the same order by name 104 | ref_df = ref_df_clean.sort_values('name') 105 | prediction_df = prediction_df_clean.sort_values('name') 106 | 107 | assert ref_df['name'].tolist() == prediction_df['name'].tolist() # names should be unique, therefore sortable 108 | 109 | # Make sure all columns in prediction are present in reference: 110 | 111 | ref_columns = ref_df.columns.tolist() 112 | for column in prediction_df: 113 | if column in SKIP_COLUMNS: 114 | continue 115 | else: 116 | if column not in ref_columns: 117 | raise ValueError(f"Column `{column}` not in reference data") 118 | 119 | # Following should be done in the reference genotype files later on! 120 | 121 | # For the S. aureus data predictions can be PVL* (which is PVL negative, missing one gene) 122 | # and resistances can include 'r' instead of 'R' - harmonize these by replacing with 123 | # appropriate value (r -> R, PVL* -> PVL-) 124 | 125 | ref_df = ref_df.replace('r', 'R') 126 | prediction_df = prediction_df.replace('r', 'R') 127 | 128 | if "pvl" in ref_df.columns and "pvl" in prediction_df.columns: 129 | ref_df = ref_df.replace('PVL*', 'PVL-') 130 | prediction_df = prediction_df.replace('PVL*', 'PVL-') 131 | 132 | if "scc" in ref_df.columns and "scc" in prediction_df.columns: 133 | # Also remove leading whitespace from SCCmec types 134 | ref_df['scc'] = ref_df.scc.str.strip() 135 | prediction_df['scc'] = prediction_df.scc.str.strip() 136 | 137 | if "meca" in ref_df.columns and "meca" in prediction_df.columns: 138 | # Drop mecA gene (assembly based) in favour of Mykrobe 139 | # methicillin typing from reads (same genotype) 140 | ref_df = ref_df.drop(columns='meca') 141 | prediction_df = prediction_df.drop(columns='meca') 142 | ref_df = ref_df.reset_index(drop=True) 143 | prediction_df = prediction_df.reset_index(drop=True) 144 | 145 | return ref_df, prediction_df 146 | 147 | 148 | def read_label_config(file: Path, default_binary: list) -> dict: 149 | """ 150 | Reads and validates the label configuration JSON 151 | """ 152 | 153 | with file.open() as json_file: 154 | label_config = json.load(json_file) 155 | 156 | conf = {} 157 | for column, config in label_config.items(): 158 | if "binary" not in config.keys(): 159 | raise ValueError(f"Could not find `binary` key in config for column: {column}") 160 | 161 | binary: bool = config["binary"] 162 | if not isinstance(binary, bool): 163 | raise ValueError(f"Binary value ({column}) is not a boolean") 164 | 165 | conf[column] = { 166 | "binary": binary, "labels": list() 167 | } 168 | 169 | if binary: 170 | if "labels" not in config.keys(): 171 | raise ValueError(f"Could not find `labels` key in config for column: {column}") 172 | 173 | labels: list = config["labels"] 174 | if not isinstance(labels, list): 175 | raise ValueError(f"Label value ({column}) is not one of: List or None") 176 | 177 | if not labels: 178 | print(f"Setting default binary labels ({column}): {default_binary}") 179 | labels = default_binary.copy() 180 | else: 181 | if len(labels) != 2: 182 | raise ValueError(f"Binary labels requires precisely two values ({column})") 183 | 184 | conf[column]["labels"] = labels 185 | 186 | return conf 187 | 188 | 189 | def get_metrics( 190 | reference: pandas.DataFrame, prediction: pandas.DataFrame, config: dict, default_binary: List, verbose: bool 191 | ) -> pandas.DataFrame: 192 | 193 | metrics = [] 194 | for column in prediction.columns: 195 | # Skip irrelevant columns 196 | if column in SKIP_COLUMNS: 197 | continue 198 | else: 199 | 200 | try: 201 | binary = config[column]["binary"] 202 | if verbose: 203 | print(f"Column ({column}) is binary: {binary}") 204 | except KeyError: 205 | if verbose: 206 | print(f"Column ({column}) is binary (default)") 207 | binary = True 208 | 209 | ref_vec = reference[column].tolist() # sorted in process_data 210 | pred_vec = prediction[column].tolist() 211 | 212 | if binary: 213 | try: 214 | labels = config[column]["labels"] 215 | if verbose: 216 | print(f"Column ({column}) labels: {', '.join(default_binary)}") 217 | except KeyError: 218 | if verbose: 219 | print(f"Column ({column}) labels: {', '.join(default_binary)} (default)") 220 | labels = default_binary.copy() 221 | 222 | tp, fp, tn, fn, acc, tpr, tnr, ppv, npv = compute_binary_metrics(ref_vec, pred_vec, labels) 223 | else: 224 | tp, fp, tn, fn, tnr = -1, -1, -1, -1, nan 225 | acc = accuracy_score(ref_vec, pred_vec) 226 | tpr = recall_score(ref_vec, pred_vec, average='weighted') 227 | ppv = precision_score(ref_vec, pred_vec, average='weighted') 228 | 229 | metrics.append([column, binary, tp, tn, fp, fn, acc, ppv, tpr, tnr]) 230 | 231 | return pandas.DataFrame( 232 | metrics, 233 | columns=[ 234 | "feature", "binary", "true_positives", "true_negatives", 235 | "false_positives", "false_negatives", "accuracy", "precision", 236 | "recall", "specificity" 237 | ] 238 | ) 239 | 240 | 241 | def compute_binary_metrics(ref: list, pred: list, labels: list): 242 | try: 243 | cm = confusion_matrix(ref, pred, labels=labels) 244 | except ValueError: 245 | print(ref, pred, labels) 246 | raise 247 | 248 | tp = int(cm[0][0]) 249 | fn = int(cm[0][1]) 250 | fp = int(cm[1][0]) 251 | tn = int(cm[1][1]) 252 | 253 | # In all cases, if either the numerator or 254 | # denominator is zero, the metric is undefined 255 | tpr = _metric(tp, (tp + fn)) 256 | tnr = _metric(tn, (tn + fp)) 257 | ppv = _metric(tp, (tp + fp)) 258 | npv = _metric(tn, (tn + fn)) 259 | acc = _metric((tp + tn), (tp + fp + fn + tn)) 260 | 261 | return tp, fp, tn, fn, acc, tpr, tnr, ppv, npv 262 | 263 | 264 | def _metric(numerator: int, denominator: int): 265 | 266 | if numerator == 0: 267 | return nan 268 | if denominator == 0: 269 | return nan 270 | 271 | return numerator/denominator 272 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "ansi_term" 13 | version = "0.11.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" 16 | dependencies = [ 17 | "winapi", 18 | ] 19 | 20 | [[package]] 21 | name = "anyhow" 22 | version = "1.0.44" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" 25 | 26 | [[package]] 27 | name = "atty" 28 | version = "0.2.13" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" 31 | dependencies = [ 32 | "libc", 33 | "winapi", 34 | ] 35 | 36 | [[package]] 37 | name = "autocfg" 38 | version = "1.0.1" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 41 | 42 | [[package]] 43 | name = "bincode" 44 | version = "1.3.3" 45 | source = "registry+https://github.com/rust-lang/crates.io-index" 46 | checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" 47 | dependencies = [ 48 | "serde", 49 | ] 50 | 51 | [[package]] 52 | name = "bitflags" 53 | version = "1.2.1" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" 56 | 57 | [[package]] 58 | name = "bstr" 59 | version = "0.2.17" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" 62 | dependencies = [ 63 | "lazy_static", 64 | "memchr", 65 | "regex-automata", 66 | "serde", 67 | ] 68 | 69 | [[package]] 70 | name = "buf_redux" 71 | version = "0.8.4" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" 74 | dependencies = [ 75 | "memchr", 76 | "safemem", 77 | ] 78 | 79 | [[package]] 80 | name = "bytecount" 81 | version = "0.6.2" 82 | source = "registry+https://github.com/rust-lang/crates.io-index" 83 | checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" 84 | 85 | [[package]] 86 | name = "bzip2" 87 | version = "0.4.3" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" 90 | dependencies = [ 91 | "bzip2-sys", 92 | "libc", 93 | ] 94 | 95 | [[package]] 96 | name = "bzip2-sys" 97 | version = "0.1.11+1.0.8" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" 100 | dependencies = [ 101 | "cc", 102 | "libc", 103 | "pkg-config", 104 | ] 105 | 106 | [[package]] 107 | name = "capnp" 108 | version = "0.14.3" 109 | source = "registry+https://github.com/rust-lang/crates.io-index" 110 | checksum = "ae9b8a7119420b5279ddc2b4ee553ee15bcf4605df6135a26f03ffe153bee97c" 111 | 112 | [[package]] 113 | name = "cc" 114 | version = "1.0.70" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" 117 | 118 | [[package]] 119 | name = "cfg-if" 120 | version = "1.0.0" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 123 | 124 | [[package]] 125 | name = "clap" 126 | version = "2.33.0" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" 129 | dependencies = [ 130 | "ansi_term", 131 | "atty", 132 | "bitflags", 133 | "strsim", 134 | "textwrap", 135 | "unicode-width", 136 | "vec_map", 137 | ] 138 | 139 | [[package]] 140 | name = "crc32fast" 141 | version = "1.2.1" 142 | source = "registry+https://github.com/rust-lang/crates.io-index" 143 | checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" 144 | dependencies = [ 145 | "cfg-if", 146 | ] 147 | 148 | [[package]] 149 | name = "crossbeam-channel" 150 | version = "0.5.1" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 153 | dependencies = [ 154 | "cfg-if", 155 | "crossbeam-utils", 156 | ] 157 | 158 | [[package]] 159 | name = "crossbeam-deque" 160 | version = "0.8.1" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" 163 | dependencies = [ 164 | "cfg-if", 165 | "crossbeam-epoch", 166 | "crossbeam-utils", 167 | ] 168 | 169 | [[package]] 170 | name = "crossbeam-epoch" 171 | version = "0.9.5" 172 | source = "registry+https://github.com/rust-lang/crates.io-index" 173 | checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" 174 | dependencies = [ 175 | "cfg-if", 176 | "crossbeam-utils", 177 | "lazy_static", 178 | "memoffset", 179 | "scopeguard", 180 | ] 181 | 182 | [[package]] 183 | name = "crossbeam-utils" 184 | version = "0.8.8" 185 | source = "registry+https://github.com/rust-lang/crates.io-index" 186 | checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" 187 | dependencies = [ 188 | "cfg-if", 189 | "lazy_static", 190 | ] 191 | 192 | [[package]] 193 | name = "csv" 194 | version = "1.1.6" 195 | source = "registry+https://github.com/rust-lang/crates.io-index" 196 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" 197 | dependencies = [ 198 | "bstr", 199 | "csv-core", 200 | "itoa", 201 | "ryu", 202 | "serde", 203 | ] 204 | 205 | [[package]] 206 | name = "csv-core" 207 | version = "0.1.10" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" 210 | dependencies = [ 211 | "memchr", 212 | ] 213 | 214 | [[package]] 215 | name = "either" 216 | version = "1.6.1" 217 | source = "registry+https://github.com/rust-lang/crates.io-index" 218 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 219 | 220 | [[package]] 221 | name = "finch" 222 | version = "0.4.1" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "0f5b421df230ee6000ccb42073103407a8b29c0adc2b5870a346d2fd6281ceec" 225 | dependencies = [ 226 | "bincode", 227 | "capnp", 228 | "memmap", 229 | "murmurhash3", 230 | "ndarray", 231 | "needletail", 232 | "rayon", 233 | "serde", 234 | "serde_json", 235 | "thiserror", 236 | ] 237 | 238 | [[package]] 239 | name = "flate2" 240 | version = "1.0.22" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" 243 | dependencies = [ 244 | "cfg-if", 245 | "crc32fast", 246 | "libc", 247 | "miniz_oxide", 248 | ] 249 | 250 | [[package]] 251 | name = "heck" 252 | version = "0.3.3" 253 | source = "registry+https://github.com/rust-lang/crates.io-index" 254 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 255 | dependencies = [ 256 | "unicode-segmentation", 257 | ] 258 | 259 | [[package]] 260 | name = "hermit-abi" 261 | version = "0.1.19" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 264 | dependencies = [ 265 | "libc", 266 | ] 267 | 268 | [[package]] 269 | name = "itoa" 270 | version = "0.4.7" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" 273 | 274 | [[package]] 275 | name = "lazy_static" 276 | version = "1.4.0" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 279 | 280 | [[package]] 281 | name = "libc" 282 | version = "0.2.95" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" 285 | 286 | [[package]] 287 | name = "lzma-sys" 288 | version = "0.1.17" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "bdb4b7c3eddad11d3af9e86c487607d2d2442d185d848575365c4856ba96d619" 291 | dependencies = [ 292 | "cc", 293 | "libc", 294 | "pkg-config", 295 | ] 296 | 297 | [[package]] 298 | name = "matrixmultiply" 299 | version = "0.2.4" 300 | source = "registry+https://github.com/rust-lang/crates.io-index" 301 | checksum = "916806ba0031cd542105d916a97c8572e1fa6dd79c9c51e7eb43a09ec2dd84c1" 302 | dependencies = [ 303 | "rawpointer", 304 | ] 305 | 306 | [[package]] 307 | name = "memchr" 308 | version = "2.4.0" 309 | source = "registry+https://github.com/rust-lang/crates.io-index" 310 | checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" 311 | 312 | [[package]] 313 | name = "memmap" 314 | version = "0.7.0" 315 | source = "registry+https://github.com/rust-lang/crates.io-index" 316 | checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" 317 | dependencies = [ 318 | "libc", 319 | "winapi", 320 | ] 321 | 322 | [[package]] 323 | name = "memoffset" 324 | version = "0.6.4" 325 | source = "registry+https://github.com/rust-lang/crates.io-index" 326 | checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" 327 | dependencies = [ 328 | "autocfg", 329 | ] 330 | 331 | [[package]] 332 | name = "miniz_oxide" 333 | version = "0.4.4" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" 336 | dependencies = [ 337 | "adler", 338 | "autocfg", 339 | ] 340 | 341 | [[package]] 342 | name = "murmurhash3" 343 | version = "0.0.5" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" 346 | 347 | [[package]] 348 | name = "ndarray" 349 | version = "0.14.0" 350 | source = "registry+https://github.com/rust-lang/crates.io-index" 351 | checksum = "6c0d5c9540a691d153064dc47a4db2504587a75eae07bf1d73f7a596ebc73c04" 352 | dependencies = [ 353 | "matrixmultiply", 354 | "num-complex", 355 | "num-integer", 356 | "num-traits", 357 | "rawpointer", 358 | ] 359 | 360 | [[package]] 361 | name = "needletail" 362 | version = "0.4.1" 363 | source = "registry+https://github.com/rust-lang/crates.io-index" 364 | checksum = "7fb4c43ebd04b0e776119c8fc3bd4c28178619cd04e1f19f600a4ef0282fa3cc" 365 | dependencies = [ 366 | "buf_redux", 367 | "bytecount", 368 | "bzip2", 369 | "flate2", 370 | "memchr", 371 | "xz2", 372 | ] 373 | 374 | [[package]] 375 | name = "num-complex" 376 | version = "0.3.1" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "747d632c0c558b87dbabbe6a82f3b4ae03720d0646ac5b7b4dae89394be5f2c5" 379 | dependencies = [ 380 | "num-traits", 381 | ] 382 | 383 | [[package]] 384 | name = "num-integer" 385 | version = "0.1.44" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" 388 | dependencies = [ 389 | "autocfg", 390 | "num-traits", 391 | ] 392 | 393 | [[package]] 394 | name = "num-traits" 395 | version = "0.2.14" 396 | source = "registry+https://github.com/rust-lang/crates.io-index" 397 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" 398 | dependencies = [ 399 | "autocfg", 400 | ] 401 | 402 | [[package]] 403 | name = "num_cpus" 404 | version = "1.13.0" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" 407 | dependencies = [ 408 | "hermit-abi", 409 | "libc", 410 | ] 411 | 412 | [[package]] 413 | name = "pkg-config" 414 | version = "0.3.20" 415 | source = "registry+https://github.com/rust-lang/crates.io-index" 416 | checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb" 417 | 418 | [[package]] 419 | name = "proc-macro-error" 420 | version = "1.0.4" 421 | source = "registry+https://github.com/rust-lang/crates.io-index" 422 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 423 | dependencies = [ 424 | "proc-macro-error-attr", 425 | "proc-macro2", 426 | "quote", 427 | "syn", 428 | "version_check", 429 | ] 430 | 431 | [[package]] 432 | name = "proc-macro-error-attr" 433 | version = "1.0.4" 434 | source = "registry+https://github.com/rust-lang/crates.io-index" 435 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 436 | dependencies = [ 437 | "proc-macro2", 438 | "quote", 439 | "version_check", 440 | ] 441 | 442 | [[package]] 443 | name = "proc-macro2" 444 | version = "1.0.29" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" 447 | dependencies = [ 448 | "unicode-xid", 449 | ] 450 | 451 | [[package]] 452 | name = "quote" 453 | version = "1.0.9" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" 456 | dependencies = [ 457 | "proc-macro2", 458 | ] 459 | 460 | [[package]] 461 | name = "rawpointer" 462 | version = "0.2.1" 463 | source = "registry+https://github.com/rust-lang/crates.io-index" 464 | checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" 465 | 466 | [[package]] 467 | name = "rayon" 468 | version = "1.5.1" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" 471 | dependencies = [ 472 | "autocfg", 473 | "crossbeam-deque", 474 | "either", 475 | "rayon-core", 476 | ] 477 | 478 | [[package]] 479 | name = "rayon-core" 480 | version = "1.9.1" 481 | source = "registry+https://github.com/rust-lang/crates.io-index" 482 | checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" 483 | dependencies = [ 484 | "crossbeam-channel", 485 | "crossbeam-deque", 486 | "crossbeam-utils", 487 | "lazy_static", 488 | "num_cpus", 489 | ] 490 | 491 | [[package]] 492 | name = "regex-automata" 493 | version = "0.1.10" 494 | source = "registry+https://github.com/rust-lang/crates.io-index" 495 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 496 | 497 | [[package]] 498 | name = "ryu" 499 | version = "1.0.5" 500 | source = "registry+https://github.com/rust-lang/crates.io-index" 501 | checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" 502 | 503 | [[package]] 504 | name = "safemem" 505 | version = "0.3.3" 506 | source = "registry+https://github.com/rust-lang/crates.io-index" 507 | checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" 508 | 509 | [[package]] 510 | name = "scopeguard" 511 | version = "1.1.0" 512 | source = "registry+https://github.com/rust-lang/crates.io-index" 513 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 514 | 515 | [[package]] 516 | name = "serde" 517 | version = "1.0.126" 518 | source = "registry+https://github.com/rust-lang/crates.io-index" 519 | checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" 520 | dependencies = [ 521 | "serde_derive", 522 | ] 523 | 524 | [[package]] 525 | name = "serde_derive" 526 | version = "1.0.126" 527 | source = "registry+https://github.com/rust-lang/crates.io-index" 528 | checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" 529 | dependencies = [ 530 | "proc-macro2", 531 | "quote", 532 | "syn", 533 | ] 534 | 535 | [[package]] 536 | name = "serde_json" 537 | version = "1.0.64" 538 | source = "registry+https://github.com/rust-lang/crates.io-index" 539 | checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" 540 | dependencies = [ 541 | "itoa", 542 | "ryu", 543 | "serde", 544 | ] 545 | 546 | [[package]] 547 | name = "sketchy-rs" 548 | version = "0.6.0" 549 | dependencies = [ 550 | "anyhow", 551 | "clap", 552 | "csv", 553 | "finch", 554 | "needletail", 555 | "rayon", 556 | "structopt", 557 | "thiserror", 558 | ] 559 | 560 | [[package]] 561 | name = "strsim" 562 | version = "0.8.0" 563 | source = "registry+https://github.com/rust-lang/crates.io-index" 564 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 565 | 566 | [[package]] 567 | name = "structopt" 568 | version = "0.3.23" 569 | source = "registry+https://github.com/rust-lang/crates.io-index" 570 | checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" 571 | dependencies = [ 572 | "clap", 573 | "lazy_static", 574 | "structopt-derive", 575 | ] 576 | 577 | [[package]] 578 | name = "structopt-derive" 579 | version = "0.4.16" 580 | source = "registry+https://github.com/rust-lang/crates.io-index" 581 | checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" 582 | dependencies = [ 583 | "heck", 584 | "proc-macro-error", 585 | "proc-macro2", 586 | "quote", 587 | "syn", 588 | ] 589 | 590 | [[package]] 591 | name = "syn" 592 | version = "1.0.77" 593 | source = "registry+https://github.com/rust-lang/crates.io-index" 594 | checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0" 595 | dependencies = [ 596 | "proc-macro2", 597 | "quote", 598 | "unicode-xid", 599 | ] 600 | 601 | [[package]] 602 | name = "textwrap" 603 | version = "0.11.0" 604 | source = "registry+https://github.com/rust-lang/crates.io-index" 605 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 606 | dependencies = [ 607 | "unicode-width", 608 | ] 609 | 610 | [[package]] 611 | name = "thiserror" 612 | version = "1.0.29" 613 | source = "registry+https://github.com/rust-lang/crates.io-index" 614 | checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" 615 | dependencies = [ 616 | "thiserror-impl", 617 | ] 618 | 619 | [[package]] 620 | name = "thiserror-impl" 621 | version = "1.0.29" 622 | source = "registry+https://github.com/rust-lang/crates.io-index" 623 | checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" 624 | dependencies = [ 625 | "proc-macro2", 626 | "quote", 627 | "syn", 628 | ] 629 | 630 | [[package]] 631 | name = "unicode-segmentation" 632 | version = "1.8.0" 633 | source = "registry+https://github.com/rust-lang/crates.io-index" 634 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" 635 | 636 | [[package]] 637 | name = "unicode-width" 638 | version = "0.1.7" 639 | source = "registry+https://github.com/rust-lang/crates.io-index" 640 | checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" 641 | 642 | [[package]] 643 | name = "unicode-xid" 644 | version = "0.2.2" 645 | source = "registry+https://github.com/rust-lang/crates.io-index" 646 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 647 | 648 | [[package]] 649 | name = "vec_map" 650 | version = "0.8.1" 651 | source = "registry+https://github.com/rust-lang/crates.io-index" 652 | checksum = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" 653 | 654 | [[package]] 655 | name = "version_check" 656 | version = "0.9.3" 657 | source = "registry+https://github.com/rust-lang/crates.io-index" 658 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 659 | 660 | [[package]] 661 | name = "winapi" 662 | version = "0.3.8" 663 | source = "registry+https://github.com/rust-lang/crates.io-index" 664 | checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" 665 | dependencies = [ 666 | "winapi-i686-pc-windows-gnu", 667 | "winapi-x86_64-pc-windows-gnu", 668 | ] 669 | 670 | [[package]] 671 | name = "winapi-i686-pc-windows-gnu" 672 | version = "0.4.0" 673 | source = "registry+https://github.com/rust-lang/crates.io-index" 674 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 675 | 676 | [[package]] 677 | name = "winapi-x86_64-pc-windows-gnu" 678 | version = "0.4.0" 679 | source = "registry+https://github.com/rust-lang/crates.io-index" 680 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 681 | 682 | [[package]] 683 | name = "xz2" 684 | version = "0.1.6" 685 | source = "registry+https://github.com/rust-lang/crates.io-index" 686 | checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c" 687 | dependencies = [ 688 | "lzma-sys", 689 | ] 690 | -------------------------------------------------------------------------------- /src/sketchy.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use finch::serialization::{ 3 | read_finch_file, read_mash_file, write_finch_file, write_mash_file, Sketch, 4 | }; 5 | use finch::sketch_schemes::{KmerCount, SketchParams}; 6 | use finch::statistics::cardinality; 7 | use needletail::{parse_fastx_file, parse_fastx_stdin, FastxReader}; 8 | use rayon::prelude::*; 9 | use std::cmp::Ordering; 10 | use std::collections::HashMap; 11 | use std::fs::File; 12 | use std::io::prelude::*; 13 | use std::io::BufReader; 14 | use std::path::{Path, PathBuf}; 15 | use thiserror::Error; 16 | 17 | #[derive(Error, Debug)] 18 | pub enum SketchyError { 19 | #[error("reference sketch identifier {0} does not match genotype identifier {1} at line {2}")] 20 | InvalidIdentifier(String, String, String), 21 | #[error("reference sketch and genotype table must have the same length")] 22 | InvalidSize, 23 | #[error("reference sketch file must have Mash (.msh) or Finch (.fsh) extension")] 24 | InvalidExtension, 25 | #[error("reference ({0}) {1} ({2}) does not match query ({3}) {1} ({4})")] 26 | InvalidSketchMatch(String, String, String, String, String), 27 | #[error("consensus genotype could not be computed")] 28 | InvalidConsensusGenotype, 29 | #[error("--top must be an odd number when using --consensus")] 30 | InvalidConsensusTop, 31 | #[error("failed to open file")] 32 | IOError(#[from] std::io::Error), 33 | #[error("failed to open file with Finch")] 34 | FinchError(#[from] finch::errors::FinchError), 35 | #[error("failed to open genotype file or record with CSV")] 36 | GenotypeTableError(#[from] csv::Error), 37 | #[error("failed to open Fastx file or record with Needletail")] 38 | FastxError(#[from] needletail::errors::ParseError), 39 | } 40 | 41 | /// A `Struct` used for configuring some 42 | /// params of the predict methods 43 | #[derive(Debug, PartialEq, Eq)] 44 | pub struct PredictConfig { 45 | pub top: usize, 46 | pub limit: usize, 47 | pub stream: bool, 48 | pub consensus: bool, 49 | pub header: bool, 50 | } 51 | 52 | /// A `Struct` used for building the formatted 53 | /// Sketchy reference sketch 54 | #[derive(Debug, PartialEq, Eq)] 55 | pub struct Sketchy {} 56 | 57 | impl Sketchy { 58 | /// Create a new reference sketch instance 59 | pub fn new() -> Self { 60 | Sketchy {} 61 | } 62 | 63 | /// Prediction method for Sketchy 64 | /// 65 | /// Implement documentation. 66 | pub fn predict( 67 | &self, 68 | fastx: Option, 69 | reference: PathBuf, 70 | genotypes: PathBuf, 71 | config: PredictConfig, 72 | ) -> Result<(), SketchyError> { 73 | // First check that top_results is odd when consensus is true 74 | if config.consensus { 75 | let is_odd = matches!(config.top % 2, 1); 76 | if !is_odd { 77 | return Err(SketchyError::InvalidConsensusTop); 78 | } 79 | } 80 | 81 | let reference_sketches = self._read_sketch(reference)?; 82 | let reference_params = &reference_sketches[0].sketch_params; 83 | 84 | let mut min_scale = 0.; 85 | if let Some(ref_scale) = reference_params.hash_info().3 { 86 | min_scale = ref_scale; 87 | } 88 | 89 | let fastx_reader = match fastx { 90 | Some(file) => parse_fastx_file(file)?, 91 | None => parse_fastx_stdin()?, 92 | }; 93 | 94 | let (geno, header) = self._read_genotypes(&genotypes)?; 95 | let geno_map = self._genotype_hashmap(geno)?; 96 | 97 | // Header is printed already here, regardless of streaming or direct prediction mode 98 | 99 | if config.header { 100 | println!("reads\tsketch_id\tshared_hashes\t{}", header); 101 | } 102 | 103 | if config.stream { 104 | self._sum_of_shared_hashes( 105 | fastx_reader, 106 | reference_params, 107 | &reference_sketches, 108 | geno_map, 109 | min_scale, 110 | config, 111 | )?; 112 | } else { 113 | self._shared_hashes( 114 | fastx_reader, 115 | reference_params, 116 | &reference_sketches, 117 | geno_map, 118 | min_scale, 119 | config, 120 | )?; 121 | } 122 | 123 | Ok(()) 124 | } 125 | /// Sketch building method for Sketchy 126 | /// 127 | /// Implement documentation. 128 | pub fn sketch( 129 | &self, 130 | input: Option>, 131 | output: PathBuf, 132 | sketch_size: usize, 133 | kmer_size: u8, 134 | seed: u64, 135 | scale: f64, 136 | ) -> Result<(), SketchyError> { 137 | let files = match input { 138 | None => { 139 | let stdin = std::io::stdin(); 140 | let files = stdin 141 | .lock() 142 | .lines() 143 | .map(|x| PathBuf::from(x.unwrap())) 144 | .collect(); 145 | files 146 | } 147 | Some(f) => f, 148 | }; 149 | 150 | let sketch_params = 151 | self._get_sketch_params_from_extension(&output, sketch_size, kmer_size, scale, seed)?; 152 | 153 | let mut writer = File::create(&output)?; 154 | let sketches = self._sketch_files(&sketch_params, files)?; 155 | 156 | match &sketch_params { 157 | SketchParams::Mash { .. } => { 158 | write_mash_file(&mut writer, &sketches)?; 159 | } 160 | SketchParams::Scaled { .. } => { 161 | write_finch_file(&mut writer, &sketches)?; 162 | } 163 | SketchParams::AllCounts { .. } => unreachable!(), // no AllCounts from Sketchy::new() 164 | } 165 | 166 | Ok(()) 167 | } 168 | /// Information method for Sketchy 169 | /// 170 | /// Given a sketch input, list the sketch identifiers, the size of the sequence the 171 | /// sketch was build from (bp) and the estimated uniqueness of the sequence (bp) 172 | pub fn info(&self, input: PathBuf, build: bool) -> Result<(), SketchyError> { 173 | let sketches = self._read_sketch(input)?; 174 | 175 | if build { 176 | let rep_sketch = &sketches[0]; // assumes all sketch params same 177 | let rep_params = &rep_sketch.sketch_params; 178 | 179 | match rep_params { 180 | SketchParams::Scaled { 181 | kmers_to_sketch, 182 | kmer_length, 183 | scale, 184 | hash_seed, 185 | } => { 186 | println! {"type=scaled sketch_size={:} kmer_size={:} scale={:} seed={:}", kmers_to_sketch, kmer_length, scale, hash_seed} 187 | } 188 | SketchParams::Mash { 189 | kmers_to_sketch, 190 | final_size: _, 191 | kmer_length, 192 | no_strict: _, 193 | hash_seed, 194 | } => { 195 | println! {"type=mash sketch_size={:} kmer_size={:} seed={:}", kmers_to_sketch, kmer_length, hash_seed} 196 | } 197 | SketchParams::AllCounts { .. } => unimplemented!(), 198 | }; 199 | } else { 200 | for sketch in &sketches { 201 | let kmers = &sketch.hashes; 202 | if let Ok(c) = cardinality(kmers) { 203 | println!("{} {} {}", &sketch.name, &sketch.seq_length, c); 204 | } 205 | } 206 | } 207 | Ok(()) 208 | } 209 | /// Checks that sketches in the input refernce sketch and identifiers in 210 | /// the genotype table are in the same order and that the reference sketch 211 | /// collection and genotype table are of the same size 212 | pub fn check(&self, input: PathBuf, genotypes: PathBuf) -> Result<(), SketchyError> { 213 | let sketches = self._read_sketch(input)?; 214 | let (genotype_data, _) = self 215 | ._read_genotypes(&genotypes) 216 | .expect("Could not read genotype file"); 217 | 218 | // Check for same order of identifiers in sketch and genotype table 219 | for (i, (sketch, genotype)) in sketches.iter().zip(&genotype_data).enumerate() { 220 | match sketch.name == genotype[0] { 221 | true => continue, 222 | false => SketchyError::InvalidIdentifier( 223 | sketch.name.to_owned(), 224 | genotype[0].to_owned(), 225 | i.to_string(), 226 | ), 227 | }; 228 | } 229 | // Check for same size of sketch collection and genotype table 230 | if sketches.len() != genotype_data.len() { 231 | Err(SketchyError::InvalidSize) 232 | } else { 233 | println!("ok"); 234 | Ok(()) 235 | } 236 | } 237 | /// Compute and print shared hashes between reference and query sketches 238 | pub fn shared(&self, reference: PathBuf, query: PathBuf) -> Result<(), SketchyError> { 239 | let reference_sketches = self._read_sketch(reference)?; 240 | let query_sketches = self._read_sketch(query)?; 241 | 242 | // Scale of sketches is inferred from first sketch in file --> might need to implement 243 | // an empty file check which is not implemented in the finch::readers 244 | let mut min_scale = 0.; 245 | if let Some(scale1) = &query_sketches[0].sketch_params.hash_info().3 { 246 | if let Some(scale2) = &reference_sketches[0].sketch_params.hash_info().3 { 247 | min_scale = f64::min(*scale1, *scale2); 248 | } 249 | } 250 | // Pairwise shared hashes computation 251 | for ref_sketch in &reference_sketches { 252 | for query_sketch in &query_sketches { 253 | let compatible = ref_sketch 254 | .sketch_params 255 | .check_compatibility(&query_sketch.sketch_params); 256 | let common = match compatible { 257 | None => Ok(self._common_hashes( 258 | &ref_sketch.hashes, 259 | &query_sketch.hashes, 260 | min_scale, 261 | )), 262 | Some(incomp) => Err(SketchyError::InvalidSketchMatch( 263 | ref_sketch.name.to_owned(), 264 | incomp.0.to_string(), 265 | incomp.1, 266 | query_sketch.name.to_owned(), 267 | incomp.2, 268 | )), 269 | }; 270 | println!( 271 | "{:} {:} {:}", 272 | ref_sketch.name, 273 | query_sketch.name, 274 | common.unwrap() 275 | ); // unwrap should be safe here 276 | } 277 | } 278 | Ok(()) 279 | } 280 | 281 | fn _shared_hashes( 282 | &self, 283 | mut fastx_reader: Box, 284 | reference_params: &SketchParams, 285 | reference_sketches: &[Sketch], 286 | geno_map: HashMap>, 287 | min_scale: f64, 288 | config: PredictConfig, 289 | ) -> Result<(), SketchyError> { 290 | // Sketcher is created for all reads 291 | let mut sketcher = reference_params.create_sketcher(); 292 | // Kmers are extracted for all reads 293 | let mut read = 0; 294 | while let Some(record) = fastx_reader.next() { 295 | sketcher.process(record?); 296 | read += 1; 297 | if read == config.limit { 298 | break; 299 | } 300 | } 301 | // Hashed kmers and counts are extracted across all reads 302 | let read_hashes = sketcher.to_vec(); 303 | 304 | let mut result_vec = vec![]; 305 | for ref_sketch in reference_sketches { 306 | // Shared hashes are computed for each ref sketch 307 | let shared_hashes = self._common_hashes(&ref_sketch.hashes, &read_hashes, min_scale); 308 | result_vec.push((&ref_sketch.name, shared_hashes, &geno_map[&ref_sketch.name])); 309 | } 310 | result_vec.sort_by(|a, b| b.1.cmp(&a.1)); 311 | 312 | self._print_results(result_vec, read, config.top, config.consensus)?; 313 | 314 | Ok(()) 315 | } 316 | 317 | fn _sum_of_shared_hashes( 318 | &self, 319 | mut fastx_reader: Box, 320 | reference_params: &SketchParams, 321 | reference_sketches: &[Sketch], 322 | geno_map: HashMap>, 323 | min_scale: f64, 324 | config: PredictConfig, 325 | ) -> Result<(), SketchyError> { 326 | let mut sum_of_shared_hashes = vec![0; reference_sketches.len()]; 327 | let mut read = 1; 328 | while let Some(record) = fastx_reader.next() { 329 | let mut result_vec = vec![]; 330 | // At each read we create a new sketcher based on the reference sketch 331 | let mut sketcher = reference_params.create_sketcher(); 332 | // Kmers are then extracted for the record and hashed 333 | sketcher.process(record?); 334 | // Hashed kmers and counts are extracted for this read 335 | let read_hashes = sketcher.to_vec(); 336 | // With each read, we compute the shared hashes with the reference sketch 337 | for (i, ref_sketch) in reference_sketches.iter().enumerate() { 338 | let shared_hashes = 339 | self._common_hashes(&ref_sketch.hashes, &read_hashes, min_scale); 340 | // Finally the sum of shared hashes are updated 341 | sum_of_shared_hashes[i] += shared_hashes; 342 | result_vec.push(( 343 | &ref_sketch.name, 344 | sum_of_shared_hashes[i], 345 | &geno_map[&ref_sketch.name], 346 | )); 347 | } 348 | result_vec.sort_by(|a, b| b.1.cmp(&a.1)); 349 | self._print_results(result_vec, read, config.top, config.consensus)?; 350 | read += 1; 351 | if read == config.limit + 1 { 352 | break; 353 | } 354 | } 355 | Ok(()) 356 | } 357 | 358 | fn _print_results( 359 | &self, 360 | result_vec: Vec<(&String, u64, &Vec)>, 361 | read: usize, 362 | top_results: usize, 363 | consensus: bool, 364 | ) -> Result<(), SketchyError> { 365 | if consensus { 366 | // For consensus calling, ignore reference genome names and shared hashes 367 | // and for each genotype feature, gather the calls in a new vector 368 | let ngenotypes = result_vec[0].2.len(); 369 | let mut genotype_features: Vec> = vec![Vec::new(); ngenotypes]; 370 | 371 | for (_, _, genotypes) in result_vec[..top_results].iter() { 372 | for (j, genotype) in genotypes.iter().enumerate() { 373 | genotype_features[j].push(genotype) 374 | } 375 | } 376 | // For each feature consensus vector, call the most frequent value 377 | // as the consensus. CLI implements a strict rule for only using 378 | // odd --top values when using consensus calling 379 | let mut consensus_genotype: Vec = Vec::new(); 380 | for consensus_feature in genotype_features.iter() { 381 | let mut counts = HashMap::new(); 382 | for genotype in consensus_feature { 383 | *counts.entry(genotype).or_insert(0) += 1; 384 | } 385 | let consensus_value = self._get_consensus_value(counts)?; 386 | consensus_genotype.push(consensus_value) 387 | } 388 | println!("{:}\t-\t-\t{:}", read, consensus_genotype.join("\t")); 389 | } else { 390 | // If not computing consensus simply iterate over top results and print to console 391 | for (name, shared_hashes, genotype) in result_vec[..top_results].iter() { 392 | println!( 393 | "{:}\t{:}\t{:}\t{:}", 394 | read, 395 | name, 396 | shared_hashes, 397 | genotype.join("\t") 398 | ); 399 | } 400 | } 401 | Ok(()) 402 | } 403 | 404 | fn _get_consensus_value( 405 | &self, 406 | counts: HashMap<&&String, usize>, 407 | ) -> Result { 408 | let consensus_value = counts.iter().max_by(|a, b| a.1.cmp(b.1)).map(|(k, _v)| k); 409 | match consensus_value { 410 | Some(value) => Ok(value.to_string()), 411 | None => Err(SketchyError::InvalidConsensusGenotype), 412 | } 413 | } 414 | /// Analogue of `finch::distance::raw_distance` reduced to extracting common hashes 415 | /// 416 | /// Assumes hashes are sorted - not sure if need to implement the check here, in particular 417 | /// because we implement a read-by-read shared hashes computation in the prediction method 418 | /// and this may increase cost in the end. Need to test. 419 | fn _common_hashes( 420 | &self, 421 | ref_hashes: &[KmerCount], 422 | query_hashes: &[KmerCount], 423 | min_scale: f64, 424 | ) -> u64 { 425 | let mut i: usize = 0; 426 | let mut j: usize = 0; 427 | let mut common: u64 = 0; 428 | while let (Some(query), Some(refer)) = (query_hashes.get(i), ref_hashes.get(j)) { 429 | match query.hash.cmp(&refer.hash) { 430 | Ordering::Less => i += 1, 431 | Ordering::Greater => j += 1, 432 | Ordering::Equal => { 433 | common += 1; 434 | i += 1; 435 | j += 1; 436 | } 437 | } 438 | } 439 | // At this point we've exhausted one of the two sketches, but we may have 440 | // more counts in the other to compare if these were scaled sketches 441 | if min_scale > 0. { 442 | let max_hash = u64::max_value() / min_scale.recip() as u64; 443 | while query_hashes 444 | .get(i) 445 | .map(|kmer_count| kmer_count.hash < max_hash) 446 | .unwrap_or(false) 447 | { 448 | i += 1; 449 | } 450 | while ref_hashes 451 | .get(j) 452 | .map(|kmer_count| kmer_count.hash < max_hash) 453 | .unwrap_or(false) 454 | { 455 | j += 1; 456 | } 457 | } 458 | common 459 | } 460 | 461 | /// Analogous method to `finch::sketch_files` excluding filtering options 462 | /// 463 | /// Filtering excluded, as we are not interested in sketching read files 464 | /// but assembled reference genome sequences for the genotype database. 465 | fn _sketch_files( 466 | &self, 467 | sketch_params: &SketchParams, 468 | sequence_files: Vec, 469 | ) -> Result, SketchyError> { 470 | sequence_files 471 | .par_iter() 472 | .map(|file| { 473 | let mut sketcher = sketch_params.create_sketcher(); 474 | let mut fastx_reader = parse_fastx_file(file)?; 475 | 476 | while let Some(record) = fastx_reader.next() { 477 | sketcher.process(record?); 478 | } 479 | 480 | let sketch_hashes = sketcher.to_vec(); 481 | let (seq_length, num_valid_kmers) = sketcher.total_bases_and_kmers(); 482 | 483 | Ok(Sketch { 484 | name: file.file_name().unwrap().to_str().unwrap().to_string(), 485 | seq_length, 486 | num_valid_kmers, 487 | comment: "".to_string(), 488 | hashes: sketch_hashes, 489 | filter_params: finch::filtering::FilterParams::default(), // no filter params 490 | sketch_params: sketch_params.clone(), 491 | }) 492 | }) 493 | .collect() 494 | } 495 | 496 | /// Read a sketch file into a vector of sketches based on the extension of the file 497 | fn _read_sketch(&self, sketch_file: PathBuf) -> Result, SketchyError> { 498 | let sketch_ext = match sketch_file.extension() { 499 | None => Err(SketchyError::InvalidExtension), 500 | Some(os_str) => match os_str.to_str() { 501 | Some("msh") => Ok("msh"), 502 | Some("fsh") => Ok("fsh"), 503 | _ => Err(SketchyError::InvalidExtension), 504 | }, 505 | }; 506 | 507 | let mut reader = BufReader::new(File::open(&sketch_file)?); 508 | 509 | match sketch_ext { 510 | Ok("msh") => { 511 | let sketches = read_mash_file(&mut reader)?; 512 | // Fixing the sketch param object, as the parameters is not written to file for some reason [MASH] 513 | let sketches_with_params = sketches 514 | .iter() 515 | .map(|sketch| { 516 | // Rethink if really necessary, currently used only to instantiate a sketcher in the main 517 | // straming function and to output a summary statistic - so technically, we only need to add 518 | // the sketch size to SketchParams::Mash in the first sketch after reading into these methods 519 | // rather than fixing all of them, which may slow down with very large sketch collections 520 | let mut new_sketch = sketch.clone(); 521 | new_sketch.sketch_params = SketchParams::Mash { 522 | kmers_to_sketch: sketch.hashes.len(), 523 | final_size: sketch.hashes.len(), 524 | kmer_length: sketch.sketch_params.k(), 525 | no_strict: false, 526 | hash_seed: sketch.sketch_params.hash_info().2, 527 | }; 528 | new_sketch 529 | }) 530 | .collect(); 531 | Ok(sketches_with_params) 532 | } 533 | Ok("fsh") => Ok(read_finch_file(&mut reader)?), 534 | _ => Err(SketchyError::InvalidExtension), 535 | } 536 | } 537 | 538 | fn _read_genotypes( 539 | &self, 540 | genotype_file: &Path, 541 | ) -> Result<(Vec>, String), SketchyError> { 542 | let mut reader = csv::ReaderBuilder::new() 543 | .delimiter(b'\t') 544 | .has_headers(true) 545 | .from_path(genotype_file)?; 546 | let mut genotypes: Vec> = vec![]; 547 | for result in reader.records() { 548 | let record = result?; 549 | let str_vec: Vec = record.iter().map(|field| field.to_string()).collect(); 550 | genotypes.push(str_vec); 551 | } 552 | let header: Vec = reader 553 | .headers()? 554 | .into_iter() 555 | .map(|field| field.to_string()) 556 | .collect(); 557 | let header_str = header[1..].join("\t"); // exclude first column identifier 558 | Ok((genotypes, header_str)) 559 | } 560 | 561 | fn _genotype_hashmap( 562 | &self, 563 | genotypes: Vec>, 564 | ) -> Result>, SketchyError> { 565 | let genotype_map: HashMap> = genotypes 566 | .iter() 567 | .map(|gvec| (gvec[0].to_owned(), gvec[1..].to_owned())) 568 | .collect(); 569 | 570 | Ok(genotype_map) 571 | } 572 | 573 | fn _get_sketch_params_from_extension( 574 | &self, 575 | output: &Path, 576 | sketch_size: usize, 577 | kmer_size: u8, 578 | scale: f64, 579 | seed: u64, 580 | ) -> Result { 581 | match output.extension() { 582 | None => Err(SketchyError::InvalidExtension), 583 | Some(os_str) => match os_str.to_str() { 584 | Some("msh") => Ok(SketchParams::Mash { 585 | kmers_to_sketch: sketch_size, 586 | final_size: sketch_size, 587 | no_strict: false, 588 | kmer_length: kmer_size, 589 | hash_seed: seed, 590 | }), 591 | Some("fsh") => Ok(SketchParams::Scaled { 592 | kmers_to_sketch: sketch_size, 593 | kmer_length: kmer_size, 594 | scale, 595 | hash_seed: seed, 596 | }), 597 | _ => Err(SketchyError::InvalidExtension), 598 | }, 599 | } 600 | } 601 | } 602 | --------------------------------------------------------------------------------