├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE.txt ├── README.md ├── benchmark ├── README.md ├── download │ ├── README.md │ ├── download_genbank.sh │ └── download_refseq.py ├── evaluation │ ├── Makefile │ ├── README.md │ ├── getNMI.py │ └── src │ │ ├── analysisPurity.cpp │ │ ├── calLabel.cpp │ │ ├── calPurity.cpp │ │ ├── checkTaxonomyStatus.cpp │ │ ├── getRepresentativeList.cpp │ │ ├── groundTruth.cpp │ │ ├── groundTruth.h │ │ ├── kseq.h │ │ ├── mapGenome.cpp │ │ └── precalLabel.cpp ├── generateList.sh └── simulate │ ├── Makefile │ ├── README.md │ └── src │ ├── create_containment_bacteria.cpp │ ├── kseq.h │ └── simulate_longSequence.cpp ├── install.sh ├── rabbittclust.png ├── src ├── CLI11.hpp ├── MST.cpp ├── MST.h ├── MST_IO.cpp ├── MST_IO.h ├── SketchInfo.cpp ├── SketchInfo.h ├── Sketch_IO.cpp ├── Sketch_IO.h ├── ThreadPool.h ├── ThreadPool.hxx ├── UnionFind.h ├── common.hpp ├── greedy.cpp ├── greedy.h ├── kseq.h ├── main.cpp ├── sub_command.cpp └── sub_command.h └── version_history ├── history.md ├── v.2.0.0.md ├── v.2.1.0.md ├── v.2.2.0.md └── v.2.2.1.md /.gitignore: -------------------------------------------------------------------------------- 1 | **/* 2 | !src/ 3 | !src/* 4 | !.gitignore 5 | !.gitmodules 6 | !CMakeLists.txt 7 | !LICENSE.txt 8 | !README.md 9 | !experient.md 10 | !install.sh 11 | !rabbittclust.png 12 | 13 | !version_history 14 | !version_history/* 15 | 16 | !benchmark/ 17 | !benchmark/README.md 18 | !benchmark/download_genomes.py 19 | !benchmark/generateList.sh 20 | 21 | !benchmark/download/ 22 | !benchmark/download/README.md 23 | !benchmark/download/download_genbank.sh 24 | !benchmark/download/download_refseq.py 25 | !benchmark/download/bact_GenBank.list.gz 26 | 27 | !benchmark/simulate/ 28 | !benchmark/simulate/README.md 29 | !benchmark/simulate/Makefile 30 | !benchmark/simulate/src/ 31 | !benchmark/simulate/src/* 32 | 33 | !benchmark/evaluation/ 34 | !benchmark/evaluation/Makefile 35 | !benchmark/evaluation/README.md 36 | !benchmark/evaluation/getNMI.py 37 | !benchmark/evaluation/src/ 38 | !benchmark/evaluation/src/* 39 | 40 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "RabbitFX"] 2 | path = RabbitFX 3 | url = https://github.com/RabbitBio/RabbitFX.git 4 | [submodule "RabbitSketch"] 5 | path = RabbitSketch 6 | url = https://github.com/RabbitBio/RabbitSketch.git 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(rabbitTClust) 4 | 5 | set(CMAKE_INSTALL_PREFIX ..) 6 | option(USE_RABBITFX "parse input fasta file with RabbitFX" OFF) 7 | option(USE_DEBUG "print the debug information" ON) 8 | option(USE_Timer "print the time information" ON) 9 | option(USE_GREEDY "use greedy incremental cluster" ON) 10 | 11 | 12 | 13 | find_package(OpenMP REQUIRED) 14 | if(OPENMP_FOUND) 15 | #message("openmp found") 16 | 17 | set(CMAKE_CXX_COMPILER "/usr/bin/g++") 18 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 20 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 21 | set(EXECUTABLE_OUTPUT_PATH .) 22 | endif() 23 | 24 | set(CMAKE_CXX_STANDARD 14) 25 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 26 | #set(CMAKE_CXX_FLAGS "-g -O3 -D THREADPOOL_MINHASH -D DEBUG -D Timer ${CMAKE_CXX_FLAGS}") 27 | 28 | set(CMAKE_CXX_FLAGS "-g -O3 -Wno-format -Wno-unused-result${CMAKE_CXX_FLAGS}") 29 | 30 | if(USE_DEBUG) 31 | set(CMAKE_CXX_FLAGS "-D DEBUG ${CMAKE_CXX_FLAGS}") 32 | endif() 33 | 34 | if(USE_Timer) 35 | set(CMAKE_CXX_FLAGS "-D Timer ${CMAKE_CXX_FLAGS}") 36 | endif() 37 | 38 | if(USE_GREEDY) 39 | set(CMAKE_CXX_FLAGS "-D GREEDY_CLUST ${CMAKE_CXX_FLAGS}") 40 | endif() 41 | 42 | if(USE_RABBITFX) 43 | set(CMAKE_CXX_FLAGS "-D RABBIT_FX ${CMAKE_CXX_FLAGS}") 44 | include_directories(src RabbitSketch/build/include RabbitFX/build/include) 45 | link_directories(RabbitSketch/build/lib RabbitFX/build/lib) 46 | else() 47 | set(CMAKE_CXX_FLAGS "-D THREADPOOL_MINHASH ${CMAKE_CXX_FLAGS}") 48 | include_directories(src RabbitSketch/build/include) 49 | link_directories(RabbitSketch/build/lib) 50 | endif() 51 | 52 | ##include_directories(src RabbitSketch/build/include RabbitFX/build/include) 53 | #include_directories(src RabbitSketch/build/include) 54 | # 55 | ##link_directories(RabbitSketch/build/lib RabbitFX/build/io) 56 | #link_directories(RabbitSketch/build/lib) 57 | 58 | aux_source_directory(src DIR_SRCS) 59 | 60 | if(USE_GREEDY) 61 | add_executable(clust-greedy ${DIR_SRCS}) 62 | if(USE_RABBITFX) 63 | target_link_libraries(clust-greedy rabbitsketch_static rabbitfx z) 64 | else() 65 | target_link_libraries(clust-greedy rabbitsketch_static z) 66 | endif() 67 | 68 | else() 69 | add_executable(clust-mst ${DIR_SRCS}) 70 | if(USE_RABBITFX) 71 | target_link_libraries(clust-mst rabbitsketch_static rabbitfx z) 72 | else() 73 | target_link_libraries(clust-mst rabbitsketch_static z) 74 | endif() 75 | endif() 76 | 77 | 78 | 79 | if(USE_GREEDY) 80 | install(TARGETS clust-greedy DESTINATION ./) 81 | else() 82 | install(TARGETS clust-mst DESTINATION ./) 83 | endif() 84 | 85 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | PURPOSE 2 | 3 | RabbitTClust is a fast and memory-efficient genoem clustering tool based 4 | on sketch-based distance estimation. Our approach enables efficient processing 5 | of large-scale datasets by combining dimensionality reduction techniques with 6 | streaming and parallelization on modern multi-core platforms. It is implemented 7 | in C++ and is distributed with: 8 | CLI11 9 | https://github.com/CLIUtils/CLI11 10 | 3-Clause BSD License 11 | 12 | KSeq 13 | https://github.com/attractivechaos/klib 14 | MIT License 15 | 16 | MurmurHash3 17 | code.google.com/p/smhasher/wiki/MurmurHash3 18 | Public domain 19 | 20 | Robin_Hood Unordered Map and Set 21 | https://github.com/martinus/robin-hood-hashing 22 | MIT License 23 | 24 | ThreadPool 25 | https://github.com/marbl/Mash/blob/master/src/mash/ThreadPool.h 26 | COPYRIGHT LICENSE 27 | Copyright © 2015, Battelle National Biodefense Institute (BNBI); 28 | all rights reserved. Authored by: Brian Ondov, Todd Treangen, 29 | Sergey Koren, and Adam Phillippy 30 | 31 | This Software was prepared for the Department of Homeland Security 32 | (DHS) by the Battelle National Biodefense Institute, LLC (BNBI) as 33 | part of contract HSHQDC-07-C-00020 to manage and operate the National 34 | Biodefense Analysis and Countermeasures Center (NBACC), a Federally 35 | Funded Research and Development Center. 36 | 37 | RabbitFX 38 | https://github.com/RabbitBio/RabbitFX 39 | GNU GENERAL PUBLIC LICENSE 40 | 41 | RabbitSketch 42 | https://github.com/RabbitBio/RabbitSketch 43 | MIT License 44 | 45 | 46 | COPYRIGHT LICENSE 47 | 48 | Copyright © 2021, Shandong University (SDU); all rights reserved. 49 | Authored by Xiaoming Xu and Zekun Yin. 50 | 51 | Redistribution and use in source and binary forms, with or without 52 | modification, are permitted provided that the following conditions are 53 | met: 54 | 55 | 1. Redistributions of source code must retain the above copyright 56 | notice, this list of conditions and the following disclaimer. 57 | 58 | 2. Redistributions in binary form must reproduce the above copyright 59 | notice, this list of conditions and the following disclaimer in the 60 | documentation and/or other materials provided with the distribution. 61 | 62 | 3. Neither the name of the copyright holder nor the names of its 63 | contributors may be used to endorse or promote products derived from 64 | this software without specific prior written permission. 65 | 66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 70 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 71 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 72 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 73 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 74 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 75 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 76 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![install with conda]( 2 | https://anaconda.org/bioconda/rabbittclust/badges/version.svg)](https://anaconda.org/bioconda/rabbittclust) 3 | [![install with conda]( 4 | https://anaconda.org/bioconda/rabbittclust/badges/latest_release_date.svg)](https://anaconda.org/bioconda/rabbittclust) 5 | [![install with conda]( 6 | https://anaconda.org/bioconda/rabbittclust/badges/platforms.svg)](https://anaconda.org/bioconda/rabbittclust) 7 | [![install with conda]( 8 | https://anaconda.org/bioconda/rabbittclust/badges/downloads.svg)](https://anaconda.org/bioconda/rabbittclust) 9 | 10 | ![RabbitTClust](rabbittclust.png) 11 | 12 | # `RabbitTClust v.2.3.0` 13 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations. 14 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms. 15 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 16 | 17 | ## Installation 18 | `RabbitTClust v.2.3.0` can only support 64-bit Linux Systems. 19 | 20 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](version_history/history.md) document. 21 | 22 | ### Install from bioconda 23 | RabbitTClust is available from [Bioconda](https://anaconda.org/bioconda/rabbittclust). 24 | 25 | Ensure that your machine supports at least AVX2 instructions. 26 | 27 | 28 | ### Install from source code 29 | #### Dependancy 30 | * cmake v.3.0 or later 31 | * c++14 32 | * [zlib](https://zlib.net/) 33 | 34 | #### Compile and install 35 | ```bash 36 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 37 | cd RabbitTClust 38 | ./install.sh 39 | ``` 40 | ## Usage 41 | ```bash 42 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust 43 | Usage: ./clust-mst [OPTIONS] 44 | Options: 45 | -h,--help Print this help message and exit 46 | -t,--threads INT set the thread number, default all CPUs of the platform 47 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 48 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 49 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 50 | -l,--list input is genome list, one genome per line 51 | -e,--no-save not save the intermediate files, such as sketches or MST 52 | -d,--threshold FLOAT set the distance threshold for clustering 53 | -o,--output TEXT REQUIRED set the output name of cluster result 54 | -i,--input TEXT Excludes: --append 55 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 56 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 57 | --premsted TEXT clustering by the pre-generated mst files rather than genomes for clust-mst 58 | --newick-tree output the newick tree format file for clust-mst 59 | --fast use the kssd algorithm for sketching and distance computing for clust-mst 60 | --append TEXT Excludes: --input 61 | append genome file or file list with the pre-generated sketch or MST files 62 | 63 | # clust-greedy, greedy incremental clustering module for RabbitTClust 64 | Usage: ./clust-greedy [OPTIONS] 65 | Options: 66 | -h,--help Print this help message and exit 67 | -t,--threads INT set the thread number, default all CPUs of the platform 68 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 69 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 70 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 71 | -l,--list input is genome list, one genome per line 72 | -e,--no-save not save the intermediate files, such as sketches or MST 73 | -d,--threshold FLOAT set the distance threshold for clustering 74 | -o,--output TEXT REQUIRED set the output name of cluster result 75 | -i,--input TEXT Excludes: --append 76 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 77 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 78 | --append TEXT Excludes: --input 79 | append genome file or file list with the pre-generated sketch or MST files 80 | ``` 81 | 82 | ## Example: 83 | ```bash 84 | # input is a file list, one genome path per line: 85 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust 86 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust 87 | 88 | # input is a single genome file in FASTA format, one genome as a sequence: 89 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust 90 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust 91 | 92 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options. 93 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust 94 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust 95 | 96 | 97 | # for redundancy detection with clust-greedy, input is a genome file list: 98 | # use -d to specify the distance threshold corresponding to various degrees of redundancy. 99 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out 100 | 101 | # v.2.1.0 or later 102 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15. 103 | # this folder contains the sketch, mst files. 104 | # for generator cluster from exist MST with a distance threshold of 0.045: 105 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 106 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045: 107 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 108 | 109 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001: 110 | # folder 2023_05_06_08-49-15 contains the sketch files. 111 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust 112 | 113 | # v.2.2.0 or later 114 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 115 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust 116 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust 117 | 118 | # v.2.2.1 or later 119 | # output the newick tree format for clust-mst, use the --newick-tree flag. 120 | ./clust-mst -l -i bacteria.list --newick-tree -o bacteria.mst.clust 121 | 122 | # v.2.3.0 or later 123 | # use the efficient Kssd sketch strategy for clust-mst, use the --fast flag. 124 | ./clust-mst --fast -l -i bacteria.list -o bacteria.fast.mst.clust 125 | ``` 126 | ## Output 127 | The output file is in a CD-HIT output format and is slightly different when running with or without `-l` input option. 128 | When using the `-l` option, the input is expected to be a FASTA file list, with each file representing a genome. Without the `-l` option, the input should be a single FASTA file, with each sequence representing a genome. 129 | 130 | #### Output format for a FASTA file list input 131 | With `-l*` option, the tab-delimited values in the lines beginning with tab delimiters are: 132 | * local index in a cluster 133 | * global index of the genome 134 | * genome length 135 | * genome file name (including genome assembly accession number) 136 | * sequence name (first sequence in the genome file) 137 | * sequence comment (remaining part of the line) 138 | 139 | **Example:** 140 | ```txt 141 | the cluster 0 is: 142 | 0 0 14782125nt bacteria/GCF_000418325.1_ASM41832v1_genomic.fna NC_021658.1 Sorangium cellulosum So0157-2, complete sequence 143 | 1 1 14598830nt bacteria/GCF_004135755.1_ASM413575v1_genomic.fna NZ_CP012672.1 Sorangium cellulosum strain So ce836 chromosome, complete genome 144 | 145 | the cluster 1 is: 146 | 0 2 14557589nt bacteria/GCF_002950945.1_ASM295094v1_genomic.fna NZ_CP012673.1 Sorangium cellulosum strain So ce26 chromosome, complete genome 147 | 148 | the cluster 2 is: 149 | 0 3 13673866nt bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna NZ_JAHKRM010000001.1 Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence 150 | 151 | ...... 152 | ``` 153 | 154 | #### Output format for a single FASTA file input 155 | Without `-l` option, the tab-delimited values in the lines beginning with tab delimiters are: 156 | * local index in a cluster 157 | * global index of the genome 158 | * genome length 159 | * sequence name 160 | * sequence comment (remaining part of this line) 161 | 162 | **Example:** 163 | ```txt 164 | the cluster 0 is: 165 | 0 0 11030030nt NZ_GG657755.1 Streptomyces himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence 166 | 1 1 11008137nt NZ_RIBZ01000339.1 Streptomyces sp. NEAU-LD23 C2041, whole genome shotgun sequence 167 | 168 | the cluster 1 is: 169 | 0 2 11006208nt NZ_KL647031.1 Nonomuraea candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence 170 | 171 | the cluster 2 is: 172 | 0 3 10940472nt NZ_VTHK01000001.1 Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence 173 | 174 | ...... 175 | ``` 176 | 177 | #### Output the newick tree format (v.2.2.1 or latter) 178 | When the `--newick-tree` option is used, an additional output file will be generated in the Newick tree format with a suffix name of ".newick.tree". 179 | 180 | 181 | # Bug Report 182 | We highly appreciate all bug reports, comments, and suggestions from our users. 183 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 184 | 185 | ## Cite 186 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6 187 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # benchmarking path 2 | 3 | ## [sub-directory simulate](./simulate/README.md) 4 | The script for simulating genome sequence. 5 | 6 | ## [sub-directory download](./download/README.md) 7 | The script for downloading the genomes from NCBI RefSeq and GenBank. 8 | 9 | ## [sub-directory evaluation](./evaluation/README.md) 10 | The script for evaluating the clustering quality. 11 | -------------------------------------------------------------------------------- /benchmark/download/README.md: -------------------------------------------------------------------------------- 1 | # download the genomes from NCBI RefSeq and GenBank 2 | RabbitTClust supports an input list of genomes in original FASTA format and gzips format. 3 | We recommend using the decompressed genome files as input to filter out the broken download compressed files. 4 | If you have to use the input list of the compressed files, you must check the md5 value. 5 | 6 | For the input of a single FASTA file (each sequence means a genome), RabbitTClust only supports decompressed FASTA format. 7 | 8 | ## download genomes from RefSeq 9 | The download script comes from [Bonsai](https://github.com/dnbaker/bonsai/tree/ac6f8c7ee1b2ae1128970a8f6dc01ddad19fdb37). 10 | 11 | The latest release of RefSeq bacterial genomes can be downloaded by `download_refseq.py` as follows: 12 | 13 | * `python3 download_genomes.py bacteria` 14 | * `python3 download_genomes.py -h` more details of help infos. 15 | 16 | ## download genomes from GenBank 17 | 18 | The latest release of GenBank bacterial genomes can be downloaded by `download_genbank.sh` as follows: 19 | * `./download_genbank.sh` 20 | -------------------------------------------------------------------------------- /benchmark/download/download_genbank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt 4 | awk -F '\t' 'NR>2 {print $20}' assembly_summary.txt >ftp.list 5 | 6 | outputDir="genbankDir" 7 | echo $# 8 | if [ $# -ge 1 ] 9 | then 10 | outputDir=$1 11 | fi 12 | mkdir -p $outputDir 13 | cat ftp.list | while read line 14 | do 15 | fname=$(echo $line | grep -o 'GCA_.*' | sed 's/$/_genomic.fna.gz/') 16 | #echo "$line/$fname" 17 | wget -c "$line/$fname" ; 18 | mv "$fname" $outputDir 19 | done 20 | -------------------------------------------------------------------------------- /benchmark/download/download_refseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import multiprocessing 4 | import gzip 5 | import os 6 | from subprocess import check_call as cc, CalledProcessError 7 | from enum import IntEnum 8 | argv = sys.argv 9 | 10 | 11 | if sys.version_info[0] != 3: 12 | raise Exception("Python 3 required") 13 | 14 | 15 | class ExitCodes(IntEnum): 16 | EXIT_SUCCESS = 0 17 | EXIT_FAILURE = 1 18 | 19 | 20 | def is_valid_gzip(fn, lazy=False, use_pigz=False): 21 | ''' 22 | We could instead use gunzip -t to check, but that actual requires 23 | iterating through the whole file, which is very slow. This is lazy, 24 | but at least it makes sure that it's a gzip file. 25 | 26 | lazy simply tries to see if the first 10 lines can be read. 27 | It isn't very safe. 28 | 29 | use_pigz uses pigz instead of gzip. A bad idea if a number of processes 30 | have already been spawned. 31 | ''' 32 | if lazy: 33 | try: 34 | cc("gzip -dc %s | head &>/dev/null" % fn, shell=True) 35 | return True 36 | except CalledProcessError: 37 | return False 38 | # lazy has already returned. This is the "else". 39 | cmd = ("pigz" if use_pigz else "gzip") + " -dc " 40 | try: 41 | cc(cmd + " -t " + fn, shell=True) 42 | sys.stderr.write(fn + " is valid") 43 | return True 44 | except CalledProcessError: 45 | sys.stderr.write("Corrupted file " + fn + ". Delete, try again.") 46 | return False 47 | 48 | 49 | def xfirstline(fn): 50 | # Works on python3, not 2. 51 | first_two = open(fn, "rb").read(2) 52 | return next((gzip.open if first_two == b"\x1f\x8b" else open)(fn)) 53 | 54 | 55 | FTP_BASENAME = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/" 56 | 57 | ALL_CLADES_MAP = { 58 | "archaea": FTP_BASENAME + "archaea/", 59 | "bacteria": FTP_BASENAME + "bacteria/", 60 | "fungi": FTP_BASENAME + "fungi/", 61 | "viral": FTP_BASENAME + "viral/", 62 | "plant": FTP_BASENAME + "plant/", 63 | "protozoa": FTP_BASENAME + "protozoa/", 64 | "human": FTP_BASENAME + "vertebrate_mammalian/Homo_sapiens", 65 | "vertebrate_mammalian": FTP_BASENAME + "vertebrate_mammalian/", 66 | "vertebrate_other": FTP_BASENAME + "vertebrate_other/" 67 | } 68 | 69 | DEFAULT_CLADES = [ 70 | "archaea", "bacteria", "viral", "human" 71 | ] 72 | 73 | DEFAULT_CLADES_STR = ", ".join(DEFAULT_CLADES) 74 | ALL_CLADES_STR = ", ".join(ALL_CLADES_MAP.keys()) 75 | 76 | TAX_PATH = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" 77 | 78 | 79 | def get_clade_map(clades): 80 | if clades[0] == "default": 81 | return {k: v for k, v in ALL_CLADES_MAP.items() if k in DEFAULT_CLADES} 82 | if clades[0] == "all": 83 | return ALL_CLADES_MAP 84 | ret = {} 85 | clades = [i.lower() for i in clades] 86 | for clade in clades: 87 | if clade in ALL_CLADES_MAP: 88 | ret[clade] = ALL_CLADES_MAP[clade] 89 | else: 90 | raise ValueError("clade %s not available. Abort!" % clade) 91 | return ret 92 | 93 | 94 | def parse_assembly(fn, fnidmap): 95 | print(fn) 96 | to_fetch = [] 97 | for line in open(fn, encoding='utf8'): 98 | if line[0] == '#': 99 | continue 100 | s = line.split("\t") 101 | if len(s) < 14: 102 | print(s) 103 | raise Exception("Not long enough") 104 | if ("latest" not in line or # Complete genome 105 | (("Complete Genome" not in line and 106 | "GRCh" not in line and "Full" not in line)) or 107 | any(i in line.lower() for 108 | i in ["contig", "supercontig"])): 109 | continue 110 | print(s[19], file=sys.stderr) 111 | fn = "%s_genomic.fna.gz" % ([i for i in s[19].split("/") if i][-1]) 112 | fnidmap[fn] = int(s[5]) 113 | index = len(s) - 1 114 | while "ftp" not in s[index] and index > 0: 115 | index = index - 1 116 | if index: 117 | to_fetch.append(s[index] + "/" + fn) 118 | else: 119 | print("No link found, continue", file=sys.stderr) 120 | continue 121 | #raise RuntimeError("ftp link not found. line: %s" % line[:-1]) 122 | return to_fetch 123 | 124 | 125 | def retry_cc(tup): 126 | cstr, die = tup 127 | RETRY_LIMIT = 10 128 | r = 0 129 | while r < RETRY_LIMIT: 130 | try: 131 | print(cstr, file=sys.stderr) 132 | cc(cstr, shell=True) 133 | return 134 | except CalledProcessError: 135 | print("retry number", r, file=sys.stderr) 136 | r += 1 137 | if die: 138 | raise Exception( 139 | "Could not download via %s " 140 | "even after %i attempts." % (cstr, RETRY_LIMIT)) 141 | else: 142 | sys.stderr.write( 143 | "Could not download %s even after %i attempts" % ( 144 | cstr, RETRY_LIMIT)) 145 | 146 | 147 | def getopts(): 148 | import argparse 149 | a = argparse.ArgumentParser() 150 | a.add_argument("--idmap", "-m", help="Path to which to write nameidmap.", 151 | default="nameidmap.txt") 152 | a.add_argument("--ref", "-r", help="Name of folder for references.") 153 | a.add_argument("clades", nargs="+", help="Clades to use." 154 | " default includes %s. all includes %s." % ( 155 | DEFAULT_CLADES_STR, ALL_CLADES_STR)) 156 | a.add_argument("--threads", "-p", 157 | help="Number of threads to use while downloading.", 158 | type=int, default=16) 159 | a.add_argument("--lazy", "-l", action='store_true', 160 | help="Don't check full gzipped file contents.") 161 | a.add_argument("--die", "-d", action='store_true') 162 | return a.parse_args() 163 | 164 | 165 | def check_path(fn, lazy=False): 166 | print("Checking path " + fn) 167 | if os.path.isfile(fn): 168 | if not is_valid_gzip(fn, lazy=lazy): 169 | cc("rm " + fn, shell=True) 170 | 171 | 172 | def check_path_lazy(path): 173 | check_path(path, lazy=True) 174 | 175 | 176 | def main(): 177 | global TAX_PATH 178 | tax_path = TAX_PATH # Make global variable local 179 | args = getopts() 180 | ref = args.ref if args.ref else "ref" 181 | if argv[1:] and argv[1] == "nodes": 182 | if not os.path.isfile("%s/nodes.dmp" % ref): 183 | cc("curl {tax_path} -o {ref}/" 184 | "taxdump.tgz && tar -zxvf {ref}/taxdump.tgz" 185 | " && mv nodes.dmp {ref}/nodes.dmp".format(**locals()), 186 | shell=True) 187 | return 0 188 | if not os.path.isdir(ref): 189 | os.makedirs(ref) 190 | clades = args.clades if args.clades else DEFAULT_CLADES 191 | for clade in clades: 192 | try: 193 | assert clade in ALL_CLADES_MAP or clade in ["all", "default"] 194 | except AssertionError: 195 | print("Clade %s not 'all', 'default', or one of the valid " 196 | "clades: %s" % (clade, ALL_CLADES_STR), file=sys.stderr) 197 | sys.exit(ExitCodes.EXIT_FAILURE) 198 | to_dl = get_clade_map(clades) 199 | print("About to download clades %s" % ", ".join(to_dl), file=sys.stderr) 200 | nameidmap = {} 201 | for clade in to_dl: 202 | cladeidmap = {} 203 | if not os.path.isdir(ref + "/" + clade): 204 | os.makedirs(ref + "/" + clade) 205 | if not os.path.isfile("%s/%s/as.%s.txt" % (ref, clade, clade)): 206 | cstr = ("curl %s/assembly_summary.txt " 207 | "-o %s/%s/as.%s.txt") % (to_dl[clade], ref, clade, clade) 208 | print(cstr) 209 | cc(cstr, shell=True) 210 | to_dl[clade] = parse_assembly("%s/%s/as.%s.txt" % 211 | (ref, clade, clade), cladeidmap) 212 | spoool = multiprocessing.Pool(args.threads) 213 | spoool.map(check_path_lazy if args.lazy else check_path, 214 | ("/".join([ref, clade, s.split("/")[-1]]) for 215 | s in to_dl[clade])) 216 | cstrs = [("curl %s -o %s/%s/%s" % 217 | (s, ref, clade, s.split("/")[-1])) for 218 | s in to_dl[clade] if not os.path.isfile( 219 | "%s/%s/%s" % (ref, clade, s.split("/")[-1]))] 220 | # If nodes.dmp hasn't been downloaded, grab it. 221 | if not os.path.isfile("%s/nodes.dmp" % ref): 222 | cstrs.append("curl {tax_path} -o {ref}/" 223 | "taxdump.tgz && tar -zxvf {ref}/taxdump.tgz" 224 | " && mv nodes.dmp {ref}/nodes.dmp".format(**locals())) 225 | spoool.map(retry_cc, ((cs, args.die) for cs in cstrs)) 226 | # Replace pathnames with seqids 227 | for fn in list(cladeidmap.keys()): 228 | try: 229 | #print(ref, clade, fn) 230 | cladeidmap[xfirstline("/".join( 231 | [ref, clade, fn] 232 | )).decode().split()[0][1:]] = cladeidmap[fn] 233 | del cladeidmap[fn] 234 | except FileNotFoundError: 235 | if args.die: 236 | raise 237 | nameidmap.update(cladeidmap) 238 | print("Done with all clades", file=sys.stderr) 239 | with open(ref + "/" + args.idmap, "w") as f: 240 | fw = f.write 241 | for k, v in nameidmap.items(): 242 | fw(k + "\t" + str(v) + "\n") 243 | return ExitCodes.EXIT_SUCCESS 244 | 245 | 246 | if __name__ == "__main__": 247 | sys.exit(main()) 248 | -------------------------------------------------------------------------------- /benchmark/evaluation/Makefile: -------------------------------------------------------------------------------- 1 | all: calLabel calPurity getRepresentativeList analysisPurity checkTaxonomyStatus 2 | 3 | calLabel: src/calLabel.cpp src/groundTruth.cpp 4 | g++ -O3 ./src/calLabel.cpp ./src/groundTruth.cpp -o calLabel 5 | calPurity: src/calPurity.cpp src/groundTruth.cpp 6 | g++ -O3 ./src/calPurity.cpp ./src/groundTruth.cpp -o calPurity 7 | getRepresentativeList: src/getRepresentativeList.cpp 8 | g++ -O3 ./src/getRepresentativeList.cpp -o getRepresentativeList 9 | analysisPurity: src/analysisPurity.cpp 10 | g++ -O3 ./src/analysisPurity.cpp -o analysisPurity 11 | checkTaxonomyStatus: src/checkTaxonomyStatus.cpp 12 | g++ -O3 ./src/checkTaxonomyStatus.cpp -o checkTaxonomyStatus 13 | 14 | clean: 15 | rm calLabel calPurity getRepresentativeList analysisPurity checkTaxonomyStatus 16 | 17 | -------------------------------------------------------------------------------- /benchmark/evaluation/README.md: -------------------------------------------------------------------------------- 1 | # evaluation script 2 | Run `make` to compile the script. 3 | 4 | ## calLabel && getNMI.py 5 | The `calLabel` is used for generating the label file from the clustering result of RabbitTClust. 6 | 7 | **==============================================================================** 8 | Example: 9 | `./calLabel bacteria.groundTruth -l bacteria.mst.clust bacteria.mst.label` 10 | It will generate the label file (`bacteria.mst.label`) from the clustering result (`bacteria.mst.clust`) of RabbitTClust. 11 | Subsequently, run `python3 getNMI.py bacteria.mst.label` to get the NMI score. 12 | 13 | **==============================================================================** 14 | * run as: `./calLabel groundTruth sketchOption clustFile labelFile` 15 | * The 0 parameter(`./calLabel`) is the application name 16 | * The 1 parameter(`groundTruth`) is input file, groundTruth file, `` per line (first line is header) 17 | * The 2 parameter(`sketchOption`) is input option, sketch options, `-l` means sketchByFile (input as a genome list), `-i` means sketchBySequence (input as a single genome file) 18 | * The 3 parameter(`clustFile`) is input file, cluster result file generated from RabbitTClust 19 | * The 4 parameter(`labelFile`) is output file, label file according to the groundTruth 20 | 21 | ## calPurity 22 | The `calPurity` is used for computing the purity of the clustering result of RabbitTClust. 23 | 24 | **==============================================================================** 25 | Example: 26 | `./calPurity -l bacteria.groundTruth bacteria.clust bacteria.purity` 27 | It will compute the total purity and the coverage of the clustering result and generate three information files: `bacteria.purity`, `bacteria.purity.accession.purity`, and `bacteria.purity.accession.unpurity`. 28 | * `bacteria.purity` is the detail purity for each cluster. 29 | * `bacteria.purity.accession.purity` is the list of first genome in each purity cluster. 30 | * `bacteria.purity.accession.unpurity` is the list of first dominant genome and the unpurity genomes for each cluster. 31 | 32 | **==============================================================================** 33 | * run as: `./calPurity options(-l, -i) groundTruth clustFile bacteria.purity` 34 | * The 0 parameter(`./calPurity`) is the application name 35 | * The 1 parameter(`options(-l, -i)`) is input option, sketch option for clust, -l or -i 36 | * The 2 parameter(`groundTruth`) is input file, the groundTruth file, `` per line 37 | * The 3 parameter(`clustFile`) is input file, cluster result file from RabbitTClust 38 | * The 4 parameter(`bacteria.purity`) is output file, output purity info file, including total result file(`bacteria.purity`) and accession file(`` per line) 39 | 40 | ## getRepresentativeList 41 | The `getRepresentativeList` is used for generating the representative genome list from the clustering result. 42 | **==============================================================================** 43 | Example: 44 | `./getRepresentativeList -l bacteria.greedy.clust bacteria_representative.list` 45 | It will choose the representative genome for each cluster and generate a list of these representative genomes. 46 | **==============================================================================** 47 | run as: `./getRepresentativeList -i/-l clustFile representative_list` 48 | * The 0 parameter(`./getRepresentativeList`) is the application name 49 | * The 1 parameter(`-i/-l`) is input parameter, sketch parameter for the cluster file, -i means sketchBySequence, -l means sketchByFile 50 | * The 2 parameter(`clustFile`) is input file, the cluster result from RabbitTClust 51 | * The 3 parameter(`representative_list`) is output file, the representative list of genomes file or sequences name. 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /benchmark/evaluation/getNMI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from sklearn import metrics 3 | import numpy as np 4 | import sys 5 | import pandas as pd 6 | 7 | def classification_report_cvs(report): 8 | report_data = [] 9 | lines = report.split('\n') 10 | #for line in lines: 11 | # report_data.append(line) 12 | #dataframe = pd.DataFrame.from_dict(report_data) 13 | #dataframe.to_csv(fileName+'.csv', index=False) 14 | line = lines[len(lines)-2] #for weighted precision, recall, and F1-score 15 | return line 16 | 17 | def getF1(arr): 18 | prediction = arr[0,:] 19 | groundTruth = arr[1,:] 20 | report = metrics.classification_report(groundTruth, prediction, digits=3, zero_division=0) 21 | line = classification_report_cvs(report) 22 | return line 23 | 24 | def getNMI(arr): 25 | a = arr[0,:] 26 | b = arr[1,:] 27 | result = metrics.normalized_mutual_info_score(a, b) 28 | return result 29 | 30 | if __name__ == "__main__": 31 | fileList = [sys.argv[1]] 32 | for file in fileList: 33 | originFile = np.loadtxt(file) 34 | F1 = getF1(originFile) 35 | NMI = getNMI(originFile) 36 | #print("result F1 of {} is: {} \n".format(file, F1)) 37 | #print("result NMI of {} is: {} \n".format(file, NMI)) 38 | print("{}:\t{}".format(file, NMI)) 39 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/analysisPurity.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/8/2 3 | * 4 | * 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | 27 | //#define LEVEL "family" 28 | #define LEVEL "genus" 29 | //#define LEVEL "species" 30 | 31 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions); 32 | 33 | int main(int argc , char *argv[]){ 34 | string application = argv[0]; 35 | vector args, descriptions; 36 | args.push_back(application); 37 | descriptions.push_back("the application name"); 38 | 39 | //========= parameters need changing ======== 40 | //The example is with parameters of specific numbers. 41 | //args is the tutorial names. 42 | string pwd = "/RabbitTClust/benchmark/evaluation/src/analysisPurity.cpp"; 43 | string dependency = "None"; 44 | string example = application + " purity.accession nodes.dmp outputFile"; 45 | args.push_back("nodes.dmp"); 46 | args.push_back("purity.accession"); 47 | args.push_back("outputFile"); 48 | descriptions.push_back("input file, the taxonomy nodes.dmp file"); 49 | descriptions.push_back("input file, the purity solved result file, from the calPurity file per line"); 50 | descriptions.push_back("output file, the final output file"); 51 | 52 | //-------- no changing ----------- 53 | assert(args.size() == descriptions.size()); 54 | if(argc != args.size()) { 55 | printInfo(pwd, dependency, example, args, descriptions); 56 | return 1; 57 | } 58 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 59 | { 60 | printInfo(pwd, dependency, example, args, descriptions); 61 | return 1; 62 | } 63 | 64 | //======== specific implement ======== 65 | string nodesFile = argv[1]; 66 | string inputFile = argv[2]; 67 | string outputFile = argv[3]; 68 | string line; 69 | ifstream ifs0(nodesFile); 70 | unordered_map> id_preIDRank_map; 71 | while(getline(ifs0, line)){ 72 | vector vstr; 73 | boost::split(vstr, line, boost::is_any_of("\t|"), boost::token_compress_on); 74 | int curId = stoi(vstr[0]); 75 | int preId = stoi(vstr[1]); 76 | string rank = vstr[2]; 77 | pair p(preId, rank); 78 | id_preIDRank_map.insert({curId, p}); 79 | } 80 | ifs0.close(); 81 | cerr << "the size of id_preIDRank_map is: " << id_preIDRank_map.size() << endl; 82 | 83 | ifstream ifs1(inputFile); 84 | string outputFile0 = outputFile + ".same"; 85 | string outputFile1 = outputFile + ".diff"; 86 | string outputFile2 = outputFile + ".same0"; 87 | ofstream ofs0(outputFile0); 88 | ofstream ofs1(outputFile1); 89 | ofstream ofs2(outputFile2); 90 | ofs0 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl; 91 | ofs1 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl; 92 | ofs2 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl; 93 | 94 | //string repAccession; 95 | vector repAccessionArr; 96 | vector badAccessionArr; 97 | unordered_map repClass; 98 | vector> badClassArr; 99 | 100 | int index = 0; 101 | string cmpLevel = LEVEL; 102 | 103 | while(getline(ifs1, line)){ 104 | //cout << index++ << endl; 105 | bool getGenus = false; 106 | if(line.length() == 0){ //finish a cluster 107 | bool isEqual = true; 108 | for(auto x : badClassArr){ 109 | if(x[cmpLevel] != repClass[cmpLevel]){ 110 | isEqual = false; 111 | break; 112 | } 113 | } 114 | if(isEqual){ 115 | if(repClass[cmpLevel] != 0){ 116 | for(auto repAccession : repAccessionArr){ 117 | ofs0 << '+' << '\t' << repAccession << '\t'; 118 | ofs0 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl; 119 | } 120 | for(int i = 0; i < badClassArr.size(); i++){ 121 | unordered_map x = badClassArr[i]; 122 | ofs0 << '-' <<'\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl; 123 | } 124 | ofs0 << endl; 125 | } 126 | else{ 127 | for(auto repAccession : repAccessionArr){ 128 | ofs2 << '+' << '\t' << repAccession << '\t'; 129 | for(auto x : repClass){ 130 | ofs2 << x.second << "(" << x.first << ")" << '\t'; 131 | } 132 | ofs2 << endl; 133 | } 134 | for(int i = 0; i < badClassArr.size(); i++){ 135 | ofs2 << '-' << '\t' << badAccessionArr[i] << '\t'; 136 | for(auto x : badClassArr[i]){ 137 | ofs2 << x.second << "(" << x.first << ")" << '\t'; 138 | } 139 | ofs2 << endl; 140 | } 141 | ofs2<< endl; 142 | } 143 | } 144 | else{ 145 | for(auto repAccession : repAccessionArr){ 146 | ofs1 << '+' << '\t' << repAccession << '\t'; 147 | ofs1 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl; 148 | } 149 | bool hasEqual = false; 150 | for(int i = 0; i < badClassArr.size(); i++){ 151 | unordered_map x = badClassArr[i]; 152 | if(x[cmpLevel] != repClass[cmpLevel]){ 153 | ofs1 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl; 154 | } 155 | else{ 156 | if(!hasEqual){ 157 | if(repClass[cmpLevel] != 0){ 158 | for(auto repAccession : repAccessionArr){ 159 | ofs0 << '+' << '\t' << repAccession << '\t'; 160 | ofs0 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl; 161 | } 162 | } 163 | else{ 164 | for(auto repAccession : repAccessionArr){ 165 | ofs2 << '+' << '\t' << repAccession << '\t'; 166 | ofs2 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl; 167 | } 168 | } 169 | hasEqual = true; 170 | } 171 | if(repClass[cmpLevel] != 0){ 172 | ofs0 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl; 173 | } 174 | else{ 175 | ofs2 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl; 176 | } 177 | } 178 | }//end for loop of badClassArr 179 | if(hasEqual){ 180 | if(repClass[cmpLevel] != 0) 181 | ofs0 << endl; 182 | else 183 | ofs2 << endl; 184 | } 185 | ofs1 << endl; 186 | 187 | } 188 | unordered_map().swap(repClass); 189 | vector>().swap(badClassArr); 190 | vector().swap(badAccessionArr); 191 | vector().swap(repAccessionArr); 192 | continue; 193 | } 194 | 195 | string accession; 196 | int curId; 197 | stringstream ss; 198 | ss << line; 199 | ss >> accession >> curId; 200 | if(id_preIDRank_map.count(curId) == 0){ 201 | cerr << "the id: " << curId << " is not in the taxonomy" << endl; 202 | continue; 203 | } 204 | if(line[0] != '\t'){ 205 | repAccessionArr.push_back(accession); 206 | //repAccession = accession; 207 | if(id_preIDRank_map.count(curId) > 0){ 208 | string rank = id_preIDRank_map[curId].second; 209 | repClass.insert({rank, curId}); 210 | repClass[rank] = curId; 211 | } 212 | while(id_preIDRank_map.count(curId) > 0 && curId != 1){ 213 | curId = id_preIDRank_map[curId].first; 214 | string rank = id_preIDRank_map[curId].second; 215 | if(rank == "no rank"){ 216 | if(!getGenus){ 217 | repClass.insert({rank, curId}); 218 | repClass[rank] = curId; 219 | } 220 | } 221 | else if(rank == "genus"){ 222 | getGenus = true; 223 | } 224 | repClass.insert({rank, curId}); 225 | repClass[rank] = curId; 226 | } 227 | }//end if line[0] != \t 228 | else{ 229 | badAccessionArr.push_back(accession); 230 | unordered_map curBadClass; 231 | if(id_preIDRank_map.count(curId) > 0){ 232 | string rank = id_preIDRank_map[curId].second; 233 | curBadClass.insert({rank, curId}); 234 | curBadClass[rank] = curId; 235 | } 236 | while(id_preIDRank_map.count(curId) > 0 && curId != 1){ 237 | curId = id_preIDRank_map[curId].first; 238 | string rank = id_preIDRank_map[curId].second; 239 | if(rank == "no rank"){ 240 | if(!getGenus){ 241 | curBadClass.insert({rank, curId}); 242 | curBadClass[rank] = curId; 243 | } 244 | } 245 | else if(rank == "genus"){ 246 | getGenus = true; 247 | } 248 | curBadClass.insert({rank, curId}); 249 | curBadClass[rank] = curId; 250 | } 251 | badClassArr.push_back(curBadClass); 252 | unordered_map().swap(curBadClass); 253 | } 254 | 255 | } 256 | ifs1.close(); 257 | ofs0.close(); 258 | ofs1.close(); 259 | ofs2.close(); 260 | 261 | 262 | cerr << "finished" << endl; 263 | return 0; 264 | } 265 | 266 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions){ 267 | assert(args.size() == descriptions.size()); 268 | cerr << endl; 269 | cerr << "example: " << example << endl; 270 | cerr << endl; 271 | cerr << "source file path: " << pwd << endl; 272 | cerr << endl; 273 | cerr << "dependency: " << dependency << endl; 274 | cerr << endl; 275 | cerr << "run as: "; 276 | for(int i = 0; i < args.size(); i++){ 277 | cerr << args[i] << ' '; 278 | } 279 | cerr << endl; 280 | for(int i = 0; i < args.size(); i++){ 281 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/calLabel.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/6/9 3 | * 4 | */ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "groundTruth.h" 19 | 20 | 21 | using namespace std; 22 | 23 | struct LabNum{ 24 | int label; 25 | int number; 26 | }; 27 | 28 | struct GlobalLabelInfo{ 29 | int clustId; 30 | int labelNumber; 31 | }; 32 | 33 | struct PosNum{ 34 | int startPos; 35 | int clustSize; 36 | }; 37 | 38 | struct IdNum{ 39 | int id; 40 | int number; 41 | }; 42 | 43 | bool cmpLabNum(LabNum ln1, LabNum ln2){ 44 | return ln1.number > ln2.number; 45 | } 46 | 47 | bool cmpIdNum(IdNum in1, IdNum in2){ 48 | return in1.number > in2.number; 49 | } 50 | 51 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions); 52 | 53 | int updateLabel(vector< vector > &labNumArr, unordered_map &globalMap, int clustId, int &badLabel, vector &resLabelArr); 54 | 55 | void calLabelFile(string groundTruth, string clustFile, string labelFile); 56 | 57 | void calLabelSequence(string groundTruth, string clustFile, string labelFile); 58 | 59 | int main(int argc , char *argv[]){ 60 | string application = argv[0]; 61 | vector args, descriptions; 62 | args.push_back(application); 63 | descriptions.push_back("the application name"); 64 | 65 | //========= parameters need changing ======== 66 | //The example is with parameters of specific numbers. 67 | //args is the tutorial names. 68 | string pwd = "RabbitTClust/benchmark/evaluation/src"; 69 | string dependency = "None"; 70 | string example = application + " bacteria.groundTruth -l bacteria.mst.clust bacteria.mst.label"; 71 | args.push_back("groundTruth"); 72 | args.push_back("sketchOption"); 73 | args.push_back("clustFile"); 74 | args.push_back("labelFile"); 75 | descriptions.push_back("input file, groundTruth file, per line(first line is header)"); 76 | descriptions.push_back("input option, sketch options, -l or -i, -l means sketchByFile, -i means sketchBySequence"); 77 | descriptions.push_back("input file, cluster result file need to be labeled"); 78 | descriptions.push_back("output file, label file according the groundTruth"); 79 | 80 | //-------- no changing ----------- 81 | assert(args.size() == descriptions.size()); 82 | if(argc != args.size()) { 83 | printInfo(pwd, dependency, example, args, descriptions); 84 | return 1; 85 | } 86 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 87 | { 88 | printInfo(pwd, dependency, example, args, descriptions); 89 | return 1; 90 | } 91 | 92 | //======== specific implement ======== 93 | string groundTruth = argv[1]; 94 | string option = argv[2]; 95 | string clustFile = argv[3]; 96 | string labelFile = argv[4]; 97 | 98 | bool sketchByFile; 99 | if(option == "-l") sketchByFile = true; 100 | else if(option == "-i") sketchByFile = false; 101 | else{ 102 | cerr << "error option: " << option << ", need -l or -i " << endl; 103 | return 1; 104 | } 105 | if(sketchByFile){ 106 | calLabelFile(groundTruth, clustFile, labelFile); 107 | } 108 | else{ 109 | calLabelSequence(groundTruth, clustFile, labelFile); 110 | } 111 | 112 | return 0; 113 | } 114 | void calLabelSequence(string groundTruth, string clustFile, string labelFile){ 115 | //--------for groundTruth-------------- 116 | unordered_map seqName_taxid_map; 117 | unordered_map taxid_organismName_map; 118 | getGroundTruthBySequence(groundTruth, seqName_taxid_map, taxid_organismName_map); 119 | 120 | //--------for cluster file-------------------------- 121 | vector ourClust; 122 | vector standardClust; 123 | unordered_map standardMap; 124 | unordered_map curMap; 125 | vector> labNumArr; 126 | vector posArr; 127 | int startPos = 0; 128 | string line; 129 | 130 | int numNotInGroundTruth = 0; 131 | ifstream ifs1(clustFile); 132 | if(!ifs1){ 133 | cerr << "error open: " << clustFile << endl; 134 | exit(1); 135 | } 136 | while(getline(ifs1, line)){ 137 | if(line[0] != '\t'){ 138 | if(curMap.size() != 0){ 139 | int clustSize = 0; 140 | vector curClustInfo; 141 | for(auto x : curMap){ 142 | LabNum ln; 143 | ln.label = x.first; 144 | ln.number = x.second; 145 | curClustInfo.push_back(ln); 146 | clustSize += x.second; 147 | } 148 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 149 | labNumArr.push_back(curClustInfo); 150 | PosNum pn; 151 | pn.startPos = startPos; 152 | pn.clustSize = clustSize; 153 | posArr.push_back(pn); 154 | startPos += clustSize; 155 | unordered_map().swap(curMap); 156 | } 157 | } 158 | else{ 159 | stringstream ss; 160 | ss << line; 161 | int curId, genomeId; 162 | string genomeSize, fileName, genomeName; 163 | ss >> curId >> genomeId >> genomeSize >> genomeName; 164 | string key = genomeName; 165 | if(seqName_taxid_map.find(key) == seqName_taxid_map.end()){ 166 | numNotInGroundTruth++; 167 | continue; 168 | } 169 | else{ 170 | int curLabel = seqName_taxid_map[key]; 171 | standardClust.push_back(curLabel); 172 | curMap.insert({curLabel, 0}); 173 | curMap[curLabel]++; 174 | } 175 | } 176 | } 177 | if(curMap.size() != 0){ 178 | int clustSize = 0; 179 | vector curClustInfo; 180 | for(auto x : curMap){ 181 | LabNum ln; 182 | ln.label = x.first; 183 | ln.number = x.second; 184 | curClustInfo.push_back(ln); 185 | clustSize += x.second; 186 | } 187 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 188 | labNumArr.push_back(curClustInfo); 189 | PosNum pn; 190 | pn.startPos = startPos; 191 | pn.clustSize = clustSize; 192 | posArr.push_back(pn); 193 | startPos += clustSize; 194 | unordered_map().swap(curMap); 195 | } 196 | 197 | //-------------for update labels------------------------------------ 198 | unordered_map globalMap; 199 | int badLabel = -1; 200 | int clustNumber = labNumArr.size(); 201 | vector resLabelArr; 202 | resLabelArr.resize(clustNumber); 203 | for(int i = 0; i < clustNumber; i++) 204 | { 205 | badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 206 | } 207 | for(int i = 0; i < posArr.size(); i++) 208 | { 209 | int startPos = posArr[i].startPos; 210 | int clustSize = posArr[i].clustSize; 211 | for(int j = 0; j < clustSize; j++) 212 | { 213 | ourClust.push_back(resLabelArr[i]); 214 | } 215 | } 216 | cerr << "the number of which not in the groundTruth is: " << numNotInGroundTruth << endl; 217 | cerr << "the size of ourClust is: " << ourClust.size() << endl; 218 | cerr << "the size of standardClust is: " << standardClust.size() << endl; 219 | if(ourClust.size() != standardClust.size()) 220 | { 221 | cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl; 222 | return; 223 | } 224 | 225 | //--------------------for output labels------------------------------------- 226 | ofstream ofs(labelFile); 227 | for(int i = 0; i < ourClust.size(); i++) 228 | ofs << ourClust[i] << ' '; 229 | ofs << endl; 230 | for(int i = 0; i < standardClust.size(); i++) 231 | ofs << standardClust[i] << ' '; 232 | ofs << endl; 233 | ofs.close(); 234 | 235 | ofstream ofs1(labelFile+".humanReadable"); 236 | for(int i = 0; i < ourClust.size(); i++) 237 | { 238 | ofs1 << ourClust[i] << '\t' << standardClust[i] << endl; 239 | } 240 | ofs1.close(); 241 | } 242 | 243 | void calLabelFile(string groundTruth, string clustFile, string labelFile){ 244 | //--------for groundTruth-------------- 245 | unordered_map accession_taxid_map; 246 | unordered_map taxid_organismName_map; 247 | getGroundTruthByFile(groundTruth, accession_taxid_map, taxid_organismName_map); 248 | 249 | //--------for cluster file-------------------------- 250 | vector ourClust; 251 | vector standardClust; 252 | unordered_map standardMap; 253 | unordered_map curMap; 254 | vector> labNumArr; 255 | vector posArr; 256 | int startPos = 0; 257 | 258 | int numNotInGroundTruth = 0; 259 | ifstream ifs1(clustFile); 260 | if(!ifs1){ 261 | cerr << "error open: " << clustFile << endl; 262 | exit(1); 263 | } 264 | string line; 265 | while(getline(ifs1, line)){ 266 | if(line[0] != '\t'){ 267 | if(curMap.size() != 0){ 268 | int clustSize = 0; 269 | vector curClustInfo; 270 | for(auto x : curMap){ 271 | LabNum ln; 272 | ln.label = x.first; 273 | ln.number = x.second; 274 | curClustInfo.push_back(ln); 275 | clustSize += x.second; 276 | } 277 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 278 | labNumArr.push_back(curClustInfo); 279 | PosNum pn; 280 | pn.startPos = startPos; 281 | pn.clustSize = clustSize; 282 | posArr.push_back(pn); 283 | startPos += clustSize; 284 | unordered_map().swap(curMap); 285 | } 286 | } 287 | else{ 288 | stringstream ss; 289 | ss << line; 290 | int curId, genomeId; 291 | string genomeSize, fileName, genomeName; 292 | ss >> curId >> genomeId >> genomeSize >> fileName >> genomeName; 293 | int startIndex = fileName.find_last_of('/'); 294 | int endIndex = fileName.find_first_of('_', startIndex+5); 295 | if(endIndex == -1) endIndex = fileName.find('.', startIndex+5); 296 | string key = fileName.substr(startIndex+1, endIndex-startIndex-1); 297 | if(accession_taxid_map.find(key) == accession_taxid_map.end()){ 298 | numNotInGroundTruth++; 299 | continue; 300 | } 301 | else{ 302 | int curLabel = accession_taxid_map[key]; 303 | standardClust.push_back(curLabel); 304 | curMap.insert({curLabel, 0}); 305 | curMap[curLabel]++; 306 | } 307 | } 308 | } 309 | if(curMap.size() != 0){ 310 | int clustSize = 0; 311 | vector curClustInfo; 312 | for(auto x : curMap){ 313 | LabNum ln; 314 | ln.label = x.first; 315 | ln.number = x.second; 316 | curClustInfo.push_back(ln); 317 | clustSize += x.second; 318 | } 319 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 320 | labNumArr.push_back(curClustInfo); 321 | PosNum pn; 322 | pn.startPos = startPos; 323 | pn.clustSize = clustSize; 324 | posArr.push_back(pn); 325 | startPos += clustSize; 326 | unordered_map().swap(curMap); 327 | } 328 | 329 | //-------------for update labels------------------------------------ 330 | unordered_map globalMap; 331 | int badLabel = -1; 332 | int clustNumber = labNumArr.size(); 333 | vector resLabelArr; 334 | resLabelArr.resize(clustNumber); 335 | for(int i = 0; i < clustNumber; i++) 336 | { 337 | badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 338 | } 339 | for(int i = 0; i < posArr.size(); i++) 340 | { 341 | int startPos = posArr[i].startPos; 342 | int clustSize = posArr[i].clustSize; 343 | for(int j = 0; j < clustSize; j++) 344 | { 345 | ourClust.push_back(resLabelArr[i]); 346 | } 347 | } 348 | cerr << "the number of which not in the groundTruth is: " << numNotInGroundTruth << endl; 349 | cerr << "the size of ourClust is: " << ourClust.size() << endl; 350 | cerr << "the size of standardClust is: " << standardClust.size() << endl; 351 | if(ourClust.size() != standardClust.size()) 352 | { 353 | cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl; 354 | return; 355 | } 356 | 357 | //--------------------for output labels------------------------------------- 358 | ofstream ofs(labelFile); 359 | for(int i = 0; i < ourClust.size(); i++) 360 | ofs << ourClust[i] << ' '; 361 | ofs << endl; 362 | for(int i = 0; i < standardClust.size(); i++) 363 | ofs << standardClust[i] << ' '; 364 | ofs << endl; 365 | ofs.close(); 366 | 367 | //ofstream ofs1(labelFile+".humanReadable"); 368 | //for(int i = 0; i < ourClust.size(); i++) 369 | //{ 370 | // ofs1 << ourClust[i] << '\t' << standardClust[i] << endl; 371 | //} 372 | //ofs1.close(); 373 | 374 | } 375 | 376 | int updateLabel(vector< vector > &labNumArr, unordered_map &globalMap, int clustId, int &badLabel, vector &resLabelArr)//return the new badLabel 377 | { 378 | bool isBad = true; 379 | while(labNumArr[clustId].size() != 0 && isBad) 380 | { 381 | int curLabel = labNumArr[clustId][0].label; 382 | int curNumber = labNumArr[clustId][0].number; 383 | if(globalMap.find(curLabel) == globalMap.end())//new label 384 | { 385 | GlobalLabelInfo glab; 386 | glab.clustId = clustId; 387 | glab.labelNumber = curNumber; 388 | globalMap.insert({curLabel, glab}); 389 | resLabelArr[clustId] = curLabel; 390 | isBad = false; 391 | } 392 | else//label collison with previous cluster 393 | { 394 | int preClustId = globalMap[curLabel].clustId; 395 | int preNumber = globalMap[curLabel].labelNumber; 396 | if(curNumber > preNumber)//the previous cluster is defeated, need to update the previous cluster. 397 | { 398 | resLabelArr[clustId] = curLabel; 399 | isBad = false; 400 | globalMap[curLabel].clustId = clustId; 401 | globalMap[curLabel].labelNumber = curNumber; 402 | badLabel = updateLabel(labNumArr, globalMap, preClustId, badLabel, resLabelArr); 403 | } 404 | else//current cluster can not defeat the previous cluster, just erase the biggest label to try new biggest label 405 | {} 406 | } 407 | 408 | labNumArr[clustId].erase(labNumArr[clustId].begin());//erase the biggest label in this cluster 409 | }//end while 410 | if(isBad) 411 | { 412 | resLabelArr[clustId] = badLabel; 413 | badLabel--;//update the newBadLabel 414 | } 415 | return badLabel; 416 | } 417 | 418 | 419 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions){ 420 | assert(args.size() == descriptions.size()); 421 | cerr << endl; 422 | cerr << "example: " << example << endl; 423 | cerr << endl; 424 | cerr << "source file path: " << pwd << endl; 425 | cerr << endl; 426 | cerr << "dependency: " << dependency << endl; 427 | cerr << endl; 428 | cerr << "run as: "; 429 | for(int i = 0; i < args.size(); i++){ 430 | cerr << args[i] << ' '; 431 | } 432 | cerr << endl; 433 | for(int i = 0; i < args.size(); i++){ 434 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 435 | } 436 | } 437 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/checkTaxonomyStatus.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/8/3 3 | * 4 | * 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | 27 | using namespace std; 28 | 29 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions); 30 | 31 | int main(int argc , char *argv[]){ 32 | string application = argv[0]; 33 | vector args, descriptions; 34 | args.push_back(application); 35 | descriptions.push_back("the application name"); 36 | 37 | //========= parameters need changing ======== 38 | //The example is with parameters of specific numbers. 39 | //args is the tutorial names. 40 | string pwd = "RabbitTClust/benchmark/evaluation/src/checkTaxonomyStatus.cpp"; 41 | string dependency = "None"; 42 | string example = application + " ANI_report_prokaryotes.txt greedy.ans output"; 43 | args.push_back("ANI_report_prokaryotes.txt"); 44 | args.push_back("greedy.ans"); 45 | args.push_back("output"); 46 | descriptions.push_back("input file, the ANI_report_prokaryotes file, per line"); 47 | descriptions.push_back("input file, the analysis from the analysisPurity, per line"); 48 | descriptions.push_back("output file, the output file of result"); 49 | 50 | //-------- no changing ----------- 51 | assert(args.size() == descriptions.size()); 52 | if(argc != args.size()) { 53 | printInfo(pwd, dependency, example, args, descriptions); 54 | return 1; 55 | } 56 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 57 | { 58 | printInfo(pwd, dependency, example, args, descriptions); 59 | return 1; 60 | } 61 | 62 | //======== specific implement ======== 63 | string aniFile = argv[1]; 64 | string anaFile = argv[2]; 65 | string outputFile = argv[3]; 66 | 67 | string line; 68 | unordered_set accessionSet; 69 | unordered_map accSpeciesTaxidMap; 70 | unordered_map accExcludedFromRefseqMap; 71 | unordered_map accBestMatchSpeciesTaxidMap; 72 | unordered_map accBestMatchStatusMap; 73 | unordered_map accQcoverageMap; 74 | unordered_map accScoverageMap; 75 | ifstream ifs0(aniFile); 76 | getline(ifs0, line);//header line 77 | int index = 0; 78 | int calNumBestMatchStatus = 0; 79 | int calNumExcludedFromRefseq = 0; 80 | while(getline(ifs0, line)){ 81 | string accession, excluded_from_refseq, best_match_status; 82 | int species_taxid, best_match_species_taxid; 83 | double qcoverage, scoverage; 84 | vector vstr; 85 | boost::split(vstr, line, boost::is_any_of("\t"), boost::token_compress_on); 86 | accession = vstr[0]; 87 | species_taxid = vstr[1] != "na" ? stoi(vstr[1]) : 0; 88 | best_match_species_taxid = vstr[2] != "na" ? stoi(vstr[2]) : 0; 89 | best_match_status = vstr[3]; 90 | excluded_from_refseq = vstr[4]; 91 | qcoverage = vstr[5] != "na" ? stod(vstr[5]) : 0.0; 92 | scoverage = vstr[6] != "na" ? stod(vstr[6]) : 0.0; 93 | if(best_match_status == "species-match") calNumBestMatchStatus++; 94 | if(excluded_from_refseq == "na"){ 95 | calNumExcludedFromRefseq++; 96 | //cout << accession << endl; 97 | } 98 | 99 | accSpeciesTaxidMap.insert({accession, species_taxid}); 100 | accExcludedFromRefseqMap.insert({accession, excluded_from_refseq}); 101 | accBestMatchSpeciesTaxidMap.insert({accession, best_match_species_taxid}); 102 | accBestMatchStatusMap.insert({accession, best_match_status}); 103 | accQcoverageMap.insert({accession, qcoverage}); 104 | accScoverageMap.insert({accession, scoverage}); 105 | accessionSet.insert({accession}); 106 | } 107 | ifs0.close(); 108 | cerr << "the size of accSpeciesTaxidMap is: " << accSpeciesTaxidMap.size() << endl; 109 | cerr << "the best_match_status of species_match is: " << calNumBestMatchStatus << ", the percent is: " << (double)calNumBestMatchStatus / accSpeciesTaxidMap.size() << endl; 110 | cerr << "the excluded_from_refseq of na is: " << calNumExcludedFromRefseq << ", the percent is: " << (double)calNumExcludedFromRefseq / accSpeciesTaxidMap.size() << endl; 111 | //exit(0); 112 | 113 | ifstream ifs1(anaFile); 114 | string outputFile0 = outputFile + ".species_taxid.check"; 115 | string outputFile1 = outputFile + ".best_match_species_taxid.check"; 116 | string outputFile2 = outputFile + ".exclude_from_refseq.check"; 117 | string outputFile3 = outputFile + ".best_match_status.check"; 118 | string outputFile4 = outputFile + ".perfect.check"; 119 | string outputFile5 = outputFile + ".coverage.check"; 120 | ofstream ofs0(outputFile0); 121 | ofstream ofs1(outputFile1); 122 | ofstream ofs2(outputFile2); 123 | ofstream ofs3(outputFile3); 124 | ofstream ofs4(outputFile4); 125 | ofstream ofs5(outputFile5); 126 | ofs0 << "label\taccession\tassembly_taxid\ttaxonomy_taxid" << endl; 127 | ofs1 << "label\taccession\tassembly_taxid\tbest_match_species_taxid" << endl; 128 | ofs2 << "label\taccession\texclude_from_refseq" << endl; 129 | ofs3 << "label\taccession\tbest_match_status" << endl; 130 | ofs4 << "label\taccession\tassembly_taxid" << endl; 131 | ofs5 << "label\taccession\tqcoverage\tscoverage" << endl; 132 | getline(ifs1, line);//header line 133 | int numNotInTaxonomy = 0; 134 | int totalNumber_rep = 0; 135 | int totalNumber_bad = 0; 136 | int perfectNum_rep = 0; 137 | int perfectNum_bad = 0; 138 | int numNotEqualTaxid_rep = 0; 139 | int numNotEqualTaxid_bad = 0; 140 | int numNotEqualBestMatch_rep = 0; 141 | int numNotEqualBestMatch_bad = 0; 142 | int numNotBestMatchSpeciesLevel_rep = 0; 143 | int numNotBestMatchSpeciesLevel_bad = 0; 144 | int numExclude_from_refSeq_rep = 0; 145 | int numExclude_from_refSeq_bad = 0; 146 | 147 | vector matchStatusArr; 148 | matchStatusArr.push_back("species-match"); 149 | matchStatusArr.push_back("subspecies-match"); 150 | matchStatusArr.push_back("synonym-match"); 151 | matchStatusArr.push_back("derived-species-match"); 152 | matchStatusArr.push_back("genus-match"); 153 | matchStatusArr.push_back("approved-mismatch"); 154 | matchStatusArr.push_back("mismatch"); 155 | matchStatusArr.push_back("below-threshold-match"); 156 | matchStatusArr.push_back("below-threshold-mismatch"); 157 | matchStatusArr.push_back("low-coverage"); 158 | 159 | vector numMatchStatusRepArr; 160 | vector numMatchStatusBadArr; 161 | for(int i = 0; i < 10; i++){ 162 | numMatchStatusRepArr.push_back(0); 163 | numMatchStatusBadArr.push_back(0); 164 | } 165 | 166 | //int rep_numSpeciesMatch = 0; 167 | //int rep_numSubSpeciesMatch = 0; 168 | //int rep_numSynonymMatch = 0; 169 | //int rep_numDerivedSpeciesMatch = 0; 170 | //int rep_numGenusMatch = 0; 171 | //int rep_numApprovedMismatch = 0; 172 | //int rep_numMismatch_rep = 0; 173 | //int rep_numBelowThresholdMatch = 0; 174 | //int rep_numBelowThresholdMismatch = 0; 175 | //int rep_numLowCoverage = 0; 176 | 177 | //int bad_numSpeciesMatch = 0; 178 | //int bad_numSubSpeciesMatch = 0; 179 | //int bad_numSynonymMatch = 0; 180 | //int bad_numDerivedSpeciesMatch = 0; 181 | //int bad_numGenusMatch = 0; 182 | //int bad_numApprovedMismatch = 0; 183 | //int bad_numMismatch = 0; 184 | //int bad_numBelowThresholdMatch = 0; 185 | //int bad_numBelowThresholdMismatch = 0; 186 | //int bad_numLowCoverage = 0; 187 | 188 | while(getline(ifs1, line)){ 189 | if(line.length() == 0){ 190 | ofs0 << endl; 191 | ofs1 << endl; 192 | ofs2 << endl; 193 | ofs3 << endl; 194 | //ofs4 << endl; 195 | ofs5 << endl; 196 | //cout << endl; 197 | continue; 198 | } 199 | string index, accession; 200 | int species, no_rank, genus, family, order; 201 | stringstream ss; 202 | ss << line; 203 | ss >> index >> accession >> species >> no_rank >> genus >> family >> order; 204 | if(accessionSet.count(accession) == 0){ 205 | //cerr << "the accession is not in the taxonomy file" << endl; 206 | //cout << line << endl; 207 | numNotInTaxonomy++; 208 | continue; 209 | } 210 | 211 | int taxSID = accSpeciesTaxidMap[accession]; 212 | int taxBMID = accBestMatchSpeciesTaxidMap[accession]; 213 | string taxEFR = accExcludedFromRefseqMap[accession]; 214 | string taxBMS = accBestMatchStatusMap[accession]; 215 | double qcoverage = accQcoverageMap[accession]; 216 | double scoverage = accScoverageMap[accession]; 217 | if(index == "+"){ 218 | totalNumber_rep++; 219 | if(species != taxSID) numNotEqualTaxid_rep++; 220 | if(taxSID != taxBMID) numNotEqualBestMatch_rep++; 221 | if(taxEFR != "na") numExclude_from_refSeq_rep++; 222 | if(taxBMS != "species-match") numNotBestMatchSpeciesLevel_rep++; 223 | 224 | for(int i = 0; i < 10; i++){ 225 | if(taxBMS == matchStatusArr[i]) numMatchStatusRepArr[i]++; 226 | } 227 | 228 | if(species == taxSID && taxSID == taxBMID && taxEFR == "na" && taxBMS == "species-match"){ 229 | perfectNum_rep++; 230 | ofs4 << line << endl; 231 | } 232 | } 233 | else{ 234 | totalNumber_bad++; 235 | if(species != taxSID) numNotEqualTaxid_bad++; 236 | if(taxSID != taxBMID) numNotEqualBestMatch_bad++; 237 | if(taxEFR != "na") numExclude_from_refSeq_bad++; 238 | if(taxBMS != "species-match") numNotBestMatchSpeciesLevel_bad++; 239 | 240 | for(int i = 0; i < 10; i++){ 241 | if(taxBMS == matchStatusArr[i]) numMatchStatusBadArr[i]++; 242 | } 243 | 244 | //if(taxBMS == "mismatch"|| taxBMS == "below-threshold-match" || taxBMS == "below-threshold-mismatch" || taxBMS == "low-coverage") numNotBestMatchSpeciesLevel_bad++; 245 | //if(species == taxSID && taxSID == taxBMID && taxEFR == "na" && taxBMS == "species-match"){ 246 | //if(species == taxSID && taxEFR == "na" && taxBMS == "species-match"){ 247 | if(species == taxBMID && taxEFR == "na"){ 248 | perfectNum_bad++; 249 | ofs4 << line << endl; 250 | } 251 | } 252 | 253 | ofs0 << index << '\t' << accession << '\t' << species << '\t' << taxSID << endl; 254 | ofs1 << index << '\t' << accession << '\t' << species << '\t' << taxBMID << endl; 255 | ofs2 << index << '\t' << accession << '\t' << taxEFR << endl; 256 | ofs3 << index << '\t' << accession << '\t' << taxBMS << endl; 257 | ofs5 << index << '\t' << accession << '\t' << qcoverage << '\t' << scoverage << endl; 258 | } 259 | ofs0.close(); 260 | ofs1.close(); 261 | ofs2.close(); 262 | ofs3.close(); 263 | ofs5.close(); 264 | 265 | cerr << "finished" << endl; 266 | cerr << "the number not in the taxonomy is: " << numNotInTaxonomy << endl; 267 | cerr << "for representative genomes, the total number is: " << totalNumber_rep << endl; 268 | cerr << "\tthe numNotEqualtaxid of assembly-summary and taxonomy is: " << numNotEqualTaxid_rep << endl; 269 | cerr << "\t\tthe percentange is: " << (double)numNotEqualTaxid_rep / totalNumber_rep << endl; 270 | cerr << "\tthe numNotEqualBestMatch of species-taxid and best-species-taxid is: " << numNotEqualBestMatch_rep << endl; 271 | cerr << "\t\tthe percentange is: " << (double)numNotEqualBestMatch_rep / totalNumber_rep << endl; 272 | cerr << "\tthe numExclude_from_refSeq_rep is: " << numExclude_from_refSeq_rep << endl; 273 | cerr << "\t\tthe percentange is: " << (double)numExclude_from_refSeq_rep / totalNumber_rep << endl; 274 | cerr << "\tthe numNotBestMatchSpeciesLevel_rep is: " << numNotBestMatchSpeciesLevel_rep << endl; 275 | cerr << "\t\tthe percentange is: " << (double)numNotBestMatchSpeciesLevel_rep / totalNumber_rep << endl; 276 | cerr << "\tthe perfectNum_rep is: " << perfectNum_rep << endl; 277 | cerr << "\t\tthe percentange is: " << (double)perfectNum_rep / totalNumber_rep << endl; 278 | 279 | cerr << "for bad genomes, the total number is: " << totalNumber_bad << endl; 280 | cerr << "\tthe numNotEqualtaxid of assembly-summary and taxonomy is: " << numNotEqualTaxid_bad << endl; 281 | cerr << "\t\tthe percentange is: " << (double)numNotEqualTaxid_bad / totalNumber_bad << endl; 282 | cerr << "\tthe numNotEqualBestMatch of species-taxid and best-species-taxid is: " << numNotEqualBestMatch_bad << endl; 283 | cerr << "\t\tthe percentange is: " << (double)numNotEqualBestMatch_bad / totalNumber_bad << endl; 284 | cerr << "\tthe numExclude_from_refSeq_bad is: " << numExclude_from_refSeq_bad << endl; 285 | cerr << "\t\tthe percentange is: " << (double)numExclude_from_refSeq_bad / totalNumber_bad << endl; 286 | cerr << "\tthe numNotBestMatchSpeciesLevel_bad is: " << numNotBestMatchSpeciesLevel_bad << endl; 287 | cerr << "\t\tthe percentange is: " << (double)numNotBestMatchSpeciesLevel_bad / totalNumber_bad << endl; 288 | cerr << "\tthe perfectNum_bad is: " << perfectNum_bad << endl; 289 | cerr << "\t\tthe percentange is: " << (double)perfectNum_bad / totalNumber_bad << endl; 290 | 291 | 292 | cerr << "====================================================================================" << endl; 293 | int tmpGoodTotalNumber = 0; 294 | for(int i = 0; i < 10; i++){ 295 | cerr << "the number of rep " << matchStatusArr[i] << " is: " << numMatchStatusRepArr[i] << ", and percent is: " << (double)numMatchStatusRepArr[i] / totalNumber_rep << endl; 296 | tmpGoodTotalNumber += numMatchStatusRepArr[i]; 297 | } 298 | cerr << "the total good number is: " << tmpGoodTotalNumber << endl; 299 | 300 | cerr << "====================================================================================" << endl; 301 | int tmpBadTotalNumber = 0; 302 | for(int i = 0; i < 10; i++){ 303 | cerr << "the number of bad " << matchStatusArr[i] << " is: " << numMatchStatusBadArr[i] << ", and percent is: " << (double)numMatchStatusBadArr[i] / totalNumber_bad << endl; 304 | tmpBadTotalNumber += numMatchStatusBadArr[i]; 305 | } 306 | cerr << "the total bad number is: " << tmpBadTotalNumber << endl; 307 | 308 | 309 | return 0; 310 | } 311 | 312 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions){ 313 | assert(args.size() == descriptions.size()); 314 | cerr << endl; 315 | cerr << "example: " << example << endl; 316 | cerr << endl; 317 | cerr << "source file path: " << pwd << endl; 318 | cerr << endl; 319 | cerr << "dependency: " << dependency << endl; 320 | cerr << endl; 321 | cerr << "run as: "; 322 | for(int i = 0; i < args.size(); i++){ 323 | cerr << args[i] << ' '; 324 | } 325 | cerr << endl; 326 | for(int i = 0; i < args.size(); i++){ 327 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/getRepresentativeList.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/7/16 3 | * 4 | * 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | using namespace std; 25 | 26 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions); 27 | 28 | int main(int argc , char *argv[]){ 29 | string application = argv[0]; 30 | vector args, descriptions; 31 | args.push_back(application); 32 | descriptions.push_back("the application name"); 33 | 34 | //========= parameters need changing ======== 35 | //The example is with parameters of specific numbers. 36 | //args is the tutorial names. 37 | string pwd = "RabbitTClust/benchmark/evaluation/src/getRepresentativeList.cpp"; 38 | string dependency = "None"; 39 | string example = application + " -l bacteria.greedy.clust bacteria_representative.list"; 40 | args.push_back("-i/-l"); 41 | args.push_back("clustFile"); 42 | args.push_back("representative_list"); 43 | descriptions.push_back("input parameter, sketch parameter for the cluster file, -i means sketchBySequence, -l means sketchByFile"); 44 | descriptions.push_back("input file, the cluster result from RabbitTClust"); 45 | descriptions.push_back("output file, the representative list of genomes file or sequences"); 46 | 47 | //-------- no changing ----------- 48 | assert(args.size() == descriptions.size()); 49 | if(argc != args.size()) { 50 | printInfo(pwd, dependency, example, args, descriptions); 51 | return 1; 52 | } 53 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 54 | { 55 | printInfo(pwd, dependency, example, args, descriptions); 56 | return 1; 57 | } 58 | 59 | //======== specific implement ======== 60 | string option = argv[1]; 61 | if(option != "-l" && option != "-i"){ 62 | cerr << "error option: " << option << ", need -l or -i option" << endl; 63 | return 1; 64 | } 65 | 66 | string clustFile = argv[2]; 67 | string outputFile = argv[3]; 68 | 69 | ifstream ifs(clustFile); 70 | string line; 71 | ofstream ofs(outputFile); 72 | bool isClust = false; 73 | while(getline(ifs, line)){ 74 | if(line[0] != '\t'){ 75 | isClust = true; 76 | } 77 | else if(isClust){ 78 | isClust = false; 79 | stringstream ss; 80 | int curId, globalId; 81 | string length, fileName, seqName, comment, tmpComment; 82 | ss << line; 83 | if(option == "-l"){ 84 | ss >> curId >> globalId >> length >> fileName >> seqName; 85 | ofs << fileName << endl; 86 | } 87 | else if(option == "-i"){ 88 | ss >> curId >> globalId >> length >> seqName; 89 | ofs << seqName << endl; 90 | } 91 | } 92 | 93 | } 94 | ifs.close(); 95 | 96 | return 0; 97 | } 98 | 99 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions){ 100 | assert(args.size() == descriptions.size()); 101 | cerr << endl; 102 | cerr << "example: " << example << endl; 103 | cerr << endl; 104 | cerr << "source file path: " << pwd << endl; 105 | cerr << endl; 106 | cerr << "dependency: " << dependency << endl; 107 | cerr << endl; 108 | cerr << "run as: "; 109 | for(int i = 0; i < args.size(); i++){ 110 | cerr << args[i] << ' '; 111 | } 112 | cerr << endl; 113 | for(int i = 0; i < args.size(); i++){ 114 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/groundTruth.cpp: -------------------------------------------------------------------------------- 1 | #include "groundTruth.h" 2 | #include 3 | 4 | void getGroundTruthBySequence(string groundTruth, unordered_map& seqName_taxid_map, unordered_map& taxid_organismName_map){ 5 | 6 | ifstream ifs0(groundTruth); 7 | if(!ifs0){ 8 | cerr << "error open: " << groundTruth << endl; 9 | exit(1); 10 | } 11 | string line; 12 | getline(ifs0, line);//for the header line 13 | while(getline(ifs0, line)){ 14 | stringstream ss; 15 | string seqName, organismName(""), tmpStr; 16 | int taxid; 17 | ss << line; 18 | ss >> seqName >> taxid; 19 | while(ss >> tmpStr){ 20 | organismName += tmpStr + ' '; 21 | } 22 | organismName.substr(0, organismName.length()-1); 23 | seqName_taxid_map.insert({seqName, taxid}); 24 | taxid_organismName_map.insert({taxid, organismName}); 25 | } 26 | ifs0.close(); 27 | } 28 | 29 | void getGroundTruthByFile(string groundTruth, unordered_map& accession_taxid_map, unordered_map& taxid_organismName_map){ 30 | 31 | ifstream ifs0(groundTruth); 32 | if(!ifs0){ 33 | cerr << "error open: " << groundTruth << endl; 34 | exit(1); 35 | } 36 | string line; 37 | getline(ifs0, line);//for the header line 38 | while(getline(ifs0, line)){ 39 | stringstream ss; 40 | string accession, organismName(""), tmpStr; 41 | int taxid; 42 | ss << line; 43 | ss >> accession >> taxid; 44 | while(ss >> tmpStr){ 45 | organismName += tmpStr + ' '; 46 | } 47 | organismName.substr(0, organismName.length()-1); 48 | accession_taxid_map.insert({accession, taxid}); 49 | taxid_organismName_map.insert({taxid, organismName}); 50 | } 51 | ifs0.close(); 52 | } 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/groundTruth.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | void getGroundTruthBySequence(string groundTruth, unordered_map& seqName_taxid_map, unordered_map& taxid_organismName_map); 9 | 10 | 11 | void getGroundTruthByFile(string groundTruth, unordered_map& accession_taxid_map, unordered_map& taxid_organismName_map); 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #ifdef USE_MALLOC_WRAPPERS 36 | # include "malloc_wrap.h" 37 | #endif 38 | 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 40 | #define KS_SEP_TAB 1 // isspace() && !' ' 41 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 42 | #define KS_SEP_MAX 2 43 | 44 | #define __KS_TYPE(type_t) \ 45 | typedef struct __kstream_t { \ 46 | unsigned char *buf; \ 47 | int begin, end, is_eof; \ 48 | type_t f; \ 49 | } kstream_t; 50 | 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 53 | 54 | #define __KS_BASIC(type_t, __bufsize) \ 55 | static inline kstream_t *ks_init(type_t f) \ 56 | { \ 57 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 58 | ks->f = f; \ 59 | ks->buf = (unsigned char*)malloc(__bufsize); \ 60 | return ks; \ 61 | } \ 62 | static inline void ks_destroy(kstream_t *ks) \ 63 | { \ 64 | if (ks) { \ 65 | free(ks->buf); \ 66 | free(ks); \ 67 | } \ 68 | } 69 | 70 | #define __KS_GETC(__read, __bufsize) \ 71 | static inline int ks_getc(kstream_t *ks) \ 72 | { \ 73 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 74 | if (ks->begin >= ks->end) { \ 75 | ks->begin = 0; \ 76 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 77 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ 78 | } \ 79 | return (int)ks->buf[ks->begin++]; \ 80 | } 81 | 82 | #ifndef KSTRING_T 83 | #define KSTRING_T kstring_t 84 | typedef struct __kstring_t { 85 | size_t l, m; 86 | char *s; 87 | } kstring_t; 88 | #endif 89 | 90 | #ifndef kroundup32 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 92 | #endif 93 | 94 | #define __KS_GETUNTIL(__read, __bufsize) \ 95 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 96 | { \ 97 | int gotany = 0; \ 98 | if (dret) *dret = 0; \ 99 | str->l = append? str->l : 0; \ 100 | for (;;) { \ 101 | int i; \ 102 | if (ks->begin >= ks->end) { \ 103 | if (!ks->is_eof) { \ 104 | ks->begin = 0; \ 105 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 106 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | gotany = 1; \ 128 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 129 | str->l = str->l + (i - ks->begin); \ 130 | ks->begin = i + 1; \ 131 | if (i < ks->end) { \ 132 | if (dret) *dret = ks->buf[i]; \ 133 | break; \ 134 | } \ 135 | } \ 136 | if (!gotany && ks_eof(ks)) return -1; \ 137 | if (str->s == 0) { \ 138 | str->m = 1; \ 139 | str->s = (char*)calloc(1, 1); \ 140 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 141 | str->s[str->l] = '\0'; \ 142 | return str->l; \ 143 | } \ 144 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 145 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 146 | 147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 148 | __KS_TYPE(type_t) \ 149 | __KS_BASIC(type_t, __bufsize) \ 150 | __KS_GETC(__read, __bufsize) \ 151 | __KS_GETUNTIL(__read, __bufsize) 152 | 153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 154 | 155 | #define __KSEQ_BASIC(SCOPE, type_t) \ 156 | SCOPE kseq_t *kseq_init(type_t fd) \ 157 | { \ 158 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 159 | s->f = ks_init(fd); \ 160 | return s; \ 161 | } \ 162 | SCOPE void kseq_destroy(kseq_t *ks) \ 163 | { \ 164 | if (!ks) return; \ 165 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 166 | ks_destroy(ks->f); \ 167 | free(ks); \ 168 | } 169 | 170 | /* Return value: 171 | >=0 length of the sequence (normal) 172 | -1 end-of-file 173 | -2 truncated quality string 174 | */ 175 | #define __KSEQ_READ(SCOPE) \ 176 | SCOPE int kseq_read(kseq_t *seq) \ 177 | { \ 178 | int c; \ 179 | kstream_t *ks = seq->f; \ 180 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 181 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 182 | if (c == -1) return -1; /* end of file */ \ 183 | seq->last_char = c; \ 184 | } /* else: the first header char has been read in the previous call */ \ 185 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 186 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 187 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 188 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 189 | seq->seq.m = 256; \ 190 | seq->seq.s = (char*)malloc(seq->seq.m); \ 191 | } \ 192 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 193 | if (c == '\n') continue; /* skip empty lines */ \ 194 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 195 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 196 | } \ 197 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 198 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 199 | seq->seq.m = seq->seq.l + 2; \ 200 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 201 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 202 | } \ 203 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 204 | if (c != '+') return seq->seq.l; /* FASTA */ \ 205 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 206 | seq->qual.m = seq->seq.m; \ 207 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 208 | } \ 209 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 210 | if (c == -1) return -2; /* error: no quality string */ \ 211 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 212 | seq->last_char = 0; /* we have not come to the next header line */ \ 213 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 214 | return seq->seq.l; \ 215 | } 216 | 217 | #define __KSEQ_TYPE(type_t) \ 218 | typedef struct { \ 219 | kstring_t name, comment, seq, qual; \ 220 | int last_char; \ 221 | kstream_t *f; \ 222 | } kseq_t; 223 | 224 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 225 | KSTREAM_INIT(type_t, __read, 16384) \ 226 | __KSEQ_TYPE(type_t) \ 227 | __KSEQ_BASIC(SCOPE, type_t) \ 228 | __KSEQ_READ(SCOPE) 229 | 230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 231 | 232 | #define KSEQ_DECLARE(type_t) \ 233 | __KS_TYPE(type_t) \ 234 | __KSEQ_TYPE(type_t) \ 235 | extern kseq_t *kseq_init(type_t fd); \ 236 | void kseq_destroy(kseq_t *ks); \ 237 | int kseq_read(kseq_t *seq); 238 | 239 | #endif 240 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/mapGenome.cpp: -------------------------------------------------------------------------------- 1 | /* There maybe not only one sequence in the genome file. 2 | * mapGenome.cpp is used to check whether multi sequences within a genome file have the same nomenclature type in the comment description or not. 3 | * The result have show that all the sequences within the same genome file have the same nomenclature type. 4 | * 5 | * 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "kseq.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | KSEQ_INIT(gzFile, gzread); 19 | using namespace std; 20 | struct Type{ 21 | string type; 22 | int number; 23 | }; 24 | 25 | 26 | int main(int argc, char *argv[]){ 27 | if(argc < 2) return 1; 28 | string inputFile = argv[1]; 29 | string outputFile = "mapType.out"; 30 | fstream fs(inputFile); 31 | string line; 32 | vector fileList; 33 | 34 | while(getline(fs, line)) 35 | { 36 | fileList.push_back(line); 37 | } 38 | cout << "the size of fileList: " << fileList.size() << endl; 39 | 40 | vector genomeType[fileList.size()]; 41 | FILE *fp = fopen(outputFile.c_str(), "w"); 42 | int tmpIndex = 0; 43 | int subIndex = fileList.size() / 100; 44 | 45 | //#pragma omp parallel for num_threads(48) 46 | for(int i = 0; i < fileList.size(); i++) 47 | { 48 | if(i > tmpIndex * subIndex) 49 | { 50 | tmpIndex++; 51 | cerr << tmpIndex << endl; 52 | } 53 | gzFile fp1 = gzopen(fileList[i].c_str(), "r"); 54 | kseq_t * ks1; 55 | ks1 = kseq_init(fp1); 56 | //cerr << "read the file " << fileList[i] << endl; 57 | unordered_map genomeMap; 58 | while(1) 59 | { 60 | int length = kseq_read(ks1); 61 | if(length < 0) break; 62 | string name = ks1->name.s; 63 | string comment = ks1->comment.s; 64 | stringstream ss; 65 | ss << comment; 66 | string type0, type1, type2; 67 | ss >> type0 >> type1 >> type2; 68 | if(type0.substr(0, 10) == "UNVERIFIED") 69 | { 70 | type0 = type1; 71 | type1 = type2; 72 | } 73 | if(type0.back() == ',') type0.pop_back(); 74 | if(type1.back() == ',') type1.pop_back(); 75 | 76 | string key = type0 + '\t' + type1; 77 | genomeMap.insert({key, 0}); 78 | genomeMap[key]++; 79 | } 80 | gzclose(fp1); 81 | kseq_destroy(ks1); 82 | if(genomeMap.size() != 1) 83 | { 84 | cerr << "there are not only one class in the file: " << fileList[i] << endl; 85 | for(auto x : genomeMap) 86 | { 87 | cerr << "\t" << x.first << "\t" << x.second << endl; 88 | } 89 | } 90 | 91 | for(auto x : genomeMap) 92 | { 93 | //cout << x.first << "\t" << x.second << endl; 94 | fprintf(fp, "%s\t%d\n", x.first.c_str(), x.second); 95 | genomeType[i].push_back({x.first, x.second}); 96 | } 97 | fprintf(fp, "\n"); 98 | unordered_map().swap(genomeMap); 99 | } 100 | cerr << "finished" << endl; 101 | 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /benchmark/evaluation/src/precalLabel.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Email: xiaoming.xu@mail.sdu.edu.cn 3 | * Data: 2022/2/18 4 | * 5 | * calF1.cpp is used as preprocessing of the evaluation of precision, recall, F1-score, and NMI for bacteria, refseq, half-Bacteria, sub-Bacteria datasets. 6 | * The ground truth labels of genomes are as species_taxid which reveals nomenclature of gene feature. 7 | * The parameter -i and -l corresponding to the cluster of genomes served as sequences and files. 8 | * The input cluster files are in the CD-HIT format. 9 | * 10 | * 11 | */ 12 | 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | struct LabNum{ 27 | int label; 28 | int number; 29 | }; 30 | 31 | struct GlobalLabelInfo{ 32 | int clustId; 33 | int labelNumber; 34 | }; 35 | 36 | struct PosNum{ 37 | int startPos; 38 | int clustSize; 39 | }; 40 | 41 | struct IdNum{ 42 | int id; 43 | int number; 44 | }; 45 | 46 | bool cmpLabNum(LabNum ln1, LabNum ln2){ 47 | return ln1.number > ln2.number; 48 | } 49 | 50 | bool cmpIdNum(IdNum in1, IdNum in2){ 51 | return in1.number > in2.number; 52 | } 53 | 54 | inline void printInfo() 55 | { 56 | cerr << "run with: ./calLabel RabbitTClust -l(-i) groundTruth bacteria.out bacteria.f1" << endl; 57 | cerr << "The second argument (RabbitTClust) is applications, including RabbitTClust, MeshClust2, MeshClust3, gclust or Mothur " << endl; 58 | cerr << "For the third argument, -l means genomes served as files, -i means genomes served as sequences" << endl; 59 | cerr << "The fourth argument (groundTruth) is the ground truth from assembly_bacteria.txt of the " << endl; 60 | cerr << "The fifth argument (bacteria.out) is the cluster result from RabbitTClust, MeshClust2, gclust or Mothur " << endl; 61 | cerr << "The sixth argument (bacteria.f1) is the output file path" << endl; 62 | } 63 | 64 | /* The output result is th resLabelArr with size of cluster number, each element is the label for the cluster. 65 | */ 66 | int updateLabel(vector< vector > &labNumArr, unordered_map &globalMap, int clustId, int &badLabel, vector &resLabelArr)//return the new badLabel 67 | { 68 | bool isBad = true; 69 | while(labNumArr[clustId].size() != 0 && isBad) 70 | { 71 | int curLabel = labNumArr[clustId][0].label; 72 | int curNumber = labNumArr[clustId][0].number; 73 | if(globalMap.find(curLabel) == globalMap.end())//new label 74 | { 75 | GlobalLabelInfo glab; 76 | glab.clustId = clustId; 77 | glab.labelNumber = curNumber; 78 | globalMap.insert({curLabel, glab}); 79 | resLabelArr[clustId] = curLabel; 80 | isBad = false; 81 | } 82 | else//label collison with previous cluster 83 | { 84 | int preClustId = globalMap[curLabel].clustId; 85 | int preNumber = globalMap[curLabel].labelNumber; 86 | if(curNumber > preNumber)//the previous cluster is defeated, need to update the previous cluster. 87 | { 88 | resLabelArr[clustId] = curLabel; 89 | isBad = false; 90 | globalMap[curLabel].clustId = clustId; 91 | globalMap[curLabel].labelNumber = curNumber; 92 | badLabel = updateLabel(labNumArr, globalMap, preClustId, badLabel, resLabelArr); 93 | } 94 | else//current cluster can not defeat the previous cluster, just erase the biggest label to try new biggest label 95 | {} 96 | } 97 | 98 | labNumArr[clustId].erase(labNumArr[clustId].begin());//erase the biggest label in this cluster 99 | }//end while 100 | if(isBad) 101 | { 102 | resLabelArr[clustId] = badLabel; 103 | badLabel--;//update the newBadLabel 104 | } 105 | return badLabel; 106 | } 107 | 108 | void calF1(string application, string argument, string groundTruth, string inputFile, string outputFile) 109 | { 110 | if(application != "MeshClust3" && application != "MeshClust2" && application != "RabbitTClust" && application != "Mothur" && application != "gclust") 111 | { 112 | printInfo(); 113 | return; 114 | } 115 | ofstream ofs(outputFile); 116 | ofstream ofs1(outputFile+".humanReadable"); 117 | 118 | fstream fs0(groundTruth); 119 | string line; 120 | 121 | unordered_map groundTruthMapFile; 122 | unordered_map groundTruthMapSeq; 123 | unordered_set groundTruthClustNumber; 124 | 125 | getline(fs0,line); 126 | while(getline(fs0, line)) 127 | { 128 | string assembly_accession, genomeName, species_taxid; 129 | stringstream ss; 130 | ss << line; 131 | //ss >> assembly_accession >> genomeName >> species_taxid; 132 | ss >> assembly_accession >> species_taxid >> genomeName; 133 | //cerr << species_taxid << endl; 134 | groundTruthMapFile.insert({assembly_accession, stoi(species_taxid)}); 135 | groundTruthMapSeq.insert({genomeName, stoi(species_taxid)}); 136 | groundTruthClustNumber.insert(stoi(species_taxid)); 137 | 138 | } 139 | cerr << "the groundTruthClustNumber size(not for this dataset) is: " << groundTruthClustNumber.size() << endl; 140 | 141 | 142 | fstream fs(inputFile); 143 | 144 | int curStandardIndex = 0; 145 | vector ourClust; 146 | vector standardClust; 147 | unordered_map standardMap; 148 | unordered_map curMap; 149 | 150 | int startPos = 0; 151 | vector< vector > labNumArr; 152 | vector posArr; 153 | 154 | int numNotIngroundTruth = 0; 155 | 156 | if(application == "MeshClust3") 157 | { 158 | int curId; 159 | string genomeSize, genomeName, fileName; 160 | while(getline(fs, line)) 161 | { 162 | if(line.length() == 0)//finish a cluster 163 | { 164 | if(curMap.size() != 0) 165 | { 166 | int clustSize = 0; 167 | vector curClustInfo; 168 | for(auto x : curMap) 169 | { 170 | LabNum ln; 171 | ln.label = x.first; 172 | ln.number = x.second; 173 | curClustInfo.push_back(ln); 174 | clustSize += x.second; 175 | } 176 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 177 | labNumArr.push_back(curClustInfo); 178 | 179 | PosNum pn; 180 | pn.startPos = startPos; 181 | pn.clustSize = clustSize; 182 | posArr.push_back(pn); 183 | 184 | startPos += clustSize; 185 | 186 | unordered_map().swap(curMap); 187 | } 188 | } 189 | else 190 | { 191 | stringstream ss; 192 | ss << line; 193 | ss >> curId >> genomeName; 194 | genomeName = genomeName.substr(1); 195 | if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end()) 196 | { 197 | //cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl; 198 | numNotIngroundTruth++; 199 | continue; 200 | } 201 | else 202 | { 203 | int curLabel = groundTruthMapSeq[genomeName]; 204 | standardClust.push_back(curLabel); 205 | curMap.insert({curLabel, 0}); 206 | curMap[curLabel]++; 207 | } 208 | } 209 | } 210 | } 211 | else 212 | { 213 | 214 | while(getline(fs, line)) 215 | { 216 | if(line.length() == 0) continue; 217 | if(application == "MeshClust2") 218 | { 219 | if(line[0] == '>') 220 | { 221 | if(curMap.size() != 0) 222 | { 223 | int clustSize = 0; 224 | vector curClustInfo; 225 | 226 | for(auto x : curMap) 227 | { 228 | LabNum ln; 229 | ln.label = x.first; 230 | ln.number = x.second; 231 | curClustInfo.push_back(ln); 232 | clustSize += x.second; 233 | } 234 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 235 | labNumArr.push_back(curClustInfo); 236 | 237 | PosNum pn; 238 | pn.startPos = startPos; 239 | pn.clustSize = clustSize; 240 | posArr.push_back(pn); 241 | 242 | startPos += clustSize; 243 | 244 | unordered_map().swap(curMap); 245 | } 246 | } 247 | else{ 248 | stringstream ss; 249 | ss << line; 250 | int curId, genomeId; 251 | string genomeSize, fileName, genomeName; 252 | string type0, type1, type2; 253 | if(argument == "-l") 254 | ss >> curId >> fileName >>genomeSize >> genomeName >> type0 >> type1 >> type2; 255 | else if(argument == "-i") 256 | ss >> curId >>genomeSize >> genomeName >> type0 >> type1 >> type2; 257 | else 258 | { 259 | cerr << "error argument, need -l or -i " << endl; 260 | printInfo(); 261 | return; 262 | } 263 | 264 | genomeName = genomeName.substr(1); 265 | if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end()) 266 | { 267 | //cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl; 268 | numNotIngroundTruth++; 269 | continue; 270 | } 271 | else 272 | { 273 | int curLabel = groundTruthMapSeq[genomeName]; 274 | standardClust.push_back(curLabel); 275 | curMap.insert({curLabel, 0}); 276 | curMap[curLabel]++; 277 | } 278 | 279 | } 280 | }//end MeshClust2 281 | else //other application(RabbitTClust, gclust, Mothur) 282 | { 283 | if(line[0] != '\t') 284 | { 285 | if(curMap.size() != 0) 286 | { 287 | int clustSize = 0; 288 | vector curClustInfo; 289 | 290 | for(auto x : curMap) 291 | { 292 | LabNum ln; 293 | ln.label = x.first; 294 | ln.number = x.second; 295 | curClustInfo.push_back(ln); 296 | clustSize += x.second; 297 | } 298 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 299 | labNumArr.push_back(curClustInfo); 300 | 301 | PosNum pn; 302 | pn.startPos = startPos; 303 | pn.clustSize = clustSize; 304 | posArr.push_back(pn); 305 | 306 | startPos += clustSize; 307 | 308 | unordered_map().swap(curMap); 309 | } 310 | } 311 | else{ 312 | stringstream ss; 313 | ss << line; 314 | int curId, genomeId; 315 | string genomeSize, fileName, genomeName; 316 | string type0, type1, type2; 317 | if(application == "RabbitTClust") 318 | { 319 | if(argument == "-l") 320 | { 321 | ss >> curId >> genomeId >> genomeSize >> fileName >> genomeName >> type0 >> type1 >> type2; 322 | int startIndex = fileName.find_last_of('/'); 323 | //cerr << fileName << endl; 324 | //cerr << startIndex << endl; 325 | int endIndex = fileName.find('_', startIndex + 5); 326 | if(fileName.find('_', startIndex+5) == -1) 327 | endIndex = fileName.find('.', startIndex+5); 328 | //int endIndex = std::max(fileName.find('_', startIndex + 5), fileName.find('.', startIndex + 5)); 329 | //cerr << endIndex << endl; 330 | string key = fileName.substr(startIndex+1, endIndex -startIndex -1); 331 | //cerr << key << endl; 332 | //exit(0); 333 | if(groundTruthMapFile.find(key) == groundTruthMapFile.end()) 334 | { 335 | //cerr << "the key: " << key << " is not in the groundTruth!" << endl; 336 | numNotIngroundTruth++; 337 | continue;//skip this label 338 | } 339 | else 340 | { 341 | int curLabel = groundTruthMapFile[key]; 342 | standardClust.push_back(curLabel); 343 | curMap.insert({curLabel, 0}); 344 | curMap[curLabel]++; 345 | } 346 | } 347 | else if(argument == "-i") 348 | { 349 | ss >> curId >> genomeId >> genomeSize >> genomeName >> type0 >> type1 >> type2; 350 | if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end()) 351 | { 352 | //cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl; 353 | numNotIngroundTruth++; 354 | continue; 355 | } 356 | else 357 | { 358 | int curLabel = groundTruthMapSeq[genomeName]; 359 | standardClust.push_back(curLabel); 360 | curMap.insert({curLabel, 0}); 361 | curMap[curLabel]++; 362 | } 363 | } 364 | } 365 | else if(application == "Mothur")//TODO 366 | { 367 | ss >> genomeName >> type0 >> type1 >> type2; 368 | if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end()) 369 | { 370 | //cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl; 371 | numNotIngroundTruth++; 372 | continue; 373 | } 374 | else 375 | { 376 | int curLabel = groundTruthMapSeq[genomeName]; 377 | standardClust.push_back(curLabel); 378 | curMap.insert({curLabel, 0}); 379 | curMap[curLabel]++; 380 | } 381 | } 382 | else if(application == "gclust")//TODO 383 | { 384 | ss >> curId >> genomeSize >> genomeName >> type0 >> type1 >> type2; 385 | if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end()) 386 | { 387 | //cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl; 388 | numNotIngroundTruth++; 389 | continue; 390 | } 391 | else 392 | { 393 | int curLabel = groundTruthMapSeq[genomeName]; 394 | standardClust.push_back(curLabel); 395 | curMap.insert({curLabel, 0}); 396 | curMap[curLabel]++; 397 | } 398 | } 399 | else 400 | { 401 | cerr << "error application, need RabbitTClust, Mothur, gclust or MeshClust2" << endl; 402 | printInfo(); 403 | return; 404 | } 405 | 406 | }//end a cluster calculation 407 | } 408 | }//end while 409 | } 410 | 411 | if(curMap.size() != 0) 412 | { 413 | int clustSize = 0; 414 | vector curClustInfo; 415 | 416 | for(auto x : curMap) 417 | { 418 | LabNum ln; 419 | ln.label = x.first; 420 | ln.number = x.second; 421 | curClustInfo.push_back(ln); 422 | clustSize += x.second; 423 | } 424 | std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum); 425 | labNumArr.push_back(curClustInfo); 426 | 427 | PosNum pn; 428 | pn.startPos = startPos; 429 | pn.clustSize = clustSize; 430 | posArr.push_back(pn); 431 | 432 | startPos += clustSize; 433 | 434 | unordered_map().swap(curMap); 435 | } 436 | 437 | //update the labels 438 | unordered_map globalMap; 439 | int badLabel = -1; 440 | vector resLabelArr; 441 | int clustNumber = labNumArr.size(); 442 | resLabelArr.resize(clustNumber); 443 | for(int i = 0; i < clustNumber; i++) 444 | { 445 | badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 446 | } 447 | 448 | //generate the result 449 | for(int i = 0; i < posArr.size(); i++) 450 | { 451 | int startPos = posArr[i].startPos; 452 | int clustSize = posArr[i].clustSize; 453 | for(int j = 0; j < clustSize; j++) 454 | { 455 | ourClust.push_back(resLabelArr[i]); 456 | } 457 | } 458 | 459 | cerr << "the number of which not in the groundTruth is: " << numNotIngroundTruth << endl; 460 | cerr << "the size of ourClust is: " << ourClust.size() << endl; 461 | cerr << "the size of standardClust is: " << standardClust.size() << endl; 462 | 463 | if(ourClust.size() != standardClust.size()) 464 | { 465 | cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl; 466 | return; 467 | } 468 | for(int i = 0; i < ourClust.size(); i++) 469 | { 470 | ofs1 << ourClust[i] << '\t' << standardClust[i] << endl; 471 | } 472 | 473 | for(int i = 0; i < ourClust.size(); i++) 474 | ofs << ourClust[i] << ' '; 475 | ofs << endl; 476 | 477 | for(int i = 0; i < standardClust.size(); i++) 478 | ofs << standardClust[i] << ' '; 479 | ofs << endl; 480 | 481 | } 482 | 483 | int main(int argc, char* argv[]){ 484 | if(argc < 6){ 485 | printInfo(); 486 | return 1; 487 | } 488 | string application = argv[1]; 489 | string argument = argv[2]; 490 | string groundTruth = argv[3]; 491 | string inputFile = argv[4]; 492 | string outputFile = argv[5]; 493 | 494 | calF1(application, argument, groundTruth, inputFile, outputFile); 495 | 496 | return 0; 497 | 498 | } 499 | -------------------------------------------------------------------------------- /benchmark/generateList.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #set -x 3 | 4 | cd ref/ 5 | resName="refList" 6 | if [ -f "$resName" ]; then 7 | echo "the file exist, remove" 8 | rm $resName 9 | fi 10 | 11 | for dir in archaea bacteria fungi viral plant protozoa human vertebrate_mammalian vertebrate_other 12 | do 13 | #echo $dir 14 | ls $dir/*.fna.gz >$dir.gz.list 15 | cat $dir.gz.list | while read line 16 | do 17 | #echo $line 18 | gunzip $line 19 | done 20 | rm $dir.gz.list 21 | 22 | ls $dir/*.fna > $dir.list 23 | cat $dir.list | while read line 24 | do 25 | echo `pwd`/$line >>$resName 26 | done 27 | rm $dir.list 28 | done 29 | 30 | mv $resName ../ 31 | cd ../ 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /benchmark/simulate/Makefile: -------------------------------------------------------------------------------- 1 | all: simulate-longSequence create-containment-bacteria 2 | simulate-longSequence: src/simulate_longSequence.cpp 3 | g++ -O3 src/simulate_longSequence.cpp -o simulate-longSequence 4 | create-containment-bacteria: src/create_containment_bacteria.cpp 5 | g++ -O3 src/create_containment_bacteria.cpp -o create-containment-bacteria -lz 6 | 7 | clean: 8 | rm simulate-longSequence create-containment-bacteria 9 | 10 | -------------------------------------------------------------------------------- /benchmark/simulate/README.md: -------------------------------------------------------------------------------- 1 | # The script for simulating genome sequence 2 | 3 | ## simulate-longSequence 4 | `simulate-longSequence` is used to generate simulated sequences with a predefined sequence length and mutation rate. 5 | 6 | Example: `simulate-longSequence 10 20 300 1000000 simulate_10_20_300_1M` 7 | It will generate `20` clusters, each with `300` sequences with a mutation rate of `0.01` (10/1000) and an approximate length of `1,000,000`. 8 | There will be three output files named `simulate_10_20_300_1M_seed.fna`, `simulate_10_20_300_1M_total.fna`, and `simulate_10_20_300_1M.groundTruth`, which corresponds to the seed sequence file, total sequence file, and the cluster groundTruth file, respectively. 9 | 10 | * Run as `simulate-longSequence mutation_rate*1000(integer) numSeedSeqs numEachClusts seqLength output` 11 | * The 0 parameter(`./simulate-longSequence`) is the application name. 12 | * The 1 parameter(`mutation_rate*1000`) is to set the mutation rate 13 | * The 2 parameter(`numSeedSeqs`) is the number of seed sequences (number of clusters). 14 | * The 3 parameter(`numEachClusts`) is the sequence number in a cluster generated from each seed sequence. 15 | * The 4 parameter(`seqLength`) is the approximate length for each sequence. 16 | * The 5 parameter(`output`) is the prefix name for ground truth, seed sequences and total simulated sequences FASTA files. 17 | 18 | ## create-containment-bacteria 19 | `create-containment-bacteria` is used to generate genomes by cutting random proportions ranging from 0.0 to 1.0 in the original seed bacterial genome length. 20 | 21 | Example: `create-containment-bacteria input.list 8 50 simulatePath` 22 | It will generate `8` clusters, each with `50` sequences. 23 | Each simulated sequence in the cluster is generated by cutting random proportions ranging from 0.0 to 1.0 in the length of an original seed bacteria genome. 24 | The seed bacteria genomes come from the `input.list`, which contains one origin genome path per line. 25 | The similarities between the seed genomes in the `input.list` should be low to promise low inter-cluster similarity. 26 | There will be `408` genomes files (8 seed genomes and 400 generated genomes) in the `simulatePath` folder. 27 | 28 | * Run as: `./create-containment-bacteria input.list num_of_clust num_genomes_each_clust simulatePath` 29 | * The 0 parameter(`./create-containment-bacteria`) is the application name 30 | * The 1 parameter(`input.list`) is input parameter, genome file list, one genome path per line. 31 | * The 2 parameter(`num_of_clust`) is input parameter, the number of clusters 32 | * The 3 parameter(`num_genomes_each_clust`) is input parameter, the number of genomes in each cluster 33 | * The 4 parameter(`simulatePath`) is output path, the output file path 34 | -------------------------------------------------------------------------------- /benchmark/simulate/src/create_containment_bacteria.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/5/31 3 | * 4 | * 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "kseq.h" 25 | 26 | 27 | KSEQ_INIT(gzFile, gzread); 28 | 29 | using namespace std; 30 | 31 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions); 32 | 33 | int main(int argc , char *argv[]){ 34 | string application = argv[0]; 35 | vector args, descriptions; 36 | args.push_back(application); 37 | descriptions.push_back("the application name"); 38 | 39 | //========= parameters need changing ======== 40 | //The example is with parameters of specific numbers. 41 | //args is the tutorial names. 42 | string pwd = "RabbitTClust/benchmark/simulate/src/create_containment_bacteria.cpp"; 43 | string dependency = "kseq.h"; 44 | string example = application + " simulateList num_of_clust num_genomes_each_clust simulatePath"; 45 | args.push_back("simulateList"); 46 | args.push_back("num_of_clust"); 47 | args.push_back("num_genomes_each_clust"); 48 | args.push_back("simulatePath"); 49 | descriptions.push_back("input file, genome file list, one genome path per line"); 50 | descriptions.push_back("input parameter, the number of clusters"); 51 | descriptions.push_back("input parameter, the number of genomes in each cluster"); 52 | descriptions.push_back("output path, the output file path"); 53 | 54 | //-------- no changing ----------- 55 | assert(args.size() == descriptions.size()); 56 | if(argc != args.size()) { 57 | printInfo(pwd, dependency, example, args, descriptions); 58 | return 1; 59 | } 60 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 61 | { 62 | printInfo(pwd, dependency, example, args, descriptions); 63 | return 1; 64 | } 65 | 66 | //======== specific implement ======== 67 | string inputList = argv[1]; 68 | int numClusts = stoi(argv[2]); 69 | int numGenomePerClust = stoi(argv[3]); 70 | string outputPath = argv[4]; 71 | string cmd0 = "mkdir -p " + outputPath; 72 | system(cmd0.c_str()); 73 | 74 | //get randoms 75 | vector randArr; 76 | for(int i = 0; i < numClusts* numGenomePerClust; i++) 77 | { 78 | int randNumber = rand() % 1000; 79 | randArr.push_back((double)randNumber/1000); 80 | } 81 | 82 | fstream fs0(inputList); 83 | unordered_set seedSet; 84 | vector seedArr; 85 | string line; 86 | 87 | while(getline(fs0, line)) 88 | { 89 | int startIndex = line.find('/'); 90 | int endIndex = line.find('.'); 91 | string key = line.substr(startIndex + 1, endIndex-startIndex-1); 92 | if(seedSet.find(key) == seedSet.end()) 93 | { 94 | seedArr.push_back(line); 95 | seedSet.insert(key); 96 | } 97 | } 98 | //cerr << "the size of seedArr is: " << seedArr.size() << endl; 99 | 100 | //string groundTruthFile = outputPath + "/groundTruth"; 101 | //ofstream ofs(groundTruthFile); 102 | int seedArrSize = seedArr.size(); 103 | int minNumber = std::min(numClusts, seedArrSize); 104 | for(int i = 0; i < minNumber; i++) 105 | { 106 | string cp_command = "cp " + seedArr[i] + " " + outputPath + '/'; 107 | cerr << cp_command << endl; 108 | system(cp_command.c_str()); 109 | gzFile fp1 = gzopen(seedArr[i].c_str(), "r"); 110 | if(!fp1){ 111 | cerr << "cannot open file: " << seedArr[i] << endl; 112 | continue; 113 | } 114 | kseq_t *ks1 = kseq_init(fp1); 115 | vector bufArr; 116 | 117 | FILE *fpArr[numGenomePerClust]; 118 | int startIndex = seedArr[i].find('/'); 119 | string keyName = seedArr[i].substr(startIndex+1); 120 | int indexStr = keyName.find_last_of('.'); 121 | keyName = keyName.substr(0, indexStr); 122 | 123 | for(int j = 0; j < numGenomePerClust; j++) 124 | { 125 | string outputName = outputPath + '/' + keyName + '.' + to_string(j) + ".fna"; 126 | fpArr[j] = fopen(outputName.c_str(), "w"); 127 | string writeBuffer(""); 128 | bufArr.push_back(writeBuffer); 129 | } 130 | 131 | int index = 0; 132 | while(1) 133 | { 134 | int length = kseq_read(ks1); 135 | if(length < 0) break; 136 | string name = ks1->name.s; 137 | string comment = ks1->comment.s; 138 | string content = ks1->seq.s; 139 | string headLine = '>' + name + ' ' + comment + '\n'; 140 | for(int j = 0; j < numGenomePerClust; j++) 141 | { 142 | bufArr[j] += headLine; 143 | 144 | int readLength = length * randArr[i*numGenomePerClust + j]; 145 | //cerr << "the readLength is: " << readLength << endl; 146 | for(int k = 0; k < readLength; k +=80) 147 | { 148 | int actualLen = std::min(80, readLength - k); 149 | string tmpContent = content.substr(k, actualLen); 150 | bufArr[j] += tmpContent + '\n'; 151 | } 152 | index++; 153 | } 154 | } 155 | for(int j = 0; j < numGenomePerClust; j++) 156 | { 157 | fwrite(bufArr[j].c_str(), sizeof(char), bufArr[j].length(), fpArr[j]); 158 | fclose(fpArr[j]); 159 | } 160 | 161 | gzclose(fp1); 162 | kseq_destroy(ks1); 163 | } 164 | //ofs.close(); 165 | 166 | return 0; 167 | } 168 | 169 | void printInfo(string pwd, string dependency, string example, vector args, vector descriptions){ 170 | assert(args.size() == descriptions.size()); 171 | cerr << endl; 172 | cerr << "example: " << example << endl; 173 | cerr << endl; 174 | cerr << "source file path: " << pwd << endl; 175 | cerr << endl; 176 | cerr << "dependency: " << dependency << endl; 177 | cerr << endl; 178 | cerr << "run as: "; 179 | for(int i = 0; i < args.size(); i++){ 180 | cerr << args[i] << ' '; 181 | } 182 | cerr << endl; 183 | for(int i = 0; i < args.size(); i++){ 184 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /benchmark/simulate/src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #ifdef USE_MALLOC_WRAPPERS 36 | # include "malloc_wrap.h" 37 | #endif 38 | 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 40 | #define KS_SEP_TAB 1 // isspace() && !' ' 41 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 42 | #define KS_SEP_MAX 2 43 | 44 | #define __KS_TYPE(type_t) \ 45 | typedef struct __kstream_t { \ 46 | unsigned char *buf; \ 47 | int begin, end, is_eof; \ 48 | type_t f; \ 49 | } kstream_t; 50 | 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 53 | 54 | #define __KS_BASIC(type_t, __bufsize) \ 55 | static inline kstream_t *ks_init(type_t f) \ 56 | { \ 57 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 58 | ks->f = f; \ 59 | ks->buf = (unsigned char*)malloc(__bufsize); \ 60 | return ks; \ 61 | } \ 62 | static inline void ks_destroy(kstream_t *ks) \ 63 | { \ 64 | if (ks) { \ 65 | free(ks->buf); \ 66 | free(ks); \ 67 | } \ 68 | } 69 | 70 | #define __KS_GETC(__read, __bufsize) \ 71 | static inline int ks_getc(kstream_t *ks) \ 72 | { \ 73 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 74 | if (ks->begin >= ks->end) { \ 75 | ks->begin = 0; \ 76 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 77 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ 78 | } \ 79 | return (int)ks->buf[ks->begin++]; \ 80 | } 81 | 82 | #ifndef KSTRING_T 83 | #define KSTRING_T kstring_t 84 | typedef struct __kstring_t { 85 | size_t l, m; 86 | char *s; 87 | } kstring_t; 88 | #endif 89 | 90 | #ifndef kroundup32 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 92 | #endif 93 | 94 | #define __KS_GETUNTIL(__read, __bufsize) \ 95 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 96 | { \ 97 | int gotany = 0; \ 98 | if (dret) *dret = 0; \ 99 | str->l = append? str->l : 0; \ 100 | for (;;) { \ 101 | int i; \ 102 | if (ks->begin >= ks->end) { \ 103 | if (!ks->is_eof) { \ 104 | ks->begin = 0; \ 105 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 106 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | gotany = 1; \ 128 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 129 | str->l = str->l + (i - ks->begin); \ 130 | ks->begin = i + 1; \ 131 | if (i < ks->end) { \ 132 | if (dret) *dret = ks->buf[i]; \ 133 | break; \ 134 | } \ 135 | } \ 136 | if (!gotany && ks_eof(ks)) return -1; \ 137 | if (str->s == 0) { \ 138 | str->m = 1; \ 139 | str->s = (char*)calloc(1, 1); \ 140 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 141 | str->s[str->l] = '\0'; \ 142 | return str->l; \ 143 | } \ 144 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 145 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 146 | 147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 148 | __KS_TYPE(type_t) \ 149 | __KS_BASIC(type_t, __bufsize) \ 150 | __KS_GETC(__read, __bufsize) \ 151 | __KS_GETUNTIL(__read, __bufsize) 152 | 153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 154 | 155 | #define __KSEQ_BASIC(SCOPE, type_t) \ 156 | SCOPE kseq_t *kseq_init(type_t fd) \ 157 | { \ 158 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 159 | s->f = ks_init(fd); \ 160 | return s; \ 161 | } \ 162 | SCOPE void kseq_destroy(kseq_t *ks) \ 163 | { \ 164 | if (!ks) return; \ 165 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 166 | ks_destroy(ks->f); \ 167 | free(ks); \ 168 | } 169 | 170 | /* Return value: 171 | >=0 length of the sequence (normal) 172 | -1 end-of-file 173 | -2 truncated quality string 174 | */ 175 | #define __KSEQ_READ(SCOPE) \ 176 | SCOPE int kseq_read(kseq_t *seq) \ 177 | { \ 178 | int c; \ 179 | kstream_t *ks = seq->f; \ 180 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 181 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 182 | if (c == -1) return -1; /* end of file */ \ 183 | seq->last_char = c; \ 184 | } /* else: the first header char has been read in the previous call */ \ 185 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 186 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 187 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 188 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 189 | seq->seq.m = 256; \ 190 | seq->seq.s = (char*)malloc(seq->seq.m); \ 191 | } \ 192 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 193 | if (c == '\n') continue; /* skip empty lines */ \ 194 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 195 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 196 | } \ 197 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 198 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 199 | seq->seq.m = seq->seq.l + 2; \ 200 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 201 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 202 | } \ 203 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 204 | if (c != '+') return seq->seq.l; /* FASTA */ \ 205 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 206 | seq->qual.m = seq->seq.m; \ 207 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 208 | } \ 209 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 210 | if (c == -1) return -2; /* error: no quality string */ \ 211 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 212 | seq->last_char = 0; /* we have not come to the next header line */ \ 213 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 214 | return seq->seq.l; \ 215 | } 216 | 217 | #define __KSEQ_TYPE(type_t) \ 218 | typedef struct { \ 219 | kstring_t name, comment, seq, qual; \ 220 | int last_char; \ 221 | kstream_t *f; \ 222 | } kseq_t; 223 | 224 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 225 | KSTREAM_INIT(type_t, __read, 16384) \ 226 | __KSEQ_TYPE(type_t) \ 227 | __KSEQ_BASIC(SCOPE, type_t) \ 228 | __KSEQ_READ(SCOPE) 229 | 230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 231 | 232 | #define KSEQ_DECLARE(type_t) \ 233 | __KS_TYPE(type_t) \ 234 | __KSEQ_TYPE(type_t) \ 235 | extern kseq_t *kseq_init(type_t fd); \ 236 | void kseq_destroy(kseq_t *ks); \ 237 | int kseq_read(kseq_t *seq); 238 | 239 | #endif 240 | -------------------------------------------------------------------------------- /benchmark/simulate/src/simulate_longSequence.cpp: -------------------------------------------------------------------------------- 1 | /* Author: Xiaoming Xu 2 | * Data: 2022/5/12 3 | * 4 | * See the LICENSE.txt file included with this software for licence information. 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | void printInfo(string pwd, string example, vector args, vector descriptions){ 19 | assert(args.size() == descriptions.size()); 20 | cerr << endl; 21 | cerr << "example: " << example << endl; 22 | cerr << endl; 23 | cerr << "source file path: " << pwd << endl; 24 | cerr << endl; 25 | cerr << "run as: "; 26 | for(int i = 0; i < args.size(); i++){ 27 | cerr << args[i] << ' '; 28 | } 29 | cerr << endl; 30 | for(int i = 0; i < args.size(); i++){ 31 | fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str()); 32 | } 33 | } 34 | 35 | 36 | int main(int argc , char *argv[]){ 37 | string pwd = "RabbitTClust/benchmark/simulate/src/simulate_longSequence.cpp"; 38 | string application = argv[0]; 39 | string example = application + " 10 20 300 1000000 simulate_10_20_300_1M"; 40 | vector args, descriptions; 41 | args.push_back(application); 42 | args.push_back("mutation_rate*1000(integer)"); 43 | args.push_back("numSeedSeqs"); 44 | args.push_back("numEachClusts"); 45 | args.push_back("seqLength"); 46 | args.push_back("output"); 47 | descriptions.push_back("the application name"); 48 | descriptions.push_back("the mutation rate"); 49 | descriptions.push_back("the number of seed sequence (number of clusters)"); 50 | descriptions.push_back("the number sequence in a cluster generate from each seed sequence"); 51 | descriptions.push_back("the approximate length for each sequence"); 52 | descriptions.push_back("the prefix name for groundTruth, seedSequences and totalSimulateSequence fasta files"); 53 | 54 | assert(args.size() == descriptions.size()); 55 | 56 | if(argc != args.size()) { 57 | printInfo(pwd, example, args, descriptions); 58 | return -1; 59 | } 60 | else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) 61 | { 62 | printInfo(pwd, example, args, descriptions); 63 | return 1; 64 | } 65 | 66 | const char nucs[4] = { 'A','T','G','C'}; 67 | 68 | int erate = std::atoi(argv[1]); 69 | int numClusts = std::stoi(argv[2]); 70 | int numEachClusts = std::stoi(argv[3]); 71 | int seqLength = std::stoi(argv[4]); 72 | string outPrefix = argv[5]; 73 | string outSeedFile = outPrefix + "_seed.fna"; 74 | string outTotalFile = outPrefix + "_total.fna"; 75 | string outGroundTruth = outPrefix + "_groundTruth"; 76 | 77 | cerr << "the error rate is: " << double(erate)/1000 << endl; 78 | cerr << "the number of clusters is: " << numClusts << endl; 79 | cerr << "the number of sequences in each cluster is: " << numEachClusts << endl; 80 | cerr << "the approximate sequence length is: " << seqLength << endl; 81 | cerr << "the output seed sequences file is: " << outSeedFile << endl; 82 | cerr << "the output total sequences file is: " << outTotalFile << endl; 83 | cerr << "the groundTruth file is: " << outGroundTruth << endl; 84 | 85 | FILE * fp0 = fopen(outSeedFile.c_str(), "w"); 86 | FILE * fp1 = fopen(outTotalFile.c_str(), "w"); 87 | FILE * fp2 = fopen(outGroundTruth.c_str(), "w"); 88 | 89 | string key1 = "seqName"; 90 | string key2 = "taxid"; 91 | fprintf(fp2, "%s\t%s\n",key1.c_str(), key2.c_str()); 92 | 93 | for(int i = 0; i < numClusts; i++) 94 | { 95 | struct timeval tv; 96 | gettimeofday(&tv, NULL); 97 | srand(tv.tv_usec); 98 | string seqName = ">seq_" + to_string(i); 99 | string groundTruthName = seqName.substr(1); 100 | fprintf(fp2, "%s\t%d\n", groundTruthName.c_str(), i); 101 | string seqComment = "Seed sequence " + to_string(i) + " to generate mutations"; 102 | string infoLine = seqName + '\t' + seqComment + '\n'; 103 | string seedSeq(""); 104 | for(int i = 0; i < seqLength; i++) 105 | { 106 | char newC = nucs[random()%4]; 107 | seedSeq += newC; 108 | } 109 | //output the seed sequence into outSeedFile 110 | int infoLineLen = infoLine.length(); 111 | fwrite(infoLine.c_str(), sizeof(char), infoLineLen, fp0); 112 | fwrite(infoLine.c_str(), sizeof(char), infoLineLen, fp1); 113 | int seedLen = seedSeq.length(); 114 | string outSeedSeq(""); 115 | for(int k = 0; k < seedLen; k += 80) 116 | { 117 | int curLength = std::min(80, seedLen-k); 118 | string tmpLine = seedSeq.substr(k, curLength); 119 | outSeedSeq += tmpLine + '\n'; 120 | } 121 | int outSeedSeqLen = outSeedSeq.length(); 122 | fwrite(outSeedSeq.c_str(), sizeof(char), outSeedSeqLen, fp0); 123 | fwrite(outSeedSeq.c_str(), sizeof(char), outSeedSeqLen, fp1); 124 | 125 | 126 | //for generate mutation sequences 127 | for(int j = 0; j < numEachClusts; j++) 128 | { 129 | string mutationName = seqName + "_mutation_" + to_string(j); 130 | string groundTruthMuName = mutationName.substr(1); 131 | fprintf(fp2, "%s\t%d\n", groundTruthMuName.c_str(), i); 132 | string mutationComment = "mutation sequence " + to_string(j) + " from seedSequence " + to_string(i); 133 | string mutaInfoLine = mutationName + '\t' + mutationComment + '\n'; 134 | string mutationSeq(""); 135 | for(int t = 0; t < seedSeq.length(); t++) 136 | { 137 | if(random()%1000 < erate){ 138 | int mut = random()%3; 139 | if(mut == 0)//sub 140 | { 141 | while(1){ 142 | char newc = nucs[random()%4]; 143 | if(newc != seedSeq[t]){ 144 | mutationSeq += newc; 145 | break; 146 | } 147 | } 148 | } 149 | else if(mut == 1)// ins 150 | { 151 | mutationSeq += nucs[random()%4]; 152 | t = t - 1; 153 | } 154 | else//del 155 | continue; 156 | }//end if mutation 157 | else// no mutation 158 | mutationSeq += seedSeq[t]; 159 | } 160 | int mutaInfoLineLen = mutaInfoLine.length(); 161 | fwrite(mutaInfoLine.c_str(), sizeof(char), mutaInfoLineLen, fp1); 162 | int mutationLen = mutationSeq.length(); 163 | string outMutationSeq(""); 164 | for(int k = 0; k < mutationLen; k += 80) 165 | { 166 | int curLength = std::min(80, mutationLen-k); 167 | string tmpLine = mutationSeq.substr(k, curLength); 168 | outMutationSeq += tmpLine + '\n'; 169 | } 170 | int outMutationSeqLen = outMutationSeq.length(); 171 | fwrite(outMutationSeq.c_str(), sizeof(char), outMutationSeqLen, fp1); 172 | } 173 | } 174 | fclose(fp0); 175 | fclose(fp1); 176 | fclose(fp2); 177 | 178 | cerr << "finish generate mutation files with multithread " << endl; 179 | 180 | return 0; 181 | } 182 | 183 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | #make rabbitSketch library 4 | cd RabbitSketch && 5 | mkdir -p build && cd build && 6 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. && 7 | make -j8 && make install && 8 | cd ../../ && 9 | 10 | #make rabbitFX library 11 | cd RabbitFX && 12 | mkdir -p build && cd build && 13 | cmake -DCMAKE_INSTALL_PREFIX=. .. && 14 | make -j8 && make install && 15 | cd ../../ && 16 | 17 | #compile the clust-greedy 18 | mkdir -p build && cd build && 19 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 20 | make -j8 && make install && 21 | cd ../ && 22 | 23 | #compile the clust-mst 24 | cd build && 25 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. && 26 | make -j8 && make install && 27 | cd ../ 28 | -------------------------------------------------------------------------------- /rabbittclust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RabbitBio/RabbitTClust/056ad8b0067688994a1f1529300a947f4ba6da0d/rabbittclust.png -------------------------------------------------------------------------------- /src/MST.h: -------------------------------------------------------------------------------- 1 | #ifndef H_MST_GRAPH 2 | #define H_MST_GRAPH 3 | 4 | #include 5 | #include 6 | #include 7 | #include "SketchInfo.h" 8 | 9 | struct NeighborNode{ 10 | int id; 11 | double distance; 12 | NeighborNode(int i, double d){ 13 | id = i; 14 | distance = d; 15 | } 16 | }; 17 | 18 | struct EdgeInfo{ 19 | int preNode; 20 | int sufNode; 21 | double dist; 22 | }; 23 | 24 | 25 | struct Graph{ 26 | int node; 27 | int curNeighbor; 28 | std::vector neighbor; 29 | 30 | }; 31 | 32 | 33 | struct MST{ 34 | std::unordered_set nodes; 35 | std::vector edges; 36 | 37 | }; 38 | 39 | bool cmpEdge(EdgeInfo e1, EdgeInfo e2); 40 | 41 | bool cmpNeighbor(NeighborNode n1, NeighborNode n2); 42 | 43 | std::vector kruskalAlgorithm(std::vectorgraph, int vertices); 44 | 45 | vector generateMST(vector& sketches, string sketchFunc, int threads); 46 | 47 | vector append_MST(vector& pre_sketches, vector& append_sketches, int sketch_func_id, int threads, int ** &denseArr, int denseSpan, uint64_t* &aniArr); 48 | 49 | vector modifyMST(vector& sketches, int start_index, int sketch_func_id, int threads, bool no_dense, int** &denseArr, int denseSpan, uint64_t* &aniArr); 50 | 51 | vector compute_kssd_mst(vector& sketches, KssdParameters info, const string folder_path, int start_index, bool no_dense, bool isContainment, int threads, int** &denseArr, int denseSpan, uint64_t* &aniArr); 52 | 53 | std::vector generateForest(std::vector mst, double threshhold); 54 | 55 | std::vector > generateCluster(std::vector forest, int vertices); 56 | 57 | vector> generateClusterWithBfs(vector forest, int vertices); 58 | 59 | vector modifyForest(vector forset, vector noiseArr, int threads); 60 | 61 | typedef pair PairInt; 62 | vector getNoiseNode(vector densePairArr, int alpha); 63 | 64 | string get_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file); 65 | string get_kssd_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file); 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /src/MST_IO.cpp: -------------------------------------------------------------------------------- 1 | #include "MST_IO.h" 2 | #include "Sketch_IO.h" 3 | #include 4 | using namespace std; 5 | 6 | 7 | inline bool cmpSketchLength(ClusterInfo c1, ClusterInfo c2){ 8 | return c1.length > c2.length; 9 | } 10 | 11 | void loadDense(int** &denseArr, string folderPath, int& denseSpan, int& genome_number){ 12 | string file_dense = folderPath + '/' + "mst.dense"; 13 | FILE* fp_dense = fopen(file_dense.c_str(), "r"); 14 | if(!fp_dense){ 15 | cerr << "ERROR: saveDense(), cannot open the file: " << file_dense; 16 | exit(1); 17 | } 18 | fread(&genome_number, sizeof(int), 1, fp_dense); 19 | fread(&denseSpan, sizeof(int), 1, fp_dense); 20 | denseArr = new int*[denseSpan]; 21 | for(int i = 0; i < denseSpan; i++){ 22 | denseArr[i] = new int[genome_number]; 23 | fread(denseArr[i], sizeof(int), genome_number, fp_dense); 24 | } 25 | fclose(fp_dense); 26 | cerr << "-----read the dense file from: " << file_dense << endl; 27 | } 28 | 29 | void loadANI(string folderPath, uint64_t* &aniArr, int sketch_func_id){ 30 | if(sketch_func_id != 0 && sketch_func_id != 1){ 31 | cerr << "ERROR: saveANI(), save ANI can only support MinHash and KSSD functions" << endl; 32 | return; 33 | } 34 | string file_ani = folderPath + '/' + "mst.ani"; 35 | FILE* fp_ani = fopen(file_ani.c_str(), "r"); 36 | if(!fp_ani){ 37 | cerr << "ERROR: saveANI(), cannot open file: " << file_ani << endl; 38 | exit(1); 39 | } 40 | aniArr = new uint64_t[101]; 41 | fread(aniArr, sizeof(uint64_t), 101, fp_ani); 42 | fclose(fp_ani); 43 | cerr << "-----read the ani file from: " << file_ani << endl; 44 | } 45 | 46 | void loadMST(string folderPath, vector& mst) 47 | { 48 | //load the mst edge 49 | string file_mst = folderPath + '/' + "edge.mst"; 50 | FILE* fp_mst = fopen(file_mst.c_str(), "r"); 51 | if(!fp_mst){ 52 | cerr << "ERROR: loadMST(), cannot open the file: " << file_mst << endl; 53 | exit(1); 54 | } 55 | size_t mst_size; 56 | fread(&mst_size, sizeof(size_t), 1, fp_mst); 57 | int preNode, sufNode; 58 | double dist; 59 | for(size_t i = 0; i < mst_size; i++){ 60 | fread(&preNode, sizeof(int), 1, fp_mst); 61 | fread(&sufNode, sizeof(int), 1, fp_mst); 62 | fread(&dist, sizeof(double), 1, fp_mst); 63 | EdgeInfo tmpEdge{preNode, sufNode, dist}; 64 | mst.push_back(tmpEdge); 65 | //cout << preNode << '\t' << sufNode << '\t' << dist << endl; 66 | } 67 | fclose(fp_mst); 68 | cerr << "-----read the mst file from " << file_mst << endl; 69 | } 70 | 71 | void printKssdResult(vector>& cluster, vector& sketches, bool sketchByFile, string outputFile) 72 | { 73 | //cerr << "output the result into: " << outputFile << endl; 74 | FILE *fp = fopen(outputFile.c_str(), "w"); 75 | if(!fp){ 76 | cerr << "Error in printKssdResult(), cannot open file: " << outputFile << endl; 77 | exit(1); 78 | } 79 | 80 | if(sketchByFile) 81 | { 82 | for(int i = 0; i < cluster.size(); i++){ 83 | fprintf(fp, "the cluster %d is: \n", i); 84 | for(int j = 0; j < cluster[i].size(); j++) 85 | { 86 | int curId = cluster[i][j]; 87 | fprintf(fp, "\t%5d\t%6d\t%12dnt\t%20s\t%20s\t%s\n", j, curId, sketches[curId].totalSeqLength, sketches[curId].fileName.c_str(), sketches[curId].fileSeqs[0].name.c_str(), sketches[curId].fileSeqs[0].comment.c_str()); 88 | } 89 | fprintf(fp, "\n"); 90 | } 91 | }//end sketchByFile 92 | 93 | else//sketch by sequence 94 | { 95 | for(int i = 0; i < cluster.size(); i++){ 96 | fprintf(fp, "the cluster %d is: \n", i); 97 | for(int j = 0; j < cluster[i].size(); j++) 98 | { 99 | int curId = cluster[i][j]; 100 | fprintf(fp, "\t%6d\t%6d\t%12dnt\t%20s\t%s\n", j, curId, sketches[curId].seqInfo.length, sketches[curId].seqInfo.name.c_str(), sketches[curId].seqInfo.comment.c_str()); 101 | } 102 | fprintf(fp, "\n"); 103 | } 104 | }//end sketchBySequence 105 | fclose(fp); 106 | 107 | } 108 | 109 | void printResult(vector>& cluster, vector& sketches, bool sketchByFile, string outputFile) 110 | { 111 | //cerr << "output the result into: " << outputFile << endl; 112 | FILE *fp = fopen(outputFile.c_str(), "w"); 113 | if(!fp){ 114 | cerr << "Error in printResult(), cannot open file: " << outputFile << endl; 115 | exit(1); 116 | } 117 | 118 | if(sketchByFile) 119 | { 120 | for(int i = 0; i < cluster.size(); i++){ 121 | fprintf(fp, "the cluster %d is: \n", i); 122 | for(int j = 0; j < cluster[i].size(); j++) 123 | { 124 | int curId = cluster[i][j]; 125 | fprintf(fp, "\t%5d\t%6d\t%12dnt\t%20s\t%20s\t%s\n", j, curId, sketches[curId].totalSeqLength, sketches[curId].fileName.c_str(), sketches[curId].fileSeqs[0].name.c_str(), sketches[curId].fileSeqs[0].comment.c_str()); 126 | } 127 | fprintf(fp, "\n"); 128 | } 129 | }//end sketchByFile 130 | 131 | else//sketch by sequence 132 | { 133 | for(int i = 0; i < cluster.size(); i++){ 134 | fprintf(fp, "the cluster %d is: \n", i); 135 | for(int j = 0; j < cluster[i].size(); j++) 136 | { 137 | int curId = cluster[i][j]; 138 | fprintf(fp, "\t%6d\t%6d\t%12dnt\t%20s\t%s\n", j, curId, sketches[curId].seqInfo.length, sketches[curId].seqInfo.name.c_str(), sketches[curId].seqInfo.comment.c_str()); 139 | } 140 | fprintf(fp, "\n"); 141 | } 142 | }//end sketchBySequence 143 | fclose(fp); 144 | 145 | } 146 | 147 | void saveKssdMST(vector& sketches, vector& mst, string folderPath, bool sketchByFile){ 148 | save_kssd_genome_info(sketches, folderPath, "mst", sketchByFile); 149 | string file_mst = folderPath + '/' + "edge.mst"; 150 | FILE* fp_mst = fopen(file_mst.c_str(), "w+"); 151 | if(!fp_mst){ 152 | cerr << "ERROR: saveKsdMST(), cannot open the file: " << file_mst << endl; 153 | exit(1); 154 | } 155 | size_t mst_size = mst.size(); 156 | fwrite(&mst_size, sizeof(size_t), 1, fp_mst); 157 | for(size_t i = 0; i < mst.size(); i++){ 158 | fwrite(&mst[i].preNode, sizeof(int), 1, fp_mst); 159 | fwrite(&mst[i].sufNode, sizeof(int), 1, fp_mst); 160 | fwrite(&mst[i].dist, sizeof(double), 1, fp_mst); 161 | } 162 | fclose(fp_mst); 163 | cerr << "-----save the kssd mst into: " << folderPath << endl; 164 | } 165 | 166 | void saveMST(vector& sketches, vector& mst, string folderPath, bool sketchByFile){ 167 | save_genome_info(sketches, folderPath, "mst", sketchByFile); 168 | string file_mst = folderPath + '/' + "edge.mst"; 169 | FILE* fp_mst = fopen(file_mst.c_str(), "w+"); 170 | if(!fp_mst){ 171 | cerr << "ERROR: saveMST(), cannot open the file: " << file_mst << endl; 172 | exit(1); 173 | } 174 | size_t mst_size = mst.size(); 175 | fwrite(&mst_size, sizeof(size_t), 1, fp_mst); 176 | for(size_t i = 0; i < mst.size(); i++){ 177 | fwrite(&mst[i].preNode, sizeof(int), 1, fp_mst); 178 | fwrite(&mst[i].sufNode, sizeof(int), 1, fp_mst); 179 | fwrite(&mst[i].dist, sizeof(double), 1, fp_mst); 180 | } 181 | fclose(fp_mst); 182 | cerr << "-----save the mst into: " << folderPath << endl; 183 | } 184 | 185 | void saveDense(string folderPath, int** denseArr, int denseSpan, int genome_number){ 186 | string file_dense = folderPath + '/' + "mst.dense"; 187 | FILE* fp_dense = fopen(file_dense.c_str(), "w+"); 188 | if(!fp_dense){ 189 | cerr << "ERROR: saveDense(), cannot open the file: " << file_dense; 190 | exit(1); 191 | } 192 | fwrite(&genome_number, sizeof(int), 1, fp_dense); 193 | fwrite(&denseSpan, sizeof(int), 1, fp_dense); 194 | for(int i = 0; i < denseSpan; i++){ 195 | fwrite(denseArr[i], sizeof(int), genome_number, fp_dense); 196 | } 197 | fclose(fp_dense); 198 | cerr << "-----save the dense file into: " << folderPath << endl; 199 | } 200 | 201 | void saveANI(string folderPath, uint64_t* aniArr, int sketch_func_id){ 202 | 203 | if(sketch_func_id != 0 && sketch_func_id != 1){ 204 | cerr << "ERROR: saveANI(), save ANI can only support MinHash and KSSD functions" << endl; 205 | return; 206 | } 207 | string file_ani = folderPath + '/' + "mst.ani"; 208 | FILE* fp_ani = fopen(file_ani.c_str(), "w+"); 209 | if(!fp_ani){ 210 | cerr << "ERROR: saveANI(), cannot open file: " << file_ani << endl; 211 | exit(1); 212 | } 213 | fwrite(aniArr, sizeof(uint64_t), 101, fp_ani); 214 | fclose(fp_ani); 215 | cerr << "-----save the ani file into: " << file_ani << endl; 216 | } 217 | 218 | void print_kssd_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file, string output){ 219 | string res_newick_tree = get_kssd_newick_tree(sketches, mst, sketch_by_file); 220 | FILE* fp_tree = fopen(output.c_str(), "w"); 221 | if(!fp_tree){ 222 | cerr << "ERROR: print_newick_tree(), cannot write file: " << output << endl; 223 | exit(1); 224 | } 225 | fprintf(fp_tree, "%s\n", res_newick_tree.c_str()); 226 | fclose(fp_tree); 227 | } 228 | 229 | void print_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file, string output){ 230 | string res_newick_tree = get_newick_tree(sketches, mst, sketch_by_file); 231 | FILE* fp_tree = fopen(output.c_str(), "w"); 232 | if(!fp_tree){ 233 | cerr << "ERROR: print_newick_tree(), cannot write file: " << output << endl; 234 | exit(1); 235 | } 236 | fprintf(fp_tree, "%s\n", res_newick_tree.c_str()); 237 | fclose(fp_tree); 238 | } 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /src/MST_IO.h: -------------------------------------------------------------------------------- 1 | #ifndef H_MST_IO 2 | #define H_MST_IO 3 | 4 | #include 5 | #include 6 | #include "SketchInfo.h"//include 7 | #include "MST.h" //include 8 | #include "common.hpp" 9 | 10 | struct ClusterInfo{ 11 | int id; 12 | uint64_t length; 13 | }; 14 | 15 | void print_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file, string output); 16 | void printResult(std::vector>& clusterOrigin, std::vector& sketches, bool sketchByFile, string outputFile); 17 | void printKssdResult(vector>& cluster, vector& sketches, bool sketchByFile, string outputFile); 18 | void print_kssd_newick_tree(const vector& sketches, const vector& mst, bool sketch_by_file, string output); 19 | 20 | void loadMST(string folderPath, vector& mst); 21 | void loadDense(int** &denseArr, string folderPath, int& denseSpan, int& genome_number); 22 | void loadANI(string folderPath, uint64_t* &aniArr, int sketch_func_id); 23 | 24 | void saveMST(vector& sketches, vector& mst, string folderPath, bool sketchByFile); 25 | void saveKssdMST(vector& sketches, vector& mst, string folderPath, bool sketchByFile); 26 | void saveDense(string folderPath, int** denseArr, int denseSpan, int genome_number); 27 | void saveANI(string folderPath, uint64_t* aniArr, int sketch_func_id); 28 | 29 | #endif 30 | 31 | -------------------------------------------------------------------------------- /src/SketchInfo.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SKETCH_INFO 2 | #define H_SKETCH_INFO 3 | 4 | #include "Sketch.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | //for sequence information 14 | struct SequenceInfo{ 15 | string name; 16 | string comment; 17 | int strand; 18 | int length; 19 | }; 20 | 21 | typedef vector Vec_SeqInfo; 22 | struct SketchInfo{ 23 | int id; 24 | string fileName;//for sketch files; 25 | uint64_t totalSeqLength; 26 | Vec_SeqInfo fileSeqs;//for sketch files; 27 | SequenceInfo seqInfo;//for sketch sequence; 28 | bool isContainment = false; 29 | 30 | Sketch::MinHash* minHash; 31 | Sketch::KSSD* KSSD; 32 | Sketch::WMinHash* WMinHash; 33 | Sketch::HyperLogLog* HLL; 34 | Sketch::OrderMinHash * OMH; 35 | }; 36 | 37 | struct KssdSketchInfo{ 38 | int id; 39 | string fileName; 40 | uint64_t totalSeqLength; 41 | Vec_SeqInfo fileSeqs; 42 | SequenceInfo seqInfo; 43 | bool use64; 44 | vector hash32_arr; 45 | vector hash64_arr; 46 | }; 47 | 48 | struct KssdParameters{ 49 | int id; 50 | int half_k; 51 | int half_subk; 52 | int drlevel; 53 | int genomeNumber; 54 | }; 55 | 56 | 57 | bool cmpGenomeSize(SketchInfo s1, SketchInfo s2); 58 | bool cmpSeqSize(SketchInfo s1, SketchInfo s2); 59 | 60 | void calSize(bool sketchByFile, string inputFile, int threads, uint64_t minLen, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize); 61 | bool sketchSequences(string inputFile, int kmerSize, int sketchSize, int minLen, string sketchFunc, bool isContainment, int containCompress, vector& sketches, int threads); 62 | bool sketchFiles(string inputFile, uint64_t minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector& sketches, int threads); 63 | bool cmpSketch(SketchInfo s1, SketchInfo s2); 64 | //bool sketchFileWithKssd(const string inputFile, const uint64_t minLen, const int kmerSize, const int drlevel, vector& sketches, int threads); 65 | bool sketchFileWithKssd(const string inputFile, const uint64_t minLen, int kmerSize, const int drlevel, vector& sketches, KssdParameters& info, int threads); 66 | bool sketchSequencesWithKssd(const string inputFile, const int minLen, const int kmerSize, const int drlevel, vector& sketches, KssdParameters& info, int threads); 67 | void transSketches(const vector& sketches, const KssdParameters& info, const string folder_path, int numThreads); 68 | 69 | 70 | 71 | #endif //H_SKETCH_INFO 72 | -------------------------------------------------------------------------------- /src/Sketch_IO.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SKETCH_IO 2 | #define H_SKETCH_IO 3 | #include "SketchInfo.h" 4 | #include "common.hpp" 5 | 6 | void read_sketch_parameters(string folder_path, int& sketch_func_id, int& kmer_size, bool& is_containment, int& contain_compress, int& sketch_size, int& half_k, int& half_subk, int& drlevel); 7 | void save_genome_info(vector& sketches, string folderPath, string type, bool sketchByFile); 8 | void save_kssd_genome_info(const vector& sketches, const string folderPath, const string type, bool sketchByFile); 9 | void saveSketches(vector& sketches, string folderPath, bool sketchByFile, string sketchFunc, bool isContainment, int containCompress, int sketchSize, int kmerSize); 10 | void saveKssdSketches(const vector& sketches, const KssdParameters info, const string folderPath, bool sketchByFile); 11 | 12 | bool loadSketches(string folderPath, int threads, vector& sketches, int& sketch_func_id); 13 | bool load_genome_info(string folderPath, string type, vector& sketches); 14 | bool load_kssd_genome_info(string folderPath, string type, vector& sketches); 15 | bool loadKssdSketches(string folderPath, int threads, vector& sketches, KssdParameters& info); 16 | #endif 17 | -------------------------------------------------------------------------------- /src/ThreadPool.h: -------------------------------------------------------------------------------- 1 | // Copyright © 2015, Battelle National Biodefense Institute (BNBI); 2 | // all rights reserved. Authored by: Brian Ondov, Todd Treangen, 3 | // Sergey Koren, and Adam Phillippy 4 | // 5 | // See the LICENSE.txt file included with this software for license information. 6 | 7 | #ifndef ThreadPool_h 8 | #define ThreadPool_h 9 | 10 | #include 11 | #include 12 | 13 | template 14 | class ThreadPool 15 | { 16 | public: 17 | 18 | ThreadPool(TypeOutput * (* functionNew)(TypeInput *), unsigned int threadCountNew); 19 | ~ThreadPool(); 20 | 21 | bool outputAvailable() const; 22 | TypeOutput * popOutputWhenAvailable(); // output must be deleted by calling function 23 | bool running() const; 24 | void runWhenThreadAvailable(TypeInput * input); // thread deletes input when finished 25 | void runWhenThreadAvailable(TypeInput * input, TypeOutput * (* functionNew)(TypeInput *)); // thread deletes input when finished 26 | 27 | private: 28 | 29 | struct OutputQueueNode 30 | { 31 | // used to preserve input order when outputting 32 | 33 | OutputQueueNode * prev; 34 | OutputQueueNode * next; 35 | 36 | TypeOutput * output; 37 | bool ready; 38 | }; 39 | 40 | unsigned int threadCount; 41 | 42 | pthread_t * threads; 43 | 44 | static void * thread(void *); 45 | 46 | TypeOutput * (* function)(TypeInput *); 47 | TypeInput * inputCurrent; 48 | OutputQueueNode * outputQueueNodeCurrent; 49 | 50 | pthread_mutex_t * mutexInput; 51 | pthread_mutex_t * mutexOutput; 52 | 53 | pthread_cond_t * condInput; 54 | pthread_cond_t * condOutput; 55 | 56 | OutputQueueNode * outputQueueHead; 57 | OutputQueueNode * outputQueueTail; 58 | 59 | bool finished; 60 | friend void * thread(void *); 61 | }; 62 | 63 | 64 | #include "ThreadPool.hxx" 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /src/ThreadPool.hxx: -------------------------------------------------------------------------------- 1 | // Copyright © 2015, Battelle National Biodefense Institute (BNBI); 2 | // all rights reserved. Authored by: Brian Ondov, Todd Treangen, 3 | // Sergey Koren, and Adam Phillippy 4 | // 5 | // See the LICENSE.txt file included with this software for license information. 6 | 7 | #include "ThreadPool.h" 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | ThreadPool::ThreadPool(TypeOutput * (* functionNew)(TypeInput *), unsigned int threadCountNew) 14 | : 15 | threadCount(threadCountNew), 16 | function(functionNew) 17 | { 18 | mutexInput = new pthread_mutex_t(); 19 | mutexOutput = new pthread_mutex_t(); 20 | 21 | condInput = new pthread_cond_t(); 22 | condOutput = new pthread_cond_t(); 23 | 24 | pthread_mutex_init(mutexInput, NULL); 25 | pthread_mutex_init(mutexOutput, NULL); 26 | 27 | pthread_cond_init(condInput, NULL); 28 | pthread_cond_init(condOutput, NULL); 29 | 30 | inputCurrent = 0; 31 | 32 | outputQueueHead = 0; 33 | outputQueueTail = 0; 34 | 35 | finished = false; 36 | 37 | threads = new pthread_t[threadCount]; 38 | 39 | for ( int i = 0; i < threadCount; i++ ) 40 | { 41 | pthread_create(&threads[i], NULL, &ThreadPool::thread, this); 42 | } 43 | } 44 | 45 | template 46 | ThreadPool::~ThreadPool() 47 | { 48 | pthread_mutex_lock(mutexInput); 49 | finished = true; 50 | pthread_cond_broadcast(condInput); 51 | pthread_mutex_unlock(mutexInput); 52 | 53 | for ( int i = 0; i < threadCount; i++ ) 54 | { 55 | pthread_join(threads[i], NULL); 56 | } 57 | 58 | delete [] threads; 59 | 60 | while ( outputQueueHead != 0 ) 61 | { 62 | OutputQueueNode * next = outputQueueHead->next; 63 | delete outputQueueHead; 64 | outputQueueHead = next; 65 | } 66 | 67 | delete mutexInput; 68 | delete mutexOutput; 69 | 70 | delete condInput; 71 | delete condOutput; 72 | } 73 | 74 | template 75 | bool ThreadPool::outputAvailable() const 76 | { 77 | bool available; 78 | 79 | pthread_mutex_lock(mutexOutput); 80 | available = outputQueueHead != 0 && outputQueueHead->ready; 81 | pthread_mutex_unlock(mutexOutput); 82 | 83 | return available; 84 | } 85 | 86 | template 87 | TypeOutput * ThreadPool::popOutputWhenAvailable() 88 | { 89 | pthread_mutex_lock(mutexOutput); 90 | 91 | if ( outputQueueHead == 0 ) 92 | { 93 | // TODO: error? 94 | std::cerr << "ERROR: waiting for output when no output queued\n"; 95 | pthread_mutex_unlock(mutexOutput); 96 | return 0; 97 | } 98 | 99 | while ( ! outputQueueHead->ready ) 100 | { 101 | pthread_cond_wait(condOutput, mutexOutput); 102 | } 103 | 104 | TypeOutput * output = outputQueueHead->output; 105 | 106 | OutputQueueNode * next = outputQueueHead->next; 107 | 108 | if ( outputQueueTail == outputQueueHead ) 109 | { 110 | outputQueueTail = 0; 111 | } 112 | 113 | delete outputQueueHead; 114 | outputQueueHead = next; 115 | pthread_mutex_unlock(mutexOutput); 116 | 117 | return output; 118 | } 119 | 120 | template 121 | void ThreadPool::runWhenThreadAvailable(TypeInput * input) 122 | { 123 | runWhenThreadAvailable(input, function); 124 | } 125 | 126 | template 127 | void ThreadPool::runWhenThreadAvailable(TypeInput * input, TypeOutput * (* functionNew)(TypeInput *)) 128 | { 129 | pthread_mutex_lock(mutexInput); 130 | 131 | while ( inputCurrent != 0 ) 132 | { 133 | pthread_cond_wait(condInput, mutexInput); 134 | } 135 | 136 | inputCurrent = input; 137 | function = functionNew; 138 | 139 | // enqueue output while input locked (to preserve order) 140 | // 141 | OutputQueueNode * outputQueueNode = new OutputQueueNode(); 142 | outputQueueNode->next = 0; 143 | outputQueueNode->ready = false; 144 | // 145 | pthread_mutex_lock(mutexOutput); 146 | // 147 | if ( outputQueueHead == 0 ) 148 | { 149 | outputQueueHead = outputQueueNode; 150 | } 151 | // 152 | outputQueueNode->prev = outputQueueTail; 153 | // 154 | if ( outputQueueTail != 0 ) 155 | { 156 | outputQueueTail->next = outputQueueNode; 157 | } 158 | // 159 | outputQueueTail = outputQueueNode; 160 | // 161 | pthread_mutex_unlock(mutexOutput); 162 | 163 | outputQueueNodeCurrent = outputQueueNode; 164 | 165 | pthread_mutex_unlock(mutexInput); 166 | pthread_cond_broadcast(condInput); 167 | } 168 | 169 | template 170 | bool ThreadPool::running() const 171 | { 172 | bool running; 173 | 174 | pthread_mutex_lock(mutexOutput); 175 | running = outputQueueHead != 0; 176 | pthread_mutex_unlock(mutexOutput); 177 | 178 | return running; 179 | } 180 | 181 | template 182 | void * ThreadPool::thread(void * arg) 183 | { 184 | ThreadPool * threadPool = (ThreadPool *)arg; 185 | TypeInput * input; 186 | OutputQueueNode * outputQueueNode; 187 | 188 | while ( ! threadPool->finished ) 189 | { 190 | // wait for input 191 | // 192 | pthread_mutex_lock(threadPool->mutexInput); 193 | // 194 | while ( ! threadPool->finished && threadPool->inputCurrent == 0 ) 195 | { 196 | pthread_cond_wait(threadPool->condInput, threadPool->mutexInput); 197 | } 198 | 199 | if ( threadPool->finished ) 200 | { 201 | pthread_mutex_unlock(threadPool->mutexInput); 202 | return 0; 203 | } 204 | // 205 | input = threadPool->inputCurrent; 206 | outputQueueNode = threadPool->outputQueueNodeCurrent; 207 | threadPool->inputCurrent = 0; 208 | TypeOutput * (* function)(TypeInput *) = threadPool->function; 209 | 210 | pthread_mutex_unlock(threadPool->mutexInput); 211 | 212 | pthread_cond_broadcast(threadPool->condInput); 213 | 214 | // run function 215 | // 216 | outputQueueNode->output = function(input); 217 | 218 | delete input; 219 | 220 | // signal output 221 | // 222 | outputQueueNode->ready = true; 223 | // 224 | pthread_mutex_lock(threadPool->mutexOutput); 225 | pthread_cond_broadcast(threadPool->condOutput); 226 | pthread_mutex_unlock(threadPool->mutexOutput); 227 | } 228 | 229 | return NULL; 230 | } 231 | -------------------------------------------------------------------------------- /src/UnionFind.h: -------------------------------------------------------------------------------- 1 | #ifndef UNIONFIND_H 2 | #define UNIONFIND_H 3 | 4 | 5 | class UnionFind { 6 | int *parent, *ranks, _size; 7 | public: 8 | UnionFind(){ 9 | } 10 | UnionFind(int size){ 11 | parent = new int[size]; ranks = new int[size]; 12 | for(int element = 0 ; element < size ; element++){ 13 | parent[element] = element , ranks[element] = 0 ; 14 | } 15 | _size = size; 16 | } 17 | void resize(int size){ 18 | parent = new int[size]; ranks = new int[size]; 19 | for(int element = 0 ; element < size ; element++){ 20 | parent[element] = element , ranks[element] = 0 ; 21 | } 22 | _size = size; 23 | } 24 | int find(int element){ 25 | if(parent[element] == element){ 26 | return element; 27 | } 28 | else{ 29 | return parent[element] = find(parent[element]); // Path Compression algorithm 30 | } 31 | } 32 | bool connected(int x,int y){ 33 | if(find(x) == find(y)){ 34 | return true; 35 | } 36 | else{ 37 | return false; 38 | } 39 | } 40 | void merge(int x,int y){ 41 | x = find(x); 42 | y = find(y); 43 | if(x != y){ // Union by Rank algorithm 44 | if(ranks[x] > ranks[y]){ 45 | parent[y] = x; 46 | } 47 | else if(ranks[x] < ranks[y]){ 48 | parent[x] = y; 49 | } 50 | else{ 51 | parent[x] = y; ranks[y] ++ ; 52 | } 53 | _size--; 54 | } 55 | } 56 | void clear(){ 57 | delete [] parent; delete [] ranks; 58 | } 59 | int size(){ 60 | return _size; 61 | } 62 | }; 63 | 64 | 65 | 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /src/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPP_COMMON 2 | #define HPP_COMMON 3 | 4 | /* 5 | * The parameter.h is for the basic parameter of sketch, such as sketchSize, kmerSize. 6 | * KMER_SIZE is the kmer size for slide-window of all sketch functions. 7 | * MINHASH_SKETCH_SIZE is the fixed sketch size for minHash resemblance computing. 8 | * SKETCH_COMPRESS_SEQUENCE is the proportion sketch size with genome sequences for containment computing. 9 | * SKETCH_COMPRESS_GENOME is the proportion sketch size with genome size for containment computing. 10 | * WMH_SKETCH_SIZE is the sketch size for WeightedMinHash. 11 | * WINDOW_SIZE is the window size for minimizer in WeightedMinHash. 12 | * HLL_SKETCH_BIT is the bit number to define the sketch size for HyperLogLog. 13 | * 14 | */ 15 | 16 | #include 17 | #include 18 | 19 | //#define KMER_SIZE 21 20 | //#define MINHASH_SKETCH_SIZE 10000 21 | //#define SKETCH_COMPRESS_SEQUENCE 1000 22 | //#define SKETCH_COMPRESS_GENOME 1000 23 | #define WMH_SKETCH_SIZE 50 24 | #define WINDOW_SIZE 20 25 | #define HLL_SKETCH_BIT 10 26 | #define DENSE_SPAN 100; 27 | 28 | #include 29 | inline double get_sec(){ 30 | struct timeval tv; 31 | gettimeofday(&tv, NULL); 32 | return (double)tv.tv_sec + (double)tv.tv_usec/1000000; 33 | } 34 | 35 | #include 36 | inline const string currentDataTime(){ 37 | time_t now = time(0); 38 | struct tm tstruct; 39 | char buf[80]; 40 | tstruct = *localtime(&now); 41 | strftime(buf, sizeof(buf), "%Y_%m_%d_%H-%M-%S", &tstruct); 42 | 43 | return buf; 44 | } 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/greedy.cpp: -------------------------------------------------------------------------------- 1 | #ifdef GREEDY_CLUST 2 | #include "greedy.h" 3 | #include 4 | #include 5 | using namespace std; 6 | 7 | /* @brief Generating clusters by greedy incremental algorthm. 8 | * 9 | * @param[in] sketches sketch array including hash values and informations for each genome sketch. 10 | * @param[in] sketchFunc sketch Function including MinHash and KSSD 11 | * @param[in] threshold distance threshold for cluster, genomes with distance below this threshold are clustered together. 12 | * @param[in] threads Thread number for multiThreading 13 | * @return cluster result two-dimention array, each array in result is a cluster, and each element in a cluster 14 | * is a genome. 15 | */ 16 | vector> greedyCluster(vector& sketches, int sketch_func_id, double threshold, int threads) 17 | { 18 | int numGenomes = sketches.size(); 19 | int * clustLabels = new int[numGenomes]; 20 | memset(clustLabels, 0, numGenomes*sizeof(int)); 21 | vector > cluster; 22 | vector representiveArr; 23 | map > semiClust; 24 | representiveArr.push_back(0); 25 | semiClust.insert({0, vector()}); 26 | 27 | for(int j = 1; j < numGenomes; j++){ 28 | map distMapCenter; 29 | #pragma omp parallel for num_threads(threads) 30 | for(int i = 0; i < representiveArr.size(); i++){ 31 | int repId = representiveArr[i]; 32 | double dist; 33 | if(sketch_func_id == 0){ 34 | if(sketches[repId].isContainment) 35 | dist = sketches[repId].minHash->containDistance(sketches[j].minHash); 36 | //dist = 1.0 - sketches[repId].minHash->containJaccard(sketches[j].minHash); 37 | else 38 | dist = sketches[repId].minHash->distance(sketches[j].minHash); 39 | } 40 | else if(sketch_func_id == 1){ 41 | dist = sketches[repId].KSSD->distance(sketches[j].KSSD); 42 | } 43 | else{ 44 | cerr << "can only support MinHash and KSSD with greedy incremental clust" << endl; 45 | exit(1); 46 | } 47 | if(dist <= threshold){ 48 | clustLabels[j] = 1; 49 | #pragma omp critical 50 | { 51 | distMapCenter.insert({dist, repId}); 52 | } 53 | //break; 54 | } 55 | }//end for i 56 | if(clustLabels[j] == 0){//this genome is a representative genome 57 | representiveArr.push_back(j); 58 | semiClust.insert({j, vector()}); 59 | } 60 | else{//this genome is a redundant genome, get the nearest representive genome as its center 61 | auto it = distMapCenter.begin(); 62 | int repId = it->second; 63 | semiClust[repId].push_back(j); 64 | } 65 | map().swap(distMapCenter); 66 | if(j % 10000 == 0) cerr << "---finished cluster: " << j << endl; 67 | 68 | }//end for j 69 | //cerr << "the representiveArr size is : " << representiveArr.size() << endl; 70 | 71 | for(auto x : semiClust){ 72 | int center = x.first; 73 | vector redundantArr = x.second; 74 | vector curClust; 75 | curClust.push_back(center); 76 | curClust.insert(curClust.end(), redundantArr.begin(), redundantArr.end()); 77 | cluster.push_back(curClust); 78 | } 79 | return cluster; 80 | } 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /src/greedy.h: -------------------------------------------------------------------------------- 1 | #ifdef GREEDY_CLUST 2 | #ifndef H_GREEDY 3 | #define H_GREEDY 4 | 5 | #include "SketchInfo.h" 6 | vector> greedyCluster(vector& sketches, int sketch_func_id, double threshold, int threads); 7 | 8 | 9 | 10 | #endif 11 | #endif 12 | -------------------------------------------------------------------------------- /src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #ifdef USE_MALLOC_WRAPPERS 36 | # include "malloc_wrap.h" 37 | #endif 38 | 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 40 | #define KS_SEP_TAB 1 // isspace() && !' ' 41 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 42 | #define KS_SEP_MAX 2 43 | 44 | #define __KS_TYPE(type_t) \ 45 | typedef struct __kstream_t { \ 46 | unsigned char *buf; \ 47 | int begin, end, is_eof; \ 48 | type_t f; \ 49 | } kstream_t; 50 | 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 53 | 54 | #define __KS_BASIC(type_t, __bufsize) \ 55 | static inline kstream_t *ks_init(type_t f) \ 56 | { \ 57 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 58 | ks->f = f; \ 59 | ks->buf = (unsigned char*)malloc(__bufsize); \ 60 | return ks; \ 61 | } \ 62 | static inline void ks_destroy(kstream_t *ks) \ 63 | { \ 64 | if (ks) { \ 65 | free(ks->buf); \ 66 | free(ks); \ 67 | } \ 68 | } 69 | 70 | #define __KS_GETC(__read, __bufsize) \ 71 | static inline int ks_getc(kstream_t *ks) \ 72 | { \ 73 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 74 | if (ks->begin >= ks->end) { \ 75 | ks->begin = 0; \ 76 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 77 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ 78 | } \ 79 | return (int)ks->buf[ks->begin++]; \ 80 | } 81 | 82 | #ifndef KSTRING_T 83 | #define KSTRING_T kstring_t 84 | typedef struct __kstring_t { 85 | size_t l, m; 86 | char *s; 87 | } kstring_t; 88 | #endif 89 | 90 | #ifndef kroundup32 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 92 | #endif 93 | 94 | #define __KS_GETUNTIL(__read, __bufsize) \ 95 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 96 | { \ 97 | int gotany = 0; \ 98 | if (dret) *dret = 0; \ 99 | str->l = append? str->l : 0; \ 100 | for (;;) { \ 101 | int i; \ 102 | if (ks->begin >= ks->end) { \ 103 | if (!ks->is_eof) { \ 104 | ks->begin = 0; \ 105 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 106 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | gotany = 1; \ 128 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 129 | str->l = str->l + (i - ks->begin); \ 130 | ks->begin = i + 1; \ 131 | if (i < ks->end) { \ 132 | if (dret) *dret = ks->buf[i]; \ 133 | break; \ 134 | } \ 135 | } \ 136 | if (!gotany && ks_eof(ks)) return -1; \ 137 | if (str->s == 0) { \ 138 | str->m = 1; \ 139 | str->s = (char*)calloc(1, 1); \ 140 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 141 | str->s[str->l] = '\0'; \ 142 | return str->l; \ 143 | } \ 144 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 145 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 146 | 147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 148 | __KS_TYPE(type_t) \ 149 | __KS_BASIC(type_t, __bufsize) \ 150 | __KS_GETC(__read, __bufsize) \ 151 | __KS_GETUNTIL(__read, __bufsize) 152 | 153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 154 | 155 | #define __KSEQ_BASIC(SCOPE, type_t) \ 156 | SCOPE kseq_t *kseq_init(type_t fd) \ 157 | { \ 158 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 159 | s->f = ks_init(fd); \ 160 | return s; \ 161 | } \ 162 | SCOPE void kseq_destroy(kseq_t *ks) \ 163 | { \ 164 | if (!ks) return; \ 165 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 166 | ks_destroy(ks->f); \ 167 | free(ks); \ 168 | } 169 | 170 | /* Return value: 171 | >=0 length of the sequence (normal) 172 | -1 end-of-file 173 | -2 truncated quality string 174 | */ 175 | #define __KSEQ_READ(SCOPE) \ 176 | SCOPE int kseq_read(kseq_t *seq) \ 177 | { \ 178 | int c; \ 179 | kstream_t *ks = seq->f; \ 180 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 181 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 182 | if (c == -1) return -1; /* end of file */ \ 183 | seq->last_char = c; \ 184 | } /* else: the first header char has been read in the previous call */ \ 185 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 186 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 187 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 188 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 189 | seq->seq.m = 256; \ 190 | seq->seq.s = (char*)malloc(seq->seq.m); \ 191 | } \ 192 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 193 | if (c == '\n') continue; /* skip empty lines */ \ 194 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 195 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 196 | } \ 197 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 198 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 199 | seq->seq.m = seq->seq.l + 2; \ 200 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 201 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 202 | } \ 203 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 204 | if (c != '+') return seq->seq.l; /* FASTA */ \ 205 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 206 | seq->qual.m = seq->seq.m; \ 207 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 208 | } \ 209 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 210 | if (c == -1) return -2; /* error: no quality string */ \ 211 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 212 | seq->last_char = 0; /* we have not come to the next header line */ \ 213 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 214 | return seq->seq.l; \ 215 | } 216 | 217 | #define __KSEQ_TYPE(type_t) \ 218 | typedef struct { \ 219 | kstring_t name, comment, seq, qual; \ 220 | int last_char; \ 221 | kstream_t *f; \ 222 | } kseq_t; 223 | 224 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 225 | KSTREAM_INIT(type_t, __read, 16384) \ 226 | __KSEQ_TYPE(type_t) \ 227 | __KSEQ_BASIC(SCOPE, type_t) \ 228 | __KSEQ_READ(SCOPE) 229 | 230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 231 | 232 | #define KSEQ_DECLARE(type_t) \ 233 | __KS_TYPE(type_t) \ 234 | __KSEQ_TYPE(type_t) \ 235 | extern kseq_t *kseq_init(type_t fd); \ 236 | void kseq_destroy(kseq_t *ks); \ 237 | int kseq_read(kseq_t *seq); 238 | 239 | #endif 240 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This version is for result checking using single thread. 3 | * The input parameter and the final output is the same with the final version. 4 | * 5 | * The input can be a single file with numbers of sequences to create sketches by sequences. 6 | * And it can alse be a single file with list of genome files to create sketches by files(genomes). 7 | * This is strategy of clustering based distance computing by sequences or by genomes. 8 | * 9 | * The program includes several sections: 10 | * section 1: read the input arguments and init the parameters. 11 | * section 2: read genome files and create the sketches. 12 | * section 3: compute the distance matrix and generate the Minimum Spanning Tree(MST) or greedy incremental clustering. 13 | * section 4: generate the clusters with the MST using different distance threshold. 14 | * 15 | * Author: Xiaoming Xu 16 | * Mar 5, 2021 17 | * 18 | */ 19 | #include 20 | #include "SketchInfo.h" 21 | #include "Sketch.h"// need to add the include path in Makefile. 22 | #include 23 | #include "MST.h" 24 | #include 25 | #include "UnionFind.h" 26 | #include 27 | #include "common.hpp" 28 | #include "MST_IO.h" 29 | #include 30 | #include "Sketch_IO.h" 31 | 32 | #ifdef GREEDY_CLUST 33 | #include "greedy.h" 34 | #endif 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include "CLI11.hpp" 42 | #include "sub_command.h" 43 | 44 | #ifdef GREEDY_CLUST 45 | #else 46 | #endif 47 | 48 | 49 | using namespace std; 50 | 51 | int main(int argc, char * argv[]){ 52 | #ifdef GREEDY_CLUST 53 | CLI::App app{"clust-greedy v.2.2.1, greedy incremental clustering module for RabbitTClust"}; 54 | #else 55 | CLI::App app{"clust-mst v.2.2.1, minimum-spanning-tree-based module for RabbitTClust"}; 56 | #endif 57 | //section 1: init parameters 58 | int argIndex = 1; 59 | string inputFile = "genome.fna"; 60 | string inputFile1 = "genome.info"; 61 | string sketchFunc = "MinHash"; 62 | string outputFile = "result.out"; 63 | int threads = 1; 64 | threads = get_nprocs_conf(); 65 | bool sketchByFile = false; 66 | bool isContainment = false; 67 | bool isJaccard = false; 68 | bool useFile = false; 69 | double threshold = 0.05; 70 | int kmerSize = 21; 71 | int sketchSize = 1000; 72 | int containCompress = 1000; 73 | int drlevel = 3; 74 | bool mstLoadSketch = false; 75 | string mstSketchFile = "sketch.info"; 76 | bool isSetKmer = false; 77 | uint64_t minLen = 10000; 78 | string folder_path; 79 | bool is_newick_tree = false; 80 | bool is_fast = false; 81 | bool no_dense = false; 82 | 83 | bool noSave = false; 84 | 85 | auto option_threads = app.add_option("-t, --threads", threads, "set the thread number, default all CPUs of the platform"); 86 | auto option_min_len = app.add_option("-m, --min-length", minLen, "set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000"); 87 | 88 | auto option_containment = app.add_option("-c, --containment", containCompress, "use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress"); 89 | auto option_kmer_size = app.add_option("-k, --kmer-size", kmerSize, "set the kmer size"); 90 | auto option_sketch_size = app.add_option("-s, --sketch-size", sketchSize, "set the sketch size for Jaccard Index and Mash distance, default 1000"); 91 | 92 | auto flag_input_list = app.add_flag("-l, --list", sketchByFile, "input is genome list, one genome per line"); 93 | auto flag_no_save = app.add_flag("-e, --no-save", noSave, "not save the intermediate files, such as sketches or MST"); 94 | auto option_threshold = app.add_option("-d, --threshold", threshold, "set the distance threshold for clustering"); 95 | auto option_output = app.add_option("-o, --output", outputFile, "set the output name of cluster result"); 96 | auto option_input = app.add_option("-i, --input", inputFile, "set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)"); 97 | auto option_presketched = app.add_option("--presketched", folder_path, "clustering by the pre-generated sketch files rather than genomes"); 98 | #ifndef GREEDY_CLUST 99 | auto option_premsted = app.add_option("--premsted", folder_path, "clustering by the pre-generated mst files rather than genomes for clust-mst"); 100 | auto flag_newick_tree = app.add_flag("--newick-tree", is_newick_tree, "output the newick tree format file for clust-mst"); 101 | auto flag_is_fast = app.add_flag("--fast", is_fast, "use the kssd algorithm for sketching and distance computing for clust-mst"); 102 | auto option_drlevel = app.add_option("--drlevel", drlevel, "set the dimention reduction level for Kssd sketches, default 3 with a dimention reduction of 1/4096"); 103 | auto flag_no_dense = app.add_flag("--no-dense", no_dense, "not calculate the density and ANI datas"); 104 | #endif 105 | auto option_append = app.add_option("--append", inputFile, "append genome file or file list with the pre-generated sketch or MST files"); 106 | 107 | option_output->required(); 108 | option_append->excludes(option_input); 109 | 110 | CLI11_PARSE(app, argc, argv); 111 | 112 | if(threads < 1){ 113 | fprintf(stderr, "-----Invalid thread number %d\n", threads); 114 | return 1; 115 | } 116 | if(option_threads){ 117 | fprintf(stderr, "-----set the thread number %d\n", threads); 118 | } 119 | if(*option_min_len){ 120 | fprintf(stderr, "-----set the filter minimum length: %ld\n", minLen); 121 | } 122 | if(*option_containment){ 123 | isContainment = true; 124 | fprintf(stderr, "-----use AAF distance with containment coefficient, the sketch size is in porportion with 1/%d\n", containCompress); 125 | } 126 | if(*option_kmer_size){ 127 | isSetKmer = true; 128 | fprintf(stderr, "-----set kmerSize: %d\n", kmerSize); 129 | } 130 | if(*option_sketch_size){ 131 | isJaccard = true; 132 | fprintf(stderr, "-----set sketchSize: %d\n", sketchSize); 133 | } 134 | if(*option_threshold){ 135 | fprintf(stderr, "-----set threshold: %d\n", threshold); 136 | } 137 | 138 | 139 | #ifndef GREEDY_CLUST 140 | //======clust-mst========================================================================= 141 | if(is_fast){ 142 | if(*option_premsted && !*option_append){ 143 | clust_from_mst_fast(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads); 144 | return 0; 145 | } 146 | if(*option_presketched && !*option_append){ 147 | clust_from_sketch_fast(folder_path, outputFile, is_newick_tree, no_dense, isContainment, threshold, threads); 148 | return 0; 149 | } 150 | if(*option_append && !*option_premsted && !*option_presketched){ 151 | cerr << "ERROR: option --append, option --presketched or --premsted needed" << endl; 152 | return 1; 153 | } 154 | if(*option_append && (*option_presketched || *option_premsted)){ 155 | append_clust_mst_fast(folder_path, inputFile, outputFile, is_newick_tree, no_dense, sketchByFile, isContainment, minLen, noSave, threshold, threads); 156 | return 0; 157 | } 158 | if(!tune_kssd_parameters(sketchByFile, isSetKmer, inputFile, threads, minLen, isContainment, kmerSize, threshold, drlevel)){ 159 | return 1; 160 | } 161 | clust_from_genome_fast(inputFile, outputFile, folder_path, is_newick_tree, no_dense, sketchByFile, isContainment, kmerSize, threshold, drlevel, minLen, noSave, threads); 162 | return 0; 163 | } 164 | 165 | if(*option_premsted && !*option_append){ 166 | clust_from_mst(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads); 167 | return 0; 168 | } 169 | if(*option_append && !*option_presketched && !*option_premsted){ 170 | cerr << "ERROR option --append, option --presketched or --premsted needed" << endl; 171 | return 1; 172 | } 173 | if(*option_append && (*option_premsted || *option_presketched)){ 174 | append_clust_mst(folder_path, inputFile, outputFile, is_newick_tree, no_dense, sketchByFile, minLen, noSave, threshold, threads); 175 | return 0; 176 | } 177 | //======clust-mst========================================================================= 178 | #else 179 | //======clust-greedy====================================================================== 180 | if(*option_append && !*option_presketched){ 181 | cerr << "ERROR option --append, option --presketched needed" << endl; 182 | return 1; 183 | } 184 | if(*option_append && *option_presketched){ 185 | append_clust_greedy(folder_path, inputFile, outputFile, sketchByFile, minLen, noSave, threshold, threads); 186 | return 0; 187 | } 188 | //======clust-greedy====================================================================== 189 | #endif 190 | 191 | if(*option_presketched && !*option_append){ 192 | clust_from_sketches(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads); 193 | return 0; 194 | } 195 | 196 | if(!tune_parameters(sketchByFile, isSetKmer, inputFile, threads, minLen, isContainment, isJaccard, kmerSize, threshold, containCompress, sketchSize)){ 197 | return 1; 198 | } 199 | 200 | 201 | clust_from_genomes(inputFile, outputFile, is_newick_tree, sketchByFile, no_dense, kmerSize, sketchSize, threshold,sketchFunc, isContainment, containCompress, minLen, folder_path, noSave, threads); 202 | 203 | return 0; 204 | }//end main 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /src/sub_command.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "SketchInfo.h" 3 | #include "MST.h" 4 | #include "Sketch_IO.h" 5 | #include "common.hpp" 6 | #include "MST_IO.h" 7 | #include "greedy.h" 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | void compute_sketches(vector& sketches, string inputFile, string& folder_path, bool sketchByFile, int minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, bool isSave, int threads); 13 | 14 | void compute_clusters(vector& sketches, bool sketchByFile, string outputFile, bool is_newick_tree, bool no_dense, string folder_path, int sketch_func_id, double threshold, bool isSave, int threads); 15 | 16 | void clust_from_genomes(string inputFile, string outputFile, bool is_newick_tree, bool sketchByFile, bool no_dense, int kmerSize, int sketchSize, double threshold, string sketchFunc, bool isContainment, int containCompress, int minLen, string folder_path, bool noSave, int threads); 17 | 18 | bool tune_parameters(bool sketchByFile, bool isSetKmer, string inputFile, int threads, int minLen, bool& isContainment, bool& isJaccard, int& kmerSize, double& threshold, int& containCompress, int& sketchSize); 19 | bool tune_kssd_parameters(bool sketchByFile, bool isSetKmer, string inputFile, int threads, int minLen, bool& isContainment, int& kmerSize, double& threshold, int &drlevel); 20 | 21 | void clust_from_sketches(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads); 22 | 23 | void clust_from_mst(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads); 24 | void clust_from_mst_fast(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads); 25 | 26 | void append_clust_mst(string folder_path, string input_file, string output_file, bool is_newick_tree, bool no_dense, bool sketch_by_file, int min_len, bool no_save, double threshold, int threads); 27 | void append_clust_mst_fast(string folder_path, string input_file, string output_file, bool is_newick_tree, bool no_dense, bool sketch_by_file, bool isContainment, int min_len, bool no_save, double threshold, int threads); 28 | 29 | void append_clust_greedy(string folder_path, string input_file, string output_file, bool sketch_by_file, int min_len, bool no_save, double threshold, int threads); 30 | 31 | void compute_kssd_sketches(vector& sketches, KssdParameters& info, bool isSave, const string inputFile, string& folder_path, bool sketchByFile, const int minLen, const int kmerSize, const int drlevel, int threads); 32 | void compute_kssd_clusters(vector& sketches, const KssdParameters info, bool sketchByFile, bool no_dense, bool isContainment, const string folder_path, string outputFile, bool is_newick_tree, double threshold, bool isSave, int threads); 33 | 34 | void clust_from_genome_fast(const string inputFile, string outputFile, string folder_path, bool is_newick_tree, bool no_dense, bool sketchByFile, bool isContainment, const int kmerSize, const double threshold, const int drlevel, const int minLen, bool noSave, int threads); 35 | void clust_from_sketch_fast(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, bool isContainment, double threshold, int threads); 36 | 37 | -------------------------------------------------------------------------------- /version_history/history.md: -------------------------------------------------------------------------------- 1 | # Latest version: `v.2.3.0` 2 | * add `--fast` option for `clust-mst` to use the more efficient Kssd sketch strategy when computing the all-vs-all genome distances. 3 | * The `--fast` option can work together with `--append`, `--presketched`, and `--premsted` options. 4 | * the `--drlevel` is used for setting the dimention reduction level for Kssd sketches. Default value is 3, which is corresponding to a dimention reduction of $1 / 2^{(4*3)} = 1/4096$. 5 | 6 | ## [`v.2.2.1`](v.2.2.1.md) 7 | * add `--newick-tree` option to output the Newick tree format for `clust-mst`. 8 | 9 | ## [`v.2.2.0`](v.2.2.0.md) 10 | * support incrementally clustering by option `--append` accompanied with `--presketched` or `--premsted` options. 11 | 12 | Note: 13 | * When considering the clustering of the genome set `A+B` using a pre-generated sketch `A_sketch` and an appending genome set `B`, it is important to note that the sketch parameter for the pre-generated sketch `A_sketch` and the appending set `B` may differ from that of the whole genome set `A+B`. However, the impact of changes in the genome lengths of set `B` on the automatically generated parameters will be minimal if they are not significant. 14 | 15 | * This is because the sketch parameters, including the $k$-mer size, sketch size, and containment compress ratio, for the appending genome set `B` are the same as those of the pre-generated sketch `A_sketch`. Additionally, the automatic parameter generation method, which is carried out using the `tune_parameters()` function, depends on whole genome information such as minimum, maximum, and mean genome length. 16 | Therefore, the changes in the genome lengths of the appending set `B` are unlikely to have a significant effect on the automatically generated parameters if they are not substantial. 17 | 18 | * In the context of genome clustering, the sketches are sorted by unstable sort in a decreasing order of their genome length. Consequently, the order of sketches may undergo slight changes if there are genomes with identical lengths. However, this does not significantly affect the outcome of the clustering process. 19 | 20 | ## [`v.2.1.0`](v.2.1.0.md) 21 | * change the parameter parsing by [CLI11](https://github.com/CLIUtils/CLI11). 22 | * save the intermediate files (sketch, mst files) in binary format. 23 | * abrogate the `-f` option for loading pre-generated sketch or MST file, replaced by `--presketched` and `--premsted` option. 24 | 25 | More details by `clust-mst --help` or `clust-greedy --help`. 26 | 27 | ## `v.2.0.3` 28 | * add the parameter `-m` to set the minimum genome length (*minLen*), genomes with lengths less than *minLen* will be ignored. 29 | 30 | ## `v.2.0.2` 31 | * update the `calSize` of gz files for automatically generating $k$-mer size . 32 | 33 | ## `v.2.0.1` 34 | * Update the latest version of [robin-hood-hashing](https://github.com/martinus/robin-hood-hashing) to solve the compile error with `g++ 12.0+`. 35 | 36 | ## [`v.2.0.0`](v.2.0.0.md) 37 | * Add the `clust-greedy` module for greedy incremental clustering. 38 | * Last MST-based clustering module is `clust-mst` module. 39 | 40 | 41 | ## `v.1.0.0` 42 | * First version of RabbitTClust, large-scaled genome clustering tool based on sketch technique and Minimum Spanning Tree (MST). -------------------------------------------------------------------------------- /version_history/v.2.0.0.md: -------------------------------------------------------------------------------- 1 | # `v.2.0.0` 2 | ## Installation 3 | RabbitTClust version 2.0 can only support 64-bit Linux Systems. 4 | 5 | ### Dependancy 6 | * cmake v.3.0 or later 7 | * c++14 8 | * [zlib](https://zlib.net/) 9 | 10 | ### Compile and install automatically 11 | ```bash 12 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 13 | cd RabbitTClust 14 | ./install.sh 15 | ``` 16 | 17 | ### Compile and install manually 18 | ```bash 19 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 20 | cd RabbitTClust 21 | 22 | #make rabbitSketch library 23 | cd RabbitSketch && 24 | mkdir -p build && cd build && 25 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. && 26 | make -j8 && make install && 27 | cd ../../ && 28 | 29 | #make rabbitFX library 30 | cd RabbitFX && 31 | mkdir -p build && cd build && 32 | cmake -DCMAKE_INSTALL_PREFIX=. .. && 33 | make -j8 && make install && 34 | cd ../../ && 35 | 36 | #compile the clust-greedy 37 | mkdir -p build && cd build && 38 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 39 | make -j8 && make install && 40 | cd ../ && 41 | 42 | #compile the clust-mst 43 | cd build && 44 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. && 45 | make -j8 && make install && 46 | cd ../ 47 | ``` 48 | 49 | ## Usage 50 | ```bash 51 | usage: clust-mst [-h] [-l] [-t] [-d] [-F] [-i] [-o] 52 | usage: clust-mst [-h] [-f] [-E] [-d] [-i] [-o] 53 | usage: clust-greedy [-h] [-l] [-t] [-d] [-F] [-i] [-o] 54 | usage: clust-greedy [-h] [-f] [-d] [-i] [-o] 55 | -h : this help message 56 | -k : set kmer size, default 21, for both clust-mst and clust-greedy 57 | -s : set sketch size, default 1000, for both clust-mst and clust-greedy 58 | -c : set sampling ratio to compute variable sketchSize, sketchSize = genomeSize/samplingRatio, only support with MinHash sketch function, for clust-greedy 59 | -d : set the distance threshold, default 0.05 for both clust-mst and clust-greedy 60 | -t : set the thread number, default take full usage of platform cores number, for both clust-mst and clust-greedy 61 | -l : input is a file list, not a single genome file. Lines in the input file list specify paths to genome files, one per line, for both clust-mst and clust-greedy 62 | -i : path of input file. One file list or single genome file. Two input file with -f and -E option 63 | -f : two input files, genomeInfo and MSTInfo files for clust-mst; genomeInfo and sketchInfo files for clust-greedy 64 | -E : two input files, genomeInfo and sketchInfo for clust-mst 65 | -F : set the sketch function, including MinHash and KSSD, default MinHash, for both clust-mst and clust-greedy 66 | -o : path of output file, for both clust-mst and clust-greedy 67 | -e : not save the intermediate file generated from the origin genome file, such as the GenomeInfo, MSTInfo, and SketchInfo files, for both clust-mst and clust-greedy 68 | 69 | ``` 70 | 71 | ## Example: 72 | ```bash 73 | #input is a file list, one genome path per line: 74 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust 75 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust 76 | 77 | #input is a single genome file in FASTA format, one genome as a sequence: 78 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust 79 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust 80 | 81 | #the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options. 82 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust 83 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust 84 | 85 | 86 | #for redundancy detection with clust-greedy, input is a genome file list: 87 | #use -d to specify the distance threshold corresponding to various degrees of redundancy. 88 | ./clust-greedy -d 0.001 -l -i bacteriaList -o bacteria.out 89 | 90 | #for generator cluster from exist MST with a distance threshold of 0.045: 91 | #ATTENTION: the -f must in front of the -i option 92 | ./clust-mst -d 0.05 -f -i bact_refseq.list.MinHashGenomeInfo bact_refseq.list.MinHashMSTInfo -o bact_refseq.mst.d.045.clust 93 | 94 | #for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001: 95 | #ATTENTION: the -f must in front of the -i option 96 | ./clust-greedy -d 0.001 -f -i bact_genbank.list.MinHashGenomeInfo bact_genbank.list.MinHashSketchInfo -o bact_genbank.greedy.d.001.clust 97 | 98 | ``` 99 | ## Output 100 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*). 101 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome. 102 | 103 | #### Output format for a FASTA file list input 104 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are: 105 | * local index in a cluster 106 | * global index of the genome 107 | * genome length 108 | * genome file name (including genome assembly accession number) 109 | * sequence name (first sequence in the genome file) 110 | * sequence comment (remaining part of the line) 111 | 112 | **Example:** 113 | ```txt 114 | the cluster 0 is: 115 | 0 0 14782125nt bacteria/GCF_000418325.1_ASM41832v1_genomic.fna NC_021658.1 Sorangium cellulosum So0157-2, complete sequence 116 | 1 1 14598830nt bacteria/GCF_004135755.1_ASM413575v1_genomic.fna NZ_CP012672.1 Sorangium cellulosum strain So ce836 chromosome, complete genome 117 | 118 | the cluster 1 is: 119 | 0 2 14557589nt bacteria/GCF_002950945.1_ASM295094v1_genomic.fna NZ_CP012673.1 Sorangium cellulosum strain So ce26 chromosome, complete genome 120 | 121 | the cluster 2 is: 122 | 0 3 13673866nt bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna NZ_JAHKRM010000001.1 Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence 123 | 124 | ...... 125 | ``` 126 | 127 | #### Output format for a single FASTA file input 128 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are: 129 | * local index in a cluster 130 | * global index of the genome 131 | * genome length 132 | * sequence name 133 | * sequence comment (remaining part of this line) 134 | 135 | **Example:** 136 | ```txt 137 | the cluster 0 is: 138 | 0 0 11030030nt NZ_GG657755.1 Streptomyces himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence 139 | 1 1 11008137nt NZ_RIBZ01000339.1 Streptomyces sp. NEAU-LD23 C2041, whole genome shotgun sequence 140 | 141 | the cluster 1 is: 142 | 0 2 11006208nt NZ_KL647031.1 Nonomuraea candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence 143 | 144 | the cluster 2 is: 145 | 0 3 10940472nt NZ_VTHK01000001.1 Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence 146 | 147 | ...... 148 | ``` 149 | 150 | 151 | # Bug Report 152 | All bug reports, comments and suggestions are welcome. 153 | 154 | ## Cite 155 | [Xu, X. et al. (2022). RabbitTClust: enabling fast clustering analysis of 156 | millions bacteria genomes with minhash sketches. bioRxiv.](https://doi.org/10.1101/2022.10.13.512052) 157 | -------------------------------------------------------------------------------- /version_history/v.2.1.0.md: -------------------------------------------------------------------------------- 1 | # `RabbitTClust v.2.1.0` 2 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations. 3 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms. 4 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 5 | 6 | ## Installation 7 | RabbitTClust version 2.1.0 can only support 64-bit Linux Systems. 8 | 9 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](./history.md) document. 10 | 11 | ### Dependancy 12 | * cmake v.3.0 or later 13 | * c++14 14 | * [zlib](https://zlib.net/) 15 | 16 | ### Compile and install automatically 17 | ```bash 18 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 19 | cd RabbitTClust 20 | ./install.sh 21 | ``` 22 | 23 | ### Compile and install manually 24 | ```bash 25 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 26 | cd RabbitTClust 27 | 28 | #make rabbitSketch library 29 | cd RabbitSketch && 30 | mkdir -p build && cd build && 31 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. && 32 | make -j8 && make install && 33 | cd ../../ && 34 | 35 | #make rabbitFX library 36 | cd RabbitFX && 37 | mkdir -p build && cd build && 38 | cmake -DCMAKE_INSTALL_PREFIX=. .. && 39 | make -j8 && make install && 40 | cd ../../ && 41 | 42 | #compile the clust-greedy 43 | mkdir -p build && cd build && 44 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 45 | make -j8 && make install && 46 | cd ../ && 47 | 48 | #compile the clust-mst 49 | cd build && 50 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. && 51 | make -j8 && make install && 52 | cd ../ 53 | ``` 54 | 55 | ## Usage 56 | ```bash 57 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust 58 | Usage: ./clust-mst [OPTIONS] 59 | Options: 60 | -h,--help Print this help message and exit 61 | -t,--threads INT set the thread number, default all CPUs of the platform 62 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 63 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress 64 | -k,--kmer-size INT set the kmer size 65 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 66 | -l,--inputlist input is genome list, one genome per line 67 | -e,--no-save not save the intermediate files, such as sketches or MST 68 | -d,--threshold FLOAT set the distance threshold for clustering 69 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 70 | -o,--output TEXT REQUIRED set the output name of cluster result 71 | -i,--input TEXT set the input file 72 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 73 | --premsted TEXT clustering by the pre-generated mst files rather than genomes for clust-mst 74 | 75 | # clust-greedy, greedy incremental clustering module for RabbitTClust 76 | Usage: ./clust-greedy [OPTIONS] 77 | Options: 78 | -h,--help Print this help message and exit 79 | -t,--threads INT set the thread number, default all CPUs of the platform 80 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 81 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress 82 | -k,--kmer-size INT set the kmer size 83 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 84 | -l,--inputlist input is genome list, one genome per line 85 | -e,--no-save not save the intermediate files, such as sketches or MST 86 | -d,--threshold FLOAT set the distance threshold for clustering 87 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 88 | -o,--output TEXT REQUIRED set the output name of cluster result 89 | -i,--input TEXT set the input file 90 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 91 | ``` 92 | 93 | ## Example: 94 | ```bash 95 | #input is a file list, one genome path per line: 96 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust 97 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust 98 | 99 | #input is a single genome file in FASTA format, one genome as a sequence: 100 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust 101 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust 102 | 103 | #the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options. 104 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust 105 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust 106 | 107 | 108 | #for redundancy detection with clust-greedy, input is a genome file list: 109 | #use -d to specify the distance threshold corresponding to various degrees of redundancy. 110 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out 111 | 112 | #for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15. 113 | #this folder contains the sketch, mst files. 114 | #for generator cluster from exist MST with a distance threshold of 0.045: 115 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15 -o bacteria.mst.d.045.clust 116 | #for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045: 117 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15 -o bacteria.mst.d.045.clust 118 | 119 | #for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001: 120 | # folder 2023_05_06_08-49-15 contains the sketch files. 121 | ./clust-greedy -d 0.001 --presketched 2023_05_06_08-49-15 -o bact_genbank.greedy.d.001.clust 122 | ``` 123 | ## Output 124 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*). 125 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome. 126 | 127 | #### Output format for a FASTA file list input 128 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are: 129 | * local index in a cluster 130 | * global index of the genome 131 | * genome length 132 | * genome file name (including genome assembly accession number) 133 | * sequence name (first sequence in the genome file) 134 | * sequence comment (remaining part of the line) 135 | 136 | **Example:** 137 | ```txt 138 | the cluster 0 is: 139 | 0 0 14782125nt bacteria/GCF_000418325.1_ASM41832v1_genomic.fna NC_021658.1 Sorangium cellulosum So0157-2, complete sequence 140 | 1 1 14598830nt bacteria/GCF_004135755.1_ASM413575v1_genomic.fna NZ_CP012672.1 Sorangium cellulosum strain So ce836 chromosome, complete genome 141 | 142 | the cluster 1 is: 143 | 0 2 14557589nt bacteria/GCF_002950945.1_ASM295094v1_genomic.fna NZ_CP012673.1 Sorangium cellulosum strain So ce26 chromosome, complete genome 144 | 145 | the cluster 2 is: 146 | 0 3 13673866nt bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna NZ_JAHKRM010000001.1 Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence 147 | 148 | ...... 149 | ``` 150 | 151 | #### Output format for a single FASTA file input 152 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are: 153 | * local index in a cluster 154 | * global index of the genome 155 | * genome length 156 | * sequence name 157 | * sequence comment (remaining part of this line) 158 | 159 | **Example:** 160 | ```txt 161 | the cluster 0 is: 162 | 0 0 11030030nt NZ_GG657755.1 Streptomyces himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence 163 | 1 1 11008137nt NZ_RIBZ01000339.1 Streptomyces sp. NEAU-LD23 C2041, whole genome shotgun sequence 164 | 165 | the cluster 1 is: 166 | 0 2 11006208nt NZ_KL647031.1 Nonomuraea candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence 167 | 168 | the cluster 2 is: 169 | 0 3 10940472nt NZ_VTHK01000001.1 Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence 170 | 171 | ...... 172 | ``` 173 | 174 | 175 | # Bug Report 176 | We highly appreciate all bug reports, comments, and suggestions from our users. 177 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 178 | 179 | ## Cite 180 | [Xu, X. et al. (2022). RabbitTClust: enabling fast clustering analysis of 181 | millions bacteria genomes with minhash sketches. bioRxiv.](https://doi.org/10.1101/2022.10.13.512052) 182 | -------------------------------------------------------------------------------- /version_history/v.2.2.0.md: -------------------------------------------------------------------------------- 1 | # `RabbitTClust v.2.2.0` 2 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations. 3 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms. 4 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 5 | 6 | ## Installation 7 | `RabbitTClust v.2.2.0` can only support 64-bit Linux Systems. 8 | 9 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](./history.md) document. 10 | 11 | ### Dependancy 12 | * cmake v.3.0 or later 13 | * c++14 14 | * [zlib](https://zlib.net/) 15 | 16 | ### Compile and install 17 | ```bash 18 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 19 | cd RabbitTClust 20 | ./install.sh 21 | ``` 22 | ## Usage 23 | ```bash 24 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust 25 | Usage: ./clust-mst [OPTIONS] 26 | Options: 27 | -h,--help Print this help message and exit 28 | -t,--threads INT set the thread number, default all CPUs of the platform 29 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 30 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 31 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 32 | -l,--list input is genome list, one genome per line 33 | -e,--no-save not save the intermediate files, such as sketches or MST 34 | -d,--threshold FLOAT set the distance threshold for clustering 35 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 36 | -o,--output TEXT REQUIRED set the output name of cluster result 37 | -i,--input TEXT Excludes: --append 38 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 39 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 40 | --premsted TEXT clustering by the pre-generated mst files rather than genomes for clust-mst 41 | --append TEXT Excludes: --input 42 | append genome file or file list with the pre-generated sketch or MST files 43 | 44 | # clust-greedy, greedy incremental clustering module for RabbitTClust 45 | Usage: ./clust-greedy [OPTIONS] 46 | Options: 47 | -h,--help Print this help message and exit 48 | -t,--threads INT set the thread number, default all CPUs of the platform 49 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 50 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 51 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 52 | -l,--list input is genome list, one genome per line 53 | -e,--no-save not save the intermediate files, such as sketches or MST 54 | -d,--threshold FLOAT set the distance threshold for clustering 55 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 56 | -o,--output TEXT REQUIRED set the output name of cluster result 57 | -i,--input TEXT Excludes: --append 58 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 59 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 60 | --append TEXT Excludes: --input 61 | append genome file or file list with the pre-generated sketch or MST files 62 | ``` 63 | 64 | ## Example: 65 | ```bash 66 | # input is a file list, one genome path per line: 67 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust 68 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust 69 | 70 | # input is a single genome file in FASTA format, one genome as a sequence: 71 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust 72 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust 73 | 74 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options. 75 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust 76 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust 77 | 78 | 79 | # for redundancy detection with clust-greedy, input is a genome file list: 80 | # use -d to specify the distance threshold corresponding to various degrees of redundancy. 81 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out 82 | 83 | # v.2.1.0 or later 84 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15. 85 | # this folder contains the sketch, mst files. 86 | # for generator cluster from exist MST with a distance threshold of 0.045: 87 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 88 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045: 89 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 90 | 91 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001: 92 | # folder 2023_05_06_08-49-15 contains the sketch files. 93 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust 94 | 95 | # v.2.2.0 or later 96 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 97 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust 98 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust 99 | ``` 100 | ## Output 101 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*). 102 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome. 103 | 104 | #### Output format for a FASTA file list input 105 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are: 106 | * local index in a cluster 107 | * global index of the genome 108 | * genome length 109 | * genome file name (including genome assembly accession number) 110 | * sequence name (first sequence in the genome file) 111 | * sequence comment (remaining part of the line) 112 | 113 | **Example:** 114 | ```txt 115 | the cluster 0 is: 116 | 0 0 14782125nt bacteria/GCF_000418325.1_ASM41832v1_genomic.fna NC_021658.1 Sorangium cellulosum So0157-2, complete sequence 117 | 1 1 14598830nt bacteria/GCF_004135755.1_ASM413575v1_genomic.fna NZ_CP012672.1 Sorangium cellulosum strain So ce836 chromosome, complete genome 118 | 119 | the cluster 1 is: 120 | 0 2 14557589nt bacteria/GCF_002950945.1_ASM295094v1_genomic.fna NZ_CP012673.1 Sorangium cellulosum strain So ce26 chromosome, complete genome 121 | 122 | the cluster 2 is: 123 | 0 3 13673866nt bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna NZ_JAHKRM010000001.1 Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence 124 | 125 | ...... 126 | ``` 127 | 128 | #### Output format for a single FASTA file input 129 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are: 130 | * local index in a cluster 131 | * global index of the genome 132 | * genome length 133 | * sequence name 134 | * sequence comment (remaining part of this line) 135 | 136 | **Example:** 137 | ```txt 138 | the cluster 0 is: 139 | 0 0 11030030nt NZ_GG657755.1 Streptomyces himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence 140 | 1 1 11008137nt NZ_RIBZ01000339.1 Streptomyces sp. NEAU-LD23 C2041, whole genome shotgun sequence 141 | 142 | the cluster 1 is: 143 | 0 2 11006208nt NZ_KL647031.1 Nonomuraea candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence 144 | 145 | the cluster 2 is: 146 | 0 3 10940472nt NZ_VTHK01000001.1 Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence 147 | 148 | ...... 149 | ``` 150 | 151 | 152 | # Bug Report 153 | We highly appreciate all bug reports, comments, and suggestions from our users. 154 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 155 | 156 | ## Cite 157 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6 158 | -------------------------------------------------------------------------------- /version_history/v.2.2.1.md: -------------------------------------------------------------------------------- 1 | ![RabbitTClust](rabbittclust.png) 2 | 3 | # `RabbitTClust v.2.2.1` 4 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations. 5 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms. 6 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 7 | 8 | ## Installation 9 | `RabbitTClust v.2.2.1` can only support 64-bit Linux Systems. 10 | 11 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](version_history/history.md) document. 12 | 13 | ### Dependancy 14 | * cmake v.3.0 or later 15 | * c++14 16 | * [zlib](https://zlib.net/) 17 | 18 | ### Compile and install 19 | ```bash 20 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git 21 | cd RabbitTClust 22 | ./install.sh 23 | ``` 24 | ## Usage 25 | ```bash 26 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust 27 | Usage: ./clust-mst [OPTIONS] 28 | Options: 29 | -h,--help Print this help message and exit 30 | -t,--threads INT set the thread number, default all CPUs of the platform 31 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 32 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 33 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 34 | -l,--list input is genome list, one genome per line 35 | -e,--no-save not save the intermediate files, such as sketches or MST 36 | -d,--threshold FLOAT set the distance threshold for clustering 37 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 38 | -o,--output TEXT REQUIRED set the output name of cluster result 39 | -i,--input TEXT Excludes: --append 40 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 41 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 42 | --premsted TEXT clustering by the pre-generated mst files rather than genomes for clust-mst 43 | --newick-tree output the newick tree format file for clust-mst 44 | --append TEXT Excludes: --input 45 | append genome file or file list with the pre-generated sketch or MST files 46 | 47 | # clust-greedy, greedy incremental clustering module for RabbitTClust 48 | Usage: ./clust-greedy [OPTIONS] 49 | Options: 50 | -h,--help Print this help message and exit 51 | -t,--threads INT set the thread number, default all CPUs of the platform 52 | -m,--min-length UINT set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000 53 | -c,--containment INT use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress -k,--kmer-size INT set the kmer size 54 | -s,--sketch-size INT set the sketch size for Jaccard Index and Mash distance, default 1000 55 | -l,--list input is genome list, one genome per line 56 | -e,--no-save not save the intermediate files, such as sketches or MST 57 | -d,--threshold FLOAT set the distance threshold for clustering 58 | -F,--function TEXT set the sketch function, such as MinHash, KSSD, default MinHash 59 | -o,--output TEXT REQUIRED set the output name of cluster result 60 | -i,--input TEXT Excludes: --append 61 | set the input file, single FASTA genome file (without -l option) or genome list file (with -l option) 62 | --presketched TEXT clustering by the pre-generated sketch files rather than genomes 63 | --append TEXT Excludes: --input 64 | append genome file or file list with the pre-generated sketch or MST files 65 | ``` 66 | 67 | ## Example: 68 | ```bash 69 | # input is a file list, one genome path per line: 70 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust 71 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust 72 | 73 | # input is a single genome file in FASTA format, one genome as a sequence: 74 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust 75 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust 76 | 77 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options. 78 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust 79 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust 80 | 81 | 82 | # for redundancy detection with clust-greedy, input is a genome file list: 83 | # use -d to specify the distance threshold corresponding to various degrees of redundancy. 84 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out 85 | 86 | # v.2.1.0 or later 87 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15. 88 | # this folder contains the sketch, mst files. 89 | # for generator cluster from exist MST with a distance threshold of 0.045: 90 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 91 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045: 92 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust 93 | 94 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001: 95 | # folder 2023_05_06_08-49-15 contains the sketch files. 96 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust 97 | 98 | # v.2.2.0 or later 99 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 100 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust 101 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust 102 | 103 | # v.2.2.1 or later 104 | # output the newick tree format for clust-mst, use the --newick-tree flag. 105 | ./clust-mst -l -i bacteria.list --newick-tree -o bacteria.mst.clust 106 | ``` 107 | ## Output 108 | The output file is in a CD-HIT output format and is slightly different when running with or without `-l` input option. 109 | When using the `-l` option, the input is expected to be a FASTA file list, with each file representing a genome. Without the `-l` option, the input should be a single FASTA file, with each sequence representing a genome. 110 | 111 | #### Output format for a FASTA file list input 112 | With `-l*` option, the tab-delimited values in the lines beginning with tab delimiters are: 113 | * local index in a cluster 114 | * global index of the genome 115 | * genome length 116 | * genome file name (including genome assembly accession number) 117 | * sequence name (first sequence in the genome file) 118 | * sequence comment (remaining part of the line) 119 | 120 | **Example:** 121 | ```txt 122 | the cluster 0 is: 123 | 0 0 14782125nt bacteria/GCF_000418325.1_ASM41832v1_genomic.fna NC_021658.1 Sorangium cellulosum So0157-2, complete sequence 124 | 1 1 14598830nt bacteria/GCF_004135755.1_ASM413575v1_genomic.fna NZ_CP012672.1 Sorangium cellulosum strain So ce836 chromosome, complete genome 125 | 126 | the cluster 1 is: 127 | 0 2 14557589nt bacteria/GCF_002950945.1_ASM295094v1_genomic.fna NZ_CP012673.1 Sorangium cellulosum strain So ce26 chromosome, complete genome 128 | 129 | the cluster 2 is: 130 | 0 3 13673866nt bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna NZ_JAHKRM010000001.1 Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence 131 | 132 | ...... 133 | ``` 134 | 135 | #### Output format for a single FASTA file input 136 | Without `-l` option, the tab-delimited values in the lines beginning with tab delimiters are: 137 | * local index in a cluster 138 | * global index of the genome 139 | * genome length 140 | * sequence name 141 | * sequence comment (remaining part of this line) 142 | 143 | **Example:** 144 | ```txt 145 | the cluster 0 is: 146 | 0 0 11030030nt NZ_GG657755.1 Streptomyces himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence 147 | 1 1 11008137nt NZ_RIBZ01000339.1 Streptomyces sp. NEAU-LD23 C2041, whole genome shotgun sequence 148 | 149 | the cluster 1 is: 150 | 0 2 11006208nt NZ_KL647031.1 Nonomuraea candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence 151 | 152 | the cluster 2 is: 153 | 0 3 10940472nt NZ_VTHK01000001.1 Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence 154 | 155 | ...... 156 | ``` 157 | 158 | #### Output the newick tree format (v.2.2.1 or latter) 159 | When the `--newick-tree` option is used, an additional output file will be generated in the Newick tree format with a suffix name of ".newick.tree". 160 | 161 | 162 | # Bug Report 163 | We highly appreciate all bug reports, comments, and suggestions from our users. 164 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 165 | 166 | ## Cite 167 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6 168 | --------------------------------------------------------------------------------