├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE.txt
├── README.md
├── benchmark
    ├── README.md
    ├── download
    │   ├── README.md
    │   ├── download_genbank.sh
    │   └── download_refseq.py
    ├── evaluation
    │   ├── Makefile
    │   ├── README.md
    │   ├── getNMI.py
    │   └── src
    │   │   ├── analysisPurity.cpp
    │   │   ├── calLabel.cpp
    │   │   ├── calPurity.cpp
    │   │   ├── checkTaxonomyStatus.cpp
    │   │   ├── getRepresentativeList.cpp
    │   │   ├── groundTruth.cpp
    │   │   ├── groundTruth.h
    │   │   ├── kseq.h
    │   │   ├── mapGenome.cpp
    │   │   └── precalLabel.cpp
    ├── generateList.sh
    └── simulate
    │   ├── Makefile
    │   ├── README.md
    │   └── src
    │       ├── create_containment_bacteria.cpp
    │       ├── kseq.h
    │       └── simulate_longSequence.cpp
├── install.sh
├── rabbittclust.png
├── src
    ├── CLI11.hpp
    ├── MST.cpp
    ├── MST.h
    ├── MST_IO.cpp
    ├── MST_IO.h
    ├── SketchInfo.cpp
    ├── SketchInfo.h
    ├── Sketch_IO.cpp
    ├── Sketch_IO.h
    ├── ThreadPool.h
    ├── ThreadPool.hxx
    ├── UnionFind.h
    ├── common.hpp
    ├── greedy.cpp
    ├── greedy.h
    ├── kseq.h
    ├── main.cpp
    ├── sub_command.cpp
    └── sub_command.h
└── version_history
    ├── history.md
    ├── v.2.0.0.md
    ├── v.2.1.0.md
    ├── v.2.2.0.md
    └── v.2.2.1.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*
 2 | !src/
 3 | !src/*
 4 | !.gitignore
 5 | !.gitmodules
 6 | !CMakeLists.txt
 7 | !LICENSE.txt
 8 | !README.md
 9 | !experient.md
10 | !install.sh
11 | !rabbittclust.png
12 | 
13 | !version_history
14 | !version_history/*
15 | 
16 | !benchmark/
17 | !benchmark/README.md
18 | !benchmark/download_genomes.py
19 | !benchmark/generateList.sh
20 | 
21 | !benchmark/download/
22 | !benchmark/download/README.md
23 | !benchmark/download/download_genbank.sh
24 | !benchmark/download/download_refseq.py
25 | !benchmark/download/bact_GenBank.list.gz
26 | 
27 | !benchmark/simulate/
28 | !benchmark/simulate/README.md
29 | !benchmark/simulate/Makefile
30 | !benchmark/simulate/src/
31 | !benchmark/simulate/src/*
32 | 
33 | !benchmark/evaluation/
34 | !benchmark/evaluation/Makefile
35 | !benchmark/evaluation/README.md
36 | !benchmark/evaluation/getNMI.py
37 | !benchmark/evaluation/src/
38 | !benchmark/evaluation/src/*
39 | 
40 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "RabbitFX"]
2 | 	path = RabbitFX
3 | 	url = https://github.com/RabbitBio/RabbitFX.git
4 | [submodule "RabbitSketch"]
5 | 	path = RabbitSketch
6 | 	url = https://github.com/RabbitBio/RabbitSketch.git
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(rabbitTClust)
 4 | 
 5 | set(CMAKE_INSTALL_PREFIX ..)
 6 | option(USE_RABBITFX "parse input fasta file with RabbitFX" OFF)
 7 | option(USE_DEBUG "print the debug information" ON)
 8 | option(USE_Timer "print the time information" ON)
 9 | option(USE_GREEDY "use greedy incremental cluster" ON)
10 | 
11 | 
12 | 
13 | find_package(OpenMP REQUIRED)
14 | if(OPENMP_FOUND)
15 | #message("openmp found")
16 | 
17 | set(CMAKE_CXX_COMPILER "/usr/bin/g++")
18 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
20 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
21 | set(EXECUTABLE_OUTPUT_PATH .)
22 | endif()
23 | 
24 | set(CMAKE_CXX_STANDARD 14)
25 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
26 | #set(CMAKE_CXX_FLAGS "-g -O3 -D THREADPOOL_MINHASH -D DEBUG -D Timer ${CMAKE_CXX_FLAGS}")
27 | 
28 | set(CMAKE_CXX_FLAGS "-g -O3 -Wno-format -Wno-unused-result${CMAKE_CXX_FLAGS}")
29 | 
30 | if(USE_DEBUG)
31 | 	set(CMAKE_CXX_FLAGS "-D DEBUG ${CMAKE_CXX_FLAGS}")
32 | endif()
33 | 
34 | if(USE_Timer)
35 | 	set(CMAKE_CXX_FLAGS "-D Timer ${CMAKE_CXX_FLAGS}")
36 | endif()
37 | 
38 | if(USE_GREEDY)
39 | 	set(CMAKE_CXX_FLAGS "-D GREEDY_CLUST ${CMAKE_CXX_FLAGS}")
40 | endif()
41 | 
42 | if(USE_RABBITFX)
43 | 	set(CMAKE_CXX_FLAGS "-D RABBIT_FX ${CMAKE_CXX_FLAGS}")
44 | 	include_directories(src RabbitSketch/build/include RabbitFX/build/include)
45 | 	link_directories(RabbitSketch/build/lib RabbitFX/build/lib)
46 | else()
47 | 	set(CMAKE_CXX_FLAGS "-D THREADPOOL_MINHASH ${CMAKE_CXX_FLAGS}")
48 | 	include_directories(src RabbitSketch/build/include)
49 | 	link_directories(RabbitSketch/build/lib)
50 | endif()
51 | 	
52 | ##include_directories(src RabbitSketch/build/include RabbitFX/build/include)
53 | #include_directories(src RabbitSketch/build/include)
54 | #
55 | ##link_directories(RabbitSketch/build/lib RabbitFX/build/io)
56 | #link_directories(RabbitSketch/build/lib)
57 | 
58 | aux_source_directory(src DIR_SRCS)
59 | 
60 | if(USE_GREEDY)
61 | add_executable(clust-greedy ${DIR_SRCS})
62 | if(USE_RABBITFX)
63 | 	target_link_libraries(clust-greedy rabbitsketch_static rabbitfx z)
64 | else()
65 | 	target_link_libraries(clust-greedy rabbitsketch_static z)
66 | endif()
67 | 
68 | else()
69 | add_executable(clust-mst ${DIR_SRCS})
70 | if(USE_RABBITFX)
71 | 	target_link_libraries(clust-mst rabbitsketch_static rabbitfx z)
72 | else()
73 | 	target_link_libraries(clust-mst rabbitsketch_static z)
74 | endif()
75 | endif()
76 | 
77 | 
78 | 
79 | if(USE_GREEDY)
80 | install(TARGETS clust-greedy DESTINATION ./)
81 | else()
82 | install(TARGETS clust-mst DESTINATION ./)
83 | endif()
84 | 
85 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | PURPOSE
 2 | 
 3 | RabbitTClust is a fast and memory-efficient genoem clustering tool based
 4 | on sketch-based distance estimation. Our approach enables efficient processing
 5 | of large-scale datasets by combining dimensionality reduction techniques with
 6 | streaming and parallelization on modern multi-core platforms. It is implemented
 7 | in C++ and is distributed with:
 8 | CLI11
 9 | 	https://github.com/CLIUtils/CLI11
10 | 	3-Clause BSD License
11 | 
12 | KSeq
13 | 	https://github.com/attractivechaos/klib
14 |   MIT License
15 | 
16 | MurmurHash3
17 |   code.google.com/p/smhasher/wiki/MurmurHash3
18 |   Public domain
19 | 
20 | Robin_Hood Unordered Map and Set
21 |   https://github.com/martinus/robin-hood-hashing
22 | 	MIT License
23 | 
24 | ThreadPool 
25 | 	https://github.com/marbl/Mash/blob/master/src/mash/ThreadPool.h
26 |   COPYRIGHT LICENSE
27 |   Copyright © 2015, Battelle National Biodefense Institute (BNBI);
28 |   all rights reserved. Authored by: Brian Ondov, Todd Treangen,
29 |   Sergey Koren, and Adam Phillippy
30 |   
31 |   This Software was prepared for the Department of Homeland Security
32 |   (DHS) by the Battelle National Biodefense Institute, LLC (BNBI) as
33 |   part of contract HSHQDC-07-C-00020 to manage and operate the National
34 |   Biodefense Analysis and Countermeasures Center (NBACC), a Federally
35 |   Funded Research and Development Center.
36 | 
37 | RabbitFX
38 | 	https://github.com/RabbitBio/RabbitFX
39 | 	GNU GENERAL PUBLIC LICENSE
40 | 
41 | RabbitSketch
42 | 	https://github.com/RabbitBio/RabbitSketch
43 | 	MIT License
44 | 
45 | 
46 | COPYRIGHT LICENSE
47 | 
48 | Copyright © 2021, Shandong University (SDU); all rights reserved.
49 | Authored by Xiaoming Xu and Zekun Yin.
50 | 
51 | Redistribution and use in source and binary forms, with or without
52 | modification, are permitted provided that the following conditions are
53 | met:
54 | 
55 | 1. Redistributions of source code must retain the above copyright
56 | notice, this list of conditions and the following disclaimer.
57 | 
58 | 2. Redistributions in binary form must reproduce the above copyright
59 | notice, this list of conditions and the following disclaimer in the
60 | documentation and/or other materials provided with the distribution.
61 | 
62 | 3. Neither the name of the copyright holder nor the names of its
63 | contributors may be used to endorse or promote products derived from
64 | this software without specific prior written permission.
65 | 
66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
70 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
71 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
72 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
73 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
74 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
75 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
76 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![install with conda](
  2 | https://anaconda.org/bioconda/rabbittclust/badges/version.svg)](https://anaconda.org/bioconda/rabbittclust)
  3 | [![install with conda](
  4 | https://anaconda.org/bioconda/rabbittclust/badges/latest_release_date.svg)](https://anaconda.org/bioconda/rabbittclust)
  5 | [![install with conda](
  6 | https://anaconda.org/bioconda/rabbittclust/badges/platforms.svg)](https://anaconda.org/bioconda/rabbittclust)
  7 | [![install with conda](
  8 | https://anaconda.org/bioconda/rabbittclust/badges/downloads.svg)](https://anaconda.org/bioconda/rabbittclust)
  9 | 
 10 | ![RabbitTClust](rabbittclust.png)
 11 | 
 12 | # `RabbitTClust v.2.3.0`
 13 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations.
 14 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms.
 15 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 
 16 | 
 17 | ## Installation
 18 | `RabbitTClust v.2.3.0` can only support 64-bit Linux Systems.
 19 | 
 20 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](version_history/history.md) document.
 21 | 
 22 | ### Install from bioconda 
 23 | RabbitTClust is available from [Bioconda](https://anaconda.org/bioconda/rabbittclust).
 24 | 
 25 | Ensure that your machine supports at least AVX2 instructions.
 26 | 
 27 | 
 28 | ### Install from source code
 29 | #### Dependancy
 30 | * cmake v.3.0 or later
 31 | * c++14
 32 | * [zlib](https://zlib.net/)
 33 | 
 34 | #### Compile and install
 35 | ```bash
 36 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 37 | cd RabbitTClust
 38 | ./install.sh
 39 | ```
 40 | ## Usage
 41 | ```bash
 42 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust
 43 | Usage: ./clust-mst [OPTIONS]
 44 | Options:
 45 |   -h,--help                   Print this help message and exit
 46 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 47 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 48 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 49 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 50 |   -l,--list                   input is genome list, one genome per line
 51 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 52 |   -d,--threshold FLOAT        set the distance threshold for clustering
 53 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 54 |   -i,--input TEXT Excludes: --append
 55 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 56 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 57 |   --premsted TEXT             clustering by the pre-generated mst files rather than genomes for clust-mst
 58 |   --newick-tree               output the newick tree format file for clust-mst
 59 |   --fast                      use the kssd algorithm for sketching and distance computing for clust-mst
 60 |   --append TEXT Excludes: --input
 61 |                               append genome file or file list with the pre-generated sketch or MST files
 62 | 
 63 | # clust-greedy, greedy incremental clustering module for RabbitTClust
 64 | Usage: ./clust-greedy [OPTIONS]
 65 | Options:
 66 |   -h,--help                   Print this help message and exit
 67 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 68 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 69 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 70 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 71 |   -l,--list                   input is genome list, one genome per line
 72 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 73 |   -d,--threshold FLOAT        set the distance threshold for clustering
 74 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 75 |   -i,--input TEXT Excludes: --append
 76 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 77 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 78 |   --append TEXT Excludes: --input
 79 |                               append genome file or file list with the pre-generated sketch or MST files
 80 | ```
 81 | 
 82 | ## Example:
 83 | ```bash
 84 | # input is a file list, one genome path per line:
 85 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust
 86 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust
 87 | 
 88 | # input is a single genome file in FASTA format, one genome as a sequence:
 89 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust
 90 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust
 91 | 
 92 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options.
 93 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust
 94 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust
 95 | 
 96 | 
 97 | # for redundancy detection with clust-greedy, input is a genome file list:
 98 | # use -d to specify the distance threshold corresponding to various degrees of redundancy.
 99 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out
100 | 
101 | # v.2.1.0 or later
102 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15.
103 | # this folder contains the sketch, mst files.
104 | # for generator cluster from exist MST with a distance threshold of 0.045:
105 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
106 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045:
107 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
108 | 
109 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001:
110 | # folder 2023_05_06_08-49-15 contains the sketch files.
111 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust
112 | 
113 | # v.2.2.0 or later
114 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 
115 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust
116 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust
117 | 
118 | # v.2.2.1 or later
119 | # output the newick tree format for clust-mst, use the --newick-tree flag.
120 | ./clust-mst -l -i bacteria.list --newick-tree -o bacteria.mst.clust 
121 | 
122 | # v.2.3.0 or later
123 | # use the efficient Kssd sketch strategy for clust-mst, use the --fast flag.
124 | ./clust-mst --fast -l -i bacteria.list -o bacteria.fast.mst.clust
125 | ```
126 | ## Output
127 | The output file is in a CD-HIT output format and is slightly different when running with or without `-l` input option.  
128 | When using the `-l` option, the input is expected to be a FASTA file list, with each file representing a genome. Without the `-l` option, the input should be a single FASTA file, with each sequence representing a genome.
129 | 
130 | #### Output format for a FASTA file list input
131 | With `-l*` option, the tab-delimited values in the lines beginning with tab delimiters are:
132 | * local index in a cluster
133 | * global index of the genome
134 | * genome length
135 | * genome file name (including genome assembly accession number)
136 | * sequence name (first sequence in the genome file)
137 | * sequence comment (remaining part of the line)
138 | 
139 | **Example:**
140 | ```txt
141 | the cluster 0 is:
142 |     0   0   14782125nt  bacteria/GCF_000418325.1_ASM41832v1_genomic.fna     NC_021658.1     Sorangium cellulosum So0157-2, complete sequence
143 |     1   1   14598830nt  bacteria/GCF_004135755.1_ASM413575v1_genomic.fna    NZ_CP012672.1   Sorangium cellulosum strain So ce836 chromosome, complete genome
144 | 
145 | the cluster 1 is:
146 |     0   2   14557589nt  bacteria/GCF_002950945.1_ASM295094v1_genomic.fna    NZ_CP012673.1   Sorangium cellulosum strain So ce26 chromosome, complete genome
147 | 
148 | the cluster 2 is:
149 |     0   3   13673866nt  bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna   NZ_JAHKRM010000001.1    Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence
150 | 
151 | ......
152 | ```
153 | 
154 | #### Output format for a single FASTA file input
155 | Without `-l` option, the tab-delimited values in the lines beginning with tab delimiters are:
156 | * local index in a cluster
157 | * global index of the genome
158 | * genome length
159 | * sequence name 
160 | * sequence comment (remaining part of this line)
161 | 
162 | **Example:**
163 | ```txt
164 | the cluster 0 is:
165 |     0   0   11030030nt  NZ_GG657755.1   Streptomyces  himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence
166 |     1   1   11008137nt  NZ_RIBZ01000339.1   Streptomyces  sp. NEAU-LD23 C2041, whole genome shotgun sequence
167 | 
168 | the cluster 1 is:
169 |     0   2   11006208nt  NZ_KL647031.1   Nonomuraea  candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence
170 |     
171 | the cluster 2 is:
172 |     0   3   10940472nt  NZ_VTHK01000001.1   Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence
173 | 
174 | ......
175 | ```
176 | 
177 | #### Output the newick tree format (v.2.2.1 or latter)
178 | When the `--newick-tree` option is used, an additional output file will be generated in the Newick tree format with a suffix name of ".newick.tree".
179 | 
180 | 
181 | # Bug Report
182 | We highly appreciate all bug reports, comments, and suggestions from our users.  
183 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 
184 | 
185 | ## Cite
186 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6
187 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # benchmarking path
 2 | 
 3 | ## [sub-directory simulate](./simulate/README.md)
 4 | The script for simulating genome sequence.
 5 | 
 6 | ## [sub-directory download](./download/README.md)
 7 | The script for downloading the genomes from NCBI RefSeq and GenBank.
 8 | 
 9 | ## [sub-directory evaluation](./evaluation/README.md)
10 | The script for evaluating the clustering quality.
11 | 


--------------------------------------------------------------------------------
/benchmark/download/README.md:
--------------------------------------------------------------------------------
 1 | # download the genomes from NCBI RefSeq and GenBank
 2 | RabbitTClust supports an input list of genomes in original FASTA format and gzips format.
 3 | We recommend using the decompressed genome files as input to filter out the broken download compressed files.
 4 | If you have to use the input list of the compressed files, you must check the md5 value.
 5 | 
 6 | For the input of a single FASTA file (each sequence means a genome), RabbitTClust only supports decompressed FASTA format.
 7 | 
 8 | ## download genomes from RefSeq
 9 | The download script comes from [Bonsai](https://github.com/dnbaker/bonsai/tree/ac6f8c7ee1b2ae1128970a8f6dc01ddad19fdb37).
10 | 
11 | The latest release of RefSeq bacterial genomes can be downloaded by `download_refseq.py` as follows:
12 | 
13 | * `python3 download_genomes.py bacteria`  
14 | * `python3 download_genomes.py -h` more details of help infos.
15 | 
16 | ## download genomes from GenBank
17 | 
18 | The latest release of GenBank bacterial genomes can be downloaded by `download_genbank.sh` as follows:
19 | * `./download_genbank.sh`
20 | 


--------------------------------------------------------------------------------
/benchmark/download/download_genbank.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt
 4 | awk -F '\t' 'NR>2 {print $20}' assembly_summary.txt >ftp.list
 5 | 
 6 | outputDir="genbankDir"
 7 | echo $#
 8 | if [ $# -ge 1 ]
 9 | then
10 | 	outputDir=$1
11 | fi
12 | mkdir -p $outputDir
13 | cat ftp.list | while read line
14 | do
15 |     fname=$(echo $line | grep -o 'GCA_.*' | sed 's/$/_genomic.fna.gz/')
16 |     #echo "$line/$fname"
17 |     wget -c "$line/$fname" ;
18 | 		mv "$fname" $outputDir
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmark/download/download_refseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import multiprocessing
  4 | import gzip
  5 | import os
  6 | from subprocess import check_call as cc, CalledProcessError
  7 | from enum import IntEnum
  8 | argv = sys.argv
  9 | 
 10 | 
 11 | if sys.version_info[0] != 3:
 12 |     raise Exception("Python 3 required")
 13 | 
 14 | 
 15 | class ExitCodes(IntEnum):
 16 |     EXIT_SUCCESS = 0
 17 |     EXIT_FAILURE = 1
 18 | 
 19 | 
 20 | def is_valid_gzip(fn, lazy=False, use_pigz=False):
 21 |     '''
 22 |     We could instead use gunzip -t to check, but that actual requires
 23 |     iterating through the whole file, which is very slow. This is lazy,
 24 |     but at least it makes sure that it's a gzip file.
 25 | 
 26 |     lazy simply tries to see if the first 10 lines can be read.
 27 |     It isn't very safe.
 28 | 
 29 |     use_pigz uses pigz instead of gzip. A bad idea if a number of processes
 30 |     have already been spawned.
 31 |     '''
 32 |     if lazy:
 33 |         try:
 34 |             cc("gzip -dc %s | head &>/dev/null" % fn, shell=True)
 35 |             return True
 36 |         except CalledProcessError:
 37 |             return False
 38 |     # lazy has already returned. This is the "else".
 39 |     cmd = ("pigz" if use_pigz else "gzip") + " -dc "
 40 |     try:
 41 |         cc(cmd + " -t " + fn, shell=True)
 42 |         sys.stderr.write(fn + " is valid")
 43 |         return True
 44 |     except CalledProcessError:
 45 |         sys.stderr.write("Corrupted file " + fn + ". Delete, try again.")
 46 |         return False
 47 | 
 48 | 
 49 | def xfirstline(fn):
 50 |     # Works on python3, not 2.
 51 |     first_two = open(fn, "rb").read(2)
 52 |     return next((gzip.open if first_two == b"\x1f\x8b" else open)(fn))
 53 | 
 54 | 
 55 | FTP_BASENAME = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/"
 56 | 
 57 | ALL_CLADES_MAP = {
 58 |     "archaea": FTP_BASENAME + "archaea/",
 59 |     "bacteria": FTP_BASENAME + "bacteria/",
 60 |     "fungi": FTP_BASENAME + "fungi/",
 61 |     "viral": FTP_BASENAME + "viral/",
 62 |     "plant": FTP_BASENAME + "plant/",
 63 |     "protozoa": FTP_BASENAME + "protozoa/",
 64 |     "human": FTP_BASENAME + "vertebrate_mammalian/Homo_sapiens",
 65 |     "vertebrate_mammalian": FTP_BASENAME + "vertebrate_mammalian/",
 66 |     "vertebrate_other": FTP_BASENAME + "vertebrate_other/"
 67 | }
 68 | 
 69 | DEFAULT_CLADES = [
 70 |     "archaea", "bacteria", "viral", "human"
 71 | ]
 72 | 
 73 | DEFAULT_CLADES_STR = ", ".join(DEFAULT_CLADES)
 74 | ALL_CLADES_STR = ", ".join(ALL_CLADES_MAP.keys())
 75 | 
 76 | TAX_PATH = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
 77 | 
 78 | 
 79 | def get_clade_map(clades):
 80 |     if clades[0] == "default":
 81 |         return {k: v for k, v in ALL_CLADES_MAP.items() if k in DEFAULT_CLADES}
 82 |     if clades[0] == "all":
 83 |         return ALL_CLADES_MAP
 84 |     ret = {}
 85 |     clades = [i.lower() for i in clades]
 86 |     for clade in clades:
 87 |         if clade in ALL_CLADES_MAP:
 88 |             ret[clade] = ALL_CLADES_MAP[clade]
 89 |         else:
 90 |             raise ValueError("clade %s not available. Abort!" % clade)
 91 |     return ret
 92 | 
 93 | 
 94 | def parse_assembly(fn, fnidmap):
 95 |     print(fn)
 96 |     to_fetch = []
 97 |     for line in open(fn, encoding='utf8'):
 98 |         if line[0] == '#':
 99 |             continue
100 |         s = line.split("\t")
101 |         if len(s) < 14:
102 |             print(s)
103 |             raise Exception("Not long enough")
104 |         if ("latest" not in line or  # Complete genome
105 |                 (("Complete Genome" not in line and
106 |                   "GRCh" not in line and "Full" not in line)) or
107 |                 any(i in line.lower() for
108 |                     i in ["contig", "supercontig"])):
109 |             continue
110 |         print(s[19], file=sys.stderr)
111 |         fn = "%s_genomic.fna.gz" % ([i for i in s[19].split("/") if i][-1])
112 |         fnidmap[fn] = int(s[5])
113 |         index = len(s) - 1
114 |         while "ftp" not in s[index] and index > 0:
115 |             index = index - 1
116 |         if index:
117 |             to_fetch.append(s[index] + "/" + fn)
118 |         else:
119 |             print("No link found, continue", file=sys.stderr)
120 |             continue
121 |             #raise RuntimeError("ftp link not found. line: %s" % line[:-1])
122 |     return to_fetch
123 | 
124 | 
125 | def retry_cc(tup):
126 |     cstr, die = tup
127 |     RETRY_LIMIT = 10
128 |     r = 0
129 |     while r < RETRY_LIMIT:
130 |         try:
131 |             print(cstr, file=sys.stderr)
132 |             cc(cstr, shell=True)
133 |             return
134 |         except CalledProcessError:
135 |             print("retry number", r, file=sys.stderr)
136 |             r += 1
137 |     if die:
138 |         raise Exception(
139 |             "Could not download via %s "
140 |             "even after %i attempts." % (cstr, RETRY_LIMIT))
141 |     else:
142 |         sys.stderr.write(
143 |             "Could not download %s even after %i attempts" % (
144 |                 cstr, RETRY_LIMIT))
145 | 
146 | 
147 | def getopts():
148 |     import argparse
149 |     a = argparse.ArgumentParser()
150 |     a.add_argument("--idmap", "-m", help="Path to which to write nameidmap.",
151 |                    default="nameidmap.txt")
152 |     a.add_argument("--ref", "-r", help="Name of folder for references.")
153 |     a.add_argument("clades", nargs="+", help="Clades to use."
154 |                    " default includes %s. all includes %s." % (
155 |                         DEFAULT_CLADES_STR, ALL_CLADES_STR))
156 |     a.add_argument("--threads", "-p",
157 |                    help="Number of threads to use while downloading.",
158 |                    type=int, default=16)
159 |     a.add_argument("--lazy", "-l", action='store_true',
160 |                    help="Don't check full gzipped file contents.")
161 |     a.add_argument("--die", "-d", action='store_true')
162 |     return a.parse_args()
163 | 
164 | 
165 | def check_path(fn, lazy=False):
166 |     print("Checking path " + fn)
167 |     if os.path.isfile(fn):
168 |         if not is_valid_gzip(fn, lazy=lazy):
169 |             cc("rm " + fn, shell=True)
170 | 
171 | 
172 | def check_path_lazy(path):
173 |     check_path(path, lazy=True)
174 | 
175 | 
176 | def main():
177 |     global TAX_PATH
178 |     tax_path = TAX_PATH  # Make global variable local
179 |     args = getopts()
180 |     ref = args.ref if args.ref else "ref"
181 |     if argv[1:] and argv[1] == "nodes":
182 |         if not os.path.isfile("%s/nodes.dmp" % ref):
183 |             cc("curl {tax_path} -o {ref}/"
184 |                "taxdump.tgz && tar -zxvf {ref}/taxdump.tgz"
185 |                " && mv nodes.dmp {ref}/nodes.dmp".format(**locals()),
186 |                 shell=True)
187 |             return 0
188 |     if not os.path.isdir(ref):
189 |         os.makedirs(ref)
190 |     clades = args.clades if args.clades else DEFAULT_CLADES
191 |     for clade in clades:
192 |         try:
193 |             assert clade in ALL_CLADES_MAP or clade in ["all", "default"]
194 |         except AssertionError:
195 |             print("Clade %s not 'all', 'default', or one of the valid "
196 |                   "clades: %s" % (clade, ALL_CLADES_STR), file=sys.stderr)
197 |             sys.exit(ExitCodes.EXIT_FAILURE)
198 |     to_dl = get_clade_map(clades)
199 |     print("About to download clades %s" % ", ".join(to_dl), file=sys.stderr)
200 |     nameidmap = {}
201 |     for clade in to_dl:
202 |         cladeidmap = {}
203 |         if not os.path.isdir(ref + "/" + clade):
204 |             os.makedirs(ref + "/" + clade)
205 |         if not os.path.isfile("%s/%s/as.%s.txt" % (ref, clade, clade)):
206 |             cstr = ("curl %s/assembly_summary.txt "
207 |                     "-o %s/%s/as.%s.txt") % (to_dl[clade], ref, clade, clade)
208 |             print(cstr)
209 |             cc(cstr, shell=True)
210 |         to_dl[clade] = parse_assembly("%s/%s/as.%s.txt" %
211 |                                       (ref, clade, clade), cladeidmap)
212 |         spoool = multiprocessing.Pool(args.threads)
213 |         spoool.map(check_path_lazy if args.lazy else check_path,
214 |                    ("/".join([ref, clade, s.split("/")[-1]]) for
215 |                     s in to_dl[clade]))
216 |         cstrs = [("curl %s -o %s/%s/%s" %
217 |                  (s, ref, clade, s.split("/")[-1])) for
218 |                  s in to_dl[clade] if not os.path.isfile(
219 |                      "%s/%s/%s" % (ref, clade, s.split("/")[-1]))]
220 |         # If nodes.dmp hasn't been downloaded, grab it.
221 |         if not os.path.isfile("%s/nodes.dmp" % ref):
222 |             cstrs.append("curl {tax_path} -o {ref}/"
223 |                          "taxdump.tgz && tar -zxvf {ref}/taxdump.tgz"
224 |                          " && mv nodes.dmp {ref}/nodes.dmp".format(**locals()))
225 |         spoool.map(retry_cc, ((cs, args.die) for cs in cstrs))
226 |         # Replace pathnames with seqids
227 |         for fn in list(cladeidmap.keys()):
228 |             try:
229 |                 #print(ref, clade, fn)
230 |                 cladeidmap[xfirstline("/".join(
231 |                     [ref, clade, fn]
232 |                 )).decode().split()[0][1:]] = cladeidmap[fn]
233 |                 del cladeidmap[fn]
234 |             except FileNotFoundError:
235 |                 if args.die:
236 |                     raise
237 |         nameidmap.update(cladeidmap)
238 |     print("Done with all clades", file=sys.stderr)
239 |     with open(ref + "/" + args.idmap, "w") as f:
240 |         fw = f.write
241 |         for k, v in nameidmap.items():
242 |             fw(k + "\t" + str(v) + "\n")
243 |     return ExitCodes.EXIT_SUCCESS
244 | 
245 | 
246 | if __name__ == "__main__":
247 |     sys.exit(main())
248 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/Makefile:
--------------------------------------------------------------------------------
 1 | all: calLabel calPurity getRepresentativeList analysisPurity checkTaxonomyStatus
 2 | 
 3 | calLabel: src/calLabel.cpp src/groundTruth.cpp
 4 | 	g++ -O3 ./src/calLabel.cpp ./src/groundTruth.cpp -o calLabel
 5 | calPurity: src/calPurity.cpp src/groundTruth.cpp
 6 | 	g++ -O3 ./src/calPurity.cpp ./src/groundTruth.cpp -o calPurity
 7 | getRepresentativeList: src/getRepresentativeList.cpp
 8 | 	g++ -O3 ./src/getRepresentativeList.cpp -o getRepresentativeList
 9 | analysisPurity: src/analysisPurity.cpp
10 | 	g++ -O3 ./src/analysisPurity.cpp -o analysisPurity
11 | checkTaxonomyStatus: src/checkTaxonomyStatus.cpp
12 | 	g++ -O3 ./src/checkTaxonomyStatus.cpp -o checkTaxonomyStatus
13 | 
14 | clean:
15 | 	rm calLabel calPurity getRepresentativeList analysisPurity checkTaxonomyStatus
16 | 
17 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # evaluation script  
 2 | Run `make` to compile the script.
 3 | 
 4 | ## calLabel && getNMI.py
 5 | The `calLabel` is used for generating the label file from the clustering result of RabbitTClust. 
 6 | 
 7 | **==============================================================================**  
 8 | Example:  
 9 | `./calLabel bacteria.groundTruth -l bacteria.mst.clust bacteria.mst.label`  
10 | It will generate the label file (`bacteria.mst.label`) from the clustering result (`bacteria.mst.clust`) of RabbitTClust.  
11 | Subsequently, run `python3 getNMI.py bacteria.mst.label` to get the NMI score.
12 | 
13 | **==============================================================================**  
14 | * run as: `./calLabel groundTruth sketchOption clustFile labelFile`
15 |   * The 0 parameter(`./calLabel`) is the application name
16 |   * The 1 parameter(`groundTruth`) is input file, groundTruth file, `<accession, taxid, organismName>` per line (first line is header)
17 |   * The 2 parameter(`sketchOption`) is input option, sketch options, `-l` means sketchByFile (input as a genome list), `-i` means sketchBySequence (input as a single genome file)
18 |   * The 3 parameter(`clustFile`) is input file, cluster result file generated from RabbitTClust
19 |   * The 4 parameter(`labelFile`) is output file, label file according to the groundTruth 
20 | 
21 | ## calPurity 
22 | The `calPurity` is used for computing the purity of the clustering result of RabbitTClust.
23 | 
24 | **==============================================================================**  
25 | Example:  
26 | `./calPurity -l bacteria.groundTruth bacteria.clust bacteria.purity`  
27 | It will compute the total purity and the coverage of the clustering result and generate three information files: `bacteria.purity`, `bacteria.purity.accession.purity`, and `bacteria.purity.accession.unpurity`.
28 | * `bacteria.purity` is the detail purity for each cluster.
29 | * `bacteria.purity.accession.purity` is the list of first genome in each purity cluster.
30 | * `bacteria.purity.accession.unpurity` is the list of first dominant genome and the unpurity genomes for each cluster.
31 | 
32 | **==============================================================================**  
33 | * run as: `./calPurity options(-l, -i) groundTruth clustFile bacteria.purity`  
34 |   * The 0 parameter(`./calPurity`) is the application name
35 |   * The 1 parameter(`options(-l, -i)`) is input option, sketch option for clust, -l or -i
36 |   * The 2 parameter(`groundTruth`) is input file, the groundTruth file, `<assembly_accession, species_taxid, genomeName>` per line
37 |   * The 3 parameter(`clustFile`) is input file, cluster result file from RabbitTClust
38 |   * The 4 parameter(`bacteria.purity`) is output file, output purity info file, including total result file(`bacteria.purity`) and accession file(`<accession, taxid>` per line)
39 | 
40 | ## getRepresentativeList
41 | The `getRepresentativeList` is used for generating the representative genome list from the clustering result.
42 | **==============================================================================**  
43 | Example:  
44 | `./getRepresentativeList -l bacteria.greedy.clust bacteria_representative.list`  
45 | It will choose the representative genome for each cluster and generate a list of these representative genomes.
46 | **==============================================================================**  
47 | run as: `./getRepresentativeList -i/-l clustFile representative_list`  
48 |   * The 0 parameter(`./getRepresentativeList`) is the application name
49 |   * The 1 parameter(`-i/-l`) is input parameter, sketch parameter for the cluster file, -i means sketchBySequence, -l means sketchByFile
50 |   * The 2 parameter(`clustFile`) is input file, the cluster result from RabbitTClust
51 |   * The 3 parameter(`representative_list`) is output file, the representative list of genomes file or sequences name.
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/getNMI.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from sklearn import metrics
 3 | import numpy as np
 4 | import sys
 5 | import pandas as pd
 6 | 
 7 | def classification_report_cvs(report):
 8 |     report_data = []
 9 |     lines = report.split('\n')
10 |     #for line in lines:
11 |     #    report_data.append(line)
12 |     #dataframe = pd.DataFrame.from_dict(report_data)
13 |     #dataframe.to_csv(fileName+'.csv', index=False)
14 |     line = lines[len(lines)-2] #for weighted precision, recall, and F1-score
15 |     return line
16 | 
17 | def getF1(arr):
18 |     prediction = arr[0,:]
19 |     groundTruth = arr[1,:]
20 |     report = metrics.classification_report(groundTruth, prediction, digits=3, zero_division=0)
21 |     line = classification_report_cvs(report)
22 |     return line
23 | 
24 | def getNMI(arr):
25 |     a = arr[0,:]
26 |     b = arr[1,:]
27 |     result = metrics.normalized_mutual_info_score(a, b)
28 |     return result
29 | 
30 | if __name__ == "__main__":
31 |     fileList = [sys.argv[1]]
32 |     for file in fileList:
33 |         originFile = np.loadtxt(file)
34 |         F1 = getF1(originFile)
35 |         NMI = getNMI(originFile)
36 |         #print("result F1 of {} is: {} \n".format(file, F1))
37 |         #print("result NMI of {} is: {} \n".format(file, NMI))
38 |         print("{}:\t{}".format(file, NMI))
39 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/analysisPurity.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/8/2
  3 |  *
  4 |  * 
  5 |  */
  6 | #include <iostream>
  7 | #include <stdlib.h>
  8 | #include <string>
  9 | #include <cassert>
 10 | #include <fstream>
 11 | #include <vector>
 12 | #include <sstream>
 13 | #include <cstdio>
 14 | #include <algorithm>
 15 | #include <unordered_set>
 16 | #include <unordered_map>
 17 | #include <sys/sysinfo.h>
 18 | #include <omp.h>
 19 | #include <set>
 20 | #include <math.h>
 21 | #include <boost/algorithm/string/classification.hpp>
 22 | #include <boost/algorithm/string/split.hpp>
 23 | #include <zlib.h>
 24 | 
 25 | using namespace std;
 26 | 
 27 | //#define LEVEL "family"
 28 | #define LEVEL "genus"
 29 | //#define LEVEL "species"
 30 | 
 31 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions);
 32 | 
 33 | int main(int argc , char *argv[]){
 34 | 	string application = argv[0];
 35 | 	vector<string> args, descriptions;
 36 | 	args.push_back(application);
 37 | 	descriptions.push_back("the application name");
 38 | 
 39 | 	//========= parameters need changing ========
 40 | 	//The example is with parameters of specific numbers.
 41 | 	//args is the tutorial names.
 42 | 	string pwd = "/RabbitTClust/benchmark/evaluation/src/analysisPurity.cpp";
 43 | 	string dependency = "None";
 44 | 	string example = application + " purity.accession nodes.dmp outputFile";
 45 | 	args.push_back("nodes.dmp");
 46 | 	args.push_back("purity.accession");
 47 | 	args.push_back("outputFile");
 48 | 	descriptions.push_back("input file, the taxonomy nodes.dmp file");
 49 | 	descriptions.push_back("input file, the purity solved result file, from the calPurity file <accession, species-taxid> per line");
 50 | 	descriptions.push_back("output file, the final output file");
 51 | 
 52 | 	//-------- no changing -----------
 53 | 	assert(args.size() == descriptions.size());
 54 |   if(argc != args.size()) {
 55 | 		printInfo(pwd, dependency, example, args, descriptions);
 56 |     return 1;
 57 |   }
 58 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 59 | 	{
 60 | 		printInfo(pwd, dependency, example, args, descriptions);
 61 | 		return 1;
 62 | 	}
 63 | 
 64 | 	//======== specific implement ========
 65 | 	string nodesFile = argv[1];
 66 | 	string inputFile = argv[2];
 67 | 	string outputFile = argv[3];
 68 | 	string line;
 69 | 	ifstream ifs0(nodesFile);
 70 | 	unordered_map<int, pair<int, string>> id_preIDRank_map;
 71 | 	while(getline(ifs0, line)){
 72 | 		vector<string> vstr;
 73 | 		boost::split(vstr, line, boost::is_any_of("\t|"), boost::token_compress_on);
 74 | 		int curId = stoi(vstr[0]);
 75 | 		int preId = stoi(vstr[1]);
 76 | 		string rank = vstr[2];
 77 | 		pair<int, string> p(preId, rank);
 78 | 		id_preIDRank_map.insert({curId, p});
 79 | 	}
 80 | 	ifs0.close();
 81 | 	cerr << "the size of id_preIDRank_map is: " << id_preIDRank_map.size() << endl;
 82 | 
 83 | 	ifstream ifs1(inputFile);
 84 | 	string outputFile0 = outputFile + ".same";
 85 | 	string outputFile1 = outputFile + ".diff";
 86 | 	string outputFile2 = outputFile + ".same0";
 87 | 	ofstream ofs0(outputFile0);
 88 | 	ofstream ofs1(outputFile1);
 89 | 	ofstream ofs2(outputFile2);
 90 | 	ofs0 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl;
 91 | 	ofs1 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl;
 92 | 	ofs2 << "label\taccession\tspecies\tno_rank\tgenus\tfamily\torder" << endl;
 93 | 
 94 | 	//string repAccession;
 95 | 	vector<string> repAccessionArr;
 96 | 	vector<string> badAccessionArr;
 97 | 	unordered_map<string, int> repClass;
 98 | 	vector<unordered_map<string, int>> badClassArr;
 99 | 
100 | 	int index = 0;
101 | 	string cmpLevel = LEVEL;
102 | 
103 | 	while(getline(ifs1, line)){
104 | 		//cout << index++ << endl;
105 | 		bool getGenus = false;
106 | 		if(line.length() == 0){ //finish a cluster
107 | 			bool isEqual = true;
108 | 			for(auto x : badClassArr){
109 | 				if(x[cmpLevel] != repClass[cmpLevel]){
110 | 					isEqual = false;
111 | 					break;
112 | 				}
113 | 			}
114 | 			if(isEqual){
115 | 				if(repClass[cmpLevel] != 0){
116 | 					for(auto repAccession : repAccessionArr){
117 | 						ofs0 << '+' << '\t' << repAccession << '\t';
118 | 						ofs0 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl;
119 | 					}
120 | 					for(int i = 0; i < badClassArr.size(); i++){
121 | 						unordered_map<string, int> x = badClassArr[i];
122 | 						ofs0 << '-' <<'\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl;
123 | 					}
124 | 					ofs0 << endl;
125 | 				}
126 | 				else{
127 | 					for(auto repAccession : repAccessionArr){
128 | 						ofs2 << '+' << '\t' << repAccession << '\t';
129 | 						for(auto x : repClass){
130 | 							ofs2 << x.second << "(" << x.first << ")" << '\t';
131 | 						}
132 | 						ofs2 << endl;
133 | 					}
134 | 					for(int i = 0; i < badClassArr.size(); i++){
135 | 						ofs2 << '-' << '\t' << badAccessionArr[i] << '\t';
136 | 						for(auto x : badClassArr[i]){
137 | 							ofs2 << x.second << "(" << x.first << ")" << '\t';
138 | 						}
139 | 						ofs2 << endl;
140 | 					}
141 | 					ofs2<< endl;
142 | 				}
143 | 			}
144 | 			else{
145 | 				for(auto repAccession : repAccessionArr){
146 | 					ofs1 << '+' << '\t' << repAccession << '\t';
147 | 					ofs1 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl;
148 | 				}
149 | 				bool hasEqual = false;
150 | 				for(int i = 0; i < badClassArr.size(); i++){
151 | 					unordered_map<string, int> x = badClassArr[i];
152 | 					if(x[cmpLevel] != repClass[cmpLevel]){
153 | 						ofs1 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl;
154 | 					}
155 | 					else{
156 | 						if(!hasEqual){
157 | 							if(repClass[cmpLevel] != 0){
158 | 								for(auto repAccession : repAccessionArr){
159 | 									ofs0 << '+' << '\t' << repAccession << '\t';
160 | 									ofs0 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl;
161 | 								}
162 | 							}
163 | 							else{
164 | 								for(auto repAccession : repAccessionArr){
165 | 									ofs2 << '+' << '\t' << repAccession << '\t';
166 | 									ofs2 << repClass["species"] << '\t' << repClass["no_rank"] << '\t' << repClass["genus"] << '\t' << repClass["family"] << '\t' << repClass["order"] << endl;
167 | 								}
168 | 							}
169 | 							hasEqual = true;
170 | 						}
171 | 						if(repClass[cmpLevel] != 0){
172 | 							ofs0 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl;
173 | 						}
174 | 						else{
175 | 							ofs2 << '-' << '\t' << badAccessionArr[i] << '\t' << x["species"] << '\t' << x["no_rank"] << '\t' << x["genus"] << '\t' << x["family"] << '\t' << x["order"] << endl;
176 | 						}
177 | 					}
178 | 				}//end for loop of badClassArr
179 | 				if(hasEqual){
180 | 					if(repClass[cmpLevel] != 0)
181 | 						ofs0 << endl;
182 | 					else
183 | 						ofs2 << endl;
184 | 				}
185 | 				ofs1 << endl;
186 | 
187 | 			}
188 | 			unordered_map<string, int>().swap(repClass);
189 | 			vector<unordered_map<string, int>>().swap(badClassArr);
190 | 			vector<string>().swap(badAccessionArr);
191 | 			vector<string>().swap(repAccessionArr);
192 | 			continue;
193 | 		}
194 | 
195 | 		string accession;
196 | 		int curId;
197 | 		stringstream ss;
198 | 		ss << line;
199 | 		ss >> accession >> curId;
200 | 		if(id_preIDRank_map.count(curId) == 0){
201 | 			cerr << "the id: " << curId  << " is not in the taxonomy" << endl;
202 | 			continue;
203 | 		}
204 | 		if(line[0] != '\t'){
205 | 			repAccessionArr.push_back(accession);
206 | 			//repAccession = accession;
207 | 			if(id_preIDRank_map.count(curId) > 0){
208 | 				string rank = id_preIDRank_map[curId].second;
209 | 				repClass.insert({rank, curId});
210 | 				repClass[rank] = curId;
211 | 			}
212 | 			while(id_preIDRank_map.count(curId) > 0 && curId != 1){
213 | 				curId = id_preIDRank_map[curId].first;
214 | 				string rank = id_preIDRank_map[curId].second;
215 | 				if(rank == "no rank"){
216 | 					if(!getGenus){
217 | 						repClass.insert({rank, curId});
218 | 						repClass[rank] = curId;
219 | 					}
220 | 				}
221 | 				else if(rank == "genus"){
222 | 					getGenus = true;
223 | 				}
224 | 				repClass.insert({rank, curId});
225 | 				repClass[rank] = curId;
226 | 			}
227 | 		}//end if line[0] != \t
228 | 		else{
229 | 			badAccessionArr.push_back(accession);
230 | 			unordered_map<string, int> curBadClass;
231 | 			if(id_preIDRank_map.count(curId) > 0){
232 | 				string rank = id_preIDRank_map[curId].second;
233 | 				curBadClass.insert({rank, curId});
234 | 				curBadClass[rank] = curId;
235 | 			}
236 | 			while(id_preIDRank_map.count(curId) > 0 && curId != 1){
237 | 				curId = id_preIDRank_map[curId].first;
238 | 				string rank = id_preIDRank_map[curId].second;
239 | 				if(rank == "no rank"){
240 | 					if(!getGenus){
241 | 						curBadClass.insert({rank, curId});
242 | 						curBadClass[rank] = curId;
243 | 					}
244 | 				}
245 | 				else if(rank == "genus"){
246 | 					getGenus = true;
247 | 				}
248 | 				curBadClass.insert({rank, curId});
249 | 				curBadClass[rank] = curId;
250 | 			}
251 | 			badClassArr.push_back(curBadClass);
252 | 			unordered_map<string, int>().swap(curBadClass);
253 | 		}
254 | 
255 | 	}
256 | 	ifs1.close();
257 | 	ofs0.close();
258 | 	ofs1.close();
259 | 	ofs2.close();
260 | 	
261 | 
262 | 	cerr << "finished" << endl;
263 |   return 0;
264 | }
265 | 
266 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions){
267 | 	assert(args.size() == descriptions.size());
268 | 	cerr << endl;
269 | 	cerr << "example: " << example << endl;
270 | 	cerr << endl;
271 | 	cerr << "source file path: " << pwd << endl;
272 | 	cerr << endl;
273 | 	cerr << "dependency: " << dependency << endl;
274 | 	cerr << endl;
275 | 	cerr << "run as: ";
276 | 	for(int i = 0; i < args.size(); i++){
277 | 		cerr << args[i] << ' ';
278 | 	}
279 | 	cerr << endl;
280 | 	for(int i = 0; i < args.size(); i++){
281 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
282 | 	}
283 | }
284 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/calLabel.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/6/9
  3 |  * 
  4 |  */
  5 | #include <iostream>
  6 | #include <stdlib.h>
  7 | #include <string>
  8 | #include <cassert>
  9 | #include <fstream>
 10 | #include <vector>
 11 | #include <sstream>
 12 | #include <cstdio>
 13 | #include <algorithm>
 14 | #include <unordered_set>
 15 | #include <unordered_map>
 16 | #include <sys/sysinfo.h>
 17 | #include <omp.h>
 18 | #include "groundTruth.h"
 19 | 
 20 | 
 21 | using namespace std;
 22 | 
 23 | struct LabNum{
 24 | 	int label;
 25 | 	int number;
 26 | };
 27 | 
 28 | struct GlobalLabelInfo{
 29 | 	int clustId;
 30 | 	int labelNumber;
 31 | };
 32 | 
 33 | struct PosNum{
 34 | 	int startPos;
 35 | 	int clustSize;
 36 | };
 37 | 
 38 | struct IdNum{
 39 | 	int id;
 40 | 	int number;
 41 | };
 42 | 
 43 | bool cmpLabNum(LabNum ln1, LabNum ln2){
 44 | 	return ln1.number > ln2.number;
 45 | }
 46 | 
 47 | bool cmpIdNum(IdNum in1, IdNum in2){
 48 | 	return in1.number > in2.number;
 49 | }
 50 | 
 51 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions);
 52 | 
 53 | int updateLabel(vector< vector<LabNum> > &labNumArr, unordered_map<int, GlobalLabelInfo> &globalMap, int clustId, int &badLabel, vector<int> &resLabelArr);
 54 | 
 55 | void calLabelFile(string groundTruth, string clustFile, string labelFile);
 56 | 
 57 | void calLabelSequence(string groundTruth, string clustFile, string labelFile);
 58 | 
 59 | int main(int argc , char *argv[]){
 60 | 	string application = argv[0];
 61 | 	vector<string> args, descriptions;
 62 | 	args.push_back(application);
 63 | 	descriptions.push_back("the application name");
 64 | 
 65 | 	//========= parameters need changing ========
 66 | 	//The example is with parameters of specific numbers.
 67 | 	//args is the tutorial names.
 68 | 	string pwd = "RabbitTClust/benchmark/evaluation/src";
 69 | 	string dependency = "None";
 70 | 	string example = application + " bacteria.groundTruth -l bacteria.mst.clust bacteria.mst.label";
 71 | 	args.push_back("groundTruth");
 72 | 	args.push_back("sketchOption");
 73 | 	args.push_back("clustFile");
 74 | 	args.push_back("labelFile");
 75 | 	descriptions.push_back("input file, groundTruth file, <accession, taxid, organismName> per line(first line is header)");
 76 | 	descriptions.push_back("input option, sketch options, -l or -i, -l means sketchByFile, -i means sketchBySequence");
 77 | 	descriptions.push_back("input file, cluster result file need to be labeled");
 78 | 	descriptions.push_back("output file, label file according the groundTruth");
 79 | 
 80 | 	//-------- no changing -----------
 81 | 	assert(args.size() == descriptions.size());
 82 |   if(argc != args.size()) {
 83 | 		printInfo(pwd, dependency, example, args, descriptions);
 84 |     return 1;
 85 |   }
 86 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 87 | 	{
 88 | 		printInfo(pwd, dependency, example, args, descriptions);
 89 | 		return 1;
 90 | 	}
 91 | 
 92 | 	//======== specific implement ========
 93 | 	string groundTruth = argv[1];
 94 | 	string option = argv[2];
 95 | 	string clustFile = argv[3];
 96 | 	string labelFile = argv[4];
 97 | 
 98 | 	bool sketchByFile;
 99 | 	if(option == "-l")	sketchByFile = true;
100 | 	else if(option == "-i") sketchByFile = false;
101 | 	else{
102 | 		cerr << "error option: " << option << ", need -l or -i " << endl;
103 | 		return 1;
104 | 	}
105 | 	if(sketchByFile){
106 | 		calLabelFile(groundTruth, clustFile, labelFile);
107 | 	}
108 | 	else{
109 | 		calLabelSequence(groundTruth, clustFile, labelFile);
110 | 	}
111 | 
112 |   return 0;
113 | }
114 | void calLabelSequence(string groundTruth, string clustFile, string labelFile){
115 | 	//--------for groundTruth--------------
116 | 	unordered_map<string, int> seqName_taxid_map;
117 | 	unordered_map<int, string> taxid_organismName_map;
118 | 	getGroundTruthBySequence(groundTruth, seqName_taxid_map, taxid_organismName_map);
119 | 
120 | 	//--------for cluster file--------------------------
121 | 	vector<int> ourClust;
122 | 	vector<int> standardClust;
123 | 	unordered_map<string, int> standardMap;
124 | 	unordered_map<int, int> curMap;
125 | 	vector<vector<LabNum>> labNumArr;
126 | 	vector<PosNum> posArr;
127 | 	int startPos = 0;
128 | 	string line;
129 | 	
130 | 	int numNotInGroundTruth = 0;
131 | 	ifstream ifs1(clustFile);
132 | 	if(!ifs1){
133 | 		cerr << "error open: " << clustFile << endl;
134 | 		exit(1);
135 | 	}
136 | 	while(getline(ifs1, line)){
137 | 		if(line[0] != '\t'){
138 | 			if(curMap.size() != 0){
139 | 				int clustSize = 0;
140 | 				vector<LabNum> curClustInfo;
141 | 				for(auto x : curMap){
142 | 					LabNum ln;
143 | 					ln.label = x.first;
144 | 					ln.number = x.second;
145 | 					curClustInfo.push_back(ln);
146 | 					clustSize += x.second;
147 | 				}
148 | 				std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
149 | 				labNumArr.push_back(curClustInfo);
150 | 				PosNum pn;
151 | 				pn.startPos = startPos;
152 | 				pn.clustSize = clustSize;
153 | 				posArr.push_back(pn);
154 | 				startPos += clustSize;
155 | 				unordered_map<int, int>().swap(curMap);
156 | 			}
157 | 		}
158 | 		else{
159 | 			stringstream ss;
160 | 			ss << line;
161 | 			int curId, genomeId;
162 | 			string genomeSize, fileName, genomeName;
163 | 			ss >> curId >> genomeId >> genomeSize >> genomeName;
164 | 			string key = genomeName;
165 | 			if(seqName_taxid_map.find(key) == seqName_taxid_map.end()){
166 | 				numNotInGroundTruth++;
167 | 				continue;
168 | 			}
169 | 			else{
170 | 				int curLabel = seqName_taxid_map[key];
171 | 				standardClust.push_back(curLabel);
172 | 				curMap.insert({curLabel, 0});
173 | 				curMap[curLabel]++;
174 | 			}
175 | 		}
176 | 	}
177 | 	if(curMap.size() != 0){
178 | 		int clustSize = 0;
179 | 		vector<LabNum> curClustInfo;
180 | 		for(auto x : curMap){
181 | 			LabNum ln;
182 | 			ln.label = x.first;
183 | 			ln.number = x.second;
184 | 			curClustInfo.push_back(ln);
185 | 			clustSize += x.second;
186 | 		}
187 | 		std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
188 | 		labNumArr.push_back(curClustInfo);
189 | 		PosNum pn;
190 | 		pn.startPos = startPos;
191 | 		pn.clustSize = clustSize;
192 | 		posArr.push_back(pn);
193 | 		startPos += clustSize;
194 | 		unordered_map<int, int>().swap(curMap);
195 | 	}
196 | 
197 | 	//-------------for update labels------------------------------------
198 | 	unordered_map<int, GlobalLabelInfo> globalMap;
199 | 	int badLabel = -1;
200 | 	int clustNumber = labNumArr.size();
201 | 	vector<int> resLabelArr;
202 | 	resLabelArr.resize(clustNumber); 
203 | 	for(int i = 0; i < clustNumber; i++)
204 | 	{
205 | 		badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 
206 | 	}
207 | 	for(int i = 0; i < posArr.size(); i++)
208 | 	{
209 | 		int startPos = posArr[i].startPos;
210 | 		int clustSize = posArr[i].clustSize;
211 | 		for(int j = 0; j < clustSize; j++)
212 | 		{
213 | 			ourClust.push_back(resLabelArr[i]);
214 | 		}
215 | 	}
216 | 	cerr << "the number of which not in the groundTruth is: " << numNotInGroundTruth << endl;
217 | 	cerr << "the size of ourClust is: " << ourClust.size() << endl;
218 | 	cerr << "the size of standardClust is: " << standardClust.size() << endl;
219 | 	if(ourClust.size() != standardClust.size())
220 | 	{
221 | 		cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl;
222 | 		return;
223 | 	}
224 | 
225 | 	//--------------------for output labels-------------------------------------
226 | 	ofstream ofs(labelFile);
227 | 	for(int i = 0; i < ourClust.size(); i++)
228 | 		ofs << ourClust[i] << ' ';
229 | 	ofs << endl;
230 | 	for(int i = 0; i < standardClust.size(); i++)
231 | 		ofs << standardClust[i] << ' ';
232 | 	ofs << endl;
233 | 	ofs.close();
234 | 
235 | 	ofstream ofs1(labelFile+".humanReadable");
236 | 	for(int i = 0; i < ourClust.size(); i++)
237 | 	{
238 | 		ofs1 << ourClust[i] << '\t' << standardClust[i] << endl;
239 | 	}
240 | 	ofs1.close();
241 | }
242 | 
243 | void calLabelFile(string groundTruth, string clustFile, string labelFile){
244 | 	//--------for groundTruth--------------
245 | 	unordered_map<string, int> accession_taxid_map;
246 | 	unordered_map<int, string> taxid_organismName_map;
247 | 	getGroundTruthByFile(groundTruth, accession_taxid_map, taxid_organismName_map);
248 | 
249 | 	//--------for cluster file--------------------------
250 | 	vector<int> ourClust;
251 | 	vector<int> standardClust;
252 | 	unordered_map<string, int> standardMap;
253 | 	unordered_map<int, int> curMap;
254 | 	vector<vector<LabNum>> labNumArr;
255 | 	vector<PosNum> posArr;
256 | 	int startPos = 0;
257 | 	
258 | 	int numNotInGroundTruth = 0;
259 | 	ifstream ifs1(clustFile);
260 | 	if(!ifs1){
261 | 		cerr << "error open: " << clustFile << endl;
262 | 		exit(1);
263 | 	}
264 | 	string line;
265 | 	while(getline(ifs1, line)){
266 | 		if(line[0] != '\t'){
267 | 			if(curMap.size() != 0){
268 | 				int clustSize = 0;
269 | 				vector<LabNum> curClustInfo;
270 | 				for(auto x : curMap){
271 | 					LabNum ln;
272 | 					ln.label = x.first;
273 | 					ln.number = x.second;
274 | 					curClustInfo.push_back(ln);
275 | 					clustSize += x.second;
276 | 				}
277 | 				std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
278 | 				labNumArr.push_back(curClustInfo);
279 | 				PosNum pn;
280 | 				pn.startPos = startPos;
281 | 				pn.clustSize = clustSize;
282 | 				posArr.push_back(pn);
283 | 				startPos += clustSize;
284 | 				unordered_map<int, int>().swap(curMap);
285 | 			}
286 | 		}
287 | 		else{
288 | 			stringstream ss;
289 | 			ss << line;
290 | 			int curId, genomeId;
291 | 			string genomeSize, fileName, genomeName;
292 | 			ss >> curId >> genomeId >> genomeSize >> fileName >> genomeName;
293 | 			int startIndex = fileName.find_last_of('/');
294 | 			int endIndex = fileName.find_first_of('_', startIndex+5);
295 | 			if(endIndex == -1)	endIndex = fileName.find('.', startIndex+5);
296 | 			string key = fileName.substr(startIndex+1, endIndex-startIndex-1);
297 | 			if(accession_taxid_map.find(key) == accession_taxid_map.end()){
298 | 				numNotInGroundTruth++;
299 | 				continue;
300 | 			}
301 | 			else{
302 | 				int curLabel = accession_taxid_map[key];
303 | 				standardClust.push_back(curLabel);
304 | 				curMap.insert({curLabel, 0});
305 | 				curMap[curLabel]++;
306 | 			}
307 | 		}
308 | 	}
309 | 	if(curMap.size() != 0){
310 | 		int clustSize = 0;
311 | 		vector<LabNum> curClustInfo;
312 | 		for(auto x : curMap){
313 | 			LabNum ln;
314 | 			ln.label = x.first;
315 | 			ln.number = x.second;
316 | 			curClustInfo.push_back(ln);
317 | 			clustSize += x.second;
318 | 		}
319 | 		std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
320 | 		labNumArr.push_back(curClustInfo);
321 | 		PosNum pn;
322 | 		pn.startPos = startPos;
323 | 		pn.clustSize = clustSize;
324 | 		posArr.push_back(pn);
325 | 		startPos += clustSize;
326 | 		unordered_map<int, int>().swap(curMap);
327 | 	}
328 | 
329 | 	//-------------for update labels------------------------------------
330 | 	unordered_map<int, GlobalLabelInfo> globalMap;
331 | 	int badLabel = -1;
332 | 	int clustNumber = labNumArr.size();
333 | 	vector<int> resLabelArr;
334 | 	resLabelArr.resize(clustNumber); 
335 | 	for(int i = 0; i < clustNumber; i++)
336 | 	{
337 | 		badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 
338 | 	}
339 | 	for(int i = 0; i < posArr.size(); i++)
340 | 	{
341 | 		int startPos = posArr[i].startPos;
342 | 		int clustSize = posArr[i].clustSize;
343 | 		for(int j = 0; j < clustSize; j++)
344 | 		{
345 | 			ourClust.push_back(resLabelArr[i]);
346 | 		}
347 | 	}
348 | 	cerr << "the number of which not in the groundTruth is: " << numNotInGroundTruth << endl;
349 | 	cerr << "the size of ourClust is: " << ourClust.size() << endl;
350 | 	cerr << "the size of standardClust is: " << standardClust.size() << endl;
351 | 	if(ourClust.size() != standardClust.size())
352 | 	{
353 | 		cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl;
354 | 		return;
355 | 	}
356 | 
357 | 	//--------------------for output labels-------------------------------------
358 | 	ofstream ofs(labelFile);
359 | 	for(int i = 0; i < ourClust.size(); i++)
360 | 		ofs << ourClust[i] << ' ';
361 | 	ofs << endl;
362 | 	for(int i = 0; i < standardClust.size(); i++)
363 | 		ofs << standardClust[i] << ' ';
364 | 	ofs << endl;
365 | 	ofs.close();
366 | 
367 | 	//ofstream ofs1(labelFile+".humanReadable");
368 | 	//for(int i = 0; i < ourClust.size(); i++)
369 | 	//{
370 | 	//	ofs1 << ourClust[i] << '\t' << standardClust[i] << endl;
371 | 	//}
372 | 	//ofs1.close();
373 | 
374 | }
375 | 
376 | int updateLabel(vector< vector<LabNum> > &labNumArr, unordered_map<int, GlobalLabelInfo> &globalMap, int clustId, int &badLabel, vector<int> &resLabelArr)//return the new badLabel
377 | {
378 | 	bool isBad = true;
379 | 	while(labNumArr[clustId].size() != 0 && isBad)
380 | 	{
381 | 		int curLabel = labNumArr[clustId][0].label;
382 | 		int curNumber = labNumArr[clustId][0].number;
383 | 		if(globalMap.find(curLabel) == globalMap.end())//new label 
384 | 		{
385 | 			GlobalLabelInfo glab;
386 | 			glab.clustId = clustId;
387 | 			glab.labelNumber = curNumber;
388 | 			globalMap.insert({curLabel, glab});
389 | 			resLabelArr[clustId] = curLabel;
390 | 			isBad = false;
391 | 		}
392 | 		else//label collison with previous cluster
393 | 		{
394 | 			int preClustId = globalMap[curLabel].clustId;
395 | 			int preNumber = globalMap[curLabel].labelNumber;
396 | 			if(curNumber > preNumber)//the previous cluster is defeated, need to update the previous cluster.
397 | 			{
398 | 				resLabelArr[clustId] = curLabel;
399 | 				isBad = false;
400 | 				globalMap[curLabel].clustId = clustId;
401 | 				globalMap[curLabel].labelNumber = curNumber;
402 | 				badLabel = updateLabel(labNumArr, globalMap, preClustId, badLabel, resLabelArr);
403 | 			}
404 | 			else//current cluster can not defeat the previous cluster, just erase the biggest label to try new biggest label
405 | 			{}
406 | 		}
407 | 
408 | 		labNumArr[clustId].erase(labNumArr[clustId].begin());//erase the biggest label in this cluster
409 | 	}//end while
410 | 	if(isBad)
411 | 	{
412 | 		resLabelArr[clustId] = badLabel;
413 | 		badLabel--;//update the newBadLabel
414 | 	}
415 | 	return badLabel;
416 | }
417 | 
418 | 
419 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions){
420 | 	assert(args.size() == descriptions.size());
421 | 	cerr << endl;
422 | 	cerr << "example: " << example << endl;
423 | 	cerr << endl;
424 | 	cerr << "source file path: " << pwd << endl;
425 | 	cerr << endl;
426 | 	cerr << "dependency: " << dependency << endl;
427 | 	cerr << endl;
428 | 	cerr << "run as: ";
429 | 	for(int i = 0; i < args.size(); i++){
430 | 		cerr << args[i] << ' ';
431 | 	}
432 | 	cerr << endl;
433 | 	for(int i = 0; i < args.size(); i++){
434 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
435 | 	}
436 | }
437 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/checkTaxonomyStatus.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/8/3
  3 |  *
  4 |  * 
  5 |  */
  6 | #include <iostream>
  7 | #include <stdlib.h>
  8 | #include <string>
  9 | #include <cassert>
 10 | #include <fstream>
 11 | #include <vector>
 12 | #include <sstream>
 13 | #include <cstdio>
 14 | #include <algorithm>
 15 | #include <unordered_set>
 16 | #include <unordered_map>
 17 | #include <sys/sysinfo.h>
 18 | #include <omp.h>
 19 | #include <set>
 20 | #include <math.h>
 21 | #include <boost/algorithm/string/classification.hpp>
 22 | #include <boost/algorithm/string/split.hpp>
 23 | #include <zlib.h>
 24 | 
 25 | 
 26 | 
 27 | using namespace std;
 28 | 
 29 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions);
 30 | 
 31 | int main(int argc , char *argv[]){
 32 | 	string application = argv[0];
 33 | 	vector<string> args, descriptions;
 34 | 	args.push_back(application);
 35 | 	descriptions.push_back("the application name");
 36 | 
 37 | 	//========= parameters need changing ========
 38 | 	//The example is with parameters of specific numbers.
 39 | 	//args is the tutorial names.
 40 | 	string pwd = "RabbitTClust/benchmark/evaluation/src/checkTaxonomyStatus.cpp";
 41 | 	string dependency = "None";
 42 | 	string example = application + " ANI_report_prokaryotes.txt greedy.ans output";
 43 | 	args.push_back("ANI_report_prokaryotes.txt");
 44 | 	args.push_back("greedy.ans");
 45 | 	args.push_back("output");
 46 | 	descriptions.push_back("input file, the ANI_report_prokaryotes file, <genbank-accession, species-taxid, best-match-species-taxid, best-match-status, excluded-from-refseq> per line");
 47 | 	descriptions.push_back("input file, the analysis from the analysisPurity, <index(+/-), genbank-accession, species-taxid, no-rank, genus-taxid, family-taxid, order-taxid> per line");
 48 | 	descriptions.push_back("output file, the output file of result");
 49 | 
 50 | 	//-------- no changing -----------
 51 | 	assert(args.size() == descriptions.size());
 52 |   if(argc != args.size()) {
 53 | 		printInfo(pwd, dependency, example, args, descriptions);
 54 |     return 1;
 55 |   }
 56 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 57 | 	{
 58 | 		printInfo(pwd, dependency, example, args, descriptions);
 59 | 		return 1;
 60 | 	}
 61 | 
 62 | 	//======== specific implement ========
 63 | 	string aniFile = argv[1];
 64 | 	string anaFile = argv[2];
 65 | 	string outputFile = argv[3];
 66 | 
 67 | 	string line;
 68 | 	unordered_set<string> accessionSet;
 69 | 	unordered_map<string, int> accSpeciesTaxidMap;
 70 | 	unordered_map<string, string> accExcludedFromRefseqMap;
 71 | 	unordered_map<string, int> accBestMatchSpeciesTaxidMap;
 72 | 	unordered_map<string, string> accBestMatchStatusMap;
 73 | 	unordered_map<string, double> accQcoverageMap;
 74 | 	unordered_map<string, double> accScoverageMap;
 75 | 	ifstream ifs0(aniFile);
 76 | 	getline(ifs0, line);//header line
 77 | 	int index = 0;
 78 | 	int calNumBestMatchStatus = 0;
 79 | 	int calNumExcludedFromRefseq = 0;
 80 | 	while(getline(ifs0, line)){
 81 | 		string accession, excluded_from_refseq, best_match_status;
 82 | 		int species_taxid,  best_match_species_taxid;
 83 | 		double qcoverage, scoverage;
 84 | 		vector<string> vstr;
 85 | 		boost::split(vstr, line, boost::is_any_of("\t"), boost::token_compress_on);
 86 | 		accession = vstr[0];
 87 | 		species_taxid = vstr[1] != "na" ? stoi(vstr[1]) : 0;
 88 | 		best_match_species_taxid = vstr[2] != "na" ? stoi(vstr[2]) : 0;
 89 | 		best_match_status = vstr[3];
 90 | 		excluded_from_refseq = vstr[4];
 91 | 		qcoverage = vstr[5] != "na" ? stod(vstr[5]) : 0.0;
 92 | 		scoverage = vstr[6] != "na" ? stod(vstr[6]) : 0.0;
 93 | 		if(best_match_status == "species-match")	calNumBestMatchStatus++;
 94 | 		if(excluded_from_refseq == "na"){
 95 | 			calNumExcludedFromRefseq++;
 96 | 			//cout << accession << endl;
 97 | 		}
 98 | 
 99 | 		accSpeciesTaxidMap.insert({accession, species_taxid});
100 | 		accExcludedFromRefseqMap.insert({accession, excluded_from_refseq});
101 | 		accBestMatchSpeciesTaxidMap.insert({accession, best_match_species_taxid});
102 | 		accBestMatchStatusMap.insert({accession, best_match_status});
103 | 		accQcoverageMap.insert({accession, qcoverage});
104 | 		accScoverageMap.insert({accession, scoverage});
105 | 		accessionSet.insert({accession});
106 | 	}
107 | 	ifs0.close();
108 | 	cerr << "the size of accSpeciesTaxidMap is: " << accSpeciesTaxidMap.size() << endl;
109 | 	cerr << "the best_match_status of species_match is: " << calNumBestMatchStatus << ", the percent is: " << (double)calNumBestMatchStatus / accSpeciesTaxidMap.size() << endl;
110 | 	cerr << "the excluded_from_refseq of na is: " << calNumExcludedFromRefseq << ", the percent is: " << (double)calNumExcludedFromRefseq / accSpeciesTaxidMap.size() << endl;
111 | 	//exit(0);
112 | 
113 | 	ifstream ifs1(anaFile);
114 | 	string outputFile0 = outputFile + ".species_taxid.check";
115 | 	string outputFile1 = outputFile + ".best_match_species_taxid.check";
116 | 	string outputFile2 = outputFile + ".exclude_from_refseq.check";
117 | 	string outputFile3 = outputFile + ".best_match_status.check";
118 | 	string outputFile4 = outputFile + ".perfect.check";
119 | 	string outputFile5 = outputFile + ".coverage.check";
120 | 	ofstream ofs0(outputFile0);
121 | 	ofstream ofs1(outputFile1);
122 | 	ofstream ofs2(outputFile2);
123 | 	ofstream ofs3(outputFile3);
124 | 	ofstream ofs4(outputFile4);
125 | 	ofstream ofs5(outputFile5);
126 | 	ofs0 << "label\taccession\tassembly_taxid\ttaxonomy_taxid" << endl;
127 | 	ofs1 << "label\taccession\tassembly_taxid\tbest_match_species_taxid" << endl;
128 | 	ofs2 << "label\taccession\texclude_from_refseq" << endl;
129 | 	ofs3 << "label\taccession\tbest_match_status" << endl;
130 | 	ofs4 << "label\taccession\tassembly_taxid" << endl;
131 | 	ofs5 << "label\taccession\tqcoverage\tscoverage" << endl;
132 | 	getline(ifs1, line);//header line
133 | 	int numNotInTaxonomy = 0;
134 | 	int totalNumber_rep = 0;
135 | 	int totalNumber_bad = 0;
136 | 	int perfectNum_rep = 0;
137 | 	int perfectNum_bad = 0;
138 | 	int numNotEqualTaxid_rep = 0;
139 | 	int numNotEqualTaxid_bad = 0;
140 | 	int numNotEqualBestMatch_rep = 0;
141 | 	int numNotEqualBestMatch_bad = 0;
142 | 	int numNotBestMatchSpeciesLevel_rep = 0;
143 | 	int numNotBestMatchSpeciesLevel_bad = 0;
144 | 	int numExclude_from_refSeq_rep = 0;
145 | 	int numExclude_from_refSeq_bad = 0;
146 | 
147 | 	vector<string> matchStatusArr;
148 | 	matchStatusArr.push_back("species-match");
149 | 	matchStatusArr.push_back("subspecies-match");
150 | 	matchStatusArr.push_back("synonym-match");
151 | 	matchStatusArr.push_back("derived-species-match");
152 | 	matchStatusArr.push_back("genus-match");
153 | 	matchStatusArr.push_back("approved-mismatch");
154 | 	matchStatusArr.push_back("mismatch");
155 | 	matchStatusArr.push_back("below-threshold-match");
156 | 	matchStatusArr.push_back("below-threshold-mismatch");
157 | 	matchStatusArr.push_back("low-coverage");
158 | 	
159 | 	vector<int> numMatchStatusRepArr;
160 | 	vector<int> numMatchStatusBadArr;
161 | 	for(int i = 0; i < 10; i++){
162 | 		numMatchStatusRepArr.push_back(0);
163 | 		numMatchStatusBadArr.push_back(0);
164 | 	}
165 | 
166 | 	//int rep_numSpeciesMatch = 0;
167 | 	//int rep_numSubSpeciesMatch = 0;
168 | 	//int rep_numSynonymMatch = 0;
169 | 	//int rep_numDerivedSpeciesMatch = 0;
170 | 	//int rep_numGenusMatch = 0;
171 | 	//int rep_numApprovedMismatch = 0;
172 | 	//int rep_numMismatch_rep = 0;
173 | 	//int rep_numBelowThresholdMatch = 0;
174 | 	//int rep_numBelowThresholdMismatch = 0;
175 | 	//int rep_numLowCoverage = 0;
176 | 
177 | 	//int bad_numSpeciesMatch = 0;
178 | 	//int bad_numSubSpeciesMatch = 0;
179 | 	//int bad_numSynonymMatch = 0;
180 | 	//int bad_numDerivedSpeciesMatch = 0;
181 | 	//int bad_numGenusMatch = 0;
182 | 	//int bad_numApprovedMismatch = 0;
183 | 	//int bad_numMismatch = 0;
184 | 	//int bad_numBelowThresholdMatch = 0;
185 | 	//int bad_numBelowThresholdMismatch = 0;
186 | 	//int bad_numLowCoverage = 0;
187 | 
188 | 	while(getline(ifs1, line)){
189 | 		if(line.length() == 0){
190 | 			ofs0 << endl;
191 | 			ofs1 << endl;
192 | 			ofs2 << endl;
193 | 			ofs3 << endl;
194 | 			//ofs4 << endl;
195 | 			ofs5 << endl;
196 | 			//cout << endl;
197 | 			continue;
198 | 		}
199 | 		string index, accession;
200 | 		int species, no_rank, genus, family, order;
201 | 		stringstream ss;
202 | 		ss << line;
203 | 		ss >> index >> accession >> species >> no_rank >> genus >> family >> order;
204 | 		if(accessionSet.count(accession) == 0){
205 | 			//cerr << "the accession is not in the taxonomy file" << endl;
206 | 			//cout << line << endl;
207 | 			numNotInTaxonomy++;
208 | 			continue;
209 | 		}
210 | 
211 | 		int taxSID = accSpeciesTaxidMap[accession];
212 | 		int taxBMID = accBestMatchSpeciesTaxidMap[accession];
213 | 		string taxEFR = accExcludedFromRefseqMap[accession];
214 | 		string taxBMS = accBestMatchStatusMap[accession];
215 | 		double qcoverage = accQcoverageMap[accession];
216 | 		double scoverage = accScoverageMap[accession];
217 | 		if(index == "+"){
218 | 			totalNumber_rep++;
219 | 			if(species != taxSID) numNotEqualTaxid_rep++;
220 | 			if(taxSID != taxBMID) numNotEqualBestMatch_rep++;
221 | 			if(taxEFR != "na") numExclude_from_refSeq_rep++;
222 | 			if(taxBMS != "species-match") numNotBestMatchSpeciesLevel_rep++;
223 | 
224 | 			for(int i = 0; i < 10; i++){
225 | 				if(taxBMS == matchStatusArr[i])	numMatchStatusRepArr[i]++;
226 | 			}
227 | 
228 | 			if(species == taxSID && taxSID == taxBMID && taxEFR == "na" && taxBMS == "species-match"){
229 | 				perfectNum_rep++;
230 | 				ofs4 << line << endl;
231 | 			}
232 | 		}
233 | 		else{
234 | 			totalNumber_bad++;
235 | 			if(species != taxSID) numNotEqualTaxid_bad++;
236 | 			if(taxSID != taxBMID) numNotEqualBestMatch_bad++;
237 | 			if(taxEFR != "na") numExclude_from_refSeq_bad++;
238 | 			if(taxBMS != "species-match") numNotBestMatchSpeciesLevel_bad++;
239 | 			
240 | 			for(int i = 0; i < 10; i++){
241 | 				if(taxBMS == matchStatusArr[i])	numMatchStatusBadArr[i]++;
242 | 			}
243 | 
244 | 			//if(taxBMS == "mismatch"|| taxBMS == "below-threshold-match" || taxBMS == "below-threshold-mismatch" || taxBMS == "low-coverage") numNotBestMatchSpeciesLevel_bad++;
245 | 			//if(species == taxSID && taxSID == taxBMID && taxEFR == "na" && taxBMS == "species-match"){
246 | 			//if(species == taxSID &&  taxEFR == "na" && taxBMS == "species-match"){
247 | 			if(species == taxBMID && taxEFR == "na"){
248 | 				perfectNum_bad++;
249 | 				ofs4 << line << endl;
250 | 			}
251 | 		}
252 | 
253 | 		ofs0 << index << '\t' << accession << '\t' << species << '\t' << taxSID << endl;
254 | 		ofs1 << index << '\t' << accession << '\t' << species << '\t' << taxBMID << endl;
255 | 		ofs2 << index << '\t' << accession << '\t' << taxEFR << endl;
256 | 		ofs3 << index << '\t' << accession << '\t' << taxBMS << endl;
257 | 		ofs5 << index << '\t' << accession << '\t' << qcoverage << '\t' << scoverage << endl;
258 | 	}
259 | 	ofs0.close();
260 | 	ofs1.close();
261 | 	ofs2.close();
262 | 	ofs3.close();
263 | 	ofs5.close();
264 | 
265 | 	cerr << "finished" << endl;
266 | 	cerr << "the number not in the taxonomy is: " << numNotInTaxonomy << endl;
267 | 	cerr << "for representative genomes, the total number is: " << totalNumber_rep << endl;
268 | 	cerr << "\tthe numNotEqualtaxid of assembly-summary and taxonomy is: " << numNotEqualTaxid_rep << endl;
269 | 	cerr << "\t\tthe percentange is: " << (double)numNotEqualTaxid_rep / totalNumber_rep << endl;
270 | 	cerr << "\tthe numNotEqualBestMatch of species-taxid and best-species-taxid is: " << numNotEqualBestMatch_rep << endl;
271 | 	cerr << "\t\tthe percentange is: " << (double)numNotEqualBestMatch_rep / totalNumber_rep << endl;
272 | 	cerr << "\tthe numExclude_from_refSeq_rep is: " << numExclude_from_refSeq_rep << endl;
273 | 	cerr << "\t\tthe percentange is: " << (double)numExclude_from_refSeq_rep / totalNumber_rep << endl;
274 | 	cerr << "\tthe numNotBestMatchSpeciesLevel_rep is: " << numNotBestMatchSpeciesLevel_rep << endl;
275 | 	cerr << "\t\tthe percentange is: " << (double)numNotBestMatchSpeciesLevel_rep / totalNumber_rep << endl;
276 | 	cerr << "\tthe perfectNum_rep is: " << perfectNum_rep << endl;
277 | 	cerr << "\t\tthe percentange is: " << (double)perfectNum_rep / totalNumber_rep << endl;
278 | 
279 | 	cerr << "for bad genomes, the total number is: " << totalNumber_bad << endl;
280 | 	cerr << "\tthe numNotEqualtaxid of assembly-summary and taxonomy is: " << numNotEqualTaxid_bad << endl;
281 | 	cerr << "\t\tthe percentange is: " << (double)numNotEqualTaxid_bad / totalNumber_bad << endl;
282 | 	cerr << "\tthe numNotEqualBestMatch of species-taxid and best-species-taxid is: " << numNotEqualBestMatch_bad << endl;
283 | 	cerr << "\t\tthe percentange is: " << (double)numNotEqualBestMatch_bad / totalNumber_bad << endl;
284 | 	cerr << "\tthe numExclude_from_refSeq_bad is: " << numExclude_from_refSeq_bad << endl;
285 | 	cerr << "\t\tthe percentange is: " << (double)numExclude_from_refSeq_bad / totalNumber_bad << endl;
286 | 	cerr << "\tthe numNotBestMatchSpeciesLevel_bad is: " << numNotBestMatchSpeciesLevel_bad << endl;
287 | 	cerr << "\t\tthe percentange is: " << (double)numNotBestMatchSpeciesLevel_bad / totalNumber_bad << endl;
288 | 	cerr << "\tthe perfectNum_bad is: " << perfectNum_bad << endl;
289 | 	cerr << "\t\tthe percentange is: " << (double)perfectNum_bad / totalNumber_bad << endl;
290 | 
291 | 
292 | 	cerr << "====================================================================================" << endl;
293 | 	int tmpGoodTotalNumber = 0;
294 | 	for(int i = 0; i < 10; i++){
295 | 		cerr << "the number of rep " << matchStatusArr[i] << " is: " << numMatchStatusRepArr[i] << ", and percent is: " << (double)numMatchStatusRepArr[i] / totalNumber_rep << endl;
296 | 		tmpGoodTotalNumber += numMatchStatusRepArr[i];
297 | 	}
298 | 	cerr << "the total good number is: " << tmpGoodTotalNumber << endl;
299 | 
300 | 	cerr << "====================================================================================" << endl;
301 | 	int tmpBadTotalNumber = 0;	
302 | 	for(int i = 0; i < 10; i++){
303 | 		cerr << "the number of bad " << matchStatusArr[i] << " is: " << numMatchStatusBadArr[i] << ", and percent is: " << (double)numMatchStatusBadArr[i] / totalNumber_bad << endl;
304 | 		tmpBadTotalNumber += numMatchStatusBadArr[i];
305 | 	}
306 | 	cerr << "the total bad number is: " << tmpBadTotalNumber << endl;
307 | 
308 | 
309 |   return 0;
310 | }
311 | 
312 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions){
313 | 	assert(args.size() == descriptions.size());
314 | 	cerr << endl;
315 | 	cerr << "example: " << example << endl;
316 | 	cerr << endl;
317 | 	cerr << "source file path: " << pwd << endl;
318 | 	cerr << endl;
319 | 	cerr << "dependency: " << dependency << endl;
320 | 	cerr << endl;
321 | 	cerr << "run as: ";
322 | 	for(int i = 0; i < args.size(); i++){
323 | 		cerr << args[i] << ' ';
324 | 	}
325 | 	cerr << endl;
326 | 	for(int i = 0; i < args.size(); i++){
327 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
328 | 	}
329 | }
330 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/getRepresentativeList.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/7/16
  3 |  *
  4 |  * 
  5 |  */
  6 | #include <iostream>
  7 | #include <stdlib.h>
  8 | #include <string>
  9 | #include <cassert>
 10 | #include <fstream>
 11 | #include <vector>
 12 | #include <sstream>
 13 | #include <cstdio>
 14 | #include <algorithm>
 15 | #include <unordered_set>
 16 | #include <unordered_map>
 17 | #include <sys/sysinfo.h>
 18 | #include <omp.h>
 19 | #include <set>
 20 | #include <math.h>
 21 | #include <zlib.h>
 22 | 
 23 | 
 24 | using namespace std;
 25 | 
 26 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions);
 27 | 
 28 | int main(int argc , char *argv[]){
 29 | 	string application = argv[0];
 30 | 	vector<string> args, descriptions;
 31 | 	args.push_back(application);
 32 | 	descriptions.push_back("the application name");
 33 | 
 34 | 	//========= parameters need changing ========
 35 | 	//The example is with parameters of specific numbers.
 36 | 	//args is the tutorial names.
 37 | 	string pwd = "RabbitTClust/benchmark/evaluation/src/getRepresentativeList.cpp";
 38 | 	string dependency = "None";
 39 | 	string example = application + " -l bacteria.greedy.clust bacteria_representative.list";
 40 | 	args.push_back("-i/-l");
 41 | 	args.push_back("clustFile");
 42 | 	args.push_back("representative_list");
 43 | 	descriptions.push_back("input parameter, sketch parameter for the cluster file, -i means sketchBySequence, -l means sketchByFile");
 44 | 	descriptions.push_back("input file, the cluster result from RabbitTClust");
 45 | 	descriptions.push_back("output file, the representative list of genomes file or sequences");
 46 | 
 47 | 	//-------- no changing -----------
 48 | 	assert(args.size() == descriptions.size());
 49 |   if(argc != args.size()) {
 50 | 		printInfo(pwd, dependency, example, args, descriptions);
 51 |     return 1;
 52 |   }
 53 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 54 | 	{
 55 | 		printInfo(pwd, dependency, example, args, descriptions);
 56 | 		return 1;
 57 | 	}
 58 | 
 59 | 	//======== specific implement ========
 60 | 	string option = argv[1];
 61 | 	if(option != "-l" && option != "-i"){
 62 | 		cerr << "error option: " << option << ", need -l or -i option" << endl;
 63 | 		return 1;
 64 | 	}
 65 | 
 66 | 	string clustFile = argv[2];
 67 | 	string outputFile = argv[3];
 68 | 
 69 | 	ifstream ifs(clustFile);
 70 | 	string line;
 71 | 	ofstream ofs(outputFile);
 72 | 	bool isClust = false;
 73 | 	while(getline(ifs, line)){
 74 | 		if(line[0] != '\t'){
 75 | 			isClust = true;
 76 | 		}
 77 | 		else if(isClust){
 78 | 			isClust = false;
 79 | 			stringstream ss;
 80 | 			int curId, globalId;
 81 | 			string length, fileName, seqName, comment, tmpComment;
 82 | 			ss << line;
 83 | 			if(option == "-l"){
 84 | 				ss >> curId >> globalId >> length >> fileName >> seqName;
 85 | 				ofs << fileName << endl;
 86 | 			}
 87 | 			else if(option == "-i"){
 88 | 				ss >> curId >> globalId >> length >> seqName;
 89 | 				ofs << seqName << endl;
 90 | 			}
 91 | 		}
 92 | 		
 93 | 	}
 94 | 	ifs.close();
 95 | 
 96 |   return 0;
 97 | }
 98 | 
 99 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions){
100 | 	assert(args.size() == descriptions.size());
101 | 	cerr << endl;
102 | 	cerr << "example: " << example << endl;
103 | 	cerr << endl;
104 | 	cerr << "source file path: " << pwd << endl;
105 | 	cerr << endl;
106 | 	cerr << "dependency: " << dependency << endl;
107 | 	cerr << endl;
108 | 	cerr << "run as: ";
109 | 	for(int i = 0; i < args.size(); i++){
110 | 		cerr << args[i] << ' ';
111 | 	}
112 | 	cerr << endl;
113 | 	for(int i = 0; i < args.size(); i++){
114 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
115 | 	}
116 | }
117 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/groundTruth.cpp:
--------------------------------------------------------------------------------
 1 | #include "groundTruth.h"
 2 | #include <sstream>
 3 | 
 4 | void getGroundTruthBySequence(string groundTruth, unordered_map<string, int>& seqName_taxid_map, unordered_map<int, string>& taxid_organismName_map){
 5 | 
 6 | 	ifstream ifs0(groundTruth);
 7 | 	if(!ifs0){
 8 | 		cerr << "error open: " << groundTruth << endl;
 9 | 		exit(1);
10 | 	}
11 | 	string line;
12 | 	getline(ifs0, line);//for the header line
13 | 	while(getline(ifs0, line)){
14 | 		stringstream ss;
15 | 		string seqName, organismName(""), tmpStr;
16 | 		int taxid;
17 | 		ss << line;
18 | 		ss >> seqName >> taxid;
19 | 		while(ss >> tmpStr){
20 | 			organismName += tmpStr + ' ';
21 | 		}
22 | 		organismName.substr(0, organismName.length()-1);
23 | 		seqName_taxid_map.insert({seqName, taxid});
24 | 		taxid_organismName_map.insert({taxid, organismName});
25 | 	}
26 | 	ifs0.close();
27 | }
28 | 
29 | void getGroundTruthByFile(string groundTruth, unordered_map<string, int>& accession_taxid_map, unordered_map<int, string>& taxid_organismName_map){
30 | 
31 | 	ifstream ifs0(groundTruth);
32 | 	if(!ifs0){
33 | 		cerr << "error open: " << groundTruth << endl;
34 | 		exit(1);
35 | 	}
36 | 	string line;
37 | 	getline(ifs0, line);//for the header line
38 | 	while(getline(ifs0, line)){
39 | 		stringstream ss;
40 | 		string accession, organismName(""), tmpStr;
41 | 		int taxid;
42 | 		ss << line;
43 | 		ss >> accession >> taxid;
44 | 		while(ss >> tmpStr){
45 | 			organismName += tmpStr + ' ';
46 | 		}
47 | 		organismName.substr(0, organismName.length()-1);
48 | 		accession_taxid_map.insert({accession, taxid});
49 | 		taxid_organismName_map.insert({taxid, organismName});
50 | 	}
51 | 	ifs0.close();
52 | }
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/groundTruth.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <unordered_map>
 3 | #include <string>
 4 | #include <fstream>
 5 | 
 6 | using namespace std;
 7 | 
 8 | void getGroundTruthBySequence(string groundTruth, unordered_map<string, int>& seqName_taxid_map, unordered_map<int, string>& taxid_organismName_map);
 9 | 
10 | 
11 | void getGroundTruthByFile(string groundTruth, unordered_map<string, int>& accession_taxid_map, unordered_map<int, string>& taxid_organismName_map);
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #ifdef USE_MALLOC_WRAPPERS
 36 | #  include "malloc_wrap.h"
 37 | #endif
 38 | 
 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 40 | #define KS_SEP_TAB   1 // isspace() && !' '
 41 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 42 | #define KS_SEP_MAX   2
 43 | 
 44 | #define __KS_TYPE(type_t)						\
 45 | 	typedef struct __kstream_t {				\
 46 | 		unsigned char *buf;						\
 47 | 		int begin, end, is_eof;					\
 48 | 		type_t f;								\
 49 | 	} kstream_t;
 50 | 
 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 53 | 
 54 | #define __KS_BASIC(type_t, __bufsize)								\
 55 | 	static inline kstream_t *ks_init(type_t f)						\
 56 | 	{																\
 57 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 58 | 		ks->f = f;													\
 59 | 		ks->buf = (unsigned char*)malloc(__bufsize);				\
 60 | 		return ks;													\
 61 | 	}																\
 62 | 	static inline void ks_destroy(kstream_t *ks)					\
 63 | 	{																\
 64 | 		if (ks) {													\
 65 | 			free(ks->buf);											\
 66 | 			free(ks);												\
 67 | 		}															\
 68 | 	}
 69 | 
 70 | #define __KS_GETC(__read, __bufsize)						\
 71 | 	static inline int ks_getc(kstream_t *ks)				\
 72 | 	{														\
 73 | 		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
 74 | 		if (ks->begin >= ks->end) {							\
 75 | 			ks->begin = 0;									\
 76 | 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
 77 | 			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
 78 | 		}													\
 79 | 		return (int)ks->buf[ks->begin++];					\
 80 | 	}
 81 | 
 82 | #ifndef KSTRING_T
 83 | #define KSTRING_T kstring_t
 84 | typedef struct __kstring_t {
 85 | 	size_t l, m;
 86 | 	char *s;
 87 | } kstring_t;
 88 | #endif
 89 | 
 90 | #ifndef kroundup32
 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 92 | #endif
 93 | 
 94 | #define __KS_GETUNTIL(__read, __bufsize)								\
 95 | 	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 96 | 	{																	\
 97 | 		int gotany = 0;													\
 98 | 		if (dret) *dret = 0;											\
 99 | 		str->l = append? str->l : 0;									\
100 | 		for (;;) {														\
101 | 			int i;														\
102 | 			if (ks->begin >= ks->end) {									\
103 | 				if (!ks->is_eof) {										\
104 | 					ks->begin = 0;										\
105 | 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
106 | 					if (ks->end == 0) { ks->is_eof = 1; break; }		\
107 | 				} else break;											\
108 | 			}															\
109 | 			if (delimiter == KS_SEP_LINE) { \
110 | 				for (i = ks->begin; i < ks->end; ++i) \
111 | 					if (ks->buf[i] == '\n') break; \
112 | 			} else if (delimiter > KS_SEP_MAX) {						\
113 | 				for (i = ks->begin; i < ks->end; ++i)					\
114 | 					if (ks->buf[i] == delimiter) break;					\
115 | 			} else if (delimiter == KS_SEP_SPACE) {						\
116 | 				for (i = ks->begin; i < ks->end; ++i)					\
117 | 					if (isspace(ks->buf[i])) break;						\
118 | 			} else if (delimiter == KS_SEP_TAB) {						\
119 | 				for (i = ks->begin; i < ks->end; ++i)					\
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */						\
122 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
123 | 				str->m = str->l + (i - ks->begin) + 1;					\
124 | 				kroundup32(str->m);										\
125 | 				str->s = (char*)realloc(str->s, str->m);				\
126 | 			}															\
127 | 			gotany = 1;													\
128 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
129 | 			str->l = str->l + (i - ks->begin);							\
130 | 			ks->begin = i + 1;											\
131 | 			if (i < ks->end) {											\
132 | 				if (dret) *dret = ks->buf[i];							\
133 | 				break;													\
134 | 			}															\
135 | 		}																\
136 | 		if (!gotany && ks_eof(ks)) return -1;							\
137 | 		if (str->s == 0) {												\
138 | 			str->m = 1;													\
139 | 			str->s = (char*)calloc(1, 1);								\
140 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
141 | 		str->s[str->l] = '\0';											\
142 | 		return str->l;													\
143 | 	} \
144 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
145 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
146 | 
147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
148 | 	__KS_TYPE(type_t)							\
149 | 	__KS_BASIC(type_t, __bufsize)				\
150 | 	__KS_GETC(__read, __bufsize)				\
151 | 	__KS_GETUNTIL(__read, __bufsize)
152 | 
153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
154 | 
155 | #define __KSEQ_BASIC(SCOPE, type_t)										\
156 | 	SCOPE kseq_t *kseq_init(type_t fd)									\
157 | 	{																	\
158 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
159 | 		s->f = ks_init(fd);												\
160 | 		return s;														\
161 | 	}																	\
162 | 	SCOPE void kseq_destroy(kseq_t *ks)									\
163 | 	{																	\
164 | 		if (!ks) return;												\
165 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
166 | 		ks_destroy(ks->f);												\
167 | 		free(ks);														\
168 | 	}
169 | 
170 | /* Return value:
171 |    >=0  length of the sequence (normal)
172 |    -1   end-of-file
173 |    -2   truncated quality string
174 |  */
175 | #define __KSEQ_READ(SCOPE) \
176 | 	SCOPE int kseq_read(kseq_t *seq) \
177 | 	{ \
178 | 		int c; \
179 | 		kstream_t *ks = seq->f; \
180 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
181 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
182 | 			if (c == -1) return -1; /* end of file */ \
183 | 			seq->last_char = c; \
184 | 		} /* else: the first header char has been read in the previous call */ \
185 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
186 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
187 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
188 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
189 | 			seq->seq.m = 256; \
190 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
191 | 		} \
192 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
193 | 			if (c == '\n') continue; /* skip empty lines */ \
194 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
195 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
196 | 		} \
197 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
198 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
199 | 			seq->seq.m = seq->seq.l + 2; \
200 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
201 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
202 | 		} \
203 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
204 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
205 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
206 | 			seq->qual.m = seq->seq.m; \
207 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
208 | 		} \
209 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
210 | 		if (c == -1) return -2; /* error: no quality string */ \
211 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
212 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
213 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
214 | 		return seq->seq.l; \
215 | 	}
216 | 
217 | #define __KSEQ_TYPE(type_t)						\
218 | 	typedef struct {							\
219 | 		kstring_t name, comment, seq, qual;		\
220 | 		int last_char;							\
221 | 		kstream_t *f;							\
222 | 	} kseq_t;
223 | 
224 | #define KSEQ_INIT2(SCOPE, type_t, __read)		\
225 | 	KSTREAM_INIT(type_t, __read, 16384)			\
226 | 	__KSEQ_TYPE(type_t)							\
227 | 	__KSEQ_BASIC(SCOPE, type_t)					\
228 | 	__KSEQ_READ(SCOPE)
229 | 
230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
231 | 
232 | #define KSEQ_DECLARE(type_t) \
233 | 	__KS_TYPE(type_t) \
234 | 	__KSEQ_TYPE(type_t) \
235 | 	extern kseq_t *kseq_init(type_t fd); \
236 | 	void kseq_destroy(kseq_t *ks); \
237 | 	int kseq_read(kseq_t *seq);
238 | 
239 | #endif
240 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/mapGenome.cpp:
--------------------------------------------------------------------------------
  1 | /* There maybe not only one sequence in the genome file. 
  2 |  * mapGenome.cpp is used to check whether multi sequences within a genome file have the same nomenclature type in the comment description or not.
  3 |  * The result have show that all the sequences within the same genome file have the same nomenclature type.
  4 |  *
  5 |  *
  6 |  */
  7 | 
  8 | #include <iostream>
  9 | #include <fstream>
 10 | #include <string>
 11 | #include "kseq.h"
 12 | #include <zlib.h>
 13 | #include <sstream>
 14 | #include <unordered_map>
 15 | #include <vector>
 16 | 
 17 | 
 18 | KSEQ_INIT(gzFile, gzread);
 19 | using namespace std;
 20 | struct Type{
 21 | 	string type;
 22 | 	int number;
 23 | };
 24 | 
 25 | 
 26 | int main(int argc, char *argv[]){
 27 | 	if(argc < 2) return 1;	
 28 | 	string inputFile = argv[1];
 29 | 	string outputFile = "mapType.out";
 30 | 	fstream fs(inputFile);
 31 | 	string line;
 32 | 	vector<string> fileList;
 33 | 
 34 | 	while(getline(fs, line))
 35 | 	{
 36 | 		fileList.push_back(line);
 37 | 	}
 38 | 	cout << "the size of fileList: " << fileList.size() << endl;
 39 | 
 40 | 	vector<Type> genomeType[fileList.size()];
 41 | 	FILE *fp = fopen(outputFile.c_str(), "w");
 42 | 	int tmpIndex = 0;
 43 | 	int subIndex = fileList.size() / 100;
 44 | 	
 45 | 	//#pragma omp parallel for num_threads(48)
 46 | 	for(int i = 0; i < fileList.size(); i++)
 47 | 	{
 48 | 		if(i > tmpIndex * subIndex)
 49 | 		{
 50 | 			tmpIndex++;
 51 | 			cerr << tmpIndex << endl;
 52 | 		}
 53 | 		gzFile fp1 = gzopen(fileList[i].c_str(), "r");
 54 | 		kseq_t * ks1;
 55 | 		ks1 = kseq_init(fp1);
 56 | 		//cerr << "read the file " << fileList[i] << endl;
 57 | 		unordered_map<string, int> genomeMap;
 58 | 		while(1)
 59 | 		{
 60 | 			int length = kseq_read(ks1);
 61 | 			if(length < 0) break;
 62 | 			string name = ks1->name.s;
 63 | 			string comment = ks1->comment.s;
 64 | 			stringstream ss;	
 65 | 			ss << comment;
 66 | 			string type0, type1, type2;
 67 | 			ss >> type0 >> type1 >> type2;
 68 | 			if(type0.substr(0, 10) == "UNVERIFIED")
 69 | 			{
 70 | 				type0 = type1;
 71 | 				type1 = type2;
 72 | 			}
 73 | 			if(type0.back() == ',') type0.pop_back();
 74 | 			if(type1.back() == ',') type1.pop_back();
 75 | 
 76 | 			string key = type0 + '\t' + type1;
 77 | 			genomeMap.insert({key, 0});
 78 | 			genomeMap[key]++;
 79 | 		}
 80 | 		gzclose(fp1);
 81 | 		kseq_destroy(ks1);
 82 | 		if(genomeMap.size() != 1)
 83 | 		{
 84 | 			cerr << "there are not only one class in the file: " << fileList[i] << endl;
 85 | 			for(auto x : genomeMap)
 86 | 			{
 87 | 				cerr << "\t" << x.first << "\t" << x.second << endl;
 88 | 			}
 89 | 		}
 90 | 
 91 | 		for(auto x : genomeMap)
 92 | 		{
 93 | 			//cout << x.first << "\t" << x.second << endl;
 94 | 			fprintf(fp, "%s\t%d\n", x.first.c_str(), x.second);
 95 | 			genomeType[i].push_back({x.first, x.second});
 96 | 		}
 97 | 		fprintf(fp, "\n");
 98 | 		unordered_map<string, int>().swap(genomeMap);
 99 | 	}
100 | 	cerr << "finished" << endl;
101 | 
102 | 	return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/benchmark/evaluation/src/precalLabel.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu 
  2 |  * Email: xiaoming.xu@mail.sdu.edu.cn
  3 |  * Data: 2022/2/18
  4 |  *
  5 |  * calF1.cpp is used as preprocessing of the evaluation of precision, recall, F1-score, and NMI for bacteria, refseq, half-Bacteria, sub-Bacteria datasets.
  6 |  * The ground truth labels of genomes are as species_taxid which reveals nomenclature of gene feature.
  7 |  * The parameter -i and -l corresponding to the cluster of genomes served as sequences and files.
  8 |  * The input cluster files are in the CD-HIT format.
  9 |  *
 10 |  *
 11 |  */
 12 | 
 13 | 
 14 | #include <iostream>
 15 | #include <fstream>
 16 | #include <string>
 17 | #include <vector>
 18 | #include <unordered_map>
 19 | #include <sstream>
 20 | #include <algorithm>
 21 | #include <cstdlib>
 22 | #include <cstdio>
 23 | #include <unordered_set>
 24 | 
 25 | using namespace std;
 26 | struct LabNum{
 27 | 	int label;
 28 | 	int number;
 29 | };
 30 | 
 31 | struct GlobalLabelInfo{
 32 | 	int clustId;
 33 | 	int labelNumber;
 34 | };
 35 | 
 36 | struct PosNum{
 37 | 	int startPos;
 38 | 	int clustSize;
 39 | };
 40 | 
 41 | struct IdNum{
 42 | 	int id;
 43 | 	int number;
 44 | };
 45 | 
 46 | bool cmpLabNum(LabNum ln1, LabNum ln2){
 47 | 	return ln1.number > ln2.number;
 48 | }
 49 | 
 50 | bool cmpIdNum(IdNum in1, IdNum in2){
 51 | 	return in1.number > in2.number;
 52 | }
 53 | 
 54 | inline void printInfo()
 55 | {
 56 | 	cerr << "run with: ./calLabel RabbitTClust -l(-i) groundTruth bacteria.out bacteria.f1" << endl;
 57 | 	cerr << "The second argument (RabbitTClust) is applications, including RabbitTClust, MeshClust2, MeshClust3, gclust or Mothur " << endl;
 58 | 	cerr << "For the third argument, -l means genomes served as files, -i means genomes served as sequences" << endl;
 59 | 	cerr << "The fourth argument (groundTruth) is the ground truth from assembly_bacteria.txt of the <assembly_accession genomeName species_taxid> " << endl;
 60 | 	cerr << "The fifth argument (bacteria.out) is the cluster result from RabbitTClust, MeshClust2, gclust or Mothur " << endl;
 61 | 	cerr << "The sixth argument (bacteria.f1) is the output file path" << endl;
 62 | }
 63 | 
 64 | /* The output result is th resLabelArr with size of cluster number, each element is the label for the cluster.
 65 |  */
 66 | int updateLabel(vector< vector<LabNum> > &labNumArr, unordered_map<int, GlobalLabelInfo> &globalMap, int clustId, int &badLabel, vector<int> &resLabelArr)//return the new badLabel
 67 | {
 68 | 	bool isBad = true;
 69 | 	while(labNumArr[clustId].size() != 0 && isBad)
 70 | 	{
 71 | 		int curLabel = labNumArr[clustId][0].label;
 72 | 		int curNumber = labNumArr[clustId][0].number;
 73 | 		if(globalMap.find(curLabel) == globalMap.end())//new label 
 74 | 		{
 75 | 			GlobalLabelInfo glab;
 76 | 			glab.clustId = clustId;
 77 | 			glab.labelNumber = curNumber;
 78 | 			globalMap.insert({curLabel, glab});
 79 | 			resLabelArr[clustId] = curLabel;
 80 | 			isBad = false;
 81 | 		}
 82 | 		else//label collison with previous cluster
 83 | 		{
 84 | 			int preClustId = globalMap[curLabel].clustId;
 85 | 			int preNumber = globalMap[curLabel].labelNumber;
 86 | 			if(curNumber > preNumber)//the previous cluster is defeated, need to update the previous cluster.
 87 | 			{
 88 | 				resLabelArr[clustId] = curLabel;
 89 | 				isBad = false;
 90 | 				globalMap[curLabel].clustId = clustId;
 91 | 				globalMap[curLabel].labelNumber = curNumber;
 92 | 				badLabel = updateLabel(labNumArr, globalMap, preClustId, badLabel, resLabelArr);
 93 | 			}
 94 | 			else//current cluster can not defeat the previous cluster, just erase the biggest label to try new biggest label
 95 | 			{}
 96 | 		}
 97 | 
 98 | 		labNumArr[clustId].erase(labNumArr[clustId].begin());//erase the biggest label in this cluster
 99 | 	}//end while
100 | 	if(isBad)
101 | 	{
102 | 		resLabelArr[clustId] = badLabel;
103 | 		badLabel--;//update the newBadLabel
104 | 	}
105 | 	return badLabel;
106 | }
107 | 
108 | void calF1(string application, string argument, string groundTruth, string inputFile, string outputFile)
109 | {	
110 | 	if(application != "MeshClust3" && application != "MeshClust2" && application != "RabbitTClust" && application != "Mothur" && application != "gclust")
111 | 	{
112 | 		printInfo();
113 | 		return;
114 | 	}
115 | 	ofstream ofs(outputFile);
116 | 	ofstream ofs1(outputFile+".humanReadable");
117 | 
118 | 	fstream fs0(groundTruth);
119 | 	string line;
120 | 
121 | 	unordered_map<string, int> groundTruthMapFile;
122 | 	unordered_map<string, int> groundTruthMapSeq;
123 | 	unordered_set<int> groundTruthClustNumber;
124 | 
125 | 	getline(fs0,line);
126 | 	while(getline(fs0, line))
127 | 	{
128 | 		string assembly_accession, genomeName, species_taxid;
129 | 		stringstream ss;
130 | 		ss << line;
131 | 		//ss >> assembly_accession >> genomeName >> species_taxid;
132 | 		ss >> assembly_accession >> species_taxid >> genomeName;
133 | 		//cerr << species_taxid << endl;
134 | 		groundTruthMapFile.insert({assembly_accession, stoi(species_taxid)});
135 | 		groundTruthMapSeq.insert({genomeName, stoi(species_taxid)});
136 | 		groundTruthClustNumber.insert(stoi(species_taxid));
137 | 
138 | 	}
139 | 	cerr << "the groundTruthClustNumber size(not for this dataset) is: " << groundTruthClustNumber.size() << endl;
140 | 
141 | 	
142 | 	fstream fs(inputFile);
143 | 
144 | 	int curStandardIndex = 0;
145 | 	vector<int> ourClust;
146 | 	vector<int> standardClust;
147 | 	unordered_map<string, int> standardMap;
148 | 	unordered_map<int, int> curMap;
149 | 
150 | 	int startPos = 0;
151 | 	vector< vector<LabNum> > labNumArr;
152 | 	vector<PosNum> posArr;
153 | 
154 | 	int numNotIngroundTruth = 0;
155 | 
156 | 	if(application == "MeshClust3")
157 | 	{
158 | 		int curId;
159 | 		string genomeSize, genomeName, fileName;
160 | 		while(getline(fs, line))
161 | 		{
162 | 			if(line.length() == 0)//finish a cluster
163 | 			{
164 | 				if(curMap.size() != 0)
165 | 				{
166 | 					int clustSize = 0;
167 | 					vector<LabNum> curClustInfo;
168 | 					for(auto x : curMap)
169 | 					{
170 | 						LabNum ln;
171 | 						ln.label = x.first;
172 | 						ln.number = x.second;
173 | 						curClustInfo.push_back(ln);
174 | 						clustSize += x.second;
175 | 					}
176 | 					std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
177 | 					labNumArr.push_back(curClustInfo);
178 | 
179 | 					PosNum pn;
180 | 					pn.startPos = startPos;
181 | 					pn.clustSize = clustSize;
182 | 					posArr.push_back(pn);
183 | 
184 | 					startPos += clustSize;
185 | 
186 | 					unordered_map<int, int>().swap(curMap);
187 | 				}
188 | 			}
189 | 			else
190 | 			{
191 | 				stringstream ss;
192 | 				ss << line;
193 | 				ss >> curId >> genomeName;
194 | 				genomeName = genomeName.substr(1);
195 | 				if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end())
196 | 				{
197 | 					//cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl;
198 | 					numNotIngroundTruth++;
199 | 					continue;
200 | 				}
201 | 				else
202 | 				{
203 | 					int curLabel = groundTruthMapSeq[genomeName];
204 | 					standardClust.push_back(curLabel);
205 | 					curMap.insert({curLabel, 0});
206 | 					curMap[curLabel]++;
207 | 				}
208 | 			}
209 | 		}
210 | 	}
211 | 	else
212 | 	{
213 | 
214 | 		while(getline(fs, line))
215 | 		{
216 | 			if(line.length() == 0) continue;
217 | 			if(application == "MeshClust2")
218 | 			{
219 | 				if(line[0] == '>')
220 | 				{
221 | 					if(curMap.size() != 0)
222 | 					{
223 | 						int clustSize = 0;
224 | 						vector<LabNum> curClustInfo;
225 | 
226 | 						for(auto x : curMap)
227 | 						{
228 | 							LabNum ln;
229 | 							ln.label = x.first;
230 | 							ln.number = x.second;
231 | 							curClustInfo.push_back(ln);
232 | 							clustSize += x.second;
233 | 						}
234 | 						std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
235 | 						labNumArr.push_back(curClustInfo);
236 | 
237 | 						PosNum pn;
238 | 						pn.startPos = startPos;
239 | 						pn.clustSize = clustSize;
240 | 						posArr.push_back(pn);
241 | 
242 | 						startPos += clustSize;
243 | 
244 | 						unordered_map<int, int>().swap(curMap);
245 | 					}
246 | 				}
247 | 				else{
248 | 					stringstream ss;
249 | 					ss << line;
250 | 					int curId, genomeId;
251 | 					string genomeSize, fileName, genomeName;
252 | 					string type0, type1, type2;
253 | 					if(argument == "-l")
254 | 						ss >> curId >> fileName >>genomeSize >> genomeName >> type0 >> type1 >> type2;
255 | 					else if(argument == "-i")
256 | 						ss >> curId >>genomeSize >> genomeName >> type0 >> type1 >> type2;
257 | 					else
258 | 					{
259 | 						cerr << "error argument, need -l or -i " << endl;
260 | 						printInfo();
261 | 						return;
262 | 					}
263 | 
264 | 					genomeName = genomeName.substr(1);
265 | 					if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end())
266 | 					{
267 | 						//cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl;
268 | 						numNotIngroundTruth++;
269 | 						continue;
270 | 					}
271 | 					else
272 | 					{
273 | 						int curLabel = groundTruthMapSeq[genomeName];
274 | 						standardClust.push_back(curLabel);
275 | 						curMap.insert({curLabel, 0});
276 | 						curMap[curLabel]++;
277 | 					}
278 | 
279 | 				}
280 | 			}//end MeshClust2
281 | 			else //other application(RabbitTClust, gclust, Mothur)
282 | 			{
283 | 				if(line[0] != '\t')
284 | 				{
285 | 					if(curMap.size() != 0)
286 | 					{
287 | 						int clustSize = 0;
288 | 						vector<LabNum> curClustInfo;
289 | 
290 | 						for(auto x : curMap)
291 | 						{
292 | 							LabNum ln;
293 | 							ln.label = x.first;
294 | 							ln.number = x.second;
295 | 							curClustInfo.push_back(ln);
296 | 							clustSize += x.second;
297 | 						}
298 | 						std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
299 | 						labNumArr.push_back(curClustInfo);
300 | 
301 | 						PosNum pn;
302 | 						pn.startPos = startPos;
303 | 						pn.clustSize = clustSize;
304 | 						posArr.push_back(pn);
305 | 
306 | 						startPos += clustSize;
307 | 
308 | 						unordered_map<int, int>().swap(curMap);
309 | 					}
310 | 				}
311 | 				else{
312 | 					stringstream ss;
313 | 					ss << line;
314 | 					int curId, genomeId;
315 | 					string genomeSize, fileName, genomeName;
316 | 					string type0, type1, type2;
317 | 					if(application == "RabbitTClust")
318 | 					{
319 | 						if(argument == "-l")
320 | 						{
321 | 							ss >> curId >> genomeId >> genomeSize >> fileName >> genomeName >> type0 >> type1 >> type2;
322 | 							int startIndex = fileName.find_last_of('/');
323 | 							//cerr << fileName << endl;
324 | 							//cerr << startIndex << endl;
325 | 							int endIndex = fileName.find('_', startIndex + 5);
326 | 							if(fileName.find('_', startIndex+5) == -1)
327 | 								endIndex = fileName.find('.', startIndex+5);
328 | 							//int endIndex = std::max(fileName.find('_', startIndex + 5), fileName.find('.', startIndex + 5));
329 | 							//cerr << endIndex << endl;
330 | 							string key = fileName.substr(startIndex+1, endIndex -startIndex -1);
331 | 							//cerr << key << endl;
332 | 							//exit(0);
333 | 							if(groundTruthMapFile.find(key) == groundTruthMapFile.end())
334 | 							{
335 | 								//cerr << "the key: " << key << " is not in the groundTruth!" << endl;
336 | 								numNotIngroundTruth++;
337 | 								continue;//skip this label
338 | 							}
339 | 							else
340 | 							{
341 | 								int curLabel = groundTruthMapFile[key];
342 | 								standardClust.push_back(curLabel);
343 | 								curMap.insert({curLabel, 0});
344 | 								curMap[curLabel]++;
345 | 							}
346 | 						}
347 | 						else if(argument == "-i")
348 | 						{
349 | 							ss >> curId >> genomeId >> genomeSize >> genomeName >> type0 >> type1 >> type2;
350 | 							if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end())
351 | 							{
352 | 								//cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl;
353 | 								numNotIngroundTruth++;
354 | 								continue;
355 | 							}
356 | 							else
357 | 							{
358 | 								int curLabel = groundTruthMapSeq[genomeName];
359 | 								standardClust.push_back(curLabel);
360 | 								curMap.insert({curLabel, 0});
361 | 								curMap[curLabel]++;
362 | 							}
363 | 						}
364 | 					}
365 | 					else if(application == "Mothur")//TODO
366 | 					{
367 | 						ss >> genomeName >> type0 >> type1 >> type2;
368 | 						if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end())
369 | 						{
370 | 							//cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl;
371 | 							numNotIngroundTruth++;
372 | 							continue;
373 | 						}
374 | 						else
375 | 						{
376 | 							int curLabel = groundTruthMapSeq[genomeName];
377 | 							standardClust.push_back(curLabel);
378 | 							curMap.insert({curLabel, 0});
379 | 							curMap[curLabel]++;
380 | 						}
381 | 					}
382 | 					else if(application == "gclust")//TODO
383 | 					{
384 | 						ss >> curId >> genomeSize >> genomeName >> type0 >> type1 >> type2;
385 | 						if(groundTruthMapSeq.find(genomeName) == groundTruthMapSeq.end())
386 | 						{
387 | 							//cerr << "the genomeName: " << genomeName << " is not in the groundTruthMapSeq!" << endl;
388 | 							numNotIngroundTruth++;
389 | 							continue;
390 | 						}
391 | 						else
392 | 						{
393 | 							int curLabel = groundTruthMapSeq[genomeName];
394 | 							standardClust.push_back(curLabel);
395 | 							curMap.insert({curLabel, 0});
396 | 							curMap[curLabel]++;
397 | 						}
398 | 					}
399 | 					else
400 | 					{
401 | 						cerr << "error application, need RabbitTClust, Mothur, gclust or MeshClust2" << endl;
402 | 						printInfo();
403 | 						return;
404 | 					}
405 | 
406 | 				}//end a cluster calculation
407 | 			}
408 | 		}//end while
409 | 	}
410 | 
411 | 	if(curMap.size() != 0)
412 | 	{
413 | 		int clustSize = 0;
414 | 		vector<LabNum> curClustInfo;
415 | 
416 | 		for(auto x : curMap)
417 | 		{
418 | 			LabNum ln;
419 | 			ln.label = x.first;
420 | 			ln.number = x.second;
421 | 			curClustInfo.push_back(ln);
422 | 			clustSize += x.second;
423 | 		}
424 | 		std::sort(curClustInfo.begin(), curClustInfo.end(), cmpLabNum);
425 | 		labNumArr.push_back(curClustInfo);
426 | 
427 | 		PosNum pn;
428 | 		pn.startPos = startPos;
429 | 		pn.clustSize = clustSize;
430 | 		posArr.push_back(pn);
431 | 
432 | 		startPos += clustSize;
433 | 
434 | 		unordered_map<int, int>().swap(curMap);
435 | 	}
436 | 
437 | 	//update the labels
438 | 	unordered_map<int, GlobalLabelInfo> globalMap;
439 | 	int badLabel = -1;
440 | 	vector<int> resLabelArr;
441 | 	int clustNumber = labNumArr.size();
442 | 	resLabelArr.resize(clustNumber); 
443 | 	for(int i = 0; i < clustNumber; i++)
444 | 	{
445 | 		badLabel = updateLabel(labNumArr, globalMap, i, badLabel, resLabelArr); 
446 | 	}
447 | 	
448 | 	//generate the result
449 | 	for(int i = 0; i < posArr.size(); i++)
450 | 	{
451 | 		int startPos = posArr[i].startPos;
452 | 		int clustSize = posArr[i].clustSize;
453 | 		for(int j = 0; j < clustSize; j++)
454 | 		{
455 | 			ourClust.push_back(resLabelArr[i]);
456 | 		}
457 | 	}
458 | 
459 | 	cerr << "the number of which not in the groundTruth is: " << numNotIngroundTruth << endl;
460 | 	cerr << "the size of ourClust is: " << ourClust.size() << endl;
461 | 	cerr << "the size of standardClust is: " << standardClust.size() << endl;
462 | 	
463 | 	if(ourClust.size() != standardClust.size())
464 | 	{
465 | 		cerr << "the size of ourClust is not equal to the standardClust, exit()" << endl;
466 | 		return;
467 | 	}
468 | 	for(int i = 0; i < ourClust.size(); i++)
469 | 	{
470 | 		ofs1 << ourClust[i] << '\t' << standardClust[i] << endl;
471 | 	}
472 | 
473 | 	for(int i = 0; i < ourClust.size(); i++)
474 | 		ofs << ourClust[i] << ' ';
475 | 	ofs << endl;
476 | 	
477 | 	for(int i = 0; i < standardClust.size(); i++)
478 | 		ofs << standardClust[i] << ' ';
479 | 	ofs << endl;
480 | 
481 | }
482 | 
483 | int main(int argc, char* argv[]){
484 | 	if(argc < 6){
485 | 		printInfo();
486 | 		return 1;
487 | 	}
488 | 	string application = argv[1];
489 | 	string argument = argv[2];
490 | 	string groundTruth = argv[3];
491 | 	string inputFile = argv[4];
492 | 	string outputFile = argv[5];
493 | 
494 | 	calF1(application, argument, groundTruth, inputFile, outputFile);
495 | 
496 | 	return 0;
497 | 
498 | }
499 | 


--------------------------------------------------------------------------------
/benchmark/generateList.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #set -x 
 3 | 
 4 | cd ref/
 5 | resName="refList"
 6 | if [ -f "$resName" ]; then
 7 | 	echo "the file exist, remove"
 8 | 	rm $resName
 9 | fi
10 | 
11 | for dir in archaea bacteria fungi viral plant protozoa human vertebrate_mammalian vertebrate_other
12 | do
13 | 	#echo $dir
14 | 	ls $dir/*.fna.gz >$dir.gz.list
15 | 	cat $dir.gz.list | while read line
16 | 	do 
17 | 		#echo $line
18 | 		gunzip $line
19 | 	done
20 | 	rm $dir.gz.list
21 | 
22 | 	ls $dir/*.fna > $dir.list
23 | 	cat  $dir.list | while read line
24 | 	do
25 | 		echo `pwd`/$line >>$resName
26 | 	done
27 | 	rm $dir.list
28 | done
29 | 
30 | mv $resName ../
31 | cd ../
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/benchmark/simulate/Makefile:
--------------------------------------------------------------------------------
 1 | all: simulate-longSequence create-containment-bacteria
 2 | simulate-longSequence: src/simulate_longSequence.cpp
 3 | 	g++ -O3 src/simulate_longSequence.cpp -o simulate-longSequence
 4 | create-containment-bacteria: src/create_containment_bacteria.cpp
 5 | 	g++ -O3 src/create_containment_bacteria.cpp -o create-containment-bacteria -lz
 6 | 
 7 | clean:
 8 | 	rm simulate-longSequence create-containment-bacteria
 9 | 	
10 | 


--------------------------------------------------------------------------------
/benchmark/simulate/README.md:
--------------------------------------------------------------------------------
 1 | # The script for simulating genome sequence
 2 | 
 3 | ## simulate-longSequence
 4 | `simulate-longSequence` is used to generate simulated sequences with a predefined sequence length and mutation rate.
 5 | 
 6 | Example: `simulate-longSequence 10 20 300 1000000 simulate_10_20_300_1M`  
 7 | It will generate `20` clusters, each with `300` sequences with a mutation rate of `0.01` (10/1000) and an approximate length of `1,000,000`.
 8 | There will be three output files named `simulate_10_20_300_1M_seed.fna`, `simulate_10_20_300_1M_total.fna`, and `simulate_10_20_300_1M.groundTruth`, which corresponds to the seed sequence file, total sequence file, and the cluster groundTruth file, respectively.
 9 | 
10 | * Run as `simulate-longSequence mutation_rate*1000(integer) numSeedSeqs numEachClusts seqLength output`
11 |   * The 0 parameter(`./simulate-longSequence`) is the application name.
12 |   * The 1 parameter(`mutation_rate*1000`) is to set the mutation rate
13 |   * The 2 parameter(`numSeedSeqs`) is the number of seed sequences (number of clusters).
14 |   * The 3 parameter(`numEachClusts`) is the sequence number in a cluster generated from each seed sequence.
15 |   * The 4 parameter(`seqLength`) is the approximate length for each sequence.
16 |   * The 5 parameter(`output`) is the prefix name for ground truth, seed sequences and total simulated sequences FASTA files.
17 | 
18 | ## create-containment-bacteria
19 | `create-containment-bacteria` is used to generate genomes by cutting random proportions ranging from 0.0 to 1.0 in the original seed bacterial genome length.
20 | 
21 | Example: `create-containment-bacteria input.list 8 50 simulatePath`  
22 | It will generate `8` clusters, each with `50` sequences. 
23 | Each simulated sequence in the cluster is generated by cutting random proportions ranging from 0.0 to 1.0 in the length of an original seed bacteria genome.
24 | The seed bacteria genomes come from the `input.list`, which contains one origin genome path per line.
25 | The similarities between the seed genomes in the `input.list` should be low to promise low inter-cluster similarity. 
26 | There will be `408` genomes files (8 seed genomes and 400 generated genomes) in the `simulatePath` folder.
27 | 
28 | * Run as: `./create-containment-bacteria input.list num_of_clust num_genomes_each_clust simulatePath`
29 |   * The 0 parameter(`./create-containment-bacteria`) is the application name
30 |   * The 1 parameter(`input.list`) is input parameter, genome file list, one genome path per line. 
31 |   * The 2 parameter(`num_of_clust`) is input parameter, the number of clusters
32 |   * The 3 parameter(`num_genomes_each_clust`) is input parameter, the number of genomes in each cluster
33 |   * The 4 parameter(`simulatePath`) is output path, the output file path
34 | 


--------------------------------------------------------------------------------
/benchmark/simulate/src/create_containment_bacteria.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/5/31
  3 |  *
  4 |  * 
  5 |  */
  6 | #include <iostream>
  7 | #include <stdlib.h>
  8 | #include <string>
  9 | #include <cassert>
 10 | #include <fstream>
 11 | #include <vector>
 12 | #include <sstream>
 13 | #include <cstdio>
 14 | #include <algorithm>
 15 | #include <unordered_set>
 16 | #include <unordered_map>
 17 | #include <sys/sysinfo.h>
 18 | #include <omp.h>
 19 | #include <set>
 20 | #include <math.h>
 21 | #include <boost/algorithm/string/classification.hpp>
 22 | #include <boost/algorithm/string/split.hpp>
 23 | #include <zlib.h>
 24 | #include "kseq.h"
 25 | 
 26 | 
 27 | KSEQ_INIT(gzFile, gzread);
 28 | 
 29 | using namespace std;
 30 | 
 31 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions);
 32 | 
 33 | int main(int argc , char *argv[]){
 34 | 	string application = argv[0];
 35 | 	vector<string> args, descriptions;
 36 | 	args.push_back(application);
 37 | 	descriptions.push_back("the application name");
 38 | 
 39 | 	//========= parameters need changing ========
 40 | 	//The example is with parameters of specific numbers.
 41 | 	//args is the tutorial names.
 42 | 	string pwd = "RabbitTClust/benchmark/simulate/src/create_containment_bacteria.cpp";
 43 | 	string dependency = "kseq.h";
 44 | 	string example = application + " simulateList num_of_clust num_genomes_each_clust simulatePath";
 45 | 	args.push_back("simulateList");
 46 | 	args.push_back("num_of_clust");
 47 | 	args.push_back("num_genomes_each_clust");
 48 | 	args.push_back("simulatePath");
 49 | 	descriptions.push_back("input file, genome file list, one genome path per line");
 50 | 	descriptions.push_back("input parameter, the number of clusters");
 51 | 	descriptions.push_back("input parameter, the number of genomes in each cluster");
 52 | 	descriptions.push_back("output path, the output file path");
 53 | 
 54 | 	//-------- no changing -----------
 55 | 	assert(args.size() == descriptions.size());
 56 |   if(argc != args.size()) {
 57 | 		printInfo(pwd, dependency, example, args, descriptions);
 58 |     return 1;
 59 |   }
 60 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 61 | 	{
 62 | 		printInfo(pwd, dependency, example, args, descriptions);
 63 | 		return 1;
 64 | 	}
 65 | 
 66 | 	//======== specific implement ========
 67 | 	string inputList = argv[1];
 68 | 	int numClusts = stoi(argv[2]);
 69 | 	int numGenomePerClust = stoi(argv[3]);
 70 | 	string outputPath = argv[4];
 71 | 	string cmd0 = "mkdir -p " + outputPath;
 72 | 	system(cmd0.c_str());
 73 | 
 74 | 	//get randoms
 75 | 	vector<double> randArr;
 76 | 	for(int i = 0; i < numClusts* numGenomePerClust; i++)
 77 | 	{
 78 | 		int randNumber = rand() % 1000;
 79 | 		randArr.push_back((double)randNumber/1000);
 80 | 	}
 81 | 
 82 | 	fstream fs0(inputList);
 83 | 	unordered_set<string> seedSet;
 84 | 	vector<string> seedArr;
 85 | 	string line;
 86 | 	
 87 | 	while(getline(fs0, line))
 88 | 	{
 89 | 		int startIndex = line.find('/');
 90 | 		int endIndex = line.find('.');
 91 | 		string key = line.substr(startIndex + 1, endIndex-startIndex-1);
 92 | 		if(seedSet.find(key) == seedSet.end())
 93 | 		{
 94 | 			seedArr.push_back(line);
 95 | 			seedSet.insert(key);
 96 | 		}
 97 | 	}
 98 | 	//cerr << "the size of seedArr is: " << seedArr.size() << endl;
 99 | 	
100 | 	//string groundTruthFile = outputPath + "/groundTruth";
101 | 	//ofstream ofs(groundTruthFile);
102 | 	int seedArrSize = seedArr.size();
103 | 	int minNumber = std::min(numClusts, seedArrSize);
104 | 	for(int i = 0; i < minNumber; i++)
105 | 	{
106 | 		string cp_command = "cp " + seedArr[i] + " " + outputPath + '/';
107 | 		cerr << cp_command << endl;
108 | 		system(cp_command.c_str());
109 | 		gzFile fp1 = gzopen(seedArr[i].c_str(), "r");
110 | 		if(!fp1){
111 | 			cerr << "cannot open file: " << seedArr[i] << endl;
112 | 			continue;
113 | 		}
114 | 		kseq_t *ks1 = kseq_init(fp1);
115 | 		vector<string> bufArr;
116 | 
117 | 		FILE *fpArr[numGenomePerClust];
118 | 		int startIndex = seedArr[i].find('/');
119 | 		string keyName = seedArr[i].substr(startIndex+1);
120 | 		int indexStr = keyName.find_last_of('.');
121 | 		keyName = keyName.substr(0, indexStr);
122 | 
123 | 		for(int j = 0; j < numGenomePerClust; j++)
124 | 		{
125 | 			string outputName = outputPath + '/' + keyName + '.' + to_string(j) + ".fna";
126 | 			fpArr[j] = fopen(outputName.c_str(), "w");
127 | 			string writeBuffer("");
128 | 			bufArr.push_back(writeBuffer);
129 | 		}
130 | 
131 | 		int index = 0;
132 | 		while(1)
133 | 		{
134 | 			int length = kseq_read(ks1);
135 | 			if(length < 0) break;
136 | 			string name = ks1->name.s;
137 | 			string comment = ks1->comment.s;
138 | 			string content = ks1->seq.s;
139 | 			string headLine = '>' + name + ' ' + comment + '\n';
140 | 			for(int j = 0; j < numGenomePerClust; j++)
141 | 			{
142 | 				bufArr[j] += headLine;
143 | 
144 | 				int readLength = length * randArr[i*numGenomePerClust + j];
145 | 				//cerr << "the readLength is: " << readLength << endl;
146 | 				for(int k = 0; k < readLength; k +=80)
147 | 				{
148 | 					int actualLen = std::min(80, readLength - k);
149 | 					string tmpContent = content.substr(k, actualLen);
150 | 					bufArr[j] += tmpContent + '\n';
151 | 				}
152 | 				index++;
153 | 			}
154 | 		}
155 | 		for(int j = 0; j < numGenomePerClust; j++)
156 | 		{
157 | 			fwrite(bufArr[j].c_str(), sizeof(char), bufArr[j].length(), fpArr[j]);
158 | 			fclose(fpArr[j]);
159 | 		}
160 | 
161 | 		gzclose(fp1);
162 | 		kseq_destroy(ks1);
163 | 	}
164 | 	//ofs.close();
165 | 
166 |   return 0;
167 | }
168 | 
169 | void printInfo(string pwd, string dependency, string example, vector<string> args, vector<string> descriptions){
170 | 	assert(args.size() == descriptions.size());
171 | 	cerr << endl;
172 | 	cerr << "example: " << example << endl;
173 | 	cerr << endl;
174 | 	cerr << "source file path: " << pwd << endl;
175 | 	cerr << endl;
176 | 	cerr << "dependency: " << dependency << endl;
177 | 	cerr << endl;
178 | 	cerr << "run as: ";
179 | 	for(int i = 0; i < args.size(); i++){
180 | 		cerr << args[i] << ' ';
181 | 	}
182 | 	cerr << endl;
183 | 	for(int i = 0; i < args.size(); i++){
184 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
185 | 	}
186 | }
187 | 


--------------------------------------------------------------------------------
/benchmark/simulate/src/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #ifdef USE_MALLOC_WRAPPERS
 36 | #  include "malloc_wrap.h"
 37 | #endif
 38 | 
 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 40 | #define KS_SEP_TAB   1 // isspace() && !' '
 41 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 42 | #define KS_SEP_MAX   2
 43 | 
 44 | #define __KS_TYPE(type_t)						\
 45 | 	typedef struct __kstream_t {				\
 46 | 		unsigned char *buf;						\
 47 | 		int begin, end, is_eof;					\
 48 | 		type_t f;								\
 49 | 	} kstream_t;
 50 | 
 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 53 | 
 54 | #define __KS_BASIC(type_t, __bufsize)								\
 55 | 	static inline kstream_t *ks_init(type_t f)						\
 56 | 	{																\
 57 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 58 | 		ks->f = f;													\
 59 | 		ks->buf = (unsigned char*)malloc(__bufsize);				\
 60 | 		return ks;													\
 61 | 	}																\
 62 | 	static inline void ks_destroy(kstream_t *ks)					\
 63 | 	{																\
 64 | 		if (ks) {													\
 65 | 			free(ks->buf);											\
 66 | 			free(ks);												\
 67 | 		}															\
 68 | 	}
 69 | 
 70 | #define __KS_GETC(__read, __bufsize)						\
 71 | 	static inline int ks_getc(kstream_t *ks)				\
 72 | 	{														\
 73 | 		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
 74 | 		if (ks->begin >= ks->end) {							\
 75 | 			ks->begin = 0;									\
 76 | 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
 77 | 			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
 78 | 		}													\
 79 | 		return (int)ks->buf[ks->begin++];					\
 80 | 	}
 81 | 
 82 | #ifndef KSTRING_T
 83 | #define KSTRING_T kstring_t
 84 | typedef struct __kstring_t {
 85 | 	size_t l, m;
 86 | 	char *s;
 87 | } kstring_t;
 88 | #endif
 89 | 
 90 | #ifndef kroundup32
 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 92 | #endif
 93 | 
 94 | #define __KS_GETUNTIL(__read, __bufsize)								\
 95 | 	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 96 | 	{																	\
 97 | 		int gotany = 0;													\
 98 | 		if (dret) *dret = 0;											\
 99 | 		str->l = append? str->l : 0;									\
100 | 		for (;;) {														\
101 | 			int i;														\
102 | 			if (ks->begin >= ks->end) {									\
103 | 				if (!ks->is_eof) {										\
104 | 					ks->begin = 0;										\
105 | 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
106 | 					if (ks->end == 0) { ks->is_eof = 1; break; }		\
107 | 				} else break;											\
108 | 			}															\
109 | 			if (delimiter == KS_SEP_LINE) { \
110 | 				for (i = ks->begin; i < ks->end; ++i) \
111 | 					if (ks->buf[i] == '\n') break; \
112 | 			} else if (delimiter > KS_SEP_MAX) {						\
113 | 				for (i = ks->begin; i < ks->end; ++i)					\
114 | 					if (ks->buf[i] == delimiter) break;					\
115 | 			} else if (delimiter == KS_SEP_SPACE) {						\
116 | 				for (i = ks->begin; i < ks->end; ++i)					\
117 | 					if (isspace(ks->buf[i])) break;						\
118 | 			} else if (delimiter == KS_SEP_TAB) {						\
119 | 				for (i = ks->begin; i < ks->end; ++i)					\
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */						\
122 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
123 | 				str->m = str->l + (i - ks->begin) + 1;					\
124 | 				kroundup32(str->m);										\
125 | 				str->s = (char*)realloc(str->s, str->m);				\
126 | 			}															\
127 | 			gotany = 1;													\
128 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
129 | 			str->l = str->l + (i - ks->begin);							\
130 | 			ks->begin = i + 1;											\
131 | 			if (i < ks->end) {											\
132 | 				if (dret) *dret = ks->buf[i];							\
133 | 				break;													\
134 | 			}															\
135 | 		}																\
136 | 		if (!gotany && ks_eof(ks)) return -1;							\
137 | 		if (str->s == 0) {												\
138 | 			str->m = 1;													\
139 | 			str->s = (char*)calloc(1, 1);								\
140 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
141 | 		str->s[str->l] = '\0';											\
142 | 		return str->l;													\
143 | 	} \
144 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
145 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
146 | 
147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
148 | 	__KS_TYPE(type_t)							\
149 | 	__KS_BASIC(type_t, __bufsize)				\
150 | 	__KS_GETC(__read, __bufsize)				\
151 | 	__KS_GETUNTIL(__read, __bufsize)
152 | 
153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
154 | 
155 | #define __KSEQ_BASIC(SCOPE, type_t)										\
156 | 	SCOPE kseq_t *kseq_init(type_t fd)									\
157 | 	{																	\
158 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
159 | 		s->f = ks_init(fd);												\
160 | 		return s;														\
161 | 	}																	\
162 | 	SCOPE void kseq_destroy(kseq_t *ks)									\
163 | 	{																	\
164 | 		if (!ks) return;												\
165 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
166 | 		ks_destroy(ks->f);												\
167 | 		free(ks);														\
168 | 	}
169 | 
170 | /* Return value:
171 |    >=0  length of the sequence (normal)
172 |    -1   end-of-file
173 |    -2   truncated quality string
174 |  */
175 | #define __KSEQ_READ(SCOPE) \
176 | 	SCOPE int kseq_read(kseq_t *seq) \
177 | 	{ \
178 | 		int c; \
179 | 		kstream_t *ks = seq->f; \
180 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
181 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
182 | 			if (c == -1) return -1; /* end of file */ \
183 | 			seq->last_char = c; \
184 | 		} /* else: the first header char has been read in the previous call */ \
185 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
186 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
187 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
188 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
189 | 			seq->seq.m = 256; \
190 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
191 | 		} \
192 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
193 | 			if (c == '\n') continue; /* skip empty lines */ \
194 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
195 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
196 | 		} \
197 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
198 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
199 | 			seq->seq.m = seq->seq.l + 2; \
200 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
201 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
202 | 		} \
203 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
204 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
205 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
206 | 			seq->qual.m = seq->seq.m; \
207 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
208 | 		} \
209 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
210 | 		if (c == -1) return -2; /* error: no quality string */ \
211 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
212 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
213 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
214 | 		return seq->seq.l; \
215 | 	}
216 | 
217 | #define __KSEQ_TYPE(type_t)						\
218 | 	typedef struct {							\
219 | 		kstring_t name, comment, seq, qual;		\
220 | 		int last_char;							\
221 | 		kstream_t *f;							\
222 | 	} kseq_t;
223 | 
224 | #define KSEQ_INIT2(SCOPE, type_t, __read)		\
225 | 	KSTREAM_INIT(type_t, __read, 16384)			\
226 | 	__KSEQ_TYPE(type_t)							\
227 | 	__KSEQ_BASIC(SCOPE, type_t)					\
228 | 	__KSEQ_READ(SCOPE)
229 | 
230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
231 | 
232 | #define KSEQ_DECLARE(type_t) \
233 | 	__KS_TYPE(type_t) \
234 | 	__KSEQ_TYPE(type_t) \
235 | 	extern kseq_t *kseq_init(type_t fd); \
236 | 	void kseq_destroy(kseq_t *ks); \
237 | 	int kseq_read(kseq_t *seq);
238 | 
239 | #endif
240 | 


--------------------------------------------------------------------------------
/benchmark/simulate/src/simulate_longSequence.cpp:
--------------------------------------------------------------------------------
  1 | /* Author: Xiaoming Xu
  2 |  * Data: 2022/5/12
  3 |  * 
  4 |  * See the LICENSE.txt file included with this software for licence information.
  5 |  */
  6 | #include <iostream>
  7 | #include <stdlib.h>
  8 | #include <ctime>
  9 | #include <string>
 10 | #include <cassert>
 11 | #include <sys/time.h>
 12 | #include <fstream>
 13 | #include <algorithm>
 14 | #include <vector>
 15 | #include <assert.h>
 16 | 
 17 | using namespace std;
 18 | void printInfo(string pwd, string example, vector<string> args, vector<string> descriptions){
 19 | 	assert(args.size() == descriptions.size());
 20 | 	cerr << endl;
 21 | 	cerr << "example: " << example << endl;
 22 | 	cerr << endl;
 23 | 	cerr << "source file path: " << pwd << endl;
 24 | 	cerr << endl;
 25 | 	cerr << "run as: ";
 26 | 	for(int i = 0; i < args.size(); i++){
 27 | 		cerr << args[i] << ' ';
 28 | 	}
 29 | 	cerr << endl;
 30 | 	for(int i = 0; i < args.size(); i++){
 31 | 		fprintf(stderr, "\tThe %d parameter(%s) is %s\n", i, args[i].c_str(), descriptions[i].c_str());
 32 | 	}
 33 | }
 34 | 
 35 | 
 36 | int main(int argc , char *argv[]){
 37 | 	string pwd = "RabbitTClust/benchmark/simulate/src/simulate_longSequence.cpp";
 38 | 	string application = argv[0];
 39 | 	string example = application + " 10 20 300 1000000 simulate_10_20_300_1M";
 40 | 	vector<string> args, descriptions;
 41 | 	args.push_back(application);
 42 | 	args.push_back("mutation_rate*1000(integer)");
 43 | 	args.push_back("numSeedSeqs");
 44 | 	args.push_back("numEachClusts");
 45 | 	args.push_back("seqLength");
 46 | 	args.push_back("output");
 47 | 	descriptions.push_back("the application name");
 48 | 	descriptions.push_back("the mutation rate");
 49 | 	descriptions.push_back("the number of seed sequence (number of clusters)");
 50 | 	descriptions.push_back("the number sequence in a cluster generate from each seed sequence");
 51 | 	descriptions.push_back("the approximate length for each sequence");
 52 | 	descriptions.push_back("the prefix name for groundTruth, seedSequences and totalSimulateSequence fasta files");
 53 | 
 54 | 	assert(args.size() == descriptions.size());
 55 | 
 56 |   if(argc != args.size()) {
 57 | 		printInfo(pwd, example, args, descriptions);
 58 |     return -1;
 59 |   }
 60 | 	else if(argc == 2 && (argv[1] == "-h" || argv[1] == "--help"))
 61 | 	{
 62 | 		printInfo(pwd, example, args, descriptions);
 63 | 		return 1;
 64 | 	}
 65 | 
 66 |   const char nucs[4] = { 'A','T','G','C'};
 67 | 
 68 |   int erate = std::atoi(argv[1]);
 69 | 	int numClusts = std::stoi(argv[2]);
 70 | 	int numEachClusts = std::stoi(argv[3]);
 71 | 	int seqLength = std::stoi(argv[4]);
 72 | 	string outPrefix = argv[5];
 73 | 	string outSeedFile = outPrefix + "_seed.fna";
 74 | 	string outTotalFile = outPrefix + "_total.fna";
 75 | 	string outGroundTruth = outPrefix + "_groundTruth";
 76 | 
 77 | 	cerr << "the error rate is: " << double(erate)/1000 << endl;
 78 | 	cerr << "the number of clusters is: " << numClusts << endl;
 79 | 	cerr << "the number of sequences in each cluster is: " << numEachClusts << endl;
 80 | 	cerr << "the approximate sequence length is: " << seqLength << endl;
 81 | 	cerr << "the output seed sequences file is: " << outSeedFile << endl;
 82 | 	cerr << "the output total sequences file is: " << outTotalFile << endl;
 83 | 	cerr << "the groundTruth file is: " << outGroundTruth << endl;
 84 | 
 85 | 	FILE * fp0 = fopen(outSeedFile.c_str(), "w");
 86 | 	FILE * fp1 = fopen(outTotalFile.c_str(), "w");
 87 | 	FILE * fp2 = fopen(outGroundTruth.c_str(), "w");
 88 | 
 89 | 	string key1 = "seqName";
 90 | 	string key2 = "taxid";
 91 | 	fprintf(fp2, "%s\t%s\n",key1.c_str(), key2.c_str());
 92 | 	
 93 | 	for(int i = 0; i < numClusts; i++)
 94 | 	{
 95 | 		struct timeval tv;
 96 | 		gettimeofday(&tv, NULL);
 97 |   	srand(tv.tv_usec);
 98 | 		string seqName = ">seq_" + to_string(i);
 99 | 		string groundTruthName = seqName.substr(1);
100 | 		fprintf(fp2, "%s\t%d\n", groundTruthName.c_str(), i);
101 | 		string seqComment = "Seed sequence " + to_string(i) + " to generate mutations";
102 | 		string infoLine = seqName + '\t' + seqComment + '\n';
103 | 		string seedSeq("");
104 | 		for(int i = 0; i < seqLength; i++)
105 | 		{
106 | 			char newC = nucs[random()%4];
107 | 			seedSeq += newC;
108 | 		}
109 | 		//output the seed sequence into outSeedFile
110 | 		int infoLineLen = infoLine.length();
111 | 		fwrite(infoLine.c_str(), sizeof(char), infoLineLen, fp0);
112 | 		fwrite(infoLine.c_str(), sizeof(char), infoLineLen, fp1);
113 | 		int seedLen = seedSeq.length();
114 | 		string outSeedSeq("");
115 | 		for(int k = 0; k < seedLen; k += 80)
116 | 		{
117 | 			int curLength = std::min(80, seedLen-k);
118 | 			string tmpLine = seedSeq.substr(k, curLength);
119 | 			outSeedSeq += tmpLine + '\n';
120 | 		}
121 | 		int outSeedSeqLen = outSeedSeq.length();
122 | 		fwrite(outSeedSeq.c_str(), sizeof(char), outSeedSeqLen, fp0);
123 | 		fwrite(outSeedSeq.c_str(), sizeof(char), outSeedSeqLen, fp1);
124 | 
125 | 
126 | 		//for generate mutation sequences
127 | 		for(int j = 0; j < numEachClusts; j++)
128 | 		{
129 | 			string mutationName = seqName + "_mutation_" + to_string(j);
130 | 			string groundTruthMuName = mutationName.substr(1);
131 | 			fprintf(fp2, "%s\t%d\n", groundTruthMuName.c_str(), i);
132 | 			string mutationComment = "mutation sequence " + to_string(j) + " from seedSequence " + to_string(i);
133 | 			string mutaInfoLine = mutationName + '\t' + mutationComment + '\n';
134 | 			string mutationSeq("");
135 | 			for(int t = 0; t < seedSeq.length(); t++)
136 | 			{
137 | 				if(random()%1000 < erate){
138 | 					int mut = random()%3;
139 | 					if(mut == 0)//sub
140 | 					{
141 | 						while(1){
142 | 							char newc = nucs[random()%4];
143 | 							if(newc != seedSeq[t]){
144 | 								mutationSeq += newc;
145 | 								break;
146 | 							}
147 | 						}
148 | 					}
149 | 					else if(mut == 1)// ins
150 | 					{
151 | 						mutationSeq += nucs[random()%4];
152 | 						t = t - 1;
153 | 					}
154 | 					else//del
155 | 						continue;
156 | 				}//end if mutation
157 | 				else// no mutation
158 | 					mutationSeq += seedSeq[t];
159 | 			}
160 | 			int mutaInfoLineLen = mutaInfoLine.length();
161 | 			fwrite(mutaInfoLine.c_str(), sizeof(char), mutaInfoLineLen, fp1);
162 | 			int mutationLen = mutationSeq.length();
163 | 			string outMutationSeq("");
164 | 			for(int k = 0; k < mutationLen; k += 80)
165 | 			{
166 | 				int curLength = std::min(80, mutationLen-k);
167 | 				string tmpLine = mutationSeq.substr(k, curLength);
168 | 				outMutationSeq += tmpLine + '\n';
169 | 			}
170 | 			int outMutationSeqLen = outMutationSeq.length();
171 | 			fwrite(outMutationSeq.c_str(), sizeof(char), outMutationSeqLen, fp1);
172 | 		}
173 | 	}
174 | 	fclose(fp0);
175 | 	fclose(fp1);
176 | 	fclose(fp2);
177 | 
178 | 	cerr << "finish generate mutation files with multithread " << endl;
179 | 
180 |   return 0;
181 | }
182 | 
183 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | #make rabbitSketch library
 4 | cd RabbitSketch &&
 5 | mkdir -p build && cd build &&
 6 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. &&
 7 | make -j8 && make install &&
 8 | cd ../../ &&
 9 | 
10 | #make rabbitFX library
11 | cd RabbitFX && 
12 | mkdir -p build && cd build &&
13 | cmake -DCMAKE_INSTALL_PREFIX=. .. &&
14 | make -j8 && make install && 
15 | cd ../../ &&
16 | 
17 | #compile the clust-greedy
18 | mkdir -p build && cd build &&
19 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 
20 | make -j8 && make install &&
21 | cd ../ &&
22 | 
23 | #compile the clust-mst
24 | cd build &&
25 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. &&
26 | make -j8 && make install &&
27 | cd ../ 
28 | 


--------------------------------------------------------------------------------
/rabbittclust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RabbitBio/RabbitTClust/056ad8b0067688994a1f1529300a947f4ba6da0d/rabbittclust.png


--------------------------------------------------------------------------------
/src/MST.h:
--------------------------------------------------------------------------------
 1 | #ifndef H_MST_GRAPH
 2 | #define H_MST_GRAPH
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <unordered_set>
 7 | #include "SketchInfo.h"
 8 | 
 9 | struct NeighborNode{
10 | 	int id;
11 | 	double distance;
12 | 	NeighborNode(int i, double d){
13 | 		id = i;
14 | 		distance = d;
15 | 	}
16 | };
17 | 
18 | struct EdgeInfo{
19 | 	int preNode;
20 | 	int sufNode;
21 | 	double dist;
22 | };
23 | 
24 | 
25 | struct Graph{
26 | 	int node;
27 | 	int curNeighbor;
28 | 	std::vector<NeighborNode> neighbor;
29 | 
30 | };
31 | 
32 | 
33 | struct MST{
34 | 	std::unordered_set<int> nodes;
35 | 	std::vector<EdgeInfo> edges;
36 | 
37 | };
38 | 
39 | bool cmpEdge(EdgeInfo e1, EdgeInfo e2);
40 | 
41 | bool cmpNeighbor(NeighborNode n1, NeighborNode n2);
42 | 
43 | std::vector<EdgeInfo> kruskalAlgorithm(std::vector<EdgeInfo>graph, int vertices);
44 | 
45 | vector<EdgeInfo> generateMST(vector<SketchInfo>& sketches, string sketchFunc, int threads);
46 | 
47 | vector<EdgeInfo> append_MST(vector<SketchInfo>& pre_sketches, vector<SketchInfo>& append_sketches, int sketch_func_id, int threads, int ** &denseArr, int denseSpan, uint64_t* &aniArr);
48 | 
49 | vector<EdgeInfo> modifyMST(vector<SketchInfo>& sketches, int start_index, int sketch_func_id, int threads, bool no_dense, int** &denseArr, int denseSpan, uint64_t* &aniArr);
50 | 
51 | vector<EdgeInfo> compute_kssd_mst(vector<KssdSketchInfo>& sketches, KssdParameters info, const string folder_path, int start_index, bool no_dense, bool isContainment, int threads, int** &denseArr, int denseSpan, uint64_t* &aniArr);
52 | 
53 | std::vector<EdgeInfo> generateForest(std::vector <EdgeInfo> mst, double threshhold);
54 | 
55 | std::vector <std::vector<int> > generateCluster(std::vector<EdgeInfo> forest, int vertices);
56 | 
57 | vector<vector<int>> generateClusterWithBfs(vector<EdgeInfo> forest, int vertices);
58 | 
59 | vector<EdgeInfo> modifyForest(vector<EdgeInfo> forset, vector<int> noiseArr, int threads);
60 | 
61 | typedef pair<int, int> PairInt;
62 | vector<int> getNoiseNode(vector<PairInt> densePairArr, int alpha);
63 | 
64 | string get_newick_tree(const vector<SketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file);
65 | string get_kssd_newick_tree(const vector<KssdSketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file);
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/src/MST_IO.cpp:
--------------------------------------------------------------------------------
  1 | #include "MST_IO.h"
  2 | #include "Sketch_IO.h"
  3 | #include <ctime>
  4 | using namespace std;
  5 | 
  6 | 
  7 | inline bool cmpSketchLength(ClusterInfo c1, ClusterInfo c2){
  8 | 	return c1.length > c2.length;
  9 | }
 10 | 
 11 | void loadDense(int** &denseArr, string folderPath, int& denseSpan, int& genome_number){
 12 | 	string file_dense = folderPath + '/' + "mst.dense";
 13 | 	FILE* fp_dense = fopen(file_dense.c_str(), "r");
 14 | 	if(!fp_dense){
 15 | 		cerr << "ERROR: saveDense(), cannot open the file: " << file_dense;
 16 | 		exit(1);
 17 | 	}
 18 | 	fread(&genome_number, sizeof(int), 1, fp_dense);
 19 | 	fread(&denseSpan, sizeof(int), 1, fp_dense);
 20 | 	denseArr = new int*[denseSpan];
 21 | 	for(int i = 0; i < denseSpan; i++){
 22 | 		denseArr[i] = new int[genome_number];
 23 | 		fread(denseArr[i], sizeof(int), genome_number, fp_dense);
 24 | 	}
 25 | 	fclose(fp_dense);
 26 | 	cerr << "-----read the dense file from: " << file_dense << endl;
 27 | }
 28 | 
 29 | void loadANI(string folderPath, uint64_t* &aniArr, int sketch_func_id){
 30 | 	if(sketch_func_id != 0 && sketch_func_id != 1){
 31 | 		cerr << "ERROR: saveANI(), save ANI can only support MinHash and KSSD functions" << endl;
 32 | 		return;
 33 | 	}
 34 | 	string file_ani = folderPath + '/' + "mst.ani";
 35 | 	FILE* fp_ani = fopen(file_ani.c_str(), "r");
 36 | 	if(!fp_ani){
 37 | 		cerr << "ERROR: saveANI(), cannot open file: " << file_ani << endl;
 38 | 		exit(1);
 39 | 	}
 40 | 	aniArr = new uint64_t[101];
 41 | 	fread(aniArr, sizeof(uint64_t), 101, fp_ani);
 42 | 	fclose(fp_ani);
 43 | 	cerr << "-----read the ani file from: " << file_ani << endl;
 44 | }
 45 | 
 46 | void loadMST(string folderPath, vector<EdgeInfo>& mst)
 47 | {
 48 | 	//load the mst edge 
 49 | 	string file_mst = folderPath + '/' + "edge.mst";
 50 | 	FILE* fp_mst = fopen(file_mst.c_str(), "r");
 51 | 	if(!fp_mst){
 52 | 		cerr << "ERROR: loadMST(), cannot open the file: " <<  file_mst << endl;
 53 | 		exit(1);
 54 | 	}
 55 | 	size_t mst_size;
 56 | 	fread(&mst_size, sizeof(size_t), 1, fp_mst);
 57 | 	int preNode, sufNode;
 58 | 	double dist;
 59 | 	for(size_t i = 0; i < mst_size; i++){
 60 | 		fread(&preNode, sizeof(int), 1, fp_mst);
 61 | 		fread(&sufNode, sizeof(int), 1, fp_mst);
 62 | 		fread(&dist, sizeof(double), 1, fp_mst);
 63 | 		EdgeInfo tmpEdge{preNode, sufNode, dist};
 64 | 		mst.push_back(tmpEdge);
 65 | 		//cout << preNode << '\t' << sufNode << '\t' << dist << endl;
 66 | 	}
 67 | 	fclose(fp_mst);
 68 | 	cerr << "-----read the mst file from " << file_mst << endl;
 69 | }
 70 | 
 71 | void printKssdResult(vector<vector<int>>& cluster, vector<KssdSketchInfo>& sketches, bool sketchByFile, string outputFile)
 72 | {
 73 | 	//cerr << "output the result into: " << outputFile << endl;
 74 | 	FILE *fp = fopen(outputFile.c_str(), "w");
 75 | 	if(!fp){
 76 | 		cerr << "Error in printKssdResult(), cannot open file: " << outputFile << endl;
 77 | 		exit(1);
 78 | 	}
 79 | 	
 80 | 	if(sketchByFile)
 81 | 	{
 82 | 		for(int i = 0; i < cluster.size(); i++){
 83 | 			fprintf(fp, "the cluster %d is: \n", i);
 84 | 			for(int j = 0; j < cluster[i].size(); j++)
 85 | 			{
 86 | 				int curId = cluster[i][j];
 87 | 				fprintf(fp, "\t%5d\t%6d\t%12dnt\t%20s\t%20s\t%s\n", j, curId, sketches[curId].totalSeqLength, sketches[curId].fileName.c_str(),  sketches[curId].fileSeqs[0].name.c_str(), sketches[curId].fileSeqs[0].comment.c_str());
 88 | 			}
 89 | 			fprintf(fp, "\n");
 90 | 		}
 91 | 	}//end sketchByFile
 92 | 
 93 | 	else//sketch by sequence
 94 | 	{
 95 | 		for(int i = 0; i < cluster.size(); i++){
 96 | 			fprintf(fp, "the cluster %d is: \n", i);
 97 | 			for(int j = 0; j < cluster[i].size(); j++)
 98 | 			{
 99 | 				int curId = cluster[i][j];		
100 | 				fprintf(fp, "\t%6d\t%6d\t%12dnt\t%20s\t%s\n", j, curId, sketches[curId].seqInfo.length, sketches[curId].seqInfo.name.c_str(), sketches[curId].seqInfo.comment.c_str());
101 | 			}
102 | 			fprintf(fp, "\n");
103 | 		}
104 | 	}//end sketchBySequence
105 | 	fclose(fp);
106 | 
107 | }
108 | 
109 | void printResult(vector<vector<int>>& cluster, vector<SketchInfo>& sketches, bool sketchByFile, string outputFile)
110 | {
111 | 	//cerr << "output the result into: " << outputFile << endl;
112 | 	FILE *fp = fopen(outputFile.c_str(), "w");
113 | 	if(!fp){
114 | 		cerr << "Error in printResult(), cannot open file: " << outputFile << endl;
115 | 		exit(1);
116 | 	}
117 | 	
118 | 	if(sketchByFile)
119 | 	{
120 | 		for(int i = 0; i < cluster.size(); i++){
121 | 			fprintf(fp, "the cluster %d is: \n", i);
122 | 			for(int j = 0; j < cluster[i].size(); j++)
123 | 			{
124 | 				int curId = cluster[i][j];
125 | 				fprintf(fp, "\t%5d\t%6d\t%12dnt\t%20s\t%20s\t%s\n", j, curId, sketches[curId].totalSeqLength, sketches[curId].fileName.c_str(),  sketches[curId].fileSeqs[0].name.c_str(), sketches[curId].fileSeqs[0].comment.c_str());
126 | 			}
127 | 			fprintf(fp, "\n");
128 | 		}
129 | 	}//end sketchByFile
130 | 
131 | 	else//sketch by sequence
132 | 	{
133 | 		for(int i = 0; i < cluster.size(); i++){
134 | 			fprintf(fp, "the cluster %d is: \n", i);
135 | 			for(int j = 0; j < cluster[i].size(); j++)
136 | 			{
137 | 				int curId = cluster[i][j];		
138 | 				fprintf(fp, "\t%6d\t%6d\t%12dnt\t%20s\t%s\n", j, curId, sketches[curId].seqInfo.length, sketches[curId].seqInfo.name.c_str(), sketches[curId].seqInfo.comment.c_str());
139 | 			}
140 | 			fprintf(fp, "\n");
141 | 		}
142 | 	}//end sketchBySequence
143 | 	fclose(fp);
144 | 
145 | }
146 | 
147 | void saveKssdMST(vector<KssdSketchInfo>& sketches, vector<EdgeInfo>& mst, string folderPath, bool sketchByFile){
148 | 	save_kssd_genome_info(sketches, folderPath, "mst", sketchByFile);
149 | 	string file_mst = folderPath + '/' + "edge.mst";
150 | 	FILE* fp_mst = fopen(file_mst.c_str(), "w+");
151 | 	if(!fp_mst){
152 | 		cerr << "ERROR: saveKsdMST(), cannot open the file: " <<  file_mst << endl;
153 | 		exit(1);
154 | 	}
155 | 	size_t mst_size = mst.size();
156 | 	fwrite(&mst_size, sizeof(size_t), 1, fp_mst);
157 | 	for(size_t i = 0; i < mst.size(); i++){
158 | 		fwrite(&mst[i].preNode, sizeof(int), 1, fp_mst);
159 | 		fwrite(&mst[i].sufNode, sizeof(int), 1, fp_mst);
160 | 		fwrite(&mst[i].dist, sizeof(double), 1, fp_mst);
161 | 	}
162 | 	fclose(fp_mst);
163 | 	cerr << "-----save the kssd mst into: " << folderPath << endl;
164 | }
165 | 
166 | void saveMST(vector<SketchInfo>& sketches, vector<EdgeInfo>& mst, string folderPath, bool sketchByFile){
167 | 	save_genome_info(sketches, folderPath, "mst", sketchByFile);
168 | 	string file_mst = folderPath + '/' + "edge.mst";
169 | 	FILE* fp_mst = fopen(file_mst.c_str(), "w+");
170 | 	if(!fp_mst){
171 | 		cerr << "ERROR: saveMST(), cannot open the file: " <<  file_mst << endl;
172 | 		exit(1);
173 | 	}
174 | 	size_t mst_size = mst.size();
175 | 	fwrite(&mst_size, sizeof(size_t), 1, fp_mst);
176 | 	for(size_t i = 0; i < mst.size(); i++){
177 | 		fwrite(&mst[i].preNode, sizeof(int), 1, fp_mst);
178 | 		fwrite(&mst[i].sufNode, sizeof(int), 1, fp_mst);
179 | 		fwrite(&mst[i].dist, sizeof(double), 1, fp_mst);
180 | 	}
181 | 	fclose(fp_mst);
182 | 	cerr << "-----save the mst into: " << folderPath << endl;
183 | }
184 | 
185 | void saveDense(string folderPath, int** denseArr, int denseSpan, int genome_number){
186 | 	string file_dense = folderPath + '/' + "mst.dense";
187 | 	FILE* fp_dense = fopen(file_dense.c_str(), "w+");
188 | 	if(!fp_dense){
189 | 		cerr << "ERROR: saveDense(), cannot open the file: " << file_dense;
190 | 		exit(1);
191 | 	}
192 | 	fwrite(&genome_number, sizeof(int), 1, fp_dense);
193 | 	fwrite(&denseSpan, sizeof(int), 1, fp_dense);
194 | 	for(int i = 0; i < denseSpan; i++){
195 | 		fwrite(denseArr[i], sizeof(int), genome_number, fp_dense);
196 | 	}
197 | 	fclose(fp_dense);
198 | 	cerr << "-----save the dense file into: " << folderPath << endl;
199 | }
200 | 
201 | void saveANI(string folderPath, uint64_t* aniArr, int sketch_func_id){
202 | 	
203 | 	if(sketch_func_id != 0 && sketch_func_id != 1){
204 | 		cerr << "ERROR: saveANI(), save ANI can only support MinHash and KSSD functions" << endl;
205 | 		return;
206 | 	}
207 | 	string file_ani = folderPath + '/' + "mst.ani";
208 | 	FILE* fp_ani = fopen(file_ani.c_str(), "w+");
209 | 	if(!fp_ani){
210 | 		cerr << "ERROR: saveANI(), cannot open file: " << file_ani << endl;
211 | 		exit(1);
212 | 	}
213 | 	fwrite(aniArr, sizeof(uint64_t), 101, fp_ani);
214 | 	fclose(fp_ani);
215 | 	cerr << "-----save the ani file into: " << file_ani << endl;
216 | }
217 | 
218 | void print_kssd_newick_tree(const vector<KssdSketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file, string output){
219 | 	string res_newick_tree = get_kssd_newick_tree(sketches, mst, sketch_by_file);
220 | 	FILE* fp_tree = fopen(output.c_str(), "w");
221 | 	if(!fp_tree){
222 | 		cerr << "ERROR: print_newick_tree(), cannot write file: " << output << endl;
223 | 		exit(1);
224 | 	}
225 | 	fprintf(fp_tree, "%s\n", res_newick_tree.c_str());
226 | 	fclose(fp_tree);
227 | }
228 | 
229 | void print_newick_tree(const vector<SketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file, string output){
230 | 	string res_newick_tree = get_newick_tree(sketches, mst, sketch_by_file);
231 | 	FILE* fp_tree = fopen(output.c_str(), "w");
232 | 	if(!fp_tree){
233 | 		cerr << "ERROR: print_newick_tree(), cannot write file: " << output << endl;
234 | 		exit(1);
235 | 	}
236 | 	fprintf(fp_tree, "%s\n", res_newick_tree.c_str());
237 | 	fclose(fp_tree);
238 | }
239 | 
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/src/MST_IO.h:
--------------------------------------------------------------------------------
 1 | #ifndef H_MST_IO
 2 | #define H_MST_IO
 3 | 
 4 | #include <fstream>
 5 | #include <sstream>
 6 | #include "SketchInfo.h"//include <iostream> <string>
 7 | #include "MST.h" //include <vector>
 8 | #include "common.hpp"
 9 | 
10 | struct ClusterInfo{
11 | 	int id;
12 | 	uint64_t length;
13 | };
14 | 
15 | void print_newick_tree(const vector<SketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file, string output);
16 | void printResult(std::vector<std::vector<int>>& clusterOrigin, std::vector<SketchInfo>& sketches, bool sketchByFile, string outputFile);
17 | void printKssdResult(vector<vector<int>>& cluster, vector<KssdSketchInfo>& sketches, bool sketchByFile, string outputFile);
18 | void print_kssd_newick_tree(const vector<KssdSketchInfo>& sketches, const vector<EdgeInfo>& mst, bool sketch_by_file, string output);
19 | 
20 | void loadMST(string folderPath, vector<EdgeInfo>& mst);
21 | void loadDense(int** &denseArr, string folderPath, int& denseSpan, int& genome_number);
22 | void loadANI(string folderPath, uint64_t* &aniArr, int sketch_func_id);
23 | 
24 | void saveMST(vector<SketchInfo>& sketches, vector<EdgeInfo>& mst, string folderPath, bool sketchByFile);
25 | void saveKssdMST(vector<KssdSketchInfo>& sketches, vector<EdgeInfo>& mst, string folderPath, bool sketchByFile);
26 | void saveDense(string folderPath, int** denseArr, int denseSpan, int genome_number);
27 | void saveANI(string folderPath, uint64_t* aniArr, int sketch_func_id);
28 | 
29 | #endif
30 | 
31 | 


--------------------------------------------------------------------------------
/src/SketchInfo.h:
--------------------------------------------------------------------------------
 1 | #ifndef H_SKETCH_INFO
 2 | #define H_SKETCH_INFO
 3 | 
 4 | #include "Sketch.h"
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <stdint.h>
 8 | #include <vector>
 9 | #include <random>
10 | 
11 | using namespace std;
12 | 
13 | //for sequence information
14 | struct SequenceInfo{
15 | 	string name;
16 | 	string comment;
17 | 	int strand;
18 | 	int length;
19 | };
20 | 
21 | typedef vector<SequenceInfo> Vec_SeqInfo;
22 | struct SketchInfo{
23 | 	int id;
24 | 	string fileName;//for sketch files;
25 | 	uint64_t totalSeqLength;
26 | 	Vec_SeqInfo fileSeqs;//for sketch files;
27 | 	SequenceInfo seqInfo;//for sketch sequence;
28 | 	bool isContainment = false;
29 | 	
30 | 	Sketch::MinHash* minHash;
31 | 	Sketch::KSSD* KSSD;
32 | 	Sketch::WMinHash* WMinHash;
33 | 	Sketch::HyperLogLog* HLL;
34 | 	Sketch::OrderMinHash * OMH;
35 | };
36 | 
37 | struct KssdSketchInfo{
38 | 	int id;
39 | 	string fileName;
40 | 	uint64_t totalSeqLength;
41 | 	Vec_SeqInfo fileSeqs;
42 | 	SequenceInfo seqInfo;
43 | 	bool use64;
44 | 	vector<uint32_t> hash32_arr;
45 | 	vector<uint64_t> hash64_arr;
46 | };
47 | 
48 | struct KssdParameters{
49 | 	int id;
50 | 	int half_k;
51 | 	int half_subk;
52 | 	int drlevel;
53 | 	int genomeNumber;
54 | };
55 | 
56 | 
57 | bool cmpGenomeSize(SketchInfo s1, SketchInfo s2);
58 | bool cmpSeqSize(SketchInfo s1, SketchInfo s2);
59 | 
60 | void calSize(bool sketchByFile, string inputFile, int threads, uint64_t minLen, uint64_t &maxSize, uint64_t& minSize, uint64_t& averageSize);
61 | bool sketchSequences(string inputFile, int kmerSize, int sketchSize, int minLen, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads);
62 | bool sketchFiles(string inputFile, uint64_t minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, vector<SketchInfo>& sketches, int threads);
63 | bool cmpSketch(SketchInfo s1, SketchInfo s2);
64 | //bool sketchFileWithKssd(const string inputFile, const uint64_t minLen, const int kmerSize, const int drlevel, vector<KssdSketchInfo>& sketches, int threads);
65 | bool sketchFileWithKssd(const string inputFile, const uint64_t minLen, int kmerSize, const int drlevel, vector<KssdSketchInfo>& sketches, KssdParameters& info, int threads);
66 | bool sketchSequencesWithKssd(const string inputFile, const int minLen, const int kmerSize, const int drlevel, vector<KssdSketchInfo>& sketches, KssdParameters& info, int threads);
67 | void transSketches(const vector<KssdSketchInfo>& sketches, const KssdParameters& info, const string folder_path, int numThreads);
68 | 
69 | 
70 | 
71 | #endif //H_SKETCH_INFO
72 | 


--------------------------------------------------------------------------------
/src/Sketch_IO.h:
--------------------------------------------------------------------------------
 1 | #ifndef H_SKETCH_IO
 2 | #define H_SKETCH_IO
 3 | #include "SketchInfo.h"
 4 | #include "common.hpp"
 5 | 
 6 | void read_sketch_parameters(string folder_path, int& sketch_func_id, int& kmer_size, bool& is_containment, int& contain_compress, int& sketch_size, int& half_k, int& half_subk, int& drlevel);
 7 | void save_genome_info(vector<SketchInfo>& sketches, string folderPath, string type, bool sketchByFile);
 8 | void save_kssd_genome_info(const vector<KssdSketchInfo>& sketches, const string folderPath, const string type, bool sketchByFile);
 9 | void saveSketches(vector<SketchInfo>& sketches, string folderPath, bool sketchByFile, string sketchFunc, bool isContainment, int containCompress, int sketchSize, int kmerSize);
10 | void saveKssdSketches(const vector<KssdSketchInfo>& sketches, const KssdParameters info, const string folderPath, bool sketchByFile);
11 | 
12 | bool loadSketches(string folderPath, int threads, vector<SketchInfo>& sketches, int& sketch_func_id);
13 | bool load_genome_info(string folderPath, string type, vector<SketchInfo>& sketches);
14 | bool load_kssd_genome_info(string folderPath, string type, vector<KssdSketchInfo>& sketches);
15 | bool loadKssdSketches(string folderPath, int threads, vector<KssdSketchInfo>& sketches, KssdParameters& info);
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/ThreadPool.h:
--------------------------------------------------------------------------------
 1 | // Copyright © 2015, Battelle National Biodefense Institute (BNBI);
 2 | // all rights reserved. Authored by: Brian Ondov, Todd Treangen,
 3 | // Sergey Koren, and Adam Phillippy
 4 | //
 5 | // See the LICENSE.txt file included with this software for license information.
 6 | 
 7 | #ifndef ThreadPool_h
 8 | #define ThreadPool_h
 9 | 
10 | #include <pthread.h>
11 | #include <queue>
12 | 
13 | template <class TypeInput, class TypeOutput>
14 | class ThreadPool
15 | {
16 | public:
17 |     
18 |     ThreadPool(TypeOutput * (* functionNew)(TypeInput *), unsigned int threadCountNew);
19 |     ~ThreadPool();
20 |     
21 |     bool outputAvailable() const;
22 |     TypeOutput * popOutputWhenAvailable(); // output must be deleted by calling function
23 |     bool running() const;
24 |     void runWhenThreadAvailable(TypeInput * input); // thread deletes input when finished
25 |     void runWhenThreadAvailable(TypeInput * input, TypeOutput * (* functionNew)(TypeInput *)); // thread deletes input when finished
26 |     
27 | private:
28 |     
29 |     struct OutputQueueNode
30 |     {
31 |         // used to preserve input order when outputting
32 |         
33 |         OutputQueueNode * prev;
34 |         OutputQueueNode * next;
35 |         
36 |         TypeOutput * output;
37 |         bool ready;
38 |     };
39 |     
40 |     unsigned int threadCount;
41 |     
42 |     pthread_t * threads;
43 |     
44 |     static void * thread(void *);
45 |     
46 |     TypeOutput * (* function)(TypeInput *);
47 |     TypeInput * inputCurrent;
48 |     OutputQueueNode * outputQueueNodeCurrent;
49 |     
50 |     pthread_mutex_t * mutexInput;
51 |     pthread_mutex_t * mutexOutput;
52 |     
53 |     pthread_cond_t * condInput;
54 |     pthread_cond_t * condOutput;
55 |     
56 |     OutputQueueNode * outputQueueHead;
57 |     OutputQueueNode * outputQueueTail;
58 |     
59 |     bool finished;
60 |     friend void * thread(void *);
61 | };
62 | 
63 | 
64 | #include "ThreadPool.hxx"
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/src/ThreadPool.hxx:
--------------------------------------------------------------------------------
  1 | // Copyright © 2015, Battelle National Biodefense Institute (BNBI);
  2 | // all rights reserved. Authored by: Brian Ondov, Todd Treangen,
  3 | // Sergey Koren, and Adam Phillippy
  4 | //
  5 | // See the LICENSE.txt file included with this software for license information.
  6 | 
  7 | #include "ThreadPool.h"
  8 | #include <stdlib.h>
  9 | #include <stdio.h>
 10 | #include <iostream>
 11 | 
 12 | template <class TypeInput, class TypeOutput>
 13 | ThreadPool<TypeInput, TypeOutput>::ThreadPool(TypeOutput * (* functionNew)(TypeInput *), unsigned int threadCountNew)
 14 |     :
 15 |     threadCount(threadCountNew),
 16 |     function(functionNew)
 17 | {
 18 |     mutexInput = new pthread_mutex_t();
 19 |     mutexOutput = new pthread_mutex_t();
 20 |     
 21 |     condInput = new pthread_cond_t();
 22 |     condOutput = new pthread_cond_t();
 23 |     
 24 |     pthread_mutex_init(mutexInput, NULL);
 25 |     pthread_mutex_init(mutexOutput, NULL);
 26 |     
 27 |     pthread_cond_init(condInput, NULL);
 28 |     pthread_cond_init(condOutput, NULL);
 29 |     
 30 |     inputCurrent = 0;
 31 |     
 32 |     outputQueueHead = 0;
 33 |     outputQueueTail = 0;
 34 |     
 35 |     finished = false;
 36 |     
 37 |     threads = new pthread_t[threadCount];
 38 |     
 39 |     for ( int i = 0; i < threadCount; i++ )
 40 |     {
 41 |         pthread_create(&threads[i], NULL, &ThreadPool::thread, this);
 42 |     }
 43 | }
 44 | 
 45 | template <class TypeInput, class TypeOutput>
 46 | ThreadPool<TypeInput, TypeOutput>::~ThreadPool()
 47 | {
 48 |     pthread_mutex_lock(mutexInput);
 49 |     finished = true;
 50 |     pthread_cond_broadcast(condInput);
 51 |     pthread_mutex_unlock(mutexInput);
 52 |     
 53 |     for ( int i = 0; i < threadCount; i++ )
 54 |     {
 55 |         pthread_join(threads[i], NULL);
 56 |     }
 57 |     
 58 |     delete [] threads;
 59 |     
 60 |     while ( outputQueueHead != 0 )
 61 |     {
 62 |         OutputQueueNode * next = outputQueueHead->next;
 63 |         delete outputQueueHead;
 64 |         outputQueueHead = next;
 65 |     }
 66 |     
 67 |     delete mutexInput;
 68 |     delete mutexOutput;
 69 |     
 70 |     delete condInput;
 71 |     delete condOutput;
 72 | }
 73 | 
 74 | template <class TypeInput, class TypeOutput>
 75 | bool ThreadPool<TypeInput, TypeOutput>::outputAvailable() const
 76 | {
 77 |     bool available;
 78 |     
 79 |     pthread_mutex_lock(mutexOutput);
 80 |     available = outputQueueHead != 0 && outputQueueHead->ready;
 81 |     pthread_mutex_unlock(mutexOutput);
 82 |     
 83 |     return available;
 84 | }
 85 | 
 86 | template <class TypeInput, class TypeOutput>
 87 | TypeOutput * ThreadPool<TypeInput, TypeOutput>::popOutputWhenAvailable()
 88 | {
 89 |     pthread_mutex_lock(mutexOutput);
 90 |     
 91 |     if ( outputQueueHead == 0 )
 92 |     {
 93 |         // TODO: error?
 94 |         std::cerr << "ERROR: waiting for output when no output queued\n";
 95 |         pthread_mutex_unlock(mutexOutput);
 96 |         return 0;
 97 |     }
 98 |     
 99 |     while ( ! outputQueueHead->ready )
100 |     {
101 |         pthread_cond_wait(condOutput, mutexOutput);
102 |     }
103 |     
104 |     TypeOutput * output = outputQueueHead->output;
105 |     
106 |     OutputQueueNode * next = outputQueueHead->next;
107 |     
108 |     if ( outputQueueTail == outputQueueHead )
109 |     {
110 |         outputQueueTail = 0;
111 |     }
112 |     
113 |     delete outputQueueHead;
114 |     outputQueueHead = next;
115 |     pthread_mutex_unlock(mutexOutput);
116 |     
117 |     return output;
118 | }
119 | 
120 | template <class TypeInput, class TypeOutput>
121 | void ThreadPool<TypeInput, TypeOutput>::runWhenThreadAvailable(TypeInput * input)
122 | {
123 | 	runWhenThreadAvailable(input, function);
124 | }
125 | 
126 | template <class TypeInput, class TypeOutput>
127 | void ThreadPool<TypeInput, TypeOutput>::runWhenThreadAvailable(TypeInput * input, TypeOutput * (* functionNew)(TypeInput *))
128 | {
129 |     pthread_mutex_lock(mutexInput);
130 |     
131 |     while ( inputCurrent != 0 )
132 |     {
133 |         pthread_cond_wait(condInput, mutexInput);
134 |     }
135 |     
136 |     inputCurrent = input;
137 |     function = functionNew;
138 |     
139 |     // enqueue output while input locked (to preserve order)
140 |     //
141 |     OutputQueueNode * outputQueueNode = new OutputQueueNode();
142 |     outputQueueNode->next = 0;
143 |     outputQueueNode->ready = false;
144 |     //
145 |     pthread_mutex_lock(mutexOutput);
146 |     //
147 |     if ( outputQueueHead == 0 )
148 |     {
149 |         outputQueueHead = outputQueueNode;
150 |     }
151 |     //
152 |     outputQueueNode->prev = outputQueueTail;
153 |     //
154 |     if ( outputQueueTail != 0 )
155 |     {
156 |         outputQueueTail->next = outputQueueNode;
157 |     }
158 |     //
159 |     outputQueueTail = outputQueueNode;
160 |     //
161 |     pthread_mutex_unlock(mutexOutput);
162 |     
163 |     outputQueueNodeCurrent = outputQueueNode;
164 |     
165 |     pthread_mutex_unlock(mutexInput);
166 |     pthread_cond_broadcast(condInput);
167 | }
168 | 
169 | template <class TypeInput, class TypeOutput>
170 | bool ThreadPool<TypeInput, TypeOutput>::running() const
171 | {
172 |     bool running;
173 |     
174 |     pthread_mutex_lock(mutexOutput);
175 |     running = outputQueueHead != 0;
176 |     pthread_mutex_unlock(mutexOutput);
177 |     
178 |     return running;
179 | }
180 | 
181 | template <class TypeInput, class TypeOutput>
182 | void * ThreadPool<TypeInput, TypeOutput>::thread(void * arg)
183 | {
184 |     ThreadPool * threadPool = (ThreadPool *)arg;
185 |     TypeInput * input;
186 |     OutputQueueNode * outputQueueNode;
187 |     
188 |     while ( ! threadPool->finished )
189 |     {
190 |         // wait for input
191 |         //
192 |         pthread_mutex_lock(threadPool->mutexInput);
193 |         //
194 |         while ( ! threadPool->finished && threadPool->inputCurrent == 0 )
195 |         {
196 |             pthread_cond_wait(threadPool->condInput, threadPool->mutexInput);
197 |         }
198 |         
199 |         if ( threadPool->finished )
200 |         {
201 |             pthread_mutex_unlock(threadPool->mutexInput);
202 |             return 0;
203 |         }
204 |         //
205 |         input = threadPool->inputCurrent;
206 |         outputQueueNode = threadPool->outputQueueNodeCurrent;
207 |         threadPool->inputCurrent = 0;
208 |         TypeOutput * (* function)(TypeInput *) = threadPool->function;
209 |         
210 |         pthread_mutex_unlock(threadPool->mutexInput);
211 |         
212 |         pthread_cond_broadcast(threadPool->condInput);
213 |         
214 |         // run function
215 |         //
216 |         outputQueueNode->output = function(input);
217 |         
218 |         delete input;
219 |         
220 |         // signal output
221 |         //
222 |         outputQueueNode->ready = true;
223 |         //
224 |         pthread_mutex_lock(threadPool->mutexOutput);
225 |         pthread_cond_broadcast(threadPool->condOutput);
226 |         pthread_mutex_unlock(threadPool->mutexOutput);
227 |     }
228 |     
229 |     return NULL;
230 | }
231 | 


--------------------------------------------------------------------------------
/src/UnionFind.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNIONFIND_H
 2 | #define UNIONFIND_H
 3 | 
 4 | 
 5 | class UnionFind {
 6 |     int *parent, *ranks, _size;
 7 | public:
 8 |     UnionFind(){
 9 |     }
10 |     UnionFind(int size){
11 |         parent = new int[size]; ranks = new int[size];
12 |         for(int element = 0 ; element < size ; element++){
13 |             parent[element] = element , ranks[element] = 0 ;
14 |         }
15 |         _size = size;
16 |     }
17 |     void resize(int size){
18 |         parent = new int[size]; ranks = new int[size];
19 |         for(int element = 0 ; element < size ; element++){
20 |             parent[element] = element , ranks[element] = 0 ;
21 |         }
22 |         _size = size;
23 |     }
24 |     int find(int element){
25 |         if(parent[element] == element){
26 |             return element;
27 |         }
28 |         else{
29 |             return parent[element] = find(parent[element]);          // Path Compression algorithm
30 |         }
31 |     }
32 |     bool connected(int x,int y){
33 |         if(find(x) == find(y)){
34 |             return true;
35 |         }
36 |         else{
37 |             return false;
38 |         }
39 |     }
40 |     void merge(int x,int y){
41 |         x = find(x);
42 |         y = find(y);
43 |         if(x != y){                                                   // Union by Rank algorithm
44 |             if(ranks[x] > ranks[y]){
45 |                 parent[y] = x;
46 |             }
47 |             else if(ranks[x] < ranks[y]){
48 |                 parent[x] = y;
49 |             }
50 |             else{
51 |                 parent[x] = y; ranks[y] ++ ;
52 |             }
53 |             _size--;
54 |         }
55 |     }
56 |     void clear(){
57 |         delete [] parent; delete [] ranks;
58 |     }
59 |     int size(){
60 |         return _size;
61 |     }
62 | };
63 | 
64 | 
65 | 
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/src/common.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef HPP_COMMON
 2 | #define HPP_COMMON
 3 | 
 4 | /*
 5 |  * The parameter.h is for the basic parameter of sketch, such as sketchSize, kmerSize.
 6 |  * 	KMER_SIZE is the kmer size for slide-window of all sketch functions.
 7 |  * 	MINHASH_SKETCH_SIZE is the fixed sketch size for minHash resemblance computing.
 8 |  * 	SKETCH_COMPRESS_SEQUENCE is the proportion sketch size with genome sequences for containment computing.
 9 |  * 	SKETCH_COMPRESS_GENOME is the proportion sketch size with genome size for containment computing.
10 |  * 	WMH_SKETCH_SIZE is the sketch size for WeightedMinHash.
11 |  * 	WINDOW_SIZE is the window size for minimizer in WeightedMinHash.
12 |  * 	HLL_SKETCH_BIT is the bit number to define the sketch size for HyperLogLog.
13 |  *
14 |  */
15 | 
16 | #include <cstdio>
17 | #include <iostream>
18 | 
19 | //#define KMER_SIZE 21
20 | //#define MINHASH_SKETCH_SIZE 10000
21 | //#define SKETCH_COMPRESS_SEQUENCE 1000
22 | //#define SKETCH_COMPRESS_GENOME 1000
23 | #define WMH_SKETCH_SIZE 50
24 | #define WINDOW_SIZE 20
25 | #define HLL_SKETCH_BIT 10
26 | #define DENSE_SPAN 100;
27 | 
28 | #include <sys/time.h>
29 | inline double get_sec(){
30 | 	struct timeval tv;
31 | 	gettimeofday(&tv, NULL);
32 | 	return (double)tv.tv_sec + (double)tv.tv_usec/1000000;
33 | }
34 | 
35 | #include <ctime>
36 | inline const string currentDataTime(){
37 | 	time_t now = time(0);
38 | 	struct tm tstruct;
39 | 	char buf[80];
40 | 	tstruct = *localtime(&now);
41 | 	strftime(buf, sizeof(buf), "%Y_%m_%d_%H-%M-%S", &tstruct);
42 | 
43 | 	return buf;
44 | }
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/greedy.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef GREEDY_CLUST
 2 | #include "greedy.h"
 3 | #include <map>
 4 | #include <omp.h>
 5 | using namespace std;
 6 | 
 7 | /* @brief									Generating clusters by greedy incremental algorthm.
 8 |  *
 9 |  * @param[in] sketches		sketch array including hash values and informations for each genome sketch.
10 |  * @param[in] sketchFunc	sketch Function including MinHash and KSSD
11 |  * @param[in] threshold		distance threshold for cluster, genomes with distance below this threshold are clustered together.
12 |  * @param[in] threads			Thread number for multiThreading
13 |  * @return								cluster result two-dimention array, each array in result is a cluster, and each element in a cluster
14 |  * 												is a genome.
15 |  */
16 | vector<vector<int>> greedyCluster(vector<SketchInfo>& sketches, int sketch_func_id, double threshold, int threads)
17 | {
18 | 	int numGenomes = sketches.size();
19 | 	int * clustLabels = new int[numGenomes];
20 | 	memset(clustLabels, 0, numGenomes*sizeof(int));
21 | 	vector<vector<int> > cluster;
22 | 	vector<int> representiveArr;
23 | 	map<int, vector<int> > semiClust;
24 | 	representiveArr.push_back(0);
25 | 	semiClust.insert({0, vector<int>()});
26 | 
27 | 	for(int j = 1; j < numGenomes; j++){
28 | 		map<double, int> distMapCenter;
29 | 		#pragma omp parallel for num_threads(threads)
30 | 		for(int i = 0; i < representiveArr.size(); i++){
31 | 			int repId = representiveArr[i];
32 | 			double dist;
33 | 			if(sketch_func_id == 0){
34 | 				if(sketches[repId].isContainment)
35 | 					dist = sketches[repId].minHash->containDistance(sketches[j].minHash);
36 | 					//dist = 1.0 - sketches[repId].minHash->containJaccard(sketches[j].minHash);
37 | 				else
38 | 					dist = sketches[repId].minHash->distance(sketches[j].minHash);
39 | 			}
40 | 			else if(sketch_func_id == 1){
41 | 				dist = sketches[repId].KSSD->distance(sketches[j].KSSD);
42 | 			}
43 | 			else{
44 | 				cerr << "can only support MinHash and KSSD with greedy incremental clust" << endl;
45 | 				exit(1);
46 | 			}
47 | 			if(dist <= threshold){
48 | 				clustLabels[j] = 1;
49 | 				#pragma omp critical
50 | 				{
51 | 					distMapCenter.insert({dist, repId});
52 | 				}
53 | 				//break;
54 | 			}
55 | 		}//end for i
56 | 		if(clustLabels[j] == 0){//this genome is a representative genome
57 | 			representiveArr.push_back(j);
58 | 			semiClust.insert({j, vector<int>()});
59 | 		}
60 | 		else{//this genome is a redundant genome, get the nearest representive genome as its center
61 | 			auto it = distMapCenter.begin();
62 | 			int repId = it->second;
63 | 			semiClust[repId].push_back(j);
64 | 		}
65 | 		map<double, int>().swap(distMapCenter);
66 | 		if(j % 10000 == 0) cerr << "---finished cluster: " << j << endl;
67 | 		
68 | 	}//end for j
69 | 	//cerr << "the representiveArr size is : " << representiveArr.size() << endl;
70 | 
71 | 	for(auto x : semiClust){
72 | 		int center = x.first;
73 | 		vector<int> redundantArr = x.second;
74 | 		vector<int> curClust;
75 | 		curClust.push_back(center);
76 | 		curClust.insert(curClust.end(), redundantArr.begin(), redundantArr.end());
77 | 		cluster.push_back(curClust);
78 | 	}
79 | 	return cluster;
80 | }
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/src/greedy.h:
--------------------------------------------------------------------------------
 1 | #ifdef GREEDY_CLUST
 2 | #ifndef H_GREEDY
 3 | #define H_GREEDY
 4 | 
 5 | #include "SketchInfo.h"
 6 | vector<vector<int>> greedyCluster(vector<SketchInfo>& sketches, int sketch_func_id, double threshold, int threads);
 7 | 
 8 | 
 9 | 
10 | #endif
11 | #endif
12 | 


--------------------------------------------------------------------------------
/src/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #ifdef USE_MALLOC_WRAPPERS
 36 | #  include "malloc_wrap.h"
 37 | #endif
 38 | 
 39 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 40 | #define KS_SEP_TAB   1 // isspace() && !' '
 41 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 42 | #define KS_SEP_MAX   2
 43 | 
 44 | #define __KS_TYPE(type_t)						\
 45 | 	typedef struct __kstream_t {				\
 46 | 		unsigned char *buf;						\
 47 | 		int begin, end, is_eof;					\
 48 | 		type_t f;								\
 49 | 	} kstream_t;
 50 | 
 51 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 52 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 53 | 
 54 | #define __KS_BASIC(type_t, __bufsize)								\
 55 | 	static inline kstream_t *ks_init(type_t f)						\
 56 | 	{																\
 57 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 58 | 		ks->f = f;													\
 59 | 		ks->buf = (unsigned char*)malloc(__bufsize);				\
 60 | 		return ks;													\
 61 | 	}																\
 62 | 	static inline void ks_destroy(kstream_t *ks)					\
 63 | 	{																\
 64 | 		if (ks) {													\
 65 | 			free(ks->buf);											\
 66 | 			free(ks);												\
 67 | 		}															\
 68 | 	}
 69 | 
 70 | #define __KS_GETC(__read, __bufsize)						\
 71 | 	static inline int ks_getc(kstream_t *ks)				\
 72 | 	{														\
 73 | 		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
 74 | 		if (ks->begin >= ks->end) {							\
 75 | 			ks->begin = 0;									\
 76 | 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
 77 | 			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
 78 | 		}													\
 79 | 		return (int)ks->buf[ks->begin++];					\
 80 | 	}
 81 | 
 82 | #ifndef KSTRING_T
 83 | #define KSTRING_T kstring_t
 84 | typedef struct __kstring_t {
 85 | 	size_t l, m;
 86 | 	char *s;
 87 | } kstring_t;
 88 | #endif
 89 | 
 90 | #ifndef kroundup32
 91 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 92 | #endif
 93 | 
 94 | #define __KS_GETUNTIL(__read, __bufsize)								\
 95 | 	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 96 | 	{																	\
 97 | 		int gotany = 0;													\
 98 | 		if (dret) *dret = 0;											\
 99 | 		str->l = append? str->l : 0;									\
100 | 		for (;;) {														\
101 | 			int i;														\
102 | 			if (ks->begin >= ks->end) {									\
103 | 				if (!ks->is_eof) {										\
104 | 					ks->begin = 0;										\
105 | 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
106 | 					if (ks->end == 0) { ks->is_eof = 1; break; }		\
107 | 				} else break;											\
108 | 			}															\
109 | 			if (delimiter == KS_SEP_LINE) { \
110 | 				for (i = ks->begin; i < ks->end; ++i) \
111 | 					if (ks->buf[i] == '\n') break; \
112 | 			} else if (delimiter > KS_SEP_MAX) {						\
113 | 				for (i = ks->begin; i < ks->end; ++i)					\
114 | 					if (ks->buf[i] == delimiter) break;					\
115 | 			} else if (delimiter == KS_SEP_SPACE) {						\
116 | 				for (i = ks->begin; i < ks->end; ++i)					\
117 | 					if (isspace(ks->buf[i])) break;						\
118 | 			} else if (delimiter == KS_SEP_TAB) {						\
119 | 				for (i = ks->begin; i < ks->end; ++i)					\
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */						\
122 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
123 | 				str->m = str->l + (i - ks->begin) + 1;					\
124 | 				kroundup32(str->m);										\
125 | 				str->s = (char*)realloc(str->s, str->m);				\
126 | 			}															\
127 | 			gotany = 1;													\
128 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
129 | 			str->l = str->l + (i - ks->begin);							\
130 | 			ks->begin = i + 1;											\
131 | 			if (i < ks->end) {											\
132 | 				if (dret) *dret = ks->buf[i];							\
133 | 				break;													\
134 | 			}															\
135 | 		}																\
136 | 		if (!gotany && ks_eof(ks)) return -1;							\
137 | 		if (str->s == 0) {												\
138 | 			str->m = 1;													\
139 | 			str->s = (char*)calloc(1, 1);								\
140 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
141 | 		str->s[str->l] = '\0';											\
142 | 		return str->l;													\
143 | 	} \
144 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
145 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
146 | 
147 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
148 | 	__KS_TYPE(type_t)							\
149 | 	__KS_BASIC(type_t, __bufsize)				\
150 | 	__KS_GETC(__read, __bufsize)				\
151 | 	__KS_GETUNTIL(__read, __bufsize)
152 | 
153 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
154 | 
155 | #define __KSEQ_BASIC(SCOPE, type_t)										\
156 | 	SCOPE kseq_t *kseq_init(type_t fd)									\
157 | 	{																	\
158 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
159 | 		s->f = ks_init(fd);												\
160 | 		return s;														\
161 | 	}																	\
162 | 	SCOPE void kseq_destroy(kseq_t *ks)									\
163 | 	{																	\
164 | 		if (!ks) return;												\
165 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
166 | 		ks_destroy(ks->f);												\
167 | 		free(ks);														\
168 | 	}
169 | 
170 | /* Return value:
171 |    >=0  length of the sequence (normal)
172 |    -1   end-of-file
173 |    -2   truncated quality string
174 |  */
175 | #define __KSEQ_READ(SCOPE) \
176 | 	SCOPE int kseq_read(kseq_t *seq) \
177 | 	{ \
178 | 		int c; \
179 | 		kstream_t *ks = seq->f; \
180 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
181 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
182 | 			if (c == -1) return -1; /* end of file */ \
183 | 			seq->last_char = c; \
184 | 		} /* else: the first header char has been read in the previous call */ \
185 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
186 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
187 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
188 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
189 | 			seq->seq.m = 256; \
190 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
191 | 		} \
192 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
193 | 			if (c == '\n') continue; /* skip empty lines */ \
194 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
195 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
196 | 		} \
197 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
198 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
199 | 			seq->seq.m = seq->seq.l + 2; \
200 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
201 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
202 | 		} \
203 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
204 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
205 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
206 | 			seq->qual.m = seq->seq.m; \
207 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
208 | 		} \
209 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
210 | 		if (c == -1) return -2; /* error: no quality string */ \
211 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
212 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
213 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
214 | 		return seq->seq.l; \
215 | 	}
216 | 
217 | #define __KSEQ_TYPE(type_t)						\
218 | 	typedef struct {							\
219 | 		kstring_t name, comment, seq, qual;		\
220 | 		int last_char;							\
221 | 		kstream_t *f;							\
222 | 	} kseq_t;
223 | 
224 | #define KSEQ_INIT2(SCOPE, type_t, __read)		\
225 | 	KSTREAM_INIT(type_t, __read, 16384)			\
226 | 	__KSEQ_TYPE(type_t)							\
227 | 	__KSEQ_BASIC(SCOPE, type_t)					\
228 | 	__KSEQ_READ(SCOPE)
229 | 
230 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
231 | 
232 | #define KSEQ_DECLARE(type_t) \
233 | 	__KS_TYPE(type_t) \
234 | 	__KSEQ_TYPE(type_t) \
235 | 	extern kseq_t *kseq_init(type_t fd); \
236 | 	void kseq_destroy(kseq_t *ks); \
237 | 	int kseq_read(kseq_t *seq);
238 | 
239 | #endif
240 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This version is for result checking using single thread.
  3 |  * The input parameter and the final output is the same with the final version.
  4 |  *
  5 |  * The input can be a single file with numbers of sequences to create sketches by sequences.
  6 |  * And it can alse be a single file with list of genome files to create sketches by files(genomes).
  7 |  * This is strategy of clustering based distance computing by sequences or by genomes.
  8 |  *
  9 |  * The program includes several sections:
 10 |  * section 1: read the input arguments and init the parameters.
 11 |  * section 2: read genome files and create the sketches.
 12 |  * section 3: compute the distance matrix and generate the Minimum Spanning Tree(MST) or greedy incremental clustering.
 13 |  * section 4: generate the clusters with the MST using different distance threshold.
 14 |  *
 15 |  * Author: Xiaoming Xu
 16 |  * Mar 5, 2021
 17 |  *
 18 |  */
 19 | #include <iostream>
 20 | #include "SketchInfo.h"
 21 | #include "Sketch.h"// need to add the include path in Makefile.
 22 | #include <zlib.h>
 23 | #include "MST.h"
 24 | #include <omp.h>
 25 | #include "UnionFind.h"
 26 | #include <algorithm>
 27 | #include "common.hpp"
 28 | #include "MST_IO.h"
 29 | #include <math.h>
 30 | #include "Sketch_IO.h"
 31 | 
 32 | #ifdef GREEDY_CLUST
 33 | #include "greedy.h"
 34 | #endif
 35 | 
 36 | #include <fstream>
 37 | #include <sstream>
 38 | #include <sys/sysinfo.h>
 39 | #include <sys/stat.h>
 40 | 
 41 | #include "CLI11.hpp"
 42 | #include "sub_command.h"
 43 | 
 44 | #ifdef GREEDY_CLUST
 45 | #else
 46 | #endif
 47 | 
 48 | 
 49 | using namespace std;
 50 | 
 51 | int main(int argc, char * argv[]){
 52 | 	#ifdef GREEDY_CLUST
 53 | 		CLI::App app{"clust-greedy v.2.2.1, greedy incremental clustering module for RabbitTClust"};
 54 | 	#else
 55 | 		CLI::App app{"clust-mst v.2.2.1, minimum-spanning-tree-based module for RabbitTClust"};
 56 | 	#endif
 57 | 	//section 1: init parameters
 58 | 	int argIndex = 1;
 59 | 	string inputFile = "genome.fna";
 60 | 	string inputFile1 = "genome.info";
 61 | 	string sketchFunc = "MinHash";
 62 | 	string outputFile = "result.out";
 63 | 	int threads = 1;
 64 | 	threads = get_nprocs_conf();
 65 | 	bool sketchByFile = false;
 66 | 	bool isContainment = false;
 67 | 	bool isJaccard = false;
 68 | 	bool useFile = false;
 69 | 	double threshold = 0.05;
 70 | 	int kmerSize = 21;
 71 | 	int sketchSize = 1000;
 72 | 	int containCompress = 1000;
 73 | 	int drlevel = 3;
 74 | 	bool mstLoadSketch = false;
 75 | 	string mstSketchFile = "sketch.info";
 76 | 	bool isSetKmer = false;
 77 | 	uint64_t minLen = 10000;
 78 | 	string folder_path;
 79 | 	bool is_newick_tree = false;
 80 | 	bool is_fast = false;
 81 | 	bool no_dense = false;
 82 | 
 83 | 	bool noSave = false;
 84 | 
 85 | 	auto option_threads = app.add_option("-t, --threads", threads,  "set the thread number, default all CPUs of the platform");
 86 | 	auto option_min_len = app.add_option("-m, --min-length", minLen, "set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000");
 87 | 
 88 | 	auto option_containment = app.add_option("-c, --containment", containCompress, "use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress");
 89 | 	auto option_kmer_size = app.add_option("-k, --kmer-size", kmerSize, "set the kmer size");
 90 | 	auto option_sketch_size = app.add_option("-s, --sketch-size", sketchSize, "set the sketch size for Jaccard Index and Mash distance, default 1000");
 91 | 
 92 | 	auto flag_input_list = app.add_flag("-l, --list", sketchByFile, "input is genome list, one genome per line");
 93 | 	auto flag_no_save = app.add_flag("-e, --no-save", noSave, "not save the intermediate files, such as sketches or MST");
 94 | 	auto option_threshold = app.add_option("-d, --threshold", threshold, "set the distance threshold for clustering");
 95 | 	auto option_output = app.add_option("-o, --output", outputFile, "set the output name of cluster result");
 96 | 	auto option_input = app.add_option("-i, --input", inputFile, "set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)");
 97 | 	auto option_presketched = app.add_option("--presketched", folder_path, "clustering by the pre-generated sketch files rather than genomes");
 98 | #ifndef GREEDY_CLUST
 99 | 	auto option_premsted = app.add_option("--premsted", folder_path, "clustering by the pre-generated mst files rather than genomes for clust-mst");
100 | 	auto flag_newick_tree = app.add_flag("--newick-tree", is_newick_tree, "output the newick tree format file for clust-mst");
101 | 	auto flag_is_fast = app.add_flag("--fast", is_fast, "use the kssd algorithm for sketching and distance computing for clust-mst");
102 | 	auto option_drlevel = app.add_option("--drlevel", drlevel, "set the dimention reduction level for Kssd sketches, default 3 with a dimention reduction of 1/4096");
103 | 	auto flag_no_dense = app.add_flag("--no-dense", no_dense, "not calculate the density and ANI datas");
104 | #endif
105 | 	auto option_append = app.add_option("--append", inputFile, "append genome file or file list with the pre-generated sketch or MST files");
106 | 
107 | 	option_output->required();
108 | 	option_append->excludes(option_input);
109 | 
110 | 	CLI11_PARSE(app, argc, argv);
111 | 
112 | 	if(threads < 1){
113 | 		fprintf(stderr, "-----Invalid thread number %d\n", threads);
114 | 		return 1;
115 | 	}
116 | 	if(option_threads){
117 | 		fprintf(stderr, "-----set the thread number %d\n", threads);
118 | 	}
119 | 	if(*option_min_len){
120 | 		fprintf(stderr, "-----set the filter minimum length: %ld\n", minLen);
121 | 	}
122 | 	if(*option_containment){
123 | 		isContainment = true;
124 | 		fprintf(stderr, "-----use AAF distance with containment coefficient, the sketch size is in porportion with 1/%d\n", containCompress);
125 | 	}
126 | 	if(*option_kmer_size){
127 | 		isSetKmer = true;
128 | 		fprintf(stderr, "-----set kmerSize: %d\n", kmerSize);
129 | 	}
130 | 	if(*option_sketch_size){
131 | 		isJaccard = true;
132 | 		fprintf(stderr, "-----set sketchSize:  %d\n", sketchSize);
133 | 	}
134 | 	if(*option_threshold){
135 | 		fprintf(stderr, "-----set threshold:  %d\n", threshold);
136 | 	}
137 | 
138 | 
139 | #ifndef GREEDY_CLUST
140 | //======clust-mst=========================================================================
141 | 	if(is_fast){
142 | 		if(*option_premsted && !*option_append){
143 | 			clust_from_mst_fast(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads);
144 | 			return 0;
145 | 		}
146 | 		if(*option_presketched && !*option_append){
147 | 			clust_from_sketch_fast(folder_path, outputFile, is_newick_tree, no_dense, isContainment, threshold, threads);
148 | 			return 0;
149 | 		}
150 | 		if(*option_append && !*option_premsted && !*option_presketched){
151 | 			cerr << "ERROR: option --append, option --presketched or --premsted needed" << endl;
152 | 			return 1;
153 | 		}
154 | 		if(*option_append && (*option_presketched || *option_premsted)){
155 | 			append_clust_mst_fast(folder_path, inputFile, outputFile, is_newick_tree, no_dense, sketchByFile, isContainment, minLen, noSave, threshold, threads);
156 | 			return 0;
157 | 		}
158 | 		if(!tune_kssd_parameters(sketchByFile, isSetKmer, inputFile, threads, minLen, isContainment, kmerSize, threshold, drlevel)){
159 | 			return 1;
160 | 		}
161 | 		clust_from_genome_fast(inputFile, outputFile, folder_path, is_newick_tree, no_dense, sketchByFile, isContainment, kmerSize, threshold, drlevel, minLen, noSave, threads);
162 | 		return 0;
163 | 	}
164 | 
165 | 	if(*option_premsted && !*option_append){
166 | 		clust_from_mst(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads);
167 | 		return 0;
168 | 	}
169 | 	if(*option_append && !*option_presketched && !*option_premsted){
170 | 		cerr << "ERROR option --append, option --presketched or --premsted needed" << endl;
171 | 		return 1;
172 | 	}
173 | 	if(*option_append && (*option_premsted || *option_presketched)){
174 | 		append_clust_mst(folder_path, inputFile, outputFile, is_newick_tree, no_dense, sketchByFile, minLen, noSave, threshold, threads);
175 | 		return 0;
176 | 	}
177 | //======clust-mst=========================================================================
178 | #else
179 | //======clust-greedy======================================================================
180 | 	if(*option_append && !*option_presketched){
181 | 		cerr << "ERROR option --append, option --presketched needed" << endl;
182 | 		return 1;
183 | 	}
184 | 	if(*option_append && *option_presketched){
185 | 		append_clust_greedy(folder_path, inputFile, outputFile, sketchByFile, minLen, noSave, threshold, threads);
186 | 		return 0;
187 | 	}
188 | //======clust-greedy======================================================================
189 | #endif
190 | 	
191 | 	if(*option_presketched && !*option_append){
192 | 		clust_from_sketches(folder_path, outputFile, is_newick_tree, no_dense, threshold, threads);
193 | 		return 0;
194 | 	}
195 | 
196 | 	if(!tune_parameters(sketchByFile, isSetKmer, inputFile, threads, minLen, isContainment, isJaccard, kmerSize, threshold, containCompress, sketchSize)){
197 | 		return 1;
198 | 	}
199 | 
200 | 	
201 | 	clust_from_genomes(inputFile, outputFile, is_newick_tree, sketchByFile, no_dense, kmerSize, sketchSize, threshold,sketchFunc, isContainment, containCompress, minLen, folder_path, noSave, threads);
202 | 
203 | 	return 0;
204 | }//end main
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/src/sub_command.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "SketchInfo.h"
 3 | #include "MST.h"
 4 | #include "Sketch_IO.h"
 5 | #include "common.hpp"
 6 | #include "MST_IO.h"
 7 | #include "greedy.h"
 8 | #include <vector>
 9 | #include <string>
10 | using namespace std;
11 | 
12 | void compute_sketches(vector<SketchInfo>& sketches, string inputFile, string& folder_path, bool sketchByFile, int minLen, int kmerSize, int sketchSize, string sketchFunc, bool isContainment, int containCompress, bool isSave, int threads);
13 | 
14 | void compute_clusters(vector<SketchInfo>& sketches, bool sketchByFile, string outputFile, bool is_newick_tree, bool no_dense, string folder_path, int sketch_func_id, double threshold, bool isSave, int threads);
15 | 
16 | void clust_from_genomes(string inputFile, string outputFile, bool is_newick_tree, bool sketchByFile, bool no_dense, int kmerSize, int sketchSize, double threshold, string sketchFunc, bool isContainment, int containCompress, int minLen, string folder_path, bool noSave, int threads);
17 | 
18 | bool tune_parameters(bool sketchByFile, bool isSetKmer, string inputFile, int threads, int minLen, bool& isContainment, bool& isJaccard, int& kmerSize, double& threshold, int& containCompress, int& sketchSize);
19 | bool tune_kssd_parameters(bool sketchByFile, bool isSetKmer, string inputFile, int threads, int minLen, bool& isContainment, int& kmerSize, double& threshold, int &drlevel);
20 | 
21 | void clust_from_sketches(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads);
22 | 
23 | void clust_from_mst(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads);
24 | void clust_from_mst_fast(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, double threshold, int threads);
25 | 
26 | void append_clust_mst(string folder_path, string input_file, string output_file, bool is_newick_tree, bool no_dense, bool sketch_by_file, int min_len, bool no_save, double threshold, int threads);
27 | void append_clust_mst_fast(string folder_path, string input_file, string output_file, bool is_newick_tree, bool no_dense, bool sketch_by_file, bool isContainment, int min_len, bool no_save, double threshold, int threads);
28 | 
29 | void append_clust_greedy(string folder_path, string input_file, string output_file, bool sketch_by_file, int min_len, bool no_save, double threshold, int threads);
30 | 
31 | void compute_kssd_sketches(vector<KssdSketchInfo>& sketches, KssdParameters& info, bool isSave, const string inputFile, string& folder_path, bool sketchByFile, const int minLen, const int kmerSize, const int drlevel, int threads);
32 | void compute_kssd_clusters(vector<KssdSketchInfo>& sketches, const KssdParameters info, bool sketchByFile, bool no_dense, bool isContainment, const string folder_path, string outputFile, bool is_newick_tree, double threshold, bool isSave, int threads);
33 | 
34 | void clust_from_genome_fast(const string inputFile, string outputFile, string folder_path, bool is_newick_tree, bool no_dense, bool sketchByFile, bool isContainment, const int kmerSize, const double threshold, const int drlevel, const int minLen, bool noSave, int threads);
35 | void clust_from_sketch_fast(string folder_path, string outputFile, bool is_newick_tree, bool no_dense, bool isContainment, double threshold, int threads);
36 | 
37 | 


--------------------------------------------------------------------------------
/version_history/history.md:
--------------------------------------------------------------------------------
 1 | # Latest version: `v.2.3.0` 
 2 | * add `--fast` option for `clust-mst` to use the more efficient Kssd sketch strategy when computing the all-vs-all genome distances.
 3 |     * The `--fast` option can work together with `--append`, `--presketched`, and `--premsted` options.
 4 |     * the `--drlevel` is used for setting the dimention reduction level for Kssd sketches. Default value is 3, which is corresponding to a dimention reduction of $1 / 2^{(4*3)} = 1/4096$.
 5 | 
 6 | ## [`v.2.2.1`](v.2.2.1.md)
 7 | * add `--newick-tree` option to output the Newick tree format for `clust-mst`.
 8 | 
 9 | ## [`v.2.2.0`](v.2.2.0.md)
10 | * support incrementally clustering by option `--append` accompanied with `--presketched` or `--premsted` options.
11 | 
12 | Note:  
13 | * When considering the clustering of the genome set `A+B` using a pre-generated sketch `A_sketch` and an appending genome set `B`, it is important to note that the sketch parameter for the pre-generated sketch `A_sketch` and the appending set `B` may differ from that of the whole genome set `A+B`. However, the impact of changes in the genome lengths of set `B` on the automatically generated parameters will be minimal if they are not significant.
14 | 
15 |     * This is because the sketch parameters, including the $k$-mer size, sketch size, and containment compress ratio, for the appending genome set `B` are the same as those of the pre-generated sketch `A_sketch`. Additionally, the automatic parameter generation method, which is carried out using the `tune_parameters()` function, depends on whole genome information such as minimum, maximum, and mean genome length.
16 |     Therefore, the changes in the genome lengths of the appending set `B` are unlikely to have a significant effect on the automatically generated parameters if they are not substantial.
17 | 
18 | * In the context of genome clustering, the sketches are sorted by unstable sort in a decreasing order of their genome length. Consequently, the order of sketches may undergo slight changes if there are genomes with identical lengths. However, this does not significantly affect the outcome of the clustering process.
19 | 
20 | ## [`v.2.1.0`](v.2.1.0.md)
21 | * change the parameter parsing by [CLI11](https://github.com/CLIUtils/CLI11).
22 | * save the intermediate files (sketch, mst files) in binary format.
23 | * abrogate the `-f` option for loading pre-generated sketch or MST file, replaced by `--presketched` and `--premsted` option.
24 | 
25 | More details by `clust-mst --help` or `clust-greedy --help`.
26 | 
27 | ## `v.2.0.3`
28 | * add the parameter `-m` to set the minimum genome length (*minLen*), genomes with lengths less than *minLen* will be ignored.
29 | 
30 | ## `v.2.0.2`
31 | * update the `calSize` of gz files for automatically generating $k$-mer size .
32 | 
33 | ## `v.2.0.1`
34 | * Update the latest version of [robin-hood-hashing](https://github.com/martinus/robin-hood-hashing) to solve the compile error with `g++ 12.0+`.
35 | 
36 | ## [`v.2.0.0`](v.2.0.0.md)
37 | * Add the `clust-greedy` module for greedy incremental clustering. 
38 | * Last MST-based clustering module is `clust-mst` module.
39 | 
40 | 
41 | ## `v.1.0.0`
42 | * First version of RabbitTClust, large-scaled genome clustering tool based on sketch technique and Minimum Spanning Tree (MST).


--------------------------------------------------------------------------------
/version_history/v.2.0.0.md:
--------------------------------------------------------------------------------
  1 | # `v.2.0.0`
  2 | ## Installation
  3 | RabbitTClust version 2.0 can only support 64-bit Linux Systems.
  4 | 
  5 | ### Dependancy
  6 | * cmake v.3.0 or later
  7 | * c++14
  8 | * [zlib](https://zlib.net/)
  9 | 
 10 | ### Compile and install automatically
 11 | ```bash
 12 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 13 | cd RabbitTClust
 14 | ./install.sh
 15 | ```
 16 | 
 17 | ### Compile and install manually 
 18 | ```bash
 19 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 20 | cd RabbitTClust
 21 | 
 22 | #make rabbitSketch library
 23 | cd RabbitSketch &&
 24 | mkdir -p build && cd build &&
 25 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. &&
 26 | make -j8 && make install &&
 27 | cd ../../ &&
 28 | 
 29 | #make rabbitFX library
 30 | cd RabbitFX && 
 31 | mkdir -p build && cd build &&
 32 | cmake -DCMAKE_INSTALL_PREFIX=. .. &&
 33 | make -j8 && make install && 
 34 | cd ../../ &&
 35 | 
 36 | #compile the clust-greedy
 37 | mkdir -p build && cd build &&
 38 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 
 39 | make -j8 && make install &&
 40 | cd ../ &&
 41 | 
 42 | #compile the clust-mst
 43 | cd build &&
 44 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. &&
 45 | make -j8 && make install &&
 46 | cd ../ 
 47 | ```
 48 | 
 49 | ## Usage
 50 | ```bash
 51 | usage: clust-mst [-h] [-l] [-t] <int> [-d] <double> [-F] <string> [-i] <string> [-o] <string>
 52 | usage: clust-mst [-h] [-f] [-E] [-d] <double> [-i] <string> <string> [-o] <string>
 53 | usage: clust-greedy [-h] [-l] [-t] <int> [-d] <double> [-F] <string> [-i] <string> [-o] <string>
 54 | usage: clust-greedy [-h] [-f] [-d] <double> [-i] <string> <string> [-o] <string>
 55 | -h         : this help message
 56 | -k <int>   : set kmer size, default 21, for both clust-mst and clust-greedy
 57 | -s <int>   : set sketch size, default 1000, for both clust-mst and clust-greedy
 58 | -c <int>   : set sampling ratio to compute variable sketchSize, sketchSize = genomeSize/samplingRatio, only support with MinHash sketch function, for clust-greedy
 59 | -d <double>: set the distance threshold, default 0.05 for both clust-mst and clust-greedy
 60 | -t <int>   : set the thread number, default take full usage of platform cores number, for both clust-mst and clust-greedy
 61 | -l         : input is a file list, not a single genome file. Lines in the input file list specify paths to genome files, one per line, for both clust-mst and clust-greedy
 62 | -i <string>: path of input file. One file list or single genome file. Two input file with -f and -E option
 63 | -f         : two input files, genomeInfo and MSTInfo files for clust-mst; genomeInfo and sketchInfo files for clust-greedy 
 64 | -E         : two input files, genomeInfo and sketchInfo for clust-mst
 65 | -F <string>: set the sketch function, including MinHash and KSSD, default MinHash, for both clust-mst and clust-greedy
 66 | -o <string>: path of output file, for both clust-mst and clust-greedy
 67 | -e         : not save the intermediate file generated from the origin genome file, such as the GenomeInfo, MSTInfo, and SketchInfo files, for both clust-mst and clust-greedy
 68 | 
 69 | ```
 70 | 
 71 | ## Example:
 72 | ```bash
 73 | #input is a file list, one genome path per line:
 74 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust
 75 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust
 76 | 
 77 | #input is a single genome file in FASTA format, one genome as a sequence:
 78 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust
 79 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust
 80 | 
 81 | #the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options.
 82 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust
 83 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust
 84 | 
 85 | 
 86 | #for redundancy detection with clust-greedy, input is a genome file list:
 87 | #use -d to specify the distance threshold corresponding to various degrees of redundancy.
 88 | ./clust-greedy -d 0.001 -l -i bacteriaList -o bacteria.out
 89 | 
 90 | #for generator cluster from exist MST with a distance threshold of 0.045:
 91 | #ATTENTION: the -f must in front of the -i option
 92 | ./clust-mst -d 0.05 -f -i bact_refseq.list.MinHashGenomeInfo bact_refseq.list.MinHashMSTInfo -o bact_refseq.mst.d.045.clust
 93 | 
 94 | #for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001:
 95 | #ATTENTION: the -f must in front of the -i option
 96 | ./clust-greedy -d 0.001 -f -i bact_genbank.list.MinHashGenomeInfo bact_genbank.list.MinHashSketchInfo -o bact_genbank.greedy.d.001.clust
 97 | 
 98 | ```
 99 | ## Output
100 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*).  
101 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome.
102 | 
103 | #### Output format for a FASTA file list input
104 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are:
105 | * local index in a cluster
106 | * global index of the genome
107 | * genome length
108 | * genome file name (including genome assembly accession number)
109 | * sequence name (first sequence in the genome file)
110 | * sequence comment (remaining part of the line)
111 | 
112 | **Example:**
113 | ```txt
114 | the cluster 0 is:
115 |     0   0   14782125nt  bacteria/GCF_000418325.1_ASM41832v1_genomic.fna     NC_021658.1     Sorangium cellulosum So0157-2, complete sequence
116 |     1   1   14598830nt  bacteria/GCF_004135755.1_ASM413575v1_genomic.fna    NZ_CP012672.1   Sorangium cellulosum strain So ce836 chromosome, complete genome
117 | 
118 | the cluster 1 is:
119 |     0   2   14557589nt  bacteria/GCF_002950945.1_ASM295094v1_genomic.fna    NZ_CP012673.1   Sorangium cellulosum strain So ce26 chromosome, complete genome
120 | 
121 | the cluster 2 is:
122 |     0   3   13673866nt  bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna   NZ_JAHKRM010000001.1    Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence
123 | 
124 | ......
125 | ```
126 | 
127 | #### Output format for a single FASTA file input
128 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are:
129 | * local index in a cluster
130 | * global index of the genome
131 | * genome length
132 | * sequence name 
133 | * sequence comment (remaining part of this line)
134 | 
135 | **Example:**
136 | ```txt
137 | the cluster 0 is:
138 |     0   0   11030030nt  NZ_GG657755.1   Streptomyces  himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence
139 |     1   1   11008137nt  NZ_RIBZ01000339.1   Streptomyces  sp. NEAU-LD23 C2041, whole genome shotgun sequence
140 | 
141 | the cluster 1 is:
142 |     0   2   11006208nt  NZ_KL647031.1   Nonomuraea  candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence
143 |     
144 | the cluster 2 is:
145 |     0   3   10940472nt  NZ_VTHK01000001.1   Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence
146 | 
147 | ......
148 | ```
149 | 
150 | 
151 | # Bug Report
152 | All bug reports, comments and suggestions are welcome.
153 | 
154 | ## Cite
155 | [Xu, X. et al. (2022). RabbitTClust: enabling fast clustering analysis of
156 | millions bacteria genomes with minhash sketches. bioRxiv.](https://doi.org/10.1101/2022.10.13.512052)
157 | 


--------------------------------------------------------------------------------
/version_history/v.2.1.0.md:
--------------------------------------------------------------------------------
  1 | # `RabbitTClust v.2.1.0`
  2 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations.
  3 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms.
  4 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 
  5 | 
  6 | ## Installation
  7 | RabbitTClust version 2.1.0 can only support 64-bit Linux Systems.
  8 | 
  9 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](./history.md) document.
 10 | 
 11 | ### Dependancy
 12 | * cmake v.3.0 or later
 13 | * c++14
 14 | * [zlib](https://zlib.net/)
 15 | 
 16 | ### Compile and install automatically
 17 | ```bash
 18 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 19 | cd RabbitTClust
 20 | ./install.sh
 21 | ```
 22 | 
 23 | ### Compile and install manually 
 24 | ```bash
 25 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 26 | cd RabbitTClust
 27 | 
 28 | #make rabbitSketch library
 29 | cd RabbitSketch &&
 30 | mkdir -p build && cd build &&
 31 | cmake -DCXXAPI=ON -DCMAKE_INSTALL_PREFIX=. .. &&
 32 | make -j8 && make install &&
 33 | cd ../../ &&
 34 | 
 35 | #make rabbitFX library
 36 | cd RabbitFX && 
 37 | mkdir -p build && cd build &&
 38 | cmake -DCMAKE_INSTALL_PREFIX=. .. &&
 39 | make -j8 && make install && 
 40 | cd ../../ &&
 41 | 
 42 | #compile the clust-greedy
 43 | mkdir -p build && cd build &&
 44 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=ON .. && 
 45 | make -j8 && make install &&
 46 | cd ../ &&
 47 | 
 48 | #compile the clust-mst
 49 | cd build &&
 50 | cmake -DUSE_RABBITFX=ON -DUSE_GREEDY=OFF .. &&
 51 | make -j8 && make install &&
 52 | cd ../ 
 53 | ```
 54 | 
 55 | ## Usage
 56 | ```bash
 57 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust
 58 | Usage: ./clust-mst [OPTIONS]
 59 | Options:
 60 |   -h,--help                   Print this help message and exit
 61 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 62 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 63 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress
 64 |   -k,--kmer-size INT          set the kmer size
 65 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 66 |   -l,--inputlist              input is genome list, one genome per line
 67 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 68 |   -d,--threshold FLOAT        set the distance threshold for clustering
 69 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 70 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 71 |   -i,--input TEXT             set the input file
 72 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 73 |   --premsted TEXT             clustering by the pre-generated mst files rather than genomes for clust-mst
 74 | 
 75 | # clust-greedy, greedy incremental clustering module for RabbitTClust
 76 | Usage: ./clust-greedy [OPTIONS]
 77 | Options:
 78 |   -h,--help                   Print this help message and exit
 79 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 80 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 81 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress
 82 |   -k,--kmer-size INT          set the kmer size
 83 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 84 |   -l,--inputlist              input is genome list, one genome per line
 85 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 86 |   -d,--threshold FLOAT        set the distance threshold for clustering
 87 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 88 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 89 |   -i,--input TEXT             set the input file
 90 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 91 | ```
 92 | 
 93 | ## Example:
 94 | ```bash
 95 | #input is a file list, one genome path per line:
 96 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust
 97 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust
 98 | 
 99 | #input is a single genome file in FASTA format, one genome as a sequence:
100 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust
101 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust
102 | 
103 | #the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options.
104 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust
105 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust
106 | 
107 | 
108 | #for redundancy detection with clust-greedy, input is a genome file list:
109 | #use -d to specify the distance threshold corresponding to various degrees of redundancy.
110 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out
111 | 
112 | #for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15.
113 | #this folder contains the sketch, mst files.
114 | #for generator cluster from exist MST with a distance threshold of 0.045:
115 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15 -o bacteria.mst.d.045.clust
116 | #for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045:
117 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15 -o bacteria.mst.d.045.clust
118 | 
119 | #for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001:
120 | # folder 2023_05_06_08-49-15 contains the sketch files.
121 | ./clust-greedy -d 0.001 --presketched 2023_05_06_08-49-15 -o bact_genbank.greedy.d.001.clust
122 | ```
123 | ## Output
124 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*).  
125 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome.
126 | 
127 | #### Output format for a FASTA file list input
128 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are:
129 | * local index in a cluster
130 | * global index of the genome
131 | * genome length
132 | * genome file name (including genome assembly accession number)
133 | * sequence name (first sequence in the genome file)
134 | * sequence comment (remaining part of the line)
135 | 
136 | **Example:**
137 | ```txt
138 | the cluster 0 is:
139 |     0   0   14782125nt  bacteria/GCF_000418325.1_ASM41832v1_genomic.fna     NC_021658.1     Sorangium cellulosum So0157-2, complete sequence
140 |     1   1   14598830nt  bacteria/GCF_004135755.1_ASM413575v1_genomic.fna    NZ_CP012672.1   Sorangium cellulosum strain So ce836 chromosome, complete genome
141 | 
142 | the cluster 1 is:
143 |     0   2   14557589nt  bacteria/GCF_002950945.1_ASM295094v1_genomic.fna    NZ_CP012673.1   Sorangium cellulosum strain So ce26 chromosome, complete genome
144 | 
145 | the cluster 2 is:
146 |     0   3   13673866nt  bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna   NZ_JAHKRM010000001.1    Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence
147 | 
148 | ......
149 | ```
150 | 
151 | #### Output format for a single FASTA file input
152 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are:
153 | * local index in a cluster
154 | * global index of the genome
155 | * genome length
156 | * sequence name 
157 | * sequence comment (remaining part of this line)
158 | 
159 | **Example:**
160 | ```txt
161 | the cluster 0 is:
162 |     0   0   11030030nt  NZ_GG657755.1   Streptomyces  himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence
163 |     1   1   11008137nt  NZ_RIBZ01000339.1   Streptomyces  sp. NEAU-LD23 C2041, whole genome shotgun sequence
164 | 
165 | the cluster 1 is:
166 |     0   2   11006208nt  NZ_KL647031.1   Nonomuraea  candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence
167 |     
168 | the cluster 2 is:
169 |     0   3   10940472nt  NZ_VTHK01000001.1   Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence
170 | 
171 | ......
172 | ```
173 | 
174 | 
175 | # Bug Report
176 | We highly appreciate all bug reports, comments, and suggestions from our users.  
177 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 
178 | 
179 | ## Cite
180 | [Xu, X. et al. (2022). RabbitTClust: enabling fast clustering analysis of
181 | millions bacteria genomes with minhash sketches. bioRxiv.](https://doi.org/10.1101/2022.10.13.512052)
182 | 


--------------------------------------------------------------------------------
/version_history/v.2.2.0.md:
--------------------------------------------------------------------------------
  1 | # `RabbitTClust v.2.2.0`
  2 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations.
  3 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms.
  4 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 
  5 | 
  6 | ## Installation
  7 | `RabbitTClust v.2.2.0` can only support 64-bit Linux Systems.
  8 | 
  9 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](./history.md) document.
 10 | 
 11 | ### Dependancy
 12 | * cmake v.3.0 or later
 13 | * c++14
 14 | * [zlib](https://zlib.net/)
 15 | 
 16 | ### Compile and install
 17 | ```bash
 18 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 19 | cd RabbitTClust
 20 | ./install.sh
 21 | ```
 22 | ## Usage
 23 | ```bash
 24 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust
 25 | Usage: ./clust-mst [OPTIONS]
 26 | Options:
 27 |   -h,--help                   Print this help message and exit
 28 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 29 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 30 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 31 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 32 |   -l,--list                   input is genome list, one genome per line
 33 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 34 |   -d,--threshold FLOAT        set the distance threshold for clustering
 35 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 36 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 37 |   -i,--input TEXT Excludes: --append
 38 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 39 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 40 |   --premsted TEXT             clustering by the pre-generated mst files rather than genomes for clust-mst
 41 |   --append TEXT Excludes: --input
 42 |                               append genome file or file list with the pre-generated sketch or MST files
 43 | 
 44 | # clust-greedy, greedy incremental clustering module for RabbitTClust
 45 | Usage: ./clust-greedy [OPTIONS]
 46 | Options:
 47 |   -h,--help                   Print this help message and exit
 48 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 49 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 50 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 51 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 52 |   -l,--list                   input is genome list, one genome per line
 53 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 54 |   -d,--threshold FLOAT        set the distance threshold for clustering
 55 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 56 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 57 |   -i,--input TEXT Excludes: --append
 58 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 59 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 60 |   --append TEXT Excludes: --input
 61 |                               append genome file or file list with the pre-generated sketch or MST files
 62 | ```
 63 | 
 64 | ## Example:
 65 | ```bash
 66 | # input is a file list, one genome path per line:
 67 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust
 68 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust
 69 | 
 70 | # input is a single genome file in FASTA format, one genome as a sequence:
 71 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust
 72 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust
 73 | 
 74 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options.
 75 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust
 76 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust
 77 | 
 78 | 
 79 | # for redundancy detection with clust-greedy, input is a genome file list:
 80 | # use -d to specify the distance threshold corresponding to various degrees of redundancy.
 81 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out
 82 | 
 83 | # v.2.1.0 or later
 84 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15.
 85 | # this folder contains the sketch, mst files.
 86 | # for generator cluster from exist MST with a distance threshold of 0.045:
 87 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
 88 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045:
 89 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
 90 | 
 91 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001:
 92 | # folder 2023_05_06_08-49-15 contains the sketch files.
 93 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust
 94 | 
 95 | # v.2.2.0 or later
 96 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 
 97 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust
 98 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust
 99 | ```
100 | ## Output
101 | The output file is in a CD-HIT output format and is slightly different when running with varying input options (*-l* and *-i*).  
102 | Option *-l* means input as a FASTA file list, one file per genome, and *-i* means input as a single FASTA file, one sequence per genome.
103 | 
104 | #### Output format for a FASTA file list input
105 | With *-l* option, the tab-delimited values in the lines beginning with tab delimiters are:
106 | * local index in a cluster
107 | * global index of the genome
108 | * genome length
109 | * genome file name (including genome assembly accession number)
110 | * sequence name (first sequence in the genome file)
111 | * sequence comment (remaining part of the line)
112 | 
113 | **Example:**
114 | ```txt
115 | the cluster 0 is:
116 |     0   0   14782125nt  bacteria/GCF_000418325.1_ASM41832v1_genomic.fna     NC_021658.1     Sorangium cellulosum So0157-2, complete sequence
117 |     1   1   14598830nt  bacteria/GCF_004135755.1_ASM413575v1_genomic.fna    NZ_CP012672.1   Sorangium cellulosum strain So ce836 chromosome, complete genome
118 | 
119 | the cluster 1 is:
120 |     0   2   14557589nt  bacteria/GCF_002950945.1_ASM295094v1_genomic.fna    NZ_CP012673.1   Sorangium cellulosum strain So ce26 chromosome, complete genome
121 | 
122 | the cluster 2 is:
123 |     0   3   13673866nt  bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna   NZ_JAHKRM010000001.1    Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence
124 | 
125 | ......
126 | ```
127 | 
128 | #### Output format for a single FASTA file input
129 | With *-i* option, the tab-delimited values in the lines beginning with tab delimiters are:
130 | * local index in a cluster
131 | * global index of the genome
132 | * genome length
133 | * sequence name 
134 | * sequence comment (remaining part of this line)
135 | 
136 | **Example:**
137 | ```txt
138 | the cluster 0 is:
139 |     0   0   11030030nt  NZ_GG657755.1   Streptomyces  himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence
140 |     1   1   11008137nt  NZ_RIBZ01000339.1   Streptomyces  sp. NEAU-LD23 C2041, whole genome shotgun sequence
141 | 
142 | the cluster 1 is:
143 |     0   2   11006208nt  NZ_KL647031.1   Nonomuraea  candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence
144 |     
145 | the cluster 2 is:
146 |     0   3   10940472nt  NZ_VTHK01000001.1   Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence
147 | 
148 | ......
149 | ```
150 | 
151 | 
152 | # Bug Report
153 | We highly appreciate all bug reports, comments, and suggestions from our users.  
154 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 
155 | 
156 | ## Cite
157 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6
158 | 


--------------------------------------------------------------------------------
/version_history/v.2.2.1.md:
--------------------------------------------------------------------------------
  1 | ![RabbitTClust](rabbittclust.png)
  2 | 
  3 | # `RabbitTClust v.2.2.1`
  4 | RabbitTClust is a fast and memory-efficient genome clustering tool based on sketch-based distance estimations.
  5 | It enables processing of large-scale datasets by combining dimensionality reduction techniques with streaming and parallelization on modern multi-core platforms.
  6 | RabbitTClust supports classical single-linkage hierarchical (clust-mst) and greedy incremental clustering (clust-greedy) algorithms for different scenarios. 
  7 | 
  8 | ## Installation
  9 | `RabbitTClust v.2.2.1` can only support 64-bit Linux Systems.
 10 | 
 11 | The detailed update information for this version, as well as the version history, can be found in the [`version_history`](version_history/history.md) document.
 12 | 
 13 | ### Dependancy
 14 | * cmake v.3.0 or later
 15 | * c++14
 16 | * [zlib](https://zlib.net/)
 17 | 
 18 | ### Compile and install
 19 | ```bash
 20 | git clone --recursive https://github.com/RabbitBio/RabbitTClust.git
 21 | cd RabbitTClust
 22 | ./install.sh
 23 | ```
 24 | ## Usage
 25 | ```bash
 26 | # clust-mst, minimum-spanning-tree-based module for RabbitTClust
 27 | Usage: ./clust-mst [OPTIONS]
 28 | Options:
 29 |   -h,--help                   Print this help message and exit
 30 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 31 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 32 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 33 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 34 |   -l,--list                   input is genome list, one genome per line
 35 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 36 |   -d,--threshold FLOAT        set the distance threshold for clustering
 37 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 38 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 39 |   -i,--input TEXT Excludes: --append
 40 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 41 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 42 |   --premsted TEXT             clustering by the pre-generated mst files rather than genomes for clust-mst
 43 |   --newick-tree               output the newick tree format file for clust-mst
 44 |   --append TEXT Excludes: --input
 45 |                               append genome file or file list with the pre-generated sketch or MST files
 46 | 
 47 | # clust-greedy, greedy incremental clustering module for RabbitTClust
 48 | Usage: ./clust-greedy [OPTIONS]
 49 | Options:
 50 |   -h,--help                   Print this help message and exit
 51 |   -t,--threads INT            set the thread number, default all CPUs of the platform
 52 |   -m,--min-length UINT        set the filter minimum length (minLen), genome length less than minLen will be ignore, default 10,000
 53 |   -c,--containment INT        use AAF distance with containment coefficient, set the containCompress, the sketch size is in proportion with 1/containCompress  -k,--kmer-size INT          set the kmer size
 54 |   -s,--sketch-size INT        set the sketch size for Jaccard Index and Mash distance, default 1000
 55 |   -l,--list                   input is genome list, one genome per line
 56 |   -e,--no-save                not save the intermediate files, such as sketches or MST
 57 |   -d,--threshold FLOAT        set the distance threshold for clustering
 58 |   -F,--function TEXT          set the sketch function, such as MinHash, KSSD, default MinHash
 59 |   -o,--output TEXT REQUIRED   set the output name of cluster result
 60 |   -i,--input TEXT Excludes: --append
 61 |                               set the input file, single FASTA genome file (without -l option) or genome list file (with -l option)
 62 |   --presketched TEXT          clustering by the pre-generated sketch files rather than genomes
 63 |   --append TEXT Excludes: --input
 64 |                               append genome file or file list with the pre-generated sketch or MST files
 65 | ```
 66 | 
 67 | ## Example:
 68 | ```bash
 69 | # input is a file list, one genome path per line:
 70 | ./clust-mst -l -i bact_refseq.list -o bact_refseq.mst.clust
 71 | ./clust-greedy -l -i bact_genbank.list -o bact_genbank.greedy.clust
 72 | 
 73 | # input is a single genome file in FASTA format, one genome as a sequence:
 74 | ./clust-mst -i bacteria.fna -o bacteria.mst.clust
 75 | ./clust-greedy -i bacteria.fna -o bacteria.greedy.clust
 76 | 
 77 | # the sketch size (reciprocal of sampling proportion), kmer size, and distance threshold can be specified by -s (-c), -k, and -d options.
 78 | ./clust-mst -l -k 21 -s 1000 -d 0.05 -i bact_refseq.list -o bact_refseq.mst.clust
 79 | ./clust-greedy -l -k 21 -c 1000 -d 0.05 -i bact_genbank.list -o bact_genbank.greedy.clust
 80 | 
 81 | 
 82 | # for redundancy detection with clust-greedy, input is a genome file list:
 83 | # use -d to specify the distance threshold corresponding to various degrees of redundancy.
 84 | ./clust-greedy -d 0.001 -l -i bacteria.list -o bacteria.out
 85 | 
 86 | # v.2.1.0 or later
 87 | # for last running of clust-mst, it generated a folder name in year_month_day_hour-minute-second format, such as 2023_05_06_08-49-15.
 88 | # this folder contains the sketch, mst files.
 89 | # for generator cluster from exist MST with a distance threshold of 0.045:
 90 | ./clust-mst -d 0.045 --premsted 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
 91 | # for generator cluster from exist sketches files of clust-mst with a distance threshold of 0.045:
 92 | ./clust-mst -d 0.045 --presketched 2023_05_06_08-49-15/ -o bact_refseq.mst.d.045.clust
 93 | 
 94 | # for generator cluster from exist sketches of clust-greedy with a distance threshold of 0.001:
 95 | # folder 2023_05_06_08-49-15 contains the sketch files.
 96 | ./clust-greedy -d 0.001 --presketched 2023_05_06_09-37-23/ -o bact_genbank.greedy.d.001.clust
 97 | 
 98 | # v.2.2.0 or later
 99 | # for generator cluster from exist part sketches (presketch_A_dir) and append genome set (genome_B.list) to incrementally clustering 
100 | ./clust-mst --presketched 2023_05_06_08-49-15/ -l --append genome_B.list -o append_refseq.mst.clust
101 | ./clust-mst --presketched 2023_05_06_09-37-23/ -l --append genome_B.list -o append_genbank.greedy.clust
102 | 
103 | # v.2.2.1 or later
104 | # output the newick tree format for clust-mst, use the --newick-tree flag.
105 | ./clust-mst -l -i bacteria.list --newick-tree -o bacteria.mst.clust 
106 | ```
107 | ## Output
108 | The output file is in a CD-HIT output format and is slightly different when running with or without `-l` input option.  
109 | When using the `-l` option, the input is expected to be a FASTA file list, with each file representing a genome. Without the `-l` option, the input should be a single FASTA file, with each sequence representing a genome.
110 | 
111 | #### Output format for a FASTA file list input
112 | With `-l*` option, the tab-delimited values in the lines beginning with tab delimiters are:
113 | * local index in a cluster
114 | * global index of the genome
115 | * genome length
116 | * genome file name (including genome assembly accession number)
117 | * sequence name (first sequence in the genome file)
118 | * sequence comment (remaining part of the line)
119 | 
120 | **Example:**
121 | ```txt
122 | the cluster 0 is:
123 |     0   0   14782125nt  bacteria/GCF_000418325.1_ASM41832v1_genomic.fna     NC_021658.1     Sorangium cellulosum So0157-2, complete sequence
124 |     1   1   14598830nt  bacteria/GCF_004135755.1_ASM413575v1_genomic.fna    NZ_CP012672.1   Sorangium cellulosum strain So ce836 chromosome, complete genome
125 | 
126 | the cluster 1 is:
127 |     0   2   14557589nt  bacteria/GCF_002950945.1_ASM295094v1_genomic.fna    NZ_CP012673.1   Sorangium cellulosum strain So ce26 chromosome, complete genome
128 | 
129 | the cluster 2 is:
130 |     0   3   13673866nt  bacteria/GCF_019396345.1_ASM1939634v1_genomic.fna   NZ_JAHKRM010000001.1    Nonomuraea guangzhouensis strain CGMCC 4.7101 NODE_1, whole genome shotgun sequence
131 | 
132 | ......
133 | ```
134 | 
135 | #### Output format for a single FASTA file input
136 | Without `-l` option, the tab-delimited values in the lines beginning with tab delimiters are:
137 | * local index in a cluster
138 | * global index of the genome
139 | * genome length
140 | * sequence name 
141 | * sequence comment (remaining part of this line)
142 | 
143 | **Example:**
144 | ```txt
145 | the cluster 0 is:
146 |     0   0   11030030nt  NZ_GG657755.1   Streptomyces  himastatinicus ATCC 53653 supercont1.2, whole genome shotgun sequence
147 |     1   1   11008137nt  NZ_RIBZ01000339.1   Streptomyces  sp. NEAU-LD23 C2041, whole genome shotgun sequence
148 | 
149 | the cluster 1 is:
150 |     0   2   11006208nt  NZ_KL647031.1   Nonomuraea  candida strain NRRL B-24552 Doro1_scaffold1, whole genome shotgun sequence
151 |     
152 | the cluster 2 is:
153 |     0   3   10940472nt  NZ_VTHK01000001.1   Amycolatopsis anabasis strain EGI 650086 RDPYD18112716_A.Scaf1, whole genome shotgun sequence
154 | 
155 | ......
156 | ```
157 | 
158 | #### Output the newick tree format (v.2.2.1 or latter)
159 | When the `--newick-tree` option is used, an additional output file will be generated in the Newick tree format with a suffix name of ".newick.tree".
160 | 
161 | 
162 | # Bug Report
163 | We highly appreciate all bug reports, comments, and suggestions from our users.  
164 | Please feel free to raise any concerns or feedback with us without hesitation by `issue`. 
165 | 
166 | ## Cite
167 | Xu, X., Yin, Z., Yan, L. et al. RabbitTClust: enabling fast clustering analysis of millions of bacteria genomes with MinHash sketches. Genome Biol 24, 121 (2023). https://doi.org/10.1186/s13059-023-02961-6
168 | 


--------------------------------------------------------------------------------