├── .gitmodules ├── CMakeLists.txt ├── CMakeLists_zlib.txt.in ├── Dockerfile ├── Doxyfile ├── LICENSE.txt ├── README.md ├── api_examples ├── Makefile ├── api_example.cpp └── sequences.fna ├── build └── .keep ├── doc └── html │ ├── classsbwt_1_1Argv-members.html │ ├── classsbwt_1_1Argv.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Argv-members.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Argv.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Disk__Instream-members.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Disk__Instream.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Kmer__stream__from__KMC__DB-members.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Kmer__stream__from__KMC__DB.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Node__stream__merger-members.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1Node__stream__merger.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1SimpleSortedKmerDB-members.html │ ├── classsbwt_1_1KMC__construction__helper__classes_1_1SimpleSortedKmerDB.html │ ├── classsbwt_1_1Kmer-members.html │ ├── classsbwt_1_1Kmer.html │ ├── classsbwt_1_1NodeBOSSInMemoryConstructor-members.html │ ├── classsbwt_1_1NodeBOSSInMemoryConstructor.html │ ├── classsbwt_1_1NodeBOSSKMCConstructor-members.html │ ├── classsbwt_1_1NodeBOSSKMCConstructor.html │ ├── classsbwt_1_1Progress__printer-members.html │ ├── classsbwt_1_1Progress__printer.html │ ├── classsbwt_1_1SBWT-members.html │ ├── classsbwt_1_1SBWT.html │ ├── classsbwt_1_1SubsetConcatRank-members.html │ ├── classsbwt_1_1SubsetConcatRank.html │ ├── classsbwt_1_1SubsetMatrixRank-members.html │ ├── classsbwt_1_1SubsetMatrixRank.html │ ├── classsbwt_1_1SubsetMatrixSelectSupport-members.html │ ├── classsbwt_1_1SubsetMatrixSelectSupport.html │ ├── classsbwt_1_1SubsetSplitRank-members.html │ ├── classsbwt_1_1SubsetSplitRank.html │ ├── classsbwt_1_1SubsetWT-members.html │ ├── classsbwt_1_1SubsetWT.html │ ├── classsbwt_1_1Temp__File__Manager-members.html │ ├── classsbwt_1_1Temp__File__Manager.html │ ├── classsbwt_1_1kmer__colex__compare-members.html │ ├── classsbwt_1_1kmer__colex__compare.html │ ├── classsbwt_1_1mod__ef__vector-members.html │ ├── classsbwt_1_1mod__ef__vector.html │ ├── classsbwt_1_1rank__support__mod__ef-members.html │ ├── classsbwt_1_1rank__support__mod__ef.html │ ├── classsbwt_1_1throwing__ifstream-members.html │ ├── classsbwt_1_1throwing__ifstream.html │ ├── classsbwt_1_1throwing__ofstream-members.html │ ├── classsbwt_1_1throwing__ofstream.html │ ├── doxygen.css │ ├── dynsections.js │ ├── jquery.js │ ├── menu.js │ ├── menudata.js │ ├── structsbwt_1_1SBWT_1_1BuildConfig-members.html │ ├── structsbwt_1_1SBWT_1_1BuildConfig.html │ ├── tabs.css │ └── variants_8hh_source.html ├── ecoli_dataset ├── README.md └── coli_files.txt ├── example_data ├── coli3.fna └── queries.fastq ├── include └── sbwt │ ├── EM_sort │ ├── Block.hh │ ├── EM_sort.hh │ ├── ParallelBoundedQueue.hh │ ├── bit_level_stuff.hh │ └── generic_EM_classes.hh │ ├── Kmer.hh │ ├── MEF.hpp │ ├── NodeBOSSInMemoryConstructor.hh │ ├── SBWT.hh │ ├── SubsetConcatRank.hh │ ├── SubsetMatrixRank.hh │ ├── SubsetMatrixSelectSupport.hh │ ├── SubsetSplitRank.hh │ ├── SubsetWT.hh │ ├── TempFileManager.hh │ ├── commands.hh │ ├── cxxopts.hpp │ ├── globals.hh │ ├── kmc_construct.hh │ ├── kmc_construct_helper_classes.hh │ ├── run_kmc.hh │ ├── stdlib_printing.hh │ ├── suffix_group_optimization.hh │ ├── throwing_streams.hh │ ├── variants.hh │ └── zstr │ ├── strict_fstream.hpp │ └── zstr.hpp ├── src ├── CLI │ ├── sbwt.cpp │ ├── sbwt_ascii_export.cpp │ ├── sbwt_build.cpp │ ├── sbwt_build_from_plain_matrix.cpp │ └── sbwt_search.cpp ├── EM_sort │ ├── Block.cpp │ └── EM_sort.cpp ├── globals.cpp ├── kmc_construct_helper_classes.cpp ├── run_kmc.cpp └── suffix_group_optimization.cpp ├── temp └── .keep ├── tests ├── query_benchmark.cpp ├── setup_tests.hh ├── test_CLI.hh ├── test_EM_sort.hh ├── test_kmer.hh ├── test_large.hh ├── test_main.cpp ├── test_misc.hh └── test_small.hh └── version.h.in /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sdsl-lite"] 2 | path = sdsl-lite 3 | url = https://github.com/iosfwd/sdsl-lite/ 4 | [submodule "googletest"] 5 | path = googletest 6 | url = https://github.com/iosfwd/googletest 7 | [submodule "KMC"] 8 | path = KMC 9 | url = https://github.com/algbio/KMC 10 | [submodule "SeqIO"] 11 | path = SeqIO 12 | url = https://github.com/jnalanko/SeqIO 13 | -------------------------------------------------------------------------------- /CMakeLists_zlib.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.2) 2 | 3 | project(zlib-get NONE) 4 | include(ExternalProject) 5 | 6 | ExternalProject_Add(zlib-download 7 | GIT_REPOSITORY https://github.com/madler/zlib.git 8 | GIT_TAG v1.2.11 9 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/zlib" 10 | BUILD_IN_SOURCE 1 11 | CONFIGURE_COMMAND ${CMAKE_BINARY_DIR}/external/zlib/configure --static 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | # Set some timezone or otherwise tzdata hangs the build. 4 | ENV TZ=Asia/Dubai 5 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 6 | 7 | RUN apt-get update && apt-get install -y g++ gcc cmake git python3-dev g++-10 libz-dev libbz2-dev 8 | 9 | RUN git clone https://github.com/algbio/SBWT 10 | WORKDIR /SBWT 11 | RUN git checkout master 12 | 13 | WORKDIR /SBWT/build 14 | RUN cmake .. -DCMAKE_CXX_COMPILER=g++-10 -DMAX_KMER_LENGTH=32 -DBUILD_TESTS=1 15 | RUN make -j8 16 | run /SBWT/build/bin/sbwt 17 | -------------------------------------------------------------------------------- /api_examples/Makefile: -------------------------------------------------------------------------------- 1 | # Yeah, this include and linking process is a bit messy currently, sorry about that. 2 | 3 | SDSL_INCLUDES=-I ../build/external/sdsl-lite/build/include 4 | DIVSUFSORT_INCLUDES=-I ../build/external/sdsl-lite/build/external/libdivsufsort/include 5 | SEQIO_INCLUDES=-I ../build/external/SeqIO/include 6 | SBWT_INCLUDES=-I ../include -I ../include/sbwt 7 | 8 | ALL_INCLUDES=${SDSL_INCLUDES} ${DIVSUFSORT_INCLUDES} ${SEQIO_INCLUDES} ${SBWT_INCLUDES} 9 | 10 | SBWT_LIBS=-L $(shell pwd)/../build/external/sdsl-lite/build/lib/ 11 | KMC_OBJECTS=../build/external/KMC/build/libkmc_core.a ../build/external/KMC/build/libkmc_tools.a 12 | 13 | api_example: 14 | ${CXX} -g -std=c++17 api_example.cpp ../build/libsbwt_static.a ${KMC_OBJECTS} ${ALL_INCLUDES} ${SBWT_LIBS} -lsdsl -lz -o api_example -Wno-deprecated-declarations -pthread -O3 15 | 16 | 17 | -------------------------------------------------------------------------------- /api_examples/api_example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "throwing_streams.hh" 3 | #include "globals.hh" 4 | #include "variants.hh" 5 | #include "SeqIO/SeqIO.hh" 6 | #include "SubsetMatrixSelectSupport.hh" 7 | 8 | using namespace std; 9 | using namespace sbwt; 10 | 11 | int main(int argc, char** argv){ 12 | 13 | int64_t k = 6; 14 | 15 | // Build the index 16 | plain_matrix_sbwt_t::BuildConfig config; 17 | config.k = k; 18 | config.build_streaming_support = true; // One extra bit vector for speeding up positive streaming queries 19 | config.precalc_k = 4; // Speed up search by precalculating all 4^p p-mer intervals 20 | config.input_files = {"sequences.fna"}; 21 | config.n_threads = 4; 22 | config.ram_gigas = 4; 23 | plain_matrix_sbwt_t sbwt(config); 24 | 25 | // Search for k-mer GATGGC 26 | cout << sbwt.search("GATGGC") << endl; 27 | 28 | // Search for all k-mers of TAATGCTGTAGC 29 | for(int64_t colex_rank : sbwt.streaming_search("TAATGCTGTAGC")){ 30 | cout << colex_rank << endl; 31 | } 32 | 33 | // Dump all k-mers out of the data structure at once (fast) 34 | string kmer_dump = sbwt.reconstruct_all_kmers(); 35 | for(int64_t i = 0; i < kmer_dump.size(); i += k){ 36 | string kmer = kmer_dump.substr(i, k); 37 | 38 | // If the k-mer is not a dummy k-mer, print it 39 | if(kmer[0] != '$') cout << kmer << endl; 40 | } 41 | cout << "--" << endl; 42 | 43 | // List k-mers one by one 44 | SubsetMatrixSelectSupport select_support(sbwt.get_subset_rank_structure()); 45 | 46 | vector buf(k+1); // The k-mer will be written here 47 | for(int64_t i = 0; i < sbwt.number_of_subsets(); i++){ 48 | sbwt.get_kmer_fast(i, buf.data(), select_support); 49 | 50 | // If the k-mer is not a dummy k-mer, print it 51 | if(buf[0] != '$') cout << buf.data() << endl; 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /api_examples/sequences.fna: -------------------------------------------------------------------------------- 1 | >1 2 | CCCGTGATGGCTA 3 | >2 4 | TAATGCTGTAGC 5 | >3 6 | TGGCTCGTGTAGTCGA 7 | -------------------------------------------------------------------------------- /build/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algbio/SBWT/b6e683096979774b69b9e15156f2d9863c909edd/build/.keep -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1Argv-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::Argv Member List
70 |
71 |
72 | 73 |

This is the complete list of members for sbwt::Argv, including all inherited members.

74 | 75 | 76 | 77 | 78 | 79 |
Argv(vector< string > v) (defined in sbwt::Argv)sbwt::Argvinline
array (defined in sbwt::Argv)sbwt::Argv
size (defined in sbwt::Argv)sbwt::Argv
~Argv() (defined in sbwt::Argv)sbwt::Argvinline
80 | 81 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1Argv.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::Argv Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 72 |
73 |
sbwt::Argv Class Reference
74 |
75 |
76 | 77 | 79 | 81 | 82 |

78 | Public Member Functions

80 |  Argv (vector< string > v)
 
83 | 85 | 87 | 88 | 90 | 91 |

84 | Public Attributes

86 | char ** array = NULL
 
89 | int64_t size = 0
 
92 |
The documentation for this class was generated from the following file: 95 |
96 | 97 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Argv-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::KMC_construction_helper_classes::Argv Member List
70 |
71 | 80 | 81 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Argv.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::KMC_construction_helper_classes::Argv Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 72 |
73 |
sbwt::KMC_construction_helper_classes::Argv Class Reference
74 |
75 |
76 | 77 | 79 | 81 | 82 |

78 | Public Member Functions

80 |  Argv (vector< string > v)
 
83 | 85 | 87 | 88 | 90 | 91 |

84 | Public Attributes

86 | char ** array = NULL
 
89 | int64_t size = 0
 
92 |
The documentation for this class was generated from the following file: 95 |
96 | 97 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Disk__Instream-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::KMC_construction_helper_classes::Disk_Instream Member List
70 |
71 | 81 | 82 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Disk__Instream.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::KMC_construction_helper_classes::Disk_Instream Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::KMC_construction_helper_classes::Disk_Instream Class Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 | 83 | 84 | 86 | 87 | 89 | 90 |

77 | Public Member Functions

79 |  Disk_Instream (string filename)
 
82 | bool stream_done () const
 
85 | Node stream_next ()
 
88 | Node peek_next ()
 
91 |
The documentation for this class was generated from the following file: 94 |
95 | 96 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Kmer__stream__from__KMC__DB-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::KMC_construction_helper_classes::Kmer_stream_from_KMC_DB Member List
70 |
71 | 80 | 81 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Kmer__stream__from__KMC__DB.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::KMC_construction_helper_classes::Kmer_stream_from_KMC_DB Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::KMC_construction_helper_classes::Kmer_stream_from_KMC_DB Class Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 | 83 | 84 | 86 | 87 |

77 | Public Member Functions

79 |  Kmer_stream_from_KMC_DB (string KMC_db_path, bool add_revcomps)
 
82 | bool done ()
 
85 | Kmer< MAX_KMER_LENGTH > next ()
 
88 |
The documentation for this class was generated from the following file: 91 |
92 | 93 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Node__stream__merger-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::KMC_construction_helper_classes::Node_stream_merger Member List
70 |
71 | 79 | 80 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1KMC__construction__helper__classes_1_1Node__stream__merger.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::KMC_construction_helper_classes::Node_stream_merger Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::KMC_construction_helper_classes::Node_stream_merger Class Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 | 83 | 84 | 86 | 87 |

77 | Public Member Functions

79 |  Node_stream_merger (Disk_Instream &A, Disk_Instream &B)
 
82 | bool stream_done ()
 
85 | Node stream_next ()
 
88 |
The documentation for this class was generated from the following file: 91 |
92 | 93 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1NodeBOSSKMCConstructor-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::NodeBOSSKMCConstructor< nodeboss_t > Member List
70 |
71 |
72 | 73 |

This is the complete list of members for sbwt::NodeBOSSKMCConstructor< nodeboss_t >, including all inherited members.

74 | 75 | 76 | 77 | 78 | 79 |
add_prefixes(kmer_t z, seq_io::Buffered_ofstream<> &out, char *buf) (defined in sbwt::NodeBOSSKMCConstructor< nodeboss_t >)sbwt::NodeBOSSKMCConstructor< nodeboss_t >inline
build(const vector< string > &input_files, nodeboss_t &nodeboss, int64_t k, int64_t n_threads, int64_t ram_gigas, bool streaming_support, int64_t min_abundance, int64_t max_abundance, int64_t precalc_k) (defined in sbwt::NodeBOSSKMCConstructor< nodeboss_t >)sbwt::NodeBOSSKMCConstructor< nodeboss_t >inline
build_bit_vectors_from_sorted_streams(const string &nodefile, const string &dummyfile, sdsl::bit_vector &A_bits_sdsl, sdsl::bit_vector &C_bits_sdsl, sdsl::bit_vector &G_bits_sdsl, sdsl::bit_vector &T_bits_sdsl, sdsl::bit_vector &suffix_group_starts_sdsl, int64_t k) (defined in sbwt::NodeBOSSKMCConstructor< nodeboss_t >)sbwt::NodeBOSSKMCConstructor< nodeboss_t >inline
write_nodes_and_dummies(const string &KMC_db_path, const string &nodes_outfile, const string &dummies_outfile, int64_t n_kmers) (defined in sbwt::NodeBOSSKMCConstructor< nodeboss_t >)sbwt::NodeBOSSKMCConstructor< nodeboss_t >inline
80 | 81 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1NodeBOSSKMCConstructor.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::NodeBOSSKMCConstructor< nodeboss_t > Class Template Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::NodeBOSSKMCConstructor< nodeboss_t > Class Template Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 | 83 | 84 | 86 | 87 | 89 | 90 |

77 | Public Member Functions

79 | void add_prefixes (kmer_t z, seq_io::Buffered_ofstream<> &out, char *buf)
 
82 | void build_bit_vectors_from_sorted_streams (const string &nodefile, const string &dummyfile, sdsl::bit_vector &A_bits_sdsl, sdsl::bit_vector &C_bits_sdsl, sdsl::bit_vector &G_bits_sdsl, sdsl::bit_vector &T_bits_sdsl, sdsl::bit_vector &suffix_group_starts_sdsl, int64_t k)
 
85 | void write_nodes_and_dummies (const string &KMC_db_path, const string &nodes_outfile, const string &dummies_outfile, int64_t n_kmers)
 
88 | void build (const vector< string > &input_files, nodeboss_t &nodeboss, int64_t k, int64_t n_threads, int64_t ram_gigas, bool streaming_support, int64_t min_abundance, int64_t max_abundance, int64_t precalc_k)
 
91 |
The documentation for this class was generated from the following file: 94 |
95 | 96 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1Progress__printer-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::Progress_printer Member List
70 |
71 |
72 | 73 |

This is the complete list of members for sbwt::Progress_printer, including all inherited members.

74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 |
first_print (defined in sbwt::Progress_printer)sbwt::Progress_printer
job_done() (defined in sbwt::Progress_printer)sbwt::Progress_printerinline
n_jobs (defined in sbwt::Progress_printer)sbwt::Progress_printer
next_print (defined in sbwt::Progress_printer)sbwt::Progress_printer
processed (defined in sbwt::Progress_printer)sbwt::Progress_printer
Progress_printer(int64_t n_jobs, int64_t total_prints) (defined in sbwt::Progress_printer)sbwt::Progress_printerinline
total_prints (defined in sbwt::Progress_printer)sbwt::Progress_printer
83 | 84 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1Temp__File__Manager-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::Temp_File_Manager Member List
70 |
71 |
72 | 73 |

This is the complete list of members for sbwt::Temp_File_Manager, including all inherited members.

74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 |
create_filename() (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
create_filename(string prefix) (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
create_filename(string prefix, string suffix) (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
delete_all_files() (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
delete_file(string filename) (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
get_dir() (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
set_dir(string temp_dir) (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
Temp_File_Manager() (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
~Temp_File_Manager() (defined in sbwt::Temp_File_Manager)sbwt::Temp_File_Managerinline
85 | 86 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1Temp__File__Manager.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::Temp_File_Manager Class Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::Temp_File_Manager Class Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 | 83 | 84 | 86 | 87 | 89 | 90 | 92 | 93 | 95 | 96 | 98 | 99 |

77 | Public Member Functions

79 | void set_dir (string temp_dir)
 
82 | string get_dir ()
 
85 | string create_filename ()
 
88 | string create_filename (string prefix)
 
91 | string create_filename (string prefix, string suffix)
 
94 | void delete_file (string filename)
 
97 | void delete_all_files ()
 
100 |
The documentation for this class was generated from the following file: 103 |
104 | 105 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1kmer__colex__compare-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 |
69 |
sbwt::kmer_colex_compare< max_len > Member List
70 |
71 |
72 | 73 |

This is the complete list of members for sbwt::kmer_colex_compare< max_len >, including all inherited members.

74 | 75 | 76 |
operator()(const Kmer< max_len > &A, const Kmer< max_len > &B) (defined in sbwt::kmer_colex_compare< max_len >)sbwt::kmer_colex_compare< max_len >inline
77 | 78 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /doc/html/classsbwt_1_1kmer__colex__compare.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: sbwt::kmer_colex_compare< max_len > Struct Template Reference 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | 27 | 28 | 29 |
24 |
SBWT 25 |
26 |
30 |
31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
53 |
54 | 55 | 56 |
57 | 60 |
61 | 62 | 66 |
67 |
68 | 71 |
72 |
sbwt::kmer_colex_compare< max_len > Struct Template Reference
73 |
74 |
75 | 76 | 78 | 80 | 81 |

77 | Public Member Functions

79 | bool operator() (const Kmer< max_len > &A, const Kmer< max_len > &B)
 
82 |
The documentation for this struct was generated from the following file: 85 |
86 | 87 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /doc/html/dynsections.js: -------------------------------------------------------------------------------- 1 | /* 2 | @licstart The following is the entire license notice for the 3 | JavaScript code in this file. 4 | 5 | Copyright (C) 1997-2017 by Dimitri van Heesch 6 | 7 | This program is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation; either version 2 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License along 18 | with this program; if not, write to the Free Software Foundation, Inc., 19 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 20 | 21 | @licend The above is the entire license notice 22 | for the JavaScript code in this file 23 | */ 24 | function toggleVisibility(linkObj) 25 | { 26 | var base = $(linkObj).attr('id'); 27 | var summary = $('#'+base+'-summary'); 28 | var content = $('#'+base+'-content'); 29 | var trigger = $('#'+base+'-trigger'); 30 | var src=$(trigger).attr('src'); 31 | if (content.is(':visible')===true) { 32 | content.hide(); 33 | summary.show(); 34 | $(linkObj).addClass('closed').removeClass('opened'); 35 | $(trigger).attr('src',src.substring(0,src.length-8)+'closed.png'); 36 | } else { 37 | content.show(); 38 | summary.hide(); 39 | $(linkObj).removeClass('closed').addClass('opened'); 40 | $(trigger).attr('src',src.substring(0,src.length-10)+'open.png'); 41 | } 42 | return false; 43 | } 44 | 45 | function updateStripes() 46 | { 47 | $('table.directory tr'). 48 | removeClass('even').filter(':visible:even').addClass('even'); 49 | } 50 | 51 | function toggleLevel(level) 52 | { 53 | $('table.directory tr').each(function() { 54 | var l = this.id.split('_').length-1; 55 | var i = $('#img'+this.id.substring(3)); 56 | var a = $('#arr'+this.id.substring(3)); 57 | if (l'+ 31 | data.children[i].text+''+ 32 | makeTree(data.children[i],relPath)+''; 33 | } 34 | result+=''; 35 | } 36 | return result; 37 | } 38 | 39 | $('#main-nav').append(makeTree(menudata,relPath)); 40 | $('#main-nav').children(':first').addClass('sm sm-dox').attr('id','main-menu'); 41 | if (searchEnabled) { 42 | if (serverSide) { 43 | $('#main-menu').append('
  • '); 44 | } else { 45 | $('#main-menu').append('
  • '); 46 | } 47 | } 48 | $('#main-menu').smartmenus(); 49 | } 50 | /* @license-end */ 51 | -------------------------------------------------------------------------------- /doc/html/menudata.js: -------------------------------------------------------------------------------- 1 | /* 2 | @licstart The following is the entire license notice for the 3 | JavaScript code in this file. 4 | 5 | Copyright (C) 1997-2019 by Dimitri van Heesch 6 | 7 | This program is free software; you can redistribute it and/or modify 8 | it under the terms of version 2 of the GNU General Public License as published by 9 | the Free Software Foundation 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License along 17 | with this program; if not, write to the Free Software Foundation, Inc., 18 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 19 | 20 | @licend The above is the entire license notice 21 | for the JavaScript code in this file 22 | */ 23 | var menudata={children:[ 24 | {text:"Main Page",url:"index.html"}, 25 | {text:"Classes",url:"annotated.html",children:[ 26 | {text:"Class List",url:"annotated.html"}, 27 | {text:"Class Index",url:"classes.html"}, 28 | {text:"Class Hierarchy",url:"inherits.html"}, 29 | {text:"Class Members",url:"functions.html",children:[ 30 | {text:"All",url:"functions.html",children:[ 31 | {text:"b",url:"functions.html#index_b"}, 32 | {text:"c",url:"functions.html#index_c"}, 33 | {text:"d",url:"functions.html#index_d"}, 34 | {text:"f",url:"functions.html#index_f"}, 35 | {text:"g",url:"functions.html#index_g"}, 36 | {text:"h",url:"functions.html#index_h"}, 37 | {text:"i",url:"functions.html#index_i"}, 38 | {text:"k",url:"functions.html#index_k"}, 39 | {text:"l",url:"functions.html#index_l"}, 40 | {text:"m",url:"functions.html#index_m"}, 41 | {text:"n",url:"functions.html#index_n"}, 42 | {text:"p",url:"functions.html#index_p"}, 43 | {text:"r",url:"functions.html#index_r"}, 44 | {text:"s",url:"functions.html#index_s"}, 45 | {text:"t",url:"functions.html#index_t"}, 46 | {text:"u",url:"functions.html#index_u"}]}, 47 | {text:"Functions",url:"functions_func.html",children:[ 48 | {text:"c",url:"functions_func.html#index_c"}, 49 | {text:"d",url:"functions_func.html#index_d"}, 50 | {text:"f",url:"functions_func.html#index_f"}, 51 | {text:"g",url:"functions_func.html#index_g"}, 52 | {text:"h",url:"functions_func.html#index_h"}, 53 | {text:"l",url:"functions_func.html#index_l"}, 54 | {text:"n",url:"functions_func.html#index_n"}, 55 | {text:"p",url:"functions_func.html#index_p"}, 56 | {text:"r",url:"functions_func.html#index_r"}, 57 | {text:"s",url:"functions_func.html#index_s"}, 58 | {text:"u",url:"functions_func.html#index_u"}]}, 59 | {text:"Variables",url:"functions_vars.html"}]}]}, 60 | {text:"Files",url:"files.html",children:[ 61 | {text:"File List",url:"files.html"}]}]} 62 | -------------------------------------------------------------------------------- /doc/html/structsbwt_1_1SBWT_1_1BuildConfig-members.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SBWT: Member List 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |
    19 |
    20 | 21 | 22 | 23 | 27 | 28 | 29 |
    24 |
    SBWT 25 |
    26 |
    30 |
    31 | 32 | 33 | 38 | 39 | 40 | 47 | 48 | 49 |
    53 |
    54 | 55 | 56 |
    57 | 60 |
    61 | 62 | 66 |
    67 |
    68 |
    69 |
    sbwt::SBWT< subset_rank_t >::BuildConfig Member List
    70 |
    71 | 85 | 86 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /ecoli_dataset/README.md: -------------------------------------------------------------------------------- 1 | The file `coli_files.txt` contains all the accession numbers of E. coli genomes included in the experiment in the paper. The dataset can be downloaded from [Zenodo](https://zenodo.org/record/6577997#.YpOdXTlBxH6). 2 | -------------------------------------------------------------------------------- /include/sbwt/EM_sort/Block.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "bit_level_stuff.hh" 15 | #include "SeqIO/buffered_streams.hh" 16 | 17 | namespace sbwt{ 18 | 19 | using namespace std; 20 | 21 | class Generic_Block{ 22 | public: 23 | virtual void sort(const std::function& cmp) = 0; 24 | virtual void write_to_file(string filename) = 0; 25 | virtual ~Generic_Block(){} 26 | }; 27 | 28 | // Takes a pointer to a buffer. Read to the buffer the whole record including the 8-byte size value. 29 | // The buffer might be realloc'd. Returns a bool whether read was successful. 30 | // buffer_len must be greater than 0 (otherwise buffer doubling does not work) 31 | bool read_variable_binary_record(seq_io::Buffered_ifstream<>& input, char** buffer, int64_t* buffer_len); 32 | 33 | // Block of records of variable length. The first 8 bytes of a record are a big-endian integer L 34 | // that tells size of the record. Then follow L-8 bytes which is the "payload" of the record. 35 | class Variable_binary_block : public Generic_Block{ 36 | public: 37 | 38 | char* data; // A buffer. Not necessarily all full of data. Contains concateated records 39 | int64_t data_len = 0; 40 | int64_t next_start = 0; 41 | 42 | vector starts; 43 | 44 | Variable_binary_block(){ 45 | data = (char*) malloc(1 * sizeof(char)); 46 | data_len = 1; 47 | } 48 | 49 | ~Variable_binary_block(){ 50 | free(data); 51 | } 52 | 53 | void double_space(){ 54 | data = (char*)realloc(data, 2 * data_len * sizeof(char)); 55 | data_len *= 2; 56 | } 57 | 58 | virtual void sort(const std::function& cmp){ 59 | auto cmp_wrap = [&](int64_t x, int64_t y){ 60 | return cmp(data+x,data+y); 61 | }; 62 | std::sort(starts.begin(), starts.end(), cmp_wrap); 63 | } 64 | 65 | void add_record(const char* record){ 66 | int64_t space_left = data_len - next_start; 67 | int64_t rec_len = parse_big_endian_LL(record); 68 | while(space_left < rec_len){ 69 | double_space(); 70 | space_left = data_len - next_start; 71 | } 72 | 73 | memcpy(data+next_start, record, rec_len); 74 | starts.push_back(next_start); 75 | next_start += rec_len; 76 | } 77 | 78 | virtual void write_to_file(string filename){ 79 | seq_io::Buffered_ofstream<> out(filename, ios::binary); 80 | for(int64_t i = 0; i < starts.size(); i++){ 81 | int64_t length = parse_big_endian_LL(data + starts[i]); 82 | out.write(data + starts[i], length); 83 | } 84 | } 85 | 86 | int64_t estimate_size_in_bytes(){ 87 | return next_start + starts.size() * sizeof(int64_t); 88 | } 89 | }; 90 | 91 | // RETURN VALUE MUST BE FREED BY CALLER 92 | Variable_binary_block* get_next_variable_binary_block(seq_io::Buffered_ifstream<>& input, int64_t B); 93 | 94 | // Records of exactly n bytes each 95 | class Constant_binary_block : public Generic_Block{ 96 | public: 97 | 98 | public: 99 | 100 | char* data; // A buffer. Not necessarily all full of data. Contains concateated records 101 | int64_t data_len = 0; 102 | int64_t next_start = 0; 103 | int64_t record_size; 104 | vector starts; // We sort this vector. The data array is not touched in sorting. 105 | 106 | Constant_binary_block(int64_t record_size) : record_size(record_size) { 107 | data = (char*) malloc(1 * sizeof(char)); 108 | data_len = 1; 109 | } 110 | 111 | ~Constant_binary_block(){ 112 | free(data); 113 | } 114 | 115 | void double_space(){ 116 | data = (char*)realloc(data, 2 * data_len * sizeof(char)); 117 | data_len *= 2; 118 | } 119 | 120 | virtual void sort(const std::function& cmp){ 121 | auto cmp_wrap = [&](int64_t x, int64_t y){ 122 | return cmp(data+x,data+y); 123 | }; 124 | std::sort(starts.begin(), starts.end(), cmp_wrap); 125 | } 126 | 127 | void add_record(const char* record){ 128 | int64_t space_left = data_len - next_start; 129 | while(space_left < record_size){ 130 | double_space(); 131 | space_left = data_len - next_start; 132 | } 133 | 134 | memcpy(data+next_start, record, record_size); 135 | starts.push_back(next_start); 136 | next_start += record_size; 137 | } 138 | 139 | virtual void write_to_file(string filename){ 140 | seq_io::Buffered_ofstream<> out(filename, ios::binary); 141 | for(int64_t i = 0; i < starts.size(); i++){ 142 | out.write(data + starts[i], record_size); 143 | } 144 | } 145 | 146 | int64_t estimate_size_in_bytes(){ 147 | return next_start + starts.size() * sizeof(int64_t); 148 | } 149 | 150 | }; 151 | 152 | // Reads up to B bytes into a new block 153 | // THE RETURN VALUE MUST BE FREED BY THE CALLER 154 | Constant_binary_block* get_next_constant_binary_block(seq_io::Buffered_ifstream<>& input, int64_t B, int64_t record_size); 155 | 156 | } -------------------------------------------------------------------------------- /include/sbwt/EM_sort/EM_sort.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../globals.hh" 15 | #include "Block.hh" 16 | #include "ParallelBoundedQueue.hh" 17 | #include "generic_EM_classes.hh" 18 | 19 | namespace sbwt{ 20 | 21 | /* 22 | Design: 23 | 24 | There are two operating modes: EM_LINES and EM_CONSTANT_BINARY. The mode determines 25 | how records are represented in disk and memory. 26 | 27 | 28 | EM_BINARY: 29 | - Records are in binary such that the first 8 bytes are the big-endian representation 30 | of a 64-bit integer x. Then follows x-8 bytes of data. The comparison function now takes 31 | as a parameter the pointers to the starts of the x byte block of data of the records. 32 | When passing around records, the size x is always at the start of the data array. 33 | - Internal workings: The records are read from disk into an object of class Block, which 34 | has a char-array which will store the concatenation of all records. A vector 'starts' 35 | stores the start position of each record. Sorting is done by permuting 'starts' and keeping 36 | the data in place. The data is written back to disk according to the permutation of starts. 37 | 38 | EM_CONSTANT_BINARY is similar, but there is no record size included in the records and they 39 | are assumed to be a given constant size instead. 40 | */ 41 | 42 | using namespace std; 43 | 44 | // Interprets the strings as integers (no leading zeros allowed) and returns: 45 | // -1 if x < y 46 | // 0 if x = y 47 | // 1 if x > y 48 | int compare_as_numbers(const char* x, const char* y); 49 | bool memcmp_variable_binary_records(const char* x, const char* y); 50 | void copy_file(string infile, string outfile, int64_t buf_size); 51 | 52 | // Constant size records of record_size bytes each 53 | void EM_sort_constant_binary(string infile, string outfile, const std::function& cmp, int64_t RAM_bytes, int64_t record_size, int64_t n_threads); 54 | 55 | // Binary format of record: first 8 bytes give the length of the record, then comes the record 56 | // k = k-way merge parameter 57 | void EM_sort_variable_length_records(string infile, string outfile, const std::function& cmp, int64_t RAM_bytes, int64_t n_threads); 58 | 59 | } -------------------------------------------------------------------------------- /include/sbwt/EM_sort/bit_level_stuff.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../globals.hh" 4 | #include "SeqIO/buffered_streams.hh" 5 | #include 6 | 7 | namespace sbwt{ 8 | 9 | inline int64_t byte_to_int(char c){ 10 | return static_cast(*reinterpret_cast(&c)); 11 | } 12 | 13 | inline int64_t parse_big_endian_LL(const char* A){ 14 | int64_t x = 0; 15 | x |= byte_to_int(A[7]) << 0; 16 | x |= byte_to_int(A[6]) << 8; 17 | x |= byte_to_int(A[5]) << 16; 18 | x |= byte_to_int(A[4]) << 24; 19 | x |= byte_to_int(A[3]) << 32; 20 | x |= byte_to_int(A[2]) << 40; 21 | x |= byte_to_int(A[1]) << 48; 22 | x |= byte_to_int(A[0]) << 56; 23 | return x; 24 | } 25 | 26 | // Get the byte with index byte_idx in the big_endian representation of x 27 | inline char get_byte(int64_t x, int64_t byte_idx){ 28 | // How does castring int to char work? 29 | // Quote from the book The C++ Programming Language, by Bjarne Stroustrup: 30 | // > If the destination type is unsigned, the resulting value is simply as many bits from 31 | // > the source as will fit in the destination (high-order bits are thrown away if necessary). 32 | // > More precisely, the result is the least unsigned integer congruent to the source integer 33 | // > modulo 2 to the n-th, where n is the number of bits used to represent the unsigned type. 34 | // > If the destination type is signed, the value is unchanged if it can be represented in the 35 | // > destination type, otherwise, the value is implementation-defined. 36 | 37 | // Therefore, we can cast to an unsigned char and we should be ok. But the code that calls 38 | // this function needs a signed char, because ofstream::write takes only signed chars. 39 | // Therefore, to avoid implementation-defined behavior, we need to do a reinterpret-cast 40 | // to a signed value. 41 | unsigned char c = static_cast((x >> (7 - byte_idx)*8) & 0xff); 42 | return *reinterpret_cast(&c); 43 | } 44 | 45 | inline void write_big_endian_LL(seq_io::Buffered_ofstream<>& out, int64_t x){ 46 | char c; 47 | 48 | c = get_byte(x,0); out.write(&c,1); 49 | c = get_byte(x,1); out.write(&c,1); 50 | c = get_byte(x,2); out.write(&c,1); 51 | c = get_byte(x,3); out.write(&c,1); 52 | c = get_byte(x,4); out.write(&c,1); 53 | c = get_byte(x,5); out.write(&c,1); 54 | c = get_byte(x,6); out.write(&c,1); 55 | c = get_byte(x,7); out.write(&c,1); 56 | } 57 | 58 | 59 | inline void write_big_endian_LL(char* buf, int64_t x){ 60 | buf[0] = get_byte(x,0); 61 | buf[1] = get_byte(x,1); 62 | buf[2] = get_byte(x,2); 63 | buf[3] = get_byte(x,3); 64 | buf[4] = get_byte(x,4); 65 | buf[5] = get_byte(x,5); 66 | buf[6] = get_byte(x,6); 67 | buf[7] = get_byte(x,7); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /include/sbwt/SubsetConcatRank.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "globals.hh" 7 | #include 8 | 9 | namespace sbwt{ 10 | 11 | using namespace std; 12 | 13 | template 14 | class SubsetConcatRank{ 15 | 16 | concat_WT_t concat; 17 | L_bitvec_t L; // Marks the first element from each set in concat with a zero 18 | L_select0_t L_ss0; 19 | 20 | public: 21 | 22 | // Count of character c in subsets up to pos, not including pos 23 | int64_t rank(int64_t pos, char c) const{ 24 | return concat.rank(L_ss0.select(pos+1), c); 25 | } 26 | 27 | bool contains(int64_t pos, char c) const{ 28 | // TODO: faster 29 | int64_t r1 = this->rank(pos, c); 30 | int64_t r2 = this->rank(pos+1, c); 31 | return r1 != r2; 32 | } 33 | 34 | SubsetConcatRank(){} 35 | 36 | SubsetConcatRank(const sdsl::bit_vector& A_bits, const sdsl::bit_vector& C_bits, const sdsl::bit_vector& G_bits, const sdsl::bit_vector& T_bits){ 37 | assert(A_bits.size() == C_bits.size() && C_bits.size() == G_bits.size() && G_bits.size() == T_bits.size()); 38 | 39 | std::string concat_str; 40 | vector L_vec_bool; 41 | int64_t n = A_bits.size(); 42 | for(int64_t i = 0; i < n; i++){ 43 | if(A_bits[i] == 1) concat_str.push_back('A'); 44 | if(C_bits[i] == 1) concat_str.push_back('C'); 45 | if(G_bits[i] == 1) concat_str.push_back('G'); 46 | if(T_bits[i] == 1) concat_str.push_back('T'); 47 | 48 | if(A_bits[i] + C_bits[i] + G_bits[i] + T_bits[i] == 0) 49 | concat_str.push_back('$'); 50 | 51 | L_vec_bool.push_back(0); 52 | while(L_vec_bool.size() < concat_str.size()) L_vec_bool.push_back(1); 53 | } 54 | 55 | L_vec_bool.push_back(0); // End sentinel to avoid a special case in select. 56 | 57 | // Construct the final L bitvector from L_vec_bool 58 | sdsl::bit_vector L_sdsl(L_vec_bool.size()); 59 | for(int64_t i = 0; i < L_vec_bool.size(); i++) L_sdsl[i] = L_vec_bool[i]; 60 | L = L_bitvec_t(L_sdsl); 61 | 62 | // Init supports 63 | sdsl::util::init_support(this->L_ss0, &(this->L)); 64 | sdsl::construct_im(concat, concat_str.c_str(), 1); // 1: file format is a sequence, not a serialized sdsl object 65 | } 66 | 67 | int64_t serialize(ostream& os) const{ 68 | int64_t written = 0; 69 | written += concat.serialize(os); 70 | written += L.serialize(os); 71 | written += L_ss0.serialize(os); 72 | return written; 73 | } 74 | 75 | void load(istream& is){ 76 | concat.load(is); 77 | L.load(is); 78 | L_ss0.load(is); 79 | L_ss0.set_vector(&L); 80 | } 81 | 82 | 83 | SubsetConcatRank(const SubsetConcatRank& other){ 84 | assert(&other != this); // What on earth are you trying to do? 85 | operator=(other); 86 | } 87 | 88 | SubsetConcatRank& operator=(const SubsetConcatRank& other){ 89 | if(&other != this){ 90 | this->concat = other.concat; 91 | this->L = other.L; 92 | this->L_ss0 = other.L_ss0; 93 | 94 | this->L_ss0.set_vector(&(this->L)); 95 | 96 | return *this; 97 | } else return *this; // Assignment to self -> do nothing. 98 | } 99 | 100 | }; 101 | 102 | } -------------------------------------------------------------------------------- /include/sbwt/SubsetMatrixRank.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "globals.hh" 7 | #include 8 | 9 | namespace sbwt{ 10 | 11 | using namespace std; 12 | 13 | template 14 | class SubsetMatrixRank{ 15 | 16 | public: 17 | 18 | // Bit vectors 19 | bitvector_t A_bits; 20 | bitvector_t C_bits; 21 | bitvector_t G_bits; 22 | bitvector_t T_bits; 23 | 24 | // Rank supports 25 | rank_support_t A_bits_rs; 26 | rank_support_t C_bits_rs; 27 | rank_support_t G_bits_rs; 28 | rank_support_t T_bits_rs; 29 | 30 | // Count of character c in subsets up to pos, not including pos 31 | int64_t rank(int64_t pos, char c) const{ 32 | if(c == 'A') return A_bits_rs.rank(pos); 33 | if(c == 'C') return C_bits_rs.rank(pos); 34 | if(c == 'G') return G_bits_rs.rank(pos); 35 | if(c == 'T') return T_bits_rs.rank(pos); 36 | return 0; 37 | } 38 | 39 | bool contains(int64_t pos, char c) const{ 40 | // Returns true if the set with index pos contains character c 41 | switch(c){ 42 | case 'A': return A_bits[pos]; 43 | case 'C': return C_bits[pos]; 44 | case 'G': return G_bits[pos]; 45 | case 'T': return T_bits[pos]; 46 | default: return false; 47 | } 48 | } 49 | 50 | SubsetMatrixRank(){} 51 | 52 | SubsetMatrixRank(const sdsl::bit_vector& A_bits, const sdsl::bit_vector& C_bits, const sdsl::bit_vector& G_bits, const sdsl::bit_vector& T_bits) 53 | : A_bits(A_bits), C_bits(C_bits), G_bits(G_bits), T_bits(T_bits){ 54 | sdsl::util::init_support(this->A_bits_rs, &(this->A_bits)); 55 | sdsl::util::init_support(this->C_bits_rs, &(this->C_bits)); 56 | sdsl::util::init_support(this->G_bits_rs, &(this->G_bits)); 57 | sdsl::util::init_support(this->T_bits_rs, &(this->T_bits)); 58 | } 59 | 60 | SubsetMatrixRank(const SubsetMatrixRank& other){ 61 | assert(&other != this); // What on earth are you trying to do? 62 | operator=(other); 63 | } 64 | 65 | SubsetMatrixRank& operator=(const SubsetMatrixRank& other){ 66 | if(&other != this){ 67 | this->A_bits = other.A_bits; 68 | this->C_bits = other.C_bits; 69 | this->G_bits = other.G_bits; 70 | this->T_bits = other.T_bits; 71 | 72 | this->A_bits_rs = other.A_bits_rs; 73 | this->C_bits_rs = other.C_bits_rs; 74 | this->G_bits_rs = other.G_bits_rs; 75 | this->T_bits_rs = other.T_bits_rs; 76 | 77 | this->A_bits_rs.set_vector(&(this->A_bits)); 78 | this->C_bits_rs.set_vector(&(this->C_bits)); 79 | this->G_bits_rs.set_vector(&(this->G_bits)); 80 | this->T_bits_rs.set_vector(&(this->T_bits)); 81 | 82 | return *this; 83 | } else return *this; // Assignment to self -> do nothing. 84 | } 85 | 86 | int64_t serialize(ostream& os) const{ 87 | int64_t written = 0; 88 | written += A_bits.serialize(os); 89 | written += C_bits.serialize(os); 90 | written += G_bits.serialize(os); 91 | written += T_bits.serialize(os); 92 | 93 | written += A_bits_rs.serialize(os); 94 | written += C_bits_rs.serialize(os); 95 | written += G_bits_rs.serialize(os); 96 | written += T_bits_rs.serialize(os); 97 | 98 | write_log("MatrixRank bit vectors total " + to_string((double)written/A_bits.size()*8) + " bits total per node", LogLevel::MINOR); 99 | return written; 100 | } 101 | 102 | void load(istream& is){ 103 | A_bits.load(is); 104 | C_bits.load(is); 105 | G_bits.load(is); 106 | T_bits.load(is); 107 | 108 | if(std::is_same, rank_support_t>::value){ 109 | // Special handling needed for rank_support_v5 because of a design flaw in sdsl 110 | A_bits_rs.load(is, &A_bits); 111 | C_bits_rs.load(is, &C_bits); 112 | G_bits_rs.load(is, &G_bits); 113 | T_bits_rs.load(is, &T_bits); 114 | } else{ 115 | A_bits_rs.load(is); 116 | C_bits_rs.load(is); 117 | G_bits_rs.load(is); 118 | T_bits_rs.load(is); 119 | 120 | A_bits_rs.set_vector(&A_bits); 121 | C_bits_rs.set_vector(&C_bits); 122 | G_bits_rs.set_vector(&G_bits); 123 | T_bits_rs.set_vector(&T_bits); 124 | } 125 | } 126 | 127 | }; 128 | 129 | } -------------------------------------------------------------------------------- /include/sbwt/SubsetMatrixSelectSupport.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "globals.hh" 6 | #include "SubsetMatrixRank.hh" 7 | #include 8 | 9 | namespace sbwt{ 10 | 11 | using namespace std; 12 | 13 | /** A subset select support based on bitvector select support on indicator bitvectors for each character. 14 | * This class does not own the bit vectors its pointing to. But it does own the select support data. 15 | */ 16 | template 17 | class SubsetMatrixSelectSupport{ 18 | 19 | public: 20 | 21 | // Select supports 22 | typename bitvector_t::select_1_type A_bits_ss; 23 | typename bitvector_t::select_1_type C_bits_ss; 24 | typename bitvector_t::select_1_type G_bits_ss; 25 | typename bitvector_t::select_1_type T_bits_ss; 26 | 27 | int64_t select(int64_t pos, char c) const{ 28 | if(c == 'A') return A_bits_ss.select(pos); 29 | if(c == 'C') return C_bits_ss.select(pos); 30 | if(c == 'G') return G_bits_ss.select(pos); 31 | if(c == 'T') return T_bits_ss.select(pos); 32 | return 0; 33 | } 34 | 35 | SubsetMatrixSelectSupport(){} 36 | 37 | /** Warning: this select structure points to internal vectors of `mr`, so the select support 38 | * can be used only as long as those pointers are valid. 39 | */ 40 | template 41 | SubsetMatrixSelectSupport(const SubsetMatrixRank& mr){ 42 | sdsl::util::init_support(this->A_bits_ss, &mr.A_bits); 43 | sdsl::util::init_support(this->C_bits_ss, &mr.C_bits); 44 | sdsl::util::init_support(this->G_bits_ss, &mr.G_bits); 45 | sdsl::util::init_support(this->T_bits_ss, &mr.T_bits); 46 | } 47 | 48 | SubsetMatrixSelectSupport(const SubsetMatrixSelectSupport& other){ 49 | assert(&other != this); // What on earth are you trying to do? 50 | operator=(other); 51 | } 52 | 53 | SubsetMatrixSelectSupport& operator=(const SubsetMatrixSelectSupport& other){ 54 | if(&other != this){ 55 | 56 | this->A_bits_ss = other.A_bits_ss; 57 | this->C_bits_ss = other.C_bits_ss; 58 | this->G_bits_ss = other.G_bits_ss; 59 | this->T_bits_ss = other.T_bits_ss; 60 | 61 | return *this; 62 | } else return *this; // Assignment to self -> do nothing. 63 | } 64 | 65 | int64_t serialize(ostream& os) const{ 66 | int64_t written = 0; 67 | 68 | written += A_bits_ss.serialize(os); 69 | written += C_bits_ss.serialize(os); 70 | written += G_bits_ss.serialize(os); 71 | written += T_bits_ss.serialize(os); 72 | 73 | return written; 74 | } 75 | 76 | /** Warning: this select structure points to internal vectors of `mr`, so the select support 77 | * can be used only as long as those pointers are valid. 78 | */ 79 | template 80 | void load(istream& is, const SubsetMatrixRank& mr){ 81 | 82 | A_bits_ss.load(is); 83 | C_bits_ss.load(is); 84 | G_bits_ss.load(is); 85 | T_bits_ss.load(is); 86 | 87 | A_bits_ss.set_vector(&mr.A_bits); 88 | C_bits_ss.set_vector(&mr.C_bits); 89 | G_bits_ss.set_vector(&mr.G_bits); 90 | T_bits_ss.set_vector(&mr.T_bits); 91 | } 92 | 93 | }; 94 | 95 | } -------------------------------------------------------------------------------- /include/sbwt/SubsetWT.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "sdsl/wavelet_trees.hpp" 7 | #include "globals.hh" 8 | #include 9 | #include 10 | 11 | namespace sbwt{ 12 | 13 | template 14 | class SubsetWT{ 15 | 16 | public: 17 | 18 | WT_type ACGT_wt; 19 | WT_type AC_wt; 20 | WT_type GT_wt; 21 | 22 | char to_char(bool left, bool right) const{ 23 | if(!left && !right) return '0'; 24 | if(!left && right) return '1'; 25 | if(left && !right) return '2'; 26 | if(left && right) return '3'; 27 | return 0; // Make compiler happy 28 | } 29 | 30 | std::string bitvector_pair_to_string(sdsl::bit_vector& v1, sdsl::bit_vector& v2) const{ 31 | assert(v1.size() == v2.size()); 32 | string S(v1.size(), '\0'); 33 | for(int64_t i = 0; i < v1.size(); i++){ 34 | S[i] = to_char(v1[i], v2[i]); 35 | } 36 | return S; 37 | } 38 | 39 | SubsetWT(){} 40 | 41 | SubsetWT(const sdsl::bit_vector& A_bits, const sdsl::bit_vector& C_bits, const sdsl::bit_vector& G_bits, const sdsl::bit_vector& T_bits){ 42 | sdsl::bit_vector AC_bv; 43 | sdsl::bit_vector GT_bv; 44 | sdsl::bit_vector A_bv; 45 | sdsl::bit_vector C_bv; 46 | sdsl::bit_vector G_bv; 47 | sdsl::bit_vector T_bv; 48 | 49 | assert(A_bits.size() == C_bits.size() && C_bits.size() == G_bits.size() && G_bits.size() == T_bits.size()); 50 | int64_t n = A_bits.size(); 51 | AC_bv.resize(n); 52 | GT_bv.resize(n); 53 | int64_t AC_total = 0; 54 | int64_t GT_total = 0; 55 | for(int64_t i = 0; i < n; i++){ 56 | AC_bv[i] = A_bits[i] || C_bits[i]; 57 | GT_bv[i] = G_bits[i] || T_bits[i]; 58 | AC_total += AC_bv[i]; 59 | GT_total += GT_bv[i]; 60 | } 61 | 62 | A_bv.resize(AC_total); 63 | C_bv.resize(AC_total); 64 | G_bv.resize(GT_total); 65 | T_bv.resize(GT_total); 66 | 67 | for(int64_t i = 0, j = 0; i < n; i++){ 68 | if(AC_bv[i]){ 69 | A_bv[j] = A_bits[i]; 70 | C_bv[j] = C_bits[i]; 71 | j++; 72 | } 73 | } 74 | 75 | for(int64_t i = 0, j = 0; i < n; i++){ 76 | if(GT_bv[i]){ 77 | G_bv[j] = G_bits[i]; 78 | T_bv[j] = T_bits[i]; 79 | j++; 80 | } 81 | } 82 | 83 | string ACGT_string = bitvector_pair_to_string(AC_bv, GT_bv); 84 | string AC_string = bitvector_pair_to_string(A_bv, C_bv); 85 | string GT_string = bitvector_pair_to_string(G_bv, T_bv); 86 | 87 | sdsl::construct_im(ACGT_wt, ACGT_string.c_str(), 1); // 1: file format is a sequence, not a serialized sdsl object 88 | sdsl::construct_im(AC_wt, AC_string.c_str(), 1); // 1: file format is a sequence, not a serialized sdsl object 89 | sdsl::construct_im(GT_wt, GT_string.c_str(), 1); // 1: file format is a sequence, not a serialized sdsl object 90 | 91 | } 92 | 93 | // Count of character c in subsets up to pos, not including pos 94 | int64_t rank(int64_t pos, char c) const{ 95 | assert(c == 'A' || c == 'C' || c == 'G' || c == 'T'); 96 | if(c == 'A'){ 97 | int64_t x = ACGT_wt.rank(pos, to_char(1,0)) + ACGT_wt.rank(pos, to_char(1,1)); 98 | return AC_wt.rank(x, to_char(1,0)) + AC_wt.rank(x, to_char(1,1)); 99 | } 100 | if(c == 'C'){ 101 | int64_t x = ACGT_wt.rank(pos, to_char(1,0)) + ACGT_wt.rank(pos, to_char(1,1)); 102 | return AC_wt.rank(x, to_char(0,1)) + AC_wt.rank(x, to_char(1,1)); 103 | } 104 | if(c == 'G'){ 105 | int64_t x = ACGT_wt.rank(pos, to_char(0,1)) + ACGT_wt.rank(pos, to_char(1,1)); 106 | return GT_wt.rank(x, to_char(1,0)) + GT_wt.rank(x, to_char(1,1)); 107 | } 108 | if(c == 'T'){ 109 | int64_t x = ACGT_wt.rank(pos, to_char(0,1)) + ACGT_wt.rank(pos, to_char(1,1)); 110 | return GT_wt.rank(x, to_char(0,1)) + GT_wt.rank(x, to_char(1,1)); 111 | } 112 | return 0; 113 | } 114 | 115 | bool contains(int64_t pos, char c) const{ 116 | // TODO: faster 117 | int64_t r1 = this->rank(pos, c); 118 | int64_t r2 = this->rank(pos+1, c); 119 | return r1 != r2; 120 | } 121 | 122 | int64_t serialize(ostream& os) const{ 123 | int64_t written = 0; 124 | written += ACGT_wt.serialize(os); 125 | written += AC_wt.serialize(os); 126 | written += GT_wt.serialize(os); 127 | return written; 128 | } 129 | 130 | void load(istream& is){ 131 | ACGT_wt.load(is); 132 | AC_wt.load(is); 133 | GT_wt.load(is); 134 | } 135 | 136 | 137 | }; 138 | 139 | } -------------------------------------------------------------------------------- /include/sbwt/TempFileManager.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "stdlib.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace sbwt{ 19 | 20 | using namespace std; 21 | 22 | class Temp_File_Manager{ 23 | 24 | // Cleans up all allocated files in the end automatically 25 | 26 | private: 27 | 28 | string get_random_string(int64_t length){ 29 | string S; 30 | for(int64_t i = 0; i < length; i++){ 31 | S += alphabet[dist(urandom)]; 32 | } 33 | return S; 34 | } 35 | 36 | std::uniform_int_distribution dist; 37 | std::random_device urandom; 38 | string temp_dir = "."; 39 | vector alphabet; 40 | set used_names; 41 | std::mutex mutex; 42 | 43 | void check_dir_exists(string path){ 44 | struct stat info; 45 | if( stat( path.c_str(), &info ) != 0 ){ 46 | printf( "Error: cannot access %s\n", path.c_str() ); 47 | exit(1); 48 | } 49 | else if( info.st_mode & S_IFDIR ){ 50 | // All good 51 | } 52 | else{ 53 | printf( "Error: %s is not a directory\n", path.c_str() ); 54 | exit(1); 55 | } 56 | } 57 | 58 | public: 59 | 60 | 61 | Temp_File_Manager() : urandom("/dev/urandom") { 62 | for(char c = 'a'; c <= 'z'; c++) alphabet.push_back(c); 63 | for(char c = 'A'; c <= 'Z'; c++) alphabet.push_back(c); 64 | for(char c = '0'; c <= '9'; c++) alphabet.push_back(c); 65 | dist = std::uniform_int_distribution(0, alphabet.size()-1); 66 | } 67 | 68 | void set_dir(string temp_dir){ // Needs to be called before calling get_temp_file_name 69 | check_dir_exists(temp_dir); 70 | this->temp_dir = temp_dir; 71 | } 72 | 73 | string get_dir(){ 74 | return this->temp_dir; 75 | } 76 | 77 | string create_filename(){ 78 | return create_filename("",""); 79 | } 80 | 81 | string create_filename(string prefix){ 82 | return create_filename(prefix, ""); 83 | } 84 | 85 | string create_filename(string prefix, string suffix){ 86 | // Make sure only one thread runs in this function at once 87 | std::lock_guard lg(mutex); 88 | if(temp_dir == ""){ 89 | cerr << "Error: temp dir not set" << endl; 90 | exit(1); 91 | } 92 | while(true){ 93 | string name = temp_dir + "/" + prefix + get_random_string(10) + suffix; 94 | auto desc = open(name.c_str(), O_CREAT | O_EXCL, S_IRWXU); // Fails if file exists, otherwise creates it 95 | if(desc != -1){ 96 | used_names.insert(name); 97 | close(desc); 98 | // Success. Let's delete the empty file that was created 99 | std::filesystem::remove(name); 100 | return name; 101 | } else if(errno != EEXIST){ 102 | cerr << std::strerror(errno) << " " << name << endl; 103 | close(desc); 104 | exit(1); 105 | } 106 | } 107 | } // The lock guard goes out of scope and is destructed 108 | 109 | // delete_file: delete a file before the end of the lifetime of the manager 110 | void delete_file(string filename){ 111 | if(used_names.count(filename) == 0){ 112 | cerr << "Error: tried to delete a temp file that thet temp file manager did not create: " << filename << endl; 113 | exit(1); 114 | } 115 | std::filesystem::remove(filename.c_str()); 116 | used_names.erase(filename); 117 | } 118 | 119 | void delete_all_files(){ 120 | for(string name : used_names) remove(name.c_str()); 121 | used_names.clear(); 122 | } 123 | 124 | ~Temp_File_Manager(){ 125 | delete_all_files(); 126 | } 127 | 128 | }; 129 | 130 | } -------------------------------------------------------------------------------- /include/sbwt/commands.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | std::vector get_available_variants(); 7 | 8 | int build_main(int argc, char** argv); 9 | int search_main(int argc, char** argv); 10 | int build_from_plain_main(int argc, char** argv); 11 | int ascii_export_main(int argc, char** argv); 12 | -------------------------------------------------------------------------------- /include/sbwt/globals.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "TempFileManager.hh" 7 | 8 | namespace sbwt{ 9 | 10 | using namespace std; 11 | 12 | #ifndef MAX_KMER_LENGTH 13 | #define MAX_KMER_LENGTH 32 14 | #endif 15 | 16 | // Table mapping ascii values of characters to their reverse complements, 17 | // lower-case to lower case, upper-case to upper-case. Non-ACGT characters 18 | // are mapped to themselves. 19 | static constexpr unsigned char rc_table[256] = 20 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21 | 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 22 | 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 23 | 56, 57, 58, 59, 60, 61, 62, 63, 64, 84, 66, 71, 68, 69, 70, 67, 72, 73, 24 | 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 65, 85, 86, 87, 88, 89, 90, 91, 25 | 92, 93, 94, 95, 96, 116, 98, 103, 100, 101, 102, 99, 104, 105, 106, 107, 26 | 108, 109, 110, 111, 112, 113, 114, 115, 97, 117, 118, 119, 120, 121, 122, 27 | 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 28 | 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 29 | 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 30 | 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 31 | 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 32 | 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 33 | 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 34 | 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 35 | 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; 36 | 37 | // Table mapping ACGT to 0123 38 | static constexpr int8_t from_ACGT_to_0123_lookup_table[256] = 39 | {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; 40 | 41 | // Table mapping 0123 to ACGT 42 | static constexpr char from_0123_to_ACGT_lookup_table[4] = {'A','C','G','T'}; 43 | 44 | // ACGT -> 0123 45 | constexpr int64_t DNA_to_char_idx(char c){ 46 | return from_ACGT_to_0123_lookup_table[(uint8_t)c]; 47 | } 48 | 49 | // 0123 -> ACGT 50 | constexpr char char_idx_to_DNA(int64_t i){ 51 | assert(i >= 0 && i < 4); 52 | return from_0123_to_ACGT_lookup_table[i]; 53 | } 54 | 55 | // ACGT -> TGCA 56 | constexpr char get_rc(char c){ 57 | return rc_table[(unsigned char)c]; 58 | } 59 | 60 | 61 | enum LogLevel {OFF = 0, MAJOR = 1, MINOR = 2, DEBUG = 3}; 62 | 63 | vector readlines(string filename); 64 | long long cur_time_millis(); 65 | long long cur_time_micros(); 66 | double seconds_since_program_start(); 67 | string getTimeString(); 68 | void set_log_level(LogLevel level); 69 | LogLevel get_log_level(); 70 | void write_log(string message, LogLevel level); 71 | void check_true(bool condition, string error_message); 72 | string get_rc(const string& s); // Reverse complement 73 | Temp_File_Manager& get_temp_file_manager(); 74 | 75 | void check_readable(string filename); 76 | void check_writable(string filename); 77 | 78 | Temp_File_Manager& get_temp_file_manager(); 79 | 80 | int64_t serialize_string(const string& S, ostream& out); // Returns the number of bytes written 81 | string load_string(istream& in); // Loads string serialized by serialize_string 82 | 83 | class Progress_printer{ 84 | 85 | public: 86 | 87 | int64_t n_jobs; 88 | int64_t processed; 89 | int64_t total_prints; 90 | int64_t next_print; 91 | bool first_print; 92 | 93 | Progress_printer(int64_t n_jobs, int64_t total_prints) : n_jobs(n_jobs), processed(0), total_prints(total_prints), next_print(0), first_print(true) {} 94 | 95 | void job_done(){ 96 | if(sbwt::get_log_level() >= LogLevel::MINOR){ 97 | if(next_print == processed){ 98 | //string erase(current_string.size() + 1, '\b'); // Backspace characters. +1 For the endline 99 | if(!first_print) cerr << '\r' << flush; // Erase current line 100 | first_print = false; 101 | 102 | int64_t progress_percent = round(100 * ((double)processed / n_jobs)); 103 | cerr << to_string(progress_percent) + "%" << flush; 104 | 105 | next_print += n_jobs / total_prints; 106 | } 107 | processed++; 108 | if(processed == n_jobs) cerr << "\r100%" << endl; // No more prints coming 109 | } 110 | } 111 | 112 | }; 113 | 114 | 115 | class Argv{ // Class for turning a vector into char** 116 | private: 117 | 118 | // Forbid copying the class because it wont work right 119 | Argv(Argv const& other); 120 | Argv& operator=(Argv const& other); 121 | 122 | public: 123 | 124 | char** array = NULL; 125 | int64_t size = 0; 126 | 127 | Argv(vector v){ 128 | array = (char**)malloc(sizeof(char*) * v.size()); 129 | // Copy contents of v into array 130 | for(int64_t i = 0; i < v.size(); i++){ 131 | char* s = (char*)malloc(sizeof(char) * (v[i].size() + 1)); // +1: space for '\0' at the end 132 | for(int64_t j = 0; j < v[i].size(); j++){ 133 | s[j] = v[i][j]; // Can't use strcpy because s.c_str() is const 134 | } 135 | s[v[i].size()] = '\0'; 136 | array[i] = s; 137 | } 138 | size = v.size(); 139 | } 140 | 141 | ~Argv(){ 142 | for(int64_t i = 0; i < size; i++) free(array[i]); 143 | free(array); 144 | } 145 | 146 | }; 147 | 148 | } -------------------------------------------------------------------------------- /include/sbwt/kmc_construct_helper_classes.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Kmer.hh" 4 | #include 5 | #include "SeqIO/buffered_streams.hh" 6 | #include "EM_sort/EM_sort.hh" 7 | #include 8 | #include 9 | #include 10 | 11 | class CKMCFile; // Defined in KMC 12 | class CKmerAPI; // Defined in KMC 13 | 14 | namespace sbwt{ 15 | 16 | namespace KMC_construction_helper_classes{ 17 | 18 | 19 | typedef Kmer kmer_t; 20 | 21 | struct Node{ 22 | kmer_t kmer; 23 | char edge_flags; 24 | 25 | Node(); 26 | Node(kmer_t kmer); 27 | 28 | static inline int64_t size_in_bytes(){ 29 | return kmer_t::size_in_bytes() + sizeof(char); // char is the edge flags 30 | } 31 | 32 | void set(char c); 33 | bool has(char c) const; 34 | bool operator==(const Node &other) const; 35 | bool operator!=(const Node &other) const; 36 | bool operator<(const Node &other) const; 37 | string to_string() const; 38 | void serialize(char* buf); 39 | void load(const char* buf); 40 | 41 | }; 42 | 43 | class Argv{ // Class for turning a vector into char** 44 | private: 45 | 46 | // Forbid copying the class because it wont work right 47 | Argv(Argv const& other); 48 | Argv& operator=(Argv const& other); 49 | 50 | public: 51 | 52 | char** array = NULL; 53 | int64_t size = 0; 54 | 55 | Argv(vector v); 56 | 57 | ~Argv(); 58 | 59 | }; 60 | 61 | // Also gives reverse complements if asked 62 | class Kmer_stream_from_KMC_DB{ 63 | 64 | private: 65 | 66 | CKMCFile* kmer_database; 67 | CKmerAPI* kmer_object; 68 | 69 | uint32_t _kmer_length; 70 | uint32_t _mode; 71 | uint32_t _counter_size; 72 | uint32_t _lut_prefix_length; 73 | uint32_t _signature_len; 74 | uint32_t _min_count; 75 | uint64_t _max_count; 76 | uint64_t _total_kmers; 77 | 78 | bool add_revcomps; 79 | std::string str; 80 | std::string str_revcomp; 81 | bool revcomp_next = false; 82 | 83 | char get_rc(char c); 84 | 85 | void reverse_complement(string& S); 86 | 87 | public: 88 | 89 | Kmer_stream_from_KMC_DB(string KMC_db_path, bool add_revcomps); 90 | 91 | bool done(); 92 | Kmer next(); 93 | 94 | ~Kmer_stream_from_KMC_DB(); 95 | }; 96 | 97 | class SimpleSortedKmerDB{ 98 | 99 | private: 100 | 101 | string filename; 102 | int64_t cursor = 0; 103 | int64_t n_kmers = 0; 104 | seq_io::Buffered_ifstream<> in; 105 | vector char_block_starts; 106 | 107 | public: 108 | 109 | // From KMC database. KMC database must be sorted!! 110 | SimpleSortedKmerDB(Kmer_stream_from_KMC_DB& sorted_kmc_db, string filename) : filename(filename), char_block_starts(256, INT64_MAX) { 111 | seq_io::Buffered_ofstream<> out(filename); 112 | char kmer_write_buf[Kmer::size_in_bytes()]; 113 | 114 | while(!sorted_kmc_db.done()){ 115 | Kmer kmer = sorted_kmc_db.next(); 116 | kmer.serialize(kmer_write_buf); 117 | out.write(kmer_write_buf, Kmer::size_in_bytes()); 118 | 119 | char_block_starts[kmer.last()] = min(char_block_starts[kmer.last()], n_kmers); 120 | 121 | n_kmers++; 122 | } 123 | 124 | for(char c : "ACGT"){ 125 | char_block_starts[c] = min(char_block_starts[c], n_kmers); // One past end 126 | } 127 | 128 | out.flush(); 129 | in.open(filename, ios::binary); 130 | } 131 | 132 | // Clones the object 133 | SimpleSortedKmerDB(const SimpleSortedKmerDB& other){ 134 | filename = other.filename; 135 | cursor = other.cursor; 136 | n_kmers = other.n_kmers; 137 | in.open(filename, ios::binary); 138 | char_block_starts = other.char_block_starts; 139 | } 140 | 141 | int64_t get_char_block_start(char c){ 142 | return char_block_starts[c]; 143 | } 144 | 145 | bool done(){ 146 | return cursor >= n_kmers; 147 | } 148 | 149 | void seek_to_char_block(char c){ 150 | cursor = char_block_starts[c]; 151 | in.open(filename, ios::binary); 152 | in.seekg(cursor * Kmer::size_in_bytes()); 153 | } 154 | 155 | Kmer next(){ 156 | char kmer_load_buf[Kmer::size_in_bytes()]; 157 | in.read(kmer_load_buf, Kmer::size_in_bytes()); 158 | 159 | Kmer x; 160 | x.load(kmer_load_buf); 161 | 162 | cursor++; 163 | return x; 164 | } 165 | 166 | }; 167 | 168 | // This stream will always start with an empty k-mer with an empty edge label set 169 | class Disk_Instream{ 170 | 171 | private: 172 | 173 | // Have pointer members -> no copying 174 | Disk_Instream(Disk_Instream const&) = delete; 175 | Disk_Instream& operator=(Disk_Instream const&) = delete; 176 | 177 | bool all_read = false; 178 | seq_io::Buffered_ifstream<> in; 179 | char* in_buffer; 180 | 181 | Node top; // Default-initialized to an empty k-mer and an empty edge set 182 | 183 | void update_top(); 184 | 185 | public: 186 | 187 | Disk_Instream(string filename); 188 | bool stream_done() const; 189 | Node stream_next(); 190 | Node peek_next(); 191 | ~Disk_Instream(); 192 | 193 | }; 194 | 195 | // A single sorted stream out of two sorted streams 196 | class Node_stream_merger{ 197 | 198 | Disk_Instream& A; 199 | Disk_Instream& B; 200 | 201 | public: 202 | 203 | Node_stream_merger(Disk_Instream& A, Disk_Instream& B); 204 | 205 | bool stream_done(); 206 | 207 | Node stream_next(); 208 | 209 | }; 210 | 211 | } // End of namespace KMC_construction_helper_classes 212 | } // End of namepace sbwt -------------------------------------------------------------------------------- /include/sbwt/run_kmc.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | // A wrapper to KMC construction and sorting 8 | 9 | namespace sbwt{ 10 | 11 | using namespace std; 12 | 13 | // Returns the KMC database prefix and the number of distinct k-mers that had abundance within the given bounds 14 | pair run_kmc(const vector& input_files, int64_t k, int64_t n_threads, int64_t ram_gigas, int64_t min_abundance, int64_t max_abundance); 15 | 16 | // Sort a KMC database 17 | void sort_kmc_db(const string& input_db_file, const string& output_db_file, int64_t n_threads); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /include/sbwt/stdlib_printing.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace sbwt{ 11 | 12 | using namespace std; 13 | 14 | template 15 | ostream& operator<<(ostream& os, const unordered_map& v){ 16 | os << "["; 17 | for(auto it = v.begin(); it != v.end(); it++) { 18 | if(it != v.begin()) os << ", "; 19 | os << it->first << ": " << it->second; 20 | } 21 | os << "]"; 22 | return os; 23 | } 24 | 25 | template 26 | ostream& operator<<(ostream& os, const map& v){ 27 | os << "{"; 28 | for(auto it = v.begin(); it != v.end(); it++) { 29 | if(it != v.begin()) os << ", "; 30 | os << it->first << ": " << it->second; 31 | } 32 | os << "}"; 33 | return os; 34 | } 35 | 36 | template 37 | ostream& operator<<(ostream& os, const vector& v){ 38 | os << "["; 39 | for(auto it = v.begin(); it != v.end(); it++) { 40 | if(it != v.begin()) os << ", "; 41 | os << *it; 42 | } 43 | os << "]"; 44 | return os; 45 | } 46 | 47 | template 48 | ostream& operator<<(ostream& os, const set& v){ 49 | os << "["; 50 | for(auto it = v.begin(); it != v.end(); it++) { 51 | if(it != v.begin()) os << ", "; 52 | os << *it; 53 | } 54 | os << "]"; 55 | return os; 56 | } 57 | 58 | template 59 | ostream& operator<<(ostream& os, const multiset& v){ 60 | os << "["; 61 | for(auto it = v.begin(); it != v.end(); it++) { 62 | if(it != v.begin()) os << ", "; 63 | os << *it; 64 | } 65 | os << "]"; 66 | return os; 67 | } 68 | 69 | template 70 | ostream& operator<<(ostream& os, const pair& x){ 71 | os << "(" << x.first << ", " << x.second << ")"; 72 | return os; 73 | } 74 | 75 | } -------------------------------------------------------------------------------- /include/sbwt/suffix_group_optimization.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "sdsl/bit_vectors.hpp" 4 | #include 5 | 6 | namespace sbwt{ 7 | 8 | using namespace std; 9 | 10 | // Entropy of distribution P 11 | double entropy(const vector& P); 12 | 13 | // Pushes the bits to the left end of the suffix group 14 | void push_bits_left(sdsl::bit_vector& A_bits, 15 | sdsl::bit_vector& C_bits, 16 | sdsl::bit_vector& G_bits, 17 | sdsl::bit_vector& T_bits, 18 | const sdsl::bit_vector& suffix_group_marks); 19 | 20 | // Maximally spreads the bits inside a suffix group. Assumes the bits have already been pushed to the left 21 | void spread_bits_after_push_left(sdsl::bit_vector& A_bits, 22 | sdsl::bit_vector& C_bits, 23 | sdsl::bit_vector& G_bits, 24 | sdsl::bit_vector& T_bits, 25 | const sdsl::bit_vector& suffix_group_marks); 26 | 27 | sdsl::bit_vector mark_suffix_groups(const sdsl::bit_vector& A_bits, 28 | const sdsl::bit_vector& C_bits, 29 | const sdsl::bit_vector& G_bits, 30 | const sdsl::bit_vector& T_bits, 31 | int64_t k); 32 | 33 | double compute_column_entropy(const sdsl::bit_vector& A_bits, 34 | const sdsl::bit_vector& C_bits, 35 | const sdsl::bit_vector& G_bits, 36 | const sdsl::bit_vector& T_bits); 37 | 38 | } -------------------------------------------------------------------------------- /include/sbwt/throwing_streams.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sbwt{ 13 | 14 | using namespace std; 15 | 16 | class throwing_ofstream{ 17 | 18 | public: 19 | 20 | // No copying, just moving 21 | throwing_ofstream(throwing_ofstream const& other) = delete; 22 | throwing_ofstream& operator=(throwing_ofstream const& other) = delete; 23 | throwing_ofstream(throwing_ofstream&&) = default; 24 | throwing_ofstream& operator = (throwing_ofstream&&) = default; 25 | 26 | throwing_ofstream() {} 27 | 28 | string filename; 29 | ofstream stream; 30 | 31 | // If file open fails, throws an error 32 | throwing_ofstream(string filename, ios_base::openmode mode = ios_base::out) : filename(filename) { 33 | stream.open(filename, mode); 34 | if(!stream.good()){ 35 | throw runtime_error("Error opening file: " + filename); 36 | } 37 | } 38 | 39 | // Throws an error if write failed 40 | void write(const char* data, int64_t n){ 41 | stream.write(data, n); 42 | if(!stream.good()){ 43 | throw(runtime_error("Error writing to file " + filename)); 44 | } 45 | } 46 | 47 | // Throws an error if file open failed 48 | void open(string filename, ios_base::openmode mode = ios_base::out){ 49 | stream.open(filename, mode); 50 | if(!stream.good()){ 51 | throw runtime_error("Error opening file: " + filename); 52 | } 53 | } 54 | 55 | void close(){ 56 | stream.close(); 57 | } 58 | 59 | void flush(){ 60 | stream.flush(); 61 | } 62 | }; 63 | 64 | template 65 | throwing_ofstream& operator<<(throwing_ofstream& os, const T& t){ 66 | os.stream << t; 67 | if(!os.stream.good()) 68 | throw runtime_error("Error writing type " + string(typeid(T).name()) + " to file " + os.filename); 69 | return os; 70 | } 71 | 72 | class throwing_ifstream{ 73 | 74 | public: 75 | 76 | // No copying, just moving 77 | throwing_ifstream(throwing_ifstream const& other) = delete; 78 | throwing_ifstream& operator=(throwing_ifstream const& other) = delete; 79 | throwing_ifstream(throwing_ifstream&&) = default; 80 | throwing_ifstream& operator = (throwing_ifstream&&) = default; 81 | 82 | string filename; 83 | ifstream stream; 84 | 85 | throwing_ifstream() {} 86 | // If file open failed, throws an error 87 | throwing_ifstream(string filename, ios_base::openmode mode = ios_base::in) : filename(filename) { 88 | stream.open(filename, mode); 89 | if(!stream.good()){ 90 | throw runtime_error("Error opening file: '" + filename + "'"); 91 | } 92 | } 93 | 94 | // If read succeeded, returns true 95 | // If read failed due to EOF, returns false. 96 | // If read failed due to other problems, throws an error. 97 | bool getline(string& line){ 98 | std::getline(stream, line); 99 | if(!stream.good() && !stream.eof()) 100 | throw runtime_error("Error reading from file '" + filename + "'"); 101 | 102 | return !stream.eof(); 103 | } 104 | 105 | // If read succeeded, returns true 106 | // If read failed due to EOF, returns false. (this means we can use the pattern while(in.read(data,n)){...} 107 | // If read failed due to other problems, throws an error. 108 | bool read(char* data, int64_t n){ 109 | stream.read(data, n); 110 | if(!stream.good() && !stream.eof()) 111 | throw runtime_error("Error reading from file '" + filename + "'"); 112 | return !stream.eof(); 113 | } 114 | 115 | // Returns number of characters read by the previous read operation 116 | int64_t gcount(){ 117 | return stream.gcount(); 118 | } 119 | 120 | // If open fails, throws an error 121 | void open(string filename, ios_base::openmode mode = ios_base::in){ 122 | stream.open(filename, mode); 123 | if(!stream.good()){ 124 | throw runtime_error("Error opening file: '" + filename + "'"); 125 | } 126 | } 127 | 128 | void close(){ 129 | stream.close(); 130 | } 131 | 132 | }; 133 | 134 | // Returns the stream object 135 | // If read failed and it's not due to EOF, throws an error 136 | template 137 | throwing_ifstream& operator>>(throwing_ifstream& is, T& t){ 138 | is.stream >> t; 139 | if(!is.stream.good() && !is.stream.eof()) 140 | throw runtime_error("Error reading type " + string(typeid(T).name()) + " from file " + is.filename); 141 | return is; 142 | } 143 | 144 | } -------------------------------------------------------------------------------- /include/sbwt/variants.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "cxxopts.hpp" 6 | #include "globals.hh" 7 | #include "SBWT.hh" 8 | #include "SubsetWT.hh" 9 | #include "stdlib_printing.hh" 10 | #include "SubsetSplitRank.hh" 11 | #include "SubsetMatrixRank.hh" 12 | #include "SubsetConcatRank.hh" 13 | #include 14 | #include "MEF.hpp" 15 | 16 | namespace sbwt{ 17 | 18 | // matrices 19 | typedef SBWT>> plain_matrix_sbwt_t; 20 | typedef SBWT, sdsl::rrr_vector<>::rank_1_type>> rrr_matrix_sbwt_t; 21 | typedef SBWT, mod_ef_vector<>::rank_1_type>> mef_matrix_sbwt_t; // Currently does not support extracting all k-mers because mod_ef_vector does not support access. 22 | 23 | // splits 24 | typedef SBWT, 25 | sdsl::bit_vector, sdsl::rank_support_v5<>>> plain_split_sbwt_t; 26 | 27 | typedef SBWT, sdsl::rrr_vector<>::rank_1_type, 28 | sdsl::bit_vector, sdsl::rank_support_v5<>>> rrr_split_sbwt_t; 29 | 30 | 31 | typedef SBWT, mod_ef_vector<>::rank_1_type, 32 | sdsl::bit_vector, sdsl::rank_support_v5<>>> mef_split_sbwt_t; // Currently does not support extracting all k-mers because mod_ef_vector does not support access. 33 | 34 | // concats 35 | typedef SBWT, 39 | sdsl::select_support_scan<1>, 40 | sdsl::select_support_scan<0>>> 41 | > plain_concat_sbwt_t; 42 | 43 | typedef SBWT, 44 | sd_vector<>::select_0_type, 45 | sdsl::wt_blcd, 46 | rrr_vector<>::rank_1_type, 47 | rrr_vector<>::select_1_type, 48 | rrr_vector<>::select_0_type>> 49 | > mef_concat_sbwt_t; // Currently does not support extracting all k-mers because mod_ef_vector does not support access. 50 | 51 | // wavelet trees 52 | typedef SBWT, 54 | sdsl::select_support_scan<1>, 55 | sdsl::select_support_scan<0>>> 56 | > plain_sswt_sbwt_t; 57 | 58 | 59 | typedef SBWT, 60 | sdsl::rrr_vector<>::rank_1_type, 61 | rrr_vector<>::select_1_type, 62 | rrr_vector<>::select_0_type>> 63 | > rrr_sswt_sbwt_t; 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/CLI/sbwt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "commands.hh" 6 | #include "globals.hh" 7 | 8 | using namespace std; 9 | 10 | static vector commands = {"build", "build-variant", "search", "ascii-export"}; 11 | 12 | void print_help(int argc, char** argv){ 13 | (void) argc; // Unused parameter 14 | cerr << "Available commands: " << endl; 15 | for(string S : commands) cerr << " " << argv[0] << " " << S << endl; 16 | cerr << "Running a command without arguments prints the usage instructions for the command." << endl; 17 | } 18 | 19 | int main(int argc, char** argv){ 20 | 21 | #ifndef __BMI2__ 22 | cerr << "WARNING: This program was compiled for a CPU without support for the BMI2 instruction set. The performance of the Elias-Fano variants will be very bad." << endl; 23 | #endif 24 | 25 | sbwt::write_log("Maximum k-mer length is set to " + to_string(MAX_KMER_LENGTH), sbwt::LogLevel::MAJOR); 26 | 27 | if(argc == 1){ 28 | print_help(argc, argv); 29 | return 0; 30 | } 31 | 32 | string command = argv[1]; 33 | if(command == "--help" || command == "-h"){ 34 | print_help(argc, argv); 35 | return 0; 36 | } 37 | 38 | // Drop the first element of argv 39 | for(int64_t i = 1; i < argc; i++) argv[i-1] = argv[i]; 40 | argc--; 41 | 42 | try{ 43 | if(command == "build") return build_main(argc, argv); 44 | else if(command == "search") return search_main(argc, argv); 45 | else if(command == "build-variant") return build_from_plain_main(argc, argv); 46 | else if(command == "ascii-export") return ascii_export_main(argc, argv); 47 | else{ 48 | throw std::runtime_error("Invalid command: " + command); 49 | return 1; 50 | } 51 | } catch (const std::runtime_error &e){ 52 | std::cerr << "Runtime error: " << e.what() << '\n'; 53 | return 1; 54 | } catch(const std::exception& e){ 55 | std::cerr << "Error: " << e.what() << '\n'; 56 | return 1; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/CLI/sbwt_ascii_export.cpp: -------------------------------------------------------------------------------- 1 | #include "globals.hh" 2 | #include "throwing_streams.hh" 3 | #include "cxxopts.hpp" 4 | #include "SBWT.hh" 5 | #include "SubsetMatrixRank.hh" 6 | #include "SeqIO/SeqIO.hh" 7 | #include "variants.hh" 8 | #include "commands.hh" 9 | #include "throwing_streams.hh" 10 | 11 | using namespace std; 12 | using namespace sbwt; 13 | 14 | template 15 | void export_sbwt_variant(const sbwt_t& sbwt, seq_io::Buffered_ofstream<>& out) { 16 | sbwt.ascii_export_metadata(out); 17 | out.flush(); 18 | sbwt.ascii_export_sets(out); 19 | out.flush(); 20 | } 21 | 22 | int ascii_export_main(int argc, char** argv){ 23 | 24 | cxxopts::Options options(argv[0], "\nExport SBWT in an ASCII format. First comes three lines of metadata:\n\nversion: \nk: \nnumber_of_sets: \nnumber_of_kmers: \n\nThis is followed by a line of the form \"sbwt: \". The format of is as follows: Each set is written as a string of characters. A non-empty set is a string of DNA characters (ACGT), such that the last character of the set is written in lower case. Empty sets are represented as a single '$'. The representations of the sets are concatenated together. For example, the sequence {A,C}, {A,T}, {}, {C}, {A,G,T} is represented as AcAt$cAGt."); 25 | 26 | options.add_options() 27 | ("o,out-file", "Output filename.", cxxopts::value()) 28 | ("i,index-file", "Index input file.", cxxopts::value()) 29 | ("h,help", "Print usage") 30 | ; 31 | 32 | int64_t old_argc = argc; // Must store this because the parser modifies it 33 | auto opts = options.parse(argc, argv); 34 | 35 | if (old_argc == 1 || opts.count("help")){ 36 | std::cerr << options.help() << std::endl; 37 | exit(1); 38 | } 39 | 40 | string indexfile = opts["index-file"].as(); 41 | string outfile = opts["out-file"].as(); 42 | 43 | vector variants = get_available_variants(); 44 | 45 | throwing_ifstream in(indexfile, ios::binary); 46 | string variant = load_string(in.stream); // read variant type 47 | if(std::find(variants.begin(), variants.end(), variant) == variants.end()){ 48 | cerr << "Error loading index from file: unrecognized variant specified in the file" << endl; 49 | return 1; 50 | } 51 | 52 | write_log("Loading and exporting the index variant " + variant, LogLevel::MAJOR); 53 | seq_io::Buffered_ofstream<> out(outfile); 54 | 55 | if (variant == "plain-matrix"){ 56 | plain_matrix_sbwt_t sbwt; 57 | sbwt.load(in.stream); 58 | export_sbwt_variant(sbwt, out); 59 | } 60 | if (variant == "rrr-matrix"){ 61 | rrr_matrix_sbwt_t sbwt; 62 | sbwt.load(in.stream); 63 | export_sbwt_variant(sbwt, out); 64 | } 65 | if (variant == "mef-matrix"){ 66 | cerr << "Error: Index export does not work for mef-matrix because mef does not implement access to the sets" << endl; 67 | return 1; 68 | } 69 | if (variant == "plain-split"){ 70 | plain_split_sbwt_t sbwt; 71 | sbwt.load(in.stream); 72 | export_sbwt_variant(sbwt, out); 73 | } 74 | if (variant == "rrr-split"){ 75 | rrr_split_sbwt_t sbwt; 76 | sbwt.load(in.stream); 77 | export_sbwt_variant(sbwt, out); 78 | } 79 | if (variant == "mef-split"){ 80 | cerr << "Error: Index export does not work for mef-split because mef does not implement access to the sets" << endl; 81 | return 1; 82 | } 83 | if (variant == "plain-concat"){ 84 | plain_concat_sbwt_t sbwt; 85 | sbwt.load(in.stream); 86 | export_sbwt_variant(sbwt, out); 87 | } 88 | if (variant == "mef-concat"){ 89 | cerr << "Error: Index export does not work for mef-concat because mef does not implement access to the sets" << endl; 90 | return 1; 91 | } 92 | if (variant == "plain-subsetwt"){ 93 | plain_sswt_sbwt_t sbwt; 94 | sbwt.load(in.stream); 95 | export_sbwt_variant(sbwt, out); 96 | } 97 | if (variant == "rrr-subsetwt"){ 98 | rrr_sswt_sbwt_t sbwt; 99 | sbwt.load(in.stream); 100 | export_sbwt_variant(sbwt, out); 101 | } 102 | 103 | return 0; 104 | } -------------------------------------------------------------------------------- /src/CLI/sbwt_build_from_plain_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include "globals.hh" 2 | #include "throwing_streams.hh" 3 | #include "cxxopts.hpp" 4 | #include "SBWT.hh" 5 | #include "SubsetMatrixRank.hh" 6 | #include "SeqIO/SeqIO.hh" 7 | #include "variants.hh" 8 | #include "commands.hh" 9 | 10 | 11 | using namespace std; 12 | 13 | int build_from_plain_main(int argc, char** argv){ 14 | 15 | sbwt::set_log_level(sbwt::LogLevel::MAJOR); 16 | 17 | cxxopts::Options options(argv[0], "Construct an SBWT variant from a plain matrix SBWT."); 18 | 19 | vector variants = get_available_variants(); 20 | string all_variants_string; 21 | for(string variant : variants) all_variants_string += " " + variant; 22 | 23 | options.add_options() 24 | ("i,in-file", "Index file of a plain matrix SBWT.", cxxopts::value()) 25 | ("o,out-file", "Output file for the constructed variant.", cxxopts::value()) 26 | ("variant", "The SBWT variant to build. Available variants:" + all_variants_string, cxxopts::value()->default_value("plain-matrix")) 27 | ("h,help", "Print usage") 28 | ; 29 | 30 | int64_t old_argc = argc; // Must store this because the parser modifies it 31 | auto opts = options.parse(argc, argv); 32 | 33 | if (old_argc == 1 || opts.count("help")){ 34 | std::cerr << options.help() << std::endl; 35 | exit(1); 36 | } 37 | 38 | string variant = opts["variant"].as(); 39 | if(std::find(variants.begin(), variants.end(), variant) == variants.end()){ 40 | cerr << "Error: unknown variant: " << variant << endl; 41 | cerr << "Available variants are:" << all_variants_string << endl; 42 | return 1; 43 | } 44 | 45 | string out_file = opts["out-file"].as(); 46 | sbwt::check_writable(out_file); 47 | 48 | string in_file = opts["in-file"].as(); 49 | sbwt::check_readable(in_file); 50 | 51 | sbwt::throwing_ifstream in(in_file, ios::binary); 52 | string variant_on_disk = sbwt::load_string(in.stream); // read variant type 53 | if(variant_on_disk != "plain-matrix"){ 54 | cerr << "Error input is not a plain-matrix SBWT." << endl; 55 | return 1; 56 | } 57 | 58 | sbwt::plain_matrix_sbwt_t matrixboss_plain; 59 | write_log("Reading input.", sbwt::LogLevel::MAJOR); 60 | matrixboss_plain.load(in.stream); 61 | 62 | sbwt::write_log("Building variant " + variant, sbwt::LogLevel::MAJOR); 63 | 64 | const sdsl::bit_vector& A_bits = matrixboss_plain.get_subset_rank_structure().A_bits; 65 | const sdsl::bit_vector& C_bits = matrixboss_plain.get_subset_rank_structure().C_bits; 66 | const sdsl::bit_vector& G_bits = matrixboss_plain.get_subset_rank_structure().G_bits; 67 | const sdsl::bit_vector& T_bits = matrixboss_plain.get_subset_rank_structure().T_bits; 68 | const sdsl::bit_vector& ssupport = matrixboss_plain.get_streaming_support(); 69 | int64_t n_kmers = matrixboss_plain.number_of_kmers(); 70 | int64_t k = matrixboss_plain.get_k(); 71 | int64_t precalc_k = matrixboss_plain.get_precalc_k(); 72 | 73 | int64_t bytes_written = 0; 74 | sbwt::throwing_ofstream out(out_file, ios::binary); 75 | 76 | sbwt::serialize_string(variant, out.stream); 77 | if (variant == "plain-matrix"){ 78 | bytes_written = matrixboss_plain.serialize(out.stream); 79 | } 80 | if (variant == "rrr-matrix"){ 81 | sbwt::rrr_matrix_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 82 | bytes_written = sbwt.serialize(out.stream); 83 | } 84 | if (variant == "mef-matrix"){ 85 | sbwt::mef_matrix_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 86 | bytes_written = sbwt.serialize(out.stream); 87 | } 88 | if (variant == "plain-split"){ 89 | sbwt::plain_split_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 90 | bytes_written = sbwt.serialize(out.stream); 91 | } 92 | if (variant == "rrr-split"){ 93 | sbwt::rrr_split_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 94 | bytes_written = sbwt.serialize(out.stream); 95 | } 96 | if (variant == "mef-split"){ 97 | sbwt::mef_split_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 98 | bytes_written = sbwt.serialize(out.stream); 99 | } 100 | if (variant == "plain-concat"){ 101 | sbwt::plain_concat_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 102 | bytes_written = sbwt.serialize(out.stream); 103 | } 104 | if (variant == "mef-concat"){ 105 | sbwt::mef_concat_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 106 | bytes_written = sbwt.serialize(out.stream); 107 | } 108 | if (variant == "plain-subsetwt"){ 109 | sbwt::plain_sswt_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 110 | bytes_written = sbwt.serialize(out.stream); 111 | } 112 | if (variant == "rrr-subsetwt"){ 113 | sbwt::rrr_sswt_sbwt_t sbwt(A_bits, C_bits, G_bits, T_bits, ssupport, k, n_kmers, precalc_k); 114 | bytes_written = sbwt.serialize(out.stream); 115 | } 116 | 117 | sbwt::write_log("Built variant " + variant + " to file " + out_file, sbwt::LogLevel::MAJOR); 118 | sbwt::write_log("Space on disk: " + 119 | to_string(bytes_written * 8.0 / matrixboss_plain.number_of_subsets()) + " bits per column, " + 120 | to_string(bytes_written * 8.0 / matrixboss_plain.number_of_kmers()) + " bits per k-mer" , 121 | sbwt::LogLevel::MAJOR); 122 | 123 | return 0; 124 | } 125 | -------------------------------------------------------------------------------- /src/EM_sort/Block.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "EM_sort/bit_level_stuff.hh" 13 | #include "EM_sort/Block.hh" 14 | #include 15 | 16 | using namespace std; 17 | using namespace sbwt; 18 | 19 | bool sbwt::read_variable_binary_record(seq_io::Buffered_ifstream<>& input, char** buffer, int64_t* buffer_len){ 20 | assert(*buffer_len > 0); 21 | char rec_len_buf[8]; 22 | int64_t bytes_read = input.read(rec_len_buf, 8); // Try to read the length of the record 23 | while(bytes_read > 0){ 24 | // Read was successful 25 | int64_t rec_len = parse_big_endian_LL(rec_len_buf); 26 | while(*buffer_len < rec_len){ // Make space in the buffer if needed 27 | *buffer = (char*)realloc(*buffer, *(buffer_len)*2); 28 | *buffer_len *= 2; 29 | } 30 | memcpy(*buffer, rec_len_buf, 8); 31 | bytes_read = input.read(*buffer + 8, rec_len - 8); // Read the payload 32 | return true; 33 | } 34 | return false; 35 | } 36 | 37 | Variable_binary_block* sbwt::get_next_variable_binary_block(seq_io::Buffered_ifstream<>& input, int64_t B){ 38 | int64_t buffer_len = 1024; // MUST HAVE AT LEAST 8 BYTES 39 | char* buffer = (char*)malloc(buffer_len); 40 | Variable_binary_block* block = new Variable_binary_block(); 41 | 42 | while(read_variable_binary_record(input, &buffer, &buffer_len)){ 43 | block->add_record(buffer); 44 | if(block->estimate_size_in_bytes() > B) break; 45 | } 46 | 47 | free(buffer); 48 | return block; 49 | } 50 | 51 | Constant_binary_block* sbwt::get_next_constant_binary_block(seq_io::Buffered_ifstream<>& input, int64_t B, int64_t record_size){ 52 | Constant_binary_block* block = new Constant_binary_block(record_size); 53 | char* buf = (char*)malloc(record_size); 54 | while(true){ 55 | if(input.read(buf, record_size) == 0) break; // end of file 56 | block->add_record(buf); 57 | if(block->estimate_size_in_bytes() > B) break; 58 | } 59 | 60 | free(buf); 61 | return block; 62 | } 63 | -------------------------------------------------------------------------------- /src/globals.cpp: -------------------------------------------------------------------------------- 1 | #include "globals.hh" 2 | #include "throwing_streams.hh" 3 | #include "SeqIO/SeqIO.hh" 4 | #include "SeqIO/buffered_streams.hh" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "zstr/zstr.hpp" 12 | 13 | using namespace std::chrono; 14 | using namespace sbwt; 15 | 16 | string sbwt::get_rc(const string& S){ 17 | string T = S; 18 | std::reverse(T.begin(), T.end()); 19 | for(char& c : T) c = get_rc(c); 20 | return T; 21 | } 22 | 23 | vector sbwt::readlines(string filename){ 24 | vector lines; 25 | string line; 26 | throwing_ifstream in(filename); 27 | while(getline(in.stream,line)){ 28 | lines.push_back(line); 29 | } 30 | return lines; 31 | } 32 | 33 | 34 | Temp_File_Manager& sbwt::get_temp_file_manager(){ 35 | static Temp_File_Manager temp_file_manager; // Singleton 36 | return temp_file_manager; 37 | } 38 | 39 | void sbwt::check_readable(string filename){ 40 | throwing_ifstream F(filename); // Throws on failure 41 | } 42 | 43 | // Also clears the file 44 | void sbwt::check_writable(string filename){ 45 | throwing_ofstream F(filename, std::ofstream::out | std::ofstream::app); // Throws on failure 46 | } 47 | 48 | // Returns the number of bytes written 49 | int64_t sbwt::serialize_string(const string& S, ostream& out){ 50 | int64_t size = S.size(); 51 | out.write((char*)&size, sizeof(size)); 52 | out.write(S.data(), size); 53 | return sizeof(size) + size; 54 | } 55 | 56 | string sbwt::load_string(istream& in){ 57 | int64_t size; 58 | in.read((char*)&size, sizeof(size)); 59 | string S(size, '\0'); 60 | in.read((char*)&S[0], size); // The C++ standard guarantees that std::string is stored contiguously in memory 61 | return S; 62 | } 63 | 64 | long long sbwt::cur_time_millis(){ 65 | return (std::chrono::duration_cast< milliseconds >(high_resolution_clock::now().time_since_epoch())).count(); 66 | } 67 | 68 | long long sbwt::cur_time_micros(){ 69 | return (std::chrono::duration_cast< microseconds >(high_resolution_clock::now().time_since_epoch())).count(); 70 | } 71 | 72 | static long long int program_start_millis = cur_time_millis(); 73 | static long long int program_start_micros = cur_time_micros(); 74 | 75 | double sbwt::seconds_since_program_start(){ 76 | return (cur_time_micros() - program_start_micros) / 1000.0; 77 | } 78 | 79 | string sbwt::getTimeString(){ 80 | std::time_t result = std::time(NULL); 81 | string time = std::asctime(std::localtime(&result)); 82 | return time.substr(0,time.size() - 1); // Trim the trailing newline 83 | } 84 | 85 | static LogLevel loglevel = MAJOR; 86 | void sbwt::set_log_level(LogLevel level){ 87 | loglevel = level; 88 | } 89 | LogLevel sbwt::get_log_level(){ 90 | return loglevel; 91 | } 92 | 93 | static std::mutex write_log_mutex; 94 | void sbwt::write_log(string message, LogLevel level){ 95 | if(level <= loglevel){ 96 | std::lock_guard lock(write_log_mutex); 97 | std::streamsize default_precision = std::cout.precision(); 98 | 99 | std::cerr << 100 | std::setprecision(4) << std::fixed << 101 | seconds_since_program_start() << 102 | std::setprecision(default_precision) << 103 | " " << getTimeString() << " " << message << std::endl; 104 | } 105 | } 106 | 107 | void sbwt::check_true(bool condition, string error_message){ 108 | if(!condition){ 109 | throw std::runtime_error(error_message); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/kmc_construct_helper_classes.cpp: -------------------------------------------------------------------------------- 1 | #include "Kmer.hh" 2 | #include 3 | #include "kmc_api/kmc_file.h" 4 | #include "include/kmc_runner.h" // In KMC/include 5 | #include "SeqIO/buffered_streams.hh" 6 | #include "EM_sort/EM_sort.hh" 7 | #include "kmc_construct_helper_classes.hh" 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sbwt{ 13 | 14 | namespace KMC_construction_helper_classes{ 15 | 16 | Node::Node() : kmer(), edge_flags(0) {} 17 | Node::Node(kmer_t kmer) : kmer(kmer), edge_flags(0) {} 18 | 19 | static inline int64_t size_in_bytes(){ 20 | return kmer_t::size_in_bytes() + sizeof(char); // char is the edge flags 21 | } 22 | 23 | void Node::set(char c){ 24 | if(c == 'A') edge_flags |= 1 << 0; 25 | else if(c == 'C') edge_flags |= 1 << 1; 26 | else if(c == 'G') edge_flags |= 1 << 2; 27 | else if(c == 'T') edge_flags |= 1 << 3; 28 | } 29 | 30 | bool Node::has(char c) const{ 31 | if(c == 'A') return edge_flags & (1 << 0); 32 | else if(c == 'C') return edge_flags & (1 << 1); 33 | else if(c == 'G') return edge_flags & (1 << 2); 34 | else if(c == 'T') return edge_flags & (1 << 3); 35 | return false; 36 | } 37 | 38 | bool Node::operator==(const Node &other) const{ 39 | return this->kmer == other.kmer && this->edge_flags == other.edge_flags; 40 | } 41 | 42 | bool Node::operator!=(const Node &other) const{ 43 | return !(*this == other); 44 | } 45 | 46 | bool Node::operator<(const Node &other) const{ 47 | if(this->kmer < other.kmer) return true; 48 | if(this->kmer == other.kmer && this->edge_flags < other.edge_flags) return true; 49 | return false; 50 | } 51 | 52 | string Node::to_string() const{ 53 | string S = kmer.to_string() + ": "; 54 | S += has('A') ? '1' : '0'; 55 | S += has('C') ? '1' : '0'; 56 | S += has('G') ? '1' : '0'; 57 | S += has('T') ? '1' : '0'; 58 | return S; 59 | } 60 | 61 | void Node::serialize(char* buf){ 62 | kmer.serialize(buf); 63 | buf[size_in_bytes()-1] = edge_flags; 64 | } 65 | 66 | void Node::load(const char* buf){ 67 | kmer.load(buf); 68 | edge_flags = buf[size_in_bytes()-1]; 69 | } 70 | 71 | 72 | Argv::Argv(vector v){ 73 | array = (char**)malloc(sizeof(char*) * v.size()); 74 | // Copy contents of v into array 75 | for(int64_t i = 0; i < v.size(); i++){ 76 | char* s = (char*)malloc(sizeof(char) * (v[i].size() + 1)); // +1: space for '\0' at the end 77 | for(int64_t j = 0; j < v[i].size(); j++){ 78 | s[j] = v[i][j]; // Can't use strcpy because s.c_str() is const 79 | } 80 | s[v[i].size()] = '\0'; 81 | array[i] = s; 82 | } 83 | size = v.size(); 84 | } 85 | 86 | Argv::~Argv(){ 87 | for(int64_t i = 0; i < size; i++) free(array[i]); 88 | free(array); 89 | } 90 | 91 | char Kmer_stream_from_KMC_DB::get_rc(char c){ 92 | switch(c){ 93 | case 'A': return 'T'; 94 | case 'T': return 'A'; 95 | case 'C': return 'G'; 96 | case 'G': return 'C'; 97 | default: cerr << "Error getting reverse complement from " << c << endl; exit(1); 98 | } 99 | } 100 | 101 | void Kmer_stream_from_KMC_DB::reverse_complement(string& S){ 102 | std::reverse(S.begin(), S.end()); 103 | for(char& c : S) c = get_rc(c); 104 | } 105 | 106 | 107 | Kmer_stream_from_KMC_DB::Kmer_stream_from_KMC_DB(string KMC_db_path, bool add_revcomps) : add_revcomps(add_revcomps) { 108 | kmer_database = new CKMCFile(); 109 | if (!kmer_database->OpenForListing(KMC_db_path)){ 110 | throw std::runtime_error("Error opening KMC database " + KMC_db_path); 111 | } 112 | 113 | kmer_database->Info(_kmer_length, _mode, _counter_size, _lut_prefix_length, _signature_len, _min_count, _max_count, _total_kmers); 114 | kmer_object = new CKmerAPI(_kmer_length); 115 | } 116 | 117 | Kmer_stream_from_KMC_DB::~Kmer_stream_from_KMC_DB(){ 118 | delete kmer_database; 119 | delete kmer_object; 120 | } 121 | 122 | bool Kmer_stream_from_KMC_DB::done(){ 123 | return (!add_revcomps || !revcomp_next) && kmer_database->Eof(); 124 | } 125 | 126 | Kmer Kmer_stream_from_KMC_DB::next(){ 127 | if(add_revcomps && revcomp_next){ 128 | revcomp_next = false; 129 | return Kmer(str_revcomp); 130 | } 131 | 132 | //float counter_f; 133 | uint32 counter_i; 134 | /*if(_mode){ //quake compatible mode 135 | kmer_database.ReadNextKmer(kmer_object, counter_f); 136 | } 137 | else { */ 138 | kmer_database->ReadNextKmer(*kmer_object, counter_i); 139 | //} 140 | 141 | kmer_object->to_string(str); 142 | if(add_revcomps){ 143 | str_revcomp = str; 144 | reverse_complement(str_revcomp); 145 | if(str != str_revcomp) revcomp_next = true; 146 | } 147 | 148 | std::reverse(str.begin(), str.end()); // Return reverses so that they are in colex order 149 | 150 | return Kmer(str); 151 | 152 | } 153 | 154 | void Disk_Instream::update_top(){ 155 | in.read(in_buffer, Node::size_in_bytes()); 156 | if(in.eof()){ 157 | all_read = true; 158 | return; 159 | } 160 | top.load(in_buffer); 161 | } 162 | 163 | Disk_Instream::Disk_Instream(string filename) { 164 | in.open(filename, ios_base::binary); 165 | in_buffer = (char*)malloc(Node::size_in_bytes()); 166 | } 167 | 168 | bool Disk_Instream::stream_done() const{ 169 | return all_read; 170 | } 171 | 172 | Node Disk_Instream::stream_next(){ 173 | Node ret = top; 174 | update_top(); 175 | return ret; 176 | } 177 | 178 | Node Disk_Instream::peek_next(){ 179 | return top; 180 | } 181 | 182 | Disk_Instream::~Disk_Instream(){ 183 | free(in_buffer); 184 | } 185 | 186 | Node_stream_merger::Node_stream_merger(Disk_Instream& A, Disk_Instream& B) : A(A), B(B){} 187 | 188 | bool Node_stream_merger::stream_done(){ 189 | return A.stream_done() && B.stream_done(); 190 | } 191 | 192 | Node Node_stream_merger::stream_next(){ 193 | if(A.stream_done()) return B.stream_next(); 194 | if(B.stream_done()) return A.stream_next(); 195 | if(A.peek_next() < B.peek_next()) return A.stream_next(); 196 | else return B.stream_next(); 197 | } 198 | 199 | } // End of namespace KMC_construction_helper_classes 200 | } // End of namepace sbwt -------------------------------------------------------------------------------- /src/suffix_group_optimization.cpp: -------------------------------------------------------------------------------- 1 | #include "suffix_group_optimization.hh" 2 | #include 3 | 4 | using namespace std; 5 | 6 | namespace sbwt{ 7 | 8 | // Entropy of distribution P 9 | double entropy(const vector& P){ 10 | double ans = 0; 11 | for(double p : P){ 12 | if(p != 0 && p != 1){ 13 | ans += p * log2(1.0 / p); 14 | } 15 | } 16 | return ans; 17 | } 18 | 19 | // Pushes the bits to the left end of the suffix group 20 | void push_bits_left(sdsl::bit_vector& A_bits, 21 | sdsl::bit_vector& C_bits, 22 | sdsl::bit_vector& G_bits, 23 | sdsl::bit_vector& T_bits, 24 | const sdsl::bit_vector& suffix_group_marks){ 25 | 26 | for(int64_t i = (int64_t)A_bits.size() - 1; i >= 1; i--){ 27 | if(suffix_group_marks[i] == 0){ 28 | 29 | // Push left 30 | A_bits[i-1] = A_bits[i-1] | A_bits[i]; 31 | C_bits[i-1] = C_bits[i-1] | C_bits[i]; 32 | G_bits[i-1] = G_bits[i-1] | G_bits[i]; 33 | T_bits[i-1] = T_bits[i-1] | T_bits[i]; 34 | 35 | // Clear 36 | A_bits[i] = 0; 37 | C_bits[i] = 0; 38 | G_bits[i] = 0; 39 | T_bits[i] = 0; 40 | } 41 | } 42 | } 43 | 44 | // Maximally spreads the bits inside a suffix group. Assumes the bits have already been pushed to the left 45 | void spread_bits_after_push_left(sdsl::bit_vector& A_bits, 46 | sdsl::bit_vector& C_bits, 47 | sdsl::bit_vector& G_bits, 48 | sdsl::bit_vector& T_bits, 49 | const sdsl::bit_vector& suffix_group_marks){ 50 | 51 | vector M = {&A_bits, &C_bits, &G_bits, &T_bits}; // Matrix 52 | for(int64_t i = 0; i < (int64_t)A_bits.size()-1; i++){ 53 | if(suffix_group_marks[i+1] == 0){ // Column i and i+1 have the same suffix group 54 | // Keep topmost 1-bit where it is, move everything else to the right 55 | int64_t top = 0; 56 | while(top < 4 && (*M[top])[i] == 0) top++; 57 | // top is now the row of the topmost 1-bit in the column 58 | for(int64_t j = top+1; j < 4; j++){ 59 | (*M[j])[i+1] = (*M[j])[i]; 60 | (*M[j])[i] = 0; 61 | } 62 | } 63 | } 64 | } 65 | 66 | sdsl::bit_vector mark_suffix_groups(const sdsl::bit_vector& A_bits, 67 | const sdsl::bit_vector& C_bits, 68 | const sdsl::bit_vector& G_bits, 69 | const sdsl::bit_vector& T_bits, 70 | int64_t k){ 71 | 72 | int64_t n_nodes = A_bits.size(); 73 | vector C_array(4); 74 | 75 | vector last; // last[i] = incoming character to node i 76 | last.push_back('$'); 77 | 78 | C_array[0] = last.size(); 79 | for(int64_t i = 0; i < n_nodes; i++) if(A_bits[i]) last.push_back('A'); 80 | 81 | C_array[1] = last.size(); 82 | for(int64_t i = 0; i < n_nodes; i++) if(C_bits[i]) last.push_back('C'); 83 | 84 | C_array[2] = last.size(); 85 | for(int64_t i = 0; i < n_nodes; i++) if(G_bits[i]) last.push_back('G'); 86 | 87 | C_array[3] = last.size(); 88 | for(int64_t i = 0; i < n_nodes; i++) if(T_bits[i]) last.push_back('T'); 89 | 90 | if(last.size() != n_nodes){ 91 | cerr << "BUG " << last.size() << " " << n_nodes << endl; 92 | exit(1); 93 | } 94 | 95 | // Mark suffix group starts 96 | sdsl::bit_vector suffix_group_starts(n_nodes); 97 | for(int64_t i = 0; i < n_nodes; i++) suffix_group_starts[i] = 0; 98 | 99 | for(int64_t round = 0; round < k-1; round++){ 100 | for(int64_t i = 0; i < n_nodes; i++){ 101 | if(i == 0 || last[i] != last[i-1]) 102 | suffix_group_starts[i] = 1; 103 | } 104 | 105 | // Propagate the labels one step forward in the graph 106 | vector propagated(n_nodes, '$'); 107 | int64_t A_ptr = C_array[0]; 108 | int64_t C_ptr = C_array[1]; 109 | int64_t G_ptr = C_array[2]; 110 | int64_t T_ptr = C_array[3]; 111 | for(int64_t i = 0; i < n_nodes; i++){ 112 | if(A_bits[i]) propagated[A_ptr++] = last[i]; 113 | if(C_bits[i]) propagated[C_ptr++] = last[i]; 114 | if(G_bits[i]) propagated[G_ptr++] = last[i]; 115 | if(T_bits[i]) propagated[T_ptr++] = last[i]; 116 | } 117 | last = propagated; 118 | } 119 | 120 | return suffix_group_starts; 121 | } 122 | 123 | double compute_column_entropy(const sdsl::bit_vector& A_bits, 124 | const sdsl::bit_vector& C_bits, 125 | const sdsl::bit_vector& G_bits, 126 | const sdsl::bit_vector& T_bits){ 127 | map, int64_t> counts; 128 | for(int64_t i = 0; i < A_bits.size(); i++){ 129 | vector column = {(bool)A_bits[i], (bool)C_bits[i], (bool)G_bits[i], (bool)T_bits[i]}; 130 | counts[column]++; 131 | } 132 | vector P; 133 | for(const auto &[key, value] : counts){ 134 | P.push_back((double)value / A_bits.size()); 135 | } 136 | return entropy(P); 137 | } 138 | 139 | } // Namespace sbwt -------------------------------------------------------------------------------- /temp/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algbio/SBWT/b6e683096979774b69b9e15156f2d9863c909edd/temp/.keep -------------------------------------------------------------------------------- /tests/query_benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "throwing_streams.hh" 3 | #include "globals.hh" 4 | #include "variants.hh" 5 | #include "SeqIO/SeqIO.hh" 6 | 7 | using namespace std; 8 | using namespace sbwt; 9 | 10 | void sequential_access_benchmark(const plain_matrix_sbwt_t& sbwt){ 11 | int64_t max_queries = 1e5; 12 | write_log("Accessing up to " + to_string(max_queries) + " k-mers sequentially", LogLevel::MAJOR); 13 | int64_t k = sbwt.get_k(); 14 | char buf[k]; 15 | int64_t buf0_sum = 0; // To prevent the compiler from optimizing the queries away 16 | int64_t micros_start = cur_time_micros(); 17 | int64_t n_queries = 0; 18 | for(int64_t i = 0; i < min((int64_t)1e5, sbwt.number_of_subsets()); i++){ 19 | sbwt.get_kmer(i, buf); 20 | buf0_sum += buf[0]; 21 | n_queries++; 22 | } 23 | int64_t micros_end = cur_time_micros(); 24 | cout << "Total queries: " << n_queries << endl; 25 | cout << "Sequential access us / kmer: " << (double) (micros_end - micros_start) / n_queries << endl; 26 | cout << "Checksum: " << buf0_sum << endl; 27 | } 28 | 29 | void sequential_access_with_select_support_benchmark(const plain_matrix_sbwt_t& sbwt){ 30 | write_log("Building select support", LogLevel::MAJOR); 31 | SubsetMatrixSelectSupport ss = sbwt.get_subset_rank_structure().build_select_support(); 32 | int64_t max_queries = 1e5; 33 | write_log("Accessing up to " + to_string(max_queries) + " k-mers sequentially with select support", LogLevel::MAJOR); 34 | int64_t k = sbwt.get_k(); 35 | char buf[k]; 36 | int64_t buf0_sum = 0; // To prevent the compiler from optimizing the queries away 37 | int64_t micros_start = cur_time_micros(); 38 | int64_t n_queries = 0; 39 | for(int64_t i = 0; i < min((int64_t)1e5, sbwt.number_of_subsets()); i++){ 40 | sbwt.get_kmer_fast(i, buf, ss); 41 | buf0_sum += buf[0]; 42 | n_queries++; 43 | } 44 | int64_t micros_end = cur_time_micros(); 45 | cout << "Total queries: " << n_queries << endl; 46 | cout << "Sequential access us / kmer with select support: " << (double) (micros_end - micros_start) / n_queries << endl; 47 | cout << "Checksum: " << buf0_sum << endl; 48 | } 49 | 50 | void search_benchmark(const plain_matrix_sbwt_t& sbwt, const string& queryfile){ 51 | int64_t k = sbwt.get_k(); 52 | int64_t max_queries = 1e5; 53 | write_log("Searching up to " + to_string(max_queries) + " k-mers individually", LogLevel::MAJOR); 54 | seq_io::Reader> reader(queryfile); 55 | 56 | int64_t micros_start = cur_time_micros(); 57 | int64_t total_colex_rank = 0; // To prevent the compiler from optimizing the queries away 58 | int64_t n_queries = 0; 59 | while(true){ 60 | int64_t len = reader.get_next_read_to_buffer(); 61 | if(len == 0) break; 62 | bool done = false; 63 | for(int64_t i = 0; i < len - k + 1; i++){ 64 | total_colex_rank += sbwt.search(reader.read_buf + i); 65 | n_queries++; 66 | if(n_queries == max_queries){ 67 | done = true; 68 | break; 69 | } 70 | } 71 | if(done) break; 72 | } 73 | int64_t micros_end = cur_time_micros(); 74 | 75 | cout << "Total queries: " << n_queries << endl; 76 | cout << "Individual search us / kmer: " << (double) (micros_end - micros_start) / n_queries << endl; 77 | cout << "Checksum: " << total_colex_rank << endl; 78 | } 79 | 80 | void streaming_search_benchmark(const plain_matrix_sbwt_t& sbwt, const string& queryfile){ 81 | int64_t k = sbwt.get_k(); 82 | int64_t max_queries = 1e5; 83 | write_log("Searching up to " + to_string(max_queries) + " k-mers with streaming search", LogLevel::MAJOR); 84 | seq_io::Reader> reader(queryfile); 85 | 86 | int64_t micros_start = cur_time_micros(); 87 | int64_t total_colex_rank = 0; // To prevent the compiler from optimizing the queries away 88 | int64_t n_queries = 0; 89 | while(true){ 90 | int64_t len = reader.get_next_read_to_buffer(); 91 | if(len == 0) break; 92 | for(int64_t x : sbwt.streaming_search(reader.read_buf, len)){ 93 | n_queries++; 94 | total_colex_rank += x * (n_queries <= max_queries); // Don't add to checksum the extra queries 95 | } 96 | if(n_queries >= max_queries) break; 97 | } 98 | int64_t micros_end = cur_time_micros(); 99 | 100 | cout << "Total queries: " << n_queries << endl; 101 | cout << "Individual search us / kmer: " << (double) (micros_end - micros_start) / n_queries << endl; 102 | cout << "Checksum: " << total_colex_rank << endl; 103 | 104 | } 105 | 106 | int main(int argc, char** argv){ 107 | if(argc != 3){ 108 | cerr << "Usage: " << argv[0] << " index.sbwt queries.fna" << endl; 109 | cerr << "Currently only supports the plain matrix variant and uncompressed queries" << endl; 110 | return 1; 111 | } 112 | 113 | string indexfile = argv[1]; 114 | string queryfile = argv[2]; 115 | 116 | throwing_ifstream in(indexfile, ios::binary); 117 | string variant = load_string(in.stream); // read variant type 118 | if (variant != "plain-matrix"){ 119 | cerr << "Error: only plain-matrix variant is supported currently" << endl; 120 | return 1; 121 | } 122 | 123 | write_log("Loading the index", LogLevel::MAJOR); 124 | plain_matrix_sbwt_t sbwt; 125 | sbwt.load(in.stream); 126 | 127 | sequential_access_benchmark(sbwt); 128 | sequential_access_with_select_support_benchmark(sbwt); 129 | search_benchmark(sbwt, queryfile); 130 | streaming_search_benchmark(sbwt, queryfile); 131 | 132 | } -------------------------------------------------------------------------------- /tests/setup_tests.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "globals.hh" 6 | #include "version.h" 7 | #include "throwing_streams.hh" 8 | 9 | using namespace sbwt; 10 | 11 | class TestLogger{ 12 | public: 13 | bool verbose = false; 14 | 15 | // This is to make std::endl compile and work with the logger 16 | TestLogger& operator<<(std::ostream& (*f)(std::ostream&)){ 17 | if(verbose) f(std::cerr); 18 | return *this; 19 | } 20 | }; 21 | 22 | template 23 | TestLogger& operator<<(TestLogger& L, const T& t){ 24 | if(L.verbose) cerr << t; 25 | return L; 26 | } 27 | 28 | TestLogger logger; // Pipe things you want to print into this object with the '<<' operator 29 | 30 | string string_to_temp_file(const string& S, const string& suffix = ""){ 31 | string filename = get_temp_file_manager().create_filename("", suffix); 32 | throwing_ofstream out(filename); 33 | out.write(S.data(), S.size()); 34 | return filename; 35 | } 36 | 37 | set get_all_kmers(const vector& input, int64_t k){ 38 | set kmers; 39 | for(string x : input) 40 | for(int64_t i = 0; i < x.size() - k + 1; i++) 41 | kmers.insert(x.substr(i,k)); 42 | return kmers; 43 | } 44 | 45 | const std::string generate_random_kmer(int64_t k) { 46 | std::string s; 47 | for (int64_t i = 0; i < k; i++) { 48 | const int r = std::rand() % 4; 49 | switch (r) { 50 | case (0): s += 'A'; break; 51 | case (1): s += 'C'; break; 52 | case (2): s += 'G'; break; 53 | case (3): s += 'T'; break; 54 | default: break; 55 | } 56 | } 57 | return s; 58 | } 59 | 60 | void write_seqs_to_fasta_file(const vector& v, const string& filename){ 61 | throwing_ofstream out(filename); 62 | for(string S : v) out.stream << ">\n" << S << "\n"; 63 | } 64 | 65 | void write_seqs_to_fasta_file(const vector& seqs, const vector& headers, const string& filename){ 66 | throwing_ofstream out(filename); 67 | assert(seqs.size() == headers.size()); 68 | for(int64_t i = 0; i < seqs.size(); i++) 69 | out.stream << ">" << headers[i] << "\n" << seqs[i] << "\n"; 70 | } 71 | 72 | // Null-terminator not written 73 | void write_to_file(const string& S, const string& filename){ 74 | throwing_ofstream out(filename); 75 | out.stream.write(S.c_str(), S.size()); 76 | } 77 | 78 | bool files_are_equal(const std::string& p1, const std::string& p2) { 79 | //https://stackoverflow.com/questions/6163611/compare-two-files/6163627 80 | throwing_ifstream f1(p1, std::ifstream::binary|std::ifstream::ate); 81 | throwing_ifstream f2(p2, std::ifstream::binary|std::ifstream::ate); 82 | 83 | if (f1.stream.tellg() != f2.stream.tellg()) { 84 | return false; //size mismatch 85 | } 86 | 87 | //seek back to beginning and use std::equal to compare contents 88 | f1.stream.seekg(0, std::ifstream::beg); 89 | f2.stream.seekg(0, std::ifstream::beg); 90 | return std::equal(std::istreambuf_iterator(f1.stream.rdbuf()), 91 | std::istreambuf_iterator(), 92 | std::istreambuf_iterator(f2.stream.rdbuf())); 93 | } 94 | 95 | 96 | 97 | void enable_test_logging(){logger.verbose = true; } 98 | void disable_test_logging(){logger.verbose = false; } 99 | 100 | void setup_tests(int argc, char** argv){ 101 | 102 | if(system("mkdir -p temp") != 0){ 103 | cerr << "Error creating directory ./temp" << endl; 104 | exit(1); 105 | } 106 | 107 | if(system("mkdir -p test_data") != 0){ 108 | cerr << "Error creating directory ./test_data" << endl; 109 | exit(1); 110 | } 111 | 112 | if(system("mkdir -p test_out") != 0){ 113 | cerr << "Error creating directory ./test_out" << endl; 114 | exit(1); 115 | } 116 | 117 | bool verbose = false; 118 | for(int64_t i = 1; i < argc; i++) 119 | if(argv[i] == string("--verbose") || argv[i] == string("-v")) verbose = true; 120 | 121 | get_temp_file_manager().set_dir("temp"); 122 | 123 | verbose ? enable_test_logging() : disable_test_logging(); // test logger 124 | verbose ? set_log_level(LogLevel::DEBUG) : set_log_level(LogLevel::OFF); // main logger 125 | 126 | ::testing::InitGoogleTest(&argc, argv); 127 | 128 | srand(247829347); 129 | 130 | } -------------------------------------------------------------------------------- /tests/test_CLI.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "setup_tests.hh" 4 | #include "SeqIO/buffered_streams.hh" 5 | #include "globals.hh" 6 | #include "commands.hh" 7 | #include "variants.hh" 8 | #include 9 | 10 | using namespace sbwt; 11 | 12 | string read_gzipped_file(string filename){ 13 | seq_io::zstr::ifstream in(filename); 14 | string S; 15 | char c; 16 | while(in.get(c)) S += c; 17 | return S; 18 | } 19 | 20 | TEST(CLI, end_to_end_build_and_query){ 21 | vector seqs1 = {"ACTAGTGTAGCTACAAA","ATGTGCTGATGCTAGCATTTTTTT"}; 22 | vector seqs2 = {"GTGTACTAGTGTGTAGTCGAT"}; 23 | string seqfile1 = get_temp_file_manager().create_filename("",".fna.gz"); 24 | string seqfile2 = get_temp_file_manager().create_filename("",".fna.gz"); 25 | 26 | { 27 | seq_io::Writer writer(seqfile1); 28 | for(string seq : seqs1) writer.write_sequence(seq.c_str(), seq.size()); 29 | } // End of scope flushes stream (would not flush properly with flush() because of how zstr works) 30 | 31 | { 32 | seq_io::Writer writer(seqfile2); 33 | for(string seq : seqs2) writer.write_sequence(seq.c_str(), seq.size()); 34 | } // End of scope flushes stream (would not flush properly with flush() because of how zstr works) 35 | 36 | string seqfile_list_file = get_temp_file_manager().create_filename("",".txt"); 37 | write_to_file(seqfile1 +"\n" + seqfile2 + "\n", seqfile_list_file); 38 | 39 | // Construct the index 40 | string indexfile = get_temp_file_manager().create_filename("",".sbwt"); 41 | plain_matrix_sbwt_t sbwt; 42 | string tempdir = get_temp_file_manager().get_dir(); 43 | vector build_args = {"build","-i",seqfile_list_file,"-o",indexfile,"-k","6","--add-reverse-complements","--temp-dir",tempdir,"--precalc-length","4"}; 44 | Argv build_argv(build_args); 45 | build_main(build_argv.size, build_argv.array); 46 | 47 | // Queries 48 | 49 | vector queries = {"GGAGAACTAGTGTAGCTACAAAGAGAG", "AGTGTGTAGCAAAATGTGCTGATGCTAGCAAAAAAAA", "CTCTACACACTTC"}; 50 | 51 | string q1 = get_temp_file_manager().create_filename("",".fq"); 52 | string q2 = get_temp_file_manager().create_filename("",".fna"); 53 | string q3 = get_temp_file_manager().create_filename("",".fq.gz"); 54 | string q4 = get_temp_file_manager().create_filename("",".fna.gz"); 55 | 56 | { 57 | // Artifical scope to flush the streams in the scope at the end of the scope. 58 | // It has to be done this way because the flush method of zstr::ofstream does 59 | // not actually flush anything. 60 | 61 | seq_io::Writer w1(q1); 62 | seq_io::Writer w2(q2); 63 | seq_io::Writer w3(q3); 64 | seq_io::Writer w4(q4); 65 | 66 | for(string S : queries){ 67 | w1.write_sequence(S.c_str(), S.size()); 68 | w2.write_sequence(S.c_str(), S.size()); 69 | w3.write_sequence(S.c_str(), S.size()); 70 | w4.write_sequence(S.c_str(), S.size()); 71 | } 72 | } 73 | 74 | string o1 = get_temp_file_manager().create_filename("",".txt"); 75 | string o2 = get_temp_file_manager().create_filename("",".txt"); 76 | string o3 = get_temp_file_manager().create_filename("",".txt"); 77 | string o4 = get_temp_file_manager().create_filename("",".txt"); 78 | 79 | string input_file_list = get_temp_file_manager().create_filename("",".txt"); 80 | write_to_file(q1 + "\n" + q2 + "\n" + q3 + "\n" + q4 + "\n", input_file_list); 81 | 82 | string output_file_list = get_temp_file_manager().create_filename("",".txt"); 83 | write_to_file(o1 + "\n" + o2 + "\n" + o3 + "\n" + o4 + "\n", output_file_list); 84 | 85 | vector args = {"search", "-o", output_file_list, "-i", indexfile, "-q", input_file_list}; 86 | Argv ARGS(args); 87 | 88 | search_main(ARGS.size, ARGS.array); 89 | 90 | string correct_answer = "-1 -1 -1 -1 -1 74 55 77 22 47 36 70 19 31 8 4 3 -1 -1 -1 -1 -1 \n57 78 23 47 36 -1 -1 -1 -1 -1 52 -1 -1 39 73 54 15 65 53 38 72 20 46 35 11 -1 -1 -1 -1 2 2 2 \n-1 -1 26 5 25 66 -1 -1 \n"; 91 | string answer_file = get_temp_file_manager().create_filename("",".txt"); 92 | write_to_file(correct_answer, answer_file); 93 | 94 | ASSERT_TRUE(files_are_equal(answer_file, o1)); 95 | ASSERT_TRUE(files_are_equal(o1, o2)); 96 | ASSERT_TRUE(files_are_equal(o2, o3)); 97 | ASSERT_TRUE(files_are_equal(o3, o4)); 98 | 99 | string output_file_list_gz = get_temp_file_manager().create_filename("",".txt"); 100 | write_to_file(o1 + ".gz\n" + o2 + ".gz\n" + o3 + ".gz\n" + o4 + ".gz\n", output_file_list_gz); 101 | 102 | vector args_gz = {"search", "-o", output_file_list_gz, "-i", indexfile, "-q", input_file_list, "--gzip-output"}; 103 | Argv ARGS_gz(args_gz); 104 | 105 | search_main(ARGS_gz.size, ARGS_gz.array); 106 | 107 | ASSERT_EQ(read_gzipped_file(o1 + ".gz"), correct_answer); 108 | ASSERT_EQ(read_gzipped_file(o2 + ".gz"), correct_answer); 109 | ASSERT_EQ(read_gzipped_file(o3 + ".gz"), correct_answer); 110 | ASSERT_EQ(read_gzipped_file(o4 + ".gz"), correct_answer); 111 | 112 | 113 | } 114 | 115 | -------------------------------------------------------------------------------- /tests/test_kmer.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "globals.hh" 4 | #include "setup_tests.hh" 5 | #include 6 | #include "Kmer.hh" 7 | 8 | using namespace sbwt; 9 | 10 | 11 | char get_random_DNA_char(){ 12 | int64_t r = rand() % 4; 13 | if(r == 0) return 'A'; 14 | else if(r == 1) return 'C'; 15 | else if(r == 2) return 'G'; 16 | else return 'T'; 17 | } 18 | 19 | string debug_test_get_random_DNA_string(int64_t len){ 20 | string S; 21 | for(int64_t i = 0; i < len; i++){ 22 | S += get_random_DNA_char(); 23 | } 24 | return S; 25 | } 26 | 27 | TEST(KMER, basic){ 28 | for(int64_t len = 1; len <= 255; len++){ 29 | string S = debug_test_get_random_DNA_string(len); 30 | Kmer<255> kmer(S); 31 | 32 | // Check that the constructor worked right 33 | ASSERT_TRUE(kmer.get_k() == S.size()); 34 | for(int64_t i = 0; i < len; i++){ 35 | ASSERT_TRUE(kmer.get(i) == S[i]); 36 | } 37 | 38 | // Test copy 39 | Kmer<255> kmer_copy = kmer.copy(); 40 | ASSERT_TRUE(kmer_copy == kmer); 41 | 42 | // Edit characters randomly 43 | for(int64_t i = 0; i < 1000; i++){ 44 | int64_t idx = rand() % S.size(); 45 | char c = get_random_DNA_char(); 46 | kmer.set(idx, c); 47 | S[idx] = c; 48 | ASSERT_TRUE(kmer.get(idx) == S[idx]); 49 | } 50 | 51 | // Check that the edits worked out right 52 | for(int64_t i = 0; i < len; i++){ 53 | ASSERT_TRUE(kmer.get(i) == S[i]); 54 | } 55 | 56 | // Test drop left 57 | Kmer<255> left = kmer.copy().dropleft(); 58 | ASSERT_TRUE(left.get_k() == len-1); 59 | for(int64_t i = 0; i < len-1; i++){ 60 | ASSERT_TRUE(left.get(i) == S[i+1]); 61 | } 62 | 63 | // Test append left 64 | Kmer<255> left_append = left.copy().appendleft(kmer.first()); 65 | ASSERT_TRUE(left_append.get_k() == len); 66 | ASSERT_TRUE(left_append == kmer); 67 | 68 | // Test drop right 69 | Kmer<255> right = kmer.copy().dropright(); 70 | ASSERT_TRUE(right.get_k() == len-1); 71 | for(int64_t i = 0; i < len-1; i++){ 72 | ASSERT_TRUE(right.get(i) == S[i]); 73 | } 74 | 75 | // Test append right 76 | Kmer<255> right_append = right.copy().appendright(kmer.last()); 77 | ASSERT_TRUE(right_append.get_k() == len); 78 | ASSERT_TRUE(right_append == kmer); 79 | } 80 | } 81 | 82 | TEST(KMER, serialization){ 83 | const string S = debug_test_get_random_DNA_string(255); 84 | const Kmer<255> kmer(S); 85 | 86 | // Test serialization to file 87 | string filename = get_temp_file_manager().create_filename("kmer-serialization"); 88 | ofstream out(filename, ios_base::binary); 89 | kmer.serialize(out); 90 | out.flush(); 91 | ifstream in(filename, ios_base::binary); 92 | Kmer<255> loaded; 93 | loaded.load(in); 94 | ASSERT_TRUE(kmer == loaded); 95 | ASSERT_TRUE(loaded.to_string() == S); 96 | ASSERT_TRUE(loaded.get_k() == S.size()); 97 | 98 | // Test serialization to a char buffer 99 | char buffer[Kmer<255>::size_in_bytes()]; 100 | kmer.serialize(buffer); 101 | Kmer<255> loaded2; 102 | loaded2.load(buffer); 103 | ASSERT_TRUE(kmer == loaded2); 104 | ASSERT_TRUE(loaded2.to_string() == S); 105 | ASSERT_TRUE(loaded2.get_k() == S.size()); 106 | } 107 | 108 | bool colex_compare(std::string A, std::string B){ 109 | std::reverse(A.begin(), A.end()); 110 | std::reverse(B.begin(), B.end()); 111 | return A < B; 112 | } 113 | 114 | TEST(KMER, colex){ 115 | vector strings; 116 | vector> kmers; 117 | 118 | // Generate random k-mers 119 | for(int i = 0; i < 20; i++){ 120 | strings.push_back(debug_test_get_random_DNA_string(rand() % 256)); // Random length, alphabet size 4 121 | } 122 | 123 | // Add empty string 124 | strings.push_back(""); 125 | strings.push_back(""); // Another so we have empty vs empty comparison 126 | 127 | // Some max-length strings for good measure 128 | for(int64_t i = 0; i < 10; i++){ 129 | strings.push_back(debug_test_get_random_DNA_string(255)); 130 | } 131 | 132 | // Add some strings that have a long shared suffix 133 | for(int64_t i = 0; i < 40; i++){ 134 | string suffix = debug_test_get_random_DNA_string(100 + i); 135 | strings.push_back(suffix); // Exact suffix of another 136 | strings.push_back(suffix); // Lets put in another so we have a long exact match 137 | strings.push_back("AAAAAAAAAAAAAA" + suffix); // Prepend some small mismatches 138 | strings.push_back("TTTTTTTTTTTTTT" + suffix); // Prepend some large mismatches 139 | } 140 | 141 | // Build the k-mer objects 142 | for(string S : strings) kmers.push_back(Kmer<255>(S)); 143 | 144 | // Check all comparisons 145 | for(int i = 0; i < strings.size(); i++){ 146 | for(int j = 0; j < strings.size(); j++){ 147 | if(strings[i] == strings[j]){ 148 | ASSERT_TRUE(kmers[i] == kmers[j]); 149 | ASSERT_TRUE((kmers[i] < kmers[j]) == false); 150 | } else{ 151 | ASSERT_TRUE(colex_compare(strings[i], strings[j]) == (kmers[i] < kmers[j])); 152 | } 153 | } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /tests/test_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "stdlib_printing.hh" 12 | #include "globals.hh" 13 | #include "setup_tests.hh" 14 | #include "test_kmer.hh" 15 | #include "test_small.hh" 16 | #include "test_large.hh" 17 | #include "test_misc.hh" 18 | #include "test_EM_sort.hh" 19 | #include "test_CLI.hh" 20 | #include 21 | 22 | int main(int argc, char **argv) { 23 | try{ 24 | setup_tests(argc, argv); 25 | return RUN_ALL_TESTS(); 26 | } catch (const std::runtime_error &e){ 27 | std::cerr << "Runtime error: " << e.what() << '\n'; 28 | return 1; 29 | } catch(const std::exception& e){ 30 | std::cerr << "Error: " << e.what() << '\n'; 31 | return 1; 32 | } 33 | } -------------------------------------------------------------------------------- /tests/test_misc.hh: -------------------------------------------------------------------------------- 1 | #include "setup_tests.hh" 2 | #include "kmc_construct.hh" 3 | #include "globals.hh" 4 | #include 5 | 6 | using namespace sbwt; 7 | 8 | TEST(MISC, test_rc){ 9 | ASSERT_EQ(get_rc('A'), 'T'); 10 | ASSERT_EQ(get_rc('C'), 'G'); 11 | ASSERT_EQ(get_rc('G'), 'C'); 12 | ASSERT_EQ(get_rc('T'), 'A'); 13 | ASSERT_EQ(get_rc('a'), 't'); 14 | ASSERT_EQ(get_rc('c'), 'g'); 15 | ASSERT_EQ(get_rc('g'), 'c'); 16 | ASSERT_EQ(get_rc('t'), 'a'); 17 | ASSERT_EQ(get_rc('N'), 'N'); 18 | } 19 | 20 | void create_rc_file_test(const string& file_extension){ 21 | 22 | vector seqs1 = {"ACAGT", "CGAG", "CGGACG"}; 23 | vector seqs2 = {"AGAT", "GAGA", "AAAAAA"}; 24 | string f1 = get_temp_file_manager().create_filename("",file_extension); 25 | string f2 = get_temp_file_manager().create_filename("",file_extension); 26 | vector oldfiles = {f1,f2}; 27 | 28 | // Write to files 29 | seq_io::Writer w1(f1); 30 | seq_io::Writer w2(f2); 31 | for(string seq : seqs1) w1.write_sequence(seq.c_str(), seq.size()); 32 | for(string seq : seqs2) w2.write_sequence(seq.c_str(), seq.size()); 33 | w1.flush(); 34 | w2.flush(); 35 | 36 | // Create file list 37 | string filelist = get_temp_file_manager().create_filename("",".txt"); 38 | throwing_ofstream filelist_out(filelist); 39 | filelist_out.stream << f1 << "\n" << f1 << "\n"; 40 | filelist_out.close(); 41 | 42 | vector newfiles; 43 | for(string old : oldfiles) 44 | newfiles.push_back(get_temp_file_manager().create_filename("",".rc" + file_extension)); 45 | 46 | seq_io::create_reverse_complement_files< 47 | seq_io::Reader>, 48 | seq_io::Writer>>(oldfiles, newfiles); 49 | ASSERT_EQ(newfiles.size(), oldfiles.size()); 50 | 51 | // Check 52 | for(int64_t i = 0; i < newfiles.size(); i++){ 53 | seq_io::Reader sr1(oldfiles[i]); 54 | seq_io::Reader sr2(newfiles[i]); 55 | while(true){ 56 | string s1 = sr1.get_next_read(); 57 | if(s1 == "") break; 58 | string s2 = sr2.get_next_read(); 59 | ASSERT_FALSE(s2 == ""); 60 | logger << s1 << endl << get_rc(s2) << endl << "--" << endl; 61 | ASSERT_EQ(s1, get_rc(s2)); 62 | } 63 | string s = sr2.get_next_read(); 64 | ASSERT_TRUE(s == ""); 65 | } 66 | 67 | } 68 | 69 | TEST(MISC, create_rc_files){ 70 | create_rc_file_test(".fna"); 71 | create_rc_file_test(".fq"); 72 | } -------------------------------------------------------------------------------- /version.h.in: -------------------------------------------------------------------------------- 1 | #ifndef SBWT_VERSION_H 2 | #define SBWT_VERSION_H 3 | // *DON'T MODIFY MANUALLY*, CMake automatically updates this file! 4 | // Defines variables containing the build version and timestamp 5 | 6 | #define SBWT_BUILD_VERSION "@SBWT_BUILD_VERSION@" 7 | #define SBWT_BUILD_TIMESTAMP "@SBWT_BUILD_TIMESTAMP@" 8 | 9 | #endif 10 | 11 | --------------------------------------------------------------------------------