├── .clang-format ├── .github └── workflows │ ├── build.yml │ └── codeql.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── hooks ├── post-merge └── pre-commit ├── img ├── fulgor.png ├── fulgor_index_size.png └── fulgor_on_dark.png ├── include ├── GGCAT.hpp ├── build_util.hpp ├── builders │ ├── builder.hpp │ ├── differential_builder.hpp │ ├── meta_builder.hpp │ └── meta_differential_builder.hpp ├── color_sets │ ├── differential.hpp │ ├── hybrid.hpp │ ├── meta.hpp │ └── meta_differential.hpp ├── filenames.hpp ├── index.hpp ├── index_types.hpp └── util.hpp ├── src ├── color_sets.cpp ├── index.cpp ├── ps_full_intersection.cpp └── ps_threshold_union.cpp ├── test_data └── salmonella_10 │ ├── SAL_AA7743AA.fasta.gz │ ├── SAL_BA0010AA.fasta.gz │ ├── SAL_CA3280AA.fasta.gz │ ├── SAL_FA0063AA.fasta.gz │ ├── SAL_FA6579AA.fasta.gz │ ├── SAL_GA5038AA.fasta.gz │ ├── SAL_HA1487AA.fasta.gz │ ├── SAL_HA3099AA.fasta.gz │ ├── SAL_HA8439AA.fasta.gz │ └── SAL_HA8462AA.fasta.gz └── tools ├── build.cpp ├── fulgor.cpp ├── permute.cpp ├── pseudoalign.cpp └── util.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: true 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: true 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeComma 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: true 45 | BreakConstructorInitializers: BeforeComma 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 100 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: false 128 | SortUsingDeclarations: false 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | StatementMacros: 146 | - Q_UNUSED 147 | - QT_REQUIRE_VERSION 148 | TabWidth: 8 149 | UseTab: Never 150 | ... 151 | 152 | 153 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | strategy: 10 | matrix: 11 | compiler: 12 | - { name: gcc, version: 11} 13 | - { name: gcc, version: 12} 14 | - { name: gcc, version: 13} 15 | - { name: gcc, version: 14} 16 | - { name: clang, version: 16} 17 | - { name: clang, version: 17} 18 | - { name: clang, version: 18} 19 | name: Build (${{ matrix.compiler.name }} ${{ matrix.compiler.version }}) 20 | runs-on: ubuntu-24.04 21 | steps: 22 | - name: Install dependencies 23 | run: | 24 | sudo add-apt-repository universe 25 | sudo apt-get update 26 | sudo apt-get install --assume-yes --no-install-recommends ca-certificates cmake git 27 | - name: Install GCC 28 | if: ${{ matrix.compiler.name == 'gcc' }} 29 | run: | 30 | sudo apt-get install --assume-yes --no-install-recommends gcc-${{ matrix.compiler.version }} g++-${{ matrix.compiler.version }} 31 | echo "CC=/usr/bin/gcc-${{ matrix.compiler.version }}" >> $GITHUB_ENV 32 | echo "CXX=/usr/bin/g++-${{ matrix.compiler.version }}" >> $GITHUB_ENV 33 | - name: Install Clang 34 | if: ${{ matrix.compiler.name == 'clang' }} 35 | run: | 36 | sudo apt-get install --assume-yes --no-install-recommends clang-${{ matrix.compiler.version }} 37 | echo "CC=/usr/bin/clang-${{ matrix.compiler.version }}" >> $GITHUB_ENV 38 | echo "CXX=/usr/bin/clang++-${{ matrix.compiler.version }}" >> $GITHUB_ENV 39 | - uses: actions/checkout@v4 40 | with: 41 | submodules: recursive 42 | - name: Build 43 | run: | 44 | cmake -B ./build 45 | cmake --build ./build --parallel 46 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | analyze: 9 | name: Analyze 10 | runs-on: ubuntu-latest 11 | permissions: 12 | actions: read 13 | contents: read 14 | security-events: write 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | with: 20 | submodules: recursive 21 | 22 | - name: Initialize CodeQL 23 | uses: github/codeql-action/init@v2 24 | with: 25 | languages: 'cpp' 26 | 27 | - name: Creating build directory 28 | run: cmake -E make_directory ./build 29 | 30 | - name: Run CMake 31 | working-directory: ./build 32 | run: cmake .. -DCMAKE_BUILD_TYPE=Release 33 | 34 | - name: Compilation 35 | working-directory: ./build 36 | run: cmake --build . --config Release 37 | 38 | # Perform Analysis 39 | - name: Perform CodeQL Analysis 40 | uses: github/codeql-action/analyze@v2 41 | with: 42 | category: "/language:cpp" 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | build 3 | debug_build 4 | test_data 5 | .idea 6 | *.swp 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/sshash"] 2 | path = external/sshash 3 | url = https://github.com/jermp/sshash 4 | [submodule "external/sketch"] 5 | path = external/sketch 6 | url = https://github.com/dnbaker/sketch 7 | [submodule "external/ggcat"] 8 | path = external/ggcat 9 | url = https://github.com/algbio/ggcat 10 | [submodule "external/smhasher"] 11 | path = external/smhasher 12 | url = https://github.com/aappleby/smhasher 13 | [submodule "external/FQFeeder"] 14 | path = external/FQFeeder 15 | url = https://github.com/rob-p/FQFeeder 16 | [submodule "external/kmeans"] 17 | path = external/kmeans 18 | url = https://github.com/jermp/kmeans 19 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5...4.0) 2 | project(FULGOR) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | if (NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE "Release") 7 | endif() 8 | 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 10 | 11 | if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")) 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -msse4.2 -mpopcnt") 14 | endif() 15 | 16 | if (UNIX) 17 | 18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function") 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") 22 | 23 | if (FULGOR_USE_SANITIZERS) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 25 | endif() 26 | 27 | endif() 28 | 29 | MESSAGE(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 30 | MESSAGE(STATUS "Compiling for processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}") 31 | MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}") 32 | 33 | include_directories(.) 34 | include_directories(external/sshash/external/pthash/include) 35 | include_directories(external/sshash/external/pthash/external/bits/include) 36 | include_directories(external/sshash/external/pthash/external/fastmod) 37 | include_directories(external/sshash/external/pthash/external/xxHash) 38 | include_directories(external/sshash/external/pthash/external/mm_file/include) 39 | include_directories(external/sshash/external/pthash/external/bits/external/essentials/include) 40 | include_directories(external/sshash) 41 | include_directories(external/FQFeeder/include) 42 | include_directories(external/sketch/include) 43 | include_directories(external/sketch/include/blaze) 44 | 45 | ### GGCAT #### 46 | 47 | add_custom_target( 48 | ggcat_cpp_api 49 | COMMAND make 50 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/ggcat/crates/capi/ggcat-cpp-api 51 | ) 52 | set(GGCAT_CPP_BINDINGS ${CMAKE_SOURCE_DIR}/external/ggcat/crates/capi/ggcat-cpp-api/lib/libggcat_cpp_bindings.a) 53 | set(GGCAT_CXX_INTEROP ${CMAKE_SOURCE_DIR}/external/ggcat/crates/capi/ggcat-cpp-api/lib/libggcat_cxx_interop.a) 54 | set(GGCAT_API ${CMAKE_SOURCE_DIR}/external/ggcat/crates/capi/ggcat-cpp-api/lib/libggcat_api.a) 55 | 56 | 57 | ### Fulgor ### 58 | 59 | add_executable(fulgor tools/fulgor.cpp) 60 | add_dependencies(fulgor ggcat_cpp_api) 61 | target_link_libraries(fulgor 62 | z 63 | ${GGCAT} 64 | ${GGCAT_API} 65 | ${GGCAT_CPP_BINDINGS} 66 | ${GGCAT_CXX_INTEROP} 67 | ${CMAKE_DL_LIBS} 68 | ) 69 | 70 | if (UNIX) 71 | if (APPLE) 72 | MESSAGE(STATUS "linking with rt should not be necessary on OSX; not adding rt") 73 | else() 74 | target_link_libraries(fulgor rt) 75 | endif() 76 | endif() 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2023-2025 Giulio Ermanno Pibiri, Alessio Campanelli, and COMBINE lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 | OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build](https://github.com/jermp/fulgor/actions/workflows/build.yml/badge.svg)](https://github.com/jermp/fulgor/actions/workflows/build.yml) 2 | [![CodeQL](https://github.com/jermp/fulgor/actions/workflows/codeql.yml/badge.svg)](https://github.com/jermp/fulgor/actions/workflows/codeql.yml) 3 | 4 | 5 | 6 | Logo 7 | 8 | 9 | **Fulgor** is a *colored de Bruijn graph* index for large-scale matching and color queries, powered by [SSHash](https://github.com/jermp/sshash) and [GGCAT](https://github.com/algbio/GGCAT). 10 | 11 | The Fulgor index is described in the following papers: 12 | 13 | - [**Fulgor: A Fast and Compact k-mer Index for Large-Scale Matching and Color Queries**](https://almob.biomedcentral.com/articles/10.1186/s13015-024-00251-9) (Algorithms for Molecular Biology, ALMOB 2024), and 14 | 15 | - [**Meta-colored compacted de Bruijn graphs**](https://link.springer.com/chapter/10.1007/978-1-0716-3989-4_9) (International Conference on Research in Computational Molecular Biology, RECOMB 2024). 16 | 17 | - [**Where the patterns are: repetition-aware compression for colored de Bruijn graphs**](https://www.liebertpub.com/doi/10.1089/cmb.2024.0714) (Journal of Computational Biology, JCB 2024). 18 | 19 | Please, cite these papers if you use Fulgor. 20 | 21 | ### Table of contents 22 | * [Dependencies](#dependencies) 23 | * [Compiling the code](#compiling-the-code) 24 | * [Tools and usage](#tools-and-usage) 25 | * [Quick start](#quick-start) 26 | * [Indexing an example Salmonella Enterica pangenome](#indexing-an-example-salmonella-enterica-pangenome) 27 | * [Pseudoalignment output format](#pseudoalignment-output-format) 28 | 29 | 30 | Dependencies 31 | ------------ 32 | 33 | #### GGCAT 34 | 35 | The code uses the [GGCAT](https://github.com/algbio/GGCAT) Rust library, 36 | so make sure you have Rust installed. If not, Rust can be installed as recommended [here](https://www.rust-lang.org/tools/install), with 37 | 38 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 39 | 40 | #### zlib 41 | 42 | If you do not have `zlib` installed, you can do 43 | 44 | sudo apt-get install zlib1g 45 | 46 | if you are on Linux/Ubuntu, or 47 | 48 | brew install zlib 49 | 50 | if you are using MacOS. 51 | 52 | 53 | Compiling the code 54 | ------------------ 55 | 56 | The code is tested on Linux with `gcc` and on MacOS with `clang`. 57 | To build the code, [`CMake`](https://cmake.org/) is required. 58 | 59 | First clone the repository with 60 | 61 | git clone https://github.com/jermp/fulgor.git 62 | 63 | and then do 64 | 65 | git submodule update --init --recursive 66 | 67 | to pull all necessary submodules before compilation. 68 | 69 | To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following, within the parent `fulgor` directory: 70 | 71 | mkdir build 72 | cd build 73 | cmake .. 74 | make -j 75 | 76 | For a testing environment, use the following instead: 77 | 78 | mkdir debug_build 79 | cd debug_build 80 | cmake .. -D CMAKE_BUILD_TYPE=Debug -D FULGOR_USE_SANITIZERS=On 81 | make -j 82 | 83 | 84 | Tools and usage 85 | --------------- 86 | 87 | There is one executable called `fulgor` after the compilation, which can be used to run a tool. 88 | Run `./fulgor` to see a list of available tools. 89 | 90 | == Fulgor: a colored de Bruijn graph index ================================ 91 | 92 | Usage: ./fulgor ... 93 | 94 | Tools: 95 | build build a Fulgor index 96 | pseudoalign pseudoalign reads to references 97 | stats print index statistics 98 | print-filenames print all reference filenames 99 | 100 | Advanced tools: 101 | permute permute the reference names of a Fulgor index 102 | dump write unitigs and color sets in text format 103 | color build a meta- or a diff- or a meta-diff- Fulgor index 104 | 105 | For large-scale indexing, it could be necessary to increase the number of file descriptors that can be opened simultaneously: 106 | 107 | ulimit -n 2048 108 | 109 | 110 | Quick start 111 | ----------- 112 | 113 | This short demo shows how to index the 10-genome collection 114 | in the folder `test_data/salmonella_10` with Fulgor. 115 | We will use the standard value k = 31. 116 | 117 | First create a list of filenames (with absolute paths) for the files in `test_data/salmonella_10`. 118 | From `fulgor/test_data`, do 119 | 120 | find $(pwd)/salmonella_10/* > salmonella_10_filenames.txt 121 | 122 | Then, from `fulgor/build`, run 123 | 124 | ./fulgor build -l ../test_data/salmonella_10_filenames.txt -o ../test_data/salmonella_10 -k 31 -m 19 -d tmp_dir -g 1 -t 1 --verbose --check 125 | 126 | to build an index that will be serialized to the file `test_data/salmonella_10.fur`. 127 | 128 | 129 | Indexing an example Salmonella Enterica pangenome 130 | ------------------------------------------------- 131 | 132 | In this example, we will build a Fulgor index, with k = 31, for the 4,546 Salmonella genomes that can be downloaded from [here](https://zenodo.org/record/1323684) 133 | with (assuming you have `wget` installed) 134 | 135 | wget https://zenodo.org/records/1323684/files/Salmonella_enterica.zip 136 | unzip Salmonella_enterica.zip 137 | 138 | We assume all commands are issue from within the home (`~/`) directory. 139 | 140 | After download, create a list of all `.fasta` filenames with 141 | 142 | find $(pwd)/Salmonella_enterica/Genomes/*.fasta > salmonella_4546_filenames.txt 143 | 144 | and, from `fulgor/build`, run 145 | 146 | ./fulgor build -l ~/salmonella_4546_filenames.txt -o ~/Salmonella_enterica/salmonella_4546 -k 31 -m 20 -d tmp_dir -g 8 -t 8 --verbose --check 147 | 148 | which will create an index named `~/Salmonella_enterica/salmonella_4546.fur` of 0.266 GB. 149 | 150 | We can now pseudoalign the reads from SRR801268, as follows. 151 | 152 | First, download the reads in `~/` with 153 | 154 | cd 155 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR801/SRR801268/SRR801268_1.fastq.gz 156 | 157 | and then process them with 158 | 159 | ./fulgor pseudoalign -i ~/Salmonella_enterica/salmonella_4546.fur -q ~/SRR801268_1.fastq.gz -t 8 --verbose -o /dev/null 160 | 161 | mapped 6584304 reads 162 | elapsed = 130133 millisec / 130.133 sec / 2.16888 min / 19.7641 musec/read 163 | num_mapped_reads 5796427/6584304 (88.034%) 164 | 165 | using 8 parallel threads and writing the mapping output to `/dev/null`. 166 | 167 | To partition the index to obtain a meta-colored Fulgor index, then do: 168 | 169 | ./fulgor color -i ~/Salmonella_enterica/salmonella_4546.fur -d tmp_dir --meta --check 170 | 171 | We can change the option `--meta` to `--diff` to create a differential-colored index, or use 172 | both options, `--meta --diff`, to create a meta-differential-colored index. 173 | See the table below. 174 | 175 | | command | output file | size (GB) | compression factor | 176 | |:----------------------|:------------------------|:---------:|:------------------:| 177 | | `color --meta` | `salmonella_4546.mfur` | 0.11769 | 2.26 | 178 | | `color --diff` | `salmonella_4546.dfur` | 0.11076 | 2.40 | 179 | | `color --meta --diff` | `salmonella_4546.mdfur` | 0.09389 | 2.84 | 180 | 181 | 182 | The following table is taken from the paper *"Where the patters are: repetition-aware compression for colored de Bruijn graphs"* and shows the size of the various Fulgor indexes on several larger pangenomes. 183 | 184 | ![Index size](./img/fulgor_index_size.png) 185 | 186 | 187 | Pseudoalignment output format 188 | ----------------------------- 189 | 190 | The tool `pseudoalign` writes the result to an output file, in plain text format, specified with the option `-o [output-filename]`. 191 | 192 | This file has one line for each mapped read, formatted as follows: 193 | 194 | [read-name][TAB][list-lenght][TAB][list] 195 | 196 | where `[list]` is a TAB-separated list of increasing integers, of length `[list-length]`, representing the list of reference identifiers to which the read is mapped. (`[TAB]` is the character `\t`.) 197 | 198 | #### Example 199 | 200 | NODE_11_length_149361_cov_9.71634_ID_21 1 0 201 | NODE_3406_length_341_cov_20.0437_ID_681 1 0 202 | NODE_4745_length_118_cov_12.7931_ID_949 3 0 3 7 203 | NODE_102_length_2047_cov_18.1471_ID_203 1 0 204 | NODE_477_length_1163_cov_22.0531_ID_953 2 0 8 205 | NODE_9_length_173161_cov_9.33695_ID_17 1 0 206 | NODE_22_length_45757_cov_12.1361_ID_43 1 0 207 | 208 | #### Important note 209 | 210 | If pseudoalignment is performed against a **meta-colored** 211 | or a **differential-meta-colored** Fulgor index, 212 | the reference identifiers in the pseudoalignment output might **not** correspond to the ones assigned following the input-file order as specified with option `-l` during index construction. 213 | This is because the meta-colored index re-assignes identifiers to references to improve index compression. 214 | 215 | In this case, the reference identifiers in the pseudoalignment output 216 | are consistent with the ones returned by the `print-filenames` tool. 217 | -------------------------------------------------------------------------------- /hooks/post-merge: -------------------------------------------------------------------------------- 1 | # /bin/bash 2 | 3 | # run 'git config core.hooksPath hooks' in the root directory of the project to enable git hooks 4 | 5 | git submodule update --init --recursive -------------------------------------------------------------------------------- /hooks/pre-commit: -------------------------------------------------------------------------------- 1 | # /bin/bash 2 | 3 | # run 'chmod +x ./hooks/*' and 'git config core.hooksPath hooks' in the root directory of the project to enable git hooks 4 | # run 'git commit --no-verify' to skip this check (UNSAFE!) 5 | 6 | set -e 7 | 8 | RED='\033[0;31m' 9 | GREEN='\033[0;32m' 10 | YELLOW='\033[1;33m' 11 | NC='\033[0m' # No Color 12 | 13 | primary_directory='build' 14 | 15 | echo "${GREEN} == pre-commit hook started == ${NC}" 16 | 17 | git submodule update --init --recursive 18 | { 19 | echo " == trying directory '${primary_directory}' == " 20 | cd "${primary_directory}" 21 | } || { 22 | echo " == '${primary_directory}' directory not found, trying directory 'debug_build' == " 23 | cd debug_build 24 | } || { 25 | echo "${RED} == '${primary_directory}' and 'debug_build' directories not found == ${NC}" 26 | echo "${YELLOW} == create a directory '${primary_directory}' or 'debug_build' and execute '$ cmake ..' inside it${NC}" 27 | echo "${YELLOW} or edit the variable 'primary_directory' in file 'hooks/pre-commit' == ${NC}" 28 | exit 1 29 | } 30 | 31 | make -j 32 | 33 | echo "${GREEN} == pre-commit hook done == ${NC}" 34 | -------------------------------------------------------------------------------- /img/fulgor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/img/fulgor.png -------------------------------------------------------------------------------- /img/fulgor_index_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/img/fulgor_index_size.png -------------------------------------------------------------------------------- /img/fulgor_on_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/img/fulgor_on_dark.png -------------------------------------------------------------------------------- /include/GGCAT.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "external/ggcat/crates/capi/ggcat-cpp-api/include/ggcat.hh" 7 | #include "util.hpp" 8 | 9 | namespace fulgor { 10 | 11 | struct GGCAT { 12 | GGCAT() : m_k(0), m_instance(nullptr) {} 13 | 14 | ~GGCAT() { 15 | try { 16 | /* remove GGCAT's tmp files */ 17 | std::remove((m_graph_file).c_str()); 18 | std::remove((m_color_sets_file).c_str()); 19 | } catch (std::exception const& e) { std::cerr << e.what() << std::endl; } 20 | } 21 | 22 | void build(build_configuration const& build_config) { 23 | { 24 | std::ifstream in(build_config.filenames_list); 25 | if (!in.is_open()) throw std::runtime_error("error in opening file"); 26 | std::string filename; 27 | while (in >> filename) m_filenames.push_back(filename); 28 | std::cout << "about to process " << m_filenames.size() << " files..." << std::endl; 29 | in.close(); 30 | } 31 | 32 | std::string basename = 33 | build_config.tmp_dirname + "/" + util::filename(build_config.file_base_name); 34 | m_color_sets_file = basename + ".ggcat.color_sets.dat"; 35 | m_graph_file = basename + ".ggcat.fa"; 36 | m_k = build_config.k; 37 | 38 | ggcat::GGCATConfig config; 39 | config.use_temp_dir = true; 40 | config.temp_dir = build_config.tmp_dirname; 41 | config.memory = build_config.ram_limit_in_GiB; 42 | config.prefer_memory = true; 43 | config.total_threads_count = build_config.num_threads; 44 | config.intermediate_compression_level = -1; 45 | config.use_stats_file = false; 46 | config.stats_file = ""; 47 | 48 | // GGCAT bug: 49 | // This leaks memory (not much) but it can't be easily fixed because 50 | // this memory is allocated in the Rust API of GGCAT and freed only at the 51 | // end of the program. 52 | m_instance = ggcat::GGCATInstance::create(config); 53 | 54 | std::vector color_names; 55 | color_names.reserve(m_filenames.size()); 56 | for (uint64_t i = 0; i != m_filenames.size(); ++i) { 57 | color_names.push_back(std::to_string(i)); 58 | } 59 | 60 | constexpr bool forward_only = false; 61 | constexpr bool output_color_sets = true; 62 | constexpr size_t min_multiplicity = 1; 63 | m_instance->build_graph_from_files( 64 | ggcat::Slice(m_filenames.data(), m_filenames.size()), m_graph_file, m_k, 65 | build_config.num_threads, forward_only, min_multiplicity, 66 | ggcat::ExtraElaborationStep_UnitigLinks, output_color_sets, 67 | ggcat::Slice(color_names.data(), color_names.size())); 68 | } 69 | 70 | void loop_through_unitigs(std::function const /* unitig */, // 71 | ggcat::Slice const /* color_set */, // 72 | bool /* same_color_set */)> 73 | callback, 74 | uint64_t num_threads = 1) const // 75 | { 76 | if (m_k == 0) throw std::runtime_error("graph must be built first"); 77 | m_instance->dump_unitigs(m_graph_file, m_k, num_threads, num_threads == 1, callback, true); 78 | } 79 | 80 | uint64_t num_colors() const { return m_filenames.size(); } 81 | std::vector const& filenames() const { return m_filenames; } 82 | 83 | private: 84 | uint64_t m_k; 85 | ggcat::GGCATInstance* m_instance; 86 | std::vector m_filenames; 87 | std::string m_graph_file; 88 | std::string m_color_sets_file; 89 | }; 90 | 91 | } // namespace fulgor 92 | -------------------------------------------------------------------------------- /include/build_util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "external/sketch/include/sketch/hll.h" 4 | #include "external/kmeans/include/kmeans.hpp" 5 | 6 | namespace fulgor { 7 | 8 | void build_reference_sketches(index_type const& index, 9 | uint64_t p, // use 2^p bytes per HLL sketch 10 | uint64_t num_threads, // num. threads for construction 11 | std::string output_filename // where the sketches will be serialized 12 | ) { 13 | assert(num_threads > 0); 14 | 15 | const uint64_t num_colors = index.num_colors(); 16 | typename sketch::hll_t::HashType hasher; 17 | auto const& u2c = index.get_u2c(); 18 | auto const& u2c_rank1_index = index.get_u2c_rank1_index(); 19 | auto const& ccs = index.get_color_sets(); 20 | const uint64_t num_color_sets = ccs.num_color_sets(); 21 | const uint64_t num_ones = u2c_rank1_index.num_ones(); 22 | assert(u2c.num_bits() > 0); 23 | const uint64_t last_pos = u2c.num_bits() - 1; 24 | assert(num_color_sets == num_ones); 25 | 26 | if (num_ones < num_threads) { 27 | throw std::runtime_error("there are only " + std::to_string(num_color_sets) + 28 | ": reduce the number of threads."); 29 | } 30 | 31 | std::vector> thread_sketches( 32 | num_threads, std::vector(num_colors, sketch::hll_t(p))); 33 | 34 | struct slice { 35 | uint64_t begin; // start position in u2c 36 | uint64_t color_id_begin, color_id_end; // [..) 37 | }; 38 | std::vector thread_slices; 39 | 40 | /* compute load */ 41 | uint64_t load = 0; 42 | { 43 | uint64_t pop_count = 0; 44 | uint64_t prev_pos = 0; 45 | auto unary_it = u2c.begin(); 46 | for (uint64_t color_id = 0; color_id != num_color_sets; ++color_id) { 47 | uint64_t curr_pos = pop_count != num_ones ? unary_it.next() : last_pos; 48 | uint64_t num_unitigs = curr_pos - prev_pos + 1; 49 | auto it = ccs.color_set(color_id); 50 | uint64_t size = it.size(); 51 | load += size * num_unitigs; 52 | pop_count += 1; 53 | prev_pos = curr_pos + 1; 54 | } 55 | } 56 | 57 | const uint64_t load_per_thread = load / num_threads; 58 | if (load_per_thread == 0) { 59 | throw std::runtime_error("load is too small: reduce the number of threads"); 60 | } 61 | 62 | { 63 | uint64_t prev_pos = 0; 64 | auto unary_it = u2c.begin(); 65 | slice s; 66 | s.begin = 0; 67 | s.color_id_begin = 0; 68 | uint64_t cur_load = 0; 69 | 70 | for (uint64_t color_id = 0; color_id != num_color_sets; ++color_id) { 71 | uint64_t curr_pos = color_id != num_color_sets - 1 ? unary_it.next() : last_pos; 72 | 73 | uint64_t num_unitigs = curr_pos - prev_pos + 1; 74 | auto it = ccs.color_set(color_id); 75 | uint64_t size = it.size(); 76 | cur_load += size * num_unitigs; 77 | prev_pos = curr_pos + 1; 78 | 79 | if (cur_load >= load_per_thread or color_id == num_color_sets - 1) { 80 | s.color_id_end = color_id + 1; 81 | thread_slices.push_back(s); 82 | s.begin = prev_pos; 83 | s.color_id_begin = color_id + 1; 84 | cur_load = 0; 85 | } 86 | } 87 | 88 | num_threads = thread_slices.size(); 89 | } 90 | 91 | auto exe = [&](uint64_t thread_id) { 92 | assert(thread_id < thread_slices.size()); 93 | auto& sketches = thread_sketches[thread_id]; 94 | auto s = thread_slices[thread_id]; 95 | uint64_t prev_pos = s.begin; 96 | std::vector hashes; 97 | auto unary_it = u2c.get_iterator_at(s.begin); 98 | for (uint64_t color_id = s.color_id_begin; color_id != s.color_id_end; ++color_id) { 99 | uint64_t curr_pos = color_id != num_color_sets - 1 ? unary_it.next() : last_pos; 100 | auto it = ccs.color_set(color_id); 101 | const uint64_t size = it.size(); 102 | hashes.reserve(curr_pos - prev_pos + 1); 103 | for (uint64_t unitig_id = prev_pos; unitig_id <= curr_pos; ++unitig_id) { 104 | assert(unitig_id < u2c.num_bits()); 105 | assert(index.u2c(unitig_id) == color_id); 106 | hashes.push_back(hasher.hash(unitig_id)); 107 | } 108 | for (uint64_t i = 0; i != size; ++i, ++it) { 109 | uint32_t ref_id = *it; 110 | assert(ref_id < num_colors); 111 | for (auto hash : hashes) sketches[ref_id].add(hash); 112 | } 113 | prev_pos = curr_pos + 1; 114 | hashes.clear(); 115 | } 116 | }; 117 | 118 | std::vector threads(num_threads); 119 | for (uint64_t thread_id = 0; thread_id != num_threads; ++thread_id) { 120 | threads[thread_id] = std::thread(exe, thread_id); 121 | } 122 | for (auto& t : threads) { 123 | if (t.joinable()) t.join(); 124 | } 125 | 126 | /* merge sketches into thread_sketches[0] */ 127 | for (uint64_t i = 0; i != num_colors; ++i) { 128 | auto& sketch = thread_sketches[0][i]; 129 | for (uint64_t thread_id = 1; thread_id != num_threads; ++thread_id) { 130 | sketch += thread_sketches[thread_id][i]; 131 | } 132 | } 133 | 134 | std::ofstream out(output_filename, std::ios::binary); 135 | if (!out.is_open()) throw std::runtime_error("cannot open file"); 136 | const uint64_t num_bytes = 1ULL << p; 137 | out.write(reinterpret_cast(&num_bytes), 8); 138 | out.write(reinterpret_cast(&num_colors), 8); 139 | for (auto const& x : thread_sketches[0]) { 140 | assert(x.m() == num_bytes); 141 | assert(x.m() == x.core().size()); 142 | uint8_t const* data = x.data(); 143 | out.write(reinterpret_cast(data), num_bytes); 144 | } 145 | out.close(); 146 | } 147 | 148 | template 149 | void build_colors_sketches_sliced( 150 | uint64_t num_colors, uint64_t num_color_sets, function colors, 151 | uint64_t p, // use 2^p bytes per HLL sketch 152 | uint64_t num_threads, // num. threads for construction 153 | std::string output_filename, // where the sketches will be serialized 154 | double left, double right) // 155 | { 156 | assert(num_threads > 0); 157 | 158 | const double min_size = left * num_colors; 159 | const double max_size = right * num_colors; 160 | assert(min_size >= 0); 161 | assert(max_size <= num_colors); 162 | 163 | if (num_color_sets < num_threads) { num_threads = num_color_sets; } 164 | 165 | std::vector filtered_colors; 166 | std::vector filtered_colors_ids; 167 | for (uint64_t color_id = 0; color_id != num_color_sets; ++color_id) { 168 | auto it = colors(color_id); 169 | uint64_t size = it.size(); 170 | if (size > min_size && size <= max_size) { 171 | filtered_colors.push_back(it); 172 | filtered_colors_ids.push_back(color_id); 173 | } 174 | } 175 | const uint64_t partition_size = filtered_colors.size(); 176 | 177 | struct slice { 178 | uint64_t begin, end; // [..) 179 | }; 180 | std::vector thread_slices; 181 | 182 | uint64_t load = 0; 183 | { 184 | for (auto it : filtered_colors) { load += it.size(); } 185 | } 186 | 187 | uint64_t load_per_thread = load / num_threads; 188 | { 189 | slice s; 190 | s.begin = 0; 191 | uint64_t curr_load = 0; 192 | 193 | for (uint64_t i = 0; i != partition_size; ++i) { 194 | auto it = filtered_colors[i]; 195 | curr_load += it.size(); 196 | if (curr_load >= load_per_thread || i == partition_size - 1) { 197 | s.end = i + 1; 198 | thread_slices.push_back(s); 199 | s.begin = i + 1; 200 | curr_load = 0; 201 | } 202 | } 203 | assert(thread_slices.size() <= num_threads); 204 | } 205 | num_threads = thread_slices.size(); 206 | std::vector> thread_sketches(num_threads); 207 | 208 | auto exe = [&](uint64_t thread_id) { 209 | assert(thread_id < thread_slices.size()); 210 | auto& sketches = thread_sketches[thread_id]; 211 | auto s = thread_slices[thread_id]; 212 | sketches = std::vector(s.end - s.begin, sketch::hll_t(p)); 213 | 214 | for (uint64_t color_id = s.begin; color_id != s.end; ++color_id) { 215 | auto it = filtered_colors[color_id]; 216 | const uint64_t size = it.size(); 217 | assert(size > 0); 218 | for (uint64_t i = 0; i < size; ++i, ++it) { 219 | uint64_t ref_id = *it; 220 | assert(ref_id < num_colors); 221 | sketches[color_id - s.begin].addh(ref_id); 222 | } 223 | } 224 | }; 225 | 226 | std::vector threads(num_threads); 227 | for (uint64_t thread_id = 0; thread_id != num_threads; ++thread_id) { 228 | threads[thread_id] = std::thread(exe, thread_id); 229 | } 230 | for (auto& t : threads) { 231 | if (t.joinable()) t.join(); 232 | } 233 | 234 | std::ofstream out(output_filename, std::ios::binary); 235 | if (!out.is_open()) throw std::runtime_error("cannot open file"); 236 | const uint64_t num_bytes = 1ULL << p; 237 | out.write(reinterpret_cast(&num_bytes), 8); 238 | out.write(reinterpret_cast(&num_colors), 8); 239 | out.write(reinterpret_cast(&partition_size), 8); 240 | for (auto const color_id : filtered_colors_ids) { 241 | out.write(reinterpret_cast(&color_id), 8); 242 | } 243 | for (auto const& sketch : thread_sketches) { 244 | for (auto const& x : sketch) { 245 | assert(x.m() == num_bytes); 246 | assert(x.m() == x.core().size()); 247 | uint8_t const* data = x.data(); 248 | out.write(reinterpret_cast(data), num_bytes); 249 | } 250 | } 251 | out.close(); 252 | } 253 | 254 | } // namespace fulgor -------------------------------------------------------------------------------- /include/builders/builder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "include/index.hpp" 4 | #include "include/GGCAT.hpp" 5 | 6 | namespace fulgor { 7 | 8 | struct buffer { 9 | buffer(uint64_t capacity) : m_capacity(capacity), m_size(0), m_num_sets(0) { 10 | m_buffer.resize(capacity); 11 | } 12 | 13 | bool insert(uint32_t* data, uint32_t size) { 14 | if (m_size + size + 1 > m_capacity) { return false; } 15 | memcpy(m_buffer.data() + m_size, &size, sizeof(uint32_t)); 16 | memcpy(m_buffer.data() + m_size + 1, data, sizeof(uint32_t) * size); 17 | 18 | m_size += size + 1; 19 | m_num_sets++; 20 | 21 | return true; 22 | } 23 | 24 | std::vector content() { return m_buffer; } 25 | 26 | uint32_t size() { return m_size; } 27 | uint64_t capacity() { return m_capacity; } 28 | uint32_t num_sets() { return m_num_sets; } 29 | 30 | uint32_t get(uint32_t i) { return m_buffer[i]; } 31 | uint32_t operator[](uint32_t i) { return get(i); } 32 | uint32_t* data() { return m_buffer.data(); } 33 | 34 | void clear() { 35 | m_size = 0; 36 | m_num_sets = 0; 37 | } 38 | 39 | private: 40 | uint64_t m_capacity, m_size, m_num_sets; 41 | std::vector m_buffer; 42 | }; 43 | 44 | template 45 | struct index::builder { 46 | builder() {} 47 | 48 | builder(build_configuration const& build_config) : m_build_config(build_config) {} 49 | 50 | void build(index& idx) { 51 | if (idx.m_k2u.size() != 0) throw std::runtime_error("index already built"); 52 | 53 | essentials::timer timer; 54 | 55 | { 56 | essentials::logger("step 1. build colored compacted dBG"); 57 | timer.start(); 58 | m_ccdbg.build(m_build_config); 59 | m_build_config.num_colors = m_ccdbg.num_colors(); 60 | timer.stop(); 61 | std::cout << "** building the ccdBG took " << timer.elapsed() << " seconds / " 62 | << timer.elapsed() / 60 << " minutes" << std::endl; 63 | timer.reset(); 64 | } 65 | 66 | std::string input_filename_for_sshash = m_build_config.tmp_dirname + "/" + 67 | util::filename(m_build_config.file_base_name) + 68 | ".sshash.fa"; 69 | 70 | { 71 | essentials::logger("step 2. build m_u2c and m_color_sets"); 72 | timer.start(); 73 | 74 | uint64_t num_unitigs = 0; 75 | uint64_t num_distinct_color_sets = 0; 76 | 77 | uint64_t num_threads = m_build_config.num_threads; 78 | constexpr uint64_t MAX_BUFFER_SIZE = 1 << 28; // 250e6 uint32_t 79 | 80 | bits::bit_vector::builder u2c_builder; 81 | 82 | /* write unitigs to fasta file for SSHash */ 83 | std::ofstream out(input_filename_for_sshash.c_str()); 84 | if (!out.is_open()) throw std::runtime_error("cannot open output file"); 85 | 86 | typename ColorSets::builder main_builder(m_build_config.num_colors); 87 | std::vector thread_builders(num_threads, 88 | m_build_config.num_colors); 89 | std::vector threads(num_threads); 90 | std::vector thread_buffers( 91 | num_threads, min(m_build_config.num_colors * 10000, MAX_BUFFER_SIZE)); 92 | assert(thread_buffers[0].capacity() > m_build_config.num_colors); 93 | uint32_t curr_thread = 0; 94 | std::atomic appending_thread = 0; 95 | 96 | auto process_and_append = [&](uint64_t thread_id) { 97 | buffer b = thread_buffers[thread_id]; 98 | thread_builders[thread_id].clear(); 99 | uint32_t pos = 0; 100 | for (uint32_t i = 0; i < b.num_sets(); i++) { 101 | uint32_t size = b[pos++]; 102 | thread_builders[thread_id].process(b.data() + pos, size); 103 | pos += size; 104 | } 105 | while (appending_thread != thread_id) {} 106 | main_builder.append(thread_builders[thread_id]); 107 | appending_thread = (appending_thread + 1) % num_threads; 108 | }; 109 | 110 | m_ccdbg.loop_through_unitigs([&](ggcat::Slice const unitig, 111 | ggcat::Slice const color_set, 112 | bool same_color_set) { 113 | assert(curr_thread >= 0); 114 | assert(curr_thread < num_threads); 115 | try { 116 | if (!same_color_set) { 117 | num_distinct_color_sets += 1; 118 | if (num_unitigs > 0) u2c_builder.set(num_unitigs - 1, 1); 119 | 120 | /* fill buffers */ 121 | if (!thread_buffers[curr_thread].insert(color_set.data, color_set.size)) { 122 | threads[curr_thread] = std::thread(process_and_append, curr_thread); 123 | const uint32_t next_thread = (curr_thread + 1) % num_threads; 124 | if (threads[next_thread].joinable()) { threads[next_thread].join(); } 125 | 126 | curr_thread = next_thread; 127 | 128 | thread_buffers[curr_thread].clear(); 129 | thread_buffers[curr_thread].insert(color_set.data, color_set.size); 130 | } 131 | } 132 | u2c_builder.push_back(0); 133 | 134 | /* 135 | Rewrite unitigs in color-set order. 136 | This is *not* the same order in which 137 | unitigs are written in the ggcat.fa file. 138 | */ 139 | out << ">\n"; 140 | out.write(unitig.data, unitig.size); 141 | out << '\n'; 142 | 143 | num_unitigs += 1; 144 | 145 | } catch (std::exception const& e) { 146 | std::cerr << e.what() << std::endl; 147 | exit(1); 148 | } 149 | }); 150 | 151 | threads[curr_thread] = std::thread(process_and_append, curr_thread); 152 | for (auto& t : threads) { 153 | if (t.joinable()) t.join(); 154 | } 155 | 156 | out.close(); 157 | 158 | assert(num_unitigs > 0); 159 | assert(num_unitigs < (uint64_t(1) << 32)); 160 | 161 | std::cout << "num_unitigs " << num_unitigs << std::endl; 162 | std::cout << "num_distinct_color_sets " << num_distinct_color_sets << std::endl; 163 | 164 | u2c_builder.set(num_unitigs - 1, 1); 165 | u2c_builder.build(idx.m_u2c); 166 | idx.m_u2c_rank1_index.build(idx.m_u2c); 167 | assert(idx.m_u2c.num_bits() == num_unitigs); 168 | assert(idx.m_u2c_rank1_index.num_ones() == num_distinct_color_sets); 169 | 170 | std::cout << "m_u2c.num_bits() " << idx.m_u2c.num_bits() << std::endl; 171 | std::cout << "m_u2c_rank1_index.num_ones() " << idx.m_u2c_rank1_index.num_ones() 172 | << std::endl; 173 | 174 | main_builder.build(idx.m_color_sets); 175 | 176 | timer.stop(); 177 | std::cout << "** building m_u2c and m_color_sets took " << timer.elapsed() 178 | << " seconds / " << timer.elapsed() / 60 << " minutes" << std::endl; 179 | timer.reset(); 180 | } 181 | 182 | { 183 | essentials::logger("step 3. build m_k2u"); 184 | timer.start(); 185 | 186 | sshash::build_configuration sshash_config; 187 | sshash_config.k = m_build_config.k; 188 | sshash_config.m = m_build_config.m; 189 | sshash_config.canonical_parsing = m_build_config.canonical_parsing; 190 | sshash_config.verbose = m_build_config.verbose; 191 | sshash_config.tmp_dirname = m_build_config.tmp_dirname; 192 | sshash_config.print(); 193 | idx.m_k2u.build(input_filename_for_sshash, sshash_config); 194 | try { // remove unitig file 195 | std::remove(input_filename_for_sshash.c_str()); 196 | } catch (std::exception const& e) { std::cerr << e.what() << std::endl; } 197 | 198 | timer.stop(); 199 | std::cout << "** building m_k2u took " << timer.elapsed() << " seconds / " 200 | << timer.elapsed() / 60 << " minutes" << std::endl; 201 | timer.reset(); 202 | } 203 | 204 | { 205 | essentials::logger("step 4. write filenames"); 206 | timer.start(); 207 | idx.m_filenames.build(m_ccdbg.filenames()); 208 | timer.stop(); 209 | std::cout << "** writing filenames took " << timer.elapsed() << " seconds / " 210 | << timer.elapsed() / 60 << " minutes" << std::endl; 211 | timer.reset(); 212 | } 213 | 214 | if (m_build_config.check) // 215 | { 216 | essentials::logger("step 5. check correctness..."); 217 | m_ccdbg.loop_through_unitigs( 218 | [&](ggcat::Slice const unitig, // 219 | ggcat::Slice const color_set, // 220 | bool /* same_color_set */) // 221 | { 222 | auto lookup_result = idx.m_k2u.lookup_advanced(unitig.data); 223 | const uint64_t unitig_id = lookup_result.contig_id; 224 | const uint64_t color_id = idx.u2c(unitig_id); 225 | for (uint64_t i = 1; i != unitig.size - idx.m_k2u.k() + 1; ++i) { 226 | uint64_t got = idx.m_k2u.lookup_advanced(unitig.data + i).contig_id; 227 | if (got != unitig_id) { 228 | std::cout << "got unitig_id " << got << " but expected " << unitig_id 229 | << std::endl; 230 | return; 231 | } 232 | } 233 | auto fwd_it = idx.m_color_sets.color_set(color_id); 234 | const uint64_t size = fwd_it.size(); 235 | if (size != color_set.size) { 236 | std::cout << "got color_set size " << size << " but expected " 237 | << color_set.size << std::endl; 238 | return; 239 | } 240 | for (uint64_t i = 0; i != size; ++i, ++fwd_it) { 241 | uint32_t ref = *fwd_it; 242 | if (ref != color_set.data[i]) { 243 | std::cout << "got ref " << ref << " but expected " << color_set.data[i] 244 | << std::endl; 245 | return; 246 | } 247 | } 248 | }, 249 | m_build_config.num_threads // 250 | ); 251 | essentials::logger("DONE!"); 252 | } 253 | } 254 | 255 | private: 256 | build_configuration m_build_config; 257 | GGCAT m_ccdbg; 258 | }; 259 | 260 | } // namespace fulgor 261 | -------------------------------------------------------------------------------- /include/builders/meta_differential_builder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "include/index.hpp" 4 | #include "include/build_util.hpp" 5 | 6 | #include 7 | 8 | namespace fulgor { 9 | 10 | template 11 | struct index::meta_differential_builder { 12 | meta_differential_builder() {} 13 | 14 | meta_differential_builder(build_configuration const& build_config) 15 | : m_build_config(build_config) {} 16 | 17 | void build(index& idx) { 18 | if (idx.m_k2u.size() != 0) throw std::runtime_error("index already built"); 19 | 20 | meta_index_type meta_index; 21 | essentials::logger("step 1. loading index to be partitioned"); 22 | essentials::load(meta_index, m_build_config.index_filename_to_partition.c_str()); 23 | essentials::logger("DONE"); 24 | 25 | const uint64_t num_color_sets = meta_index.num_color_sets(); 26 | 27 | essentials::timer timer; 28 | uint64_t num_partitions = meta_index.get_color_sets().num_partitions(); 29 | 30 | meta_differential::builder builder; 31 | builder.init(meta_index.num_colors(), num_partitions); 32 | 33 | std::vector> partial_permutations(num_partitions); 34 | 35 | { 36 | essentials::logger("step 2. building differential partial/meta colors"); 37 | timer.start(); 38 | 39 | std::vector const& pc = meta_index.get_color_sets().partial_colors(); 40 | assert(pc.size() == num_partitions); 41 | 42 | for (uint64_t i = 0; i < num_partitions; i++) { 43 | std::cout << " Partition " << i << " / " << num_partitions << std::endl; 44 | differential_permuter dp(m_build_config); 45 | dp.permute(pc[i]); 46 | 47 | differential::builder diff_builder; 48 | diff_builder.init_color_sets_builder(dp.num_colors()); 49 | 50 | auto const& permutation = dp.permutation(); 51 | auto const& references = dp.references(); 52 | 53 | partial_permutations[i].resize(permutation.size()); 54 | uint64_t original_id = 0; 55 | 56 | for (auto& reference : references) { 57 | diff_builder.encode_representative(reference); 58 | } 59 | for (auto& [cluster_id, color_id] : permutation) { 60 | auto it = pc[i].color_set(color_id); 61 | diff_builder.encode_color_set( 62 | cluster_id, references[cluster_id], it.size(), [&it]() -> void { ++it; }, 63 | [&it]() -> uint64_t { return *it; }); 64 | partial_permutations[i][color_id] = original_id++; 65 | } 66 | differential d; 67 | diff_builder.build(d); 68 | builder.process_partition(d); 69 | d.print_stats(); 70 | } 71 | 72 | timer.stop(); 73 | std::cout << "** building partial/meta colors took " << timer.elapsed() << " seconds / " 74 | << timer.elapsed() / 60 << " minutes" << std::endl; 75 | timer.reset(); 76 | } 77 | 78 | std::vector permutation(num_color_sets); 79 | 80 | { 81 | essentials::logger("step 5. build differential-meta colors"); 82 | timer.start(); 83 | 84 | std::vector counts; 85 | std::map, uint64_t> meta_partitions; 86 | std::vector> partition_sets; 87 | std::vector color_set_to_partition_set(num_color_sets); 88 | uint64_t num_partition_sets = 0; 89 | for (uint64_t color_id = 0; color_id < num_color_sets; color_id++) { 90 | auto it = meta_index.get_color_sets().color_set(color_id); 91 | const uint64_t size = it.meta_color_set_size(); 92 | std::vector partition_set(size); 93 | for (uint64_t i = 0; i < size; ++i, it.next_partition_id()) { 94 | partition_set[i] = it.partition_id(); 95 | } 96 | if (meta_partitions.count(partition_set) == 0) { 97 | meta_partitions[partition_set] = num_partition_sets++; 98 | partition_sets.push_back(partition_set); 99 | counts.push_back(0); 100 | } 101 | color_set_to_partition_set[color_id] = meta_partitions[partition_set]; 102 | counts[meta_partitions[partition_set]]++; 103 | } 104 | 105 | builder.init_meta_color_partition_sets(num_partition_sets); 106 | for (uint64_t partition_set_id = 0; partition_set_id < num_partition_sets; 107 | partition_set_id++) { 108 | builder.process_meta_color_partition_set(partition_sets[partition_set_id]); 109 | } 110 | 111 | std::vector cum_sum = {0}; 112 | uint64_t prev_val = 0; 113 | for (uint64_t count : counts) { 114 | prev_val += count; 115 | cum_sum.push_back(prev_val); 116 | } 117 | 118 | for (uint64_t color_id = 0; color_id < num_color_sets; color_id++) { 119 | permutation[cum_sum[color_set_to_partition_set[color_id]]++] = color_id; 120 | } 121 | for (uint64_t permuted_id = 0; permuted_id < num_color_sets; permuted_id++) { 122 | uint64_t original_color_id = permutation[permuted_id]; 123 | uint64_t partition_set_id = color_set_to_partition_set[original_color_id]; 124 | auto it = meta_index.get_color_sets().color_set(original_color_id); 125 | const uint64_t size = it.meta_color_set_size(); 126 | std::vector relative_colors; 127 | relative_colors.reserve(size); 128 | for (uint64_t i = 0; i < size; i++, it.next_partition_id()) { 129 | it.update_partition(); 130 | uint64_t partition_id = partition_sets[partition_set_id][i]; 131 | relative_colors.push_back( 132 | partial_permutations[partition_id] 133 | [it.meta_color() - it.num_color_sets_before()]); 134 | } 135 | builder.process_metacolor_set(partition_set_id, partition_sets[partition_set_id], 136 | relative_colors); 137 | } 138 | 139 | builder.build(idx.m_color_sets); 140 | 141 | timer.stop(); 142 | std::cout << "** building differential-meta colors took " << timer.elapsed() 143 | << " seconds / " << timer.elapsed() / 60 << " minutes" << std::endl; 144 | timer.reset(); 145 | } 146 | 147 | { 148 | essentials::logger("step 6. build u2c and k2u"); 149 | timer.start(); 150 | 151 | const std::string permuted_unitigs_filename = 152 | m_build_config.tmp_dirname + "/permuted_unitigs.fa"; 153 | std::ofstream out(permuted_unitigs_filename.c_str()); 154 | if (!out.is_open()) throw std::runtime_error("cannot open output file"); 155 | 156 | auto const& u2c = meta_index.get_u2c(); 157 | bits::darray1 d; // for select_1 on u2c 158 | d.build(u2c); 159 | 160 | const uint64_t num_unitigs = u2c.num_bits(); 161 | bits::bit_vector::builder u2c_builder(num_unitigs + 1, 0); 162 | 163 | auto const& dict = meta_index.get_k2u(); 164 | const uint64_t k = dict.k(); 165 | 166 | uint64_t pos = 0; 167 | for (uint64_t new_color_id = 0; new_color_id != num_color_sets; ++new_color_id) { 168 | uint64_t old_color_id = permutation[new_color_id]; 169 | uint64_t old_unitig_id_end = num_unitigs; 170 | if (old_color_id < num_color_sets - 1) { 171 | old_unitig_id_end = d.select(u2c, old_color_id) + 1; 172 | } 173 | uint64_t old_unitig_id_begin = 0; 174 | if (old_color_id > 0) old_unitig_id_begin = d.select(u2c, old_color_id - 1) + 1; 175 | 176 | // num. unitigs that have the same color 177 | pos += old_unitig_id_end - old_unitig_id_begin; 178 | // cout << "[" << new_color_id << "] " << pos << "\n"; 179 | assert(pos - 1 < u2c_builder.num_bits()); 180 | 181 | u2c_builder.set(pos - 1, 1); 182 | 183 | for (uint64_t i = old_unitig_id_begin; i != old_unitig_id_end; ++i) { 184 | auto it = dict.at_contig_id(i); 185 | out << ">\n"; 186 | auto [_, kmer] = it.next(); 187 | out << kmer; 188 | while (it.has_next()) { 189 | auto [_, kmer] = it.next(); 190 | out << kmer[k - 1]; // overlaps! 191 | } 192 | out << '\n'; 193 | } 194 | } 195 | 196 | assert(pos == num_unitigs); 197 | out.close(); 198 | u2c_builder.build(idx.m_u2c); 199 | idx.m_u2c_rank1_index.build(idx.m_u2c); 200 | 201 | /* build a new sshash::dictionary on the permuted unitigs */ 202 | sshash::build_configuration sshash_config; 203 | sshash_config.k = dict.k(); 204 | sshash_config.m = dict.m(); 205 | sshash_config.canonical_parsing = dict.canonicalized(); 206 | sshash_config.verbose = m_build_config.verbose; 207 | sshash_config.tmp_dirname = m_build_config.tmp_dirname; 208 | sshash_config.print(); 209 | idx.m_k2u.build(permuted_unitigs_filename, sshash_config); 210 | assert(idx.get_k2u().size() == dict.size()); 211 | try { // remove unitig file 212 | std::remove(permuted_unitigs_filename.c_str()); 213 | } catch (std::exception const& e) { std::cerr << e.what() << std::endl; } 214 | 215 | timer.stop(); 216 | std::cout << "** building u2c and k2u took " << timer.elapsed() << " seconds / " 217 | << timer.elapsed() / 60 << " minutes" << std::endl; 218 | timer.reset(); 219 | } 220 | 221 | { 222 | essentials::logger("step 7. copying filenames"); 223 | timer.start(); 224 | idx.m_filenames = meta_index.get_filenames(); 225 | timer.stop(); 226 | std::cout << "** copying filenames took " << timer.elapsed() << " seconds / " 227 | << timer.elapsed() / 60 << " minutes" << std::endl; 228 | timer.reset(); 229 | } 230 | 231 | if (m_build_config.check) { 232 | essentials::logger("step 8. check correctness..."); 233 | timer.start(); 234 | 235 | uint64_t slice_size = ceil(idx.m_k2u.num_contigs() / m_build_config.num_threads); 236 | 237 | auto exe = [&](uint64_t thread_id) { 238 | uint64_t l = slice_size * thread_id; 239 | uint64_t r = min(slice_size * (thread_id + 1), idx.m_k2u.num_contigs()); 240 | 241 | for (uint64_t unitig_id = l; unitig_id < r; ++unitig_id) { 242 | auto it = idx.get_k2u().at_contig_id(unitig_id); 243 | while (it.has_next()) { 244 | auto [_, kmer] = it.next(); 245 | uint64_t new_contig_id = 246 | idx.get_k2u().lookup_advanced(kmer.c_str()).contig_id; 247 | if (new_contig_id != unitig_id) { 248 | std::cout << "expected " << unitig_id << " but found " << new_contig_id 249 | << std::endl; 250 | continue; 251 | } 252 | uint64_t old_contig_id = 253 | meta_index.get_k2u().lookup_advanced(kmer.c_str()).contig_id; 254 | 255 | uint64_t new_color_id = idx.u2c(new_contig_id); 256 | uint64_t old_color_id = meta_index.u2c(old_contig_id); 257 | 258 | auto exp_it = meta_index.color_set(old_color_id); 259 | auto res_it = idx.color_set(new_color_id); 260 | if (res_it.size() != exp_it.size()) { 261 | std::cout << "Error while checking color " << new_color_id 262 | << ", different sizes: expected " << exp_it.size() 263 | << " but got " << res_it.size() << std::endl; 264 | continue; 265 | } 266 | for (uint64_t j = 0; j < exp_it.size(); ++j, ++exp_it, ++res_it) { 267 | auto exp = *exp_it; 268 | auto got = *res_it; 269 | if (exp != got) { 270 | std::cout << "Error while checking color " << new_color_id 271 | << ", mismatch at position " << j << ": expected " << exp 272 | << " but got " << got << std::endl; 273 | } 274 | } 275 | } 276 | } 277 | }; 278 | 279 | std::vector threads(m_build_config.num_threads); 280 | for (uint64_t thread_id = 0; thread_id != m_build_config.num_threads; ++thread_id) { 281 | threads[thread_id] = std::thread(exe, thread_id); 282 | } 283 | for (auto& t : threads) { 284 | if (t.joinable()) t.join(); 285 | } 286 | 287 | timer.stop(); 288 | std::cout << "** checking correctness took " << timer.elapsed() << " seconds / " 289 | << timer.elapsed() / 60 << " minutes" << std::endl; 290 | essentials::logger("DONE!"); 291 | } 292 | } 293 | 294 | private: 295 | build_configuration m_build_config; 296 | }; 297 | 298 | } // namespace fulgor 299 | -------------------------------------------------------------------------------- /include/color_sets/differential.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fulgor { 4 | 5 | struct differential { 6 | static const index_t type = index_t::DIFF; 7 | 8 | struct builder { 9 | builder() : m_prev_cluster_id(0) { 10 | m_color_set_offsets.push_back(0); 11 | m_representative_offsets.push_back(0); 12 | } 13 | 14 | void init_color_sets_builder(uint64_t num_colors) { 15 | m_num_colors = num_colors; 16 | m_num_total_integers = 0; 17 | m_num_sets = 0; 18 | } 19 | 20 | void encode_representative(std::vector const& representative) { 21 | uint64_t size = representative.size(); 22 | bits::util::write_delta(m_bvb, size); 23 | m_num_total_integers += size + 1; // size plus size number 24 | m_num_sets += 1; 25 | if (size > 0) { 26 | uint32_t prev_val = representative[0]; 27 | bits::util::write_delta(m_bvb, prev_val); 28 | for (uint64_t i = 1; i < size; ++i) { 29 | uint32_t val = representative[i]; 30 | assert(val >= prev_val + 1); 31 | bits::util::write_delta(m_bvb, val - (prev_val + 1)); 32 | prev_val = val; 33 | } 34 | } 35 | m_representative_offsets.push_back(m_bvb.num_bits()); 36 | } 37 | 38 | void encode_color_set(uint64_t cluster_id, std::vector const& representative, 39 | uint64_t it_size, function next, function get) // 40 | { 41 | std::vector differential_set; 42 | uint64_t ref_size = representative.size(); 43 | differential_set.reserve(ref_size + it_size); 44 | 45 | if (cluster_id != m_prev_cluster_id) { 46 | m_prev_cluster_id = cluster_id; 47 | m_clusters.set(m_clusters.num_bits() - 1); 48 | } 49 | m_clusters.push_back(false); 50 | 51 | uint64_t i = 0, j = 0; 52 | while (i < it_size && j < ref_size) { 53 | if (get() == representative[j]) { 54 | i += 1; 55 | j += 1; 56 | next(); 57 | } else if (get() < representative[j]) { 58 | differential_set.push_back(get()); 59 | i += 1; 60 | next(); 61 | } else { 62 | differential_set.push_back(representative[j]); 63 | j += 1; 64 | } 65 | } 66 | while (i < it_size) { 67 | differential_set.push_back(get()); 68 | next(); 69 | i += 1; 70 | } 71 | while (j < ref_size) { 72 | differential_set.push_back(representative[j]); 73 | j += 1; 74 | } 75 | 76 | uint64_t size = differential_set.size(); 77 | bits::util::write_delta(m_bvb, size); 78 | bits::util::write_delta(m_bvb, it_size); 79 | 80 | // size plus differential_set size plus original set size 81 | m_num_total_integers += size + 2; 82 | 83 | m_num_sets += 1; 84 | 85 | if (size > 0) { 86 | uint32_t prev_val = differential_set[0]; 87 | bits::util::write_delta(m_bvb, prev_val); 88 | for (uint64_t pos = 1; pos < size; ++pos) { 89 | uint32_t val = differential_set[pos]; 90 | assert(val >= prev_val + 1); 91 | bits::util::write_delta(m_bvb, val - (prev_val + 1)); 92 | prev_val = val; 93 | } 94 | } 95 | 96 | uint64_t last_offset = m_representative_offsets[m_representative_offsets.size() - 1]; 97 | m_color_set_offsets.push_back(m_bvb.num_bits() - last_offset); 98 | } 99 | 100 | void build(differential& d) { 101 | d.m_num_colors = m_num_colors; 102 | m_bvb.build(d.m_color_sets); 103 | m_clusters.build(d.m_clusters); 104 | d.m_clusters_rank1_index.build(d.m_clusters); 105 | d.m_representative_offsets.encode(m_representative_offsets.begin(), // 106 | m_representative_offsets.size(), // 107 | m_representative_offsets.back()); 108 | d.m_color_set_offsets.encode(m_color_set_offsets.begin(), // 109 | m_color_set_offsets.size(), // 110 | m_color_set_offsets.back()); 111 | 112 | std::cout << "processed " << m_num_sets << " color sets\n"; 113 | std::cout << "m_num_total_integers " << m_num_total_integers << '\n'; 114 | 115 | std::cout << " total bits for ints = " << 8 * d.m_color_sets.num_bytes() << '\n'; 116 | std::cout << " total bits per offset = " 117 | << 8 * (d.m_color_set_offsets.num_bytes() + 118 | d.m_representative_offsets.num_bytes()) 119 | << " (differences: " << 8 * d.m_color_set_offsets.num_bytes() 120 | << ", representatives: " << 8 * d.m_representative_offsets.num_bytes() 121 | << ")\n"; 122 | std::cout << " offsets: " 123 | << 8.0 * 124 | (d.m_color_set_offsets.num_bytes() + 125 | d.m_representative_offsets.num_bytes()) / 126 | m_num_total_integers 127 | << " bits/int\n"; 128 | std::cout << " color sets: " 129 | << (8.0 * d.m_color_sets.num_bytes()) / m_num_total_integers << " bits/int\n"; 130 | } 131 | 132 | private: 133 | bits::bit_vector::builder m_bvb, m_clusters; 134 | uint64_t m_num_total_integers, m_num_sets; 135 | 136 | uint64_t m_num_colors; 137 | uint64_t m_prev_cluster_id; 138 | std::vector m_representative_offsets, m_color_set_offsets; 139 | }; 140 | 141 | struct forward_iterator { 142 | forward_iterator() {} 143 | 144 | forward_iterator(differential const* ptr, uint64_t set_begin, uint64_t representative_begin) 145 | : m_ptr(ptr) 146 | , m_differential_set_begin(set_begin) 147 | , m_representative_begin(representative_begin) { 148 | rewind(); 149 | } 150 | 151 | void rewind() { 152 | init(); 153 | update_curr_val(); 154 | } 155 | 156 | void full_rewind() { init(); } 157 | 158 | uint32_t size() const { return m_size; } 159 | 160 | uint64_t value() const { return m_curr_val; } 161 | uint64_t operator*() const { return value(); } 162 | 163 | void next() { 164 | if (m_pos_in_representative >= m_representative_size && 165 | m_pos_in_differential_set >= m_differential_set_size) { 166 | m_curr_val = num_colors(); 167 | return; 168 | } 169 | if (m_pos_in_representative >= m_representative_size || 170 | m_curr_differential_val < m_curr_representative_val) { 171 | next_differential_val(); 172 | } else if (m_pos_in_differential_set >= m_differential_set_size || 173 | m_curr_representative_val < m_curr_differential_val) { 174 | next_representative_val(); 175 | } 176 | update_curr_val(); 177 | } 178 | void operator++() { next(); } 179 | 180 | void next_geq(const uint64_t lower_bound) { 181 | assert(lower_bound <= num_colors()); 182 | while (value() < lower_bound) next(); 183 | assert(value() >= lower_bound); 184 | } 185 | 186 | uint32_t num_colors() const { return m_ptr->m_num_colors; } 187 | uint64_t differential_set_size() const { return m_differential_set_size; } 188 | int encoding_type() const { return encoding_t::symmetric_difference; } 189 | 190 | uint64_t representative_begin() const { return m_representative_begin; } 191 | 192 | void next_representative_val() { 193 | m_pos_in_representative += 1; 194 | m_prev_representative_val = m_curr_representative_val; 195 | if (m_pos_in_representative < m_representative_size) { 196 | m_curr_representative_val = 197 | m_prev_representative_val + bits::util::read_delta(m_representative_it) + 1; 198 | } else { 199 | m_curr_representative_val = num_colors(); 200 | } 201 | } 202 | 203 | uint32_t representative_val() const { return m_curr_representative_val; } 204 | 205 | void next_differential_val() { 206 | m_pos_in_differential_set += 1; 207 | m_prev_differential_val = m_curr_differential_val; 208 | if (m_pos_in_differential_set < m_differential_set_size) { 209 | m_curr_differential_val = 210 | m_prev_differential_val + bits::util::read_delta(m_differential_set_it) + 1; 211 | } else { 212 | m_curr_differential_val = num_colors(); 213 | } 214 | } 215 | 216 | uint32_t differential_val() const { return m_curr_differential_val; } 217 | 218 | private: 219 | differential const* m_ptr; 220 | uint64_t m_differential_set_begin, m_representative_begin; 221 | uint64_t m_representative_size, m_differential_set_size; 222 | uint64_t m_pos_in_differential_set, m_pos_in_representative; 223 | uint32_t m_curr_representative_val, m_curr_differential_val; 224 | uint32_t m_prev_representative_val, m_prev_differential_val; 225 | uint32_t m_curr_val; 226 | uint32_t m_size; 227 | bits::bit_vector::iterator m_representative_it, m_differential_set_it; 228 | 229 | void init() { 230 | m_differential_set_it = // 231 | (m_ptr->m_color_sets).get_iterator_at(m_differential_set_begin); 232 | m_representative_it = // 233 | (m_ptr->m_color_sets).get_iterator_at(m_representative_begin); 234 | 235 | m_differential_set_size = bits::util::read_delta(m_differential_set_it); 236 | m_representative_size = bits::util::read_delta(m_representative_it); 237 | m_size = bits::util::read_delta(m_differential_set_it); 238 | 239 | m_curr_differential_val = m_differential_set_size == 0 240 | ? num_colors() 241 | : bits::util::read_delta(m_differential_set_it); 242 | m_prev_differential_val = 0; 243 | m_curr_representative_val = m_representative_size == 0 244 | ? num_colors() 245 | : bits::util::read_delta(m_representative_it); 246 | m_prev_representative_val = 0; 247 | 248 | m_pos_in_differential_set = 0; 249 | m_pos_in_representative = 0; 250 | } 251 | 252 | void update_curr_val() { 253 | while (m_curr_representative_val == m_curr_differential_val && 254 | m_pos_in_representative <= m_representative_size && 255 | m_pos_in_differential_set <= m_differential_set_size) { 256 | next_differential_val(); 257 | next_representative_val(); 258 | } 259 | m_curr_val = min(m_curr_differential_val, m_curr_representative_val); 260 | } 261 | }; 262 | 263 | typedef forward_iterator iterator_type; 264 | 265 | forward_iterator color_set(uint64_t color_id) const { 266 | assert(color_id < num_color_sets()); 267 | uint64_t last_representative = m_representative_offsets.access(num_partitions()); 268 | uint64_t set_begin = m_color_set_offsets.access(color_id) + last_representative; 269 | uint64_t representative_begin = 270 | m_representative_offsets.access(m_clusters_rank1_index.rank1(m_clusters, color_id)); 271 | return forward_iterator(this, set_begin, representative_begin); 272 | } 273 | 274 | uint64_t num_color_sets() const { return m_color_set_offsets.size() - 1; } 275 | uint64_t num_partitions() const { return m_clusters_rank1_index.num_ones() + 1; } 276 | uint64_t num_colors() const { return m_num_colors; } 277 | 278 | uint64_t num_bits() const { 279 | return (sizeof(m_num_colors) + m_representative_offsets.num_bytes() + 280 | m_color_set_offsets.num_bytes() + m_color_sets.num_bytes() + 281 | m_clusters.num_bytes() + m_clusters_rank1_index.num_bytes()) * 282 | 8; 283 | } 284 | 285 | void print_stats() const; 286 | 287 | template 288 | void visit(Visitor& visitor) { 289 | visit_impl(visitor, *this); 290 | } 291 | 292 | template 293 | void visit(Visitor& visitor) const { 294 | visit_impl(visitor, *this); 295 | } 296 | 297 | private: 298 | template 299 | static void visit_impl(Visitor& visitor, T&& t) { 300 | visitor.visit(t.m_num_colors); 301 | visitor.visit(t.m_representative_offsets); 302 | visitor.visit(t.m_color_set_offsets); 303 | visitor.visit(t.m_color_sets); 304 | visitor.visit(t.m_clusters); 305 | visitor.visit(t.m_clusters_rank1_index); 306 | } 307 | 308 | uint32_t m_num_colors; 309 | bits::elias_fano m_representative_offsets, m_color_set_offsets; 310 | bits::bit_vector m_color_sets; 311 | bits::bit_vector m_clusters; 312 | bits::rank9 m_clusters_rank1_index; 313 | }; 314 | 315 | } // namespace fulgor 316 | -------------------------------------------------------------------------------- /include/color_sets/hybrid.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fulgor { 4 | 5 | struct hybrid { 6 | static const index_t type = index_t::HYBRID; 7 | 8 | struct builder { 9 | builder() : m_num_color_sets(0) {} 10 | builder(uint64_t num_colors) { init(num_colors); } 11 | 12 | void init(uint64_t num_colors) { 13 | m_num_colors = num_colors; 14 | 15 | /* if set contains < sparse_set_threshold_size ints, code it with gaps+delta */ 16 | m_sparse_set_threshold_size = 0.25 * m_num_colors; 17 | 18 | /* if set contains > very_dense_set_threshold_size ints, code it as a complementary set 19 | with gaps+delta */ 20 | m_very_dense_set_threshold_size = 0.75 * m_num_colors; 21 | /* otherwise: code it as a bitmap of m_num_colors bits */ 22 | 23 | std::cout << "m_num_colors " << m_num_colors << std::endl; 24 | std::cout << "m_sparse_set_threshold_size " << m_sparse_set_threshold_size << std::endl; 25 | std::cout << "m_very_dense_set_threshold_size " << m_very_dense_set_threshold_size 26 | << std::endl; 27 | 28 | m_bvb.reserve(320 * essentials::GB); 29 | m_offsets.push_back(0); 30 | 31 | m_num_color_sets = 0; 32 | m_num_total_integers = 0; 33 | } 34 | 35 | void process(uint32_t const* color_set, const uint64_t size) // 36 | { 37 | bits::util::write_delta(m_bvb, size); /* encode size */ 38 | if (size < m_sparse_set_threshold_size) { 39 | uint32_t prev_val = color_set[0]; 40 | bits::util::write_delta(m_bvb, prev_val); 41 | for (uint64_t i = 1; i != size; ++i) { 42 | uint32_t val = color_set[i]; 43 | assert(val >= prev_val + 1); 44 | bits::util::write_delta(m_bvb, val - (prev_val + 1)); 45 | prev_val = val; 46 | } 47 | } else if (size < m_very_dense_set_threshold_size) { 48 | bits::bit_vector::builder bvb; 49 | bvb.resize(m_num_colors); 50 | for (uint64_t i = 0; i != size; ++i) bvb.set(color_set[i]); 51 | m_bvb.append(bvb); 52 | } else { 53 | bool first = true; 54 | uint32_t val = 0; 55 | uint32_t prev_val = -1; 56 | uint32_t written = 0; 57 | for (uint64_t i = 0; i != size; ++i) { 58 | uint32_t x = color_set[i]; 59 | while (val < x) { 60 | if (first) { 61 | bits::util::write_delta(m_bvb, val); 62 | first = false; 63 | ++written; 64 | } else { 65 | assert(val >= prev_val + 1); 66 | bits::util::write_delta(m_bvb, val - (prev_val + 1)); 67 | ++written; 68 | } 69 | prev_val = val; 70 | ++val; 71 | } 72 | assert(val == x); 73 | val++; // skip x 74 | } 75 | while (val < m_num_colors) { 76 | assert(val >= prev_val + 1); 77 | bits::util::write_delta(m_bvb, val - (prev_val + 1)); 78 | prev_val = val; 79 | ++val; 80 | ++written; 81 | } 82 | assert(val == m_num_colors); 83 | /* complementary_set_size = m_num_colors - size */ 84 | assert(m_num_colors - size <= m_num_colors); 85 | assert(written == m_num_colors - size); 86 | } 87 | m_offsets.push_back(m_bvb.num_bits()); 88 | m_num_total_integers += size; 89 | m_num_color_sets += 1; 90 | if (m_num_color_sets % 500000 == 0) { 91 | std::cout << " processed " << m_num_color_sets << " color sets" << std::endl; 92 | } 93 | } 94 | 95 | void append(hybrid::builder& hb) { 96 | if (hb.m_num_color_sets == 0) return; 97 | m_bvb.append(hb.m_bvb); 98 | assert(m_offsets.size() > 0); 99 | uint64_t delta = m_offsets.back(); 100 | m_offsets.reserve(m_offsets.size() + hb.m_offsets.size()); 101 | for (uint64_t i = 1; i != hb.m_offsets.size(); ++i) { 102 | m_offsets.push_back(hb.m_offsets[i] + delta); 103 | } 104 | m_num_color_sets += hb.m_num_color_sets; 105 | m_num_total_integers += hb.m_num_total_integers; 106 | assert(m_num_color_sets == m_offsets.size() - 1); 107 | } 108 | 109 | void build(hybrid& h) { 110 | h.m_num_colors = m_num_colors; 111 | h.m_sparse_set_threshold_size = m_sparse_set_threshold_size; 112 | h.m_very_dense_set_threshold_size = m_very_dense_set_threshold_size; 113 | 114 | std::cout << "processed " << m_num_color_sets << " color sets" << std::endl; 115 | std::cout << "m_num_total_integers " << m_num_total_integers << std::endl; 116 | assert(m_num_color_sets == m_offsets.size() - 1); 117 | 118 | h.m_offsets.encode(m_offsets.begin(), m_offsets.size(), m_offsets.back()); 119 | m_bvb.build(h.m_color_sets); 120 | 121 | std::cout << " total bits for ints = " << 8 * h.m_color_sets.num_bytes() << std::endl; 122 | std::cout << " total bits per offsets = " << 8 * h.m_offsets.num_bytes() << std::endl; 123 | std::cout << " total bits = " 124 | << 8 * (h.m_color_sets.num_bytes() + h.m_offsets.num_bytes()) << std::endl; 125 | std::cout << " offsets: " << (8.0 * h.m_offsets.num_bytes()) / m_num_total_integers 126 | << " bits/int" << std::endl; 127 | std::cout << " color sets: " 128 | << (8.0 * h.m_color_sets.num_bytes()) / m_num_total_integers << " bits/int" 129 | << std::endl; 130 | } 131 | 132 | void clear() { 133 | m_offsets.clear(); 134 | m_bvb.clear(); 135 | init(m_num_colors); 136 | } 137 | 138 | private: 139 | uint32_t m_num_colors; 140 | uint32_t m_sparse_set_threshold_size; 141 | uint32_t m_very_dense_set_threshold_size; 142 | uint64_t m_num_color_sets; 143 | uint64_t m_num_total_integers; 144 | 145 | bits::bit_vector::builder m_bvb; 146 | std::vector m_offsets; 147 | }; 148 | 149 | struct forward_iterator { 150 | forward_iterator() {} 151 | 152 | forward_iterator(hybrid const* ptr, uint64_t begin) 153 | : m_ptr(ptr) 154 | , m_bitmap_begin(begin) 155 | , m_color_sets_begin(begin) 156 | , m_num_colors(ptr->m_num_colors) { 157 | rewind(); 158 | } 159 | 160 | void rewind() { 161 | m_pos_in_set = 0; 162 | m_pos_in_comp_set = 0; 163 | m_comp_set_size = 0; 164 | m_comp_val = -1; 165 | m_prev_val = -1; 166 | m_curr_val = 0; 167 | m_it = (m_ptr->m_color_sets).get_iterator_at(m_color_sets_begin); 168 | m_size = bits::util::read_delta(m_it); 169 | /* set m_encoding_type and read the first value */ 170 | if (m_size < m_ptr->m_sparse_set_threshold_size) { 171 | m_encoding_type = encoding_t::delta_gaps; 172 | m_curr_val = bits::util::read_delta(m_it); 173 | } else if (m_size < m_ptr->m_very_dense_set_threshold_size) { 174 | m_encoding_type = encoding_t::bitmap; 175 | m_bitmap_begin = m_it.position(); // after m_size 176 | m_it.skip_to(m_bitmap_begin); 177 | uint64_t pos = m_it.next(); 178 | assert(pos >= m_bitmap_begin); 179 | m_curr_val = pos - m_bitmap_begin; 180 | } else { 181 | m_encoding_type = encoding_t::complement_delta_gaps; 182 | m_comp_set_size = m_num_colors - m_size; 183 | if (m_comp_set_size > 0) m_comp_val = bits::util::read_delta(m_it); 184 | next_comp_val(); 185 | } 186 | } 187 | 188 | /* this is needed to annul the next_comp_val() done in the constructor 189 | if we want to iterate through the complemented set */ 190 | void reinit_for_complemented_set_iteration() { 191 | assert(m_encoding_type == encoding_t::complement_delta_gaps); 192 | m_pos_in_comp_set = 0; 193 | m_prev_val = -1; 194 | m_curr_val = 0; 195 | m_it = (m_ptr->m_color_sets).get_iterator_at(m_color_sets_begin); 196 | bits::util::read_delta(m_it); /* skip m_size */ 197 | if (m_comp_set_size > 0) { 198 | m_comp_val = bits::util::read_delta(m_it); 199 | } else { 200 | m_comp_val = m_num_colors; 201 | } 202 | } 203 | 204 | uint64_t value() const { return m_curr_val; } 205 | uint64_t comp_value() const { return m_comp_val; } 206 | uint64_t operator*() const { return value(); } 207 | 208 | void next() { 209 | if (m_encoding_type == encoding_t::complement_delta_gaps) { 210 | ++m_curr_val; 211 | if (m_curr_val >= m_num_colors) { // saturate 212 | m_curr_val = m_num_colors; 213 | return; 214 | } 215 | next_comp_val(); 216 | } else if (m_encoding_type == encoding_t::delta_gaps) { 217 | m_pos_in_set += 1; 218 | if (m_pos_in_set >= m_size) { // saturate 219 | m_curr_val = m_num_colors; 220 | return; 221 | } 222 | m_prev_val = m_curr_val; 223 | m_curr_val = bits::util::read_delta(m_it) + (m_prev_val + 1); 224 | } else { 225 | assert(m_encoding_type == encoding_t::bitmap); 226 | m_pos_in_set += 1; 227 | if (m_pos_in_set >= m_size) { // saturate 228 | m_curr_val = m_num_colors; 229 | return; 230 | } 231 | uint64_t pos = m_it.next(); 232 | assert(pos >= m_bitmap_begin); 233 | m_curr_val = pos - m_bitmap_begin; 234 | } 235 | } 236 | 237 | void next_comp() { 238 | ++m_pos_in_comp_set; 239 | if (m_pos_in_comp_set >= m_comp_set_size) { // saturate 240 | m_comp_val = m_num_colors; 241 | return; 242 | } 243 | m_prev_val = m_comp_val; 244 | m_comp_val = bits::util::read_delta(m_it) + (m_prev_val + 1); 245 | } 246 | 247 | void operator++() { next(); } 248 | 249 | /* update the state of the iterator to the element 250 | which is greater-than or equal-to lower_bound */ 251 | void next_geq(const uint64_t lower_bound) { 252 | assert(lower_bound <= num_colors()); 253 | if (m_encoding_type == encoding_t::complement_delta_gaps) { 254 | if (value() > lower_bound) return; 255 | next_geq_comp_val(lower_bound); 256 | m_curr_val = lower_bound + (m_comp_val == lower_bound); 257 | } else { 258 | while (value() < lower_bound) next(); 259 | } 260 | assert(value() >= lower_bound); 261 | } 262 | 263 | uint32_t size() const { return m_size; } 264 | uint32_t num_colors() const { return m_num_colors; } 265 | int encoding_type() const { return m_encoding_type; } 266 | 267 | private: 268 | hybrid const* m_ptr; 269 | uint64_t m_bitmap_begin; 270 | uint64_t m_color_sets_begin; 271 | uint32_t m_num_colors; 272 | int m_encoding_type; 273 | 274 | bits::bit_vector::iterator m_it; 275 | uint32_t m_pos_in_set; 276 | uint32_t m_size; 277 | 278 | uint32_t m_pos_in_comp_set; 279 | uint32_t m_comp_set_size; 280 | 281 | uint32_t m_comp_val; 282 | uint32_t m_prev_val; 283 | uint32_t m_curr_val; 284 | 285 | void next_comp_val() { 286 | while (m_curr_val == m_comp_val) { 287 | ++m_curr_val; 288 | ++m_pos_in_comp_set; 289 | if (m_pos_in_comp_set >= m_comp_set_size) break; 290 | m_prev_val = m_comp_val; 291 | m_comp_val = bits::util::read_delta(m_it) + (m_prev_val + 1); 292 | } 293 | } 294 | 295 | void next_geq_comp_val(const uint64_t lower_bound) { 296 | while (m_comp_val < lower_bound) { 297 | ++m_pos_in_comp_set; 298 | if (m_pos_in_comp_set >= m_comp_set_size) break; 299 | m_prev_val = m_comp_val; 300 | m_comp_val = bits::util::read_delta(m_it) + (m_prev_val + 1); 301 | } 302 | } 303 | }; 304 | 305 | typedef forward_iterator iterator_type; 306 | 307 | forward_iterator color_set(uint64_t color_set_id) const { 308 | assert(color_set_id < num_color_sets()); 309 | uint64_t begin = m_offsets.access(color_set_id); 310 | return forward_iterator(this, begin); 311 | } 312 | 313 | uint32_t num_colors() const { return m_num_colors; } 314 | uint64_t num_color_sets() const { return m_offsets.size() - 1; } 315 | 316 | uint64_t num_bits() const { 317 | return (sizeof(m_num_colors) + sizeof(m_sparse_set_threshold_size) + 318 | sizeof(m_very_dense_set_threshold_size) + m_offsets.num_bytes() + 319 | m_color_sets.num_bytes()) * 320 | 8; 321 | } 322 | 323 | void print_stats() const; 324 | 325 | template 326 | void visit(Visitor& visitor) { 327 | visit_impl(visitor, *this); 328 | } 329 | 330 | template 331 | void visit(Visitor& visitor) const { 332 | visit_impl(visitor, *this); 333 | } 334 | 335 | private: 336 | template 337 | static void visit_impl(Visitor& visitor, T&& t) { 338 | visitor.visit(t.m_num_colors); 339 | visitor.visit(t.m_sparse_set_threshold_size); 340 | visitor.visit(t.m_very_dense_set_threshold_size); 341 | visitor.visit(t.m_offsets); 342 | visitor.visit(t.m_color_sets); 343 | } 344 | 345 | uint32_t m_num_colors; 346 | uint32_t m_sparse_set_threshold_size; 347 | uint32_t m_very_dense_set_threshold_size; 348 | 349 | bits::elias_fano m_offsets; 350 | bits::bit_vector m_color_sets; 351 | }; 352 | 353 | } // namespace fulgor 354 | -------------------------------------------------------------------------------- /include/color_sets/meta.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fulgor { 4 | 5 | template 6 | struct meta { 7 | static const index_t type = index_t::META; 8 | 9 | struct partition_endpoint { 10 | template 11 | void visit(Visitor& visitor) { 12 | visitor.visit(min_color); 13 | visitor.visit(num_color_sets_before); 14 | } 15 | uint32_t min_color; 16 | uint32_t num_color_sets_before; 17 | }; 18 | 19 | struct builder { 20 | builder() : m_offset(0) { m_meta_color_sets_offsets.push_back(0); } 21 | 22 | void init_meta_color_sets_builder( 23 | uint64_t num_integers_in_metacolor_sets, uint64_t num_color_sets, 24 | std::vector const& partition_sizes, 25 | std::vector const& num_color_sets_in_partitions) // 26 | { 27 | m_meta_color_sets_builder.resize(num_integers_in_metacolor_sets, 28 | std::ceil(std::log2(num_color_sets))); 29 | m_partition_endpoints.reserve(num_color_sets_in_partitions.size()); 30 | assert(partition_sizes.front() == 0); 31 | m_partition_endpoints.push_back({partition_sizes[0], 0}); 32 | for (uint32_t i = 0, val = 0; i != num_color_sets_in_partitions.size(); ++i) { 33 | val += num_color_sets_in_partitions[i]; 34 | m_partition_endpoints.push_back({partition_sizes[i + 1], val}); 35 | } 36 | } 37 | 38 | void init_color_sets_builder(uint64_t num_colors, uint64_t num_partitions) { 39 | m_num_colors = num_colors; 40 | m_color_sets_builders.resize(num_partitions); 41 | } 42 | 43 | void init_partition(uint64_t partition_id, uint64_t num_colors_in_partition) { 44 | assert(partition_id < m_color_sets_builders.size()); 45 | m_color_sets_builders[partition_id].init(num_colors_in_partition); 46 | } 47 | 48 | void process_color_set(uint64_t partition_id, uint32_t const* color_set, 49 | const uint64_t size) { 50 | assert(partition_id < m_color_sets_builders.size()); 51 | m_color_sets_builders[partition_id].process(color_set, size); 52 | } 53 | 54 | void process_metacolor_set(uint32_t const* metacolor_set, const uint64_t size) { 55 | assert(size < (1ULL << m_meta_color_sets_builder.width())); 56 | m_meta_color_sets_builder.push_back(size); 57 | for (uint64_t i = 0; i != size; ++i) { 58 | m_meta_color_sets_builder.push_back(metacolor_set[i]); 59 | } 60 | m_offset += size + 1; 61 | m_meta_color_sets_offsets.push_back(m_offset); 62 | } 63 | 64 | void build(meta& m) { 65 | m.m_num_colors = m_num_colors; 66 | m_meta_color_sets_builder.build(m.m_meta_color_sets); 67 | m.m_partial_color_sets.resize(m_color_sets_builders.size()); 68 | for (uint64_t i = 0; i != m_color_sets_builders.size(); ++i) { 69 | m_color_sets_builders[i].build(m.m_partial_color_sets[i]); 70 | } 71 | m.m_meta_color_sets_offsets.encode(m_meta_color_sets_offsets.begin(), 72 | m_meta_color_sets_offsets.size(), 73 | m_meta_color_sets_offsets.back()); 74 | m.m_partition_endpoints.swap(m_partition_endpoints); 75 | } 76 | 77 | private: 78 | bits::compact_vector::builder m_meta_color_sets_builder; 79 | std::vector m_color_sets_builders; 80 | 81 | uint64_t m_num_colors; 82 | uint64_t m_offset; 83 | std::vector m_meta_color_sets_offsets; 84 | 85 | std::vector m_partition_endpoints; 86 | }; 87 | 88 | struct forward_iterator { 89 | forward_iterator(meta const* ptr, uint64_t begin) 90 | : m_ptr(ptr) 91 | , m_begin(begin) 92 | , m_meta_color_set_size((m_ptr->m_meta_color_sets)[m_begin]) { 93 | rewind(); 94 | } 95 | 96 | void rewind() { 97 | init(); 98 | assert(m_meta_color_set_size > 0); 99 | change_partition(); 100 | } 101 | 102 | void init() { 103 | m_pos_in_meta_color_list = 0; 104 | m_partition_id = 0; 105 | m_partition_min_color = 0; 106 | } 107 | 108 | uint64_t value() const { return m_curr_val; } 109 | uint64_t operator*() const { return value(); } 110 | 111 | bool has_next() const { return m_pos_in_curr_partition != m_curr_partition_size; } 112 | void next_in_partition() { 113 | m_pos_in_curr_partition += 1; 114 | m_curr_partition_it.next(); 115 | update_curr_val(); 116 | } 117 | 118 | void next() { 119 | if (m_pos_in_curr_partition == m_curr_partition_size - 1) { 120 | if (m_pos_in_meta_color_list == meta_color_set_size() - 1) { // saturate 121 | m_curr_val = num_colors(); 122 | return; 123 | } 124 | m_pos_in_meta_color_list += 1; 125 | change_partition(); 126 | } else { 127 | next_in_partition(); 128 | } 129 | } 130 | void operator++() { next(); } 131 | 132 | /* update the state of the iterator to the element 133 | which is greater-than or equal-to lower_bound */ 134 | void next_geq(const uint64_t lower_bound) { 135 | assert(lower_bound <= num_colors()); 136 | while (value() < lower_bound) next(); 137 | assert(value() >= lower_bound); 138 | } 139 | 140 | /* Warning: this might be slow. */ 141 | uint32_t size() const { 142 | uint64_t n = 0; 143 | for (uint32_t i = 0, partition_id = 0; i != meta_color_set_size(); ++i) { 144 | uint32_t meta_color = (m_ptr->m_meta_color_sets)[m_begin + 1 + i]; 145 | partition_id = update_partition_id(meta_color, partition_id); 146 | uint32_t num_color_sets_before = 147 | (m_ptr->m_partition_endpoints)[partition_id].num_color_sets_before; 148 | n += (m_ptr->m_partial_color_sets)[partition_id] 149 | .color_set(meta_color - num_color_sets_before) 150 | .size(); 151 | } 152 | return n; 153 | } 154 | 155 | uint32_t partial_set_size() const { return m_curr_partition_it.size(); } 156 | 157 | uint32_t meta_color() const { return m_curr_meta_color; } 158 | 159 | void read_partition_id() { 160 | m_curr_meta_color = (m_ptr->m_meta_color_sets)[m_begin + 1 + m_pos_in_meta_color_list]; 161 | m_partition_id = update_partition_id(m_curr_meta_color, m_partition_id); 162 | } 163 | 164 | void next_partition_id() { 165 | m_pos_in_meta_color_list += 1; 166 | if (m_pos_in_meta_color_list == meta_color_set_size()) { // saturate 167 | m_partition_id = num_partitions(); 168 | return; 169 | } 170 | read_partition_id(); 171 | } 172 | 173 | void next_geq_partition_id(const uint32_t lower_bound) { 174 | assert(lower_bound <= num_partitions()); 175 | while (partition_id() < lower_bound) next_partition_id(); 176 | assert(partition_id() >= lower_bound); 177 | } 178 | 179 | void update_partition() { 180 | /* update partition min/max color */ 181 | auto const& endpoints = m_ptr->m_partition_endpoints; 182 | m_partition_min_color = endpoints[m_partition_id].min_color; 183 | m_partition_max_color = endpoints[m_partition_id + 1].min_color; 184 | 185 | uint32_t num_color_sets_before = endpoints[m_partition_id].num_color_sets_before; 186 | m_curr_partition_it = (m_ptr->m_partial_color_sets)[m_partition_id].color_set( 187 | m_curr_meta_color - num_color_sets_before); 188 | m_curr_partition_size = m_curr_partition_it.size(); 189 | assert(m_curr_partition_size > 0); 190 | m_pos_in_curr_partition = 0; 191 | 192 | update_curr_val(); 193 | } 194 | 195 | void change_partition() { 196 | read_partition_id(); 197 | update_partition(); 198 | } 199 | 200 | uint32_t partition_id() const { return m_partition_id; } 201 | uint32_t meta_color_set_size() const { return m_meta_color_set_size; } 202 | uint32_t num_colors() const { return m_ptr->num_colors(); } 203 | uint32_t num_partitions() const { return m_ptr->num_partitions(); } 204 | uint32_t partition_min_color() const { return m_partition_min_color; } 205 | uint32_t partition_max_color() const { return m_partition_max_color; } 206 | uint32_t num_color_sets_before() const { 207 | return m_ptr->m_partition_endpoints[m_partition_id].num_color_sets_before; 208 | } 209 | 210 | private: 211 | meta const* m_ptr; 212 | typename ColorSets::iterator_type m_curr_partition_it; 213 | uint64_t m_begin; 214 | uint32_t m_curr_meta_color, m_curr_val; 215 | uint32_t m_meta_color_set_size, m_pos_in_meta_color_list; 216 | uint32_t m_curr_partition_size, m_pos_in_curr_partition; 217 | uint32_t m_partition_id; 218 | uint32_t m_partition_min_color, m_partition_max_color; 219 | 220 | void update_curr_val() { m_curr_val = m_curr_partition_it.value() + m_partition_min_color; } 221 | 222 | uint32_t update_partition_id(const uint32_t meta_color, uint32_t partition_id) const { 223 | auto const& endpoints = m_ptr->m_partition_endpoints; 224 | while (partition_id + 1 < endpoints.size() and 225 | meta_color >= endpoints[partition_id + 1].num_color_sets_before) { 226 | partition_id += 1; 227 | } 228 | assert(partition_id < m_ptr->num_partitions()); 229 | return partition_id; 230 | } 231 | }; 232 | 233 | typedef forward_iterator iterator_type; 234 | 235 | forward_iterator color_set(uint64_t color_set_id) const { 236 | assert(color_set_id < num_color_sets()); 237 | uint64_t begin = m_meta_color_sets_offsets.access(color_set_id); 238 | return forward_iterator(this, begin); 239 | } 240 | 241 | std::vector const& partial_colors() const { return m_partial_color_sets; } 242 | 243 | uint32_t num_colors() const { return m_num_colors; } 244 | uint64_t num_color_sets() const { return m_meta_color_sets_offsets.size() - 1; } 245 | uint64_t num_partitions() const { return m_partition_endpoints.size() - 1; } 246 | 247 | uint64_t num_bits() const { 248 | uint64_t num_bits_colors = sizeof(size_t) * 8; // for std::vector::size 249 | for (auto const& c : m_partial_color_sets) num_bits_colors += c.num_bits(); 250 | return num_bits_colors + 251 | (m_meta_color_sets_offsets.num_bytes() + m_meta_color_sets.num_bytes() + 252 | essentials::vec_bytes(m_partition_endpoints) + sizeof(m_num_colors)) * 253 | 8; 254 | } 255 | 256 | void print_stats() const; 257 | 258 | template 259 | void visit(Visitor& visitor) { 260 | visit_impl(visitor, *this); 261 | } 262 | 263 | template 264 | void visit(Visitor& visitor) const { 265 | visit_impl(visitor, *this); 266 | } 267 | 268 | private: 269 | template 270 | static void visit_impl(Visitor& visitor, T&& t) { 271 | visitor.visit(t.m_num_colors); 272 | visitor.visit(t.m_meta_color_sets); 273 | visitor.visit(t.m_meta_color_sets_offsets); 274 | visitor.visit(t.m_partial_color_sets); 275 | visitor.visit(t.m_partition_endpoints); 276 | } 277 | 278 | uint32_t m_num_colors; 279 | bits::compact_vector m_meta_color_sets; 280 | bits::elias_fano m_meta_color_sets_offsets; 281 | std::vector m_partial_color_sets; 282 | std::vector m_partition_endpoints; 283 | }; 284 | 285 | } // namespace fulgor 286 | -------------------------------------------------------------------------------- /include/color_sets/meta_differential.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fulgor { 4 | 5 | struct meta_differential { 6 | static const index_t type = index_t::META_DIFF; 7 | 8 | struct partition_endpoint { 9 | template 10 | void visit(Visitor& visitor) { 11 | visitor.visit(min_color); 12 | visitor.visit(num_color_sets); 13 | } 14 | uint64_t min_color; 15 | uint64_t num_color_sets; 16 | }; 17 | 18 | struct builder { 19 | builder() : m_prev_docs(0), m_prev_partition_set_id(0) { 20 | m_partition_sets_offsets.push_back(0); 21 | m_relative_colors_offsets.push_back(0); 22 | } 23 | 24 | void init(uint64_t num_colors, uint64_t num_partitions) { 25 | m_num_colors = num_colors; 26 | m_partition_endpoints.reserve(num_partitions); 27 | m_partial_color_sets.reserve(num_partitions); 28 | } 29 | 30 | void init_meta_color_partition_sets(uint64_t num_sets) { 31 | m_num_partition_sets = num_sets; 32 | m_partition_sets_offsets.reserve(num_sets); 33 | } 34 | 35 | void process_meta_color_partition_set(vector& partition_set) { 36 | uint64_t size = partition_set.size(); 37 | uint64_t prev_val = partition_set[0]; 38 | 39 | bits::util::write_delta(m_partition_sets, size); 40 | bits::util::write_delta(m_partition_sets, prev_val); 41 | for (uint64_t i = 1; i < size; i++) { 42 | assert(prev_val < partition_set[i]); 43 | bits::util::write_delta(m_partition_sets, partition_set[i] - prev_val); 44 | prev_val = partition_set[i]; 45 | } 46 | 47 | m_partition_sets_offsets.push_back(m_partition_sets.num_bits()); 48 | } 49 | 50 | void process_partition(differential& d) { 51 | m_partial_color_sets.push_back(d); 52 | m_partition_endpoints.push_back({m_prev_docs, d.num_color_sets()}); 53 | m_prev_docs += d.num_colors(); 54 | } 55 | 56 | void process_metacolor_set(uint64_t partition_set_id, // 57 | vector& partition_set, // 58 | vector& relative_colors) // 59 | { 60 | assert(partition_set.size() == relative_colors.size()); 61 | if (partition_set_id != m_prev_partition_set_id) { 62 | m_prev_partition_set_id = partition_set_id; 63 | m_partition_sets_partitions.set(m_partition_sets_partitions.num_bits() - 1); 64 | } 65 | m_partition_sets_partitions.push_back(false); 66 | 67 | uint64_t size = partition_set.size(); 68 | for (uint64_t i = 0; i < size; i++) { 69 | uint64_t partition_id = partition_set[i]; 70 | uint64_t relative_id = relative_colors[i]; 71 | uint64_t partition_size = m_partition_endpoints[partition_id].num_color_sets; 72 | m_relative_colors.append_bits(relative_id, bits::util::msbll(partition_size) + 1); 73 | } 74 | m_relative_colors_offsets.push_back(m_relative_colors.num_bits()); 75 | } 76 | 77 | void build(meta_differential& m) { 78 | m.m_num_colors = m_num_colors; 79 | m.m_num_partition_sets = m_num_partition_sets; 80 | m.m_partial_color_sets.swap(m_partial_color_sets); 81 | 82 | m_relative_colors.build(m.m_relative_colors); 83 | m_partition_sets.build(m.m_partition_sets); 84 | m_partition_sets_partitions.build(m.m_partition_sets_partitions); 85 | m.m_partition_sets_partitions_rank1_index.build(m.m_partition_sets_partitions); 86 | 87 | m.m_partition_sets_offsets.encode(m_partition_sets_offsets.begin(), 88 | m_partition_sets_offsets.size(), 89 | m_partition_sets_offsets.back()); 90 | m.m_relative_colors_offsets.encode(m_relative_colors_offsets.begin(), 91 | m_relative_colors_offsets.size(), 92 | m_relative_colors_offsets.back()); 93 | 94 | m.m_partition_endpoints.swap(m_partition_endpoints); 95 | } 96 | 97 | private: 98 | std::vector m_partial_color_sets; 99 | 100 | bits::bit_vector::builder m_relative_colors; 101 | bits::bit_vector::builder m_partition_sets; 102 | bits::bit_vector::builder m_partition_sets_partitions; 103 | 104 | std::vector m_partition_sets_offsets; 105 | std::vector m_relative_colors_offsets; 106 | 107 | uint64_t m_num_colors, m_prev_docs; 108 | uint64_t m_num_partition_sets; 109 | 110 | uint64_t m_prev_partition_set_id; 111 | 112 | std::vector m_partition_endpoints; 113 | }; 114 | 115 | struct forward_iterator { 116 | forward_iterator(meta_differential const* ptr, // 117 | uint64_t begin_partition_set, uint64_t begin_rel) 118 | : m_ptr(ptr) 119 | , m_begin_partition_set(begin_partition_set) 120 | , m_begin_rel(begin_rel) // 121 | { 122 | rewind(); 123 | assert(m_meta_color_set_size > 0); 124 | } 125 | 126 | void rewind() { 127 | init(); 128 | change_partition(); 129 | } 130 | 131 | void init() { 132 | m_num_color_sets_before = 0; 133 | m_pos_in_meta_color = 0; 134 | m_pos_in_partial_color = 0; 135 | m_curr_partition_id = 0; 136 | m_partition_set_id = (m_ptr->m_partition_sets).get_iterator_at(m_begin_partition_set); 137 | m_relative_colors_it = (m_ptr->m_relative_colors).get_iterator_at(m_begin_rel); 138 | m_meta_color_set_size = bits::util::read_delta(m_partition_set_id); 139 | } 140 | 141 | uint64_t value() const { return m_curr_val; } 142 | uint64_t operator*() const { return value(); } 143 | 144 | bool has_next() const { return m_pos_in_partial_color != m_curr_partition_size; } 145 | 146 | void next() { 147 | if (m_pos_in_partial_color == m_curr_partition_size - 1) { 148 | if (m_pos_in_meta_color == meta_color_set_size() - 1) { // saturate 149 | m_curr_val = num_colors(); 150 | return; 151 | } 152 | m_pos_in_meta_color += 1; 153 | change_partition(); 154 | } else { 155 | next_in_partition(); 156 | } 157 | } 158 | void operator++() { next(); } 159 | 160 | /* update the state of the iterator to the element 161 | which is greater-than or equal-to lower_bound */ 162 | void next_geq(const uint64_t lower_bound) { 163 | assert(lower_bound <= num_colors()); 164 | while (value() < lower_bound) next(); 165 | assert(value() >= lower_bound); 166 | } 167 | 168 | void next_in_partition() { 169 | m_pos_in_partial_color += 1; 170 | ++m_curr_partition_it; 171 | update_curr_val(); 172 | } 173 | 174 | void change_partition() { 175 | read_partition_id(); 176 | update_partition(); 177 | } 178 | 179 | void next_partition_id() { 180 | m_pos_in_meta_color += 1; 181 | if (m_pos_in_meta_color == meta_color_set_size()) { 182 | m_curr_partition_id = num_partitions(); 183 | return; 184 | } 185 | read_partition_id(); 186 | } 187 | 188 | void read_partition_id() { 189 | uint64_t delta = bits::util::read_delta(m_partition_set_id); 190 | for (uint64_t i = 0; i < delta; i++) { 191 | m_num_color_sets_before += 192 | m_ptr->m_partition_endpoints[m_curr_partition_id + i].num_color_sets; 193 | } 194 | m_curr_partition_id += delta; 195 | uint8_t relative_color_size = 196 | bits::util::msbll( 197 | m_ptr->m_partition_endpoints[m_curr_partition_id].num_color_sets) + 198 | 1; 199 | m_curr_relative_color = m_relative_colors_it.take(relative_color_size); 200 | } 201 | 202 | void next_geq_partition_id(const uint32_t min_color) { 203 | assert(min_color <= num_partitions()); 204 | while (partition_id() < min_color) next_partition_id(); 205 | assert(partition_id() >= min_color); 206 | } 207 | 208 | void update_partition() { 209 | m_partition_min_color = m_ptr->m_partition_endpoints[m_curr_partition_id].min_color; 210 | 211 | m_pos_in_partial_color = 0; 212 | m_curr_partition_it = 213 | m_ptr->m_partial_color_sets[m_curr_partition_id].color_set(m_curr_relative_color); 214 | m_curr_partition_size = m_curr_partition_it.size(); 215 | 216 | update_curr_val(); 217 | } 218 | 219 | uint64_t size() const { 220 | uint64_t size = 0; 221 | auto partition_set_it = // 222 | (m_ptr->m_partition_sets).get_iterator_at(m_begin_partition_set); 223 | auto rel_it = // 224 | (m_ptr->m_relative_colors).get_iterator_at(m_begin_rel); 225 | uint64_t partition_id = 0; 226 | bits::util::read_delta(partition_set_it); 227 | for (uint64_t i = 0; i != m_meta_color_set_size; ++i) { 228 | partition_id += bits::util::read_delta(partition_set_it); 229 | uint8_t relative_color_size = 230 | bits::util::msbll(m_ptr->m_partition_endpoints[partition_id].num_color_sets) + 231 | 1; 232 | uint64_t relative_color = rel_it.take(relative_color_size); 233 | size += m_ptr->m_partial_color_sets[partition_id].color_set(relative_color).size(); 234 | } 235 | return size; 236 | } 237 | 238 | uint32_t partial_set_size() const { return m_curr_partition_it.size(); } 239 | 240 | uint32_t partition_id() const { return m_curr_partition_id; } 241 | uint32_t partition_min_color() const { return m_partition_min_color; } 242 | uint32_t partition_max_color() const { 243 | return m_partition_min_color + m_curr_partition_it.num_colors(); 244 | } 245 | uint32_t meta_color() const { return m_num_color_sets_before + m_curr_relative_color; } 246 | uint32_t num_colors() const { return m_ptr->num_colors(); } 247 | uint32_t num_partitions() const { return m_ptr->num_partitions(); } 248 | uint64_t meta_color_set_size() const { return m_meta_color_set_size; } 249 | 250 | differential::iterator_type partition_it() const { return m_curr_partition_it; } 251 | 252 | private: 253 | meta_differential const* m_ptr; 254 | 255 | differential::iterator_type m_curr_partition_it; 256 | bits::bit_vector::iterator m_partition_set_id, m_relative_colors_it; 257 | 258 | uint64_t m_meta_color_set_size; 259 | uint64_t m_begin_partition_set, m_begin_rel; 260 | uint64_t m_pos_in_meta_color, m_pos_in_partial_color; 261 | uint64_t m_curr_relative_color; 262 | uint64_t m_curr_partition_id, m_curr_partition_size; 263 | uint64_t m_curr_val; 264 | uint64_t m_partition_min_color; 265 | uint64_t m_num_color_sets_before; 266 | 267 | void update_curr_val() { m_curr_val = m_partition_min_color + *m_curr_partition_it; } 268 | }; 269 | 270 | typedef forward_iterator iterator_type; 271 | 272 | forward_iterator color_set(uint64_t color_set_id) const { 273 | assert(color_set_id < num_color_sets()); 274 | uint64_t begin_partition_set = 275 | m_partition_sets_offsets.access(m_partition_sets_partitions_rank1_index.rank1( 276 | m_partition_sets_partitions, color_set_id)); 277 | uint64_t begin_rel = m_relative_colors_offsets.access(color_set_id); 278 | return forward_iterator(this, begin_partition_set, begin_rel); 279 | } 280 | 281 | uint32_t num_colors() const { return m_num_colors; } 282 | uint64_t num_color_sets() const { return m_relative_colors_offsets.size() - 1; } 283 | uint64_t num_partitions() const { return m_partition_endpoints.size(); } 284 | uint64_t num_partition_sets() const { return m_num_partition_sets; } 285 | 286 | uint64_t num_bits() const { 287 | uint64_t num_bits_partial_color_sets = 0; 288 | for (auto const& c : m_partial_color_sets) num_bits_partial_color_sets += c.num_bits(); 289 | return num_bits_partial_color_sets + // 290 | (sizeof(size_t) + sizeof(m_num_colors) + sizeof(m_num_partition_sets) + 291 | m_relative_colors_offsets.num_bytes() + m_partition_sets_offsets.num_bytes() + 292 | essentials::vec_bytes(m_partition_endpoints) + m_relative_colors.num_bytes() + 293 | m_partition_sets.num_bytes() + m_partition_sets_partitions.num_bytes() + 294 | m_partition_sets_partitions_rank1_index.num_bytes()) * 295 | 8; 296 | } 297 | 298 | void print_stats() const; 299 | 300 | template 301 | void visit(Visitor& visitor) { 302 | visit_impl(visitor, *this); 303 | } 304 | 305 | template 306 | void visit(Visitor& visitor) const { 307 | visit_impl(visitor, *this); 308 | } 309 | 310 | private: 311 | template 312 | static void visit_impl(Visitor& visitor, T&& t) { 313 | visitor.visit(t.m_num_colors); 314 | visitor.visit(t.m_num_partition_sets); 315 | visitor.visit(t.m_partition_sets_offsets); 316 | visitor.visit(t.m_relative_colors_offsets); 317 | visitor.visit(t.m_partition_endpoints); 318 | visitor.visit(t.m_partial_color_sets); 319 | visitor.visit(t.m_relative_colors); 320 | visitor.visit(t.m_partition_sets); 321 | visitor.visit(t.m_partition_sets_partitions); 322 | visitor.visit(t.m_partition_sets_partitions_rank1_index); 323 | } 324 | 325 | uint32_t m_num_colors; 326 | uint32_t m_num_partition_sets; 327 | bits::elias_fano m_partition_sets_offsets, m_relative_colors_offsets; 328 | std::vector m_partition_endpoints; 329 | std::vector m_partial_color_sets; 330 | bits::bit_vector m_relative_colors; 331 | bits::bit_vector m_partition_sets; 332 | bits::bit_vector m_partition_sets_partitions; 333 | bits::rank9 m_partition_sets_partitions_rank1_index; 334 | }; 335 | 336 | } // namespace fulgor 337 | -------------------------------------------------------------------------------- /include/filenames.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fulgor { 4 | 5 | struct filenames { 6 | void build(std::vector const& filenames) { 7 | uint32_t offset = 0; 8 | m_offsets.push_back(offset); 9 | for (auto const& f : filenames) { 10 | std::copy(f.begin(), f.end(), std::back_inserter(m_chars)); 11 | offset += f.size(); 12 | m_offsets.push_back(offset); 13 | } 14 | } 15 | 16 | std::string_view operator[](uint64_t i) const { 17 | uint32_t begin = m_offsets[i]; 18 | uint32_t end = m_offsets[i + 1]; 19 | return {m_chars.data() + begin, end - begin}; 20 | } 21 | 22 | uint64_t num_bits() const { 23 | return essentials::vec_bytes(m_offsets) * 8 + essentials::vec_bytes(m_chars) * 8; 24 | } 25 | 26 | template 27 | void visit(Visitor& visitor) { 28 | visit_impl(visitor, *this); 29 | } 30 | 31 | template 32 | void visit(Visitor& visitor) const { 33 | visit_impl(visitor, *this); 34 | } 35 | 36 | private: 37 | template 38 | static void visit_impl(Visitor& visitor, T&& t) { 39 | visitor.visit(t.m_offsets); 40 | visitor.visit(t.m_chars); 41 | } 42 | 43 | std::vector m_offsets; 44 | std::vector m_chars; 45 | }; 46 | 47 | } // namespace fulgor -------------------------------------------------------------------------------- /include/index.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "external/sshash/include/dictionary.hpp" 4 | #include "external/sshash/external/pthash/external/bits/include/integer_codes.hpp" 5 | #include "external/sshash/external/pthash/external/bits/include/bit_vector.hpp" 6 | #include "external/sshash/external/pthash/external/bits/include/rank9.hpp" 7 | 8 | #include "filenames.hpp" 9 | #include "util.hpp" 10 | 11 | namespace fulgor { 12 | 13 | using kmer_type = sshash::default_kmer_t; 14 | using sshash_type = sshash::dictionary; 15 | 16 | template 17 | struct index { 18 | typedef ColorSets color_sets_type; 19 | 20 | struct builder; 21 | struct meta_builder; 22 | struct differential_builder; 23 | struct meta_differential_builder; 24 | 25 | typename color_sets_type::iterator_type color_set(uint64_t color_set_id) const { 26 | assert(color_set_id < num_color_sets()); 27 | return m_color_sets.color_set(color_set_id); 28 | } 29 | 30 | /* from unitig_id to color_set_id */ 31 | uint64_t u2c(uint64_t unitig_id) const { return m_u2c_rank1_index.rank1(m_u2c, unitig_id); } 32 | 33 | void pseudoalign_full_intersection(std::string const& sequence, // 34 | std::vector& results) const; 35 | 36 | void pseudoalign_threshold_union(std::string const& sequence, // 37 | std::vector& results, // 38 | const double threshold) const; 39 | 40 | std::string_view filename(uint64_t color) const { 41 | assert(color < num_colors()); 42 | return m_filenames[color]; 43 | } 44 | 45 | void print_stats() const; 46 | void dump(std::string const& basename) const; 47 | 48 | uint64_t k() const { return m_k2u.k(); } 49 | uint64_t num_colors() const { return m_color_sets.num_colors(); } 50 | uint64_t num_unitigs() const { return m_k2u.num_contigs(); } 51 | uint64_t num_color_sets() const { return m_color_sets.num_color_sets(); } 52 | 53 | sshash_type const& get_k2u() const { return m_k2u; } 54 | bits::bit_vector const& get_u2c() const { return m_u2c; } 55 | bits::rank9 const& get_u2c_rank1_index() const { return m_u2c_rank1_index; } 56 | ColorSets const& get_color_sets() const { return m_color_sets; } 57 | filenames const& get_filenames() const { return m_filenames; } 58 | 59 | template 60 | void visit(Visitor& visitor) { 61 | visit_impl(visitor, *this); 62 | } 63 | 64 | template 65 | void visit(Visitor& visitor) const { 66 | visit_impl(visitor, *this); 67 | } 68 | 69 | uint64_t num_bits() const { 70 | return m_k2u.num_bits() + (m_u2c.num_bytes() + m_u2c_rank1_index.num_bytes()) * 8 + 71 | m_color_sets.num_bits() + m_filenames.num_bits(); 72 | } 73 | 74 | private: 75 | template 76 | static void visit_impl(Visitor& visitor, T&& t) { 77 | visitor.visit(t.m_k2u); 78 | visitor.visit(t.m_u2c); 79 | visitor.visit(t.m_u2c_rank1_index); 80 | visitor.visit(t.m_color_sets); 81 | visitor.visit(t.m_filenames); 82 | } 83 | 84 | sshash_type m_k2u; 85 | bits::bit_vector m_u2c; 86 | bits::rank9 m_u2c_rank1_index; 87 | ColorSets m_color_sets; 88 | filenames m_filenames; 89 | }; 90 | 91 | } // namespace fulgor 92 | -------------------------------------------------------------------------------- /include/index_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "index.hpp" 4 | 5 | #include "builders/builder.hpp" 6 | #include "color_sets/hybrid.hpp" 7 | 8 | namespace fulgor { 9 | typedef index hybrid_colors_index_type; 10 | typedef hybrid_colors_index_type index_type; // in use 11 | } // namespace fulgor 12 | 13 | #include "builders/meta_builder.hpp" 14 | #include "color_sets/meta.hpp" 15 | 16 | namespace fulgor { 17 | typedef index> meta_hybrid_colors_index_type; 18 | typedef meta_hybrid_colors_index_type meta_index_type; // in use 19 | } // namespace fulgor 20 | 21 | #include "builders/differential_builder.hpp" 22 | #include "color_sets/differential.hpp" 23 | 24 | namespace fulgor { 25 | typedef index differential_colors_index_type; 26 | typedef differential_colors_index_type differential_index_type; // in use 27 | } // namespace fulgor 28 | 29 | #include "color_sets/meta_differential.hpp" 30 | #include "builders/meta_differential_builder.hpp" 31 | 32 | namespace fulgor { 33 | typedef index meta_differential_colors_index_type; 34 | typedef meta_differential_colors_index_type meta_differential_index_type; // in use 35 | } // namespace fulgor 36 | -------------------------------------------------------------------------------- /include/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // for std::set_intersection 10 | 11 | #include "external/smhasher/src/City.h" 12 | #include "external/smhasher/src/City.cpp" 13 | 14 | namespace fulgor { 15 | 16 | enum class index_t { HYBRID, DIFF, META, META_DIFF }; 17 | enum encoding_t { delta_gaps, bitmap, complement_delta_gaps, symmetric_difference }; 18 | 19 | namespace constants { 20 | constexpr double invalid_threshold = -1.0; 21 | constexpr uint64_t default_ram_limit_in_GiB = 8; 22 | static const std::string default_tmp_dirname("."); 23 | static const std::string fulgor_filename_extension("fur"); 24 | static const std::string meta_colored_fulgor_filename_extension("mfur"); 25 | static const std::string diff_colored_fulgor_filename_extension("dfur"); 26 | static const std::string meta_diff_colored_fulgor_filename_extension("mdfur"); 27 | } // namespace constants 28 | 29 | struct build_configuration { 30 | build_configuration() 31 | : k(31) 32 | , m(20) 33 | , num_threads(1) 34 | , ram_limit_in_GiB(constants::default_ram_limit_in_GiB) 35 | , num_colors(0) 36 | , tmp_dirname(constants::default_tmp_dirname) 37 | // 38 | , verbose(false) 39 | , canonical_parsing(true) 40 | , check(false) 41 | // 42 | , meta_colored(false) 43 | , diff_colored(false) // 44 | {} 45 | 46 | uint32_t k; // kmer length 47 | uint32_t m; // minimizer length 48 | uint32_t num_threads; // for building and checking correctness 49 | uint32_t ram_limit_in_GiB; 50 | uint64_t num_colors; 51 | 52 | std::string tmp_dirname; 53 | std::string file_base_name; 54 | std::string filenames_list; 55 | 56 | std::string index_filename_to_partition; 57 | 58 | bool verbose; 59 | bool canonical_parsing; 60 | bool check; 61 | 62 | bool meta_colored; 63 | bool diff_colored; 64 | }; 65 | 66 | namespace util { 67 | 68 | static void print_cmd(int argc, char** argv) { 69 | for (int i = 0; i != argc; ++i) std::cout << argv[i] << ' '; 70 | std::cout << std::endl; 71 | } 72 | 73 | std::string filename(std::string const& path) { return path.substr(path.find_last_of("/\\") + 1); } 74 | 75 | template 76 | bool check_intersection(std::vector& iterators, 77 | std::vector const& got) // 78 | { 79 | if (iterators.empty()) return true; 80 | 81 | /* re-init iterators */ 82 | for (auto& it : iterators) it.rewind(); 83 | 84 | /* decompress the color sets */ 85 | const uint32_t num_colors = iterators[0].num_colors(); 86 | std::vector> sets(iterators.size()); 87 | for (uint64_t i = 0; i != iterators.size(); ++i) { 88 | auto& it = iterators[i]; 89 | uint32_t val = it.value(); 90 | while (val < num_colors) { 91 | sets[i].push_back(val); 92 | it.next(); 93 | val = it.value(); 94 | } 95 | } 96 | 97 | /* compute intersectiom using std::set_intersection */ 98 | std::vector expected; 99 | if (iterators.size() > 1) { 100 | std::vector l = sets[0]; 101 | for (uint64_t i = 1; i != sets.size(); ++i) { 102 | auto r = sets[i]; 103 | expected.clear(); 104 | std::set_intersection(l.begin(), l.end(), r.begin(), r.end(), 105 | std::back_inserter(expected)); 106 | if (i != sets.size() - 1) l.swap(expected); 107 | } 108 | } else { 109 | expected.swap(sets[0]); 110 | } 111 | 112 | /* compare the results */ 113 | if (expected.size() != got.size()) { 114 | std::cerr << "expected intersection size " << expected.size() << " but got " << got.size() 115 | << std::endl; 116 | return false; 117 | } 118 | for (uint64_t i = 0; i != got.size(); ++i) { 119 | if (expected[i] != got[i]) { 120 | std::cerr << "error at " << i << "/" << got.size() << ": expected " << expected[i] 121 | << " but got " << got[i] << std::endl; 122 | return false; 123 | } 124 | } 125 | 126 | return true; 127 | } 128 | 129 | template 130 | bool check_union(std::vector& iterators, // 131 | std::vector const& got, const uint64_t min_score) // 132 | { 133 | if (iterators.empty()) return true; 134 | 135 | /* re-init iterators */ 136 | for (auto& p : iterators) p.item.rewind(); 137 | 138 | /* compute the num. occs of each color */ 139 | const uint32_t num_colors = iterators[0].item.num_colors(); 140 | std::vector scores(num_colors, 0); 141 | for (auto& [it, score] : iterators) { 142 | uint32_t val = it.value(); 143 | while (val < num_colors) { 144 | scores[val] += score; 145 | it.next(); 146 | val = it.value(); 147 | } 148 | } 149 | 150 | /* compare the results */ 151 | uint64_t expected_size = 0; 152 | auto it = got.begin(); 153 | for (uint64_t i = 0; i != num_colors; ++i) { 154 | if (scores[i] >= min_score) { 155 | if (it == got.end()) { 156 | std::cerr << "error: more elements than expected in thershold-union result" 157 | << std::endl; 158 | return false; 159 | } 160 | if (i != *it) { 161 | std::cerr << "error at " << expected_size << "/" << got.size() << ": expected " << i 162 | << " but got " << *it << std::endl; 163 | return false; 164 | } 165 | ++expected_size; 166 | ++it; 167 | } 168 | } 169 | 170 | if (expected_size != got.size()) { 171 | std::cerr << "expected thershold-union size " << expected_size << " but got " << got.size() 172 | << std::endl; 173 | return false; 174 | } 175 | 176 | return true; 177 | } 178 | 179 | __uint128_t hash128(char const* bytes, uint64_t num_bytes, const uint64_t seed = 1234567890) { 180 | auto ret = CityHash128WithSeed(bytes, num_bytes, {seed, seed}); 181 | __uint128_t out = 0; 182 | out += __uint128_t(ret.first); 183 | out += __uint128_t(ret.second) << 64; 184 | return out; 185 | } 186 | 187 | struct hasher_uint128_t { 188 | uint64_t operator()(const __uint128_t x) const { return static_cast(x) ^ (x >> 64); } 189 | }; 190 | 191 | } // namespace util 192 | } // namespace fulgor 193 | -------------------------------------------------------------------------------- /src/color_sets.cpp: -------------------------------------------------------------------------------- 1 | #include "include/color_sets/hybrid.hpp" 2 | #include "include/color_sets/differential.hpp" 3 | #include "include/color_sets/meta.hpp" 4 | #include "include/color_sets/meta_differential.hpp" 5 | 6 | namespace fulgor { 7 | 8 | void hybrid::print_stats() const // 9 | { 10 | const uint64_t num_buckets = 10; 11 | assert(num_buckets > 0); 12 | uint64_t bucket_size = m_num_colors / num_buckets; 13 | std::vector color_set_size_upperbounds; 14 | for (uint64_t i = 0, curr_color_set_size_upper_bound = bucket_size; i != num_buckets; 15 | ++i, curr_color_set_size_upper_bound += bucket_size) { 16 | if (i == num_buckets - 1) curr_color_set_size_upper_bound = m_num_colors; 17 | color_set_size_upperbounds.push_back(curr_color_set_size_upper_bound); 18 | } 19 | 20 | std::vector num_bits_per_bucket; 21 | std::vector num_color_sets_per_bucket; 22 | std::vector num_ints_per_bucket; 23 | num_bits_per_bucket.resize(num_buckets, 0); 24 | num_color_sets_per_bucket.resize(num_buckets, 0); 25 | num_ints_per_bucket.resize(num_buckets, 0); 26 | 27 | uint64_t num_total_integers = 0; 28 | for (uint64_t color_set_id = 0; color_set_id != m_offsets.size() - 1; ++color_set_id) { 29 | uint64_t offset = m_offsets.access(color_set_id); 30 | auto it = m_color_sets.get_iterator_at(offset); 31 | uint32_t color_set_size = bits::util::read_delta(it); 32 | uint64_t num_bits = m_offsets.access(color_set_id + 1) - offset; 33 | auto bucket_it = std::upper_bound(color_set_size_upperbounds.begin(), 34 | color_set_size_upperbounds.end(), color_set_size); 35 | if (bucket_it != color_set_size_upperbounds.begin() and 36 | *(bucket_it - 1) == color_set_size) { 37 | --bucket_it; 38 | } 39 | uint64_t bucket_index = std::distance(color_set_size_upperbounds.begin(), bucket_it); 40 | num_bits_per_bucket[bucket_index] += num_bits; 41 | num_color_sets_per_bucket[bucket_index] += 1; 42 | num_ints_per_bucket[bucket_index] += color_set_size; 43 | num_total_integers += color_set_size; 44 | } 45 | 46 | std::cout << "Color sets space breakdown:\n"; 47 | uint64_t integers = 0; 48 | uint64_t bits = 0; 49 | const uint64_t total_bits = num_bits(); 50 | for (uint64_t i = 0, curr_color_set_size_upper_bound = 0; i != num_buckets; ++i) { 51 | if (i == num_buckets - 1) { 52 | curr_color_set_size_upper_bound = m_num_colors; 53 | } else { 54 | curr_color_set_size_upper_bound += bucket_size; 55 | } 56 | if (num_color_sets_per_bucket[i] > 0) { 57 | uint64_t n = num_ints_per_bucket[i]; 58 | integers += n; 59 | bits += num_bits_per_bucket[i]; 60 | std::cout << " num. color_sets of size > " 61 | << (curr_color_set_size_upper_bound - bucket_size) 62 | << " and <= " << curr_color_set_size_upper_bound << ": " 63 | << num_color_sets_per_bucket[i] << " (" 64 | << (num_color_sets_per_bucket[i] * 100.0) / num_color_sets() 65 | << "%) -- integers: " << n << " (" << (n * 100.0) / num_total_integers 66 | << "%) -- bits/int: " << static_cast(num_bits_per_bucket[i]) / n 67 | << " -- " << static_cast(num_bits_per_bucket[i]) / total_bits * 100.0 68 | << "\% of total space" << '\n'; 69 | } 70 | } 71 | assert(integers == num_total_integers); 72 | assert(std::accumulate(num_color_sets_per_bucket.begin(), num_color_sets_per_bucket.end(), 73 | uint64_t(0)) == num_color_sets()); 74 | std::cout << " colors: " << static_cast(bits) / integers << " bits/int" << std::endl; 75 | std::cout << " offsets: " 76 | << ((sizeof(m_num_colors) + sizeof(m_sparse_set_threshold_size) + 77 | sizeof(m_very_dense_set_threshold_size) + m_offsets.num_bytes()) * 78 | 8.0) / 79 | integers 80 | << " bits/int" << std::endl; 81 | } 82 | 83 | template 84 | void meta::print_stats() const // 85 | { 86 | std::cout << "Color statistics:\n"; 87 | std::cout << " Number of partitions: " << num_partitions() << '\n'; 88 | uint64_t num_bits_colors = 0; 89 | 90 | uint64_t num_partial_color_sets_very_dense = 0; 91 | uint64_t num_partial_color_sets_dense = 0; 92 | uint64_t num_partial_color_sets_sparse = 0; 93 | uint64_t num_total_partial_colors = 0; 94 | 95 | for (auto const& pcs : m_partial_color_sets) { 96 | // pcs.print_stats(); 97 | const uint64_t n = pcs.num_color_sets(); 98 | num_total_partial_colors += n; 99 | for (uint64_t i = 0; i != n; ++i) { 100 | auto it = pcs.color_set(i); 101 | if (it.encoding_type() == encoding_t::complement_delta_gaps) { 102 | ++num_partial_color_sets_very_dense; 103 | } else if (it.encoding_type() == encoding_t::bitmap) { 104 | ++num_partial_color_sets_dense; 105 | } else { 106 | assert(it.encoding_type() == encoding_t::delta_gaps); 107 | ++num_partial_color_sets_sparse; 108 | } 109 | } 110 | 111 | num_bits_colors += pcs.num_bits(); 112 | } 113 | 114 | assert(num_total_partial_colors > 0); 115 | assert(num_bits() > 0); 116 | 117 | std::cout << " num_partial_color_sets_very_dense = " << num_partial_color_sets_very_dense 118 | << " / " << num_total_partial_colors << " (" 119 | << (num_partial_color_sets_very_dense * 100.0) / num_total_partial_colors << "%)" 120 | << std::endl; 121 | std::cout << " num_partial_color_sets_dense = " << num_partial_color_sets_dense << " / " 122 | << num_total_partial_colors << " (" 123 | << (num_partial_color_sets_dense * 100.0) / num_total_partial_colors << "%)" 124 | << std::endl; 125 | std::cout << " num_partial_color_sets_sparse = " << num_partial_color_sets_sparse << " / " 126 | << num_total_partial_colors << " (" 127 | << (num_partial_color_sets_sparse * 100.0) / num_total_partial_colors << "%)" 128 | << std::endl; 129 | 130 | std::cout << " partial colors: " << num_bits_colors / 8 << " bytes (" 131 | << (num_bits_colors * 100.0) / num_bits() << "%)\n"; 132 | std::cout << " meta colors: " 133 | << m_meta_color_sets.num_bytes() + m_meta_color_sets_offsets.num_bytes() << " bytes (" 134 | << ((m_meta_color_sets.num_bytes() + m_meta_color_sets_offsets.num_bytes()) * 8 * 135 | 100.0) / 136 | num_bits() 137 | << "%)\n"; 138 | std::cout << " other: " << essentials::vec_bytes(m_partition_endpoints) << " bytes (" 139 | << ((essentials::vec_bytes(m_partition_endpoints) * 8) * 100.0) / num_bits() 140 | << "%)\n"; 141 | std::cout << " partition endpoints: "; 142 | for (auto p : m_partition_endpoints) std::cout << p.min_color << " "; 143 | std::cout << std::endl; 144 | } 145 | 146 | void differential::print_stats() const // 147 | { 148 | std::cout << "Color statistics:\n"; 149 | std::cout << " Number of partitions: " << num_partitions() << std::endl; 150 | 151 | uint64_t num_bits_representative_offsets = m_representative_offsets.num_bytes() * 8; 152 | uint64_t num_bits_color_sets_offsets = m_color_set_offsets.num_bytes() * 8; 153 | uint64_t num_bits_color_sets = m_color_sets.num_bytes() * 8; 154 | 155 | const uint64_t num_clusters = m_clusters.num_bits(); 156 | uint64_t num_representatives = 0; 157 | uint64_t num_differential_color_sets = 0; 158 | uint64_t num_metadata = 0; 159 | 160 | uint64_t size_representatives = 0; 161 | uint64_t size_differentials = 0; 162 | 163 | uint64_t num_colors_tenth = num_colors() / 10; 164 | 165 | std::vector distribution(11, 0); 166 | 167 | for (uint64_t representative_id = 0; representative_id < num_partitions(); 168 | representative_id++) // 169 | { 170 | uint64_t representative_begin = m_representative_offsets.access(representative_id); 171 | auto it = m_color_sets.get_iterator_at(representative_begin); 172 | uint64_t prev_position = it.position(); 173 | 174 | uint64_t size = bits::util::read_delta(it); 175 | size_representatives += size; 176 | num_metadata += it.position() - prev_position; 177 | prev_position = it.position(); 178 | 179 | for (uint64_t i = 0; i < size; i++) { 180 | bits::util::read_delta(it); 181 | num_representatives += it.position() - prev_position; 182 | prev_position = it.position(); 183 | } 184 | } 185 | uint64_t last_representative = m_representative_offsets.access(num_partitions()); 186 | for (uint64_t color_id = 0; color_id < num_color_sets(); color_id++) // 187 | { 188 | uint64_t color_set_begin = m_color_set_offsets.access(color_id) + last_representative; 189 | auto it = m_color_sets.get_iterator_at(color_set_begin); 190 | uint64_t prev_position = it.position(); 191 | 192 | uint64_t size = bits::util::read_delta(it); 193 | size_differentials += size; 194 | num_metadata += it.position() - prev_position; 195 | prev_position = it.position(); 196 | 197 | bits::util::read_delta(it); // original color_set size 198 | num_metadata += it.position() - prev_position; 199 | prev_position = it.position(); 200 | 201 | for (uint64_t i = 0; i < size; i++) { 202 | bits::util::read_delta(it); 203 | uint64_t delta_size = it.position() - prev_position; 204 | num_differential_color_sets += delta_size; 205 | 206 | prev_position = it.position(); 207 | } 208 | uint64_t q = 0; 209 | if (num_colors_tenth != 0) { 210 | q = size / (num_colors_tenth) > 10 ? 10 : size / (num_colors_tenth); 211 | } 212 | 213 | distribution[q]++; 214 | } 215 | 216 | assert(num_bits() > 0); 217 | assert(num_bits_color_sets > 0); 218 | 219 | std::cout << " representative offsets: " << num_bits_representative_offsets / 8 << " bytes (" 220 | << (num_bits_representative_offsets * 100.0) / num_bits() << "%)" << std::endl; 221 | std::cout << " average representative set size: " 222 | << size_representatives * 1. / num_partitions() << " ints" << std::endl; 223 | std::cout << " average differential set size: " << size_differentials * 1. / num_color_sets() 224 | << " ints" << std::endl; 225 | std::cout << " differential color set offsets: " << num_bits_color_sets_offsets / 8 226 | << " bytes (" << (num_bits_color_sets_offsets * 100.0) / num_bits() << "%)" 227 | << std::endl; 228 | std::cout << " clusters: " << num_clusters / 8 << " bytes (" 229 | << (num_clusters * 100.0) / num_bits() << "%)" << std::endl; 230 | std::cout << " differential color sets: " << num_bits_color_sets / 8 << " bytes (" 231 | << (num_bits_color_sets * 100.0) / num_bits() << "%)" << std::endl; 232 | std::cout << " representatives: " << num_representatives / 8 << " bytes (" 233 | << (num_representatives * 100.0) / num_bits_color_sets << "%)" << std::endl; 234 | std::cout << " differential color sets: " << num_differential_color_sets / 8 << " bytes (" 235 | << (num_differential_color_sets * 100.0) / num_bits_color_sets << "%)" << std::endl; 236 | std::cout << " metadata: " << num_metadata / 8 << " bytes (" 237 | << (num_metadata * 100.0) / num_bits_color_sets << "%)" << std::endl; 238 | std::cout << " differential color sets size distribution:" << std::endl; 239 | for (uint64_t partition = 0; partition < distribution.size(); partition++) { 240 | std::cout << distribution[partition] << " "; 241 | } 242 | std::cout << std::endl; 243 | } 244 | 245 | void meta_differential::print_stats() const // 246 | { 247 | std::cout << "Color statistics:\n"; 248 | std::cout << " Number of partitions: " << num_partitions() << '\n'; 249 | std::cout << " Number of partition sets: " << num_partition_sets() << '\n'; 250 | 251 | uint64_t num_bits_meta_color_sets = 252 | 8 * (m_relative_colors_offsets.num_bytes() + m_partition_sets_offsets.num_bytes() + 253 | m_relative_colors.num_bytes() + m_partition_sets.num_bytes() + 254 | m_partition_sets_partitions.num_bytes() + 255 | m_partition_sets_partitions_rank1_index.num_bytes()); 256 | 257 | uint64_t num_bits_partial_color_sets = 0; 258 | for (auto const& pcs : m_partial_color_sets) num_bits_partial_color_sets += pcs.num_bits(); 259 | 260 | assert(num_bits() > 0); 261 | std::cout << " partial color sets: " << num_bits_partial_color_sets / 8 << " bytes (" 262 | << (num_bits_partial_color_sets * 100.0) / num_bits() << "%)\n"; 263 | std::cout << " meta color sets: " << num_bits_meta_color_sets / 8 << " bytes (" 264 | << (num_bits_meta_color_sets * 100.0) / num_bits() << "%)\n"; 265 | std::cout << " other: " << essentials::vec_bytes(m_partition_endpoints) << " bytes (" 266 | << ((essentials::vec_bytes(m_partition_endpoints) * 8) * 100.0) / num_bits() 267 | << "%)\n"; 268 | } 269 | 270 | } // namespace fulgor 271 | -------------------------------------------------------------------------------- /src/index.cpp: -------------------------------------------------------------------------------- 1 | #include "include/index.hpp" 2 | 3 | namespace fulgor { 4 | 5 | template 6 | void index::print_stats() const { 7 | const uint64_t total_bits = num_bits(); 8 | assert(total_bits > 0); 9 | auto const& k2u = get_k2u(); 10 | auto const& u2c = get_u2c(); 11 | auto const& u2c_rank1_index = get_u2c_rank1_index(); 12 | auto const& color_sets = get_color_sets(); 13 | auto const& filenames = get_filenames(); 14 | 15 | std::cout << "total index size: " << total_bits / 8 << " [B] -- " 16 | << essentials::convert(total_bits / 8, essentials::GB) << " [GB]" << '\n'; 17 | std::cout << "SPACE BREAKDOWN:\n"; 18 | std::cout << " dBG (SSHash): " << k2u.num_bits() / 8 << " bytes / " 19 | << essentials::convert(k2u.num_bits() / 8, essentials::GB) << " GB (" 20 | << (k2u.num_bits() * 100.0) / total_bits << "%)\n"; 21 | std::cout << " Color sets: " << color_sets.num_bits() / 8 << " bytes / " 22 | << essentials::convert(color_sets.num_bits() / 8, essentials::GB) << " GB (" 23 | << (color_sets.num_bits() * 100.0) / total_bits << "%)\n"; 24 | uint64_t other_bits = 25 | (u2c.num_bytes() + u2c_rank1_index.num_bytes()) * 8 + filenames.num_bits(); 26 | std::cout << " Other: " << other_bits / 8 << " bytes / " 27 | << essentials::convert(other_bits / 8, essentials::GB) << " GB (" 28 | << (other_bits * 100.0) / total_bits << "%)\n"; 29 | std::cout << " Map from unitig_id to color_set_id: " 30 | << u2c.num_bytes() + u2c_rank1_index.num_bytes() << " bytes / " 31 | << essentials::convert(u2c.num_bytes() + u2c_rank1_index.num_bytes(), essentials::GB) 32 | << " GB (" 33 | << ((u2c.num_bytes() + u2c_rank1_index.num_bytes()) * 8 * 100.0) / total_bits 34 | << "%)\n"; 35 | std::cout << " filenames: " << filenames.num_bits() / 8 << " bytes / " 36 | << essentials::convert(filenames.num_bits() / 8, essentials::GB) << " GB (" 37 | << (filenames.num_bits() * 100.0) / total_bits << "%)\n"; 38 | 39 | uint64_t num_ints_in_color_sets = 0; 40 | uint64_t num_color_sets = color_sets.num_color_sets(); 41 | std::cout << "Color id range 0.." << num_colors() - 1 << '\n'; 42 | std::cout << "Number of distinct color sets: " << num_color_sets << '\n'; 43 | for (uint64_t color_set_id = 0; color_set_id != num_color_sets; ++color_set_id) { 44 | uint64_t list_size = color_sets.color_set(color_set_id).size(); 45 | num_ints_in_color_sets += list_size; 46 | } 47 | std::cout << "Number of ints in distinct color sets: " << num_ints_in_color_sets << " (" 48 | << static_cast(color_sets.num_bits()) / num_ints_in_color_sets 49 | << " bits/int)\n"; 50 | std::cout << "k: " << k2u.k() << '\n'; 51 | std::cout << "m: " << k2u.m() << " (minimizer length used in SSHash)\n"; 52 | std::cout << "Number of kmers in dBG: " << k2u.size() << " (" 53 | << static_cast(k2u.num_bits()) / k2u.size() << " bits/kmer)\n"; 54 | std::cout << "Number of unitigs in dBG: " << k2u.num_contigs() << std::endl; 55 | 56 | color_sets.print_stats(); 57 | } 58 | 59 | template 60 | void index::dump(std::string const& basename) const { 61 | /* metadata file */ 62 | std::ofstream metadata_file(basename + ".metadata.txt"); 63 | if (!metadata_file.is_open()) throw std::runtime_error("cannot open output file"); 64 | metadata_file << "k=" << k() << '\n'; 65 | metadata_file << "num_colors=" << num_colors() << '\n'; 66 | metadata_file << "num_unitigs=" << num_unitigs() << '\n'; 67 | metadata_file << "num_color_sets=" << num_color_sets() << '\n'; 68 | metadata_file.close(); 69 | 70 | /* unitigs file */ 71 | std::ofstream unitigs_file(basename + ".unitigs.fa"); 72 | if (!unitigs_file.is_open()) throw std::runtime_error("cannot open output file"); 73 | const uint64_t u = num_unitigs(); 74 | const uint64_t kmer_length = k(); 75 | for (uint64_t unitig_id = 0; unitig_id != u; ++unitig_id) { 76 | auto it = m_k2u.at_contig_id(unitig_id); 77 | const uint64_t color_set_id = u2c(unitig_id); 78 | unitigs_file << "> unitig_id=" << unitig_id << " color_set_id=" << color_set_id << '\n'; 79 | auto [_, kmer] = it.next(); 80 | unitigs_file << kmer; 81 | while (it.has_next()) { 82 | auto [_, kmer] = it.next(); 83 | unitigs_file << kmer[kmer_length - 1]; // overlaps! 84 | } 85 | unitigs_file << '\n'; 86 | } 87 | unitigs_file.close(); 88 | 89 | /* color_sets file */ 90 | std::ofstream color_sets_file(basename + ".color_sets.txt"); 91 | if (!color_sets_file.is_open()) throw std::runtime_error("cannot open output file"); 92 | auto const& color_sets = get_color_sets(); 93 | const uint64_t n = num_color_sets(); 94 | for (uint64_t color_set_id = 0; color_set_id != n; ++color_set_id) { 95 | auto it = color_sets.color_set(color_set_id); 96 | const uint32_t size = it.size(); 97 | color_sets_file << "color_set_id=" << color_set_id << " size=" << size << ' '; 98 | for (uint32_t j = 0; j != size; ++j) { 99 | color_sets_file << it.value(); 100 | it.next(); 101 | if (j != size - 1) color_sets_file << ' '; 102 | } 103 | color_sets_file << '\n'; 104 | } 105 | color_sets_file.close(); 106 | } 107 | 108 | } // namespace fulgor -------------------------------------------------------------------------------- /src/ps_full_intersection.cpp: -------------------------------------------------------------------------------- 1 | #include "include/index.hpp" 2 | #include "external/sshash/include/query/streaming_query_canonical_parsing.hpp" 3 | 4 | namespace fulgor { 5 | 6 | template 7 | void next_geq_intersect(Iterator const& begin, Iterator const& end, // 8 | std::vector& colors, const uint32_t num_colors) // 9 | { 10 | uint32_t candidate = begin->value(); 11 | uint64_t i = 1; 12 | const uint64_t size = end - begin; 13 | while (candidate < num_colors) { 14 | for (; i != size; ++i) { 15 | (begin + i)->next_geq(candidate); 16 | uint32_t val = (begin + i)->value(); 17 | if (val != candidate) { 18 | candidate = val; 19 | i = 0; 20 | break; 21 | } 22 | } 23 | if (i == size) { 24 | colors.push_back(candidate); 25 | begin->next(); 26 | candidate = begin->value(); 27 | i = 1; 28 | } 29 | } 30 | } 31 | 32 | template 33 | void intersect(std::vector& iterators, // 34 | std::vector& colors, // 35 | std::vector& complement_set) // 36 | { 37 | assert(colors.empty()); 38 | assert(complement_set.empty()); 39 | 40 | if (iterators.empty()) return; 41 | std::sort(iterators.begin(), iterators.end(), 42 | [](auto const& x, auto const& y) { return x.size() < y.size(); }); 43 | 44 | const uint32_t num_colors = iterators[0].num_colors(); 45 | uint32_t num_sparse = 0; 46 | while (num_sparse != iterators.size() && 47 | iterators[num_sparse].encoding_type() != encoding_t::complement_delta_gaps) { 48 | ++num_sparse; 49 | } 50 | 51 | if (num_sparse == 0) { 52 | /* step 1: take the union of complementary sets */ 53 | for (auto& it : iterators) it.reinit_for_complemented_set_iteration(); 54 | 55 | uint32_t candidate = (*std::min_element(iterators.begin(), iterators.end(), 56 | [](auto const& x, auto const& y) { 57 | return x.comp_value() < y.comp_value(); 58 | })) 59 | .comp_value(); 60 | 61 | complement_set.reserve(num_colors); 62 | while (candidate < num_colors) { 63 | uint32_t next_candidate = num_colors; 64 | for (uint64_t i = 0; i != iterators.size(); ++i) { 65 | if (iterators[i].comp_value() == candidate) iterators[i].next_comp(); 66 | /* compute next minimum */ 67 | if (iterators[i].comp_value() < next_candidate) { 68 | next_candidate = iterators[i].comp_value(); 69 | } 70 | } 71 | complement_set.push_back(candidate); 72 | assert(next_candidate > candidate); 73 | candidate = next_candidate; 74 | } 75 | 76 | /* step 2: compute the intersection by scanning complement_set */ 77 | candidate = 0; 78 | for (uint32_t i : complement_set) { 79 | while (candidate < i) { 80 | colors.push_back(candidate); 81 | candidate += 1; 82 | } 83 | candidate += 1; // skip the candidate because it is equal to complement_set[i] 84 | } 85 | while (candidate < num_colors) { 86 | colors.push_back(candidate); 87 | candidate += 1; 88 | } 89 | 90 | return; 91 | } 92 | 93 | std::vector complement_union(num_colors, true); 94 | for (uint32_t i = num_sparse; i < iterators.size(); ++i) { 95 | auto it = iterators[i]; 96 | it.reinit_for_complemented_set_iteration(); 97 | while (it.comp_value() < num_colors) { 98 | complement_union[it.comp_value()] = false; 99 | it.next_comp(); 100 | } 101 | } 102 | 103 | /* traditional intersection code based on next_geq() and next() */ 104 | 105 | assert(iterators[0].encoding_type() != encoding_t::complement_delta_gaps); 106 | uint32_t candidate = iterators[0].value(); 107 | uint64_t i = 1; 108 | uint64_t size = num_sparse; 109 | 110 | while (candidate < num_colors) { 111 | for (; i != size; ++i) { 112 | iterators[i].next_geq(candidate); 113 | uint32_t val = iterators[i].value(); 114 | if (val != candidate) { 115 | candidate = val; 116 | i = 0; 117 | break; 118 | } 119 | } 120 | if (i == size) { 121 | if (complement_union[candidate]) { colors.push_back(candidate); } 122 | iterators[0].next(); 123 | candidate = iterators[0].value(); 124 | i = 1; 125 | } 126 | } 127 | } 128 | 129 | template 130 | void diff_intersect(std::vector& iterators, std::vector& colors, 131 | uint32_t lower_bound = 0) // 132 | { 133 | if (iterators.empty()) return; 134 | const uint32_t num_colors = iterators[0].num_colors(); 135 | 136 | std::sort(iterators.begin(), iterators.end(), [](const Iterator& a, const Iterator& b) { 137 | return a.representative_begin() < b.representative_begin(); 138 | }); 139 | 140 | const uint32_t num_iterators = iterators.size(); 141 | uint32_t num_partitions = 1; 142 | { 143 | uint32_t prev_partition = iterators[0].representative_begin(); 144 | for (const auto& it : iterators) { 145 | uint32_t partition_id = it.representative_begin(); 146 | if (partition_id != prev_partition) { 147 | prev_partition = partition_id; 148 | ++num_partitions; 149 | } 150 | } 151 | } 152 | 153 | std::vector> partitions(num_partitions); 154 | 155 | { 156 | std::vector counts(num_colors, 0); 157 | uint32_t partition_id = 0; 158 | uint32_t partition_size = 0; 159 | 160 | for (uint32_t iterator_id = 0; iterator_id < num_iterators; iterator_id++) { 161 | Iterator it = iterators[iterator_id]; 162 | partition_size++; 163 | 164 | bool is_last_in_partition = 165 | iterator_id + 1 == num_iterators || 166 | iterators[iterator_id + 1].representative_begin() != it.representative_begin(); 167 | 168 | if (partition_size == 1 && is_last_in_partition) { 169 | // if one element in partition, decode the color set 170 | for (uint32_t i = 0; i < it.size(); ++i, ++it) { 171 | partitions[partition_id].push_back(*it); 172 | } 173 | partition_id++; 174 | partition_size = 0; 175 | continue; 176 | } 177 | 178 | it.full_rewind(); 179 | 180 | uint32_t val = it.differential_val(); 181 | while (val != num_colors) { 182 | ++counts[val]; 183 | it.next_differential_val(); 184 | val = it.differential_val(); 185 | } 186 | 187 | if (is_last_in_partition) { 188 | it.full_rewind(); 189 | val = it.representative_val(); 190 | for (uint32_t color = 0; color < num_colors; color++) { 191 | if (val < color) { 192 | it.next_representative_val(); 193 | val = it.representative_val(); 194 | } 195 | if ((counts[color] == partition_size && val != color) || 196 | (counts[color] == 0 && val == color)) { 197 | partitions[partition_id].push_back(color); 198 | } 199 | } 200 | partition_id++; 201 | partition_size = 0; 202 | fill(counts.begin(), counts.end(), 0); 203 | } 204 | } 205 | } 206 | 207 | std::sort(partitions.begin(), partitions.end(), 208 | [](auto const& x, auto const& y) { return x.size() < y.size(); }); 209 | 210 | std::vector::iterator> its(num_partitions); 211 | for (uint32_t i = 0; i < num_partitions; i++) { 212 | if (partitions[i].empty()) return; 213 | its[i] = partitions[i].begin(); 214 | } 215 | 216 | uint32_t candidate = *its[0]; 217 | uint64_t i = 1; 218 | while (candidate < num_colors) { 219 | for (; i != its.size(); ++i) { 220 | while (its[i] != partitions[i].end() && *its[i] < candidate) ++its[i]; 221 | if (its[i] == partitions[i].end()) { 222 | candidate = num_colors; 223 | break; 224 | } 225 | uint32_t val = *its[i]; 226 | if (val != candidate) { 227 | candidate = val; 228 | i = 0; 229 | break; 230 | } 231 | } 232 | if (i == its.size()) { 233 | colors.push_back(candidate + lower_bound); 234 | ++its[0]; 235 | if (its[0] == partitions[0].end()) break; 236 | candidate = *its[0]; 237 | i = 1; 238 | } 239 | } 240 | } 241 | 242 | template 243 | void meta_intersect(std::vector& iterators, std::vector& colors, 244 | std::vector& partition_ids) { 245 | assert(colors.empty()); 246 | assert(partition_ids.empty()); 247 | 248 | if (iterators.empty()) return; 249 | 250 | for (auto it : iterators) { 251 | while (it.partition_id() != it.num_partitions()) it.next_partition_id(); 252 | it.init(); 253 | it.change_partition(); 254 | } 255 | 256 | std::sort(iterators.begin(), iterators.end(), [](auto const& x, auto const& y) { 257 | return x.meta_color_set_size() < y.meta_color_set_size(); 258 | }); 259 | 260 | /* step 1: determine partitions in common */ 261 | const uint32_t num_partitions = iterators[0].num_partitions(); 262 | partition_ids.reserve(num_partitions); // at most 263 | 264 | uint32_t candidate = iterators[0].partition_id(); 265 | uint64_t i = 1; 266 | while (candidate < num_partitions) { 267 | for (; i != iterators.size(); ++i) { 268 | iterators[i].next_geq_partition_id(candidate); 269 | uint32_t val = iterators[i].partition_id(); 270 | if (val != candidate) { 271 | candidate = val; 272 | i = 0; 273 | break; 274 | } 275 | } 276 | if (i == iterators.size()) { 277 | partition_ids.push_back(candidate); 278 | iterators[0].next_partition_id(); 279 | candidate = iterators[0].partition_id(); 280 | i = 1; 281 | } 282 | } 283 | 284 | /* step 2: intersect partial colors in the same partitions only */ 285 | for (auto& it : iterators) { 286 | it.init(); 287 | it.change_partition(); 288 | } 289 | for (auto partition_id : partition_ids) { 290 | bool same_meta_color = true; 291 | auto& front_it = iterators.front(); 292 | front_it.next_geq_partition_id(partition_id); 293 | front_it.update_partition(); 294 | uint32_t meta_color = front_it.meta_color(); 295 | 296 | for (uint32_t i = 1; i != iterators.size(); ++i) { 297 | auto& it = iterators[i]; 298 | it.next_geq_partition_id(partition_id); 299 | it.update_partition(); 300 | if (it.meta_color() != meta_color) same_meta_color = false; 301 | } 302 | 303 | if (same_meta_color) { // do not intersect, just write the whole partial color once 304 | while (front_it.has_next()) { 305 | uint32_t val = front_it.value(); 306 | colors.push_back(val); 307 | front_it.next_in_partition(); 308 | } 309 | } else { // intersect partial colors in the partition 310 | std::sort(iterators.begin(), iterators.end(), [](const Iterator& a, const Iterator& b) { 311 | return a.partial_set_size() < b.partial_set_size() || 312 | (a.partial_set_size() == b.partial_set_size() && 313 | a.meta_color() < b.meta_color()); 314 | }); 315 | 316 | uint64_t back_pos = 0; 317 | for (uint64_t curr_pos = 1; curr_pos < iterators.size(); curr_pos++) { 318 | if (iterators[curr_pos].meta_color() != iterators[back_pos].meta_color()) { 319 | std::swap(iterators[++back_pos], iterators[curr_pos]); 320 | } 321 | } 322 | auto end_it = iterators.begin() + back_pos + 1; 323 | 324 | if constexpr (is_differential) { 325 | std::vector diff_iterators; 326 | diff_iterators.reserve(end_it - iterators.begin()); 327 | std::transform(iterators.begin(), end_it, back_inserter(diff_iterators), 328 | [](Iterator a) { return a.partition_it(); }); 329 | uint32_t lower_bound = 330 | iterators[0].partition_max_color() - diff_iterators[0].num_colors(); 331 | diff_intersect(diff_iterators, colors, lower_bound); 332 | } else { 333 | const uint32_t num_colors = iterators[0].partition_max_color(); 334 | next_geq_intersect(iterators.begin(), end_it, colors, num_colors); 335 | } 336 | } 337 | } 338 | } 339 | 340 | template 341 | void index::pseudoalign_full_intersection(std::string const& sequence, 342 | std::vector& colors) const { 343 | if (sequence.length() < m_k2u.k()) return; 344 | colors.clear(); 345 | std::vector unitig_ids; 346 | 347 | { /* stream through */ 348 | sshash::streaming_query_canonical_parsing query(&m_k2u); 349 | query.start(); 350 | const uint64_t num_kmers = sequence.length() - m_k2u.k() + 1; 351 | for (uint64_t i = 0, prev_unitig_id = -1; i != num_kmers; ++i) { 352 | char const* kmer = sequence.data() + i; 353 | auto answer = query.lookup_advanced(kmer); 354 | if (answer.kmer_id != sshash::constants::invalid_uint64) { // kmer is positive 355 | if (answer.contig_id != prev_unitig_id) { 356 | unitig_ids.push_back(answer.contig_id); 357 | prev_unitig_id = answer.contig_id; 358 | } 359 | } 360 | } 361 | } 362 | 363 | /* here we use it to hold the color class ids; 364 | in meta_intersect we use it to hold the partition ids */ 365 | std::vector tmp; 366 | std::vector iterators; 367 | 368 | /* deduplicate unitig_ids */ 369 | std::sort(unitig_ids.begin(), unitig_ids.end()); 370 | auto end_unitigs = std::unique(unitig_ids.begin(), unitig_ids.end()); 371 | tmp.reserve(end_unitigs - unitig_ids.begin()); 372 | for (auto it = unitig_ids.begin(); it != end_unitigs; ++it) { 373 | uint32_t unitig_id = *it; 374 | uint32_t color_set_id = u2c(unitig_id); 375 | tmp.push_back(color_set_id); 376 | } 377 | 378 | /* deduplicate color class ids */ 379 | std::sort(tmp.begin(), tmp.end()); 380 | auto end_tmp = std::unique(tmp.begin(), tmp.end()); 381 | iterators.reserve(end_tmp - tmp.begin()); 382 | for (auto it = tmp.begin(); it != end_tmp; ++it) { 383 | uint64_t color_set_id = *it; 384 | auto fwd_it = m_color_sets.color_set(color_set_id); 385 | iterators.push_back(fwd_it); 386 | } 387 | 388 | tmp.clear(); // don't need color class ids anymore 389 | if constexpr (ColorSets::type == index_t::META) { 390 | meta_intersect(iterators, colors, tmp); 391 | } else if constexpr (ColorSets::type == index_t::META_DIFF) { 392 | meta_intersect(iterators, colors, tmp); 393 | } else if constexpr (ColorSets::type == index_t::DIFF) { 394 | diff_intersect(iterators, colors); 395 | } else if constexpr (ColorSets::type == index_t::HYBRID) { 396 | intersect(iterators, colors, tmp); 397 | } 398 | 399 | assert(util::check_intersection(iterators, colors)); 400 | } 401 | 402 | } // namespace fulgor 403 | -------------------------------------------------------------------------------- /src/ps_threshold_union.cpp: -------------------------------------------------------------------------------- 1 | #include // for std::accumulate 2 | 3 | #include "include/index.hpp" 4 | #include "external/sshash/include/query/streaming_query_canonical_parsing.hpp" 5 | 6 | namespace fulgor { 7 | 8 | template 9 | struct scored { 10 | T item; 11 | uint32_t score; 12 | }; 13 | 14 | typedef scored scored_id; 15 | 16 | template 17 | void merge(std::vector& iterators, std::vector& colors, int64_t min_score) { 18 | if (iterators.empty()) return; 19 | 20 | uint32_t num_colors = iterators[0].item.num_colors(); 21 | std::vector scores(num_colors, 0); 22 | for (auto& it : iterators) { 23 | if (it.item.encoding_type() == encoding_t::complement_delta_gaps) { 24 | it.item.reinit_for_complemented_set_iteration(); 25 | min_score -= it.score; 26 | while (it.item.comp_value() < num_colors) { 27 | scores[it.item.comp_value()] -= it.score; 28 | it.item.next_comp(); 29 | } 30 | } else { 31 | uint32_t size = it.item.size(); 32 | for (uint32_t i = 0; i < size; ++i, it.item.next()) { 33 | scores[it.item.value()] += it.score; 34 | } 35 | } 36 | } 37 | for (uint32_t color = 0; color < num_colors; color++) { 38 | if (scores[color] >= min_score) colors.push_back(color); 39 | } 40 | } 41 | 42 | template 43 | void merge_meta(std::vector& iterators, std::vector& colors, 44 | const uint64_t min_score) { 45 | if (iterators.empty()) return; 46 | 47 | const uint32_t num_partitions = iterators[0].item.num_partitions(); 48 | const uint32_t num_colors = iterators[0].item.num_colors(); 49 | std::vector partition_ids; 50 | partition_ids.reserve(num_partitions); 51 | 52 | // the number of partitions is relatively small, so this does not impact efficiency 53 | uint32_t candidate_partition = 54 | (*std::min_element(iterators.begin(), iterators.end(), [](auto const& x, auto const& y) { 55 | return x.item.partition_id() < y.item.partition_id(); 56 | })).item.partition_id(); 57 | 58 | while (candidate_partition < num_partitions) { 59 | uint32_t next_partition = num_partitions; 60 | uint32_t score = 0; 61 | for (uint64_t i = 0; i != iterators.size(); ++i) { 62 | if (iterators[i].item.partition_id() == candidate_partition) { 63 | score += iterators[i].score; 64 | iterators[i].item.next_partition_id(); 65 | } 66 | /* compute next minimum */ 67 | if (iterators[i].item.partition_id() < next_partition) { 68 | next_partition = iterators[i].item.partition_id(); 69 | } 70 | } 71 | if (score >= min_score) partition_ids.push_back(candidate_partition); 72 | assert(next_partition > candidate_partition); 73 | candidate_partition = next_partition; 74 | } 75 | 76 | std::vector scores(num_colors, 0); 77 | for (auto& it : iterators) { 78 | it.item.init(); 79 | it.item.change_partition(); 80 | } 81 | for (auto partition_id : partition_ids) { 82 | uint32_t upper_bound = 0; 83 | for (auto& it : iterators) { 84 | it.item.next_geq_partition_id(partition_id); 85 | if (it.item.partition_id() == partition_id) { 86 | it.item.update_partition(); 87 | upper_bound = it.item.partition_max_color(); 88 | } 89 | } 90 | 91 | std::sort(iterators.begin(), iterators.end(), [](const Iterator& a, const Iterator& b) { 92 | return a.item.partition_id() < b.item.partition_id() || 93 | (a.item.partition_id() == b.item.partition_id() && 94 | a.item.meta_color() < b.item.meta_color()); 95 | }); 96 | 97 | uint64_t meta_score = iterators.front().score; 98 | auto process_meta = [&](Iterator& it) { 99 | while (it.item.value() < upper_bound) { 100 | scores[it.item.value()] += meta_score; 101 | it.item.next(); 102 | } 103 | }; 104 | uint64_t i = 1; 105 | for (; i < iterators.size(); ++i) { 106 | auto& it = iterators[i]; 107 | if (it.item.partition_id() != partition_id) break; 108 | if (it.item.meta_color() != iterators[i - 1].item.meta_color()) { 109 | process_meta(iterators[i - 1]); 110 | meta_score = 0; 111 | } 112 | meta_score += it.score; 113 | } 114 | process_meta(iterators[i - 1]); 115 | } 116 | 117 | for (uint32_t color = 0; color < num_colors; color++) { 118 | if (scores[color] >= min_score) colors.push_back(color); 119 | } 120 | } 121 | 122 | template 123 | void merge_diff(std::vector& iterators, std::vector& colors, 124 | const uint64_t min_score) { 125 | if (iterators.empty()) return; 126 | const uint32_t num_colors = iterators[0].item.num_colors(); 127 | const uint32_t num_iterators = iterators.size(); 128 | 129 | std::sort(iterators.begin(), iterators.end(), [](const Iterator& a, const Iterator& b) { 130 | return a.item.representative_begin() < b.item.representative_begin(); 131 | }); 132 | 133 | std::vector partition_scores(num_colors, 0); 134 | std::vector scores(num_colors, 0); 135 | uint32_t score = 0; 136 | uint32_t partition_size = 0; 137 | for (uint32_t iterator_id = 0; iterator_id < num_iterators; iterator_id++) { 138 | Iterator it = iterators[iterator_id]; 139 | partition_size++; 140 | score += it.score; 141 | 142 | bool is_last_in_partition = iterator_id + 1 == num_iterators || 143 | iterators[iterator_id + 1].item.representative_begin() != 144 | it.item.representative_begin(); 145 | 146 | if (partition_size == 1 && is_last_in_partition) { 147 | for (uint32_t i = 0; i < it.item.size(); ++i, it.item.next()) { 148 | scores[it.item.value()] += it.score; 149 | } 150 | score = 0; 151 | partition_size = 0; 152 | continue; 153 | } 154 | 155 | it.item.full_rewind(); 156 | 157 | uint32_t val = it.item.differential_val(); 158 | while (val != num_colors) { 159 | partition_scores[val] += it.score; 160 | it.item.next_differential_val(); 161 | val = it.item.differential_val(); 162 | } 163 | 164 | if (is_last_in_partition) { 165 | it.item.full_rewind(); 166 | val = it.item.representative_val(); 167 | for (uint32_t color = 0; color < num_colors; color++) { 168 | if (val == color) { 169 | scores[color] += score - partition_scores[color]; 170 | it.item.next_representative_val(); 171 | val = it.item.representative_val(); 172 | } else { 173 | scores[color] += partition_scores[color]; 174 | } 175 | } 176 | score = 0; 177 | partition_size = 0; 178 | fill(partition_scores.begin(), partition_scores.end(), 0); 179 | } 180 | } 181 | 182 | for (uint32_t color = 0; color < num_colors; color++) { 183 | if (scores[color] >= min_score) colors.push_back(color); 184 | } 185 | } 186 | 187 | template 188 | void merge_metadiff(std::vector& iterators, std::vector& colors, 189 | const uint64_t min_score) { 190 | if (iterators.empty()) return; 191 | 192 | const uint32_t num_partitions = iterators[0].item.num_partitions(); 193 | const uint32_t num_colors = iterators[0].item.num_colors(); 194 | const uint32_t num_iterators = iterators.size(); 195 | std::vector partition_ids; 196 | partition_ids.reserve(num_partitions); 197 | 198 | // the number of partitions is relatively small, so this does not impact efficiency 199 | uint32_t candidate_partition = 200 | std::min_element(iterators.begin(), iterators.end(), [](auto const& x, auto const& y) { 201 | return x.item.partition_id() < y.item.partition_id(); 202 | })->item.partition_id(); 203 | 204 | while (candidate_partition < num_partitions) { 205 | uint32_t next_partition = num_partitions; 206 | uint32_t score = 0; 207 | for (uint64_t i = 0; i != iterators.size(); ++i) { 208 | if (iterators[i].item.partition_id() == candidate_partition) { 209 | score += iterators[i].score; 210 | iterators[i].item.next_partition_id(); 211 | } 212 | /* compute next minimum */ 213 | if (iterators[i].item.partition_id() < next_partition) { 214 | next_partition = iterators[i].item.partition_id(); 215 | } 216 | } 217 | if (score >= min_score) partition_ids.push_back(candidate_partition); 218 | assert(next_partition > candidate_partition); 219 | candidate_partition = next_partition; 220 | } 221 | 222 | std::vector scores(num_colors, 0); 223 | std::vector partition_scores(num_colors, 0); 224 | for (auto& it : iterators) { 225 | it.item.init(); 226 | it.item.change_partition(); 227 | } 228 | for (auto partition_id : partition_ids) { 229 | uint32_t num_partition_colors = 0; 230 | uint32_t lower_bound = 0; 231 | uint32_t num_sets = 0; 232 | for (auto& it : iterators) { 233 | it.item.next_geq_partition_id(partition_id); 234 | if (it.item.partition_id() == partition_id) { 235 | it.item.update_partition(); 236 | num_sets++; 237 | } 238 | } 239 | 240 | std::sort(iterators.begin(), iterators.end(), [&](const Iterator& a, const Iterator& b) { 241 | uint32_t a_part = a.item.partition_id(); 242 | uint32_t b_part = b.item.partition_id(); 243 | if (a_part == partition_id && b_part == partition_id) { 244 | uint32_t a_repr = a.item.partition_it().representative_begin(); 245 | uint32_t a_meta = a.item.meta_color(); 246 | uint32_t b_repr = b.item.partition_it().representative_begin(); 247 | uint32_t b_meta = b.item.meta_color(); 248 | return a_meta < b_meta || (a_meta == b_meta && a_repr < b_repr); 249 | } 250 | return a_part < b_part; 251 | }); 252 | 253 | lower_bound = iterators.front().item.partition_min_color(); 254 | num_partition_colors = iterators.front().item.partition_it().num_colors(); 255 | 256 | uint32_t partition_score = 0; 257 | uint32_t partition_size = 0; 258 | uint32_t meta_score = 0; 259 | for (uint32_t iterator_id = 0; iterator_id < num_iterators; iterator_id++) { 260 | Iterator it = iterators[iterator_id]; 261 | if (it.item.partition_id() != partition_id) break; 262 | meta_score += it.score; 263 | num_sets--; 264 | partition_size++; 265 | if (num_sets != 0 && 266 | iterators[iterator_id + 1].item.meta_color() == it.item.meta_color()) 267 | continue; 268 | 269 | auto diff_it = it.item.partition_it(); 270 | partition_score += meta_score; 271 | 272 | bool is_last_in_partition = 273 | num_sets == 0 || 274 | iterators[iterator_id + 1].item.partition_it().representative_begin() != 275 | diff_it.representative_begin(); 276 | 277 | if (is_last_in_partition && partition_size == 1) { 278 | for (uint32_t i = 0; i < diff_it.size(); ++i, ++diff_it) { 279 | scores[lower_bound + *diff_it] += meta_score; 280 | } 281 | partition_score = 0; 282 | partition_size = 0; 283 | meta_score = 0; 284 | continue; 285 | } 286 | 287 | diff_it.full_rewind(); 288 | 289 | uint32_t val = diff_it.differential_val(); 290 | while (val != num_partition_colors) { 291 | partition_scores[val] += meta_score; 292 | diff_it.next_differential_val(); 293 | val = diff_it.differential_val(); 294 | } 295 | meta_score = 0; 296 | 297 | if (is_last_in_partition) { 298 | diff_it.full_rewind(); 299 | val = diff_it.representative_val(); 300 | for (uint32_t color = 0; color < num_partition_colors; color++) { 301 | if (val == color) { 302 | scores[lower_bound + color] += partition_score - partition_scores[color]; 303 | diff_it.next_representative_val(); 304 | val = diff_it.representative_val(); 305 | } else { 306 | scores[lower_bound + color] += partition_scores[color]; 307 | } 308 | } 309 | partition_score = 0; 310 | partition_size = 0; 311 | fill(partition_scores.begin(), partition_scores.begin() + num_partition_colors, 0); 312 | } 313 | } 314 | } 315 | for (uint32_t color = 0; color < num_colors; color++) { 316 | if (scores[color] >= min_score) colors.push_back(color); 317 | } 318 | } 319 | 320 | template 321 | void index::pseudoalign_threshold_union(std::string const& sequence, 322 | std::vector& colors, 323 | const double threshold) const { 324 | if (sequence.length() < m_k2u.k()) return; 325 | colors.clear(); 326 | 327 | std::vector unitig_ids; 328 | uint64_t num_positive_kmers_in_sequence = 0; 329 | { /* stream through with multiplicities */ 330 | sshash::streaming_query_canonical_parsing query(&m_k2u); 331 | query.start(); 332 | const uint64_t num_kmers = sequence.length() - m_k2u.k() + 1; 333 | for (uint64_t i = 0, prev_unitig_id = -1; i != num_kmers; ++i) { 334 | char const* kmer = sequence.data() + i; 335 | auto answer = query.lookup_advanced(kmer); 336 | if (answer.kmer_id != sshash::constants::invalid_uint64) { // kmer is positive 337 | num_positive_kmers_in_sequence += 1; 338 | if (answer.contig_id != prev_unitig_id) { 339 | unitig_ids.push_back({answer.contig_id, 1}); 340 | prev_unitig_id = answer.contig_id; 341 | } else { 342 | assert(!unitig_ids.empty()); 343 | unitig_ids.back().score += 1; 344 | } 345 | } 346 | } 347 | } 348 | 349 | /* num_positive_kmers_in_sequence must be equal to the sum of the scores */ 350 | assert(num_positive_kmers_in_sequence == 351 | std::accumulate(unitig_ids.begin(), unitig_ids.end(), uint64_t(0), 352 | [](uint64_t curr_sum, auto const& u) { return curr_sum + u.score; })); 353 | 354 | std::vector color_set_ids; 355 | std::vector> iterators; 356 | 357 | /* deduplicate unitig_ids */ 358 | std::sort(unitig_ids.begin(), unitig_ids.end(), 359 | [](auto const& x, auto const& y) { return x.item < y.item; }); 360 | uint32_t prev_unitig_id = -1; 361 | for (uint64_t i = 0; i != unitig_ids.size(); ++i) { 362 | uint32_t unitig_id = unitig_ids[i].item; 363 | if (unitig_id != prev_unitig_id) { 364 | uint32_t color_set_id = u2c(unitig_id); 365 | color_set_ids.push_back({color_set_id, unitig_ids[i].score}); 366 | prev_unitig_id = unitig_id; 367 | } else { 368 | assert(!color_set_ids.empty()); 369 | color_set_ids.back().score += unitig_ids[i].score; 370 | } 371 | } 372 | 373 | /* deduplicate color_set_ids */ 374 | std::sort(color_set_ids.begin(), color_set_ids.end(), 375 | [](auto const& x, auto const& y) { return x.item < y.item; }); 376 | uint32_t prev_color_set_id = -1; 377 | for (uint64_t i = 0; i != color_set_ids.size(); ++i) { 378 | uint64_t color_set_id = color_set_ids[i].item; 379 | if (color_set_id != prev_color_set_id) { 380 | auto fwd_it = m_color_sets.color_set(color_set_id); 381 | iterators.push_back({fwd_it, color_set_ids[i].score}); 382 | prev_color_set_id = color_set_id; 383 | } else { 384 | assert(!iterators.empty()); 385 | iterators.back().score += color_set_ids[i].score; 386 | } 387 | } 388 | 389 | const uint64_t min_score = static_cast(num_positive_kmers_in_sequence) * threshold; 390 | 391 | if constexpr (ColorSets::type == index_t::META) { 392 | merge_meta(iterators, colors, min_score); 393 | } else if constexpr (ColorSets::type == index_t::DIFF) { 394 | merge_diff(iterators, colors, min_score); 395 | } else if constexpr (ColorSets::type == index_t::META_DIFF) { 396 | merge_metadiff(iterators, colors, min_score); 397 | } else if constexpr (ColorSets::type == index_t::HYBRID) { 398 | merge(iterators, colors, min_score); 399 | } 400 | 401 | assert(util::check_union(iterators, colors, min_score)); 402 | } 403 | 404 | } // namespace fulgor 405 | -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_AA7743AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_AA7743AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_BA0010AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_BA0010AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_CA3280AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_CA3280AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_FA0063AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_FA0063AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_FA6579AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_FA6579AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_GA5038AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_GA5038AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_HA1487AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_HA1487AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_HA3099AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_HA3099AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_HA8439AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_HA8439AA.fasta.gz -------------------------------------------------------------------------------- /test_data/salmonella_10/SAL_HA8462AA.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/fulgor/caf3f5ae6c5d76457047b93e9d27dcb7c993fccc/test_data/salmonella_10/SAL_HA8462AA.fasta.gz -------------------------------------------------------------------------------- /tools/build.cpp: -------------------------------------------------------------------------------- 1 | using namespace fulgor; 2 | 3 | void meta_color(build_configuration const& build_config, const bool force) // 4 | { 5 | std::string output_filename = build_config.index_filename_to_partition.substr( 6 | 0, build_config.index_filename_to_partition.length() - 7 | constants::fulgor_filename_extension.length() - 1) + 8 | "." + constants::meta_colored_fulgor_filename_extension; 9 | 10 | if (std::filesystem::exists(output_filename)) { 11 | std::cerr << "An index with the name '" << output_filename << "' alreay exists." 12 | << std::endl; 13 | if (force) { 14 | std::cerr << "Option '--force' specified: re-building the index." << std::endl; 15 | } else { 16 | std::cerr << "Use option '--force' to re-build the index." << std::endl; 17 | return; 18 | } 19 | } 20 | 21 | essentials::timer timer; 22 | timer.start(); 23 | meta_index_type index; 24 | typename meta_index_type::meta_builder builder(build_config); 25 | builder.build(index); 26 | index.print_stats(); 27 | timer.stop(); 28 | essentials::logger("DONE"); 29 | std::cout << "** building the index took " << timer.elapsed() << " seconds / " 30 | << timer.elapsed() / 60 << " minutes" << std::endl; 31 | 32 | essentials::logger("saving index to disk..."); 33 | essentials::save(index, output_filename.c_str()); 34 | essentials::logger("DONE"); 35 | } 36 | 37 | void diff_color(build_configuration const& build_config, const bool force) // 38 | { 39 | std::string output_filename = build_config.index_filename_to_partition.substr( 40 | 0, build_config.index_filename_to_partition.length() - 41 | constants::fulgor_filename_extension.length() - 1) + 42 | "." + constants::diff_colored_fulgor_filename_extension; 43 | 44 | if (std::filesystem::exists(output_filename)) { 45 | std::cerr << "An index with the name '" << output_filename << "' alreay exists." 46 | << std::endl; 47 | if (force) { 48 | std::cerr << "Option '--force' specified: re-building the index." << std::endl; 49 | } else { 50 | std::cerr << "Use option '--force' to re-build the index." << std::endl; 51 | return; 52 | } 53 | } 54 | 55 | essentials::timer timer; 56 | timer.start(); 57 | differential_index_type index; 58 | typename differential_index_type::differential_builder builder(build_config); 59 | builder.build(index); 60 | index.print_stats(); 61 | timer.stop(); 62 | essentials::logger("DONE"); 63 | std::cout << "** building the index took " << timer.elapsed() << " seconds / " 64 | << timer.elapsed() / 60 << " minutes" << std::endl; 65 | 66 | essentials::logger("saving index to disk..."); 67 | essentials::save(index, output_filename.c_str()); 68 | essentials::logger("DONE"); 69 | } 70 | 71 | void meta_diff_color(build_configuration const& build_config, const bool force) // 72 | { 73 | std::string output_filename = build_config.index_filename_to_partition.substr( 74 | 0, build_config.index_filename_to_partition.length() - 75 | constants::fulgor_filename_extension.length() - 1) + 76 | "." + constants::meta_diff_colored_fulgor_filename_extension; 77 | 78 | if (std::filesystem::exists(output_filename)) { 79 | std::cerr << "An index with the name '" << output_filename << "' alreay exists." 80 | << std::endl; 81 | if (force) { 82 | std::cerr << "Option '--force' specified: re-building the index." << std::endl; 83 | } else { 84 | std::cerr << "Use option '--force' to re-build the index." << std::endl; 85 | return; 86 | } 87 | } 88 | 89 | /* first build a meta-colored Fulgor index */ 90 | meta_color(build_config, force); 91 | 92 | essentials::timer timer; 93 | timer.start(); 94 | build_configuration meta_diff_build_config = build_config; 95 | meta_diff_build_config.index_filename_to_partition = 96 | build_config.index_filename_to_partition.substr( 97 | 0, build_config.index_filename_to_partition.length() - 98 | constants::fulgor_filename_extension.length() - 1) + 99 | "." + constants::meta_colored_fulgor_filename_extension; 100 | meta_differential_index_type index; 101 | typename meta_differential_index_type::meta_differential_builder builder( 102 | meta_diff_build_config); 103 | builder.build(index); 104 | index.print_stats(); 105 | timer.stop(); 106 | essentials::logger("DONE"); 107 | std::cout << "** building the index took " << timer.elapsed() << " seconds / " 108 | << timer.elapsed() / 60 << " minutes" << std::endl; 109 | 110 | essentials::logger("saving index to disk..."); 111 | essentials::save(index, output_filename.c_str()); 112 | essentials::logger("DONE"); 113 | } 114 | 115 | int build(int argc, char** argv) { 116 | cmd_line_parser::parser parser(argc, argv); 117 | parser.add("filenames_list", "Filenames list.", "-l", true); 118 | parser.add("file_base_name", "File basename.", "-o", true); 119 | parser.add("k", "K-mer length (must be <= " + std::to_string(kmer_type::max_k) + ").", "-k", 120 | true); 121 | parser.add("m", "Minimizer length (must be < k).", "-m", true); 122 | parser.add( 123 | "tmp_dirname", 124 | "Temporary directory used for construction in external memory. Default is directory '" + 125 | constants::default_tmp_dirname + "'.", 126 | "-d", false); 127 | parser.add("RAM", 128 | "RAM limit in GiB. Default value is " + 129 | std::to_string(constants::default_ram_limit_in_GiB) + ".", 130 | "-g", false); 131 | parser.add("num_threads", "Number of threads (default is 1).", "-t", false); 132 | parser.add("verbose", "Verbose output during construction.", "--verbose", false, true); 133 | parser.add("check", "Check correctness after index construction (it might take some time).", 134 | "--check", false, true); 135 | parser.add("force", "Re-build the index even when an index with the same name is found.", 136 | "--force", false, true); 137 | parser.add("meta", "Build a meta-colored index.", "--meta", false, true); 138 | parser.add("diff", "Build a differential-colored index.", "--diff", false, true); 139 | 140 | if (!parser.parse()) return 1; 141 | util::print_cmd(argc, argv); 142 | 143 | build_configuration build_config; 144 | build_config.file_base_name = parser.get("file_base_name"); 145 | std::string output_filename = 146 | build_config.file_base_name + "." + constants::fulgor_filename_extension; 147 | build_config.index_filename_to_partition = output_filename; 148 | bool force = parser.get("force"); 149 | build_config.meta_colored = parser.get("meta"); 150 | build_config.diff_colored = parser.get("diff"); 151 | 152 | if (parser.parsed("tmp_dirname")) { 153 | build_config.tmp_dirname = parser.get("tmp_dirname"); 154 | essentials::create_directory(build_config.tmp_dirname); 155 | } 156 | if (parser.parsed("num_threads")) { 157 | build_config.num_threads = parser.get("num_threads"); 158 | } 159 | 160 | if (std::filesystem::exists(output_filename)) { 161 | std::cerr << "An index with the name '" << output_filename << "' alreay exists." 162 | << std::endl; 163 | if (force) { 164 | std::cerr << "Option '--force' specified: re-building the index." << std::endl; 165 | } else { 166 | std::cerr << "Use option '--force' to re-build the index." << std::endl; 167 | if (build_config.meta_colored and build_config.diff_colored) { 168 | std::cerr << "Consider using: \"./fulgor color -i " << output_filename << " -d " 169 | << build_config.tmp_dirname << " -t " 170 | << std::to_string(build_config.num_threads) << " --diff --meta\"" 171 | << std::endl; 172 | } else if (build_config.meta_colored) { 173 | std::cerr << "Consider using: \"./fulgor color -i " << output_filename << " -d " 174 | << build_config.tmp_dirname << " -t " 175 | << std::to_string(build_config.num_threads) << " --meta\"" << std::endl; 176 | } else if (build_config.diff_colored) { 177 | std::cerr << "Consider using: \"./fulgor color -i " << output_filename << " -d " 178 | << build_config.tmp_dirname << " -t " 179 | << std::to_string(build_config.num_threads) << " --diff\"" << std::endl; 180 | } 181 | return 1; 182 | } 183 | } 184 | 185 | auto k = parser.get("k"); 186 | auto m = parser.get("m"); 187 | build_config.k = k; 188 | build_config.m = m; 189 | build_config.verbose = parser.get("verbose"); 190 | build_config.check = parser.get("check"); 191 | build_config.filenames_list = parser.get("filenames_list"); 192 | if (parser.get("RAM")) { 193 | build_config.ram_limit_in_GiB = parser.get("RAM"); 194 | } 195 | 196 | essentials::timer timer; 197 | timer.start(); 198 | 199 | index_type index; 200 | typename index_type::builder builder(build_config); 201 | builder.build(index); 202 | index.print_stats(); 203 | 204 | timer.stop(); 205 | essentials::logger("DONE"); 206 | std::cout << "** building the index took " << timer.elapsed() << " seconds / " 207 | << timer.elapsed() / 60 << " minutes" << std::endl; 208 | 209 | essentials::logger("saving index to disk..."); 210 | essentials::save(index, output_filename.c_str()); 211 | essentials::logger("DONE"); 212 | 213 | if (build_config.meta_colored and build_config.diff_colored) { 214 | meta_diff_color(build_config, force); 215 | } else if (build_config.meta_colored) { 216 | meta_color(build_config, force); 217 | } else if (build_config.diff_colored) { 218 | diff_color(build_config, force); 219 | } 220 | 221 | return 0; 222 | } 223 | 224 | int color(int argc, char** argv) { 225 | cmd_line_parser::parser parser(argc, argv); 226 | parser.add("index_filename", "The Fulgor index filename to partition.", "-i", true); 227 | parser.add( 228 | "tmp_dirname", 229 | "Temporary directory used for construction in external memory. Default is directory '" + 230 | constants::default_tmp_dirname + "'.", 231 | "-d", false); 232 | parser.add("num_threads", "Number of threads (default is 1).", "-t", false); 233 | parser.add("verbose", "Verbose output during construction.", "--verbose", false, true); 234 | parser.add("check", "Check correctness after index construction (it might take some time).", 235 | "--check", false, true); 236 | parser.add("force", "Re-build the index even when an index with the same name is found.", 237 | "--force", false, true); 238 | parser.add("meta", "Build a meta-colored index.", "--meta", false, true); 239 | parser.add("diff", "Build a differential-colored index.", "--diff", false, true); 240 | 241 | if (!parser.parse()) return 1; 242 | util::print_cmd(argc, argv); 243 | 244 | build_configuration build_config; 245 | build_config.index_filename_to_partition = parser.get("index_filename"); 246 | if (!sshash::util::ends_with(build_config.index_filename_to_partition, 247 | "." + constants::fulgor_filename_extension)) { 248 | std::cerr << "Error: the file to partition must have extension \"." 249 | << constants::fulgor_filename_extension 250 | << "\". Have you first built a Fulgor index with the tool \"build\"?" 251 | << std::endl; 252 | return 1; 253 | } 254 | 255 | if (parser.parsed("tmp_dirname")) { 256 | build_config.tmp_dirname = parser.get("tmp_dirname"); 257 | essentials::create_directory(build_config.tmp_dirname); 258 | } 259 | if (parser.parsed("num_threads")) { 260 | build_config.num_threads = parser.get("num_threads"); 261 | } 262 | build_config.check = parser.get("check"); 263 | build_config.meta_colored = parser.get("meta"); 264 | build_config.diff_colored = parser.get("diff"); 265 | build_config.verbose = parser.get("verbose"); 266 | bool force = parser.get("force"); 267 | 268 | if (build_config.meta_colored and build_config.diff_colored) { 269 | meta_diff_color(build_config, force); 270 | } else if (build_config.meta_colored) { 271 | meta_color(build_config, force); 272 | } else if (build_config.diff_colored) { 273 | diff_color(build_config, force); 274 | } else { 275 | std::cerr << "Either \"--meta\" or \"--diff\" should be specified." << std::endl; 276 | return 1; 277 | } 278 | 279 | return 0; 280 | } 281 | -------------------------------------------------------------------------------- /tools/fulgor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "external/sshash/external/gz/zip_stream.cpp" 5 | #include "external/sshash/src/build.cpp" 6 | #include "external/sshash/src/dictionary.cpp" 7 | #include "external/sshash/src/info.cpp" 8 | 9 | #include "include/index_types.hpp" 10 | #include "src/index.cpp" 11 | #include "src/color_sets.cpp" 12 | #include "external/sshash/external/pthash/external/cmd_line_parser/include/parser.hpp" 13 | 14 | #include "util.cpp" 15 | #include "build.cpp" 16 | #include "permute.cpp" 17 | #include "pseudoalign.cpp" 18 | 19 | int help(char* arg0) { 20 | std::cout << "== Fulgor: a colored de Bruijn graph index " 21 | "================================" 22 | << std::endl 23 | << std::endl; 24 | 25 | std::cout << "Usage: " << arg0 << " ...\n\n"; 26 | 27 | std::cout << "Tools:\n" 28 | << " build build a Fulgor index\n" 29 | << " pseudoalign perform pseudoalignment to a Fulgor index\n" 30 | << " stats print index statistics\n" 31 | << " print-filenames print all reference filenames\n" 32 | << std::endl; 33 | 34 | std::cout << "Advanced tools:\n" 35 | << " permute permute the reference names of a Fulgor index\n" 36 | << " dump write unitigs and color sets in text format\n" 37 | << " color build a meta- or a diff- or a meta-diff- Fulgor index\n" 38 | << std::endl; 39 | 40 | return 1; 41 | } 42 | 43 | int main(int argc, char** argv) { 44 | if (argc < 2) return help(argv[0]); 45 | 46 | auto tool = std::string(argv[1]); 47 | 48 | /* basic tools */ 49 | if (tool == "build") { 50 | return build(argc - 1, argv + 1); 51 | } else if (tool == "pseudoalign") { 52 | return pseudoalign(argc - 1, argv + 1); 53 | } else if (tool == "stats") { 54 | return stats(argc - 1, argv + 1); 55 | } else if (tool == "print-filenames") { 56 | return print_filenames(argc - 1, argv + 1); 57 | } 58 | 59 | /* advanced tools */ 60 | else if (tool == "permute") { 61 | return permute(argc - 1, argv + 1); 62 | } else if (tool == "dump") { 63 | return dump(argc - 1, argv + 1); 64 | } else if (tool == "color") { 65 | return color(argc - 1, argv + 1); 66 | } 67 | 68 | std::cout << "Unsupported tool '" << tool << "'.\n" << std::endl; 69 | 70 | return help(argv[0]); 71 | } 72 | -------------------------------------------------------------------------------- /tools/permute.cpp: -------------------------------------------------------------------------------- 1 | using namespace fulgor; 2 | 3 | int permute(int argc, char** argv) { 4 | cmd_line_parser::parser parser(argc, argv); 5 | parser.add("index_filename", 6 | "The Fulgor index filename from which we permute the reference names.", "-i", true); 7 | parser.add( 8 | "tmp_dirname", 9 | "Temporary directory used for construction in external memory. Default is directory '" + 10 | constants::default_tmp_dirname + "'.", 11 | "-d", false); 12 | parser.add("output_filename", "Output file where to save the permuted filenames.", "-o", true); 13 | if (!parser.parse()) return 1; 14 | util::print_cmd(argc, argv); 15 | 16 | build_configuration build_config; 17 | if (parser.parsed("tmp_dirname")) { 18 | build_config.tmp_dirname = parser.get("tmp_dirname"); 19 | essentials::create_directory(build_config.tmp_dirname); 20 | } 21 | 22 | auto index_filename = parser.get("index_filename"); 23 | 24 | if (!sshash::util::ends_with(index_filename, "." + constants::fulgor_filename_extension)) { 25 | std::cerr << "Error: the file to partition must have extension \"." 26 | << constants::fulgor_filename_extension 27 | << "\". Have you first built a Fulgor index with the tool \"build\"?" 28 | << std::endl; 29 | return 1; 30 | } 31 | 32 | essentials::timer timer; 33 | timer.start(); 34 | 35 | index_type index; 36 | essentials::logger("step 1. loading index to be partitioned..."); 37 | essentials::load(index, index_filename.c_str()); 38 | essentials::logger("DONE"); 39 | 40 | permuter p(build_config); 41 | p.permute(index); 42 | auto const& filenames = p.filenames(); 43 | 44 | std::ofstream out(parser.get("output_filename").c_str()); 45 | if (!out.is_open()) { 46 | std::cerr << "cannot open output filename" << std::endl; 47 | return 1; 48 | } 49 | for (auto const& fn : filenames) out << fn << '\n'; 50 | out.close(); 51 | 52 | timer.stop(); 53 | essentials::logger("DONE"); 54 | std::cout << "** permuting the reference names took " << timer.elapsed() << " seconds / " 55 | << timer.elapsed() / 60 << " minutes" << std::endl; 56 | 57 | return 0; 58 | } -------------------------------------------------------------------------------- /tools/pseudoalign.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "external/sshash/external/gz/zip_stream.hpp" 6 | #include "external/FQFeeder/include/FastxParser.hpp" 7 | #include "external/FQFeeder/src/FastxParser.cpp" 8 | 9 | #include "src/ps_full_intersection.cpp" 10 | #include "src/ps_threshold_union.cpp" 11 | 12 | using namespace fulgor; 13 | 14 | enum class pseudoalignment_algorithm : uint8_t { FULL_INTERSECTION, THRESHOLD_UNION }; 15 | 16 | std::string to_string(pseudoalignment_algorithm algo, double threshold) { 17 | std::string o; 18 | switch (algo) { 19 | case pseudoalignment_algorithm::FULL_INTERSECTION: 20 | o = "full-intersection"; 21 | break; 22 | case pseudoalignment_algorithm::THRESHOLD_UNION: 23 | o = "threshold-union (threshold = " + std::to_string(threshold) + ")"; 24 | break; 25 | } 26 | return o; 27 | } 28 | 29 | template 30 | int pseudoalign(FulgorIndex const& index, fastx_parser::FastxParser& rparser, 31 | std::atomic& num_reads, std::atomic& num_mapped_reads, 32 | pseudoalignment_algorithm algo, const double threshold, std::ofstream& out_file, 33 | std::mutex& iomut, std::mutex& ofile_mut) // 34 | { 35 | std::vector colors; // result of pseudoalignment 36 | std::stringstream ss; 37 | uint64_t buff_size = 0; 38 | constexpr uint64_t buff_thresh = 50; 39 | 40 | auto rg = rparser.getReadGroup(); 41 | while (rparser.refill(rg)) { 42 | for (auto const& record : rg) { 43 | switch (algo) { 44 | case pseudoalignment_algorithm::FULL_INTERSECTION: 45 | index.pseudoalign_full_intersection(record.seq, colors); 46 | break; 47 | case pseudoalignment_algorithm::THRESHOLD_UNION: 48 | index.pseudoalign_threshold_union(record.seq, colors, threshold); 49 | break; 50 | default: 51 | break; 52 | } 53 | buff_size += 1; 54 | if (!colors.empty()) { 55 | num_mapped_reads += 1; 56 | ss << record.name << '\t' << colors.size(); 57 | for (auto c : colors) { ss << "\t" << c; } 58 | ss << '\n'; 59 | } else { 60 | ss << record.name << "\t0\n"; 61 | } 62 | num_reads += 1; 63 | colors.clear(); 64 | if (num_reads > 0 and num_reads % 1000000 == 0) { 65 | iomut.lock(); 66 | std::cout << "mapped " << num_reads << " reads" << std::endl; 67 | iomut.unlock(); 68 | } 69 | if (buff_size > buff_thresh) { 70 | std::string outs = ss.str(); 71 | ss.str(""); 72 | ofile_mut.lock(); 73 | out_file.write(outs.data(), outs.size()); 74 | ofile_mut.unlock(); 75 | buff_size = 0; 76 | } 77 | } 78 | } 79 | 80 | // dump anything left in the buffer 81 | if (buff_size > 0) { 82 | std::string outs = ss.str(); 83 | ss.str(""); 84 | ofile_mut.lock(); 85 | out_file.write(outs.data(), outs.size()); 86 | ofile_mut.unlock(); 87 | buff_size = 0; 88 | } 89 | 90 | return 0; 91 | } 92 | 93 | template 94 | int pseudoalign(std::string const& index_filename, std::string const& query_filename, 95 | std::string const& output_filename, uint64_t num_threads, double threshold, 96 | pseudoalignment_algorithm ps_alg, const bool verbose) { 97 | FulgorIndex index; 98 | if (verbose) essentials::logger("loading index from disk..."); 99 | essentials::load(index, index_filename.c_str()); 100 | if (verbose) essentials::logger("DONE"); 101 | 102 | std::cerr << "query mode : " << to_string(ps_alg, threshold) << "\n"; 103 | 104 | std::ifstream is(query_filename.c_str()); 105 | if (!is.good()) { 106 | std::cerr << "error in opening the file '" + query_filename + "'" << std::endl; 107 | return 1; 108 | } 109 | 110 | if (verbose) essentials::logger("performing queries from file '" + query_filename + "'..."); 111 | essentials::timer t; 112 | t.start(); 113 | 114 | std::atomic num_mapped_reads{0}; 115 | std::atomic num_reads{0}; 116 | 117 | auto query_filenames = std::vector({query_filename}); 118 | assert(num_threads >= 2); 119 | fastx_parser::FastxParser rparser(query_filenames, num_threads, 120 | num_threads - 1); 121 | 122 | rparser.start(); 123 | std::vector workers; 124 | std::mutex iomut; 125 | std::mutex ofile_mut; 126 | 127 | std::ofstream out_file; 128 | out_file.open(output_filename, std::ios::out | std::ios::trunc); 129 | if (!out_file) { 130 | std::cerr << "could not open output file " + output_filename << std::endl; 131 | return 1; 132 | } 133 | 134 | for (uint64_t i = 1; i != num_threads; ++i) { 135 | workers.push_back(std::thread([&index, &rparser, &num_reads, &num_mapped_reads, ps_alg, 136 | threshold, &out_file, &iomut, &ofile_mut]() { 137 | pseudoalign(index, rparser, num_reads, num_mapped_reads, ps_alg, threshold, out_file, 138 | iomut, ofile_mut); 139 | })); 140 | } 141 | 142 | for (auto& w : workers) { w.join(); } 143 | rparser.stop(); 144 | 145 | t.stop(); 146 | if (verbose) essentials::logger("DONE"); 147 | 148 | if (verbose) { 149 | std::cout << "mapped " << num_reads << " reads" << std::endl; 150 | std::cout << "elapsed = " << t.elapsed() << " millisec / "; 151 | std::cout << t.elapsed() / 1000 << " sec / "; 152 | std::cout << t.elapsed() / 1000 / 60 << " min / "; 153 | std::cout << (t.elapsed() * 1000) / num_reads << " musec/read" << std::endl; 154 | std::cout << "num_mapped_reads " << num_mapped_reads << "/" << num_reads << " (" 155 | << (num_mapped_reads * 100.0) / num_reads << "%)" << std::endl; 156 | } 157 | 158 | return 0; 159 | } 160 | 161 | int pseudoalign(int argc, char** argv) { 162 | cmd_line_parser::parser parser(argc, argv); 163 | 164 | parser.add("index_filename", "The Fulgor index filename.", "-i", true); 165 | parser.add("query_filename", "Query filename in FASTA/FASTQ format (optionally gzipped).", "-q", 166 | true); 167 | parser.add("output_filename", 168 | "File where output will be written. You can specify \"/dev/stdout\" to write " 169 | "output to stdout. In this case, it is also recommended to use the --verbose flag " 170 | "to avoid printing status messages to stdout.", 171 | "-o", true); 172 | parser.add("num_threads", "Number of threads (default is 1).", "-t", false); 173 | parser.add("verbose", "Verbose output during query (default is false).", "--verbose", false, 174 | true); 175 | parser.add("threshold", 176 | "Threshold for threshold_union algorithm. It must be a float in (0.0,1.0].", "-r", 177 | false); 178 | if (!parser.parse()) return 1; 179 | 180 | auto index_filename = parser.get("index_filename"); 181 | auto query_filename = parser.get("query_filename"); 182 | auto output_filename = parser.get("output_filename"); 183 | 184 | uint64_t num_threads = 1; 185 | if (parser.parsed("num_threads")) num_threads = parser.get("num_threads"); 186 | if (num_threads == 1) { 187 | num_threads += 1; 188 | std::cerr 189 | << "1 thread was specified, but an additional thread will be allocated for parsing" 190 | << std::endl; 191 | } 192 | 193 | double threshold = constants::invalid_threshold; 194 | if (parser.parsed("threshold")) threshold = parser.get("threshold"); 195 | if (threshold == 0.0 or threshold > 1.0) { 196 | std::cerr << "threshold must be a float in (0.0,1.0]" << std::endl; 197 | return 1; 198 | } 199 | 200 | auto ps_alg = pseudoalignment_algorithm::FULL_INTERSECTION; 201 | if (threshold != constants::invalid_threshold) { 202 | ps_alg = pseudoalignment_algorithm::THRESHOLD_UNION; 203 | } 204 | 205 | bool verbose = parser.get("verbose"); 206 | 207 | if (verbose) util::print_cmd(argc, argv); 208 | 209 | if (sshash::util::ends_with(index_filename, 210 | constants::meta_diff_colored_fulgor_filename_extension)) { 211 | return pseudoalign(index_filename, query_filename, 212 | output_filename, num_threads, threshold, 213 | ps_alg, verbose); 214 | } else if (sshash::util::ends_with(index_filename, 215 | constants::meta_colored_fulgor_filename_extension)) { 216 | return pseudoalign(index_filename, query_filename, output_filename, 217 | num_threads, threshold, ps_alg, verbose); 218 | } else if (sshash::util::ends_with(index_filename, 219 | constants::diff_colored_fulgor_filename_extension)) { 220 | return pseudoalign(index_filename, query_filename, output_filename, 221 | num_threads, threshold, ps_alg, verbose); 222 | } else if (sshash::util::ends_with(index_filename, constants::fulgor_filename_extension)) { 223 | return pseudoalign(index_filename, query_filename, output_filename, num_threads, 224 | threshold, ps_alg, verbose); 225 | } 226 | 227 | std::cerr << "Wrong index filename supplied." << std::endl; 228 | 229 | return 1; 230 | } 231 | -------------------------------------------------------------------------------- /tools/util.cpp: -------------------------------------------------------------------------------- 1 | using namespace fulgor; 2 | 3 | bool is_meta(std::string const& index_filename) { 4 | return sshash::util::ends_with(index_filename, 5 | constants::meta_colored_fulgor_filename_extension); 6 | } 7 | 8 | bool is_meta_diff(std::string const& index_filename) { 9 | return sshash::util::ends_with(index_filename, 10 | constants::meta_diff_colored_fulgor_filename_extension); 11 | } 12 | 13 | bool is_diff(std::string const& index_filename) { 14 | return sshash::util::ends_with(index_filename, 15 | constants::diff_colored_fulgor_filename_extension); 16 | } 17 | 18 | bool is_hybrid(std::string const& index_filename) { 19 | return sshash::util::ends_with(index_filename, constants::fulgor_filename_extension); 20 | } 21 | 22 | template 23 | void print_stats(std::string const& index_filename) { 24 | FulgorIndex index; 25 | essentials::logger("loading index from disk..."); 26 | essentials::load(index, index_filename.c_str()); 27 | essentials::logger("DONE"); 28 | index.print_stats(); 29 | } 30 | 31 | template 32 | void print_filenames(std::string const& index_filename) { 33 | FulgorIndex index; 34 | essentials::logger("loading index from disk..."); 35 | essentials::load(index, index_filename.c_str()); 36 | essentials::logger("DONE"); 37 | for (uint64_t i = 0; i != index.num_colors(); ++i) { 38 | std::cout << i << '\t' << index.filename(i) << '\n'; 39 | } 40 | } 41 | 42 | template 43 | void dump(std::string const& index_filename, std::string const& basename) { 44 | FulgorIndex index; 45 | essentials::logger("loading index from disk..."); 46 | essentials::load(index, index_filename.c_str()); 47 | essentials::logger("DONE"); 48 | index.dump(basename); 49 | } 50 | 51 | int stats(int argc, char** argv) { 52 | cmd_line_parser::parser parser(argc, argv); 53 | parser.add("index_filename", "The Fulgor index filename.", "-i", true); 54 | if (!parser.parse()) return 1; 55 | util::print_cmd(argc, argv); 56 | auto index_filename = parser.get("index_filename"); 57 | if (is_meta(index_filename)) { 58 | print_stats(index_filename); 59 | } else if (is_meta_diff(index_filename)) { 60 | print_stats(index_filename); 61 | } else if (is_diff(index_filename)) { 62 | print_stats(index_filename); 63 | } else if (is_hybrid(index_filename)) { 64 | print_stats(index_filename); 65 | } else { 66 | std::cerr << "Wrong filename supplied." << std::endl; 67 | return 1; 68 | } 69 | return 0; 70 | } 71 | 72 | int print_filenames(int argc, char** argv) { 73 | cmd_line_parser::parser parser(argc, argv); 74 | parser.add("index_filename", "The Fulgor index filename.", "-i", true); 75 | if (!parser.parse()) return 1; 76 | util::print_cmd(argc, argv); 77 | auto index_filename = parser.get("index_filename"); 78 | if (is_meta_diff(index_filename)) { 79 | print_filenames(index_filename); 80 | } else if (is_meta(index_filename)) { 81 | print_filenames(index_filename); 82 | } else if (is_diff(index_filename)) { 83 | print_filenames(index_filename); 84 | } else if (is_hybrid(index_filename)) { 85 | print_filenames(index_filename); 86 | } else { 87 | std::cerr << "Wrong filename supplied." << std::endl; 88 | return 1; 89 | } 90 | return 0; 91 | } 92 | 93 | int dump(int argc, char** argv) { 94 | cmd_line_parser::parser parser(argc, argv); 95 | parser.add("index_filename", "The Fulgor index filename.", "-i", true); 96 | if (!parser.parse()) return 1; 97 | util::print_cmd(argc, argv); 98 | auto index_filename = parser.get("index_filename"); 99 | if (is_meta_diff(index_filename)) { 100 | std::string basename{index_filename.data(), 101 | index_filename.length() - 102 | constants::meta_diff_colored_fulgor_filename_extension.length() - 103 | 1}; 104 | dump(index_filename, basename); 105 | } else if (is_meta(index_filename)) { 106 | std::string basename{index_filename.data(), 107 | index_filename.length() - 108 | constants::meta_colored_fulgor_filename_extension.length() - 1}; 109 | dump(index_filename, basename); 110 | } else if (is_diff(index_filename)) { 111 | std::string basename{index_filename.data(), 112 | index_filename.length() - 113 | constants::diff_colored_fulgor_filename_extension.length() - 1}; 114 | dump(index_filename, basename); 115 | } else if (is_hybrid(index_filename)) { 116 | std::string basename{ 117 | index_filename.data(), 118 | index_filename.length() - constants::fulgor_filename_extension.length() - 1}; 119 | dump(index_filename, basename); 120 | } else { 121 | std::cerr << "Wrong filename supplied." << std::endl; 122 | return 1; 123 | } 124 | return 0; 125 | } 126 | --------------------------------------------------------------------------------