├── .clang-format ├── .clang-tidy ├── .github └── workflows │ ├── ci_linux.yaml │ └── clang-format.yaml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── benchmark └── run_benchmark.sh ├── doc └── Doxyfile.in ├── include ├── popvcf └── popvcf.hpp ├── src ├── CMakeLists.txt ├── decode.cpp ├── decode.hpp ├── encode.cpp ├── encode.hpp ├── in.constants.hpp ├── io.hpp ├── main.cpp ├── sequence_utils.cpp └── sequence_utils.hpp └── test └── create_test_data.sh /.clang-format: -------------------------------------------------------------------------------- 1 | # run on whole project with: 2 | # find src/ test/ -name "*.hpp" -o -name "*.cpp" | xargs clang-format -i --style=file 3 | --- 4 | Language: Cpp 5 | BasedOnStyle: LLVM 6 | AlignOperands: AlignAfterOperator 7 | AllowAllArgumentsOnNextLine: false 8 | AllowShortEnumsOnASingleLine: false 9 | AllowShortFunctionsOnASingleLine: None 10 | AlwaysBreakAfterDefinitionReturnType: None 11 | AlwaysBreakAfterReturnType: None 12 | AlwaysBreakBeforeMultilineStrings: true 13 | AlwaysBreakTemplateDeclarations: Yes 14 | BinPackArguments: false 15 | BinPackParameters: false 16 | BreakBeforeBraces: Allman 17 | BraceWrapping: 18 | AfterCaseLabel: true 19 | AfterClass: true 20 | AfterControlStatement: Always 21 | AfterEnum: true 22 | AfterFunction: true 23 | AfterNamespace: true 24 | AfterObjCDeclaration: true 25 | AfterStruct: true 26 | AfterUnion: true 27 | AfterExternBlock: true 28 | BeforeCatch: true 29 | BeforeElse: true 30 | BeforeLambdaBody: true 31 | BeforeWhile: true 32 | IndentBraces: false 33 | SplitEmptyFunction: false 34 | SplitEmptyRecord: false 35 | SplitEmptyNamespace: false 36 | BreakBeforeInheritanceComma: true 37 | BreakConstructorInitializers: AfterColon 38 | BreakInheritanceList: AfterColon 39 | ColumnLimit: 120 40 | CommentPragmas: \/\/! 41 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 42 | ConstructorInitializerIndentWidth: 2 43 | ContinuationIndentWidth: 2 44 | DeriveLineEnding: false 45 | IncludeBlocks: Regroup 46 | IncludeCategories: 47 | - Regex: '^<(sys/)?[a-z_]+(\.h)?>$' 48 | Priority: 1 49 | - Regex: '^ 3 | -*, 4 | performance-*, 5 | bugprone-*, 6 | -bugprone-narrowing-conversions, 7 | # readability-*, 8 | # -readability-magic-numbers, 9 | # -readability-qualified-auto, 10 | # -readability-braces-around-statements, 11 | # -readability-uppercase-literal-suffix, 12 | # -readability-avoid-const-params-in-decls, 13 | # -readability-function-size, 14 | # -readability-function-cognitive-complexity, 15 | # -readability-container-size-empty 16 | # clang-analyzer-*, 17 | # -clang-analyzer-osx*, 18 | # modernize-*, 19 | # -modernize-use-trailing-return-type, 20 | 21 | WarningsAsErrors: '' 22 | #HeaderFilterRegex: 'graphtyper/' 23 | AnalyzeTemporaryDtors: false 24 | FormatStyle: none 25 | CheckOptions: 26 | - { key: readability-identifier-naming.NamespaceCase, value: lower_case } 27 | ... 28 | -------------------------------------------------------------------------------- /.github/workflows/ci_linux.yaml: -------------------------------------------------------------------------------- 1 | name: CI on Linux 2 | 3 | on: 4 | # Trigger workflow on pull requests of any branch 5 | pull_request: 6 | # Trigger workflow on pushes to following branches 7 | push: 8 | branches: 9 | - master 10 | - dev 11 | 12 | env: 13 | TZ: Atlantic/Reykjavik 14 | 15 | defaults: 16 | run: 17 | shell: bash -ex {0} 18 | 19 | jobs: 20 | build: 21 | name: ${{ matrix.name }} 22 | runs-on: ubuntu-18.04 23 | timeout-minutes: 120 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | include: 28 | - name: "GCC8 Release" 29 | pkg: "g++-8" 30 | cxx: "g++-8" 31 | cc: "gcc-8" 32 | build_type: Release 33 | 34 | - name: "GCC9 Debug" 35 | pkg: "g++-9" 36 | cxx: "g++-9" 37 | cc: "gcc-9" 38 | build_type: Debug 39 | 40 | - name: "GCC10 Debug C++20" 41 | pkg: "g++-10" 42 | cxx: "g++-10" 43 | cc: "gcc-10" 44 | cxx_flags: "-std=c++20" 45 | build_type: Debug 46 | 47 | - name: "GCC11 Release C++20" 48 | pkg: "g++-11" 49 | cxx: "g++-11" 50 | cc: "gcc-11" 51 | cxx_flags: "-std=c++20" 52 | build_type: Release 53 | 54 | - name: "Clang10 Debug C++20" 55 | pkg: "clang-10" 56 | cxx: "clang++-10" 57 | cc: "clang-10" 58 | cxx_flags: "-std=c++20" 59 | build_type: Debug 60 | 61 | - name: "Clang12 Release C++20" 62 | pkg: "clang-12" 63 | cxx: "clang++-12" 64 | cc: "clang-12" 65 | cxx_flags: "-std=c++20" 66 | build_type: Release 67 | steps: 68 | - name: Checkout 69 | uses: actions/checkout@v2 70 | with: 71 | path: src 72 | fetch-depth: 2 73 | submodules: recursive 74 | 75 | - name: Add package source 76 | run: | 77 | echo 'APT::Acquire::Retries "5";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null 78 | sudo add-apt-repository --no-update --yes ppa:ubuntu-toolchain-r/ppa 79 | sudo add-apt-repository --no-update --yes ppa:ubuntu-toolchain-r/test 80 | wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - 81 | sudo add-apt-repository --no-update --yes "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" 82 | sudo add-apt-repository --no-update --yes "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main" 83 | sudo add-apt-repository --no-update --yes "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-12 main" 84 | sudo apt-get update 85 | 86 | - name: Install Build dependencies 87 | run: sudo apt-get install --yes cmake ccache 88 | 89 | - name: Install compiler ${{ matrix.name }} 90 | run: sudo apt-get install --yes ${{ matrix.pkg }} 91 | 92 | - name: Install zstd 93 | run: sudo apt-get install --yes zstd libzstd-dev 94 | 95 | - name: Load ccache 96 | uses: actions/cache@v2 97 | with: 98 | path: .ccache 99 | key: ${{ runner.os }}-${{ matrix.name }}-ccache-${{ github.ref }}-${{ github.run_number }} 100 | # Restoring: From current branch, otherwise from base branch, otherwise from any branch. 101 | restore-keys: | 102 | ${{ runner.os }}-${{ matrix.name }}-ccache-${{ github.ref }} 103 | ${{ runner.os }}-${{ matrix.name }}-ccache-${{ github.base_ref }} 104 | ${{ runner.os }}-${{ matrix.name }}-ccache- 105 | 106 | - name: Tool versions 107 | run: | 108 | env cmake --version 109 | env ${{ matrix.cxx }} --version 110 | 111 | - name: Configure tests 112 | env: 113 | CXX: ${{ matrix.cxx }} 114 | CC: ${{ matrix.cc }} 115 | run: | 116 | mkdir build 117 | cd build 118 | cmake ../src -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_FLAGS="${{ matrix.cxx_flags }}" 119 | 120 | - name: Build tests 121 | env: 122 | CCACHE_BASEDIR: ${{ github.workspace }} 123 | CCACHE_DIR: ${{ github.workspace }}/.ccache 124 | CCACHE_COMPRESS: true 125 | CCACHE_COMPRESSLEVEL: 6 126 | CCACHE_MAXSIZE: 500M 127 | run: | 128 | ccache -p || true 129 | cd build 130 | make -k -j2 131 | ccache -s || true 132 | 133 | - name: Run tests 134 | run: | 135 | cd build 136 | make test 137 | -------------------------------------------------------------------------------- /.github/workflows/clang-format.yaml: -------------------------------------------------------------------------------- 1 | name: Format-check 2 | 3 | on: 4 | # Trigger workflow on pull requests of any branch 5 | pull_request: 6 | # Trigger workflow on pushes to following branches 7 | push: 8 | branches: 9 | - master 10 | - dev 11 | 12 | env: 13 | TZ: Atlantic/Reykjavik 14 | 15 | defaults: 16 | run: 17 | shell: bash -ex {0} 18 | 19 | jobs: 20 | build: 21 | name: ${{ matrix.name }} 22 | runs-on: ubuntu-18.04 23 | timeout-minutes: 120 24 | strategy: 25 | fail-fast: true 26 | matrix: 27 | include: 28 | - name: "Clang12 Release C++20" 29 | pkg: "clang-12" 30 | format: "clang-format-12" 31 | cxx: "clang++-12" 32 | cc: "clang-12" 33 | cxx_flags: "-std=c++20" 34 | build_type: Release 35 | steps: 36 | - name: Checkout 37 | uses: actions/checkout@v2 38 | with: 39 | path: src 40 | fetch-depth: 2 41 | submodules: recursive 42 | 43 | - name: Add package source 44 | run: | 45 | echo 'APT::Acquire::Retries "5";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null 46 | sudo add-apt-repository --no-update --yes ppa:ubuntu-toolchain-r/ppa 47 | sudo add-apt-repository --no-update --yes ppa:ubuntu-toolchain-r/test 48 | wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - 49 | sudo add-apt-repository --no-update --yes "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-12 main" 50 | sudo apt-get update 51 | 52 | - name: Install CMake 53 | run: sudo apt-get install --yes cmake 54 | 55 | - name: Install clang-format 56 | run: sudo apt-get install --yes ${{ matrix.pkg }} ${{ matrix.format }} 57 | 58 | - name: Install zstd 59 | run: sudo apt-get install --yes zstd libzstd-dev 60 | 61 | - name: Tool versions 62 | run: | 63 | env cmake --version 64 | env ${{ matrix.cxx }} --version 65 | env ${{ matrix.format }} --version 66 | - name: Configure tests 67 | env: 68 | CXX: ${{ matrix.cxx }} 69 | CC: ${{ matrix.cc }} 70 | run: | 71 | mkdir build 72 | cd build 73 | cmake ../src -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_FLAGS="${{ matrix.cxx_flags }}" 74 | - name: Run tests 75 | run: | 76 | cd build 77 | make check_format 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.* 2 | /.git* 3 | /build*/ 4 | *vcf 5 | *.gz 6 | *.popvcf 7 | *.popvcf.gz 8 | /benchmark/.* 9 | *.tbi 10 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "submodules/parallel-hashmap"] 2 | path = submodules/parallel-hashmap 3 | url = https://github.com/greg7mdp/parallel-hashmap.git 4 | ignore = dirty 5 | [submodule "submodules/paw"] 6 | path = submodules/paw 7 | url = https://github.com/hannespetur/paw.git 8 | ignore = dirty 9 | [submodule "submodules/htslib"] 10 | path = submodules/htslib 11 | url = https://github.com/samtools/htslib.git 12 | ignore = dirty 13 | [submodule "submodules/libdeflate"] 14 | path = submodules/libdeflate 15 | url = https://github.com/ebiggers/libdeflate.git 16 | ignore = dirty 17 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.8) 2 | project (popvcf LANGUAGES C CXX) 3 | 4 | include(ExternalProject) 5 | 6 | # Build popvcf in release by default 7 | if(NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE "RELEASE" CACHE STRING "Choose the type of build." FORCE) 9 | endif() 10 | 11 | # Build paw (dependency) in release by default 12 | if(NOT PAW_CMAKE_BUILD_TYPE) 13 | set(PAW_CMAKE_BUILD_TYPE "RELEASE" CACHE STRING "Choose the type of build for paw." FORCE) 14 | endif() 15 | 16 | message (STATUS "Build type: ${CMAKE_BUILD_TYPE}") 17 | 18 | set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/") 19 | set (STATIC_DIR "" CACHE STRING "Build in 'static' mode and include libraries in this directory.") 20 | 21 | ############ 22 | ## popvcf ## 23 | ############ 24 | set (popvcf_VERSION_MAJOR 1) 25 | set (popvcf_VERSION_MINOR 1) 26 | set (popvcf_VERSION_PATCH 1) 27 | set (popvcf_VERSION ${popvcf_VERSION_MAJOR}.${popvcf_VERSION_MINOR}.${popvcf_VERSION_PATCH}) 28 | 29 | add_subdirectory(src) # Exposes "popvcf_sources", which contains all source files of popvcf 30 | add_library(popvcf_objects OBJECT ${popvcf_sources}) 31 | 32 | target_compile_features(popvcf_objects PUBLIC cxx_std_17) 33 | target_compile_options(popvcf_objects PUBLIC -Wall -Wextra -Wfatal-errors -pedantic -Wno-variadic-macros -march=x86-64 -mtune=generic) 34 | add_dependencies(popvcf_objects project_paw) # popvcf depends on building paw 35 | 36 | target_include_directories(popvcf_objects PUBLIC 37 | ${CMAKE_CURRENT_SOURCE_DIR}/src 38 | ${CMAKE_CURRENT_BINARY_DIR}/include/popvcf) 39 | 40 | # Add popvcf executable 41 | add_executable(popvcf src/main.cpp $) 42 | 43 | # Inherit stuff from popvcf_objects 44 | target_include_directories(popvcf PRIVATE $) 45 | target_compile_features(popvcf PRIVATE $) 46 | target_compile_options(popvcf PRIVATE $) 47 | 48 | # configure a header file to pass some of the CMake settings to the source code 49 | configure_file ( 50 | ${PROJECT_SOURCE_DIR}/src/in.constants.hpp 51 | ${PROJECT_BINARY_DIR}/include/popvcf/constants.hpp) 52 | 53 | target_include_directories(popvcf_objects PUBLIC ${PROJECT_BINARY_DIR}/include) 54 | 55 | ######################## 56 | ## Build dependencies ## 57 | ######################## 58 | ### paw ### 59 | if (CMAKE_BUILD_TYPE STREQUAL "DEBUG") 60 | message (STATUS "paw in debug mode") 61 | 62 | ExternalProject_Add( 63 | project_paw 64 | SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/submodules/paw 65 | BUILD_IN_SOURCE 0 66 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/paw 67 | CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -H${CMAKE_CURRENT_SOURCE_DIR}/submodules/paw -B${CMAKE_CURRENT_BINARY_DIR}/paw -DCMAKE_BUILD_TYPE=${PAW_CMAKE_BUILD_TYPE} -DFORCE_AVX_FAST=1 68 | BUILD_COMMAND $(MAKE) -C ${CMAKE_CURRENT_BINARY_DIR}/paw static 69 | INSTALL_COMMAND "") 70 | else() 71 | message (STATUS "paw in non-debug mode") 72 | 73 | ExternalProject_Add( 74 | project_paw 75 | SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/submodules/paw 76 | BUILD_IN_SOURCE 0 77 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/paw 78 | CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -H${CMAKE_CURRENT_SOURCE_DIR}/submodules/paw -B${CMAKE_CURRENT_BINARY_DIR}/paw -DCMAKE_BUILD_TYPE=${PAW_CMAKE_BUILD_TYPE} -DFORCE_AVX_FAST=1 79 | BUILD_COMMAND $(MAKE) -C ${CMAKE_CURRENT_BINARY_DIR}/paw static 80 | INSTALL_COMMAND "") 81 | endif() 82 | 83 | add_library(paw STATIC IMPORTED) 84 | target_include_directories(popvcf_objects SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/submodules/paw/include) 85 | target_include_directories(popvcf_objects SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/paw/include) 86 | set(paw_location ${CMAKE_CURRENT_BINARY_DIR}/paw/lib/libpaw.a) 87 | message(STATUS "paw target location is ${paw_location}") 88 | set_property(TARGET paw PROPERTY IMPORTED_LOCATION ${paw_location}) 89 | add_dependencies(paw project_paw) 90 | add_dependencies(popvcf_objects paw) 91 | target_link_libraries(popvcf PUBLIC paw) 92 | 93 | ### htslib 94 | if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/submodules/htslib/Makefile) 95 | message(FATAL_ERROR "htslib not found. Run 'git submodule update --init' to fetch all submodules.") 96 | endif() 97 | 98 | ExternalProject_Add( 99 | project_htslib 100 | BUILD_IN_SOURCE 1 101 | SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/htslib 102 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/htslib 103 | CONFIGURE_COMMAND cp -a ${CMAKE_CURRENT_SOURCE_DIR}/submodules/htslib ${CMAKE_CURRENT_BINARY_DIR}/ COMMAND autoheader COMMAND autoconf COMMAND ${CMAKE_CURRENT_BINARY_DIR}/htslib/configure --disable-libcurl --disable-gcs --disable-lzma --disable-bz2 --with-libdeflate 104 | "CFLAGS=${MYCFLAGS} -g -Wall -O3 ${CMAKE_C_FLAGS} -I${CMAKE_CURRENT_BINARY_DIR}/libdeflate" 105 | "LDFLAGS=${MYLDFLAGS} -L${CMAKE_CURRENT_BINARY_DIR}/libdeflate" 106 | "CC=${CMAKE_C_COMPILER}" 107 | BUILD_COMMAND $(MAKE) -C ${CMAKE_CURRENT_BINARY_DIR}/htslib libhts.a 108 | INSTALL_COMMAND "" 109 | ) 110 | 111 | add_library(htslib STATIC IMPORTED) 112 | 113 | target_include_directories(popvcf_objects SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/htslib) 114 | 115 | set(htslib_location ${CMAKE_CURRENT_BINARY_DIR}/htslib/libhts.a) 116 | message(STATUS "htslib target location is ${htslib_location}") 117 | set_property(TARGET htslib PROPERTY IMPORTED_LOCATION ${htslib_location}) 118 | add_dependencies(htslib project_htslib) 119 | add_dependencies(project_htslib libdeflate) 120 | add_dependencies(popvcf_objects htslib) 121 | target_link_libraries(popvcf PUBLIC ${htslib_location}) 122 | 123 | ### libdeflate 124 | if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/submodules/libdeflate/Makefile) 125 | message(FATAL_ERROR "libdeflate not found. Run 'git submodule update --init' to fetch all submodules.") 126 | endif () 127 | 128 | ExternalProject_Add( 129 | project_libdeflate 130 | BUILD_IN_SOURCE 1 131 | SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/libdeflate 132 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/libdeflate 133 | CONFIGURE_COMMAND cp -a ${CMAKE_CURRENT_SOURCE_DIR}/submodules/libdeflate ${CMAKE_CURRENT_BINARY_DIR}/ 134 | BUILD_COMMAND $(MAKE) -C ${CMAKE_CURRENT_BINARY_DIR}/libdeflate 135 | PREFIX="${CMAKE_CURRENT_BINARY_DIR}/libdeflate" 136 | "CC=${CMAKE_C_COMPILER}" 137 | "CFLAGS=-fPIC -O3" libdeflate.a 138 | INSTALL_COMMAND "" 139 | ) 140 | 141 | add_library(libdeflate STATIC IMPORTED) 142 | set(libdeflate_location ${CMAKE_CURRENT_BINARY_DIR}/libdeflate/libdeflate.a) 143 | message(STATUS "libdeflate target location is ${libdeflate_location}") 144 | 145 | set_property(TARGET libdeflate PROPERTY IMPORTED_LOCATION ${libdeflate_location}) 146 | add_dependencies(libdeflate project_libdeflate) 147 | target_link_libraries(popvcf PUBLIC libdeflate) 148 | 149 | ### parallel_hashmap ### 150 | target_include_directories(popvcf_objects SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/submodules/parallel-hashmap) 151 | 152 | ### threads ### 153 | if (STATIC_DIR STREQUAL "") 154 | find_package(Threads) 155 | target_link_libraries(popvcf PUBLIC ${CMAKE_THREAD_LIBS_INIT}) 156 | else() 157 | target_link_libraries(popvcf PUBLIC "${STATIC_DIR}/libpthread.a") 158 | endif() 159 | 160 | ### rt and filesystem ### 161 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 162 | message(STATUS "Using GCC") 163 | 164 | if (STATIC_DIR STREQUAL "") 165 | target_link_libraries(popvcf PUBLIC "rt") 166 | # target_link_libraries(popvcf PUBLIC "stdc++fs") 167 | else() 168 | target_link_libraries(popvcf PUBLIC "${STATIC_DIR}/librt.a") 169 | endif() 170 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 171 | message(STATUS "Using Clang") 172 | else() 173 | message(WARNING "Unsupported compiler") 174 | endif () 175 | 176 | ### zlib ### 177 | message (STATUS "Checking for zlib") 178 | find_package(ZLIB REQUIRED) 179 | target_include_directories(popvcf_objects SYSTEM PUBLIC ${ZLIB_INCLUDE_DIRS}) 180 | 181 | if (STATIC_DIR STREQUAL "") 182 | target_link_libraries(popvcf PUBLIC ${ZLIB_LIBRARIES}) 183 | else() 184 | target_link_libraries(popvcf PUBLIC "${STATIC_DIR}/libz.a") 185 | endif() 186 | 187 | ### GCC ### 188 | 189 | # LOCAL binaries have static GCC, PREBUILT are all static 190 | if (STATIC_DIR STREQUAL "") 191 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++") 192 | else () 193 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static") 194 | endif () 195 | 196 | ################ 197 | # clang-format # 198 | ################ 199 | # Add clang-format check target 200 | find_program(CLANG_FORMAT "clang-format-12") 201 | 202 | if (CLANG_FORMAT STREQUAL "" OR CLANG_FORMAT STREQUAL "CLANG_FORMAT-NOTFOUND") 203 | find_program(CLANG_FORMAT "clang-format") 204 | endif() 205 | 206 | if (NOT CLANG_FORMAT STREQUAL "" AND NOT CLANG_FORMAT STREQUAL "CLANG_FORMAT-NOTFOUND") 207 | add_custom_target (check_format "find" "${CMAKE_CURRENT_SOURCE_DIR}/src" "-name" "\"*.hpp\"" "-o" "-name" "\"*.cpp\"" "-exec" "${CLANG_FORMAT}" "--style=file" "--dry-run" "--Werror" "{}" "+" COMMENT "Checking all files format with clang-format.") 208 | endif() 209 | 210 | ################# 211 | # Documentation # 212 | ################# 213 | ## Adds a target to generate API documentation with Doxygen 214 | find_package(Doxygen) 215 | option(BUILD_DOCUMENTATION "Create and install the HTML based API documentation (requires Doxygen)" ${DOXYGEN_FOUND}) 216 | 217 | if(BUILD_DOCUMENTATION) 218 | if(NOT DOXYGEN_FOUND) 219 | message(FATAL_ERROR "Doxygen is needed to build the documentation.") 220 | endif() 221 | 222 | set(doxyfile_in ${CMAKE_CURRENT_SOURCE_DIR}/doc/Doxyfile.in) 223 | set(doxyfile ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) 224 | 225 | configure_file(${doxyfile_in} ${doxyfile} @ONLY) 226 | 227 | add_custom_target(doc 228 | COMMAND ${DOXYGEN_EXECUTABLE} ${doxyfile} 229 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 230 | COMMENT "Generating API documentation with Doxygen" 231 | VERBATIM) 232 | 233 | #install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/html DESTINATION share/doc) 234 | endif() 235 | 236 | ########### 237 | # ARCHIVE # 238 | ########### 239 | add_custom_target(archive 240 | COMMAND sh -c "bash .git-archive-all.sh --format tar.gz --prefix popvcf-v${popvcf_VERSION}/ ${CMAKE_CURRENT_BINARY_DIR}/popvcf-v${popvcf_VERSION}.tar.gz" 241 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 242 | COMMENT "Generating an archive which includes submodules." 243 | VERBATIM) 244 | 245 | ########### 246 | # Testing # 247 | ########### 248 | 249 | enable_testing(true) 250 | add_test(NAME test_popvcf COMMAND sh -c "set -e; sh ${CMAKE_CURRENT_SOURCE_DIR}/test/create_test_data.sh > test.vcf ; ${CMAKE_CURRENT_BINARY_DIR}/popvcf encode test.vcf -Oz > test.popvcf.gz ; ${CMAKE_CURRENT_BINARY_DIR}/popvcf decode test.popvcf.gz > test.new.vcf ; diff test.vcf test.new.vcf") 251 | 252 | # TODO add tabix to workflow or make popvcf build the index 253 | # tabix -p vcf test.popvcf.gz ; ${CMAKE_CURRENT_BINARY_DIR}/popvcf decode test.popvcf.gz --region=chr2:10000-10200 | grep -v ^# | wc -l | grep -q -w -F 2 254 | 255 | set_tests_properties(test_popvcf PROPERTIES DEPENDS popvcf) 256 | 257 | ########### 258 | ## Other ## 259 | ########### 260 | # Get the current working git branch 261 | execute_process( 262 | COMMAND git rev-parse --abbrev-ref HEAD 263 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 264 | OUTPUT_VARIABLE GIT_BRANCH 265 | OUTPUT_STRIP_TRAILING_WHITESPACE) 266 | 267 | # Get the latest abbreviated SHA commit hash of the working branch 268 | execute_process( 269 | COMMAND git log -1 --format=%h 270 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 271 | OUTPUT_VARIABLE GIT_COMMIT_SHORT_HASH 272 | OUTPUT_STRIP_TRAILING_WHITESPACE) 273 | 274 | # Get the latest SHA commit hash of the working branch 275 | execute_process( 276 | COMMAND git rev-parse HEAD 277 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 278 | OUTPUT_VARIABLE GIT_COMMIT_LONG_HASH 279 | OUTPUT_STRIP_TRAILING_WHITESPACE) 280 | 281 | # Get the latest SHA commit hash of the working branch 282 | execute_process( 283 | COMMAND git diff COMMAND wc -l 284 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 285 | OUTPUT_VARIABLE GIT_NUM_DIRTY_LINES 286 | OUTPUT_STRIP_TRAILING_WHITESPACE) 287 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## popVCF 2 | 3 | popVCF losslessly encodes a multi sample VCF to reduce disk footprint. VCF fields are encoded by pointing to other exactly identical fields in the same row or in the row above. popVCF compression performance is small on a single sample VCF, but the compression ratio can go up to 40+ on a large population VCFs or 5x more compressed than the standard bgzip compression. The compression ratio varies a lot between data sets, see below for benchmarks on several different data sets. 4 | 5 | Files are encoded with the "popvcf encode" command, and by encoding with the "-Oz" flag you can directly write the output in bgzip format. You can then decode the file back to VCF using the "popvcf decode" command. The decode subcommand can also query a region using option "--region=chrN:A-B". 6 | 7 | On a 64 bit linux, you can get the latest static binary from the [Release page](https://github.com/DecodeGenetics/popvcf/releases). 8 | 9 | ### Benchmarks 10 | 11 | We have benchmarked popVCF against few other compression methods with some large population VCF data. In all experiements, we report wall clock time using /usr/bin/time and used a single CPU thread. The VCF data was read and written to a SSD disk. spVCF was run with the "--no-squeeze" option to prevent any lossy compression. The script run to benchmark is in the benchmark/ directory. In the WGS benchmarks, we had to exclude genozip and VCFShark as they were unable to compress the data because of repeated runtime errors. 12 | 13 | Benchmarked versions: popVCF v1.1.0, spVCF v1.2.0-0-gbecb461, htslib+bcftools v1.14 (with libdeflate), Genozip 13.0.11, VCFShark v1.1. 14 | 15 | #### GraphTyper UK biobank WGS-487k individual data 16 | 17 | | Method/format | Compression ratio | Compared to bgzip | 18 | | ------------- | ----------------- | ----------------- | 19 | | popVCF+bgzip | 37.6x | 4.4x | 20 | | spVCF+bgzip | 17.2x | 2.0x | 21 | | BCF | 10.5x | 1.2x | 22 | | bgzip (VCF) | 8.6x | 1.0x | 23 | 24 | #### Deep Variant/GLnexus WES-200k individual data 25 | 26 | | Method/format | Compression ratio | Compared to bgzip | Compression speed (MB/s) | Decompression speed (MB/s) | 27 | | ------------- | ----------------- | ----------------- | ------------------------ | -------------------------- | 28 | | popVCF+bgzip | 102.9x | 6.9x | 194.0 | 490.7 | 29 | | spVCF+bgzip | 43.8x | 2.9x | 129.7 | 281.5 | 30 | | Genozip | 35.0x | 2.3x | 18.0 | 17.3 | 31 | | VCFShark | 28.3x | 1.9x | 22.8 | 21.7 | 32 | | BCF | 14.0x | 0.94x | 62.4 | 175.2 | 33 | | bgzip (VCF) | 14.9x | 1.0x | 91.6 | 521.3 | 34 | 35 | #### GATK UK biobank WGS-150k individual data 36 | 37 | | Method/format | Compression ratio | Compared to bgzip | Compression speed (MB/s) | Decompression speed (MB/s) | 38 | | ------------- | ----------------- | ----------------- | ------------------------ | -------------------------- | 39 | | popVCF+bgzip | 20.1x | 2.8x | 102.2 | 295.0 | 40 | | spVCF+bgzip | 10.0x | 1.4x | 58.8 | 165.7 | 41 | | BCF | 6.7x | 0.94x | 55.6 | 174.2 | 42 | | bgzip (VCF) | 7.1x | 1.0x | 58.5 | 474.7 | 43 | 44 | ### Usage 45 | 46 | ```sh 47 | popvcf encode my.vcf > my.popvcf 48 | popvcf decode my.popvcf > my.new.vcf 49 | diff my.vcf my.new.vcf # Should be the same 50 | 51 | # It is also possible to bgzip, tabix index and query 52 | popvcf encode my.vcf -Oz > my.popvcf.gz 53 | tabix my.popvcf.gz 54 | popvcf decode my.popvcf.gz > my.new2.vcf 55 | popvcf decode my.popvcf.gz --region=chrN:A-B > my.region.vcf # Random access a region using the tabix index 56 | ``` 57 | 58 | ### Building 59 | Feature complete C++17 compiler is required for building popVCF, i.e. GCC 8/Clang 10 or newer. 60 | 61 | ```sh 62 | git clone --recursive popvcf # Clone the repository 63 | cd popvcf 64 | mkdir build-release 65 | cd build-release 66 | cmake .. 67 | make -j3 popvcf 68 | ``` 69 | 70 | ### Known limitations 71 | 72 | * Each VCF genotype field is assumed to be no larger than the popVCF buffer size (256kb). Site data may exceed this limit though (i.e. the INFO field). 73 | * Each VCF genotype field is assumed to start on a number (0-9), a period (.), or a dash (-). Any VCF record with a GT field fulfills this requirement. Subsequent characters can contain any other printable characters. 74 | 75 | ### License 76 | MIT -------------------------------------------------------------------------------- /benchmark/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | #set -o xtrace 5 | 6 | popvcf=$1 7 | vcf=$2 8 | level="$3" 9 | #level="-l9" 10 | 11 | if [[ -z $vcf ]]; then 12 | echo "Usage: $0 [level]" >&2 13 | echo " i.e. $0 ./popvcf test.vcf \"-l9\"" 14 | exit 1 15 | fi 16 | 17 | if [[ ${vcf} != "test.vcf" ]]; then 18 | cp $vcf test.vcf 19 | fi 20 | 21 | #echo "cat test.vcf | bgzip -c > test.pipe.vcf.gz" 22 | #time cat test.vcf | bgzip -c > test.pipe.vcf.gz 23 | 24 | run () 25 | ( 26 | echo "$1" 27 | eval time "$1" 28 | ) 29 | 30 | #echo "bgzip -k -f test.vcf" 31 | #time bgzip -k -f test.vcf 32 | 33 | 34 | echo "== Compression times ==" 35 | run "bgzip --stdout --force --threads 1 test.vcf ${level} > test.vcf.gz" 36 | run "${popvcf} encode test.vcf -o test.vcf.popvcf" 37 | run "${popvcf} encode test.vcf ${level} -Oz -o test.vcf.popvcf.gz" 38 | run "spvcf encode --quiet --no-squeeze test.vcf > test.vcf.spvcf" 39 | run "spvcf encode --quiet --no-squeeze test.vcf | bgzip -c ${level} > test.vcf.spvcf.gz" 40 | 41 | echo "== Decompression times ==" 42 | md5sum test.vcf 43 | run "${popvcf} decode test.vcf.popvcf > test.vcf.popvcf.vcf" 44 | md5sum test.vcf.popvcf.vcf ; rm -f test.vcf.popvcf.vcf 45 | run "${popvcf} decode test.vcf.popvcf.gz > test.vcf.popvcf.gz.vcf" 46 | md5sum test.vcf.popvcf.gz.vcf ; rm -f test.vcf.popvcf.gz.vcf 47 | run "bgzip -dc test.vcf.spvcf.gz | spvcf decode --quiet > test.vcf.spvcf.gz.vcf" 48 | md5sum test.vcf.spvcf.gz.vcf ; rm -f test.vcf.spvcf.gz.vcf 49 | run "bgzip -dc test.vcf.gz > test.vcf.gz.vcf" 50 | md5sum test.vcf.gz.vcf ; rm -f test.vcf.gz.vcf 51 | 52 | echo "== Index construction times ==" 53 | run "tabix -p vcf -f test.vcf.gz" 54 | run "tabix -p vcf -f test.vcf.popvcf.gz" 55 | run "tabix -p vcf -f test.vcf.spvcf.gz" 56 | 57 | echo "== Query times ==" 58 | region=$(grep -v ^# test.vcf | cut -f1,2 | head -n 20 | tail -n 1 | awk '{print $1":"$2"-"$2+100}') 59 | run "tabix test.vcf.gz ${region} > /dev/null" 60 | run "${popvcf} decode test.vcf.popvcf.gz --region=${region} > /dev/null" 61 | run "spvcf tabix test.vcf.spvcf.gz ${region} | spvcf decode - > /dev/null" 62 | 63 | ls -lh test.*gz 64 | ls -l test.*gz 65 | 66 | original_size=$(find -L . -name "test.vcf" -printf "%s\n") 67 | find . -name "test.*gz" -printf "%f\t%s\n" | awk -v os="${original_size}" '{print $1"\t"$2"\t"os/$2}' 68 | 69 | # cleanup 70 | #echo test.* | tr ' ' '\n' | grep -vP "^test.vcf$" | grep -vP "^test.vcf.gz$" | xargs rm 71 | -------------------------------------------------------------------------------- /doc/Doxyfile.in: -------------------------------------------------------------------------------- 1 | # Doxyfile 1.6.1 2 | 3 | # This file describes the settings to be used by the documentation system 4 | # doxygen (www.doxygen.org) for a project 5 | # 6 | # All text after a hash (#) is considered a comment and will be ignored 7 | # The format is: 8 | # TAG = value [value, ...] 9 | # For lists items can also be appended using: 10 | # TAG += value [value, ...] 11 | # Values that contain spaces should be placed between quotes (" ") 12 | 13 | #--------------------------------------------------------------------------- 14 | # Project related configuration options 15 | #--------------------------------------------------------------------------- 16 | 17 | # This tag specifies the encoding used for all characters in the config file 18 | # that follow. The default is UTF-8 which is also the encoding used for all 19 | # text before the first occurrence of this tag. Doxygen uses libiconv (or the 20 | # iconv built into libc) for the transcoding. See 21 | # http://www.gnu.org/software/libiconv for the list of possible encodings. 22 | 23 | DOXYFILE_ENCODING = UTF-8 24 | 25 | # The PROJECT_NAME tag is a single word (or a sequence of words surrounded 26 | # by quotes) that should identify the project. 27 | 28 | PROJECT_NAME = "@CMAKE_PROJECT_NAME@" 29 | 30 | # The PROJECT_NUMBER tag can be used to enter a project or revision number. 31 | # This could be handy for archiving the generated documentation or 32 | # if some version control system is used. 33 | 34 | PROJECT_NUMBER = @VERSION@ 35 | 36 | # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 37 | # base path where the generated documentation will be put. 38 | # If a relative path is entered, it will be relative to the location 39 | # where doxygen was started. If left blank the current directory will be used. 40 | 41 | OUTPUT_DIRECTORY = doc 42 | 43 | # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 44 | # 4096 sub-directories (in 2 levels) under the output directory of each output 45 | # format and will distribute the generated files over these directories. 46 | # Enabling this option can be useful when feeding doxygen a huge amount of 47 | # source files, where putting all generated files in the same directory would 48 | # otherwise cause performance problems for the file system. 49 | 50 | CREATE_SUBDIRS = NO 51 | 52 | # The OUTPUT_LANGUAGE tag is used to specify the language in which all 53 | # documentation generated by doxygen is written. Doxygen will use this 54 | # information to generate all constant output in the proper language. 55 | # The default language is English, other supported languages are: 56 | # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 57 | # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, 58 | # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English 59 | # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, 60 | # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, 61 | # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. 62 | 63 | OUTPUT_LANGUAGE = English 64 | 65 | # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 66 | # include brief member descriptions after the members that are listed in 67 | # the file and class documentation (similar to JavaDoc). 68 | # Set to NO to disable this. 69 | 70 | BRIEF_MEMBER_DESC = YES 71 | 72 | # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 73 | # the brief description of a member or function before the detailed description. 74 | # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 75 | # brief descriptions will be completely suppressed. 76 | 77 | REPEAT_BRIEF = YES 78 | 79 | # This tag implements a quasi-intelligent brief description abbreviator 80 | # that is used to form the text in various listings. Each string 81 | # in this list, if found as the leading text of the brief description, will be 82 | # stripped from the text and the result after processing the whole list, is 83 | # used as the annotated text. Otherwise, the brief description is used as-is. 84 | # If left blank, the following values are used ("$name" is automatically 85 | # replaced with the name of the entity): "The $name class" "The $name widget" 86 | # "The $name file" "is" "provides" "specifies" "contains" 87 | # "represents" "a" "an" "the" 88 | 89 | ABBREVIATE_BRIEF = 90 | 91 | # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 92 | # Doxygen will generate a detailed section even if there is only a brief 93 | # description. 94 | 95 | ALWAYS_DETAILED_SEC = NO 96 | 97 | # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 98 | # inherited members of a class in the documentation of that class as if those 99 | # members were ordinary class members. Constructors, destructors and assignment 100 | # operators of the base classes will not be shown. 101 | 102 | INLINE_INHERITED_MEMB = NO 103 | 104 | # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 105 | # path before files name in the file list and in the header files. If set 106 | # to NO the shortest path that makes the file name unique will be used. 107 | 108 | FULL_PATH_NAMES = YES 109 | 110 | # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 111 | # can be used to strip a user-defined part of the path. Stripping is 112 | # only done if one of the specified strings matches the left-hand part of 113 | # the path. The tag can be used to show relative paths in the file list. 114 | # If left blank the directory from which doxygen is run is used as the 115 | # path to strip. 116 | 117 | STRIP_FROM_PATH = . 118 | 119 | # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 120 | # the path mentioned in the documentation of a class, which tells 121 | # the reader which header file to include in order to use a class. 122 | # If left blank only the name of the header file containing the class 123 | # definition is used. Otherwise one should specify the include paths that 124 | # are normally passed to the compiler using the -I flag. 125 | 126 | STRIP_FROM_INC_PATH = 127 | 128 | # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 129 | # (but less readable) file names. This can be useful is your file systems 130 | # doesn't support long names like on DOS, Mac, or CD-ROM. 131 | 132 | SHORT_NAMES = NO 133 | 134 | # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 135 | # will interpret the first line (until the first dot) of a JavaDoc-style 136 | # comment as the brief description. If set to NO, the JavaDoc 137 | # comments will behave just like regular Qt-style comments 138 | # (thus requiring an explicit @brief command for a brief description.) 139 | 140 | JAVADOC_AUTOBRIEF = NO 141 | 142 | # If the QT_AUTOBRIEF tag is set to YES then Doxygen will 143 | # interpret the first line (until the first dot) of a Qt-style 144 | # comment as the brief description. If set to NO, the comments 145 | # will behave just like regular Qt-style comments (thus requiring 146 | # an explicit \brief command for a brief description.) 147 | 148 | QT_AUTOBRIEF = NO 149 | 150 | # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 151 | # treat a multi-line C++ special comment block (i.e. a block of //! or /// 152 | # comments) as a brief description. This used to be the default behaviour. 153 | # The new default is to treat a multi-line C++ comment block as a detailed 154 | # description. Set this tag to YES if you prefer the old behaviour instead. 155 | 156 | MULTILINE_CPP_IS_BRIEF = NO 157 | 158 | # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 159 | # member inherits the documentation from any documented member that it 160 | # re-implements. 161 | 162 | INHERIT_DOCS = YES 163 | 164 | # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 165 | # a new page for each member. If set to NO, the documentation of a member will 166 | # be part of the file/class/namespace that contains it. 167 | 168 | SEPARATE_MEMBER_PAGES = NO 169 | 170 | # The TAB_SIZE tag can be used to set the number of spaces in a tab. 171 | # Doxygen uses this value to replace tabs by spaces in code fragments. 172 | 173 | TAB_SIZE = 2 174 | 175 | # This tag can be used to specify a number of aliases that acts 176 | # as commands in the documentation. An alias has the form "name=value". 177 | # For example adding "sideeffect=\par Side Effects:\n" will allow you to 178 | # put the command \sideeffect (or @sideeffect) in the documentation, which 179 | # will result in a user-defined paragraph with heading "Side Effects:". 180 | # You can put \n's in the value part of an alias to insert newlines. 181 | 182 | ALIASES = 183 | 184 | # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 185 | # sources only. Doxygen will then generate output that is more tailored for C. 186 | # For instance, some of the names that are used will be different. The list 187 | # of all members will be omitted, etc. 188 | 189 | OPTIMIZE_OUTPUT_FOR_C = YES 190 | 191 | # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 192 | # sources only. Doxygen will then generate output that is more tailored for 193 | # Java. For instance, namespaces will be presented as packages, qualified 194 | # scopes will look different, etc. 195 | 196 | OPTIMIZE_OUTPUT_JAVA = NO 197 | 198 | # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 199 | # sources only. Doxygen will then generate output that is more tailored for 200 | # Fortran. 201 | 202 | OPTIMIZE_FOR_FORTRAN = NO 203 | 204 | # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 205 | # sources. Doxygen will then generate output that is tailored for 206 | # VHDL. 207 | 208 | OPTIMIZE_OUTPUT_VHDL = NO 209 | 210 | # Doxygen selects the parser to use depending on the extension of the files it parses. 211 | # With this tag you can assign which parser to use for a given extension. 212 | # Doxygen has a built-in mapping, but you can override or extend it using this tag. 213 | # The format is ext=language, where ext is a file extension, and language is one of 214 | # the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, 215 | # Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat 216 | # .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), 217 | # use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. 218 | 219 | EXTENSION_MAPPING = 220 | 221 | # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 222 | # to include (a tag file for) the STL sources as input, then you should 223 | # set this tag to YES in order to let doxygen match functions declarations and 224 | # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 225 | # func(std::string) {}). This also make the inheritance and collaboration 226 | # diagrams that involve STL classes more complete and accurate. 227 | 228 | BUILTIN_STL_SUPPORT = NO 229 | 230 | # If you use Microsoft's C++/CLI language, you should set this option to YES to 231 | # enable parsing support. 232 | 233 | CPP_CLI_SUPPORT = NO 234 | 235 | # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 236 | # Doxygen will parse them like normal C++ but will assume all classes use public 237 | # instead of private inheritance when no explicit protection keyword is present. 238 | 239 | SIP_SUPPORT = NO 240 | 241 | # For Microsoft's IDL there are propget and propput attributes to indicate getter 242 | # and setter methods for a property. Setting this option to YES (the default) 243 | # will make doxygen to replace the get and set methods by a property in the 244 | # documentation. This will only work if the methods are indeed getting or 245 | # setting a simple type. If this is not the case, or you want to show the 246 | # methods anyway, you should set this option to NO. 247 | 248 | IDL_PROPERTY_SUPPORT = YES 249 | 250 | # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 251 | # tag is set to YES, then doxygen will reuse the documentation of the first 252 | # member in the group (if any) for the other members of the group. By default 253 | # all members of a group must be documented explicitly. 254 | 255 | DISTRIBUTE_GROUP_DOC = NO 256 | 257 | # Set the SUBGROUPING tag to YES (the default) to allow class member groups of 258 | # the same type (for instance a group of public functions) to be put as a 259 | # subgroup of that type (e.g. under the Public Functions section). Set it to 260 | # NO to prevent subgrouping. Alternatively, this can be done per class using 261 | # the \nosubgrouping command. 262 | 263 | SUBGROUPING = YES 264 | 265 | # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 266 | # is documented as struct, union, or enum with the name of the typedef. So 267 | # typedef struct TypeS {} TypeT, will appear in the documentation as a struct 268 | # with name TypeT. When disabled the typedef will appear as a member of a file, 269 | # namespace, or class. And the struct will be named TypeS. This can typically 270 | # be useful for C code in case the coding convention dictates that all compound 271 | # types are typedef'ed and only the typedef is referenced, never the tag name. 272 | 273 | TYPEDEF_HIDES_STRUCT = NO 274 | 275 | # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to 276 | # determine which symbols to keep in memory and which to flush to disk. 277 | # When the cache is full, less often used symbols will be written to disk. 278 | # For small to medium size projects (<1000 input files) the default value is 279 | # probably good enough. For larger projects a too small cache size can cause 280 | # doxygen to be busy swapping symbols to and from disk most of the time 281 | # causing a significant performance penality. 282 | # If the system has enough physical memory increasing the cache will improve the 283 | # performance by keeping more symbols in memory. Note that the value works on 284 | # a logarithmic scale so increasing the size by one will rougly double the 285 | # memory usage. The cache size is given by this formula: 286 | # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, 287 | # corresponding to a cache size of 2^16 = 65536 symbols 288 | 289 | # SYMBOL_CACHE_SIZE = 0 290 | 291 | #--------------------------------------------------------------------------- 292 | # Build related configuration options 293 | #--------------------------------------------------------------------------- 294 | 295 | # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 296 | # documentation are documented, even if no documentation was available. 297 | # Private class members and static file members will be hidden unless 298 | # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES 299 | 300 | EXTRACT_ALL = YES 301 | 302 | # If the EXTRACT_PRIVATE tag is set to YES all private members of a class 303 | # will be included in the documentation. 304 | 305 | EXTRACT_PRIVATE = NO 306 | 307 | # If the EXTRACT_STATIC tag is set to YES all static members of a file 308 | # will be included in the documentation. 309 | 310 | EXTRACT_STATIC = YES 311 | 312 | # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 313 | # defined locally in source files will be included in the documentation. 314 | # If set to NO only classes defined in header files are included. 315 | 316 | EXTRACT_LOCAL_CLASSES = YES 317 | 318 | # This flag is only useful for Objective-C code. When set to YES local 319 | # methods, which are defined in the implementation section but not in 320 | # the interface are included in the documentation. 321 | # If set to NO (the default) only methods in the interface are included. 322 | 323 | EXTRACT_LOCAL_METHODS = NO 324 | 325 | # If this flag is set to YES, the members of anonymous namespaces will be 326 | # extracted and appear in the documentation as a namespace called 327 | # 'anonymous_namespace{file}', where file will be replaced with the base 328 | # name of the file that contains the anonymous namespace. By default 329 | # anonymous namespace are hidden. 330 | 331 | EXTRACT_ANON_NSPACES = NO 332 | 333 | # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 334 | # undocumented members of documented classes, files or namespaces. 335 | # If set to NO (the default) these members will be included in the 336 | # various overviews, but no documentation section is generated. 337 | # This option has no effect if EXTRACT_ALL is enabled. 338 | 339 | HIDE_UNDOC_MEMBERS = NO 340 | 341 | # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 342 | # undocumented classes that are normally visible in the class hierarchy. 343 | # If set to NO (the default) these classes will be included in the various 344 | # overviews. This option has no effect if EXTRACT_ALL is enabled. 345 | 346 | HIDE_UNDOC_CLASSES = NO 347 | 348 | # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 349 | # friend (class|struct|union) declarations. 350 | # If set to NO (the default) these declarations will be included in the 351 | # documentation. 352 | 353 | HIDE_FRIEND_COMPOUNDS = NO 354 | 355 | # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 356 | # documentation blocks found inside the body of a function. 357 | # If set to NO (the default) these blocks will be appended to the 358 | # function's detailed documentation block. 359 | 360 | HIDE_IN_BODY_DOCS = NO 361 | 362 | # The INTERNAL_DOCS tag determines if documentation 363 | # that is typed after a \internal command is included. If the tag is set 364 | # to NO (the default) then the documentation will be excluded. 365 | # Set it to YES to include the internal documentation. 366 | 367 | INTERNAL_DOCS = NO 368 | 369 | # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 370 | # file names in lower-case letters. If set to YES upper-case letters are also 371 | # allowed. This is useful if you have classes or files whose names only differ 372 | # in case and if your file system supports case sensitive file names. Windows 373 | # and Mac users are advised to set this option to NO. 374 | 375 | CASE_SENSE_NAMES = YES 376 | 377 | # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 378 | # will show members with their full class and namespace scopes in the 379 | # documentation. If set to YES the scope will be hidden. 380 | 381 | HIDE_SCOPE_NAMES = NO 382 | 383 | # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 384 | # will put a list of the files that are included by a file in the documentation 385 | # of that file. 386 | 387 | SHOW_INCLUDE_FILES = YES 388 | 389 | # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 390 | # is inserted in the documentation for inline members. 391 | 392 | INLINE_INFO = YES 393 | 394 | # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 395 | # will sort the (detailed) documentation of file and class members 396 | # alphabetically by member name. If set to NO the members will appear in 397 | # declaration order. 398 | 399 | SORT_MEMBER_DOCS = YES 400 | 401 | # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 402 | # brief documentation of file, namespace and class members alphabetically 403 | # by member name. If set to NO (the default) the members will appear in 404 | # declaration order. 405 | 406 | SORT_BRIEF_DOCS = NO 407 | 408 | # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. 409 | 410 | SORT_MEMBERS_CTORS_1ST = NO 411 | 412 | # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 413 | # hierarchy of group names into alphabetical order. If set to NO (the default) 414 | # the group names will appear in their defined order. 415 | 416 | SORT_GROUP_NAMES = NO 417 | 418 | # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 419 | # sorted by fully-qualified names, including namespaces. If set to 420 | # NO (the default), the class list will be sorted only by class name, 421 | # not including the namespace part. 422 | # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 423 | # Note: This option applies only to the class list, not to the 424 | # alphabetical list. 425 | 426 | SORT_BY_SCOPE_NAME = NO 427 | 428 | # The GENERATE_TODOLIST tag can be used to enable (YES) or 429 | # disable (NO) the todo list. This list is created by putting \todo 430 | # commands in the documentation. 431 | 432 | GENERATE_TODOLIST = YES 433 | 434 | # The GENERATE_TESTLIST tag can be used to enable (YES) or 435 | # disable (NO) the test list. This list is created by putting \test 436 | # commands in the documentation. 437 | 438 | GENERATE_TESTLIST = YES 439 | 440 | # The GENERATE_BUGLIST tag can be used to enable (YES) or 441 | # disable (NO) the bug list. This list is created by putting \bug 442 | # commands in the documentation. 443 | 444 | GENERATE_BUGLIST = YES 445 | 446 | # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 447 | # disable (NO) the deprecated list. This list is created by putting 448 | # \deprecated commands in the documentation. 449 | 450 | GENERATE_DEPRECATEDLIST= YES 451 | 452 | # The ENABLED_SECTIONS tag can be used to enable conditional 453 | # documentation sections, marked by \if sectionname ... \endif. 454 | 455 | ENABLED_SECTIONS = 456 | 457 | # The MAX_INITIALIZER_LINES tag determines the maximum number of lines 458 | # the initial value of a variable or define consists of for it to appear in 459 | # the documentation. If the initializer consists of more lines than specified 460 | # here it will be hidden. Use a value of 0 to hide initializers completely. 461 | # The appearance of the initializer of individual variables and defines in the 462 | # documentation can be controlled using \showinitializer or \hideinitializer 463 | # command in the documentation regardless of this setting. 464 | 465 | MAX_INITIALIZER_LINES = 30 466 | 467 | # Set the SHOW_USED_FILES tag to NO to disable the list of files generated 468 | # at the bottom of the documentation of classes and structs. If set to YES the 469 | # list will mention the files that were used to generate the documentation. 470 | 471 | SHOW_USED_FILES = YES 472 | 473 | # If the sources in your project are distributed over multiple directories 474 | # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 475 | # in the documentation. The default is NO. 476 | 477 | # Obsolete # SHOW_DIRECTORIES = NO 478 | 479 | # Set the SHOW_FILES tag to NO to disable the generation of the Files page. 480 | # This will remove the Files entry from the Quick Index and from the 481 | # Folder Tree View (if specified). The default is YES. 482 | 483 | SHOW_FILES = YES 484 | 485 | # Set the SHOW_NAMESPACES tag to NO to disable the generation of the 486 | # Namespaces page. 487 | # This will remove the Namespaces entry from the Quick Index 488 | # and from the Folder Tree View (if specified). The default is YES. 489 | 490 | SHOW_NAMESPACES = YES 491 | 492 | # The FILE_VERSION_FILTER tag can be used to specify a program or script that 493 | # doxygen should invoke to get the current version for each file (typically from 494 | # the version control system). Doxygen will invoke the program by executing (via 495 | # popen()) the command , where is the value of 496 | # the FILE_VERSION_FILTER tag, and is the name of an input file 497 | # provided by doxygen. Whatever the program writes to standard output 498 | # is used as the file version. See the manual for examples. 499 | 500 | FILE_VERSION_FILTER = 501 | 502 | # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by 503 | # doxygen. The layout file controls the global structure of the generated output files 504 | # in an output format independent way. The create the layout file that represents 505 | # doxygen's defaults, run doxygen with the -l option. You can optionally specify a 506 | # file name after the option, if omitted DoxygenLayout.xml will be used as the name 507 | # of the layout file. 508 | 509 | LAYOUT_FILE = 510 | 511 | #--------------------------------------------------------------------------- 512 | # configuration options related to warning and progress messages 513 | #--------------------------------------------------------------------------- 514 | 515 | # The QUIET tag can be used to turn on/off the messages that are generated 516 | # by doxygen. Possible values are YES and NO. If left blank NO is used. 517 | 518 | QUIET = NO 519 | 520 | # The WARNINGS tag can be used to turn on/off the warning messages that are 521 | # generated by doxygen. Possible values are YES and NO. If left blank 522 | # NO is used. 523 | 524 | WARNINGS = YES 525 | 526 | # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 527 | # for undocumented members. If EXTRACT_ALL is set to YES then this flag will 528 | # automatically be disabled. 529 | 530 | WARN_IF_UNDOCUMENTED = YES 531 | 532 | # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 533 | # potential errors in the documentation, such as not documenting some 534 | # parameters in a documented function, or documenting parameters that 535 | # don't exist or using markup commands wrongly. 536 | 537 | WARN_IF_DOC_ERROR = YES 538 | 539 | # This WARN_NO_PARAMDOC option can be abled to get warnings for 540 | # functions that are documented, but have no documentation for their parameters 541 | # or return value. If set to NO (the default) doxygen will only warn about 542 | # wrong or incomplete parameter documentation, but not about the absence of 543 | # documentation. 544 | 545 | WARN_NO_PARAMDOC = NO 546 | 547 | # The WARN_FORMAT tag determines the format of the warning messages that 548 | # doxygen can produce. The string should contain the $file, $line, and $text 549 | # tags, which will be replaced by the file and line number from which the 550 | # warning originated and the warning text. Optionally the format may contain 551 | # $version, which will be replaced by the version of the file (if it could 552 | # be obtained via FILE_VERSION_FILTER) 553 | 554 | WARN_FORMAT = 555 | 556 | # The WARN_LOGFILE tag can be used to specify a file to which warning 557 | # and error messages should be written. If left blank the output is written 558 | # to stderr. 559 | 560 | WARN_LOGFILE = 561 | 562 | #--------------------------------------------------------------------------- 563 | # configuration options related to the input files 564 | #--------------------------------------------------------------------------- 565 | 566 | # The INPUT tag can be used to specify the files and/or directories that contain 567 | # documented source files. You may enter file names like "myfile.cpp" or 568 | # directories like "/usr/src/myproject". Separate the files or directories 569 | # with spaces. 570 | 571 | INPUT = @PROJECT_SOURCE_DIR@/include/weaver \ 572 | @PROJECT_SOURCE_DIR@/doc \ 573 | @PROJECT_SOURCE_DIR@/test 574 | 575 | # This tag can be used to specify the character encoding of the source files 576 | # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 577 | # also the default input encoding. Doxygen uses libiconv (or the iconv built 578 | # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 579 | # the list of possible encodings. 580 | 581 | INPUT_ENCODING = UTF-8 582 | 583 | # If the value of the INPUT tag contains directories, you can use the 584 | # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 585 | # and *.h) to filter out the source-files in the directories. If left 586 | # blank the following patterns are tested: 587 | # *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 588 | # *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 589 | 590 | FILE_PATTERNS = *.cpp *.hpp *.dox 591 | 592 | # The RECURSIVE tag can be used to turn specify whether or not subdirectories 593 | # should be searched for input files as well. Possible values are YES and NO. 594 | # If left blank NO is used. 595 | 596 | RECURSIVE = YES 597 | 598 | # The EXCLUDE tag can be used to specify files and/or directories that should 599 | # excluded from the INPUT source files. This way you can easily exclude a 600 | # subdirectory from a directory tree whose root is specified with the INPUT tag. 601 | 602 | EXCLUDE = 603 | 604 | # The EXCLUDE_SYMLINKS tag can be used select whether or not files or 605 | # directories that are symbolic links (a Unix filesystem feature) are excluded 606 | # from the input. 607 | 608 | EXCLUDE_SYMLINKS = NO 609 | 610 | # If the value of the INPUT tag contains directories, you can use the 611 | # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 612 | # certain files from those directories. Note that the wildcards are matched 613 | # against the file with absolute path, so to exclude all test directories 614 | # for example use the pattern */test/* 615 | 616 | EXCLUDE_PATTERNS = 617 | 618 | # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 619 | # (namespaces, classes, functions, etc.) that should be excluded from the 620 | # output. The symbol name can be a fully qualified name, a word, or if the 621 | # wildcard * is used, a substring. Examples: ANamespace, AClass, 622 | # AClass::ANamespace, ANamespace::*Test 623 | 624 | EXCLUDE_SYMBOLS = 625 | 626 | # The EXAMPLE_PATH tag can be used to specify one or more files or 627 | # directories that contain example code fragments that are included (see 628 | # the \include command). 629 | 630 | EXAMPLE_PATH = 631 | 632 | # If the value of the EXAMPLE_PATH tag contains directories, you can use the 633 | # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 634 | # and *.h) to filter out the source-files in the directories. If left 635 | # blank all files are included. 636 | 637 | EXAMPLE_PATTERNS = 638 | 639 | # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 640 | # searched for input files to be used with the \include or \dontinclude 641 | # commands irrespective of the value of the RECURSIVE tag. 642 | # Possible values are YES and NO. If left blank NO is used. 643 | 644 | EXAMPLE_RECURSIVE = NO 645 | 646 | # The IMAGE_PATH tag can be used to specify one or more files or 647 | # directories that contain image that are included in the documentation (see 648 | # the \image command). 649 | 650 | IMAGE_PATH = 651 | 652 | # The INPUT_FILTER tag can be used to specify a program that doxygen should 653 | # invoke to filter for each input file. Doxygen will invoke the filter program 654 | # by executing (via popen()) the command , where 655 | # is the value of the INPUT_FILTER tag, and is the name of an 656 | # input file. Doxygen will then use the output that the filter program writes 657 | # to standard output. 658 | # If FILTER_PATTERNS is specified, this tag will be 659 | # ignored. 660 | 661 | INPUT_FILTER = 662 | 663 | # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 664 | # basis. 665 | # Doxygen will compare the file name with each pattern and apply the 666 | # filter if there is a match. 667 | # The filters are a list of the form: 668 | # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 669 | # info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 670 | # is applied to all files. 671 | 672 | FILTER_PATTERNS = 673 | 674 | # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 675 | # INPUT_FILTER) will be used to filter the input files when producing source 676 | # files to browse (i.e. when SOURCE_BROWSER is set to YES). 677 | 678 | FILTER_SOURCE_FILES = NO 679 | 680 | #--------------------------------------------------------------------------- 681 | # configuration options related to source browsing 682 | #--------------------------------------------------------------------------- 683 | 684 | # If the SOURCE_BROWSER tag is set to YES then a list of source files will 685 | # be generated. Documented entities will be cross-referenced with these sources. 686 | # Note: To get rid of all source code in the generated output, make sure also 687 | # VERBATIM_HEADERS is set to NO. 688 | 689 | SOURCE_BROWSER = YES 690 | 691 | # Setting the INLINE_SOURCES tag to YES will include the body 692 | # of functions and classes directly in the documentation. 693 | 694 | INLINE_SOURCES = NO 695 | 696 | # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 697 | # doxygen to hide any special comment blocks from generated source code 698 | # fragments. Normal C and C++ comments will always remain visible. 699 | 700 | STRIP_CODE_COMMENTS = YES 701 | 702 | # If the REFERENCED_BY_RELATION tag is set to YES 703 | # then for each documented function all documented 704 | # functions referencing it will be listed. 705 | 706 | REFERENCED_BY_RELATION = YES 707 | 708 | # If the REFERENCES_RELATION tag is set to YES 709 | # then for each documented function all documented entities 710 | # called/used by that function will be listed. 711 | 712 | REFERENCES_RELATION = YES 713 | 714 | # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 715 | # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 716 | # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 717 | # link to the source code. 718 | # Otherwise they will link to the documentation. 719 | 720 | REFERENCES_LINK_SOURCE = YES 721 | 722 | # If the USE_HTAGS tag is set to YES then the references to source code 723 | # will point to the HTML generated by the htags(1) tool instead of doxygen 724 | # built-in source browser. The htags tool is part of GNU's global source 725 | # tagging system (see http://www.gnu.org/software/global/global.html). You 726 | # will need version 4.8.6 or higher. 727 | 728 | USE_HTAGS = NO 729 | 730 | # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 731 | # will generate a verbatim copy of the header file for each class for 732 | # which an include is specified. Set to NO to disable this. 733 | 734 | VERBATIM_HEADERS = YES 735 | 736 | #--------------------------------------------------------------------------- 737 | # configuration options related to the alphabetical class index 738 | #--------------------------------------------------------------------------- 739 | 740 | # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 741 | # of all compounds will be generated. Enable this if the project 742 | # contains a lot of classes, structs, unions or interfaces. 743 | 744 | ALPHABETICAL_INDEX = YES 745 | 746 | # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 747 | # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 748 | # in which this list will be split (can be a number in the range [1..20]) 749 | 750 | COLS_IN_ALPHA_INDEX = 5 751 | 752 | # In case all classes in a project start with a common prefix, all 753 | # classes will be put under the same header in the alphabetical index. 754 | # The IGNORE_PREFIX tag can be used to specify one or more prefixes that 755 | # should be ignored while generating the index headers. 756 | 757 | IGNORE_PREFIX = 758 | 759 | #--------------------------------------------------------------------------- 760 | # configuration options related to the HTML output 761 | #--------------------------------------------------------------------------- 762 | 763 | # If the GENERATE_HTML tag is set to YES (the default) Doxygen will 764 | # generate HTML output. 765 | 766 | GENERATE_HTML = YES 767 | 768 | # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 769 | # If a relative path is entered the value of OUTPUT_DIRECTORY will be 770 | # put in front of it. If left blank `html' will be used as the default path. 771 | 772 | HTML_OUTPUT = 773 | 774 | # The HTML_FILE_EXTENSION tag can be used to specify the file extension for 775 | # each generated HTML page (for example: .htm,.php,.asp). If it is left blank 776 | # doxygen will generate files with .html extension. 777 | 778 | HTML_FILE_EXTENSION = .html 779 | 780 | # The HTML_HEADER tag can be used to specify a personal HTML header for 781 | # each generated HTML page. If it is left blank doxygen will generate a 782 | # standard header. 783 | 784 | HTML_HEADER = 785 | 786 | # The HTML_FOOTER tag can be used to specify a personal HTML footer for 787 | # each generated HTML page. If it is left blank doxygen will generate a 788 | # standard footer. 789 | 790 | HTML_FOOTER = 791 | 792 | # If the HTML_TIMESTAMP tag is set to YES then the generated HTML 793 | # documentation will contain the timesstamp. 794 | 795 | HTML_TIMESTAMP = NO 796 | 797 | # The HTML_STYLESHEET tag can be used to specify a user-defined cascading 798 | # style sheet that is used by each HTML page. It can be used to 799 | # fine-tune the look of the HTML output. If the tag is left blank doxygen 800 | # will generate a default style sheet. Note that doxygen will try to copy 801 | # the style sheet file to the HTML output directory, so don't put your own 802 | # stylesheet in the HTML output directory as well, or it will be erased! 803 | 804 | HTML_STYLESHEET = 805 | 806 | # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 807 | # files or namespaces will be aligned in HTML using tables. If set to 808 | # NO a bullet list will be used. 809 | 810 | # Obsolete # HTML_ALIGN_MEMBERS = YES 811 | 812 | # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 813 | # documentation will contain sections that can be hidden and shown after the 814 | # page has loaded. For this to work a browser that supports 815 | # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 816 | # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). 817 | 818 | HTML_DYNAMIC_SECTIONS = NO 819 | 820 | # If the GENERATE_DOCSET tag is set to YES, additional index files 821 | # will be generated that can be used as input for Apple's Xcode 3 822 | # integrated development environment, introduced with OSX 10.5 (Leopard). 823 | # To create a documentation set, doxygen will generate a Makefile in the 824 | # HTML output directory. Running make will produce the docset in that 825 | # directory and running "make install" will install the docset in 826 | # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 827 | # it at startup. 828 | # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. 829 | 830 | GENERATE_DOCSET = NO 831 | 832 | # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 833 | # feed. A documentation feed provides an umbrella under which multiple 834 | # documentation sets from a single provider (such as a company or product suite) 835 | # can be grouped. 836 | 837 | DOCSET_FEEDNAME = "Doxygen generated docs" 838 | 839 | # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 840 | # should uniquely identify the documentation set bundle. This should be a 841 | # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 842 | # will append .docset to the name. 843 | 844 | DOCSET_BUNDLE_ID = org.doxygen.Project 845 | 846 | # If the GENERATE_HTMLHELP tag is set to YES, additional index files 847 | # will be generated that can be used as input for tools like the 848 | # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 849 | # of the generated HTML documentation. 850 | 851 | GENERATE_HTMLHELP = NO 852 | 853 | # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 854 | # be used to specify the file name of the resulting .chm file. You 855 | # can add a path in front of the file if the result should not be 856 | # written to the html output directory. 857 | 858 | CHM_FILE = 859 | 860 | # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 861 | # be used to specify the location (absolute path including file name) of 862 | # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 863 | # the HTML help compiler on the generated index.hhp. 864 | 865 | HHC_LOCATION = 866 | 867 | # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 868 | # controls if a separate .chi index file is generated (YES) or that 869 | # it should be included in the master .chm file (NO). 870 | 871 | GENERATE_CHI = NO 872 | 873 | # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 874 | # is used to encode HtmlHelp index (hhk), content (hhc) and project file 875 | # content. 876 | 877 | CHM_INDEX_ENCODING = 878 | 879 | # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 880 | # controls whether a binary table of contents is generated (YES) or a 881 | # normal table of contents (NO) in the .chm file. 882 | 883 | BINARY_TOC = NO 884 | 885 | # The TOC_EXPAND flag can be set to YES to add extra items for group members 886 | # to the contents of the HTML help documentation and to the tree view. 887 | 888 | TOC_EXPAND = NO 889 | 890 | # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER 891 | # are set, an additional index file will be generated that can be used as input for 892 | # Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated 893 | # HTML documentation. 894 | 895 | GENERATE_QHP = NO 896 | 897 | # If the QHG_LOCATION tag is specified, the QCH_FILE tag can 898 | # be used to specify the file name of the resulting .qch file. 899 | # The path specified is relative to the HTML output folder. 900 | 901 | QCH_FILE = 902 | 903 | # The QHP_NAMESPACE tag specifies the namespace to use when generating 904 | # Qt Help Project output. For more information please see 905 | # http://doc.trolltech.com/qthelpproject.html#namespace 906 | 907 | QHP_NAMESPACE = 908 | 909 | # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 910 | # Qt Help Project output. For more information please see 911 | # http://doc.trolltech.com/qthelpproject.html#virtual-folders 912 | 913 | QHP_VIRTUAL_FOLDER = doc 914 | 915 | # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. 916 | # For more information please see 917 | # http://doc.trolltech.com/qthelpproject.html#custom-filters 918 | 919 | QHP_CUST_FILTER_NAME = 920 | 921 | # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see 922 | # Qt Help Project / Custom Filters. 923 | 924 | QHP_CUST_FILTER_ATTRS = 925 | 926 | # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's 927 | # filter section matches. 928 | # Qt Help Project / Filter Attributes. 929 | 930 | QHP_SECT_FILTER_ATTRS = 931 | 932 | # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 933 | # be used to specify the location of Qt's qhelpgenerator. 934 | # If non-empty doxygen will try to run qhelpgenerator on the generated 935 | # .qhp file. 936 | 937 | QHG_LOCATION = 938 | 939 | # The DISABLE_INDEX tag can be used to turn on/off the condensed index at 940 | # top of each HTML page. The value NO (the default) enables the index and 941 | # the value YES disables it. 942 | 943 | DISABLE_INDEX = NO 944 | 945 | # This tag can be used to set the number of enum values (range [1..20]) 946 | # that doxygen will group on one line in the generated HTML documentation. 947 | 948 | ENUM_VALUES_PER_LINE = 4 949 | 950 | # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 951 | # structure should be generated to display hierarchical information. 952 | # If the tag value is set to YES, a side panel will be generated 953 | # containing a tree-like index structure (just like the one that 954 | # is generated for HTML Help). For this to work a browser that supports 955 | # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). 956 | # Windows users are probably better off using the HTML help feature. 957 | 958 | GENERATE_TREEVIEW = YES 959 | 960 | # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, 961 | # and Class Hierarchy pages using a tree view instead of an ordered list. 962 | 963 | # Obsolete # USE_INLINE_TREES = NO 964 | 965 | # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 966 | # used to set the initial width (in pixels) of the frame in which the tree 967 | # is shown. 968 | 969 | TREEVIEW_WIDTH = 250 970 | 971 | # Use this tag to change the font size of Latex formulas included 972 | # as images in the HTML documentation. The default is 10. Note that 973 | # when you change the font size after a successful doxygen run you need 974 | # to manually remove any form_*.png images from the HTML output directory 975 | # to force them to be regenerated. 976 | 977 | FORMULA_FONTSIZE = 10 978 | 979 | # When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript 980 | # and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP) 981 | # there is already a search function so this one should typically 982 | # be disabled. 983 | 984 | SEARCHENGINE = YES 985 | 986 | #--------------------------------------------------------------------------- 987 | # configuration options related to the LaTeX output 988 | #--------------------------------------------------------------------------- 989 | 990 | # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 991 | # generate Latex output. 992 | 993 | GENERATE_LATEX = NO 994 | 995 | # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 996 | # If a relative path is entered the value of OUTPUT_DIRECTORY will be 997 | # put in front of it. If left blank `latex' will be used as the default path. 998 | 999 | LATEX_OUTPUT = 1000 | 1001 | # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 1002 | # invoked. If left blank `latex' will be used as the default command name. 1003 | 1004 | LATEX_CMD_NAME = latex 1005 | 1006 | # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 1007 | # generate index for LaTeX. If left blank `makeindex' will be used as the 1008 | # default command name. 1009 | 1010 | MAKEINDEX_CMD_NAME = makeindex 1011 | 1012 | # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 1013 | # LaTeX documents. This may be useful for small projects and may help to 1014 | # save some trees in general. 1015 | 1016 | COMPACT_LATEX = NO 1017 | 1018 | # The PAPER_TYPE tag can be used to set the paper type that is used 1019 | # by the printer. Possible values are: a4, a4wide, letter, legal and 1020 | # executive. If left blank a4wide will be used. 1021 | 1022 | PAPER_TYPE = a4wide 1023 | 1024 | # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 1025 | # packages that should be included in the LaTeX output. 1026 | 1027 | EXTRA_PACKAGES = 1028 | 1029 | # The LATEX_HEADER tag can be used to specify a personal LaTeX header for 1030 | # the generated latex document. The header should contain everything until 1031 | # the first chapter. If it is left blank doxygen will generate a 1032 | # standard header. Notice: only use this tag if you know what you are doing! 1033 | 1034 | LATEX_HEADER = 1035 | 1036 | # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 1037 | # is prepared for conversion to pdf (using ps2pdf). The pdf file will 1038 | # contain links (just like the HTML output) instead of page references 1039 | # This makes the output suitable for online browsing using a pdf viewer. 1040 | 1041 | PDF_HYPERLINKS = NO 1042 | 1043 | # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 1044 | # plain latex in the generated Makefile. Set this option to YES to get a 1045 | # higher quality PDF documentation. 1046 | 1047 | USE_PDFLATEX = NO 1048 | 1049 | # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 1050 | # command to the generated LaTeX files. This will instruct LaTeX to keep 1051 | # running if errors occur, instead of asking the user for help. 1052 | # This option is also used when generating formulas in HTML. 1053 | 1054 | LATEX_BATCHMODE = NO 1055 | 1056 | # If LATEX_HIDE_INDICES is set to YES then doxygen will not 1057 | # include the index chapters (such as File Index, Compound Index, etc.) 1058 | # in the output. 1059 | 1060 | LATEX_HIDE_INDICES = NO 1061 | 1062 | # If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER. 1063 | 1064 | LATEX_SOURCE_CODE = NO 1065 | 1066 | #--------------------------------------------------------------------------- 1067 | # configuration options related to the RTF output 1068 | #--------------------------------------------------------------------------- 1069 | 1070 | # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 1071 | # The RTF output is optimized for Word 97 and may not look very pretty with 1072 | # other RTF readers or editors. 1073 | 1074 | GENERATE_RTF = NO 1075 | 1076 | # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 1077 | # If a relative path is entered the value of OUTPUT_DIRECTORY will be 1078 | # put in front of it. If left blank `rtf' will be used as the default path. 1079 | 1080 | RTF_OUTPUT = 1081 | 1082 | # If the COMPACT_RTF tag is set to YES Doxygen generates more compact 1083 | # RTF documents. This may be useful for small projects and may help to 1084 | # save some trees in general. 1085 | 1086 | COMPACT_RTF = NO 1087 | 1088 | # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 1089 | # will contain hyperlink fields. The RTF file will 1090 | # contain links (just like the HTML output) instead of page references. 1091 | # This makes the output suitable for online browsing using WORD or other 1092 | # programs which support those fields. 1093 | # Note: wordpad (write) and others do not support links. 1094 | 1095 | RTF_HYPERLINKS = NO 1096 | 1097 | # Load stylesheet definitions from file. Syntax is similar to doxygen's 1098 | # config file, i.e. a series of assignments. You only have to provide 1099 | # replacements, missing definitions are set to their default value. 1100 | 1101 | RTF_STYLESHEET_FILE = 1102 | 1103 | # Set optional variables used in the generation of an rtf document. 1104 | # Syntax is similar to doxygen's config file. 1105 | 1106 | RTF_EXTENSIONS_FILE = 1107 | 1108 | #--------------------------------------------------------------------------- 1109 | # configuration options related to the man page output 1110 | #--------------------------------------------------------------------------- 1111 | 1112 | # If the GENERATE_MAN tag is set to YES (the default) Doxygen will 1113 | # generate man pages 1114 | 1115 | GENERATE_MAN = YES 1116 | 1117 | # The MAN_OUTPUT tag is used to specify where the man pages will be put. 1118 | # If a relative path is entered the value of OUTPUT_DIRECTORY will be 1119 | # put in front of it. If left blank `man' will be used as the default path. 1120 | 1121 | MAN_OUTPUT = 1122 | 1123 | # The MAN_EXTENSION tag determines the extension that is added to 1124 | # the generated man pages (default is the subroutine's section .3) 1125 | 1126 | MAN_EXTENSION = 1127 | 1128 | # If the MAN_LINKS tag is set to YES and Doxygen generates man output, 1129 | # then it will generate one additional man file for each entity 1130 | # documented in the real man page(s). These additional files 1131 | # only source the real man page, but without them the man command 1132 | # would be unable to find the correct page. The default is NO. 1133 | 1134 | MAN_LINKS = NO 1135 | 1136 | #--------------------------------------------------------------------------- 1137 | # configuration options related to the XML output 1138 | #--------------------------------------------------------------------------- 1139 | 1140 | # If the GENERATE_XML tag is set to YES Doxygen will 1141 | # generate an XML file that captures the structure of 1142 | # the code including all documentation. 1143 | 1144 | GENERATE_XML = NO 1145 | 1146 | # The XML_OUTPUT tag is used to specify where the XML pages will be put. 1147 | # If a relative path is entered the value of OUTPUT_DIRECTORY will be 1148 | # put in front of it. If left blank `xml' will be used as the default path. 1149 | 1150 | XML_OUTPUT = xml 1151 | 1152 | # The XML_SCHEMA tag can be used to specify an XML schema, 1153 | # which can be used by a validating XML parser to check the 1154 | # syntax of the XML files. 1155 | 1156 | # XML_SCHEMA = 1157 | 1158 | # The XML_DTD tag can be used to specify an XML DTD, 1159 | # which can be used by a validating XML parser to check the 1160 | # syntax of the XML files. 1161 | 1162 | # XML_DTD = 1163 | 1164 | # If the XML_PROGRAMLISTING tag is set to YES Doxygen will 1165 | # dump the program listings (including syntax highlighting 1166 | # and cross-referencing information) to the XML output. Note that 1167 | # enabling this will significantly increase the size of the XML output. 1168 | 1169 | XML_PROGRAMLISTING = YES 1170 | 1171 | #--------------------------------------------------------------------------- 1172 | # configuration options for the AutoGen Definitions output 1173 | #--------------------------------------------------------------------------- 1174 | 1175 | # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 1176 | # generate an AutoGen Definitions (see autogen.sf.net) file 1177 | # that captures the structure of the code including all 1178 | # documentation. Note that this feature is still experimental 1179 | # and incomplete at the moment. 1180 | 1181 | GENERATE_AUTOGEN_DEF = NO 1182 | 1183 | #--------------------------------------------------------------------------- 1184 | # configuration options related to the Perl module output 1185 | #--------------------------------------------------------------------------- 1186 | 1187 | # If the GENERATE_PERLMOD tag is set to YES Doxygen will 1188 | # generate a Perl module file that captures the structure of 1189 | # the code including all documentation. Note that this 1190 | # feature is still experimental and incomplete at the 1191 | # moment. 1192 | 1193 | GENERATE_PERLMOD = NO 1194 | 1195 | # If the PERLMOD_LATEX tag is set to YES Doxygen will generate 1196 | # the necessary Makefile rules, Perl scripts and LaTeX code to be able 1197 | # to generate PDF and DVI output from the Perl module output. 1198 | 1199 | PERLMOD_LATEX = NO 1200 | 1201 | # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 1202 | # nicely formatted so it can be parsed by a human reader. 1203 | # This is useful 1204 | # if you want to understand what is going on. 1205 | # On the other hand, if this 1206 | # tag is set to NO the size of the Perl module output will be much smaller 1207 | # and Perl will parse it just the same. 1208 | 1209 | PERLMOD_PRETTY = YES 1210 | 1211 | # The names of the make variables in the generated doxyrules.make file 1212 | # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 1213 | # This is useful so different doxyrules.make files included by the same 1214 | # Makefile don't overwrite each other's variables. 1215 | 1216 | PERLMOD_MAKEVAR_PREFIX = 1217 | 1218 | #--------------------------------------------------------------------------- 1219 | # Configuration options related to the preprocessor 1220 | #--------------------------------------------------------------------------- 1221 | 1222 | # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 1223 | # evaluate all C-preprocessor directives found in the sources and include 1224 | # files. 1225 | 1226 | ENABLE_PREPROCESSING = YES 1227 | 1228 | # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 1229 | # names in the source code. If set to NO (the default) only conditional 1230 | # compilation will be performed. Macro expansion can be done in a controlled 1231 | # way by setting EXPAND_ONLY_PREDEF to YES. 1232 | 1233 | MACRO_EXPANSION = NO 1234 | 1235 | # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 1236 | # then the macro expansion is limited to the macros specified with the 1237 | # PREDEFINED and EXPAND_AS_DEFINED tags. 1238 | 1239 | EXPAND_ONLY_PREDEF = NO 1240 | 1241 | # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 1242 | # in the INCLUDE_PATH (see below) will be search if a #include is found. 1243 | 1244 | SEARCH_INCLUDES = YES 1245 | 1246 | # The INCLUDE_PATH tag can be used to specify one or more directories that 1247 | # contain include files that are not input files but should be processed by 1248 | # the preprocessor. 1249 | 1250 | INCLUDE_PATH = 1251 | 1252 | # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 1253 | # patterns (like *.h and *.hpp) to filter out the header-files in the 1254 | # directories. If left blank, the patterns specified with FILE_PATTERNS will 1255 | # be used. 1256 | 1257 | INCLUDE_FILE_PATTERNS = 1258 | 1259 | # The PREDEFINED tag can be used to specify one or more macro names that 1260 | # are defined before the preprocessor is started (similar to the -D option of 1261 | # gcc). The argument of the tag is a list of macros of the form: name 1262 | # or name=definition (no spaces). If the definition and the = are 1263 | # omitted =1 is assumed. To prevent a macro definition from being 1264 | # undefined via #undef or recursively expanded use the := operator 1265 | # instead of the = operator. 1266 | 1267 | PREDEFINED = 1268 | 1269 | # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 1270 | # this tag can be used to specify a list of macro names that should be expanded. 1271 | # The macro definition that is found in the sources will be used. 1272 | # Use the PREDEFINED tag if you want to use a different macro definition. 1273 | 1274 | EXPAND_AS_DEFINED = 1275 | 1276 | # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 1277 | # doxygen's preprocessor will remove all function-like macros that are alone 1278 | # on a line, have an all uppercase name, and do not end with a semicolon. Such 1279 | # function macros are typically used for boiler-plate code, and will confuse 1280 | # the parser if not removed. 1281 | 1282 | SKIP_FUNCTION_MACROS = YES 1283 | 1284 | #--------------------------------------------------------------------------- 1285 | # Configuration::additions related to external references 1286 | #--------------------------------------------------------------------------- 1287 | 1288 | # The TAGFILES option can be used to specify one or more tagfiles. 1289 | # Optionally an initial location of the external documentation 1290 | # can be added for each tagfile. The format of a tag file without 1291 | # this location is as follows: 1292 | # 1293 | # TAGFILES = file1 file2 ... 1294 | # Adding location for the tag files is done as follows: 1295 | # 1296 | # TAGFILES = file1=loc1 "file2 = loc2" ... 1297 | # where "loc1" and "loc2" can be relative or absolute paths or 1298 | # URLs. If a location is present for each tag, the installdox tool 1299 | # does not have to be run to correct the links. 1300 | # Note that each tag file must have a unique name 1301 | # (where the name does NOT include the path) 1302 | # If a tag file is not located in the directory in which doxygen 1303 | # is run, you must also specify the path to the tagfile here. 1304 | 1305 | TAGFILES = 1306 | 1307 | # When a file name is specified after GENERATE_TAGFILE, doxygen will create 1308 | # a tag file that is based on the input files it reads. 1309 | 1310 | GENERATE_TAGFILE = 1311 | 1312 | # If the ALLEXTERNALS tag is set to YES all external classes will be listed 1313 | # in the class index. If set to NO only the inherited external classes 1314 | # will be listed. 1315 | 1316 | ALLEXTERNALS = NO 1317 | 1318 | # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 1319 | # in the modules index. If set to NO, only the current project's groups will 1320 | # be listed. 1321 | 1322 | EXTERNAL_GROUPS = YES 1323 | 1324 | # The PERL_PATH should be the absolute path and name of the perl script 1325 | # interpreter (i.e. the result of `which perl'). 1326 | 1327 | PERL_PATH = 1328 | 1329 | #--------------------------------------------------------------------------- 1330 | # Configuration options related to the dot tool 1331 | #--------------------------------------------------------------------------- 1332 | 1333 | # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 1334 | # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 1335 | # or super classes. Setting the tag to NO turns the diagrams off. Note that 1336 | # this option is superseded by the HAVE_DOT option below. This is only a 1337 | # fallback. It is recommended to install and use dot, since it yields more 1338 | # powerful graphs. 1339 | 1340 | CLASS_DIAGRAMS = NO 1341 | 1342 | # You can define message sequence charts within doxygen comments using the \msc 1343 | # command. Doxygen will then run the mscgen tool (see 1344 | # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 1345 | # documentation. The MSCGEN_PATH tag allows you to specify the directory where 1346 | # the mscgen tool resides. If left empty the tool is assumed to be found in the 1347 | # default search path. 1348 | 1349 | MSCGEN_PATH = 1350 | 1351 | # If set to YES, the inheritance and collaboration graphs will hide 1352 | # inheritance and usage relations if the target is undocumented 1353 | # or is not a class. 1354 | 1355 | HIDE_UNDOC_RELATIONS = YES 1356 | 1357 | # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 1358 | # available from the path. This tool is part of Graphviz, a graph visualization 1359 | # toolkit from AT&T and Lucent Bell Labs. The other options in this section 1360 | # have no effect if this option is set to NO (the default) 1361 | 1362 | HAVE_DOT = YES 1363 | 1364 | # By default doxygen will write a font called FreeSans.ttf to the output 1365 | # directory and reference it in all dot files that doxygen generates. This 1366 | # font does not include all possible unicode characters however, so when you need 1367 | # these (or just want a differently looking font) you can specify the font name 1368 | # using DOT_FONTNAME. You need need to make sure dot is able to find the font, 1369 | # which can be done by putting it in a standard location or by setting the 1370 | # DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory 1371 | # containing the font. 1372 | 1373 | #DOT_FONTNAME = FreeSans 1374 | 1375 | # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 1376 | # The default size is 10pt. 1377 | 1378 | DOT_FONTSIZE = 10 1379 | 1380 | # By default doxygen will tell dot to use the output directory to look for the 1381 | # FreeSans.ttf font (which doxygen will put there itself). If you specify a 1382 | # different font using DOT_FONTNAME you can set the path where dot 1383 | # can find it using this tag. 1384 | 1385 | DOT_FONTPATH = 1386 | 1387 | # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 1388 | # will generate a graph for each documented class showing the direct and 1389 | # indirect inheritance relations. Setting this tag to YES will force the 1390 | # the CLASS_DIAGRAMS tag to NO. 1391 | 1392 | CLASS_GRAPH = YES 1393 | 1394 | # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 1395 | # will generate a graph for each documented class showing the direct and 1396 | # indirect implementation dependencies (inheritance, containment, and 1397 | # class references variables) of the class with other documented classes. 1398 | 1399 | COLLABORATION_GRAPH = YES 1400 | 1401 | # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 1402 | # will generate a graph for groups, showing the direct groups dependencies 1403 | 1404 | GROUP_GRAPHS = YES 1405 | 1406 | # If the UML_LOOK tag is set to YES doxygen will generate inheritance and 1407 | # collaboration diagrams in a style similar to the OMG's Unified Modeling 1408 | # Language. 1409 | 1410 | UML_LOOK = NO 1411 | 1412 | # If set to YES, the inheritance and collaboration graphs will show the 1413 | # relations between templates and their instances. 1414 | 1415 | TEMPLATE_RELATIONS = NO 1416 | 1417 | # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 1418 | # tags are set to YES then doxygen will generate a graph for each documented 1419 | # file showing the direct and indirect include dependencies of the file with 1420 | # other documented files. 1421 | 1422 | INCLUDE_GRAPH = YES 1423 | 1424 | # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 1425 | # HAVE_DOT tags are set to YES then doxygen will generate a graph for each 1426 | # documented header file showing the documented files that directly or 1427 | # indirectly include this file. 1428 | 1429 | INCLUDED_BY_GRAPH = YES 1430 | 1431 | # If the CALL_GRAPH and HAVE_DOT options are set to YES then 1432 | # doxygen will generate a call dependency graph for every global function 1433 | # or class method. Note that enabling this option will significantly increase 1434 | # the time of a run. So in most cases it will be better to enable call graphs 1435 | # for selected functions only using the \callgraph command. 1436 | 1437 | CALL_GRAPH = NO 1438 | 1439 | # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 1440 | # doxygen will generate a caller dependency graph for every global function 1441 | # or class method. Note that enabling this option will significantly increase 1442 | # the time of a run. So in most cases it will be better to enable caller 1443 | # graphs for selected functions only using the \callergraph command. 1444 | 1445 | CALLER_GRAPH = NO 1446 | 1447 | # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 1448 | # will graphical hierarchy of all classes instead of a textual one. 1449 | 1450 | GRAPHICAL_HIERARCHY = YES 1451 | 1452 | # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 1453 | # then doxygen will show the dependencies a directory has on other directories 1454 | # in a graphical way. The dependency relations are determined by the #include 1455 | # relations between the files in the directories. 1456 | 1457 | DIRECTORY_GRAPH = YES 1458 | 1459 | # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 1460 | # generated by dot. Possible values are png, jpg, or gif 1461 | # If left blank png will be used. 1462 | 1463 | DOT_IMAGE_FORMAT = png 1464 | 1465 | # The tag DOT_PATH can be used to specify the path where the dot tool can be 1466 | # found. If left blank, it is assumed the dot tool can be found in the path. 1467 | 1468 | DOT_PATH = 1469 | 1470 | # The DOTFILE_DIRS tag can be used to specify one or more directories that 1471 | # contain dot files that are included in the documentation (see the 1472 | # \dotfile command). 1473 | 1474 | DOTFILE_DIRS = 1475 | 1476 | # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 1477 | # nodes that will be shown in the graph. If the number of nodes in a graph 1478 | # becomes larger than this value, doxygen will truncate the graph, which is 1479 | # visualized by representing a node as a red box. Note that doxygen if the 1480 | # number of direct children of the root node in a graph is already larger than 1481 | # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 1482 | # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. 1483 | 1484 | DOT_GRAPH_MAX_NODES = 100 1485 | 1486 | # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 1487 | # graphs generated by dot. A depth value of 3 means that only nodes reachable 1488 | # from the root by following a path via at most 3 edges will be shown. Nodes 1489 | # that lay further from the root node will be omitted. Note that setting this 1490 | # option to 1 or 2 may greatly reduce the computation time needed for large 1491 | # code bases. Also note that the size of a graph can be further restricted by 1492 | # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. 1493 | 1494 | MAX_DOT_GRAPH_DEPTH = 0 1495 | 1496 | # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 1497 | # background. This is disabled by default, because dot on Windows does not 1498 | # seem to support this out of the box. Warning: Depending on the platform used, 1499 | # enabling this option may lead to badly anti-aliased labels on the edges of 1500 | # a graph (i.e. they become hard to read). 1501 | 1502 | DOT_TRANSPARENT = NO 1503 | 1504 | # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 1505 | # files in one run (i.e. multiple -o and -T options on the command line). This 1506 | # makes dot run faster, but since only newer versions of dot (>1.8.10) 1507 | # support this, this feature is disabled by default. 1508 | 1509 | DOT_MULTI_TARGETS = YES 1510 | 1511 | # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 1512 | # generate a legend page explaining the meaning of the various boxes and 1513 | # arrows in the dot generated graphs. 1514 | 1515 | GENERATE_LEGEND = YES 1516 | 1517 | # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 1518 | # remove the intermediate dot files that are used to generate 1519 | # the various graphs. 1520 | 1521 | DOT_CLEANUP = NO 1522 | -------------------------------------------------------------------------------- /include/popvcf: -------------------------------------------------------------------------------- 1 | ../src -------------------------------------------------------------------------------- /include/popvcf.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../src/decode.hpp" 4 | #include "../src/encode.hpp" 5 | #include "../src/sequence_utils.hpp" 6 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.8) 2 | 3 | # Update with "find src -name "*.?pp" | sort | awk '$1 !~ /main.cpp/{print " "$1}'" in project root directory 4 | set(popvcf_sources 5 | src/encode.cpp 6 | src/encode.hpp 7 | src/decode.cpp 8 | src/decode.hpp 9 | src/sequence_utils.cpp 10 | src/sequence_utils.hpp 11 | PARENT_SCOPE) 12 | -------------------------------------------------------------------------------- /src/decode.cpp: -------------------------------------------------------------------------------- 1 | #include "decode.hpp" 2 | 3 | #include //std::copy 4 | #include // std::array 5 | #include 6 | #include // std::stdin 7 | #include // std::memmove 8 | #include // std::cerr 9 | #include 10 | #include 11 | #include // std::string 12 | #include // std::vector 13 | 14 | #include "io.hpp" 15 | #include "sequence_utils.hpp" // ascii_cstring_to_int 16 | 17 | #include "htslib/bgzf.h" 18 | #include "htslib/hts.h" 19 | #include "htslib/kseq.h" 20 | #include "htslib/tbx.h" 21 | 22 | namespace popvcf 23 | { 24 | void decode_file(std::string const & input_fn, bool const is_bgzf_input) 25 | { 26 | Tdec_array_buf buffer_in; // input buffer 27 | std::vector buffer_out; // output buffer 28 | DecodeData dd; // data used to keep track of buffers while decoding 29 | 30 | /// Input streams 31 | popvcf::bgzf_ptr in_bgzf(nullptr, popvcf::close_bgzf); 32 | popvcf::file_ptr in_vcf(nullptr, popvcf::close_vcf_nop); 33 | 34 | /// Open input file based on options 35 | if (is_bgzf_input) 36 | in_bgzf = popvcf::open_bgzf(input_fn, "r"); 37 | else 38 | in_vcf = popvcf::open_vcf(input_fn, "r"); 39 | 40 | buffer_out.reserve(16 * DEC_BUFFER_SIZE); 41 | 42 | /// Read first batch of data 43 | if (is_bgzf_input) 44 | dd.in_size = bgzf_read(in_bgzf.get(), buffer_in.data(), DEC_BUFFER_SIZE); 45 | else 46 | dd.in_size = fread(buffer_in.data(), 1, DEC_BUFFER_SIZE, in_vcf.get()); 47 | 48 | long new_bytes = dd.in_size; 49 | 50 | /// Outer loop - loop while there is some data to decode from the input stream 51 | while (new_bytes != 0) 52 | { 53 | decode_buffer(buffer_out, buffer_in, dd); 54 | 55 | /// Write buffer_out to stdout 56 | fwrite(buffer_out.data(), 1, buffer_out.size(), stdout); 57 | buffer_out.resize(0); // Clears output buffer, but does not deallocate 58 | new_bytes = -static_cast(dd.in_size); 59 | 60 | /// Read more data 61 | if (is_bgzf_input) 62 | dd.in_size += bgzf_read(in_bgzf.get(), buffer_in.data() + dd.in_size, DEC_BUFFER_SIZE - dd.in_size); 63 | else 64 | dd.in_size += fread(buffer_in.data() + dd.in_size, 1, DEC_BUFFER_SIZE - dd.in_size, in_vcf.get()); 65 | 66 | new_bytes += dd.in_size; 67 | } /// ends outer loop 68 | 69 | assert(buffer_out.size() == 0); 70 | 71 | if (dd.in_size != 0) 72 | { 73 | std::cerr << "[popvcf] WARNING: Unexpected ending of the VCF data, possibly the file is truncated.\n"; 74 | 75 | // write output buffer 76 | fwrite(buffer_in.data(), 1, dd.in_size, stdout); // write output buffer 77 | } 78 | } 79 | 80 | void decode_region(std::string const & popvcf_fn, std::string const & region) 81 | { 82 | assert(region.size() > 0); 83 | std::vector buffer_in; // input buffer 84 | buffer_in.reserve(DEC_BUFFER_SIZE); 85 | std::vector buffer_out; // output buffer 86 | DecodeData dd; // data used to keep track of buffers while decoding 87 | 88 | /// parse region 89 | std::string chrom; 90 | long begin{-1}; 91 | long end{std::numeric_limits::max()}; 92 | 93 | if (auto colon = region.find(':'); colon == std::string::npos) 94 | { 95 | chrom = region; 96 | } 97 | else 98 | { 99 | chrom = region.substr(0, colon); 100 | 101 | if (auto dash = region.find('-', colon + 1); dash == std::string::npos) 102 | { 103 | auto ret = std::from_chars(region.data() + colon + 1, region.data() + region.size(), begin); 104 | 105 | if (ret.ec != std::errc()) 106 | throw std::runtime_error("Could not parse region: " + region); 107 | 108 | end = begin; 109 | } 110 | else 111 | { 112 | auto ret_begin = std::from_chars(region.data() + colon + 1, region.data() + dash, begin); 113 | 114 | if (ret_begin.ec != std::errc()) 115 | throw std::runtime_error("Could not parse region: " + region); 116 | 117 | auto ret_end = std::from_chars(region.data() + dash + 1, region.data() + region.size(), end); 118 | 119 | if (ret_end.ec != std::errc()) 120 | throw std::runtime_error("Could not parse region: " + region); 121 | } 122 | 123 | dd.begin = begin; 124 | dd.end = end; 125 | } 126 | 127 | /// Determine the region to query 128 | std::string safe_region = chrom; 129 | long safe_begin; 130 | 131 | if (begin >= 0) 132 | { 133 | safe_begin = std::max(1l, (begin / 10000l) * 10000l); 134 | safe_region.push_back(':'); 135 | safe_region.append(std::to_string(std::max(1l, safe_begin))); 136 | safe_region.push_back('-'); 137 | safe_region.append(std::to_string(end)); 138 | } 139 | else 140 | { 141 | safe_begin = 0; 142 | } 143 | 144 | /// Input streams 145 | popvcf::hts_file_ptr in_bgzf = popvcf::open_hts_file(popvcf_fn.c_str(), "r"); // open popvcf.gz 146 | popvcf::tbx_t_ptr in_tbx = popvcf::open_tbx_t(popvcf_fn.c_str()); // open popvcf.gz.tbi 147 | popvcf::hts_itr_t_ptr in_it = popvcf::open_hts_itr_t(in_tbx.get(), safe_region.c_str()); // query region 148 | 149 | /// Write the header lines 150 | kstring_t str = {0, 0, 0}; 151 | 152 | while (hts_getline(in_bgzf.get(), KS_SEP_LINE, &str) >= 0) 153 | { 154 | if (!str.l || str.s[0] != in_tbx->conf.meta_char) 155 | break; 156 | 157 | fwrite(str.s, 1, str.l, stdout); 158 | fputs("\n", stdout); 159 | } 160 | 161 | // return here, after writing header, if there are no records in the region 162 | if (in_it == nullptr) 163 | { 164 | free(str.s); 165 | return; 166 | } 167 | 168 | int ret = tbx_itr_next(in_bgzf.get(), in_tbx.get(), in_it.get(), &str); 169 | 170 | while (ret > 0) 171 | { 172 | long vcf_pos = get_vcf_pos(str.s, str.s + str.l); 173 | 174 | if (vcf_pos >= safe_begin) 175 | { 176 | buffer_in.insert(buffer_in.end(), str.s, str.s + str.l); 177 | buffer_in.push_back('\n'); 178 | 179 | decode_buffer(buffer_out, buffer_in, dd); 180 | 181 | /// Write buffer_out to stdout 182 | fwrite(buffer_out.data(), 1, buffer_out.size(), stdout); 183 | 184 | /// Clears output buffer, but does not deallocate 185 | buffer_out.resize(0); 186 | 187 | /// Check if end position has been passed 188 | if (vcf_pos > dd.end) 189 | break; 190 | } 191 | 192 | /// Read more data 193 | ret = tbx_itr_next(in_bgzf.get(), in_tbx.get(), in_it.get(), &str); 194 | } 195 | 196 | free(str.s); 197 | } 198 | 199 | } // namespace popvcf 200 | -------------------------------------------------------------------------------- /src/decode.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include "sequence_utils.hpp" 16 | 17 | #include 18 | 19 | namespace popvcf 20 | { 21 | class DecodeData 22 | { 23 | public: 24 | std::size_t field{0}; //!< Current vcf field 25 | std::size_t in_size{0}; //!< Size of input buffer. 26 | std::size_t b{0}; //!< Field begin index in input buffer. 27 | std::size_t i{b}; //!< Curent index in input buffer 28 | bool header_line{true}; //!< True iff in header line 29 | bool in_region{true}; //!< True iff in region 30 | 31 | int64_t begin{-1}; 32 | int64_t end{std::numeric_limits::max()}; 33 | 34 | std::vector prev_field2uid{}; 35 | std::vector prev_unique_fields{}; 36 | phmap::flat_hash_map prev_map_to_unique_fields{}; 37 | 38 | int32_t stored_alt{0}; 39 | int32_t n_alt{-1}; 40 | std::string next_contig{}; 41 | std::vector field2uid{}; 42 | std::vector unique_fields{}; 43 | phmap::flat_hash_map map_to_unique_fields{}; 44 | 45 | inline void clear_line(int32_t next_n_alt) 46 | { 47 | next_n_alt += stored_alt; 48 | stored_alt = 0; 49 | 50 | if (next_n_alt == n_alt) 51 | { 52 | std::swap(prev_field2uid, field2uid); 53 | std::swap(prev_unique_fields, unique_fields); 54 | std::swap(prev_map_to_unique_fields, map_to_unique_fields); 55 | } 56 | 57 | n_alt = next_n_alt; 58 | field2uid.resize(0); 59 | unique_fields.resize(0); 60 | map_to_unique_fields.clear(); 61 | } 62 | }; 63 | 64 | template 65 | inline void set_input_size(Tbuffer_in & buffer_in, DecodeData & dd) 66 | { 67 | dd.in_size = buffer_in.size(); 68 | } 69 | 70 | template <> 71 | inline void set_input_size(Tdec_array_buf & /*buffer_in*/, DecodeData & /*dd*/) 72 | { 73 | // Do nothing. 74 | // NOTE: dd.in_size must be set prior to calling decode_buffer in arrays 75 | } 76 | 77 | //! Decodes an input buffer. Output is written in \a buffer_out . 78 | template 79 | inline void decode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, DecodeData & dd) 80 | { 81 | set_input_size(buffer_in, dd); 82 | std::size_t constexpr N_FIELDS_SITE_DATA{9}; 83 | 84 | // inner loop - Loops over each character in the input buffer 85 | while (dd.i < dd.in_size) 86 | { 87 | char const b_in = buffer_in[dd.i]; 88 | 89 | if (b_in != '\t' && b_in != '\n') 90 | { 91 | ++dd.i; // we are in a vcf field 92 | continue; 93 | } 94 | 95 | if (dd.field == 0) 96 | { 97 | dd.header_line = buffer_in[dd.b] == '#'; // check if in header line 98 | 99 | if (not dd.header_line) 100 | { 101 | ++dd.i; // include '\t' 102 | dd.next_contig.assign(&buffer_in[dd.b], dd.i - dd.b); 103 | 104 | /// Do not print this line until we know if we are inside the region or not 105 | dd.b = dd.i; 106 | ++dd.field; 107 | continue; 108 | } 109 | } 110 | else if (not dd.header_line) 111 | { 112 | if (dd.field == 1) /*POS field */ 113 | { 114 | long pos{}; 115 | std::from_chars(&buffer_in[dd.b], &buffer_in[dd.i], pos); // get pos 116 | dd.in_region = pos >= dd.begin && pos <= dd.end; 117 | 118 | if (!is_region || dd.in_region) /*print contig if we are inside the region*/ 119 | buffer_out.insert(buffer_out.end(), dd.next_contig.begin(), dd.next_contig.end()); 120 | } 121 | else if (dd.field == 4) /* ALT field */ 122 | { 123 | int32_t next_n_alt = std::count(&buffer_in[dd.b], &buffer_in[dd.i], ','); 124 | dd.clear_line(next_n_alt); 125 | } 126 | } 127 | 128 | if (dd.header_line || dd.field < N_FIELDS_SITE_DATA) 129 | { 130 | // write field without any encoding 131 | ++dd.i; // adds '\t' or '\n' 132 | 133 | if (!is_region || dd.in_region) 134 | buffer_out.insert(buffer_out.end(), &buffer_in[dd.b], &buffer_in[dd.i]); 135 | } 136 | else 137 | { 138 | long field_idx = dd.field - N_FIELDS_SITE_DATA; 139 | assert(field_idx == static_cast(dd.field2uid.size())); 140 | 141 | while (buffer_in[dd.b] == '$' || buffer_in[dd.b] == '&') 142 | { 143 | assert(dd.b < dd.i); 144 | assert(field_idx < static_cast(dd.prev_field2uid.size())); 145 | assert(dd.prev_field2uid[field_idx] < static_cast(dd.prev_unique_fields.size())); 146 | 147 | std::string const & prior_field = dd.prev_unique_fields[dd.prev_field2uid[field_idx]]; 148 | 149 | if (buffer_in[dd.b] == '$') 150 | { 151 | /* Unique field in this line. Same as field above. */ 152 | dd.map_to_unique_fields.insert(std::pair(prior_field, dd.unique_fields.size())); 153 | dd.field2uid.push_back(dd.unique_fields.size()); 154 | dd.unique_fields.push_back(prior_field); 155 | } 156 | else 157 | { 158 | /* Duplicate field in this line. Same as field above. */ 159 | assert(buffer_in[dd.b] == '&'); 160 | auto find_it = dd.map_to_unique_fields.find(prior_field); 161 | assert(find_it != dd.map_to_unique_fields.end()); 162 | dd.field2uid.push_back(find_it->second); 163 | } 164 | 165 | ++dd.b; 166 | ++dd.field; 167 | ++field_idx; 168 | 169 | if (!is_region || dd.in_region) 170 | { 171 | buffer_out.insert(buffer_out.end(), prior_field.begin(), prior_field.end()); 172 | 173 | if (dd.b < dd.i) 174 | buffer_out.push_back('\t'); 175 | } 176 | } 177 | 178 | if (buffer_in[dd.b] == '\n') 179 | { 180 | if (!is_region || dd.in_region) 181 | buffer_out.push_back('\n'); 182 | 183 | ++dd.i; 184 | } 185 | else if (buffer_in[dd.b] == '%') 186 | { 187 | // Unique field within the line but was seen in the previous line 188 | ++dd.b; // Get over '%' 189 | uint32_t const prev_unique_index = ascii_cstring_to_int(&buffer_in[dd.b], &buffer_in[dd.i++]); 190 | assert(prev_unique_index < dd.prev_unique_fields.size()); 191 | std::string const & prior_field = dd.prev_unique_fields[prev_unique_index]; 192 | 193 | dd.map_to_unique_fields.insert(std::pair(prior_field, dd.unique_fields.size())); 194 | dd.field2uid.push_back(dd.unique_fields.size()); 195 | dd.unique_fields.push_back(prior_field); 196 | 197 | if (!is_region || dd.in_region) 198 | { 199 | buffer_out.insert(buffer_out.end(), prior_field.begin(), prior_field.end()); 200 | buffer_out.push_back(b_in); 201 | } 202 | } 203 | else if (buffer_in[dd.b] >= ':') 204 | { 205 | // same as earler field in the same line 206 | uint32_t const unique_index = ascii_cstring_to_int(&buffer_in[dd.b], &buffer_in[dd.i++]); 207 | assert(unique_index < dd.unique_fields.size()); 208 | dd.field2uid.push_back(unique_index); 209 | std::string const & prior_field = dd.unique_fields[unique_index]; 210 | 211 | if (!is_region || dd.in_region) 212 | { 213 | buffer_out.insert(buffer_out.end(), prior_field.begin(), prior_field.end()); 214 | buffer_out.push_back(b_in); 215 | } 216 | } 217 | else 218 | { 219 | // add a new unique field and write field without any encoding 220 | auto insert_it = dd.map_to_unique_fields.insert( 221 | std::pair(std::piecewise_construct, 222 | std::forward_as_tuple(&buffer_in[dd.b], dd.i - dd.b), 223 | std::forward_as_tuple(dd.unique_fields.size()))); 224 | 225 | assert(insert_it.second == true); 226 | dd.field2uid.push_back(dd.unique_fields.size()); 227 | dd.unique_fields.push_back(insert_it.first->first); 228 | ++dd.i; 229 | 230 | if (!is_region || dd.in_region) 231 | buffer_out.insert(buffer_out.end(), &buffer_in[dd.b], &buffer_in[dd.i]); 232 | } 233 | 234 | // assert((field_idx + 1) == static_cast(dd.field2uid.size())); 235 | } 236 | 237 | assert(b_in == buffer_in[dd.i - 1]); 238 | dd.b = dd.i; 239 | 240 | if (b_in == '\n') 241 | dd.field = 0; 242 | else 243 | ++dd.field; 244 | } // ends inner loop 245 | 246 | if (dd.field >= 3 && dd.field < N_FIELDS_SITE_DATA) 247 | { 248 | // write field without updating the field index 249 | if (!is_region || dd.in_region) 250 | buffer_out.insert(buffer_out.end(), &buffer_in[dd.b], &buffer_in[dd.i]); 251 | 252 | if (dd.field == 4) /*store the number of ALT alleles if we are in the ALT field*/ 253 | dd.stored_alt = std::count(&buffer_in[dd.b], &buffer_in[dd.i], ','); 254 | 255 | dd.i = 0; 256 | } 257 | else 258 | { 259 | // write data to the beginning of the input buffer 260 | std::copy(&buffer_in[dd.b], &buffer_in[dd.i], &buffer_in[0]); 261 | dd.i = dd.i - dd.b; 262 | } 263 | 264 | dd.b = 0; 265 | dd.in_size = dd.i; 266 | resize_input_buffer(buffer_in, dd.i); 267 | } 268 | 269 | //! Decode an encoded popVCF 270 | void decode_file(std::string const & popvcf_fn, bool const is_bgzf_input); 271 | 272 | //! Decode a region with a bgzf file and tabix index. 273 | void decode_region(std::string const & popvcf_fn, std::string const & region); 274 | 275 | } // namespace popvcf 276 | -------------------------------------------------------------------------------- /src/encode.cpp: -------------------------------------------------------------------------------- 1 | #include "encode.hpp" 2 | 3 | #include // std::array 4 | #include 5 | #include // std::cerr 6 | #include // std::string 7 | #include 8 | 9 | #include // phmap::flat_hash_map 10 | 11 | #include "io.hpp" 12 | #include "sequence_utils.hpp" // int_to_ascii 13 | 14 | #include "htslib/bgzf.h" 15 | 16 | class BGZF; 17 | 18 | namespace popvcf 19 | { 20 | void encode_file(std::string const & input_fn, 21 | bool const is_bgzf_input, 22 | std::string const & output_fn, 23 | std::string const & output_mode, 24 | bool const is_bgzf_output, 25 | int const compression_threads) 26 | { 27 | Tenc_array_buf buffer_in; // input buffer 28 | std::vector buffer_out; // output buffer 29 | EncodeData ed; // encode data struct 30 | 31 | /// Open input file streams 32 | popvcf::bgzf_ptr in_bgzf(nullptr, popvcf::close_bgzf); // bgzf input stream 33 | popvcf::file_ptr in_vcf(nullptr, popvcf::close_vcf_nop); // vcf input stream 34 | 35 | if (is_bgzf_input) 36 | in_bgzf = popvcf::open_bgzf(input_fn, "r"); 37 | else 38 | in_vcf = popvcf::open_vcf(input_fn, "r"); 39 | 40 | /// Open output file streams 41 | popvcf::bgzf_ptr out_bgzf(nullptr, popvcf::close_bgzf); // bgzf output stream 42 | popvcf::file_ptr out_vcf(nullptr, popvcf::close_vcf_nop); // vcf output stream 43 | 44 | if (is_bgzf_output) 45 | { 46 | out_bgzf = popvcf::open_bgzf(output_fn.c_str(), output_mode.c_str()); 47 | 48 | if (compression_threads > 1) 49 | bgzf_mt(out_bgzf.get(), compression_threads, 256); 50 | } 51 | else 52 | { 53 | out_vcf = popvcf::open_vcf(output_fn, output_mode); 54 | } 55 | 56 | /// Read first buffer of input data 57 | if (is_bgzf_input) 58 | ed.in_size = bgzf_read(in_bgzf.get(), buffer_in.data(), ENC_BUFFER_SIZE); 59 | else 60 | ed.in_size = fread(buffer_in.data(), 1, ENC_BUFFER_SIZE, in_vcf.get()); 61 | 62 | long new_bytes = ed.in_size; 63 | 64 | // loop until all data has been read 65 | while (new_bytes != 0) 66 | { 67 | // encode the input buffer and write to output buffer 68 | encode_buffer(buffer_out, buffer_in, ed); 69 | 70 | // write output buffer 71 | if (out_bgzf != nullptr) 72 | popvcf::write_bgzf(out_bgzf.get(), buffer_out.data(), buffer_out.size()); 73 | else 74 | fwrite(buffer_out.data(), 1, buffer_out.size(), out_vcf.get()); // write output buffer 75 | 76 | buffer_out.resize(0); 77 | new_bytes = -static_cast(ed.in_size); 78 | 79 | // attempt to read more data from input 80 | if (is_bgzf_input) 81 | ed.in_size += bgzf_read(in_bgzf.get(), buffer_in.data() + ed.in_size, ENC_BUFFER_SIZE - ed.in_size); 82 | else 83 | ed.in_size += fread(buffer_in.data() + ed.in_size, 1, ENC_BUFFER_SIZE - ed.in_size, in_vcf.get()); 84 | 85 | new_bytes += ed.in_size; 86 | } 87 | 88 | if (ed.in_size != 0) 89 | { 90 | std::cerr << "[popvcf] WARNING: Unexpected ending of the VCF data, possibly the file is truncated.\n"; 91 | 92 | // write output buffer 93 | if (out_bgzf != nullptr) 94 | popvcf::write_bgzf(out_bgzf.get(), buffer_in.data(), ed.in_size); 95 | else 96 | fwrite(buffer_in.data(), 1, ed.in_size, out_vcf.get()); // write output buffer 97 | } 98 | } 99 | 100 | } // namespace popvcf 101 | -------------------------------------------------------------------------------- /src/encode.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include "sequence_utils.hpp" 12 | 13 | namespace popvcf 14 | { 15 | class EncodeData 16 | { 17 | public: 18 | std::size_t field{0}; //!< current vcf field. 19 | std::size_t in_size{0}; //!< Size of inut buffer. 20 | std::size_t b{0}; //!< begin index in buffer_in 21 | std::size_t i{b}; //!< index in buffer_in 22 | bool header_line{true}; //!< True iff in header line 23 | 24 | /* Data fields from previous line. */ 25 | std::vector prev_unique_fields{}; 26 | std::vector prev_field2uid{}; 27 | phmap::flat_hash_map prev_map_to_unique_fields{}; 28 | 29 | /* Data fields from current line. */ 30 | std::string contig{}; 31 | int64_t pos{0}; 32 | int32_t stored_alt{0}; 33 | int32_t n_alt{-1}; 34 | std::vector unique_fields{}; 35 | std::vector field2uid{}; 36 | phmap::flat_hash_map map_to_unique_fields{}; 37 | 38 | /* Data fields for the next line. */ 39 | std::string next_contig{}; 40 | int64_t next_pos{0}; 41 | 42 | inline void clear_line(int64_t next_pos, int32_t next_n_alt) 43 | { 44 | next_n_alt += stored_alt; 45 | stored_alt = 0; 46 | 47 | if (next_contig != contig || (next_pos / 10000) != (pos / 10000)) 48 | { 49 | /// Previous line is not available, clear values 50 | prev_unique_fields.resize(0); 51 | prev_field2uid.resize(0); 52 | prev_map_to_unique_fields.clear(); 53 | } 54 | else if (next_n_alt == n_alt) 55 | { 56 | /// Only swap out from this line if we have the same amount of alts 57 | std::swap(prev_unique_fields, unique_fields); 58 | std::swap(prev_field2uid, field2uid); 59 | std::swap(prev_map_to_unique_fields, map_to_unique_fields); 60 | } 61 | 62 | /// Clear data from this line for the next 63 | contig = next_contig; 64 | pos = next_pos; 65 | n_alt = next_n_alt; 66 | unique_fields.resize(0); 67 | field2uid.resize(0); 68 | map_to_unique_fields.clear(); 69 | } 70 | }; 71 | 72 | template 73 | inline void set_input_size(Tbuffer_in & buffer_in, EncodeData & ed) 74 | { 75 | ed.in_size = buffer_in.size(); 76 | } 77 | 78 | template <> 79 | inline void set_input_size(Tenc_array_buf & /*buffer_in*/, EncodeData & /*ed*/) 80 | { 81 | // Do nothing. 82 | // NOTE: dd.in_size must be set prior to calling decode_buffer in arrays 83 | } 84 | 85 | //! Encodes an input buffer. Output is written in \a buffer_out. 86 | template 87 | inline void encode_buffer(Tbuffer_out & buffer_out, Tbuffer_in & buffer_in, EncodeData & ed) 88 | { 89 | set_input_size(buffer_in, ed); 90 | buffer_out.reserve(ENC_BUFFER_SIZE); 91 | std::size_t constexpr N_FIELDS_SITE_DATA{9}; // how many fields of the VCF contains site data 92 | int64_t next_pos{0}; 93 | 94 | while (ed.i < ed.in_size) 95 | { 96 | char const b_in = buffer_in[ed.i]; 97 | 98 | if (b_in != '\t' && b_in != '\n') 99 | { 100 | ++ed.i; 101 | continue; // we are in a vcf field 102 | } 103 | 104 | if (ed.field == 0) /*CHROM field*/ 105 | { 106 | // check if in header line and store contig 107 | ed.header_line = buffer_in[ed.b] == '#'; // check if in header line 108 | 109 | if (not ed.header_line) 110 | ed.next_contig.assign(&buffer_in[ed.b], ed.i - ed.b); 111 | } 112 | else if (not ed.header_line) 113 | { 114 | if (ed.field == 1) /*POS field*/ 115 | { 116 | std::from_chars(&buffer_in[ed.b], &buffer_in[ed.i], next_pos); 117 | } 118 | else if (ed.field == 4) /*ALT field*/ 119 | { 120 | int32_t next_n_alt = std::count(&buffer_in[ed.b], &buffer_in[ed.i], ','); 121 | ed.clear_line(next_pos, next_n_alt); 122 | } 123 | } 124 | 125 | if (ed.header_line || ed.field < N_FIELDS_SITE_DATA) 126 | { 127 | ++ed.i; // adds '\t' or '\n' and then insert the field to the output buffer 128 | buffer_out.insert(buffer_out.end(), &buffer_in[ed.b], &buffer_in[ed.i]); 129 | } 130 | else 131 | { 132 | assert(buffer_in[ed.b] >= '!'); 133 | assert(buffer_in[ed.b] <= '9'); 134 | 135 | // check if it is in the current line 136 | auto insert_it = ed.map_to_unique_fields.insert( 137 | std::pair(std::piecewise_construct, 138 | std::forward_as_tuple(&buffer_in[ed.b], ed.i - ed.b), 139 | std::forward_as_tuple(ed.unique_fields.size()))); 140 | 141 | long const field_idx = ed.field - N_FIELDS_SITE_DATA; 142 | assert(field_idx == static_cast(ed.field2uid.size())); 143 | 144 | if (insert_it.second == true) 145 | { 146 | ed.field2uid.push_back(ed.unique_fields.size()); 147 | ed.unique_fields.emplace_back(&buffer_in[ed.b], ed.i - ed.b); 148 | 149 | if (field_idx < static_cast(ed.prev_field2uid.size()) && 150 | ed.prev_unique_fields[ed.prev_field2uid[field_idx]] == ed.unique_fields[insert_it.first->second]) 151 | { 152 | /* Case 0: unique and same as above. */ 153 | buffer_out.push_back('$'); 154 | 155 | if (b_in == '\n') /* never skip newline */ 156 | buffer_out.push_back('\n'); 157 | 158 | ++ed.i; 159 | } 160 | else 161 | { 162 | // check if it is in the previous line 163 | auto prev_find_it = ed.prev_map_to_unique_fields.find(insert_it.first->first); 164 | 165 | if (prev_find_it == ed.prev_map_to_unique_fields.end()) 166 | { 167 | /* Case 1: Field is unique in the current line and is not in the previous line. */ 168 | ++ed.i; // adds '\t' or '\n' 169 | buffer_out.insert(buffer_out.end(), &buffer_in[ed.b], &buffer_in[ed.i]); 170 | } 171 | else 172 | { 173 | /* Case 2: Field is unique in the current line but identical to a field in the previous line. */ 174 | buffer_out.push_back('%'); 175 | popvcf::to_chars(prev_find_it->second, buffer_out); 176 | buffer_out.push_back(buffer_in[ed.i]); // write '\t' or '\n' 177 | ++ed.i; 178 | } 179 | } 180 | } 181 | else 182 | { 183 | ed.field2uid.push_back(insert_it.first->second); 184 | 185 | if (field_idx < static_cast(ed.prev_field2uid.size()) && 186 | ed.prev_unique_fields[ed.prev_field2uid[field_idx]] == ed.unique_fields[insert_it.first->second]) 187 | { 188 | /* Case 3: Field is not unique and same has the field above. */ 189 | buffer_out.push_back('&'); 190 | 191 | if (b_in == '\n') /* never skip newline */ 192 | buffer_out.push_back('\n'); 193 | 194 | ++ed.i; 195 | } 196 | else 197 | { 198 | /* Case 4: Field is a duplicate in the current line. */ 199 | popvcf::to_chars(insert_it.first->second, buffer_out); 200 | buffer_out.push_back(buffer_in[ed.i]); // write '\t' or '\n' 201 | ++ed.i; 202 | } 203 | } 204 | 205 | assert((field_idx + 1) == static_cast(ed.field2uid.size())); 206 | assert(ed.field2uid[0] == 0); 207 | } 208 | 209 | assert(b_in == buffer_in[ed.i - 1]); // i should have been already incremented here 210 | ed.b = ed.i; // set begin index of next field 211 | 212 | // check if we need to clear line or increment field 213 | if (b_in == '\n') 214 | ed.field = 0; // reset field index 215 | else 216 | ++ed.field; 217 | } // ends inner loop 218 | 219 | if (ed.field >= 3 && ed.field < N_FIELDS_SITE_DATA) 220 | { 221 | // write the data even if the field is not complete 222 | buffer_out.insert(buffer_out.end(), &buffer_in[ed.b], &buffer_in[ed.i]); 223 | 224 | if (ed.field == 4) /*ALT field*/ 225 | ed.stored_alt = std::count(&buffer_in[ed.b], &buffer_in[ed.i], ','); 226 | 227 | ed.i = 0; 228 | } 229 | else 230 | { 231 | // copy the remaining data to the beginning of the input buffer 232 | std::copy(&buffer_in[ed.b], &buffer_in[ed.i], &buffer_in[0]); 233 | ed.i = ed.i - ed.b; 234 | } 235 | 236 | ed.b = 0; 237 | ed.in_size = ed.i; 238 | resize_input_buffer(buffer_in, ed.i); 239 | } 240 | 241 | //! Encode a gzipped file and write to stdout 242 | void encode_file(std::string const & input_fn, 243 | bool const is_bgzf_input, 244 | std::string const & output_fn, 245 | std::string const & output_mode, 246 | bool const is_bgzf_output, 247 | int const compression_threads); 248 | 249 | } // namespace popvcf 250 | -------------------------------------------------------------------------------- /src/in.constants.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /*! 3 | * \file in.constants.hpp 4 | * \brief Global constants, macros and configurations set by CMake. 5 | */ 6 | 7 | #include 8 | #include // strrchr 9 | #include 10 | #include 11 | 12 | // clang-format off 13 | // CMake variables 14 | #define popvcf_VERSION_MAJOR @popvcf_VERSION_MAJOR@ 15 | #define popvcf_VERSION_MINOR @popvcf_VERSION_MINOR@ 16 | #define popvcf_VERSION_PATCH @popvcf_VERSION_PATCH@ 17 | #define popvcf_SOURCE_DIRECTORY "@PROJECT_SOURCE_DIR@" 18 | #define popvcf_BINARY_DIRECTORY "@PROJECT_BINARY_DIR@" 19 | #define GIT_BRANCH "@GIT_BRANCH@" 20 | #define GIT_COMMIT_SHORT_HASH "@GIT_COMMIT_SHORT_HASH@" 21 | #define GIT_COMMIT_LONG_HASH "@GIT_COMMIT_LONG_HASH@" 22 | #define GIT_NUM_DIRTY_LINES "@GIT_NUM_DIRTY_LINES@" 23 | // clang-format on 24 | 25 | namespace popvcf 26 | { 27 | // Macros 28 | #define S1_popvcf_internal__(x) #x 29 | #define S2_popvcf_internal__(x) S1_popvcf_internal__(x) 30 | #define _HERE_ (strrchr("/" __FILE__ ":" S2_popvcf_internal__(__LINE__), '/') + 1) 31 | 32 | } // namespace popvcf 33 | -------------------------------------------------------------------------------- /src/io.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "htslib/bgzf.h" 9 | #include "htslib/hts.h" 10 | #include "htslib/kseq.h" 11 | #include "htslib/tbx.h" 12 | 13 | class BGZF; 14 | 15 | namespace popvcf 16 | { 17 | using file_ptr = std::unique_ptr; //!< Type definition for a smart FILE pointer. 18 | using bgzf_ptr = std::unique_ptr; //!< Type definition for a smart BGZF pointer. 19 | using hts_file_ptr = std::unique_ptr; //!< Type definition for a smart htsFile pointer. 20 | using tbx_t_ptr = std::unique_ptr; //!< Type definition for a smart tbx_t pointer. 21 | using hts_itr_t_ptr = std::unique_ptr; //!< Type definition for a hts_itr_t pointer. 22 | 23 | //! Closes a VCF file stream, i.e. stdout/stdin 24 | inline void close_vcf_nop(FILE *) 25 | { 26 | } 27 | 28 | //! Closes VCF file 29 | inline void close_vcf(FILE * f) 30 | { 31 | if (f != nullptr) 32 | { 33 | fclose(f); 34 | } 35 | } 36 | 37 | //! Opens a VCF file either from filename or stdout/stdin 38 | inline file_ptr open_vcf(std::string const & fn, std::string const & filemode) 39 | { 40 | if (fn == "-") 41 | { 42 | if (filemode == "r") 43 | return file_ptr(stdin, popvcf::close_vcf_nop); 44 | else 45 | return file_ptr(stdout, popvcf::close_vcf_nop); 46 | } 47 | 48 | file_ptr in_vcf(fopen(fn.c_str(), filemode.c_str()), popvcf::close_vcf); 49 | 50 | if (in_vcf == nullptr) 51 | { 52 | std::cerr << "[popvcf] ERROR: Opening VCF file " << fn << std::endl; 53 | std::exit(1); 54 | } 55 | 56 | return in_vcf; 57 | } 58 | 59 | inline void write_bgzf(BGZF * bgzf, const char * data, std::size_t const size) 60 | { 61 | assert(bgzf != nullptr); 62 | std::size_t const written_bytes = bgzf_write(bgzf, data, size); 63 | 64 | if (written_bytes != size) 65 | { 66 | std::cerr << "[popvcf] WARNING: Problem writing bgzf data. " << written_bytes << " bytes written but expected " 67 | << size << " bytes.\n"; 68 | std::exit(1); 69 | } 70 | } 71 | 72 | inline void close_bgzf(BGZF * bgzf) 73 | { 74 | if (bgzf != nullptr) 75 | { 76 | if (bgzf_close(bgzf) != 0) 77 | { 78 | std::cerr << "[popvcf] ERROR: Failed closing bgzf file." << std::endl; 79 | std::exit(1); 80 | } 81 | } 82 | } 83 | 84 | inline bgzf_ptr open_bgzf(std::string const & fn, std::string const & filemode) 85 | { 86 | bgzf_ptr in_bgzf(bgzf_open(fn.c_str(), filemode.c_str()), popvcf::close_bgzf); 87 | 88 | if (in_bgzf == nullptr) 89 | { 90 | std::cerr << "[popvcf] ERROR: Opening bgzf file " << fn << std::endl; 91 | std::exit(1); 92 | } 93 | 94 | return in_bgzf; 95 | } 96 | 97 | inline void close_hts_file(htsFile * f) 98 | { 99 | if (f != nullptr) 100 | { 101 | if (hts_close(f) != 0) 102 | { 103 | std::cerr << "[popvcf] ERROR: Failed closing hts file." << std::endl; 104 | std::exit(1); 105 | } 106 | } 107 | } 108 | 109 | inline hts_file_ptr open_hts_file(const char * fn, const char * fm) 110 | { 111 | hts_file_ptr ptr(hts_open(fn, fm), popvcf::close_hts_file); 112 | 113 | if (ptr == nullptr) 114 | { 115 | std::cerr << "ERROR: Could not open file " << fn << std::endl; 116 | std::exit(1); 117 | } 118 | 119 | return ptr; 120 | } 121 | 122 | inline void close_tbx_t(tbx_t * f) 123 | { 124 | if (f != nullptr) 125 | tbx_destroy(f); 126 | } 127 | 128 | inline tbx_t_ptr open_tbx_t(const char * fn) 129 | { 130 | tbx_t_ptr ptr(tbx_index_load(fn), popvcf::close_tbx_t); 131 | 132 | if (ptr == nullptr) 133 | { 134 | std::cerr << "[popvcf] ERROR: Could not open file " << fn << std::endl; 135 | std::exit(1); 136 | } 137 | 138 | return ptr; 139 | } 140 | 141 | inline void close_hts_itr_t(hts_itr_t * f) 142 | { 143 | if (f != nullptr) 144 | tbx_itr_destroy(f); 145 | } 146 | 147 | inline hts_itr_t_ptr open_hts_itr_t(tbx_t * tbx, const char * region) 148 | { 149 | hts_itr_t_ptr ptr(tbx_itr_querys(tbx, region), popvcf::close_hts_itr_t); 150 | 151 | if (ptr == nullptr) 152 | std::cerr << "[popvcf] WARNING: No records found in region " << region << "\n"; 153 | 154 | return ptr; 155 | } 156 | 157 | inline void free_kstring_t(kstring_t * str) 158 | { 159 | if (str->s != NULL) 160 | free(str->s); 161 | } 162 | } // namespace popvcf 163 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include "decode.hpp" 8 | #include "encode.hpp" 9 | 10 | #include 11 | 12 | namespace popvcf 13 | { 14 | int subcmd_encode(paw::Parser & parser) 15 | { 16 | std::string vcf_fn{"-"}; 17 | std::string input_type{"g"}; 18 | std::string output_fn{"-"}; 19 | std::string output_mode{"w"}; 20 | std::string output_type{"v"}; 21 | int output_compress_level{-1}; 22 | int compression_threads{1}; 23 | 24 | try 25 | { 26 | parser.parse_positional_argument(vcf_fn, 27 | "VCF", 28 | "Encode this VCF (or VCF.gz). If not set, read VCF from standard input."); 29 | 30 | parser.parse_option(compression_threads, 31 | '@', 32 | "threads", 33 | "Number of output file compression threads (only used if output type is \"z\").", 34 | "NUM"); 35 | 36 | parser.parse_option(input_type, 37 | 'I', 38 | "input-type", 39 | "Input type. v uncompressed VCF, z bgzipped VCF, g guess based on filename.", 40 | "v|z|g"); 41 | 42 | parser.parse_option(output_fn, 43 | 'o', 44 | "output", 45 | "Output will be written to this path. If '-', then write instead to standard output.", 46 | "output.vcf[.gz]"); 47 | 48 | parser.parse_option(output_compress_level, 'l', "output-compress-level", "Output file compression level.", "LEVEL"); 49 | 50 | parser.parse_option(output_type, 'O', "output-type", "Output type. v uncompressed VCF, z bgzipped VCF.", "v|z"); 51 | parser.finalize(); 52 | } 53 | catch (paw::exception::missing_positional_argument &) 54 | { 55 | output_fn = "-"; 56 | } 57 | 58 | if (output_compress_level >= 0) 59 | output_mode += std::to_string(std::min(9, output_compress_level)); 60 | 61 | long const n = vcf_fn.size(); 62 | 63 | if (n > 3 && vcf_fn[n - 2] == 'g' && vcf_fn[n - 1] == 'z') 64 | input_type = "z"; 65 | 66 | encode_file(vcf_fn, input_type == "z", output_fn, output_mode, output_type == "z", compression_threads); 67 | return 0; 68 | } 69 | 70 | int subcmd_decode(paw::Parser & parser) 71 | { 72 | std::string popvcf_fn{}; 73 | std::string input_type{"g"}; 74 | std::string region{}; 75 | 76 | try 77 | { 78 | parser.parse_option(input_type, 79 | 'I', 80 | "input-type", 81 | "Input type. v uncompressed VCF, z bgzipped VCF, g guess based on filename.", 82 | "v|z|g"); 83 | parser.parse_option(region, 'r', "region", "Fetch region/interval to decode. Requires .tbi index.", "chrN:A-B"); 84 | parser.parse_positional_argument(popvcf_fn, "popVCF", "Decode this popVCF. Use '-' for standard input."); 85 | parser.finalize(); 86 | } 87 | catch (paw::exception::missing_positional_argument &) 88 | { 89 | popvcf_fn = "-"; 90 | } 91 | 92 | long const n = popvcf_fn.size(); 93 | 94 | if (input_type == "g" && n > 3 && popvcf_fn[n - 2] == 'g' && popvcf_fn[n - 1] == 'z') 95 | input_type = "z"; 96 | 97 | if (region.empty()) 98 | decode_file(popvcf_fn, input_type == "z"); 99 | else 100 | decode_region(popvcf_fn, region); 101 | 102 | return 0; 103 | } 104 | 105 | } // namespace popvcf 106 | 107 | int main(int argc, char ** argv) 108 | { 109 | #ifndef NDEBUG 110 | std::ios_base::sync_with_stdio(false); 111 | #endif // NDEBUG 112 | paw::Parser parser(argc, argv); 113 | parser.set_name("popVCF"); 114 | parser.set_version(popvcf_VERSION_MAJOR, popvcf_VERSION_MINOR, popvcf_VERSION_PATCH); 115 | int ret{0}; 116 | 117 | try 118 | { 119 | std::string subcmd{}; 120 | 121 | parser.add_subcommand("encode", "Encode a VCF into a popVCF."); 122 | parser.add_subcommand("decode", "Decode a popVCF into a VCF."); 123 | 124 | parser.parse_subcommand(subcmd); 125 | 126 | if (subcmd == "encode") 127 | { 128 | ret = popvcf::subcmd_encode(parser); 129 | } 130 | else if (subcmd == "decode") 131 | { 132 | ret = popvcf::subcmd_decode(parser); 133 | } 134 | else if (subcmd.size() == 0) 135 | { 136 | parser.finalize(); 137 | ret = 0; 138 | } 139 | else 140 | { 141 | parser.finalize(); 142 | ret = 1; 143 | } 144 | } 145 | catch (paw::exception::help const & e) 146 | { 147 | std::cout << e.what(); 148 | return 0; 149 | } 150 | catch (std::exception const & e) 151 | { 152 | std::cerr << e.what(); 153 | return 1; 154 | } 155 | 156 | return ret; 157 | } 158 | -------------------------------------------------------------------------------- /src/sequence_utils.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence_utils.hpp" 2 | 3 | #include // std::find 4 | #include // int32_t 5 | #include // std::string 6 | #include // std::string_view 7 | #include // std::vector 8 | 9 | namespace popvcf 10 | { 11 | template 12 | std::vector split_string(Tstring const & str, char const delimiter) 13 | { 14 | std::vector output; 15 | std::string_view strv(str); 16 | auto first = strv.cbegin(); 17 | 18 | while (first != strv.cend()) 19 | { 20 | auto const second = std::find(first, strv.cend(), delimiter); 21 | 22 | if (first != second) 23 | { 24 | std::size_t const pos = std::distance(strv.cbegin(), first); 25 | output.emplace_back(strv.substr(pos, second - first)); 26 | } 27 | 28 | if (second == strv.cend()) 29 | break; 30 | 31 | first = std::next(second); 32 | } 33 | 34 | return output; 35 | } 36 | 37 | template std::vector split_string(std::string const & str, char const delimiter); 38 | template std::vector split_string(std::string_view const & str, char const delimiter); 39 | 40 | } // namespace popvcf 41 | -------------------------------------------------------------------------------- /src/sequence_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace popvcf 12 | { 13 | uint32_t constexpr CHAR_SET_SIZE = 69; 14 | uint32_t constexpr CHAR_SET_SIZE_2BYTES = CHAR_SET_SIZE * CHAR_SET_SIZE; 15 | char constexpr CHAR_SET_MIN = ':'; 16 | 17 | long constexpr ENC_BUFFER_SIZE{4 * 65536}; //!< Buffer size of arrays when encoding 18 | long constexpr DEC_BUFFER_SIZE{8 * 65536}; //!< Buffer size of arrays when decoding 19 | 20 | //! Data type of an encoding array buffer 21 | using Tenc_array_buf = std::array; 22 | 23 | //! Data type of a decoding array buffer 24 | using Tdec_array_buf = std::array; 25 | 26 | inline char int_to_ascii(uint32_t in) 27 | { 28 | assert(in < CHAR_SET_SIZE); 29 | 30 | return CHAR_SET_MIN + in; 31 | } 32 | 33 | inline uint32_t ascii_to_int(char in) 34 | { 35 | assert(in >= CHAR_SET_MIN); 36 | 37 | return static_cast(in) - CHAR_SET_MIN; 38 | } 39 | 40 | inline std::string int_to_ascii_string(uint32_t in) 41 | { 42 | std::string str; 43 | 44 | while (in >= CHAR_SET_SIZE) 45 | { 46 | uint32_t rem = in % CHAR_SET_SIZE; 47 | in = in / CHAR_SET_SIZE; 48 | str.push_back(int_to_ascii(rem)); 49 | } 50 | 51 | assert(in < CHAR_SET_SIZE); 52 | str.push_back(int_to_ascii(in)); 53 | return str; 54 | } 55 | 56 | inline uint32_t ascii_string_view_to_int(std::string_view in) 57 | { 58 | uint32_t const in_size = in.size(); 59 | assert(in_size > 0); 60 | uint32_t out{0}; 61 | uint32_t pow{1}; 62 | 63 | for (uint32_t c{0}; c < in_size; ++c) 64 | { 65 | out += pow * ascii_to_int(in[c]); 66 | pow *= CHAR_SET_SIZE; 67 | } 68 | 69 | return out; 70 | } 71 | 72 | inline uint32_t ascii_cstring_to_int(char const * b, char const * e) 73 | { 74 | uint32_t out{ascii_to_int(*b)}; 75 | ++b; 76 | uint32_t pow{CHAR_SET_SIZE}; 77 | 78 | while (b != e) 79 | { 80 | out += pow * ascii_to_int(*b); 81 | pow *= CHAR_SET_SIZE; 82 | ++b; 83 | } 84 | 85 | return out; 86 | } 87 | 88 | template 89 | inline void to_chars(Tint char_val, Tbuffer_out & buffer_out) 90 | { 91 | std::size_t constexpr ARR_SIZE{6}; 92 | std::array a; 93 | std::size_t i{0}; 94 | 95 | while (char_val >= CHAR_SET_SIZE) 96 | { 97 | Tint rem = char_val % CHAR_SET_SIZE; 98 | char_val = char_val / CHAR_SET_SIZE; 99 | assert(i < ARR_SIZE); 100 | a[i++] = int_to_ascii(rem); 101 | } 102 | 103 | assert(char_val < CHAR_SET_SIZE); 104 | assert(i < ARR_SIZE); 105 | a[i++] = int_to_ascii(char_val); 106 | buffer_out.insert(buffer_out.end(), a.data(), a.data() + i); 107 | } 108 | 109 | template 110 | std::vector split_string(Tstring const & str, char const delimiter); 111 | 112 | template 113 | long get_vcf_pos(Tit begin, Tit end) 114 | { 115 | auto find_it1 = std::find(begin, end, '\t'); 116 | auto find_it2 = std::find(find_it1 + 1, end, '\t'); 117 | long vcf_pos{0}; 118 | std::from_chars(find_it1 + 1, find_it2, vcf_pos); 119 | return vcf_pos; 120 | } 121 | 122 | template 123 | inline void resize_input_buffer(Tbuffer_in & buffer_in, std::size_t const new_size) 124 | { 125 | buffer_in.resize(new_size); 126 | } 127 | 128 | template <> 129 | inline void resize_input_buffer(Tdec_array_buf & /*buffer_in*/, std::size_t const /*new_size*/) 130 | { 131 | // Do nothing. Arrays are not resized 132 | } 133 | 134 | template <> 135 | inline void resize_input_buffer(Tenc_array_buf & /*buffer_in*/, std::size_t const /*new_size*/) 136 | { 137 | // Do nothing. Arrays are not resized 138 | } 139 | 140 | } // namespace popvcf 141 | -------------------------------------------------------------------------------- /test/create_test_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | n=100000 4 | echo "##fileformat=VCFv4.2" 5 | echo "##contig=" 6 | echo "##contig=" 7 | echo "##FORMAT=" 8 | echo "##FORMAT=" 9 | echo "##FORMAT=" 10 | echo -e -n "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" 11 | 12 | awk -v n=${n} 'BEGIN{ 13 | for (i = 1; i <= n; i++){ 14 | printf "\t%08d", i 15 | } 16 | }' 17 | 18 | awk -v n=${n} -v n_alts=3 'BEGIN{ 19 | alt="AC" 20 | printf "\nchr1\t"n+2 21 | printf "\t.\tA\tA" 22 | for (a = 1; a <= n_alts; a++){ 23 | alt=alt"C" 24 | printf ","alt 25 | } 26 | printf "\t0\t.\t.\tGT:AD:PL" 27 | for (i = 1; i <= n; i++){ 28 | printf "\t0/0:"n_alts*10 29 | for (a = 1; a <= n_alts; a++){ 30 | printf ","a % 10 31 | } 32 | printf ":0" 33 | for (a = 2; a <= n_alts + 1; a++){ 34 | for (b = 1; b <= a; b++){ 35 | printf ","a+b-2 36 | } 37 | } 38 | } 39 | printf "\n"}' 40 | 41 | awk -v n=${n} -v n_alts=3 'BEGIN{ 42 | alt="AC" 43 | printf "chr1\t"n+2 44 | printf "\t.\tA\tA" 45 | for (a = 1; a <= n_alts; a++){ 46 | alt=alt"C" 47 | printf ","alt 48 | } 49 | printf "\t0\t.\t.\tGT:AD:PL" 50 | for (i = 1; i <= n; i++){ 51 | printf "\t0/0:"n_alts*10 52 | for (a = 1; a <= n_alts; a++){ 53 | printf ","a % 10 54 | } 55 | printf ":0" 56 | for (a = 2; a <= n_alts + 1; a++){ 57 | for (b = 1; b <= a; b++){ 58 | printf ","a+b-2 59 | } 60 | } 61 | } 62 | printf "\n"}' 63 | 64 | awk -v n=${n} -v n_alts=3 'BEGIN{ 65 | alt="AC" 66 | printf "chr1\t"n+2 67 | printf "\t.\tA\tA" 68 | for (a = 1; a <= n_alts; a++){ 69 | alt=alt"C" 70 | printf ","alt 71 | } 72 | printf "\t0\t.\t.\tGT:AD:PL" 73 | for (i = 1; i <= n; i++){ 74 | printf "\t0/0:"n_alts*10 75 | for (a = 1; a <= n_alts; a++){ 76 | printf ","a % 10 77 | } 78 | printf ":0" 79 | for (a = 2; a <= n_alts + 1; a++){ 80 | for (b = 1; b <= a; b++){ 81 | printf ","a+b-2 82 | } 83 | } 84 | } 85 | printf "\n"}' 86 | 87 | 88 | awk -v n=${n} -v n_alts=7 'BEGIN{ 89 | alt="GT" 90 | printf "chr1\t"n+3 91 | printf "\t.\tG\tG" 92 | for (a = 1; a <= n_alts; a++){ 93 | alt=alt"T" 94 | printf ","alt 95 | } 96 | printf "\t0\t.\t.\tGT:AD:PL" 97 | for (i = 1; i <= n; i++){ 98 | printf "\t0/0:"n_alts*10 99 | for (a = 1; a <= n_alts; a++){ 100 | printf ","(a+i) % 10 101 | } 102 | printf ":0" 103 | for (a = 2; a <= n_alts + 1; a++){ 104 | for (b = 1; b <= a; b++){ 105 | printf ","a+i+b-2 106 | } 107 | } 108 | } 109 | printf "\n"}' 110 | 111 | awk -v n=${n} 'BEGIN{ 112 | printf "chr2\t9999\t.\tGTTTTTTT\tG\t0\t.\t.\tGT" 113 | for (i = 1; i <= n; i++){ 114 | printf "\t0/0" 115 | } 116 | printf "\n"}' 117 | 118 | awk -v n=${n} 'BEGIN{ 119 | printf "chr2\t10000\t.\tGTTTTTTT\tG\t0\t.\t.\tGT" 120 | for (i = 1; i <= n; i++){ 121 | printf "\t0/0" 122 | } 123 | printf "\n"}' 124 | 125 | awk -v n=${n} 'BEGIN{ 126 | printf "chr2\t10001\t.\tGTTTTTTT\tG\t0\t.\t.\tGT" 127 | for (i = 1; i <= n; i++){ 128 | printf "\t0/0" 129 | } 130 | printf "\n"}' 131 | 132 | awk -v n=${n} 'BEGIN{ 133 | printf "chr2\t1000000\t.\tGTTTTTTT\tG\t0\t.\t.\tGT" 134 | for (i = 1; i <= n; i++){ 135 | printf "\t0/0" 136 | } 137 | printf "\n"}' 138 | --------------------------------------------------------------------------------