├── .gitignore ├── CITATION.cff ├── CMakeLists.txt ├── CMakeModules ├── ConfigureCompilerClang.cmake └── ConfigureCompilerGcc.cmake ├── Dockerfile ├── LICENSE ├── README.md ├── data ├── SARS-CoV2 │ ├── SARS-CoV2.1k.fa.gz │ └── reads.fastq.gz ├── reads.fastq └── yeast.fasta ├── include ├── common │ ├── common.hpp │ └── seqidx.hpp ├── extender │ ├── extend_reads_dispatcher.hpp │ ├── extender_klib.hpp │ └── extender_ksw2.hpp └── ms │ ├── ms_pointers.hpp │ ├── ms_rle_string.hpp │ └── thresholds_ds.hpp ├── pipeline └── moni.in ├── src ├── CMakeLists.txt ├── build_seqidx.cpp ├── compress_dictionary.cpp ├── extend_klib.cpp ├── extend_ksw2.cpp ├── matching_statistics.cpp ├── mems.cpp └── rlebwt_ms_build.cpp ├── thirdparty └── CMakeLists.txt ├── utils.md └── utils ├── CMakeLists.txt └── split_fa.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # Folders 35 | build/* 36 | debug/* 37 | data/* 38 | !data/yeast.fasta -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | 2 | cff-version: 1.2.0 3 | message: "If you use this software, please cite it as below." 4 | authors: 5 | - family-names: "Rossi" 6 | given-names: "Massimiliano" 7 | orcid: "https://orcid.org/0000-0002-3012-1394" 8 | title: "MONI: A Pangenomic Index for Finding Maximal Exact Matches" 9 | url: "https://github.com/maxrossi91/moni" 10 | preferred-citation: 11 | type: journal-paper 12 | authors: 13 | - family-names: "Rossi" 14 | given-names: "Massimiliano" 15 | orcid: "https://orcid.org/0000-0002-3012-1394" 16 | - family-names: "Oliva" 17 | given-names: "Marco" 18 | orcid: "https://orcid.org/0000-0003-0525-3114" 19 | - family-names: "Langmead" 20 | given-names: "Ben" 21 | orcid: "https://orcid.org/0000-0003-2437-1976" 22 | - family-names: "Gagie" 23 | given-names: "Travis" 24 | orcid: "https://orcid.org/0000-0003-3689-327X" 25 | - family-names: "Boucher" 26 | given-names: "Christina" 27 | orcid: "https://orcid.org/0000-0001-9509-9725" 28 | doi: 10.1089/cmb.2021.0290 29 | journal: "Journal of Computational Biology" 30 | start: 169 # First page number 31 | end: 187 # Last page number 32 | title: "MONI: A Pangenomic Index for Finding Maximal Exact Matchesx" 33 | year: 2022 34 | volume: 29 35 | number: 2 -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.15) 2 | set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") 3 | 4 | # Set a default build type if none was specified 5 | # ------------------------------------------------------------------------------ 6 | if(NOT CMAKE_BUILD_TYPE) 7 | message(STATUS "Setting build type to 'Release' as none was specified.") 8 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 9 | endif() 10 | 11 | message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}") 12 | # About this project 13 | # ------------------------------------------------------------------------------ 14 | project(moni) 15 | SET(VERSION_MAJOR "0") 16 | SET(VERSION_MINOR "2") 17 | SET(VERSION_PATCH "2") 18 | SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") 19 | message("version: ${VERSION}") 20 | 21 | set(PROJECT_VERSION ${VERSION}) 22 | 23 | # Set environment 24 | # ------------------------------------------------------------------------------ 25 | 26 | find_package(Git) 27 | if(GIT_FOUND) 28 | message("git found: ${GIT_EXECUTABLE}") 29 | else() 30 | message(WARNING "git not found. Cloning of submodules will not work.") 31 | endif() 32 | 33 | 34 | 35 | # Configure thirdparty 36 | # ------------------------------------------------------------------------------ 37 | set(CMAKE_INSTALL_INCLUDEDIR "include") # This is an hack because include(GUIInstallDirs) doesn't work 38 | 39 | add_subdirectory(thirdparty) 40 | 41 | 42 | # Configure the compiler with the appropriate flags 43 | # ------------------------------------------------------------------------------ 44 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") 45 | # using Clang 46 | include(ConfigureCompilerClang) 47 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 48 | # using GCC 49 | include(ConfigureCompilerGcc) 50 | else ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 51 | message(FATAL_ERROR "Only the compiler gcc and clang are supported") 52 | endif() 53 | 54 | 55 | add_subdirectory(src) 56 | add_subdirectory(utils) 57 | 58 | # Configure pipeline for build folder 59 | set(USE_INSTALL_PATH False) 60 | configure_file(${PROJECT_SOURCE_DIR}/pipeline/moni.in ${PROJECT_BINARY_DIR}/moni @ONLY) 61 | 62 | # Configure pipeline for install folder 63 | set(USE_INSTALL_PATH True) 64 | configure_file(${PROJECT_SOURCE_DIR}/pipeline/moni.in ${PROJECT_BINARY_DIR}/moni.install @ONLY) 65 | 66 | 67 | install(TARGETS ms mems rlebwt_ms_build extend_ksw2 compress_dictionary build_seqidx TYPE RUNTIME) 68 | install(TARGETS SlpEncBuild pfp_thresholds pfp_thresholds64 TYPE RUNTIME) 69 | install(PROGRAMS ${PROJECT_BINARY_DIR}/moni.install RENAME moni TYPE BIN) 70 | # install(TARGETS ms rlebwt_ms_build extend_ksw2 DESTINATION bin) 71 | # install(PROGRAMS ${PROJECT_SOURCE_DIR}/pipeline/moni DESTINATION bin) 72 | 73 | 74 | # Configure cpack variables 75 | # ------------------------------------------------------------------------------ 76 | 77 | set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR}) 78 | set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR}) 79 | set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH}) 80 | set(CPACK_PACKAGE_VERSION "${VERSION}") 81 | 82 | include(InstallRequiredSystemLibraries) 83 | set(CPACK_GENERATOR "STGZ;TGZ;DEB") 84 | set(CPACK_SOURCE_GENERATOR "TGZ") 85 | set(CPACK_PACKAGE_VENDOR "University of Florida") 86 | set(CPACK_PACKAGE_CONTACT "maxrossi91@gmail.com") 87 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "MONI - Pangenomic index for finding MEMs") 88 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") 89 | set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md") 90 | set(CPACK_PACKAGE_NAME "${CMAKE_PROJECT_NAME}") 91 | set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-sources") 92 | 93 | set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Massimiliano Rossi") 94 | set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) 95 | set(CPACK_COMPONENTS_GROUPING ALL_COMPONENTS_IN_ONE) # Groupp all components 96 | # set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.3.1-6), libc6 (< 2.4)") 97 | set (CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) 98 | set(CPACK_DEB_COMPONENT_INSTALL YES) 99 | include(CPack) -------------------------------------------------------------------------------- /CMakeModules/ConfigureCompilerClang.cmake: -------------------------------------------------------------------------------- 1 | # ############################################################################## 2 | # Compiler configuration 3 | # @author Massimiliano Rossi 4 | # ############################################################################## 5 | 6 | # Add the basic compiler options 7 | add_compile_options("-std=c++11") 8 | # add_compile_options("-Werror") 9 | add_compile_options("-Wall") 10 | add_compile_options("-Wextra") 11 | add_compile_options("-Wcomment") 12 | add_compile_options("-Wformat=2") 13 | add_compile_options("-Wnonnull") 14 | add_compile_options("-Winit-self") 15 | add_compile_options("-Wmain") 16 | add_compile_options("-Wmissing-braces") 17 | add_compile_options("-Wmissing-include-dirs") 18 | add_compile_options("-Wparentheses") 19 | add_compile_options("-Wsequence-point") 20 | add_compile_options("-Wreturn-type") 21 | add_compile_options("-Wdate-time") 22 | add_compile_options("-Wswitch") 23 | add_compile_options("-Wswitch-default") 24 | add_compile_options("-Wswitch-enum") 25 | add_compile_options("-Wunused-function") 26 | add_compile_options("-Wunused-label") 27 | add_compile_options("-Wunused-local-typedefs") 28 | add_compile_options("-Wunused-parameter") 29 | add_compile_options("-Wunused-variable") 30 | add_compile_options("-Wunused-value") 31 | add_compile_options("-Wunused") 32 | add_compile_options("-Wuninitialized") 33 | add_compile_options("-Wunknown-pragmas") 34 | add_compile_options("-Wstrict-aliasing") 35 | add_compile_options("-Wstrict-overflow=5") 36 | add_compile_options("-Warray-bounds") 37 | add_compile_options("-Wundef") 38 | add_compile_options("-Wendif-labels") 39 | add_compile_options("-Wshadow") 40 | add_compile_options("-Wpointer-arith") 41 | add_compile_options("-Wtype-limits") 42 | add_compile_options("-Wcast-qual") 43 | add_compile_options("-Wwrite-strings") 44 | add_compile_options("-Wconversion") 45 | add_compile_options("-Wenum-compare") 46 | add_compile_options("-Wsign-compare") 47 | add_compile_options("-Waddress") 48 | add_compile_options("-Wattributes") 49 | add_compile_options("-Wbuiltin-macro-redefined") 50 | add_compile_options("-Wmissing-declarations") 51 | add_compile_options("-Wmissing-field-initializers") 52 | add_compile_options("-Wdeprecated") 53 | add_compile_options("-Wdeprecated-declarations") 54 | add_compile_options("-Woverflow") 55 | add_compile_options("-Wpacked") 56 | add_compile_options("-Winline") 57 | add_compile_options("-Wint-to-pointer-cast") 58 | add_compile_options("-Winvalid-pch") 59 | add_compile_options("-Wno-long-long") 60 | add_compile_options("-Wno-variadic-macros") 61 | add_compile_options("-Wvarargs") 62 | add_compile_options("-Wvla") 63 | add_compile_options("-Wvolatile-register-var") 64 | add_compile_options("-Wdisabled-optimization") 65 | add_compile_options("-Wstack-protector") 66 | add_compile_options("-Woverlength-strings") 67 | add_compile_options("-fvisibility=hidden") 68 | add_compile_options("-Wc++11-compat") 69 | add_compile_options("-Wconversion-null") 70 | add_compile_options("-Winherited-variadic-ctor") 71 | add_compile_options("-Winvalid-offsetof") 72 | add_compile_options("-pedantic") 73 | add_compile_options("-fno-gnu-keywords") 74 | add_compile_options("-Wctor-dtor-privacy") 75 | add_compile_options("-Wdelete-non-virtual-dtor") 76 | add_compile_options("-Wnarrowing") 77 | add_compile_options("-Wnon-virtual-dtor") 78 | add_compile_options("-Wreorder") 79 | add_compile_options("-Weffc++") 80 | add_compile_options("-Wold-style-cast") 81 | add_compile_options("-Wsign-promo") 82 | add_compile_options("-Wchar-subscripts") 83 | add_compile_options("-Wno-ignored-qualifiers") 84 | add_compile_options("-Wuninitialized") 85 | add_compile_options("-Wdiv-by-zero") 86 | add_compile_options("-Wfloat-equal") 87 | add_compile_options("-Wcast-align") 88 | add_compile_options("-Wempty-body") 89 | add_compile_options("-Wsizeof-pointer-memaccess") 90 | add_compile_options("-Wmultichar") 91 | add_compile_options("-fPIC") 92 | 93 | 94 | # Add the basic compiler options for debug version 95 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3") 96 | # Add the basic compiler options for release version 97 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ansi -march=native -funroll-loops -O3 -DNDEBUG") 98 | -------------------------------------------------------------------------------- /CMakeModules/ConfigureCompilerGcc.cmake: -------------------------------------------------------------------------------- 1 | # ############################################################################## 2 | # Compiler configuration 3 | # @author Massimiliano Rossi 4 | # ############################################################################## 5 | 6 | # Add the basic compiler options 7 | add_compile_options("-std=c++11") 8 | # add_compile_options("-Werror") 9 | add_compile_options("-Wall") 10 | add_compile_options("-Wextra") 11 | add_compile_options("-Wcomment") 12 | add_compile_options("-Wdouble-promotion") 13 | add_compile_options("-Wformat=2") 14 | add_compile_options("-Wnonnull") 15 | add_compile_options("-Winit-self") 16 | add_compile_options("-Wmain") 17 | add_compile_options("-Wmissing-braces") 18 | add_compile_options("-Wmissing-include-dirs") 19 | add_compile_options("-Wparentheses") 20 | add_compile_options("-Wsequence-point") 21 | add_compile_options("-Wreturn-local-addr") 22 | add_compile_options("-Wreturn-type") 23 | add_compile_options("-Wswitch") 24 | add_compile_options("-Wswitch-default") 25 | add_compile_options("-Wswitch-enum") 26 | add_compile_options("-Wunused-but-set-parameter") 27 | add_compile_options("-Wunused-but-set-variable") 28 | add_compile_options("-Wunused-function") 29 | add_compile_options("-Wunused-label") 30 | add_compile_options("-Wunused-local-typedefs") 31 | add_compile_options("-Wunused-parameter") 32 | add_compile_options("-Wunused-variable") 33 | add_compile_options("-Wunused-value") 34 | add_compile_options("-Wunused") 35 | add_compile_options("-Wuninitialized") 36 | add_compile_options("-Wunknown-pragmas") 37 | add_compile_options("-Wstrict-aliasing") 38 | add_compile_options("-Wstrict-overflow=5") 39 | add_compile_options("-Warray-bounds") 40 | add_compile_options("-Wundef") 41 | add_compile_options("-Wendif-labels") 42 | add_compile_options("-Wshadow") 43 | add_compile_options("-Wfree-nonheap-object") 44 | add_compile_options("-Wunsafe-loop-optimizations") 45 | add_compile_options("-Wpointer-arith") 46 | add_compile_options("-Wtype-limits") 47 | add_compile_options("-Wcast-qual") 48 | add_compile_options("-Wwrite-strings") 49 | add_compile_options("-Wclobbered") 50 | add_compile_options("-Wconversion") 51 | add_compile_options("-Wenum-compare") 52 | add_compile_options("-Wsign-compare") 53 | add_compile_options("-Wsign-conversion") 54 | add_compile_options("-Waddress") 55 | add_compile_options("-Wlogical-op") 56 | add_compile_options("-Wno-aggressive-loop-optimizations") 57 | add_compile_options("-Wattributes") 58 | add_compile_options("-Wbuiltin-macro-redefined") 59 | add_compile_options("-Wmissing-declarations") 60 | add_compile_options("-Wmissing-field-initializers") 61 | add_compile_options("-Wdeprecated") 62 | add_compile_options("-Wdeprecated-declarations") 63 | add_compile_options("-Woverflow") 64 | add_compile_options("-Wpacked") 65 | add_compile_options("-Wno-packed-bitfield-compat") 66 | add_compile_options("-Winline") 67 | add_compile_options("-Wint-to-pointer-cast") 68 | add_compile_options("-Winvalid-pch") 69 | add_compile_options("-Wno-long-long") 70 | add_compile_options("-Wno-variadic-macros") 71 | add_compile_options("-Wvarargs") 72 | add_compile_options("-Wvector-operation-performance") 73 | add_compile_options("-Wvla") 74 | add_compile_options("-Wvolatile-register-var") 75 | add_compile_options("-Wdisabled-optimization") 76 | add_compile_options("-Wstack-protector") 77 | add_compile_options("-Woverlength-strings") 78 | add_compile_options("-fvisibility=hidden") 79 | add_compile_options("-Wc++11-compat") 80 | add_compile_options("-Wconversion-null") 81 | add_compile_options("-Wuseless-cast") 82 | add_compile_options("-Winherited-variadic-ctor") 83 | add_compile_options("-Winvalid-offsetof") 84 | add_compile_options("-Wvirtual-move-assign") 85 | add_compile_options("-pedantic") 86 | add_compile_options("-fno-gnu-keywords") 87 | add_compile_options("-foptional-diags") 88 | add_compile_options("-Wctor-dtor-privacy") 89 | add_compile_options("-Wdelete-non-virtual-dtor") 90 | add_compile_options("-Wliteral-suffix") 91 | add_compile_options("-Wnarrowing") 92 | add_compile_options("-Wnon-virtual-dtor") 93 | add_compile_options("-Wreorder") 94 | add_compile_options("-Weffc++") 95 | add_compile_options("-fno-ext-numeric-literals") 96 | add_compile_options("-Wnon-template-friend") 97 | add_compile_options("-Wold-style-cast") 98 | add_compile_options("-Wpmf-conversions") 99 | add_compile_options("-Wsign-promo") 100 | add_compile_options("-Wchar-subscripts") 101 | add_compile_options("-Wno-ignored-qualifiers") 102 | add_compile_options("-Wmaybe-uninitialized") 103 | add_compile_options("-Wdiv-by-zero") 104 | add_compile_options("-Wtrampolines") 105 | add_compile_options("-Wfloat-equal") 106 | add_compile_options("-Wcast-align") 107 | add_compile_options("-Wempty-body") 108 | add_compile_options("-Wsizeof-pointer-memaccess") 109 | add_compile_options("-Wmultichar") 110 | add_compile_options("-Wnormalized=nfc") 111 | add_compile_options("-Wnoexcept") 112 | add_compile_options("-Wstrict-null-sentinel") 113 | 114 | # Add the basic compiler options for debug version 115 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3") 116 | # Add the basic compiler options for release version 117 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ansi -march=native -funroll-loops -O3 -DNDEBUG") 118 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest as builder 2 | 3 | WORKDIR /build 4 | 5 | 6 | RUN apt-get update -qq && \ 7 | apt-get install -y zlib1g-dev \ 8 | git \ 9 | cmake \ 10 | build-essential \ 11 | python3 \ 12 | gcc-9 \ 13 | g++-9 \ 14 | && \ 15 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9 && \ 16 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9 17 | 18 | COPY . /workspace 19 | RUN cd /workspace; rm -rf build; mkdir build; cd build; cmake ..; make -j 8; make install; 20 | 21 | # # Cleanup cmake and git 22 | # RUN apt remove -y cmake git && apt autoremove -y 23 | 24 | FROM ubuntu:latest 25 | 26 | LABEL org.opencontainers.image.authors="maxrossi91@gmail.com" 27 | RUN apt-get update -qq && \ 28 | apt-get install -y zlib1g-dev \ 29 | python3 30 | 31 | COPY --from=builder /usr/local/bin /bin 32 | CMD ["/bin/moni"] 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases) 2 | [![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_0.2.2_amd64.deb) 3 | [![Docker Pulls](https://badgen.net/docker/pulls/maxrossi91/moni?icon=docker&label=pulls)](https://hub.docker.com/r/maxrossi91/moni/) 4 | [![Docker Image Size](https://badgen.net/docker/size/maxrossi91/moni?icon=docker&label=image%20size)](https://hub.docker.com/r/maxrossi91/moni/) 5 | [![Bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/moni/README.html) 6 | 7 | # MONI 8 | ```console 9 | __ __ ____ _ _ _____ 10 | | \/ |/ __ \| \ | |_ _| 11 | | \ / | | | | \| | | | 12 | | |\/| | | | | . ` | | | 13 | | | | | |__| | |\ |_| |_ 14 | |_| |_|\____/|_| \_|_____| 15 | ver 0.2.2 16 | ``` 17 | A Pangenomics Index for Finding MEMs. 18 | 19 | MONI index uses the prefix-free parsing of the text [2][3] to build the Burrows-Wheeler Transform (BWT) of the reference genomes, the suffix array (SA) samples at the beginning and at the end of each run of the BWT, and the threshold positions of [1]. 20 | 21 | ## How to get MONI 22 | 23 | ### Docker 24 | 25 | MONI is available on `docker`: 26 | 27 | ```console 28 | docker pull maxrossi91/moni:v0.2.2 29 | docker run maxrossi91/moni:v0.2.2 moni -h 30 | ``` 31 | if using `singularity`: 32 | ```console 33 | singularity pull moni_sif docker://maxrossi91/moni:v0.2.2 34 | ./moni_sif moni --help 35 | ``` 36 | 37 | ### Install Packages 38 | 39 | We provide MONI on a `.deb` package: 40 | ```console 41 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2_amd64.deb 42 | sudo dpkg -i moni_v0.2.2_amd64.deb 43 | moni -h 44 | ``` 45 | We provide MONI on a linux `.sh` installer: 46 | ```console 47 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2-Linux.sh 48 | chmod +x moni_v0.2.2-Linux.sh 49 | ./moni_v0.2.2-Linux.sh 50 | moni -h 51 | ``` 52 | We provide MONI on a pre-compiled `.tar.gz`: 53 | ```console 54 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2-Linux.tar.gz 55 | tar -xzvf moni_v0.2.2-Linux.tar.gz 56 | moni_v0.2.2-Linux/bin/moni -h 57 | ``` 58 | 59 | ### Compile and install 60 | #### Install prerequisite packages 61 | 62 | ```console 63 | apt-get update 64 | apt-get install -y build-essential cmake git python3 zlib1g-dev 65 | ``` 66 | 67 | #### Download 68 | 69 | ```console 70 | git clone https://github.com/maxrossi91/moni 71 | ``` 72 | #### Compile 73 | 74 | ```console 75 | co moni 76 | mkdir build 77 | cd build; cmake -DCMAKE_INSTALL_PREFIX= .. 78 | make 79 | ``` 80 | 81 | Replace `` with your preferred install path. If not specified the install path is `/usr/bin` by default. 82 | 83 | #### Install 84 | 85 | ```console 86 | make install 87 | ``` 88 | 89 | ### Construction of the index: 90 | ``` 91 | usage: moni build [-h] -r REFERENCE [-w WSIZE] [-p MOD] [-t THREADS] [-k] [-v] 92 | [-f] [--moni-ms] [--spumoni] 93 | -h, --help show this help message and exit 94 | -r REFERENCE, --reference REFERENCE 95 | reference file name (default: None) 96 | -o OUTPUT, --output OUTPUT 97 | output directory path (default: same as reference) 98 | -w WSIZE, --wsize WSIZE 99 | sliding window size (default: 10) 100 | -p MOD, --mod MOD hash modulus (default: 100) 101 | -t THREADS, --threads THREADS 102 | number of helper threads (default: 0) 103 | -k keep temporary files (default: False) 104 | -v verbose (default: False) 105 | -f read fasta (default: False) 106 | -g GRAMMAR, --grammar GRAMMAR 107 | select the grammar [plain, shaped] (default: plain) 108 | 109 | ``` 110 | 111 | 112 | ### Computing the matching statistics with MONI: 113 | ``` 114 | usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] 115 | -h, --help show this help message and exit 116 | -i INDEX, --index INDEX 117 | reference index base name (default: None) 118 | -p PATTERN, --pattern PATTERN 119 | the input query (default: None) 120 | -o OUTPUT, --output OUTPUT 121 | output directory path (default: .) 122 | -t THREADS, --threads THREADS 123 | number of helper threads (default: 1) 124 | -g GRAMMAR, --grammar GRAMMAR 125 | select the grammar [plain, shaped] (default: plain) 126 | ``` 127 | 128 | ### Computing the matching statistics with MONI: 129 | ``` 130 | usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-s] [-t THREADS] 131 | -h, --help show this help message and exit 132 | -i INDEX, --index INDEX 133 | reference index base name (default: None) 134 | -p PATTERN, --pattern PATTERN 135 | the input query (default: None) 136 | -o OUTPUT, --output OUTPUT 137 | output directory path (default: .) 138 | -e, --extended-output 139 | output MEM occurrence in the reference (default: False) 140 | -s, --sam-output 141 | output MEM in a SAM formatted file. (default: False) 142 | -t THREADS, --threads THREADS 143 | number of helper threads (default: 1) 144 | -g GRAMMAR, --grammar GRAMMAR 145 | select the grammar [plain, shaped] (default: plain) 146 | ``` 147 | 148 | ### Computing the MEM extension with MONI and ksw2: 149 | ``` 150 | usage: moni extend [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] [-b BATCH] [-g GRAMMAR] [-L EXTL] [-A SMATCH] [-B SMISMATCH] [-O GAPO] [-E GAPE] 151 | 152 | optional arguments: 153 | -h, --help show this help message and exit 154 | -i INDEX, --index INDEX 155 | reference index folder (default: None) 156 | -p PATTERN, --pattern PATTERN 157 | the input query (default: None) 158 | -o OUTPUT, --output OUTPUT 159 | output directory path (default: .) 160 | -t THREADS, --threads THREADS 161 | number of helper threads (default: 1) 162 | -b BATCH, --batch BATCH 163 | number of reads per thread batch (default: 100) 164 | -g GRAMMAR, --grammar GRAMMAR 165 | select the grammar [plain, shaped] (default: plain) 166 | -L EXTL, --extl EXTL length of reference substring for extension (default: 100) 167 | -A SMATCH, --smatch SMATCH 168 | match score value (default: 2) 169 | -B SMISMATCH, --smismatch SMISMATCH 170 | mismatch penalty value (default: 4) 171 | -O GAPO, --gapo GAPO coma separated gap open penalty values (default: 4,13) 172 | -E GAPE, --gape GAPE coma separated gap extension penalty values (default: 2,1) 173 | ``` 174 | 175 | # Example 176 | 177 | ##### Build the index for `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder 178 | ```console 179 | moni build -r data/SARS-CoV2/SARS-CoV2.1k.fa.gz -o sars-cov2 -f 180 | ``` 181 | It produces three files `sars-cov2.plain.slp`, `sars-cov2.thrbv.ms`, and `sars-cov2.idx` in the current folder which contain the grammar, the rlbwt and the thresholds, and the starting position and name of each fasta sequence in the reference file respectively. 182 | 183 | ##### Compute the matching statistics of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder 184 | ```console 185 | moni ms -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads 186 | ``` 187 | It produces two output files `reads.lengths` and `reads.pointers` in the current folder which store the lengths and the positions of the matching statistics of the reads against the reference in a fasta-like format. 188 | 189 | ##### Compute the MEMs of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder 190 | ```console 191 | moni mems -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads 192 | ``` 193 | It produces one output file `reads.mems` in the current folder which store the MEMs reposted as pairs of position and lengths in a fasta-like format. 194 | 195 | ##### Compute the MEM extension of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder 196 | ```console 197 | moni extend -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads 198 | ``` 199 | It produces one output file `reads.sam` in the current folder which stores the information of the MEM extensions in SAM format. 200 | # External resources 201 | 202 | * [Big-BWT](https://github.com/alshai/Big-BWT.git) 203 | * [gSACA-K](https://github.com/felipelouza/gsa-is.git) 204 | * [malloc_count](https://github.com/bingmann/malloc_count) 205 | * [sdsl-lite](https://github.com/simongog/sdsl-lite) 206 | * [Divsufsort](https://github.com/simongog/libdivsufsort.git) 207 | * [klib](https://github.com/attractivechaos/klib) 208 | * [ksw2](https://github.com/lh3/ksw2) 209 | * [r-index](https://github.com/maxrossi91/r-index.git) 210 | * [pfp-thresholds](https://github.com/maxrossi91/pfp-thresholds.git) 211 | * [bigrepair](https://gitlab.com/manzai/bigrepair.git) 212 | * [shaped_slp](https://github.com/koeppl/ShapedSlp.git) 213 | 215 | 216 | # Citation 217 | 218 | Please, if you use this tool in an academic setting cite the following papers: 219 | 220 | @article{RossiOLGB21, 221 | author = { Massimiliano Rossi and 222 | Marco Oliva and 223 | Ben Langmead and 224 | Travis Gagie and 225 | Christina Boucher}, 226 | title = {MONI: A Pangenomics Index for Finding Maximal Exact Matches}, 227 | booktitle = {Research in Computational Molecular Biology - 25th Annual 228 | International Conference, {RECOMB} 2021, Padova, Italy}, 229 | journal = {Journal of Computational Biology}, 230 | volume = {29}, 231 | number = {2}, 232 | pages = {169--187}, 233 | year = {2022}, 234 | publisher = {Mary Ann Liebert, Inc., publishers 140 Huguenot Street, 3rd Floor New~…} 235 | } 236 | 237 | 238 | # Authors 239 | 240 | ### Theoretical results: 241 | 242 | * Christina Boucher 243 | * Travis Gagie 244 | * Ben Langmead 245 | * Massimiliano Rossi 246 | 247 | ### Implementation: 248 | 249 | * [Massimiliano Rossi](https://github.com/maxrossi91) 250 | 251 | ### Experiments 252 | 253 | * [Marco Oliva](https://github.com/marco-oliva) 254 | * [Massimiliano Rossi](https://github.com/maxrossi91) 255 | 256 | # Why "MONI"? 257 | 258 | **Moni** is the Finnish word for *multi*. 259 | 260 | # References 261 | 262 | [1] Hideo Bannai, Travis Gagie, and Tomohiro I, *"Refining ther-index"*, Theoretical Computer Science, 812 (2020), pp. 96–108 263 | 264 | [2] Christina Boucher, Travis Gagie, Alan Kuhnle and Giovanni Manzini, *"Prefix-Free Parsing for Building Big BWTs"*, In Proc. of the 18th International Workshop on Algorithms in Bioinformatics (WABI 2018). 265 | 266 | [3] Christina Boucher, Travis Gagie, Alan Kuhnle, Ben Langmead, Giovanni Manzini, and Taher Mun. *"Prefix-free parsing for building big BWTs."*, Algorithms for Molecular Biology 14, no. 1 (2019): 13. -------------------------------------------------------------------------------- /data/SARS-CoV2/SARS-CoV2.1k.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrossi91/moni/7f3f954f5bc6174457ea374acee1f2dcca61527b/data/SARS-CoV2/SARS-CoV2.1k.fa.gz -------------------------------------------------------------------------------- /data/SARS-CoV2/reads.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrossi91/moni/7f3f954f5bc6174457ea374acee1f2dcca61527b/data/SARS-CoV2/reads.fastq.gz -------------------------------------------------------------------------------- /include/common/common.hpp: -------------------------------------------------------------------------------- 1 | /* pfp-ds - prefix free parsing data structures 2 | Copyright (C) 2020 Massimiliano Rossi 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see http://www.gnu.org/licenses/ . 16 | */ 17 | /*! 18 | \file common.hpp 19 | \brief common.hpp contains common features. 20 | \author Massimiliano Rossi 21 | \date 12/03/2020 22 | */ 23 | 24 | #ifndef _COMMON_HH 25 | #define _COMMON_HH 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #include // for mmap 36 | #include 37 | #include 38 | #include 39 | 40 | #include // std::stringstream 41 | 42 | #include // std::vector 43 | 44 | #include // high_resolution_clock 45 | 46 | #include // serialize and load 47 | #include // enable_if_t and is_fundamental 48 | 49 | //**************************** From Big-BWT *********************************** 50 | // special symbols used by the construction algorithm: 51 | // they cannot appear in the input file 52 | // the 0 symbol is used in the final BWT file as the EOF char 53 | 54 | #define Dollar 2 // special char for the parsing algorithm, must be the highest special char 55 | #define EndOfWord 1 // word delimiter for the plain dictionary file 56 | #define EndOfDict 0 // end of dictionary delimiter 57 | //****************************************************************************** 58 | 59 | #define THRBYTES 5 // The number of bytes for the thresholds 60 | #define SSABYTES 5 // The number of bytes for the thresholds 61 | 62 | std::string NowTime(); 63 | void _internal_messageInfo(const std::string message); 64 | void _internal_messageWarning( const std::string file, const unsigned int line, const std::string message); 65 | void _internal_messageError( const std::string file, const unsigned int line,const std::string message); 66 | 67 | 68 | std::string NowTime() 69 | { 70 | struct timeval tv; 71 | gettimeofday(&tv, 0); 72 | char buffer[100]; 73 | tm r; 74 | strftime(buffer, sizeof(buffer), "%X", localtime_r(&tv.tv_sec, &r)); 75 | char result[100]; 76 | snprintf(result, 100, "%s"/*.%06ld"*/, buffer/*, (long)tv.tv_usec*/); 77 | return result; 78 | } 79 | 80 | 81 | template 82 | inline void _internal_message_helper(std::stringstream &ss, T const &first) { ss << first; } 83 | template 84 | inline void _internal_message_helper(std::stringstream &ss, T const &first, const Args&... args) { ss << first << " "; _internal_message_helper(ss,args...); } 85 | template 86 | inline std::string _internal_message(T const &first, const Args&... args) { std::stringstream ss; _internal_message_helper(ss,first,args...); return ss.str(); } 87 | 88 | 89 | void _internal_messageInfo(const std::string message) 90 | { 91 | std::cout << "[INFO] " << NowTime() << " - " << "Message: " << message << std::endl; 92 | } 93 | 94 | void _internal_messageWarning( const std::string file, const unsigned int line, 95 | const std::string message) 96 | { 97 | std::cout << "[WARNING] " << NowTime() << " - " 98 | << "File: " << file << '\n' 99 | << "Line: " << line << '\n' 100 | << "Message: " << message << std::endl; 101 | } 102 | 103 | void _internal_messageError( const std::string file, const unsigned int line, 104 | const std::string message) 105 | { 106 | std::cerr << "[ERROR] " << NowTime() << " - " 107 | << "File: " << file << '\n' 108 | << "Line: " << line << '\n' 109 | << "Message: " << message << std::endl; 110 | assert( false ); 111 | exit( 1 ); 112 | } 113 | 114 | 115 | 116 | #define info( args... ) \ 117 | _internal_messageInfo( _internal_message(args) ) 118 | 119 | #ifdef VERBOSE 120 | #define verbose( args... ) \ 121 | _internal_messageInfo( _internal_message(args) ) 122 | #else 123 | #define verbose( args... ) 124 | #endif 125 | 126 | #define warning( args... ) \ 127 | _internal_messageWarning( __FILE__, __LINE__, _internal_message(args) ) 128 | 129 | #define error( args... ) \ 130 | _internal_messageError( __FILE__, __LINE__, _internal_message(args) ) 131 | 132 | 133 | // converts elemens in csv format 134 | template 135 | inline void csv_helper(std::stringstream &ss, T const &first){ss << first;} 136 | template 137 | inline void csv_helper(std::stringstream &ss, T const &first, const Args &... args){ ss << first << ", "; csv_helper(ss, args...);} 138 | template 139 | inline std::string csv(T const &first, const Args &... args){std::stringstream ss;csv_helper(ss, first, args...); return ss.str();} 140 | 141 | //*********************** File I/O ********************************************* 142 | template 143 | void map_file(const char *filename, T*& ptr, size_t& length){ 144 | struct stat filestat; 145 | int fd; 146 | 147 | if ((fd = open(filename, O_RDONLY)) < 0) 148 | error("open() file " + std::string(filename) + " failed" ); 149 | 150 | if (fstat(fd, &filestat) < 0) 151 | error("stat() file " + std::string(filename) + " failed" ); 152 | 153 | if(filestat.st_size % sizeof(T) != 0) 154 | error("invilid file " + std::string(filename)); 155 | 156 | length = filestat.st_size / sizeof(T); 157 | 158 | if ((ptr = mmap(NULL, filestat.st_size, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) 159 | error("mmap() file " + std::string(filename) + " failed"); 160 | } 161 | 162 | template 163 | void read_file(const char *filename, T*& ptr, size_t& length){ 164 | struct stat filestat; 165 | FILE* fd; 166 | 167 | if ((fd = fopen(filename, "r")) == nullptr) 168 | error("open() file " + std::string(filename) + " failed" ); 169 | 170 | int fn = fileno(fd); 171 | if (fstat(fn, &filestat) < 0) 172 | error("stat() file " + std::string(filename) + " failed" ); 173 | 174 | if(filestat.st_size % sizeof(T) != 0) 175 | error("invilid file " + std::string(filename)); 176 | 177 | length = filestat.st_size / sizeof(T); 178 | ptr = new T[length]; 179 | 180 | if ((fread(ptr, sizeof(T), length, fd)) != length) 181 | error("fread() file " + std::string(filename) + " failed"); 182 | 183 | fclose(fd); 184 | } 185 | 186 | template 187 | void read_file(const char *filename, std::vector& ptr){ 188 | struct stat filestat; 189 | FILE* fd; 190 | 191 | if ((fd = fopen(filename, "r")) == nullptr) 192 | error("open() file " + std::string(filename) + " failed" ); 193 | 194 | int fn = fileno(fd); 195 | if (fstat(fn, &filestat) < 0) 196 | error("stat() file " + std::string(filename) + " failed" ); 197 | 198 | if(filestat.st_size % sizeof(T) != 0) 199 | error("invilid file " + std::string(filename)); 200 | 201 | size_t length = filestat.st_size / sizeof(T); 202 | ptr.resize(length); 203 | 204 | if ((fread(&ptr[0], sizeof(T), length, fd)) != length) 205 | error("fread() file " + std::string(filename) + " failed"); 206 | 207 | fclose(fd); 208 | } 209 | 210 | void read_file(const char *filename, std::string &ptr) 211 | { 212 | struct stat filestat; 213 | FILE *fd; 214 | 215 | if ((fd = fopen(filename, "r")) == nullptr) 216 | error("open() file " + std::string(filename) + " failed"); 217 | 218 | int fn = fileno(fd); 219 | if (fstat(fn, &filestat) < 0) 220 | error("stat() file " + std::string(filename) + " failed"); 221 | 222 | if (filestat.st_size % sizeof(char) != 0) 223 | error("invilid file " + std::string(filename)); 224 | 225 | size_t length = filestat.st_size / sizeof(char); 226 | ptr.resize(length); 227 | 228 | if ((fread(&ptr[0], sizeof(char), length, fd)) != length) 229 | error("fread() file " + std::string(filename) + " failed"); 230 | 231 | fclose(fd); 232 | } 233 | 234 | template 235 | void read_fasta_file(const char *filename, std::vector& v){ 236 | FILE* fd; 237 | 238 | if ((fd = fopen(filename, "r")) == nullptr) 239 | error("open() file " + std::string(filename) + " failed" ); 240 | 241 | v.clear(); 242 | 243 | char c; 244 | while (fread( &c, sizeof(char), 1,fd) == 1) { 245 | if(c == '>'){ 246 | while(fread( &c, sizeof(char), 1,fd) == 1 && c != '\n'); 247 | }else{ 248 | v.push_back(c); 249 | while(fread( &c, sizeof(char), 1,fd) == 1 && c!= '\n') v.push_back(c); 250 | } 251 | } 252 | fclose(fd); 253 | } 254 | 255 | template 256 | void write_file(const char *filename, std::vector &ptr) 257 | { 258 | struct stat filestat; 259 | FILE *fd; 260 | 261 | if ((fd = fopen(filename, "w")) == nullptr) 262 | error("open() file " + std::string(filename) + " failed"); 263 | 264 | size_t length = ptr.size(); 265 | if ((fwrite(&ptr[0], sizeof(T), length, fd)) != length) 266 | error("fwrite() file " + std::string(filename) + " failed"); 267 | 268 | fclose(fd); 269 | } 270 | 271 | //*********************** Time resources *************************************** 272 | 273 | /*! 274 | * op the operation that we want to measure 275 | */ 276 | #define _elapsed_time(op) \ 277 | ({ \ 278 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); \ 279 | op; \ 280 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); \ 281 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); \ 282 | std::chrono::duration>(t_insert_end - t_insert_start).count(); \ 283 | }) 284 | 285 | 286 | 287 | //********** begin my serialize edit from sdsl ******************** 288 | // Those are wrapper around most of the serialization functions of sdsl 289 | 290 | 291 | template 292 | uint64_t 293 | my_serialize_array(const T* p, const size_type size, std::ostream &out, typename std::enable_if::value>::type * = 0) 294 | { 295 | size_t written_bytes = 0; 296 | if (size > 0) 297 | { 298 | 299 | size_type idx = 0; 300 | while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (size)) 301 | { 302 | out.write((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T)); 303 | written_bytes += sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T); 304 | p += sdsl::conf::SDSL_BLOCK_SIZE; 305 | idx += sdsl::conf::SDSL_BLOCK_SIZE; 306 | } 307 | out.write((char *)p, ((size) - idx) * sizeof(T)); 308 | written_bytes += ((size) - idx) * sizeof(T); 309 | 310 | } 311 | return written_bytes; 312 | } 313 | 314 | //! Serialize each element of an std::vector 315 | /*! 316 | * \param vec The vector which should be serialized. 317 | * \param out Output stream to which should be written. 318 | * \param v Structure tree node. Note: If all elements have the same 319 | * structure, then it is tried to combine all elements (i.e. 320 | * make one node w with size set to the cumulative sum of all 321 | * sizes of the children) 322 | */ 323 | // specialization for fundamental types 324 | template 325 | uint64_t 326 | my_serialize_vector(const std::vector &vec, std::ostream &out, sdsl::structure_tree_node *v, std::string name, typename std::enable_if::value>::type * = 0) 327 | { 328 | if (vec.size() > 0) 329 | { 330 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, "std::vector<" + sdsl::util::class_name(vec[0]) + ">"); 331 | size_t written_bytes = 0; 332 | 333 | const T *p = &vec[0]; 334 | typename std::vector::size_type idx = 0; 335 | while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (vec.size())) 336 | { 337 | out.write((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T)); 338 | written_bytes += sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T); 339 | p += sdsl::conf::SDSL_BLOCK_SIZE; 340 | idx += sdsl::conf::SDSL_BLOCK_SIZE; 341 | } 342 | out.write((char *)p, ((vec.size()) - idx) * sizeof(T)); 343 | written_bytes += ((vec.size()) - idx) * sizeof(T); 344 | 345 | sdsl::structure_tree::add_size(child, written_bytes); 346 | return written_bytes; 347 | } 348 | else 349 | { 350 | return 0; 351 | } 352 | } 353 | 354 | template 355 | uint64_t 356 | my_serialize(const std::vector &x, 357 | std::ostream &out, sdsl::structure_tree_node *v = nullptr, 358 | std::string name = "", typename std::enable_if::value>::type * = 0) 359 | { 360 | return sdsl::serialize(x.size(), out, v, name) + my_serialize_vector(x, out, v, name); 361 | } 362 | 363 | 364 | /** 365 | * @brief Load an array of size elements into p. p should be preallocated. 366 | * 367 | * \tparam T 368 | * \tparam size_type 369 | * @param p 370 | * @param size 371 | * @param in 372 | */ 373 | template 374 | void my_load_array(T *p, const size_type size, std::istream &in, typename std::enable_if::value>::type * = 0) 375 | { 376 | size_type idx = 0; 377 | while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (size)) 378 | { 379 | in.read((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T)); 380 | p += sdsl::conf::SDSL_BLOCK_SIZE; 381 | idx += sdsl::conf::SDSL_BLOCK_SIZE; 382 | } 383 | in.read((char *)p, ((size) - idx) * sizeof(T)); 384 | } 385 | 386 | //! Load all elements of a vector from a input stream 387 | /*! \param vec Vector whose elements should be loaded. 388 | * \param in Input stream. 389 | * \par Note 390 | * The vector has to be resized prior the loading 391 | * of its elements. 392 | */ 393 | template 394 | void my_load_vector(std::vector &vec, std::istream &in, typename std::enable_if::value>::type * = 0) 395 | { 396 | T *p = &vec[0]; 397 | typename std::vector::size_type idx = 0; 398 | while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (vec.size())) 399 | { 400 | in.read((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T)); 401 | p += sdsl::conf::SDSL_BLOCK_SIZE; 402 | idx += sdsl::conf::SDSL_BLOCK_SIZE; 403 | } 404 | in.read((char *)p, ((vec.size()) - idx) * sizeof(T)); 405 | } 406 | 407 | template 408 | void my_load(std::vector &x, std::istream &in, typename std::enable_if::value>::type * = 0) 409 | { 410 | typename std::vector::size_type size; 411 | sdsl::load(size, in); 412 | x.resize(size); 413 | my_load_vector(x, in); 414 | } 415 | 416 | 417 | 418 | 419 | #endif /* end of include guard: _COMMON_HH */ 420 | -------------------------------------------------------------------------------- /include/common/seqidx.hpp: -------------------------------------------------------------------------------- 1 | /* seqidx - an index fo the sequence names in a fasta file 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file seqidx.cpp 16 | \brief seqidx.cpp an index fo the sequence names in a fasta file. 17 | \author Massimiliano Rossi 18 | \date 07/08/2021 19 | */ 20 | #ifndef _SEQIDX_HH 21 | #define _SEQIDX_HH 22 | 23 | #include 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | // KSEQ_DECLARE(gzFile); 31 | 32 | class seqidx 33 | { 34 | public: 35 | seqidx() 36 | { 37 | u = 0; 38 | } 39 | /** 40 | * @brief Construct a new seqidx object 41 | * 42 | * @param filename filepath of the fasta/q file 43 | */ 44 | seqidx(std::string filename) 45 | { 46 | gzFile fp(gzopen(filename.c_str(), "r")); 47 | if (fp == nullptr) 48 | error("gzopen() file " + std::string(filename) + " failed"); 49 | 50 | kseq_t *seq = kseq_init(fp); 51 | 52 | std::vector onset(1,0); 53 | u = 0; 54 | 55 | while (kseq_read(seq) >= 0) 56 | { 57 | u += seq->seq.l; 58 | names.push_back(std::string(seq->name.s)); 59 | onset.push_back(u); 60 | } 61 | 62 | kseq_destroy(seq); 63 | gzclose(fp); 64 | 65 | sdsl::sd_vector_builder builder(u, onset.size()); 66 | for (auto idx : onset) 67 | builder.set(idx); 68 | 69 | starts = sdsl::sd_vector<>(builder); 70 | rank1 = sdsl::sd_vector<>::rank_1_type(&starts); 71 | select1 = sdsl::sd_vector<>::select_1_type(&starts); 72 | } 73 | 74 | /** 75 | * @brief Construct a new seqidx object from onset, list of sequence names and total length 76 | * 77 | * @param onset the popsitions 78 | * @param names_ 79 | * @param l 80 | */ 81 | seqidx(const std::vector& onset, const std::vector& names_, const size_t l) 82 | { 83 | assert(onset.size() == names_.size()); 84 | assert(onset[0] == 0); 85 | assert(onset.back() < l); 86 | assert(std::is_sorted(onset.begin(), onset.end())); 87 | 88 | u = l; 89 | names = std::vector(names_); 90 | 91 | 92 | sdsl::sd_vector_builder builder(u, onset.size()); 93 | for (auto idx : onset) 94 | builder.set(idx); 95 | 96 | builder.set(u); 97 | 98 | starts = sdsl::sd_vector<>(builder); 99 | rank1 = sdsl::sd_vector<>::rank_1_type(&starts); 100 | select1 = sdsl::sd_vector<>::select_1_type(&starts); 101 | } 102 | 103 | 104 | 105 | /** 106 | * @brief Return the length of the i-th sequence 107 | * 108 | * @param i 109 | * @return size_t 110 | */ 111 | inline size_t length(const size_t i) 112 | { 113 | assert(i < names.size()); 114 | return select1(i+2) - select1(i+1); 115 | // return select1(i+1) - select1(i); 116 | } 117 | 118 | /** 119 | * @brief return the name of the sequence pos belongs. 120 | * 121 | * @param pos the position in the set of sequences. 122 | * @return std::string the name of the sequence pos belongs. 123 | */ 124 | inline std::string operator[](const size_t pos) 125 | { 126 | return names[rank1(pos + 1)-1]; // pos+1 becausethe rank counts the 1s before 127 | } 128 | 129 | /** 130 | * @brief return the name of the sequence pos belongs, and its offset. 131 | * 132 | * @param pos the position in the set of sequences. 133 | * @return std::pair the name of the sequence pos belongs and its offset. 134 | */ 135 | inline std::pair index(const size_t pos) 136 | { 137 | size_t rank = rank1(pos + 1); 138 | size_t start = select1(rank); 139 | return std::make_pair(names[rank-1],pos - start); // pos+1 becausethe rank counts the 1s before 140 | } 141 | 142 | /** 143 | * @brief Check if the substring [pos.pos+len-1] does not span two sequences. 144 | * 145 | * @param pos the position of the substring. 146 | * @param len the length of the substring. 147 | * @return true if the substring does not span two sequences. 148 | * @return false if the substring spans two sequences. 149 | */ 150 | inline bool valid(size_t pos, size_t len) 151 | { 152 | return (pos + len <= select1(rank1(pos + 1)+1)); // pos+1 becausethe rank counts the 1s before 153 | } 154 | 155 | /** 156 | * @brief return the SAM header description of the reference file 157 | * 158 | * @return std::string 159 | */ 160 | std::string to_sam() 161 | { 162 | std::string res = ""; 163 | for (size_t i = 0; i < names.size(); ++i) 164 | res += "@SQ\tSN:" + names[i] + "\tLN:" + std::to_string(length(i)) + "\n"; 165 | return res; 166 | } 167 | 168 | size_t serialize(std::ostream &out) 169 | { 170 | 171 | size_t w_bytes = 0; 172 | 173 | out.write((char *)&u, sizeof(u)); 174 | 175 | w_bytes += sizeof(u); 176 | 177 | if (u == 0) 178 | return w_bytes; 179 | 180 | w_bytes += starts.serialize(out); 181 | w_bytes += sdsl::serialize(names.size(), out); 182 | for(size_t i = 0; i < names.size(); ++i) 183 | { 184 | w_bytes += sdsl::serialize(names[i].size(), out); 185 | w_bytes = my_serialize_array(names[i].data(), names[i].size(), out); 186 | } 187 | return w_bytes; 188 | } 189 | 190 | void load(std::istream &in) 191 | { 192 | 193 | in.read((char *)&u, sizeof(u)); 194 | 195 | if (u == 0) 196 | return; 197 | 198 | starts.load(in); 199 | rank1 = sdsl::sd_vector<>::rank_1_type(&starts); 200 | select1 = sdsl::sd_vector<>::select_1_type(&starts); 201 | 202 | std::vector::size_type names_size; 203 | sdsl::load(names_size, in); 204 | names.resize(names_size); 205 | for (size_t i = 0; i < names.size(); ++i) 206 | { 207 | std::string::size_type string_size; 208 | sdsl::load(string_size, in); 209 | names[i].resize(string_size); 210 | my_load_array(&names[i][0], names[i].size(), in); 211 | } 212 | } 213 | 214 | std::string get_file_extension() const 215 | { 216 | return ".idx"; 217 | } 218 | 219 | protected: 220 | size_t u; 221 | 222 | sdsl::sd_vector<> starts; 223 | sdsl::sd_vector<>::rank_1_type rank1; 224 | sdsl::sd_vector<>::select_1_type select1; 225 | 226 | std::vector names; 227 | 228 | }; 229 | 230 | #endif /* end of include guard: _SEQIDX_HH */ 231 | -------------------------------------------------------------------------------- /include/extender/extend_reads_dispatcher.hpp: -------------------------------------------------------------------------------- 1 | /* extender_reads_dispatcher - Dispatches the reads in single and multithread. 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file extender_reads_dispatcher.cpp 16 | \brief extender_reads_dispatcher.cpp Dispatches the reads in single and multithread. 17 | \author Massimiliano Rossi 18 | \date 29/04/2021 19 | */ 20 | 21 | #ifndef _READS_DISPATCHER_HH 22 | #define _READS_DISPATCHER_HH 23 | 24 | extern "C"{ 25 | #include 26 | } 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | //////////////////////////////////////////////////////////////////////////////// 33 | /// kseq extra 34 | //////////////////////////////////////////////////////////////////////////////// 35 | 36 | static inline size_t ks_tell(kseq_t *seq) 37 | { 38 | return gztell(seq->f->f) - seq->f->end + seq->f->begin; 39 | } 40 | 41 | void copy_kstring_t(kstring_t &l, kstring_t &r) 42 | { 43 | l.l = r.l; 44 | l.m = r.m; 45 | l.s = (char *)malloc(l.m); 46 | for (size_t i = 0; i < r.m; ++i) 47 | l.s[i] = r.s[i]; 48 | } 49 | void copy_kseq_t(kseq_t *l, kseq_t *r) 50 | { 51 | copy_kstring_t(l->name, r->name); 52 | copy_kstring_t(l->comment, r->comment); 53 | copy_kstring_t(l->seq, r->seq); 54 | copy_kstring_t(l->qual, r->qual); 55 | l->last_char = r->last_char; 56 | } 57 | //////////////////////////////////////////////////////////////////////////////// 58 | 59 | //////////////////////////////////////////////////////////////////////////////// 60 | /// xerror extra (conditions) 61 | //////////////////////////////////////////////////////////////////////////////// 62 | 63 | #ifndef Thread_error_wait 64 | #define Thread_error_wait 5 65 | #endif 66 | 67 | // cond 68 | int xpthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr, int linea, const char *file) 69 | { 70 | int e = pthread_cond_init(cond, attr); 71 | if (e != 0) 72 | { 73 | xperror(e, "Error in pthread_cond_init"); 74 | fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file); 75 | sleep(Thread_error_wait); // do not kill immediately other threads 76 | exit(1); 77 | } 78 | return e; 79 | } 80 | 81 | int xpthread_cond_destroy(pthread_cond_t *cond, int linea, const char *file) 82 | { 83 | int e = pthread_cond_destroy(cond); 84 | if (e != 0) 85 | { 86 | xperror(e, "Error in pthread_cond_destroy"); 87 | fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file); 88 | sleep(Thread_error_wait); // do not kill immediately other threads 89 | exit(1); 90 | } 91 | return e; 92 | } 93 | 94 | int xpthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex, int linea, const char *file) 95 | { 96 | int e = pthread_cond_wait(cond, mutex); 97 | if (e != 0) 98 | { 99 | xperror(e, "Error in pthread_cond_lock"); 100 | fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file); 101 | sleep(Thread_error_wait); // do not kill immediately other threads 102 | exit(1); 103 | } 104 | return e; 105 | } 106 | 107 | int xpthread_cond_signal(pthread_cond_t *cond, int linea, const char *file) 108 | { 109 | int e = pthread_cond_signal(cond); 110 | if (e != 0) 111 | { 112 | xperror(e, "Error in pthread_cond_unlock"); 113 | fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file); 114 | sleep(Thread_error_wait); // do not kill immediately other threads 115 | exit(1); 116 | } 117 | return e; 118 | } 119 | //////////////////////////////////////////////////////////////////////////////// 120 | 121 | //////////////////////////////////////////////////////////////////////////////// 122 | /// Parallel computation 123 | //////////////////////////////////////////////////////////////////////////////// 124 | 125 | // This should be done using buffering. 126 | size_t next_start_fastq(gzFile fp) 127 | { 128 | int c; 129 | // Special case when we arr at the beginning of the file. 130 | if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@') 131 | return 0; 132 | 133 | // Strart from the previous character 134 | gzseek(fp, -1, SEEK_CUR); 135 | 136 | std::vector> window; 137 | // Find the first new line 138 | for (size_t i = 0; i < 4; ++i) 139 | { 140 | while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n')) 141 | { 142 | } 143 | if (c == EOF) 144 | return gztell(fp); 145 | if ((c = gzgetc(fp)) == EOF) 146 | return gztell(fp); 147 | window.push_back(std::make_pair(c, gztell(fp) - 1)); 148 | } 149 | 150 | for (size_t i = 0; i < 2; ++i) 151 | { 152 | if (window[i].first == '@' && window[i + 2].first == '+') 153 | return window[i].second; 154 | if (window[i].first == '+' && window[i + 2].first == '@') 155 | return window[i + 2].second; 156 | } 157 | 158 | return gztell(fp); 159 | } 160 | 161 | // test if the file is gzipped 162 | static inline bool is_gzipped(std::string filename) 163 | { 164 | FILE *fp = fopen(filename.c_str(), "rb"); 165 | if (fp == NULL) 166 | error("Opening file " + filename); 167 | int byte1 = 0, byte2 = 0; 168 | fread(&byte1, sizeof(char), 1, fp); 169 | fread(&byte2, sizeof(char), 1, fp); 170 | fclose(fp); 171 | return (byte1 == 0x1f && byte2 == 0x8b); 172 | } 173 | 174 | // Return the length of the file 175 | // Assumes that the file is not compressed 176 | static inline size_t get_file_size(std::string filename) 177 | { 178 | if (is_gzipped(filename)) 179 | { 180 | std::cerr << "The input is gzipped!" << std::endl; 181 | return -1; 182 | } 183 | FILE *fp = fopen(filename.c_str(), "r"); 184 | fseek(fp, 0L, SEEK_END); 185 | size_t size = ftell(fp); 186 | fclose(fp); 187 | return size; 188 | } 189 | 190 | std::vector split_fastq(std::string filename, size_t n_threads) 191 | { 192 | //Precondition: the file is not gzipped 193 | // scan file for start positions and execute threads 194 | size_t size = get_file_size(filename); 195 | 196 | gzFile fp = gzopen(filename.c_str(), "r"); 197 | if (fp == Z_NULL) 198 | { 199 | throw new std::runtime_error("Cannot open input file " + filename); 200 | } 201 | 202 | std::vector starts(n_threads + 1); 203 | for (int i = 0; i < n_threads; ++i) 204 | { 205 | size_t start = (size_t)((size * i) / n_threads); 206 | gzseek(fp, start, SEEK_SET); 207 | starts[i] = next_start_fastq(fp); 208 | } 209 | starts[n_threads] = size; 210 | gzclose(fp); 211 | return starts; 212 | } 213 | 214 | inline char complement(const char n) 215 | { 216 | switch (n) 217 | { 218 | case 'A': 219 | return 'T'; 220 | case 'T': 221 | return 'A'; 222 | case 'G': 223 | return 'C'; 224 | case 'C': 225 | return 'G'; 226 | default: 227 | return n; 228 | } 229 | } 230 | 231 | //////////////////////////////////////////////////////////////////////////////// 232 | 233 | //////////////////////////////////////////////////////////////////////////////// 234 | /// Merge SAMs 235 | //////////////////////////////////////////////////////////////////////////////// 236 | 237 | 238 | // Merges te file in filename in the file pointed by fp 239 | void append_file(const std::string filename, FILE *fp){ 240 | const size_t buff_size = 16384; 241 | 242 | uint8_t buff[buff_size]; 243 | size_t size = 0; 244 | 245 | struct stat filestat; 246 | FILE *fd; 247 | 248 | if ((fd = fopen(filename.c_str(), "r")) == nullptr) 249 | error("open() file " + std::string(filename) + " failed"); 250 | 251 | // int fn = fileno(fd); 252 | // if (fstat(fn, &filestat) < 0) 253 | // error("stat() file " + std::string(filename) + " failed"); 254 | 255 | // size_t length = filestat.st_size; 256 | size_t length = 0; 257 | 258 | while((length = fread(buff, sizeof(uint8_t), buff_size, fd)) == buff_size) 259 | if ((fwrite(buff, sizeof(uint8_t), buff_size, fp)) != buff_size) 260 | error("fwrite() file " + std::string(filename) + " failed"); 261 | 262 | assert(length < buff_size); 263 | if(length > 0) 264 | if ((fwrite(buff, sizeof(uint8_t), length, fp)) != length) 265 | error("fwrite() file " + std::string(filename) + " failed"); 266 | 267 | 268 | fclose(fd); 269 | } 270 | 271 | //////////////////////////////////////////////////////////////////////////////// 272 | 273 | //////////////////////////////////////////////////////////////////////////////// 274 | /// Multithreads workers 275 | //////////////////////////////////////////////////////////////////////////////// 276 | 277 | pthread_mutex_t mutex_reads_dispatcher; 278 | pthread_cond_t cond_reads_dispatcher; 279 | // Critical variables 280 | size_t n_active_threads = 0; 281 | // std::vector active_threads; 282 | 283 | template 284 | struct mt_param_t 285 | { 286 | // Parameters 287 | extender_t *extender; 288 | std::string pattern_filename; 289 | std::string sam_filename; 290 | size_t start; 291 | size_t end; 292 | size_t wk_id; 293 | // Return values 294 | size_t n_reads; 295 | size_t n_extended_reads; 296 | }; 297 | 298 | template 299 | void *mt_extend_worker(void *param) 300 | { 301 | mt_param_t *p = (mt_param_t *)param; 302 | size_t n_reads = 0; 303 | size_t n_extended_reads = 0; 304 | 305 | FILE *sam_fd; 306 | gzFile fp; 307 | 308 | if ((sam_fd = fopen(p->sam_filename.c_str(), "w")) == nullptr) 309 | error("open() file " + p->sam_filename + " failed"); 310 | 311 | if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL) 312 | error("open() file " + p->pattern_filename + " failed"); 313 | 314 | gzseek(fp, p->start, SEEK_SET); 315 | 316 | kseq_t rev; 317 | int l; 318 | 319 | kseq_t *seq = kseq_init(fp); 320 | while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0)) 321 | { 322 | 323 | bool fwd_extend = p->extender->extend(seq, sam_fd, 0); 324 | 325 | //copy seq 326 | copy_kseq_t(&rev, seq); 327 | 328 | for (size_t i = 0; i < seq->seq.l; ++i) 329 | rev.seq.s[i] = complement(seq->seq.s[seq->seq.l - i - 1]); 330 | 331 | if (rev.seq.m > rev.seq.l) 332 | rev.seq.s[rev.seq.l] = 0; 333 | 334 | bool rev_extend = p->extender->extend(&rev, sam_fd, 1); 335 | 336 | if (fwd_extend or rev_extend) 337 | n_extended_reads++; 338 | n_reads++; 339 | 340 | free(rev.name.s); 341 | free(rev.comment.s); 342 | free(rev.seq.s); 343 | free(rev.qual.s); 344 | } 345 | 346 | verbose("Number of extended reads block ", p->wk_id, " : ", n_extended_reads, "/", n_reads); 347 | p->n_reads = n_reads; 348 | p->n_extended_reads = n_extended_reads; 349 | kseq_destroy(seq); 350 | gzclose(fp); 351 | fclose(sam_fd); 352 | 353 | // Update the number of active threads 354 | xpthread_mutex_lock(&mutex_reads_dispatcher, __LINE__, __FILE__); 355 | { 356 | --n_active_threads; 357 | xpthread_cond_signal(&cond_reads_dispatcher, __LINE__, __FILE__); 358 | } 359 | xpthread_mutex_unlock(&mutex_reads_dispatcher, __LINE__, __FILE__); 360 | 361 | return NULL; 362 | } 363 | 364 | template 365 | size_t mt_extend(extender_t *extender, std::string pattern_filename, std::string sam_filename, size_t n_threads, size_t k) 366 | { 367 | xpthread_mutex_init(&mutex_reads_dispatcher, NULL, __LINE__, __FILE__); 368 | xpthread_cond_init(&cond_reads_dispatcher, NULL, __LINE__, __FILE__); 369 | 370 | // active_threads = std::vector(n_threads, false); 371 | pthread_t t[k * n_threads] = {0}; 372 | mt_param_t params[k * n_threads]; 373 | std::vector starts = split_fastq(pattern_filename, k * n_threads); 374 | for (size_t i = 0; i < k * n_threads; ++i) 375 | { 376 | // Get the number of active threads 377 | xpthread_mutex_lock(&mutex_reads_dispatcher, __LINE__, __FILE__); 378 | { 379 | while(n_active_threads >= n_threads) 380 | xpthread_cond_wait(&cond_reads_dispatcher, &mutex_reads_dispatcher, __LINE__, __FILE__); 381 | assert(n_active_threads < n_threads); 382 | // Create a new thread 383 | params[i].extender = extender; 384 | params[i].pattern_filename = pattern_filename; 385 | params[i].sam_filename = sam_filename + "_" + std::to_string(i) + ".sam"; 386 | params[i].start = starts[i]; 387 | params[i].end = starts[i + 1]; 388 | params[i].wk_id = i; 389 | xpthread_create(&t[i], NULL, &mt_extend_worker, ¶ms[i], __LINE__, __FILE__); 390 | // Update the number of active threads 391 | ++n_active_threads; 392 | } 393 | xpthread_mutex_unlock(&mutex_reads_dispatcher, __LINE__, __FILE__); 394 | } 395 | 396 | size_t tot_reads = 0; 397 | size_t tot_extended_reads = 0; 398 | 399 | for (size_t i = 0; i < k * n_threads; ++i) 400 | { 401 | xpthread_join(t[i], NULL, __LINE__, __FILE__); 402 | } 403 | 404 | // sleep(5); 405 | verbose("Merging temporary SAM files"); 406 | 407 | FILE *fd; 408 | 409 | if ((fd = fopen(std::string(sam_filename + ".sam").c_str(), "w")) == nullptr) 410 | error("open() file " + std::string(sam_filename + ".sam") + " failed"); 411 | 412 | fprintf(fd, "%s", extender->to_sam().c_str()); 413 | 414 | for (size_t i = 0; i < k * n_threads; ++i) 415 | { 416 | tot_reads += params[i].n_reads; 417 | tot_extended_reads += params[i].n_extended_reads; 418 | 419 | append_file(params[i].sam_filename, fd); 420 | if (std::remove(params[i].sam_filename.c_str()) != 0) 421 | error("remove() file " + params[i].sam_filename + " failed"); 422 | } 423 | 424 | xpthread_mutex_destroy(&mutex_reads_dispatcher, __LINE__, __FILE__); 425 | xpthread_cond_destroy(&cond_reads_dispatcher, __LINE__, __FILE__); 426 | 427 | verbose("Number of extended reads: ", tot_extended_reads, "/", tot_reads); 428 | return tot_extended_reads; 429 | } 430 | 431 | //////////////////////////////////////////////////////////////////////////////// 432 | /// Single Thread 433 | //////////////////////////////////////////////////////////////////////////////// 434 | template 435 | size_t st_extend(extender_t *extender, std::string pattern_filename, std::string sam_filename) 436 | { 437 | size_t n_reads = 0; 438 | size_t n_extended_reads = 0; 439 | kseq_t rev; 440 | int l; 441 | FILE *sam_fd; 442 | 443 | sam_filename += ".sam"; 444 | 445 | if ((sam_fd = fopen(sam_filename.c_str(), "w")) == nullptr) 446 | error("open() file " + sam_filename + " failed"); 447 | 448 | fprintf(sam_fd, "%s", extender->to_sam().c_str()); 449 | 450 | gzFile fp = gzopen(pattern_filename.c_str(), "r"); 451 | kseq_t *seq = kseq_init(fp); 452 | while ((l = kseq_read(seq)) >= 0) 453 | { 454 | 455 | bool fwd_extend = extender->extend(seq, sam_fd, 0); 456 | 457 | //copy seq 458 | copy_kseq_t(&rev, seq); 459 | 460 | for (size_t i = 0; i < seq->seq.l; ++i) 461 | rev.seq.s[i] = complement(seq->seq.s[seq->seq.l - i - 1]); 462 | 463 | if (rev.seq.m > rev.seq.l) 464 | rev.seq.s[rev.seq.l] = 0; 465 | 466 | bool rev_extend = extender->extend(&rev, sam_fd, 1); 467 | 468 | if (fwd_extend or rev_extend) 469 | n_extended_reads++; 470 | n_reads++; 471 | 472 | free(rev.name.s); 473 | free(rev.comment.s); 474 | free(rev.seq.s); 475 | free(rev.qual.s); 476 | } 477 | 478 | verbose("Number of extended reads: ", n_extended_reads, "/", n_reads); 479 | kseq_destroy(seq); 480 | gzclose(fp); 481 | fclose(sam_fd); 482 | 483 | // sleep(5); 484 | 485 | return n_extended_reads; 486 | } 487 | 488 | #endif /* end of include guard: _READS_DISPATCHER_HH */ 489 | -------------------------------------------------------------------------------- /include/extender/extender_klib.hpp: -------------------------------------------------------------------------------- 1 | /* extender_klib - Extend the MEMs of the reads to the reference using the klib library for SW 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file extender_klib.cpp 16 | \brief extender_klib.cpp Extend the MEMs of the reads to the reference using the klib library for SW 17 | \author Massimiliano Rossi 18 | \date 13/07/2020 19 | */ 20 | 21 | #ifndef _EXTENDER_KLIB_HH 22 | #define _EXTENDER_KLIB_HH 23 | 24 | #include 25 | 26 | #include 27 | 28 | #include 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include 39 | #include 40 | 41 | #include 42 | #include 43 | 44 | //////////////////////////////////////////////////////////////////////////////// 45 | /// SLP definitions 46 | //////////////////////////////////////////////////////////////////////////////// 47 | 48 | using SelSd = SelectSdvec<>; 49 | using DagcSd = DirectAccessibleGammaCode; 50 | using Fblc = FixedBitLenCode<>; 51 | 52 | using shaped_slp_t = SelfShapedSlp; 53 | using plain_slp_t = PlainSlp; 54 | 55 | template 56 | std::string get_slp_file_extension() 57 | { 58 | return std::string(".slp"); 59 | } 60 | 61 | template <> 62 | std::string get_slp_file_extension() 63 | { 64 | return std::string(".slp"); 65 | } 66 | 67 | template <> 68 | std::string get_slp_file_extension() 69 | { 70 | return std::string(".plain.slp"); 71 | } 72 | //////////////////////////////////////////////////////////////////////////////// 73 | 74 | template 75 | class extender 76 | { 77 | public: 78 | extender(std::string filename, 79 | size_t min_len_ = 50, 80 | bool forward_only_ = true) : min_len(min_len_), 81 | forward_only(forward_only_) 82 | { 83 | verbose("Loading the matching statistics index"); 84 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 85 | 86 | std::string filename_ms = filename + ms.get_file_extension(); 87 | 88 | ifstream fs_ms(filename_ms); 89 | ms.load(fs_ms); 90 | fs_ms.close(); 91 | 92 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 93 | 94 | verbose("Matching statistics index construction complete"); 95 | verbose("Memory peak: ", malloc_count_peak()); 96 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 97 | 98 | verbose("Loading random access"); 99 | t_insert_start = std::chrono::high_resolution_clock::now(); 100 | 101 | std::string filename_slp = filename + get_slp_file_extension(); 102 | 103 | ifstream fs(filename_slp); 104 | ra.load(fs); 105 | fs.close(); 106 | 107 | n = ra.getLen(); 108 | 109 | t_insert_end = std::chrono::high_resolution_clock::now(); 110 | 111 | verbose("Matching statistics index loading complete"); 112 | verbose("Memory peak: ", malloc_count_peak()); 113 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 114 | 115 | std::string filename_idx = filename + idx.get_file_extension(); 116 | verbose("Loading fasta index file: " + filename_idx); 117 | t_insert_start = std::chrono::high_resolution_clock::now(); 118 | 119 | 120 | ifstream fs_idx(filename_idx); 121 | idx.load(fs_idx); 122 | fs_idx.close(); 123 | 124 | t_insert_end = std::chrono::high_resolution_clock::now(); 125 | 126 | verbose("Fasta index loading complete"); 127 | verbose("Memory peak: ", malloc_count_peak()); 128 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 129 | 130 | 131 | verbose("Initialize the local aligner"); 132 | t_insert_start = std::chrono::high_resolution_clock::now(); 133 | 134 | if (minsc > 0xffff) 135 | minsc = 0xffff; 136 | xtra |= KSW_XSUBO | minsc; 137 | // initialize scoring matrix 138 | for (i = k = 0; i < 4; ++i) 139 | { 140 | for (j = 0; j < 4; ++j) 141 | mat[k++] = i == j ? sa : -sb; 142 | mat[k++] = 0; // ambiguous base 143 | } 144 | for (j = 0; j < 5; ++j) 145 | mat[k++] = 0; 146 | 147 | t_insert_end = std::chrono::high_resolution_clock::now(); 148 | 149 | verbose("Local aligner initialization complete"); 150 | verbose("Memory peak: ", malloc_count_peak()); 151 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 152 | 153 | verbose("Minimum MEM length: ", min_len); 154 | } 155 | 156 | bool extend(kseq_t *read, FILE *out, uint8_t strand) 157 | { 158 | size_t mem_pos = 0; 159 | size_t mem_len = 0; 160 | size_t mem_idx = 0; 161 | 162 | bool extended = false; 163 | 164 | auto pointers = ms.query(read->seq.s, read->seq.l); 165 | std::vector lengths(pointers.size()); 166 | size_t l = 0; 167 | for (size_t i = 0; i < pointers.size(); ++i) 168 | { 169 | size_t pos = pointers[i]; 170 | while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l)) 171 | ++l; 172 | 173 | lengths[i] = l; 174 | l = (l == 0 ? 0 : (l - 1)); 175 | 176 | // Update MEM 177 | if (lengths[i] > mem_len) 178 | { 179 | mem_len = lengths[i]; 180 | mem_pos = pointers[i]; 181 | mem_idx = i; 182 | } 183 | } 184 | 185 | // Align the read 186 | if (mem_len >= min_len) 187 | { 188 | char *str = (char *)malloc(400); 189 | 190 | int32_t maskLen = read->seq.l / 2; 191 | maskLen = maskLen < 15 ? 15 : maskLen; 192 | 193 | // Extract the context from the reference 194 | size_t left_occ = (mem_pos > 100 ? mem_pos - 100 : 0); 195 | size_t len = mem_len + 100 + (mem_pos > 100 ? 100 : 100 - mem_pos); 196 | ra.expandSubstr(left_occ, len, str); 197 | 198 | size_t min_score = 20 + 8 * log(read->seq.l); 199 | 200 | uint8_t *seq = (uint8_t *)malloc(read->seq.l); 201 | // Convert A,C,G,T,N into 0,1,2,3,4 202 | for (i = 0; i < (int)read->seq.l; ++i) 203 | seq[i] = seq_nt4_table[(int)read->seq.s[i]]; 204 | // for (i = 0; i < (int)read->seq.l; ++i) 205 | // read->seq.s[i] = seq_nt4_table[(int)read->seq.s[i]]; 206 | 207 | for (i = 0; i < (int)len; ++i) 208 | str[i] = seq_nt4_table[(int)str[i]]; 209 | 210 | int score; 211 | 212 | kswq_t *q = 0; 213 | kswr_t r; 214 | 215 | r = ksw_align(read->seq.l, (uint8_t *)seq, len, (uint8_t *)str, 5, mat, gapo, gape, xtra, &q); 216 | // score = ksw_global(read->seq.l, (uint8_t *)read->seq.s, len, (uint8_t *)str, 5, mat, gapo, gape, w, &n_cigar, &cigar); 217 | 218 | int n_cigar; 219 | uint32_t *cigar; 220 | 221 | size_t new_seq_len = r.qe - r.qb; 222 | size_t new_ref_len = r.te - r.tb; 223 | uint8_t *new_seq = (uint8_t *)(seq + r.qb); 224 | // uint8_t *new_seq = (uint8_t *)(read->seq.s + r.qb); 225 | uint8_t *new_ref = (uint8_t *)(str + r.tb); 226 | 227 | score = ksw_global(new_seq_len, (uint8_t *)new_seq, new_ref_len, new_ref, 5, mat, gapo, gape, w, &n_cigar, &cigar); 228 | 229 | std::string cig; 230 | 231 | // for(size_t i = 0; i < n_cigar; ++i) 232 | // { 233 | // // for (i = 0; i < ez->n_cigar; ++i) 234 | // // printf("%d%c", ez->cigar[i] >> 4, "MID"[ez->cigar[i] & 0xf]); 235 | // cig += std::to_string(cigar[i] >> 4) + "MID"[cigar[i] & 0xf]; 236 | // } 237 | 238 | size_t mismatch = mark_mismatch(r.tb, r.qb, r.qe, (int8_t *)str, (int8_t *)seq, read->seq.l, &cigar, &n_cigar); 239 | for (c = 0; c < (n_cigar); ++c) 240 | { 241 | char letter = cigar_int_to_op(cigar[c]); 242 | uint32_t length = cigar_int_to_len(cigar[c]); 243 | // fprintf(out, "%lu%c", (unsigned long)length, letter); 244 | cig += std::to_string((unsigned long)length) + letter; 245 | } 246 | 247 | // if(r.score > 0) 248 | // printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", "human", r.tb, r.te + 1, read->name.s, r.qb, r.qe + 1, r.score, r.score2, r.te2); 249 | // // std::cout << "\rCurrent score... "<< r.score << std::flush; 250 | 251 | // // Declares a default Aligner 252 | // StripedSmithWaterman::Aligner aligner; 253 | // // Declares a default filter 254 | // StripedSmithWaterman::Filter filter; 255 | // // StripedSmithWaterman::Filter filter(true, true, min_score, 32767); 256 | // // Declares an alignment that stores the result 257 | // StripedSmithWaterman::Alignment alignment; 258 | // // Aligns the query to the ref 259 | // aligner.Align(read->seq.s, str, len, filter, &alignment, maskLen); 260 | 261 | // // Update alignment method 262 | r.tb += left_occ; 263 | r.te += left_occ; 264 | r.te2 += left_occ; 265 | 266 | if (r.score >= min_score) 267 | { 268 | ssw_write_sam(r, idx[r.tb].c_str(), read, strand, out, cig, mismatch); 269 | extended = true; 270 | } 271 | 272 | // extended_reads++; 273 | free(cigar); 274 | free(q); 275 | delete str; 276 | delete seq; 277 | } 278 | return extended; 279 | } 280 | 281 | size_t get_extended_reads() 282 | { 283 | return extended_reads; 284 | } 285 | 286 | // Adapted from SSW 287 | static void ssw_write_sam(kswr_t &a, 288 | const char *ref_seq_name, 289 | const kseq_t *read, 290 | int8_t strand, 291 | FILE *out, 292 | std::string cigar, 293 | size_t mismatches) // 0: forward aligned ; 1: reverse complement aligned 294 | { 295 | // Sam format output 296 | fprintf(out, "%s\t", read->name.s); 297 | if (a.score == 0) 298 | fprintf(out, "4\t*\t0\t255\t*\t*\t0\t0\t*\t*\n"); 299 | else 300 | { 301 | int32_t c, p; 302 | uint32_t mapq = -4.343 * log(1 - (double)abs(a.score - a.score2) / (double)a.score); 303 | mapq = (uint32_t)(mapq + 4.99); 304 | mapq = mapq < 254 ? mapq : 254; 305 | if (strand) 306 | fprintf(out, "16\t"); 307 | else 308 | fprintf(out, "0\t"); 309 | // TODO: Find the correct reference name. 310 | fprintf(out, "%s\t%d\t%d\t", ref_seq_name, a.tb + 1, mapq); 311 | // size_t mismatch = mark_mismatch(a.tb, a.qb, a.qe, (int8_t*)ref, (int8_t*)read_, read->seq.l, cigar, cigarLen); 312 | // for (c = 0; c < (*cigarLen); ++c) 313 | // { 314 | // char letter = cigar_int_to_op((*cigar)[c]); 315 | // uint32_t length = cigar_int_to_len((*cigar)[c]); 316 | // fprintf(out, "%lu%c", (unsigned long)length, letter); 317 | // } 318 | // fprintf(out, "\t*\t"); 319 | // fprintf(out, "%s", a.cigar_string.c_str()); 320 | fprintf(out, "%s", cigar.c_str()); 321 | fprintf(out, "\t*\t0\t0\t"); 322 | fprintf(out, "%s", read->seq.s); 323 | fprintf(out, "\t"); 324 | if (read->qual.s && strand) 325 | { 326 | for (p = read->qual.l - 1; p >= 0; --p) 327 | fprintf(out, "%c", read->qual.s[p]); 328 | } 329 | else if (read->qual.s) 330 | fprintf(out, "%s", read->qual.s); 331 | else 332 | fprintf(out, "*"); 333 | fprintf(out, "\tAS:i:%d", a.score); 334 | fprintf(out, "\tNM:i:%d\t", mismatches); 335 | // fprintf(out, "\tNM:i:%d\t", a.mismatches); 336 | if (a.score2 > 0) 337 | fprintf(out, "ZS:i:%d\n", a.score2); 338 | else 339 | fprintf(out, "\n"); 340 | } 341 | } 342 | 343 | std::string to_sam() 344 | { 345 | std::string res = "@HD\tVN:1.6\tSO:unknown\n"; 346 | res += idx.to_sam(); 347 | res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n"; 348 | return res; 349 | } 350 | 351 | protected: 352 | ms_pointers<> ms; 353 | slp_t ra; 354 | seqidx idx; 355 | 356 | size_t min_len = 0; 357 | size_t extended_reads = 0; 358 | size_t n = 0; 359 | 360 | unsigned char seq_nt4_table[256] = { 361 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 362 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 363 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 364 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 365 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 366 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 367 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 368 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 369 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 370 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 371 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 372 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 373 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 374 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 375 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 376 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 377 | 378 | int c, sa = 2, sb = 2, i, j, k, max_rseq = 0; 379 | int w = 4000; 380 | int8_t mat[25]; 381 | int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; 382 | uint8_t *rseq = 0; 383 | 384 | bool forward_only; 385 | }; 386 | 387 | #endif /* end of include guard: _EXTENDER_KLIB_HH */ 388 | -------------------------------------------------------------------------------- /include/ms/ms_pointers.hpp: -------------------------------------------------------------------------------- 1 | /* ms_pointers - Computes the matching statistics pointers from BWT and Thresholds 2 | Copyright (C) 2020 Massimiliano Rossi 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see http://www.gnu.org/licenses/ . 16 | */ 17 | /*! 18 | \file ms_pointers.hpp 19 | \brief ms_pointers.hpp Computes the matching statistics pointers from BWT and Thresholds. 20 | \author Massimiliano Rossi 21 | \date 09/07/2020 22 | */ 23 | 24 | #ifndef _MS_POINTERS_HH 25 | #define _MS_POINTERS_HH 26 | 27 | #include 28 | 29 | #include 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #include 37 | #include 38 | 39 | template > 42 | class ms_pointers : ri::r_index 43 | { 44 | public: 45 | thresholds_t thresholds; 46 | 47 | // std::vector samples_start; 48 | int_vector<> samples_start; 49 | // int_vector<> samples_end; 50 | // std::vector samples_last; 51 | 52 | // static const uchar TERMINATOR = 1; 53 | // bool sais = true; 54 | // /* 55 | // * sparse RLBWT: r (log sigma + (1+epsilon) * log (n/r)) (1+o(1)) bits 56 | // */ 57 | // //F column of the BWT (vector of 256 elements) 58 | // std::vector F; 59 | // //L column of the BWT, run-length compressed 60 | // rle_string_t bwt; 61 | // ulint terminator_position = 0; 62 | // ulint r = 0; //number of BWT runs 63 | 64 | typedef size_t size_type; 65 | 66 | ms_pointers() {} 67 | 68 | ms_pointers(std::string filename, bool rle = false) : ri::r_index() 69 | { 70 | verbose("Building the r-index from BWT"); 71 | 72 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 73 | 74 | std::string bwt_fname = filename + ".bwt"; 75 | 76 | verbose("RLE encoding BWT and computing SA samples"); 77 | 78 | if (rle) 79 | { 80 | std::string bwt_heads_fname = bwt_fname + ".heads"; 81 | std::ifstream ifs_heads(bwt_heads_fname); 82 | std::string bwt_len_fname = bwt_fname + ".len"; 83 | std::ifstream ifs_len(bwt_len_fname); 84 | this->bwt = rle_string_t(ifs_heads, ifs_len); 85 | 86 | ifs_heads.seekg(0); 87 | ifs_len.seekg(0); 88 | this->build_F_(ifs_heads, ifs_len); 89 | } 90 | else 91 | { 92 | std::ifstream ifs(bwt_fname); 93 | this->bwt = rle_string_t(ifs); 94 | 95 | ifs.seekg(0); 96 | this->build_F(ifs); 97 | } 98 | // std::string istring; 99 | // read_file(bwt_fname.c_str(), istring); 100 | // for(size_t i = 0; i < istring.size(); ++i) 101 | // if(istring[i]==0) 102 | // istring[i] = TERMINATOR; 103 | // this->bwt = rle_string_t(istring); 104 | 105 | this->r = this->bwt.number_of_runs(); 106 | ri::ulint n = this->bwt.size(); 107 | int log_r = bitsize(uint64_t(this->r)); 108 | int log_n = bitsize(uint64_t(this->bwt.size())); 109 | 110 | verbose("Number of BWT equal-letter runs: r = ", this->r); 111 | verbose("Rate n/r = ", double(this->bwt.size()) / this->r); 112 | verbose("log2(r) = ", log2(double(this->r))); 113 | verbose("log2(n/r) = ", log2(double(this->bwt.size()) / this->r)); 114 | 115 | // this->build_F(istring); 116 | // istring.clear(); 117 | // istring.shrink_to_fit(); 118 | 119 | read_samples(filename + ".ssa", this->r, n, samples_start); 120 | read_samples(filename + ".esa", this->r, n, this->samples_last); 121 | 122 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 123 | 124 | verbose("R-index construction complete"); 125 | verbose("Memory peak: ", malloc_count_peak()); 126 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 127 | 128 | verbose("Reading thresholds from file"); 129 | 130 | t_insert_start = std::chrono::high_resolution_clock::now(); 131 | 132 | thresholds = thresholds_t(filename,&this->bwt); 133 | 134 | // std::string tmp_filename = filename + std::string(".thr_pos"); 135 | 136 | // struct stat filestat; 137 | // FILE *fd; 138 | 139 | // if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 140 | // error("open() file " + tmp_filename + " failed"); 141 | 142 | // int fn = fileno(fd); 143 | // if (fstat(fn, &filestat) < 0) 144 | // error("stat() file " + tmp_filename + " failed"); 145 | 146 | // if (filestat.st_size % THRBYTES != 0) 147 | // error("invilid file " + tmp_filename); 148 | 149 | // size_t length = filestat.st_size / THRBYTES; 150 | // thresholds.resize(length); 151 | 152 | // for (size_t i = 0; i < length; ++i) 153 | // if ((fread(&thresholds[i], THRBYTES, 1, fd)) != 1) 154 | // error("fread() file " + tmp_filename + " failed"); 155 | 156 | // fclose(fd); 157 | 158 | t_insert_end = std::chrono::high_resolution_clock::now(); 159 | 160 | verbose("Memory peak: ", malloc_count_peak()); 161 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 162 | } 163 | 164 | void read_samples(std::string filename, ulint r, ulint n, int_vector<> &samples) 165 | { 166 | int log_n = bitsize(uint64_t(n)); 167 | 168 | struct stat filestat; 169 | FILE *fd; 170 | 171 | if ((fd = fopen(filename.c_str(), "r")) == nullptr) 172 | error("open() file " + filename + " failed"); 173 | 174 | int fn = fileno(fd); 175 | if (fstat(fn, &filestat) < 0) 176 | error("stat() file " + filename + " failed"); 177 | 178 | if (filestat.st_size % SSABYTES != 0) 179 | error("invilid file " + filename); 180 | 181 | size_t length = filestat.st_size / (2 * SSABYTES); 182 | //Check that the length of the file is 2*r elements of 5 bytes 183 | assert(length == r); 184 | 185 | // Create the vector 186 | samples = int_vector<>(r, 0, log_n); 187 | 188 | // Read the vector 189 | uint64_t left = 0; 190 | uint64_t right = 0; 191 | size_t i = 0; 192 | while (fread((char *)&left, SSABYTES, 1, fd) && fread((char *)&right, SSABYTES, 1, fd)) 193 | { 194 | ulint val = (right ? right - 1 : n - 1); 195 | assert(bitsize(uint64_t(val)) <= log_n); 196 | samples[i++] = val; 197 | } 198 | 199 | fclose(fd); 200 | } 201 | 202 | vector build_F_(std::ifstream &heads, std::ifstream &lengths) 203 | { 204 | heads.clear(); 205 | heads.seekg(0); 206 | lengths.clear(); 207 | lengths.seekg(0); 208 | 209 | this->F = vector(256, 0); 210 | int c; 211 | ulint i = 0; 212 | while ((c = heads.get()) != EOF) 213 | { 214 | size_t length = 0; 215 | lengths.read((char *)&length, 5); 216 | if (c > TERMINATOR) 217 | this->F[c] += length; 218 | else 219 | { 220 | this->F[TERMINATOR] += length; 221 | this->terminator_position = i; 222 | } 223 | i++; 224 | } 225 | for (ulint i = 255; i > 0; --i) 226 | this->F[i] = this->F[i - 1]; 227 | this->F[0] = 0; 228 | for (ulint i = 1; i < 256; ++i) 229 | this->F[i] += this->F[i - 1]; 230 | return this->F; 231 | } 232 | 233 | // Computes the matching statistics pointers for the given pattern 234 | std::vector query(const std::vector &pattern) 235 | { 236 | size_t m = pattern.size(); 237 | 238 | return _query(pattern.data(), m); 239 | } 240 | 241 | std::vector query(const char* pattern, const size_t m) 242 | { 243 | return _query(pattern, m); 244 | } 245 | 246 | void print_stats() 247 | { 248 | sdsl::nullstream ns; 249 | 250 | verbose("Memory consumption (bytes)."); 251 | verbose(" terminator_position: ", sizeof(this->terminator_position)); 252 | verbose(" F: ", my_serialize(this->F, ns)); 253 | verbose(" bwt: ", this->bwt.serialize(ns)); 254 | verbose(" samples_last: ", this->samples_last.serialize(ns)); 255 | verbose(" thresholds: ", thresholds.serialize(ns)); 256 | verbose(" samples_start: ", samples_start.serialize(ns)); 257 | } 258 | 259 | /* 260 | * \param i position in the BWT 261 | * \param c character 262 | * \return lexicographic rank of cw in bwt 263 | */ 264 | ulint LF(ri::ulint i, ri::uchar c) 265 | { 266 | // //if character does not appear in the text, return empty pair 267 | // if ((c == 255 and this->F[c] == this->bwt_size()) || this->F[c] >= this->F[c + 1]) 268 | // return {1, 0}; 269 | //number of c before the interval 270 | ri::ulint c_before = this->bwt.rank(i, c); 271 | // number of c inside the interval rn 272 | ri::ulint l = this->F[c] + c_before; 273 | return l; 274 | } 275 | 276 | /* serialize the structure to the ostream 277 | * \param out the ostream 278 | */ 279 | size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 280 | { 281 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 282 | size_type written_bytes = 0; 283 | 284 | out.write((char *)&this->terminator_position, sizeof(this->terminator_position)); 285 | written_bytes += sizeof(this->terminator_position); 286 | written_bytes += my_serialize(this->F, out, child, "F"); 287 | written_bytes += this->bwt.serialize(out); 288 | written_bytes += this->samples_last.serialize(out); 289 | 290 | written_bytes += thresholds.serialize(out, child, "thresholds"); 291 | // written_bytes += my_serialize(thresholds, out, child, "thresholds"); 292 | // written_bytes += my_serialize(samples_start, out, child, "samples_start"); 293 | written_bytes += samples_start.serialize(out, child, "samples_start"); 294 | 295 | sdsl::structure_tree::add_size(child, written_bytes); 296 | return written_bytes; 297 | } 298 | 299 | std::string get_file_extension() const 300 | { 301 | return thresholds.get_file_extension() + ".ms"; 302 | } 303 | 304 | /* load the structure from the istream 305 | * \param in the istream 306 | */ 307 | void load(std::istream &in) 308 | { 309 | 310 | in.read((char *)&this->terminator_position, sizeof(this->terminator_position)); 311 | my_load(this->F, in); 312 | this->bwt.load(in); 313 | this->r = this->bwt.number_of_runs(); 314 | this->samples_last.load(in); 315 | 316 | thresholds.load(in,&this->bwt); 317 | // my_load(thresholds, in); 318 | samples_start.load(in); 319 | // my_load(samples_start,in); 320 | } 321 | 322 | // // From r-index 323 | // ulint get_last_run_sample() 324 | // { 325 | // return (samples_last[r - 1] + 1) % bwt.size(); 326 | // } 327 | 328 | protected: 329 | // Computes the matching statistics pointers for the given pattern 330 | template 331 | std::vector _query(const string_t &pattern, const size_t m) 332 | { 333 | 334 | std::vector ms_pointers(m); 335 | 336 | // Start with the empty string 337 | auto pos = this->bwt_size() - 1; 338 | auto sample = this->get_last_run_sample(); 339 | 340 | for (size_t i = 0; i < m; ++i) 341 | { 342 | auto c = pattern[m - i - 1]; 343 | 344 | if (this->bwt.number_of_letter(c) == 0) 345 | { 346 | sample = 0; 347 | } 348 | else if (pos < this->bwt.size() && this->bwt[pos] == c) 349 | { 350 | sample--; 351 | } 352 | else 353 | { 354 | // Get threshold 355 | ri::ulint rnk = this->bwt.rank(pos, c); 356 | size_t thr = this->bwt.size() + 1; 357 | 358 | ulint next_pos = pos; 359 | 360 | // if (rnk < (this->F[c] - this->F[c-1]) // I can use F to compute it 361 | if (rnk < this->bwt.number_of_letter(c)) 362 | { 363 | // j is the first position of the next run of c's 364 | ri::ulint j = this->bwt.select(rnk, c); 365 | ri::ulint run_of_j = this->bwt.run_of_position(j); 366 | 367 | thr = thresholds[run_of_j]; // If it is the first run thr = 0 368 | 369 | // Here we should use Phi_inv that is not implemented yet 370 | // sample = this->Phi(this->samples_last[run_of_j - 1]) - 1; 371 | sample = samples_start[run_of_j]; 372 | 373 | next_pos = j; 374 | } 375 | 376 | if (pos < thr) 377 | { 378 | 379 | rnk--; 380 | ri::ulint j = this->bwt.select(rnk, c); 381 | ri::ulint run_of_j = this->bwt.run_of_position(j); 382 | sample = this->samples_last[run_of_j]; 383 | 384 | next_pos = j; 385 | } 386 | 387 | pos = next_pos; 388 | } 389 | 390 | ms_pointers[m - i - 1] = sample; 391 | 392 | // Perform one backward step 393 | pos = LF(pos, c); 394 | } 395 | 396 | return ms_pointers; 397 | } 398 | // // From r-index 399 | // vector build_F(std::ifstream &ifs) 400 | // { 401 | // ifs.clear(); 402 | // ifs.seekg(0); 403 | // F = vector(256, 0); 404 | // uchar c; 405 | // ulint i = 0; 406 | // while (ifs >> c) 407 | // { 408 | // if (c > TERMINATOR) 409 | // F[c]++; 410 | // else 411 | // { 412 | // F[TERMINATOR]++; 413 | // terminator_position = i; 414 | // } 415 | // i++; 416 | // } 417 | // for (ulint i = 255; i > 0; --i) 418 | // F[i] = F[i - 1]; 419 | // F[0] = 0; 420 | // for (ulint i = 1; i < 256; ++i) 421 | // F[i] += F[i - 1]; 422 | // return F; 423 | // } 424 | 425 | // // From r-index 426 | // vector> &read_run_starts(std::string fname, ulint n, vector> &ssa) 427 | // { 428 | // ssa.clear(); 429 | // std::ifstream ifs(fname); 430 | // uint64_t x = 0; 431 | // uint64_t y = 0; 432 | // uint64_t i = 0; 433 | // while (ifs.read((char *)&x, 5) && ifs.read((char *)&y, 5)) 434 | // { 435 | // ssa.push_back(pair(y ? y - 1 : n - 1, i)); 436 | // i++; 437 | // } 438 | // return ssa; 439 | // } 440 | 441 | // // From r-index 442 | // vector &read_run_ends(std::string fname, ulint n, vector &esa) 443 | // { 444 | // esa.clear(); 445 | // std::ifstream ifs(fname); 446 | // uint64_t x = 0; 447 | // uint64_t y = 0; 448 | // while (ifs.read((char *)&x, 5) && ifs.read((char *)&y, 5)) 449 | // { 450 | // esa.push_back(y ? y - 1 : n - 1); 451 | // } 452 | // return esa; 453 | // } 454 | }; 455 | 456 | // Computes the matching statistics pointers for the given pattern 457 | template <> 458 | template 459 | std::vector ms_pointers>::_query(const string_t &pattern, const size_t m) 460 | { 461 | 462 | std::vector ms_pointers(m); 463 | 464 | // Start with the empty string 465 | auto pos = this->bwt_size() - 1; 466 | auto sample = this->get_last_run_sample(); 467 | 468 | for (size_t i = 0; i < m; ++i) 469 | { 470 | auto c = pattern[m - i - 1]; 471 | const auto n_c = this->bwt.number_of_letter(c); 472 | if (n_c == 0) 473 | { 474 | sample = 0; 475 | // Perform one backward step 476 | pos = LF(pos, c); 477 | } 478 | else if (pos < this->bwt.size() && this->bwt[pos] == c) 479 | { 480 | sample--; 481 | // Perform one backward step 482 | pos = LF(pos, c); 483 | } 484 | else 485 | { 486 | // Get threshold 487 | ri::ulint run_of_pos = this->bwt.run_of_position(pos); 488 | auto rnk_c = this->bwt.run_and_head_rank(run_of_pos, c); 489 | size_t thr_c = thresholds.rank(pos + 1, c); // +1 because the rank count the thresiold in pos 490 | 491 | if (rnk_c.first > thr_c) 492 | { 493 | // Jump up 494 | size_t run_of_j = this->bwt.run_head_select(rnk_c.first, c); 495 | sample = samples_last[run_of_j]; 496 | // Perform one backward step 497 | pos = this->F[c] + rnk_c.second - 1; 498 | } 499 | else 500 | { 501 | // Jump down 502 | size_t run_of_j = this->bwt.run_head_select(rnk_c.first + 1, c); 503 | sample = samples_start[run_of_j]; 504 | // Perform one backward step 505 | pos = this->F[c] + rnk_c.second; 506 | } 507 | } 508 | // Store the sample 509 | ms_pointers[m - i - 1] = sample; 510 | } 511 | 512 | return ms_pointers; 513 | } 514 | 515 | #endif /* end of include guard: _MS_POINTERS_HH */ 516 | -------------------------------------------------------------------------------- /include/ms/ms_rle_string.hpp: -------------------------------------------------------------------------------- 1 | /* ms_rle_string - Extension of the r-index rle_string to compute matching statistics 2 | Copyright (C) 2020 Massimiliano Rossi 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see http://www.gnu.org/licenses/ . 16 | */ 17 | /*! 18 | \file ms_rle_string.hpp 19 | \brief ms_rle_string.hpp Extension of the r-index rle_string to compute matching statistics. 20 | \author Massimiliano Rossi 21 | \date 10/07/2020 22 | */ 23 | 24 | #ifndef _MS_RLE_STRING_HH 25 | #define _MS_RLE_STRING_HH 26 | 27 | #include 28 | 29 | #include 30 | 31 | template < 32 | class sparse_bitvector_t = ri::sparse_sd_vector, //predecessor structure storing run length 33 | class string_t = ri::huff_string //run heads 34 | > 35 | class ms_rle_string : public ri::rle_string 36 | { 37 | public: 38 | ms_rle_string() : ri::rle_string() 39 | { 40 | //NtD 41 | } 42 | 43 | /* 44 | * constructor: build structure on the input string 45 | * \param input the input string without 0x0 bytes in it. 46 | * \param B block size. The main sparse bitvector has R/B bits set (R being number of runs) 47 | * 48 | */ 49 | ms_rle_string(string &input, ulint B = 2) : ri::rle_string(input, B) 50 | { 51 | // NtD 52 | } 53 | 54 | ms_rle_string(std::ifstream &ifs, ulint B = 2) : ri::rle_string(ifs, B) 55 | { 56 | } 57 | 58 | // Construction from run-length encoded BWT 59 | ms_rle_string(std::ifstream &heads, std::ifstream &lengths, ulint B = 2) 60 | { 61 | // build_rlbwt(heads,lengths,B); 62 | heads.clear(); 63 | heads.seekg(0); 64 | lengths.clear(); 65 | lengths.seekg(0); 66 | // assert(not contains0(input)); // We're hacking the 0 away :) 67 | this->B = B; 68 | // n = input.size(); 69 | auto runs_per_letter_bv = vector>(256); 70 | //runs in main bitvector 71 | vector runs_bv; 72 | 73 | // Reads the run heads 74 | string run_heads_s; 75 | heads.seekg(0, heads.end); 76 | run_heads_s.resize(heads.tellg()); 77 | heads.seekg(0, heads.beg); 78 | heads.read(&run_heads_s[0], run_heads_s.size()); 79 | 80 | size_t pos = 0; 81 | this->n = 0; 82 | this->R = run_heads_s.size(); 83 | // Compute runs_bv and runs_per_letter_bv 84 | for (size_t i = 0; i < run_heads_s.size(); ++i) 85 | { 86 | size_t length; 87 | lengths.read((char *)&length, 5); 88 | if (run_heads_s[i] <= TERMINATOR) // change 0 to 1 89 | run_heads_s[i] = TERMINATOR; 90 | 91 | std::fill_n(std::back_inserter(runs_bv), length - 1, false); 92 | runs_bv.push_back(i % B == B - 1); 93 | 94 | std::fill_n(std::back_inserter(runs_per_letter_bv[run_heads_s[i]]), length - 1, false); 95 | runs_per_letter_bv[run_heads_s[i]].push_back(true); 96 | 97 | this->n += length; 98 | } 99 | // runs_bv.push_back(false); 100 | 101 | //now compact structures 102 | assert(runs_bv.size() == this->n); 103 | ulint t = 0; 104 | for (ulint i = 0; i < 256; ++i) 105 | t += runs_per_letter_bv[i].size(); 106 | assert(t == this->n); 107 | this->runs = sparse_bitvector_t(runs_bv); 108 | //a fast direct array: char -> bitvector. 109 | this->runs_per_letter = vector(256); 110 | for (ulint i = 0; i < 256; ++i) 111 | this->runs_per_letter[i] = sparse_bitvector_t(runs_per_letter_bv[i]); 112 | this->run_heads = string_t(run_heads_s); 113 | assert(this->run_heads.size() == this->R); 114 | } 115 | 116 | size_t number_of_runs_of_letter(uint8_t c) 117 | { 118 | return this->runs_per_letter[c].number_of_1(); 119 | } 120 | 121 | size_t number_of_letter(uint8_t c) 122 | { 123 | return this->runs_per_letter[c].size(); 124 | } 125 | 126 | // i-th run head 127 | uint8_t head_of(const size_t i) 128 | { 129 | assert(iR); 130 | return this->run_heads[i]; 131 | } 132 | 133 | // rank in chracters of the i-th run head 134 | // i.e., the number of characters c before the first character of the run. 135 | size_t head_rank(const size_t i, const uint8_t c) 136 | { 137 | assert(i < this->R); 138 | size_t j = this->run_heads.rank(i, c); 139 | if(j < 1) 140 | return j; 141 | assert(j<=i); 142 | return this->runs_per_letter[c].select(j-1) + 1; // j-1 because the select is 0 based 143 | } 144 | // number of runs of character c in in position i 145 | size_t run_head_rank(const size_t i, const uint8_t c) 146 | { 147 | assert(i < this->R); 148 | size_t j = this->run_heads.rank(i, c); 149 | return j; 150 | } 151 | 152 | inline std::pair run_and_head_rank(const size_t i, const uint8_t c) 153 | { 154 | assert(i < this->R); 155 | const size_t j = this->run_heads.rank(i, c); 156 | if( j < 1) 157 | return make_pair(j,j); 158 | const size_t k = this->runs_per_letter[c].select(j - 1) + 1; // j-1 because the select is 0 based 159 | return make_pair(j, k); 160 | } 161 | 162 | // Select the i-th run of c 163 | size_t run_head_select(const size_t i, const uint8_t c) 164 | { 165 | assert(i < this->R and i > 0); 166 | return this->run_heads.select(i - 1, c); 167 | } 168 | 169 | /* serialize the structure to the ostream 170 | * \param out the ostream 171 | */ 172 | ulint serialize(std::ostream &out) 173 | { 174 | return ri::rle_string::serialize(out); 175 | } 176 | 177 | /* load the structure from the istream 178 | * \param in the istream 179 | */ 180 | void load(std::istream &in) 181 | { 182 | ri::rle_string::load(in); 183 | } 184 | 185 | protected: 186 | void build_rlbwt(std::ifstream &heads, std::ifstream &lengths, ulint B) 187 | { 188 | heads.clear(); 189 | heads.seekg(0); 190 | lengths.clear(); 191 | lengths.seekg(0); 192 | // assert(not contains0(input)); // We're hacking the 0 away :) 193 | this->B = B; 194 | // n = input.size(); 195 | auto runs_per_letter_bv = vector>(256); 196 | //runs in main bitvector 197 | vector runs_bv; 198 | 199 | // Reads the run heads 200 | string run_heads_s; 201 | heads.seekg(0, heads.end); 202 | run_heads_s.resize(heads.tellg()); 203 | heads.seekg(0, heads.beg); 204 | heads.read(&run_heads_s[0], run_heads_s.size()); 205 | 206 | size_t pos = 0; 207 | this->n = 0; 208 | this->R = run_heads_s.size(); 209 | // Compute runs_bv and runs_per_letter_bv 210 | for (size_t i = 0; i < run_heads_s.size(); ++i) 211 | { 212 | size_t length = 0; 213 | lengths.read((char *)&length, 5); 214 | if (run_heads_s[i] <= TERMINATOR) // change 0 to 1 215 | run_heads_s[i] = TERMINATOR; 216 | 217 | std::fill_n(std::back_inserter(runs_bv), length - 1, false); 218 | runs_bv.push_back(i % B == B - 1); 219 | 220 | std::fill_n(std::back_inserter(runs_per_letter_bv[run_heads_s[i]]), length - 1, false); 221 | runs_per_letter_bv[run_heads_s[i]].push_back(true); 222 | 223 | this->n += length; 224 | } 225 | // runs_bv.push_back(false); 226 | 227 | //now compact structures 228 | assert(runs_bv.size() == this->n); 229 | ulint t = 0; 230 | for (ulint i = 0; i < 256; ++i) 231 | t += runs_per_letter_bv[i].size(); 232 | assert(t == this->n); 233 | this->runs = sparse_bitvector_t(runs_bv); 234 | //a fast direct array: char -> bitvector. 235 | this->runs_per_letter = vector(256); 236 | for (ulint i = 0; i < 256; ++i) 237 | this->runs_per_letter[i] = sparse_bitvector_t(runs_per_letter_bv[i]); 238 | this->run_heads = string_t(run_heads_s); 239 | assert(this->run_heads.size() == this->R); 240 | } 241 | private: 242 | }; 243 | 244 | // Construction from run-length encoded BWT specialization for sparse_sd_vector 245 | template <> 246 | ms_rle_string::ms_rle_string(std::ifstream &heads, std::ifstream &lengths, ulint B) 247 | { 248 | heads.clear(); 249 | heads.seekg(0); 250 | lengths.clear(); 251 | lengths.seekg(0); 252 | // assert(not contains0(input)); // We're hacking the 0 away :) 253 | this->B = B; 254 | // n = input.size(); 255 | 256 | // Reads the run heads 257 | string run_heads_s; 258 | heads.seekg(0, heads.end); 259 | run_heads_s.resize(heads.tellg()); 260 | heads.seekg(0, heads.beg); 261 | heads.read(&run_heads_s[0], run_heads_s.size()); 262 | 263 | size_t pos = 0; 264 | this->n = 0; 265 | this->R = run_heads_s.size(); 266 | 267 | auto runs_per_letter_bv = vector> (256); 268 | auto runs_per_letter_bv_i = vector (256,0); 269 | //runs in main bitvector 270 | vector runs_bv_onset; 271 | size_t runs_bv_i = 0; 272 | // Compute runs_bv and runs_per_letter_bv 273 | for (size_t i = 0; i < run_heads_s.size(); ++i) 274 | { 275 | size_t length = 0; 276 | lengths.read((char *)&length, 5); 277 | 278 | uint8_t curr_ch = unsigned(run_heads_s[i]); 279 | if (curr_ch <= TERMINATOR) { // change 0 to 1 280 | run_heads_s[i] = TERMINATOR; 281 | curr_ch = TERMINATOR; 282 | } 283 | 284 | if(i % B == B - 1) 285 | runs_bv_onset.push_back(this->n + length - 1); 286 | 287 | assert(length > 0); 288 | runs_per_letter_bv_i[curr_ch] += length; 289 | runs_per_letter_bv[curr_ch].push_back(runs_per_letter_bv_i[curr_ch] - 1); 290 | 291 | this->n += length; 292 | } 293 | // runs_bv.push_back(false); 294 | 295 | //now compact structures 296 | ulint t = 0; 297 | for (ulint i = 0; i < 256; ++i) 298 | t += runs_per_letter_bv_i[i]; 299 | assert(t == this->n); 300 | this->runs = ri::sparse_sd_vector(runs_bv_onset, this->n); 301 | //a fast direct array: char -> bitvector. 302 | this->runs_per_letter = vector(256); 303 | for (ulint i = 0; i < 256; ++i) 304 | this->runs_per_letter[i] = ri::sparse_sd_vector(runs_per_letter_bv[i],runs_per_letter_bv_i[i]); 305 | this->run_heads = ri::huff_string(run_heads_s); 306 | assert(this->run_heads.size() == this->R); 307 | }; 308 | 309 | typedef ms_rle_string ms_rle_string_sd; 310 | typedef ms_rle_string ms_rle_string_hyb; 311 | 312 | #endif /* end of include guard: _MS_RLE_STRING_HH */ 313 | -------------------------------------------------------------------------------- /include/ms/thresholds_ds.hpp: -------------------------------------------------------------------------------- 1 | /* thresholds_ds - Stores the thresholds in compressed and plain ways 2 | Copyright (C) 2020 Massimiliano Rossi 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see http://www.gnu.org/licenses/ . 16 | */ 17 | /*! 18 | \file thresholds_ds.hpp 19 | \brief thresholds_ds.hpp Stores the thresholds in compressed and plain ways. 20 | \author Massimiliano Rossi 21 | \date 09/07/2020 22 | */ 23 | 24 | #ifndef _MS_THRESHOLDS_DS_HH 25 | #define _MS_THRESHOLDS_DS_HH 26 | 27 | #include 28 | 29 | #include 30 | 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | template 37 | class thr_plain 38 | { 39 | public: 40 | int_vector<> thresholds; 41 | rle_string_t *bwt; 42 | 43 | typedef size_t size_type; 44 | 45 | thr_plain() 46 | { 47 | bwt=nullptr; 48 | } 49 | 50 | thr_plain(std::string filename, rle_string_t* bwt_):bwt(bwt_) 51 | { 52 | int log_n = bitsize(uint64_t(bwt->size())); 53 | 54 | verbose("Reading thresholds from file"); 55 | 56 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 57 | 58 | std::string tmp_filename = filename + std::string(".thr_pos"); 59 | 60 | struct stat filestat; 61 | FILE *fd; 62 | 63 | if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 64 | error("open() file " + tmp_filename + " failed"); 65 | 66 | int fn = fileno(fd); 67 | if (fstat(fn, &filestat) < 0) 68 | error("stat() file " + tmp_filename + " failed"); 69 | 70 | if (filestat.st_size % THRBYTES != 0) 71 | error("invilid file " + tmp_filename); 72 | 73 | size_t length = filestat.st_size / THRBYTES; 74 | size_t threshold = 0; 75 | 76 | thresholds = int_vector<>(length, 0, log_n); 77 | 78 | for (size_t i = 0; i < length; ++i) 79 | { 80 | size_t threshold = 0; 81 | if ((fread(&threshold, THRBYTES, 1, fd)) != 1) 82 | error("fread() file " + tmp_filename + " failed"); 83 | thresholds[i] = threshold; 84 | } 85 | 86 | fclose(fd); 87 | 88 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 89 | 90 | verbose("Memory peak: ", malloc_count_peak()); 91 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 92 | } 93 | 94 | // Destructor 95 | ~thr_plain() 96 | { 97 | // NtD 98 | } 99 | 100 | // Copy constructor 101 | thr_plain(const thr_plain &other) 102 | :thresholds(other.thresholds), 103 | bwt(other.bwt) 104 | { 105 | } 106 | 107 | friend void swap(thr_plain &first, thr_plain &second) // nothrow 108 | { 109 | using std::swap; 110 | 111 | swap(first.thresholds, second.thresholds); 112 | swap(first.bwt, second.bwt); 113 | } 114 | 115 | // Copy assignment 116 | thr_plain &operator=(thr_plain other) 117 | { 118 | swap(*this,other); 119 | 120 | return *this; 121 | } 122 | 123 | // Move constructor 124 | thr_plain(thr_plain &&other) noexcept 125 | : thr_plain() 126 | { 127 | swap(*this, other); 128 | } 129 | 130 | size_t operator[] (size_t& i) 131 | { 132 | assert( i < thresholds.size()); 133 | return thresholds[i]; 134 | } 135 | 136 | /* serialize the structure to the ostream 137 | * \param out the ostream 138 | */ 139 | size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 140 | { 141 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 142 | size_type written_bytes = 0; 143 | 144 | written_bytes += thresholds.serialize(out, child, "thresholds"); 145 | 146 | sdsl::structure_tree::add_size(child, written_bytes); 147 | return written_bytes; 148 | } 149 | 150 | /* load the structure from the istream 151 | * \param in the istream 152 | */ 153 | void load(std::istream &in, rle_string_t *bwt_) 154 | { 155 | thresholds.load(in); 156 | bwt = bwt_; 157 | } 158 | 159 | std::string get_file_extension() const 160 | { 161 | return ".thrp"; 162 | } 163 | }; 164 | 165 | template 166 | class thr_compressed 167 | { 168 | public: 169 | int_vector<> thresholds; 170 | rle_string_t *bwt; 171 | long long min_off; 172 | 173 | typedef size_t size_type; 174 | 175 | thr_compressed() 176 | { 177 | bwt=nullptr; 178 | } 179 | 180 | thr_compressed(std::string filename, rle_string_t* bwt_):bwt(bwt_) 181 | { 182 | int log_n = bitsize(uint64_t(bwt->size())); 183 | size_t n = uint64_t(bwt->size()); 184 | 185 | verbose("Reading thresholds from file"); 186 | 187 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 188 | 189 | std::string tmp_filename = filename + std::string(".thr_pos"); 190 | 191 | struct stat filestat; 192 | FILE *fd; 193 | 194 | if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 195 | error("open() file " + tmp_filename + " failed"); 196 | 197 | int fn = fileno(fd); 198 | if (fstat(fn, &filestat) < 0) 199 | error("stat() file " + tmp_filename + " failed"); 200 | 201 | if (filestat.st_size % THRBYTES != 0) 202 | error("invilid file " + tmp_filename); 203 | 204 | size_t length = filestat.st_size / THRBYTES; 205 | 206 | size_t pos = 0; 207 | 208 | long long max_off = 0; 209 | min_off = n; 210 | 211 | for (size_t i = 0; i < length; ++i) 212 | { 213 | size_t threshold = 0; 214 | if ((fread(&threshold, THRBYTES, 1, fd)) != 1) 215 | error("fread() file " + tmp_filename + " failed"); 216 | 217 | long long off = 0; 218 | 219 | if (threshold > 0) 220 | { 221 | uint8_t c = bwt->head_of(i); 222 | size_t pred = bwt->select(bwt->rank(pos - 1, c) - 1, c); 223 | size_t mid_int = (pos - pred + 1) >> 1; 224 | assert(threshold > pred); 225 | 226 | threshold = threshold - pred; 227 | 228 | off = mid_int - threshold; 229 | 230 | max_off = max(max_off, off); 231 | min_off = min(min_off, off); 232 | } 233 | 234 | pos += bwt->run_at(i); 235 | } 236 | 237 | // Rewind the file 238 | fseek(fd,0, SEEK_SET); 239 | pos = 0; 240 | 241 | int log_off = bitsize((size_t)(max_off - min_off + 1)); 242 | 243 | min_off = -min_off; // Shift all the values 244 | thresholds = int_vector<>(length,0,log_off); 245 | for (size_t i = 0; i < length; ++i) 246 | { 247 | size_t threshold = 0; 248 | if ((fread(&threshold, THRBYTES, 1, fd)) != 1) 249 | error("fread() file " + tmp_filename + " failed"); 250 | 251 | long long off = 0; 252 | 253 | if (threshold > 0) 254 | { 255 | uint8_t c = bwt->head_of(i); 256 | size_t pred = bwt->select(bwt->rank(pos - 1, c) - 1, c); 257 | size_t mid_int = (pos - pred + 1) >> 1; 258 | assert(threshold > pred); 259 | 260 | threshold = threshold - pred; 261 | 262 | off = mid_int - threshold + min_off; 263 | } 264 | 265 | thresholds[i] = off; 266 | pos += bwt->run_at(i); 267 | } 268 | 269 | fclose(fd); 270 | 271 | 272 | 273 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 274 | 275 | verbose("Memory peak: ", malloc_count_peak()); 276 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 277 | } 278 | 279 | // Destructor 280 | ~thr_compressed() 281 | { 282 | // NtD 283 | } 284 | 285 | // Copy constructor 286 | thr_compressed(const thr_compressed &other) 287 | : thresholds(other.thresholds), 288 | bwt(other.bwt), 289 | min_off(other.min_off) 290 | { 291 | } 292 | 293 | friend void swap(thr_compressed &first, thr_compressed &second) // nothrow 294 | { 295 | using std::swap; 296 | 297 | swap(first.thresholds, second.thresholds); 298 | swap(first.bwt, second.bwt); 299 | swap(first.min_off, second.min_off); 300 | } 301 | 302 | // Copy assignment 303 | thr_compressed &operator=(thr_compressed other) 304 | { 305 | swap(*this, other); 306 | 307 | return *this; 308 | } 309 | 310 | // Move constructor 311 | thr_compressed(thr_compressed &&other) noexcept 312 | : thr_compressed() 313 | { 314 | swap(*this, other); 315 | } 316 | 317 | size_t operator[] (size_t& i) 318 | { 319 | assert( i < thresholds.size()); 320 | 321 | // get mid_interval 322 | uint8_t c = bwt->head_of(i); 323 | size_t rank = bwt->head_rank(i, c); 324 | if(rank == 0) 325 | return 0; 326 | 327 | size_t pred = bwt->select(rank - 1, c); 328 | size_t pos = bwt->select(rank, c); 329 | size_t mid_int = (pos - pred + 1) >> 1; 330 | 331 | size_t thr_i = thresholds[i]; 332 | 333 | return mid_int + min_off - thresholds[i] + pred; 334 | } 335 | 336 | /* serialize the structure to the ostream 337 | * \param out the ostream 338 | */ 339 | size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 340 | { 341 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 342 | size_type written_bytes = 0; 343 | 344 | out.write((char *)&min_off, sizeof(min_off)); 345 | written_bytes += sizeof(min_off); 346 | 347 | written_bytes += thresholds.serialize(out, child, "thresholds"); 348 | 349 | sdsl::structure_tree::add_size(child, written_bytes); 350 | return written_bytes; 351 | } 352 | 353 | /* load the structure from the istream 354 | * \param in the istream 355 | */ 356 | void load(std::istream &in, rle_string_t *bwt_) 357 | { 358 | in.read((char *)&min_off, sizeof(min_off)); 359 | thresholds.load(in); 360 | bwt = bwt_; 361 | } 362 | 363 | std::string get_file_extension() const 364 | { 365 | return ".thrc"; 366 | } 367 | }; 368 | 369 | 370 | template 371 | class thr_bv 372 | { 373 | public: 374 | std::vector thresholds_per_letter; 375 | rle_string_t *bwt; 376 | 377 | typedef size_t size_type; 378 | 379 | thr_bv() 380 | { 381 | bwt=nullptr; 382 | } 383 | 384 | thr_bv(std::string filename, rle_string_t* bwt_):bwt(bwt_) 385 | { 386 | int log_n = bitsize(uint64_t(bwt->size())); 387 | size_t n = uint64_t(bwt->size()); 388 | 389 | verbose("Reading thresholds from file"); 390 | 391 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 392 | 393 | std::string tmp_filename = filename + std::string(".thr_pos"); 394 | 395 | struct stat filestat; 396 | FILE *fd; 397 | 398 | if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 399 | error("open() file " + tmp_filename + " failed"); 400 | 401 | int fn = fileno(fd); 402 | if (fstat(fn, &filestat) < 0) 403 | error("stat() file " + tmp_filename + " failed"); 404 | 405 | if (filestat.st_size % THRBYTES != 0) 406 | error("invilid file " + tmp_filename); 407 | 408 | size_t length = filestat.st_size / THRBYTES; 409 | 410 | auto thrs_per_letter_bv = vector>(256); 411 | auto thrs_per_letter_bv_i = vector(256, 0); 412 | 413 | for (size_t i = 0; i < length; ++i) 414 | { 415 | size_t threshold = 0; 416 | if ((fread(&threshold, THRBYTES, 1, fd)) != 1) 417 | error("fread() file " + tmp_filename + " failed"); 418 | 419 | long long off = 0; 420 | 421 | uint8_t c = bwt->head_of(i); 422 | if (threshold > 0) 423 | thrs_per_letter_bv[c].push_back(threshold); 424 | thrs_per_letter_bv_i[c] = n; 425 | 426 | } 427 | 428 | thresholds_per_letter = vector(256); 429 | for (ulint i = 0; i < 256; ++i) 430 | thresholds_per_letter[i] = ri::sparse_sd_vector(thrs_per_letter_bv[i], thrs_per_letter_bv_i[i]); 431 | 432 | fclose(fd); 433 | 434 | 435 | 436 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 437 | 438 | verbose("Memory peak: ", malloc_count_peak()); 439 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 440 | } 441 | 442 | // Destructor 443 | ~thr_bv() 444 | { 445 | // NtD 446 | } 447 | 448 | // Copy constructor 449 | thr_bv(const thr_bv &other) 450 | : thresholds_per_letter(other.thresholds_per_letter), 451 | bwt(other.bwt) 452 | { 453 | } 454 | 455 | friend void swap(thr_bv &first, thr_bv &second) // nothrow 456 | { 457 | using std::swap; 458 | 459 | swap(first.thresholds_per_letter, second.thresholds_per_letter); 460 | swap(first.bwt, second.bwt); 461 | } 462 | 463 | // Copy assignment 464 | thr_bv &operator=(thr_bv other) 465 | { 466 | swap(*this, other); 467 | 468 | return *this; 469 | } 470 | 471 | // Move constructor 472 | thr_bv(thr_bv &&other) noexcept 473 | : thr_bv() 474 | { 475 | swap(*this, other); 476 | } 477 | 478 | size_t operator[] (size_t& i) 479 | { 480 | assert(i < bwt->number_of_runs()); 481 | 482 | // get mid_interval 483 | uint8_t c = bwt->head_of(i); 484 | size_t rank = bwt->run_head_rank(i, c); 485 | if(rank == 0) 486 | return 0; 487 | 488 | size_t thr_i = thresholds_per_letter[c].select(rank-1); 489 | 490 | return thr_i; 491 | } 492 | 493 | // number of thresholds for the character c before position i 494 | size_t rank(const size_t i, const uint8_t c) 495 | { 496 | return thresholds_per_letter[c].rank(i); // j-1 because the select is 0 based 497 | } 498 | 499 | /* serialize the structure to the ostream 500 | * \param out the ostream 501 | */ 502 | size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const 503 | { 504 | sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this)); 505 | size_type written_bytes = 0; 506 | 507 | for (ulint i = 0; i < 256; ++i) 508 | written_bytes += thresholds_per_letter[i].serialize(out); 509 | 510 | sdsl::structure_tree::add_size(child, written_bytes); 511 | return written_bytes; 512 | } 513 | 514 | /* load the structure from the istream 515 | * \param in the istream 516 | */ 517 | void load(std::istream &in, rle_string_t *bwt_) 518 | { 519 | thresholds_per_letter = vector(256); 520 | for (ulint i = 0; i < 256; ++i) 521 | thresholds_per_letter[i].load(in); 522 | bwt = bwt_; 523 | } 524 | 525 | std::string get_file_extension() const 526 | { 527 | return ".thrbv"; 528 | } 529 | }; 530 | 531 | #endif /* end of include guard: _MS_THRESHOLDS_DS_HH */ 532 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(compress_dictionary compress_dictionary.cpp) 3 | target_link_libraries(compress_dictionary sdsl malloc_count) 4 | target_include_directories(compress_dictionary PUBLIC "../include/common") 5 | 6 | FetchContent_GetProperties(r-index) 7 | FetchContent_GetProperties(shaped_slp) 8 | FetchContent_GetProperties(ssw) 9 | FetchContent_GetProperties(ksw2) 10 | FetchContent_GetProperties(klib) 11 | FetchContent_GetProperties(bigbwt) 12 | 13 | set(FOLCA_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/folca) 14 | set(SUX_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/external/sux/sux) 15 | 16 | 17 | add_executable(ms matching_statistics.cpp ${bigbwt_SOURCE_DIR}/xerrors.c) 18 | target_link_libraries(ms common sdsl divsufsort divsufsort64 malloc_count ri pthread) 19 | target_include_directories(ms PUBLIC "../include/ms" 20 | "../include/common" 21 | "${shaped_slp_SOURCE_DIR}" 22 | "${FOLCA_SOURCE_DIR}" 23 | "${SUX_SOURCE_DIR}/function" 24 | "${SUX_SOURCE_DIR}/support" 25 | "${bigbwt_SOURCE_DIR}" 26 | ) 27 | target_compile_options(ms PUBLIC "-std=c++17") 28 | 29 | add_executable(mems mems.cpp ${bigbwt_SOURCE_DIR}/xerrors.c) 30 | target_link_libraries(mems common sdsl divsufsort divsufsort64 malloc_count ri pthread) 31 | target_include_directories(mems PUBLIC "../include/ms" 32 | "../include/common" 33 | "${shaped_slp_SOURCE_DIR}" 34 | "${FOLCA_SOURCE_DIR}" 35 | "${SUX_SOURCE_DIR}/function" 36 | "${SUX_SOURCE_DIR}/support" 37 | "${bigbwt_SOURCE_DIR}" 38 | ) 39 | target_compile_options(mems PUBLIC "-std=c++17") 40 | 41 | add_executable(rlebwt_ms_build rlebwt_ms_build.cpp) 42 | target_link_libraries(rlebwt_ms_build common sdsl divsufsort divsufsort64 malloc_count ri) 43 | target_include_directories(rlebwt_ms_build PUBLIC "../include/ms" 44 | "../include/common" 45 | "${shaped_slp_SOURCE_DIR}" 46 | "${FOLCA_SOURCE_DIR}" 47 | "${SUX_SOURCE_DIR}/function" 48 | "${SUX_SOURCE_DIR}/support" 49 | ) 50 | target_compile_options(rlebwt_ms_build PUBLIC "-std=c++17") 51 | 52 | add_executable(extend_klib extend_klib.cpp ${klib_SOURCE_DIR}/ksw.c ${bigbwt_SOURCE_DIR}/xerrors.c) 53 | target_link_libraries(extend_klib common malloc_count sdsl divsufsort divsufsort64 ri klib ssw pthread) 54 | target_include_directories(extend_klib PUBLIC "../include/ms" 55 | "../include/common" 56 | "../include/extender" 57 | "${shaped_slp_SOURCE_DIR}" 58 | "${FOLCA_SOURCE_DIR}" 59 | "${SUX_SOURCE_DIR}/function" 60 | "${SUX_SOURCE_DIR}/support" 61 | "${ssw_SOURCE_DIR}/src" 62 | "${klib_SOURCE_DIR}" 63 | "${bigbwt_SOURCE_DIR}" 64 | ) 65 | target_compile_options(extend_klib PUBLIC "-std=c++17") 66 | 67 | add_executable(extend_ksw2 extend_ksw2.cpp ${bigbwt_SOURCE_DIR}/xerrors.c) 68 | target_link_libraries(extend_ksw2 common sdsl malloc_count divsufsort divsufsort64 ri ksw2 pthread) 69 | target_include_directories(extend_ksw2 PUBLIC "../include/ms" 70 | "../include/common" 71 | "${ksw2_SOURCE_DIR}" 72 | "../include/extender" 73 | "${shaped_slp_SOURCE_DIR}" 74 | "${FOLCA_SOURCE_DIR}" 75 | "${SUX_SOURCE_DIR}/function" 76 | "${SUX_SOURCE_DIR}/support" 77 | "${bigbwt_SOURCE_DIR}" 78 | ) 79 | target_compile_options(extend_ksw2 PUBLIC "-std=c++17") 80 | 81 | add_executable(build_seqidx build_seqidx.cpp) 82 | target_link_libraries(build_seqidx common sdsl divsufsort divsufsort64 malloc_count klib z) 83 | target_include_directories(build_seqidx PUBLIC "../include/ms" 84 | "../include/common" 85 | ) 86 | target_compile_options(build_seqidx PUBLIC "-std=c++17") -------------------------------------------------------------------------------- /src/build_seqidx.cpp: -------------------------------------------------------------------------------- 1 | /* build_seqidx - Builds the sequence index for the reference 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file build_seqidx.cpp 16 | \brief build_seqidx.cpp Builds the sequence index for the reference. 17 | \author Massimiliano Rossi 18 | \date 07/08/2021 19 | */ 20 | 21 | #include 22 | 23 | #define VERBOSE 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | KSEQ_INIT(gzFile, gzread); 32 | 33 | #include 34 | 35 | // #include 36 | // namespace fs = std::filesystem; 37 | #include 38 | 39 | //*********************** Argument options *************************************** 40 | // struct containing command line parameters and other globals 41 | struct Args 42 | { 43 | std::string filename = ""; 44 | std::string outpath = ""; // path where to output the file 45 | }; 46 | 47 | void parseArgs(int argc, char *const argv[], Args &arg) 48 | { 49 | int c; 50 | extern char *optarg; 51 | extern int optind; 52 | 53 | std::string usage("usage: " + std::string(argv[0]) + " infile [-o outpath]\n\n" + 54 | "Computes the .idx file storing the sequence names and starting positions.\n" + 55 | "outpath: [string] - path to where to output the file.\n"); 56 | 57 | std::string sarg; 58 | while ((c = getopt(argc, argv, "o:")) != -1) 59 | { 60 | switch (c) 61 | { 62 | case 'o': 63 | arg.outpath.assign(optarg); 64 | break; 65 | case 'h': 66 | error(usage); 67 | case '?': 68 | error("Unknown option.\n", usage); 69 | exit(1); 70 | } 71 | } 72 | // the only input parameter is the file name 73 | if (argc == optind + 1) 74 | { 75 | arg.filename.assign(argv[optind]); 76 | } 77 | else 78 | { 79 | error("Invalid number of arguments\n", usage); 80 | } 81 | } 82 | 83 | //********** end argument options ******************** 84 | 85 | int main(int argc, char *const argv[]) 86 | { 87 | Args args; 88 | parseArgs(argc, argv, args); 89 | 90 | // Building the sequence idx 91 | 92 | verbose("Building the sequence index"); 93 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 94 | 95 | seqidx idx(args.filename); 96 | 97 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 98 | 99 | verbose("Sequence index construction complete"); 100 | verbose("Memory peak: ", malloc_count_peak()); 101 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 102 | 103 | 104 | std::string outfile = ""; 105 | if(args.outpath == "") outfile = args.filename; 106 | else outfile = args.outpath + std::string(basename(args.filename.data())); 107 | // else outfile = args.outpath + fs::path(args.filename).filename().string(); 108 | outfile += idx.get_file_extension(); 109 | 110 | std::ofstream out(outfile); 111 | idx.serialize(out); 112 | 113 | t_insert_end = std::chrono::high_resolution_clock::now(); 114 | 115 | verbose("Sequence index serialzation complete"); 116 | verbose("Memory peak: ", malloc_count_peak()); 117 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 118 | 119 | auto mem_peak = malloc_count_peak(); 120 | verbose("Memory peak: ", malloc_count_peak()); 121 | return 0; 122 | } -------------------------------------------------------------------------------- /src/compress_dictionary.cpp: -------------------------------------------------------------------------------- 1 | /* compress_dictionary - Computes the compressed dictionary from prefix-free parse dictionary 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file compress_dictionary.cpp 16 | \brief compress_dictionary.cpp Computes the compressed dictionary from prefix-free parse dictionary. 17 | \author Massimiliano Rossi 18 | \date 16/09/2020 19 | */ 20 | 21 | #include 22 | 23 | #define VERBOSE 24 | 25 | #include 26 | 27 | #include 28 | 29 | //*********************** Argument options *************************************** 30 | // struct containing command line parameters and other globals 31 | struct Args 32 | { 33 | std::string filename = ""; 34 | size_t w = 10; // sliding window size and its default 35 | }; 36 | 37 | void parseArgs(int argc, char *const argv[], Args &arg) 38 | { 39 | int c; 40 | extern char *optarg; 41 | extern int optind; 42 | 43 | std::string usage("usage: " + std::string(argv[0]) + " infile [-w wsize]\n\n" + 44 | "Computes the pfp data structures of infile, provided that infile.parse, infile.dict, and infile.occ exists.\n" + 45 | " wsize: [integer] - sliding window size (def. 10)\n"); 46 | 47 | std::string sarg; 48 | while ((c = getopt(argc, argv, "w:smcfl:rhp:t:")) != -1) 49 | { 50 | switch (c) 51 | { 52 | case 'w': 53 | sarg.assign(optarg); 54 | arg.w = stoi(sarg); 55 | break; 56 | case 'h': 57 | error(usage); 58 | case '?': 59 | error("Unknown option.\n", usage); 60 | exit(1); 61 | } 62 | } 63 | // the only input parameter is the file name 64 | if (argc == optind + 1) 65 | { 66 | arg.filename.assign(argv[optind]); 67 | } 68 | else 69 | { 70 | error("Invalid number of arguments\n", usage); 71 | } 72 | } 73 | 74 | //********** end argument options ******************** 75 | 76 | std::string execute_cmd(const char* cmd) { 77 | std::array buffer{}; 78 | std::string output = ""; 79 | 80 | std::string cmd_plus_stderr = std::string(cmd) + " 2>&1"; 81 | FILE* pipe = popen(cmd_plus_stderr.data(), "r"); // Extract stderr as well 82 | if (!pipe) {throw std::runtime_error("popen() failed!");} 83 | 84 | try { 85 | std::size_t bytes; 86 | while ((bytes = fread(buffer.data(), sizeof(char), sizeof(buffer), pipe))) { 87 | output += std::string(buffer.data(), bytes); 88 | } 89 | } catch (...) { 90 | pclose(pipe); 91 | throw std::runtime_error("Error occurred while reading popen() stream."); 92 | } 93 | pclose(pipe); 94 | return output; 95 | } 96 | 97 | int main(int argc, char *const argv[]) 98 | { 99 | 100 | Args args; 101 | parseArgs(argc, argv, args); 102 | 103 | // Building the r-index 104 | 105 | verbose("Compressing the dictionary"); 106 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 107 | 108 | // Open output files 109 | std::string dicz_filename = args.filename + ".dicz"; 110 | std::string dicz_len_filename = args.filename + ".dicz.len"; 111 | 112 | FILE *dicz; 113 | FILE *dicz_len; 114 | 115 | if ((dicz = fopen(dicz_filename.c_str(), "w")) == nullptr) 116 | error("open() file " + std::string(dicz_filename) + " failed"); 117 | 118 | if ((dicz_len = fopen(dicz_len_filename.c_str(), "w")) == nullptr) 119 | error("open() file " + std::string(dicz_len_filename) + " failed"); 120 | 121 | // Open the dictionary 122 | std::string dict_filename = args.filename + ".dict"; 123 | std::vector dict; 124 | read_file(dict_filename.c_str(), dict); 125 | 126 | // Start processing 127 | 128 | 129 | // Generating phrase lengths 130 | verbose("Generating phrase lengths"); 131 | std::vector lengths(1,0); 132 | 133 | // Counting the number of Dollars at the beginning 134 | size_t i = 0, j = 0; 135 | while(dict[i++] == Dollar) 136 | j++; 137 | dict.erase(dict.begin(), dict.begin() + j); 138 | 139 | for(auto chr: dict) 140 | { 141 | // Skip the Dollars 142 | if(chr == EndOfDict) 143 | continue; 144 | 145 | // Hit end of phrase 146 | if(chr == EndOfWord) 147 | lengths.push_back(0); 148 | else 149 | lengths.back()++; 150 | } 151 | 152 | if (lengths.back()==0) 153 | lengths.pop_back(); 154 | 155 | verbose("Found", lengths.size(), " phrases "); 156 | 157 | verbose("Generating phrases"); 158 | uint8_t* ptr = dict.data(); // Beginning of the current phrase 159 | 160 | bool empty_first_phrase = false; 161 | for(size_t i = 0; i < lengths.size(); i++) 162 | { 163 | size_t compressed_length = lengths[i] - args.w; 164 | 165 | // special case: starts with a trigger string 166 | if (i==0 && compressed_length == 0) { 167 | ptr += lengths[i] + 1; 168 | empty_first_phrase = true; 169 | continue; 170 | } else if (i > 0 && compressed_length == 0) { 171 | error("encountered a length=0 phrase after removing trigger string, which should not occur."); 172 | } 173 | 174 | if ((fwrite(&compressed_length, 4, 1, dicz_len)) != 1) 175 | error("fwrite() file " + std::string(dicz_len_filename) + " failed"); 176 | 177 | if ((fwrite(ptr, sizeof(uint8_t), compressed_length, dicz)) != compressed_length) 178 | error("fwrite() file " + std::string(dicz_filename) + " failed"); 179 | 180 | ptr += lengths[i] + 1; 181 | } 182 | fclose(dicz); 183 | fclose(dicz_len); 184 | 185 | // re-writes parse file to shift down all the phrase ids by 1 186 | // since we removed the empty beginning phrase 187 | if (empty_first_phrase) { 188 | verbose("alert: found that the first phrase length is 0" 189 | " so we will rewrite *.parse file to generated correct SLP."); 190 | 191 | // read in all the phrase ids in parse 192 | std::string parse_filename = args.filename + ".parse"; 193 | std::vector parse_arr; 194 | read_file(parse_filename.c_str(), parse_arr); 195 | 196 | // make sure first phrase is lowest lexicographically and then remove it 197 | if (parse_arr[0] != 1) 198 | error("parse should being with lowest lexicographic phrase."); 199 | parse_arr.erase(parse_arr.begin()); 200 | 201 | // rename the old parse file as *.parse_with_empty_phrase 202 | std::ostringstream command_stream; 203 | command_stream << "mv " << parse_filename << " " << (args.filename + ".parse_with_empty_phrase"); 204 | auto mv_log = execute_cmd(command_stream.str().c_str()); 205 | 206 | verbose("executed this command: " + command_stream.str()); 207 | 208 | // open new parse file for writing 209 | FILE* new_parse_file; 210 | if ((new_parse_file = fopen((args.filename + ".parse").c_str(), "w")) == nullptr) 211 | verbose("open() file " + std::string(args.filename + ".parse" + " failed")); 212 | 213 | // iterate through each element of parse and decrement by 1 214 | for (size_t i = 0; i < parse_arr.size(); i++) { 215 | if (parse_arr[i] == 1) 216 | error("issue occurred when creating new parse file."); 217 | parse_arr[i]--; 218 | 219 | // write it out 220 | if ((fwrite(&parse_arr[i], 4, 1, new_parse_file)) != 1) 221 | verbose("fwrite() file " + std::string(args.filename + ".parse") + " failed"); 222 | } 223 | fclose(new_parse_file); 224 | } 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 233 | 234 | verbose("Memory peak: ", malloc_count_peak()); 235 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 236 | 237 | auto mem_peak = malloc_count_peak(); 238 | verbose("Memory peak: ", malloc_count_peak()); 239 | 240 | return 0; 241 | } -------------------------------------------------------------------------------- /src/extend_klib.cpp: -------------------------------------------------------------------------------- 1 | /* extend_klib - Extend the MEMs of the reads to the reference 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file extend_klib.cpp 16 | \brief extend_klib.cpp Extend the MEMs of the reads to the reference. 17 | \author Massimiliano Rossi 18 | \date 30/04/2021 19 | */ 20 | 21 | extern "C" { 22 | #include 23 | } 24 | 25 | #include 26 | 27 | #define VERBOSE 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include 46 | #include 47 | 48 | #include 49 | 50 | //*********************** Argument options *************************************** 51 | // struct containing command line parameters and other globals 52 | struct Args 53 | { 54 | std::string filename = ""; 55 | std::string patterns = ""; // path to patterns file 56 | size_t l = 25; // minumum MEM length 57 | size_t th = 1; // number of threads 58 | size_t b = 1; // number of batches per thread pool 59 | bool shaped_slp = false; // use shaped slp 60 | }; 61 | 62 | void parseArgs(int argc, char *const argv[], Args &arg) 63 | { 64 | int c; 65 | extern char *optarg; 66 | extern int optind; 67 | 68 | std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" + 69 | "Extends the MEMs of the reads in the pattern against the reference index in infile.\n" + 70 | "shaped_slp: [boolean] - use shaped slp. (def. false)\n" + 71 | " pattens: [string] - path to patterns file.\n" + 72 | " len: [integer] - minimum MEM lengt (def. 25)\n" + 73 | " thread: [integer] - number of threads (def. 1)\n" + 74 | " batch: [integer] - number of batches per therad pool (def. 1)\n"); 75 | 76 | std::string sarg; 77 | while ((c = getopt(argc, argv, "l:hp:b:t:")) != -1) 78 | { 79 | switch (c) 80 | { 81 | case 'p': 82 | arg.patterns.assign(optarg); 83 | break; 84 | case 'l': 85 | sarg.assign(optarg); 86 | arg.l = stoi(sarg); 87 | break; 88 | case 't': 89 | sarg.assign(optarg); 90 | arg.th = stoi(sarg); 91 | break; 92 | case 'b': 93 | sarg.assign(optarg); 94 | arg.b = stoi(sarg); 95 | break; 96 | case 'q': 97 | arg.shaped_slp = true; 98 | break; 99 | case 'h': 100 | error(usage); 101 | case '?': 102 | error("Unknown option.\n", usage); 103 | exit(1); 104 | } 105 | } 106 | // the only input parameter is the file name 107 | if (argc == optind + 1) 108 | { 109 | arg.filename.assign(argv[optind]); 110 | } 111 | else 112 | { 113 | error("Invalid number of arguments\n", usage); 114 | } 115 | } 116 | 117 | //********** end argument options ******************** 118 | 119 | 120 | template 121 | void dispatcher(Args &args){ 122 | verbose("Construction of the extender"); 123 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 124 | 125 | extender_t extender(args.filename, args.l); 126 | 127 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 128 | verbose("Memory peak: ", malloc_count_peak()); 129 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 130 | 131 | verbose("Processing patterns"); 132 | t_insert_start = std::chrono::high_resolution_clock::now(); 133 | 134 | std::string base_name = basename(args.filename.data()); 135 | std::string sam_filename = args.patterns + "_" + base_name + "_" + std::to_string(args.l); 136 | 137 | if (is_gzipped(args.patterns)) 138 | { 139 | verbose("The input is gzipped - forcing single thread extension."); 140 | args.th = 1; 141 | } 142 | 143 | if (args.th == 1) 144 | st_extend(&extender, args.patterns, sam_filename); 145 | else 146 | mt_extend(&extender, args.patterns, sam_filename, args.th, args.b); 147 | 148 | // TODO: Merge the SAM files. 149 | 150 | t_insert_end = std::chrono::high_resolution_clock::now(); 151 | 152 | verbose("Memory peak: ", malloc_count_peak()); 153 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 154 | 155 | auto mem_peak = malloc_count_peak(); 156 | verbose("Memory peak: ", malloc_count_peak()); 157 | 158 | } 159 | 160 | int main(int argc, char *const argv[]) 161 | { 162 | 163 | Args args; 164 | parseArgs(argc, argv, args); 165 | 166 | if (args.shaped_slp) 167 | { 168 | dispatcher>(args); 169 | } 170 | else 171 | { 172 | dispatcher>(args); 173 | } 174 | 175 | return 0; 176 | } -------------------------------------------------------------------------------- /src/extend_ksw2.cpp: -------------------------------------------------------------------------------- 1 | /* extend_ksw2 - Extend the MEMs of the reads to the reference 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file extend_ksw2.cpp 16 | \brief extend_ksw2.cpp Extend the MEMs of the reads to the reference. 17 | \author Massimiliano Rossi 18 | \date 13/07/2020 19 | */ 20 | 21 | extern "C" { 22 | #include 23 | } 24 | 25 | #include 26 | 27 | #define VERBOSE 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | #include 38 | 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include 46 | 47 | //*********************** Argument options *************************************** 48 | // struct containing command line parameters and other globals 49 | struct Args 50 | { 51 | std::string filename = ""; 52 | std::string patterns = ""; // path to patterns file 53 | std::string output = ""; // output file prefix 54 | size_t l = 25; // minumum MEM length 55 | size_t th = 1; // number of threads 56 | size_t b = 1; // number of batches per thread pool 57 | bool shaped_slp = false; // use shaped slp 58 | size_t ext_len = 100; // Extension length 59 | // size_t top_k = 1; // Report the top_k alignments 60 | 61 | // ksw2 parameters 62 | int8_t smatch = 2; // Match score default 63 | int8_t smismatch = 4; // Mismatch score default 64 | int8_t gapo = 4; // Gap open penalty 65 | int8_t gapo2 = 13; // Gap open penalty 66 | int8_t gape = 2; // Gap extension penalty 67 | int8_t gape2 = 1; // Gap extension penalty 68 | // int end_bonus = 400; // Bonus to add at the extension score to declare the alignment 69 | 70 | // int w = -1; // Band width 71 | // int zdrop = -1; // Zdrop enable 72 | }; 73 | 74 | void parseArgs(int argc, char *const argv[], Args &arg) 75 | { 76 | int c; 77 | extern char *optarg; 78 | extern int optind; 79 | 80 | std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-t threads] [-l len] [-q shaped_slp] [-b batch] [-L ext_l] [-A smatch] [-B smismatc] [-O gapo] [-E gape]\n\n" + 81 | "Extends the MEMs of the reads in the pattern against the reference index in infile.\n" + 82 | "shaped_slp: [boolean] - use shaped slp. (def. false)\n" + 83 | " pattens: [string] - path to patterns file.\n" + 84 | " output: [string] - output file prefix.\n" + 85 | " len: [integer] - minimum MEM lengt (def. 25)\n" + 86 | " thread: [integer] - number of threads (def. 1)\n" + 87 | " ext_l: [integer] - length of reference substring for extension (def. " + std::to_string(arg.ext_len) + ")\n" + 88 | " smatch: [integer] - match score value (def. " + std::to_string(arg.smatch) + ")\n" + 89 | " smismatch: [integer] - mismatch penalty value (def. " + std::to_string(arg.smismatch) + ")\n" + 90 | " gapo: [integer] - gap open penalty value (def. " + std::to_string(arg.gapo) + "," + std::to_string(arg.gapo2) + ")\n" + 91 | " gape: [integer] - gap extension penalty value (def. " + std::to_string(arg.gape) + "," + std::to_string(arg.gape2) + ")\n" + 92 | " batch: [integer] - number of batches per therad pool (def. 1)\n"); 93 | 94 | std::string sarg; 95 | char* s; 96 | while ((c = getopt(argc, argv, "l:hp:o:b:t:qA:B:O:E:L:")) != -1) 97 | { 98 | switch (c) 99 | { 100 | case 'p': 101 | arg.patterns.assign(optarg); 102 | break; 103 | case 'o': 104 | arg.output.assign(optarg); 105 | break; 106 | case 'l': 107 | sarg.assign(optarg); 108 | arg.l = stoi(sarg); 109 | break; 110 | case 't': 111 | sarg.assign(optarg); 112 | arg.th = stoi(sarg); 113 | break; 114 | case 'b': 115 | sarg.assign(optarg); 116 | arg.b = stoi(sarg); 117 | break; 118 | case 'L': 119 | sarg.assign(optarg); 120 | arg.ext_len = stoi(sarg); 121 | break; 122 | case 'A': 123 | sarg.assign(optarg); 124 | arg.smatch = stoi(sarg); 125 | break; 126 | case 'B': 127 | sarg.assign(optarg); 128 | arg.smismatch = stoi(sarg); 129 | break; 130 | case 'O': 131 | arg.gapo = arg.gapo2 = strtol(optarg, &s, 10); 132 | if (*s == ',') arg.gapo2 = strtol(s+1, &s, 10); 133 | break; 134 | case 'E': 135 | arg.gape = arg.gape2 = strtol(optarg, &s, 10); 136 | if (*s == ',') arg.gape2 = strtol(s+1, &s, 10); 137 | break; 138 | case 'q': 139 | arg.shaped_slp = true; 140 | break; 141 | case 'h': 142 | error(usage); 143 | case '?': 144 | error("Unknown option.\n", usage); 145 | exit(1); 146 | } 147 | } 148 | // the only input parameter is the file name 149 | if (argc == optind + 1) 150 | { 151 | arg.filename.assign(argv[optind]); 152 | } 153 | else 154 | { 155 | error("Invalid number of arguments\n", usage); 156 | } 157 | } 158 | 159 | //********** end argument options ******************** 160 | 161 | 162 | template 163 | typename extender_t::config_t configurer(Args &args){ 164 | typename extender_t::config_t config; 165 | 166 | config.min_len = args.l; // Minimum MEM length 167 | config.ext_len = args.ext_len; // Extension length 168 | 169 | // ksw2 parameters 170 | config.smatch = args.smatch; // Match score default 171 | config.smismatch = args.smismatch; // Mismatch score default 172 | config.gapo = args.gapo; // Gap open penalty 173 | config.gapo2 = args.gapo2; // Gap open penalty 174 | config.gape = args.gape; // Gap extension penalty 175 | config.gape2 = args.gape2; // Gap extension penalty 176 | 177 | return config; 178 | } 179 | 180 | template 181 | void dispatcher(Args &args){ 182 | verbose("Construction of the extender"); 183 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 184 | 185 | 186 | extender_t extender(args.filename, configurer(args)); 187 | 188 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 189 | verbose("Memory peak: ", malloc_count_peak()); 190 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 191 | 192 | verbose("Processing patterns"); 193 | t_insert_start = std::chrono::high_resolution_clock::now(); 194 | 195 | std::string base_name = basename(args.filename.data()); 196 | std::string sam_filename = args.patterns + "_" + base_name + "_" + std::to_string(args.l); 197 | if(args.output != "") 198 | sam_filename = args.output; 199 | 200 | if (is_gzipped(args.patterns)) 201 | { 202 | verbose("The input is gzipped - forcing single thread extension."); 203 | args.th = 1; 204 | } 205 | 206 | if (args.th == 1) 207 | st_extend(&extender, args.patterns, sam_filename); 208 | else 209 | mt_extend(&extender, args.patterns, sam_filename, args.th, args.b); 210 | 211 | // TODO: Merge the SAM files. 212 | 213 | t_insert_end = std::chrono::high_resolution_clock::now(); 214 | 215 | verbose("Memory peak: ", malloc_count_peak()); 216 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 217 | 218 | auto mem_peak = malloc_count_peak(); 219 | verbose("Memory peak: ", malloc_count_peak()); 220 | } 221 | 222 | int main(int argc, char *const argv[]) 223 | { 224 | 225 | Args args; 226 | parseArgs(argc, argv, args); 227 | 228 | if (args.shaped_slp) 229 | { 230 | dispatcher>>(args); 231 | } 232 | else 233 | { 234 | dispatcher>>(args); 235 | } 236 | 237 | return 0; 238 | } -------------------------------------------------------------------------------- /src/matching_statistics.cpp: -------------------------------------------------------------------------------- 1 | /* matching_statistics - Computes the matching statistics from BWT and Thresholds 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file matching_statistics.cpp 16 | \brief matching_statistics.cpp Computes the matching statistics from BWT and Thresholds. 17 | \author Massimiliano Rossi 18 | \date 13/07/2020 19 | */ 20 | 21 | extern "C" { 22 | #include 23 | } 24 | 25 | #include 26 | 27 | #define VERBOSE 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include 34 | 35 | #include 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | /// kseq extra 45 | //////////////////////////////////////////////////////////////////////////////// 46 | 47 | static inline size_t ks_tell(kseq_t *seq) 48 | { 49 | return gztell(seq->f->f) - seq->f->end + seq->f->begin; 50 | } 51 | 52 | void copy_kstring_t(kstring_t &l, kstring_t &r) 53 | { 54 | l.l = r.l; 55 | l.m = r.m; 56 | l.s = (char *)malloc(l.m); 57 | for (size_t i = 0; i < r.m; ++i) 58 | l.s[i] = r.s[i]; 59 | } 60 | void copy_kseq_t(kseq_t *l, kseq_t *r) 61 | { 62 | copy_kstring_t(l->name, r->name); 63 | copy_kstring_t(l->comment, r->comment); 64 | copy_kstring_t(l->seq, r->seq); 65 | copy_kstring_t(l->qual, r->qual); 66 | l->last_char = r->last_char; 67 | } 68 | 69 | //////////////////////////////////////////////////////////////////////////////// 70 | /// Parallel computation 71 | //////////////////////////////////////////////////////////////////////////////// 72 | 73 | // This should be done using buffering. 74 | size_t next_start_fastq(gzFile fp) 75 | { 76 | int c; 77 | // Special case when we arr at the beginning of the file. 78 | if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@') 79 | return 0; 80 | 81 | // Strart from the previous character 82 | gzseek(fp, -1, SEEK_CUR); 83 | 84 | std::vector> window; 85 | // Find the first new line 86 | for (size_t i = 0; i < 4; ++i) 87 | { 88 | while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n')) 89 | { 90 | } 91 | if (c == EOF) 92 | return gztell(fp); 93 | if ((c = gzgetc(fp)) == EOF) 94 | return gztell(fp); 95 | window.push_back(std::make_pair(c, gztell(fp) - 1)); 96 | } 97 | 98 | for (size_t i = 0; i < 2; ++i) 99 | { 100 | if (window[i].first == '@' && window[i + 2].first == '+') 101 | return window[i].second; 102 | if (window[i].first == '+' && window[i + 2].first == '@') 103 | return window[i + 2].second; 104 | } 105 | 106 | return gztell(fp); 107 | } 108 | 109 | // test if the file is gzipped 110 | static inline bool is_gzipped(std::string filename) 111 | { 112 | FILE *fp = fopen(filename.c_str(), "rb"); 113 | if(fp == NULL) error("Opening file " + filename); 114 | int byte1 = 0, byte2 = 0; 115 | fread(&byte1, sizeof(char), 1, fp); 116 | fread(&byte2, sizeof(char), 1, fp); 117 | fclose(fp); 118 | return (byte1 == 0x1f && byte2 == 0x8b); 119 | } 120 | 121 | // Return the length of the file 122 | // Assumes that the file is not compressed 123 | static inline size_t get_file_size(std::string filename) 124 | { 125 | if (is_gzipped(filename)) 126 | { 127 | std::cerr << "The input is gzipped!" << std::endl; 128 | return -1; 129 | } 130 | FILE *fp = fopen(filename.c_str(), "r"); 131 | fseek(fp, 0L, SEEK_END); 132 | size_t size = ftell(fp); 133 | fclose(fp); 134 | return size; 135 | } 136 | 137 | std::vector split_fastq(std::string filename, size_t n_threads) 138 | { 139 | //Precondition: the file is not gzipped 140 | // scan file for start positions and execute threads 141 | size_t size = get_file_size(filename); 142 | 143 | gzFile fp = gzopen(filename.c_str(), "r"); 144 | if (fp == Z_NULL) 145 | { 146 | throw new std::runtime_error("Cannot open input file " + filename); 147 | } 148 | 149 | std::vector starts(n_threads + 1); 150 | for (int i = 0; i < n_threads + 1; ++i) 151 | { 152 | size_t start = (size_t)((size * i) / n_threads); 153 | gzseek(fp, start, SEEK_SET); 154 | starts[i] = next_start_fastq(fp); 155 | } 156 | gzclose(fp); 157 | return starts; 158 | } 159 | 160 | //////////////////////////////////////////////////////////////////////////////// 161 | /// SLP definitions 162 | //////////////////////////////////////////////////////////////////////////////// 163 | 164 | using SelSd = SelectSdvec<>; 165 | using DagcSd = DirectAccessibleGammaCode; 166 | using Fblc = FixedBitLenCode<>; 167 | 168 | using shaped_slp_t = SelfShapedSlp; 169 | using plain_slp_t = PlainSlp; 170 | 171 | template 172 | std::string get_slp_file_extension() 173 | { 174 | return std::string(".slp"); 175 | } 176 | 177 | template <> 178 | std::string get_slp_file_extension() 179 | { 180 | return std::string(".slp"); 181 | } 182 | 183 | template <> 184 | std::string get_slp_file_extension() 185 | { 186 | return std::string(".plain.slp"); 187 | } 188 | //////////////////////////////////////////////////////////////////////////////// 189 | 190 | template 191 | class ms_c 192 | { 193 | public: 194 | 195 | ms_c(std::string filename) 196 | { 197 | verbose("Loading the matching statistics index"); 198 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 199 | 200 | std::string filename_ms = filename + ms.get_file_extension(); 201 | 202 | ifstream fs_ms(filename_ms); 203 | ms.load(fs_ms); 204 | fs_ms.close(); 205 | 206 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 207 | 208 | verbose("Matching statistics index construction complete"); 209 | verbose("Memory peak: ", malloc_count_peak()); 210 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 211 | 212 | verbose("Loading random access"); 213 | t_insert_start = std::chrono::high_resolution_clock::now(); 214 | 215 | std::string filename_slp = filename + get_slp_file_extension(); 216 | 217 | ifstream fs(filename_slp); 218 | ra.load(fs); 219 | fs.close(); 220 | 221 | n = ra.getLen(); 222 | 223 | t_insert_end = std::chrono::high_resolution_clock::now(); 224 | 225 | verbose("Matching statistics index loading complete"); 226 | verbose("Memory peak: ", malloc_count_peak()); 227 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 228 | } 229 | 230 | // Destructor 231 | ~ms_c() 232 | { 233 | // NtD 234 | } 235 | 236 | // The outfile has the following format. The first size_t integer store the 237 | // length l of the name. Then the following l characters stores the name of 238 | // the read. The following size_t integer store the length l of the query. 239 | // Then the following l size_t integers stores the pointers of the 240 | // matching statistics, and the following l size_t integers stores the lengths 241 | // of the mathcing statistics. 242 | void matching_statistics(kseq_t *read, FILE* out) 243 | { 244 | auto pointers = ms.query(read->seq.s, read->seq.l); 245 | std::vector lengths(pointers.size()); 246 | size_t l = 0; 247 | for (size_t i = 0; i < pointers.size(); ++i) 248 | { 249 | size_t pos = pointers[i]; 250 | while ((i + l) < read->seq.l && (pos + l) < n && (i < 1 || pos != (pointers[i-1] + 1) ) && read->seq.s[i + l] == ra.charAt(pos + l)) 251 | ++l; 252 | 253 | lengths[i] = l; 254 | l = (l == 0 ? 0 : (l - 1)); 255 | } 256 | 257 | // Original MS computation 258 | // for (size_t i = 0; i < pointers.size(); ++i) 259 | // { 260 | // size_t pos = pointers[i]; 261 | // while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l)) 262 | // ++l; 263 | 264 | // lengths[i] = l; 265 | // l = (l == 0 ? 0 : (l - 1)); 266 | // } 267 | 268 | assert(lengths.size() == pointers.size()); 269 | 270 | size_t h_length = read->name.l; 271 | fwrite(&h_length, sizeof(size_t), 1,out); 272 | fwrite(read->name.s, sizeof(char),h_length,out); 273 | 274 | size_t q_length = pointers.size(); 275 | fwrite(&q_length, sizeof(size_t), 1,out); 276 | fwrite(pointers.data(), sizeof(size_t),q_length,out); 277 | fwrite(lengths.data(), sizeof(size_t),q_length,out); 278 | } 279 | 280 | protected: 281 | ms_pointers<> ms; 282 | slp_t ra; 283 | size_t n = 0; 284 | }; 285 | 286 | 287 | 288 | char complement(char n) 289 | { 290 | switch (n) 291 | { 292 | case 'A': 293 | return 'T'; 294 | case 'T': 295 | return 'A'; 296 | case 'G': 297 | return 'C'; 298 | case 'C': 299 | return 'G'; 300 | default: 301 | return n; 302 | } 303 | } 304 | 305 | template 306 | struct mt_param_t 307 | { 308 | // Parameters 309 | ms_t *ms; 310 | std::string pattern_filename; 311 | std::string out_filename; 312 | size_t start; 313 | size_t end; 314 | size_t wk_id; 315 | }; 316 | 317 | template 318 | void *mt_ms_worker(void *param) 319 | { 320 | mt_param_t *p = (mt_param_t*) param; 321 | size_t n_reads = 0; 322 | size_t n_processed_reads = 0; 323 | 324 | FILE *out_fd; 325 | gzFile fp; 326 | 327 | if ((out_fd = fopen(p->out_filename.c_str(), "w")) == nullptr) 328 | error("open() file " + p->out_filename + " failed"); 329 | 330 | if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL) 331 | error("open() file " + p->pattern_filename + " failed"); 332 | 333 | gzseek(fp, p->start, SEEK_SET); 334 | 335 | kseq_t rev; 336 | int l; 337 | 338 | kseq_t *seq = kseq_init(fp); 339 | while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0)) 340 | { 341 | 342 | p->ms->matching_statistics(seq,out_fd); 343 | 344 | } 345 | 346 | kseq_destroy(seq); 347 | gzclose(fp); 348 | fclose(out_fd); 349 | 350 | return NULL; 351 | } 352 | 353 | template 354 | void mt_ms(ms_t *ms, std::string pattern_filename, std::string out_filename, size_t n_threads) 355 | { 356 | pthread_t t[n_threads] = {0}; 357 | mt_param_t params[n_threads]; 358 | std::vector starts = split_fastq(pattern_filename, n_threads); 359 | for(size_t i = 0; i < n_threads; ++i) 360 | { 361 | params[i].ms = ms; 362 | params[i].pattern_filename = pattern_filename; 363 | params[i].out_filename = out_filename + "_" + std::to_string(i) + ".ms.tmp.out"; 364 | params[i].start = starts[i]; 365 | params[i].end = starts[i+1]; 366 | params[i].wk_id = i; 367 | xpthread_create(&t[i], NULL, &mt_ms_worker, ¶ms[i], __LINE__, __FILE__); 368 | } 369 | 370 | for(size_t i = 0; i < n_threads; ++i) 371 | { 372 | xpthread_join(t[i],NULL,__LINE__,__FILE__); 373 | } 374 | 375 | // sleep(5); 376 | 377 | 378 | return; 379 | } 380 | 381 | 382 | //////////////////////////////////////////////////////////////////////////////// 383 | /// Single Thread 384 | //////////////////////////////////////////////////////////////////////////////// 385 | template 386 | size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename) 387 | { 388 | size_t n_reads = 0; 389 | size_t n_processed_reads = 0; 390 | kseq_t rev; 391 | int l; 392 | FILE *out_fd; 393 | 394 | out_filename += "_0.ms.tmp.out"; 395 | 396 | if ((out_fd = fopen(out_filename.c_str(), "w")) == nullptr) 397 | error("open() file " + out_filename + " failed"); 398 | 399 | gzFile fp = gzopen(pattern_filename.c_str(), "r"); 400 | kseq_t* seq = kseq_init(fp); 401 | while ((l = kseq_read(seq)) >= 0) 402 | { 403 | 404 | ms->matching_statistics(seq, out_fd); 405 | 406 | } 407 | 408 | kseq_destroy(seq); 409 | gzclose(fp); 410 | fclose(out_fd); 411 | 412 | // sleep(5); 413 | 414 | return n_processed_reads; 415 | } 416 | 417 | 418 | typedef std::pair> pattern_t; 419 | 420 | //*********************** Argument options *************************************** 421 | // struct containing command line parameters and other globals 422 | struct Args 423 | { 424 | std::string filename = ""; 425 | std::string patterns = ""; // path to patterns file 426 | std::string output = ""; // output file prefix 427 | size_t l = 25; // minumum MEM length 428 | size_t th = 1; // number of threads 429 | bool shaped_slp = false; // use shaped slp 430 | }; 431 | 432 | void parseArgs(int argc, char *const argv[], Args &arg) 433 | { 434 | int c; 435 | extern char *optarg; 436 | extern int optind; 437 | 438 | std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" + 439 | "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" + 440 | "shaped_slp: [boolean] - use shaped slp. (def. false)\n" + 441 | " pattens: [string] - path to patterns file.\n" + 442 | " output: [string] - output file prefix.\n" + 443 | " len: [integer] - minimum MEM lengt (def. 25)\n" + 444 | " thread: [integer] - number of threads (def. 1)\n"); 445 | 446 | std::string sarg; 447 | while ((c = getopt(argc, argv, "l:hp:o:t:")) != -1) 448 | { 449 | switch (c) 450 | { 451 | case 'p': 452 | arg.patterns.assign(optarg); 453 | break; 454 | case 'o': 455 | arg.output.assign(optarg); 456 | break; 457 | case 'l': 458 | sarg.assign(optarg); 459 | arg.l = stoi(sarg); 460 | break; 461 | case 't': 462 | sarg.assign(optarg); 463 | arg.th = stoi(sarg); 464 | break; 465 | case 'q': 466 | arg.shaped_slp = true; 467 | break; 468 | case 'h': 469 | error(usage); 470 | case '?': 471 | error("Unknown option.\n", usage); 472 | exit(1); 473 | } 474 | } 475 | // the only input parameter is the file name 476 | if (argc == optind + 1) 477 | { 478 | arg.filename.assign(argv[optind]); 479 | } 480 | else 481 | { 482 | error("Invalid number of arguments\n", usage); 483 | } 484 | } 485 | 486 | //********** end argument options ******************** 487 | 488 | template 489 | void dispatcher(Args &args) 490 | { 491 | verbose("Construction of the matching statistics data structure"); 492 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 493 | 494 | ms_t ms(args.filename); 495 | 496 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 497 | verbose("Memory peak: ", malloc_count_peak()); 498 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 499 | 500 | verbose("Processing patterns"); 501 | t_insert_start = std::chrono::high_resolution_clock::now(); 502 | 503 | std::string base_name = basename(args.filename.data()); 504 | std::string out_filename = args.patterns + "_" + base_name; 505 | if(args.output != "") 506 | out_filename = args.output; 507 | 508 | if (is_gzipped(args.patterns)) 509 | { 510 | verbose("The input is gzipped - forcing single thread matching statistics."); 511 | args.th = 1; 512 | } 513 | 514 | if (args.th == 1) 515 | st_ms(&ms, args.patterns, out_filename); 516 | else 517 | mt_ms(&ms, args.patterns, out_filename, args.th); 518 | 519 | // TODO: Merge the SAM files. 520 | 521 | t_insert_end = std::chrono::high_resolution_clock::now(); 522 | 523 | verbose("Memory peak: ", malloc_count_peak()); 524 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 525 | 526 | auto mem_peak = malloc_count_peak(); 527 | verbose("Memory peak: ", malloc_count_peak()); 528 | 529 | verbose("Printing plain output"); 530 | t_insert_start = std::chrono::high_resolution_clock::now(); 531 | 532 | std::ofstream f_pointers(out_filename + ".pointers"); 533 | std::ofstream f_lengths(out_filename + ".lengths"); 534 | 535 | if (!f_pointers.is_open()) 536 | error("open() file " + std::string(out_filename) + ".pointers failed"); 537 | 538 | if (!f_lengths.is_open()) 539 | error("open() file " + std::string(out_filename) + ".lengths failed"); 540 | 541 | size_t n_seq = 0; 542 | for (size_t i = 0; i < args.th; ++i) 543 | { 544 | std::string tmp_filename = out_filename + "_" + std::to_string(i) + ".ms.tmp.out"; 545 | FILE *in_fd; 546 | 547 | if ((in_fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 548 | error("open() file " + tmp_filename + " failed"); 549 | 550 | size_t length = 0; 551 | size_t m = 100; // Reserved size for pointers and lengths 552 | size_t *mem = (size_t *)malloc(m * sizeof(size_t)); 553 | size_t s = 100; // Reserved size for read name 554 | char* rname = (char *)malloc(s * sizeof(char)); 555 | while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0) 556 | { 557 | // Reading read name 558 | if (s < length) 559 | { 560 | // Resize lengths and pointers 561 | s = length; 562 | rname = (char *)realloc(rname, m * sizeof(char)); 563 | } 564 | 565 | if ((fread(rname, sizeof(char), length, in_fd)) != length) 566 | error("fread() file " + std::string(tmp_filename) + " failed"); 567 | 568 | // TODO: Store the fasta headers somewhere 569 | f_pointers << ">" + std::string(rname,length) << endl; 570 | f_lengths << ">" + std::string(rname,length) << endl; 571 | 572 | // Reading MEMs 573 | if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1) 574 | error("fread() file " + std::string(tmp_filename) + " failed"); 575 | 576 | if (m < length) 577 | { 578 | // Resize lengths and pointers 579 | m = length; 580 | mem = (size_t *)realloc(mem, m * sizeof(size_t)); 581 | } 582 | 583 | if ((fread(mem, sizeof(size_t), length, in_fd)) != length) 584 | error("fread() file " + std::string(tmp_filename) + " failed"); 585 | 586 | // TODO: Store the fasta headers somewhere 587 | // f_pointers << ">" + std::to_string(n_seq) << endl; 588 | for (size_t i = 0; i < length; ++i) 589 | f_pointers << mem[i] << " "; 590 | f_pointers << endl; 591 | 592 | if ((fread(mem, sizeof(size_t), length, in_fd)) != length) 593 | error("fread() file " + std::string(tmp_filename) + " failed"); 594 | 595 | // f_lengths << ">" + std::to_string(n_seq) << endl; 596 | for (size_t i = 0; i < length; ++i) 597 | f_lengths << mem[i] << " "; 598 | f_lengths << endl; 599 | 600 | n_seq++; 601 | } 602 | fclose(in_fd); 603 | if (std::remove(tmp_filename.c_str()) != 0) 604 | error("remove() file " + tmp_filename + " failed"); 605 | } 606 | 607 | f_pointers.close(); 608 | f_lengths.close(); 609 | 610 | t_insert_end = std::chrono::high_resolution_clock::now(); 611 | 612 | verbose("Memory peak: ", malloc_count_peak()); 613 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 614 | 615 | mem_peak = malloc_count_peak(); 616 | verbose("Memory peak: ", malloc_count_peak()); 617 | } 618 | 619 | int main(int argc, char *const argv[]) 620 | { 621 | Args args; 622 | parseArgs(argc, argv, args); 623 | 624 | if (args.shaped_slp) 625 | { 626 | dispatcher>(args); 627 | } 628 | else 629 | { 630 | dispatcher>(args); 631 | } 632 | return 0; 633 | } -------------------------------------------------------------------------------- /src/mems.cpp: -------------------------------------------------------------------------------- 1 | /* mems - Computes the MEMs from BWT and Thresholds 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file mems.cpp 16 | \brief mems.cpp Computes the MEMs from BWT and Thresholds. 17 | \author Massimiliano Rossi 18 | \date 13/07/2020 19 | */ 20 | 21 | extern "C" { 22 | #include 23 | } 24 | 25 | #include 26 | 27 | #define VERBOSE 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include 34 | 35 | #include 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #include 44 | 45 | //////////////////////////////////////////////////////////////////////////////// 46 | /// kseq extra 47 | //////////////////////////////////////////////////////////////////////////////// 48 | 49 | static inline size_t ks_tell(kseq_t *seq) 50 | { 51 | return gztell(seq->f->f) - seq->f->end + seq->f->begin; 52 | } 53 | 54 | void copy_kstring_t(kstring_t &l, kstring_t &r) 55 | { 56 | l.l = r.l; 57 | l.m = r.m; 58 | l.s = (char *)malloc(l.m); 59 | for (size_t i = 0; i < r.m; ++i) 60 | l.s[i] = r.s[i]; 61 | } 62 | void copy_kseq_t(kseq_t *l, kseq_t *r) 63 | { 64 | copy_kstring_t(l->name, r->name); 65 | copy_kstring_t(l->comment, r->comment); 66 | copy_kstring_t(l->seq, r->seq); 67 | copy_kstring_t(l->qual, r->qual); 68 | l->last_char = r->last_char; 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | /// Parallel computation 73 | //////////////////////////////////////////////////////////////////////////////// 74 | 75 | // This should be done using buffering. 76 | size_t next_start_fastq(gzFile fp) 77 | { 78 | int c; 79 | // Special case when we arr at the beginning of the file. 80 | if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@') 81 | return 0; 82 | 83 | // Strart from the previous character 84 | gzseek(fp, -1, SEEK_CUR); 85 | 86 | std::vector> window; 87 | // Find the first new line 88 | for (size_t i = 0; i < 4; ++i) 89 | { 90 | while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n')) 91 | { 92 | } 93 | if (c == EOF) 94 | return gztell(fp); 95 | if ((c = gzgetc(fp)) == EOF) 96 | return gztell(fp); 97 | window.push_back(std::make_pair(c, gztell(fp) - 1)); 98 | } 99 | 100 | for (size_t i = 0; i < 2; ++i) 101 | { 102 | if (window[i].first == '@' && window[i + 2].first == '+') 103 | return window[i].second; 104 | if (window[i].first == '+' && window[i + 2].first == '@') 105 | return window[i + 2].second; 106 | } 107 | 108 | return gztell(fp); 109 | } 110 | 111 | // test if the file is gzipped 112 | static inline bool is_gzipped(std::string filename) 113 | { 114 | FILE *fp = fopen(filename.c_str(), "rb"); 115 | if(fp == NULL) error("Opening file " + filename); 116 | int byte1 = 0, byte2 = 0; 117 | fread(&byte1, sizeof(char), 1, fp); 118 | fread(&byte2, sizeof(char), 1, fp); 119 | fclose(fp); 120 | return (byte1 == 0x1f && byte2 == 0x8b); 121 | } 122 | 123 | // Return the length of the file 124 | // Assumes that the file is not compressed 125 | static inline size_t get_file_size(std::string filename) 126 | { 127 | if (is_gzipped(filename)) 128 | { 129 | std::cerr << "The input is gzipped!" << std::endl; 130 | return -1; 131 | } 132 | FILE *fp = fopen(filename.c_str(), "r"); 133 | fseek(fp, 0L, SEEK_END); 134 | size_t size = ftell(fp); 135 | fclose(fp); 136 | return size; 137 | } 138 | 139 | std::vector split_fastq(std::string filename, size_t n_threads) 140 | { 141 | //Precondition: the file is not gzipped 142 | // scan file for start positions and execute threads 143 | size_t size = get_file_size(filename); 144 | 145 | gzFile fp = gzopen(filename.c_str(), "r"); 146 | if (fp == Z_NULL) 147 | { 148 | throw new std::runtime_error("Cannot open input file " + filename); 149 | } 150 | 151 | std::vector starts(n_threads + 1); 152 | for (int i = 0; i < n_threads + 1; ++i) 153 | { 154 | size_t start = (size_t)((size * i) / n_threads); 155 | gzseek(fp, start, SEEK_SET); 156 | starts[i] = next_start_fastq(fp); 157 | } 158 | gzclose(fp); 159 | return starts; 160 | } 161 | 162 | //////////////////////////////////////////////////////////////////////////////// 163 | /// SLP definitions 164 | //////////////////////////////////////////////////////////////////////////////// 165 | 166 | using SelSd = SelectSdvec<>; 167 | using DagcSd = DirectAccessibleGammaCode; 168 | using Fblc = FixedBitLenCode<>; 169 | 170 | using shaped_slp_t = SelfShapedSlp; 171 | using plain_slp_t = PlainSlp; 172 | 173 | template 174 | std::string get_slp_file_extension() 175 | { 176 | return std::string(".slp"); 177 | } 178 | 179 | template <> 180 | std::string get_slp_file_extension() 181 | { 182 | return std::string(".slp"); 183 | } 184 | 185 | template <> 186 | std::string get_slp_file_extension() 187 | { 188 | return std::string(".plain.slp"); 189 | } 190 | //////////////////////////////////////////////////////////////////////////////// 191 | 192 | template 193 | class mems_c 194 | { 195 | public: 196 | 197 | mems_c(std::string filename) 198 | { 199 | verbose("Loading the matching statistics index"); 200 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 201 | 202 | std::string filename_ms = filename + ms.get_file_extension(); 203 | 204 | ifstream fs_ms(filename_ms); 205 | ms.load(fs_ms); 206 | fs_ms.close(); 207 | 208 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 209 | 210 | verbose("Matching statistics index construction complete"); 211 | verbose("Memory peak: ", malloc_count_peak()); 212 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 213 | 214 | verbose("Loading random access"); 215 | t_insert_start = std::chrono::high_resolution_clock::now(); 216 | 217 | std::string filename_slp = filename + get_slp_file_extension(); 218 | 219 | ifstream fs(filename_slp); 220 | ra.load(fs); 221 | fs.close(); 222 | 223 | n = ra.getLen(); 224 | 225 | t_insert_end = std::chrono::high_resolution_clock::now(); 226 | 227 | verbose("Matching statistics index loading complete"); 228 | verbose("Memory peak: ", malloc_count_peak()); 229 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 230 | } 231 | 232 | // Destructor 233 | ~mems_c() 234 | { 235 | // NtD 236 | } 237 | 238 | // The outfile has the following format. The first size_t integer store the 239 | // length l of the name. Then the following l characters stores the name of 240 | // the read, the next size_t integer stores the number m of MEMs, and the 241 | // following m size_t pairs of integers stores the positions and lengths of 242 | // the MEMs. 243 | void maximal_exact_matches(kseq_t *read, FILE* out) 244 | { 245 | auto pointers = ms.query(read->seq.s, read->seq.l); 246 | std::vector lengths(pointers.size()); 247 | std::vector> mems; 248 | 249 | size_t l = 0; 250 | for (size_t i = 0; i < pointers.size(); ++i) 251 | { 252 | size_t pos = pointers[i]; 253 | while ((i + l) < read->seq.l && (pos + l) < n && (i < 1 || pos != (pointers[i-1] + 1) ) && read->seq.s[i + l] == ra.charAt(pos + l)) 254 | ++l; 255 | 256 | lengths[i] = l; 257 | l = (l == 0 ? 0 : (l - 1)); 258 | 259 | if((i == 0) or (lengths[i] >= lengths[i-1])) 260 | mems.push_back(make_tuple(i,lengths[i],pos)); 261 | } 262 | 263 | // Original MS computation 264 | // for (size_t i = 0; i < pointers.size(); ++i) 265 | // { 266 | // size_t pos = pointers[i]; 267 | // while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l)) 268 | // ++l; 269 | 270 | // lengths[i] = l; 271 | // l = (l == 0 ? 0 : (l - 1)); 272 | // } 273 | 274 | assert(lengths.size() == pointers.size()); 275 | 276 | size_t h_length = read->name.l; 277 | fwrite(&h_length, sizeof(size_t), 1,out); 278 | fwrite(read->name.s, sizeof(char),h_length,out); 279 | if(sam_output) 280 | { 281 | size_t s_length = read->seq.l; 282 | fwrite(&s_length, sizeof(size_t), 1,out); 283 | fwrite(read->seq.s, sizeof(char),s_length,out); 284 | fwrite(read->qual.s, sizeof(char),s_length,out); 285 | } 286 | size_t q_length = mems.size(); 287 | fwrite(&q_length, sizeof(size_t), 1,out); 288 | fwrite(mems.data(), sizeof(std::tuple),q_length,out); 289 | } 290 | 291 | protected: 292 | ms_pointers<> ms; 293 | slp_t ra; 294 | size_t n = 0; 295 | }; 296 | 297 | 298 | 299 | char complement(char n) 300 | { 301 | switch (n) 302 | { 303 | case 'A': 304 | return 'T'; 305 | case 'T': 306 | return 'A'; 307 | case 'G': 308 | return 'C'; 309 | case 'C': 310 | return 'G'; 311 | default: 312 | return n; 313 | } 314 | } 315 | 316 | template 317 | struct mt_param_t 318 | { 319 | // Parameters 320 | ms_t *ms; 321 | std::string pattern_filename; 322 | std::string out_filename; 323 | size_t start; 324 | size_t end; 325 | size_t wk_id; 326 | }; 327 | 328 | template 329 | void *mt_ms_worker(void *param) 330 | { 331 | mt_param_t *p = (mt_param_t*) param; 332 | size_t n_reads = 0; 333 | size_t n_processed_reads = 0; 334 | 335 | FILE *out_fd; 336 | gzFile fp; 337 | 338 | if ((out_fd = fopen(p->out_filename.c_str(), "w")) == nullptr) 339 | error("open() file " + p->out_filename + " failed"); 340 | 341 | if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL) 342 | error("open() file " + p->pattern_filename + " failed"); 343 | 344 | gzseek(fp, p->start, SEEK_SET); 345 | 346 | kseq_t rev; 347 | int l; 348 | 349 | kseq_t *seq = kseq_init(fp); 350 | while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0)) 351 | { 352 | 353 | p->ms->maximal_exact_matches(seq,out_fd); 354 | 355 | } 356 | 357 | kseq_destroy(seq); 358 | gzclose(fp); 359 | fclose(out_fd); 360 | 361 | return NULL; 362 | } 363 | 364 | template 365 | void mt_ms(ms_t *ms, std::string pattern_filename, std::string out_filename, size_t n_threads) 366 | { 367 | pthread_t t[n_threads] = {0}; 368 | mt_param_t params[n_threads]; 369 | std::vector starts = split_fastq(pattern_filename, n_threads); 370 | for(size_t i = 0; i < n_threads; ++i) 371 | { 372 | params[i].ms = ms; 373 | params[i].pattern_filename = pattern_filename; 374 | params[i].out_filename = out_filename + "_" + std::to_string(i) + ".mems.tmp.out"; 375 | params[i].start = starts[i]; 376 | params[i].end = starts[i+1]; 377 | params[i].wk_id = i; 378 | xpthread_create(&t[i], NULL, &mt_ms_worker, ¶ms[i], __LINE__, __FILE__); 379 | } 380 | 381 | for(size_t i = 0; i < n_threads; ++i) 382 | { 383 | xpthread_join(t[i],NULL,__LINE__,__FILE__); 384 | } 385 | 386 | // sleep(5); 387 | 388 | 389 | return; 390 | } 391 | 392 | 393 | //////////////////////////////////////////////////////////////////////////////// 394 | /// Single Thread 395 | //////////////////////////////////////////////////////////////////////////////// 396 | template 397 | size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename) 398 | { 399 | size_t n_reads = 0; 400 | size_t n_processed_reads = 0; 401 | kseq_t rev; 402 | int l; 403 | FILE *out_fd; 404 | 405 | out_filename += "_0.mems.tmp.out"; 406 | 407 | if ((out_fd = fopen(out_filename.c_str(), "w")) == nullptr) 408 | error("open() file " + out_filename + " failed"); 409 | 410 | gzFile fp = gzopen(pattern_filename.c_str(), "r"); 411 | kseq_t* seq = kseq_init(fp); 412 | while ((l = kseq_read(seq)) >= 0) 413 | { 414 | 415 | ms->maximal_exact_matches(seq, out_fd); 416 | 417 | } 418 | 419 | kseq_destroy(seq); 420 | gzclose(fp); 421 | fclose(out_fd); 422 | 423 | // sleep(5); 424 | 425 | return n_processed_reads; 426 | } 427 | 428 | 429 | typedef std::pair> pattern_t; 430 | 431 | //*********************** Argument options *************************************** 432 | // struct containing command line parameters and other globals 433 | struct Args 434 | { 435 | std::string filename = ""; 436 | std::string patterns = ""; // path to patterns file 437 | std::string output = ""; // output file prefix 438 | size_t l = 25; // minumum MEM length 439 | size_t th = 1; // number of threads 440 | bool shaped_slp = false; // use shaped slp 441 | bool extended_output = false; // print one MEM occurrence in the reference 442 | bool sam_output = false; // output MEMs in SAM format 443 | }; 444 | 445 | void parseArgs(int argc, char *const argv[], Args &arg) 446 | { 447 | int c; 448 | extern char *optarg; 449 | extern int optind; 450 | 451 | std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-s sam_output] [-b batch]\n\n" + 452 | "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" + 453 | " shaped_slp: [boolean] - use shaped slp. (def. false)\n" + 454 | "extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" + 455 | " sam_output: [boolean] - print output in SAM format. (def. false)\n" + 456 | " pattens: [string] - path to patterns file.\n" + 457 | " output: [string] - output file prefix.\n" + 458 | " len: [integer] - minimum MEM lengt (def. 25)\n" + 459 | " thread: [integer] - number of threads (def. 1)\n"); 460 | 461 | std::string sarg; 462 | while ((c = getopt(argc, argv, "l:hp:o:t:qes")) != -1) 463 | { 464 | switch (c) 465 | { 466 | case 'p': 467 | arg.patterns.assign(optarg); 468 | break; 469 | case 'o': 470 | arg.output.assign(optarg); 471 | break; 472 | case 'l': 473 | sarg.assign(optarg); 474 | arg.l = stoi(sarg); 475 | break; 476 | case 't': 477 | sarg.assign(optarg); 478 | arg.th = stoi(sarg); 479 | break; 480 | case 'q': 481 | arg.shaped_slp = true; 482 | break; 483 | case 'e': 484 | arg.extended_output = true; 485 | break; 486 | case 's': 487 | arg.sam_output = true; 488 | break; 489 | case 'h': 490 | error(usage); 491 | case '?': 492 | error("Unknown option.\n", usage); 493 | exit(1); 494 | } 495 | } 496 | // the only input parameter is the file name 497 | if (argc == optind + 1) 498 | { 499 | arg.filename.assign(argv[optind]); 500 | } 501 | else 502 | { 503 | error("Invalid number of arguments\n", usage); 504 | } 505 | 506 | if (arg.extended_output && arg.sam_output) { 507 | error("Cannot specify both extended_output and sam_output flags.\n", usage); 508 | } 509 | } 510 | 511 | //********** end argument options ******************** 512 | 513 | template 514 | void dispatcher(Args &args) 515 | { 516 | verbose("Construction of the matching statistics data structure"); 517 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 518 | 519 | ms_t ms(args.filename); 520 | 521 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 522 | verbose("Memory peak: ", malloc_count_peak()); 523 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 524 | 525 | verbose("Processing patterns"); 526 | t_insert_start = std::chrono::high_resolution_clock::now(); 527 | 528 | std::string base_name = basename(args.filename.data()); 529 | std::string out_filename = args.patterns + "_" + base_name; 530 | if(args.output != "") 531 | out_filename = args.output; 532 | 533 | if (is_gzipped(args.patterns)) 534 | { 535 | verbose("The input is gzipped - forcing single thread matching statistics."); 536 | args.th = 1; 537 | } 538 | 539 | if (args.th == 1) 540 | st_ms(&ms, args.patterns, out_filename); 541 | else 542 | mt_ms(&ms, args.patterns, out_filename, args.th); 543 | 544 | // TODO: Merge the SAM files. 545 | 546 | t_insert_end = std::chrono::high_resolution_clock::now(); 547 | 548 | verbose("Memory peak: ", malloc_count_peak()); 549 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 550 | 551 | auto mem_peak = malloc_count_peak(); 552 | verbose("Memory peak: ", malloc_count_peak()); 553 | 554 | seqidx idx; 555 | 556 | std::string filename_idx = args.filename + idx.get_file_extension(); 557 | verbose("Loading fasta index file: " + filename_idx); 558 | t_insert_start = std::chrono::high_resolution_clock::now(); 559 | 560 | ifstream fs_idx(filename_idx); 561 | idx.load(fs_idx); 562 | fs_idx.close(); 563 | 564 | t_insert_end = std::chrono::high_resolution_clock::now(); 565 | 566 | verbose("Fasta index loading complete"); 567 | verbose("Memory peak: ", malloc_count_peak()); 568 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 569 | 570 | verbose("Printing plain output"); 571 | t_insert_start = std::chrono::high_resolution_clock::now(); 572 | 573 | std::string mems_file_suffix = args.sam_output ? ".sam" : ".mems"; 574 | std::ofstream f_mems(out_filename + mems_file_suffix); 575 | 576 | if (!f_mems.is_open()) 577 | error("open() file " + std::string(out_filename) + mems_file_suffix + " failed"); 578 | 579 | if(args.sam_output) 580 | { 581 | f_mems << "@HD\tVN:1.6\tSO:unknown\n"; 582 | f_mems << idx.to_sam(); 583 | f_mems << "@PG\tID:moni\tPN:moni\tVN:0.2.2\n"; 584 | } 585 | 586 | size_t n_seq = 0; 587 | for (size_t i = 0; i < args.th; ++i) 588 | { 589 | std::string tmp_filename = out_filename + "_" + std::to_string(i) + ".mems.tmp.out"; 590 | FILE *in_fd; 591 | 592 | if ((in_fd = fopen(tmp_filename.c_str(), "r")) == nullptr) 593 | error("open() file " + tmp_filename + " failed"); 594 | 595 | size_t length = 0; 596 | size_t rname_l = 0; 597 | size_t s_length = 0; 598 | size_t m = 100; // Reserved size for pointers and lengths 599 | std::vector> mem(m); 600 | size_t s = 100; // Reserved size for read name 601 | size_t rseq_l = 100; // Reserved size for seq and qual 602 | char* rname = (char *)malloc(s * sizeof(char)); 603 | char *rseq = (char *)malloc(rseq_l * sizeof(char)); 604 | char *rqual = (char *)malloc(rseq_l * sizeof(char)); 605 | while (!feof(in_fd) and fread(&rname_l, sizeof(size_t), 1, in_fd) > 0) 606 | { 607 | // Reading read name 608 | if (s < rname_l) 609 | { 610 | // Resize lengths and pointers 611 | s = rname_l; 612 | rname = (char *)realloc(rname, s * sizeof(char)); 613 | } 614 | 615 | if ((fread(rname, sizeof(char), rname_l, in_fd)) != rname_l) 616 | error("fread() file " + std::string(tmp_filename) + " failed"); 617 | 618 | // In case of SAM output read also the sequence and quals 619 | if (args.sam_output) 620 | { 621 | if ((fread(&s_length, sizeof(size_t), 1, in_fd)) != 1) 622 | error("fread() file " + std::string(tmp_filename) + " failed"); 623 | if (rseq_l < s_length) 624 | { 625 | // Resize s_lengths and pointers 626 | rseq_l = s_length; 627 | rseq = (char *)realloc(rseq, rseq_l * sizeof(char)); 628 | rqual = (char *)realloc(rqual, rseq_l * sizeof(char)); 629 | } 630 | if ((fread(rseq, sizeof(char), s_length, in_fd)) != s_length) 631 | error("fread() file " + std::string(tmp_filename) + " failed"); 632 | if ((fread(rqual, sizeof(char), s_length, in_fd)) != s_length) 633 | error("fread() file " + std::string(tmp_filename) + " failed"); 634 | } 635 | else 636 | { 637 | f_mems << ">" + std::string(rname, rname_l) << endl; 638 | } 639 | 640 | // Reading MEMs 641 | if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1) 642 | error("fread() file " + std::string(tmp_filename) + " failed"); 643 | 644 | if (m < length) 645 | { 646 | // Resize lengths and pointers 647 | m = length; 648 | mem.resize(m); 649 | } 650 | 651 | if ((fread(mem.data(), sizeof(std::tuple), length, in_fd)) != length) 652 | error("fread() file " + std::string(tmp_filename) + " failed"); 653 | 654 | // TODO: Store the fasta headers somewhere 655 | // f_mems << ">" + std::to_string(n_seq) << endl; 656 | if (args.sam_output){ 657 | for (size_t i = 0; i < length; ++i) 658 | { 659 | size_t mem_pos = std::get<0>(mem[i]); 660 | size_t mem_len = std::get<1>(mem[i]); 661 | std::pair pos = idx.index(std::get<2>(mem[i])); 662 | f_mems << std::string(rname,rname_l) + "\t"; 663 | // First MEM is primary, all other MEMs are non primary 664 | f_mems << (i?"256\t":"0\t"); 665 | f_mems << pos.first << "\t" << pos.second + 1<< "\t60\t"; 666 | std::string cigar = ""; 667 | if (mem_pos > 0) cigar += std::to_string(mem_pos) + "S"; 668 | cigar += std::to_string(mem_len) + "M"; 669 | size_t suff_length = s_length - (mem_pos + mem_len); 670 | if (suff_length > 0) cigar += std::to_string(suff_length) + "S"; 671 | f_mems << cigar + "\t" + std::string(rseq, s_length) + "\t" + std::string(rqual, s_length) + "\n"; 672 | } 673 | } else 674 | { 675 | if (args.extended_output){ 676 | for (size_t i = 0; i < length; ++i) 677 | { 678 | std::pair pos = idx.index(std::get<2>(mem[i])); 679 | f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") "; 680 | } 681 | } else { 682 | for (size_t i = 0; i < length; ++i) 683 | { 684 | f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") "; 685 | } 686 | 687 | } 688 | f_mems << endl; 689 | } 690 | 691 | n_seq++; 692 | } 693 | fclose(in_fd); 694 | if (std::remove(tmp_filename.c_str()) != 0) 695 | error("remove() file " + tmp_filename + " failed"); 696 | } 697 | 698 | f_mems.close(); 699 | 700 | t_insert_end = std::chrono::high_resolution_clock::now(); 701 | 702 | verbose("Memory peak: ", malloc_count_peak()); 703 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 704 | 705 | mem_peak = malloc_count_peak(); 706 | verbose("Memory peak: ", malloc_count_peak()); 707 | } 708 | 709 | int main(int argc, char *const argv[]) 710 | { 711 | Args args; 712 | parseArgs(argc, argv, args); 713 | 714 | if (args.shaped_slp) 715 | { 716 | if (args.sam_output) 717 | dispatcher>(args); 718 | else 719 | dispatcher>(args); 720 | } 721 | else 722 | { 723 | if(args.sam_output) 724 | dispatcher>(args); 725 | else 726 | dispatcher>(args); 727 | } 728 | return 0; 729 | } -------------------------------------------------------------------------------- /src/rlebwt_ms_build.cpp: -------------------------------------------------------------------------------- 1 | /* rlebwt_ms_build - Build the matching statistics data structure 2 | Copyright (C) 2020 Massimiliano Rossi 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | This program is distributed in the hope that it will be useful, 8 | but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 | GNU General Public License for more details. 11 | You should have received a copy of the GNU General Public License 12 | along with this program. If not, see http://www.gnu.org/licenses/ . 13 | */ 14 | /*! 15 | \file rlebwt_ms_build.cpp 16 | \brief rlebwt_ms_build.cpp Build the matching statistics data structure. 17 | \author Massimiliano Rossi 18 | \date 13/07/2020 19 | */ 20 | 21 | #include 22 | 23 | #define VERBOSE 24 | 25 | #include 26 | 27 | #include 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | //*********************** Argument options *************************************** 38 | // struct containing command line parameters and other globals 39 | struct Args 40 | { 41 | std::string filename = ""; 42 | bool memo = false; // print the memory usage 43 | bool csv = false; // print stats on stderr in csv format 44 | bool rle = false; // outpt RLBWT 45 | }; 46 | 47 | void parseArgs(int argc, char *const argv[], Args &arg) 48 | { 49 | int c; 50 | extern char *optarg; 51 | extern int optind; 52 | 53 | std::string usage("usage: " + std::string(argv[0]) + " infile [-s store] [-m memo] [-c csv] [-p patterns] [-f fasta] [-r rle] [-t threads] [-l len]\n\n" + 54 | "Computes the pfp data structures of infile, provided that infile.parse, infile.dict, and infile.occ exists.\n" + 55 | " memo: [boolean] - print the data structure memory usage. (def. false)\n" + 56 | " rle: [boolean] - output run length encoded BWT. (def. false)\n" + 57 | " csv: [boolean] - print the stats in csv form on strerr. (def. false)\n"); 58 | 59 | std::string sarg; 60 | while ((c = getopt(argc, argv, "mcrh")) != -1) 61 | { 62 | switch (c) 63 | { 64 | case 'm': 65 | arg.memo = true; 66 | break; 67 | case 'c': 68 | arg.csv = true; 69 | break; 70 | case 'r': 71 | arg.rle = true; 72 | break; 73 | case 'h': 74 | error(usage); 75 | case '?': 76 | error("Unknown option.\n", usage); 77 | exit(1); 78 | } 79 | } 80 | // the only input parameter is the file name 81 | if (argc == optind + 1) 82 | { 83 | arg.filename.assign(argv[optind]); 84 | } 85 | else 86 | { 87 | error("Invalid number of arguments\n", usage); 88 | } 89 | } 90 | 91 | //********** end argument options ******************** 92 | 93 | int main(int argc, char *const argv[]) 94 | { 95 | using SelSd = SelectSdvec<>; 96 | using DagcSd = DirectAccessibleGammaCode; 97 | 98 | Args args; 99 | parseArgs(argc, argv, args); 100 | 101 | // Building the r-index 102 | 103 | verbose("Building the matching statistics index"); 104 | std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now(); 105 | 106 | ms_pointers<> ms(args.filename, true); 107 | 108 | std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now(); 109 | 110 | verbose("Matching statistics index construction complete"); 111 | verbose("Memory peak: ", malloc_count_peak()); 112 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 113 | 114 | 115 | std::string outfile = args.filename + ms.get_file_extension(); 116 | std::ofstream out(outfile); 117 | ms.serialize(out); 118 | 119 | // size_t ra_size = sdsl::size_in_bytes(ra); 120 | 121 | 122 | t_insert_end = std::chrono::high_resolution_clock::now(); 123 | 124 | verbose("Memory peak: ", malloc_count_peak()); 125 | verbose("Elapsed time (s): ", std::chrono::duration>(t_insert_end - t_insert_start).count()); 126 | 127 | auto mem_peak = malloc_count_peak(); 128 | verbose("Memory peak: ", malloc_count_peak()); 129 | 130 | size_t space = 0; 131 | if (args.memo) 132 | { 133 | sdsl::nullstream ns; 134 | 135 | size_t ms_size = ms.serialize(ns); 136 | verbose("MS size (bytes): ", ms_size); 137 | } 138 | 139 | if (args.csv) 140 | std::cerr << csv(args.filename.c_str(), time, space, mem_peak) << std::endl; 141 | 142 | return 0; 143 | } -------------------------------------------------------------------------------- /thirdparty/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | 3 | ## Add malloc_count 4 | FetchContent_Declare( 5 | malloc_count 6 | GIT_REPOSITORY https://github.com/bingmann/malloc_count 7 | ) 8 | 9 | FetchContent_GetProperties(malloc_count) 10 | if(NOT malloc_count_POPULATED) 11 | FetchContent_Populate(malloc_count) 12 | 13 | add_library(malloc_count OBJECT ${malloc_count_SOURCE_DIR}/malloc_count.c ${malloc_count_SOURCE_DIR}/malloc_count.h) 14 | target_link_libraries(malloc_count dl) 15 | target_include_directories(malloc_count PUBLIC "${malloc_count_SOURCE_DIR}") 16 | 17 | add_library(memprofile OBJECT ${malloc_count_SOURCE_DIR}/memprofile.h) 18 | target_include_directories(memprofile PUBLIC "${malloc_count_SOURCE_DIR}") 19 | endif() 20 | 21 | # # Add klib 22 | set(KLIB_COMMIT "9a063b33efd841fcc42d4b9f68cb78bb528bf75b") 23 | 24 | FetchContent_Declare( 25 | klib 26 | GIT_REPOSITORY https://github.com/attractivechaos/klib 27 | GIT_TAG ${KLIB_COMMIT} 28 | ) 29 | 30 | FetchContent_GetProperties(klib) 31 | if(NOT klib_POPULATED) 32 | FetchContent_Populate(klib) 33 | 34 | # add_subdirectory(${klib_SOURCE_DIR} ${klib_BINARY_DIR} EXCLUDE_FROM_ALL) 35 | add_library(klib INTERFACE) 36 | 37 | target_include_directories(klib INTERFACE ${klib_SOURCE_DIR}) 38 | endif() 39 | 40 | ## Add Big-BWT 41 | FetchContent_Declare( 42 | bigbwt 43 | GIT_REPOSITORY https://github.com/alshai/Big-BWT.git 44 | ) 45 | 46 | FetchContent_GetProperties(bigbwt) 47 | if(NOT bigbwt_POPULATED) 48 | FetchContent_Populate(bigbwt) 49 | add_subdirectory(${bigbwt_SOURCE_DIR} ${bigbwt_BINARY_DIR}) 50 | 51 | endif() 52 | 53 | 54 | 55 | ## Add gsacak 56 | FetchContent_Declare( 57 | gsacak 58 | GIT_REPOSITORY https://github.com/felipelouza/gsa-is.git 59 | ) 60 | 61 | FetchContent_GetProperties(gsacak) 62 | if(NOT gsacak_POPULATED) 63 | FetchContent_Populate(gsacak) 64 | add_library(gsacak OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h) 65 | target_include_directories(gsacak PUBLIC "${gsacak_SOURCE_DIR}") 66 | 67 | add_library(gsacak64 OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h) 68 | target_include_directories(gsacak64 PUBLIC "${gsacak_SOURCE_DIR}") 69 | target_compile_options(gsacak64 PUBLIC -DM64) 70 | endif() 71 | 72 | 73 | ## Add sdsl 74 | FetchContent_Declare( 75 | sdsl 76 | GIT_REPOSITORY https://github.com/simongog/sdsl-lite 77 | ) 78 | 79 | FetchContent_GetProperties(sdsl) 80 | if(NOT sdsl_POPULATED) 81 | FetchContent_Populate(sdsl) 82 | 83 | set(GENERATE_DOC OFF CACHE BOOL "Do not generate doxygen for sdsl-lite") 84 | 85 | add_subdirectory(${sdsl_SOURCE_DIR} ${sdsl_BINARY_DIR} EXCLUDE_FROM_ALL) 86 | endif() 87 | 88 | ## Add divsuffsort 89 | FetchContent_Declare( 90 | divsufsort 91 | GIT_REPOSITORY https://github.com/simongog/libdivsufsort.git 92 | GIT_TAG 2.0.1 93 | ) 94 | 95 | FetchContent_GetProperties(divsufsort) 96 | if(NOT divsufsort_POPULATED) 97 | FetchContent_Populate(divsufsort) 98 | 99 | set(BUILD_SHARED_LIBS OFF CACHE BOOL "Do not build a shared library for libdivsufsort") 100 | set(BUILD_EXAMPLES OFF CACHE BOOL "Do not build libdivsufsort example") 101 | set(BUILD_DIVSUFSORT64 ON CACHE BOOL "Build libdivsufsort in 64-bits mode") 102 | 103 | add_subdirectory(${divsufsort_SOURCE_DIR} ${divsufsort_BINARY_DIR} EXCLUDE_FROM_ALL) 104 | 105 | target_include_directories(divsufsort PUBLIC "${divsufsort_BINARY_DIR}/include") 106 | target_include_directories(divsufsort64 PUBLIC "${divsufsort_BINARY_DIR}/include") 107 | endif() 108 | 109 | 110 | ## Add r-index 111 | FetchContent_Declare( 112 | r-index 113 | GIT_REPOSITORY https://github.com/maxrossi91/r-index.git 114 | ) 115 | 116 | FetchContent_GetProperties(r-index) 117 | if(NOT r-index_POPULATED) 118 | FetchContent_Populate(r-index) 119 | 120 | add_subdirectory(${r-index_SOURCE_DIR} ${r-index_BINARY_DIR} EXCLUDE_FROM_ALL) 121 | add_library(ri INTERFACE) 122 | target_link_libraries(ri INTERFACE klib z) 123 | target_include_directories(ri INTERFACE ${r-index_SOURCE_DIR}/internal) 124 | endif() 125 | 126 | ## Add pfp-thresholds 127 | FetchContent_Declare( 128 | pfp_thresholds 129 | GIT_REPOSITORY https://github.com/maxrossi91/pfp-thresholds.git 130 | GIT_TAG develop 131 | ) 132 | 133 | FetchContent_GetProperties(pfp_thresholds) 134 | if(NOT pfp_thresholds_POPULATED) 135 | FetchContent_Populate(pfp_thresholds) 136 | add_subdirectory(${pfp_thresholds_SOURCE_DIR} ${pfp_thresholds_BINARY_DIR}) 137 | 138 | endif() 139 | 140 | ## Add bigrepair 141 | FetchContent_Declare( 142 | bigrepair 143 | GIT_REPOSITORY https://gitlab.com/maxrossi91/bigrepair.git 144 | # GIT_REPOSITORY https://gitlab.com/manzai/bigrepair.git 145 | ) 146 | 147 | FetchContent_GetProperties(bigrepair) 148 | if(NOT bigrepair_POPULATED) 149 | set(DISABLE_PFP ON CACHE BOOL "Build bigrepair without the PFP") 150 | FetchContent_Populate(bigrepair) 151 | add_subdirectory(${bigrepair_SOURCE_DIR} ${bigrepair_BINARY_DIR}) 152 | 153 | # execute_process(COMMAND make 154 | # RESULT_VARIABLE result 155 | # WORKING_DIRECTORY ${bigrepair_SOURCE_DIR} ) 156 | # if(result) 157 | # message(FATAL_ERROR "CMake step for bigrepair failed: ${result}") 158 | # endif() 159 | 160 | endif() 161 | 162 | ## Add ShapedSlp 163 | FetchContent_Declare( 164 | shaped_slp 165 | GIT_REPOSITORY https://github.com/koeppl/ShapedSlp.git 166 | GIT_TAG master 167 | ) 168 | 169 | FetchContent_GetProperties(shaped_slp) 170 | if(NOT shaped_slp_POPULATED) 171 | FetchContent_Populate(shaped_slp) 172 | add_subdirectory(${shaped_slp_SOURCE_DIR} ${shaped_slp_BINARY_DIR}) 173 | set(FOLCA_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/folca) 174 | set(SUX_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/external/sux/sux) 175 | endif() 176 | 177 | ## Add SSW 178 | FetchContent_Declare( 179 | ssw 180 | GIT_REPOSITORY https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library 181 | GIT_TAG master 182 | ) 183 | 184 | FetchContent_GetProperties(ssw) 185 | if(NOT ssw_POPULATED) 186 | FetchContent_Populate(ssw) 187 | 188 | add_library(ssw OBJECT ${ssw_SOURCE_DIR}/src/ssw_cpp.cpp ${ssw_SOURCE_DIR}/src/ssw.c) 189 | target_include_directories(ssw PUBLIC ${ssw_SOURCE_DIR}/src) 190 | endif() 191 | 192 | 193 | ## Add Ksw2 194 | FetchContent_Declare( 195 | ksw2 196 | GIT_REPOSITORY https://github.com/lh3/ksw2 197 | GIT_TAG master 198 | ) 199 | 200 | FetchContent_GetProperties(ksw2) 201 | if(NOT ksw2_POPULATED) 202 | FetchContent_Populate(ksw2) 203 | 204 | add_library(ksw2 OBJECT ${ksw2_SOURCE_DIR}/kalloc.c 205 | ${ksw2_SOURCE_DIR}/ksw2_gg.c 206 | ${ksw2_SOURCE_DIR}/ksw2_gg2.c 207 | ${ksw2_SOURCE_DIR}/ksw2_gg2_sse.c 208 | ${ksw2_SOURCE_DIR}/ksw2_extz.c 209 | ${ksw2_SOURCE_DIR}/ksw2_extz2_sse.c 210 | ${ksw2_SOURCE_DIR}/ksw2_extd.c 211 | ${ksw2_SOURCE_DIR}/ksw2_extd2_sse.c 212 | ${ksw2_SOURCE_DIR}/ksw2_extf2_sse.c 213 | ${ksw2_SOURCE_DIR}/ksw2_exts2_sse.c) 214 | target_include_directories(ksw2 PUBLIC ${ksw2_SOURCE_DIR}/src) 215 | endif() 216 | -------------------------------------------------------------------------------- /utils.md: -------------------------------------------------------------------------------- 1 | # Utils command 2 | 3 | # Build the docker image 4 | 5 | ```console 6 | docker build --platform linux/amd64 --no-cache -t maxrossi91/moni . 7 | ``` 8 | 9 | # Pseudo system test 10 | ```console 11 | docker run --platform linux/amd64 -v `pwd`/data:/data -it maxrossi91/moni bash 12 | 13 | mkdir -p out 14 | moni build -r data/SARS-CoV2/SARS-CoV2.1k.fa.gz -o out/sars-cov2 -f 15 | moni mems -i out/sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o out/reads -s 16 | ``` 17 | -------------------------------------------------------------------------------- /utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | FetchContent_GetProperties(r-index) 3 | 4 | add_executable(split_fa split_fa.cpp) 5 | target_link_libraries(split_fa klib z) 6 | -------------------------------------------------------------------------------- /utils/split_fa.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // split_fa.cpp 3 | // 4 | // Copyright 2020 Marco Oliva. All rights reserved. 5 | // 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | KSEQ_INIT(gzFile, gzread) 15 | 16 | void split_file(std::string& path, std::size_t n_seqs, std::size_t n_blocks) 17 | { 18 | std::size_t seqs_per_block = 0, residual = 0; 19 | if (n_seqs % n_blocks == 0) { seqs_per_block = n_seqs / n_blocks; residual = n_seqs / n_blocks; } 20 | else if (n_blocks == 2) {seqs_per_block = n_seqs / 2; residual = (n_seqs % 2) + (n_seqs / 2);} 21 | else { seqs_per_block = n_seqs / (n_blocks - 1); residual = n_seqs % (n_blocks - 1); } 22 | 23 | std::size_t last_index = path.find_last_of('.'); std::string remove_gz = path.substr(0, last_index); 24 | last_index = remove_gz.find_last_of('.', last_index); 25 | std::string base_name = path.substr(0, last_index); 26 | 27 | int l; 28 | gzFile fp; 29 | kseq_t *seq; 30 | fp = gzopen(path.c_str(), "r"); 31 | seq = kseq_init(fp); 32 | for (std::size_t i = 0; i < n_blocks - 1; i++) 33 | { 34 | std::cout << "\rSplitting sequences... " << std::to_string(i + 1) << "/" << n_blocks << " " 35 | << std::to_string((double(i + 1) / double(n_blocks)) * 100) << "%" << std::flush; 36 | 37 | std::string out_path = base_name + "_" + std::to_string(i + 1) + ".fa"; 38 | std::ofstream out_file(out_path); 39 | std::size_t it = 0; 40 | while (it < seqs_per_block) 41 | { 42 | l = kseq_read(seq); 43 | if (seq->seq.l > 0) 44 | { 45 | out_file.put('>'); out_file.write(seq->name.s, seq->name.l); out_file.put('\n'); 46 | out_file.write(seq->seq.s, seq->seq.l); out_file.put('\n'); 47 | } 48 | it++; 49 | } 50 | out_file.close(); 51 | } 52 | // remember to write residual to last file 53 | std::string out_path = base_name + "_" + std::to_string(n_blocks) + ".fasta"; 54 | std::ofstream out_file(out_path); 55 | std::size_t it = 0; 56 | std::cout << "\rSplitting sequences... " << std::to_string(n_blocks) << "/" << n_blocks << " " 57 | << std::to_string((double(n_blocks) / double(n_blocks)) * 100) << "%" << std::flush; 58 | while (it < residual) 59 | { 60 | l = kseq_read(seq); 61 | if (seq->seq.l > 0) 62 | { 63 | out_file.put('>'); out_file.write(seq->name.s, seq->name.l); out_file.put('\n'); 64 | out_file.write(seq->seq.s, seq->seq.l); out_file.put('\n'); 65 | } 66 | it++; 67 | } 68 | out_file.close(); 69 | 70 | 71 | // free 72 | kseq_destroy(seq); 73 | gzclose(fp); 74 | } 75 | 76 | // Count sequences 77 | std::size_t count_seqs(std::string& path) 78 | { 79 | std::size_t sequences_count = 0; 80 | int l; 81 | gzFile fp; 82 | kseq_t *seq; 83 | fp = gzopen(path.c_str(), "r"); 84 | seq = kseq_init(fp); 85 | while ((l = kseq_read(seq)) >= 0) 86 | { 87 | sequences_count++; 88 | } 89 | kseq_destroy(seq); 90 | gzclose(fp); 91 | 92 | return sequences_count; 93 | } 94 | 95 | int main(int argc, char *argv[]) 96 | { 97 | if (argc != 3) { 98 | fprintf(stderr, "Usage: %s \n", argv[0]); 99 | return 1; 100 | } 101 | 102 | std::string path = argv[1]; 103 | std::cout << "In path: " << path << std::endl; 104 | std::size_t n_blocks = std::stoi(argv[2]); 105 | std::cout << "Blocks: " << n_blocks << std::endl; 106 | 107 | std::cout << "Reading sequences..."; 108 | std::size_t n_seq = count_seqs(path); 109 | std::cout << " done. N: " << n_seq << std::endl; 110 | 111 | std::cout << "Splitting sequences..."; 112 | split_file(path, n_seq, n_blocks); 113 | std::cout << " done." << std::endl; 114 | 115 | return 0; 116 | } 117 | --------------------------------------------------------------------------------