├── .gitignore
├── CITATION.cff
├── CMakeLists.txt
├── CMakeModules
    ├── ConfigureCompilerClang.cmake
    └── ConfigureCompilerGcc.cmake
├── Dockerfile
├── LICENSE
├── README.md
├── data
    ├── SARS-CoV2
    │   ├── SARS-CoV2.1k.fa.gz
    │   └── reads.fastq.gz
    ├── reads.fastq
    └── yeast.fasta
├── include
    ├── common
    │   ├── common.hpp
    │   └── seqidx.hpp
    ├── extender
    │   ├── extend_reads_dispatcher.hpp
    │   ├── extender_klib.hpp
    │   └── extender_ksw2.hpp
    └── ms
    │   ├── ms_pointers.hpp
    │   ├── ms_rle_string.hpp
    │   └── thresholds_ds.hpp
├── pipeline
    └── moni.in
├── src
    ├── CMakeLists.txt
    ├── build_seqidx.cpp
    ├── compress_dictionary.cpp
    ├── extend_klib.cpp
    ├── extend_ksw2.cpp
    ├── matching_statistics.cpp
    ├── mems.cpp
    └── rlebwt_ms_build.cpp
├── thirdparty
    └── CMakeLists.txt
├── utils.md
└── utils
    ├── CMakeLists.txt
    └── split_fa.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # Folders
35 | build/*
36 | debug/*
37 | data/*
38 | !data/yeast.fasta


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | 
 2 | cff-version: 1.2.0
 3 | message: "If you use this software, please cite it as below."
 4 | authors:
 5 | - family-names: "Rossi"
 6 |   given-names: "Massimiliano"
 7 |   orcid: "https://orcid.org/0000-0002-3012-1394"
 8 | title: "MONI: A Pangenomic Index for Finding Maximal Exact Matches"
 9 | url: "https://github.com/maxrossi91/moni"
10 | preferred-citation:
11 |   type: journal-paper
12 |   authors:
13 |   - family-names: "Rossi"
14 |     given-names: "Massimiliano"
15 |     orcid: "https://orcid.org/0000-0002-3012-1394"
16 |   - family-names: "Oliva"
17 |     given-names: "Marco"
18 |     orcid: "https://orcid.org/0000-0003-0525-3114"
19 |   - family-names: "Langmead"
20 |     given-names: "Ben"
21 |     orcid: "https://orcid.org/0000-0003-2437-1976"
22 |   - family-names: "Gagie"
23 |     given-names: "Travis"
24 |     orcid: "https://orcid.org/0000-0003-3689-327X"
25 |   - family-names: "Boucher"
26 |     given-names: "Christina"
27 |     orcid: "https://orcid.org/0000-0001-9509-9725"
28 |   doi: 10.1089/cmb.2021.0290
29 |   journal: "Journal of Computational Biology"
30 |   start: 169  # First page number
31 |   end: 187 # Last page number
32 |   title: "MONI: A Pangenomic Index for Finding Maximal Exact Matchesx"
33 |   year: 2022
34 |   volume: 29
35 |   number: 2


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.15)
 2 | set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 3 | 
 4 | # Set a default build type if none was specified
 5 | # ------------------------------------------------------------------------------
 6 | if(NOT CMAKE_BUILD_TYPE)
 7 |   message(STATUS "Setting build type to 'Release' as none was specified.")
 8 |   set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
 9 | endif()
10 | 
11 | message(STATUS "Install directory: ${CMAKE_INSTALL_PREFIX}")
12 | # About this project
13 | # ------------------------------------------------------------------------------
14 | project(moni)
15 | SET(VERSION_MAJOR "0")
16 | SET(VERSION_MINOR "2")
17 | SET(VERSION_PATCH "2")
18 | SET(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
19 | message("version: ${VERSION}")
20 | 
21 | set(PROJECT_VERSION ${VERSION})
22 | 
23 | # Set environment
24 | # ------------------------------------------------------------------------------
25 | 
26 | find_package(Git)
27 | if(GIT_FOUND)
28 |     message("git found: ${GIT_EXECUTABLE}")
29 | else()
30 |     message(WARNING "git not found. Cloning of submodules will not work.")
31 | endif()
32 | 
33 | 
34 | 
35 | # Configure thirdparty
36 | # ------------------------------------------------------------------------------
37 | set(CMAKE_INSTALL_INCLUDEDIR "include") # This is an hack because include(GUIInstallDirs) doesn't work
38 | 
39 | add_subdirectory(thirdparty)
40 | 
41 | 
42 | # Configure the compiler with the appropriate flags
43 | # ------------------------------------------------------------------------------
44 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
45 |   # using Clang
46 |   include(ConfigureCompilerClang)
47 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
48 |   # using GCC
49 |   include(ConfigureCompilerGcc)
50 | else ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
51 | 	message(FATAL_ERROR "Only the compiler gcc and clang are supported")
52 | endif()
53 | 
54 | 
55 | add_subdirectory(src)
56 | add_subdirectory(utils)
57 | 
58 | # Configure pipeline for build folder
59 | set(USE_INSTALL_PATH False)
60 | configure_file(${PROJECT_SOURCE_DIR}/pipeline/moni.in ${PROJECT_BINARY_DIR}/moni @ONLY)
61 | 
62 | # Configure pipeline for install folder
63 | set(USE_INSTALL_PATH True)
64 | configure_file(${PROJECT_SOURCE_DIR}/pipeline/moni.in ${PROJECT_BINARY_DIR}/moni.install @ONLY)
65 | 
66 | 
67 | install(TARGETS ms mems rlebwt_ms_build extend_ksw2 compress_dictionary build_seqidx TYPE RUNTIME)
68 | install(TARGETS SlpEncBuild pfp_thresholds pfp_thresholds64 TYPE RUNTIME)
69 | install(PROGRAMS ${PROJECT_BINARY_DIR}/moni.install RENAME moni TYPE BIN)
70 | # install(TARGETS ms rlebwt_ms_build extend_ksw2 DESTINATION bin)
71 | # install(PROGRAMS ${PROJECT_SOURCE_DIR}/pipeline/moni DESTINATION bin)
72 | 
73 | 
74 | # Configure cpack variables
75 | # ------------------------------------------------------------------------------
76 | 
77 | set(CPACK_PACKAGE_VERSION_MAJOR ${VERSION_MAJOR})
78 | set(CPACK_PACKAGE_VERSION_MINOR ${VERSION_MINOR})
79 | set(CPACK_PACKAGE_VERSION_PATCH ${VERSION_PATCH})
80 | set(CPACK_PACKAGE_VERSION "${VERSION}")
81 | 
82 | include(InstallRequiredSystemLibraries)
83 | set(CPACK_GENERATOR "STGZ;TGZ;DEB")
84 | set(CPACK_SOURCE_GENERATOR "TGZ")
85 | set(CPACK_PACKAGE_VENDOR "University of Florida")
86 | set(CPACK_PACKAGE_CONTACT "maxrossi91@gmail.com")
87 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "MONI - Pangenomic index for finding MEMs")
88 | set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
89 | set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
90 | set(CPACK_PACKAGE_NAME "${CMAKE_PROJECT_NAME}")
91 | set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-sources")
92 | 
93 | set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Massimiliano Rossi")
94 | set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) 
95 | set(CPACK_COMPONENTS_GROUPING ALL_COMPONENTS_IN_ONE) # Groupp all components
96 | # set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.3.1-6), libc6 (< 2.4)")
97 | set (CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
98 | set(CPACK_DEB_COMPONENT_INSTALL YES)
99 | include(CPack)


--------------------------------------------------------------------------------
/CMakeModules/ConfigureCompilerClang.cmake:
--------------------------------------------------------------------------------
 1 | # ##############################################################################
 2 | # Compiler configuration
 3 | # @author Massimiliano Rossi
 4 | # ##############################################################################
 5 | 
 6 | # Add the basic compiler options
 7 | add_compile_options("-std=c++11")
 8 | # add_compile_options("-Werror")
 9 | add_compile_options("-Wall")
10 | add_compile_options("-Wextra")
11 | add_compile_options("-Wcomment")
12 | add_compile_options("-Wformat=2")
13 | add_compile_options("-Wnonnull")
14 | add_compile_options("-Winit-self")
15 | add_compile_options("-Wmain")
16 | add_compile_options("-Wmissing-braces")
17 | add_compile_options("-Wmissing-include-dirs")
18 | add_compile_options("-Wparentheses")
19 | add_compile_options("-Wsequence-point")
20 | add_compile_options("-Wreturn-type")
21 | add_compile_options("-Wdate-time")
22 | add_compile_options("-Wswitch")
23 | add_compile_options("-Wswitch-default")
24 | add_compile_options("-Wswitch-enum")
25 | add_compile_options("-Wunused-function")
26 | add_compile_options("-Wunused-label")
27 | add_compile_options("-Wunused-local-typedefs")
28 | add_compile_options("-Wunused-parameter")
29 | add_compile_options("-Wunused-variable")
30 | add_compile_options("-Wunused-value")
31 | add_compile_options("-Wunused")
32 | add_compile_options("-Wuninitialized")
33 | add_compile_options("-Wunknown-pragmas")
34 | add_compile_options("-Wstrict-aliasing")
35 | add_compile_options("-Wstrict-overflow=5")
36 | add_compile_options("-Warray-bounds")
37 | add_compile_options("-Wundef")
38 | add_compile_options("-Wendif-labels")
39 | add_compile_options("-Wshadow")
40 | add_compile_options("-Wpointer-arith")
41 | add_compile_options("-Wtype-limits")
42 | add_compile_options("-Wcast-qual")
43 | add_compile_options("-Wwrite-strings")
44 | add_compile_options("-Wconversion")
45 | add_compile_options("-Wenum-compare")
46 | add_compile_options("-Wsign-compare")
47 | add_compile_options("-Waddress")
48 | add_compile_options("-Wattributes")
49 | add_compile_options("-Wbuiltin-macro-redefined")
50 | add_compile_options("-Wmissing-declarations")
51 | add_compile_options("-Wmissing-field-initializers")
52 | add_compile_options("-Wdeprecated")
53 | add_compile_options("-Wdeprecated-declarations")
54 | add_compile_options("-Woverflow")
55 | add_compile_options("-Wpacked")
56 | add_compile_options("-Winline")
57 | add_compile_options("-Wint-to-pointer-cast")
58 | add_compile_options("-Winvalid-pch")
59 | add_compile_options("-Wno-long-long")
60 | add_compile_options("-Wno-variadic-macros")
61 | add_compile_options("-Wvarargs")
62 | add_compile_options("-Wvla")
63 | add_compile_options("-Wvolatile-register-var")
64 | add_compile_options("-Wdisabled-optimization")
65 | add_compile_options("-Wstack-protector")
66 | add_compile_options("-Woverlength-strings")
67 | add_compile_options("-fvisibility=hidden")
68 | add_compile_options("-Wc++11-compat")
69 | add_compile_options("-Wconversion-null")
70 | add_compile_options("-Winherited-variadic-ctor")
71 | add_compile_options("-Winvalid-offsetof")
72 | add_compile_options("-pedantic")
73 | add_compile_options("-fno-gnu-keywords")
74 | add_compile_options("-Wctor-dtor-privacy")
75 | add_compile_options("-Wdelete-non-virtual-dtor")
76 | add_compile_options("-Wnarrowing")
77 | add_compile_options("-Wnon-virtual-dtor")
78 | add_compile_options("-Wreorder")
79 | add_compile_options("-Weffc++")
80 | add_compile_options("-Wold-style-cast")
81 | add_compile_options("-Wsign-promo")
82 | add_compile_options("-Wchar-subscripts")
83 | add_compile_options("-Wno-ignored-qualifiers")
84 | add_compile_options("-Wuninitialized")
85 | add_compile_options("-Wdiv-by-zero")
86 | add_compile_options("-Wfloat-equal")
87 | add_compile_options("-Wcast-align")
88 | add_compile_options("-Wempty-body")
89 | add_compile_options("-Wsizeof-pointer-memaccess")
90 | add_compile_options("-Wmultichar")
91 | add_compile_options("-fPIC")
92 | 
93 | 
94 | # Add the basic compiler options for debug version
95 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3")
96 | # Add the basic compiler options for release version
97 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ansi -march=native -funroll-loops -O3 -DNDEBUG")
98 | 


--------------------------------------------------------------------------------
/CMakeModules/ConfigureCompilerGcc.cmake:
--------------------------------------------------------------------------------
  1 | # ##############################################################################
  2 | # Compiler configuration
  3 | # @author Massimiliano Rossi
  4 | # ##############################################################################
  5 | 
  6 | # Add the basic compiler options
  7 | add_compile_options("-std=c++11")
  8 | # add_compile_options("-Werror")
  9 | add_compile_options("-Wall")
 10 | add_compile_options("-Wextra")
 11 | add_compile_options("-Wcomment")
 12 | add_compile_options("-Wdouble-promotion")
 13 | add_compile_options("-Wformat=2")
 14 | add_compile_options("-Wnonnull")
 15 | add_compile_options("-Winit-self")
 16 | add_compile_options("-Wmain")
 17 | add_compile_options("-Wmissing-braces")
 18 | add_compile_options("-Wmissing-include-dirs")
 19 | add_compile_options("-Wparentheses")
 20 | add_compile_options("-Wsequence-point")
 21 | add_compile_options("-Wreturn-local-addr")
 22 | add_compile_options("-Wreturn-type")
 23 | add_compile_options("-Wswitch")
 24 | add_compile_options("-Wswitch-default")
 25 | add_compile_options("-Wswitch-enum")
 26 | add_compile_options("-Wunused-but-set-parameter")
 27 | add_compile_options("-Wunused-but-set-variable")
 28 | add_compile_options("-Wunused-function")
 29 | add_compile_options("-Wunused-label")
 30 | add_compile_options("-Wunused-local-typedefs")
 31 | add_compile_options("-Wunused-parameter")
 32 | add_compile_options("-Wunused-variable")
 33 | add_compile_options("-Wunused-value")
 34 | add_compile_options("-Wunused")
 35 | add_compile_options("-Wuninitialized")
 36 | add_compile_options("-Wunknown-pragmas")
 37 | add_compile_options("-Wstrict-aliasing")
 38 | add_compile_options("-Wstrict-overflow=5")
 39 | add_compile_options("-Warray-bounds")
 40 | add_compile_options("-Wundef")
 41 | add_compile_options("-Wendif-labels")
 42 | add_compile_options("-Wshadow")
 43 | add_compile_options("-Wfree-nonheap-object")
 44 | add_compile_options("-Wunsafe-loop-optimizations")
 45 | add_compile_options("-Wpointer-arith")
 46 | add_compile_options("-Wtype-limits")
 47 | add_compile_options("-Wcast-qual")
 48 | add_compile_options("-Wwrite-strings")
 49 | add_compile_options("-Wclobbered")
 50 | add_compile_options("-Wconversion")
 51 | add_compile_options("-Wenum-compare")
 52 | add_compile_options("-Wsign-compare")
 53 | add_compile_options("-Wsign-conversion")
 54 | add_compile_options("-Waddress")
 55 | add_compile_options("-Wlogical-op")
 56 | add_compile_options("-Wno-aggressive-loop-optimizations")
 57 | add_compile_options("-Wattributes")
 58 | add_compile_options("-Wbuiltin-macro-redefined")
 59 | add_compile_options("-Wmissing-declarations")
 60 | add_compile_options("-Wmissing-field-initializers")
 61 | add_compile_options("-Wdeprecated")
 62 | add_compile_options("-Wdeprecated-declarations")
 63 | add_compile_options("-Woverflow")
 64 | add_compile_options("-Wpacked")
 65 | add_compile_options("-Wno-packed-bitfield-compat")
 66 | add_compile_options("-Winline")
 67 | add_compile_options("-Wint-to-pointer-cast")
 68 | add_compile_options("-Winvalid-pch")
 69 | add_compile_options("-Wno-long-long")
 70 | add_compile_options("-Wno-variadic-macros")
 71 | add_compile_options("-Wvarargs")
 72 | add_compile_options("-Wvector-operation-performance")
 73 | add_compile_options("-Wvla")
 74 | add_compile_options("-Wvolatile-register-var")
 75 | add_compile_options("-Wdisabled-optimization")
 76 | add_compile_options("-Wstack-protector")
 77 | add_compile_options("-Woverlength-strings")
 78 | add_compile_options("-fvisibility=hidden")
 79 | add_compile_options("-Wc++11-compat")
 80 | add_compile_options("-Wconversion-null")
 81 | add_compile_options("-Wuseless-cast")
 82 | add_compile_options("-Winherited-variadic-ctor")
 83 | add_compile_options("-Winvalid-offsetof")
 84 | add_compile_options("-Wvirtual-move-assign")
 85 | add_compile_options("-pedantic")
 86 | add_compile_options("-fno-gnu-keywords")
 87 | add_compile_options("-foptional-diags")
 88 | add_compile_options("-Wctor-dtor-privacy")
 89 | add_compile_options("-Wdelete-non-virtual-dtor")
 90 | add_compile_options("-Wliteral-suffix")
 91 | add_compile_options("-Wnarrowing")
 92 | add_compile_options("-Wnon-virtual-dtor")
 93 | add_compile_options("-Wreorder")
 94 | add_compile_options("-Weffc++")
 95 | add_compile_options("-fno-ext-numeric-literals")
 96 | add_compile_options("-Wnon-template-friend")
 97 | add_compile_options("-Wold-style-cast")
 98 | add_compile_options("-Wpmf-conversions")
 99 | add_compile_options("-Wsign-promo")
100 | add_compile_options("-Wchar-subscripts")
101 | add_compile_options("-Wno-ignored-qualifiers")
102 | add_compile_options("-Wmaybe-uninitialized")
103 | add_compile_options("-Wdiv-by-zero")
104 | add_compile_options("-Wtrampolines")
105 | add_compile_options("-Wfloat-equal")
106 | add_compile_options("-Wcast-align")
107 | add_compile_options("-Wempty-body")
108 | add_compile_options("-Wsizeof-pointer-memaccess")
109 | add_compile_options("-Wmultichar")
110 | add_compile_options("-Wnormalized=nfc")
111 | add_compile_options("-Wnoexcept")
112 | add_compile_options("-Wstrict-null-sentinel")
113 | 
114 | # Add the basic compiler options for debug version
115 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb3")
116 | # Add the basic compiler options for release version
117 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ansi -march=native -funroll-loops -O3 -DNDEBUG")
118 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:latest as builder
 2 | 
 3 | WORKDIR /build
 4 | 
 5 | 
 6 | RUN apt-get update -qq && \
 7 |     apt-get install -y zlib1g-dev \
 8 |                     git \
 9 |                     cmake \
10 |                     build-essential \
11 |                     python3 \
12 |                     gcc-9 \
13 |                     g++-9 \
14 |                     && \
15 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9 && \
16 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
17 | 
18 | COPY . /workspace
19 | RUN cd /workspace; rm -rf build; mkdir build; cd build; cmake ..; make -j 8; make install;
20 | 
21 | # # Cleanup cmake and git
22 | # RUN apt remove -y cmake git && apt autoremove -y
23 | 
24 | FROM ubuntu:latest
25 | 
26 | LABEL org.opencontainers.image.authors="maxrossi91@gmail.com"
27 | RUN apt-get update -qq && \
28 |     apt-get install -y zlib1g-dev \
29 |     python3
30 | 
31 | COPY --from=builder /usr/local/bin /bin
32 | CMD ["/bin/moni"]
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Release](https://img.shields.io/github/release/maxrossi91/moni.svg)](https://github.com/maxrossi91/moni/releases)
  2 | [![Downloads](https://img.shields.io/github/downloads/maxrossi91/moni/total?logo=github)](https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_0.2.2_amd64.deb)
  3 | [![Docker Pulls](https://badgen.net/docker/pulls/maxrossi91/moni?icon=docker&label=pulls)](https://hub.docker.com/r/maxrossi91/moni/)
  4 | [![Docker Image Size](https://badgen.net/docker/size/maxrossi91/moni?icon=docker&label=image%20size)](https://hub.docker.com/r/maxrossi91/moni/)
  5 | [![Bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/moni/README.html)
  6 | 
  7 | # MONI
  8 | ```console
  9 |                            __  __  ____  _   _ _____
 10 |                           |  \/  |/ __ \| \ | |_   _|
 11 |                           | \  / | |  | |  \| | | |
 12 |                           | |\/| | |  | | . ` | | |
 13 |                           | |  | | |__| | |\  |_| |_
 14 |                           |_|  |_|\____/|_| \_|_____|
 15 |                                             ver 0.2.2
 16 | ```
 17 | A Pangenomics Index for Finding MEMs.
 18 | 
 19 | MONI index uses the prefix-free parsing of the text [2][3] to build the Burrows-Wheeler Transform (BWT) of the reference genomes, the suffix array (SA) samples at the beginning and at the end of each run of the BWT, and the threshold positions of [1]. 
 20 | 
 21 | ## How to get MONI
 22 | 
 23 | ### Docker
 24 | 
 25 | MONI is available on `docker`:
 26 | 
 27 | ```console
 28 | docker pull maxrossi91/moni:v0.2.2
 29 | docker run maxrossi91/moni:v0.2.2 moni -h
 30 | ```
 31 | if using `singularity`:
 32 | ```console
 33 | singularity pull moni_sif docker://maxrossi91/moni:v0.2.2
 34 | ./moni_sif moni --help
 35 | ```
 36 | 
 37 | ### Install Packages
 38 | 
 39 | We provide MONI on a `.deb` package:
 40 | ```console
 41 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2_amd64.deb
 42 | sudo dpkg -i moni_v0.2.2_amd64.deb
 43 | moni -h
 44 | ```
 45 | We provide MONI on a linux `.sh` installer:
 46 | ```console
 47 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2-Linux.sh
 48 | chmod +x moni_v0.2.2-Linux.sh
 49 | ./moni_v0.2.2-Linux.sh
 50 | moni -h
 51 | ```
 52 | We provide MONI on a pre-compiled `.tar.gz`:
 53 | ```console
 54 | wget https://github.com/maxrossi91/moni/releases/download/v0.2.2/moni_v0.2.2-Linux.tar.gz
 55 | tar -xzvf moni_v0.2.2-Linux.tar.gz
 56 | moni_v0.2.2-Linux/bin/moni -h
 57 | ```
 58 | 
 59 | ### Compile and install
 60 | #### Install prerequisite packages
 61 | 
 62 | ```console
 63 | apt-get update
 64 | apt-get install -y build-essential cmake git python3 zlib1g-dev
 65 | ```
 66 | 
 67 | #### Download
 68 | 
 69 | ```console
 70 | git clone https://github.com/maxrossi91/moni
 71 | ```
 72 | #### Compile
 73 | 
 74 | ```console
 75 | co moni
 76 | mkdir build
 77 | cd build; cmake -DCMAKE_INSTALL_PREFIX=<path/to/install/prefix> ..
 78 | make
 79 | ```
 80 | 
 81 | Replace `<path/to/install/prefix>` with your preferred install path. If not specified the install path is `/usr/bin` by default.
 82 | 
 83 | #### Install
 84 | 
 85 | ```console
 86 | make install
 87 | ```
 88 | 
 89 | ### Construction of the index:
 90 | ```
 91 | usage: moni build [-h] -r REFERENCE [-w WSIZE] [-p MOD] [-t THREADS] [-k] [-v]
 92 |                   [-f] [--moni-ms] [--spumoni]
 93 |   -h, --help            show this help message and exit
 94 |   -r REFERENCE, --reference REFERENCE
 95 |                         reference file name (default: None)
 96 |   -o OUTPUT, --output OUTPUT
 97 |                         output directory path (default: same as reference)
 98 |   -w WSIZE, --wsize WSIZE
 99 |                         sliding window size (default: 10)
100 |   -p MOD, --mod MOD     hash modulus (default: 100)
101 |   -t THREADS, --threads THREADS
102 |                         number of helper threads (default: 0)
103 |   -k                    keep temporary files (default: False)
104 |   -v                    verbose (default: False)
105 |   -f                    read fasta (default: False)
106 |   -g GRAMMAR, --grammar GRAMMAR
107 |                         select the grammar [plain, shaped] (default: plain)
108 | 
109 | ```
110 | 
111 | 
112 | ### Computing the matching statistics with MONI:
113 | ```
114 | usage: moni ms [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS]
115 |   -h, --help            show this help message and exit
116 |   -i INDEX, --index INDEX
117 |                         reference index base name (default: None)
118 |   -p PATTERN, --pattern PATTERN
119 |                         the input query (default: None)
120 |   -o OUTPUT, --output OUTPUT
121 |                         output directory path (default: .)
122 |   -t THREADS, --threads THREADS
123 |                         number of helper threads (default: 1)
124 |   -g GRAMMAR, --grammar GRAMMAR
125 |                         select the grammar [plain, shaped] (default: plain)
126 | ```
127 | 
128 | ### Computing the matching statistics with MONI:
129 | ```
130 | usage: moni mems [-h] -i INDEX -p PATTERN [-o OUTPUT] [-e] [-s] [-t THREADS]
131 |   -h, --help            show this help message and exit
132 |   -i INDEX, --index INDEX
133 |                         reference index base name (default: None)
134 |   -p PATTERN, --pattern PATTERN
135 |                         the input query (default: None)
136 |   -o OUTPUT, --output OUTPUT
137 |                         output directory path (default: .)
138 |   -e, --extended-output
139 |                         output MEM occurrence in the reference (default: False)
140 |   -s, --sam-output
141 |                         output MEM in a SAM formatted file. (default: False)
142 |   -t THREADS, --threads THREADS
143 |                         number of helper threads (default: 1)
144 |   -g GRAMMAR, --grammar GRAMMAR
145 |                         select the grammar [plain, shaped] (default: plain)
146 | ```
147 | 
148 | ### Computing the MEM extension with MONI and ksw2:
149 | ```
150 | usage: moni extend [-h] -i INDEX -p PATTERN [-o OUTPUT] [-t THREADS] [-b BATCH] [-g GRAMMAR] [-L EXTL] [-A SMATCH] [-B SMISMATCH] [-O GAPO] [-E GAPE]
151 | 
152 | optional arguments:
153 |   -h, --help            show this help message and exit
154 |   -i INDEX, --index INDEX
155 |                         reference index folder (default: None)
156 |   -p PATTERN, --pattern PATTERN
157 |                         the input query (default: None)
158 |   -o OUTPUT, --output OUTPUT
159 |                         output directory path (default: .)
160 |   -t THREADS, --threads THREADS
161 |                         number of helper threads (default: 1)
162 |   -b BATCH, --batch BATCH
163 |                         number of reads per thread batch (default: 100)
164 |   -g GRAMMAR, --grammar GRAMMAR
165 |                         select the grammar [plain, shaped] (default: plain)
166 |   -L EXTL, --extl EXTL  length of reference substring for extension (default: 100)
167 |   -A SMATCH, --smatch SMATCH
168 |                         match score value (default: 2)
169 |   -B SMISMATCH, --smismatch SMISMATCH
170 |                         mismatch penalty value (default: 4)
171 |   -O GAPO, --gapo GAPO  coma separated gap open penalty values (default: 4,13)
172 |   -E GAPE, --gape GAPE  coma separated gap extension penalty values (default: 2,1)
173 | ```
174 | 
175 | # Example
176 | 
177 | ##### Build the index for `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder
178 | ```console
179 | moni build -r data/SARS-CoV2/SARS-CoV2.1k.fa.gz -o sars-cov2 -f
180 | ```
181 | It produces three files `sars-cov2.plain.slp`, `sars-cov2.thrbv.ms`, and `sars-cov2.idx` in the current folder which contain the grammar, the rlbwt and the thresholds, and the starting position and name of each fasta sequence in the reference file respectively.
182 | 
183 | ##### Compute the matching statistics of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder
184 | ```console
185 | moni ms -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads
186 | ```
187 | It produces two output files `reads.lengths` and `reads.pointers` in the current folder which store the lengths and the positions of the matching statistics of the reads against the reference in a fasta-like format.  
188 | 
189 | ##### Compute the MEMs of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder
190 | ```console
191 | moni mems -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads
192 | ```
193 | It produces one output file `reads.mems` in the current folder which store the MEMs reposted as pairs of position and lengths in a fasta-like format.  
194 | 
195 | ##### Compute the MEM extension of `reads.fastq.gz ` against `SARS-CoV2.1k.fa.gz` in the `data/SARS-CoV2` folder
196 | ```console
197 | moni extend -i sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o reads
198 | ```
199 | It produces one output file `reads.sam` in the current folder which stores the information of the MEM extensions in SAM format.  
200 | # External resources
201 | 
202 | * [Big-BWT](https://github.com/alshai/Big-BWT.git)
203 |     * [gSACA-K](https://github.com/felipelouza/gsa-is.git)
204 |     * [malloc_count](https://github.com/bingmann/malloc_count)
205 | * [sdsl-lite](https://github.com/simongog/sdsl-lite)
206 |     * [Divsufsort](https://github.com/simongog/libdivsufsort.git)
207 | * [klib](https://github.com/attractivechaos/klib)
208 | * [ksw2](https://github.com/lh3/ksw2)
209 | * [r-index](https://github.com/maxrossi91/r-index.git)
210 | * [pfp-thresholds](https://github.com/maxrossi91/pfp-thresholds.git)
211 | * [bigrepair](https://gitlab.com/manzai/bigrepair.git)
212 | * [shaped_slp](https://github.com/koeppl/ShapedSlp.git)
213 | <!-- * [Google Benchmark](https://github.com/google/benchmark.git)
214 |     * [Google Test](https://github.com/google/googletest) -->
215 | 
216 | # Citation 
217 | 
218 | Please, if you use this tool in an academic setting cite the following papers:
219 | 
220 |     @article{RossiOLGB21,
221 |     author      = { Massimiliano Rossi and 
222 |                     Marco Oliva and
223 |                     Ben Langmead and
224 |                     Travis Gagie and
225 |                     Christina Boucher},
226 |     title       = {MONI: A Pangenomics Index for Finding Maximal Exact Matches},
227 |     booktitle   = {Research in Computational Molecular Biology - 25th Annual 
228 |                     International Conference, {RECOMB} 2021, Padova, Italy},
229 |     journal     = {Journal of Computational Biology},
230 |     volume      = {29},
231 |     number      = {2},
232 |     pages       = {169--187},
233 |     year        = {2022},
234 |     publisher   = {Mary Ann Liebert, Inc., publishers 140 Huguenot Street, 3rd Floor New~…}
235 |     }
236 | 
237 | 
238 | # Authors
239 | 
240 | ### Theoretical results:
241 | 
242 | * Christina Boucher
243 | * Travis Gagie
244 | * Ben Langmead
245 | * Massimiliano Rossi
246 | 
247 | ### Implementation:
248 | 
249 | * [Massimiliano Rossi](https://github.com/maxrossi91)
250 | 
251 | ### Experiments
252 | 
253 | * [Marco Oliva](https://github.com/marco-oliva)
254 | * [Massimiliano Rossi](https://github.com/maxrossi91)
255 | 
256 | # Why "MONI"?
257 | 
258 | **Moni** is the Finnish word for *multi*.
259 | 
260 | # References
261 | 
262 | [1] Hideo Bannai, Travis Gagie, and Tomohiro I, *"Refining ther-index"*, Theoretical Computer Science, 812 (2020), pp. 96–108
263 | 
264 | [2] Christina Boucher, Travis Gagie, Alan Kuhnle and Giovanni Manzini, *"Prefix-Free Parsing for Building Big BWTs"*, In Proc. of the 18th International Workshop on Algorithms in Bioinformatics (WABI 2018).
265 | 
266 | [3] Christina Boucher, Travis Gagie, Alan Kuhnle, Ben Langmead, Giovanni Manzini, and Taher Mun. *"Prefix-free parsing for building big BWTs."*, Algorithms for Molecular Biology 14, no. 1 (2019): 13.


--------------------------------------------------------------------------------
/data/SARS-CoV2/SARS-CoV2.1k.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrossi91/moni/7f3f954f5bc6174457ea374acee1f2dcca61527b/data/SARS-CoV2/SARS-CoV2.1k.fa.gz


--------------------------------------------------------------------------------
/data/SARS-CoV2/reads.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrossi91/moni/7f3f954f5bc6174457ea374acee1f2dcca61527b/data/SARS-CoV2/reads.fastq.gz


--------------------------------------------------------------------------------
/include/common/common.hpp:
--------------------------------------------------------------------------------
  1 | /* pfp-ds - prefix free parsing data structures
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU General Public License as published by
  6 |     the Free Software Foundation, either version 3 of the License, or
  7 |     (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU General Public License
 15 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 16 | */
 17 | /*!
 18 |    \file common.hpp
 19 |    \brief common.hpp contains common features.
 20 |    \author Massimiliano Rossi
 21 |    \date 12/03/2020
 22 | */
 23 | 
 24 | #ifndef _COMMON_HH
 25 | #define _COMMON_HH
 26 | 
 27 | #include <iostream>
 28 | #include <cstdlib>
 29 | #include <cstdio>
 30 | #include <ctime>
 31 | #include <assert.h>
 32 | 
 33 | #include <sys/time.h>
 34 | 
 35 | #include <sys/mman.h> // for mmap
 36 | #include <unistd.h>
 37 | #include <sys/stat.h>
 38 |  #include <fcntl.h>
 39 | 
 40 | #include <sstream>      // std::stringstream
 41 | 
 42 | #include <vector>      // std::vector
 43 | 
 44 | #include <chrono>       // high_resolution_clock
 45 | 
 46 | #include <sdsl/io.hpp>  // serialize and load
 47 | #include <type_traits>  // enable_if_t and is_fundamental
 48 | 
 49 | //**************************** From  Big-BWT ***********************************
 50 | // special symbols used by the construction algorithm:
 51 | //   they cannot appear in the input file
 52 | //   the 0 symbol is used in the final BWT file as the EOF char
 53 | 
 54 | #define Dollar 2     // special char for the parsing algorithm, must be the highest special char
 55 | #define EndOfWord 1  // word delimiter for the plain dictionary file
 56 | #define EndOfDict 0  // end of dictionary delimiter
 57 | //******************************************************************************
 58 | 
 59 | #define THRBYTES 5 // The number of bytes for the thresholds
 60 | #define SSABYTES 5 // The number of bytes for the thresholds
 61 | 
 62 | std::string NowTime();
 63 | void _internal_messageInfo(const std::string message);
 64 | void _internal_messageWarning( const std::string file, const unsigned int line, const std::string message);
 65 | void _internal_messageError( const std::string file, const unsigned int line,const std::string message);
 66 | 
 67 | 
 68 | std::string NowTime()
 69 | {
 70 |     struct timeval tv;
 71 |     gettimeofday(&tv, 0);
 72 |     char buffer[100];
 73 |     tm r;
 74 |     strftime(buffer, sizeof(buffer), "%X", localtime_r(&tv.tv_sec, &r));
 75 |     char result[100];
 76 |     snprintf(result, 100, "%s"/*.%06ld"*/, buffer/*, (long)tv.tv_usec*/);
 77 |     return result;
 78 | }
 79 | 
 80 | 
 81 | template<typename T>
 82 | inline void _internal_message_helper(std::stringstream &ss, T const &first) { ss << first; }
 83 | template<typename T, typename... Args>
 84 | inline void _internal_message_helper(std::stringstream &ss, T const &first, const Args&... args) { ss << first << " "; _internal_message_helper(ss,args...); }
 85 | template<typename T, typename... Args>
 86 | inline std::string _internal_message(T const &first, const Args&... args) { std::stringstream ss; _internal_message_helper(ss,first,args...); return ss.str(); }
 87 | 
 88 | 
 89 | void _internal_messageInfo(const std::string message)
 90 | {
 91 |   std::cout << "[INFO] " << NowTime() << " - " << "Message: " << message << std::endl;
 92 | }
 93 | 
 94 | void _internal_messageWarning( const std::string file, const unsigned int line,
 95 |   const std::string message)
 96 | {
 97 |   std::cout << "[WARNING] " << NowTime() << " - "
 98 |   << "File: " << file << '\n'
 99 |   << "Line: " << line << '\n'
100 |   << "Message: " << message << std::endl;
101 | }
102 | 
103 | void _internal_messageError( const std::string file, const unsigned int line,
104 |   const std::string message)
105 | {
106 |   std::cerr << "[ERROR] " << NowTime() << " - "
107 |   << "File: " << file << '\n'
108 |   << "Line: " << line << '\n'
109 |   << "Message: " << message << std::endl;
110 |   assert( false );
111 |   exit( 1 );
112 | }
113 | 
114 | 
115 | 
116 | #define info( args... ) \
117 |     _internal_messageInfo( _internal_message(args) )
118 | 
119 | #ifdef VERBOSE
120 |   #define verbose( args... ) \
121 |       _internal_messageInfo( _internal_message(args) )
122 | #else
123 |   #define verbose( args... )
124 | #endif
125 | 
126 | #define warning( args... ) \
127 |     _internal_messageWarning( __FILE__, __LINE__, _internal_message(args) )
128 | 
129 | #define error( args... ) \
130 |     _internal_messageError( __FILE__, __LINE__, _internal_message(args) )
131 | 
132 | 
133 | // converts elemens in csv format
134 | template <typename T>
135 | inline void csv_helper(std::stringstream &ss, T const &first){ss << first;}
136 | template <typename T, typename... Args>
137 | inline void csv_helper(std::stringstream &ss, T const &first, const Args &... args){ ss << first << ", "; csv_helper(ss, args...);}
138 | template <typename T, typename... Args>
139 | inline std::string csv(T const &first, const Args &... args){std::stringstream ss;csv_helper(ss, first, args...); return ss.str();}
140 | 
141 | //*********************** File I/O *********************************************
142 | template<typename T>
143 | void map_file(const char *filename, T*& ptr, size_t& length){
144 |     struct stat filestat;
145 |     int fd;
146 | 
147 |     if ((fd = open(filename, O_RDONLY)) < 0)
148 |         error("open() file " + std::string(filename) + " failed" );
149 | 
150 |     if (fstat(fd, &filestat) < 0)
151 |         error("stat() file " + std::string(filename) + " failed" );
152 | 
153 |     if(filestat.st_size % sizeof(T) != 0)
154 |         error("invilid file " + std::string(filename));
155 | 
156 |     length = filestat.st_size / sizeof(T);
157 | 
158 |     if ((ptr = mmap(NULL, filestat.st_size, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED)
159 |         error("mmap() file " + std::string(filename) + " failed");
160 | }
161 | 
162 | template<typename T>
163 | void read_file(const char *filename, T*& ptr, size_t& length){
164 |     struct stat filestat;
165 |     FILE* fd;
166 | 
167 |     if ((fd = fopen(filename, "r")) == nullptr)
168 |         error("open() file " + std::string(filename) + " failed" );
169 | 
170 |     int fn = fileno(fd);
171 |     if (fstat(fn, &filestat) < 0)
172 |         error("stat() file " + std::string(filename) + " failed" );
173 | 
174 |     if(filestat.st_size % sizeof(T) != 0)
175 |         error("invilid file " + std::string(filename));
176 | 
177 |     length = filestat.st_size / sizeof(T);
178 |     ptr = new T[length];
179 | 
180 |     if ((fread(ptr, sizeof(T), length, fd)) != length)
181 |         error("fread() file " + std::string(filename) + " failed");
182 | 
183 |     fclose(fd);
184 | }
185 | 
186 | template<typename T>
187 | void read_file(const char *filename, std::vector<T>& ptr){
188 |     struct stat filestat;
189 |     FILE* fd;
190 | 
191 |     if ((fd = fopen(filename, "r")) == nullptr)
192 |         error("open() file " + std::string(filename) + " failed" );
193 | 
194 |     int fn = fileno(fd);
195 |     if (fstat(fn, &filestat) < 0)
196 |         error("stat() file " + std::string(filename) + " failed" );
197 | 
198 |     if(filestat.st_size % sizeof(T) != 0)
199 |         error("invilid file " + std::string(filename));
200 | 
201 |     size_t length = filestat.st_size / sizeof(T);
202 |     ptr.resize(length);
203 | 
204 |     if ((fread(&ptr[0], sizeof(T), length, fd)) != length)
205 |         error("fread() file " + std::string(filename) + " failed");
206 | 
207 |     fclose(fd);
208 | }
209 | 
210 | void read_file(const char *filename, std::string &ptr)
211 | {
212 |   struct stat filestat;
213 |   FILE *fd;
214 | 
215 |   if ((fd = fopen(filename, "r")) == nullptr)
216 |     error("open() file " + std::string(filename) + " failed");
217 | 
218 |   int fn = fileno(fd);
219 |   if (fstat(fn, &filestat) < 0)
220 |     error("stat() file " + std::string(filename) + " failed");
221 | 
222 |   if (filestat.st_size % sizeof(char) != 0)
223 |     error("invilid file " + std::string(filename));
224 | 
225 |   size_t length = filestat.st_size / sizeof(char);
226 |   ptr.resize(length);
227 | 
228 |   if ((fread(&ptr[0], sizeof(char), length, fd)) != length)
229 |     error("fread() file " + std::string(filename) + " failed");
230 | 
231 |   fclose(fd);
232 | }
233 | 
234 | template<typename T>
235 | void read_fasta_file(const char *filename, std::vector<T>& v){
236 |     FILE* fd;
237 | 
238 |     if ((fd = fopen(filename, "r")) == nullptr)
239 |         error("open() file " + std::string(filename) + " failed" );
240 | 
241 |     v.clear();
242 | 
243 |     char c;
244 |     while (fread( &c, sizeof(char), 1,fd) == 1) {
245 |       if(c == '>'){
246 |         while(fread( &c, sizeof(char), 1,fd) == 1 && c != '\n');
247 |       }else{
248 |         v.push_back(c);
249 |         while(fread( &c, sizeof(char), 1,fd) == 1 && c!= '\n') v.push_back(c);
250 |       }
251 |   	}
252 |   	fclose(fd);
253 | }
254 | 
255 | template <typename T>
256 | void write_file(const char *filename, std::vector<T> &ptr)
257 | {
258 |   struct stat filestat;
259 |   FILE *fd;
260 | 
261 |   if ((fd = fopen(filename, "w")) == nullptr)
262 |     error("open() file " + std::string(filename) + " failed");
263 | 
264 |   size_t length = ptr.size(); 
265 |   if ((fwrite(&ptr[0], sizeof(T), length, fd)) != length)
266 |     error("fwrite() file " + std::string(filename) + " failed");
267 | 
268 |   fclose(fd);
269 | }
270 | 
271 | //*********************** Time resources ***************************************
272 | 
273 | /*!
274 |  * op the operation that we want to measure
275 |  */
276 | #define _elapsed_time(op)                                                                                               \
277 |   ({                                                                                                                    \
278 |     std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();          \
279 |     op;                                                                                                                 \
280 |     std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();            \
281 |     verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count()); \
282 |     std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count();                                \
283 |   })
284 | 
285 | 
286 | 
287 | //********** begin my serialize edit from sdsl ********************
288 | // Those are wrapper around most of the serialization functions of sdsl
289 | 
290 | 
291 | template <class T, typename size_type>
292 | uint64_t
293 | my_serialize_array(const T* p, const size_type size, std::ostream &out, typename std::enable_if<std::is_fundamental<T>::value>::type * = 0)
294 | {
295 |   size_t written_bytes = 0;
296 |   if (size > 0)
297 |   {
298 | 
299 |     size_type idx = 0;
300 |     while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (size))
301 |     {
302 |       out.write((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T));
303 |       written_bytes += sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T);
304 |       p += sdsl::conf::SDSL_BLOCK_SIZE;
305 |       idx += sdsl::conf::SDSL_BLOCK_SIZE;
306 |     }
307 |     out.write((char *)p, ((size) - idx) * sizeof(T));
308 |     written_bytes += ((size) - idx) * sizeof(T);
309 | 
310 |   }
311 |   return written_bytes;
312 | }
313 | 
314 | //! Serialize each element of an std::vector
315 | /*!
316 |  * \param vec The vector which should be serialized.
317 |  * \param out Output stream to which should be written.
318 |  * \param v   Structure tree node. Note: If all elements have the same
319 |  *            structure, then it is tried to combine all elements (i.e.
320 |  *            make one node w with size set to the cumulative sum of all
321 |  *           sizes of the children)
322 |  */
323 | // specialization for fundamental types
324 | template <class T>
325 | uint64_t
326 | my_serialize_vector(const std::vector<T> &vec, std::ostream &out, sdsl::structure_tree_node *v, std::string name, typename std::enable_if<std::is_fundamental<T>::value>::type * = 0)
327 | {
328 |   if (vec.size() > 0)
329 |   {
330 |     sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, "std::vector<" + sdsl::util::class_name(vec[0]) + ">");
331 |     size_t written_bytes = 0;
332 | 
333 |     const T *p = &vec[0];
334 |     typename std::vector<T>::size_type idx = 0;
335 |     while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (vec.size()))
336 |     {
337 |       out.write((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T));
338 |       written_bytes += sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T);
339 |       p += sdsl::conf::SDSL_BLOCK_SIZE;
340 |       idx += sdsl::conf::SDSL_BLOCK_SIZE;
341 |     }
342 |     out.write((char *)p, ((vec.size()) - idx) * sizeof(T));
343 |     written_bytes += ((vec.size()) - idx) * sizeof(T);
344 | 
345 |     sdsl::structure_tree::add_size(child, written_bytes);
346 |     return written_bytes;
347 |   }
348 |   else
349 |   {
350 |     return 0;
351 |   }
352 | }
353 | 
354 | template <typename X>
355 | uint64_t
356 | my_serialize(const std::vector<X> &x,
357 |              std::ostream &out, sdsl::structure_tree_node *v = nullptr,
358 |              std::string name = "", typename std::enable_if<std::is_fundamental<X>::value>::type * = 0)
359 | {
360 |   return sdsl::serialize(x.size(), out, v, name) + my_serialize_vector(x, out, v, name);
361 | }
362 | 
363 | 
364 | /**
365 |  * @brief Load an array of size elements into p. p should be preallocated.
366 |  * 
367 |  * \tparam T 
368 |  * \tparam size_type 
369 |  * @param p 
370 |  * @param size 
371 |  * @param in 
372 |  */
373 | template <class T, typename size_type>
374 | void my_load_array(T *p, const size_type size, std::istream &in, typename std::enable_if<std::is_fundamental<T>::value>::type * = 0)
375 | {
376 |   size_type idx = 0;
377 |   while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (size))
378 |   {
379 |     in.read((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T));
380 |     p += sdsl::conf::SDSL_BLOCK_SIZE;
381 |     idx += sdsl::conf::SDSL_BLOCK_SIZE;
382 |   }
383 |   in.read((char *)p, ((size) - idx) * sizeof(T));
384 | }
385 | 
386 | //! Load all elements of a vector from a input stream
387 | /*! \param vec  Vector whose elements should be loaded.
388 |  *  \param in   Input stream.
389 |  *  \par Note
390 |  *   The vector has to be resized prior the loading
391 |  *   of its elements.
392 |  */
393 | template <class T>
394 | void my_load_vector(std::vector<T> &vec, std::istream &in, typename std::enable_if<std::is_fundamental<T>::value>::type * = 0)
395 | {
396 |   T *p = &vec[0];
397 |   typename std::vector<T>::size_type idx = 0;
398 |   while (idx + sdsl::conf::SDSL_BLOCK_SIZE < (vec.size()))
399 |   {
400 |     in.read((char *)p, sdsl::conf::SDSL_BLOCK_SIZE * sizeof(T));
401 |     p += sdsl::conf::SDSL_BLOCK_SIZE;
402 |     idx += sdsl::conf::SDSL_BLOCK_SIZE;
403 |   }
404 |   in.read((char *)p, ((vec.size()) - idx) * sizeof(T));
405 | }
406 | 
407 | template <typename X>
408 | void my_load(std::vector<X> &x, std::istream &in, typename std::enable_if<std::is_fundamental<X>::value>::type * = 0)
409 | {
410 |   typename std::vector<X>::size_type size;
411 |   sdsl::load(size, in);
412 |   x.resize(size);
413 |   my_load_vector(x, in);
414 | }
415 | 
416 | 
417 | 
418 | 
419 | #endif /* end of include guard: _COMMON_HH */
420 | 


--------------------------------------------------------------------------------
/include/common/seqidx.hpp:
--------------------------------------------------------------------------------
  1 | /* seqidx - an index fo the sequence names in a fasta file
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file seqidx.cpp
 16 |    \brief seqidx.cpp an index fo the sequence names in a fasta file.
 17 |    \author Massimiliano Rossi
 18 |    \date 07/08/2021
 19 | */
 20 | #ifndef _SEQIDX_HH
 21 | #define _SEQIDX_HH
 22 | 
 23 | #include <common.hpp>
 24 | 
 25 | #include <sdsl/sd_vector.hpp>
 26 | 
 27 | #include <kseq.h>
 28 | #include <zlib.h>
 29 | 
 30 | // KSEQ_DECLARE(gzFile);
 31 | 
 32 | class seqidx
 33 | {
 34 | public:
 35 |     seqidx()
 36 |     {
 37 |         u = 0;
 38 |     }
 39 |     /**
 40 |      * @brief Construct a new seqidx object
 41 |      * 
 42 |      * @param filename filepath of the fasta/q file
 43 |      */
 44 |     seqidx(std::string filename)
 45 |     {
 46 |         gzFile fp(gzopen(filename.c_str(), "r"));
 47 |         if (fp == nullptr)
 48 |             error("gzopen() file " + std::string(filename) + " failed");
 49 | 
 50 |         kseq_t *seq = kseq_init(fp);
 51 | 
 52 |         std::vector<size_t> onset(1,0);
 53 |         u = 0;
 54 | 
 55 |         while (kseq_read(seq) >= 0)
 56 |         {
 57 |             u += seq->seq.l;
 58 |             names.push_back(std::string(seq->name.s));
 59 |             onset.push_back(u);
 60 |         }
 61 | 
 62 |         kseq_destroy(seq);
 63 |         gzclose(fp);
 64 | 
 65 |         sdsl::sd_vector_builder builder(u, onset.size());
 66 |         for (auto idx : onset)
 67 |             builder.set(idx);
 68 | 
 69 |         starts = sdsl::sd_vector<>(builder);
 70 |         rank1 = sdsl::sd_vector<>::rank_1_type(&starts);
 71 |         select1 = sdsl::sd_vector<>::select_1_type(&starts);
 72 |     }
 73 | 
 74 |     /**
 75 |      * @brief Construct a new seqidx object from onset, list of sequence names and total length
 76 |      * 
 77 |      * @param onset the popsitions
 78 |      * @param names_ 
 79 |      * @param l 
 80 |      */
 81 |     seqidx(const std::vector<size_t>& onset, const std::vector<std::string>& names_, const size_t l)
 82 |     {
 83 |         assert(onset.size() == names_.size());
 84 |         assert(onset[0] == 0);
 85 |         assert(onset.back() < l);
 86 |         assert(std::is_sorted(onset.begin(), onset.end()));
 87 | 
 88 |         u = l;
 89 |         names = std::vector<std::string>(names_);
 90 | 
 91 | 
 92 |         sdsl::sd_vector_builder builder(u, onset.size());
 93 |         for (auto idx : onset)
 94 |             builder.set(idx);
 95 |         
 96 |         builder.set(u);
 97 | 
 98 |         starts = sdsl::sd_vector<>(builder);
 99 |         rank1 = sdsl::sd_vector<>::rank_1_type(&starts);
100 |         select1 = sdsl::sd_vector<>::select_1_type(&starts);
101 |     }
102 | 
103 |     
104 | 
105 |     /**
106 |      * @brief Return the length of the i-th sequence
107 |      * 
108 |      * @param i 
109 |      * @return size_t 
110 |      */
111 |     inline size_t length(const size_t i)
112 |     {
113 |         assert(i < names.size());
114 |         return select1(i+2) - select1(i+1);
115 |         // return select1(i+1) - select1(i);
116 |     }
117 | 
118 |     /**
119 |      * @brief return the name of the sequence pos belongs.
120 |      * 
121 |      * @param pos the position in the set of sequences.
122 |      * @return std::string the name of the sequence pos belongs.
123 |      */
124 |     inline std::string operator[](const size_t pos)
125 |     {
126 |         return names[rank1(pos + 1)-1]; // pos+1 becausethe rank counts the 1s before
127 |     }
128 | 
129 |     /**
130 |      * @brief return the name of the sequence pos belongs, and its offset.
131 |      * 
132 |      * @param pos the position in the set of sequences.
133 |      * @return std::pair<std::string,size_t> the name of the sequence pos belongs and its offset.
134 |      */
135 |     inline std::pair<std::string,size_t> index(const size_t pos)
136 |     {
137 |         size_t rank = rank1(pos + 1);
138 |         size_t start = select1(rank);
139 |         return std::make_pair(names[rank-1],pos - start); // pos+1 becausethe rank counts the 1s before
140 |     }
141 | 
142 |     /**
143 |      * @brief Check if the substring [pos.pos+len-1] does not span two sequences.
144 |      * 
145 |      * @param pos the position of the substring.
146 |      * @param len the length of the substring.
147 |      * @return true if the substring does not span two sequences.
148 |      * @return false if the substring spans two sequences.
149 |      */
150 |     inline bool valid(size_t pos, size_t len)
151 |     {
152 |         return (pos + len <= select1(rank1(pos + 1)+1)); // pos+1 becausethe rank counts the 1s before
153 |     }
154 | 
155 |     /**
156 |      * @brief return the SAM header description of the reference file
157 |      * 
158 |      * @return std::string 
159 |      */
160 |     std::string to_sam()
161 |     {
162 |         std::string res = "";
163 |         for (size_t i = 0; i < names.size(); ++i)
164 |             res += "@SQ\tSN:" + names[i] + "\tLN:" + std::to_string(length(i)) + "\n";
165 |         return res;    
166 |     }
167 | 
168 |     size_t serialize(std::ostream &out)
169 |     {
170 | 
171 |         size_t w_bytes = 0;
172 | 
173 |         out.write((char *)&u, sizeof(u));
174 | 
175 |         w_bytes += sizeof(u);
176 | 
177 |         if (u == 0)
178 |             return w_bytes;
179 | 
180 |         w_bytes += starts.serialize(out);
181 |         w_bytes += sdsl::serialize(names.size(), out);
182 |         for(size_t i = 0; i < names.size(); ++i)
183 |         {
184 |             w_bytes += sdsl::serialize(names[i].size(), out);
185 |             w_bytes = my_serialize_array<char, std::string::size_type>(names[i].data(), names[i].size(), out);
186 |         }
187 |         return w_bytes;
188 |     }
189 | 
190 |     void load(std::istream &in)
191 |     {
192 | 
193 |         in.read((char *)&u, sizeof(u));
194 | 
195 |         if (u == 0)
196 |             return;
197 | 
198 |         starts.load(in);
199 |         rank1 = sdsl::sd_vector<>::rank_1_type(&starts);
200 |         select1 = sdsl::sd_vector<>::select_1_type(&starts);
201 | 
202 |         std::vector<std::string>::size_type names_size;
203 |         sdsl::load(names_size, in);
204 |         names.resize(names_size);
205 |         for (size_t i = 0; i < names.size(); ++i)
206 |         {
207 |             std::string::size_type string_size;
208 |             sdsl::load(string_size, in);
209 |             names[i].resize(string_size);
210 |             my_load_array<char, std::string::size_type>(&names[i][0], names[i].size(), in);
211 |         }
212 |     }
213 | 
214 |     std::string get_file_extension() const
215 |     {
216 |         return  ".idx";
217 |     }
218 | 
219 | protected:
220 |     size_t u;
221 |     
222 |     sdsl::sd_vector<> starts;
223 |     sdsl::sd_vector<>::rank_1_type rank1;
224 |     sdsl::sd_vector<>::select_1_type select1;
225 |     
226 |     std::vector<std::string> names;
227 | 
228 | };
229 | 
230 | #endif /* end of include guard: _SEQIDX_HH */
231 | 


--------------------------------------------------------------------------------
/include/extender/extend_reads_dispatcher.hpp:
--------------------------------------------------------------------------------
  1 | /* extender_reads_dispatcher - Dispatches the reads in single and multithread.
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file extender_reads_dispatcher.cpp
 16 |    \brief extender_reads_dispatcher.cpp Dispatches the reads in single and multithread.
 17 |    \author Massimiliano Rossi
 18 |    \date 29/04/2021
 19 | */
 20 | 
 21 | #ifndef _READS_DISPATCHER_HH
 22 | #define _READS_DISPATCHER_HH
 23 | 
 24 | extern "C"{
 25 | #include <xerrors.h>
 26 | }
 27 | 
 28 | #include <common.hpp>
 29 | #include <kseq.h>
 30 | #include <zlib.h>
 31 | 
 32 | ////////////////////////////////////////////////////////////////////////////////
 33 | /// kseq extra
 34 | ////////////////////////////////////////////////////////////////////////////////
 35 | 
 36 | static inline size_t ks_tell(kseq_t *seq)
 37 | {
 38 |     return gztell(seq->f->f) - seq->f->end + seq->f->begin;
 39 | }
 40 | 
 41 | void copy_kstring_t(kstring_t &l, kstring_t &r)
 42 | {
 43 |     l.l = r.l;
 44 |     l.m = r.m;
 45 |     l.s = (char *)malloc(l.m);
 46 |     for (size_t i = 0; i < r.m; ++i)
 47 |         l.s[i] = r.s[i];
 48 | }
 49 | void copy_kseq_t(kseq_t *l, kseq_t *r)
 50 | {
 51 |     copy_kstring_t(l->name, r->name);
 52 |     copy_kstring_t(l->comment, r->comment);
 53 |     copy_kstring_t(l->seq, r->seq);
 54 |     copy_kstring_t(l->qual, r->qual);
 55 |     l->last_char = r->last_char;
 56 | }
 57 | ////////////////////////////////////////////////////////////////////////////////
 58 | 
 59 | ////////////////////////////////////////////////////////////////////////////////
 60 | /// xerror extra (conditions)
 61 | ////////////////////////////////////////////////////////////////////////////////
 62 | 
 63 | #ifndef Thread_error_wait
 64 |     #define Thread_error_wait 5
 65 | #endif
 66 | 
 67 | // cond
 68 | int xpthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr, int linea, const char *file)
 69 | {
 70 |     int e = pthread_cond_init(cond, attr);
 71 |     if (e != 0)
 72 |     {
 73 |         xperror(e, "Error in pthread_cond_init");
 74 |         fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file);
 75 |         sleep(Thread_error_wait); // do not kill immediately other threads
 76 |         exit(1);
 77 |     }
 78 |     return e;
 79 | }
 80 | 
 81 | int xpthread_cond_destroy(pthread_cond_t *cond, int linea, const char *file)
 82 | {
 83 |     int e = pthread_cond_destroy(cond);
 84 |     if (e != 0)
 85 |     {
 86 |         xperror(e, "Error in pthread_cond_destroy");
 87 |         fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file);
 88 |         sleep(Thread_error_wait); // do not kill immediately other threads
 89 |         exit(1);
 90 |     }
 91 |     return e;
 92 | }
 93 | 
 94 | int xpthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex, int linea, const char *file)
 95 | {
 96 |     int e = pthread_cond_wait(cond, mutex);
 97 |     if (e != 0)
 98 |     {
 99 |         xperror(e, "Error in pthread_cond_lock");
100 |         fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file);
101 |         sleep(Thread_error_wait); // do not kill immediately other threads
102 |         exit(1);
103 |     }
104 |     return e;
105 | }
106 | 
107 | int xpthread_cond_signal(pthread_cond_t *cond, int linea, const char *file)
108 | {
109 |     int e = pthread_cond_signal(cond);
110 |     if (e != 0)
111 |     {
112 |         xperror(e, "Error in pthread_cond_unlock");
113 |         fprintf(stderr, "== %d == Line: %d, File: %s\n", getpid(), linea, file);
114 |         sleep(Thread_error_wait); // do not kill immediately other threads
115 |         exit(1);
116 |     }
117 |     return e;
118 | }
119 | ////////////////////////////////////////////////////////////////////////////////
120 | 
121 | ////////////////////////////////////////////////////////////////////////////////
122 | /// Parallel computation
123 | ////////////////////////////////////////////////////////////////////////////////
124 | 
125 | // This should be done using buffering.
126 | size_t next_start_fastq(gzFile fp)
127 | {
128 |     int c;
129 |     // Special case when we arr at the beginning of the file.
130 |     if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@')
131 |         return 0;
132 | 
133 |     // Strart from the previous character
134 |     gzseek(fp, -1, SEEK_CUR);
135 | 
136 |     std::vector<std::pair<int, size_t>> window;
137 |     // Find the first new line
138 |     for (size_t i = 0; i < 4; ++i)
139 |     {
140 |         while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n'))
141 |         {
142 |         }
143 |         if (c == EOF)
144 |             return gztell(fp);
145 |         if ((c = gzgetc(fp)) == EOF)
146 |             return gztell(fp);
147 |         window.push_back(std::make_pair(c, gztell(fp) - 1));
148 |     }
149 | 
150 |     for (size_t i = 0; i < 2; ++i)
151 |     {
152 |         if (window[i].first == '@' && window[i + 2].first == '+')
153 |             return window[i].second;
154 |         if (window[i].first == '+' && window[i + 2].first == '@')
155 |             return window[i + 2].second;
156 |     }
157 | 
158 |     return gztell(fp);
159 | }
160 | 
161 | // test if the file is gzipped
162 | static inline bool is_gzipped(std::string filename)
163 | {
164 |     FILE *fp = fopen(filename.c_str(), "rb");
165 |     if (fp == NULL)
166 |         error("Opening file " + filename);
167 |     int byte1 = 0, byte2 = 0;
168 |     fread(&byte1, sizeof(char), 1, fp);
169 |     fread(&byte2, sizeof(char), 1, fp);
170 |     fclose(fp);
171 |     return (byte1 == 0x1f && byte2 == 0x8b);
172 | }
173 | 
174 | // Return the length of the file
175 | // Assumes that the file is not compressed
176 | static inline size_t get_file_size(std::string filename)
177 | {
178 |     if (is_gzipped(filename))
179 |     {
180 |         std::cerr << "The input is gzipped!" << std::endl;
181 |         return -1;
182 |     }
183 |     FILE *fp = fopen(filename.c_str(), "r");
184 |     fseek(fp, 0L, SEEK_END);
185 |     size_t size = ftell(fp);
186 |     fclose(fp);
187 |     return size;
188 | }
189 | 
190 | std::vector<size_t> split_fastq(std::string filename, size_t n_threads)
191 | {
192 |     //Precondition: the file is not gzipped
193 |     // scan file for start positions and execute threads
194 |     size_t size = get_file_size(filename);
195 | 
196 |     gzFile fp = gzopen(filename.c_str(), "r");
197 |     if (fp == Z_NULL)
198 |     {
199 |         throw new std::runtime_error("Cannot open input file " + filename);
200 |     }
201 | 
202 |     std::vector<size_t> starts(n_threads + 1);
203 |     for (int i = 0; i < n_threads; ++i)
204 |     {
205 |         size_t start = (size_t)((size * i) / n_threads);
206 |         gzseek(fp, start, SEEK_SET);
207 |         starts[i] = next_start_fastq(fp);
208 |     }
209 |     starts[n_threads] = size;
210 |     gzclose(fp);
211 |     return starts;
212 | }
213 | 
214 | inline char complement(const char n)
215 | {
216 |     switch (n)
217 |     {
218 |     case 'A':
219 |         return 'T';
220 |     case 'T':
221 |         return 'A';
222 |     case 'G':
223 |         return 'C';
224 |     case 'C':
225 |         return 'G';
226 |     default:
227 |         return n;
228 |     }
229 | }
230 | 
231 | ////////////////////////////////////////////////////////////////////////////////
232 | 
233 | ////////////////////////////////////////////////////////////////////////////////
234 | /// Merge SAMs
235 | ////////////////////////////////////////////////////////////////////////////////
236 | 
237 | 
238 | // Merges te file in filename in the file pointed by fp
239 | void append_file(const std::string filename, FILE *fp){
240 |     const size_t buff_size = 16384;
241 | 
242 |     uint8_t buff[buff_size];
243 |     size_t size = 0;
244 | 
245 |     struct stat filestat;
246 |     FILE *fd;
247 | 
248 |     if ((fd = fopen(filename.c_str(), "r")) == nullptr)
249 |         error("open() file " + std::string(filename) + " failed");
250 | 
251 |     // int fn = fileno(fd);
252 |     // if (fstat(fn, &filestat) < 0)
253 |     //     error("stat() file " + std::string(filename) + " failed");
254 | 
255 |     // size_t length = filestat.st_size;
256 |     size_t length = 0;
257 | 
258 |     while((length = fread(buff, sizeof(uint8_t), buff_size, fd)) == buff_size)
259 |         if ((fwrite(buff, sizeof(uint8_t), buff_size, fp)) != buff_size)
260 |             error("fwrite() file " + std::string(filename) + " failed");
261 |     
262 |     assert(length < buff_size);
263 |     if(length > 0)
264 |         if ((fwrite(buff, sizeof(uint8_t), length, fp)) != length)
265 |             error("fwrite() file " + std::string(filename) + " failed");
266 | 
267 | 
268 |     fclose(fd);
269 | }
270 | 
271 | ////////////////////////////////////////////////////////////////////////////////
272 | 
273 | ////////////////////////////////////////////////////////////////////////////////
274 | /// Multithreads workers
275 | ////////////////////////////////////////////////////////////////////////////////
276 | 
277 | pthread_mutex_t mutex_reads_dispatcher;
278 | pthread_cond_t cond_reads_dispatcher;
279 | // Critical variables
280 | size_t n_active_threads = 0;
281 | // std::vector<bool> active_threads;
282 | 
283 | template <typename extender_t>
284 | struct mt_param_t
285 | {
286 |     // Parameters
287 |     extender_t *extender;
288 |     std::string pattern_filename;
289 |     std::string sam_filename;
290 |     size_t start;
291 |     size_t end;
292 |     size_t wk_id;
293 |     // Return values
294 |     size_t n_reads;
295 |     size_t n_extended_reads;
296 | };
297 | 
298 | template <typename extender_t>
299 | void *mt_extend_worker(void *param)
300 | {
301 |     mt_param_t<extender_t> *p = (mt_param_t<extender_t> *)param;
302 |     size_t n_reads = 0;
303 |     size_t n_extended_reads = 0;
304 | 
305 |     FILE *sam_fd;
306 |     gzFile fp;
307 | 
308 |     if ((sam_fd = fopen(p->sam_filename.c_str(), "w")) == nullptr)
309 |         error("open() file " + p->sam_filename + " failed");
310 | 
311 |     if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL)
312 |         error("open() file " + p->pattern_filename + " failed");
313 | 
314 |     gzseek(fp, p->start, SEEK_SET);
315 | 
316 |     kseq_t rev;
317 |     int l;
318 | 
319 |     kseq_t *seq = kseq_init(fp);
320 |     while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0))
321 |     {
322 | 
323 |         bool fwd_extend = p->extender->extend(seq, sam_fd, 0);
324 | 
325 |         //copy seq
326 |         copy_kseq_t(&rev, seq);
327 | 
328 |         for (size_t i = 0; i < seq->seq.l; ++i)
329 |             rev.seq.s[i] = complement(seq->seq.s[seq->seq.l - i - 1]);
330 | 
331 |         if (rev.seq.m > rev.seq.l)
332 |             rev.seq.s[rev.seq.l] = 0;
333 | 
334 |         bool rev_extend = p->extender->extend(&rev, sam_fd, 1);
335 | 
336 |         if (fwd_extend or rev_extend)
337 |             n_extended_reads++;
338 |         n_reads++;
339 | 
340 |         free(rev.name.s);
341 |         free(rev.comment.s);
342 |         free(rev.seq.s);
343 |         free(rev.qual.s);
344 |     }
345 | 
346 |     verbose("Number of extended reads block ", p->wk_id, " : ", n_extended_reads, "/", n_reads);
347 |     p->n_reads = n_reads;
348 |     p->n_extended_reads = n_extended_reads;
349 |     kseq_destroy(seq);
350 |     gzclose(fp);
351 |     fclose(sam_fd);
352 | 
353 |     // Update the number of active threads
354 |     xpthread_mutex_lock(&mutex_reads_dispatcher, __LINE__, __FILE__);
355 |     {
356 |         --n_active_threads;
357 |         xpthread_cond_signal(&cond_reads_dispatcher, __LINE__, __FILE__);
358 |     }
359 |     xpthread_mutex_unlock(&mutex_reads_dispatcher, __LINE__, __FILE__);
360 | 
361 |     return NULL;
362 | }
363 | 
364 | template <typename extender_t>
365 | size_t mt_extend(extender_t *extender, std::string pattern_filename, std::string sam_filename, size_t n_threads, size_t k)
366 | {
367 |     xpthread_mutex_init(&mutex_reads_dispatcher, NULL, __LINE__, __FILE__);
368 |     xpthread_cond_init(&cond_reads_dispatcher, NULL, __LINE__, __FILE__);
369 | 
370 |     // active_threads = std::vector<bool>(n_threads, false);
371 |     pthread_t t[k * n_threads] = {0};
372 |     mt_param_t<extender_t> params[k * n_threads];
373 |     std::vector<size_t> starts = split_fastq(pattern_filename, k * n_threads);
374 |     for (size_t i = 0; i < k * n_threads; ++i)
375 |     {
376 |         // Get the number of active threads
377 |         xpthread_mutex_lock(&mutex_reads_dispatcher, __LINE__, __FILE__);
378 |         {
379 |             while(n_active_threads >= n_threads)
380 |                 xpthread_cond_wait(&cond_reads_dispatcher, &mutex_reads_dispatcher, __LINE__, __FILE__);
381 |             assert(n_active_threads < n_threads);
382 |             // Create a new thread
383 |             params[i].extender = extender;
384 |             params[i].pattern_filename = pattern_filename;
385 |             params[i].sam_filename = sam_filename + "_" + std::to_string(i) + ".sam";
386 |             params[i].start = starts[i];
387 |             params[i].end = starts[i + 1];
388 |             params[i].wk_id = i;
389 |             xpthread_create(&t[i], NULL, &mt_extend_worker<extender_t>, &params[i], __LINE__, __FILE__);
390 |             // Update the number of active threads
391 |             ++n_active_threads;
392 |         }
393 |         xpthread_mutex_unlock(&mutex_reads_dispatcher, __LINE__, __FILE__);
394 |     }
395 | 
396 |     size_t tot_reads = 0;
397 |     size_t tot_extended_reads = 0;
398 | 
399 |     for (size_t i = 0; i < k * n_threads; ++i)
400 |     {
401 |         xpthread_join(t[i], NULL, __LINE__, __FILE__);
402 |     }
403 | 
404 |     // sleep(5);
405 |     verbose("Merging temporary SAM files");
406 | 
407 |     FILE *fd;
408 | 
409 |     if ((fd = fopen(std::string(sam_filename + ".sam").c_str(), "w")) == nullptr)
410 |         error("open() file " + std::string(sam_filename + ".sam") + " failed");
411 | 
412 |     fprintf(fd, "%s", extender->to_sam().c_str());
413 | 
414 |     for (size_t i = 0; i < k * n_threads; ++i)
415 |     {
416 |         tot_reads += params[i].n_reads;
417 |         tot_extended_reads += params[i].n_extended_reads;
418 | 
419 |         append_file(params[i].sam_filename, fd);
420 |         if (std::remove(params[i].sam_filename.c_str()) != 0)
421 |             error("remove() file " + params[i].sam_filename + " failed");
422 |     }
423 |     
424 |     xpthread_mutex_destroy(&mutex_reads_dispatcher, __LINE__, __FILE__);
425 |     xpthread_cond_destroy(&cond_reads_dispatcher, __LINE__, __FILE__);
426 | 
427 |     verbose("Number of extended reads: ", tot_extended_reads, "/", tot_reads);
428 |     return tot_extended_reads;
429 | }
430 | 
431 | ////////////////////////////////////////////////////////////////////////////////
432 | /// Single Thread
433 | ////////////////////////////////////////////////////////////////////////////////
434 | template <typename extender_t>
435 | size_t st_extend(extender_t *extender, std::string pattern_filename, std::string sam_filename)
436 | {
437 |     size_t n_reads = 0;
438 |     size_t n_extended_reads = 0;
439 |     kseq_t rev;
440 |     int l;
441 |     FILE *sam_fd;
442 | 
443 |     sam_filename += ".sam";
444 | 
445 |     if ((sam_fd = fopen(sam_filename.c_str(), "w")) == nullptr)
446 |         error("open() file " + sam_filename + " failed");
447 | 
448 |     fprintf(sam_fd, "%s", extender->to_sam().c_str());
449 | 
450 |     gzFile fp = gzopen(pattern_filename.c_str(), "r");
451 |     kseq_t *seq = kseq_init(fp);
452 |     while ((l = kseq_read(seq)) >= 0)
453 |     {
454 | 
455 |         bool fwd_extend = extender->extend(seq, sam_fd, 0);
456 | 
457 |         //copy seq
458 |         copy_kseq_t(&rev, seq);
459 | 
460 |         for (size_t i = 0; i < seq->seq.l; ++i)
461 |             rev.seq.s[i] = complement(seq->seq.s[seq->seq.l - i - 1]);
462 | 
463 |         if (rev.seq.m > rev.seq.l)
464 |             rev.seq.s[rev.seq.l] = 0;
465 | 
466 |         bool rev_extend = extender->extend(&rev, sam_fd, 1);
467 | 
468 |         if (fwd_extend or rev_extend)
469 |             n_extended_reads++;
470 |         n_reads++;
471 | 
472 |         free(rev.name.s);
473 |         free(rev.comment.s);
474 |         free(rev.seq.s);
475 |         free(rev.qual.s);
476 |     }
477 | 
478 |     verbose("Number of extended reads: ", n_extended_reads, "/", n_reads);
479 |     kseq_destroy(seq);
480 |     gzclose(fp);
481 |     fclose(sam_fd);
482 | 
483 |     // sleep(5);
484 | 
485 |     return n_extended_reads;
486 | }
487 | 
488 | #endif /* end of include guard: _READS_DISPATCHER_HH */
489 | 


--------------------------------------------------------------------------------
/include/extender/extender_klib.hpp:
--------------------------------------------------------------------------------
  1 | /* extender_klib - Extend the MEMs of the reads to the reference using the klib library for SW
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file extender_klib.cpp
 16 |    \brief extender_klib.cpp Extend the MEMs of the reads to the reference using the klib library for SW
 17 |    \author Massimiliano Rossi
 18 |    \date 13/07/2020
 19 | */
 20 | 
 21 | #ifndef _EXTENDER_KLIB_HH
 22 | #define _EXTENDER_KLIB_HH
 23 | 
 24 | #include <common.hpp>
 25 | 
 26 | #include <sdsl/io.hpp>
 27 | 
 28 | #include <ms_pointers.hpp>
 29 | 
 30 | #include <malloc_count.h>
 31 | 
 32 | #include <SelfShapedSlp.hpp>
 33 | #include <DirectAccessibleGammaCode.hpp>
 34 | #include <SelectType.hpp>
 35 | #include <PlainSlp.hpp>
 36 | #include <FixedBitLenCode.hpp>
 37 | 
 38 | #include <ksw.h>
 39 | #include <ssw.h>
 40 | 
 41 | #include <libgen.h>
 42 | #include <seqidx.hpp>
 43 | 
 44 | ////////////////////////////////////////////////////////////////////////////////
 45 | /// SLP definitions
 46 | ////////////////////////////////////////////////////////////////////////////////
 47 | 
 48 | using SelSd = SelectSdvec<>;
 49 | using DagcSd = DirectAccessibleGammaCode<SelSd>;
 50 | using Fblc = FixedBitLenCode<>;
 51 | 
 52 | using shaped_slp_t = SelfShapedSlp<uint32_t, DagcSd, DagcSd, SelSd>;
 53 | using plain_slp_t = PlainSlp<uint32_t, Fblc, Fblc>;
 54 | 
 55 | template <typename slp_t>
 56 | std::string get_slp_file_extension()
 57 | {
 58 |     return std::string(".slp");
 59 | }
 60 | 
 61 | template <>
 62 | std::string get_slp_file_extension<shaped_slp_t>()
 63 | {
 64 |     return std::string(".slp");
 65 | }
 66 | 
 67 | template <>
 68 | std::string get_slp_file_extension<plain_slp_t>()
 69 | {
 70 |     return std::string(".plain.slp");
 71 | }
 72 | ////////////////////////////////////////////////////////////////////////////////
 73 | 
 74 | template <typename slp_t>
 75 | class extender
 76 | {
 77 | public:
 78 |     extender(std::string filename,
 79 |             size_t min_len_ = 50,
 80 |             bool forward_only_ = true) : min_len(min_len_),
 81 |                                          forward_only(forward_only_)
 82 |     {
 83 |         verbose("Loading the matching statistics index");
 84 |         std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
 85 | 
 86 |         std::string filename_ms = filename + ms.get_file_extension();
 87 | 
 88 |         ifstream fs_ms(filename_ms);
 89 |         ms.load(fs_ms);
 90 |         fs_ms.close();
 91 | 
 92 |         std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
 93 | 
 94 |         verbose("Matching statistics index construction complete");
 95 |         verbose("Memory peak: ", malloc_count_peak());
 96 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
 97 | 
 98 |         verbose("Loading random access");
 99 |         t_insert_start = std::chrono::high_resolution_clock::now();
100 | 
101 |         std::string filename_slp = filename + get_slp_file_extension<slp_t>();
102 | 
103 |         ifstream fs(filename_slp);
104 |         ra.load(fs);
105 |         fs.close();
106 | 
107 |         n = ra.getLen();
108 | 
109 |         t_insert_end = std::chrono::high_resolution_clock::now();
110 | 
111 |         verbose("Matching statistics index loading complete");
112 |         verbose("Memory peak: ", malloc_count_peak());
113 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
114 | 
115 |         std::string filename_idx = filename + idx.get_file_extension();
116 |         verbose("Loading fasta index file: " + filename_idx);
117 |         t_insert_start = std::chrono::high_resolution_clock::now();
118 | 
119 | 
120 |         ifstream fs_idx(filename_idx);
121 |         idx.load(fs_idx);
122 |         fs_idx.close();
123 | 
124 |         t_insert_end = std::chrono::high_resolution_clock::now();
125 | 
126 |         verbose("Fasta index loading complete");
127 |         verbose("Memory peak: ", malloc_count_peak());
128 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
129 | 
130 | 
131 |         verbose("Initialize the local aligner");
132 |         t_insert_start = std::chrono::high_resolution_clock::now();
133 | 
134 |         if (minsc > 0xffff)
135 |             minsc = 0xffff;
136 |         xtra |= KSW_XSUBO | minsc;
137 |         // initialize scoring matrix
138 |         for (i = k = 0; i < 4; ++i)
139 |         {
140 |             for (j = 0; j < 4; ++j)
141 |                 mat[k++] = i == j ? sa : -sb;
142 |             mat[k++] = 0; // ambiguous base
143 |         }
144 |         for (j = 0; j < 5; ++j)
145 |             mat[k++] = 0;
146 | 
147 |         t_insert_end = std::chrono::high_resolution_clock::now();
148 | 
149 |         verbose("Local aligner initialization complete");
150 |         verbose("Memory peak: ", malloc_count_peak());
151 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
152 | 
153 |         verbose("Minimum MEM length: ", min_len);
154 |     }
155 | 
156 |     bool extend(kseq_t *read, FILE *out, uint8_t strand)
157 |     {
158 |         size_t mem_pos = 0;
159 |         size_t mem_len = 0;
160 |         size_t mem_idx = 0;
161 | 
162 |         bool extended = false;
163 | 
164 |         auto pointers = ms.query(read->seq.s, read->seq.l);
165 |         std::vector<size_t> lengths(pointers.size());
166 |         size_t l = 0;
167 |         for (size_t i = 0; i < pointers.size(); ++i)
168 |         {
169 |             size_t pos = pointers[i];
170 |             while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l))
171 |                 ++l;
172 | 
173 |             lengths[i] = l;
174 |             l = (l == 0 ? 0 : (l - 1));
175 | 
176 |             // Update MEM
177 |             if (lengths[i] > mem_len)
178 |             {
179 |                 mem_len = lengths[i];
180 |                 mem_pos = pointers[i];
181 |                 mem_idx = i;
182 |             }
183 |         }
184 | 
185 |         // Align the read
186 |         if (mem_len >= min_len)
187 |         {
188 |             char *str = (char *)malloc(400);
189 | 
190 |             int32_t maskLen = read->seq.l / 2;
191 |             maskLen = maskLen < 15 ? 15 : maskLen;
192 | 
193 |             // Extract the context from the reference
194 |             size_t left_occ = (mem_pos > 100 ? mem_pos - 100 : 0);
195 |             size_t len = mem_len + 100 + (mem_pos > 100 ? 100 : 100 - mem_pos);
196 |             ra.expandSubstr(left_occ, len, str);
197 | 
198 |             size_t min_score = 20 + 8 * log(read->seq.l);
199 | 
200 |             uint8_t *seq = (uint8_t *)malloc(read->seq.l);
201 |             // Convert A,C,G,T,N into 0,1,2,3,4
202 |             for (i = 0; i < (int)read->seq.l; ++i)
203 |                 seq[i] = seq_nt4_table[(int)read->seq.s[i]];
204 |             // for (i = 0; i < (int)read->seq.l; ++i)
205 |             //   read->seq.s[i] = seq_nt4_table[(int)read->seq.s[i]];
206 | 
207 |             for (i = 0; i < (int)len; ++i)
208 |                 str[i] = seq_nt4_table[(int)str[i]];
209 | 
210 |             int score;
211 | 
212 |             kswq_t *q = 0;
213 |             kswr_t r;
214 | 
215 |             r = ksw_align(read->seq.l, (uint8_t *)seq, len, (uint8_t *)str, 5, mat, gapo, gape, xtra, &q);
216 |             // score = ksw_global(read->seq.l, (uint8_t *)read->seq.s, len, (uint8_t *)str, 5, mat, gapo, gape, w, &n_cigar, &cigar);
217 | 
218 |             int n_cigar;
219 |             uint32_t *cigar;
220 | 
221 |             size_t new_seq_len = r.qe - r.qb;
222 |             size_t new_ref_len = r.te - r.tb;
223 |             uint8_t *new_seq = (uint8_t *)(seq + r.qb);
224 |             // uint8_t *new_seq = (uint8_t *)(read->seq.s + r.qb);
225 |             uint8_t *new_ref = (uint8_t *)(str + r.tb);
226 | 
227 |             score = ksw_global(new_seq_len, (uint8_t *)new_seq, new_ref_len, new_ref, 5, mat, gapo, gape, w, &n_cigar, &cigar);
228 | 
229 |             std::string cig;
230 | 
231 |             // for(size_t i = 0; i < n_cigar; ++i)
232 |             // {
233 |             //   // for (i = 0; i < ez->n_cigar; ++i)
234 |             //   //   printf("%d%c", ez->cigar[i] >> 4, "MID"[ez->cigar[i] & 0xf]);
235 |             //   cig += std::to_string(cigar[i] >> 4) + "MID"[cigar[i] & 0xf];
236 |             // }
237 | 
238 |             size_t mismatch = mark_mismatch(r.tb, r.qb, r.qe, (int8_t *)str, (int8_t *)seq, read->seq.l, &cigar, &n_cigar);
239 |             for (c = 0; c < (n_cigar); ++c)
240 |             {
241 |                 char letter = cigar_int_to_op(cigar[c]);
242 |                 uint32_t length = cigar_int_to_len(cigar[c]);
243 |                 // fprintf(out, "%lu%c", (unsigned long)length, letter);
244 |                 cig += std::to_string((unsigned long)length) + letter;
245 |             }
246 | 
247 |             // if(r.score > 0)
248 |             //   printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", "human", r.tb, r.te + 1, read->name.s, r.qb, r.qe + 1, r.score, r.score2, r.te2);
249 |             //   // std::cout << "\rCurrent score... "<< r.score << std::flush;
250 | 
251 |             // // Declares a default Aligner
252 |             // StripedSmithWaterman::Aligner aligner;
253 |             // // Declares a default filter
254 |             // StripedSmithWaterman::Filter filter;
255 |             // // StripedSmithWaterman::Filter filter(true, true, min_score, 32767);
256 |             // // Declares an alignment that stores the result
257 |             // StripedSmithWaterman::Alignment alignment;
258 |             // // Aligns the query to the ref
259 |             // aligner.Align(read->seq.s, str, len, filter, &alignment, maskLen);
260 | 
261 |             // // Update alignment method
262 |             r.tb += left_occ;
263 |             r.te += left_occ;
264 |             r.te2 += left_occ;
265 | 
266 |             if (r.score >= min_score)
267 |             {
268 |                 ssw_write_sam(r, idx[r.tb].c_str(), read, strand, out, cig, mismatch);
269 |                 extended = true;
270 |             }
271 | 
272 |             // extended_reads++;
273 |             free(cigar);
274 |             free(q);
275 |             delete str;
276 |             delete seq;
277 |         }
278 |         return extended;
279 |     }
280 | 
281 |     size_t get_extended_reads()
282 |     {
283 |         return extended_reads;
284 |     }
285 | 
286 |     // Adapted from SSW
287 |     static void ssw_write_sam(kswr_t &a,
288 |                               const char *ref_seq_name,
289 |                               const kseq_t *read,
290 |                               int8_t strand,
291 |                               FILE *out,
292 |                               std::string cigar,
293 |                               size_t mismatches) // 0: forward aligned ; 1: reverse complement aligned
294 |     {
295 |         // Sam format output
296 |         fprintf(out, "%s\t", read->name.s);
297 |         if (a.score == 0)
298 |             fprintf(out, "4\t*\t0\t255\t*\t*\t0\t0\t*\t*\n");
299 |         else
300 |         {
301 |             int32_t c, p;
302 |             uint32_t mapq = -4.343 * log(1 - (double)abs(a.score - a.score2) / (double)a.score);
303 |             mapq = (uint32_t)(mapq + 4.99);
304 |             mapq = mapq < 254 ? mapq : 254;
305 |             if (strand)
306 |                 fprintf(out, "16\t");
307 |             else
308 |                 fprintf(out, "0\t");
309 |             // TODO: Find the correct reference name.
310 |             fprintf(out, "%s\t%d\t%d\t", ref_seq_name, a.tb + 1, mapq);
311 |             // size_t mismatch = mark_mismatch(a.tb, a.qb, a.qe, (int8_t*)ref, (int8_t*)read_, read->seq.l, cigar, cigarLen);
312 |             // for (c = 0; c < (*cigarLen); ++c)
313 |             // {
314 |             //   char letter = cigar_int_to_op((*cigar)[c]);
315 |             //   uint32_t length = cigar_int_to_len((*cigar)[c]);
316 |             //   fprintf(out, "%lu%c", (unsigned long)length, letter);
317 |             // }
318 |             // fprintf(out, "\t*\t");
319 |             // fprintf(out, "%s", a.cigar_string.c_str());
320 |             fprintf(out, "%s", cigar.c_str());
321 |             fprintf(out, "\t*\t0\t0\t");
322 |             fprintf(out, "%s", read->seq.s);
323 |             fprintf(out, "\t");
324 |             if (read->qual.s && strand)
325 |             {
326 |                 for (p = read->qual.l - 1; p >= 0; --p)
327 |                     fprintf(out, "%c", read->qual.s[p]);
328 |             }
329 |             else if (read->qual.s)
330 |                 fprintf(out, "%s", read->qual.s);
331 |             else
332 |                 fprintf(out, "*");
333 |             fprintf(out, "\tAS:i:%d", a.score);
334 |             fprintf(out, "\tNM:i:%d\t", mismatches);
335 |             // fprintf(out, "\tNM:i:%d\t", a.mismatches);
336 |             if (a.score2 > 0)
337 |                 fprintf(out, "ZS:i:%d\n", a.score2);
338 |             else
339 |                 fprintf(out, "\n");
340 |         }
341 |     }
342 | 
343 |     std::string to_sam()
344 |     {
345 |         std::string res = "@HD\tVN:1.6\tSO:unknown\n";
346 |         res += idx.to_sam();
347 |         res += "@PG\tID:moni\tPN:moni\tVN:0.2.2\n";
348 |         return res; 
349 |     }
350 | 
351 | protected:
352 |     ms_pointers<> ms;
353 |     slp_t ra;
354 |     seqidx idx;
355 | 
356 |     size_t min_len = 0;
357 |     size_t extended_reads = 0;
358 |     size_t n = 0;
359 | 
360 |     unsigned char seq_nt4_table[256] = {
361 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
362 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
363 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
364 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
365 |         4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
366 |         4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367 |         4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
368 |         4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
369 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
370 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
371 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
372 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
373 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376 |         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
377 | 
378 |     int c, sa = 2, sb = 2, i, j, k, max_rseq = 0;
379 |     int w = 4000;
380 |     int8_t mat[25];
381 |     int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
382 |     uint8_t *rseq = 0;
383 | 
384 |     bool forward_only;
385 | };
386 | 
387 | #endif /* end of include guard: _EXTENDER_KLIB_HH */
388 | 


--------------------------------------------------------------------------------
/include/ms/ms_pointers.hpp:
--------------------------------------------------------------------------------
  1 | /* ms_pointers - Computes the matching statistics pointers from BWT and Thresholds 
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU General Public License as published by
  6 |     the Free Software Foundation, either version 3 of the License, or
  7 |     (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU General Public License
 15 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 16 | */
 17 | /*!
 18 |    \file ms_pointers.hpp
 19 |    \brief ms_pointers.hpp Computes the matching statistics pointers from BWT and Thresholds.
 20 |    \author Massimiliano Rossi
 21 |    \date 09/07/2020
 22 | */
 23 | 
 24 | #ifndef _MS_POINTERS_HH
 25 | #define _MS_POINTERS_HH
 26 | 
 27 | #include <common.hpp>
 28 | 
 29 | #include <malloc_count.h>
 30 | 
 31 | #include <sdsl/rmq_support.hpp>
 32 | #include <sdsl/int_vector.hpp>
 33 | 
 34 | #include <r_index.hpp>
 35 | 
 36 | #include <ms_rle_string.hpp>
 37 | #include <thresholds_ds.hpp>
 38 | 
 39 | template <class sparse_bv_type = ri::sparse_sd_vector,
 40 |           class rle_string_t = ms_rle_string_sd,
 41 |           class thresholds_t = thr_bv<rle_string_t> >
 42 | class ms_pointers : ri::r_index<sparse_bv_type, rle_string_t>
 43 | {
 44 | public:
 45 |     thresholds_t thresholds;
 46 | 
 47 |     // std::vector<ulint> samples_start;
 48 |     int_vector<> samples_start;
 49 |     // int_vector<> samples_end;
 50 |     // std::vector<ulint> samples_last;
 51 | 
 52 |     // static const uchar TERMINATOR = 1;
 53 |     // bool sais = true;
 54 |     // /*
 55 |     //  * sparse RLBWT: r (log sigma + (1+epsilon) * log (n/r)) (1+o(1)) bits
 56 |     //  */
 57 |     // //F column of the BWT (vector of 256 elements)
 58 |     // std::vector<ulint> F;
 59 |     // //L column of the BWT, run-length compressed
 60 |     // rle_string_t bwt;
 61 |     // ulint terminator_position = 0;
 62 |     // ulint r = 0; //number of BWT runs
 63 | 
 64 |     typedef size_t size_type;
 65 | 
 66 |     ms_pointers() {}
 67 | 
 68 |     ms_pointers(std::string filename, bool rle = false) : ri::r_index<sparse_bv_type, rle_string_t>()
 69 |     {
 70 |         verbose("Building the r-index from BWT");
 71 | 
 72 |         std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
 73 | 
 74 |         std::string bwt_fname = filename + ".bwt";
 75 | 
 76 |         verbose("RLE encoding BWT and computing SA samples");
 77 | 
 78 |         if (rle)
 79 |         {
 80 |             std::string bwt_heads_fname = bwt_fname + ".heads";
 81 |             std::ifstream ifs_heads(bwt_heads_fname);
 82 |             std::string bwt_len_fname = bwt_fname + ".len";
 83 |             std::ifstream ifs_len(bwt_len_fname);
 84 |             this->bwt = rle_string_t(ifs_heads, ifs_len);
 85 | 
 86 |             ifs_heads.seekg(0);
 87 |             ifs_len.seekg(0);
 88 |             this->build_F_(ifs_heads, ifs_len);
 89 |         }
 90 |         else
 91 |         {
 92 |             std::ifstream ifs(bwt_fname);
 93 |             this->bwt = rle_string_t(ifs);
 94 | 
 95 |             ifs.seekg(0);
 96 |             this->build_F(ifs);
 97 |         }
 98 |         // std::string istring;
 99 |         // read_file(bwt_fname.c_str(), istring);
100 |         // for(size_t i = 0; i < istring.size(); ++i)
101 |         //     if(istring[i]==0)
102 |         //         istring[i] = TERMINATOR;
103 |         // this->bwt = rle_string_t(istring);
104 | 
105 |         this->r = this->bwt.number_of_runs();
106 |         ri::ulint n = this->bwt.size();
107 |         int log_r = bitsize(uint64_t(this->r));
108 |         int log_n = bitsize(uint64_t(this->bwt.size()));
109 | 
110 |         verbose("Number of BWT equal-letter runs: r = ", this->r);
111 |         verbose("Rate n/r = ", double(this->bwt.size()) / this->r);
112 |         verbose("log2(r) = ", log2(double(this->r)));
113 |         verbose("log2(n/r) = ", log2(double(this->bwt.size()) / this->r));
114 | 
115 |         // this->build_F(istring);
116 |         // istring.clear();
117 |         // istring.shrink_to_fit();
118 | 
119 |         read_samples(filename + ".ssa", this->r, n, samples_start);
120 |         read_samples(filename + ".esa", this->r, n, this->samples_last);
121 | 
122 |         std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
123 | 
124 |         verbose("R-index construction complete");
125 |         verbose("Memory peak: ", malloc_count_peak());
126 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
127 | 
128 |         verbose("Reading thresholds from file");
129 | 
130 |         t_insert_start = std::chrono::high_resolution_clock::now();
131 | 
132 |         thresholds = thresholds_t(filename,&this->bwt);
133 | 
134 |         // std::string tmp_filename = filename + std::string(".thr_pos");
135 | 
136 |         // struct stat filestat;
137 |         // FILE *fd;
138 | 
139 |         // if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
140 |         //     error("open() file " + tmp_filename + " failed");
141 | 
142 |         // int fn = fileno(fd);
143 |         // if (fstat(fn, &filestat) < 0)
144 |         //     error("stat() file " + tmp_filename + " failed");
145 | 
146 |         // if (filestat.st_size % THRBYTES != 0)
147 |         //     error("invilid file " + tmp_filename);
148 | 
149 |         // size_t length = filestat.st_size / THRBYTES;
150 |         // thresholds.resize(length);
151 | 
152 |         // for (size_t i = 0; i < length; ++i)
153 |         //     if ((fread(&thresholds[i], THRBYTES, 1, fd)) != 1)
154 |         //         error("fread() file " + tmp_filename + " failed");
155 | 
156 |         // fclose(fd);
157 | 
158 |         t_insert_end = std::chrono::high_resolution_clock::now();
159 | 
160 |         verbose("Memory peak: ", malloc_count_peak());
161 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
162 |     }
163 | 
164 |     void read_samples(std::string filename, ulint r, ulint n, int_vector<> &samples)
165 |     {
166 |         int log_n = bitsize(uint64_t(n));
167 | 
168 |         struct stat filestat;
169 |         FILE *fd;
170 | 
171 |         if ((fd = fopen(filename.c_str(), "r")) == nullptr)
172 |             error("open() file " + filename + " failed");
173 | 
174 |         int fn = fileno(fd);
175 |         if (fstat(fn, &filestat) < 0)
176 |             error("stat() file " + filename + " failed");
177 | 
178 |         if (filestat.st_size % SSABYTES != 0)
179 |             error("invilid file " + filename);
180 | 
181 |         size_t length = filestat.st_size / (2 * SSABYTES);
182 |         //Check that the length of the file is 2*r elements of 5 bytes
183 |         assert(length == r);
184 | 
185 |         // Create the vector
186 |         samples = int_vector<>(r, 0, log_n);
187 | 
188 |         // Read the vector
189 |         uint64_t left = 0;
190 |         uint64_t right = 0;
191 |         size_t i = 0;
192 |         while (fread((char *)&left, SSABYTES, 1, fd) && fread((char *)&right, SSABYTES, 1, fd))
193 |         {
194 |             ulint val = (right ? right - 1 : n - 1);
195 |             assert(bitsize(uint64_t(val)) <= log_n);
196 |             samples[i++] = val;
197 |         }
198 | 
199 |         fclose(fd);
200 |     }
201 | 
202 |     vector<ulint> build_F_(std::ifstream &heads, std::ifstream &lengths)
203 |     {
204 |         heads.clear();
205 |         heads.seekg(0);
206 |         lengths.clear();
207 |         lengths.seekg(0);
208 | 
209 |         this->F = vector<ulint>(256, 0);
210 |         int c;
211 |         ulint i = 0;
212 |         while ((c = heads.get()) != EOF)
213 |         {
214 |             size_t length = 0;
215 |             lengths.read((char *)&length, 5);
216 |             if (c > TERMINATOR)
217 |                 this->F[c] += length;
218 |             else
219 |             {
220 |                 this->F[TERMINATOR] += length;
221 |                 this->terminator_position = i;
222 |             }
223 |             i++;
224 |         }
225 |         for (ulint i = 255; i > 0; --i)
226 |             this->F[i] = this->F[i - 1];
227 |         this->F[0] = 0;
228 |         for (ulint i = 1; i < 256; ++i)
229 |             this->F[i] += this->F[i - 1];
230 |         return this->F;
231 |     }
232 | 
233 |     // Computes the matching statistics pointers for the given pattern
234 |     std::vector<size_t> query(const std::vector<uint8_t> &pattern)
235 |     {
236 |         size_t m = pattern.size();
237 | 
238 |         return _query(pattern.data(), m);
239 |     }
240 | 
241 |     std::vector<size_t> query(const char* pattern, const size_t m)
242 |     {
243 |         return _query(pattern, m);
244 |     }
245 | 
246 |     void print_stats()
247 |     {
248 |         sdsl::nullstream ns;
249 | 
250 |         verbose("Memory consumption (bytes).");
251 |         verbose("   terminator_position: ", sizeof(this->terminator_position));
252 |         verbose("                     F: ", my_serialize(this->F, ns));
253 |         verbose("                   bwt: ", this->bwt.serialize(ns));
254 |         verbose("          samples_last: ", this->samples_last.serialize(ns));
255 |         verbose("            thresholds: ", thresholds.serialize(ns));
256 |         verbose("         samples_start: ", samples_start.serialize(ns));
257 |     }
258 | 
259 |     /*
260 |      * \param i position in the BWT
261 |      * \param c character
262 |      * \return lexicographic rank of cw in bwt
263 |      */
264 |     ulint LF(ri::ulint i, ri::uchar c)
265 |     {
266 |         // //if character does not appear in the text, return empty pair
267 |         // if ((c == 255 and this->F[c] == this->bwt_size()) || this->F[c] >= this->F[c + 1])
268 |         //     return {1, 0};
269 |         //number of c before the interval
270 |         ri::ulint c_before = this->bwt.rank(i, c);
271 |         // number of c inside the interval rn
272 |         ri::ulint l = this->F[c] + c_before;
273 |         return l;
274 |     }
275 | 
276 |     /* serialize the structure to the ostream
277 |      * \param out     the ostream
278 |      */
279 |     size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const
280 |     {
281 |         sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this));
282 |         size_type written_bytes = 0;
283 | 
284 |         out.write((char *)&this->terminator_position, sizeof(this->terminator_position));
285 |         written_bytes += sizeof(this->terminator_position);
286 |         written_bytes += my_serialize(this->F, out, child, "F");
287 |         written_bytes += this->bwt.serialize(out);
288 |         written_bytes += this->samples_last.serialize(out);
289 | 
290 |         written_bytes += thresholds.serialize(out, child, "thresholds");
291 |         // written_bytes += my_serialize(thresholds, out, child, "thresholds");
292 |         // written_bytes += my_serialize(samples_start, out, child, "samples_start");
293 |         written_bytes += samples_start.serialize(out, child, "samples_start");
294 | 
295 |         sdsl::structure_tree::add_size(child, written_bytes);
296 |         return written_bytes;
297 |     }
298 | 
299 |     std::string get_file_extension() const
300 |     {
301 |         return thresholds.get_file_extension() + ".ms";
302 |     }
303 | 
304 |     /* load the structure from the istream
305 |      * \param in the istream
306 |      */
307 |     void load(std::istream &in)
308 |     {
309 | 
310 |         in.read((char *)&this->terminator_position, sizeof(this->terminator_position));
311 |         my_load(this->F, in);
312 |         this->bwt.load(in);
313 |         this->r = this->bwt.number_of_runs();
314 |         this->samples_last.load(in);
315 | 
316 |         thresholds.load(in,&this->bwt);
317 |         // my_load(thresholds, in);
318 |         samples_start.load(in);
319 |         // my_load(samples_start,in);
320 |     }
321 | 
322 |     // // From r-index
323 |     // ulint get_last_run_sample()
324 |     // {
325 |     //     return (samples_last[r - 1] + 1) % bwt.size();
326 |     // }
327 | 
328 | protected:
329 |     // Computes the matching statistics pointers for the given pattern
330 |     template<typename string_t>
331 |     std::vector<size_t> _query(const string_t &pattern, const size_t m)
332 |     {
333 | 
334 |         std::vector<size_t> ms_pointers(m);
335 | 
336 |         // Start with the empty string
337 |         auto pos = this->bwt_size() - 1;
338 |         auto sample = this->get_last_run_sample();
339 | 
340 |         for (size_t i = 0; i < m; ++i)
341 |         {
342 |             auto c = pattern[m - i - 1];
343 | 
344 |             if (this->bwt.number_of_letter(c) == 0)
345 |             {
346 |                 sample = 0;
347 |             }
348 |             else if (pos < this->bwt.size() && this->bwt[pos] == c)
349 |             {
350 |                 sample--;
351 |             }
352 |             else
353 |             {
354 |                 // Get threshold
355 |                 ri::ulint rnk = this->bwt.rank(pos, c);
356 |                 size_t thr = this->bwt.size() + 1;
357 | 
358 |                 ulint next_pos = pos;
359 | 
360 |                 // if (rnk < (this->F[c] - this->F[c-1]) // I can use F to compute it
361 |                 if (rnk < this->bwt.number_of_letter(c))
362 |                 {
363 |                     // j is the first position of the next run of c's
364 |                     ri::ulint j = this->bwt.select(rnk, c);
365 |                     ri::ulint run_of_j = this->bwt.run_of_position(j);
366 | 
367 |                     thr = thresholds[run_of_j]; // If it is the first run thr = 0
368 | 
369 |                     // Here we should use Phi_inv that is not implemented yet
370 |                     // sample = this->Phi(this->samples_last[run_of_j - 1]) - 1;
371 |                     sample = samples_start[run_of_j];
372 | 
373 |                     next_pos = j;
374 |                 }
375 | 
376 |                 if (pos < thr)
377 |                 {
378 | 
379 |                     rnk--;
380 |                     ri::ulint j = this->bwt.select(rnk, c);
381 |                     ri::ulint run_of_j = this->bwt.run_of_position(j);
382 |                     sample = this->samples_last[run_of_j];
383 | 
384 |                     next_pos = j;
385 |                 }
386 | 
387 |                 pos = next_pos;
388 |             }
389 | 
390 |             ms_pointers[m - i - 1] = sample;
391 | 
392 |             // Perform one backward step
393 |             pos = LF(pos, c);
394 |         }
395 | 
396 |         return ms_pointers;
397 |     }
398 |     // // From r-index
399 |     // vector<ulint> build_F(std::ifstream &ifs)
400 |     // {
401 |     //     ifs.clear();
402 |     //     ifs.seekg(0);
403 |     //     F = vector<ulint>(256, 0);
404 |     //     uchar c;
405 |     //     ulint i = 0;
406 |     //     while (ifs >> c)
407 |     //     {
408 |     //         if (c > TERMINATOR)
409 |     //             F[c]++;
410 |     //         else
411 |     //         {
412 |     //             F[TERMINATOR]++;
413 |     //             terminator_position = i;
414 |     //         }
415 |     //         i++;
416 |     //     }
417 |     //     for (ulint i = 255; i > 0; --i)
418 |     //         F[i] = F[i - 1];
419 |     //     F[0] = 0;
420 |     //     for (ulint i = 1; i < 256; ++i)
421 |     //         F[i] += F[i - 1];
422 |     //     return F;
423 |     // }
424 | 
425 |     // // From r-index
426 |     // vector<pair<ulint, ulint>> &read_run_starts(std::string fname, ulint n, vector<pair<ulint, ulint>> &ssa)
427 |     // {
428 |     //     ssa.clear();
429 |     //     std::ifstream ifs(fname);
430 |     //     uint64_t x = 0;
431 |     //     uint64_t y = 0;
432 |     //     uint64_t i = 0;
433 |     //     while (ifs.read((char *)&x, 5) && ifs.read((char *)&y, 5))
434 |     //     {
435 |     //         ssa.push_back(pair<ulint, ulint>(y ? y - 1 : n - 1, i));
436 |     //         i++;
437 |     //     }
438 |     //     return ssa;
439 |     // }
440 | 
441 |     // // From r-index
442 |     // vector<ulint> &read_run_ends(std::string fname, ulint n, vector<ulint> &esa)
443 |     // {
444 |     //     esa.clear();
445 |     //     std::ifstream ifs(fname);
446 |     //     uint64_t x = 0;
447 |     //     uint64_t y = 0;
448 |     //     while (ifs.read((char *)&x, 5) && ifs.read((char *)&y, 5))
449 |     //     {
450 |     //         esa.push_back(y ? y - 1 : n - 1);
451 |     //     }
452 |     //     return esa;
453 |     // }
454 | };
455 | 
456 | // Computes the matching statistics pointers for the given pattern
457 | template <>
458 | template <typename string_t>
459 | std::vector<size_t> ms_pointers<ri::sparse_sd_vector, ms_rle_string_sd, thr_bv<ms_rle_string_sd>>::_query(const string_t &pattern, const size_t m)
460 | {
461 | 
462 |     std::vector<size_t> ms_pointers(m);
463 | 
464 |     // Start with the empty string
465 |     auto pos = this->bwt_size() - 1;
466 |     auto sample = this->get_last_run_sample();
467 | 
468 |     for (size_t i = 0; i < m; ++i)
469 |     {
470 |         auto c = pattern[m - i - 1];
471 |         const auto n_c = this->bwt.number_of_letter(c);
472 |         if (n_c == 0)
473 |         {
474 |             sample = 0;
475 |             // Perform one backward step
476 |             pos = LF(pos, c);
477 |         }
478 |         else if (pos < this->bwt.size() && this->bwt[pos] == c)
479 |         {
480 |             sample--;
481 |             // Perform one backward step
482 |             pos = LF(pos, c);
483 |         }
484 |         else
485 |         {
486 |             // Get threshold
487 |             ri::ulint run_of_pos = this->bwt.run_of_position(pos);
488 |             auto rnk_c = this->bwt.run_and_head_rank(run_of_pos, c);
489 |             size_t thr_c = thresholds.rank(pos + 1, c); // +1 because the rank count the thresiold in pos
490 | 
491 |             if (rnk_c.first > thr_c)
492 |             {
493 |                 // Jump up
494 |                 size_t run_of_j = this->bwt.run_head_select(rnk_c.first, c);
495 |                 sample = samples_last[run_of_j];
496 |                 // Perform one backward step
497 |                 pos = this->F[c] + rnk_c.second - 1;
498 |             }
499 |             else
500 |             {
501 |                 // Jump down
502 |                 size_t run_of_j = this->bwt.run_head_select(rnk_c.first + 1, c);
503 |                 sample = samples_start[run_of_j];
504 |                 // Perform one backward step
505 |                 pos = this->F[c] + rnk_c.second;
506 |             }
507 |         }
508 |         // Store the sample
509 |         ms_pointers[m - i - 1] = sample;
510 |     }
511 | 
512 |     return ms_pointers;
513 | }
514 | 
515 | #endif /* end of include guard: _MS_POINTERS_HH */
516 | 


--------------------------------------------------------------------------------
/include/ms/ms_rle_string.hpp:
--------------------------------------------------------------------------------
  1 | /* ms_rle_string - Extension of the r-index rle_string to compute matching statistics
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU General Public License as published by
  6 |     the Free Software Foundation, either version 3 of the License, or
  7 |     (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU General Public License
 15 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 16 | */
 17 | /*!
 18 |    \file ms_rle_string.hpp
 19 |    \brief ms_rle_string.hpp Extension of the r-index rle_string to compute matching statistics.
 20 |    \author Massimiliano Rossi
 21 |    \date 10/07/2020
 22 | */
 23 | 
 24 | #ifndef _MS_RLE_STRING_HH
 25 | #define _MS_RLE_STRING_HH
 26 | 
 27 | #include <common.hpp>
 28 | 
 29 | #include <rle_string.hpp>
 30 | 
 31 | template <
 32 |     class sparse_bitvector_t = ri::sparse_sd_vector, //predecessor structure storing run length
 33 |     class string_t = ri::huff_string                 //run heads
 34 |     >
 35 | class ms_rle_string : public ri::rle_string<sparse_bitvector_t, string_t>
 36 | {
 37 | public:
 38 |     ms_rle_string() : ri::rle_string<sparse_bitvector_t, string_t>()
 39 |     {
 40 |         //NtD
 41 |     }
 42 | 
 43 |     /*
 44 |      * constructor: build structure on the input string
 45 |      * \param input the input string without 0x0 bytes in it.
 46 |      * \param B block size. The main sparse bitvector has R/B bits set (R being number of runs)
 47 |      *
 48 |      */
 49 |     ms_rle_string(string &input, ulint B = 2) : ri::rle_string<sparse_bitvector_t, string_t>(input, B)
 50 |     {
 51 |         // NtD
 52 |     }
 53 | 
 54 |     ms_rle_string(std::ifstream &ifs, ulint B = 2) : ri::rle_string<sparse_bitvector_t, string_t>(ifs, B)
 55 |     {
 56 |     }
 57 | 
 58 |     // Construction from run-length encoded BWT
 59 |     ms_rle_string(std::ifstream &heads, std::ifstream &lengths, ulint B = 2)
 60 |     {
 61 |         // build_rlbwt(heads,lengths,B);
 62 |         heads.clear();
 63 |         heads.seekg(0);
 64 |         lengths.clear();
 65 |         lengths.seekg(0);
 66 |         // assert(not contains0(input)); // We're hacking the 0 away :)
 67 |         this->B = B;
 68 |         // n = input.size();
 69 |         auto runs_per_letter_bv = vector<vector<bool>>(256);
 70 |         //runs in main bitvector
 71 |         vector<bool> runs_bv;
 72 | 
 73 |         // Reads the run heads
 74 |         string run_heads_s;
 75 |         heads.seekg(0, heads.end);
 76 |         run_heads_s.resize(heads.tellg());
 77 |         heads.seekg(0, heads.beg);
 78 |         heads.read(&run_heads_s[0], run_heads_s.size());
 79 | 
 80 |         size_t pos = 0;
 81 |         this->n = 0;
 82 |         this->R = run_heads_s.size();
 83 |         // Compute runs_bv and runs_per_letter_bv
 84 |         for (size_t i = 0; i < run_heads_s.size(); ++i)
 85 |         {
 86 |             size_t length;
 87 |             lengths.read((char *)&length, 5);
 88 |             if (run_heads_s[i] <= TERMINATOR) // change 0 to 1
 89 |                 run_heads_s[i] = TERMINATOR;
 90 | 
 91 |             std::fill_n(std::back_inserter(runs_bv), length - 1, false);
 92 |             runs_bv.push_back(i % B == B - 1);
 93 | 
 94 |             std::fill_n(std::back_inserter(runs_per_letter_bv[run_heads_s[i]]), length - 1, false);
 95 |             runs_per_letter_bv[run_heads_s[i]].push_back(true);
 96 | 
 97 |             this->n += length;
 98 |         }
 99 |         // runs_bv.push_back(false);
100 | 
101 |         //now compact structures
102 |         assert(runs_bv.size() == this->n);
103 |         ulint t = 0;
104 |         for (ulint i = 0; i < 256; ++i)
105 |             t += runs_per_letter_bv[i].size();
106 |         assert(t == this->n);
107 |         this->runs = sparse_bitvector_t(runs_bv);
108 |         //a fast direct array: char -> bitvector.
109 |         this->runs_per_letter = vector<sparse_bitvector_t>(256);
110 |         for (ulint i = 0; i < 256; ++i)
111 |             this->runs_per_letter[i] = sparse_bitvector_t(runs_per_letter_bv[i]);
112 |         this->run_heads = string_t(run_heads_s);
113 |         assert(this->run_heads.size() == this->R);
114 |     }
115 | 
116 |     size_t number_of_runs_of_letter(uint8_t c)
117 |     {
118 |         return this->runs_per_letter[c].number_of_1();
119 |     }
120 | 
121 |     size_t number_of_letter(uint8_t c)
122 |     {
123 |         return this->runs_per_letter[c].size();
124 |     }
125 | 
126 |     // i-th run head
127 |     uint8_t head_of(const size_t i)
128 |     {
129 |         assert(i<this->R);
130 |         return this->run_heads[i];
131 |     }
132 | 
133 |     // rank in chracters of the i-th run head
134 |     // i.e., the number of characters c before the first character of the run.
135 |     size_t head_rank(const size_t i, const uint8_t c)
136 |     {
137 |         assert(i < this->R);
138 |         size_t j = this->run_heads.rank(i, c);
139 |         if(j < 1)
140 |             return j;
141 |         assert(j<=i);
142 |         return this->runs_per_letter[c].select(j-1) + 1; // j-1 because the select is 0 based
143 |     }
144 |     // number of runs of character c in in position i
145 |     size_t run_head_rank(const size_t i, const uint8_t c)
146 |     {
147 |         assert(i < this->R);
148 |         size_t j = this->run_heads.rank(i, c);
149 |         return j;
150 |     }
151 | 
152 |     inline std::pair<size_t,size_t> run_and_head_rank(const size_t i, const uint8_t c)
153 |     {
154 |         assert(i < this->R);
155 |         const size_t j = this->run_heads.rank(i, c);
156 |         if( j < 1)
157 |             return make_pair(j,j);
158 |         const size_t k = this->runs_per_letter[c].select(j - 1) + 1; // j-1 because the select is 0 based
159 |         return make_pair(j, k);
160 |     }
161 | 
162 |     // Select the i-th run of c
163 |     size_t run_head_select(const size_t i, const uint8_t c)
164 |     {
165 |         assert(i < this->R and i > 0);
166 |         return this->run_heads.select(i - 1, c);
167 |     }
168 | 
169 |     /* serialize the structure to the ostream
170 |      * \param out     the ostream
171 |      */
172 |     ulint serialize(std::ostream &out)
173 |     {
174 |         return ri::rle_string<sparse_bitvector_t, string_t>::serialize(out);
175 |     }
176 | 
177 |     /* load the structure from the istream
178 |      * \param in the istream
179 |      */
180 |     void load(std::istream &in)
181 |     {
182 |         ri::rle_string<sparse_bitvector_t, string_t>::load(in);
183 |     }
184 | 
185 | protected:
186 |     void build_rlbwt(std::ifstream &heads, std::ifstream &lengths, ulint B) 
187 |     {
188 |         heads.clear();
189 |         heads.seekg(0);
190 |         lengths.clear();
191 |         lengths.seekg(0);
192 |         // assert(not contains0(input)); // We're hacking the 0 away :)
193 |         this->B = B;
194 |         // n = input.size();
195 |         auto runs_per_letter_bv = vector<vector<bool>>(256);
196 |         //runs in main bitvector
197 |         vector<bool> runs_bv;
198 | 
199 |         // Reads the run heads
200 |         string run_heads_s;
201 |         heads.seekg(0, heads.end);
202 |         run_heads_s.resize(heads.tellg());
203 |         heads.seekg(0, heads.beg);
204 |         heads.read(&run_heads_s[0], run_heads_s.size());
205 | 
206 |         size_t pos = 0;
207 |         this->n = 0;
208 |         this->R = run_heads_s.size();
209 |         // Compute runs_bv and runs_per_letter_bv
210 |         for (size_t i = 0; i < run_heads_s.size(); ++i)
211 |         {
212 |             size_t length = 0;
213 |             lengths.read((char *)&length, 5);
214 |             if (run_heads_s[i] <= TERMINATOR) // change 0 to 1
215 |                 run_heads_s[i] = TERMINATOR;
216 | 
217 |             std::fill_n(std::back_inserter(runs_bv), length - 1, false);
218 |             runs_bv.push_back(i % B == B - 1);
219 | 
220 |             std::fill_n(std::back_inserter(runs_per_letter_bv[run_heads_s[i]]), length - 1, false);
221 |             runs_per_letter_bv[run_heads_s[i]].push_back(true);
222 | 
223 |             this->n += length;
224 |         }
225 |         // runs_bv.push_back(false);
226 | 
227 |         //now compact structures
228 |         assert(runs_bv.size() == this->n);
229 |         ulint t = 0;
230 |         for (ulint i = 0; i < 256; ++i)
231 |             t += runs_per_letter_bv[i].size();
232 |         assert(t == this->n);
233 |         this->runs = sparse_bitvector_t(runs_bv);
234 |         //a fast direct array: char -> bitvector.
235 |         this->runs_per_letter = vector<sparse_bitvector_t>(256);
236 |         for (ulint i = 0; i < 256; ++i)
237 |             this->runs_per_letter[i] = sparse_bitvector_t(runs_per_letter_bv[i]);
238 |         this->run_heads = string_t(run_heads_s);
239 |         assert(this->run_heads.size() == this->R);
240 |     }
241 | private:
242 | };
243 | 
244 | // Construction from run-length encoded BWT specialization for sparse_sd_vector
245 | template <>
246 | ms_rle_string<ri::sparse_sd_vector, ri::huff_string>::ms_rle_string(std::ifstream &heads, std::ifstream &lengths, ulint B)
247 | {
248 |     heads.clear();
249 |     heads.seekg(0);
250 |     lengths.clear();
251 |     lengths.seekg(0);
252 |     // assert(not contains0(input)); // We're hacking the 0 away :)
253 |     this->B = B;
254 |     // n = input.size();
255 | 
256 |     // Reads the run heads
257 |     string run_heads_s;
258 |     heads.seekg(0, heads.end);
259 |     run_heads_s.resize(heads.tellg());
260 |     heads.seekg(0, heads.beg);
261 |     heads.read(&run_heads_s[0], run_heads_s.size());
262 | 
263 |     size_t pos = 0;
264 |     this->n = 0;
265 |     this->R = run_heads_s.size();
266 | 
267 |     auto runs_per_letter_bv = vector<vector<size_t>> (256);
268 |     auto runs_per_letter_bv_i = vector<size_t> (256,0);
269 |     //runs in main bitvector
270 |     vector<size_t> runs_bv_onset;
271 |     size_t runs_bv_i = 0;
272 |     // Compute runs_bv and runs_per_letter_bv
273 |     for (size_t i = 0; i < run_heads_s.size(); ++i)
274 |     {
275 |         size_t length = 0;
276 |         lengths.read((char *)&length, 5);
277 | 
278 |         uint8_t curr_ch = unsigned(run_heads_s[i]);
279 |         if (curr_ch <= TERMINATOR) { // change 0 to 1
280 |             run_heads_s[i] = TERMINATOR;
281 |             curr_ch = TERMINATOR;
282 |         }
283 | 
284 |         if(i % B == B - 1)
285 |             runs_bv_onset.push_back(this->n + length - 1);
286 | 
287 |         assert(length > 0);
288 |         runs_per_letter_bv_i[curr_ch] += length;
289 |         runs_per_letter_bv[curr_ch].push_back(runs_per_letter_bv_i[curr_ch] - 1);
290 | 
291 |         this->n += length;
292 |     }
293 |     // runs_bv.push_back(false);
294 | 
295 |     //now compact structures
296 |     ulint t = 0;
297 |     for (ulint i = 0; i < 256; ++i)
298 |         t += runs_per_letter_bv_i[i];
299 |     assert(t == this->n);
300 |     this->runs = ri::sparse_sd_vector(runs_bv_onset, this->n);
301 |     //a fast direct array: char -> bitvector.
302 |     this->runs_per_letter = vector<ri::sparse_sd_vector>(256);
303 |     for (ulint i = 0; i < 256; ++i)
304 |         this->runs_per_letter[i] = ri::sparse_sd_vector(runs_per_letter_bv[i],runs_per_letter_bv_i[i]);
305 |     this->run_heads = ri::huff_string(run_heads_s);
306 |     assert(this->run_heads.size() == this->R);
307 | };
308 | 
309 | typedef ms_rle_string<ri::sparse_sd_vector> ms_rle_string_sd;
310 | typedef ms_rle_string<ri::sparse_hyb_vector> ms_rle_string_hyb;
311 | 
312 | #endif /* end of include guard: _MS_RLE_STRING_HH */
313 | 


--------------------------------------------------------------------------------
/include/ms/thresholds_ds.hpp:
--------------------------------------------------------------------------------
  1 | /* thresholds_ds - Stores the thresholds in compressed and plain ways 
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 | 
  4 |     This program is free software: you can redistribute it and/or modify
  5 |     it under the terms of the GNU General Public License as published by
  6 |     the Free Software Foundation, either version 3 of the License, or
  7 |     (at your option) any later version.
  8 | 
  9 |     This program is distributed in the hope that it will be useful,
 10 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |     GNU General Public License for more details.
 13 | 
 14 |     You should have received a copy of the GNU General Public License
 15 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 16 | */
 17 | /*!
 18 |    \file thresholds_ds.hpp
 19 |    \brief thresholds_ds.hpp Stores the thresholds in compressed and plain ways.
 20 |    \author Massimiliano Rossi
 21 |    \date 09/07/2020
 22 | */
 23 | 
 24 | #ifndef _MS_THRESHOLDS_DS_HH
 25 | #define _MS_THRESHOLDS_DS_HH
 26 | 
 27 | #include <common.hpp>
 28 | 
 29 | #include <malloc_count.h>
 30 | 
 31 | #include <sdsl/rmq_support.hpp>
 32 | #include <sdsl/int_vector.hpp>
 33 | 
 34 | #include <ms_rle_string.hpp>
 35 | 
 36 | template <class rle_string_t = ms_rle_string_sd>
 37 | class thr_plain
 38 | {
 39 | public:
 40 |     int_vector<> thresholds;
 41 |     rle_string_t *bwt;
 42 | 
 43 |     typedef size_t size_type;
 44 | 
 45 |     thr_plain()
 46 |     {
 47 |         bwt=nullptr;
 48 |     }
 49 | 
 50 |     thr_plain(std::string filename, rle_string_t* bwt_):bwt(bwt_)
 51 |     {
 52 |         int log_n = bitsize(uint64_t(bwt->size()));
 53 | 
 54 |         verbose("Reading thresholds from file");
 55 | 
 56 |         std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
 57 | 
 58 |         std::string tmp_filename = filename + std::string(".thr_pos");
 59 | 
 60 |         struct stat filestat;
 61 |         FILE *fd;
 62 | 
 63 |         if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
 64 |             error("open() file " + tmp_filename + " failed");
 65 | 
 66 |         int fn = fileno(fd);
 67 |         if (fstat(fn, &filestat) < 0)
 68 |             error("stat() file " + tmp_filename + " failed");
 69 | 
 70 |         if (filestat.st_size % THRBYTES != 0)
 71 |             error("invilid file " + tmp_filename);
 72 | 
 73 |         size_t length = filestat.st_size / THRBYTES;
 74 |         size_t threshold = 0;
 75 | 
 76 |         thresholds = int_vector<>(length, 0, log_n);
 77 | 
 78 |         for (size_t i = 0; i < length; ++i)
 79 |         {
 80 |             size_t threshold = 0;
 81 |             if ((fread(&threshold, THRBYTES, 1, fd)) != 1)
 82 |                 error("fread() file " + tmp_filename + " failed");
 83 |             thresholds[i] = threshold;
 84 |         }
 85 | 
 86 |         fclose(fd);
 87 | 
 88 |         std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
 89 | 
 90 |         verbose("Memory peak: ", malloc_count_peak());
 91 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
 92 |     }
 93 | 
 94 |     // Destructor
 95 |     ~thr_plain() 
 96 |     {
 97 |        // NtD
 98 |     }
 99 | 
100 |     // Copy constructor
101 |     thr_plain(const thr_plain &other)
102 |         :thresholds(other.thresholds),
103 |         bwt(other.bwt)
104 |     {
105 |     }
106 | 
107 |     friend void swap(thr_plain &first, thr_plain &second) // nothrow
108 |     {
109 |         using std::swap;
110 | 
111 |         swap(first.thresholds, second.thresholds);
112 |         swap(first.bwt, second.bwt);
113 |     }
114 | 
115 |     // Copy assignment
116 |     thr_plain &operator=(thr_plain other) 
117 |     {
118 |         swap(*this,other);
119 |         
120 |         return *this;
121 |     }
122 | 
123 |     // Move constructor
124 |     thr_plain(thr_plain &&other) noexcept
125 |         : thr_plain()
126 |     {
127 |         swap(*this, other);
128 |     }
129 | 
130 |     size_t operator[] (size_t& i)
131 |     {
132 |         assert( i < thresholds.size());
133 |         return thresholds[i];
134 |     }
135 | 
136 |     /* serialize the structure to the ostream
137 |      * \param out     the ostream
138 |      */
139 |     size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const
140 |     {
141 |         sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this));
142 |         size_type written_bytes = 0;
143 | 
144 |         written_bytes += thresholds.serialize(out, child, "thresholds");
145 | 
146 |         sdsl::structure_tree::add_size(child, written_bytes);
147 |         return written_bytes;
148 |     }
149 | 
150 |     /* load the structure from the istream
151 |      * \param in the istream
152 |      */
153 |     void load(std::istream &in, rle_string_t *bwt_)
154 |     {
155 |         thresholds.load(in);
156 |         bwt = bwt_;
157 |     }
158 | 
159 |     std::string get_file_extension() const
160 |     {
161 |         return ".thrp";
162 |     }
163 | };
164 | 
165 | template <class rle_string_t = ms_rle_string_sd>
166 | class thr_compressed
167 | {
168 | public:
169 |     int_vector<> thresholds;
170 |     rle_string_t *bwt;
171 |     long long min_off;
172 | 
173 |     typedef size_t size_type;
174 | 
175 |     thr_compressed()
176 |     {
177 |         bwt=nullptr;
178 |     }
179 | 
180 |     thr_compressed(std::string filename, rle_string_t* bwt_):bwt(bwt_)
181 |     {
182 |         int log_n = bitsize(uint64_t(bwt->size()));
183 |         size_t n = uint64_t(bwt->size());
184 | 
185 |         verbose("Reading thresholds from file");
186 | 
187 |         std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
188 | 
189 |         std::string tmp_filename = filename + std::string(".thr_pos");
190 | 
191 |         struct stat filestat;
192 |         FILE *fd;
193 | 
194 |         if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
195 |             error("open() file " + tmp_filename + " failed");
196 | 
197 |         int fn = fileno(fd);
198 |         if (fstat(fn, &filestat) < 0)
199 |             error("stat() file " + tmp_filename + " failed");
200 | 
201 |         if (filestat.st_size % THRBYTES != 0)
202 |             error("invilid file " + tmp_filename);
203 | 
204 |         size_t length = filestat.st_size / THRBYTES;
205 | 
206 |         size_t pos = 0;
207 | 
208 |         long long max_off = 0;
209 |         min_off = n;
210 | 
211 |         for (size_t i = 0; i < length; ++i)
212 |         {
213 |             size_t threshold = 0;
214 |             if ((fread(&threshold, THRBYTES, 1, fd)) != 1)
215 |                 error("fread() file " + tmp_filename + " failed");
216 | 
217 |             long long off = 0;
218 | 
219 |             if (threshold > 0)
220 |             {
221 |                 uint8_t c = bwt->head_of(i);
222 |                 size_t pred = bwt->select(bwt->rank(pos - 1, c) - 1, c);
223 |                 size_t mid_int = (pos - pred + 1) >> 1;
224 |                 assert(threshold > pred);
225 | 
226 |                 threshold = threshold - pred;
227 | 
228 |                 off = mid_int - threshold;
229 | 
230 |                 max_off = max(max_off, off);
231 |                 min_off = min(min_off, off);
232 |             }
233 | 
234 |             pos += bwt->run_at(i);
235 |         }
236 | 
237 |         // Rewind the file
238 |         fseek(fd,0, SEEK_SET);
239 |         pos = 0;
240 |         
241 |         int log_off = bitsize((size_t)(max_off - min_off + 1));
242 | 
243 |         min_off = -min_off; // Shift all the values
244 |         thresholds = int_vector<>(length,0,log_off);
245 |         for (size_t i = 0; i < length; ++i)
246 |         {
247 |             size_t threshold = 0;
248 |             if ((fread(&threshold, THRBYTES, 1, fd)) != 1)
249 |                 error("fread() file " + tmp_filename + " failed");
250 | 
251 |             long long off = 0;
252 | 
253 |             if (threshold > 0)
254 |             {
255 |                 uint8_t c = bwt->head_of(i);
256 |                 size_t pred = bwt->select(bwt->rank(pos - 1, c) - 1, c);
257 |                 size_t mid_int = (pos - pred + 1) >> 1;
258 |                 assert(threshold > pred);
259 | 
260 |                 threshold = threshold - pred;
261 | 
262 |                 off = mid_int - threshold + min_off;
263 |             }
264 | 
265 |             thresholds[i] = off;
266 |             pos += bwt->run_at(i);
267 |         }
268 | 
269 |         fclose(fd);
270 | 
271 | 
272 | 
273 |         std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
274 | 
275 |         verbose("Memory peak: ", malloc_count_peak());
276 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
277 |     }
278 | 
279 |     // Destructor
280 |     ~thr_compressed()
281 |     {
282 |         // NtD
283 |     }
284 | 
285 |     // Copy constructor
286 |     thr_compressed(const thr_compressed &other)
287 |         : thresholds(other.thresholds),
288 |           bwt(other.bwt),
289 |           min_off(other.min_off)
290 |     {
291 |     }
292 | 
293 |     friend void swap(thr_compressed &first, thr_compressed &second) // nothrow
294 |     {
295 |         using std::swap;
296 | 
297 |         swap(first.thresholds, second.thresholds);
298 |         swap(first.bwt, second.bwt);
299 |         swap(first.min_off, second.min_off);
300 |     }
301 | 
302 |     // Copy assignment
303 |     thr_compressed &operator=(thr_compressed other)
304 |     {
305 |         swap(*this, other);
306 | 
307 |         return *this;
308 |     }
309 | 
310 |     // Move constructor
311 |     thr_compressed(thr_compressed &&other) noexcept 
312 |         : thr_compressed()
313 |     {
314 |         swap(*this, other);
315 |     }
316 | 
317 |     size_t operator[] (size_t& i)
318 |     {
319 |         assert( i < thresholds.size());
320 | 
321 |         // get mid_interval
322 |         uint8_t c = bwt->head_of(i);
323 |         size_t rank = bwt->head_rank(i, c);
324 |         if(rank == 0)
325 |             return 0;
326 | 
327 |         size_t pred = bwt->select(rank - 1, c);
328 |         size_t pos = bwt->select(rank, c);
329 |         size_t mid_int = (pos - pred + 1) >> 1;
330 | 
331 |         size_t thr_i = thresholds[i];
332 | 
333 |         return mid_int + min_off - thresholds[i] + pred;
334 |     }
335 | 
336 |     /* serialize the structure to the ostream
337 |      * \param out     the ostream
338 |      */
339 |     size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const
340 |     {
341 |         sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this));
342 |         size_type written_bytes = 0;
343 | 
344 |         out.write((char *)&min_off, sizeof(min_off));
345 |         written_bytes += sizeof(min_off);
346 | 
347 |         written_bytes += thresholds.serialize(out, child, "thresholds");
348 | 
349 |         sdsl::structure_tree::add_size(child, written_bytes);
350 |         return written_bytes;
351 |     }
352 | 
353 |     /* load the structure from the istream
354 |      * \param in the istream
355 |      */
356 |     void load(std::istream &in, rle_string_t *bwt_)
357 |     {
358 |         in.read((char *)&min_off, sizeof(min_off));
359 |         thresholds.load(in);
360 |         bwt = bwt_;
361 |     }
362 | 
363 |     std::string get_file_extension() const
364 |     {
365 |         return ".thrc";
366 |     }
367 | };
368 | 
369 | 
370 | template <class rle_string_t = ms_rle_string_sd>
371 | class thr_bv
372 | {
373 | public:
374 |     std::vector<ri::sparse_sd_vector> thresholds_per_letter;
375 |     rle_string_t *bwt;
376 | 
377 |     typedef size_t size_type;
378 | 
379 |     thr_bv()
380 |     {
381 |         bwt=nullptr;
382 |     }
383 | 
384 |     thr_bv(std::string filename, rle_string_t* bwt_):bwt(bwt_)
385 |     {
386 |         int log_n = bitsize(uint64_t(bwt->size()));
387 |         size_t n = uint64_t(bwt->size());
388 | 
389 |         verbose("Reading thresholds from file");
390 | 
391 |         std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
392 | 
393 |         std::string tmp_filename = filename + std::string(".thr_pos");
394 | 
395 |         struct stat filestat;
396 |         FILE *fd;
397 | 
398 |         if ((fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
399 |             error("open() file " + tmp_filename + " failed");
400 | 
401 |         int fn = fileno(fd);
402 |         if (fstat(fn, &filestat) < 0)
403 |             error("stat() file " + tmp_filename + " failed");
404 | 
405 |         if (filestat.st_size % THRBYTES != 0)
406 |             error("invilid file " + tmp_filename);
407 | 
408 |         size_t length = filestat.st_size / THRBYTES;
409 | 
410 |         auto thrs_per_letter_bv = vector<vector<size_t>>(256);
411 |         auto thrs_per_letter_bv_i = vector<size_t>(256, 0);
412 | 
413 |         for (size_t i = 0; i < length; ++i)
414 |         {
415 |             size_t threshold = 0;
416 |             if ((fread(&threshold, THRBYTES, 1, fd)) != 1)
417 |                 error("fread() file " + tmp_filename + " failed");
418 | 
419 |             long long off = 0;
420 | 
421 |             uint8_t c = bwt->head_of(i);
422 |             if (threshold > 0)
423 |                 thrs_per_letter_bv[c].push_back(threshold);
424 |             thrs_per_letter_bv_i[c] = n;
425 | 
426 |         }
427 | 
428 |         thresholds_per_letter = vector<ri::sparse_sd_vector>(256);
429 |         for (ulint i = 0; i < 256; ++i)
430 |             thresholds_per_letter[i] = ri::sparse_sd_vector(thrs_per_letter_bv[i], thrs_per_letter_bv_i[i]);
431 | 
432 |         fclose(fd);
433 | 
434 | 
435 | 
436 |         std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
437 | 
438 |         verbose("Memory peak: ", malloc_count_peak());
439 |         verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
440 |     }
441 | 
442 |     // Destructor
443 |     ~thr_bv()
444 |     {
445 |         // NtD
446 |     }
447 | 
448 |     // Copy constructor
449 |     thr_bv(const thr_bv &other)
450 |         : thresholds_per_letter(other.thresholds_per_letter),
451 |           bwt(other.bwt)
452 |     {
453 |     }
454 | 
455 |     friend void swap(thr_bv &first, thr_bv &second) // nothrow
456 |     {
457 |         using std::swap;
458 | 
459 |         swap(first.thresholds_per_letter, second.thresholds_per_letter);
460 |         swap(first.bwt, second.bwt);
461 |     }
462 | 
463 |     // Copy assignment
464 |     thr_bv &operator=(thr_bv other)
465 |     {
466 |         swap(*this, other);
467 | 
468 |         return *this;
469 |     }
470 | 
471 |     // Move constructor
472 |     thr_bv(thr_bv &&other) noexcept 
473 |         : thr_bv()
474 |     {
475 |         swap(*this, other);
476 |     }
477 | 
478 |     size_t operator[] (size_t& i)
479 |     {
480 |         assert(i < bwt->number_of_runs());
481 | 
482 |         // get mid_interval
483 |         uint8_t c = bwt->head_of(i);
484 |         size_t rank = bwt->run_head_rank(i, c);
485 |         if(rank == 0)
486 |             return 0;
487 | 
488 |         size_t thr_i = thresholds_per_letter[c].select(rank-1);
489 | 
490 |         return thr_i;
491 |     }
492 | 
493 |     // number of thresholds for the character c before position i 
494 |     size_t rank(const size_t i, const uint8_t c)
495 |     {
496 |         return thresholds_per_letter[c].rank(i); // j-1 because the select is 0 based
497 |     }
498 | 
499 |     /* serialize the structure to the ostream
500 |      * \param out     the ostream
501 |      */
502 |     size_type serialize(std::ostream &out, sdsl::structure_tree_node *v = nullptr, std::string name = "") // const
503 |     {
504 |         sdsl::structure_tree_node *child = sdsl::structure_tree::add_child(v, name, sdsl::util::class_name(*this));
505 |         size_type written_bytes = 0;
506 | 
507 |         for (ulint i = 0; i < 256; ++i)
508 |             written_bytes += thresholds_per_letter[i].serialize(out);
509 | 
510 |         sdsl::structure_tree::add_size(child, written_bytes);
511 |         return written_bytes;
512 |     }
513 | 
514 |     /* load the structure from the istream
515 |      * \param in the istream
516 |      */
517 |     void load(std::istream &in, rle_string_t *bwt_)
518 |     {
519 |         thresholds_per_letter = vector<ri::sparse_sd_vector>(256);
520 |         for (ulint i = 0; i < 256; ++i)
521 |             thresholds_per_letter[i].load(in);
522 |         bwt = bwt_;
523 |     }
524 | 
525 |     std::string get_file_extension() const
526 |     {
527 |         return ".thrbv";
528 |     }
529 | };
530 | 
531 | #endif /* end of include guard: _MS_THRESHOLDS_DS_HH */
532 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | add_executable(compress_dictionary compress_dictionary.cpp)
 3 | target_link_libraries(compress_dictionary sdsl malloc_count)
 4 | target_include_directories(compress_dictionary PUBLIC "../include/common")
 5 | 
 6 | FetchContent_GetProperties(r-index)
 7 | FetchContent_GetProperties(shaped_slp)
 8 | FetchContent_GetProperties(ssw)
 9 | FetchContent_GetProperties(ksw2)
10 | FetchContent_GetProperties(klib)
11 | FetchContent_GetProperties(bigbwt)
12 | 
13 | set(FOLCA_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/folca)
14 | set(SUX_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/external/sux/sux)
15 | 
16 | 
17 | add_executable(ms matching_statistics.cpp ${bigbwt_SOURCE_DIR}/xerrors.c)
18 | target_link_libraries(ms common sdsl divsufsort divsufsort64 malloc_count ri pthread)
19 | target_include_directories(ms PUBLIC    "../include/ms" 
20 |                                         "../include/common" 
21 |                                         "${shaped_slp_SOURCE_DIR}" 
22 |                                         "${FOLCA_SOURCE_DIR}" 
23 |                                         "${SUX_SOURCE_DIR}/function" 
24 |                                         "${SUX_SOURCE_DIR}/support"
25 |                                         "${bigbwt_SOURCE_DIR}"
26 |                                         )
27 | target_compile_options(ms PUBLIC "-std=c++17")
28 | 
29 | add_executable(mems mems.cpp ${bigbwt_SOURCE_DIR}/xerrors.c)
30 | target_link_libraries(mems common sdsl divsufsort divsufsort64 malloc_count ri pthread)
31 | target_include_directories(mems PUBLIC    "../include/ms" 
32 |                                         "../include/common" 
33 |                                         "${shaped_slp_SOURCE_DIR}" 
34 |                                         "${FOLCA_SOURCE_DIR}" 
35 |                                         "${SUX_SOURCE_DIR}/function" 
36 |                                         "${SUX_SOURCE_DIR}/support"
37 |                                         "${bigbwt_SOURCE_DIR}"
38 |                                         )
39 | target_compile_options(mems PUBLIC "-std=c++17")
40 | 
41 | add_executable(rlebwt_ms_build rlebwt_ms_build.cpp)
42 | target_link_libraries(rlebwt_ms_build common sdsl divsufsort divsufsort64 malloc_count ri)
43 | target_include_directories(rlebwt_ms_build PUBLIC    "../include/ms" 
44 |                                         "../include/common" 
45 |                                         "${shaped_slp_SOURCE_DIR}" 
46 |                                         "${FOLCA_SOURCE_DIR}" 
47 |                                         "${SUX_SOURCE_DIR}/function" 
48 |                                         "${SUX_SOURCE_DIR}/support"
49 |                                         )
50 | target_compile_options(rlebwt_ms_build PUBLIC "-std=c++17")
51 | 
52 | add_executable(extend_klib extend_klib.cpp ${klib_SOURCE_DIR}/ksw.c ${bigbwt_SOURCE_DIR}/xerrors.c)
53 | target_link_libraries(extend_klib common malloc_count sdsl divsufsort divsufsort64 ri klib ssw pthread)
54 | target_include_directories(extend_klib PUBLIC    "../include/ms" 
55 |                                             "../include/common" 
56 |                                             "../include/extender" 
57 |                                             "${shaped_slp_SOURCE_DIR}" 
58 |                                             "${FOLCA_SOURCE_DIR}" 
59 |                                             "${SUX_SOURCE_DIR}/function" 
60 |                                             "${SUX_SOURCE_DIR}/support"
61 |                                             "${ssw_SOURCE_DIR}/src"
62 |                                             "${klib_SOURCE_DIR}"
63 |                                             "${bigbwt_SOURCE_DIR}"
64 |                                         )
65 | target_compile_options(extend_klib PUBLIC "-std=c++17")
66 | 
67 | add_executable(extend_ksw2 extend_ksw2.cpp ${bigbwt_SOURCE_DIR}/xerrors.c)
68 | target_link_libraries(extend_ksw2 common sdsl malloc_count divsufsort divsufsort64 ri ksw2 pthread)
69 | target_include_directories(extend_ksw2 PUBLIC    "../include/ms" 
70 |                                             "../include/common"
71 |                                             "${ksw2_SOURCE_DIR}"
72 |                                             "../include/extender" 
73 |                                             "${shaped_slp_SOURCE_DIR}" 
74 |                                             "${FOLCA_SOURCE_DIR}" 
75 |                                             "${SUX_SOURCE_DIR}/function" 
76 |                                             "${SUX_SOURCE_DIR}/support"
77 |                                             "${bigbwt_SOURCE_DIR}"
78 |                                         )
79 | target_compile_options(extend_ksw2 PUBLIC "-std=c++17")
80 | 
81 | add_executable(build_seqidx build_seqidx.cpp)
82 | target_link_libraries(build_seqidx common sdsl divsufsort divsufsort64 malloc_count klib z)
83 | target_include_directories(build_seqidx PUBLIC    "../include/ms" 
84 |                                         "../include/common" 
85 |                                         )
86 | target_compile_options(build_seqidx PUBLIC "-std=c++17")


--------------------------------------------------------------------------------
/src/build_seqidx.cpp:
--------------------------------------------------------------------------------
  1 | /* build_seqidx - Builds the sequence index for the reference
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file build_seqidx.cpp
 16 |    \brief build_seqidx.cpp Builds the sequence index for the reference.
 17 |    \author Massimiliano Rossi
 18 |    \date 07/08/2021
 19 | */
 20 | 
 21 | #include <iostream>
 22 | 
 23 | #define VERBOSE
 24 | 
 25 | #include <common.hpp>
 26 | 
 27 | #include <malloc_count.h>
 28 | #include <kseq.h>
 29 | #include <zlib.h>
 30 | 
 31 | KSEQ_INIT(gzFile, gzread);
 32 | 
 33 | #include <seqidx.hpp>
 34 | 
 35 | // #include <filesystem>
 36 | // namespace fs = std::filesystem;
 37 | #include <libgen.h>
 38 | 
 39 | //*********************** Argument options ***************************************
 40 | // struct containing command line parameters and other globals
 41 | struct Args
 42 | {
 43 |   std::string filename = "";
 44 |   std::string outpath = ""; // path where to output the file
 45 | };
 46 | 
 47 | void parseArgs(int argc, char *const argv[], Args &arg)
 48 | {
 49 |   int c;
 50 |   extern char *optarg;
 51 |   extern int optind;
 52 | 
 53 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-o outpath]\n\n" +
 54 |                     "Computes the .idx file storing the sequence names and starting positions.\n" +
 55 |                     "outpath: [string]  - path to where to output the file.\n");
 56 | 
 57 |   std::string sarg;
 58 |   while ((c = getopt(argc, argv, "o:")) != -1)
 59 |   {
 60 |     switch (c)
 61 |     {
 62 |     case 'o':
 63 |       arg.outpath.assign(optarg);
 64 |       break;
 65 |     case 'h':
 66 |       error(usage);
 67 |     case '?':
 68 |       error("Unknown option.\n", usage);
 69 |       exit(1);
 70 |     }
 71 |   }
 72 |   // the only input parameter is the file name
 73 |   if (argc == optind + 1)
 74 |   {
 75 |     arg.filename.assign(argv[optind]);
 76 |   }
 77 |   else
 78 |   {
 79 |     error("Invalid number of arguments\n", usage);
 80 |   }
 81 | }
 82 | 
 83 | //********** end argument options ********************
 84 | 
 85 | int main(int argc, char *const argv[])
 86 | {
 87 |   Args args;
 88 |   parseArgs(argc, argv, args);
 89 | 
 90 |   // Building the sequence idx
 91 | 
 92 |   verbose("Building the sequence index");
 93 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
 94 | 
 95 |   seqidx idx(args.filename);
 96 | 
 97 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
 98 | 
 99 |   verbose("Sequence index construction complete");
100 |   verbose("Memory peak: ", malloc_count_peak());
101 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
102 | 
103 | 
104 |   std::string outfile = "";
105 |   if(args.outpath == "") outfile = args.filename;
106 |   else outfile = args.outpath + std::string(basename(args.filename.data()));
107 |   // else outfile = args.outpath + fs::path(args.filename).filename().string();
108 |   outfile += idx.get_file_extension();
109 | 
110 |   std::ofstream out(outfile);
111 |   idx.serialize(out);
112 | 
113 |   t_insert_end = std::chrono::high_resolution_clock::now();
114 | 
115 |   verbose("Sequence index serialzation complete");
116 |   verbose("Memory peak: ", malloc_count_peak());
117 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
118 | 
119 |   auto mem_peak = malloc_count_peak();
120 |   verbose("Memory peak: ", malloc_count_peak());
121 |   return 0;
122 | }


--------------------------------------------------------------------------------
/src/compress_dictionary.cpp:
--------------------------------------------------------------------------------
  1 | /* compress_dictionary - Computes the compressed dictionary from prefix-free parse dictionary
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file compress_dictionary.cpp
 16 |    \brief compress_dictionary.cpp Computes the compressed dictionary from prefix-free parse dictionary.
 17 |    \author Massimiliano Rossi
 18 |    \date 16/09/2020
 19 | */
 20 | 
 21 | #include <iostream>
 22 | 
 23 | #define VERBOSE
 24 | 
 25 | #include <common.hpp>
 26 | 
 27 | #include <malloc_count.h>
 28 | 
 29 | //*********************** Argument options ***************************************
 30 | // struct containing command line parameters and other globals
 31 | struct Args
 32 | {
 33 |   std::string filename = "";
 34 |   size_t w = 10;             // sliding window size and its default
 35 | };
 36 | 
 37 | void parseArgs(int argc, char *const argv[], Args &arg)
 38 | {
 39 |   int c;
 40 |   extern char *optarg;
 41 |   extern int optind;
 42 | 
 43 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-w wsize]\n\n" +
 44 |                     "Computes the pfp data structures of infile, provided that infile.parse, infile.dict, and infile.occ exists.\n" +
 45 |                     "  wsize: [integer] - sliding window size (def. 10)\n");
 46 | 
 47 |   std::string sarg;
 48 |   while ((c = getopt(argc, argv, "w:smcfl:rhp:t:")) != -1)
 49 |   {
 50 |     switch (c)
 51 |     {
 52 |     case 'w':
 53 |       sarg.assign(optarg);
 54 |       arg.w = stoi(sarg);
 55 |       break;
 56 |     case 'h':
 57 |       error(usage);
 58 |     case '?':
 59 |       error("Unknown option.\n", usage);
 60 |       exit(1);
 61 |     }
 62 |   }
 63 |   // the only input parameter is the file name
 64 |   if (argc == optind + 1)
 65 |   {
 66 |     arg.filename.assign(argv[optind]);
 67 |   }
 68 |   else
 69 |   {
 70 |     error("Invalid number of arguments\n", usage);
 71 |   }
 72 | }
 73 | 
 74 | //********** end argument options ********************
 75 | 
 76 | std::string execute_cmd(const char* cmd) {
 77 |     std::array<char, 256> buffer{};
 78 |     std::string output = "";
 79 | 
 80 |     std::string cmd_plus_stderr = std::string(cmd) + " 2>&1";
 81 |     FILE* pipe = popen(cmd_plus_stderr.data(), "r"); // Extract stderr as well
 82 |     if (!pipe) {throw std::runtime_error("popen() failed!");}
 83 | 
 84 |     try {
 85 |         std::size_t bytes;
 86 |         while ((bytes = fread(buffer.data(), sizeof(char), sizeof(buffer), pipe))) {
 87 |             output += std::string(buffer.data(), bytes);
 88 |         }
 89 |     } catch (...) {
 90 |         pclose(pipe);
 91 |         throw std::runtime_error("Error occurred while reading popen() stream.");
 92 |     }
 93 |     pclose(pipe);
 94 |     return output;
 95 | }
 96 | 
 97 | int main(int argc, char *const argv[])
 98 | {
 99 | 
100 |   Args args;
101 |   parseArgs(argc, argv, args);
102 | 
103 |   // Building the r-index
104 | 
105 |   verbose("Compressing the dictionary");
106 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
107 | 
108 |   // Open output files
109 |   std::string dicz_filename = args.filename + ".dicz";
110 |   std::string dicz_len_filename = args.filename + ".dicz.len";
111 | 
112 |   FILE *dicz;
113 |   FILE *dicz_len;
114 | 
115 |   if ((dicz = fopen(dicz_filename.c_str(), "w")) == nullptr)
116 |     error("open() file " + std::string(dicz_filename) + " failed");
117 | 
118 |   if ((dicz_len = fopen(dicz_len_filename.c_str(), "w")) == nullptr)
119 |     error("open() file " + std::string(dicz_len_filename) + " failed");
120 | 
121 |   // Open the dictionary
122 |   std::string dict_filename = args.filename + ".dict";
123 |   std::vector<uint8_t> dict;
124 |   read_file(dict_filename.c_str(), dict);
125 | 
126 |   // Start processing
127 | 
128 |   
129 |   // Generating phrase lengths
130 |   verbose("Generating phrase lengths");
131 |   std::vector<size_t> lengths(1,0);
132 |   
133 |   // Counting the number of Dollars at the beginning
134 |   size_t i = 0, j = 0;
135 |   while(dict[i++] == Dollar)
136 |     j++;
137 |   dict.erase(dict.begin(), dict.begin() + j);
138 | 
139 |   for(auto chr: dict)
140 |   {
141 |     // Skip the Dollars
142 |     if(chr == EndOfDict)
143 |       continue;
144 | 
145 |     // Hit end of phrase
146 |     if(chr == EndOfWord)
147 |       lengths.push_back(0);
148 |     else
149 |       lengths.back()++;
150 |   }
151 | 
152 |   if (lengths.back()==0)
153 |     lengths.pop_back();
154 | 
155 |   verbose("Found", lengths.size(), " phrases ");
156 | 
157 |   verbose("Generating phrases");
158 |   uint8_t* ptr = dict.data(); // Beginning of the current phrase
159 | 
160 |   bool empty_first_phrase = false;
161 |   for(size_t i = 0; i < lengths.size(); i++)
162 |   {
163 |     size_t compressed_length = lengths[i] - args.w;
164 | 
165 |     // special case: starts with a trigger string
166 |     if (i==0 && compressed_length == 0) {
167 |       ptr += lengths[i] + 1; 
168 |       empty_first_phrase = true;
169 |       continue;
170 |     } else if (i > 0 && compressed_length == 0) {
171 |       error("encountered a length=0 phrase after removing trigger string, which should not occur.");
172 |     }
173 | 
174 |     if ((fwrite(&compressed_length, 4, 1, dicz_len)) != 1)
175 |       error("fwrite() file " + std::string(dicz_len_filename) + " failed");
176 | 
177 |     if ((fwrite(ptr, sizeof(uint8_t), compressed_length, dicz)) != compressed_length)
178 |       error("fwrite() file " + std::string(dicz_filename) + " failed");
179 | 
180 |     ptr += lengths[i] + 1;
181 |   }
182 |   fclose(dicz);
183 |   fclose(dicz_len);
184 | 
185 |   // re-writes parse file to shift down all the phrase ids by 1 
186 |   // since we removed the empty beginning phrase
187 |   if (empty_first_phrase) {
188 |     verbose("alert: found that the first phrase length is 0"
189 |             " so we will rewrite *.parse file to generated correct SLP.");
190 | 
191 |     // read in all the phrase ids in parse
192 |     std::string parse_filename = args.filename + ".parse";
193 |     std::vector<uint32_t> parse_arr;
194 |     read_file(parse_filename.c_str(), parse_arr);
195 | 
196 |     // make sure first phrase is lowest lexicographically and then remove it
197 |     if (parse_arr[0] != 1)
198 |       error("parse should being with lowest lexicographic phrase.");
199 |     parse_arr.erase(parse_arr.begin());
200 | 
201 |     // rename the old parse file as *.parse_with_empty_phrase
202 |     std::ostringstream command_stream;
203 |     command_stream << "mv " << parse_filename << " " << (args.filename + ".parse_with_empty_phrase");
204 |     auto mv_log = execute_cmd(command_stream.str().c_str());
205 | 
206 |     verbose("executed this command: " + command_stream.str());
207 | 
208 |     // open new parse file for writing
209 |     FILE* new_parse_file;
210 |     if ((new_parse_file = fopen((args.filename + ".parse").c_str(), "w")) == nullptr)
211 |       verbose("open() file " + std::string(args.filename + ".parse" + " failed"));
212 | 
213 |     // iterate through each element of parse and decrement by 1
214 |     for (size_t i = 0; i < parse_arr.size(); i++) {
215 |       if (parse_arr[i] == 1)
216 |         error("issue occurred when creating new parse file.");
217 |       parse_arr[i]--;
218 |       
219 |       // write it out 
220 |       if ((fwrite(&parse_arr[i], 4, 1, new_parse_file)) != 1)
221 |         verbose("fwrite() file " + std::string(args.filename + ".parse") + " failed"); 
222 |     }
223 |     fclose(new_parse_file);
224 |   }
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
233 | 
234 |   verbose("Memory peak: ", malloc_count_peak());
235 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
236 | 
237 |   auto mem_peak = malloc_count_peak();
238 |   verbose("Memory peak: ", malloc_count_peak());
239 | 
240 |   return 0;
241 | }


--------------------------------------------------------------------------------
/src/extend_klib.cpp:
--------------------------------------------------------------------------------
  1 | /* extend_klib - Extend the MEMs of the reads to the reference
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file extend_klib.cpp
 16 |    \brief extend_klib.cpp Extend the MEMs of the reads to the reference.
 17 |    \author Massimiliano Rossi
 18 |    \date 30/04/2021
 19 | */
 20 | 
 21 | extern "C" {
 22 | #include <xerrors.h>
 23 | }
 24 | 
 25 | #include <iostream>
 26 | 
 27 | #define VERBOSE
 28 | 
 29 | #include <common.hpp>
 30 | 
 31 | #include <sdsl/io.hpp>
 32 | 
 33 | #include <ms_pointers.hpp>
 34 | #include <extender_klib.hpp>
 35 | #include <extend_reads_dispatcher.hpp>
 36 | 
 37 | #include <malloc_count.h>
 38 | 
 39 | #include <SelfShapedSlp.hpp>
 40 | #include <DirectAccessibleGammaCode.hpp>
 41 | #include <SelectType.hpp>
 42 | #include <PlainSlp.hpp>
 43 | #include <FixedBitLenCode.hpp>
 44 | 
 45 | #include <ksw.h>
 46 | #include <ssw.h>
 47 | 
 48 | #include <libgen.h>
 49 | 
 50 | //*********************** Argument options ***************************************
 51 | // struct containing command line parameters and other globals
 52 | struct Args
 53 | {
 54 |   std::string filename = "";
 55 |   std::string patterns = ""; // path to patterns file
 56 |   size_t l = 25;             // minumum MEM length
 57 |   size_t th = 1;             // number of threads
 58 |   size_t b = 1;              // number of batches per thread pool
 59 |   bool shaped_slp = false;   // use shaped slp
 60 | };
 61 | 
 62 | void parseArgs(int argc, char *const argv[], Args &arg)
 63 | {
 64 |   int c;
 65 |   extern char *optarg;
 66 |   extern int optind;
 67 | 
 68 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" +
 69 |                     "Extends the MEMs of the reads in the pattern against the reference index in infile.\n" +
 70 |                     "shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
 71 |                     "   pattens: [string]  - path to patterns file.\n" +
 72 |                     "       len: [integer] - minimum MEM lengt (def. 25)\n" +
 73 |                     "    thread: [integer] - number of threads (def. 1)\n" +
 74 |                     "     batch: [integer] - number of batches per therad pool (def. 1)\n");
 75 | 
 76 |   std::string sarg;
 77 |   while ((c = getopt(argc, argv, "l:hp:b:t:")) != -1)
 78 |   {
 79 |     switch (c)
 80 |     {
 81 |     case 'p':
 82 |       arg.patterns.assign(optarg);
 83 |       break;
 84 |     case 'l':
 85 |       sarg.assign(optarg);
 86 |       arg.l = stoi(sarg);
 87 |       break;
 88 |     case 't':
 89 |       sarg.assign(optarg);
 90 |       arg.th = stoi(sarg);
 91 |       break;
 92 |     case 'b':
 93 |       sarg.assign(optarg);
 94 |       arg.b = stoi(sarg);
 95 |       break;
 96 |     case 'q':
 97 |       arg.shaped_slp = true;
 98 |       break;
 99 |     case 'h':
100 |       error(usage);
101 |     case '?':
102 |       error("Unknown option.\n", usage);
103 |       exit(1);
104 |     }
105 |   }
106 |   // the only input parameter is the file name
107 |   if (argc == optind + 1)
108 |   {
109 |     arg.filename.assign(argv[optind]);
110 |   }
111 |   else
112 |   {
113 |     error("Invalid number of arguments\n", usage);
114 |   }
115 | }
116 | 
117 | //********** end argument options ********************
118 | 
119 | 
120 | template<typename extender_t>
121 | void dispatcher(Args &args){
122 |   verbose("Construction of the extender");
123 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
124 | 
125 |   extender_t extender(args.filename, args.l);
126 | 
127 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
128 |   verbose("Memory peak: ", malloc_count_peak());
129 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
130 | 
131 |   verbose("Processing patterns");
132 |   t_insert_start = std::chrono::high_resolution_clock::now();
133 | 
134 |   std::string base_name = basename(args.filename.data());
135 |   std::string sam_filename = args.patterns + "_" + base_name + "_" + std::to_string(args.l);
136 | 
137 |   if (is_gzipped(args.patterns))
138 |   {
139 |     verbose("The input is gzipped - forcing single thread extension.");
140 |     args.th = 1;
141 |   }
142 | 
143 |   if (args.th == 1)
144 |     st_extend<extender_t>(&extender, args.patterns, sam_filename);
145 |   else
146 |     mt_extend<extender_t>(&extender, args.patterns, sam_filename, args.th, args.b);
147 | 
148 |   // TODO: Merge the SAM files.
149 | 
150 |   t_insert_end = std::chrono::high_resolution_clock::now();
151 | 
152 |   verbose("Memory peak: ", malloc_count_peak());
153 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
154 | 
155 |   auto mem_peak = malloc_count_peak();
156 |   verbose("Memory peak: ", malloc_count_peak());
157 | 
158 | }
159 | 
160 | int main(int argc, char *const argv[])
161 | {
162 | 
163 |   Args args;
164 |   parseArgs(argc, argv, args);
165 | 
166 |   if (args.shaped_slp)
167 |   {
168 |     dispatcher<extender<shaped_slp_t>>(args);
169 |   }
170 |   else
171 |   {
172 |     dispatcher<extender<plain_slp_t>>(args);
173 |   }
174 | 
175 |   return 0;
176 | }


--------------------------------------------------------------------------------
/src/extend_ksw2.cpp:
--------------------------------------------------------------------------------
  1 | /* extend_ksw2 - Extend the MEMs of the reads to the reference
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file extend_ksw2.cpp
 16 |    \brief extend_ksw2.cpp Extend the MEMs of the reads to the reference.
 17 |    \author Massimiliano Rossi
 18 |    \date 13/07/2020
 19 | */
 20 | 
 21 | extern "C" {
 22 | #include <xerrors.h>
 23 | }
 24 | 
 25 | #include <iostream>
 26 | 
 27 | #define VERBOSE
 28 | 
 29 | #include <common.hpp>
 30 | 
 31 | #include <sdsl/io.hpp>
 32 | 
 33 | #include <ms_pointers.hpp>
 34 | #include <extender_ksw2.hpp>
 35 | #include <extend_reads_dispatcher.hpp>
 36 | 
 37 | #include <malloc_count.h>
 38 | 
 39 | #include <SelfShapedSlp.hpp>
 40 | #include <DirectAccessibleGammaCode.hpp>
 41 | #include <SelectType.hpp>
 42 | #include <PlainSlp.hpp>
 43 | #include <FixedBitLenCode.hpp>
 44 | 
 45 | #include <libgen.h>
 46 | 
 47 | //*********************** Argument options ***************************************
 48 | // struct containing command line parameters and other globals
 49 | struct Args
 50 | {
 51 |   std::string filename = "";
 52 |   std::string patterns = ""; // path to patterns file
 53 |   std::string output   = ""; // output file prefix
 54 |   size_t l = 25;             // minumum MEM length
 55 |   size_t th = 1;             // number of threads
 56 |   size_t b = 1;              // number of batches per thread pool
 57 |   bool shaped_slp = false;   // use shaped slp
 58 |   size_t ext_len = 100;      // Extension length
 59 |   // size_t top_k = 1;       // Report the top_k alignments
 60 | 
 61 |   // ksw2 parameters
 62 |   int8_t smatch = 2;      // Match score default
 63 |   int8_t smismatch = 4;   // Mismatch score default
 64 |   int8_t gapo = 4;        // Gap open penalty
 65 |   int8_t gapo2 = 13;      // Gap open penalty
 66 |   int8_t gape = 2;        // Gap extension penalty
 67 |   int8_t gape2 = 1;       // Gap extension penalty
 68 |   // int end_bonus = 400;    // Bonus to add at the extension score to declare the alignment
 69 | 
 70 |   // int w = -1;             // Band width
 71 |   // int zdrop = -1;         // Zdrop enable
 72 | };
 73 | 
 74 | void parseArgs(int argc, char *const argv[], Args &arg)
 75 | {
 76 |   int c;
 77 |   extern char *optarg;
 78 |   extern int optind;
 79 | 
 80 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-t threads] [-l len] [-q shaped_slp] [-b batch] [-L ext_l] [-A smatch] [-B smismatc] [-O gapo] [-E gape]\n\n" +
 81 |                     "Extends the MEMs of the reads in the pattern against the reference index in infile.\n" +
 82 |                     "shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
 83 |                     "   pattens: [string]  - path to patterns file.\n" +
 84 |                     "    output: [string]  - output file prefix.\n" +
 85 |                     "       len: [integer] - minimum MEM lengt (def. 25)\n" +
 86 |                     "    thread: [integer] - number of threads (def. 1)\n" +
 87 |                     "     ext_l: [integer] - length of reference substring for extension (def. " + std::to_string(arg.ext_len) + ")\n" +
 88 |                     "    smatch: [integer] - match score value (def. " + std::to_string(arg.smatch) + ")\n" +
 89 |                     " smismatch: [integer] - mismatch penalty value (def. " + std::to_string(arg.smismatch) + ")\n" +
 90 |                     "      gapo: [integer] - gap open penalty value (def. " + std::to_string(arg.gapo) + "," + std::to_string(arg.gapo2) + ")\n" +
 91 |                     "      gape: [integer] - gap extension penalty value (def. " + std::to_string(arg.gape) + "," + std::to_string(arg.gape2) + ")\n" +
 92 |                     "     batch: [integer] - number of batches per therad pool (def. 1)\n");
 93 | 
 94 |   std::string sarg;
 95 |   char* s;
 96 |   while ((c = getopt(argc, argv, "l:hp:o:b:t:qA:B:O:E:L:")) != -1)
 97 |   {
 98 |     switch (c)
 99 |     {
100 |     case 'p':
101 |       arg.patterns.assign(optarg);
102 |       break;
103 |     case 'o':
104 |       arg.output.assign(optarg);
105 |       break;
106 |     case 'l':
107 |       sarg.assign(optarg);
108 |       arg.l = stoi(sarg);
109 |       break;
110 |     case 't':
111 |       sarg.assign(optarg);
112 |       arg.th = stoi(sarg);
113 |       break;
114 |     case 'b':
115 |       sarg.assign(optarg);
116 |       arg.b = stoi(sarg);
117 |       break;
118 |     case 'L':
119 |       sarg.assign(optarg);
120 |       arg.ext_len = stoi(sarg);
121 |       break;
122 |     case 'A':
123 |       sarg.assign(optarg);
124 |       arg.smatch = stoi(sarg);
125 |       break;
126 |     case 'B':
127 |       sarg.assign(optarg);
128 |       arg.smismatch = stoi(sarg);
129 |       break;
130 |     case 'O':
131 |       arg.gapo = arg.gapo2 = strtol(optarg, &s, 10);
132 |       if (*s == ',') arg.gapo2 = strtol(s+1, &s, 10);
133 |       break;
134 |     case 'E':
135 |       arg.gape = arg.gape2 = strtol(optarg, &s, 10);
136 |       if (*s == ',') arg.gape2 = strtol(s+1, &s, 10);
137 |       break;
138 |     case 'q':
139 |       arg.shaped_slp = true;
140 |       break;
141 |     case 'h':
142 |       error(usage);
143 |     case '?':
144 |       error("Unknown option.\n", usage);
145 |       exit(1);
146 |     }
147 |   }
148 |   // the only input parameter is the file name
149 |   if (argc == optind + 1)
150 |   {
151 |     arg.filename.assign(argv[optind]);
152 |   }
153 |   else
154 |   {
155 |     error("Invalid number of arguments\n", usage);
156 |   }
157 | }
158 | 
159 | //********** end argument options ********************
160 | 
161 | 
162 | template<typename extender_t>
163 | typename extender_t::config_t configurer(Args &args){
164 |   typename extender_t::config_t config;
165 |   
166 |   config.min_len    = args.l;           // Minimum MEM length
167 |   config.ext_len    = args.ext_len;     // Extension length
168 | 
169 |   // ksw2 parameters
170 |   config.smatch     = args.smatch;      // Match score default
171 |   config.smismatch  = args.smismatch;   // Mismatch score default
172 |   config.gapo       = args.gapo;        // Gap open penalty
173 |   config.gapo2      = args.gapo2;       // Gap open penalty
174 |   config.gape       = args.gape;        // Gap extension penalty
175 |   config.gape2      = args.gape2;       // Gap extension penalty
176 | 
177 |   return config;
178 | }
179 | 
180 | template<typename extender_t>
181 | void dispatcher(Args &args){
182 |   verbose("Construction of the extender");
183 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
184 | 
185 | 
186 |   extender_t extender(args.filename, configurer<extender_t>(args));
187 | 
188 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
189 |   verbose("Memory peak: ", malloc_count_peak());
190 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
191 | 
192 |   verbose("Processing patterns");
193 |   t_insert_start = std::chrono::high_resolution_clock::now();
194 | 
195 |   std::string base_name = basename(args.filename.data());
196 |   std::string sam_filename = args.patterns + "_" + base_name + "_" + std::to_string(args.l);
197 |   if(args.output != "")
198 |     sam_filename = args.output;
199 | 
200 |   if (is_gzipped(args.patterns))
201 |   {
202 |     verbose("The input is gzipped - forcing single thread extension.");
203 |     args.th = 1;
204 |   }
205 | 
206 |   if (args.th == 1)
207 |     st_extend<extender_t>(&extender, args.patterns, sam_filename);
208 |   else
209 |     mt_extend<extender_t>(&extender, args.patterns, sam_filename, args.th, args.b);
210 | 
211 |   // TODO: Merge the SAM files.
212 | 
213 |   t_insert_end = std::chrono::high_resolution_clock::now();
214 | 
215 |   verbose("Memory peak: ", malloc_count_peak());
216 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
217 | 
218 |   auto mem_peak = malloc_count_peak();
219 |   verbose("Memory peak: ", malloc_count_peak());
220 | }
221 | 
222 | int main(int argc, char *const argv[])
223 | {
224 | 
225 |   Args args;
226 |   parseArgs(argc, argv, args);
227 | 
228 |   if (args.shaped_slp)
229 |   {
230 |     dispatcher<extender<shaped_slp_t, ms_pointers<>>>(args);
231 |   }
232 |   else
233 |   {
234 |     dispatcher<extender<plain_slp_t, ms_pointers<>>>(args);
235 |   }
236 | 
237 |   return 0;
238 | }


--------------------------------------------------------------------------------
/src/matching_statistics.cpp:
--------------------------------------------------------------------------------
  1 | /* matching_statistics - Computes the matching statistics from BWT and Thresholds
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file matching_statistics.cpp
 16 |    \brief matching_statistics.cpp Computes the matching statistics from BWT and Thresholds.
 17 |    \author Massimiliano Rossi
 18 |    \date 13/07/2020
 19 | */
 20 | 
 21 | extern "C" {
 22 | #include <xerrors.h>
 23 | }
 24 | 
 25 | #include <iostream>
 26 | 
 27 | #define VERBOSE
 28 | 
 29 | #include <common.hpp>
 30 | 
 31 | #include <sdsl/io.hpp>
 32 | 
 33 | #include <ms_pointers.hpp>
 34 | 
 35 | #include <malloc_count.h>
 36 | 
 37 | #include <SelfShapedSlp.hpp>
 38 | #include <DirectAccessibleGammaCode.hpp>
 39 | #include <SelectType.hpp>
 40 | #include <PlainSlp.hpp>
 41 | #include <FixedBitLenCode.hpp>
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | /// kseq extra
 45 | ////////////////////////////////////////////////////////////////////////////////
 46 | 
 47 | static inline size_t ks_tell(kseq_t *seq)
 48 | {
 49 |   return gztell(seq->f->f) - seq->f->end + seq->f->begin;
 50 | }
 51 | 
 52 | void copy_kstring_t(kstring_t &l, kstring_t &r)
 53 | {
 54 |   l.l = r.l;
 55 |   l.m = r.m;
 56 |   l.s = (char *)malloc(l.m);
 57 |   for (size_t i = 0; i < r.m; ++i)
 58 |     l.s[i] = r.s[i];
 59 | }
 60 | void copy_kseq_t(kseq_t *l, kseq_t *r)
 61 | {
 62 |   copy_kstring_t(l->name, r->name);
 63 |   copy_kstring_t(l->comment, r->comment);
 64 |   copy_kstring_t(l->seq, r->seq);
 65 |   copy_kstring_t(l->qual, r->qual);
 66 |   l->last_char = r->last_char;
 67 | }
 68 | 
 69 | ////////////////////////////////////////////////////////////////////////////////
 70 | /// Parallel computation
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | 
 73 | // This should be done using buffering.
 74 | size_t next_start_fastq(gzFile fp)
 75 | {
 76 |   int c;
 77 |   // Special case when we arr at the beginning of the file.
 78 |   if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@')
 79 |     return 0;
 80 | 
 81 |   // Strart from the previous character
 82 |   gzseek(fp, -1, SEEK_CUR);
 83 | 
 84 |   std::vector<std::pair<int, size_t>> window;
 85 |   // Find the first new line
 86 |   for (size_t i = 0; i < 4; ++i)
 87 |   {
 88 |     while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n'))
 89 |     {
 90 |     }
 91 |     if (c == EOF)
 92 |       return gztell(fp);
 93 |     if ((c = gzgetc(fp)) == EOF)
 94 |       return gztell(fp);
 95 |     window.push_back(std::make_pair(c, gztell(fp) - 1));
 96 |   }
 97 | 
 98 |   for (size_t i = 0; i < 2; ++i)
 99 |   {
100 |     if (window[i].first == '@' && window[i + 2].first == '+')
101 |       return window[i].second;
102 |     if (window[i].first == '+' && window[i + 2].first == '@')
103 |       return window[i + 2].second;
104 |   }
105 | 
106 |   return gztell(fp);
107 | }
108 | 
109 | // test if the file is gzipped
110 | static inline bool is_gzipped(std::string filename)
111 | {
112 |   FILE *fp = fopen(filename.c_str(), "rb");
113 |   if(fp == NULL) error("Opening file " + filename);
114 |   int byte1 = 0, byte2 = 0;
115 |   fread(&byte1, sizeof(char), 1, fp);
116 |   fread(&byte2, sizeof(char), 1, fp);
117 |   fclose(fp);
118 |   return (byte1 == 0x1f && byte2 == 0x8b);
119 | }
120 | 
121 | // Return the length of the file
122 | // Assumes that the file is not compressed
123 | static inline size_t get_file_size(std::string filename)
124 | {
125 |   if (is_gzipped(filename))
126 |   {
127 |     std::cerr << "The input is gzipped!" << std::endl;
128 |     return -1;
129 |   }
130 |   FILE *fp = fopen(filename.c_str(), "r");
131 |   fseek(fp, 0L, SEEK_END);
132 |   size_t size = ftell(fp);
133 |   fclose(fp);
134 |   return size;
135 | }
136 | 
137 | std::vector<size_t> split_fastq(std::string filename, size_t n_threads)
138 | {
139 |   //Precondition: the file is not gzipped
140 |   // scan file for start positions and execute threads
141 |   size_t size = get_file_size(filename);
142 | 
143 |   gzFile fp = gzopen(filename.c_str(), "r");
144 |   if (fp == Z_NULL)
145 |   {
146 |     throw new std::runtime_error("Cannot open input file " + filename);
147 |   }
148 | 
149 |   std::vector<size_t> starts(n_threads + 1);
150 |   for (int i = 0; i < n_threads + 1; ++i)
151 |   {
152 |     size_t start = (size_t)((size * i) / n_threads);
153 |     gzseek(fp, start, SEEK_SET);
154 |     starts[i] = next_start_fastq(fp);
155 |   }
156 |   gzclose(fp);
157 |   return starts;
158 | }
159 | 
160 | ////////////////////////////////////////////////////////////////////////////////
161 | /// SLP definitions
162 | ////////////////////////////////////////////////////////////////////////////////
163 | 
164 | using SelSd = SelectSdvec<>;
165 | using DagcSd = DirectAccessibleGammaCode<SelSd>;
166 | using Fblc = FixedBitLenCode<>;
167 | 
168 | using shaped_slp_t = SelfShapedSlp<uint32_t, DagcSd, DagcSd, SelSd>;
169 | using plain_slp_t = PlainSlp<uint32_t, Fblc, Fblc>;
170 | 
171 | template <typename slp_t>
172 | std::string get_slp_file_extension()
173 | {
174 |   return std::string(".slp");
175 | }
176 | 
177 | template <>
178 | std::string get_slp_file_extension<shaped_slp_t>()
179 | {
180 |   return std::string(".slp");
181 | }
182 | 
183 | template <>
184 | std::string get_slp_file_extension<plain_slp_t>()
185 | {
186 |   return std::string(".plain.slp");
187 | }
188 | ////////////////////////////////////////////////////////////////////////////////
189 | 
190 | template <typename slp_t>
191 | class ms_c
192 | {
193 | public:
194 | 
195 |   ms_c(std::string filename)
196 |   {
197 |     verbose("Loading the matching statistics index");
198 |     std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
199 | 
200 |     std::string filename_ms = filename + ms.get_file_extension();
201 | 
202 |     ifstream fs_ms(filename_ms);
203 |     ms.load(fs_ms);
204 |     fs_ms.close();
205 | 
206 |     std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
207 | 
208 |     verbose("Matching statistics index construction complete");
209 |     verbose("Memory peak: ", malloc_count_peak());
210 |     verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
211 | 
212 |     verbose("Loading random access");
213 |     t_insert_start = std::chrono::high_resolution_clock::now();
214 | 
215 |     std::string filename_slp = filename + get_slp_file_extension<slp_t>();
216 | 
217 |     ifstream fs(filename_slp);
218 |     ra.load(fs);
219 |     fs.close();
220 | 
221 |     n = ra.getLen();
222 | 
223 |     t_insert_end = std::chrono::high_resolution_clock::now();
224 | 
225 |     verbose("Matching statistics index loading complete");
226 |     verbose("Memory peak: ", malloc_count_peak());
227 |     verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
228 |   }
229 | 
230 |   // Destructor
231 |   ~ms_c() 
232 |   {
233 |       // NtD
234 |   }
235 | 
236 |   // The outfile has the following format. The first size_t integer store the
237 |   // length l of the name. Then the following l characters stores the name of
238 |   // the read. The following size_t integer store the length l of the query. 
239 |   // Then the following l size_t integers stores the pointers of the 
240 |   // matching statistics, and the following l size_t integers stores the lengths
241 |   // of the mathcing statistics.
242 |   void matching_statistics(kseq_t *read, FILE* out)
243 |   {
244 |     auto pointers = ms.query(read->seq.s, read->seq.l);
245 |     std::vector<size_t> lengths(pointers.size());
246 |     size_t l = 0;
247 |     for (size_t i = 0; i < pointers.size(); ++i)
248 |     {
249 |       size_t pos = pointers[i];
250 |       while ((i + l) < read->seq.l && (pos + l) < n && (i < 1 || pos != (pointers[i-1] + 1) ) && read->seq.s[i + l] == ra.charAt(pos + l))
251 |         ++l;
252 | 
253 |       lengths[i] = l;
254 |       l = (l == 0 ? 0 : (l - 1));
255 |     }
256 | 
257 |     // Original MS computation
258 |     // for (size_t i = 0; i < pointers.size(); ++i)
259 |     // {
260 |     //   size_t pos = pointers[i];
261 |     //   while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l))
262 |     //     ++l;
263 | 
264 |     //   lengths[i] = l;
265 |     //   l = (l == 0 ? 0 : (l - 1));
266 |     // }
267 | 
268 |     assert(lengths.size() == pointers.size());
269 | 
270 |     size_t h_length = read->name.l;
271 |     fwrite(&h_length, sizeof(size_t), 1,out);
272 |     fwrite(read->name.s, sizeof(char),h_length,out);
273 | 
274 |     size_t q_length = pointers.size();
275 |     fwrite(&q_length, sizeof(size_t), 1,out);
276 |     fwrite(pointers.data(), sizeof(size_t),q_length,out);
277 |     fwrite(lengths.data(), sizeof(size_t),q_length,out);
278 |   }
279 | 
280 | protected:
281 |   ms_pointers<> ms;
282 |   slp_t ra;
283 |   size_t n = 0;
284 | };
285 | 
286 | 
287 | 
288 | char complement(char n)
289 | {
290 |   switch (n)
291 |   {
292 |   case 'A':
293 |     return 'T';
294 |   case 'T':
295 |     return 'A';
296 |   case 'G':
297 |     return 'C';
298 |   case 'C':
299 |     return 'G';
300 |   default:
301 |     return n;
302 |   }
303 | }
304 | 
305 | template <typename ms_t>
306 | struct mt_param_t
307 | {
308 |   // Parameters
309 |   ms_t *ms;
310 |   std::string pattern_filename;
311 |   std::string out_filename;
312 |   size_t start;
313 |   size_t end;
314 |   size_t wk_id;
315 | };
316 | 
317 | template <typename ms_t>
318 | void *mt_ms_worker(void *param)
319 | {
320 |   mt_param_t<ms_t> *p = (mt_param_t<ms_t>*) param;
321 |   size_t n_reads = 0;
322 |   size_t n_processed_reads = 0;
323 | 
324 |   FILE *out_fd;
325 |   gzFile fp;
326 | 
327 |   if ((out_fd = fopen(p->out_filename.c_str(), "w")) == nullptr)
328 |     error("open() file " + p->out_filename + " failed");
329 | 
330 |   if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL)
331 |     error("open() file " + p->pattern_filename + " failed");
332 | 
333 |   gzseek(fp, p->start, SEEK_SET);
334 | 
335 |   kseq_t rev;
336 |   int l;
337 | 
338 |   kseq_t *seq = kseq_init(fp);
339 |   while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0))
340 |   {
341 | 
342 |     p->ms->matching_statistics(seq,out_fd);
343 | 
344 |   }
345 | 
346 |   kseq_destroy(seq);
347 |   gzclose(fp);
348 |   fclose(out_fd);
349 | 
350 |   return NULL;
351 | }
352 | 
353 | template <typename ms_t>
354 | void mt_ms(ms_t *ms, std::string pattern_filename, std::string out_filename, size_t n_threads)
355 | {
356 |   pthread_t t[n_threads] = {0};
357 |   mt_param_t<ms_t> params[n_threads];
358 |   std::vector<size_t> starts = split_fastq(pattern_filename, n_threads);
359 |   for(size_t i = 0; i < n_threads; ++i)
360 |   {
361 |     params[i].ms = ms;
362 |     params[i].pattern_filename = pattern_filename;
363 |     params[i].out_filename = out_filename + "_" + std::to_string(i) + ".ms.tmp.out";
364 |     params[i].start = starts[i];
365 |     params[i].end = starts[i+1];
366 |     params[i].wk_id = i;
367 |     xpthread_create(&t[i], NULL, &mt_ms_worker<ms_t>, &params[i], __LINE__, __FILE__);
368 |   }
369 | 
370 |   for(size_t i = 0; i < n_threads; ++i)
371 |   {
372 |     xpthread_join(t[i],NULL,__LINE__,__FILE__);
373 |   }
374 | 
375 |   // sleep(5);
376 | 
377 | 
378 |   return;
379 | }
380 | 
381 | 
382 | ////////////////////////////////////////////////////////////////////////////////
383 | /// Single Thread
384 | ////////////////////////////////////////////////////////////////////////////////
385 | template <typename ms_t>
386 | size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename)
387 | {
388 |   size_t n_reads = 0;
389 |   size_t n_processed_reads = 0;
390 |   kseq_t rev;
391 |   int l;
392 |   FILE *out_fd;
393 | 
394 |   out_filename += "_0.ms.tmp.out";
395 | 
396 |   if ((out_fd = fopen(out_filename.c_str(), "w")) == nullptr)
397 |     error("open() file " + out_filename + " failed");
398 | 
399 |   gzFile fp = gzopen(pattern_filename.c_str(), "r");
400 |   kseq_t* seq = kseq_init(fp);
401 |   while ((l = kseq_read(seq)) >= 0)
402 |   {
403 | 
404 |     ms->matching_statistics(seq, out_fd);
405 | 
406 |   }
407 | 
408 |   kseq_destroy(seq);
409 |   gzclose(fp);
410 |   fclose(out_fd);
411 | 
412 |   // sleep(5);
413 | 
414 |   return n_processed_reads;
415 | }
416 | 
417 | 
418 | typedef std::pair<std::string, std::vector<uint8_t>> pattern_t;
419 | 
420 | //*********************** Argument options ***************************************
421 | // struct containing command line parameters and other globals
422 | struct Args
423 | {
424 |   std::string filename = "";
425 |   std::string patterns = ""; // path to patterns file
426 |   std::string output   = ""; // output file prefix
427 |   size_t l = 25;             // minumum MEM length
428 |   size_t th = 1;             // number of threads
429 |   bool shaped_slp = false;   // use shaped slp
430 | };
431 | 
432 | void parseArgs(int argc, char *const argv[], Args &arg)
433 | {
434 |   int c;
435 |   extern char *optarg;
436 |   extern int optind;
437 | 
438 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-b batch]\n\n" +
439 |                     "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" +
440 |                     "shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
441 |                     "   pattens: [string]  - path to patterns file.\n" +
442 |                     "    output: [string]  - output file prefix.\n" +
443 |                     "       len: [integer] - minimum MEM lengt (def. 25)\n" +
444 |                     "    thread: [integer] - number of threads (def. 1)\n");
445 | 
446 |   std::string sarg;
447 |   while ((c = getopt(argc, argv, "l:hp:o:t:")) != -1)
448 |   {
449 |     switch (c)
450 |     {
451 |     case 'p':
452 |       arg.patterns.assign(optarg);
453 |       break;
454 |     case 'o':
455 |       arg.output.assign(optarg);
456 |       break;
457 |     case 'l':
458 |       sarg.assign(optarg);
459 |       arg.l = stoi(sarg);
460 |       break;
461 |     case 't':
462 |       sarg.assign(optarg);
463 |       arg.th = stoi(sarg);
464 |       break;
465 |     case 'q':
466 |       arg.shaped_slp = true;
467 |       break;
468 |     case 'h':
469 |       error(usage);
470 |     case '?':
471 |       error("Unknown option.\n", usage);
472 |       exit(1);
473 |     }
474 |   }
475 |   // the only input parameter is the file name
476 |   if (argc == optind + 1)
477 |   {
478 |     arg.filename.assign(argv[optind]);
479 |   }
480 |   else
481 |   {
482 |     error("Invalid number of arguments\n", usage);
483 |   }
484 | }
485 | 
486 | //********** end argument options ********************
487 | 
488 | template <typename ms_t>
489 | void dispatcher(Args &args)
490 | {
491 |   verbose("Construction of the matching statistics data structure");
492 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
493 | 
494 |   ms_t ms(args.filename);
495 | 
496 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
497 |   verbose("Memory peak: ", malloc_count_peak());
498 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
499 | 
500 |   verbose("Processing patterns");
501 |   t_insert_start = std::chrono::high_resolution_clock::now();
502 | 
503 |   std::string base_name = basename(args.filename.data());
504 |   std::string out_filename = args.patterns + "_" + base_name;
505 |   if(args.output != "")
506 |     out_filename = args.output;
507 | 
508 |   if (is_gzipped(args.patterns))
509 |   {
510 |     verbose("The input is gzipped - forcing single thread matching statistics.");
511 |     args.th = 1;
512 |   }
513 | 
514 |   if (args.th == 1)
515 |     st_ms<ms_t>(&ms, args.patterns, out_filename);
516 |   else
517 |     mt_ms<ms_t>(&ms, args.patterns, out_filename, args.th);
518 | 
519 |   // TODO: Merge the SAM files.
520 | 
521 |   t_insert_end = std::chrono::high_resolution_clock::now();
522 | 
523 |   verbose("Memory peak: ", malloc_count_peak());
524 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
525 | 
526 |   auto mem_peak = malloc_count_peak();
527 |   verbose("Memory peak: ", malloc_count_peak());
528 | 
529 |   verbose("Printing plain output");
530 |   t_insert_start = std::chrono::high_resolution_clock::now();
531 | 
532 |   std::ofstream f_pointers(out_filename + ".pointers");
533 |   std::ofstream f_lengths(out_filename + ".lengths");
534 | 
535 |   if (!f_pointers.is_open())
536 |     error("open() file " + std::string(out_filename) + ".pointers failed");
537 | 
538 |   if (!f_lengths.is_open())
539 |     error("open() file " + std::string(out_filename) + ".lengths failed");
540 | 
541 |   size_t n_seq = 0;
542 |   for (size_t i = 0; i < args.th; ++i)
543 |   {
544 |     std::string tmp_filename = out_filename + "_" + std::to_string(i) + ".ms.tmp.out";
545 |     FILE *in_fd;
546 | 
547 |     if ((in_fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
548 |       error("open() file " + tmp_filename + " failed");
549 | 
550 |     size_t length = 0;
551 |     size_t m = 100; // Reserved size for pointers and lengths
552 |     size_t *mem = (size_t *)malloc(m * sizeof(size_t));
553 |     size_t s = 100; // Reserved size for read name
554 |     char* rname = (char *)malloc(s * sizeof(char));
555 |     while (!feof(in_fd) and fread(&length, sizeof(size_t), 1, in_fd) > 0)
556 |     {
557 |       // Reading read name
558 |       if (s < length)
559 |       {
560 |         // Resize lengths and pointers
561 |         s = length;
562 |         rname = (char *)realloc(rname, m * sizeof(char));
563 |       }
564 | 
565 |       if ((fread(rname, sizeof(char), length, in_fd)) != length)
566 |         error("fread() file " + std::string(tmp_filename) + " failed");
567 | 
568 |       // TODO: Store the fasta headers somewhere
569 |       f_pointers << ">" + std::string(rname,length) << endl;
570 |       f_lengths << ">" + std::string(rname,length) << endl;
571 | 
572 |       // Reading MEMs
573 |       if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1)
574 |         error("fread() file " + std::string(tmp_filename) + " failed");
575 | 
576 |       if (m < length)
577 |       {
578 |         // Resize lengths and pointers
579 |         m = length;
580 |         mem = (size_t *)realloc(mem, m * sizeof(size_t));
581 |       }
582 | 
583 |       if ((fread(mem, sizeof(size_t), length, in_fd)) != length)
584 |         error("fread() file " + std::string(tmp_filename) + " failed");
585 | 
586 |       // TODO: Store the fasta headers somewhere
587 |       // f_pointers << ">" + std::to_string(n_seq) << endl;
588 |       for (size_t i = 0; i < length; ++i)
589 |         f_pointers << mem[i] << " ";
590 |       f_pointers << endl;
591 | 
592 |       if ((fread(mem, sizeof(size_t), length, in_fd)) != length)
593 |         error("fread() file " + std::string(tmp_filename) + " failed");
594 | 
595 |       // f_lengths << ">" + std::to_string(n_seq) << endl;
596 |       for (size_t i = 0; i < length; ++i)
597 |         f_lengths << mem[i] << " ";
598 |       f_lengths << endl;
599 | 
600 |       n_seq++;
601 |     }
602 |     fclose(in_fd);
603 |     if (std::remove(tmp_filename.c_str()) != 0)
604 |       error("remove() file " + tmp_filename + " failed");
605 |   }
606 | 
607 |   f_pointers.close();
608 |   f_lengths.close();
609 | 
610 |   t_insert_end = std::chrono::high_resolution_clock::now();
611 | 
612 |   verbose("Memory peak: ", malloc_count_peak());
613 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
614 | 
615 |   mem_peak = malloc_count_peak();
616 |   verbose("Memory peak: ", malloc_count_peak());
617 | }
618 | 
619 | int main(int argc, char *const argv[])
620 | {
621 |   Args args;
622 |   parseArgs(argc, argv, args);
623 | 
624 |   if (args.shaped_slp)
625 |   {
626 |     dispatcher<ms_c<shaped_slp_t>>(args);
627 |   }
628 |   else
629 |   {
630 |     dispatcher<ms_c<plain_slp_t>>(args);
631 |   }
632 |   return 0;
633 | }


--------------------------------------------------------------------------------
/src/mems.cpp:
--------------------------------------------------------------------------------
  1 | /* mems - Computes the MEMs from BWT and Thresholds
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file mems.cpp
 16 |    \brief mems.cpp Computes the MEMs from BWT and Thresholds.
 17 |    \author Massimiliano Rossi
 18 |    \date 13/07/2020
 19 | */
 20 | 
 21 | extern "C" {
 22 | #include <xerrors.h>
 23 | }
 24 | 
 25 | #include <iostream>
 26 | 
 27 | #define VERBOSE
 28 | 
 29 | #include <common.hpp>
 30 | 
 31 | #include <sdsl/io.hpp>
 32 | 
 33 | #include <ms_pointers.hpp>
 34 | 
 35 | #include <malloc_count.h>
 36 | 
 37 | #include <SelfShapedSlp.hpp>
 38 | #include <DirectAccessibleGammaCode.hpp>
 39 | #include <SelectType.hpp>
 40 | #include <PlainSlp.hpp>
 41 | #include <FixedBitLenCode.hpp>
 42 | 
 43 | #include <seqidx.hpp>
 44 | 
 45 | ////////////////////////////////////////////////////////////////////////////////
 46 | /// kseq extra
 47 | ////////////////////////////////////////////////////////////////////////////////
 48 | 
 49 | static inline size_t ks_tell(kseq_t *seq)
 50 | {
 51 |   return gztell(seq->f->f) - seq->f->end + seq->f->begin;
 52 | }
 53 | 
 54 | void copy_kstring_t(kstring_t &l, kstring_t &r)
 55 | {
 56 |   l.l = r.l;
 57 |   l.m = r.m;
 58 |   l.s = (char *)malloc(l.m);
 59 |   for (size_t i = 0; i < r.m; ++i)
 60 |     l.s[i] = r.s[i];
 61 | }
 62 | void copy_kseq_t(kseq_t *l, kseq_t *r)
 63 | {
 64 |   copy_kstring_t(l->name, r->name);
 65 |   copy_kstring_t(l->comment, r->comment);
 66 |   copy_kstring_t(l->seq, r->seq);
 67 |   copy_kstring_t(l->qual, r->qual);
 68 |   l->last_char = r->last_char;
 69 | }
 70 | 
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | /// Parallel computation
 73 | ////////////////////////////////////////////////////////////////////////////////
 74 | 
 75 | // This should be done using buffering.
 76 | size_t next_start_fastq(gzFile fp)
 77 | {
 78 |   int c;
 79 |   // Special case when we arr at the beginning of the file.
 80 |   if ((gztell(fp) == 0) && ((c = gzgetc(fp)) != EOF) && c == '@')
 81 |     return 0;
 82 | 
 83 |   // Strart from the previous character
 84 |   gzseek(fp, -1, SEEK_CUR);
 85 | 
 86 |   std::vector<std::pair<int, size_t>> window;
 87 |   // Find the first new line
 88 |   for (size_t i = 0; i < 4; ++i)
 89 |   {
 90 |     while (((c = gzgetc(fp)) != EOF) && (c != (int)'\n'))
 91 |     {
 92 |     }
 93 |     if (c == EOF)
 94 |       return gztell(fp);
 95 |     if ((c = gzgetc(fp)) == EOF)
 96 |       return gztell(fp);
 97 |     window.push_back(std::make_pair(c, gztell(fp) - 1));
 98 |   }
 99 | 
100 |   for (size_t i = 0; i < 2; ++i)
101 |   {
102 |     if (window[i].first == '@' && window[i + 2].first == '+')
103 |       return window[i].second;
104 |     if (window[i].first == '+' && window[i + 2].first == '@')
105 |       return window[i + 2].second;
106 |   }
107 | 
108 |   return gztell(fp);
109 | }
110 | 
111 | // test if the file is gzipped
112 | static inline bool is_gzipped(std::string filename)
113 | {
114 |   FILE *fp = fopen(filename.c_str(), "rb");
115 |   if(fp == NULL) error("Opening file " + filename);
116 |   int byte1 = 0, byte2 = 0;
117 |   fread(&byte1, sizeof(char), 1, fp);
118 |   fread(&byte2, sizeof(char), 1, fp);
119 |   fclose(fp);
120 |   return (byte1 == 0x1f && byte2 == 0x8b);
121 | }
122 | 
123 | // Return the length of the file
124 | // Assumes that the file is not compressed
125 | static inline size_t get_file_size(std::string filename)
126 | {
127 |   if (is_gzipped(filename))
128 |   {
129 |     std::cerr << "The input is gzipped!" << std::endl;
130 |     return -1;
131 |   }
132 |   FILE *fp = fopen(filename.c_str(), "r");
133 |   fseek(fp, 0L, SEEK_END);
134 |   size_t size = ftell(fp);
135 |   fclose(fp);
136 |   return size;
137 | }
138 | 
139 | std::vector<size_t> split_fastq(std::string filename, size_t n_threads)
140 | {
141 |   //Precondition: the file is not gzipped
142 |   // scan file for start positions and execute threads
143 |   size_t size = get_file_size(filename);
144 | 
145 |   gzFile fp = gzopen(filename.c_str(), "r");
146 |   if (fp == Z_NULL)
147 |   {
148 |     throw new std::runtime_error("Cannot open input file " + filename);
149 |   }
150 | 
151 |   std::vector<size_t> starts(n_threads + 1);
152 |   for (int i = 0; i < n_threads + 1; ++i)
153 |   {
154 |     size_t start = (size_t)((size * i) / n_threads);
155 |     gzseek(fp, start, SEEK_SET);
156 |     starts[i] = next_start_fastq(fp);
157 |   }
158 |   gzclose(fp);
159 |   return starts;
160 | }
161 | 
162 | ////////////////////////////////////////////////////////////////////////////////
163 | /// SLP definitions
164 | ////////////////////////////////////////////////////////////////////////////////
165 | 
166 | using SelSd = SelectSdvec<>;
167 | using DagcSd = DirectAccessibleGammaCode<SelSd>;
168 | using Fblc = FixedBitLenCode<>;
169 | 
170 | using shaped_slp_t = SelfShapedSlp<uint32_t, DagcSd, DagcSd, SelSd>;
171 | using plain_slp_t = PlainSlp<uint32_t, Fblc, Fblc>;
172 | 
173 | template <typename slp_t>
174 | std::string get_slp_file_extension()
175 | {
176 |   return std::string(".slp");
177 | }
178 | 
179 | template <>
180 | std::string get_slp_file_extension<shaped_slp_t>()
181 | {
182 |   return std::string(".slp");
183 | }
184 | 
185 | template <>
186 | std::string get_slp_file_extension<plain_slp_t>()
187 | {
188 |   return std::string(".plain.slp");
189 | }
190 | ////////////////////////////////////////////////////////////////////////////////
191 | 
192 | template <typename slp_t, bool sam_output = false>
193 | class mems_c
194 | {
195 | public:
196 | 
197 |   mems_c(std::string filename)
198 |   {
199 |     verbose("Loading the matching statistics index");
200 |     std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
201 | 
202 |     std::string filename_ms = filename + ms.get_file_extension();
203 | 
204 |     ifstream fs_ms(filename_ms);
205 |     ms.load(fs_ms);
206 |     fs_ms.close();
207 | 
208 |     std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
209 | 
210 |     verbose("Matching statistics index construction complete");
211 |     verbose("Memory peak: ", malloc_count_peak());
212 |     verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
213 | 
214 |     verbose("Loading random access");
215 |     t_insert_start = std::chrono::high_resolution_clock::now();
216 | 
217 |     std::string filename_slp = filename + get_slp_file_extension<slp_t>();
218 | 
219 |     ifstream fs(filename_slp);
220 |     ra.load(fs);
221 |     fs.close();
222 | 
223 |     n = ra.getLen();
224 | 
225 |     t_insert_end = std::chrono::high_resolution_clock::now();
226 | 
227 |     verbose("Matching statistics index loading complete");
228 |     verbose("Memory peak: ", malloc_count_peak());
229 |     verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
230 |   }
231 | 
232 |   // Destructor
233 |   ~mems_c() 
234 |   {
235 |       // NtD
236 |   }
237 | 
238 |   // The outfile has the following format. The first size_t integer store the
239 |   // length l of the name. Then the following l characters stores the name of
240 |   // the read, the next size_t integer stores the number m of MEMs, and the 
241 |   // following m size_t pairs of integers stores the positions and lengths of 
242 |   // the MEMs.
243 |   void maximal_exact_matches(kseq_t *read, FILE* out)
244 |   {
245 |     auto pointers = ms.query(read->seq.s, read->seq.l);
246 |     std::vector<size_t> lengths(pointers.size());
247 |     std::vector<std::tuple<size_t,size_t,size_t>> mems;
248 | 
249 |     size_t l = 0;
250 |     for (size_t i = 0; i < pointers.size(); ++i)
251 |     {
252 |       size_t pos = pointers[i];
253 |       while ((i + l) < read->seq.l && (pos + l) < n && (i < 1 || pos != (pointers[i-1] + 1) ) && read->seq.s[i + l] == ra.charAt(pos + l))
254 |         ++l;
255 | 
256 |       lengths[i] = l;
257 |       l = (l == 0 ? 0 : (l - 1));
258 |  
259 |       if((i == 0) or (lengths[i] >= lengths[i-1]))
260 |         mems.push_back(make_tuple(i,lengths[i],pos));
261 |     }
262 | 
263 |     // Original MS computation
264 |     // for (size_t i = 0; i < pointers.size(); ++i)
265 |     // {
266 |     //   size_t pos = pointers[i];
267 |     //   while ((i + l) < read->seq.l && (pos + l) < n && read->seq.s[i + l] == ra.charAt(pos + l))
268 |     //     ++l;
269 | 
270 |     //   lengths[i] = l;
271 |     //   l = (l == 0 ? 0 : (l - 1));
272 |     // }
273 | 
274 |     assert(lengths.size() == pointers.size());
275 | 
276 |     size_t h_length = read->name.l;
277 |     fwrite(&h_length, sizeof(size_t), 1,out);
278 |     fwrite(read->name.s, sizeof(char),h_length,out);
279 |     if(sam_output)
280 |     {
281 |       size_t s_length = read->seq.l;
282 |       fwrite(&s_length, sizeof(size_t), 1,out);
283 |       fwrite(read->seq.s, sizeof(char),s_length,out);
284 |       fwrite(read->qual.s, sizeof(char),s_length,out);
285 |     }
286 |     size_t q_length = mems.size();
287 |     fwrite(&q_length, sizeof(size_t), 1,out);
288 |     fwrite(mems.data(), sizeof(std::tuple<size_t,size_t,size_t>),q_length,out);
289 |   }
290 | 
291 | protected:
292 |   ms_pointers<> ms;
293 |   slp_t ra;
294 |   size_t n = 0;
295 | };
296 | 
297 | 
298 | 
299 | char complement(char n)
300 | {
301 |   switch (n)
302 |   {
303 |   case 'A':
304 |     return 'T';
305 |   case 'T':
306 |     return 'A';
307 |   case 'G':
308 |     return 'C';
309 |   case 'C':
310 |     return 'G';
311 |   default:
312 |     return n;
313 |   }
314 | }
315 | 
316 | template <typename ms_t>
317 | struct mt_param_t
318 | {
319 |   // Parameters
320 |   ms_t *ms;
321 |   std::string pattern_filename;
322 |   std::string out_filename;
323 |   size_t start;
324 |   size_t end;
325 |   size_t wk_id;
326 | };
327 | 
328 | template <typename ms_t>
329 | void *mt_ms_worker(void *param)
330 | {
331 |   mt_param_t<ms_t> *p = (mt_param_t<ms_t>*) param;
332 |   size_t n_reads = 0;
333 |   size_t n_processed_reads = 0;
334 | 
335 |   FILE *out_fd;
336 |   gzFile fp;
337 | 
338 |   if ((out_fd = fopen(p->out_filename.c_str(), "w")) == nullptr)
339 |     error("open() file " + p->out_filename + " failed");
340 | 
341 |   if ((fp = gzopen(p->pattern_filename.c_str(), "r")) == Z_NULL)
342 |     error("open() file " + p->pattern_filename + " failed");
343 | 
344 |   gzseek(fp, p->start, SEEK_SET);
345 | 
346 |   kseq_t rev;
347 |   int l;
348 | 
349 |   kseq_t *seq = kseq_init(fp);
350 |   while ((ks_tell(seq) < p->end) && ((l = kseq_read(seq)) >= 0))
351 |   {
352 | 
353 |     p->ms->maximal_exact_matches(seq,out_fd);
354 | 
355 |   }
356 | 
357 |   kseq_destroy(seq);
358 |   gzclose(fp);
359 |   fclose(out_fd);
360 | 
361 |   return NULL;
362 | }
363 | 
364 | template <typename ms_t>
365 | void mt_ms(ms_t *ms, std::string pattern_filename, std::string out_filename, size_t n_threads)
366 | {
367 |   pthread_t t[n_threads] = {0};
368 |   mt_param_t<ms_t> params[n_threads];
369 |   std::vector<size_t> starts = split_fastq(pattern_filename, n_threads);
370 |   for(size_t i = 0; i < n_threads; ++i)
371 |   {
372 |     params[i].ms = ms;
373 |     params[i].pattern_filename = pattern_filename;
374 |     params[i].out_filename = out_filename + "_" + std::to_string(i) + ".mems.tmp.out";
375 |     params[i].start = starts[i];
376 |     params[i].end = starts[i+1];
377 |     params[i].wk_id = i;
378 |     xpthread_create(&t[i], NULL, &mt_ms_worker<ms_t>, &params[i], __LINE__, __FILE__);
379 |   }
380 | 
381 |   for(size_t i = 0; i < n_threads; ++i)
382 |   {
383 |     xpthread_join(t[i],NULL,__LINE__,__FILE__);
384 |   }
385 | 
386 |   // sleep(5);
387 | 
388 | 
389 |   return;
390 | }
391 | 
392 | 
393 | ////////////////////////////////////////////////////////////////////////////////
394 | /// Single Thread
395 | ////////////////////////////////////////////////////////////////////////////////
396 | template <typename ms_t>
397 | size_t st_ms(ms_t *ms, std::string pattern_filename, std::string out_filename)
398 | {
399 |   size_t n_reads = 0;
400 |   size_t n_processed_reads = 0;
401 |   kseq_t rev;
402 |   int l;
403 |   FILE *out_fd;
404 | 
405 |   out_filename += "_0.mems.tmp.out";
406 | 
407 |   if ((out_fd = fopen(out_filename.c_str(), "w")) == nullptr)
408 |     error("open() file " + out_filename + " failed");
409 | 
410 |   gzFile fp = gzopen(pattern_filename.c_str(), "r");
411 |   kseq_t* seq = kseq_init(fp);
412 |   while ((l = kseq_read(seq)) >= 0)
413 |   {
414 | 
415 |     ms->maximal_exact_matches(seq, out_fd);
416 | 
417 |   }
418 | 
419 |   kseq_destroy(seq);
420 |   gzclose(fp);
421 |   fclose(out_fd);
422 | 
423 |   // sleep(5);
424 | 
425 |   return n_processed_reads;
426 | }
427 | 
428 | 
429 | typedef std::pair<std::string, std::vector<uint8_t>> pattern_t;
430 | 
431 | //*********************** Argument options ***************************************
432 | // struct containing command line parameters and other globals
433 | struct Args
434 | {
435 |   std::string filename = "";
436 |   std::string patterns = "";    // path to patterns file
437 |   std::string output   = "";    // output file prefix
438 |   size_t l = 25;                // minumum MEM length
439 |   size_t th = 1;                // number of threads
440 |   bool shaped_slp = false;      // use shaped slp
441 |   bool extended_output = false; // print one MEM occurrence in the reference
442 |   bool sam_output = false;      // output MEMs in SAM format
443 | };
444 | 
445 | void parseArgs(int argc, char *const argv[], Args &arg)
446 | {
447 |   int c;
448 |   extern char *optarg;
449 |   extern int optind;
450 | 
451 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-p patterns] [-o output] [-t threads] [-l len] [-q shaped_slp] [-e extended_output] [-s sam_output] [-b batch]\n\n" +
452 |                     "Copmputes the matching statistics of the reads in the pattern against the reference index in infile.\n" +
453 |                     "     shaped_slp: [boolean] - use shaped slp. (def. false)\n" +
454 |                     "extended_output: [boolean] - print one MEM occurrence in ref. (def. false)\n" +
455 |                     "     sam_output: [boolean] - print output in SAM format. (def. false)\n" +
456 |                     "        pattens: [string]  - path to patterns file.\n" +
457 |                     "         output: [string]  - output file prefix.\n" +
458 |                     "            len: [integer] - minimum MEM lengt (def. 25)\n" +
459 |                     "         thread: [integer] - number of threads (def. 1)\n");
460 | 
461 |   std::string sarg;
462 |   while ((c = getopt(argc, argv, "l:hp:o:t:qes")) != -1)
463 |   {
464 |     switch (c)
465 |     {
466 |     case 'p':
467 |       arg.patterns.assign(optarg);
468 |       break;
469 |     case 'o':
470 |       arg.output.assign(optarg);
471 |       break;
472 |     case 'l':
473 |       sarg.assign(optarg);
474 |       arg.l = stoi(sarg);
475 |       break;
476 |     case 't':
477 |       sarg.assign(optarg);
478 |       arg.th = stoi(sarg);
479 |       break;
480 |     case 'q':
481 |       arg.shaped_slp = true;
482 |       break;
483 |     case 'e':
484 |       arg.extended_output = true;
485 |       break;
486 |     case 's':
487 |       arg.sam_output = true;
488 |       break;
489 |     case 'h':
490 |       error(usage);
491 |     case '?':
492 |       error("Unknown option.\n", usage);
493 |       exit(1);
494 |     }
495 |   }
496 |   // the only input parameter is the file name
497 |   if (argc == optind + 1)
498 |   {
499 |     arg.filename.assign(argv[optind]);
500 |   }
501 |   else
502 |   {
503 |     error("Invalid number of arguments\n", usage);
504 |   }
505 | 
506 |   if (arg.extended_output && arg.sam_output) {
507 |     error("Cannot specify both extended_output and sam_output flags.\n", usage);
508 |   }
509 | }
510 | 
511 | //********** end argument options ********************
512 | 
513 | template <typename ms_t>
514 | void dispatcher(Args &args)
515 | {
516 |   verbose("Construction of the matching statistics data structure");
517 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
518 | 
519 |   ms_t ms(args.filename);
520 | 
521 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
522 |   verbose("Memory peak: ", malloc_count_peak());
523 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
524 | 
525 |   verbose("Processing patterns");
526 |   t_insert_start = std::chrono::high_resolution_clock::now();
527 | 
528 |   std::string base_name = basename(args.filename.data());
529 |   std::string out_filename = args.patterns + "_" + base_name;
530 |   if(args.output != "")
531 |     out_filename = args.output;
532 | 
533 |   if (is_gzipped(args.patterns))
534 |   {
535 |     verbose("The input is gzipped - forcing single thread matching statistics.");
536 |     args.th = 1;
537 |   }
538 | 
539 |   if (args.th == 1)
540 |     st_ms<ms_t>(&ms, args.patterns, out_filename);
541 |   else
542 |     mt_ms<ms_t>(&ms, args.patterns, out_filename, args.th);
543 | 
544 |   // TODO: Merge the SAM files.
545 | 
546 |   t_insert_end = std::chrono::high_resolution_clock::now();
547 | 
548 |   verbose("Memory peak: ", malloc_count_peak());
549 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
550 | 
551 |   auto mem_peak = malloc_count_peak();
552 |   verbose("Memory peak: ", malloc_count_peak());
553 | 
554 |   seqidx idx;
555 | 
556 |   std::string filename_idx = args.filename + idx.get_file_extension();
557 |   verbose("Loading fasta index file: " + filename_idx);
558 |   t_insert_start = std::chrono::high_resolution_clock::now();
559 | 
560 |   ifstream fs_idx(filename_idx);
561 |   idx.load(fs_idx);
562 |   fs_idx.close();
563 | 
564 |   t_insert_end = std::chrono::high_resolution_clock::now();
565 | 
566 |   verbose("Fasta index loading complete");
567 |   verbose("Memory peak: ", malloc_count_peak());
568 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
569 | 
570 |   verbose("Printing plain output");
571 |   t_insert_start = std::chrono::high_resolution_clock::now();
572 | 
573 |   std::string mems_file_suffix = args.sam_output ? ".sam" : ".mems";
574 |   std::ofstream f_mems(out_filename + mems_file_suffix);
575 | 
576 |   if (!f_mems.is_open())
577 |     error("open() file " + std::string(out_filename) + mems_file_suffix + " failed");
578 | 
579 |   if(args.sam_output)
580 |   {
581 |       f_mems << "@HD\tVN:1.6\tSO:unknown\n";
582 |       f_mems << idx.to_sam();
583 |       f_mems << "@PG\tID:moni\tPN:moni\tVN:0.2.2\n";
584 |   }
585 | 
586 |   size_t n_seq = 0;
587 |   for (size_t i = 0; i < args.th; ++i)
588 |   {
589 |     std::string tmp_filename = out_filename + "_" + std::to_string(i) + ".mems.tmp.out";
590 |     FILE *in_fd;
591 | 
592 |     if ((in_fd = fopen(tmp_filename.c_str(), "r")) == nullptr)
593 |       error("open() file " + tmp_filename + " failed");
594 | 
595 |     size_t length = 0;
596 |     size_t rname_l = 0;
597 |     size_t s_length = 0;
598 |     size_t m = 100; // Reserved size for pointers and lengths
599 |     std::vector<std::tuple<size_t,size_t,size_t>> mem(m);
600 |     size_t s = 100; // Reserved size for read name
601 |     size_t rseq_l = 100; // Reserved size for seq and qual
602 |     char* rname = (char *)malloc(s * sizeof(char));
603 |     char *rseq = (char *)malloc(rseq_l * sizeof(char));
604 |     char *rqual = (char *)malloc(rseq_l * sizeof(char));
605 |     while (!feof(in_fd) and fread(&rname_l, sizeof(size_t), 1, in_fd) > 0)
606 |     {
607 |       // Reading read name
608 |       if (s < rname_l)
609 |       {
610 |         // Resize lengths and pointers
611 |         s = rname_l;
612 |         rname = (char *)realloc(rname, s * sizeof(char));
613 |       }
614 | 
615 |       if ((fread(rname, sizeof(char), rname_l, in_fd)) != rname_l)
616 |         error("fread() file " + std::string(tmp_filename) + " failed");
617 |       
618 |       // In case of SAM output read also the sequence and quals
619 |       if (args.sam_output)
620 |       {
621 |         if ((fread(&s_length, sizeof(size_t), 1, in_fd)) != 1)
622 |           error("fread() file " + std::string(tmp_filename) + " failed");
623 |         if (rseq_l < s_length)
624 |         {
625 |           // Resize s_lengths and pointers
626 |           rseq_l = s_length;
627 |           rseq = (char *)realloc(rseq, rseq_l * sizeof(char));
628 |           rqual = (char *)realloc(rqual, rseq_l * sizeof(char));
629 |         }
630 |         if ((fread(rseq, sizeof(char), s_length, in_fd)) != s_length)
631 |           error("fread() file " + std::string(tmp_filename) + " failed");
632 |         if ((fread(rqual, sizeof(char), s_length, in_fd)) != s_length)
633 |           error("fread() file " + std::string(tmp_filename) + " failed");
634 |       }
635 |       else 
636 |       {
637 |         f_mems << ">" + std::string(rname, rname_l) << endl;
638 |       }
639 | 
640 |       // Reading MEMs
641 |       if ((fread(&length, sizeof(size_t), 1, in_fd)) != 1)
642 |         error("fread() file " + std::string(tmp_filename) + " failed");
643 | 
644 |       if (m < length)
645 |       {
646 |         // Resize lengths and pointers
647 |         m = length;
648 |         mem.resize(m);
649 |       }
650 | 
651 |       if ((fread(mem.data(), sizeof(std::tuple<size_t,size_t,size_t>), length, in_fd)) != length)
652 |         error("fread() file " + std::string(tmp_filename) + " failed");
653 | 
654 |       // TODO: Store the fasta headers somewhere
655 |       // f_mems << ">" + std::to_string(n_seq) << endl;
656 |       if (args.sam_output){
657 |         for (size_t i = 0; i < length; ++i)
658 |         {
659 |           size_t mem_pos = std::get<0>(mem[i]);
660 |           size_t mem_len = std::get<1>(mem[i]);
661 |           std::pair<std::string, size_t> pos = idx.index(std::get<2>(mem[i]));
662 |           f_mems << std::string(rname,rname_l) + "\t";
663 |           // First MEM is primary, all other MEMs are non primary
664 |           f_mems << (i?"256\t":"0\t");
665 |           f_mems << pos.first << "\t" << pos.second + 1<< "\t60\t";
666 |           std::string cigar = "";
667 |           if (mem_pos > 0) cigar += std::to_string(mem_pos) + "S";
668 |           cigar += std::to_string(mem_len) + "M";
669 |           size_t suff_length = s_length - (mem_pos + mem_len);
670 |           if (suff_length > 0) cigar += std::to_string(suff_length) + "S";
671 |           f_mems << cigar + "\t" + std::string(rseq, s_length) + "\t" + std::string(rqual, s_length) + "\n";
672 |         }
673 |       } else 
674 |       {
675 |         if (args.extended_output){
676 |           for (size_t i = 0; i < length; ++i)
677 |           {
678 |             std::pair<std::string, size_t> pos = idx.index(std::get<2>(mem[i]));
679 |             f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << "," << pos.first << "," << pos.second << ") ";
680 |           }
681 |         } else {
682 |           for (size_t i = 0; i < length; ++i)
683 |           {
684 |             f_mems << "(" << std::get<0>(mem[i]) << "," << std::get<1>(mem[i]) << ") ";
685 |           }
686 | 
687 |         }
688 |         f_mems << endl;
689 |       }
690 | 
691 |       n_seq++;
692 |     }
693 |     fclose(in_fd);
694 |     if (std::remove(tmp_filename.c_str()) != 0)
695 |       error("remove() file " + tmp_filename + " failed");
696 |   }
697 | 
698 |   f_mems.close();
699 | 
700 |   t_insert_end = std::chrono::high_resolution_clock::now();
701 | 
702 |   verbose("Memory peak: ", malloc_count_peak());
703 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
704 | 
705 |   mem_peak = malloc_count_peak();
706 |   verbose("Memory peak: ", malloc_count_peak());
707 | }
708 | 
709 | int main(int argc, char *const argv[])
710 | {
711 |   Args args;
712 |   parseArgs(argc, argv, args);
713 | 
714 |   if (args.shaped_slp)
715 |   {
716 |     if (args.sam_output)
717 |       dispatcher<mems_c<shaped_slp_t,true>>(args);
718 |     else
719 |       dispatcher<mems_c<shaped_slp_t,false>>(args);
720 |   }
721 |   else
722 |   {
723 |     if(args.sam_output)
724 |       dispatcher<mems_c<plain_slp_t,true>>(args);
725 |     else
726 |       dispatcher<mems_c<plain_slp_t,false>>(args);
727 |   }
728 |   return 0;
729 | }


--------------------------------------------------------------------------------
/src/rlebwt_ms_build.cpp:
--------------------------------------------------------------------------------
  1 | /* rlebwt_ms_build - Build the matching statistics data structure
  2 |     Copyright (C) 2020 Massimiliano Rossi
  3 |     This program is free software: you can redistribute it and/or modify
  4 |     it under the terms of the GNU General Public License as published by
  5 |     the Free Software Foundation, either version 3 of the License, or
  6 |     (at your option) any later version.
  7 |     This program is distributed in the hope that it will be useful,
  8 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
  9 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 10 |     GNU General Public License for more details.
 11 |     You should have received a copy of the GNU General Public License
 12 |     along with this program.  If not, see http://www.gnu.org/licenses/ .
 13 | */
 14 | /*!
 15 |    \file rlebwt_ms_build.cpp
 16 |    \brief rlebwt_ms_build.cpp Build the matching statistics data structure.
 17 |    \author Massimiliano Rossi
 18 |    \date 13/07/2020
 19 | */
 20 | 
 21 | #include <iostream>
 22 | 
 23 | #define VERBOSE
 24 | 
 25 | #include <common.hpp>
 26 | 
 27 | #include <sdsl/io.hpp>
 28 | 
 29 | #include <ms_pointers.hpp>
 30 | 
 31 | #include <malloc_count.h>
 32 | 
 33 | #include <SelfShapedSlp.hpp>
 34 | #include <DirectAccessibleGammaCode.hpp>
 35 | #include <SelectType.hpp>
 36 | 
 37 | //*********************** Argument options ***************************************
 38 | // struct containing command line parameters and other globals
 39 | struct Args
 40 | {
 41 |   std::string filename = "";
 42 |   bool memo = false;         // print the memory usage
 43 |   bool csv = false;          // print stats on stderr in csv format
 44 |   bool rle = false;          // outpt RLBWT
 45 | };
 46 | 
 47 | void parseArgs(int argc, char *const argv[], Args &arg)
 48 | {
 49 |   int c;
 50 |   extern char *optarg;
 51 |   extern int optind;
 52 | 
 53 |   std::string usage("usage: " + std::string(argv[0]) + " infile [-s store] [-m memo] [-c csv] [-p patterns] [-f fasta] [-r rle] [-t threads] [-l len]\n\n" +
 54 |                     "Computes the pfp data structures of infile, provided that infile.parse, infile.dict, and infile.occ exists.\n" +
 55 |                     "   memo: [boolean] - print the data structure memory usage. (def. false)\n" +
 56 |                     "    rle: [boolean] - output run length encoded BWT. (def. false)\n" +
 57 |                     "    csv: [boolean] - print the stats in csv form on strerr. (def. false)\n");
 58 | 
 59 |   std::string sarg;
 60 |   while ((c = getopt(argc, argv, "mcrh")) != -1)
 61 |   {
 62 |     switch (c)
 63 |     {
 64 |     case 'm':
 65 |       arg.memo = true;
 66 |       break;
 67 |     case 'c':
 68 |       arg.csv = true;
 69 |       break;
 70 |     case 'r':
 71 |       arg.rle = true;
 72 |       break;
 73 |     case 'h':
 74 |       error(usage);
 75 |     case '?':
 76 |       error("Unknown option.\n", usage);
 77 |       exit(1);
 78 |     }
 79 |   }
 80 |   // the only input parameter is the file name
 81 |   if (argc == optind + 1)
 82 |   {
 83 |     arg.filename.assign(argv[optind]);
 84 |   }
 85 |   else
 86 |   {
 87 |     error("Invalid number of arguments\n", usage);
 88 |   }
 89 | }
 90 | 
 91 | //********** end argument options ********************
 92 | 
 93 | int main(int argc, char *const argv[])
 94 | {
 95 |   using SelSd = SelectSdvec<>;
 96 |   using DagcSd = DirectAccessibleGammaCode<SelSd>;
 97 | 
 98 |   Args args;
 99 |   parseArgs(argc, argv, args);
100 | 
101 |   // Building the r-index
102 | 
103 |   verbose("Building the matching statistics index");
104 |   std::chrono::high_resolution_clock::time_point t_insert_start = std::chrono::high_resolution_clock::now();
105 | 
106 |   ms_pointers<> ms(args.filename, true);
107 | 
108 |   std::chrono::high_resolution_clock::time_point t_insert_end = std::chrono::high_resolution_clock::now();
109 | 
110 |   verbose("Matching statistics index construction complete");
111 |   verbose("Memory peak: ", malloc_count_peak());
112 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
113 | 
114 | 
115 |   std::string outfile = args.filename + ms.get_file_extension();
116 |   std::ofstream out(outfile);
117 |   ms.serialize(out);
118 | 
119 |   // size_t ra_size = sdsl::size_in_bytes(ra);
120 | 
121 | 
122 |   t_insert_end = std::chrono::high_resolution_clock::now();
123 | 
124 |   verbose("Memory peak: ", malloc_count_peak());
125 |   verbose("Elapsed time (s): ", std::chrono::duration<double, std::ratio<1>>(t_insert_end - t_insert_start).count());
126 | 
127 |   auto mem_peak = malloc_count_peak();
128 |   verbose("Memory peak: ", malloc_count_peak());
129 | 
130 |   size_t space = 0;
131 |   if (args.memo)
132 |   {
133 |     sdsl::nullstream ns;
134 | 
135 |     size_t ms_size = ms.serialize(ns);
136 |     verbose("MS size (bytes): ", ms_size);
137 |   }
138 | 
139 |   if (args.csv)
140 |     std::cerr << csv(args.filename.c_str(), time, space, mem_peak) << std::endl;
141 | 
142 |   return 0;
143 | }


--------------------------------------------------------------------------------
/thirdparty/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | include(FetchContent)
  2 | 
  3 | ## Add malloc_count
  4 | FetchContent_Declare(
  5 |   malloc_count
  6 |   GIT_REPOSITORY https://github.com/bingmann/malloc_count
  7 |   )
  8 |   
  9 | FetchContent_GetProperties(malloc_count)
 10 | if(NOT malloc_count_POPULATED)
 11 |   FetchContent_Populate(malloc_count)
 12 | 
 13 |   add_library(malloc_count OBJECT ${malloc_count_SOURCE_DIR}/malloc_count.c ${malloc_count_SOURCE_DIR}/malloc_count.h)
 14 |   target_link_libraries(malloc_count dl)
 15 |   target_include_directories(malloc_count PUBLIC "${malloc_count_SOURCE_DIR}")
 16 | 
 17 |   add_library(memprofile OBJECT ${malloc_count_SOURCE_DIR}/memprofile.h)
 18 |   target_include_directories(memprofile PUBLIC "${malloc_count_SOURCE_DIR}")
 19 | endif()
 20 | 
 21 | # # Add klib
 22 | set(KLIB_COMMIT "9a063b33efd841fcc42d4b9f68cb78bb528bf75b")
 23 | 
 24 | FetchContent_Declare(
 25 |   klib
 26 |   GIT_REPOSITORY https://github.com/attractivechaos/klib
 27 |   GIT_TAG ${KLIB_COMMIT}
 28 | )
 29 | 
 30 | FetchContent_GetProperties(klib)
 31 | if(NOT klib_POPULATED)
 32 |   FetchContent_Populate(klib)
 33 |   
 34 |   # add_subdirectory(${klib_SOURCE_DIR} ${klib_BINARY_DIR} EXCLUDE_FROM_ALL)
 35 |   add_library(klib INTERFACE)
 36 | 
 37 |   target_include_directories(klib INTERFACE ${klib_SOURCE_DIR})
 38 | endif()
 39 | 
 40 | ## Add Big-BWT
 41 | FetchContent_Declare(
 42 |   bigbwt
 43 |   GIT_REPOSITORY https://github.com/alshai/Big-BWT.git
 44 |   )
 45 |   
 46 | FetchContent_GetProperties(bigbwt)
 47 | if(NOT bigbwt_POPULATED)
 48 |   FetchContent_Populate(bigbwt)
 49 |   add_subdirectory(${bigbwt_SOURCE_DIR} ${bigbwt_BINARY_DIR})
 50 | 
 51 |   endif()
 52 | 
 53 | 
 54 |   
 55 | ## Add gsacak
 56 | FetchContent_Declare(
 57 |   gsacak
 58 |   GIT_REPOSITORY https://github.com/felipelouza/gsa-is.git
 59 | )
 60 | 
 61 | FetchContent_GetProperties(gsacak)
 62 | if(NOT gsacak_POPULATED)
 63 |     FetchContent_Populate(gsacak)
 64 |     add_library(gsacak OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h)
 65 |     target_include_directories(gsacak PUBLIC "${gsacak_SOURCE_DIR}")
 66 |     
 67 |     add_library(gsacak64 OBJECT ${gsacak_SOURCE_DIR}/gsacak.c ${gsacak_SOURCE_DIR}/gsacak.h)
 68 |     target_include_directories(gsacak64 PUBLIC "${gsacak_SOURCE_DIR}")
 69 |     target_compile_options(gsacak64 PUBLIC -DM64)
 70 | endif()
 71 | 
 72 | 
 73 | ## Add sdsl
 74 | FetchContent_Declare(
 75 |   sdsl
 76 |   GIT_REPOSITORY https://github.com/simongog/sdsl-lite
 77 | )
 78 | 
 79 | FetchContent_GetProperties(sdsl)
 80 | if(NOT sdsl_POPULATED)
 81 |   FetchContent_Populate(sdsl)
 82 | 
 83 |   set(GENERATE_DOC OFF CACHE BOOL "Do not generate doxygen for sdsl-lite")
 84 |   
 85 |   add_subdirectory(${sdsl_SOURCE_DIR} ${sdsl_BINARY_DIR} EXCLUDE_FROM_ALL)
 86 | endif()
 87 | 
 88 | ## Add divsuffsort
 89 | FetchContent_Declare(
 90 |   divsufsort
 91 |   GIT_REPOSITORY https://github.com/simongog/libdivsufsort.git
 92 |   GIT_TAG        2.0.1
 93 | )
 94 | 
 95 | FetchContent_GetProperties(divsufsort)
 96 | if(NOT divsufsort_POPULATED)
 97 |   FetchContent_Populate(divsufsort)
 98 | 
 99 |   set(BUILD_SHARED_LIBS OFF CACHE BOOL "Do not build a shared library for libdivsufsort")
100 |   set(BUILD_EXAMPLES OFF CACHE BOOL "Do not build libdivsufsort example")
101 |   set(BUILD_DIVSUFSORT64 ON CACHE BOOL "Build libdivsufsort in 64-bits mode")
102 | 
103 |   add_subdirectory(${divsufsort_SOURCE_DIR} ${divsufsort_BINARY_DIR} EXCLUDE_FROM_ALL)
104 | 
105 |   target_include_directories(divsufsort PUBLIC "${divsufsort_BINARY_DIR}/include")
106 |   target_include_directories(divsufsort64 PUBLIC "${divsufsort_BINARY_DIR}/include")
107 | endif()
108 | 
109 | 
110 | ## Add r-index
111 | FetchContent_Declare(
112 |   r-index
113 |   GIT_REPOSITORY https://github.com/maxrossi91/r-index.git
114 | )
115 | 
116 | FetchContent_GetProperties(r-index)
117 | if(NOT r-index_POPULATED)
118 |   FetchContent_Populate(r-index)
119 | 
120 |   add_subdirectory(${r-index_SOURCE_DIR} ${r-index_BINARY_DIR} EXCLUDE_FROM_ALL)
121 |   add_library(ri INTERFACE)
122 |   target_link_libraries(ri INTERFACE klib z)
123 |   target_include_directories(ri INTERFACE ${r-index_SOURCE_DIR}/internal)
124 | endif()
125 | 
126 | ## Add pfp-thresholds
127 | FetchContent_Declare(
128 |   pfp_thresholds
129 |   GIT_REPOSITORY https://github.com/maxrossi91/pfp-thresholds.git
130 |   GIT_TAG develop
131 |   )
132 |   
133 | FetchContent_GetProperties(pfp_thresholds)
134 | if(NOT pfp_thresholds_POPULATED)
135 |   FetchContent_Populate(pfp_thresholds)
136 |   add_subdirectory(${pfp_thresholds_SOURCE_DIR} ${pfp_thresholds_BINARY_DIR})
137 |   
138 | endif()
139 | 
140 | ## Add bigrepair
141 | FetchContent_Declare(
142 |   bigrepair
143 |   GIT_REPOSITORY https://gitlab.com/maxrossi91/bigrepair.git
144 |   # GIT_REPOSITORY https://gitlab.com/manzai/bigrepair.git
145 |   )
146 |   
147 | FetchContent_GetProperties(bigrepair)
148 | if(NOT bigrepair_POPULATED)
149 |   set(DISABLE_PFP ON CACHE BOOL "Build bigrepair without the PFP")
150 |   FetchContent_Populate(bigrepair)
151 |   add_subdirectory(${bigrepair_SOURCE_DIR} ${bigrepair_BINARY_DIR})
152 | 
153 |   # execute_process(COMMAND make
154 |   #       RESULT_VARIABLE result
155 |   #       WORKING_DIRECTORY ${bigrepair_SOURCE_DIR} )
156 |   # if(result)
157 |   #     message(FATAL_ERROR "CMake step for bigrepair failed: ${result}")
158 |   # endif()
159 |   
160 | endif()
161 | 
162 | ## Add ShapedSlp
163 | FetchContent_Declare(
164 |   shaped_slp
165 |   GIT_REPOSITORY https://github.com/koeppl/ShapedSlp.git
166 |   GIT_TAG master
167 |   )
168 |   
169 | FetchContent_GetProperties(shaped_slp)
170 | if(NOT shaped_slp_POPULATED)
171 |   FetchContent_Populate(shaped_slp)
172 |   add_subdirectory(${shaped_slp_SOURCE_DIR} ${shaped_slp_BINARY_DIR})
173 |   set(FOLCA_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/folca)
174 |   set(SUX_SOURCE_DIR ${shaped_slp_SOURCE_DIR}/external/sux/sux)
175 | endif()
176 | 
177 | ## Add SSW
178 | FetchContent_Declare(
179 |     ssw
180 |     GIT_REPOSITORY https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library
181 |     GIT_TAG master
182 |   )
183 |   
184 |   FetchContent_GetProperties(ssw)
185 |   if(NOT ssw_POPULATED)
186 |     FetchContent_Populate(ssw)
187 | 
188 |     add_library(ssw OBJECT ${ssw_SOURCE_DIR}/src/ssw_cpp.cpp ${ssw_SOURCE_DIR}/src/ssw.c)
189 |     target_include_directories(ssw PUBLIC ${ssw_SOURCE_DIR}/src)
190 |   endif()
191 | 
192 | 
193 | ## Add Ksw2
194 | FetchContent_Declare(
195 |     ksw2
196 |     GIT_REPOSITORY https://github.com/lh3/ksw2
197 |     GIT_TAG master
198 |   )
199 |   
200 |   FetchContent_GetProperties(ksw2)
201 |   if(NOT ksw2_POPULATED)
202 |     FetchContent_Populate(ksw2)
203 |     
204 |     add_library(ksw2 OBJECT ${ksw2_SOURCE_DIR}/kalloc.c 
205 |                             ${ksw2_SOURCE_DIR}/ksw2_gg.c 
206 |                             ${ksw2_SOURCE_DIR}/ksw2_gg2.c 
207 |                             ${ksw2_SOURCE_DIR}/ksw2_gg2_sse.c 
208 |                             ${ksw2_SOURCE_DIR}/ksw2_extz.c 
209 |                             ${ksw2_SOURCE_DIR}/ksw2_extz2_sse.c
210 |                             ${ksw2_SOURCE_DIR}/ksw2_extd.c 
211 |                             ${ksw2_SOURCE_DIR}/ksw2_extd2_sse.c 
212 |                             ${ksw2_SOURCE_DIR}/ksw2_extf2_sse.c 
213 |                             ${ksw2_SOURCE_DIR}/ksw2_exts2_sse.c)
214 |     target_include_directories(ksw2 PUBLIC ${ksw2_SOURCE_DIR}/src)
215 |   endif()
216 |   


--------------------------------------------------------------------------------
/utils.md:
--------------------------------------------------------------------------------
 1 | # Utils command
 2 | 
 3 | # Build the docker image
 4 | 
 5 | ```console
 6 | docker build --platform linux/amd64 --no-cache -t maxrossi91/moni . 
 7 | ```
 8 | 
 9 | # Pseudo system test
10 | ```console
11 | docker run --platform linux/amd64 -v `pwd`/data:/data  -it maxrossi91/moni bash
12 | 
13 | mkdir -p out
14 | moni build -r data/SARS-CoV2/SARS-CoV2.1k.fa.gz -o out/sars-cov2 -f
15 | moni mems -i out/sars-cov2 -p data/SARS-CoV2/reads.fastq.gz -o out/reads -s
16 | ```
17 | 


--------------------------------------------------------------------------------
/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | FetchContent_GetProperties(r-index)
3 | 
4 | add_executable(split_fa split_fa.cpp)
5 | target_link_libraries(split_fa klib z)
6 | 


--------------------------------------------------------------------------------
/utils/split_fa.cpp:
--------------------------------------------------------------------------------
  1 | //
  2 | //  split_fa.cpp
  3 | //
  4 | //  Copyright 2020 Marco Oliva. All rights reserved.
  5 | //
  6 | 
  7 | #include <iostream>
  8 | #include <fstream>
  9 | 
 10 | #include <zlib.h>
 11 | #include <stdio.h>
 12 | #include <kseq.h>
 13 | 
 14 | KSEQ_INIT(gzFile, gzread)
 15 | 
 16 | void split_file(std::string& path, std::size_t n_seqs, std::size_t n_blocks)
 17 | {
 18 |     std::size_t seqs_per_block = 0, residual = 0;
 19 |     if (n_seqs % n_blocks == 0) { seqs_per_block = n_seqs / n_blocks; residual = n_seqs / n_blocks; }
 20 |     else if (n_blocks == 2) {seqs_per_block = n_seqs / 2; residual = (n_seqs % 2) + (n_seqs / 2);}
 21 |     else { seqs_per_block = n_seqs / (n_blocks - 1); residual = n_seqs % (n_blocks - 1); }
 22 |     
 23 |     std::size_t last_index = path.find_last_of('.'); std::string remove_gz = path.substr(0, last_index);
 24 |     last_index = remove_gz.find_last_of('.', last_index);
 25 |     std::string base_name = path.substr(0, last_index);
 26 |     
 27 |     int l;
 28 |     gzFile fp;
 29 |     kseq_t *seq;
 30 |     fp = gzopen(path.c_str(), "r");
 31 |     seq = kseq_init(fp);
 32 |     for (std::size_t i = 0; i < n_blocks - 1; i++)
 33 |     {
 34 |         std::cout << "\rSplitting sequences... " << std::to_string(i + 1) << "/" << n_blocks << "  "
 35 |                   << std::to_string((double(i + 1) / double(n_blocks)) * 100) << "%" << std::flush;
 36 |         
 37 |         std::string out_path = base_name + "_" + std::to_string(i + 1) + ".fa";
 38 |         std::ofstream out_file(out_path);
 39 |         std::size_t it = 0;
 40 |         while (it < seqs_per_block)
 41 |         {
 42 |             l = kseq_read(seq);
 43 |             if (seq->seq.l > 0)
 44 |             {
 45 |                 out_file.put('>'); out_file.write(seq->name.s, seq->name.l); out_file.put('\n');
 46 |                 out_file.write(seq->seq.s, seq->seq.l); out_file.put('\n');
 47 |             }
 48 |             it++;
 49 |         }
 50 |         out_file.close();
 51 |     }
 52 |     // remember to write residual to last file
 53 |     std::string out_path = base_name + "_" + std::to_string(n_blocks) + ".fasta";
 54 |     std::ofstream out_file(out_path);
 55 |     std::size_t it = 0;
 56 |     std::cout << "\rSplitting sequences... " << std::to_string(n_blocks) << "/" << n_blocks << "  "
 57 |               << std::to_string((double(n_blocks) / double(n_blocks)) * 100) << "%" << std::flush;
 58 |     while (it < residual)
 59 |     {
 60 |         l = kseq_read(seq);
 61 |         if (seq->seq.l > 0)
 62 |         {
 63 |             out_file.put('>'); out_file.write(seq->name.s, seq->name.l); out_file.put('\n');
 64 |             out_file.write(seq->seq.s, seq->seq.l); out_file.put('\n');
 65 |         }
 66 |         it++;
 67 |     }
 68 |     out_file.close();
 69 |     
 70 |     
 71 |     // free
 72 |     kseq_destroy(seq);
 73 |     gzclose(fp);
 74 | }
 75 | 
 76 | // Count sequences
 77 | std::size_t count_seqs(std::string& path)
 78 | {
 79 |     std::size_t sequences_count = 0;
 80 |     int l;
 81 |     gzFile fp;
 82 |     kseq_t *seq;
 83 |     fp = gzopen(path.c_str(), "r");
 84 |     seq = kseq_init(fp);
 85 |     while ((l = kseq_read(seq)) >= 0)
 86 |     {
 87 |         sequences_count++;
 88 |     }
 89 |     kseq_destroy(seq);
 90 |     gzclose(fp);
 91 |     
 92 |     return sequences_count;
 93 | }
 94 | 
 95 | int main(int argc, char *argv[])
 96 | {
 97 |     if (argc != 3) {
 98 |         fprintf(stderr, "Usage: %s <in.seq> <n blocks>\n", argv[0]);
 99 |         return 1;
100 |     }
101 |     
102 |     std::string path = argv[1];
103 |     std::cout << "In path: " << path << std::endl;
104 |     std::size_t n_blocks = std::stoi(argv[2]);
105 |     std::cout << "Blocks: " << n_blocks << std::endl;
106 |     
107 |     std::cout << "Reading sequences...";
108 |     std::size_t n_seq = count_seqs(path);
109 |     std::cout << " done. N: " <<  n_seq << std::endl;
110 |     
111 |     std::cout << "Splitting sequences...";
112 |     split_file(path, n_seq, n_blocks);
113 |     std::cout << " done." << std::endl;
114 |     
115 |     return 0;
116 | }
117 | 


--------------------------------------------------------------------------------