├── .gitignore ├── .gitmodules ├── .travis.yml ├── AUTHORS ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── cobs ├── CMakeLists.txt ├── construction │ ├── classic_index.cpp │ ├── classic_index.hpp │ ├── compact_index.cpp │ └── compact_index.hpp ├── cortex_file.hpp ├── document_list.cpp ├── document_list.hpp ├── fasta_file.hpp ├── fasta_multifile.cpp ├── fasta_multifile.hpp ├── fastq_file.hpp ├── file │ ├── classic_index_header.cpp │ ├── classic_index_header.hpp │ ├── compact_index_header.cpp │ ├── compact_index_header.hpp │ ├── file_io_exception.hpp │ ├── header.hpp │ ├── kmer_buffer_header.cpp │ └── kmer_buffer_header.hpp ├── kmer.cpp ├── kmer.hpp ├── kmer_buffer.hpp ├── query │ ├── classic_index │ │ ├── mmap_search_file.cpp │ │ ├── mmap_search_file.hpp │ │ ├── search_file.cpp │ │ └── search_file.hpp │ ├── classic_search.cpp │ ├── classic_search.hpp │ ├── compact_index │ │ ├── aio_search_file.cpp │ │ ├── aio_search_file.hpp │ │ ├── mmap_search_file.cpp │ │ ├── mmap_search_file.hpp │ │ ├── search_file.cpp │ │ └── search_file.hpp │ ├── index_file.hpp │ └── search.hpp ├── settings.cpp ├── settings.hpp ├── text_file.hpp └── util │ ├── aio.cpp │ ├── aio.hpp │ ├── calc_signature_size.cpp │ ├── calc_signature_size.hpp │ ├── error_handling.cpp │ ├── error_handling.hpp │ ├── file.hpp │ ├── fs.hpp │ ├── misc.cpp │ ├── misc.hpp │ ├── parallel_for.cpp │ ├── parallel_for.hpp │ ├── process_file_batches.hpp │ ├── query.cpp │ ├── query.hpp │ ├── serialization.hpp │ ├── thread_object_array.hpp │ ├── timer.cpp │ ├── timer.hpp │ ├── zip_stream.cpp │ ├── zip_stream.hpp │ └── zip_stream_fwd.hpp ├── misc ├── format │ ├── analyze-source.pl │ └── uncrustify.cfg ├── mkdocs.sh └── python-wheels │ └── manylinux_x86_64.sh ├── python ├── CMakeLists.txt ├── cobs ├── docs │ ├── Makefile │ ├── _static │ │ └── cobs-index-architecture.png │ ├── cobs_index.rst │ ├── conf.py │ ├── index.rst │ ├── make.bat │ └── tutorial.rst ├── module.cpp ├── notes.md └── tests │ ├── __init__.py │ └── test_cobs_index.py ├── setup.py ├── src ├── CMakeLists.txt └── cobs.cpp └── tests ├── CMakeLists.txt ├── classic_index_construction.cpp ├── classic_index_query.cpp ├── compact_index_construction.cpp ├── compact_index_query.cpp ├── cortex_file.cpp ├── data ├── cortex │ ├── document.ctx │ ├── document_sorted.txt │ ├── sample1-k15.ctx │ ├── sample1-k15.txt │ ├── sample1-k19.ctx │ ├── sample1-k19.txt │ ├── sample1-k31.ctx │ └── sample1-k31.txt ├── fasta │ ├── sample1.fasta │ ├── sample2.fasta │ ├── sample3.fasta.gz │ ├── sample4.fasta │ ├── sample5.fasta │ ├── sample6.fasta │ └── sample7.fasta.gz ├── fasta_files.list ├── fasta_multi │ ├── sample1.mfasta │ └── sample2.mfasta ├── fastq │ ├── sample1.fastq │ ├── sample2.fastq.gz │ └── sample3.fastq └── text │ ├── sample1.txt │ └── sample2.txt ├── fasta_file.cpp ├── fasta_multifile.cpp ├── fastq_file.cpp ├── file.cpp ├── parameters.cpp ├── test_util.hpp ├── text_file.cpp └── util.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | cmake-build-debug/ 4 | build/ 5 | cmake-build-release 6 | python/docs/_build 7 | python/docs/_generated 8 | *.cobs_cache 9 | *.cobs_classic 10 | *.cobs_compact 11 | __pycache__ 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extlib/tlx"] 2 | path = extlib/tlx 3 | url = https://github.com/tlx/tlx.git 4 | [submodule "extlib/cli11"] 5 | path = extlib/cli11 6 | url = https://github.com/CLIUtils/CLI11.git 7 | [submodule "extlib/googletest"] 8 | path = extlib/googletest 9 | url = https://github.com/google/googletest.git 10 | [submodule "extlib/xxhash"] 11 | path = extlib/xxhash 12 | url = https://github.com/Cyan4973/xxHash.git 13 | [submodule "python/pybind11"] 14 | path = python/pybind11 15 | url = https://github.com/pybind/pybind11.git 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Travis-CI build script for cobs 2 | 3 | language: cpp 4 | 5 | sudo: false 6 | dist: bionic 7 | 8 | addons: 9 | apt: 10 | packages: 11 | - python3 12 | - python3-pip 13 | - python3-setuptools 14 | - python3-sphinx 15 | - lcov 16 | 17 | matrix: 18 | include: 19 | # gcc 7.x, Debug with -O1 20 | - env: CMAKE_CC="gcc" CMAKE_CXX="g++" BUILD_TYPE="Debug" COMPILER_FLAGS="-O1" 21 | 22 | # gcc 7.x, Release 23 | - env: CMAKE_CC="gcc" CMAKE_CXX="g++" BUILD_TYPE="Release" COMPILER_FLAGS="" 24 | 25 | # gcc 7.x, Debug with -O1 and coverage 26 | - env: CMAKE_CC="gcc" CMAKE_CXX="g++" BUILD_TYPE="Debug" COMPILER_FLAGS="-O1" BUILD_COVERAGE="1" 27 | 28 | before_script: 29 | 30 | # print out some version numbers 31 | - $CMAKE_CXX --version 32 | - cmake --version 33 | 34 | # gcov coverage 35 | - if [ -n "$BUILD_COVERAGE" -a -n "$COVERALLS_REPO_TOKEN" ]; then 36 | pip install --user cpp-coveralls; 37 | CMAKE_ARGS="$CMAKE_ARGS -DCOBS_USE_GCOV=ON"; 38 | fi 39 | 40 | # configure 41 | - mkdir build; cd build 42 | - cmake 43 | -DCMAKE_BUILD_TYPE="$BUILD_TYPE" 44 | -DCMAKE_C_COMPILER="$CMAKE_CC" -DCMAKE_CXX_COMPILER="$CMAKE_CXX" 45 | -DCMAKE_C_FLAGS="$COMPILER_FLAGS" -DCMAKE_CXX_FLAGS="$COMPILER_FLAGS" 46 | -DPYTHON_EXECUTABLE="/usr/bin/python3" 47 | $CMAKE_ARGS .. 48 | 49 | script: 50 | # build and run tests 51 | - make -j2 && ctest -V 52 | 53 | # install and test python module 54 | - cd $TRAVIS_BUILD_DIR 55 | - mkdir build-python; cd build-python 56 | - pip3 install --user --verbose .. 57 | - cd $TRAVIS_BUILD_DIR/python && python3 -m unittest 58 | 59 | after_success: 60 | # upload coverage 61 | - if [ -n "$BUILD_COVERAGE" -a -n "$COVERALLS_REPO_TOKEN" ]; then 62 | cd $TRAVIS_BUILD_DIR; 63 | coveralls --exclude extlib --exclude python --exclude build/CMakeFiles --gcov-options '\-lp'; 64 | fi 65 | 66 | # build docs 67 | - pip3 install --user sphinx-rtd-theme 68 | - cd $TRAVIS_BUILD_DIR/python/docs && make html 69 | - touch _build/html/.nojekyll 70 | 71 | deploy: 72 | provider: pages 73 | local_dir: $TRAVIS_BUILD_DIR/python/docs/_build/html/ 74 | repo: bingmann/cobs-python-docs 75 | skip_cleanup: true 76 | github_token: $GITHUB_TOKEN 77 | keep_history: true 78 | target_branch: master 79 | on: 80 | branch: master 81 | condition: $BUILD_TYPE = Release 82 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Florian Gauger 2 | Timo Bingmann 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # CMakeLists.txt 3 | # 4 | # Copyright (c) 2018 Florian Gauger 5 | # Copyright (c) 2018-2019 Timo Bingmann 6 | # 7 | # All rights reserved. Published under the MIT License in the LICENSE file. 8 | ################################################################################ 9 | 10 | cmake_minimum_required(VERSION 3.9.2) 11 | cmake_policy(VERSION 3.9.2) 12 | 13 | project(cobs) 14 | 15 | # prohibit in-source builds 16 | if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}") 17 | message(SEND_ERROR "In-source builds are not allowed.") 18 | endif() 19 | 20 | # Set a default build type if none was specified 21 | set(COBS_DEFAULT_BUILD_TYPE "Release") 22 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 23 | message(STATUS "Setting build type to '${COBS_DEFAULT_BUILD_TYPE}' as none was specified.") 24 | set(CMAKE_BUILD_TYPE "${COBS_DEFAULT_BUILD_TYPE}" CACHE 25 | STRING "Choose the type of build." FORCE) 26 | # Set the possible values of build type for cmake-gui 27 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS 28 | "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 29 | endif() 30 | 31 | ################################################################################ 32 | ### Options and Switches 33 | 34 | # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to 35 | # make it prominent in the GUI. 36 | option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF) 37 | 38 | # Override CMAKE_EXECUTABLE_SUFFIX 39 | option(COBS_EXECUTABLE_SUFFIX "Suffix for executables." 40 | "${CMAKE_EXECUTABLE_SUFFIX}") 41 | if(COBS_EXECUTABLE_SUFFIX) 42 | set(CMAKE_EXECUTABLE_SUFFIX "${COBS_EXECUTABLE_SUFFIX}") 43 | endif() 44 | 45 | option(COBS_USE_GCOV 46 | "Compile and run tests with gcov for coverage analysis." OFF) 47 | 48 | ################################################################################ 49 | ### Compiler Flags 50 | 51 | # variables to collect compile-time definitions, include dirs, and libraries 52 | set(COBS_DEFINITIONS "") 53 | set(COBS_INCLUDE_DIRS "") 54 | set(COBS_LINK_LIBRARIES "") 55 | 56 | # enable more warnings 57 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wpedantic") 58 | 59 | # use C++17 60 | set(CMAKE_CXX_STANDARD 17) 61 | 62 | # enable warnings 63 | set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} -W -Wall -march=native -fPIC") 64 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wall -march=native -fPIC") 65 | 66 | # with run-time STL checks 67 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_DEBUG") 68 | 69 | # with AddressSanitizer 70 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") 71 | 72 | # enable ThreadSanitizer 73 | if(OFF) 74 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread -pie -fPIC") 75 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCOBS_HAVE_THREAD_SANITIZER=1") 76 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -pie -fPIC") 77 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOBS_HAVE_THREAD_SANITIZER=1") 78 | endif() 79 | 80 | message(STATUS "COBS CMAKE_CXX_FLAGS:" ${CMAKE_CXX_FLAGS}) 81 | 82 | ############################################################################### 83 | # enable gcov coverage analysis with gcc 84 | 85 | if(COBS_USE_GCOV) 86 | # find programs 87 | find_program(GENHTML genhtml) 88 | find_program(LCOV lcov) 89 | 90 | if(NOT LCOV OR NOT GENHTML) 91 | message(SEND_ERROR "Coverage analysis requires lcov and genhtml programs.") 92 | else() 93 | message(STATUS "Found lcov: ${LCOV}") 94 | message(STATUS "Found genhtml: ${GENHTML}") 95 | endif() 96 | 97 | # add coverage anaylsis compile and link flags 98 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -fprofile-arcs -ftest-coverage") 99 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fprofile-arcs -ftest-coverage") 100 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lgcov") 101 | 102 | # add cached variable containing parameters for lcov/genhtml 103 | set(LCOV_FLAGS "" CACHE STRING "parameters for lcov") 104 | set(GENHTML_FLAGS --legend --no-branch-coverage 105 | CACHE STRING "parameters for genhtml") 106 | 107 | # custom target to run before tests 108 | add_custom_target(lcov-reset 109 | COMMAND ${LCOV} -q --directory ${CMAKE_BINARY_DIR} --zerocounters 110 | COMMENT "Resetting code coverage counters") 111 | 112 | # custom lcov target to run tests 113 | add_custom_target(lcov-runtests 114 | COMMAND ${CMAKE_CTEST_COMMAND} \${ARGS} || true 115 | DEPENDS lcov-reset 116 | COMMENT "Running all unit tests") 117 | 118 | # get git version description 119 | execute_process(COMMAND git describe --tags 120 | WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} 121 | OUTPUT_VARIABLE GITDESC 122 | OUTPUT_STRIP_TRAILING_WHITESPACE) 123 | 124 | # command sequence to gather, clean and generate HTML coverage report 125 | add_custom_target(lcov-html 126 | COMMAND ${LCOV} -q --directory . --capture --output-file lcov.info 127 | COMMAND ${LCOV} -q --remove lcov.info '/usr/*' '*/extlib/*' ${LCOV_FLAGS} --output-file lcov-clean.info 128 | COMMAND ${GENHTML} -q -o coverage --title "cobs ${GITDESC}" --prefix ${PROJECT_SOURCE_DIR} ${GENHTML_FLAGS} lcov-clean.info 129 | DEPENDS lcov-runtests 130 | COMMENT "Capturing code coverage counters and create HTML coverage report" 131 | WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) 132 | 133 | # top-level target to run tests and generate coverage report 134 | add_custom_target(test-coverage 135 | COMMENT "Generate HTML coverage report " 136 | DEPENDS lcov-html) 137 | 138 | endif(COBS_USE_GCOV) 139 | 140 | ################################################################################ 141 | ### Find Required Libraries 142 | 143 | ### find pthreads ### 144 | 145 | find_package(Threads REQUIRED) 146 | set(COBS_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${COBS_LINK_LIBRARIES}) 147 | if(CMAKE_USE_PTHREADS_INIT) 148 | set(COBS_LINK_LIBRARIES pthread ${COBS_LINK_LIBRARIES}) 149 | endif() 150 | 151 | ### use Google Test ### 152 | 153 | add_subdirectory(extlib/googletest) 154 | 155 | enable_testing() 156 | include(GoogleTest) 157 | 158 | ### use xxHash ### 159 | 160 | add_subdirectory(extlib/xxhash/cmake_unofficial) 161 | set(COBS_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/extlib/xxhash ${COBS_INCLUDE_DIRS}) 162 | set(COBS_LINK_LIBRARIES xxhash ${COBS_LINK_LIBRARIES}) 163 | 164 | ### use ZLIB ### 165 | 166 | find_package(ZLIB REQUIRED) 167 | set(COBS_INCLUDE_DIRS ${ZLIB_INCLUDE_DIRS} ${COBS_INCLUDE_DIRS}) 168 | set(COBS_LINK_LIBRARIES ${ZLIB_LIBRARIES} ${COBS_LINK_LIBRARIES}) 169 | 170 | ### use Boost filesystem ### 171 | 172 | find_package(Boost 1.42.0 COMPONENTS system filesystem) 173 | if(${Boost_FOUND}) 174 | set(COBS_INCLUDE_DIRS ${Boost_INCLUDE_DIRS} ${COBS_INCLUDE_DIRS}) 175 | set(COBS_LINK_LIBRARIES ${Boost_LIBRARIES} ${COBS_LINK_LIBRARIES}) 176 | endif() 177 | 178 | set(COBS_LINK_LIBRARIES stdc++fs ${COBS_LINK_LIBRARIES}) 179 | 180 | ### use TLX ### 181 | 182 | add_subdirectory(extlib/tlx) 183 | set(COBS_LINK_LIBRARIES tlx ${COBS_LINK_LIBRARIES}) 184 | 185 | ################################################################################ 186 | ### Descend into Subdirectories 187 | 188 | # descend into library source 189 | add_subdirectory(cobs) 190 | 191 | # descend into programs 192 | add_subdirectory(src) 193 | 194 | # descend into tests 195 | add_subdirectory(tests) 196 | 197 | # descend into python 198 | add_subdirectory(python) 199 | 200 | ################################################################################ 201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Florian Gauger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include extlib * 2 | recursive-exclude extlib .git 3 | recursive-include python/pybind11 * 4 | recursive-exclude python/pybind11 .git 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Compact Bit-Sliced Signature Index (COBS) 2 | 3 | COBS (COmpact Bit-sliced Signature index) is a cross-over between an inverted index and Bloom filters. Our target application is to index k-mers of DNA samples or q-grams from text documents and process **approximate pattern matching** queries on the corpus with a user-chosen coverage threshold. Query results may contain a number of false positives which decreases exponentially with the query length and the false positive rate of the index determined at construction time. 4 | COBS' compact but simple data structure outperforms other indexes in construction time and query performance with Mantis by Pandey et al. in second place. 5 | However, unlike Mantis and other previous work, COBS does not need the complete index in RAM and is thus designed to scale to larger document sets. 6 | 7 | ![cobs-architecture](https://user-images.githubusercontent.com/2604907/58323540-91b52100-7e24-11e9-933d-98b9b24ae041.png) 8 | 9 | COBS has two interfaces: ( 10 | [![Build Status](https://travis-ci.org/bingmann/cobs.svg?branch=master)](https://travis-ci.org/bingmann/cobs) 11 | [![Coverage Status](https://coveralls.io/repos/github/bingmann/cobs/badge.svg?branch=master)](https://coveralls.io/github/bingmann/cobs?branch=master) 12 | ) 13 | 14 | - a command line tool in C++ called `cobs` (see below) 15 | - a Python interface to the C++ library [![PyPI version](https://badge.fury.io/py/cobs-index.svg)](https://badge.fury.io/py/cobs-index) (see [bingmann.github.io/cobs-python-docs/](https://bingmann.github.io/cobs-python-docs/)) 16 | 17 | 18 | More information about COBS is presented in [our current research paper](https://arxiv.org/abs/1905.09624): 19 | Timo Bingmann, Phelim Bradley, Florian Gauger, and Zamin Iqbal. 20 | "COBS: a Compact Bit-Sliced Signature Index". 21 | In: *26th International Symposium on String Processing and Information Retrieval (SPIRE)*. pages 285-303. Spinger. October 2019. 22 | preprint arXiv:1905.09624. 23 | 24 | If you use COBS in an academic context or publication, please cite our paper 25 | ``` 26 | @InProceedings{bingmann2019cobs, 27 | author = {Timo Bingmann and Phelim Bradley and Florian Gauger and Zamin Iqbal}, 28 | title = {{COBS}: a Compact Bit-Sliced Signature Index}, 29 | booktitle = {26th International Conference on String Processing and Information Retrieval (SPIRE)}, 30 | year = 2019, 31 | series = {LNCS}, 32 | pages = {285--303}, 33 | month = oct, 34 | organization = {Springer}, 35 | note = {preprint arXiv:1905.09624}, 36 | } 37 | ``` 38 | 39 | # Installation and First Steps 40 | 41 | ## Installation 42 | 43 | COBS requires CMake, a C++17 compiler or the Boost.Filesystem library. 44 | 45 | To download and install COBS run: 46 | ``` 47 | git clone --recursive https://github.com/bingmann/cobs.git 48 | mkdir cobs/build 49 | cd cobs/build 50 | cmake .. 51 | make -j4 52 | ``` 53 | and optionally run `make test` to check the build. 54 | 55 | ## Building an Index 56 | 57 | COBS can read FASTA files (`*.fa`, `*.fasta`, `*.fna`, `*.ffn`, `*.faa`, `*.frn`, `*.fa.gz`, `*.fasta.gz`, `*.fna.gz`, `*.ffn.gz`, `*.faa.gz`, `*.frn.gz`), FASTQ files (`*.fq`, `*.fastq`, `*.fq.gz.`, `*.fastq.gz`), "Multi-FASTA" and "Multi-FASTQ" files (`*.mfasta`, `*.mfastq`), McCortex files (`*.ctx`), or text files (`*.txt`). 58 | See below on [details how they are parsed](#file-types-and-how-they-are-parsed). 59 | 60 | You can either recursively scan a directory for all files matching any of these files, or pass a `*.list` file which lists all paths COBS should index. 61 | 62 | To check the document list to be indexed, run for example 63 | ``` 64 | src/cobs doc-list tests/data/fasta/ 65 | ``` 66 | 67 | To construct a compact COBS index from these seven example documents run 68 | ``` 69 | src/cobs compact-construct tests/data/fasta/ example.cobs_compact 70 | ``` 71 | 72 | Or construct a compact COBS index from a list of documents by running 73 | ``` 74 | src/cobs compact-construct tests/data/fasta_files.list example.cobs_compact 75 | ``` 76 | The paths in the file list can be absolute or relative to the file list's path. 77 | Note that `*.txt` files are read as verbatim text files. 78 | You can force COBS to read a `.txt` file as a file list using `--file-type list`. 79 | 80 | Check `--help` for many options. 81 | 82 | ## Query an Index 83 | 84 | COBS has a simple command line query tool: 85 | ``` 86 | src/cobs query -i example.cobs_compact AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT 87 | ``` 88 | or a fasta file of queries with 89 | ``` 90 | src/cobs query -i example.cobs_compact -f query.fa 91 | ``` 92 | Multiple indices can be queried at once by adding more `-i` parameters. 93 | 94 | ## Python Interface 95 | 96 | COBS also has a Python frontend interface which can be used to construct and query an index. 97 | See https://bingmann.github.io/cobs-python-docs/ for a tutorial. 98 | 99 | # Experimental Results 100 | 101 | In our paper we compare COBS against seven other k-mer indexing software packages. 102 | These are the main results, scaling by number of documents in the index, and in the second diagram shown per document. 103 | 104 | ![cobs-experiments-scaling](https://user-images.githubusercontent.com/2604907/58323544-94b01180-7e24-11e9-8c3a-be998eb790a4.png) 105 | ![cobs-experiments-scaling-per-documents](https://user-images.githubusercontent.com/2604907/58323546-9679d500-7e24-11e9-9fed-636889628050.png) 106 | 107 | # More Details 108 | 109 | ## File Types and How They Are Parsed 110 | 111 | COBS can read FASTA files (`*.fa`, `*.fasta`, `*.fa.gz`, `*.fasta.gz`), FASTQ files (`*.fq`, `*.fastq`, `*.fq.gz.`, `*.fastq.gz`), "Multi-FASTA" and "Multi-FASTQ" files (`*.mfasta`, `*.mfastq`), McCortex files (`*.ctx`), or text files (`*.txt`). 112 | Each file type is parsed slightly differently into q-grams or k-mers. 113 | 114 | FASTA files are parsed as one document each. 115 | If a FASTA file contains multiple sequences or reads then they are combined into one document. 116 | Multiple sequences (separated by comments) are NOT concatenated trivially, instead the k-mers are extracted separately from each sequence. 117 | This means there are no erroneous k-mers from the beginning or end of crossing sequences. 118 | All newlines within a sequence are removed. 119 | 120 | The k-mers from DNA sequences are automatically canonicalized (the lexicographically smaller is indexed). 121 | By adding the flag `--no-canonicalize` this process can be skipped. 122 | With canonicalization only ACGT letters are indexed, every other letter is mapped to binary zeros and index with the other data. 123 | A warning per FASTA/FASTQ file containing a non-ACGT letter is printed, but processing continues. 124 | With the flag `--no-canonicalize` any letters or text can be indexed. 125 | 126 | FASTQ files are also parsed as one document each. 127 | The quality information is dropped and effectively everything is parsed identical to FASTA files. 128 | 129 | Multi-FASTA or Multi-FASTQ files are parsed as many documents. 130 | Each sequence in the FASTA or FASTQ file is considered a separate document in the COBS index. 131 | Their names are append with `_###` where ### is the index of the subdocument. 132 | 133 | McCortex files (`*.ctx`) contain a list of k-mers and these k-mers are indexes individually. 134 | The graph information is ignored. 135 | Only k=31 is currently supported. 136 | 137 | Text files (`*.txt`) are parsed as verbatim binary documents. 138 | All q-grams are extracted, including newlines and other whitespace. 139 | 140 | 141 | -------------------------------------------------------------------------------- /cobs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # cobs/CMakeLists.txt 3 | # 4 | # Copyright (c) 2019 Timo Bingmann 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | # glob general sources 10 | file(GLOB COBS_SOURCES 11 | RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} 12 | ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp 13 | ${CMAKE_CURRENT_SOURCE_DIR}/construction/*.[ch]pp 14 | ${CMAKE_CURRENT_SOURCE_DIR}/file/*.[ch]pp 15 | ${CMAKE_CURRENT_SOURCE_DIR}/query/*.[ch]pp 16 | ${CMAKE_CURRENT_SOURCE_DIR}/query/*/*.[ch]pp 17 | ${CMAKE_CURRENT_SOURCE_DIR}/util/*.[ch]pp 18 | ) 19 | 20 | add_library(cobs_static STATIC ${COBS_SOURCES}) 21 | target_compile_definitions(cobs_static PUBLIC ${COBS_DEFINITIONS}) 22 | target_include_directories(cobs_static PUBLIC ${PROJECT_SOURCE_DIR}) 23 | target_include_directories(cobs_static SYSTEM PUBLIC ${COBS_INCLUDE_DIRS}) 24 | target_link_libraries(cobs_static PUBLIC ${COBS_LINK_LIBRARIES}) 25 | 26 | ################################################################################ 27 | -------------------------------------------------------------------------------- /cobs/construction/classic_index.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/construction/classic_index.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_CONSTRUCTION_CLASSIC_INDEX_HEADER 11 | #define COBS_CONSTRUCTION_CLASSIC_INDEX_HEADER 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | /*! 18 | * The classic Inverted Signature Index without the space-saving improvements. 19 | * 20 | * This namespace provides methods for creation of this index. It can either be 21 | * constructed from existing documents or with random dummy data for performance 22 | * testing purposes. 23 | */ 24 | namespace cobs { 25 | 26 | /*! 27 | * Parameters for classic index construction. 28 | */ 29 | struct ClassicIndexParameters { 30 | //! length of terms / k-mers 31 | unsigned term_size = 31; 32 | //! canonicalization flag for base pairs 33 | uint8_t canonicalize = 1; 34 | //! number of hash functions, provided by user 35 | unsigned num_hashes = 1; 36 | //! false positive rate, provided by user 37 | double false_positive_rate = 0.3; 38 | //! signature size, either provided by user or calculated from 39 | //! false_positive_rate if zero. 40 | uint64_t signature_size = 0; 41 | //! memory to use bytes to create index 42 | uint64_t mem_bytes = get_memory_size(80); 43 | //! number of threads to use 44 | size_t num_threads = gopt_threads; 45 | //! log prefix (used by compact index construction) 46 | std::string log_prefix; 47 | //! clobber erase output directory if it exists, default: false 48 | bool clobber = false; 49 | //! continue in existing output directory, default: false 50 | bool continue_ = false; 51 | //! keep temporary files during construction, default: false 52 | bool keep_temporary = false; 53 | }; 54 | 55 | /*! 56 | * Constructs the index by executing all necessary steps. 57 | * 58 | * First calls cobs::classic_construct_from_documents() to construct multiple 59 | * small indices. Afterwards combines these indices with calls to 60 | * cobs::classic_combine until only one index remains. 61 | */ 62 | void classic_construct( 63 | const DocumentList& filelist, const fs::path& out_dir, 64 | fs::path tmp_path, ClassicIndexParameters index_params); 65 | 66 | /*! 67 | * Constructs multiple small indices from document files. 68 | */ 69 | void classic_construct_from_documents( 70 | const DocumentList& doc_list, const fs::path& out_dir, 71 | const ClassicIndexParameters& index_params); 72 | 73 | /*! 74 | * Combines multiple indices into one or more bigger indices. 75 | */ 76 | bool classic_combine( 77 | const fs::path& in_dir, const fs::path& out_dir, fs::path& result_file, 78 | uint64_t mem_bytes, size_t num_threads, bool keep_temporary); 79 | 80 | /*! 81 | * Constructs a classic index filled with random data. 82 | */ 83 | void classic_construct_random( 84 | const fs::path& out_file, uint64_t signature_size, 85 | uint64_t num_documents, size_t document_size, 86 | uint64_t num_hashes, size_t seed); 87 | 88 | } // namespace cobs 89 | 90 | #endif // !COBS_CONSTRUCTION_CLASSIC_INDEX_HEADER 91 | 92 | /******************************************************************************/ 93 | -------------------------------------------------------------------------------- /cobs/construction/compact_index.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/construction/compact_index.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_CONSTRUCTION_COMPACT_INDEX_HEADER 10 | #define COBS_CONSTRUCTION_COMPACT_INDEX_HEADER 11 | 12 | #include 13 | #include 14 | 15 | /*! 16 | * The compact Inverted Signature Index with the space-saving improvements. 17 | * This namespace provides methods for creation of this index. It can either be 18 | * constructed from existing documents or with random dummy data for performance 19 | * testing purposes. The index uses different signature sizes to minimize space 20 | * wastage. 21 | */ 22 | namespace cobs { 23 | 24 | struct CompactIndexParameters { 25 | //! length of terms / k-mers 26 | unsigned term_size = 31; 27 | //! canonicalization flag for base pairs 28 | uint8_t canonicalize = 1; 29 | //! number of hash functions, provided by user 30 | unsigned num_hashes = 1; 31 | //! false positive rate, provided by user 32 | double false_positive_rate = 0.3; 33 | //! page or block size of filters with common fpr 34 | uint64_t page_size = 0; 35 | //! memory to use bytes to create index 36 | uint64_t mem_bytes = get_memory_size(80); 37 | //! number of threads to use 38 | size_t num_threads = gopt_threads; 39 | //! clobber erase output directory if it exists, default: false 40 | bool clobber = false; 41 | //! continue in existing output directory, default: false 42 | bool continue_ = false; 43 | //! keep temporary files during construction, default: false 44 | bool keep_temporary = false; 45 | }; 46 | 47 | /*! 48 | * Constructs the folders used by the cobs::compact_index::construct. Sorts the 49 | * documents by file size and then splits them into several directories. 50 | */ 51 | void compact_construct( 52 | DocumentList doc_list, const fs::path& index_dir, 53 | fs::path tmp_path, CompactIndexParameters index_params); 54 | 55 | void compact_combine_into_compact( 56 | const fs::path& in_dir, const fs::path& out_file, 57 | uint64_t page_size = get_page_size(), 58 | uint64_t memory = get_memory_size(80), 59 | bool keep_temporary = false); 60 | 61 | } // namespace cobs 62 | 63 | #endif // !COBS_CONSTRUCTION_COMPACT_INDEX_HEADER 64 | 65 | /******************************************************************************/ 66 | -------------------------------------------------------------------------------- /cobs/cortex_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/cortex_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018-2020 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_CORTEX_FILE_HEADER 11 | #define COBS_CORTEX_FILE_HEADER 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | namespace cobs { 30 | 31 | class CortexFile 32 | { 33 | public: 34 | CortexFile(std::string path) { 35 | is_.open(path); 36 | die_unless(is_.good()); 37 | read_header(is_, path); 38 | } 39 | 40 | template 41 | static inline Type cast_advance(std::istream& is) { 42 | Type t; 43 | is.read(reinterpret_cast(&t), sizeof(t)); 44 | return t; 45 | } 46 | 47 | static void check_magic_number(std::istream& is, std::string path) { 48 | std::string magic_word = "CORTEX"; 49 | for (size_t i = 0; i < magic_word.size(); i++) { 50 | if (is.get() != magic_word[i]) { 51 | throw std::invalid_argument( 52 | "CortexFile: magic number not found @ " + path); 53 | } 54 | } 55 | } 56 | 57 | void read_header(std::istream& is, std::string path) { 58 | check_magic_number(is, path); 59 | version_ = cast_advance(is); 60 | if (version_ != 6) 61 | die("Invalid .ctx file version (" << version_); 62 | 63 | kmer_size_ = cast_advance(is); 64 | num_words_per_kmer_ = cast_advance(is); 65 | num_colors_ = cast_advance(is); 66 | if (num_colors_ != 1) 67 | die("Invalid number of colors (" << num_colors_ << "), must be 1"); 68 | 69 | for (size_t i = 0; i < num_colors_; i++) { 70 | uint32_t mean_read_length = cast_advance(is); 71 | uint64_t total_length = cast_advance(is); 72 | tlx::unused(mean_read_length, total_length); 73 | } 74 | for (size_t i = 0; i < num_colors_; i++) { 75 | auto document_name_length = cast_advance(is); 76 | name_.resize(document_name_length); 77 | is.read(const_cast(name_.data()), document_name_length); 78 | } 79 | is.ignore(16 * num_colors_); 80 | for (size_t i = 0; i < num_colors_; i++) { 81 | is.ignore(12); 82 | auto length_graph_name = cast_advance(is); 83 | is.ignore(length_graph_name); 84 | } 85 | check_magic_number(is, path); 86 | 87 | LOG0 << "CortexFile::read_header()" 88 | << " version_=" << version_ 89 | << " kmer_size_=" << kmer_size_ 90 | << " num_words_per_kmer_=" << num_words_per_kmer_ 91 | << " num_colors_=" << num_colors_; 92 | 93 | pos_data_begin_ = is.tellg(); 94 | is.seekg(0, std::ios::end); 95 | pos_data_end_ = is.tellg(); 96 | } 97 | 98 | size_t num_kmers() const { 99 | return (pos_data_end_ - pos_data_begin_) 100 | / (8 * num_words_per_kmer_ + 5 * num_colors_); 101 | } 102 | 103 | template 104 | void process_terms(size_t term_size, Callback callback) { 105 | std::string kmer(kmer_size_, 0); 106 | static const size_t kmer_packed_size = (kmer_size_ + 3) / 4; 107 | 108 | // bytes per k-mer from 64-bit words per k-mer () 109 | size_t bytes_per_kmer = sizeof(uint64_t) * num_words_per_kmer_; 110 | die_unless(bytes_per_kmer >= kmer_packed_size); 111 | 112 | std::vector kmer_data(bytes_per_kmer); 113 | 114 | is_.clear(); 115 | is_.seekg(pos_data_begin_); 116 | 117 | size_t r = num_kmers(); 118 | while (r != 0) { 119 | --r; 120 | if (!is_.good()) 121 | die("corrupted .ctx file"); 122 | 123 | // read k-mer data bytes 124 | is_.read(reinterpret_cast(kmer_data.data()), bytes_per_kmer); 125 | // skip color information 126 | is_.ignore(5 * num_colors_); 127 | 128 | // from KMer::to_string() 129 | kmer.clear(); 130 | for (size_t i = 0; i < kmer_packed_size; ++i) { 131 | if (TLX_UNLIKELY(i == 0 && kmer_size_ % 4 != 0)) { 132 | // fragment of last k-mer 133 | kmer += kmer_byte_to_base_pairs[ 134 | kmer_data[kmer_packed_size - 1 - i]] + (4 - kmer_size_ % 4); 135 | } 136 | else { 137 | kmer += kmer_byte_to_base_pairs[ 138 | kmer_data[kmer_packed_size - 1 - i]]; 139 | } 140 | } 141 | 142 | for (size_t i = 0; i + term_size <= kmer_size_; ++i) { 143 | callback(tlx::string_view(kmer.data() + i, term_size)); 144 | } 145 | } 146 | } 147 | 148 | //! version number 149 | uint32_t version_; 150 | //! kmer size () 151 | uint32_t kmer_size_; 152 | //! number of uint64_t (64 bit words) encoding a kmer () 153 | uint32_t num_words_per_kmer_; 154 | //! number of colours () 155 | uint32_t num_colors_; 156 | 157 | std::string name_; 158 | 159 | private: 160 | std::ifstream is_; 161 | std::istream::pos_type pos_data_begin_, pos_data_end_; 162 | }; 163 | 164 | } // namespace cobs 165 | 166 | #endif // !COBS_CORTEX_FILE_HEADER 167 | 168 | /******************************************************************************/ 169 | -------------------------------------------------------------------------------- /cobs/document_list.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/document_list.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | namespace cobs { 14 | 15 | FileType StringToFileType(std::string& s) { 16 | tlx::to_lower(&s); 17 | if (s == "any" || s == "*") 18 | return FileType::Any; 19 | if (s == "text" || s == "txt") 20 | return FileType::Text; 21 | if (s == "cortex" || s == "ctx") 22 | return FileType::Cortex; 23 | if (s == "cobs" || s == "cobs_doc") 24 | return FileType::KMerBuffer; 25 | if (s == "fasta") 26 | return FileType::Fasta; 27 | if (s == "fastq") 28 | return FileType::Fastq; 29 | if (s == "list") 30 | return FileType::List; 31 | die("Unknown file type " << s); 32 | } 33 | 34 | } // namespace cobs 35 | 36 | /******************************************************************************/ 37 | -------------------------------------------------------------------------------- /cobs/fasta_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/fasta_file.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FASTA_FILE_HEADER 10 | #define COBS_FASTA_FILE_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | namespace cobs { 30 | 31 | class FastaFile 32 | { 33 | public: 34 | FastaFile(std::string path, bool use_cache = true) : path_(path) { 35 | is_.open(path); 36 | die_unless(is_.good()); 37 | 38 | if (!use_cache || gopt_disable_cache) { 39 | compute_index(); 40 | } 41 | else if (read_cache_file()) { 42 | // all good. 43 | } 44 | else { 45 | compute_index(); 46 | write_cache_file(); 47 | } 48 | } 49 | 50 | //! return index cache file path 51 | std::string cache_path() const { 52 | return path_ + ".cobs_cache"; 53 | } 54 | 55 | //! read complete FASTA file for sub-documents 56 | void compute_index(std::istream& is) { 57 | LOGC(!gopt_disable_cache) 58 | << "FastaFile: computing index for " << path_; 59 | 60 | std::string line; 61 | size_t sequence_size = 0; 62 | sequence_count_ = 0; 63 | size_ = 0; 64 | 65 | std::getline(is, line); 66 | if (is.eof()) return; 67 | die_unless(is.good()); 68 | 69 | if (line.size() == 0 || (line[0] != '>' && line[0] != ';')) 70 | die("FastaFile: file does not start with > or ; - " << path_); 71 | size_ += line.size() + 1; 72 | 73 | while (std::getline(is, line)) { 74 | size_ += line.size() + 1; 75 | if (line.size() == 0 || line[0] == '>' || line[0] == ';') { 76 | // comment or empty line restart the term buffer 77 | if (sequence_size != 0) { 78 | sequence_size_hist_[sequence_size]++; 79 | sequence_count_++; 80 | } 81 | sequence_size = 0; 82 | continue; 83 | } 84 | sequence_size += line.size(); 85 | } 86 | if (sequence_size != 0) { 87 | sequence_size_hist_[sequence_size]++; 88 | sequence_count_++; 89 | } 90 | } 91 | 92 | //! read complete FASTA file for sub-documents 93 | void compute_index() { 94 | is_.clear(); 95 | is_.seekg(0); 96 | 97 | if (tlx::ends_with(path_, ".gz")) { 98 | zip_istream zis(is_); 99 | return compute_index(zis); 100 | } 101 | else { 102 | return compute_index(is_); 103 | } 104 | } 105 | 106 | //! write cache file 107 | void write_cache_file() { 108 | std::ofstream os(cache_path() + ".tmp"); 109 | stream_put_pod(os, size_); 110 | stream_put_pod(os, sequence_count_); 111 | stream_put_pod(os, sequence_size_hist_.size()); 112 | for (auto it = sequence_size_hist_.begin(); 113 | it != sequence_size_hist_.end(); ++it) { 114 | stream_put_pod(os, it->first); 115 | stream_put_pod(os, it->second); 116 | } 117 | fs::rename(cache_path() + ".tmp", 118 | cache_path()); 119 | LOG1 << "FastaFile: saved index as " << cache_path(); 120 | } 121 | 122 | //! read cache file 123 | bool read_cache_file() { 124 | std::ifstream is(cache_path()); 125 | if (!is.good()) return false; 126 | size_t hist_size; 127 | stream_get_pod(is, size_); 128 | stream_get_pod(is, sequence_count_); 129 | stream_get_pod(is, hist_size); 130 | LOG1 << "FastaFile: loading index " << cache_path() 131 | << " [" << sequence_count_ << " subsequences]"; 132 | for (size_t i = 0; i < hist_size; ++i) { 133 | size_t size, count; 134 | stream_get_pod(is, size); 135 | stream_get_pod(is, count); 136 | sequence_size_hist_[size] = count; 137 | } 138 | return is.good() && (is.get() == EOF); 139 | } 140 | 141 | //! return estimated size of a fasta document 142 | size_t size() { 143 | return size_; 144 | } 145 | 146 | //! return number of q-grams in document 147 | size_t num_terms(size_t q) { 148 | size_t total = 0; 149 | for (const auto& p : sequence_size_hist_) { 150 | total += p.second * (p.first < q ? 0 : p.first - q + 1); 151 | } 152 | return total; 153 | } 154 | 155 | template 156 | void process_terms(std::istream& is, size_t term_size, Callback callback) { 157 | std::string line; 158 | size_t pos = 0; 159 | 160 | while (tlx::appendline(is, line)) { 161 | if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { 162 | // comment or empty line restart the term buffer 163 | line.clear(); 164 | continue; 165 | } 166 | 167 | // process terms continued on next line 168 | for (size_t i = 0; i + term_size <= line.size(); ++i) { 169 | callback(tlx::string_view(line.data() + i, term_size)); 170 | } 171 | if (line.size() > term_size - 1) { 172 | std::copy(line.data() + line.size() - (term_size - 1), 173 | line.data() + line.size(), 174 | line.data()); 175 | line.resize(term_size - 1); 176 | pos = line.size(); 177 | } 178 | else { 179 | pos = 0; 180 | } 181 | } 182 | } 183 | 184 | template 185 | void process_terms(size_t term_size, Callback callback) { 186 | is_.clear(); 187 | is_.seekg(0); 188 | die_unless(is_.good()); 189 | 190 | if (tlx::ends_with(path_, ".gz")) { 191 | zip_istream zis(is_); 192 | return process_terms(zis, term_size, callback); 193 | } 194 | else { 195 | return process_terms(is_, term_size, callback); 196 | } 197 | } 198 | 199 | private: 200 | //! file stream 201 | std::ifstream is_; 202 | //! path 203 | std::string path_; 204 | //! size in bytes 205 | size_t size_; 206 | //! number of sub-sequences 207 | size_t sequence_count_; 208 | //! histogram of sub-sequence sizes 209 | std::map sequence_size_hist_; 210 | }; 211 | 212 | } // namespace cobs 213 | 214 | #endif // !COBS_FASTA_FILE_HEADER 215 | 216 | /******************************************************************************/ 217 | -------------------------------------------------------------------------------- /cobs/fasta_multifile.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/fasta_multifile.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | namespace cobs { 14 | 15 | ThreadObjectLRUSet FastaMultifile::lru_set_ { 16 | std::thread::hardware_concurrency()* 4 17 | }; 18 | 19 | FastaIndexCache FastaMultifile::cache_; 20 | 21 | } // namespace cobs 22 | 23 | /******************************************************************************/ 24 | -------------------------------------------------------------------------------- /cobs/fastq_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/fastq_file.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FASTQ_FILE_HEADER 10 | #define COBS_FASTQ_FILE_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | 28 | namespace cobs { 29 | 30 | class FastqFile 31 | { 32 | public: 33 | FastqFile(std::string path, bool use_cache = true) : path_(path) { 34 | is_.open(path); 35 | die_unless(is_.good()); 36 | 37 | if (!use_cache || gopt_disable_cache) { 38 | compute_index(); 39 | } 40 | else if (read_cache_file()) { 41 | // all good. 42 | } 43 | else { 44 | compute_index(); 45 | write_cache_file(); 46 | } 47 | } 48 | 49 | //! return index cache file path 50 | std::string cache_path() const { 51 | return path_ + ".cobs_cache"; 52 | } 53 | 54 | //! read complete FASTQ file for sub-documents 55 | void compute_index(std::istream& is) { 56 | LOGC(!gopt_disable_cache) 57 | << "FastqFile: computing index for " << path_; 58 | 59 | std::string line; 60 | sequence_count_ = 0; 61 | size_ = 0; 62 | 63 | size_t line_num = 0; 64 | while (std::getline(is, line)) { 65 | size_ += line.size() + 1; 66 | 67 | if (line_num % 4 == 0) { 68 | if (line.size() == 0 || line[0] != '@') { 69 | die("FastqFile: line " << line_num << 70 | " does not start with @ - " << path_); 71 | } 72 | } 73 | else if (line_num % 4 == 1) { 74 | // sequence/read line 75 | size_t sequence_size = line.size(); 76 | sequence_size_hist_[sequence_size]++; 77 | sequence_count_++; 78 | } 79 | else if (line_num % 4 == 2) { 80 | if (line.size() == 0 || line[0] != '+') { 81 | die("FastqFile: line " << line_num << 82 | " does not start with + - " << path_); 83 | } 84 | } 85 | else if (line_num % 4 == 3) { 86 | // don't care about quality 87 | } 88 | ++line_num; 89 | } 90 | } 91 | 92 | //! read complete FASTQ file for sub-documents 93 | void compute_index() { 94 | is_.clear(); 95 | is_.seekg(0); 96 | 97 | if (tlx::ends_with(path_, ".gz")) { 98 | zip_istream zis(is_); 99 | return compute_index(zis); 100 | } 101 | else { 102 | return compute_index(is_); 103 | } 104 | } 105 | 106 | //! write cache file 107 | void write_cache_file() { 108 | std::ofstream os(cache_path() + ".tmp"); 109 | stream_put_pod(os, size_); 110 | stream_put_pod(os, sequence_count_); 111 | stream_put_pod(os, sequence_size_hist_.size()); 112 | for (auto it = sequence_size_hist_.begin(); 113 | it != sequence_size_hist_.end(); ++it) { 114 | stream_put_pod(os, it->first); 115 | stream_put_pod(os, it->second); 116 | } 117 | fs::rename(cache_path() + ".tmp", 118 | cache_path()); 119 | LOG1 << "FastqFile: saved index as " << cache_path(); 120 | } 121 | 122 | //! read cache file 123 | bool read_cache_file() { 124 | std::ifstream is(cache_path()); 125 | if (!is.good()) return false; 126 | size_t hist_size; 127 | stream_get_pod(is, size_); 128 | stream_get_pod(is, sequence_count_); 129 | stream_get_pod(is, hist_size); 130 | LOG1 << "FastqFile: loading index " << cache_path() 131 | << " [" << sequence_count_ << " subsequences]"; 132 | for (size_t i = 0; i < hist_size; ++i) { 133 | size_t size, count; 134 | stream_get_pod(is, size); 135 | stream_get_pod(is, count); 136 | sequence_size_hist_[size] = count; 137 | } 138 | return is.good() && (is.get() == EOF); 139 | } 140 | 141 | //! return estimated size of a fastq document 142 | size_t size() { 143 | return size_; 144 | } 145 | 146 | //! return number of q-grams in document 147 | size_t num_terms(size_t q) { 148 | size_t total = 0; 149 | for (const auto& p : sequence_size_hist_) { 150 | total += p.second * (p.first < q ? 0 : p.first - q + 1); 151 | } 152 | return total; 153 | } 154 | 155 | template 156 | void process_terms(std::istream& is, size_t term_size, Callback callback) { 157 | std::string line; 158 | 159 | size_t line_num = 0; 160 | while (std::getline(is, line)) { 161 | if (line_num % 4 == 0) { 162 | if (line.size() == 0 || line[0] != '@') { 163 | die("FastqFile: line " << line_num << 164 | " does not start with @ - " << path_); 165 | } 166 | } 167 | else if (line_num % 4 == 1) { 168 | // process terms in sequence/read line. 169 | for (size_t i = 0; i + term_size <= line.size(); ++i) { 170 | callback(tlx::string_view(line.data() + i, term_size)); 171 | } 172 | } 173 | else if (line_num % 4 == 2) { 174 | if (line.size() == 0 || line[0] != '+') { 175 | die("FastqFile: line " << line_num << 176 | " does not start with + - " << path_); 177 | } 178 | } 179 | else if (line_num % 4 == 3) { 180 | // don't care about quality 181 | } 182 | ++line_num; 183 | } 184 | } 185 | 186 | template 187 | void process_terms(size_t term_size, Callback callback) { 188 | is_.clear(); 189 | is_.seekg(0); 190 | die_unless(is_.good()); 191 | 192 | if (tlx::ends_with(path_, ".gz")) { 193 | zip_istream zis(is_); 194 | return process_terms(zis, term_size, callback); 195 | } 196 | else { 197 | return process_terms(is_, term_size, callback); 198 | } 199 | } 200 | 201 | private: 202 | //! file stream 203 | std::ifstream is_; 204 | //! path 205 | std::string path_; 206 | //! size in bytes 207 | size_t size_; 208 | //! number of sub-sequences 209 | size_t sequence_count_; 210 | //! histogram of sub-sequence sizes 211 | std::map sequence_size_hist_; 212 | }; 213 | 214 | } // namespace cobs 215 | 216 | #endif // !COBS_FASTQ_FILE_HEADER 217 | 218 | /******************************************************************************/ 219 | -------------------------------------------------------------------------------- /cobs/file/classic_index_header.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/classic_index_header.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | 12 | namespace cobs { 13 | 14 | const std::string ClassicIndexHeader::magic_word = "CLASSIC_INDEX"; 15 | const uint32_t ClassicIndexHeader::version = 1; 16 | const std::string ClassicIndexHeader::file_extension = ".cobs_classic"; 17 | 18 | uint64_t ClassicIndexHeader::row_bits() const { 19 | return file_names_.size(); 20 | } 21 | 22 | uint64_t ClassicIndexHeader::row_size() const { 23 | return (file_names_.size() + 7) / 8; 24 | } 25 | 26 | void ClassicIndexHeader::serialize(std::ostream& os) const { 27 | serialize_magic_begin(os, magic_word, version); 28 | 29 | stream_put(os, term_size_, canonicalize_, 30 | (uint32_t)file_names_.size(), signature_size_, num_hashes_); 31 | for (const auto& file_name : file_names_) { 32 | os << file_name << std::endl; 33 | } 34 | 35 | serialize_magic_end(os, magic_word); 36 | } 37 | 38 | void ClassicIndexHeader::deserialize(std::istream& is) { 39 | deserialize_magic_begin(is, magic_word, version); 40 | 41 | uint32_t file_names_size; 42 | stream_get(is, term_size_, canonicalize_, 43 | file_names_size, signature_size_, num_hashes_); 44 | file_names_.resize(file_names_size); 45 | for (auto& file_name : file_names_) { 46 | std::getline(is, file_name); 47 | } 48 | 49 | deserialize_magic_end(is, magic_word); 50 | } 51 | 52 | void ClassicIndexHeader::write_file(std::ostream& os, 53 | const std::vector& data) { 54 | os.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 55 | serialize(os); 56 | os.write(reinterpret_cast(data.data()), data.size()); 57 | } 58 | 59 | void ClassicIndexHeader::write_file(const fs::path& p, 60 | const std::vector& data) { 61 | if (!p.parent_path().empty()) 62 | fs::create_directories(p.parent_path()); 63 | std::ofstream ofs(p.string(), std::ios::out | std::ios::binary); 64 | write_file(ofs, data); 65 | } 66 | 67 | void ClassicIndexHeader::read_file(std::istream& is, 68 | std::vector& data) { 69 | is.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 70 | deserialize(is); 71 | size_t size = get_stream_size(is); 72 | data.resize(size); 73 | is.read(reinterpret_cast(data.data()), size); 74 | } 75 | 76 | void ClassicIndexHeader::read_file(const fs::path& p, 77 | std::vector& data) { 78 | std::ifstream ifs(p.string(), std::ios::in | std::ios::binary); 79 | read_file(ifs, data); 80 | } 81 | 82 | } // namespace cobs 83 | 84 | /******************************************************************************/ 85 | -------------------------------------------------------------------------------- /cobs/file/classic_index_header.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/classic_index_header.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FILE_CLASSIC_INDEX_HEADER_HEADER 10 | #define COBS_FILE_CLASSIC_INDEX_HEADER_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class ClassicIndexHeader 17 | { 18 | public: 19 | //! k-mer or q-gram size = term size 20 | uint32_t term_size_; 21 | //! 0 = don't modify k-mers, 1 = canonicalize 22 | uint8_t canonicalize_; 23 | //! size of Bloom filters in bits = number of rows of matrix 24 | uint64_t signature_size_; 25 | //! number of hashes per term, usually 1 26 | uint64_t num_hashes_; 27 | //! list of document file names 28 | std::vector file_names_; 29 | 30 | public: 31 | static const std::string magic_word; 32 | static const uint32_t version; 33 | static const std::string file_extension; 34 | 35 | ClassicIndexHeader() = default; 36 | 37 | //! number of bits in a row, which is the number of documents 38 | uint64_t row_bits() const; 39 | //! number of bytes in a row, number of documents rounded up to bytes. 40 | uint64_t row_size() const; 41 | 42 | void serialize(std::ostream& os) const; 43 | void deserialize(std::istream& is); 44 | 45 | void write_file(std::ostream& os, const std::vector& data); 46 | void write_file(const fs::path& p, const std::vector& data); 47 | 48 | void read_file(std::istream& is, std::vector& data); 49 | void read_file(const fs::path& p, std::vector& data); 50 | }; 51 | 52 | } // namespace cobs 53 | 54 | #endif // !COBS_FILE_CLASSIC_INDEX_HEADER_HEADER 55 | 56 | /******************************************************************************/ 57 | -------------------------------------------------------------------------------- /cobs/file/compact_index_header.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/compact_index_header.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | namespace cobs { 12 | 13 | const std::string CompactIndexHeader::magic_word = "COMPACT_INDEX"; 14 | const uint32_t CompactIndexHeader::version = 1; 15 | const std::string CompactIndexHeader::file_extension = ".cobs_compact"; 16 | 17 | CompactIndexHeader::CompactIndexHeader(uint64_t page_size) 18 | : page_size_(page_size) { } 19 | 20 | size_t CompactIndexHeader::padding_size(uint64_t curr_stream_pos) const { 21 | return (page_size_ - ((curr_stream_pos + CompactIndexHeader::magic_word.size()) % page_size_)) % page_size_; 22 | } 23 | 24 | void CompactIndexHeader::serialize(std::ostream& os) const { 25 | serialize_magic_begin(os, magic_word, version); 26 | 27 | stream_put(os, term_size_, canonicalize_, 28 | (uint32_t)parameters_.size(), (uint32_t)file_names_.size(), 29 | page_size_); 30 | os.flush(); 31 | for (const auto& p : parameters_) { 32 | cobs::stream_put(os, p.signature_size, p.num_hashes); 33 | } 34 | for (const auto& file_name : file_names_) { 35 | os << file_name << std::endl; 36 | } 37 | 38 | std::vector padding(padding_size(os.tellp())); 39 | os.write(padding.data(), padding.size()); 40 | 41 | serialize_magic_end(os, magic_word); 42 | } 43 | 44 | void CompactIndexHeader::deserialize(std::istream& is) { 45 | deserialize_magic_begin(is, magic_word, version); 46 | 47 | uint32_t parameters_size; 48 | uint32_t file_names_size; 49 | stream_get(is, term_size_, canonicalize_, 50 | parameters_size, file_names_size, page_size_); 51 | parameters_.resize(parameters_size); 52 | for (auto& p : parameters_) { 53 | stream_get(is, p.signature_size, p.num_hashes); 54 | } 55 | 56 | file_names_.resize(file_names_size); 57 | for (auto& file_name : file_names_) { 58 | std::getline(is, file_name); 59 | } 60 | 61 | StreamPos sp = get_stream_pos(is); 62 | is.seekg(sp.curr_pos + padding_size(sp.curr_pos), std::ios::beg); 63 | 64 | deserialize_magic_end(is, magic_word); 65 | } 66 | 67 | void CompactIndexHeader::read_file(std::istream& is, 68 | std::vector >& data) { 69 | is.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 70 | deserialize(is); 71 | data.clear(); 72 | data.resize(parameters_.size()); 73 | for (size_t i = 0; i < parameters_.size(); i++) { 74 | size_t data_size = page_size_ * parameters_[i].signature_size; 75 | std::vector d(data_size); 76 | is.read(reinterpret_cast(d.data()), data_size); 77 | data[i] = std::move(d); 78 | } 79 | } 80 | 81 | void CompactIndexHeader::read_file(const fs::path& p, 82 | std::vector >& data) { 83 | std::ifstream ifs(p.string(), std::ios::in | std::ios::binary); 84 | read_file(ifs, data); 85 | } 86 | 87 | } // namespace cobs 88 | 89 | /******************************************************************************/ 90 | -------------------------------------------------------------------------------- /cobs/file/compact_index_header.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/compact_index_header.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FILE_COMPACT_INDEX_HEADER_HEADER 10 | #define COBS_FILE_COMPACT_INDEX_HEADER_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class CompactIndexHeader 17 | { 18 | public: 19 | struct parameter { 20 | uint64_t signature_size; 21 | uint64_t num_hashes; 22 | }; 23 | 24 | //! k-mer or q-gram size = term size 25 | uint32_t term_size_; 26 | //! 0 = don't modify k-mers, 1 = canonicalize 27 | uint8_t canonicalize_; 28 | //! parameters of subindices 29 | std::vector parameters_; 30 | //! list of document file names 31 | std::vector file_names_; 32 | //! size of each subindex in bytes 33 | uint64_t page_size_; 34 | 35 | size_t padding_size(uint64_t curr_stream_pos) const; 36 | 37 | public: 38 | static const std::string magic_word; 39 | static const uint32_t version; 40 | static const std::string file_extension; 41 | 42 | explicit CompactIndexHeader(uint64_t page_size = 4096); 43 | 44 | void serialize(std::ostream& os) const; 45 | void deserialize(std::istream& is); 46 | 47 | void read_file(std::istream& is, std::vector >& data); 48 | void read_file(const fs::path& p, std::vector >& data); 49 | }; 50 | 51 | } // namespace cobs 52 | 53 | #endif // !COBS_FILE_COMPACT_INDEX_HEADER_HEADER 54 | 55 | /******************************************************************************/ 56 | -------------------------------------------------------------------------------- /cobs/file/file_io_exception.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/file_io_exception.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FILE_FILE_IO_EXCEPTION_HEADER 10 | #define COBS_FILE_FILE_IO_EXCEPTION_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class FileIOException : public std::runtime_error 17 | { 18 | private: 19 | std::string msg_; 20 | 21 | public: 22 | explicit FileIOException(const std::string& msg) 23 | : std::runtime_error(msg), msg_(msg) { } 24 | 25 | const char * what() const noexcept override { 26 | return msg_.c_str(); 27 | } 28 | 29 | std::string& message() { 30 | return msg_; 31 | } 32 | }; 33 | 34 | } // namespace cobs 35 | 36 | #endif // !COBS_FILE_FILE_IO_EXCEPTION_HEADER 37 | 38 | /******************************************************************************/ 39 | -------------------------------------------------------------------------------- /cobs/file/header.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/header.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FILE_HEADER_HEADER 10 | #define COBS_FILE_HEADER_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | namespace cobs { 21 | 22 | static inline 23 | void check_magic_word(std::istream& is, const std::string& magic_word) { 24 | std::vector mw_v(magic_word.size(), ' '); 25 | is.read(mw_v.data(), magic_word.size()); 26 | std::string mw(mw_v.data(), mw_v.size()); 27 | assert_throw(mw == magic_word, "invalid file type"); 28 | assert_throw(is.good(), "input filestream broken"); 29 | } 30 | 31 | static inline 32 | void serialize_magic_begin( 33 | std::ostream& os, const std::string& magic_word, const uint32_t& version) { 34 | os << "COBS:"; 35 | os << magic_word; 36 | stream_put(os, version); 37 | } 38 | 39 | static inline 40 | void serialize_magic_end( 41 | std::ostream& os, const std::string& magic_word) { 42 | os << magic_word; 43 | } 44 | 45 | static inline 46 | void deserialize_magic_begin( 47 | std::istream& is, const std::string& magic_word, const uint32_t& version) { 48 | check_magic_word(is, "COBS:"); 49 | check_magic_word(is, magic_word); 50 | uint32_t v; 51 | stream_get(is, v); 52 | assert_throw(v == version, "invalid file version"); 53 | } 54 | 55 | static inline 56 | void deserialize_magic_end( 57 | std::istream& is, const std::string& magic_word) { 58 | check_magic_word(is, magic_word); 59 | } 60 | 61 | } // namespace cobs 62 | 63 | #endif // !COBS_FILE_HEADER_HEADER 64 | 65 | /******************************************************************************/ 66 | -------------------------------------------------------------------------------- /cobs/file/kmer_buffer_header.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/kmer_buffer_header.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | 12 | namespace cobs { 13 | 14 | const std::string KMerBufferHeader::magic_word = "DOCUMENT"; 15 | const uint32_t KMerBufferHeader::version = 1; 16 | const std::string KMerBufferHeader::file_extension = ".cobs_doc"; 17 | 18 | KMerBufferHeader::KMerBufferHeader(std::string name, uint32_t kmer_size) 19 | : name_(name), kmer_size_(kmer_size) { } 20 | 21 | void KMerBufferHeader::serialize(std::ostream& os) const { 22 | serialize_magic_begin(os, magic_word, version); 23 | 24 | stream_put(os, kmer_size_); 25 | os << name_ << '\0'; 26 | 27 | serialize_magic_end(os, magic_word); 28 | } 29 | 30 | void KMerBufferHeader::deserialize(std::istream& is) { 31 | deserialize_magic_begin(is, magic_word, version); 32 | 33 | stream_get(is, kmer_size_); 34 | std::getline(is, name_, '\0'); 35 | 36 | deserialize_magic_end(is, magic_word); 37 | } 38 | 39 | std::string KMerBufferHeader::name() const { 40 | return name_; 41 | } 42 | 43 | uint32_t KMerBufferHeader::kmer_size() const { 44 | return kmer_size_; 45 | } 46 | 47 | } // namespace cobs 48 | 49 | /******************************************************************************/ 50 | -------------------------------------------------------------------------------- /cobs/file/kmer_buffer_header.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/file/kmer_buffer_header.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_FILE_KMER_BUFFER_HEADER_HEADER 10 | #define COBS_FILE_KMER_BUFFER_HEADER_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class KMerBufferHeader 17 | { 18 | private: 19 | std::string name_; 20 | uint32_t kmer_size_; 21 | 22 | public: 23 | static const std::string magic_word; 24 | static const uint32_t version; 25 | static const std::string file_extension; 26 | 27 | KMerBufferHeader() = default; 28 | KMerBufferHeader(std::string name, uint32_t kmer_size); 29 | 30 | void serialize(std::ostream& os) const; 31 | void deserialize(std::istream& is); 32 | 33 | std::string name() const; 34 | uint32_t kmer_size() const; 35 | }; 36 | 37 | } // namespace cobs 38 | 39 | #endif // !COBS_FILE_KMER_BUFFER_HEADER_HEADER 40 | 41 | /******************************************************************************/ 42 | -------------------------------------------------------------------------------- /cobs/kmer.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/kmer.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_KMER_HEADER 11 | #define COBS_KMER_HEADER 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | namespace cobs { 25 | 26 | static inline 27 | uint32_t chars_to_int(char c1, char c2, char c3, char c4) { 28 | return uint32_t(0) | (c1 << 24) | (c2 << 16) | (c3 << 8) | c4; 29 | } 30 | 31 | extern const char* kmer_byte_to_base_pairs[256]; 32 | extern const std::unordered_map kmer_bps_to_uint8_t; 33 | extern uint8_t kmer_mirror_pairs[256]; 34 | 35 | template 36 | class KMer : public std::array 37 | { 38 | public: 39 | static const size_t size = (N + 3) / 4; 40 | 41 | using Super = std::array; 42 | using Super::data; 43 | 44 | public: 45 | KMer() { 46 | static_assert(sizeof(KMer) == KMer::size); 47 | } 48 | 49 | explicit KMer(const char* chars) { 50 | static_assert(sizeof(KMer) == KMer::size); 51 | init(chars); 52 | } 53 | 54 | void init(const char* chars) { 55 | for (int i = N - 4; i >= -3; i -= 4) { 56 | if (i >= 0) { 57 | data()[(N - (i + 4)) / 4] = 58 | kmer_bps_to_uint8_t.at(*((uint32_t*)(chars + i))); 59 | } 60 | else { 61 | char c2 = i < -1 ? 'A' : chars[i + 1]; 62 | char c3 = i < -2 ? 'A' : chars[i + 2]; 63 | data()[size - 1] = kmer_bps_to_uint8_t.at( 64 | chars_to_int(chars[i + 3], c3, c2, 'A')); 65 | } 66 | } 67 | } 68 | 69 | std::string string() const { 70 | std::string result; 71 | result.reserve(N); 72 | for (size_t i = 0; i < size; ++i) { 73 | if (TLX_UNLIKELY(i == 0 && N % 4 != 0)) { 74 | result += kmer_byte_to_base_pairs[ 75 | data()[size - 1 - i]] + (4 - N % 4); 76 | } 77 | else { 78 | result += kmer_byte_to_base_pairs[ 79 | data()[size - 1 - i]]; 80 | } 81 | } 82 | return result; 83 | } 84 | 85 | std::string& to_string(std::string* out) const { 86 | out->clear(); 87 | for (size_t i = 0; i < size; ++i) { 88 | if (TLX_UNLIKELY(i == 0 && N % 4 != 0)) { 89 | *out += kmer_byte_to_base_pairs[ 90 | data()[size - 1 - i]] + (4 - N % 4); 91 | } 92 | else { 93 | *out += kmer_byte_to_base_pairs[ 94 | data()[size - 1 - i]]; 95 | } 96 | } 97 | return *out; 98 | } 99 | 100 | void print(std::ostream& ostream) const { 101 | ostream << string(); 102 | } 103 | 104 | static void init(const char* chars, char* kmer_data, uint32_t kmer_size) { 105 | int kmer_size_int = kmer_size; 106 | for (int i = kmer_size_int - 4; i >= -3; i -= 4) { 107 | if (i >= 0) { 108 | kmer_data[(kmer_size_int - (i + 4)) / 4] = 109 | kmer_bps_to_uint8_t.at(*((uint32_t*)(chars + i))); 110 | } 111 | else { 112 | char c2 = i < -1 ? 'A' : chars[i + 1]; 113 | char c3 = i < -2 ? 'A' : chars[i + 2]; 114 | kmer_data[data_size(kmer_size) - 1] = 115 | kmer_bps_to_uint8_t.at(chars_to_int(chars[i + 3], c3, c2, 'A')); 116 | } 117 | } 118 | } 119 | 120 | static uint32_t data_size(uint32_t kmer_size) { 121 | return (kmer_size + 3) / 4; 122 | } 123 | 124 | bool operator < (const KMer& b) const { 125 | return std::lexicographical_compare( 126 | this->rbegin(), this->rend(), b.rbegin(), b.rend()); 127 | } 128 | 129 | template 130 | void fill_random(RandomGenerator& rng) { 131 | size_t i = 0; 132 | for ( ; i + 3 < size; i += 4) { 133 | *reinterpret_cast(data() + i) = rng(); 134 | } 135 | for ( ; i < size; ++i) { 136 | data()[i] = rng(); 137 | } 138 | } 139 | 140 | //! return 0 (A), 1 (C), 2 (G), or 3 (T) letter at index 141 | uint8_t at(size_t index) const { 142 | assert(index < N); 143 | // skip unused bits at the end 144 | index += (4 - N % 4); 145 | return (data()[size - 1 - index / 4] >> (6 - 2 * (index % 4))) & 0x3; 146 | } 147 | 148 | void canonicalize() { 149 | // base pair mirror_map map. A -> T, C -> G, G -> C, T -> A. 150 | // static const uint8_t mirror_map[4] = { 3, 2, 1, 0 }; 151 | 152 | size_t i = 0, r = N - 1; 153 | while (at(i) == (3 - at(r)) && i < N / 2) 154 | ++i, --r; 155 | 156 | if (at(i) > (3 - at(r))) 157 | mirror(); 158 | } 159 | 160 | void mirror() { 161 | // reverse kmer base pairs into a buffer and copy back 162 | std::array buffer; 163 | 164 | // last byte contains only (N % 4) base pairs 165 | uint8_t overflow = 166 | static_cast(data()[size - 1]) << (2 * (4 - N % 4)); 167 | for (size_t i = 1; i < size; i++) { 168 | uint8_t bp = static_cast(data()[size - 1 - i]); 169 | overflow |= bp >> (2 * (N % 4)); 170 | buffer[i - 1] = kmer_mirror_pairs[overflow]; 171 | overflow = bp << (2 * (4 - N % 4)); 172 | } 173 | buffer[size - 1] = kmer_mirror_pairs[overflow]; 174 | 175 | std::copy(buffer.begin(), buffer.end(), this->begin()); 176 | } 177 | }; 178 | 179 | } // namespace cobs 180 | 181 | #endif // !COBS_KMER_HEADER 182 | 183 | /******************************************************************************/ 184 | -------------------------------------------------------------------------------- /cobs/kmer_buffer.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/kmer_buffer.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_KMER_BUFFER_HEADER 11 | #define COBS_KMER_BUFFER_HEADER 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | namespace cobs { 23 | 24 | template 25 | class KMerBuffer 26 | { 27 | private: 28 | std::vector > m_data; 29 | 30 | public: 31 | void print(std::ostream& ostream) const; 32 | 33 | std::vector >& data() { 34 | return m_data; 35 | } 36 | const std::vector >& data() const { 37 | return m_data; 38 | } 39 | 40 | const KMer& operator [] (size_t i) const { 41 | return m_data[i]; 42 | } 43 | 44 | size_t num_kmers() const { 45 | return m_data.size(); 46 | } 47 | 48 | void sort_kmers() { 49 | std::sort(m_data.begin(), m_data.end()); 50 | } 51 | 52 | void serialize(std::ostream& os, const std::string& name) const { 53 | os.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 54 | KMerBufferHeader sh(name, N); 55 | sh.serialize(os); 56 | os.write(reinterpret_cast(m_data.data()), 57 | KMer::size* m_data.size()); 58 | } 59 | 60 | void serialize(const fs::path& p, const std::string& name) const { 61 | fs::create_directories(p.parent_path()); 62 | std::ofstream ofs(p.string(), std::ios::out | std::ios::binary); 63 | serialize(ofs, name); 64 | } 65 | 66 | void deserialize(std::istream& is, KMerBufferHeader& h) { 67 | is.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 68 | h.deserialize(is); 69 | die_unless(N == h.kmer_size()); 70 | 71 | size_t size = get_stream_size(is); 72 | m_data.resize(size / KMer::size); 73 | is.read(reinterpret_cast(m_data.data()), size); 74 | } 75 | 76 | void deserialize(const fs::path& p, KMerBufferHeader& h) { 77 | std::ifstream ifs(p.string(), std::ios::in | std::ios::binary); 78 | deserialize(ifs, h); 79 | } 80 | }; 81 | 82 | template 83 | void KMerBuffer::print(std::ostream& ostream) const { 84 | for (size_t i = 0; i < m_data.size(); i++) { 85 | ostream << m_data[i] << std::endl; 86 | } 87 | } 88 | 89 | } // namespace cobs 90 | 91 | #endif // !COBS_KMER_BUFFER_HEADER 92 | 93 | /******************************************************************************/ 94 | -------------------------------------------------------------------------------- /cobs/query/classic_index/mmap_search_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/classic_index/mmap_search_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | ClassicIndexMMapSearchFile::ClassicIndexMMapSearchFile(const fs::path& path) 18 | : ClassicIndexSearchFile(path) { 19 | handle_ = initialize_mmap(path); 20 | data_ = handle_.data + stream_pos_.curr_pos; 21 | } 22 | 23 | ClassicIndexMMapSearchFile::~ClassicIndexMMapSearchFile() { 24 | destroy_mmap(handle_); 25 | } 26 | 27 | void ClassicIndexMMapSearchFile::read_from_disk( 28 | const std::vector& hashes, uint8_t* rows, 29 | size_t begin, size_t size, size_t buffer_size) 30 | { 31 | die_unless(begin + size <= header_.row_size()); 32 | for (size_t i = 0; i < hashes.size(); i++) { 33 | auto data_8 = 34 | data_ + begin 35 | + (hashes[i] % header_.signature_size_) * header_.row_size(); 36 | auto rows_8 = rows + i * buffer_size; 37 | // std::memcpy(rows_8, data_8, size); 38 | std::copy(data_8, data_8 + size, rows_8); 39 | } 40 | } 41 | 42 | } // namespace cobs 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /cobs/query/classic_index/mmap_search_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/classic_index/mmap_search_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_QUERY_CLASSIC_INDEX_MMAP_SEARCH_FILE_HEADER 10 | #define COBS_QUERY_CLASSIC_INDEX_MMAP_SEARCH_FILE_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class ClassicIndexMMapSearchFile : public ClassicIndexSearchFile 17 | { 18 | private: 19 | MMapHandle handle_; 20 | uint8_t* data_; 21 | 22 | protected: 23 | void read_from_disk(const std::vector& hashes, uint8_t* rows, 24 | size_t begin, size_t size, size_t buffer_size) override; 25 | 26 | public: 27 | explicit ClassicIndexMMapSearchFile(const fs::path& path); 28 | ~ClassicIndexMMapSearchFile(); 29 | }; 30 | 31 | } // namespace cobs 32 | 33 | #endif // !COBS_QUERY_CLASSIC_INDEX_MMAP_SEARCH_FILE_HEADER 34 | 35 | /******************************************************************************/ 36 | -------------------------------------------------------------------------------- /cobs/query/classic_index/search_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/classic_index/search_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | namespace cobs { 14 | 15 | ClassicIndexSearchFile::ClassicIndexSearchFile(const fs::path& path) { 16 | std::ifstream ifs; 17 | header_ = deserialize_header(ifs, path); 18 | stream_pos_ = get_stream_pos(ifs); 19 | } 20 | 21 | uint64_t ClassicIndexSearchFile::counts_size() const { 22 | return 8 * header_.row_size(); 23 | } 24 | 25 | } // namespace cobs 26 | 27 | /******************************************************************************/ 28 | -------------------------------------------------------------------------------- /cobs/query/classic_index/search_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/classic_index/search_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_QUERY_CLASSIC_INDEX_SEARCH_FILE_HEADER 10 | #define COBS_QUERY_CLASSIC_INDEX_SEARCH_FILE_HEADER 11 | 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | class ClassicIndexSearchFile : public IndexSearchFile 18 | { 19 | protected: 20 | explicit ClassicIndexSearchFile(const fs::path& path); 21 | 22 | uint32_t term_size() const final { return header_.term_size_; } 23 | uint8_t canonicalize() const final { return header_.canonicalize_; } 24 | uint64_t num_hashes() const final { return header_.num_hashes_; } 25 | uint64_t row_size() const final { return header_.row_size(); } 26 | uint64_t page_size() const final { return 1; } 27 | uint64_t counts_size() const final; 28 | const std::vector& file_names() const override { 29 | return header_.file_names_; 30 | } 31 | 32 | ClassicIndexHeader header_; 33 | 34 | public: 35 | virtual ~ClassicIndexSearchFile() = default; 36 | }; 37 | 38 | } // namespace cobs 39 | 40 | #endif // !COBS_QUERY_CLASSIC_INDEX_SEARCH_FILE_HEADER 41 | 42 | /******************************************************************************/ 43 | -------------------------------------------------------------------------------- /cobs/query/classic_search.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/classic_search.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_QUERY_CLASSIC_SEARCH_HEADER 11 | #define COBS_QUERY_CLASSIC_SEARCH_HEADER 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | namespace cobs { 18 | 19 | class ClassicSearch : public Search 20 | { 21 | public: 22 | //! method to try to auto-detect and load IndexSearchFile 23 | ClassicSearch(std::string path); 24 | 25 | ClassicSearch(std::shared_ptr index); 26 | 27 | ClassicSearch(std::vector > indices); 28 | 29 | void search( 30 | const std::string& query, 31 | std::vector& result, 32 | double threshold = 0.0, size_t num_results = 0) final; 33 | 34 | protected: 35 | //! reference to index file query object to retrieve data 36 | std::vector > index_files_; 37 | }; 38 | 39 | /*----------------------------------------------------------------------------*/ 40 | // hacky variables to disable expansion table variants for better testing 41 | 42 | //! disable 8-bit expansion table 43 | extern bool classic_search_disable_8bit; 44 | //! disable 16-bit expansion table 45 | extern bool classic_search_disable_16bit; 46 | //! disable 32-bit expansion table 47 | extern bool classic_search_disable_32bit; 48 | 49 | //! disable SSE2 versions of expansion 50 | extern bool classic_search_disable_sse2; 51 | 52 | /*----------------------------------------------------------------------------*/ 53 | 54 | } // namespace cobs 55 | 56 | #endif // !COBS_QUERY_CLASSIC_SEARCH_HEADER 57 | 58 | /******************************************************************************/ 59 | -------------------------------------------------------------------------------- /cobs/query/compact_index/aio_search_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/aio_search_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | namespace cobs { 22 | 23 | CompactIndexAioSearchFile::CompactIndexAioSearchFile(const fs::path& path) 24 | : CompactIndexSearchFile(path), m_max_nr_ios(65536 * header_.parameters_.size()), 25 | m_iocbs(m_max_nr_ios), m_iocbpp(m_max_nr_ios), m_io_events(m_max_nr_ios) 26 | { 27 | // todo use sysctl to check max-nr-io 28 | assert_exit(header_.page_size_ % cobs::get_page_size() == 0, 29 | "page size needs to be divisible by 4096 " 30 | "so the index can be opened with O_DIRECT"); 31 | 32 | m_offsets.resize(header_.parameters_.size()); 33 | m_offsets[0] = stream_pos_.curr_pos; 34 | for (size_t i = 1; i < header_.parameters_.size(); i++) { 35 | m_offsets[i] = m_offsets[i - 1] + header_.page_size_ * header_.parameters_[i - 1].signature_size; 36 | } 37 | 38 | m_fd = open_file(path, O_RDONLY | O_DIRECT); 39 | if (io_setup(m_max_nr_ios, &m_ctx) < 0) { 40 | exit_error_errno("io_setup error"); 41 | } 42 | 43 | for (size_t i = 0; i < m_iocbs.size(); i++) { 44 | m_iocbs[i].aio_fildes = m_fd; 45 | m_iocbs[i].aio_lio_opcode = IOCB_CMD_PREAD; 46 | m_iocbs[i].aio_nbytes = header_.page_size_; 47 | m_iocbpp[i] = m_iocbs.data() + i; 48 | } 49 | } 50 | 51 | CompactIndexAioSearchFile::~CompactIndexAioSearchFile() { 52 | close_file(m_fd); 53 | if (io_destroy(m_ctx) < 0) { 54 | exit_error_errno("io_destroy error"); 55 | } 56 | } 57 | 58 | void CompactIndexAioSearchFile::read_from_disk( 59 | const std::vector& hashes, uint8_t* rows, 60 | size_t begin, size_t size, size_t buffer_size) 61 | { 62 | tlx::unused(begin, size, buffer_size); 63 | 64 | int64_t num_requests = header_.parameters_.size() * hashes.size(); 65 | 66 | #pragma omp parallel for collapse(2) 67 | for (size_t i = 0; i < header_.parameters_.size(); i++) { 68 | for (size_t j = 0; j < hashes.size(); j++) { 69 | uint64_t index = i + j * header_.parameters_.size(); 70 | uint64_t hash = hashes[j] % header_.parameters_[i].signature_size; 71 | // todo rows does not need to be reallocated each time 72 | m_iocbs[index].aio_buf = (uint64_t)rows + index * header_.page_size_; 73 | m_iocbs[index].aio_offset = m_offsets[i] + hash * header_.page_size_; 74 | } 75 | } 76 | 77 | int ret = io_submit(m_ctx, num_requests, m_iocbpp.data()); 78 | if (ret != num_requests) { 79 | if (ret < 0) { 80 | perror("io_submit error"); 81 | } 82 | else { 83 | fprintf(stderr, "could not sumbit IOs"); 84 | } 85 | exit_error_errno("io_submit error"); 86 | } 87 | 88 | if (io_getevents(m_ctx, num_requests, num_requests, m_io_events.data(), nullptr) < num_requests) { 89 | exit_error_errno("io_getevents error"); 90 | } 91 | 92 | for (int i = 0; i < num_requests; i++) { 93 | if (m_io_events[i].res != (int64_t)header_.page_size_) { 94 | std::cout << i << " " << std::strerror(-m_io_events[i].res) << std::endl; 95 | } 96 | } 97 | } 98 | 99 | } // namespace cobs 100 | 101 | /******************************************************************************/ 102 | -------------------------------------------------------------------------------- /cobs/query/compact_index/aio_search_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/aio_search_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_QUERY_COMPACT_INDEX_AIO_SEARCH_FILE_HEADER 10 | #define COBS_QUERY_COMPACT_INDEX_AIO_SEARCH_FILE_HEADER 11 | 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | class CompactIndexAioSearchFile : public CompactIndexSearchFile 18 | { 19 | private: 20 | uint64_t m_max_nr_ios; 21 | int m_fd; 22 | aio_context_t m_ctx = 0; 23 | std::vector m_offsets; 24 | std::vector m_iocbs; 25 | std::vector m_iocbpp; 26 | std::vector m_io_events; 27 | 28 | protected: 29 | void read_from_disk(const std::vector& hashes, uint8_t* rows, 30 | size_t begin, size_t size, size_t buffer_size) override; 31 | 32 | public: 33 | explicit CompactIndexAioSearchFile(const fs::path& path); 34 | ~CompactIndexAioSearchFile(); 35 | }; 36 | 37 | } // namespace cobs 38 | 39 | #endif // !COBS_QUERY_COMPACT_INDEX_AIO_SEARCH_FILE_HEADER 40 | 41 | /******************************************************************************/ 42 | -------------------------------------------------------------------------------- /cobs/query/compact_index/mmap_search_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/mmap_search_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | CompactIndexMMapSearchFile::CompactIndexMMapSearchFile(const fs::path& path) 18 | : CompactIndexSearchFile(path) 19 | { 20 | data_.resize(header_.parameters_.size()); 21 | handle_ = initialize_mmap(path); 22 | data_[0] = handle_.data + stream_pos_.curr_pos; 23 | for (size_t i = 1; i < header_.parameters_.size(); i++) { 24 | data_[i] = 25 | data_[i - 1] 26 | + header_.page_size_ * header_.parameters_[i - 1].signature_size; 27 | } 28 | } 29 | 30 | CompactIndexMMapSearchFile::~CompactIndexMMapSearchFile() { 31 | destroy_mmap(handle_); 32 | } 33 | 34 | void CompactIndexMMapSearchFile::read_from_disk( 35 | const std::vector& hashes, uint8_t* rows, 36 | size_t begin, size_t size, size_t buffer_size) 37 | { 38 | size_t page_size = header_.page_size_; 39 | 40 | die_unless(begin + size <= row_size()); 41 | die_unless(begin % page_size == 0); 42 | size_t begin_page = begin / page_size; 43 | size_t end_page = tlx::div_ceil(begin + size, page_size); 44 | die_unless(end_page <= header_.parameters_.size()); 45 | 46 | LOG0 << "mmap::read_from_disk()" 47 | << " page_size=" << page_size 48 | << " hashes.size=" << hashes.size() 49 | << " begin=" << begin 50 | << " size=" << size 51 | << " buffer_size=" << buffer_size 52 | << " begin_page=" << begin_page 53 | << " end_page=" << end_page; 54 | 55 | for (size_t i = 0; i < hashes.size(); i++) { 56 | size_t j = 0; 57 | for (size_t p = begin_page; p < end_page; ++p, ++j) { 58 | uint64_t hash = hashes[i] % header_.parameters_[p].signature_size; 59 | uint8_t* data_8 = data_[p] + hash * page_size; 60 | uint8_t* rows_8 = 61 | reinterpret_cast(rows) + i * buffer_size + j * page_size; 62 | // die_unless(rows_8 + page_size <= rows + size * hashes.size()); 63 | // std::memcpy(rows_8, data_8, page_size); 64 | std::copy(data_8, data_8 + page_size, rows_8); 65 | } 66 | } 67 | } 68 | 69 | } // namespace cobs 70 | 71 | /******************************************************************************/ 72 | -------------------------------------------------------------------------------- /cobs/query/compact_index/mmap_search_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/mmap_search_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_QUERY_COMPACT_INDEX_MMAP_SEARCH_FILE_HEADER 10 | #define COBS_QUERY_COMPACT_INDEX_MMAP_SEARCH_FILE_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | class CompactIndexMMapSearchFile : public CompactIndexSearchFile 17 | { 18 | private: 19 | MMapHandle handle_; 20 | std::vector data_; 21 | 22 | protected: 23 | void read_from_disk(const std::vector& hashes, uint8_t* rows, 24 | size_t begin, size_t size, size_t buffer_size) override; 25 | 26 | public: 27 | explicit CompactIndexMMapSearchFile(const fs::path& path); 28 | ~CompactIndexMMapSearchFile(); 29 | }; 30 | 31 | } // namespace cobs 32 | 33 | #endif // !COBS_QUERY_COMPACT_INDEX_MMAP_SEARCH_FILE_HEADER 34 | 35 | /******************************************************************************/ 36 | -------------------------------------------------------------------------------- /cobs/query/compact_index/search_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/search_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | namespace cobs { 16 | 17 | CompactIndexSearchFile::CompactIndexSearchFile(const fs::path& path) { 18 | std::ifstream ifs; 19 | header_ = deserialize_header(ifs, path); 20 | stream_pos_ = get_stream_pos(ifs); 21 | 22 | // todo assertions that all the data in the Header is correct 23 | row_size_ = header_.page_size_ * header_.parameters_.size(); 24 | num_hashes_ = header_.parameters_[0].num_hashes; 25 | for (const auto& p : header_.parameters_) { 26 | die_unless(num_hashes_ == p.num_hashes); 27 | } 28 | } 29 | 30 | uint64_t CompactIndexSearchFile::counts_size() const { 31 | return 8 * header_.parameters_.size() * header_.page_size_; 32 | } 33 | 34 | } // namespace cobs 35 | 36 | /******************************************************************************/ 37 | -------------------------------------------------------------------------------- /cobs/query/compact_index/search_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/compact_index/search_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_QUERY_COMPACT_INDEX_SEARCH_FILE_HEADER 10 | #define COBS_QUERY_COMPACT_INDEX_SEARCH_FILE_HEADER 11 | 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | class CompactIndexSearchFile : public IndexSearchFile 18 | { 19 | protected: 20 | size_t num_hashes_; 21 | size_t row_size_; 22 | explicit CompactIndexSearchFile(const fs::path& path); 23 | 24 | uint32_t term_size() const final { return header_.term_size_; } 25 | uint8_t canonicalize() const final { return header_.canonicalize_; } 26 | uint64_t num_hashes() const final { return num_hashes_; } 27 | uint64_t page_size() const final { return header_.page_size_; } 28 | uint64_t row_size() const final { return row_size_; } 29 | uint64_t counts_size() const final; 30 | 31 | const std::vector& file_names() const final { 32 | return header_.file_names_; 33 | } 34 | 35 | CompactIndexHeader header_; 36 | 37 | public: 38 | virtual ~CompactIndexSearchFile() = default; 39 | }; 40 | 41 | } // namespace cobs 42 | 43 | #endif // !COBS_QUERY_COMPACT_INDEX_SEARCH_FILE_HEADER 44 | 45 | /******************************************************************************/ 46 | -------------------------------------------------------------------------------- /cobs/query/index_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/index_file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_QUERY_INDEX_FILE_HEADER 11 | #define COBS_QUERY_INDEX_FILE_HEADER 12 | 13 | #include 14 | 15 | #include 16 | 17 | namespace cobs { 18 | 19 | class IndexSearchFile 20 | { 21 | public: 22 | StreamPos stream_pos_; 23 | 24 | virtual void read_from_disk( 25 | const std::vector& hashes, uint8_t* rows, 26 | size_t begin, size_t size, size_t buffer_size) = 0; 27 | 28 | virtual uint32_t term_size() const = 0; 29 | virtual uint8_t canonicalize() const = 0; 30 | virtual uint64_t row_size() const = 0; 31 | virtual uint64_t page_size() const = 0; 32 | virtual uint64_t num_hashes() const = 0; 33 | virtual uint64_t counts_size() const = 0; 34 | virtual const std::vector& file_names() const = 0; 35 | }; 36 | 37 | } // namespace cobs 38 | 39 | #endif // !COBS_QUERY_INDEX_FILE_HEADER 40 | 41 | /******************************************************************************/ 42 | -------------------------------------------------------------------------------- /cobs/query/search.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/query/search.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #ifndef COBS_QUERY_SEARCH_HEADER 11 | #define COBS_QUERY_SEARCH_HEADER 12 | 13 | #include 14 | 15 | namespace cobs { 16 | 17 | struct SearchResult { 18 | //! string reference to document name 19 | const char* doc_name; 20 | //! score (number of matched k-mers) 21 | uint32_t score; 22 | 23 | SearchResult() = default; 24 | 25 | SearchResult(const char* doc_name, uint32_t score) 26 | : doc_name(doc_name), score(score) { } 27 | }; 28 | 29 | class Search 30 | { 31 | public: 32 | virtual ~Search() = default; 33 | 34 | //! Returns timer_ 35 | Timer& timer() { return timer_; } 36 | //! Returns timer_ 37 | const Timer& timer() const { return timer_; } 38 | 39 | virtual void search( 40 | const std::string& query, 41 | std::vector& result, 42 | double threshold = 0.0, size_t num_results = 0) = 0; 43 | 44 | public: 45 | //! timer of different query phases 46 | Timer timer_; 47 | }; 48 | 49 | } // namespace cobs 50 | 51 | #endif // !COBS_QUERY_SEARCH_HEADER 52 | 53 | /******************************************************************************/ 54 | -------------------------------------------------------------------------------- /cobs/settings.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/settings.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | namespace cobs { 14 | 15 | size_t gopt_threads = std::thread::hardware_concurrency(); 16 | 17 | bool gopt_load_complete_index = false; 18 | 19 | bool gopt_disable_cache = false; 20 | 21 | } // namespace cobs 22 | 23 | /******************************************************************************/ 24 | -------------------------------------------------------------------------------- /cobs/settings.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/settings.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_SETTINGS_HEADER 10 | #define COBS_SETTINGS_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | //! run COBS using parallel threads, default: all cores 17 | extern size_t gopt_threads; 18 | 19 | //! whether to load the complete index to RAM for queries. 20 | extern bool gopt_load_complete_index; 21 | 22 | //! whether to disable FastA/FastQ cache files globally. 23 | extern bool gopt_disable_cache; 24 | 25 | } // namespace cobs 26 | 27 | #endif // !COBS_SETTINGS_HEADER 28 | 29 | /******************************************************************************/ 30 | -------------------------------------------------------------------------------- /cobs/text_file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/text_file.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_TEXT_FILE_HEADER 10 | #define COBS_TEXT_FILE_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | namespace cobs { 23 | 24 | class TextFile 25 | { 26 | public: 27 | TextFile(std::string path) : path_(path) { 28 | is_.open(path); 29 | die_unless(is_.good()); 30 | } 31 | 32 | //! return size of a text document 33 | size_t size() { 34 | is_.clear(); 35 | is_.seekg(0, std::ios::end); 36 | return is_.tellg(); 37 | } 38 | 39 | //! return number of q-grams in document 40 | size_t num_terms(size_t q) { 41 | size_t n = size(); 42 | return n < q ? 0 : n - q + 1; 43 | } 44 | 45 | template 46 | void process_terms(size_t term_size, Callback callback) { 47 | is_.clear(); 48 | is_.seekg(0); 49 | 50 | char buffer[64 * 1024]; 51 | size_t pos = 0; 52 | 53 | while (!is_.eof()) { 54 | is_.read(buffer + pos, sizeof(buffer) - pos); 55 | size_t wb = is_.gcount(); 56 | 57 | for (size_t i = 0; i + term_size <= pos + wb; ++i) { 58 | callback(tlx::string_view(buffer + i, term_size)); 59 | } 60 | 61 | if (wb + 1 < term_size) 62 | break; 63 | 64 | std::copy(buffer + wb - term_size + 1, buffer + wb, 65 | buffer); 66 | pos = term_size - 1; 67 | } 68 | } 69 | 70 | private: 71 | //! file stream 72 | std::ifstream is_; 73 | //! path 74 | std::string path_; 75 | }; 76 | 77 | } // namespace cobs 78 | 79 | #endif // !COBS_TEXT_FILE_HEADER 80 | 81 | /******************************************************************************/ 82 | -------------------------------------------------------------------------------- /cobs/util/aio.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/aio.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #define _GNU_SOURCE /* syscall() is not POSIX */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace cobs { 24 | 25 | int io_setup(unsigned nr, aio_context_t* ctxp) { 26 | return syscall(__NR_io_setup, nr, ctxp); 27 | } 28 | 29 | int io_destroy(aio_context_t ctx) { 30 | return syscall(__NR_io_destroy, ctx); 31 | } 32 | 33 | int io_submit(aio_context_t ctx, long nr, iocb** iocbpp) { 34 | return syscall(__NR_io_submit, ctx, nr, iocbpp); 35 | } 36 | 37 | int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event* events, timespec* timeout) { 38 | return syscall(__NR_io_getevents, ctx, min_nr, max_nr, events, timeout); 39 | } 40 | 41 | } // namespace cobs 42 | 43 | /******************************************************************************/ 44 | -------------------------------------------------------------------------------- /cobs/util/aio.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/aio.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_AIO_HEADER 10 | #define COBS_UTIL_AIO_HEADER 11 | 12 | #include 13 | #include 14 | 15 | namespace cobs { 16 | 17 | int io_setup(unsigned nr, aio_context_t* ctxp); 18 | int io_destroy(aio_context_t ctx); 19 | int io_submit(aio_context_t ctx, long nr, iocb** iocbpp); 20 | int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event* events, timespec* timeout); 21 | 22 | } // namespace cobs 23 | 24 | #endif // !COBS_UTIL_AIO_HEADER 25 | 26 | /******************************************************************************/ 27 | -------------------------------------------------------------------------------- /cobs/util/calc_signature_size.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/calc_signature_size.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | namespace cobs { 16 | 17 | double calc_signature_size_ratio(double num_hashes, 18 | double false_positive_rate) { 19 | double denominator = 20 | std::log(1 - std::pow(false_positive_rate, 1 / num_hashes)); 21 | double result = -num_hashes / denominator; 22 | die_unless(result > 0); 23 | return result; 24 | } 25 | 26 | uint64_t calc_signature_size(uint64_t num_elements, double num_hashes, 27 | double false_positive_rate) { 28 | double signature_size_ratio = calc_signature_size_ratio( 29 | num_hashes, false_positive_rate); 30 | double result = std::ceil(num_elements * signature_size_ratio); 31 | die_unless(result >= 0); 32 | die_unless(result <= UINT64_MAX); 33 | return (uint64_t)result; 34 | } 35 | 36 | double calc_average_set_bit_ratio(uint64_t signature_size, double num_hashes, 37 | double false_positive_rate) { 38 | double num_elements = 39 | signature_size / 40 | calc_signature_size_ratio(num_hashes, false_positive_rate); 41 | double result = 42 | 1 - std::pow(1 - 1 / (double)signature_size, num_hashes * num_elements); 43 | die_unless(result >= 0); 44 | die_unless(result <= 1); 45 | return result; 46 | } 47 | 48 | } // namespace cobs 49 | 50 | /******************************************************************************/ 51 | -------------------------------------------------------------------------------- /cobs/util/calc_signature_size.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/calc_signature_size.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_CALC_SIGNATURE_SIZE_HEADER 10 | #define COBS_UTIL_CALC_SIGNATURE_SIZE_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | //! calculate the size ratio of a Bloom filter with k hash functions and given 17 | //! fpr. 18 | double calc_signature_size_ratio(double num_hashes, 19 | double false_positive_rate); 20 | 21 | //! calculate the number of cells in a Bloom filter with k hash functions into 22 | //! which num_elements are inserted such that it has expected given fpr. 23 | uint64_t calc_signature_size(uint64_t num_elements, double num_hashes, 24 | double false_positive_rate); 25 | 26 | //! calculate expected probability of a bit in the Bloom filter to be one 27 | double calc_average_set_bit_ratio(uint64_t signature_size, double num_hashes, 28 | double false_positive_rate); 29 | 30 | } // namespace cobs 31 | 32 | #endif // !COBS_UTIL_CALC_SIGNATURE_SIZE_HEADER 33 | 34 | /******************************************************************************/ 35 | -------------------------------------------------------------------------------- /cobs/util/error_handling.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/error_handling.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace cobs { 14 | 15 | void print_errno(const std::string& msg) { 16 | std::cerr << msg + ": " << std::strerror(errno) << std::endl; 17 | } 18 | 19 | void exit_error(const std::string& msg) { 20 | std::cerr << msg << std::endl; 21 | std::exit(EXIT_FAILURE); 22 | } 23 | 24 | void assert_exit(bool cond, const std::string& msg) { 25 | if (!cond) { 26 | exit_error(msg); 27 | } 28 | } 29 | 30 | void exit_error_errno(const std::string& msg) { 31 | exit_error(msg + ": " + std::strerror(errno)); 32 | } 33 | 34 | } // namespace cobs 35 | 36 | /******************************************************************************/ 37 | -------------------------------------------------------------------------------- /cobs/util/error_handling.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/error_handling.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_ERROR_HANDLING_HEADER 10 | #define COBS_UTIL_ERROR_HANDLING_HEADER 11 | 12 | #include 13 | 14 | namespace cobs { 15 | 16 | void print_errno(const std::string& msg); 17 | void exit_error(const std::string& msg); 18 | void assert_exit(bool cond, const std::string& msg); 19 | void exit_error_errno(const std::string& msg); 20 | template 21 | void assert_throw(bool cond, const std::string& msg); 22 | 23 | } // namespace cobs 24 | 25 | namespace cobs { 26 | 27 | template 28 | void assert_throw(bool cond, const std::string& msg) { 29 | if (!cond) { 30 | throw E(msg); 31 | } 32 | } 33 | 34 | } // namespace cobs 35 | 36 | #endif // !COBS_UTIL_ERROR_HANDLING_HEADER 37 | 38 | /******************************************************************************/ 39 | -------------------------------------------------------------------------------- /cobs/util/file.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/file.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_FILE_HEADER 10 | #define COBS_UTIL_FILE_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include 19 | 20 | namespace cobs { 21 | 22 | template 23 | void serialize_header(std::ofstream& ofs, const fs::path& p, const Header& h) { 24 | ofs.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 25 | ofs.open(p.string(), std::ios::out | std::ios::binary); 26 | die_unless(ofs.good()); 27 | h.serialize(ofs); 28 | } 29 | 30 | template 31 | void serialize_header(const fs::path& p, const Header& h) { 32 | std::ofstream ofs; 33 | serialize_header
(ofs, p, h); 34 | } 35 | 36 | template 37 | Header deserialize_header(std::ifstream& ifs, const fs::path& p) { 38 | ifs.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 39 | ifs.open(p.string(), std::ios::in | std::ios::binary); 40 | die_unless(ifs.good()); 41 | Header h; 42 | h.deserialize(ifs); 43 | return h; 44 | } 45 | 46 | template 47 | Header deserialize_header(const fs::path& p) { 48 | std::ifstream ifs; 49 | return deserialize_header
(ifs, p); 50 | } 51 | 52 | //! check if file has given header 53 | template 54 | bool file_has_header(const fs::path& p) { 55 | if (!fs::is_regular_file(p)) 56 | return false; 57 | 58 | try { 59 | deserialize_header
(p); 60 | } 61 | catch (...) { 62 | return false; 63 | } 64 | 65 | return true; 66 | } 67 | 68 | //! for a path, return the base file name without any extensions 69 | static inline 70 | std::string base_name(const fs::path& p) { 71 | std::string result = p.filename().string(); 72 | std::string::size_type pos = result.find('.'); 73 | if (pos == std::string::npos) 74 | return result; 75 | return result.substr(0, pos); 76 | } 77 | 78 | } // namespace cobs 79 | 80 | #endif // !COBS_UTIL_FILE_HEADER 81 | 82 | /******************************************************************************/ 83 | -------------------------------------------------------------------------------- /cobs/util/fs.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/fs.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_FS_HEADER 10 | #define COBS_UTIL_FS_HEADER 11 | 12 | #if __cplusplus >= 201703L 13 | 14 | #include 15 | 16 | namespace cobs { 17 | 18 | namespace fs = std::experimental::filesystem; 19 | using std::error_code; 20 | 21 | } // namespace cobs 22 | 23 | #else 24 | 25 | #include 26 | 27 | namespace cobs { 28 | 29 | namespace fs = boost::filesystem; 30 | using boost::system::error_code; 31 | 32 | } // namespace cobs 33 | 34 | #endif 35 | 36 | #endif // !COBS_UTIL_FS_HEADER 37 | 38 | /******************************************************************************/ 39 | -------------------------------------------------------------------------------- /cobs/util/misc.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/misc.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | namespace cobs { 16 | 17 | uint64_t get_page_size() { 18 | int page_size = getpagesize(); 19 | die_unless(page_size > 0); 20 | die_unless(page_size == 4096); // todo check for experiments 21 | return (uint64_t)page_size; 22 | } 23 | 24 | uint64_t get_memory_size() { 25 | return sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); 26 | } 27 | 28 | uint64_t get_memory_size(size_t percentage) { 29 | return get_memory_size() * percentage / 100; 30 | } 31 | 32 | std::string random_sequence(size_t size, size_t seed) { 33 | std::default_random_engine rng(seed); 34 | return random_sequence_rng(size, rng); 35 | } 36 | 37 | } // namespace cobs 38 | 39 | /******************************************************************************/ 40 | -------------------------------------------------------------------------------- /cobs/util/misc.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/misc.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_MISC_HEADER 10 | #define COBS_UTIL_MISC_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #include 21 | 22 | namespace cobs { 23 | 24 | uint64_t get_page_size(); 25 | 26 | uint64_t get_memory_size(); 27 | 28 | uint64_t get_memory_size(size_t percentage); 29 | 30 | template 31 | std::string random_sequence_rng(size_t size, RandomGenerator& rng) { 32 | static const std::array basepairs = { 'A', 'C', 'G', 'T' }; 33 | std::string result; 34 | for (size_t i = 0; i < size; i++) { 35 | result += basepairs[rng() % 4]; 36 | } 37 | return result; 38 | } 39 | 40 | std::string random_sequence(size_t size, size_t seed); 41 | 42 | template 43 | T * allocate_aligned(uint64_t size, size_t alignment) { 44 | T* ptr; 45 | int r = posix_memalign(reinterpret_cast(&ptr), alignment, sizeof(T) * size); 46 | if (r != 0) 47 | throw std::runtime_error("Out of memory"); 48 | std::fill(ptr, ptr + size, 0); 49 | return ptr; 50 | } 51 | 52 | static inline 53 | void deallocate_aligned(void* ptr) { 54 | free(ptr); 55 | } 56 | 57 | static inline 58 | std::string pad_index(unsigned index, int size = 6) { 59 | return tlx::ssprintf("%0*u", size, index); 60 | } 61 | 62 | /*! 63 | * Constructs the hash used by the signatures. 64 | */ 65 | template 66 | void process_hashes(const void* input, size_t size, uint64_t signature_size, 67 | uint64_t num_hashes, Callback callback) { 68 | for (unsigned int i = 0; i < num_hashes; i++) { 69 | uint64_t hash = XXH64(input, size, i); 70 | callback(hash % signature_size); 71 | } 72 | } 73 | 74 | } // namespace cobs 75 | 76 | #endif // !COBS_UTIL_MISC_HEADER 77 | 78 | /******************************************************************************/ 79 | -------------------------------------------------------------------------------- /cobs/util/parallel_for.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/parallel_for.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | namespace cobs { 12 | 13 | //! thread pool singleton 14 | std::unique_ptr g_thread_pool; 15 | 16 | } // namespace cobs 17 | 18 | /******************************************************************************/ 19 | -------------------------------------------------------------------------------- /cobs/util/parallel_for.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/parallel_for.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_PARALLEL_FOR_HEADER 10 | #define COBS_UTIL_PARALLEL_FOR_HEADER 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | namespace cobs { 19 | 20 | //! thread pool singleton 21 | extern std::unique_ptr g_thread_pool; 22 | 23 | //! run a functor in parallel 24 | template 25 | void parallel_for(size_t begin, size_t end, size_t num_threads, 26 | Functor functor) { 27 | if (num_threads <= 1) { 28 | for (size_t i = begin; i < end; ++i) { 29 | functor(i); 30 | } 31 | } 32 | else { 33 | if (!g_thread_pool) 34 | g_thread_pool = std::make_unique(); 35 | 36 | tlx::Semaphore sem; 37 | std::atomic counter { begin }; 38 | std::exception_ptr eptr; 39 | // enqueue threads for work 40 | for (size_t t = 0; t < num_threads; ++t) { 41 | g_thread_pool->enqueue( 42 | [&]() { 43 | try { 44 | size_t i; 45 | while ((i = counter++) < end) { 46 | functor(i); 47 | } 48 | } 49 | catch (...) { 50 | // capture exception 51 | eptr = std::current_exception(); 52 | } 53 | // done, raise semaphore 54 | sem.signal(); 55 | }); 56 | } 57 | // wait for all num_threads to finish 58 | sem.wait(num_threads); 59 | // rethrow exception 60 | if (eptr) 61 | std::rethrow_exception(eptr); 62 | } 63 | } 64 | 65 | } // namespace cobs 66 | 67 | #endif // !COBS_UTIL_PARALLEL_FOR_HEADER 68 | 69 | /******************************************************************************/ 70 | -------------------------------------------------------------------------------- /cobs/util/process_file_batches.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/process_file_batches.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_PROCESS_FILE_BATCHES_HEADER 10 | #define COBS_UTIL_PROCESS_FILE_BATCHES_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace cobs { 27 | 28 | template 29 | void get_sorted_file_names(const fs::path& in_dir, 30 | std::vector* paths, 31 | Callback callback) { 32 | fs::recursive_directory_iterator it(in_dir), end; 33 | while (it != end) { 34 | if (callback(*it)) { 35 | paths->emplace_back(*it); 36 | } 37 | ++it; 38 | } 39 | std::sort(paths->begin(), paths->end()); 40 | } 41 | 42 | template 43 | size_t process_file_batches(const fs::path& in_dir, const fs::path& out_dir, 44 | size_t batch_size, Selector selector, 45 | Callback callback) { 46 | std::vector sorted_paths; 47 | get_sorted_file_names( 48 | in_dir, &sorted_paths, [](const fs::path&) { return true; }); 49 | fs::create_directories(out_dir); 50 | 51 | struct Batch { 52 | std::vector files; 53 | std::string out_file; 54 | }; 55 | std::vector batch_list; 56 | 57 | std::string first_filename, last_filename; 58 | 59 | std::vector paths; 60 | for (size_t i = 0; i < sorted_paths.size(); i++) { 61 | if (selector(sorted_paths[i])) { 62 | std::string filename = cobs::base_name(sorted_paths[i]); 63 | if (first_filename.empty()) { 64 | first_filename = filename; 65 | } 66 | last_filename = filename; 67 | paths.push_back(sorted_paths[i]); 68 | } 69 | if (paths.size() == batch_size || 70 | (!paths.empty() && i + 1 == sorted_paths.size())) 71 | { 72 | std::string out_file = 73 | pad_index(batch_list.size()) + '_' + 74 | '[' + first_filename + '-' + last_filename + ']'; 75 | 76 | batch_list.emplace_back(Batch { std::move(paths), out_file }); 77 | 78 | paths.clear(); 79 | first_filename.clear(); 80 | } 81 | } 82 | 83 | parallel_for( 84 | 0, batch_list.size(), gopt_threads, 85 | [&](size_t i) { 86 | callback(batch_list[i].files, batch_list[i].out_file); 87 | }); 88 | 89 | return batch_list.size(); 90 | } 91 | 92 | } // namespace cobs 93 | 94 | #endif // !COBS_UTIL_PROCESS_FILE_BATCHES_HEADER 95 | 96 | /******************************************************************************/ 97 | -------------------------------------------------------------------------------- /cobs/util/query.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/query.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_QUERY_HEADER 10 | #define COBS_UTIL_QUERY_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | namespace cobs { 22 | 23 | int open_file(const fs::path& path, int flags); 24 | void close_file(int fd); 25 | 26 | struct MMapHandle { 27 | int fd; 28 | uint8_t* data; 29 | uint64_t size; 30 | }; 31 | 32 | MMapHandle initialize_mmap(const fs::path& path); 33 | void destroy_mmap(MMapHandle& handle); 34 | 35 | //! Canonicalize a k-mer. Given an input k-mer of length size, checks if should 36 | //! be canonicalized into its reverse complement. If any letter other than ACGT 37 | //! occurs, the letter is replaced with a binary zero, and the function returns 38 | //! false, indicating an invalid input. The input k-mer is always written to the 39 | //! output buffer, replacing letters with zeros, or with the reverse 40 | //! complement. The output pointer must point to a memory area of size 41 | //! bytes. The output is not returned null-terminated!. 42 | bool canonicalize_kmer(const char* input, char* output, size_t size); 43 | 44 | } // namespace cobs 45 | 46 | #endif // !COBS_UTIL_QUERY_HEADER 47 | 48 | /******************************************************************************/ 49 | -------------------------------------------------------------------------------- /cobs/util/serialization.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/serialization.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_SERIALIZATION_HEADER 10 | #define COBS_UTIL_SERIALIZATION_HEADER 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | namespace cobs { 20 | 21 | struct StreamPos { 22 | uint64_t curr_pos; 23 | uint64_t end_pos; 24 | uint64_t size() const { return end_pos - curr_pos; } 25 | }; 26 | 27 | //! return StreamPos object 28 | static inline 29 | StreamPos get_stream_pos(std::istream& is) { 30 | std::streamoff curr_pos = is.tellg(); 31 | is.seekg(0, std::ios::end); 32 | std::streamoff end_pos = is.tellg(); 33 | is.seekg(curr_pos, std::ios::beg); 34 | die_unless(is.good()); 35 | die_unless(curr_pos >= 0); 36 | die_unless(end_pos >= 0); 37 | die_unless(end_pos >= curr_pos); 38 | return StreamPos { (uint64_t)curr_pos, (uint64_t)end_pos }; 39 | } 40 | 41 | //! return remaining bytes in stream and rewind to the current position. 42 | static inline 43 | size_t get_stream_size(std::istream& is) { 44 | std::streamoff curr_pos = is.tellg(); 45 | is.seekg(0, std::ios::end); 46 | std::streamoff end_pos = is.tellg(); 47 | is.seekg(curr_pos, std::ios::beg); 48 | die_unless(is.good()); 49 | die_unless(curr_pos >= 0); 50 | die_unless(end_pos >= 0); 51 | die_unless(end_pos >= curr_pos); 52 | return (uint64_t)end_pos - (uint64_t)curr_pos; 53 | } 54 | 55 | //! read complete file of PODs 56 | template 57 | void read_complete_file(const fs::path& path, std::vector& v) { 58 | std::ifstream is(path.string(), std::ios::in | std::ios::binary); 59 | is.exceptions(std::ios::eofbit | std::ios::failbit | std::ios::badbit); 60 | StreamPos sp = get_stream_pos(is); 61 | die_unless(sp.end_pos % sizeof(T) == 0); 62 | v.resize(sp.end_pos / sizeof(T)); 63 | is.read(reinterpret_cast(v.data()), sp.end_pos); 64 | } 65 | 66 | /******************************************************************************/ 67 | 68 | //! append a POD to an ostream 69 | template 70 | void stream_put_pod(std::ostream& os, const T& t) { 71 | static_assert(std::is_pod::value, "T must be POD"); 72 | os.write(reinterpret_cast(&t), sizeof(T)); 73 | } 74 | 75 | //! append a list of PODs to an ostream 76 | static inline 77 | void stream_put(std::ostream& /* os */) { } 78 | 79 | //! append a list of PODs to an ostream 80 | template 81 | void stream_put(std::ostream& os, const T& t, const Args& ... args) { 82 | stream_put_pod(os, t); 83 | stream_put(os, args...); 84 | } 85 | 86 | //! read a POD from an istream 87 | template 88 | void stream_get_pod(std::istream& is, T& t) { 89 | static_assert(std::is_pod::value, "T must be POD"); 90 | is.read(reinterpret_cast(&t), sizeof(T)); 91 | } 92 | 93 | //! read a list of PODs from an istream 94 | static inline 95 | void stream_get(std::istream& /* is */) { } 96 | 97 | //! read a list of PODs from an istream 98 | template 99 | void stream_get(std::istream& is, T& t, Args& ... args) { 100 | stream_get_pod(is, t); 101 | stream_get(is, args...); 102 | } 103 | 104 | } // namespace cobs 105 | 106 | #endif // !COBS_UTIL_SERIALIZATION_HEADER 107 | 108 | /******************************************************************************/ 109 | -------------------------------------------------------------------------------- /cobs/util/thread_object_array.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/thread_object_array.hpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_THREAD_OBJECT_ARRAY_HEADER 10 | #define COBS_UTIL_THREAD_OBJECT_ARRAY_HEADER 11 | 12 | #include 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace cobs { 22 | 23 | /*! 24 | * LRU cache to free objects in ThreadObjectArray when they are not used often 25 | * enough. 26 | */ 27 | template 28 | class ThreadObjectLRUSet 29 | { 30 | public: 31 | ThreadObjectLRUSet(size_t limit) 32 | : limit_(limit) { } 33 | 34 | tlx::LruCacheSet > lru_set_; 35 | std::mutex mutex_; 36 | size_t limit_; 37 | 38 | void put(const std::shared_ptr& ptr) { 39 | while (lru_set_.size() + 1 > limit_) { 40 | // pop shared pointer and thus release reference 41 | lru_set_.pop(); 42 | } 43 | lru_set_.put(ptr); 44 | } 45 | 46 | void touch(const std::shared_ptr& ptr) { 47 | lru_set_.touch(ptr); 48 | } 49 | }; 50 | 51 | /*! 52 | * Array of objects each local to the current thread. 53 | */ 54 | template 55 | class ThreadObjectArray 56 | { 57 | public: 58 | ThreadObjectArray(ThreadObjectLRUSet& lru_set) 59 | : lru_set_(lru_set) { } 60 | 61 | //! get the thread local object, if released via LRU, return a new one 62 | template 63 | std::shared_ptr get(Args&& ... args) { 64 | std::unique_lock lock(lru_set_.mutex_); 65 | std::thread::id id = std::this_thread::get_id(); 66 | // try to acquire shared ptr from array 67 | std::shared_ptr p = map_[id].lock(); 68 | if (p) { 69 | lru_set_.touch(p); 70 | return p; 71 | } 72 | // otherwise acquire a token from the LRU set and construct a new object 73 | p = std::make_shared(std::forward(args) ...); 74 | map_[id] = p; 75 | lru_set_.put(p); 76 | return p; 77 | } 78 | 79 | private: 80 | std::unordered_map > map_; 81 | ThreadObjectLRUSet& lru_set_; 82 | }; 83 | 84 | template 85 | using ThreadObjectArrayPtr = std::shared_ptr >; 86 | 87 | } // namespace cobs 88 | 89 | #endif // !COBS_UTIL_THREAD_OBJECT_ARRAY_HEADER 90 | 91 | /******************************************************************************/ 92 | -------------------------------------------------------------------------------- /cobs/util/timer.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/timer.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | namespace cobs { 20 | 21 | static std::mutex s_timer_add_mutex; 22 | 23 | Timer::Entry& Timer::find_or_create(const char* name) { 24 | uint32_t h = tlx::hash_djb2(name); 25 | for (size_t i = 0; i < timers_.size(); ++i) { 26 | if (timers_[i].hash == h && strcmp(timers_[i].name, name) == 0) 27 | return timers_[i]; 28 | } 29 | Entry new_entry; 30 | new_entry.hash = h; 31 | new_entry.name = name; 32 | new_entry.duration = std::chrono::duration::zero(); 33 | timers_.emplace_back(new_entry); 34 | return timers_.back(); 35 | } 36 | 37 | void Timer::active(const char* timer) { 38 | die_unless(timer); 39 | // yes, compare string pointers, not contents 40 | if (running_ == timer) { 41 | LOG1 << "Timer: starting same timer twice, maybe multi-threading?"; 42 | } 43 | stop(); 44 | running_ = timer; 45 | } 46 | 47 | void Timer::stop() { 48 | auto new_time_point = std::chrono::high_resolution_clock::now(); 49 | if (running_) { 50 | Entry& e = find_or_create(running_); 51 | e.duration += new_time_point - time_point_; 52 | total_duration_ += new_time_point - time_point_; 53 | } 54 | time_point_ = new_time_point; 55 | running_ = nullptr; 56 | } 57 | 58 | void Timer::reset() { 59 | timers_.clear(); 60 | total_duration_ = std::chrono::duration::zero(); 61 | } 62 | 63 | double Timer::get(const char* name) { 64 | return find_or_create(name).duration.count(); 65 | } 66 | 67 | Timer& Timer::operator += (const Timer& b) { 68 | std::unique_lock lock(s_timer_add_mutex); 69 | for (const Entry& t : b.timers_) { 70 | Entry& e = find_or_create(t.name); 71 | e.duration += t.duration; 72 | } 73 | total_duration_ += b.total_duration_; 74 | return *this; 75 | } 76 | 77 | void Timer::print(const char* info, std::ostream& os) const { 78 | die_unless(!running_); 79 | 80 | os << "TIMER info=" << info; 81 | for (const Entry& timer : timers_) { 82 | os << ' ' << timer.name << '=' << timer.duration.count(); 83 | } 84 | os << " total=" << total_duration_.count() << std::endl; 85 | } 86 | 87 | void Timer::print(const char* info) const { 88 | return print(info, std::cerr); 89 | } 90 | 91 | } // namespace cobs 92 | 93 | /******************************************************************************/ 94 | -------------------------------------------------------------------------------- /cobs/util/timer.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/timer.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_UTIL_TIMER_HEADER 10 | #define COBS_UTIL_TIMER_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace cobs { 18 | 19 | class Timer 20 | { 21 | private: 22 | //! timer entry 23 | struct Entry { 24 | uint32_t hash; 25 | const char* name; 26 | std::chrono::duration duration; 27 | }; 28 | 29 | //! array of timers 30 | std::vector timers_; 31 | 32 | //! total duration 33 | std::chrono::duration total_duration_ = 34 | std::chrono::duration::zero(); 35 | 36 | //! currently running timer name 37 | const char* running_ = nullptr; 38 | //! start of currently running timer name 39 | std::chrono::time_point time_point_; 40 | 41 | Entry& find_or_create(const char* name); 42 | 43 | public: 44 | Timer() = default; 45 | void active(const char* timer); 46 | void stop(); 47 | void reset(); 48 | double get(const char* timer); 49 | void print(const char* info, std::ostream& os) const; 50 | void print(const char* info) const; 51 | 52 | //! add all timers from another, internally holds a global lock, because 53 | //! this is used to add thread values 54 | Timer& operator += (const Timer& b); 55 | }; 56 | 57 | } // namespace cobs 58 | 59 | #endif // !COBS_UTIL_TIMER_HEADER 60 | 61 | /******************************************************************************/ 62 | -------------------------------------------------------------------------------- /cobs/util/zip_stream_fwd.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * cobs/util/zip_stream_fwd.hpp 3 | * 4 | * 5 | * All rights reserved. Published under the MIT License in the LICENSE file. 6 | ******************************************************************************/ 7 | 8 | #ifndef COBS_UTIL_ZIP_STREAM_FWD_HEADER 9 | #define COBS_UTIL_ZIP_STREAM_FWD_HEADER 10 | 11 | #include 12 | 13 | namespace cobs { 14 | 15 | template > 17 | class basic_zip_ostream; 18 | 19 | template > 21 | class basic_zip_istream; 22 | 23 | //! A typedef for basic_zip_ostream 24 | using zip_ostream = basic_zip_ostream; 25 | //! A typedef for basic_zip_istream 26 | using zip_istream = basic_zip_istream; 27 | 28 | } // namespace cobs 29 | 30 | #endif // !COBS_UTIL_ZIP_STREAM_FWD_HEADER 31 | 32 | /******************************************************************************/ 33 | -------------------------------------------------------------------------------- /misc/mkdocs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | ################################################################################ 3 | # misc/mkdocs.sh 4 | # 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | # Script to build and install python module and then to rebuild docs 10 | 11 | set -e 12 | 13 | pushd build/python 14 | make -j8 15 | cp \ 16 | cobs_index.cpython-36m-x86_64-linux-gnu.so \ 17 | ~/.local/lib64/python3.6/site-packages/ 18 | popd 19 | 20 | pushd python/docs 21 | rm -rf _build _generated 22 | make html 23 | popd 24 | 25 | ################################################################################ 26 | -------------------------------------------------------------------------------- /misc/python-wheels/manylinux_x86_64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | ################################################################################ 3 | # misc/python-wheels/manylinux_x86_64.sh 4 | # 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | # run script inside a docker with 10 | # 11 | # docker pull quay.io/pypa/manylinux2010_x86_64 12 | # docker run --rm -it -v /home/tb/cobs:/src quay.io/pypa/manylinux2010_x86_64 13 | 14 | set -e 15 | 16 | yum update -y 17 | yum install -y wget zlib-devel 18 | 19 | # build newer cmake 20 | rm -rf /tmp/cmake 21 | mkdir /tmp/cmake 22 | cd /tmp/cmake 23 | 24 | wget https://cmake.org/files/v3.9/cmake-3.9.2.tar.gz 25 | tar -zxf cmake-3.9.2.tar.gz 26 | cd cmake-3.9.2 27 | 28 | ./bootstrap --prefix=/usr/local 29 | make -j4 30 | make install 31 | 32 | # build many wheels 33 | rm -rf /src/build 34 | mkdir -p /src/dist 35 | for p in /opt/python/*; do 36 | $p/bin/pip wheel --verbose /src -w /src/dist 37 | done 38 | 39 | # repair wheels 40 | for w in /src/dist/cobs_index-*whl; do 41 | auditwheel repair $w -w /src/dist/ 42 | rm -f $w 43 | done 44 | 45 | ################################################################################ 46 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # python/CMakeLists.txt 3 | # 4 | # Copyright (c) 2019 Timo Bingmann 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | add_subdirectory(pybind11) 10 | pybind11_add_module(cobs_python module.cpp) 11 | 12 | # rename cobs_python target output to cobs_index 13 | set_target_properties(cobs_python PROPERTIES OUTPUT_NAME cobs_index) 14 | target_link_libraries(cobs_python PRIVATE cobs_static) 15 | 16 | ################################################################################ 17 | -------------------------------------------------------------------------------- /python/cobs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import cobs_index, subprocess, sys, os 3 | path = os.path.abspath(os.path.dirname(cobs_index.__file__)) + "/cobs.bin" 4 | subprocess.call([path] + sys.argv[1:]) 5 | -------------------------------------------------------------------------------- /python/docs/_static/cobs-index-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/python/docs/_static/cobs-index-architecture.png -------------------------------------------------------------------------------- /python/docs/cobs_index.rst: -------------------------------------------------------------------------------- 1 | .. automodule:: cobs_index 2 | -------------------------------------------------------------------------------- /python/docs/index.rst: -------------------------------------------------------------------------------- 1 | ========================================== 2 | COBS: A Compact Bit-Sliced Signature Index 3 | ========================================== 4 | 5 | COBS (COmpact Bit-sliced Signature index) is a cross-over between an inverted 6 | index and Bloom filters. Our target application is to index k-mers of DNA 7 | samples or q-grams from text documents and process **approximate pattern 8 | matching** queries on the corpus with a user-chosen coverage threshold. Query 9 | results may contain a number of false positives which decreases exponentially 10 | with the query length and the false positive rate of the index determined at 11 | construction time. COBS' compact but simple data structure outperforms other 12 | indexes in construction time and query performance with Mantis by Pandey et 13 | al. in second place. However, unlike Mantis and other previous work, COBS does 14 | not need the complete index in RAM and is thus designed to scale to larger 15 | document sets. 16 | 17 | .. image:: _static/cobs-index-architecture.png 18 | 19 | More information about COBS is presented in `our current research paper `_ 20 | Timo Bingmann, Phelim Bradley, Florian Gauger, and Zamin Iqbal. 21 | "COBS: a Compact Bit-Sliced Signature Index". 22 | In: *26th International Symposium on String Processing and Information Retrieval (SPIRE)*. pages 285-303. Spinger. October 2019. 23 | preprint arXiv:1905.09624. 24 | 25 | :ref:`See the tutorial page` on how to use COBS in python scripts. 26 | 27 | Table of Contents 28 | ================= 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | 33 | tutorial 34 | cobs_index 35 | -------------------------------------------------------------------------------- /python/docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst; mode: flyspell; ispell-local-dictionary: "en_US"; coding: utf-8 -*- 2 | .. _tutorial: 3 | .. currentmodule:: cobs_index 4 | 5 | ================================== 6 | Tutorial for COBS Python Interface 7 | ================================== 8 | 9 | Installation 10 | ------------ 11 | 12 | Installation of COBS with Python interface is easy using pip. The `package name 13 | on PyPI `_ is ``cobs_index`` and you need cmake 14 | and a recent C++11 compiler to build the C++ library source. 15 | 16 | .. code-block:: bash 17 | 18 | $ pip install --user cobs_index 19 | 20 | Document Lists 21 | -------------- 22 | 23 | COBS can read and create an index from the following document types: 24 | 25 | - FastA (``.fasta``, ``.fa``, ``.fasta.gz``, ``.fa.gz``) 26 | - FastQ (``.fastq``, ``.fq``, ``.fastq.gz``, ``.fq.gz``) 27 | - McCortex (``.ctx``, ``.cortex``) 28 | - text files (``.txt``) 29 | - MultiFastA (``.mfasta``) 30 | 31 | The document types are identified by extension and compressed ``.gz`` files are 32 | handled transparently. The set of k-mers extracted from each file type is 33 | handled slightly differently: for FastA files each continuous subsequence is 34 | broken into k-mers individually, while McCortex files explicitly list all 35 | k-mers, and for text files the entire continuous file is broken into 36 | k-mers. Each document creates one entry in the index, except for MultiFastA were 37 | each subsequence is considered an individual document. 38 | 39 | COBS usually scans a directory and creates an index containing all documents it 40 | finds. For more fine-grain control, document lists are represented using 41 | :class:`DocumentList` objects. DocumentLists can be created empty or by scanning 42 | a directory, files can be added, and they contain :class:`DocumentEntry` objects 43 | which can be iterated over. 44 | 45 | .. code-block:: python 46 | 47 | import cobs_index as cobs 48 | 49 | doclist1 = cobs.DocumentList("/path/to/documents") 50 | print("doclist1: ({} entries)".format(len(doclist1))) 51 | for i, d in enumerate(doclist1): 52 | print("doc[{}] name {} size {}".format(i, d.name, d.size)) 53 | 54 | doclist2 = cobs.DocumentList() 55 | doclist2.add("/path/to/single/document.fa") 56 | doclist2.add_recursive("/path/to/documents", cobs.FileType.Fasta) 57 | print("doclist2: ({} entries)".format(len(doclist2))) 58 | for i, d in enumerate(doclist2): 59 | print("doc[{}] name {} size {}".format(i, d.name, d.size)) 60 | 61 | Index Construction 62 | ------------------ 63 | 64 | Compact indices are constructed using the functions :func:`compact_construct` or 65 | :func:`compact_construct_list`. The first scans a directory for documents and 66 | constructs an index from them, while the latter takes a explicit 67 | :class:`DocumentList`. Note that the output index file *must* end with 68 | ``.cobs_compact``. 69 | 70 | .. code-block:: python 71 | 72 | cobs.compact_construct("/path/to/documents", "my_index.cobs_compact") 73 | 74 | Parameters for index construction may be passed using a 75 | :class:`CompactIndexParameters` object. See the class documentation for a 76 | complete list of parameters. The default parameters are a reasonable choice for 77 | most DNA k-mer applications. 78 | 79 | .. code-block:: python 80 | 81 | import cobs_index as cobs 82 | 83 | p = cobs.CompactIndexParameters() 84 | p.term_size = 31 # k-mer size 85 | p.clobber = True # overwrite output and temporary files 86 | p.false_positive_rate = 0.4 # higher false positive rate -> smaller index 87 | 88 | cobs.compact_construct("/path/to/documents", "my_index.cobs_compact", index_params=p) 89 | 90 | Besides compact indices, COBS also constructs and supports "classic" 91 | indices. These are however usually not be used in practice and thus not 92 | discussed here further. 93 | 94 | Querying an Index 95 | ----------------- 96 | 97 | To query an index, first load it using a :class:`Search` object. This method 98 | detects the type of index, reads the metadata, and opens the entire file using 99 | ``mmap``. 100 | 101 | Querying is performed with the :meth:`Search.search` method. This method returns 102 | **a list containing pairs**: ``(#occurrences, document name)``. 103 | 104 | .. code-block:: python 105 | 106 | import cobs_index as cobs 107 | 108 | s = cobs.Search("out.cobs_compact") 109 | r = s.search("AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT") 110 | print(r) 111 | # output: [(20, 'sample1'), (16, 'sample2'), ...] 112 | 113 | With the default search parameters **all document scores** are returned. For 114 | large corpora creating this Python list is a substantial overhead, such that the 115 | result set should be limited using a) the ``threshold`` parameter or b) the 116 | ``num_results`` parameter. Threshold determines the fraction of k-mers in the 117 | query a document be reach to be included in the result, while ``num_results`` 118 | simply limits the list size to a given number. 119 | -------------------------------------------------------------------------------- /python/notes.md: -------------------------------------------------------------------------------- 1 | ## Run Tests with setuptools 2 | 3 | python3 setup.py test 4 | 5 | ## Build and Install Locally 6 | 7 | python3 -m pip install --user --verbose . 8 | 9 | ## Build sdist and upload to PyPI-Test 10 | 11 | python3 setup.py sdist 12 | python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* 13 | 14 | ## Build wheels and upload to PyPI-Test 15 | 16 | docker pull quay.io/pypa/manylinux2010_x86_64 17 | docker run --rm -it -v /home/tb/cobs:/src quay.io/pypa/manylinux2010_x86_64 18 | 19 | and run 20 | /src/misc/python-wheels/manylinux_x86_64.sh 21 | 22 | python3 -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* 23 | 24 | ## Test distributed inside a virtualenv 25 | 26 | virtualenv a 27 | cd a 28 | source bin/activate 29 | 30 | python3 -m pip install --index-url https://test.pypi.org/simple/ --no-deps cobs_index 31 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /python/tests/test_cobs_index.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | import cobs_index as cobs 5 | 6 | mydir = os.path.dirname(os.path.realpath(__file__)) 7 | datadir = os.path.realpath(mydir + "/../../tests/data") 8 | 9 | cobs.disable_cache() 10 | 11 | class MainTest(unittest.TestCase): 12 | # read a directory containing FastA files 13 | def test_doc_list(self): 14 | l1 = cobs.DocumentList(datadir + "/fasta") 15 | self.assertEqual(l1.size(), 7) 16 | 17 | l2 = cobs.DocumentList() 18 | l2.add_recursive(datadir + "/fasta") 19 | self.assertEqual(l2.size(), 7) 20 | 21 | # construct classic index and run queries 22 | def test_classic_construct_query(self): 23 | index_file = datadir + "/python_test.cobs_classic" 24 | 25 | # construct classic index 26 | p = cobs.ClassicIndexParameters() 27 | p.clobber = True 28 | cobs.classic_construct( 29 | input=datadir + "/fasta", 30 | out_file=index_file, 31 | index_params=p) 32 | self.assertTrue(os.path.isfile(index_file)) 33 | 34 | # run queries 35 | s = cobs.Search(index_file) 36 | r = s.search("AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT") 37 | #print(r) 38 | self.assertEqual(len(r), 7) 39 | self.assertEqual(r[0].doc_name, "sample1") 40 | self.assertEqual(r[0].score, 20) 41 | 42 | # construct compact index and run queries 43 | def test_compact_construct_query(self): 44 | index_file = datadir + "/python_test.cobs_compact" 45 | 46 | # construct compact index 47 | p = cobs.CompactIndexParameters() 48 | p.clobber = True 49 | cobs.compact_construct( 50 | input=datadir + "/fasta", 51 | out_file=index_file, 52 | index_params=p) 53 | self.assertTrue(os.path.isfile(index_file)) 54 | 55 | # run queries 56 | s = cobs.Search(index_file) 57 | r = s.search("AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT") 58 | #print(r) 59 | self.assertEqual(len(r), 7) 60 | self.assertEqual(r[0].doc_name, "sample1") 61 | self.assertEqual(r[0].score, 20) 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import platform 5 | import subprocess 6 | import unittest 7 | 8 | from setuptools import setup, Extension 9 | from setuptools.command.build_ext import build_ext 10 | from distutils.version import LooseVersion 11 | 12 | class CMakeExtension(Extension): 13 | def __init__(self, name, sourcedir=''): 14 | Extension.__init__(self, name, sources=[]) 15 | self.sourcedir = os.path.abspath(sourcedir) 16 | 17 | class CMakeBuild(build_ext): 18 | def run(self): 19 | try: 20 | out = subprocess.check_output(['cmake', '--version']) 21 | except OSError: 22 | raise RuntimeError("CMake must be installed to build the following extensions: " + 23 | ", ".join(e.name for e in self.extensions)) 24 | 25 | if platform.system() == "Windows": 26 | cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1)) 27 | if cmake_version < '3.1.0': 28 | raise RuntimeError("CMake >= 3.1.0 is required on Windows") 29 | 30 | for ext in self.extensions: 31 | self.build_extension(ext) 32 | 33 | def build_extension(self, ext): 34 | extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) 35 | cmake_args = ['-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=' + extdir, 36 | '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, 37 | '-DCOBS_EXECUTABLE_SUFFIX=.bin', 38 | '-DPYTHON_EXECUTABLE=' + sys.executable] 39 | 40 | cfg = 'Debug' if self.debug else 'Release' 41 | build_args = ['--config', cfg] 42 | 43 | if platform.system() == "Windows": 44 | cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)] 45 | if sys.maxsize > 2**32: 46 | cmake_args += ['-A', 'x64'] 47 | build_args += ['--', '/m'] 48 | else: 49 | cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] 50 | build_args += ['--', '-j2'] 51 | 52 | env = os.environ.copy() 53 | env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format( 54 | env.get('CXXFLAGS', ''), self.distribution.get_version()) 55 | 56 | if not os.path.exists(self.build_temp): 57 | os.makedirs(self.build_temp) 58 | subprocess.check_call( 59 | ['cmake', ext.sourcedir] + cmake_args, 60 | cwd=self.build_temp, env=env) 61 | subprocess.check_call( 62 | ['cmake', '--build', '.', '--target', 'cobs'] + build_args, 63 | cwd=self.build_temp) 64 | subprocess.check_call( 65 | ['cmake', '--build', '.', '--target', 'cobs_python'] + build_args, 66 | cwd=self.build_temp) 67 | subprocess.check_call(['strip', extdir + "/cobs.bin"]) 68 | 69 | def test_suite(): 70 | test_loader = unittest.TestLoader() 71 | test_suite = test_loader.discover('python/tests', pattern='test_*.py') 72 | return test_suite 73 | 74 | with open("README.md", "r") as fh: 75 | long_description = fh.read() 76 | 77 | if __name__ == '__main__': 78 | setup( 79 | name='cobs_index', 80 | version='0.1.2', 81 | description='Compact Bit-Sliced Signature Index (COBS)', 82 | long_description=long_description, 83 | long_description_content_type="text/markdown", 84 | url="https://panthema.net/cobs", 85 | author='Timo Bingmann', 86 | author_email='tbdev@panthema.net', 87 | ext_modules=[CMakeExtension('cobs_index')], 88 | cmdclass=dict(build_ext=CMakeBuild), 89 | zip_safe=False, 90 | # use MANIFEST.in for extra source files 91 | include_package_data=True, 92 | # include cobs wrapper script 93 | scripts=["python/cobs"], 94 | # find test suite 95 | test_suite='setup.test_suite', 96 | classifiers=[ 97 | "Development Status :: 4 - Beta", 98 | "Topic :: Scientific/Engineering :: Bio-Informatics", 99 | "Topic :: Text Processing :: Indexing" 100 | ], 101 | ) 102 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # src/CMakeLists.txt 3 | # 4 | # Copyright (c) 2019 Timo Bingmann 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | # glob programs 10 | file(GLOB_RECURSE EXECUTABLE_FILES 11 | "${PROJECT_SOURCE_DIR}/src/*.cpp") 12 | 13 | foreach(target ${EXECUTABLE_FILES}) 14 | get_filename_component(target_name ${target} NAME_WE) 15 | add_executable(${target_name} ${target}) 16 | target_link_libraries(${target_name} cobs_static) 17 | endforeach(target) 18 | 19 | ################################################################################ 20 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # tests/CMakeLists.txt 3 | # 4 | # Copyright (c) 2019 Timo Bingmann 5 | # 6 | # All rights reserved. Published under the MIT License in the LICENSE file. 7 | ################################################################################ 8 | 9 | # copy resources 10 | file(COPY data DESTINATION .) 11 | 12 | # glob tests 13 | file(GLOB_RECURSE TEST_FILES 14 | "${PROJECT_SOURCE_DIR}/tests/*.hpp" "${PROJECT_SOURCE_DIR}/tests/*.cpp") 15 | 16 | # compile into one program 17 | add_executable(cobs_tests ${TEST_FILES} ${SOURCE_FILES}) 18 | target_compile_definitions(cobs_tests PRIVATE cobs_test) 19 | target_link_libraries(cobs_tests cobs_static gtest gtest_main) 20 | gtest_add_tests(cobs_tests "" ${TEST_FILES}) 21 | 22 | ################################################################################ 23 | -------------------------------------------------------------------------------- /tests/classic_index_construction.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/classic_index_construction.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include "test_util.hpp" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fs = cobs::fs; 17 | 18 | static fs::path base_dir = "data/classic_index_construction"; 19 | static fs::path input_dir = base_dir / "input"; 20 | static fs::path index_dir = base_dir / "index"; 21 | static fs::path index_file = base_dir / "index.cobs_classic"; 22 | static fs::path tmp_path = base_dir / "tmp"; 23 | 24 | class classic_index_construction : public ::testing::Test 25 | { 26 | protected: 27 | void SetUp() final { 28 | cobs::error_code ec; 29 | fs::remove_all(base_dir, ec); 30 | } 31 | void TearDown() final { 32 | cobs::error_code ec; 33 | fs::remove_all(base_dir, ec); 34 | } 35 | }; 36 | 37 | TEST_F(classic_index_construction, deserialization) { 38 | // generate 39 | std::string query = cobs::random_sequence(10000, 1); 40 | auto documents = generate_documents_all(query, /* num_documents */ 33); 41 | generate_test_case(documents, input_dir.string()); 42 | 43 | // get file names 44 | std::vector paths; 45 | std::copy_if(fs::recursive_directory_iterator(input_dir), 46 | fs::recursive_directory_iterator(), 47 | std::back_inserter(paths), 48 | [](const auto& p) { 49 | return cobs::file_has_header(p); 50 | }); 51 | std::sort(paths.begin(), paths.end()); 52 | 53 | // construct classic index 54 | cobs::ClassicIndexParameters index_params; 55 | index_params.num_hashes = 3; 56 | index_params.false_positive_rate = 0.1; 57 | 58 | cobs::classic_construct( 59 | cobs::DocumentList(input_dir), index_file, tmp_path, index_params); 60 | 61 | // read classic index and check header fields 62 | std::vector data; 63 | cobs::ClassicIndexHeader h; 64 | h.read_file(index_file, data); 65 | ASSERT_EQ(h.file_names_.size(), 33u); 66 | ASSERT_EQ(h.num_hashes_, 3u); 67 | ASSERT_EQ(h.file_names_.size(), paths.size()); 68 | for (size_t i = 0; i < h.file_names_.size(); i++) { 69 | ASSERT_EQ(h.file_names_[i], cobs::base_name(paths[i])); 70 | } 71 | 72 | // check ratio of zeros/ones 73 | std::map num_ones; 74 | for (size_t j = 0; j < h.signature_size_; j++) { 75 | for (size_t k = 0; k < h.row_size(); k++) { 76 | uint8_t d = data[j * h.row_size() + k]; 77 | for (size_t o = 0; o < 8; o++) { 78 | size_t file_names_index = k * 8 + o; 79 | if (file_names_index < h.file_names_.size()) { 80 | std::string file_name = h.file_names_[file_names_index]; 81 | num_ones[file_name] += (d & (1 << o)) >> o; 82 | } 83 | } 84 | } 85 | } 86 | 87 | double set_bit_ratio = 88 | cobs::calc_average_set_bit_ratio(h.signature_size_, 3, 0.1); 89 | double num_ones_average = set_bit_ratio * h.signature_size_; 90 | for (auto& no : num_ones) { 91 | ASSERT_LE(no.second, num_ones_average * 1.01); 92 | } 93 | } 94 | 95 | TEST_F(classic_index_construction, combine) { 96 | using cobs::pad_index; 97 | fs::create_directories(index_dir); 98 | // generate 10 individual sets of documents and construct indices 99 | using DocumentSet = std::vector >; 100 | std::vector doc_sets; 101 | for (size_t i = 0; i < 10; ++i) { 102 | std::string query = cobs::random_sequence(10000, /* seed */ i + 1); 103 | auto documents = generate_documents_all( 104 | query, /* num_documents */ 3, /* num_terms */ 100); 105 | generate_test_case( 106 | documents, /* prefix */ "set_" + pad_index(i) + "_", 107 | input_dir / pad_index(i)); 108 | doc_sets.emplace_back(std::move(documents)); 109 | 110 | // construct classic index 111 | cobs::ClassicIndexParameters index_params; 112 | index_params.num_hashes = 3; 113 | index_params.false_positive_rate = 0.1; 114 | 115 | cobs::classic_construct( 116 | cobs::DocumentList(input_dir / pad_index(i)), 117 | index_dir / (pad_index(i) + ".cobs_classic"), 118 | tmp_path, index_params); 119 | } 120 | 121 | fs::path result_file; 122 | cobs::classic_combine( 123 | index_dir, index_file, result_file, 124 | /* mem_bytes */ 128 * 1024 * 1024, /* num_threads */ 4, 125 | /* keep_temporary */ false); 126 | 127 | // check result by querying for document terms 128 | cobs::ClassicSearch s_base( 129 | std::make_shared(result_file)); 130 | 131 | std::vector result; 132 | 133 | for (size_t ds = 0; ds < 10; ++ds) { 134 | for (size_t d = 0; d < doc_sets[ds].size(); ++d) { 135 | for (size_t i = 0; i < doc_sets[ds][d].num_kmers(); ++i) { 136 | std::string doc_match = 137 | "set_" + pad_index(ds) + "_document_" + pad_index(d); 138 | std::string kmer = doc_sets[ds][d][i].string(); 139 | LOG0 << kmer; 140 | 141 | s_base.search(kmer, result); 142 | 143 | bool found = false; 144 | for (auto& r : result) { 145 | if (r.doc_name == doc_match && r.score > 0) 146 | found = true; 147 | } 148 | ASSERT_TRUE(found); 149 | } 150 | } 151 | } 152 | } 153 | 154 | /******************************************************************************/ 155 | -------------------------------------------------------------------------------- /tests/compact_index_construction.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/compact_index_construction.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include "test_util.hpp" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fs = cobs::fs; 17 | 18 | static fs::path base_dir = "data/compact_index_construction"; 19 | static fs::path input_dir = base_dir / "input"; 20 | static fs::path index_file = base_dir / "index.cobs_compact"; 21 | static fs::path tmp_path = base_dir / "tmp"; 22 | static fs::path cobs_2_dir = tmp_path / cobs::pad_index(2); 23 | 24 | static std::string query = cobs::random_sequence(100000, 1); 25 | 26 | class compact_index_construction : public ::testing::Test 27 | { 28 | protected: 29 | void SetUp() final { 30 | cobs::error_code ec; 31 | fs::remove_all(base_dir, ec); 32 | } 33 | void TearDown() final { 34 | cobs::error_code ec; 35 | fs::remove_all(base_dir, ec); 36 | } 37 | }; 38 | 39 | TEST_F(compact_index_construction, padding) { 40 | // generate 41 | auto documents = generate_documents_all(query, /* num_documents */ 200); 42 | generate_test_case(documents, input_dir.string()); 43 | 44 | // construct compact index 45 | cobs::CompactIndexParameters index_params; 46 | index_params.num_hashes = 3; 47 | index_params.false_positive_rate = 0.1; 48 | index_params.page_size = 16; 49 | 50 | cobs::compact_construct( 51 | cobs::DocumentList(input_dir), index_file, tmp_path, index_params); 52 | 53 | // read compact index header, check page_size alignment of data 54 | std::ifstream ifs; 55 | cobs::deserialize_header(ifs, index_file); 56 | cobs::StreamPos sp = cobs::get_stream_pos(ifs); 57 | ASSERT_EQ(sp.curr_pos % index_params.page_size, 0U); 58 | } 59 | 60 | TEST_F(compact_index_construction, deserialization) { 61 | // generate 62 | auto documents = generate_documents_all(query); 63 | generate_test_case(documents, input_dir.string()); 64 | 65 | // get file names 66 | cobs::DocumentList doc_list(input_dir, cobs::FileType::Any); 67 | doc_list.sort_by_size(); 68 | 69 | std::vector paths = doc_list.list(); 70 | for (size_t i = 0; i < documents.size(); i += 2 * 8) { 71 | size_t middle_index = std::min(i + 16, paths.size()); 72 | std::sort(paths.begin() + i, paths.begin() + middle_index); 73 | } 74 | 75 | // construct compact index 76 | cobs::CompactIndexParameters index_params; 77 | index_params.num_hashes = 3; 78 | index_params.false_positive_rate = 0.1; 79 | index_params.page_size = 2; 80 | index_params.keep_temporary = true; 81 | 82 | cobs::compact_construct( 83 | cobs::DocumentList(input_dir), index_file, tmp_path, index_params); 84 | 85 | // read compact index header and check fields 86 | std::vector > data; 87 | cobs::CompactIndexHeader h; 88 | h.read_file(index_file, data); 89 | ASSERT_EQ(h.file_names_.size(), 33U); 90 | ASSERT_EQ(h.parameters_.size(), 3U); 91 | ASSERT_EQ(data.size(), 3U); 92 | for (size_t i = 0; i < h.file_names_.size(); i++) { 93 | ASSERT_EQ(h.file_names_[i], cobs::base_name(paths[i].path_)); 94 | } 95 | 96 | // check compact index parameters 97 | std::vector document_sizes; 98 | std::vector parameters; 99 | for (const fs::path& p : fs::recursive_directory_iterator(input_dir)) { 100 | // TODO: this test does nothing, because DocumentHeader should be below! 101 | if (cobs::file_has_header(p)) { 102 | std::cout << "doc: " << p.string() << std::endl; 103 | document_sizes.push_back(fs::file_size(p)); 104 | } 105 | } 106 | 107 | std::sort(document_sizes.begin(), document_sizes.end()); 108 | for (size_t i = 0; i < document_sizes.size(); i++) { 109 | if (i % 8 == 7) { 110 | uint64_t signature_size = cobs::calc_signature_size( 111 | document_sizes[i] / 8, index_params.num_hashes, 0.1); 112 | ASSERT_EQ(h.parameters_[i / 8].signature_size, signature_size); 113 | ASSERT_EQ(h.parameters_[i / 8].num_hashes, index_params.num_hashes); 114 | } 115 | } 116 | 117 | // check ratio of ones and zeros 118 | std::vector > num_ones(h.parameters_.size()); 119 | for (size_t i = 0; i < h.parameters_.size(); i++) { 120 | for (size_t j = 0; j < h.parameters_[i].signature_size; j++) { 121 | for (size_t k = 0; k < h.page_size_; k++) { 122 | uint8_t d = data[i][j * h.page_size_ + k]; 123 | for (size_t o = 0; o < 8; o++) { 124 | size_t file_names_index = i * h.page_size_ * 8 + k * 8 + o; 125 | if (file_names_index < h.file_names_.size()) { 126 | std::string file_name = h.file_names_[file_names_index]; 127 | num_ones[i][file_name] += (d & (1 << o)) >> o; 128 | } 129 | } 130 | } 131 | } 132 | } 133 | 134 | for (size_t i = 0; i < h.parameters_.size(); i++) { 135 | double set_bit_ratio = cobs::calc_average_set_bit_ratio( 136 | h.parameters_[i].signature_size, 3, 0.1); 137 | double num_ones_average = set_bit_ratio * h.parameters_[i].signature_size; 138 | for (auto& no : num_ones[i]) { 139 | ASSERT_LE(no.second, num_ones_average * 1.02); 140 | } 141 | } 142 | 143 | // check content of compact index against partial classic indexes 144 | std::vector > indices; 145 | for (auto& p : fs::directory_iterator(cobs_2_dir)) { 146 | if (fs::is_directory(p)) { 147 | for (const fs::path& cobs_p : fs::directory_iterator(p)) { 148 | if (cobs::file_has_header(cobs_p)) { 149 | cobs::ClassicIndexHeader cih; 150 | std::vector data; 151 | cih.read_file(cobs_p, data); 152 | indices.push_back(data); 153 | } 154 | } 155 | } 156 | } 157 | 158 | std::sort(indices.begin(), indices.end(), 159 | [](auto& i1, auto& i2) { 160 | return i1.size() < i2.size(); 161 | }); 162 | 163 | ASSERT_EQ(indices.size(), data.size()); 164 | for (size_t i = 0; i < indices.size() - 1; i++) { 165 | ASSERT_EQ(indices[i].size(), data[i].size()); 166 | for (size_t j = 0; j < indices[i].size(); j++) { 167 | ASSERT_EQ(indices[i].data()[j], data[i][j]); 168 | } 169 | } 170 | } 171 | 172 | /******************************************************************************/ 173 | -------------------------------------------------------------------------------- /tests/cortex_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/cortex_file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2020 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | namespace fs = cobs::fs; 16 | 17 | static fs::path in_dir = "data/cortex/"; 18 | static fs::path in_cortex = in_dir / "document.ctx"; 19 | static fs::path compare_document = in_dir / "document_sorted.txt"; 20 | 21 | TEST(cortex, process_kmers) { 22 | cobs::CortexFile ctx(in_cortex); 23 | 24 | // check contents 25 | die_unequal(ctx.version_, 6u); 26 | die_unequal(ctx.kmer_size_, 31u); 27 | die_unequal(ctx.num_words_per_kmer_, 1u); 28 | die_unequal(ctx.num_colors_, 1u); 29 | die_unequal(ctx.name_, "DRR030535"); 30 | 31 | die_unequal(ctx.num_kmers(), 24158u); 32 | 33 | // process kmers 34 | std::vector kmer_list; 35 | ctx.process_terms( 36 | 31, 37 | [&](const tlx::string_view& v) { 38 | kmer_list.emplace_back(v.to_string()); 39 | }); 40 | 41 | die_unequal(ctx.num_kmers(), kmer_list.size()); 42 | std::sort(kmer_list.begin(), kmer_list.end()); 43 | 44 | // compare with ground truth 45 | std::ifstream ifs(compare_document); 46 | std::string line; 47 | size_t i = 0; 48 | while (std::getline(ifs, line)) { 49 | ASSERT_EQ(line, kmer_list[i]); 50 | i++; 51 | } 52 | } 53 | 54 | TEST(cortex, sample1) { 55 | std::string line; 56 | 57 | cobs::CortexFile ctx31(in_dir / "sample1-k31.ctx"); 58 | std::ifstream txt31(in_dir / "sample1-k31.txt"); 59 | ctx31.process_terms( 60 | 31, 61 | [&](const tlx::string_view& v) { 62 | ASSERT_TRUE(std::getline(txt31, line)); 63 | ASSERT_EQ(line, v.to_string()); 64 | }); 65 | ASSERT_FALSE(std::getline(txt31, line)); 66 | 67 | cobs::CortexFile ctx19(in_dir / "sample1-k19.ctx"); 68 | std::ifstream txt19(in_dir / "sample1-k19.txt"); 69 | ctx19.process_terms( 70 | 19, 71 | [&](const tlx::string_view& v) { 72 | ASSERT_TRUE(std::getline(txt19, line)); 73 | ASSERT_EQ(line, v.to_string()); 74 | }); 75 | ASSERT_FALSE(std::getline(txt31, line)); 76 | 77 | cobs::CortexFile ctx15(in_dir / "sample1-k15.ctx"); 78 | std::ifstream txt15(in_dir / "sample1-k15.txt"); 79 | ctx15.process_terms( 80 | 15, 81 | [&](const tlx::string_view& v) { 82 | ASSERT_TRUE(std::getline(txt15, line)); 83 | ASSERT_EQ(line, v.to_string()); 84 | }); 85 | ASSERT_FALSE(std::getline(txt31, line)); 86 | } 87 | 88 | /******************************************************************************/ 89 | -------------------------------------------------------------------------------- /tests/data/cortex/document.ctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/cortex/document.ctx -------------------------------------------------------------------------------- /tests/data/cortex/sample1-k15.ctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/cortex/sample1-k15.ctx -------------------------------------------------------------------------------- /tests/data/cortex/sample1-k19.ctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/cortex/sample1-k19.ctx -------------------------------------------------------------------------------- /tests/data/cortex/sample1-k31.ctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/cortex/sample1-k31.ctx -------------------------------------------------------------------------------- /tests/data/fasta/sample1.fasta: -------------------------------------------------------------------------------- 1 | >chr5 2 | AGAGCTCTCCTCACAGTTTCTCACGGCATCTGTTTCCAAACAAAGGAAAG 3 | GGTTTGGACTAGCCTGTCAATAACTAACGGTATCATTCAGAGAGAAATGT 4 | CACCCGCCAGCCCTCACTCCTGCCTAACTAGAGTCCTGAGTGAGGATTCT 5 | AACACAACGGGGCTCCATCCCTCAGCAATGCCACTCAAATTAAAACACAT 6 | GATGTGCCAAAAGAAAAAAAGAGCAGGAAAAACGGGACACAGGGCGGTCA 7 | GACACTGGAAAGTGAATCCGTGGTAAGAACACACCGGTTCTAACACCTAG 8 | TGAGGCTTGGGCGTTGGAAAGTGAATCCGTGGTAAAAATGCAGGTCGGTC 9 | TAATACTTAGAACCTTAACCAATGAACAGGAAAGCAACTTTAAAAGCTAA 10 | TGGACGCCTTGACCCAGGAGGTAGAGATTGCAGTAAGCCAAGATTGTGCT 11 | ACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAAAATA 12 | ATGGACGCTCCCAAAATCCACCTGTTTAAACTCAATCATGTGCCATTTTT 13 | TAGGACAAATCCTGGATATCGACCAAACAAACTCACATTTGAAAGTCGCA 14 | CATAAATAACATACAATTAGGCTGTAGTCCAAACCCATAAAGACCAGGTC 15 | AGAAAAGTAAAGTGACCTTGGGCAAAACTATTTGGTAAGTATCATCCCAG 16 | GTTTCTGCAGCTCCAGAGAGTAAGAGACAGAGAGAGGAGAGGACTTAGAT 17 | GCACCAGGCAGGGCTCTCAATATTTTTACTACACTCTGTGGATGAAGATC 18 | CCAGGTATCTGTGTCACTTTGTAAAACCTACAGGTCCCACTATCCACCTA 19 | GGAAACGGAACTGTGCCAGTCCCCTGCAGGTGGTACCTGGAAGACAATGG 20 | CAGGGAGTGTGGTCTGGTAGTACCCGTCCTGGTCGGCTTCTGGCTCTGTC 21 | TCTTTGACCCAGTCTTTCTTGTCTGTCTCCAGCGCTTTCCGCAGCCAGGC 22 | GATGATGTTTGACTAAAGAAAGTTCAGCATCATGGAGAAGAGAAGAGAGA 23 | AGAGCAGGTGCCTGAGGGTCATGCCAATTACCTGGTCAGTTGCATGCACA 24 | AGCTCCACAAACCAGACACAGGGCACCAGCGTCCTCTTTCTCAAATGAGG 25 | ACTCCAGAGCAGGGACCCAGGAGCCACCCCCATGGGAGGCAGTGACCACA 26 | GCAGCCCGCAGGGGTTTAACTGGAACCCTTGGTACAGATAAAGAGGGAGA 27 | AGAAACTAGATGCCCTTTTGGCTGCACCTCATGCTGGAGGACAAGGCCGC 28 | TGAATCCCAGGTGCACAGAGAATGAGGCCGACCAGGTCCGGTCAGGCCTG 29 | TTCCCATGCCCAGACTCAGCAACCACAGCAGCTCACGTCCCAAGAGAAGA 30 | CAGCGCTCCTTCCCCAGCTCCTGAGGACACCTCGGCTTCTGGTCATAATG 31 | CTGCCTTGCTAGGTGGACGCACCCTTGTCTAAGAAGCAGACCCTGCTGAC 32 | CTCCTCTTCTCACTCTGAGACCACCTGGTGGTCACCAGGCTGCAAATGGA 33 | CTCGCCCTTTGGTCCTTGACGGACCCCCCCAACCATGGGACTCAGAGCCG 34 | AGAGCCTGGCAGGTGCTGCCACACACGGTTCAGACGTTCTGGGCCCTGGC 35 | CAGCCTGCCGAGTAGGGCACAGGCTCACGTCTCCAGCAGAGGACAAGCTT 36 | CAGATCCAGGCCCAGAGAAGGGGCCATTCTCCTCTTCTAGCAAACTGACC 37 | TGTCAAGGCCTGGAAGCAGAGCCCGGGCAACTGAGCAATGCATCCTAGGA 38 | AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT 39 | AGTTACTCATATAACCAGCACGGTGGTCTGTGGCACGCAGGCCAGCCCTA 40 | CTCACAGTGAGCGTGGACATGTACGTGTCAAGCAGCTCAGAGACCACGTG 41 | TGGAGAAAGCAATGGCTCCAGGGTGCCGACATCCACTTCCGGGGCCAGCT 42 | CCACGTTCCTCATCATCTCAGTACTATGGGGGAACCAGGAATGTGAGTGA 43 | CAAACCACGACAGACACAGAAGAGCATGCCCAAGTGTGCCGAGACTGTTT 44 | CACTTGACACATGAATGTCCACCACCCTGGAGACTTCCTGGTCACAGCCG 45 | GCCCACAGCTCAGGGGAAAGTGCAGGGCAGGGGCCGAGGGCTTTGGGGTC 46 | TCTGTGGCATCACCCCACTGGCTCCCAGACTGCGTGAGTCCCTCCAACCG 47 | GGGAAAGTGCAGGGCAGGGGCCGAGGGCTTTGGGGTCTCTGTGGCATCAC 48 | CCACTGGCTCCGAGACCACTTGAGTCCCTCCAACCCCAGCTTCACCCTGC 49 | TGCGCCAGCTTCATGAGGTGCTGGGGCTCAAATGTGGAAGAGCCTTTCCT 50 | GTGGGGAAGACGTGTGTTCTATTCACTGCACCTGCAGAGCCGAGGCAGGG 51 | CCTGGCAAACAGGAGGCGCTGAATAAACAAACGTACTCAGTCAAAACGAC 52 | AAGCACAGCCGCCCCCACAGCCACGGGCGCCACGTCCACTGACTCAACCA 53 | AACACAAACTGAAAGTATTTGGGAAAAACAACAATAAAAAATAATAAAAA 54 | TTTCAAAAAATATAGAACAACTATTTCCATAGCATTCACATTGTATTTAG 55 | TCTTATAAGTAATCTAGAAATGATTTAGAGCATGAAGGAGAATGTACGCA 56 | GGTTACGTGCAAACACTATACCCTTTTATAAAGGGGACTTGGGCTACGGA 57 | GGATTTTGATAGCCAAGGGGGGTCCTGGAATCCCTCCCCATTGAGTATGG 58 | AGGGGCAGCTGTGCTGTCCTTTGGAAACAAGCCACTGCAGTCCACATCAA 59 | TATGCTCTGCAGACGGTGATGACTGGAACGTACAATACAGGACATTACTG 60 | ATAAAAACGCATCCGCAAGAACAGGCAAAACTACGTGAGGTCTACTGTCC 61 | AGAAGACAGGAAACACATGCTACATTTCAGGAAAAGTTTTCATTAGAAAG 62 | TTATTATCAGCCGGGCATGGTGGCTCACGCCTGTAATCCCAGCACTTTGG 63 | GAAGCCGAGACGGGCGGATCATGAGGTCAGGAGATTGAGACCATCCTGGC 64 | TAACACAGTGAAACCCCATCTCTATTAAAAATACAAAAAAATTAGCGGGG 65 | -------------------------------------------------------------------------------- /tests/data/fasta/sample2.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD 4 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL 5 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD 6 | SFRKIYTDLGWKFTPL -------------------------------------------------------------------------------- /tests/data/fasta/sample3.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/fasta/sample3.fasta.gz -------------------------------------------------------------------------------- /tests/data/fasta/sample4.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q197F8|002R_IIV3 Uncharacterized protein 002R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-002R PE=4 SV=1 2 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL 3 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT 4 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD 5 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET 6 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY 7 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS 8 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI 9 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC -------------------------------------------------------------------------------- /tests/data/fasta/sample5.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q197F7|003L_IIV3 Uncharacterized protein 003L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-003L PE=4 SV=1 2 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT 3 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS 4 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI -------------------------------------------------------------------------------- /tests/data/fasta/sample6.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-003R PE=3 SV=1 2 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD 3 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI 4 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ 5 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD 6 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR 7 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA 8 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR 9 | KAKIQEMFDNMVSRMVTS -------------------------------------------------------------------------------- /tests/data/fasta/sample7.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/fasta/sample7.fasta.gz -------------------------------------------------------------------------------- /tests/data/fasta_files.list: -------------------------------------------------------------------------------- 1 | fasta/sample1.fasta 2 | fasta/sample2.fasta 3 | fasta/sample3.fasta.gz 4 | fasta/sample6.fasta -------------------------------------------------------------------------------- /tests/data/fasta_multi/sample1.mfasta: -------------------------------------------------------------------------------- 1 | >chr5 2 | AGAGCTCTCCTCACAGTTTCTCACGGCATCTGTTTCCAAACAAAGGAAAG 3 | GGTTTGGACTAGCCTGTCAATAACTAACGGTATCATTCAGAGAGAAATGT 4 | CACCCGCCAGCCCTCACTCCTGCCTAACTAGAGTCCTGAGTGAGGATTCT 5 | AACACAACGGGGCTCCATCCCTCAGCAATGCCACTCAAATTAAAACACAT 6 | GATGTGCCAAAAGAAAAAAAGAGCAGGAAAAACGGGACACAGGGCGGTCA 7 | GACACTGGAAAGTGAATCCGTGGTAAGAACACACCGGTTCTAACACCTAG 8 | TGAGGCTTGGGCGTTGGAAAGTGAATCCGTGGTAAAAATGCAGGTCGGTC 9 | TAATACTTAGAACCTTAACCAATGAACAGGAAAGCAACTTTAAAAGCTAA 10 | TGGACGCCTTGACCCAGGAGGTAGAGATTGCAGTAAGCCAAGATTGTGCT 11 | ACCGCACTCCAGCCTGGGCAACAGAGTGAGACTCCGTCTCAAAAAAAATA 12 | ATGGACGCTCCCAAAATCCACCTGTTTAAACTCAATCATGTGCCATTTTT 13 | TAGGACAAATCCTGGATATCGACCAAACAAACTCACATTTGAAAGTCGCA 14 | CATAAATAACATACAATTAGGCTGTAGTCCAAACCCATAAAGACCAGGTC 15 | AGAAAAGTAAAGTGACCTTGGGCAAAACTATTTGGTAAGTATCATCCCAG 16 | GTTTCTGCAGCTCCAGAGAGTAAGAGACAGAGAGAGGAGAGGACTTAGAT 17 | GCACCAGGCAGGGCTCTCAATATTTTTACTACACTCTGTGGATGAAGATC 18 | CCAGGTATCTGTGTCACTTTGTAAAACCTACAGGTCCCACTATCCACCTA 19 | GGAAACGGAACTGTGCCAGTCCCCTGCAGGTGGTACCTGGAAGACAATGG 20 | CAGGGAGTGTGGTCTGGTAGTACCCGTCCTGGTCGGCTTCTGGCTCTGTC 21 | TCTTTGACCCAGTCTTTCTTGTCTGTCTCCAGCGCTTTCCGCAGCCAGGC 22 | GATGATGTTTGACTAAAGAAAGTTCAGCATCATGGAGAAGAGAAGAGAGA 23 | AGAGCAGGTGCCTGAGGGTCATGCCAATTACCTGGTCAGTTGCATGCACA 24 | AGCTCCACAAACCAGACACAGGGCACCAGCGTCCTCTTTCTCAAATGAGG 25 | ACTCCAGAGCAGGGACCCAGGAGCCACCCCCATGGGAGGCAGTGACCACA 26 | GCAGCCCGCAGGGGTTTAACTGGAACCCTTGGTACAGATAAAGAGGGAGA 27 | AGAAACTAGATGCCCTTTTGGCTGCACCTCATGCTGGAGGACAAGGCCGC 28 | TGAATCCCAGGTGCACAGAGAATGAGGCCGACCAGGTCCGGTCAGGCCTG 29 | TTCCCATGCCCAGACTCAGCAACCACAGCAGCTCACGTCCCAAGAGAAGA 30 | CAGCGCTCCTTCCCCAGCTCCTGAGGACACCTCGGCTTCTGGTCATAATG 31 | CTGCCTTGCTAGGTGGACGCACCCTTGTCTAAGAAGCAGACCCTGCTGAC 32 | CTCCTCTTCTCACTCTGAGACCACCTGGTGGTCACCAGGCTGCAAATGGA 33 | CTCGCCCTTTGGTCCTTGACGGACCCCCCCAACCATGGGACTCAGAGCCG 34 | AGAGCCTGGCAGGTGCTGCCACACACGGTTCAGACGTTCTGGGCCCTGGC 35 | CAGCCTGCCGAGTAGGGCACAGGCTCACGTCTCCAGCAGAGGACAAGCTT 36 | CAGATCCAGGCCCAGAGAAGGGGCCATTCTCCTCTTCTAGCAAACTGACC 37 | TGTCAAGGCCTGGAAGCAGAGCCCGGGCAACTGAGCAATGCATCCTAGGA 38 | AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT 39 | AGTTACTCATATAACCAGCACGGTGGTCTGTGGCACGCAGGCCAGCCCTA 40 | CTCACAGTGAGCGTGGACATGTACGTGTCAAGCAGCTCAGAGACCACGTG 41 | TGGAGAAAGCAATGGCTCCAGGGTGCCGACATCCACTTCCGGGGCCAGCT 42 | CCACGTTCCTCATCATCTCAGTACTATGGGGGAACCAGGAATGTGAGTGA 43 | CAAACCACGACAGACACAGAAGAGCATGCCCAAGTGTGCCGAGACTGTTT 44 | CACTTGACACATGAATGTCCACCACCCTGGAGACTTCCTGGTCACAGCCG 45 | GCCCACAGCTCAGGGGAAAGTGCAGGGCAGGGGCCGAGGGCTTTGGGGTC 46 | TCTGTGGCATCACCCCACTGGCTCCCAGACTGCGTGAGTCCCTCCAACCG 47 | GGGAAAGTGCAGGGCAGGGGCCGAGGGCTTTGGGGTCTCTGTGGCATCAC 48 | CCACTGGCTCCGAGACCACTTGAGTCCCTCCAACCCCAGCTTCACCCTGC 49 | TGCGCCAGCTTCATGAGGTGCTGGGGCTCAAATGTGGAAGAGCCTTTCCT 50 | GTGGGGAAGACGTGTGTTCTATTCACTGCACCTGCAGAGCCGAGGCAGGG 51 | CCTGGCAAACAGGAGGCGCTGAATAAACAAACGTACTCAGTCAAAACGAC 52 | AAGCACAGCCGCCCCCACAGCCACGGGCGCCACGTCCACTGACTCAACCA 53 | AACACAAACTGAAAGTATTTGGGAAAAACAACAATAAAAAATAATAAAAA 54 | TTTCAAAAAATATAGAACAACTATTTCCATAGCATTCACATTGTATTTAG 55 | TCTTATAAGTAATCTAGAAATGATTTAGAGCATGAAGGAGAATGTACGCA 56 | GGTTACGTGCAAACACTATACCCTTTTATAAAGGGGACTTGGGCTACGGA 57 | GGATTTTGATAGCCAAGGGGGGTCCTGGAATCCCTCCCCATTGAGTATGG 58 | AGGGGCAGCTGTGCTGTCCTTTGGAAACAAGCCACTGCAGTCCACATCAA 59 | TATGCTCTGCAGACGGTGATGACTGGAACGTACAATACAGGACATTACTG 60 | ATAAAAACGCATCCGCAAGAACAGGCAAAACTACGTGAGGTCTACTGTCC 61 | AGAAGACAGGAAACACATGCTACATTTCAGGAAAAGTTTTCATTAGAAAG 62 | TTATTATCAGCCGGGCATGGTGGCTCACGCCTGTAATCCCAGCACTTTGG 63 | GAAGCCGAGACGGGCGGATCATGAGGTCAGGAGATTGAGACCATCCTGGC 64 | TAACACAGTGAAACCCCATCTCTATTAAAAATACAAAAAAATTAGCGGGG 65 | -------------------------------------------------------------------------------- /tests/data/fasta_multi/sample2.mfasta: -------------------------------------------------------------------------------- 1 | >sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD 4 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL 5 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD 6 | SFRKIYTDLGWKFTPL 7 | >sp|Q6GZX3|002L_FRG3G Uncharacterized protein 002L OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-002L PE=4 SV=1 8 | MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR 9 | IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL 10 | AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC 11 | KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML 12 | DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK 13 | VMFFVAGAVLVAILISTVRW 14 | >sp|Q197F8|002R_IIV3 Uncharacterized protein 002R OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-002R PE=4 SV=1 15 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL 16 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT 17 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD 18 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET 19 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY 20 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS 21 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI 22 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC 23 | >sp|Q197F7|003L_IIV3 Uncharacterized protein 003L OS=Invertebrate iridescent virus 3 OX=345201 GN=IIV3-003L PE=4 SV=1 24 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT 25 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS 26 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI 27 | >sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-003R PE=3 SV=1 28 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD 29 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI 30 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ 31 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD 32 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR 33 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA 34 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR 35 | KAKIQEMFDNMVSRMVTS -------------------------------------------------------------------------------- /tests/data/fastq/sample1.fastq: -------------------------------------------------------------------------------- 1 | @SRR364003.1 1/1 2 | TGGAGAGGAAGAAGGAAAGTACACTGGCCAATACTCTGAAAATGACTGACTTGGGTACATGTGGTCACAGACGGCTCCCGAAACGAAGGTAACATGCGC 3 | + 4 | 7=*4@@@A>5D5<-<>7@<@@##################################################################################### 9 | @SRR364003.3 3/1 10 | TTACTGCAGGCCACCTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAAGATCACACCTTAACGTTCCCACTAACCTTATCATCTCGCCACTCTGAC 11 | + 12 | )14//1(/9:-7;)22529AC>9A?9A<@###################################################################### 13 | @SRR364003.4 4/1 14 | TATTTTTGACCTAGTCAGTTTTTAAACAATGTGCATTTGCAGGAGGTAATGAAAAGAGACGCAATAACATGATTTTACTGATTCAGTATCGGAATGCTG 15 | + 16 | FFEFFF/FFF@DFF@E;@BEEE=EACD@B,@.>=??94BA87>*917823=A???############################################################ 21 | @SRR364003.6 6/1 22 | GAGGGTCTCTGGGTGCAAGGCCTGGATGACTTCCCTCTGGGTGGGGGGGCAGTCGCCTCTCAGGTTCCAGCTTCTGTGTACGAGGGAAAACAACAGTAT 23 | + 24 | ################################################################################################### 25 | @SRR364003.7 7/1 26 | TCATGTAGTGAGGGGGTAATCAGAGTGAAGGAGGGATTTCGAGCAGTCATAAGGGAAACGTCTGATAGAACAGGAAGATATTAAGCACTAGCGAGGCCA 27 | + 28 | 8D>D############################################################################################### 29 | @SRR364003.8 8/1 30 | GGAAACCTTAAATACTGTTCAGAAAGAATATATCTTCAATCAAGGCTCTTGTGCAGCGTACACCCAAAAATGAAGCTTTTTGGGTTAGGGGCAAGACGA 31 | + 32 | GGGGDEDGGGEGEGG=GBGGGD@GGGGEFGD=DD8EE@E=@D@D####################################################### 33 | @SRR364003.9 9/1 34 | CATTGCTTGGCCTGCGCACACCTGCCAACTCCACCATTGTAACGTCGGAATGGTACAAAATGTTTCTGTACAGAGACACGTTTCAGATACTTCAGTGCG 35 | + 36 | FFFBD7BEED8DBDFFFCFCC3@B,)>@:6+'80B@B@B############################################################ 37 | @SRR364003.10 10/1 38 | TTACTGCAGGCCACCTACTGATTGACATAATTGGAACGTCAACACTAACAATAGGGCACAATCGACTTCCACTCAACGCGATCAGTGTCACAACTATAC 39 | + 40 | 4576/1)/,A<7=A;@7A################################################################################# 41 | @SRR364003.11 11/1 42 | CCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCATTGACGCCTGCCTGATCCTCCACATCCCACACGTACTATTACTGGCCACGCCATAATCA 43 | + 44 | ;7109/-/152<9:<2<;?######################################################################### 45 | @SRR364003.12 12/1 46 | GAAGATCTGGCCCTGATGGAGGATCAGAGGGGGCAGGTTGGGGGAAAGCCCGGTTTTCAGCCTCTTGTTATAGGGACCACGTTGAGAGTGAGGGTGGAG 47 | + 48 | EEEE=DD=DDDD>@>:=@@################################################################################ 49 | @SRR364003.13 13/1 50 | CACCCCAAATTCTGTAGCATCTTCAAAGTCATCTCCATCGCTATCACTAACCATATGCACATCGTGGATTTGTTCTCCCGACTGTTTCCGGGGCTGTGC 51 | + 52 | :6;E7@47><:>>::::9CC@CA>887*8A=D?############################################################# 57 | @SRR364003.15 15/1 58 | GACAATGTAAATAAACATCTCCCAAAGTAGAGAAAAAAACAAAATTTTAAGCCAAGAAGAAACAAAAATACTCAAGTTACAACCACACAACACCTTTTG 59 | + 60 | CEDHF@CE?EE@C=FG=GFECEEDDD@BE>;BBA@;<:@EEEE?####################################################### 61 | @SRR364003.16 16/1 62 | CCCGGGCCATGTCTCTTTGGTATAAGATTCTGAGCAGGGGAGGGAGAGGGACGGGGAGGCAGAGAGGGAAGACAAAGGACCTGAACACACCGCGTCGGC 63 | + 64 | ################################################################################################### 65 | -------------------------------------------------------------------------------- /tests/data/fastq/sample2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bingmann/cobs/2fbb044bd643e8254a20ccda8187fcefdd0e167a/tests/data/fastq/sample2.fastq.gz -------------------------------------------------------------------------------- /tests/data/fastq/sample3.fastq: -------------------------------------------------------------------------------- 1 | @SRR1024122.1 DBRHHJN1:224:C06H5ACXX:2:1101:1094:1025/1 2 | AGGNGATGCAGGCAATCAGGGAGGACAGCAGGCTGGGGGCGGCTGCTGTTGAGTCTGTTTGTTACGCTCGCCGTA 3 | + 4 | BBC#4ABDHHHHHIIHHGHHJFGIEGGHHIIJGGIJJJJFFDBDD@CDDCC>CAAADCDDCBD>:?;?8@;BBDB 5 | @SRR1024122.2 DBRHHJN1:224:C06H5ACXX:2:1101:1136:1046/1 6 | GAGNGTGGGCAGTCGGTGATGGGAGAAATAATAAGCTGAAGAAAAGCAACAATGTAGGGCCAAATCACTTTTAAA 7 | + 8 | ?:?#4ADDHHFHHJIJGIEHHIJ?FCGHHIIGGHIHHIGGHFHJJJHJJJJJFHHGHJJHIEFHEHFFEFFFECE 9 | @SRR1024122.3 DBRHHJN1:224:C06H5ACXX:2:1101:1133:1244/1 10 | AAGNGGTTACAGTGAGCCCCTTTAGCCTTGCAGACGACTTAAGTGTTAACCAGCTAAGGTTACGCTCGCCGTAGA 11 | + 12 | CCC#4ADDHHDHHJGIIGHJCFHCHBCHII?EDBFGGHFGDHGIIGIDEDFEHIHGGJIJJGHID?-=?=>@>B? 13 | @SRR1024122.4 DBRHHJN1:224:C06H5ACXX:2:1101:1363:1008/1 14 | NCAAGGTTACGCTCGCCGTAGAGGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCTCGTATGCCGTCT 15 | + 16 | #1BDFFFFHHHHHJJJJJJJJIJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFFFFEECEEDDDDCCDDDDD 17 | @SRR1024122.5 DBRHHJN1:224:C06H5ACXX:2:1101:1304:1098/1 18 | AAGGGAACCTCGGGCCGATCGCACGCCCCCCGTGGCGGCGACGACCCATTGTTACGCTCGCCGTAGAGGATCGGA 19 | + 20 | @@CFFDFFHHFHHJJJJJIIIJIHGIIJJJHFFFDBDDBB@DD@BDDBCCDEDCABDDDDBDDBDDBDDADDDDB 21 | @SRR1024122.6 DBRHHJN1:224:C06H5ACXX:2:1101:1343:1115/1 22 | AGGGGGTGGTAGTTAATGTTACGCTCGCCGTAGAGGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGTC 23 | + 24 | @CCFFFDDHFHHHHHIJJJJJHIJIJJJJJIIJJJJIIHHHFFFCECEECDDDDDDCDDDDDDDDDDDDDDDDDD 25 | @SRR1024122.7 DBRHHJN1:224:C06H5ACXX:2:1101:1438:1130/1 26 | GGAGGCTGAAGAATTATTTTCCAACTTTGACTTGGACTATAATTCTGGAAAGATGATCTATATACACACGTTACG 27 | + 28 | BCBFFFDFHHHGHJJIJJJJHGIIIIJJJHJJJJJEHHHIGHIJGIJIHHJJIIIHHIIHJIIIIIJJJJJIJGI 29 | @SRR1024122.8 DBRHHJN1:224:C06H5ACXX:2:1101:1496:1145/1 30 | TAAGGGGCAGTCACGTAGCAGAGCAGCTCCTCGCTGCGATCTATTGAAAGTGTTACGCTCGCCGTAGAGGATCGG 31 | + 32 | @@@DFFFFFHHHFIIJIJGIJIEFHG?EHHIHHIJGIIHHGHHHIHIHHIHCHIIEHEGBDBCDB=B?C@ACC?B 33 | @SRR1024122.9 DBRHHJN1:224:C06H5ACXX:2:1101:1410:1163/1 34 | GGGAGGCCAGGCAGGGAGGGGGCAAGATGGGGTCACAGAAAGAACATGAGTGTGTTACGCTCGCCGTAGAGGATC 35 | + 36 | CCCFFFFFHHHGGIJJFHJJJJJJJJEIJJJJFH6CHIGHHHHFFFFFDEEEEEDDDDDDDDDDDDDDDDDDDDC 37 | @SRR1024122.10 DBRHHJN1:224:C06H5ACXX:2:1101:1491:1166/1 38 | GGGAGAGAGCGAGACTCTGTCTCAAAAAAAAAGTATATATATATACACGTATATATGTGTGTGTGTGTTACGCTC 39 | + 40 | @@@BDAD=DHHHHC@FFHIFEGFGIIGEHIHII@@GGHIIEGIID@DEC=CHECBDFE>?AABEECCCCCC@B88 41 | @SRR1024122.11 DBRHHJN1:224:C06H5ACXX:2:1101:1417:1214/1 42 | AGGAGCAAAGCATCGCGAAGGCCCGCGGCGGGTGTTGATGCGATGTGATTTCTGCCCAGTGCTCTGAATGTGTTA 43 | + 44 | CCCFFFFFHHHGHIJJJJJJJJJIIIJJJHDD7@:BDCDDDDBBDDDCCDECDACDDDDDDCDDDDCDDDCDDDC 45 | @SRR1024122.12 DBRHHJN1:224:C06H5ACXX:2:1101:1434:1244/1 46 | GTAAGAAAGCCATGCATGCGCATTTGTATGTTACGCTCGCCGTAGAGGATCGGAAGAGCACACGTCTGAACTCCA 47 | + 48 | CCCFFFFFHHHHHJJJJJIJJJJJJJJJJJJIJIJJJJJJJJJJJJJJIJJJHHHFDFFEEEDDDDDDDDDDDDD 49 | @SRR1024122.13 DBRHHJN1:224:C06H5ACXX:2:1101:1541:1011/1 50 | NTAGGGTTACGCTCGCCGTAGAGGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCTCGTATGCCGTCT 51 | + 52 | #1:BDDDDHFFHFIIGEHIII=GHHGIIGHIIBGHGCCHIGGGI@GFHHAHF;CCEEEACCCCBBB@CC>CB8;@ 53 | @SRR1024122.14 DBRHHJN1:224:C06H5ACXX:2:1101:1690:1051/1 54 | GGTAGCGCCTCGCCCCATGCCGTGGACGGCAACACTGTGGAGCTGAAGCGGGCGGTGTCCCGGGAGGATTCGGCG 55 | + 56 | CCCFFFFFHHHHHJJJJJJJJJIIJIIIIIJJJJJJJJJJHHHHHFFFFDDDBDD9@;?B@BB@BBD@?CBBDDB 77 | -------------------------------------------------------------------------------- /tests/data/text/sample1.txt: -------------------------------------------------------------------------------- 1 | Hello, this is the first sample text. 2 | Hello, this is the first sample text. 3 | -------------------------------------------------------------------------------- /tests/data/text/sample2.txt: -------------------------------------------------------------------------------- 1 | Hello, this is the second sample text. 2 | Hello, this is the second sample text. 3 | -------------------------------------------------------------------------------- /tests/fasta_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/fasta_file.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fs = cobs::fs; 17 | 18 | static fs::path input_dir = "data/fasta/"; 19 | static fs::path base_dir = "data/fasta_index"; 20 | static fs::path index_path = base_dir / "index.cobs_classic"; 21 | static fs::path tmp_path = base_dir / "tmp"; 22 | 23 | class fasta : public ::testing::Test 24 | { 25 | protected: 26 | void SetUp() final { 27 | cobs::error_code ec; 28 | fs::remove_all(base_dir, ec); 29 | } 30 | void TearDown() final { 31 | cobs::error_code ec; 32 | fs::remove_all(base_dir, ec); 33 | } 34 | }; 35 | 36 | TEST_F(fasta, process_kmers) { 37 | cobs::FastaFile fasta1(input_dir / "sample1.fasta"); 38 | die_unequal(fasta1.size(), 3219u); 39 | 40 | cobs::FastaFile fasta7(input_dir / "sample7.fasta.gz"); 41 | die_unequal(fasta7.size(), 1659u); 42 | 43 | size_t nterms = fasta7.num_terms(31); 44 | die_unequal(nterms, 15u * (76 - 31 + 1)); 45 | 46 | size_t check = 0; 47 | fasta7.process_terms( 48 | 31, [&](const tlx::string_view& s) { 49 | LOG0 << s.to_string(); 50 | check++; 51 | }); 52 | die_unequal(nterms, check); 53 | } 54 | 55 | TEST_F(fasta, document_list) { 56 | static constexpr bool debug = false; 57 | 58 | cobs::DocumentList doc_list(input_dir); 59 | 60 | die_unequal(doc_list.list().size(), 7u); 61 | 62 | // construct classic index 63 | cobs::ClassicIndexParameters index_params; 64 | index_params.num_hashes = 3; 65 | index_params.false_positive_rate = 0.1; 66 | index_params.canonicalize = 0; 67 | 68 | cobs::classic_construct( 69 | cobs::DocumentList(input_dir), index_path, tmp_path, index_params); 70 | cobs::ClassicSearch s_base( 71 | std::make_shared(index_path)); 72 | 73 | // run queries for each kmer in the documents 74 | for (const cobs::DocumentEntry& de : doc_list.list()) { 75 | LOG << de.name_; 76 | de.process_terms( 77 | /* term_size */ 31, 78 | [&](const tlx::string_view& term) { 79 | std::string query = term.to_string(); 80 | 81 | std::vector result; 82 | s_base.search(query, result); 83 | ASSERT_EQ(7u, result.size()); 84 | 85 | for (size_t i = 0; i < result.size(); ++i) { 86 | sLOG << result[i].score << result[i].doc_name; 87 | 88 | if (result[i].doc_name == de.name_) { 89 | ASSERT_GE(result[i].score, 1u); 90 | } 91 | } 92 | }); 93 | } 94 | } 95 | 96 | TEST_F(fasta, listfile) { 97 | cobs::DocumentList doc_list("data/fasta_files.list"); 98 | die_unequal(doc_list.list().size(), 4u); 99 | } 100 | 101 | /******************************************************************************/ 102 | -------------------------------------------------------------------------------- /tests/fasta_multifile.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/fasta_multifile.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fs = cobs::fs; 17 | 18 | static fs::path input_dir = "data/fasta_multi/"; 19 | static fs::path base_dir = "data/fasta_multi_index"; 20 | static fs::path index_path = base_dir / "index.cobs_classic"; 21 | static fs::path tmp_path = base_dir / "tmp"; 22 | 23 | class fasta_multi : public ::testing::Test 24 | { 25 | protected: 26 | void SetUp() final { 27 | cobs::error_code ec; 28 | fs::remove_all(base_dir, ec); 29 | } 30 | void TearDown() final { 31 | cobs::error_code ec; 32 | fs::remove_all(base_dir, ec); 33 | } 34 | }; 35 | 36 | TEST_F(fasta_multi, process_kmers1) { 37 | cobs::FastaMultifile fasta_multi(input_dir / "sample1.mfasta"); 38 | 39 | die_unequal(fasta_multi.num_documents(), 1u); 40 | } 41 | 42 | TEST_F(fasta_multi, process_kmers2) { 43 | cobs::FastaMultifile fasta_multi(input_dir / "sample2.mfasta"); 44 | 45 | die_unequal(fasta_multi.num_documents(), 5u); 46 | 47 | die_unequal(fasta_multi.size(0), 256u); 48 | die_unequal(fasta_multi.size(4), 438u); 49 | 50 | size_t count = 0; 51 | fasta_multi.process_terms(0, 31, [&](const tlx::string_view&) { ++count; }); 52 | die_unequal(fasta_multi.size(0) - 30, count); 53 | } 54 | 55 | TEST_F(fasta_multi, document_list) { 56 | static constexpr bool debug = false; 57 | 58 | cobs::DocumentList doc_list(input_dir); 59 | 60 | die_unequal(doc_list.list().size(), 6u); 61 | 62 | // construct classic index 63 | cobs::ClassicIndexParameters index_params; 64 | index_params.num_hashes = 3; 65 | index_params.false_positive_rate = 0.1; 66 | index_params.canonicalize = 0; 67 | 68 | cobs::classic_construct( 69 | cobs::DocumentList(input_dir), index_path, tmp_path, index_params); 70 | cobs::ClassicSearch s_base( 71 | std::make_shared(index_path)); 72 | 73 | // run queries for each kmer in the documents 74 | for (const cobs::DocumentEntry& de : doc_list.list()) { 75 | LOG << de.name_; 76 | de.process_terms( 77 | /* term_size */ 31, 78 | [&](const tlx::string_view& term) { 79 | std::string query = term.to_string(); 80 | 81 | std::vector result; 82 | s_base.search(query, result); 83 | ASSERT_EQ(6u, result.size()); 84 | 85 | for (size_t i = 0; i < result.size(); ++i) { 86 | sLOG << result[i].score << result[i].doc_name; 87 | 88 | if (result[i].doc_name == de.name_) { 89 | ASSERT_GE(result[i].score, 1u); 90 | } 91 | } 92 | }); 93 | } 94 | } 95 | 96 | /******************************************************************************/ 97 | -------------------------------------------------------------------------------- /tests/fastq_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/fastq_file.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace fs = cobs::fs; 17 | 18 | static fs::path input_dir = "data/fastq/"; 19 | static fs::path base_dir = "data/fastq_index"; 20 | static fs::path index_path = base_dir / "index.cobs_classic"; 21 | static fs::path tmp_path = base_dir / "tmp"; 22 | 23 | class fastq : public ::testing::Test 24 | { 25 | protected: 26 | void SetUp() final { 27 | cobs::error_code ec; 28 | fs::remove_all(base_dir, ec); 29 | } 30 | void TearDown() final { 31 | cobs::error_code ec; 32 | fs::remove_all(base_dir, ec); 33 | } 34 | }; 35 | 36 | TEST_F(fastq, process_kmers) { 37 | cobs::FastqFile fastq1(input_dir / "sample1.fastq"); 38 | die_unequal(fastq1.size(), 3518u); 39 | 40 | cobs::FastqFile fastq2(input_dir / "sample2.fastq.gz"); 41 | die_unequal(fastq2.size(), 3001u); 42 | 43 | size_t nterms = fastq2.num_terms(31); 44 | // die_unequal(nterms, 17u * (65 - 31 + 1)); 45 | 46 | size_t check = 0; 47 | fastq2.process_terms( 48 | 31, [&](const tlx::string_view& s) { 49 | LOG0 << s.to_string(); 50 | check++; 51 | }); 52 | die_unequal(nterms, check); 53 | } 54 | 55 | TEST_F(fastq, document_list) { 56 | static constexpr bool debug = false; 57 | 58 | cobs::DocumentList doc_list(input_dir); 59 | 60 | die_unequal(doc_list.list().size(), 3u); 61 | 62 | // construct classic index 63 | cobs::ClassicIndexParameters index_params; 64 | index_params.num_hashes = 3; 65 | index_params.false_positive_rate = 0.1; 66 | index_params.canonicalize = 0; 67 | 68 | cobs::classic_construct( 69 | cobs::DocumentList(input_dir), index_path, tmp_path, index_params); 70 | cobs::ClassicSearch s_base( 71 | std::make_shared(index_path)); 72 | 73 | // run queries for each kmer in the documents 74 | for (const cobs::DocumentEntry& de : doc_list.list()) { 75 | LOG << de.name_; 76 | de.process_terms( 77 | /* term_size */ 31, 78 | [&](const tlx::string_view& term) { 79 | std::string query = term.to_string(); 80 | 81 | std::vector result; 82 | s_base.search(query, result); 83 | ASSERT_EQ(3u, result.size()); 84 | 85 | for (size_t i = 0; i < result.size(); ++i) { 86 | sLOG << result[i].score << result[i].doc_name; 87 | 88 | if (result[i].doc_name == de.name_) { 89 | ASSERT_GE(result[i].score, 1u); 90 | } 91 | } 92 | }); 93 | } 94 | } 95 | 96 | /******************************************************************************/ 97 | -------------------------------------------------------------------------------- /tests/file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/file.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace fs = cobs::fs; 20 | 21 | TEST(file, base_name) { 22 | fs::path out_path("data/out/file/classic_index.cobs_doc"); 23 | ASSERT_EQ("classic_index", cobs::base_name(out_path)); 24 | } 25 | 26 | TEST(file, document) { 27 | std::stringstream buffer; 28 | 29 | cobs::KMerBuffer<31> doc_out; 30 | doc_out.serialize(buffer, "document"); 31 | 32 | cobs::KMerBufferHeader hdoc; 33 | cobs::KMerBuffer<31> doc_in; 34 | doc_in.deserialize(buffer, hdoc); 35 | } 36 | 37 | TEST(file, classic_index_header) { 38 | std::stringstream buffer; 39 | 40 | // write classic index header 41 | std::vector file_names = { "n1", "n2", "n3", "n4" }; 42 | cobs::ClassicIndexHeader h_out; 43 | h_out.term_size_ = 31; 44 | h_out.canonicalize_ = 1; 45 | h_out.signature_size_ = 321; 46 | h_out.num_hashes_ = 21; 47 | h_out.file_names_ = file_names; 48 | h_out.serialize(buffer); 49 | 50 | // read classic index header 51 | cobs::ClassicIndexHeader h_in; 52 | h_in.deserialize(buffer); 53 | 54 | // compare results 55 | ASSERT_EQ(h_out.term_size_, h_in.term_size_); 56 | ASSERT_EQ(h_out.canonicalize_, h_in.canonicalize_); 57 | ASSERT_EQ(h_out.signature_size_, h_in.signature_size_); 58 | ASSERT_EQ(h_out.row_size(), h_in.row_size()); 59 | ASSERT_EQ(h_out.num_hashes_, h_in.num_hashes_); 60 | ASSERT_EQ(file_names, h_in.file_names_); 61 | } 62 | 63 | TEST(file, classic_index) { 64 | std::stringstream buffer; 65 | 66 | // write classic index file 67 | std::vector file_names = { "n1", "n2", "n3", "n4" }; 68 | cobs::ClassicIndexHeader h_out; 69 | h_out.term_size_ = 31; 70 | h_out.canonicalize_ = 1; 71 | h_out.signature_size_ = 123; 72 | h_out.num_hashes_ = 12; 73 | h_out.file_names_ = file_names; 74 | std::vector v_out(h_out.row_size() * h_out.signature_size_, 7); 75 | h_out.write_file(buffer, v_out); 76 | 77 | // read classic index file 78 | cobs::ClassicIndexHeader h_in; 79 | std::vector v_in; 80 | h_in.read_file(buffer, v_in); 81 | 82 | // compare results 83 | ASSERT_EQ(h_out.term_size_, h_in.term_size_); 84 | ASSERT_EQ(h_out.canonicalize_, h_in.canonicalize_); 85 | ASSERT_EQ(h_out.signature_size_, h_in.signature_size_); 86 | ASSERT_EQ(h_out.row_size(), h_in.row_size()); 87 | ASSERT_EQ(h_out.num_hashes_, h_in.num_hashes_); 88 | ASSERT_EQ(v_out, v_in); 89 | ASSERT_EQ(file_names, h_in.file_names_); 90 | } 91 | 92 | TEST(file, compact_index_header_values) { 93 | std::stringstream buffer; 94 | 95 | // write compact file header 96 | std::vector parameters = { 97 | { 100, 1 }, 98 | { 200, 1 }, 99 | { 3000, 1 }, 100 | }; 101 | std::vector file_names = { "file_1", "file_2", "file_3" }; 102 | cobs::CompactIndexHeader h_out; 103 | h_out.term_size_ = 31; 104 | h_out.canonicalize_ = 1; 105 | h_out.parameters_ = parameters; 106 | h_out.file_names_ = file_names; 107 | h_out.page_size_ = 4096; 108 | h_out.serialize(buffer); 109 | 110 | // read compact file header 111 | cobs::CompactIndexHeader h_in; 112 | h_in.deserialize(buffer); 113 | 114 | // compare results 115 | for (size_t i = 0; i < parameters.size(); i++) { 116 | ASSERT_EQ(parameters[i].num_hashes, h_in.parameters_[i].num_hashes); 117 | ASSERT_EQ(parameters[i].signature_size, h_in.parameters_[i].signature_size); 118 | } 119 | ASSERT_EQ(file_names, h_in.file_names_); 120 | } 121 | 122 | TEST(file, compact_index_header_padding) { 123 | std::stringstream buffer; 124 | 125 | // write compact file header 126 | std::vector parameters = { }; 127 | std::vector file_names = { }; 128 | uint64_t page_size = 4096; 129 | cobs::CompactIndexHeader h_out; 130 | h_out.term_size_ = 31; 131 | h_out.canonicalize_ = 1; 132 | h_out.parameters_ = parameters; 133 | h_out.file_names_ = file_names; 134 | h_out.page_size_ = page_size; 135 | h_out.serialize(buffer); 136 | 137 | // read compact file header 138 | cobs::CompactIndexHeader h_in; 139 | h_in.deserialize(buffer); 140 | cobs::StreamPos sp = cobs::get_stream_pos(buffer); 141 | ASSERT_EQ(sp.curr_pos % page_size, 0U); 142 | } 143 | 144 | /******************************************************************************/ 145 | -------------------------------------------------------------------------------- /tests/parameters.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/parameters.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | static std::string query = cobs::random_sequence(10000, 1); 20 | 21 | static std::unordered_map basepairs = { 22 | { 'A', 'T' }, { 'C', 'G' }, { 'G', 'C' }, { 'T', 'A' } 23 | }; 24 | 25 | size_t get_num_positives(uint64_t num_elements, uint64_t num_hashes, double false_positive_rate, size_t num_tests) { 26 | uint64_t signature_size = cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate); 27 | 28 | std::vector signature(signature_size); 29 | std::srand(1); 30 | for (size_t i = 0; i < num_hashes * num_elements; i++) { 31 | signature[std::rand() % signature.size()] = true; 32 | } 33 | 34 | size_t num_positives = 0; 35 | for (size_t i = 0; i < num_tests; i++) { 36 | size_t num_hits = 0; 37 | for (size_t j = 0; j < num_hashes; j++) { 38 | num_hits += signature[std::rand() % signature.size()] ? 1 : 0; 39 | } 40 | num_positives += num_hits == num_hashes ? 1 : 0; 41 | } 42 | return num_positives; 43 | } 44 | 45 | size_t get_num_positives_hash(uint64_t num_hashes, double false_positive_rate, size_t num_tests) { 46 | std::string query = cobs::random_sequence(10000, 1); 47 | uint64_t num_elements = query.size() - 30; 48 | uint64_t signature_size = cobs::calc_signature_size(num_elements, num_hashes, false_positive_rate); 49 | 50 | std::vector signature(signature_size); 51 | cobs::KMer<31> k; 52 | for (size_t i = 0; i < num_elements; i++) { 53 | cobs::process_hashes( 54 | query.data() + i, 31, signature_size, num_hashes, 55 | [&](size_t index) { 56 | signature[index] = true; 57 | }); 58 | } 59 | 60 | std::srand(1); 61 | size_t num_positives = 0; 62 | for (size_t i = 0; i < num_tests; i++) { 63 | size_t num_hits = 0; 64 | for (size_t j = 0; j < num_hashes; j++) { 65 | num_hits += signature[std::rand() % signature.size()] ? 1 : 0; 66 | } 67 | num_positives += num_hits == num_hashes ? 1 : 0; 68 | } 69 | return num_positives; 70 | } 71 | 72 | void assert_between(size_t num, size_t min, size_t max) { 73 | ASSERT_GE(num, min); 74 | ASSERT_LE(num, max); 75 | } 76 | 77 | TEST(parameters, false_positive) { 78 | size_t num_positives = get_num_positives(100000, 1, 0.3, 100000); 79 | assert_between(num_positives, 29000, 31000); 80 | num_positives = get_num_positives(100000, 2, 0.3, 100000); 81 | assert_between(num_positives, 29000, 31000); 82 | num_positives = get_num_positives(100000, 3, 0.3, 100000); 83 | assert_between(num_positives, 29000, 31000); 84 | num_positives = get_num_positives(100000, 1, 0.1, 100000); 85 | assert_between(num_positives, 9800, 10200); 86 | num_positives = get_num_positives(100000, 2, 0.1, 100000); 87 | assert_between(num_positives, 9800, 10200); 88 | num_positives = get_num_positives(100000, 3, 0.1, 100000); 89 | assert_between(num_positives, 9800, 10200); 90 | } 91 | 92 | TEST(parameters, false_positive_hash) { 93 | size_t num_positives = get_num_positives_hash(1, 0.3, 100000); 94 | assert_between(num_positives, 29000, 31000); 95 | num_positives = get_num_positives_hash(2, 0.3, 100000); 96 | assert_between(num_positives, 29000, 31000); 97 | num_positives = get_num_positives_hash(3, 0.3, 100000); 98 | assert_between(num_positives, 29000, 31000); 99 | num_positives = get_num_positives_hash(1, 0.1, 100000); 100 | assert_between(num_positives, 9800, 10200); 101 | num_positives = get_num_positives_hash(2, 0.1, 100000); 102 | assert_between(num_positives, 9800, 10200); 103 | num_positives = get_num_positives_hash(3, 0.1, 100000); 104 | assert_between(num_positives, 9800, 10200); 105 | } 106 | 107 | TEST(parameters, canonical) { 108 | char kmer_buffer[31]; 109 | for (size_t i = 0; i < query.size() - 31; i++) { 110 | char* kmer_8 = query.data() + i; 111 | bool good = cobs::canonicalize_kmer(kmer_8, kmer_buffer, 31); 112 | die_unless(good); 113 | 114 | std::string kmer_result(kmer_buffer, 31); 115 | std::string kmer_original(kmer_8, 31); 116 | std::string kmer_complement(31, 'X'); 117 | for (size_t j = 0; j < 31; j++) { 118 | kmer_complement[j] = basepairs[kmer_original[30 - j]]; 119 | } 120 | ASSERT_EQ(kmer_result, std::min(kmer_original, kmer_complement)); 121 | } 122 | } 123 | 124 | /******************************************************************************/ 125 | -------------------------------------------------------------------------------- /tests/test_util.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/test_util.hpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #ifndef COBS_TESTS_TEST_UTIL_HEADER 10 | #define COBS_TESTS_TEST_UTIL_HEADER 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | static inline 26 | void assert_equals_files(const std::string& f1, const std::string& f2) { 27 | std::ifstream ifs1(f1, std::ios::in | std::ios::binary); 28 | std::ifstream ifs2(f2, std::ios::in | std::ios::binary); 29 | std::istream_iterator start1(ifs1); 30 | std::istream_iterator start2(ifs2); 31 | std::istream_iterator end; 32 | std::vector v1(start1, end); 33 | std::vector v2(start2, end); 34 | 35 | ASSERT_EQ(v1.size(), v2.size()); 36 | for (size_t i = 0; i < v1.size(); i++) { 37 | ASSERT_EQ(v1[i], v2[i]); 38 | } 39 | } 40 | 41 | //! Generate documents from a (random) query sequence 42 | static inline 43 | std::vector > 44 | generate_documents_all(const std::string& query, 45 | size_t num_documents = 33, size_t num_terms = 1000000) { 46 | std::vector > documents(num_documents); 47 | cobs::KMer<31> k; 48 | char kmer_buffer[32]; 49 | for (size_t i = 0; i < num_terms && i < query.size() - 31; i++) { 50 | bool good = cobs::canonicalize_kmer(query.data() + i, kmer_buffer, 31); 51 | die_unless(good); 52 | kmer_buffer[31] = 0; 53 | 54 | k.init(kmer_buffer); 55 | for (size_t j = 0; j < documents.size(); j++) { 56 | if (j % (i % (documents.size() - 1) + 1) == 0) { 57 | documents[j].data().push_back(k); 58 | } 59 | } 60 | } 61 | return documents; 62 | } 63 | 64 | //! Generate documents from a (random) query sequence with each query term 65 | //! contained in exactly one document. 66 | static inline 67 | std::vector > 68 | generate_documents_one(const std::string& query, size_t num_documents = 33) { 69 | std::vector > documents(num_documents); 70 | cobs::KMer<31> k; 71 | char kmer_buffer[32]; 72 | 73 | bool good = cobs::canonicalize_kmer(query.data(), kmer_buffer, 31); 74 | die_unless(good); 75 | kmer_buffer[31] = 0; 76 | 77 | k.init(kmer_buffer); 78 | for (size_t i = 0; i < documents.size(); i++) { 79 | for (size_t j = 0; j < i * 10 + 1; j++) { 80 | documents[i].data().push_back(k); 81 | } 82 | } 83 | return documents; 84 | } 85 | 86 | static inline 87 | void generate_test_case(std::vector > documents, 88 | std::string prefix, 89 | const std::string& out_dir) { 90 | for (size_t i = 0; i < documents.size(); i++) { 91 | std::string file_name = prefix + "document_" + cobs::pad_index(i); 92 | documents[i].serialize( 93 | out_dir + "/" + file_name + cobs::KMerBufferHeader::file_extension, 94 | file_name); 95 | } 96 | } 97 | 98 | static inline 99 | void generate_test_case(std::vector > documents, 100 | const std::string& out_dir) { 101 | return generate_test_case(documents, "", out_dir); 102 | } 103 | 104 | #endif // !COBS_TESTS_TEST_UTIL_HEADER 105 | 106 | /******************************************************************************/ 107 | -------------------------------------------------------------------------------- /tests/text_file.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/text_file.cpp 3 | * 4 | * Copyright (c) 2019 Timo Bingmann 5 | * 6 | * All rights reserved. Published under the MIT License in the LICENSE file. 7 | ******************************************************************************/ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace fs = cobs::fs; 14 | 15 | static fs::path in_dir = "data/text/"; 16 | 17 | TEST(text, process_txt1) { 18 | cobs::TextFile text(in_dir / "sample1.txt"); 19 | 20 | die_unequal(text.size(), 76u); 21 | 22 | size_t count = 0; 23 | text.process_terms(31, [&](const tlx::string_view&) { ++count; }); 24 | die_unequal(text.size() - 30, count); 25 | } 26 | 27 | TEST(text, document_list) { 28 | cobs::DocumentList doc_list(in_dir); 29 | 30 | die_unequal(doc_list.list().size(), 2u); 31 | } 32 | 33 | /******************************************************************************/ 34 | -------------------------------------------------------------------------------- /tests/util.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * tests/util.cpp 3 | * 4 | * Copyright (c) 2018 Florian Gauger 5 | * Copyright (c) 2018 Timo Bingmann 6 | * 7 | * All rights reserved. Published under the MIT License in the LICENSE file. 8 | ******************************************************************************/ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | void is_aligned(void* ptr, size_t alignment) { 16 | ASSERT_EQ((uintptr_t)ptr % alignment, 0); 17 | } 18 | 19 | TEST(util, allocate_aligned) { 20 | auto ptr1 = cobs::allocate_aligned(10, cobs::get_page_size()); 21 | is_aligned(ptr1, cobs::get_page_size()); 22 | auto ptr2 = cobs::allocate_aligned(1337, 16); 23 | is_aligned(ptr2, 16); 24 | cobs::deallocate_aligned(ptr1); 25 | cobs::deallocate_aligned(ptr2); 26 | } 27 | 28 | void test_kmer(const char* kmer_data, 29 | const char* kmer_correct, bool is_good) { 30 | char kmer_buffer[31]; 31 | bool good = cobs::canonicalize_kmer(kmer_data, kmer_buffer, 31); 32 | 33 | die_unequal(std::string(kmer_buffer, 31), 34 | std::string(kmer_correct, 31)); 35 | die_unequal(good, is_good); 36 | } 37 | 38 | TEST(util, kmer_canonicalize) { 39 | // one already canonical one 40 | test_kmer("AGGAAAGTCTTTTACGCTGGGGTAAGAGTGA", 41 | "AGGAAAGTCTTTTACGCTGGGGTAAGAGTGA", true); 42 | // two k-mers which need to be flipped 43 | test_kmer("TGGAAAGTCTTTTACGCTGGGGTAAGAGTGA", 44 | "TCACTCTTACCCCAGCGTAAAAGACTTTCCA", true); 45 | test_kmer("TTTTTTGTCTTTTACGCTGGGGTTTAAAAAA", 46 | "TTTTTTAAACCCCAGCGTAAAAGACAAAAAA", true); 47 | // special case, lexicographically smaller until center 48 | test_kmer("AAAAAAAAAAAAAAAATTTTTTTTTTTTTTT", 49 | "AAAAAAAAAAAAAAAATTTTTTTTTTTTTTT", true); 50 | 51 | // one kmer already canonical but containing invalid letters 52 | test_kmer("AGGAAAGTCTTTTACGCTGGGXXXAGAGTGA", 53 | "AGGAAAGTCTTTTACGCTGGG\0\0\0AGAGTGA", false); 54 | // one k-mer needing flipping containing invalid letters 55 | test_kmer("TGGAAAGTCTTTTACGCTGGGXXXAGAGTGA", 56 | "TCACTCT\0\0\0CCCAGCGTAAAAGACTTTCCA", false); 57 | // one kmer containing the invalid letter at the center 58 | test_kmer("AAAAAAAAAAAAAAAXTTTTTTTTTTTTTTT", 59 | "AAAAAAAAAAAAAAA\0TTTTTTTTTTTTTTT", false); 60 | } 61 | 62 | /******************************************************************************/ 63 | --------------------------------------------------------------------------------