├── .clang-format ├── .github └── workflows │ └── cmake.yml ├── .gitignore ├── .gitmodules ├── .hgignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── experiments_flags ├── experiments_main.cpp ├── legacy ├── align_fasta.cpp ├── cross_comp.cpp ├── dists_pairwise.cpp ├── dtw.cpp ├── long_seqs.cpp ├── tensor.hpp ├── tensor_disc.hpp ├── tensor_slide.hpp ├── tensor_slide2.hpp ├── test_tensor_disc.cpp ├── test_typeinfo.cpp ├── vectool.cpp └── vectool.hpp ├── paper_gen.py ├── phylogeny ├── upgma.cpp └── upgma.hpp ├── pyproject.toml ├── python ├── init_numba_env.sh └── lib │ ├── __init__.py │ ├── base.py │ ├── cds.py │ ├── sequence.py │ ├── tensor_embedding.py │ ├── tensor_sketch.py │ ├── tensor_sketch_gpu.py │ └── util.py ├── sequence ├── alphabets.cpp ├── alphabets.hpp ├── fasta_io.hpp └── sequence_generator.hpp ├── sequence_generator_main.cpp ├── sketch ├── dim_reduce.hpp ├── edit_distance.hpp ├── hash_base.cpp ├── hash_base.hpp ├── hash_min.hpp ├── hash_ordered.hpp ├── hash_weighted.hpp ├── sketch_base.hpp ├── tensor.hpp ├── tensor_block.hpp ├── tensor_embedding.hpp ├── tensor_slide.hpp └── tensor_slide_flat.hpp ├── sketch_main.cpp ├── tests ├── phylogeny │ ├── data.txt │ └── test_upgma.cpp ├── sketch │ ├── test_hash_base.cpp │ ├── test_min_hash.cpp │ ├── test_ordered_min_hash.cpp │ ├── test_tensor.cpp │ ├── test_tensor_block.cpp │ ├── test_tensor_slide.cpp │ └── test_weighted_min_hash.cpp └── util │ ├── test_multivec.cpp │ └── test_spearman.cpp ├── third_party └── murmur_hash │ ├── CMakeLists.txt │ ├── murmur_hash3.cpp │ └── murmur_hash3.hpp └── util ├── multivec.hpp ├── progress.cpp ├── progress.hpp ├── spearman.hpp ├── timer.cpp ├── timer.hpp ├── transformer.hpp ├── utils.cpp └── utils.hpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: DontAlign 9 | AlignOperands: false 10 | AlignTrailingComments: false 11 | AllowAllParametersOfDeclarationOnNextLine: false 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Inline 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: true 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: true 21 | BinPackParameters: false 22 | BraceWrapping: 23 | AfterEnum: false 24 | AfterFunction: false 25 | AfterNamespace: false 26 | AfterStruct: false 27 | AfterUnion: false 28 | AfterExternBlock: false 29 | BeforeCatch: false 30 | BeforeElse: false 31 | IndentBraces: false 32 | SplitEmptyFunction: false 33 | BreakBeforeBinaryOperators: All 34 | BreakBeforeBraces: Attach 35 | BreakBeforeInheritanceComma: false 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializersBeforeComma: false 38 | BreakConstructorInitializers: BeforeColon 39 | BreakStringLiterals: true 40 | ColumnLimit: 100 41 | CompactNamespaces: false 42 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 43 | ConstructorInitializerIndentWidth: 4 44 | ContinuationIndentWidth: 8 45 | Cpp11BracedListStyle: false 46 | DerivePointerAlignment: false 47 | DisableFormat: false 48 | ExperimentalAutoDetectBinPacking: false 49 | FixNamespaceComments: true 50 | ForEachMacros: 51 | - foreach 52 | - Q_FOREACH 53 | - BOOST_FOREACH 54 | IndentCaseLabels: true 55 | IndentPPDirectives: None 56 | IndentWidth: 4 57 | IndentWrappedFunctionNames: false 58 | KeepEmptyLinesAtTheStartOfBlocks: false 59 | MaxEmptyLinesToKeep: 2 60 | NamespaceIndentation: None 61 | PenaltyBreakAssignment: 2 62 | PenaltyBreakBeforeFirstCallParameter: 70 63 | PenaltyBreakComment: 300 64 | PenaltyBreakFirstLessLess: 120 65 | PenaltyBreakString: 1000 66 | PenaltyBreakTemplateDeclaration: 1 67 | PenaltyExcessCharacter: 5000 68 | PenaltyReturnTypeOnItsOwnLine: 60 69 | PointerAlignment: Right 70 | RawStringFormats: 71 | - Delimiters: 72 | - 'pb' 73 | Language: TextProto 74 | BasedOnStyle: LLVM 75 | ReflowComments: true 76 | SortIncludes: true 77 | SortUsingDeclarations: false 78 | SpaceAfterCStyleCast: false 79 | SpaceAfterTemplateKeyword: true 80 | SpaceBeforeAssignmentOperators: true 81 | SpaceBeforeCpp11BracedList: true 82 | SpaceBeforeCtorInitializerColon: true 83 | SpaceBeforeInheritanceColon: true 84 | SpaceBeforeParens: ControlStatements 85 | SpaceBeforeRangeBasedForLoopColon: true 86 | SpaceInEmptyParentheses: false 87 | SpacesBeforeTrailingComments: 1 88 | SpacesInAngles: false 89 | SpacesInCStyleCastParentheses: false 90 | SpacesInContainerLiterals: true 91 | SpacesInParentheses: false 92 | SpacesInSquareBrackets: false 93 | Standard: Cpp11 94 | TabWidth: 4 95 | UseTab: Never 96 | ... 97 | -------------------------------------------------------------------------------- /.github/workflows/cmake.yml: -------------------------------------------------------------------------------- 1 | name: CMake 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | # The CMake configure and build commands are platform agnostic and should work equally 8 | # well on Windows or Mac. You can convert this to a matrix build if you need 9 | # cross-platform coverage. 10 | # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | build_type: [Debug, Release] 16 | compiler: [g++-8] 17 | include: 18 | - compiler: g++-8 19 | cxx: g++-8 20 | cc: gcc-8 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - name: checkout submodules 26 | run: git submodule update --init --recursive 27 | 28 | - name: create build dir 29 | run: mkdir ${{runner.workspace}}/build 30 | 31 | - name: Configure CMake 32 | working-directory: ${{runner.workspace}}/build 33 | shell: bash 34 | run: | 35 | export CC=$(which ${{ matrix.cc }}) 36 | export CXX=$(which ${{ matrix.cxx }}) 37 | cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} 38 | 39 | - name: Build 40 | working-directory: ${{runner.workspace}}/build 41 | shell: bash 42 | # Execute the build. You can specify a specific target with "--target " 43 | run: make -j 44 | 45 | - name: Test 46 | working-directory: ${{runner.workspace}}/build 47 | shell: bash 48 | # Execute tests defined by the CMake configuration. 49 | # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail 50 | run: ./tests 51 | 52 | build-macos: 53 | 54 | runs-on: macos-latest 55 | 56 | strategy: 57 | matrix: 58 | build_type: [Debug, Release] 59 | 60 | steps: 61 | - uses: actions/checkout@v2 62 | 63 | - name: checkout submodules 64 | run: git submodule update --init --recursive 65 | 66 | - name: install dependencies 67 | run: brew install libomp 68 | 69 | - name: create build dir 70 | run: mkdir ${{runner.workspace}}/build 71 | 72 | - name: Configure CMake 73 | working-directory: ${{runner.workspace}}/build 74 | shell: bash 75 | run: | 76 | cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} 77 | 78 | - name: Build 79 | working-directory: ${{runner.workspace}}/build 80 | shell: bash 81 | # Execute the build. You can specify a specific target with "--target " 82 | run: make -j 83 | 84 | - name: Test 85 | working-directory: ${{runner.workspace}}/build 86 | shell: bash 87 | # Execute tests defined by the CMake configuration. 88 | # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail 89 | run: ./tests 90 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cmake-build-debug 2 | cmake-build-release 3 | .idea 4 | build 5 | compile_commands.json 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "googletest"] 2 | path = googletest 3 | url = https://github.com/google/googletest 4 | [submodule "third_party/googletest"] 5 | path = third_party/googletest 6 | url = https://github.com/google/googletest 7 | [submodule "third_party/gflags"] 8 | path = third_party/gflags 9 | url = https://github.com/gflags/gflags 10 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | third_party 2 | legacy 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: https://github.com/psf/black 8 | rev: 20.8b1 9 | hooks: 10 | - id: black 11 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) 2 | project(sequence_sketching) 3 | set(CMAKE_CXX_STANDARD 17) 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1) 5 | find_package(OpenMP REQUIRED) 6 | 7 | include_directories(.) 8 | 9 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 10 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 11 | 12 | # Google Flags Library 13 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags EXCLUDE_FROM_ALL) 14 | 15 | # Murmur 16 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/murmur_hash EXCLUDE_FROM_ALL) 17 | 18 | file(GLOB util_files "util/*.cpp") 19 | add_library(util ${util_files}) 20 | target_link_libraries(util gflags OpenMP::OpenMP_CXX) 21 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 22 | target_link_libraries(util stdc++fs) 23 | endif() 24 | 25 | file(GLOB sequence_files "sequence/*.cpp") 26 | add_library(sequence ${sequence_files}) 27 | target_link_libraries(sequence gflags OpenMP::OpenMP_CXX) 28 | 29 | file(GLOB sketch_files "sketch/*.cpp") 30 | add_library(sketch_lib ${sketch_files}) 31 | target_link_libraries(sketch_lib murmur_lib OpenMP::OpenMP_CXX) 32 | 33 | file(GLOB phylogeny_files "phylogeny/*.cpp") 34 | add_library(phylogeny_lib ${phylogeny_files}) 35 | 36 | add_executable(experiments experiments_main.cpp ) 37 | target_link_libraries(experiments sequence util sketch_lib) 38 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 39 | target_link_libraries(experiments stdc++fs) 40 | endif() 41 | 42 | add_executable(sketch sketch_main.cpp) 43 | target_link_libraries(sketch sequence util sketch_lib) 44 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 45 | target_link_libraries(sketch stdc++fs) 46 | endif() 47 | 48 | add_executable(seqgen sequence_generator_main.cpp) 49 | target_link_libraries(seqgen sequence util sketch_lib) 50 | 51 | # TESTS 52 | string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Werror -Wfatal-errors -msse4") 53 | 54 | 55 | enable_testing() 56 | include(GoogleTest) 57 | 58 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest EXCLUDE_FROM_ALL) 59 | target_compile_options(gtest_main PRIVATE -w) 60 | target_compile_options(gtest PRIVATE -w) 61 | 62 | file(GLOB test_files "tests/**/*.cpp") 63 | 64 | add_executable(tests ${test_files}) 65 | target_link_libraries(tests gtest_main gtest gmock util sketch_lib phylogeny_lib) 66 | target_include_directories(tests PRIVATE "include") 67 | 68 | gtest_discover_tests(tests) 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ratschlab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Similarity Estimation via Tensor Sketching 2 | This repository contains the reference implementation for the Tensor Sketching method, which can be used to estimate sequence similarity without needing to align the sequences. 3 | 4 | The method is described in the paper by Amir Joudaki et al. [`Fast Alignment-Free Similarity Estimation By Tensor 5 | Sketching`](https://www.biorxiv.org/content/10.1101/2020.11.13.381814v5.full). 6 | 7 | ## Download and build 8 | ``` 9 | git clone https://github.com/ratschlab/Project2020-seq-tensor-sketching 10 | cd Project2020-seq-tensor-sketching 11 | git submodule update --init -- 12 | mkdir build; cd build 13 | cmake .. 14 | make -j 15 | ``` 16 | 17 | ## Run 18 | The `sketch` binary expects as input a directory containing fasta files (with extension `.fa`, `.fasta` or `.fna`), 19 | each fasta file containing a single sequence: 20 | ```bash 21 | ./sketch -i /tmp/ -o /tmp/sketch_triangle 22 | ``` 23 | 24 | The output file will contain the number of sequences on the first line and the pairwise distances between each 25 | sequence on the following lines, e.g.: 26 | ``` 27 | 4 28 | test2.fa 29 | test3.fa 0.28125 30 | test4.fa 1.06314 0.915816 31 | test1.fa 0 0.28125 1.06314 32 | ``` 33 | For example, the distance between test1.fa and test2.fa is 0 (the lower the distance the more similar the sequences). 34 | 35 | ### Flags 36 | To see all available flags, run: 37 | ``` 38 | ./sketch --help 39 | ``` 40 | Here are the most important flags: 41 | 42 | `-m`, `--sketch_method`: the sketching method to use; can be one of `MH, WMH, OMH, TS, TSB or TSS`, which corresponds to 43 | min-hash, weigheted-min-has, ordered-min-hash, tensor-sketch, tensor-block and tensor-slide-sketch, respectively. 44 | 45 | `-k`, `--kmer_length`: the length of the k-mer used in the sketching method (default=3) 46 | 47 | `--embed_dim`: the dimension of the embedded space used in all sketching methods (default=4) 48 | 49 | `-t, --tuple-length`: the ordered tuple length, not used in Min-hash and Weighted-min-hash (default=3) 50 | 51 | `--block_size`: only consider tuples made out of block-size continuous characters for Tensor sketch (default=1) 52 | 53 | `-w, --window_size`: the size of sliding window in Tensor Slide Sketch (default=32) 54 | 55 | `--max_len`: the maximum accepted sequence length for Ordered and Weighted min-hash (default=32) 56 | 57 | `-s, --stride`: stride for sliding window: shift step for sliding window (default=8) 58 | ## Contributing 59 | 60 | - The python code in the repository is formatted using [black](https://github.com/psf/black). 61 | To enable the pre-commit hook, install [pre-commit](https://pre-commit.com/) 62 | with `pip` or your package manager (Arch: `python-pre-commit`) and run 63 | `pre-commit install` from the repository root. All python code will now automatically be formatted 64 | on each commit. 65 | -------------------------------------------------------------------------------- /experiments_flags: -------------------------------------------------------------------------------- 1 | # sequence generation 2 | --alphabet_size=4 3 | --seq_len=10000 4 | --num_seqs=1000 5 | --group_size=2 6 | --phylogeny_shape=path 7 | --fix_len=true 8 | --max_mutation_rate=1.0 9 | --min_mutation_rate=0.0 10 | # sketching methods 11 | --embed_dim=16 12 | --kmer_size=4 13 | --tuple_length=3 14 | --block_size=2 15 | --window_size=2000 16 | --stride=1000 17 | --tss_dim=4 18 | --max_len=1000 19 | # change this to uniform for final experiments 20 | --hash_alg=murmur 21 | --transform=none 22 | --num_bins=256 23 | # execution & I/O 24 | --num_threads=0 25 | --o=/tmp/ts 26 | -------------------------------------------------------------------------------- /legacy/align_fasta.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | #include "util/modules.hpp" 6 | #include "util/seqgen.hpp" 7 | #include "util/utils.hpp" 8 | 9 | using namespace ts; 10 | using namespace BasicTypes; 11 | 12 | struct KmerModule : public BasicModules { 13 | int original_alphabet_size {}; 14 | 15 | void override_pre() override { 16 | alphabet_size = 5; 17 | original_alphabet_size = alphabet_size; 18 | alphabet_size = int_pow(alphabet_size, kmer_size); 19 | } 20 | 21 | void override_post() override { 22 | // tensor_slide_params.alphabet_size = original_alphabet_size; 23 | // tensor_slide_params.tup_len = 2; 24 | tensor_slide_params.embed_dim = 50; 25 | tensor_slide_params.num_bins = 250; 26 | } 27 | }; 28 | 29 | struct HGModule { 30 | Vec3D dists; 31 | std::ifstream infile; 32 | 33 | std::map chr2int = { { 'a', 1 }, { 'c', 2 }, { 'g', 3 }, { 't', 4 }, { 'n', 0 }, 34 | { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } }; 35 | std::map chr2int_mask 36 | = { { 'a', -1 }, { 'c', -2 }, { 'g', -3 }, { 't', -4 }, { 'n', 0 }, 37 | { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } }; 38 | 39 | BasicModules basicModules; 40 | KmerModule kmerModules; 41 | 42 | void parse(int argc, char **argv) { 43 | basicModules.parse(argc, argv); 44 | basicModules.alphabet_size = 5; 45 | basicModules.models_init(); 46 | kmerModules.parse(argc, argv); 47 | kmerModules.models_init(); 48 | } 49 | 50 | 51 | string read_first() { 52 | string hg_file = "data/sub2.fa"; 53 | infile = std::ifstream(hg_file); 54 | string line; 55 | std::getline(infile, line); 56 | return line; 57 | } 58 | 59 | template 60 | string read_next_seq(std::vector &seq, std::vector mask) { 61 | seq.clear(); 62 | string line; 63 | while (std::getline(infile, line)) { 64 | if (line[0] == '>') { 65 | return line; 66 | } else { 67 | for (char c : line) { 68 | seq.push_back(chr2int[c]); 69 | mask.push_back((chr2int[c] > 0)); 70 | } 71 | } 72 | } 73 | return ""; 74 | } 75 | 76 | void compute_sketches() { 77 | Vec2D slide_sketch; 78 | std::vector seq, kmer_seq; 79 | std::vector mask; 80 | string name = read_first(), next_name; 81 | while (not name.empty()) { 82 | next_name = read_next_seq(seq, mask); 83 | seq2kmer(seq, kmer_seq, basicModules.kmer_size, basicModules.alphabet_size); 84 | tensor_slide_sketch(kmer_seq, slide_sketch, kmerModules.tensor_slide_params); 85 | save_output(name, slide_sketch); 86 | name = next_name; 87 | } 88 | } 89 | 90 | 91 | void save_output(string seq_name, const Vec2D &sketch) { 92 | std::ofstream fo; 93 | seq_name = string("data/sketch_") + seq_name.substr(1) + "_" + std::to_string(sketch.size()) 94 | + "_" + std::to_string(sketch[0].size()) + ".txt"; 95 | fo.open(seq_name); 96 | // fo << sketch.size() << ", " << sketch[0].size() << "\n"; 97 | for (int m = 0; m < sketch.size(); m++) { 98 | for (int i = 0; i < sketch[m].size(); i++) { 99 | fo << sketch[m][i] << ","; 100 | } 101 | fo << "\n"; 102 | } 103 | fo.close(); 104 | } 105 | }; 106 | 107 | int main(int argc, char *argv[]) { 108 | HGModule experiment; 109 | experiment.parse(argc, argv); 110 | experiment.compute_sketches(); 111 | // experiment.save_output(); 112 | } 113 | -------------------------------------------------------------------------------- /legacy/cross_comp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | #include "util/modules.hpp" 6 | #include "util/seqgen.hpp" 7 | #include "util/utils.hpp" 8 | 9 | using namespace ts; 10 | using namespace BasicTypes; 11 | 12 | struct KmerModule : public BasicModule { 13 | int original_alphabet_size {}; 14 | 15 | void override_module_params() override { 16 | original_alphabet_size = alphabet_size; 17 | alphabet_size = int_pow(alphabet_size, kmer_size); 18 | } 19 | }; 20 | 21 | template 22 | struct SeqGenModule { 23 | Vec2D seqs; 24 | std::vector seq_names; 25 | string test_id; 26 | Vec2D kmer_seqs; 27 | Vec2D mh_sketch; 28 | Vec2D wmh_sketch; 29 | Vec2D omh_sketch; 30 | Vec2D ten_sketch; 31 | Vec3D slide_sketch; 32 | Vec3D dists; 33 | 34 | BasicModule basicModules; 35 | KmerModule kmerModules; 36 | string output; 37 | 38 | void parse(int argc, char **argv) { 39 | basicModules.parse(argc, argv); 40 | basicModules.models_init(); 41 | kmerModules.parse(argc, argv); 42 | kmerModules.models_init(); 43 | output = basicModules.directory + basicModules.output; 44 | } 45 | 46 | void write_fasta(Vec2D &seq_vec, bool Abc = false) { 47 | std::ofstream fo; 48 | fo.open(output + "seqs.fa"); 49 | test_id = "#" + std::to_string(random()); 50 | fo << test_id << "\n"; 51 | for (int si = 0; si < seq_vec.size(); si++) { 52 | fo << "> " << si << "\n"; 53 | for (int i = 0; i < seq_vec[i].size(); i++) { 54 | if (Abc) { 55 | fo << (char)(seq_vec[si][i] + (int)'A'); 56 | } else { 57 | fo << seq_vec[si][i] << ","; 58 | } 59 | } 60 | fo << "\n\n"; 61 | } 62 | fo.close(); 63 | } 64 | 65 | void read_fasta(Vec2D &seq_vec) { 66 | seq_vec.clear(); 67 | string file = (output + "/seqs.fa"); 68 | std::ifstream infile = std::ifstream(file); 69 | string line; 70 | 71 | std::getline(infile, line); 72 | if (line[0] == '#') { 73 | test_id = line; 74 | std::getline(infile, line); 75 | } 76 | while (line[0] != '>') { 77 | std::cout << line << "\n"; 78 | std::getline(infile, line); 79 | } 80 | string name = line; 81 | std::vector seq; 82 | while (std::getline(infile, line)) { 83 | if (line[0] == '>') { 84 | seq_vec.push_back(seq); 85 | seq_names.push_back(name); 86 | seq.clear(); 87 | name = line; 88 | } else if (!line.empty()) { 89 | for (char c : line) { 90 | int ic = c - (int)'A'; 91 | seq.push_back(ic); 92 | } 93 | } 94 | } 95 | } 96 | 97 | 98 | void generate_sequences() { 99 | if (basicModules.mutation_pattern == "pairs") { 100 | basicModules.seq_gen.genseqs_pairs(seqs); 101 | } else if (basicModules.mutation_pattern == "linear") { 102 | basicModules.seq_gen.genseqs_linear(seqs); 103 | } else if (basicModules.mutation_pattern == "tree") { 104 | basicModules.seq_gen.genseqs_tree(seqs, basicModules.sequence_seeds); 105 | } else { 106 | std::cerr << " mutation pattern `" << basicModules.mutation_pattern 107 | << "` is not valid\n"; 108 | exit(1); 109 | } 110 | write_fasta(seqs); 111 | } 112 | 113 | void compute_sketches() { 114 | int num_seqs = seqs.size(); 115 | kmer_seqs.resize(num_seqs); 116 | wmh_sketch.resize(num_seqs); 117 | mh_sketch.resize(num_seqs); 118 | omh_sketch.resize(num_seqs); 119 | ten_sketch.resize(num_seqs); 120 | slide_sketch.resize(num_seqs); 121 | for (int si = 0; si < num_seqs; si++) { 122 | seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size); 123 | minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params); 124 | weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params); 125 | ordered_minhash_flat(seqs[si], omh_sketch[si], basicModules.omh_params); 126 | tensor_sketch(seqs[si], ten_sketch[si], basicModules.tensor_params); 127 | tensor_slide_sketch(seqs[si], slide_sketch[si], basicModules.tensor_slide_params); 128 | } 129 | } 130 | void compute_pairwise_dists() { 131 | int num_seqs = seqs.size(); 132 | if (basicModules.mutation_pattern == "pairs") { 133 | dists = new3D(8, num_seqs, 1, -1); 134 | for (int i = 0; i < seqs.size(); i += 2) { 135 | int j = i + 1; 136 | dists[0][i][0] = edit_distance(seqs[i], seqs[j]); 137 | dists[1][i][0] = hamming_dist(mh_sketch[i], mh_sketch[j]); 138 | dists[2][i][0] = hamming_dist(wmh_sketch[i], wmh_sketch[j]); 139 | dists[3][i][0] = hamming_dist(omh_sketch[i], omh_sketch[j]); 140 | dists[4][i][0] = l1_dist(ten_sketch[i], ten_sketch[j]); 141 | dists[5][i][0] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]); 142 | } 143 | } else { 144 | dists = new3D(8, num_seqs, num_seqs, 0); 145 | for (int i = 0; i < seqs.size(); i++) { 146 | for (int j = i + 1; j < seqs.size(); j++) { 147 | dists[0][i][j] = edit_distance(seqs[i], seqs[j]); 148 | dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]); 149 | dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]); 150 | dists[3][i][j] = hamming_dist(omh_sketch[i], omh_sketch[j]); 151 | dists[4][i][j] = l1_dist(ten_sketch[i], ten_sketch[j]); 152 | dists[5][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]); 153 | } 154 | } 155 | } 156 | } 157 | 158 | void save_output() { 159 | std::vector method_names 160 | = { "ED", "MH", "WMH", "OMH", "TenSketch", "TenSlide", "Ten2", "Ten2Slide" }; 161 | std::ofstream fo; 162 | 163 | fo.open(output + "conf.csv"); 164 | fo << basicModules.config(); 165 | fo.close(); 166 | 167 | int num_seqs = seqs.size(); 168 | for (int m = 0; m < 6; m++) { 169 | fo.open(output + "dists/" + method_names[m] + ".txt"); 170 | assert(fo.is_open()); 171 | if (basicModules.mutation_pattern == "pairs") { 172 | for (int i = 0; i < num_seqs; i += 2) { 173 | int j = i + 1; 174 | fo << i << ", " << j << ", " << dists[m][i][0] << "\n"; 175 | } 176 | } else { 177 | for (int i = 0; i < num_seqs; i++) { 178 | for (int j = i + 1; j < seqs.size(); j++) { 179 | fo << i << ", " << j << ", " << dists[m][i][j] << "\n"; 180 | } 181 | } 182 | } 183 | fo.close(); 184 | } 185 | 186 | fo.open(output + "sketches/mh.txt"); 187 | assert(fo.is_open()); 188 | for (int si = 0; si < num_seqs; si++) { 189 | fo << ">> seq " << si << "\n"; 190 | for (const auto &e : mh_sketch[si]) { 191 | fo << e << ", "; 192 | } 193 | fo << "\n"; 194 | } 195 | fo.close(); 196 | 197 | fo.open(output + "sketches/wmh.txt"); 198 | assert(fo.is_open()); 199 | for (int si = 0; si < num_seqs; si++) { 200 | fo << ">> seq " << si << "\n"; 201 | for (const auto &e : wmh_sketch[si]) { 202 | fo << e << ", "; 203 | } 204 | fo << "\n"; 205 | } 206 | fo.close(); 207 | 208 | fo.open(output + "sketches/omh.txt"); 209 | assert(fo.is_open()); 210 | for (int si = 0; si < num_seqs; si++) { 211 | fo << ">> seq " << si << "\n"; 212 | for (const auto &e : omh_sketch[si]) { 213 | fo << e << ", "; 214 | } 215 | fo << "\n"; 216 | } 217 | fo.close(); 218 | 219 | fo.open(output + "sketches/ten.txt"); 220 | assert(fo.is_open()); 221 | for (int si = 0; si < seqs.size(); si++) { 222 | fo << ">> seq " << si << "\n"; 223 | for (const auto &e : ten_sketch[si]) { 224 | fo << e << ", "; 225 | } 226 | fo << "\n"; 227 | } 228 | fo.close(); 229 | 230 | fo.open(output + "sketches/ten_slide.txt"); 231 | for (int si = 0; si < seqs.size(); si++) { 232 | auto &sk = slide_sketch[si]; 233 | for (int dim = 0; dim < sk.size(); dim++) { 234 | fo << ">> seq: " << si << ", dim: " << dim << "\n"; 235 | for (auto &item : sk[dim]) 236 | fo << item << ", "; 237 | fo << "\n"; 238 | } 239 | fo << "\n"; 240 | } 241 | fo.close(); 242 | } 243 | }; 244 | 245 | int main(int argc, char *argv[]) { 246 | SeqGenModule experiment; 247 | experiment.parse(argc, argv); 248 | if (experiment.basicModules.show_help) { 249 | std::cout << experiment.basicModules.description(); 250 | } else { 251 | experiment.generate_sequences(); 252 | experiment.compute_sketches(); 253 | experiment.compute_pairwise_dists(); 254 | experiment.save_output(); 255 | } 256 | return 0; 257 | } 258 | -------------------------------------------------------------------------------- /legacy/dists_pairwise.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | #include "util/modules.hpp" 6 | #include "util/seqgen.hpp" 7 | #include "util/utils.hpp" 8 | 9 | using namespace ts; 10 | using namespace BasicTypes; 11 | 12 | struct KmerModule : public BasicModules { 13 | int original_alphabet_size {}; 14 | 15 | void override_pre() override { 16 | original_alphabet_size = alphabet_size; 17 | alphabet_size = int_pow(alphabet_size, kmer_size); 18 | } 19 | 20 | void override_post() override { 21 | tensor_slide_params.alphabet_size = original_alphabet_size; 22 | tensor_slide_params.tup_len = 2; 23 | } 24 | }; 25 | 26 | struct TestModule1 { 27 | Vec2D seqs; 28 | std::vector seq_names; 29 | string test_id; 30 | Vec2D kmer_seqs; 31 | Vec2D wmh_sketch; 32 | Vec2D mh_sketch; 33 | Vec3D omh_sketch; 34 | Vec2D ten_sketch; 35 | Vec3D slide_sketch; 36 | Vec2D ten_new_sketch; 37 | Vec3D ten_new_slide_sketch; 38 | Vec3D dists; 39 | 40 | BasicModules basicModules; 41 | KmerModule kmerModules; 42 | NewModules newModules; 43 | 44 | void parse(int argc, char **argv) { 45 | basicModules.parse(argc, argv); 46 | basicModules.models_init(); 47 | kmerModules.parse(argc, argv); 48 | kmerModules.models_init(); 49 | newModules.parse(argc, argv); 50 | newModules.model_init(); 51 | } 52 | 53 | 54 | template 55 | void write_fasta(Vec2D &seq_vec) { 56 | std::ofstream fo; 57 | fo.open(out_path + "/seqs.fa"); 58 | test_id = "#" + std::to_string(random()); 59 | fo << test_id << "\n"; 60 | for (int si = 0; si < seq_vec.size(); si++) { 61 | fo << "> " << si << "\n"; 62 | for (int i = 0; i < seq_vec[i].size(); i++) { 63 | fo << (char)(seq_vec[si][i] + (int)'A'); 64 | } 65 | fo << "\n\n"; 66 | } 67 | fo.close(); 68 | } 69 | 70 | template 71 | void read_fasta(Vec2D &seq_vec) { 72 | seq_vec.clear(); 73 | string file = (out_path + "/seqs.fa"); 74 | std::ifstream infile = std::ifstream(file); 75 | string line; 76 | 77 | std::getline(infile, line); 78 | if (line[0] == '#') { 79 | test_id = line; 80 | std::getline(infile, line); 81 | } 82 | while (line[0] != '>') { 83 | std::cout << line << "\n"; 84 | std::getline(infile, line); 85 | } 86 | string name = line; 87 | std::vector seq; 88 | while (std::getline(infile, line)) { 89 | if (line[0] == '>') { 90 | seq_vec.push_back(seq); 91 | seq_names.push_back(name); 92 | seq.clear(); 93 | name = line; 94 | } else if (!line.empty()) { 95 | for (char c : line) { 96 | int ic = c - (int)'A'; 97 | seq.push_back(ic); 98 | } 99 | } 100 | } 101 | } 102 | 103 | void generate_sequences() { 104 | basicModules.seq_gen.gen_seqs(seqs); 105 | write_fasta(seqs); 106 | // read_fasta(seqs); 107 | } 108 | 109 | void compute_sketches() { 110 | int num_seqs = seqs.size(); 111 | kmer_seqs.resize(num_seqs); 112 | wmh_sketch.resize(num_seqs); 113 | mh_sketch.resize(num_seqs); 114 | omh_sketch.resize(num_seqs); 115 | ten_sketch.resize(num_seqs); 116 | slide_sketch.resize(num_seqs); 117 | ten_new_sketch.resize(num_seqs); 118 | ten_new_slide_sketch.resize(num_seqs); 119 | for (int si = 0; si < num_seqs; si++) { 120 | seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size); 121 | minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params); 122 | weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params); 123 | ordered_minhash(kmer_seqs[si], omh_sketch[si], kmerModules.omh_params); 124 | tensor_sketch(kmer_seqs[si], ten_sketch[si], kmerModules.tensor_params); 125 | tensor_slide_sketch(seqs[si], slide_sketch[si], kmerModules.tensor_slide_params); 126 | 127 | tensor2_sketch(kmer_seqs[si], ten_new_sketch[si], newModules.ten_2_params); 128 | tensor2_slide_sketch(kmer_seqs[si], ten_new_slide_sketch[si], 129 | newModules.ten_2_slide_params); 130 | } 131 | std::ofstream fo; 132 | fo.open(out_path + "/sketches_Ten2.txt"); 133 | fo << test_id << "\n"; 134 | for (int si = 0; si < num_seqs; si++) { 135 | for (int i = 0; i < ten_new_sketch[si].size(); i++) { 136 | fo << ten_new_sketch[si][i]; 137 | } 138 | fo << "\n"; 139 | } 140 | fo.close(); 141 | fo.open(out_path + "/sketches_Ten2_slide.txt"); 142 | fo << test_id << "\n"; 143 | for (int si = 0; si < num_seqs; si++) { 144 | fo << ">> " << si << "\n"; 145 | for (int i = 0; i < ten_new_slide_sketch[si].size(); i++) { 146 | for (int j = 0; j < ten_new_slide_sketch[si][i].size(); j++) 147 | fo << ten_new_slide_sketch[si][i][j] << ", "; 148 | fo << "\n"; 149 | } 150 | fo << "\n"; 151 | } 152 | fo.close(); 153 | } 154 | void compute_dists() { 155 | std::ofstream fo; 156 | int num_seqs = seqs.size(); 157 | dists = new3D(8, num_seqs, num_seqs, 0); 158 | for (int i = 0; i < seqs.size(); i++) { 159 | for (int j = i + 1; j < seqs.size(); j++) { 160 | dists[0][i][j] = edit_distance(seqs[i], seqs[j]); 161 | dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]); 162 | dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]); 163 | dists[3][i][j] = hamming_dist2D(omh_sketch[i], omh_sketch[j]); 164 | dists[4][i][j] = l2_sq_dist(ten_sketch[i], ten_sketch[j]); 165 | dists[5][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]); 166 | dists[6][i][j] = l2_sq_dist(ten_new_sketch[i], ten_new_sketch[j]); 167 | dists[7][i][j] = l1_dist2D_minlen(ten_new_slide_sketch[i], ten_new_slide_sketch[j]); 168 | // dists[6][i][j] = cosine_sim(ten_new_sketch[i], ten_new_sketch[j]); 169 | // dists[6][i][j] = l1_dist(ten_new_sketch[i], ten_new_sketch[j]); 170 | } 171 | } 172 | std::vector method_names 173 | = { "ED", "MH", "WMH", "OMH", "TenSketch", "TenSlide", "Ten2", "Ten2Slide" }; 174 | for (int m = 0; m < 8; m++) { 175 | fo.open(out_path + "/dists_" + method_names[m] + ".txt"); 176 | fo << test_id << "\n"; 177 | for (int i = 0; i < num_seqs; i++) { 178 | for (int j = i + 1; j < num_seqs; j++) { 179 | fo << i << ", " << j << ", " << dists[m][i][j] << "\n"; 180 | } 181 | } 182 | fo.close(); 183 | } 184 | } 185 | 186 | void save_output() { 187 | std::ofstream fo; 188 | // fo.open("output.txt"); 189 | fo.open(out_path + "/matlab_output.txt"); 190 | for (int i = 0; i < seqs.size(); i++) { 191 | for (int j = i + 1; j < seqs.size(); j++) { 192 | fo << dists[0][i][j] << ", " << dists[1][i][j] << ", " << dists[2][i][j] << ", " 193 | << dists[3][i][j] << ", " << dists[4][i][j] << ", " << dists[5][i][j] << ", " 194 | << dists[6][i][j] << ", " << dists[7][i][j] << "\n"; 195 | } 196 | } 197 | fo.close(); 198 | } 199 | }; 200 | 201 | int main(int argc, char *argv[]) { 202 | TestModule1 experiment; 203 | experiment.parse(argc, argv); 204 | experiment.generate_sequences(); 205 | experiment.compute_sketches(); 206 | experiment.compute_dists(); 207 | experiment.save_output(); 208 | } 209 | -------------------------------------------------------------------------------- /legacy/dtw.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Amir Joudaki on 6/16/20. 3 | // 4 | 5 | #ifndef SEQUENCE_SKETCHING_DTW_H 6 | #define SEQUENCE_SKETCHING_DTW_H 7 | 8 | #include 9 | // TODO test skeching with l1 over vectors 10 | 11 | template 12 | void dtw(std::vector &a, const std::vector &b) { 13 | int n = a.size(); 14 | int m = b.size(); 15 | std::vector> DTW(n + 1, std::vector(m, 0)); 16 | 17 | for (int i = 1; i <= n; i++) { 18 | for (int j = 1; j <= m; j++) { 19 | T cost = std::abs(a[i] - b[j]); 20 | DTW[i][j] = cost + std::min({ DTW[i - 1][j], DTW[i][j - 1], DTW[i - 1][j - 1] }); 21 | } 22 | } 23 | return DTW[n][m]; 24 | } 25 | 26 | #endif // SEQUENCE_SKETCHING_DTW_H 27 | -------------------------------------------------------------------------------- /legacy/long_seqs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | #include "util/modules.hpp" 6 | #include "util/seqgen.hpp" 7 | #include "util/utils.hpp" 8 | 9 | using namespace ts; 10 | using namespace BasicTypes; 11 | 12 | struct KmerModule : public BasicModules { 13 | int original_alphabet_size {}; 14 | 15 | void override_pre() override { 16 | original_alphabet_size = alphabet_size; 17 | alphabet_size = int_pow(alphabet_size, kmer_size); 18 | } 19 | 20 | void override_post() override { 21 | // tensor_slide_params.alphabet_size = original_alphabet_size; 22 | // tensor_slide_params.tup_len = 2; 23 | } 24 | }; 25 | 26 | struct TestModule1 { 27 | Vec2D seqs; 28 | Vec2D kmer_seqs; 29 | Vec2D wmh_sketch; 30 | Vec2D mh_sketch; 31 | Vec3D omh_sketch; 32 | Vec2D ten_sketch; 33 | Vec3D slide_sketch; 34 | Vec3D dists; 35 | 36 | BasicModules basicModules; 37 | KmerModule kmerModules; 38 | 39 | void parse(int argc, char **argv) { 40 | basicModules.parse(argc, argv); 41 | basicModules.models_init(); 42 | kmerModules.parse(argc, argv); 43 | kmerModules.models_init(); 44 | } 45 | 46 | void generate_sequences() { basicModules.seq_gen.gen_seqs(seqs); } 47 | 48 | void compute_sketches() { 49 | int num_seqs = seqs.size(); 50 | kmer_seqs.resize(num_seqs); 51 | slide_sketch.resize(num_seqs); 52 | for (int si = 0; si < num_seqs; si++) { 53 | seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size); 54 | tensor_slide_sketch(kmer_seqs[si], slide_sketch[si], kmerModules.tensor_slide_params); 55 | } 56 | } 57 | void compute_dists() { 58 | int num_seqs = seqs.size(); 59 | dists = new3D(2, num_seqs, num_seqs, 0); 60 | for (int i = 0; i < seqs.size(); i++) { 61 | for (int j = i + 1; j < seqs.size(); j++) { 62 | dists[0][i][j] = edit_distance(seqs[i], seqs[j]); 63 | dists[1][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]); 64 | } 65 | } 66 | } 67 | 68 | void save_output() { 69 | std::ofstream fo; 70 | fo.open("long_seq_output.txt"); 71 | for (int i = 0; i < seqs.size(); i++) { 72 | for (int j = i + 1; j < seqs.size(); j++) { 73 | fo << dists[0][i][j] << ", " << dists[1][i][j] << "\n"; 74 | } 75 | } 76 | fo.close(); 77 | } 78 | }; 79 | 80 | int main(int argc, char *argv[]) { 81 | TestModule1 experiment; 82 | experiment.parse(argc, argv); 83 | experiment.generate_sequences(); 84 | experiment.compute_sketches(); 85 | experiment.compute_dists(); 86 | experiment.save_output(); 87 | } 88 | -------------------------------------------------------------------------------- /legacy/tensor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "util/multivec.hpp" 9 | 10 | namespace ts { // ts = Tensor Sketch 11 | 12 | 13 | /** 14 | * Computes tensor sketches for a given sequence as described in 15 | * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1 16 | * @tparam set_type the type of the characters in sketched sequences 17 | * @tparam sketch_type the type of elements in the sketch 18 | */ 19 | template 20 | class Tensor { 21 | public: 22 | /** 23 | * @param alphabet_size the size of the alphabet over which the sequences to be sketched are 24 | * defined 25 | * @param sketch_count number of different sketches to compute 26 | * @param embedded_dim the dimension of the embedded (sketched) space, denoted by D in the paper 27 | * @param num_bins number of bins for discretization of the sketches. 28 | * @param tup_len the length of the subsequences considered for sketching, denoted by t in the 29 | * paper 30 | */ 31 | Tensor(set_type alphabet_size, 32 | size_t sketch_count, 33 | size_t embedded_dim, 34 | size_t num_bins, 35 | size_t tup_len) 36 | : sketch_count(sketch_count), 37 | alphabet_size(alphabet_size), 38 | embedded_dim(embedded_dim), 39 | num_bins(num_bins), 40 | tup_len(tup_len) { 41 | rand_init(); 42 | } 43 | 44 | std::vector compute(const std::vector &sequence) { 45 | Timer::start("tensor_sketch"); 46 | 47 | std::vector sketch(sketch_count, 0); 48 | for (size_t m = 0; m < sketch_count; m++) { 49 | auto cnt = new2D(tup_len + 1, embedded_dim, sketch_type(0)); 50 | cnt[0][0] = 1; // base case 51 | for (size_t i = 0; i < sequence.size(); i++) { 52 | for (int32_t t = (int32_t)tup_len - 1; t >= 0; t--) { 53 | auto pi = hashes[m][t][sequence[i]]; 54 | for (size_t p = 0; p < embedded_dim; p++) { 55 | auto shift = (p + pi) % embedded_dim; 56 | cnt[t + 1][shift] += cnt[t][p]; 57 | } 58 | } 59 | } 60 | const auto &top_cnt = cnt[tup_len]; // this is T^p 61 | auto prod = std::inner_product(s[m].begin(), s[m].end(), top_cnt.begin(), 0.0); 62 | prod /= l1(top_cnt); // this is the total no of sequences 63 | if (num_bins == 0) { 64 | sketch[m] = prod; 65 | } else { 66 | sketch_type bin = std::upper_bound(bins.begin(), bins.begin() + num_bins, prod) 67 | - bins.begin(); 68 | sketch[m] = bin; 69 | } 70 | } 71 | Timer::stop(); 72 | 73 | return sketch; 74 | } 75 | 76 | protected: 77 | void rand_init() { 78 | std::random_device rd; 79 | std::mt19937 gen(rd()); 80 | std::uniform_int_distribution unif_hash(0, embedded_dim - 1); 81 | 82 | hashes = new3D(sketch_count, tup_len, alphabet_size); 83 | s = new2D(sketch_count, embedded_dim); 84 | for (size_t m = 0; m < sketch_count; m++) { 85 | for (size_t t = 0; t < tup_len; t++) { 86 | for (size_t c = 0; c < alphabet_size; c++) { 87 | hashes[m][t][c] = unif_hash(gen); 88 | } 89 | } 90 | for (size_t p = 0; p < embedded_dim; p++) { 91 | s[m][p] = (p % 2 == 0) ? 1 : -1; // use oddity of p to assign (-1) or (1) 92 | } 93 | } 94 | bins = std::vector(num_bins); 95 | for (size_t b = 0; b < num_bins; b++) { 96 | bins[b] = std::tan(M_PI * ((b + .5) / num_bins - .5)); 97 | } 98 | bins.push_back(std::numeric_limits::max()); 99 | bins.insert(bins.begin(), std::numeric_limits::lowest()); 100 | } 101 | 102 | protected: 103 | size_t sketch_count; 104 | set_type alphabet_size; 105 | size_t embedded_dim; 106 | size_t num_bins; 107 | size_t tup_len; 108 | 109 | /** 110 | * Denotes the hash functions h1,....hD:A->{1....D}. 111 | */ 112 | Vec3D hashes; 113 | /** 114 | * Sign function, corresponds to s1,s2,...st:A->{-1,1} in the paper. The first index denotes the 115 | * sketch count, the second index the embedded dimension 116 | */ 117 | //TODO: figure out why second index is not the tuple and why there is no 3rd index. 118 | Vec2D s; 119 | 120 | /** Bins the possible values of a sketch into #num_bins integer values */ 121 | std::vector bins; 122 | }; 123 | 124 | } // namespace ts 125 | -------------------------------------------------------------------------------- /legacy/tensor_slide.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tensor.hpp" 4 | 5 | namespace ts { // ts = Tensor Sketch 6 | 7 | /** 8 | * Computes tensor slide sketches for a given sequence. 9 | * @tparam sketch_type the type of elements in the sequences to be sketched. 10 | */ 11 | template 12 | class TensorSlideOld : public Tensor { 13 | public: 14 | /** 15 | * @param set_size the number of elements in S, 16 | * @param sketch_dim the number of components (elements) in the sketch vector. 17 | */ 18 | TensorSlideOld(sketch_type set_size, 19 | size_t sketch_dim, 20 | size_t num_phases, 21 | size_t num_bins, 22 | size_t tup_len, 23 | size_t win_len, 24 | size_t stride, 25 | size_t offset) 26 | : Tensor(set_size, sketch_dim, num_phases, num_bins, tup_len), 27 | win_len(win_len), 28 | stride(stride), 29 | offset(offset) { 30 | this->rand_init(); 31 | } 32 | 33 | void compute(const Seq &seq, Vec2D &sketch) { 34 | Timer::start("tensor_slide_sketch"); 35 | sketch = Vec2D(this->sketch_count, std::vector()); 36 | for (size_t m = 0; m < this->sketch_count; m++) { 37 | auto cnt = new3D(this->tup_len, this->tup_len, this->embedded_dim, 0); 38 | for (size_t i = 0; i < seq.size(); i++) { 39 | if (i >= win_len) { 40 | size_t j = i - win_len; 41 | for (size_t t = 0; t < this->tup_len; t++) { 42 | auto pj = this->hashes[m][t][seq[j]]; 43 | cnt[t][t][pj]--; 44 | for (int t2 = t - 1; t2 >= 0; t2--) { 45 | auto pj = this->hashes[m][t2][seq[j]]; 46 | for (size_t p = 0; p < this->embedded_dim; p++) { 47 | auto shift = (p + pj) % this->embedded_dim; 48 | cnt[t2][t][shift] -= cnt[t2 + 1][t][p]; 49 | } 50 | } 51 | } 52 | } 53 | 54 | for (size_t t = 0; t < this->tup_len; t++) { 55 | for (size_t t2 = this->tup_len - 1; t2 > t; t2--) { 56 | auto pi = this->hashes[m][t2][seq[i]]; 57 | for (size_t p = 0; p < this->embedded_dim; p++) { 58 | auto shift = (p + pi) % this->embedded_dim; 59 | cnt[t][t2][shift] += cnt[t][t2 - 1][p]; 60 | } 61 | } 62 | auto pi = this->hashes[m][t][seq[i]]; 63 | cnt[t][t][pi]++; 64 | } 65 | if (sketch_now(i, seq.size(), stride, offset)) { 66 | const auto &top_cnt = cnt[0][this->tup_len - 1]; 67 | auto prod = std::inner_product(this->s[m].begin(), this->s[m].end(), 68 | top_cnt.begin(), (double)0); 69 | prod = prod / l1(top_cnt); 70 | // int exp; 71 | // frexp(prod, &exp); 72 | // embedding[m].push_back(exp * sgn(prod)); 73 | sketch_type bin = std::upper_bound(this->bins.begin(), 74 | this->bins.begin() + this->num_bins, prod) 75 | - this->bins.begin(); 76 | sketch[m].push_back(bin); 77 | } 78 | } 79 | } 80 | Timer::stop(); 81 | } 82 | 83 | private: 84 | size_t win_len; 85 | size_t stride; 86 | size_t offset; 87 | }; 88 | 89 | } // namespace ts 90 | -------------------------------------------------------------------------------- /legacy/tensor_slide2.hpp: -------------------------------------------------------------------------------- 1 | //// 2 | //// Created by Amir Joudaki on 6/19/20. 3 | //// 4 | // 5 | //#ifndef SEQUENCE_SKETCHING_TENSOR_SLIDE2_H 6 | //#define SEQUENCE_SKETCHING_TENSOR_SLIDE2_H 7 | // 8 | //#include "sketch/tensor_slide.hpp" 9 | // 10 | // namespace ts { // ts = Tensor Sketch 11 | // 12 | // 13 | // template 14 | // void tensor_sketch_slide2(const std::vector> &seq2D, Vec2D &embedding, const 15 | // TensorSlideParams ¶ms) { 16 | // assert(seq2D.size() == params.embed_dim); 17 | // embedding = Vec2D(params.embed_dim, std::vector()); 18 | // for (int m = 0; m < params.embed_dim; m++) { 19 | // const auto &seq = seq2D[m]; 20 | // auto cnt = new3D(params.tup_len, params.tup_len, params.num_phases, 0); 21 | // for (int i = 0; i < seq.size(); i++) { 22 | // int j = i - params.win_len; 23 | // if (j >= 0) { 24 | // for (int t = 0; t < params.tup_len; t++) { 25 | // auto pj = params.iphases[m][t][seq[j]]; 26 | // cnt[t][t][pj]--; 27 | // for (int t2 = t - 1; t2 >= 0; t2--) { 28 | // auto pj = params.iphases[m][t2][seq[j]]; 29 | // for (int p = 0; p < params.num_phases; p++) { 30 | // auto shift = (p + pj) % params.num_phases; 31 | // cnt[t2][t][shift] -= cnt[t2 + 1][t][p]; 32 | // } 33 | // } 34 | // } 35 | // } 36 | // 37 | // for (int t = 0; t < params.tup_len; t++) { 38 | // for (int t2 = params.tup_len - 1; t2 > t; t2--) { 39 | // auto pi = params.iphases[m][t2][seq[i]]; 40 | // for (int p = 0; p < params.num_phases; p++) { 41 | // auto shift = (p + pi) % params.num_phases; 42 | // cnt[t][t2][shift] += cnt[t][t2 - 1][p]; 43 | // } 44 | // } 45 | // auto pi = params.iphases[m][t][seq[i]]; 46 | // cnt[t][t][pi]++; 47 | // } 48 | // const auto &top_cnt = cnt[0][params.tup_len - 1]; 49 | // auto prod = std::inner_product(params.icdf[m].begin(), params.icdf[m].end(), 50 | // top_cnt.begin(), (double) 0); auto norm = l1(top_cnt); prod = prod / norm; 51 | // embed_type bin = std::upper_bound(params.bins.begin(), params.bins.begin() + 52 | // params.num_bins, prod) - params.bins.begin(); if ((i + 1) % params.stride == 0 or 53 | // i == (seq.size() - 1)) { 54 | // if (norm != 0) 55 | // embedding[m].push_back(bin); 56 | // else 57 | // embedding[m].push_back(params.num_bins / 2); 58 | // } 59 | // } 60 | // } 61 | // } 62 | // 63 | //} 64 | // 65 | //#endif//SEQUENCE_SKETCHING_TENSOR_SLIDE2_H 66 | -------------------------------------------------------------------------------- /legacy/test_tensor_disc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | #include "util/modules.hpp" 6 | #include "util/seqgen.hpp" 7 | #include "util/utils.hpp" 8 | 9 | using namespace ts; 10 | using namespace BasicTypes; 11 | 12 | struct KmerModule : public BasicModules { 13 | int original_alphabet_size {}; 14 | 15 | void override_pre() override { 16 | original_alphabet_size = alphabet_size; 17 | alphabet_size = int_pow(alphabet_size, kmer_size); 18 | } 19 | 20 | void override_post() override { 21 | tensor_slide_params.alphabet_size = original_alphabet_size; 22 | tensor_slide_params.tup_len = 2; 23 | } 24 | }; 25 | 26 | struct DiscModules : public BasicModules { 27 | std::vector dims = { 16, 64, 256 }; 28 | std::vector win2stride = { 2, 2, 2 }; 29 | std::vector tup_lens = { 4, 4, 4 }; 30 | std::vector num_phases = { 5, 5, 5 }; 31 | std::vector strid2dim = { 2, 2, 2 }; 32 | 33 | void override_post() override {} 34 | 35 | TensorSlideParams layer0() { 36 | TensorSlideParams tensorParams; 37 | init_tensor_slide_params(tensorParams); 38 | tensorParams.tup_len = tup_lens[0]; 39 | tensorParams.win_len = dims[0]; 40 | tensorParams.stride = dims[0] / strid2dim[0]; 41 | tensorParams.embed_dim = dims[0]; 42 | tensorParams.num_phases = num_phases[0]; 43 | return tensorParams; 44 | } 45 | 46 | std::vector layers(int l) { 47 | assert(0 < l and l <= 2); 48 | std::vector param_vec; 49 | for (int i = 0; i < dims[l - 1]; i++) { 50 | auto params = layer0(); 51 | params.tup_len = tup_lens[l]; 52 | params.alphabet_size = num_phases[l]; 53 | params.num_phases = num_phases[l]; 54 | params.embed_dim = dims[l] / dims[l - 1]; 55 | params.stride = (dims[l] / dims[l - 1]); 56 | params.win_len = params.stride * win2stride[l]; 57 | param_vec.push_back(params); 58 | } 59 | return param_vec; 60 | } 61 | }; 62 | 63 | struct TestModule1 { 64 | Vec2D seqs; 65 | Vec2D kmer_seqs; 66 | Vec2D wmh_sketch; 67 | Vec2D mh_sketch; 68 | Vec3D omh_sketch; 69 | Vec2D ten_sketch; 70 | Vec3D ten_disc_sketch; 71 | Vec4D slide_disc_sketch1; 72 | Vec4D slide_disc_sketch2; 73 | Vec4D slide_disc_sketch3; 74 | Vec2D slide_disc_flat; 75 | Vec3D slide_sketch; 76 | Vec3D dists; 77 | 78 | BasicModules basicModules; 79 | KmerModule kmerModules; 80 | DiscModules discModules; 81 | 82 | void parse(int argc, char **argv) { 83 | basicModules.parse(argc, argv); 84 | basicModules.models_init(); 85 | kmerModules.parse(argc, argv); 86 | kmerModules.models_init(); 87 | discModules.parse(argc, argv); 88 | discModules.models_init(); 89 | } 90 | 91 | void generate_sequences() { basicModules.seq_gen.gen_seqs(seqs); } 92 | 93 | void compute_sketches() { 94 | int num_seqs = seqs.size(); 95 | kmer_seqs.resize(num_seqs); 96 | wmh_sketch.resize(num_seqs); 97 | mh_sketch.resize(num_seqs); 98 | omh_sketch.resize(num_seqs); 99 | ten_sketch.resize(num_seqs); 100 | ten_disc_sketch.resize(num_seqs); 101 | slide_disc_sketch1.resize(num_seqs); 102 | slide_disc_sketch2.resize(num_seqs); 103 | slide_disc_sketch3.resize(num_seqs); 104 | slide_disc_flat.resize(num_seqs); 105 | slide_sketch.resize(num_seqs); 106 | auto lay0 = discModules.layer0(); 107 | lay0.rand_init(); 108 | auto lay1 = discModules.layers(1); 109 | for (auto &l : lay1) 110 | l.rand_init(); 111 | auto lay2 = discModules.layers(2); 112 | for (auto &l : lay2) 113 | l.rand_init(); 114 | for (int si = 0; si < num_seqs; si++) { 115 | seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size); 116 | minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params); 117 | weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params); 118 | ordered_minhash(kmer_seqs[si], omh_sketch[si], kmerModules.omh_params); 119 | // tensor_sketch(seqs[si], ten_sketch[si], longseqModule.tensor_params); 120 | // tensor_disc_sketch(seqs[si], ten_disc_sketch[si], 121 | // discModules.tensor_params); Vec3D in, out; 122 | slide_disc_sketch1[si] = tensor_disc_slide(seqs[si], lay0); 123 | slide_disc_sketch2[si] 124 | = tensor_disc_slide3(slide_disc_sketch1[si], lay1); 125 | slide_disc_sketch3[si] 126 | = tensor_disc_slide3(slide_disc_sketch2[si], lay2); 127 | slide_disc_flat[si] = squeeze_tensor(slide_disc_sketch3[si]); 128 | tensor_slide_sketch(seqs[si], slide_sketch[si], kmerModules.tensor_slide_params); 129 | } 130 | } 131 | void compute_dists() { 132 | int num_seqs = seqs.size(); 133 | dists = new3D(7, num_seqs, num_seqs, 0); 134 | for (int i = 0; i < seqs.size(); i++) { 135 | for (int j = i + 1; j < seqs.size(); j++) { 136 | dists[0][i][j] = edit_distance(seqs[i], seqs[j]); 137 | dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]); 138 | dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]); 139 | dists[3][i][j] = hamming_dist2D(omh_sketch[i], omh_sketch[j]); 140 | dists[4][i][j] = l1_dist(ten_sketch[i], ten_sketch[j]); 141 | dists[5][i][j] = l1_dist(slide_disc_flat[i], slide_disc_flat[j]); 142 | dists[6][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]); 143 | } 144 | } 145 | } 146 | 147 | void save_output() { 148 | std::ofstream fo; 149 | fo.open("output.txt"); 150 | for (int i = 0; i < seqs.size(); i++) { 151 | for (int j = i + 1; j < seqs.size(); j++) { 152 | fo << dists[0][i][j] << ", " << dists[1][i][j] << ", " << dists[2][i][j] << ", " 153 | << dists[3][i][j] << ", " << dists[4][i][j] << ", " << dists[5][i][j] << ", " 154 | << dists[6][i][j] << "\n"; 155 | } 156 | } 157 | fo.close(); 158 | } 159 | }; 160 | 161 | int main(int argc, char *argv[]) { 162 | TestModule1 experiment; 163 | experiment.parse(argc, argv); 164 | experiment.generate_sequences(); 165 | experiment.compute_sketches(); 166 | experiment.compute_dists(); 167 | experiment.save_output(); 168 | } 169 | -------------------------------------------------------------------------------- /legacy/test_typeinfo.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Amir Joudaki on 6/18/20. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | // using std::literals; 10 | 11 | 12 | struct Base {}; // non-polymorphic 13 | struct Derived : Base {}; 14 | 15 | struct Base2 { 16 | virtual void foo() {} 17 | }; // polymorphic 18 | struct Derived2 : Base2 {}; 19 | 20 | 21 | // requires std::is_integral::value 22 | template 23 | requires std::is_integral_v struct foo { 24 | T val; 25 | }; 26 | 27 | int main() { 28 | foo a = { 11.5 }; 29 | std::cout << (a.val) << "\n"; 30 | int myint = 50; 31 | std::string mystr = "string"; 32 | double *mydoubleptr = nullptr; 33 | 34 | std::cout << "myint has type: " << typeid(myint).name() << '\n' 35 | << "mystr has type: " << typeid(mystr).name() << '\n' 36 | << "mydoubleptr has type: " << typeid(mydoubleptr).name() << '\n'; 37 | 38 | // std::cout << myint is a glvalue expression of polymorphic type; it is evaluated 39 | const std::type_info &r1 = typeid(std::cout << myint); 40 | std::cout << '\n' << "std::cout< 6 | #include 7 | #include 8 | #include 9 | 10 | #include "vectool.hpp" 11 | 12 | using namespace ts; 13 | 14 | int main() { 15 | using std::cout; 16 | using std::vector; 17 | 18 | // we want dims[0]xdim[1] matrix interface 19 | vector dims = { 3, 2 }, data(6, 0); 20 | MultiView mat(dims, data); 21 | for (int i = 0; i < mat.size(1); i++) { 22 | for (int j = 0; j < mat.size(0); j++) { 23 | mat[i][j] = i * 10 + j; // elements can be modified with op[][] syntax 24 | } 25 | } 26 | cout << "mat = \n" << mat << "\n"; // ostream&<< is overloaded 27 | mat[0][1]++; // scalar elements can be modified/accessed 28 | mat = 2; // assign scalar init_tensor_slide_params all matrix 29 | mat += 5; // add 5 init_tensor_slide_params all elements (sam for -= and *=) 30 | mat[0] *= 2; // multiply the first row mat[0][:] by two 31 | 32 | // MultiVec has an internal storage 33 | MultiVec mv({ 3, 2 }, 10), mv2({ 3, 2 }, 1); 34 | mv[0] += mv2[1]; // partial assignment 35 | cout << "mv2 = \n" << mv2 << "\n"; 36 | 37 | std::default_random_engine eng; 38 | std::uniform_int_distribution unif(0, 10); 39 | Multistd::vector T({ 3, 2, 2 }, 0); // tensor with types for index and value 40 | for (auto it = T.begin(); it != T.end(); it++) { // an iterator over all elements 41 | *it = unif(eng); 42 | } 43 | cout << "Tensor = \n" << T << "\n"; 44 | /* 45 | the output will be 46 | 47 | mat = 48 | (d1=0) 0 1 2 49 | (d1=1) 10 11 12 50 | 51 | mv2 = 52 | (d1=0) 1 1 1 53 | (d1=1) 1 1 1 54 | 55 | Tensor = 56 | (d2=0) 57 | (d1=0) 1 5 0 58 | (d1=1) 2 0 8 59 | 60 | (d2=1) 61 | (d1=0) 2 2 10 62 | (d1=1) 8 2 8 63 | 64 | */ 65 | } 66 | -------------------------------------------------------------------------------- /phylogeny/upgma.cpp: -------------------------------------------------------------------------------- 1 | #include "upgma.hpp" 2 | 3 | #include 4 | #include 5 | 6 | namespace ts { 7 | 8 | Tree upgma(const std::vector> &dist_mat) { 9 | if (dist_mat.empty()) { 10 | return {}; 11 | } 12 | 13 | Tree result(2 * dist_mat.size() - 1); 14 | // {nodeId, nodeCount} pairs of all the cluster roots 15 | std::unordered_map roots; 16 | std::unordered_map> D; 17 | for (uint32_t i = 0; i < dist_mat.size(); ++i) { 18 | roots.insert({ i, 1 }); 19 | result[i] = { 0, NO_CHILD, NO_CHILD }; 20 | for (uint32_t j = 0; j < dist_mat.size(); ++j) { 21 | D[i][j] = dist_mat[i][j]; 22 | } 23 | } 24 | for (uint32_t step = 0; step < dist_mat.size() - 1; ++step) { 25 | double minDist = std::numeric_limits::max(); 26 | uint32_t min_i, min_j; 27 | for (const auto &root1 : roots) { 28 | for (const auto &root2 : roots) { 29 | if (root1.first == root2.first) { 30 | continue; 31 | } 32 | double currentDist = D[root1.first][root2.first]; 33 | if (currentDist < minDist) { 34 | minDist = currentDist; 35 | min_i = root1.first; 36 | min_j = root2.first; 37 | } 38 | } 39 | } 40 | uint32_t new_node = dist_mat.size() + step; 41 | 42 | result[new_node] = { minDist / 2., min_i, min_j }; 43 | // update D 44 | for (const auto &root : roots) { 45 | D[new_node][root.first] 46 | = (D[min_i][root.first] * roots[min_i] + D[min_j][root.first] * roots[min_j]) 47 | / (roots[min_i] + roots[min_j]); 48 | D[root.first][new_node] = D[new_node][root.first]; 49 | } 50 | D.erase(min_i); 51 | D.erase(min_j); 52 | for (auto &row : D) { 53 | row.second.erase(min_i); 54 | row.second.erase(min_j); 55 | } 56 | roots[new_node] = roots[min_i] + roots[min_j]; 57 | roots.erase(min_i); 58 | roots.erase(min_j); 59 | } 60 | return result; 61 | } 62 | 63 | } // namespace ts 64 | -------------------------------------------------------------------------------- /phylogeny/upgma.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace ts { 9 | 10 | /** 11 | * Node in the phylogeny node. 12 | */ 13 | struct Node { 14 | double age; 15 | uint32_t left, right; 16 | }; 17 | 18 | constexpr uint32_t NO_CHILD = std::numeric_limits::max(); 19 | 20 | /** 21 | * To avoid dynamic memory allocation, the tree is represented as a map from node id to the actual 22 | * Node. The root is at index size()-1 23 | */ 24 | using Tree = std::vector; 25 | 26 | /** 27 | * Runs UPGMA (Unweighted Pair Group Method with Arithmetic Mean Algorithm) on the given distance 28 | * matrix and returns the reconstructed phylogeny graph as a (parent->children) map. 29 | */ 30 | Tree upgma(const std::vector> &dist_mat); 31 | 32 | } // namespace ts 33 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | # Line length 100 3 | line-length = 100 4 | # Keep ' instead of converting to " 5 | skip-string-normalization = true 6 | -------------------------------------------------------------------------------- /python/init_numba_env.sh: -------------------------------------------------------------------------------- 1 | # For colour output; also supports jupyter_nb. 2 | export NUMBA_COLOR_SCHEME=dark_bg 3 | 4 | # Level 2 is a bit faster for compilation itself. 5 | # Level 3 is faster??? for running CUDA kernels. 6 | export NUMBA_OPT=2 7 | 8 | # Disable jit entirely for debugging purposes. 9 | #export NUMBA_DISABLE_JIT=1 10 | 11 | # Options for caching 12 | #export NUMBA_DEBUG_CACHE=1 13 | -------------------------------------------------------------------------------- /python/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ratschlab/Project2020-seq-tensor-sketching/20b19ddd19751840d33af97abe314d29b34dc0d4/python/lib/__init__.py -------------------------------------------------------------------------------- /python/lib/base.py: -------------------------------------------------------------------------------- 1 | # Contains sketch base classes and helper methods 2 | 3 | import random 4 | 5 | import numpy as np 6 | import numba as nb 7 | from numba import njit 8 | from numba.experimental import jitclass 9 | from numba.typed import List 10 | 11 | from lib.sequence import * 12 | 13 | # A SketchedSequence contains a sequence and its sketch. 14 | # The sketch must be a 1D array of float32s. 15 | @jitclass([('seq', Sequence_type), ('sketch', nb.float32[::1])]) 16 | class SketchedSequence: 17 | def __init__(self, seq: Sequence, sketch): 18 | self.seq = seq 19 | self.sketch = sketch 20 | 21 | 22 | SketchedSequence_type = SketchedSequence.class_type.instance_type 23 | 24 | # Compute the Euclidean distance between two sketched sequences. 25 | @njit 26 | def dist(ss1: np.ndarray, ss2: np.ndarray) -> np.float32: 27 | return np.linalg.norm(ss1.sketch - ss2.sketch) 28 | 29 | 30 | # Return a sorted list of (dist, seq1, seq2). 31 | @njit 32 | def pairwise_dists( 33 | seqs: list[SketchedSequence], 34 | ) -> list[tuple[np.float32, SketchedSequence, SketchedSequence]]: 35 | d = [] 36 | for j in range(len(seqs)): 37 | for i in range(j): 38 | d.append((dist(seqs[i], seqs[j]), seqs[i], seqs[j])) 39 | d.sort(key=lambda tup: tup[0]) 40 | return d 41 | 42 | 43 | sketchparams_spec = [ 44 | ('A', nb.int32), 45 | ('t', nb.int32), 46 | ('D', nb.int32), 47 | ('normalize', nb.bool_), 48 | ('L', nb.int32), 49 | ('DL', nb.int32), 50 | ] 51 | 52 | 53 | @jitclass(sketchparams_spec) 54 | class SketchParams: 55 | def __init__(self, A, t, D, normalize=True, L=1): 56 | # Alphabet size 57 | self.A = A 58 | # Tensor Sketch tuple size 59 | self.t = t 60 | # Tensor Sketch embed dimension 61 | self.D = D 62 | # Return frequencies instead of counts 63 | self.normalize = normalize 64 | 65 | # GPU Sketch 66 | # Amount of work per thread, must divide D. 67 | # Spawn t*(D/L) instead of t*D threads when this is > 1. 68 | self.L = L 69 | assert D % L == 0 70 | self.DL = D // L 71 | 72 | 73 | SketchParams_type = SketchParams.class_type.instance_type 74 | 75 | 76 | # NOTE: Sketchers are not always jitted, since e.g. CUDA invocations do not support this. 77 | class Sketcher: 78 | def __init__(self, params: SketchParams): 79 | self.A = params.A 80 | self.t = params.t 81 | self.D = params.D 82 | self.normalize = params.normalize 83 | self.L = params.L 84 | self.DL = params.DL 85 | 86 | # [Optional] sketch a single sequence for all t' <= t. 87 | def _full_sketch(self, seq: Sequence): 88 | pass 89 | 90 | # Sketch a single sequence. 91 | def sketch_one(self, seq: Sequence) -> SketchedSequence: 92 | pass 93 | 94 | # Sketch a list of sequences. 95 | def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]: 96 | pass 97 | -------------------------------------------------------------------------------- /python/lib/cds.py: -------------------------------------------------------------------------------- 1 | # Helper functions to parse the .CDS files in the homology dataset. 2 | 3 | import os 4 | import json 5 | import seaborn as sns 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | 9 | from pathlib import Path 10 | from collections import defaultdict 11 | 12 | # Print: min, mean (std), median, max, 5 lowest + 5 largest values with counts, 5 most common values 13 | def _print_stats(name, data): 14 | name = name.upper() 15 | minval = min(*data) 16 | mean = np.mean(data) 17 | std = np.std(data) 18 | maxval = max(*data) 19 | sumval = sum(data) 20 | 21 | counts = defaultdict(int) 22 | for x in data: 23 | counts[x] += 1 24 | 25 | counts = list(counts.items()) 26 | 27 | # Sort by value, get low and high 5. 28 | counts = sorted(counts, key=lambda x: x[0]) 29 | if len(counts) <= 10: 30 | lowhigh = counts 31 | else: 32 | lowhigh = counts[:5] + ['...'] + counts[-5:] 33 | 34 | # Sort by count, get the 5 most frequent values. 35 | counts = sorted(counts, key=lambda x: x[1]) 36 | maxcount = reversed(counts[-5:]) 37 | 38 | print(name) 39 | print(f'{minval: 6} <= {mean: 8.1f}=μ ({std: 8.1f}=σ) <= {maxval: 6}; sum={sumval}') 40 | print('Low/high vals:', *lowhigh) 41 | print('Frequent vals:', *maxcount) 42 | 43 | if len(data) <= 20: 44 | print(*data) 45 | print() 46 | 47 | 48 | def exon_stats(fasta_paths, sequences): 49 | exon_lengths = [] 50 | num_exons = [] 51 | total_exon_lengths = [] 52 | 53 | id_to_exons = dict() 54 | for f in fasta_paths: 55 | data = json.loads(f.with_suffix('.CDS.json').read_text()) 56 | id_to_exons |= data 57 | 58 | print(len(id_to_exons)) 59 | 60 | i = 0 61 | for s in sequences: 62 | exons = id_to_exons[s.metadata['tid']] 63 | total_exon_length = 0 64 | 65 | num_exons.append(len(exons)) 66 | 67 | for exon in exons: 68 | l = exon['end'] - exon['start'] 69 | exon_lengths.append(l) 70 | total_exon_length += l 71 | 72 | _print_stats('Exon length', exon_lengths) 73 | _print_stats('Exons per gene', num_exons) 74 | _print_stats('Total exon length per gene', total_exon_lengths) 75 | 76 | sns.displot(exon_lengths) 77 | plt.show() 78 | -------------------------------------------------------------------------------- /python/lib/sequence.py: -------------------------------------------------------------------------------- 1 | # Classes for Sequence and FastaFile. 2 | 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import numba as nb 7 | from numba import njit, types, typed 8 | from numba.experimental import jitclass 9 | 10 | # Map from sequence characters to internal integer representation. 11 | _char_map: dict[str, int] = {'A': 0, 'C': 1, 'G': 2, 'T': 3} 12 | 13 | 14 | # Given the char_map above, returns an array of length 256 mapping bytes to 15 | # internal integers. -1 signals unknown bytes. 16 | def _compute_char_list() -> np.ndarray: 17 | char_list = np.full(256, -1, np.int8) 18 | for k in _char_map: 19 | char_list[ord(k)] = _char_map[k] 20 | return char_list 21 | 22 | 23 | # Map 256 bytes to integers; built from the map above. 24 | # -1 signals unknown bytes. 25 | _char_list: np.ndarray = _compute_char_list() 26 | 27 | 28 | # Class that contains a single sequence. The full_seq member contains the 29 | # original byte-representation of the sequence as read from the Fasta file. The 30 | # seq member contains the processed internal int8 representation. 31 | @jitclass( 32 | [ 33 | ('id', types.unicode_type), 34 | ('metadata', types.DictType(types.unicode_type, types.unicode_type)), 35 | # C-layout 1-dimensional arrays. 36 | ('full_seq', nb.byte[::1]), 37 | ('seq', nb.int8[::1]), 38 | ] 39 | ) 40 | class Sequence: 41 | 42 | # Given an ID of the form key:value|otherkey:othervalue, parse it. 43 | @staticmethod 44 | def id_to_map(id): 45 | data = typed.Dict() 46 | for kv in id.split('|'): 47 | k, v = kv.split(':') 48 | data[k] = v 49 | return data 50 | 51 | # Remap characters by char_map. Removes other (lower case) characters. 52 | @staticmethod 53 | def remap(s): 54 | return np.array([_char_list[c] for c in s if _char_list[c] != -1], dtype=np.int8) 55 | 56 | @staticmethod 57 | def reverse_complement(seq: np.ndarray): 58 | seqr = np.flip(seq) 59 | return np.array([(c ^ 3) for c in seqr], dtype=np.int8) 60 | 61 | def __init__(self, id: str, s: bytes): 62 | # String: header/name/id of this sequence in the Fasta file. 63 | self.id = id 64 | # Metadata encoded in the header. 65 | self.metadata = self.id_to_map(id) 66 | # The original sequence. 67 | self.full_seq = np.array([c for c in s], dtype=nb.byte) 68 | # The sequence with masked repeats (lower case characters) removed, and mapped to integers. 69 | self.seq = self.remap(s) 70 | if 'strand' in self.metadata and self.metadata['strand'] == '-': 71 | self.seq = self.reverse_complement(self.seq) 72 | 73 | def len(self): 74 | return len(self.seq) 75 | 76 | 77 | Sequence_type = Sequence.class_type.instance_type 78 | 79 | 80 | class FastaFile: 81 | def __init__(self, path): 82 | # The Path to the current file. 83 | self.path = Path(path) 84 | # The name of the current file. 85 | self.name = self.path.name 86 | # A list of Sequence objects in this file. 87 | self.seqs = [] 88 | 89 | self.read() 90 | 91 | def read(self): 92 | header = None 93 | seq = [] 94 | 95 | def flush(): 96 | nonlocal header, seq 97 | if header is None: 98 | return 99 | assert seq 100 | sequence = Sequence(header, b''.join(seq)) 101 | self.seqs.append(sequence) 102 | header = None 103 | seq = [] 104 | 105 | # Sequences are read in binary mode; ids are decoded as ascii. 106 | with self.path.open('br') as f: 107 | for line in f: 108 | if line[0] == ord('>'): 109 | flush() 110 | header = line[1:].decode('ascii').strip() 111 | else: 112 | seq.append(line) 113 | flush() 114 | 115 | 116 | # Contains a map from ids to sequences, constructed from a list of FastaFiles. 117 | class SequenceDict: 118 | def __init__(self, fastafiles): 119 | self.by_id = dict() 120 | for file in fastafiles: 121 | for seq in file.seqs: 122 | self.by_id[seq.id] = seq 123 | 124 | # Returns None if key not found. 125 | def __getitem__(self, key): 126 | return self.by_id.get(key) 127 | -------------------------------------------------------------------------------- /python/lib/tensor_embedding.py: -------------------------------------------------------------------------------- 1 | # TENSOR EMBEDDING 2 | 3 | from lib.base import * 4 | 5 | # a_1...a_t is mapped to index A^{t-1} a_1 + ... + A * a_{t-1} + 1 * a_t 6 | @jitclass(sketchparams_spec + [('pow', nb.int32[:])]) 7 | class TE(Sketcher): 8 | # https://github.com/numba/numba/issues/1694 9 | __init__Sketcher = Sketcher.__init__ 10 | 11 | def __init__(self, params): 12 | self.__init__Sketcher(params) 13 | 14 | self.pow = np.zeros(self.t + 1, np.int32) 15 | self.pow[0] = 1 16 | for i in range(1, self.t + 1): 17 | self.pow[i] = self.A * self.pow[i - 1] 18 | 19 | # NOTE: The sketch is stored as float64 here so counting won't overflow. 20 | def _empty_tensor(self): 21 | Ts = List() 22 | for l in self.pow: 23 | Ts.append(np.zeros(l, np.float64)) 24 | return Ts 25 | 26 | # Return the sketch for the concatenation of two sequences. 27 | # TODO: Optimize this to modify Tr in place. 28 | def _join(self, Tl, Tr): 29 | Ts = self._empty_tensor() 30 | for tr in range(self.t + 1): 31 | for tl in range(self.t + 1 - tr): 32 | Ts[tl + tr] += np.kron(Tl[tl], Tr[tr]) 33 | return Ts 34 | 35 | # Returns the raw 1D count sketches for all tuple sizes up to t. 36 | # NOTE: This returns counts, not frequencies. 37 | def _full_sketch(self, seq: Sequence): 38 | Ts = self._empty_tensor() 39 | 40 | Ts[0][0] = 1 41 | 42 | # sketch 43 | for c in seq.seq: 44 | assert 0 <= c and c < self.A 45 | for i in range(self.t - 1, -1, -1): 46 | for j in range(len(Ts[i])): 47 | Ts[i + 1][self.A * j + c] += Ts[i][j] 48 | return Ts 49 | 50 | def sketch_one(self, seq: Sequence) -> SketchedSequence: 51 | full_sketch = self._full_sketch(seq) 52 | if self.normalize: 53 | # Normalization factor. 54 | n = seq.len() 55 | nct = nb.float64(1) 56 | for i in range(self.t): 57 | nct = nct * (n - i) / (i + 1) 58 | full_sketch[self.t] /= nct 59 | sketch = np.array([x for x in full_sketch[self.t]], dtype=nb.float32) 60 | return SketchedSequence(seq, sketch) 61 | 62 | # Returns the sketch for the given t as frequencies. 63 | def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]: 64 | return [self.sketch_one(seq) for seq in seqs] 65 | -------------------------------------------------------------------------------- /python/lib/tensor_sketch.py: -------------------------------------------------------------------------------- 1 | # TENSOR SKETCH 2 | 3 | from lib.base import * 4 | 5 | 6 | @jitclass(sketchparams_spec + [('hashes', nb.int32[:, :]), ('signs', nb.float32[:, :])]) 7 | class TS(Sketcher): 8 | __init__Sketcher = Sketcher.__init__ 9 | 10 | def __init__(self, params): 11 | self.__init__Sketcher(params) 12 | 13 | random.seed(31415) 14 | # An A*t array of random integers in [0, D) 15 | self.hashes = np.empty((self.A, self.t), dtype=np.int32) 16 | # An A*t array of random +-1 17 | self.signs = np.empty((self.A, self.t), dtype=np.float32) 18 | for c in range(self.A): 19 | for k in range(self.t): 20 | self.hashes[c][k] = random.randrange(0, self.D) 21 | self.signs[c][k] = random.randrange(-1, 2, 2) 22 | 23 | def _full_sketch(self, seq): 24 | # NOTE: The sketch is stored as float64 here so counting won't overflow. 25 | T = np.zeros((self.t + 1, self.D), dtype=np.float64) 26 | T[0][0] = 1 27 | 28 | for c in seq.seq: 29 | for k in range(self.t - 1, -1, -1): 30 | h = self.hashes[c][k] 31 | s = self.signs[c][k] 32 | for l in range(self.D): 33 | r = l + h if l + h < self.D else l + h - self.D 34 | T[k + 1][l] += s * T[k][r] 35 | 36 | return T 37 | 38 | def _normalize(self, seq, T): 39 | if self.normalize: 40 | # Normalization factor. 41 | n = seq.len() 42 | nct = nb.float64(1) 43 | for i in range(self.t): 44 | nct = nct * (n - i) / (i + 1) 45 | T /= nct 46 | return T 47 | 48 | def sketch_one(self, seq: Sequence) -> SketchedSequence: 49 | full_sketch = self._full_sketch(seq) 50 | 51 | self._normalize(seq, full_sketch[self.t]) 52 | 53 | sketch = np.array([x for x in full_sketch[self.t]], dtype=nb.float32) 54 | return SketchedSequence(seq, sketch) 55 | 56 | def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]: 57 | return [self.sketch_one(seq) for seq in seqs] 58 | -------------------------------------------------------------------------------- /python/lib/tensor_sketch_gpu.py: -------------------------------------------------------------------------------- 1 | # GPU TENSOR SKETCH 2 | 3 | from numba import cuda 4 | 5 | from lib.base import * 6 | from lib.tensor_sketch import TS 7 | 8 | # CUDA kernel to sketch a list of sequences. 9 | # A, t, D, L (int32): parameters as usual. 10 | # global_hashes (int32[:, :]): A*t device array of hashes. 11 | # global_signs (float32[:, :]): A*t device array of signs. Note that these are 12 | # floats to avoid additional (slow) int32->float32 conversions. 13 | # seq (int8[:]): concatenation of the sequences to sketch. 14 | # starts (int32[:]): the start positions of the subsequences in seq. 15 | # T: (float32[:, :]): n*D device array for the output, given n input sequences. 16 | @cuda.jit(fastmath=True) 17 | def _gpu_sketch(A, t, D, L, hashes, signs, seq, starts, T): 18 | seqid = cuda.blockIdx.x 19 | start = starts[seqid] 20 | end = starts[seqid + 1] 21 | 22 | l = cuda.threadIdx.x 23 | k = cuda.threadIdx.y 24 | assert k < t 25 | assert l < D // L 26 | 27 | # We use a 2*(t+1)*D tensor consisting of two 'planes'. 28 | # At each step, one plane is the input, and one is the output. Which is indicated by `j` further down. 29 | plane = (t + 1) * D 30 | threads = t * D // L 31 | 32 | # Slice the shared memory into local shared memory arrays. 33 | # Note the different types per view. 34 | 35 | # NOTE: Tin has a variable offset of k*D to save a bit on further computations. 36 | Tin = cuda.shared.array(shape=0, dtype=nb.float32)[k * D : 2 * plane] 37 | local_seq = cuda.shared.array(shape=0, dtype=nb.int32)[2 * plane : 2 * plane + threads] 38 | 39 | local_signs = cuda.shared.array(shape=0, dtype=nb.float32)[ 40 | 2 * plane + threads : 2 * plane + threads + A * t 41 | ] 42 | local_hashes = cuda.shared.array(shape=0, dtype=nb.int32)[ 43 | 2 * plane + threads + A * t : 2 * plane + threads + 2 * A * t 44 | ] 45 | 46 | # Copy the device memory hashes/signs to shared memory. 47 | if l < A: 48 | local_hashes[l * t + k] = hashes[l][k] 49 | local_signs[l * t + k] = signs[l][k] 50 | 51 | # Initialize the tensors to 0. 52 | for ll in range(l, D, D // L): 53 | Tin[0 * plane + 0 * D + ll] = 0 54 | Tin[0 * plane + (0 + 1) * D + ll] = 0 55 | Tin[1 * plane + 0 * D + ll] = 0 56 | Tin[1 * plane + (0 + 1) * D + ll] = 0 57 | 58 | cuda.syncthreads() 59 | 60 | # Initialize the 0-element of the tensor to 1. 61 | if k == 0: 62 | Tin[0] = 1 63 | Tin[plane] = 1 64 | 65 | cuda.syncthreads() 66 | 67 | # The offset for the plane we're currently reading from. The write offset 68 | # is the other plane: `plane-read_plane`. 69 | read_plane = 0 70 | 71 | # Loop over characters in the sequence. 72 | tid = l + k * D // L 73 | for i in range((end - start) // threads): 74 | # Read `threads` characters from `seq` and store them in `local_seq` in shared memory. 75 | idx = start + i * threads + tid 76 | local_seq[tid] = seq[idx] 77 | cuda.syncthreads() 78 | 79 | # Process the fetched characters. 80 | for c in local_seq: 81 | h = local_hashes[c * t + k] 82 | s = local_signs[c * t + k] 83 | write_plane = plane - read_plane 84 | # Process L consecutive indices (of the D in total). 85 | # 0 <= l < D/L, so this covers all of [0, D). 86 | for ll in range(L * l, L * (l + 1)): 87 | # Compute the shifted target index, avoiding a modulo operation. 88 | r = ll + h 89 | r -= D if r >= D else 0 90 | # Write to output tensor. 91 | Tin[write_plane + D + ll] = Tin[read_plane + D + ll] + s * Tin[read_plane + r] 92 | 93 | # After this thread has processed the current character `c`, swap the active plane and wait for other threads. 94 | read_plane = write_plane 95 | cuda.syncthreads() 96 | 97 | # Process the remaining characters. We don't do synchronous prefetching to 98 | # shared memory here, because this only covers the last few characters of 99 | # the sequence. 100 | # TODO: If sequences are short, it may actually be beneficial to still do this. 101 | for idx in range(start + (end - start) // threads * threads, end): 102 | c = seq[idx] 103 | # Same code as above. 104 | h = local_hashes[c * t + k] 105 | s = local_signs[c * t + k] 106 | write_plane = plane - read_plane 107 | for ll in range(L * l, L * (l + 1)): 108 | r = ll + h 109 | r -= D if r >= D else 0 110 | Tin[write_plane + D + ll] = Tin[read_plane + D + ll] + s * Tin[read_plane + r] 111 | 112 | read_plane = write_plane 113 | cuda.syncthreads() 114 | 115 | # Copy to result. 116 | for ll in range(l, D, D // L): 117 | T[seqid][k][ll] = Tin[read_plane + ll] 118 | T[seqid][k + 1][ll] = Tin[read_plane + D + ll] 119 | 120 | 121 | class GTS(Sketcher): 122 | def __init__(self, params): 123 | super().__init__(params) 124 | 125 | # Use the jitclass TS to copy hashes and signs parameters. 126 | # This is needed, because calling random returns different random 127 | # numbers inside and outside of jitted functions. 128 | # Ideally we'd inherit from TS, but inheriting from jitted classes is 129 | # not possible. 130 | self.ts = TS(params) 131 | self.hashes = np.array(self.ts.hashes, dtype=np.int32) 132 | self.signs = np.array(self.ts.signs, dtype=np.float32) 133 | 134 | self.d_hashes = cuda.to_device(self.hashes) 135 | self.d_signs = cuda.to_device(self.signs) 136 | 137 | def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]: 138 | assert isinstance(seqs, List) 139 | assert len(seqs) > 0 140 | assert isinstance(seqs[0], Sequence) 141 | 142 | # TODO: Add normalization to the GPU sketch method. 143 | for seq in seqs: 144 | assert ( 145 | seq.len() ** self.t < 10 ** 38 146 | ), "Counts may overflow! Lower t or shorten the sequence." 147 | 148 | # Sort by decreasing length 149 | seqs = sorted(seqs, key=lambda s: len(s.seq), reverse=True) 150 | 151 | # Put all operations on a stream, so that the python code runs asynchronously of the GPU code. 152 | stream = cuda.stream() 153 | 154 | # Launch one thread block per sequence. 155 | blocks = len(seqs) 156 | 157 | # Convert the input sequences to a single list of characters and the corresponding start indices. 158 | raw_seqs = [seq.seq for seq in seqs] 159 | raw_seq = np.concatenate(raw_seqs) 160 | starts = np.array( 161 | np.cumsum(np.array([0] + [len(seq) for seq in raw_seqs]), dtype=np.int32), 162 | dtype=np.int32, 163 | ) 164 | 165 | # Copy data from host to device. 166 | d_raw_seq = cuda.to_device(raw_seq, stream=stream) 167 | d_starts = cuda.to_device(starts, stream=stream) 168 | d_T = cuda.device_array((blocks, self.t + 1, self.D), dtype=np.float32, stream=stream) 169 | 170 | threads = self.t * self.D // self.L 171 | 172 | # Make sure we have enough threads to initialize self.hashes and 173 | # self.signs by a single synchronous copy. 174 | assert self.DL >= self.A 175 | 176 | # One thread per (l, k) <= (D/L, t) 177 | _gpu_sketch[ 178 | (blocks, 1), 179 | (self.DL, self.t), 180 | stream, 181 | 4 * (threads + 2 * (self.t + 1) * self.D + 2 * self.A * self.t), 182 | ]( 183 | np.int32(self.A), 184 | np.int32(self.t), 185 | np.int32(self.D), 186 | np.int32(self.L), 187 | self.d_hashes, 188 | self.d_signs, 189 | d_raw_seq, 190 | d_starts, 191 | d_T, 192 | ) 193 | 194 | T = d_T.copy_to_host(stream=stream) 195 | 196 | # Only return the length t sketch 197 | sketched_seqs = List() 198 | for seq, sketch in zip(seqs, T): 199 | self.ts._normalize(seq, sketch[self.t]) 200 | sketched_seqs.append(SketchedSequence(seq, sketch[self.t])) 201 | 202 | return sketched_seqs 203 | 204 | def sketch_one(self, seq: Sequence) -> SketchedSequence: 205 | return self.sketch(List([seq]))[0] 206 | -------------------------------------------------------------------------------- /python/lib/util.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | # Time the duration of the given lambda and print it. 4 | def timeit(f, name=''): 5 | s = time.time() 6 | ret = f() 7 | e = time.time() 8 | print(f'Duration [{name}]: {e-s:4.4f}') 9 | return ret 10 | -------------------------------------------------------------------------------- /sequence/alphabets.cpp: -------------------------------------------------------------------------------- 1 | #include "alphabets.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace ts { 8 | 9 | template 10 | class log2 { 11 | static constexpr size_t _log2(size_t x) { 12 | if (x < 2) { 13 | return 0; 14 | } else { 15 | return _log2(x >> 1) + 1; 16 | } 17 | } 18 | 19 | public: 20 | static constexpr size_t value = _log2(n); 21 | }; 22 | 23 | constexpr uint8_t alphabet_size_dna = 5; 24 | constexpr char alphabet_dna[] = "ACGTN"; 25 | constexpr uint8_t bits_per_char_dna = 3; 26 | constexpr uint8_t char2int_tab_dna[128] // A=1,C=2,G=3,T=4,N=0,invalid=5 27 | = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 0=25 28 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 26-50 29 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, // 51-75 30 | 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, // 76-100 31 | 5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }; // 100-127 32 | 33 | inline uint32_t char2int_dna(uint8_t c) { 34 | return char2int_tab_dna[c]; 35 | } 36 | 37 | constexpr uint8_t alphabet_size_dna4 = 4; 38 | constexpr char alphabet_dna4[] = "ACGTN"; 39 | constexpr uint8_t bits_per_char_dna4 = 2; 40 | constexpr uint8_t char2int_tab_dna4[128] // A=0,C=1,G=2,T=3, invalid=5 41 | = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 0=25 42 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 26-50 43 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, // 51-75 44 | 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, // 76-100 45 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }; // 100-127 46 | 47 | inline uint32_t char2int_dna4(uint8_t c) { 48 | return char2int_tab_dna4[c]; 49 | } 50 | 51 | 52 | constexpr char alphabet_protein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX"; 53 | constexpr uint8_t alphabet_size_protein = sizeof(alphabet_protein) - 1; 54 | constexpr uint8_t bits_per_char_protein = log2::value + 1; 55 | constexpr uint8_t char2int_tab_protein[128] 56 | = { 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 57 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 58 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 59 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 60 | 25, 23, 24, 25, 25, 25, 25, 25, 25, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 61 | 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, 25, 25, 25, 25, 25 }; 62 | 63 | inline uint32_t char2int_protein(uint8_t c) { 64 | return char2int_tab_protein[c]; 65 | } 66 | 67 | enum class AlphabetType { DNA4, DNA5, Protein }; 68 | 69 | AlphabetType from_string(std::string str) { 70 | std::transform(str.begin(), str.end(), str.begin(), 71 | [](unsigned char c) { return std::tolower(c); }); 72 | if (str == "dna4") { 73 | return AlphabetType::DNA4; 74 | } else if (str == "dna5") { 75 | return AlphabetType::DNA5; 76 | } else if (str == "protein") { 77 | return AlphabetType::Protein; 78 | } else { 79 | throw std::logic_error("Invalid alphabet type"); 80 | } 81 | } 82 | 83 | std::function char2int; 84 | const char *alphabet; 85 | uint8_t alphabet_size; 86 | uint8_t bits_per_char; 87 | 88 | void init_alphabet(const std::string &alphabet_str) { 89 | switch (from_string(alphabet_str)) { 90 | case AlphabetType::DNA5: 91 | char2int = char2int_dna; 92 | alphabet = alphabet_dna; 93 | alphabet_size = alphabet_size_dna; 94 | bits_per_char = bits_per_char_dna; 95 | return; 96 | case AlphabetType::DNA4: 97 | char2int = char2int_dna4; 98 | alphabet = alphabet_dna4; 99 | alphabet_size = alphabet_size_dna4; 100 | bits_per_char = bits_per_char_dna4; 101 | return; 102 | case AlphabetType::Protein: 103 | char2int = char2int_protein; 104 | alphabet = alphabet_protein; 105 | alphabet_size = alphabet_size_protein; 106 | bits_per_char = bits_per_char_protein; 107 | return; 108 | default: 109 | std::cerr << "Invalid alphabet type: " << alphabet_str << std::endl; 110 | std::exit(1); 111 | } 112 | } 113 | 114 | } // namespace ts 115 | -------------------------------------------------------------------------------- /sequence/alphabets.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Defines the alphabets that TensorSketch can operate on 9 | namespace ts { 10 | 11 | extern std::function char2int; 12 | extern const char *alphabet; 13 | extern uint8_t alphabet_size; 14 | extern uint8_t bits_per_char; 15 | 16 | void init_alphabet(const std::string &alphabet_str); 17 | 18 | } // namespace ts 19 | -------------------------------------------------------------------------------- /sequence/fasta_io.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "sequence/alphabets.hpp" 4 | #include "util/utils.hpp" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace ts { // ts = Tensor Sketch 14 | 15 | /** 16 | * Represents the contents of a single Fasta file. 17 | * All the sequences in a file should be treated as a single assembly and should be sketched as a 18 | * whole. 19 | */ 20 | template 21 | struct FastaFile { 22 | /** The name of the file. */ 23 | std::string filename; 24 | /** The leading comment before each sequence. Always has the same length as sequences. */ 25 | std::vector comments; 26 | /** The sequences in the file. */ 27 | std::vector> sequences; 28 | }; 29 | 30 | /** 31 | * Reads a fasta file and returns its contents. 32 | * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t 33 | */ 34 | template 35 | FastaFile read_fasta(const std::string &file_name, const std::string &input_format) { 36 | FastaFile f; 37 | 38 | if (!std::filesystem::exists(file_name)) { 39 | std::cerr << "Input file does not exist: " << file_name << std::endl; 40 | std::exit(1); 41 | } 42 | 43 | std::ifstream infile(file_name); 44 | if (!infile.is_open()) { 45 | std::cout << "Could not open " + file_name << std::endl; 46 | std::exit(1); 47 | } 48 | 49 | f.filename = std::filesystem::path(file_name).filename(); 50 | 51 | std::string line; 52 | std::vector seq; 53 | while (std::getline(infile, line)) { 54 | if (line[0] == '>') { 55 | if (!seq.empty()) { 56 | f.sequences.push_back(std::move(seq)); 57 | seq.clear(); 58 | } 59 | // Drop the leading '>'. 60 | f.comments.emplace_back(line.begin() + 1, line.end()); 61 | } else if (!line.empty()) { 62 | if (input_format == "fasta") { 63 | for (char c : line) { 64 | seq.push_back(char2int(c)); 65 | } 66 | } else if (input_format == "csv") { 67 | std::stringstream ss(line); 68 | std::string item; 69 | while (std::getline(ss, item, ',')) { 70 | seq.push_back(std::stoi(item, 0, 16)); 71 | } 72 | f.comments.push_back("seq" + std::to_string(f.sequences.size())); 73 | f.sequences.push_back(std::move(seq)); 74 | seq.clear(); 75 | } else { 76 | std::cerr << "Invalid input foramt: " << input_format << std::endl; 77 | exit(1); 78 | } 79 | } 80 | } 81 | if (!seq.empty()) { 82 | f.sequences.push_back(std::move(seq)); 83 | seq.clear(); 84 | } 85 | if(f.sequences.size() != f.comments.size()) { 86 | std::cerr << "Invalid fasta file: " << file_name << std::endl; 87 | std::exit(1); 88 | } 89 | return f; 90 | } 91 | 92 | /** 93 | * Reads all .fasta and .fna files in the given directory and returns them. 94 | * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t 95 | */ 96 | template 97 | std::vector> read_directory(const std::string &directory_name) { 98 | if (!std::filesystem::exists(directory_name)) { 99 | std::cerr << "Input directory does not exist: " << directory_name << std::endl; 100 | std::exit(1); 101 | } 102 | std::vector> files; 103 | 104 | // Handle the case where the argument is a single file as well. 105 | if (std::filesystem::is_regular_file(directory_name)) { 106 | files.emplace_back(read_fasta(directory_name, "fasta")); 107 | } else { 108 | for (const auto &f : std::filesystem::directory_iterator(directory_name)) { 109 | const std::filesystem::path ext = f.path().extension(); 110 | if (ext == ".fa" || ext == ".fna" || ext == ".fasta") { 111 | files.emplace_back(read_fasta(f.path(), "fasta")); 112 | } 113 | } 114 | } 115 | 116 | return files; 117 | } 118 | 119 | template 120 | void write_fasta(const std::string &file_name, const Vec2D &sequences, bool Abc = false) { 121 | std::ofstream fo(file_name); 122 | fo << "#" + std::to_string(random()) << std::endl; 123 | fo << "# " << flag_values(' ') << std::endl; 124 | for (uint32_t si = 0; si < sequences.size(); si++) { 125 | fo << ">s" << si << "\n"; 126 | auto &seq = sequences[si]; 127 | for (auto &c : seq) { 128 | if (Abc) { 129 | fo << (char)(c + (int)'A'); 130 | } else { 131 | fo << c << ","; 132 | } 133 | } 134 | fo << "\n\n"; 135 | } 136 | } 137 | 138 | } // namespace ts 139 | -------------------------------------------------------------------------------- /sequence/sequence_generator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/utils.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace ts { // ts = Tensor Sketch 13 | 14 | class SeqGen { 15 | public: 16 | SeqGen(uint8_t alphabet_size, 17 | bool fix_len, 18 | uint32_t num_seqs, 19 | uint32_t seq_len, 20 | uint32_t group_size, 21 | double max_mutation_rate, 22 | double min_mutation_rate, 23 | std::string phylogeny_shape) 24 | : alphabet_size(alphabet_size), 25 | fix_len(fix_len), 26 | num_seqs(num_seqs), 27 | seq_len(seq_len), 28 | group_size(group_size), 29 | max_mutation_rate(max_mutation_rate), 30 | min_mutation_rate(min_mutation_rate), 31 | phylogeny_shape(std::move(phylogeny_shape)) { 32 | assert(group_size >= 2 && "group size<=1 leads to completely independent sequences"); 33 | } 34 | 35 | /** 36 | * Generate sequences divided into independent groups of size `group_size`, and store 37 | * ingroup_pairs within each group in `ingroup_pairs` 38 | * @tparam T character type 39 | * @tparam C index type 40 | * @param seqs generated sequences 41 | * @param pairs sequence ingroup_pairs within each group 42 | */ 43 | template 44 | Vec2D generate_seqs() { 45 | if (phylogeny_shape == "pair") { // shape=path: implemented as path & group_size=2 46 | phylogeny_shape = "path"; 47 | group_size = 2; 48 | } 49 | Vec2D seqs; 50 | seqs.reserve(num_seqs); 51 | while (seqs.size() < num_seqs) { 52 | Vec2D group; 53 | 54 | // tree-like: g1->g2, add g2 to pool, g1->g3, g2->g4, add g3, g4 to pool 55 | if (phylogeny_shape == "tree") { 56 | group = Vec2D(1); 57 | random_sequence(group[0], seq_len); 58 | Vec2D children; 59 | while (group.size() < group_size) { 60 | for (auto &seq : group) { 61 | std::vector ch; 62 | mutate(seq, ch); 63 | children.push_back(seq); 64 | children.push_back(ch); 65 | } 66 | std::swap(group, children); 67 | children.clear(); 68 | } 69 | } else if (phylogeny_shape == "path") { // path-like: g0->g1->g2->g3->... 70 | group = Vec2D(group_size); 71 | random_sequence(group[0], seq_len); 72 | for (size_t i = 0; i < group_size - 1; i++) { 73 | mutate(group[i], group[i + 1]); 74 | } 75 | } else if (phylogeny_shape == "star") { // star-like: g0->g1, g0->g2,g0->g3 ... 76 | group = Vec2D(1); 77 | random_sequence(group[0], seq_len); 78 | for (size_t i = 1; i < group_size; i++) { 79 | mutate(group[0], group[i]); 80 | } 81 | } 82 | 83 | group.resize(group_size); 84 | seqs.insert(seqs.end(), group.begin(), group.end()); 85 | if (seqs.size() > num_seqs) { 86 | seqs.resize(num_seqs); 87 | } 88 | } 89 | return seqs; 90 | } 91 | 92 | template 93 | void ingroup_pairs(std::vector> &pairs) { 94 | for (size_t go = 0; go < num_seqs; go += group_size) { // group-offset 95 | for (size_t i = 0; i < group_size && go + i < num_seqs; i++) { // group-member i 96 | for (size_t j = i + 1; j < group_size && go + j < num_seqs; j++) { // group-member j 97 | pairs.push_back({ go + i, go + j }); 98 | } 99 | } 100 | } 101 | } 102 | 103 | 104 | private: 105 | template 106 | void mutate(const std::vector &ref, std::vector &seq) { 107 | std::uniform_real_distribution unif(min_mutation_rate, max_mutation_rate); 108 | mutate(ref, seq, unif(gen)); 109 | if (fix_len) 110 | make_fix_len(seq); 111 | } 112 | 113 | /** 114 | * Mutate seq from ref, by mutating each position with the probability = `rate` 115 | * @tparam T element type in the sequence 116 | * @param ref 117 | * @param seq mutated sequence 118 | * @param rate probability of mutation at each index 119 | */ 120 | template 121 | void mutate(const std::vector &ref, std::vector &seq, double rate) { 122 | assert((rate >= 0.0) && (rate <= 1.0) && " rate must be strictly in the range [0,1]"); 123 | // probabilities for each index position: no mutation, insert, delete, substitute 124 | std::discrete_distribution mut { 1 - rate, rate / 3, rate / 3, rate / 3 }; 125 | // the range chosen such that (sub_char+ref % alphabet_size) will different from ref 126 | std::uniform_int_distribution sub_char(1, alphabet_size - 1); 127 | // random character from the alphabet 128 | std::uniform_int_distribution rand_char(0, alphabet_size - 1); 129 | for (size_t i = 0; i < ref.size(); i++) { 130 | switch (mut(gen)) { 131 | case 0: { // no mutation 132 | seq.push_back(ref[i]); 133 | break; 134 | } 135 | case 1: { // insert 136 | seq.push_back(rand_char(gen)); 137 | i--; // init_tensor_slide_params negate the increment 138 | break; 139 | } 140 | case 2: { // delete 141 | break; 142 | } 143 | case 3: { // substitute 144 | seq.push_back((sub_char(gen) + ref[i]) % alphabet_size); 145 | break; 146 | } 147 | } 148 | } 149 | } 150 | 151 | 152 | template 153 | void make_fix_len(std::vector &seq) { 154 | std::uniform_int_distribution rand_char(0, alphabet_size - 1); 155 | if (seq.size() > seq_len) { 156 | seq = std::vector(seq.begin(), seq.end()); 157 | } else if (seq.size() < seq_len) { 158 | while (seq.size() < seq_len) { 159 | seq.push_back(rand_char(gen)); 160 | } 161 | } 162 | } 163 | 164 | /** 165 | * Generate a random sequence of length `len` 166 | * @tparam T 167 | * @param seq : the result will be stored in `seq` 168 | * @param len : length of the random sequence 169 | */ 170 | template 171 | void random_sequence(std::vector &seq, size_t len) { 172 | seq.resize(len); 173 | std::uniform_int_distribution rand_char(0, alphabet_size - 1); 174 | for (uint32_t i = 0; i < len; i++) { 175 | seq[i] = rand_char(gen); 176 | } 177 | } 178 | 179 | 180 | private: 181 | std::mt19937 gen = std::mt19937(341234); 182 | 183 | uint8_t alphabet_size; 184 | bool fix_len; 185 | uint32_t num_seqs; 186 | uint32_t seq_len; 187 | uint32_t group_size; 188 | double max_mutation_rate; 189 | double min_mutation_rate; 190 | std::string phylogeny_shape; 191 | }; 192 | 193 | } // namespace ts 194 | -------------------------------------------------------------------------------- /sequence_generator_main.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence/sequence_generator.hpp" 2 | #include "sequence/fasta_io.hpp" 3 | #include "util/utils.hpp" 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | DEFINE_int32(alphabet_size, 4, "Size of the alphabet for generated sequences"); 11 | DEFINE_int32(A, 4, "Short hand for --alphabet_size"); 12 | 13 | DEFINE_bool(fix_len, false, "Force generated sequence length to be equal"); 14 | DEFINE_bool(F, false, "Short hand for --fix_len"); 15 | 16 | 17 | DEFINE_uint32(num_seqs, 200, "Number of sequences to be generated"); 18 | DEFINE_uint32(N, 200, "Short hand for --num_seqs"); 19 | 20 | DEFINE_uint32(seq_len, 256, "The length of sequence to be generated"); 21 | DEFINE_uint32(L, 256, "Short hand for --seq_len"); 22 | 23 | DEFINE_uint32(group_size, 2, "The number of sequences in each group"); 24 | DEFINE_uint32(G, 2, "Short hand for --group_size"); 25 | 26 | DEFINE_double(max_mutation_rate, 0.3, "Maximum rate of point mutation for sequence generation"); 27 | DEFINE_double(R, 0.3, "Short hand for --max_mutation_rate"); 28 | 29 | DEFINE_double(min_mutation_rate, 0.0, "min rate for sequence mutation for sequence generation"); 30 | DEFINE_double(r, 0.00, "Short hand for --min_mutation_rate"); 31 | 32 | 33 | DEFINE_string(output_dir, "/tmp/", "File name where the generated sequence should be written"); 34 | DEFINE_string(o, "./seqs.fa", "Short hand for --output"); 35 | 36 | 37 | static bool validatePhylogenyShape(const char *flagname, const std::string &value) { 38 | if (value == "path" || value == "tree" || value == "star" || value == "pair") 39 | return true; 40 | printf("Invalid value for --%s: %s\n", flagname, value.c_str()); 41 | return false; 42 | } 43 | DEFINE_string(phylogeny_shape, 44 | "path", 45 | "shape of the phylogeny can be 'path', 'tree', 'star', or 'pair'"); 46 | DEFINE_validator(phylogeny_shape, &validatePhylogenyShape); 47 | 48 | 49 | 50 | void adjust_short_names() { 51 | if (!gflags::GetCommandLineFlagInfoOrDie("A").is_default) { 52 | FLAGS_alphabet_size = FLAGS_A; 53 | } 54 | if (!gflags::GetCommandLineFlagInfoOrDie("N").is_default) { 55 | FLAGS_num_seqs = FLAGS_N; 56 | } 57 | if (!gflags::GetCommandLineFlagInfoOrDie("L").is_default) { 58 | FLAGS_seq_len = FLAGS_L; 59 | } 60 | if (!gflags::GetCommandLineFlagInfoOrDie("R").is_default) { 61 | FLAGS_max_mutation_rate = FLAGS_R; 62 | } 63 | if (!gflags::GetCommandLineFlagInfoOrDie("r").is_default) { 64 | FLAGS_min_mutation_rate = FLAGS_r; 65 | } 66 | if (!gflags::GetCommandLineFlagInfoOrDie("o").is_default) { 67 | FLAGS_output_dir = FLAGS_o; 68 | } 69 | if (!gflags::GetCommandLineFlagInfoOrDie("G").is_default) { 70 | FLAGS_group_size = FLAGS_G; 71 | } 72 | } 73 | 74 | int main(int argc, char *argv[]) { 75 | gflags::ParseCommandLineFlags(&argc, &argv, true); 76 | adjust_short_names(); 77 | 78 | ts::Vec2D seqs; 79 | std::vector seq_names; 80 | std::string test_id; 81 | 82 | ts::SeqGen seq_gen(FLAGS_alphabet_size, FLAGS_fix_len, FLAGS_num_seqs, FLAGS_seq_len, 83 | FLAGS_group_size, FLAGS_max_mutation_rate, FLAGS_min_mutation_rate, FLAGS_phylogeny_shape); 84 | 85 | seqs = seq_gen.generate_seqs(); 86 | ts::write_fasta(std::filesystem::path(FLAGS_output_dir) / "seqs.fa", seqs); 87 | } 88 | -------------------------------------------------------------------------------- /sketch/dim_reduce.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/utils.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | namespace ts { 11 | 12 | class Int32Flattener { 13 | public: 14 | using sketch_type = std::vector; 15 | 16 | Int32Flattener(uint32_t flat_dim, uint32_t sketch_dim, uint32_t max_len, uint32_t seed) 17 | : flat_dim(flat_dim), sketch_dim(sketch_dim), max_len(max_len) { 18 | std::mt19937 gen(seed); 19 | std::uniform_int_distribution distribution; 20 | rand_proj = new2D(flat_dim, this->max_len * sketch_dim * 2); 21 | for (auto &v : rand_proj) { 22 | for (auto &e : v) { 23 | e = distribution(gen); 24 | } 25 | } 26 | } 27 | 28 | std::vector flatten(const Vec2D &sketch) { 29 | Timer timer("Int32Flattener"); 30 | assert(rand_proj.size() == flat_dim); 31 | std::vector v(flat_dim, 0); 32 | for (uint32_t s1 = 0; s1 < flat_dim; s1++) { 33 | for (uint32_t s2 = 0; s2 < 32; s2++) { // iterate over 32 bits 34 | size_t j = s1 % sketch_dim; 35 | double val = 0; 36 | for (size_t i = 0; i < sketch.size(); i++) { 37 | auto bit = rand_proj[s1][i * sketch_dim + j] >> s2; // random bit 38 | val += (bit & 1) ? sketch[i][j] : -sketch[i][j]; 39 | } 40 | v[s1] = (v[s1] << 1) + std::signbit(val); // insert sgn(val) into v[s1] 41 | } 42 | } 43 | return v; 44 | } 45 | 46 | static double dist(const std::vector &v1, const std::vector &v2) { 47 | Timer timer("Int32Flattener_dist"); 48 | assert(v1.size() == v2.size()); 49 | std::vector d(v1.size()); 50 | double val = 0; 51 | for (size_t i = 0; i < d.size(); i++) { 52 | val += __builtin_popcount(v1[i] ^ v2[i]); 53 | } 54 | return val; 55 | } 56 | 57 | private: 58 | uint32_t flat_dim; 59 | uint32_t sketch_dim; 60 | uint32_t max_len; 61 | Vec2D rand_proj; 62 | }; 63 | 64 | 65 | class DoubleFlattener { 66 | public: 67 | using sketch_type = std::vector; 68 | 69 | DoubleFlattener(uint32_t output_dim, 70 | uint32_t input_dim, 71 | uint32_t input_max_len, 72 | uint32_t seed) 73 | : flat_dim(output_dim), sketch_dim(input_dim), max_len(input_max_len) { 74 | std::mt19937 gen(seed); 75 | std::cauchy_distribution distribution(0, 1.0); 76 | rand_proj = new2D(this->flat_dim, this->max_len * input_dim * 2); 77 | for (auto &v : rand_proj) { 78 | for (double &e : v) { 79 | e = distribution(gen); 80 | } 81 | } 82 | } 83 | 84 | std::vector flatten(const Vec2D &sketch) { 85 | Timer timer("DoubleFlattener"); 86 | assert(rand_proj.size() == flat_dim); 87 | std::vector v(this->flat_dim, 0); 88 | for (size_t s = 0; s < this->flat_dim; s++) { 89 | size_t j = s % this->sketch_dim; 90 | for (size_t i = 0; i < sketch.size(); i++) { 91 | v[s] += rand_proj[s][i * this->sketch_dim + j] * sketch[i][j]; 92 | } 93 | v[s] /= (double)(sketch.size() 94 | * sketch[0] 95 | .size()); // divide by number of elements to compute the mean 96 | } 97 | 98 | return v; 99 | } 100 | 101 | static double dist(const std::vector &v1, const std::vector &v2) { 102 | Timer timer("DoubleFlattener_dist"); 103 | assert(v1.size() == v2.size()); 104 | std::vector d(v1.size()); 105 | for (size_t i = 0; i < d.size(); i++) { 106 | d[i] = abs(v1[i] - v2[i]); 107 | } 108 | std::sort(d.begin(), d.end()); 109 | return d[d.size() / 2]; // return the median 110 | } 111 | 112 | private: 113 | uint32_t flat_dim; 114 | uint32_t sketch_dim; 115 | uint32_t max_len; 116 | Vec2D rand_proj; 117 | }; 118 | 119 | } // namespace ts 120 | -------------------------------------------------------------------------------- /sketch/edit_distance.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "sketch/sketch_base.hpp" 4 | #include "util/multivec.hpp" 5 | #include "util/timer.hpp" 6 | #include "util/utils.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace ts { // ts = Tensor Sketch 13 | 14 | template 15 | class EditDistance : public SketchBase *, false> { 16 | public: 17 | explicit EditDistance(const std::string &name = "ED") 18 | : SketchBase *, false>(name) { 19 | init(); 20 | } 21 | 22 | void init() {} 23 | 24 | const std::vector *compute(const std::vector &seq) { return &seq; } 25 | 26 | static double dist(const std::vector *a, const std::vector *b) { 27 | Timer timer("edit_dist"); 28 | return edit_distance(*a, *b); 29 | } 30 | }; 31 | 32 | } // namespace ts 33 | -------------------------------------------------------------------------------- /sketch/hash_base.cpp: -------------------------------------------------------------------------------- 1 | #include "sketch/hash_base.hpp" 2 | 3 | namespace ts { 4 | 5 | HashAlgorithm parse_hash_algorithm(const std::string &name) { 6 | if (name == "uniform") { 7 | return HashAlgorithm::uniform; 8 | } 9 | if (name == "crc32") { 10 | return HashAlgorithm::crc32; 11 | } 12 | if (name == "murmur") { 13 | return HashAlgorithm::murmur; 14 | } 15 | assert(false); 16 | } 17 | 18 | } // namespace ts 19 | -------------------------------------------------------------------------------- /sketch/hash_base.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "sketch/sketch_base.hpp" 4 | #include "util/timer.hpp" 5 | #include "util/utils.hpp" 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace ts { 16 | 17 | enum class HashAlgorithm { uniform, crc32, murmur }; 18 | 19 | HashAlgorithm parse_hash_algorithm(const std::string &name); 20 | 21 | /** 22 | * @tparam T the type of elements in the hash. 23 | */ 24 | template 25 | class HashBase : public SketchBase, true> { 26 | public: 27 | HashBase(T set_size, 28 | size_t sketch_dim, 29 | size_t hash_size, 30 | HashAlgorithm hash_algorithm, 31 | uint32_t seed, 32 | const std::string &name = "HashBase", 33 | size_t kmer_size = 1) 34 | : SketchBase, true>(name, kmer_size), 35 | set_size(set_size), 36 | sketch_dim(sketch_dim), 37 | hash_size(2 * hash_size), 38 | hash_algorithm(hash_algorithm), 39 | rand(0, this->hash_size - 1), 40 | rng(seed) { 41 | init(); 42 | } 43 | 44 | void init() { 45 | hash_seed = rand(rng); 46 | hash_seed2 = rand(rng); 47 | hashes.assign(sketch_dim, {}); 48 | hash_values.assign(sketch_dim, {}); 49 | } 50 | 51 | void set_hashes_for_testing(const std::vector> &h) { hashes = h; } 52 | 53 | protected: 54 | T set_size; 55 | size_t sketch_dim; 56 | size_t hash_size; 57 | 58 | /** 59 | * Returns the hash value for the given #key of the #index-th hash function. 60 | */ 61 | T hash(uint64_t index, uint64_t key) { 62 | switch (hash_algorithm) { 63 | case HashAlgorithm::uniform: { 64 | T val; 65 | // TODO multiple read Semaphore instead of critical 66 | #pragma omp critical 67 | { 68 | auto [it, inserted] = hashes[index].insert({ key, -1 }); 69 | if (!inserted) { 70 | val = it->second; 71 | } else { 72 | do { 73 | val = rand(rng); 74 | } while (!hash_values[index].insert(val).second); 75 | it->second = val; 76 | } 77 | } 78 | assert(val >= 0 && val < hash_size 79 | && " Hash values are not in [0,set_size-1] range"); 80 | return val; 81 | } 82 | case HashAlgorithm::crc32: { 83 | uint32_t val = _mm_crc32_u32(hash_seed, (uint32_t)key); 84 | val = _mm_crc32_u32((uint32_t)val, (uint32_t)index); 85 | if constexpr (sizeof(T) <= 4) { 86 | return static_cast(val); 87 | } else if constexpr (sizeof(T) <=8) { 88 | uint32_t val2 = _mm_crc32_u32(hash_seed2, (uint32_t)key); 89 | val2 = _mm_crc32_u32((uint32_t)val2, (uint32_t)index); 90 | return (val << 4) | val2; 91 | } 92 | } 93 | case HashAlgorithm::murmur: { 94 | uint64_t to_hash[] = { index, key }; 95 | uint8_t result[16]; 96 | MurmurHash3_x86_128(to_hash, 16, hash_seed, result); 97 | T v = *((T *)result); 98 | return v; 99 | } 100 | default: 101 | return -1; 102 | } 103 | } 104 | 105 | private: 106 | HashAlgorithm hash_algorithm; 107 | 108 | /** Contains the sketch_dim permutations (hashes) that are used to compute the min-hash */ 109 | std::vector> hashes; 110 | /** Contains the values used so far for each on-demand permutation */ 111 | std::vector> hash_values; 112 | std::uniform_int_distribution rand; 113 | std::mt19937 rng; 114 | uint32_t hash_seed; 115 | uint32_t hash_seed2; 116 | }; 117 | 118 | 119 | } // namespace ts 120 | -------------------------------------------------------------------------------- /sketch/hash_min.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "hash_base.hpp" 4 | 5 | #include "util/timer.hpp" 6 | #include "util/utils.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace ts { // ts = Tensor Sketch 13 | 14 | /** 15 | * Implements min-hash-based sketching, as described in https://en.wikipedia.org/wiki/MinHash. 16 | * Given a set S, and a sequence s=s1...sn with elements from S, this class computes a vector 17 | * {hmin_1(s), hmin_2(s), ..., hmin_sketch_size(s)}, where hmin_k(s)=s_i, such that h_k(s_i) is the 18 | * smallest of h_k(s_1), h_k(s_2), ..., h_k(s_n) and h_k:S->{1..#set_size} is a random permuation of 19 | * the elements in S. 20 | * This class assumes that S= {0,1,2....,#set_size}. 21 | * @tparam T the type of S's elements. 22 | */ 23 | template 24 | class MinHash : public HashBase { 25 | public: 26 | /** 27 | * Constructs a min-hasher for the given alphabet size which constructs sketches of the set size 28 | * and sketch dimension. 29 | * @param set_size the number of elements in S, 30 | * @param sketch_dim the number of components (elements) in the sketch vector. 31 | * @param seed the seed to initialize the random number generator used for the random hash 32 | * functions. 33 | */ 34 | MinHash(T set_size, 35 | size_t sketch_dim, 36 | HashAlgorithm hash_algorithm, 37 | uint32_t seed, 38 | const std::string &name = "MH", 39 | size_t kmer_size = 1) 40 | : HashBase(set_size, sketch_dim, set_size, hash_algorithm, seed, name, kmer_size) {} 41 | 42 | /** 43 | * Computes the min-hash sketch for the given kmers. 44 | * @param kmers kmers extracted from a sequence 45 | * @return the min-hash sketch of #kmers 46 | */ 47 | std::vector compute(const std::vector &kmers) { 48 | Timer timer("minhash"); 49 | std::vector sketch(this->sketch_dim); 50 | if (kmers.empty()) { 51 | return sketch; 52 | } 53 | 54 | for (size_t si = 0; si < this->sketch_dim; si++) { 55 | T min_char = T(0); 56 | T min_rank = std::numeric_limits::max(); 57 | for (auto s : kmers) { 58 | T hash = this->hash(si, s); 59 | if (hash < min_rank) { 60 | min_rank = hash; 61 | min_char = s; 62 | } 63 | } 64 | sketch[si] = min_char; 65 | } 66 | return sketch; 67 | } 68 | 69 | /** 70 | * Computes the min-hash sketch for the given sequence. 71 | * @param sequence the sequence to compute the min-hash for 72 | * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be 73 | * hashed 74 | * @param number of characters in the alphabet over which sequence is defined 75 | * @return the min-hash sketch of sequence 76 | * @tparam C the type of characters in the sequence 77 | */ 78 | template 79 | std::vector compute(const std::vector &sequence, uint32_t k, uint32_t alphabet_size) { 80 | std::vector kmers = seq2kmer(sequence, k, alphabet_size); 81 | return compute(kmers); 82 | } 83 | 84 | static T dist(const std::vector &a, const std::vector &b) { 85 | Timer timer("minhash_dist"); 86 | return hamming_dist(a, b); 87 | } 88 | }; 89 | } // namespace ts 90 | -------------------------------------------------------------------------------- /sketch/hash_ordered.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "hash_base.hpp" 4 | 5 | #include "util/utils.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace ts { // ts = Tensor Sketch 12 | 13 | /** 14 | * Naive implementation of the Ordered MinHash sketching method described in: 15 | * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6612865/ 16 | * 17 | * @tparam T the type of element in the sequences to be sketched 18 | */ 19 | template 20 | class OrderedMinHash : public HashBase { 21 | public: 22 | /** 23 | * @param set_size the number of elements in S 24 | * @param sketch_dim the number of components (elements) in the sketch vector. 25 | * @param max_len maximum sequence length to be hashed. 26 | * @param tup_len the sketching will select the tup_len lowest values for each hash function 27 | * @param seed the seed to initialize the random number generator used for the random hash 28 | * functions. 29 | */ 30 | OrderedMinHash(T set_size, 31 | size_t sketch_dim, 32 | size_t max_len, 33 | size_t tup_len, 34 | HashAlgorithm hash_algorithm, 35 | uint32_t seed, 36 | const std::string &name = "OMH", 37 | size_t kmer_size = 1) 38 | : HashBase(set_size, sketch_dim, set_size * max_len, hash_algorithm, seed, name, kmer_size), 39 | max_len(max_len), 40 | tup_len(tup_len) {} 41 | 42 | Vec2D compute_2d(const std::vector &kmers) { 43 | Vec2D sketch(this->sketch_dim); 44 | if (kmers.size() < tup_len) { 45 | throw std::invalid_argument("Sequence of kmers must be longer than tuple length"); 46 | } 47 | for (size_t pi = 0; pi < this->sketch_dim; pi++) { 48 | std::unordered_map counts; 49 | std::vector> ranks; 50 | for (size_t i = 0; i < kmers.size(); i++) { 51 | auto s = kmers[i]; 52 | ranks.push_back({ this->hash(pi, s + this->set_size * counts[s]), i }); 53 | counts[s]++; 54 | #ifndef NDEBUG 55 | assert(counts[s] != 0); // no overflow 56 | if (counts[s] > max_len) { 57 | throw std::invalid_argument("Kmer " + std::to_string(s) + " repeats more than " 58 | + std::to_string(max_len) 59 | + " times. Set --max_len to a higher value."); 60 | } 61 | #endif 62 | } 63 | std::sort(ranks.begin(), ranks.end()); 64 | std::vector tup; 65 | for (auto pair = ranks.begin(); pair != ranks.end() && pair != ranks.begin() + tup_len; 66 | pair++) { 67 | tup.push_back(pair->second); 68 | } 69 | std::sort(tup.begin(), tup.end()); // sort indices of kmers 70 | for (auto idx : tup) 71 | sketch[pi].push_back(kmers[idx]); 72 | } 73 | return sketch; 74 | } 75 | 76 | std::vector compute(const std::vector &kmers) { 77 | Timer timer("ordered_minhash"); 78 | std::vector sketch; 79 | 80 | Vec2D sketch2D = compute_2d(kmers); 81 | for (const auto &tuple : sketch2D) { 82 | T sum = 0; 83 | for (const auto &item : tuple) { 84 | sum = sum * this->set_size + item; // TODO: deal with overflows 85 | } 86 | sketch.push_back(sum); 87 | } 88 | 89 | return sketch; 90 | } 91 | 92 | /** 93 | * Computes the ordered min-hash sketch for the given sequence. 94 | * @param sequence the sequence to compute the ordered min-hash for 95 | * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be 96 | * hashed 97 | * @param number of characters in the alphabet over which sequence is defined 98 | * @return the ordered min-hash sketch of sequence 99 | * @tparam C the type of characters in the sequence 100 | */ 101 | template 102 | std::vector compute(const std::vector &sequence, uint32_t k, uint32_t alphabet_size) { 103 | return compute(seq2kmer(sequence, k, alphabet_size)); 104 | } 105 | 106 | static T dist(const std::vector &a, const std::vector &b) { 107 | Timer timer("ordered_minhash_dist"); 108 | return hamming_dist(a, b); 109 | } 110 | 111 | private: 112 | size_t max_len; 113 | size_t tup_len; 114 | }; 115 | 116 | } // namespace ts 117 | -------------------------------------------------------------------------------- /sketch/hash_weighted.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "hash_base.hpp" 4 | 5 | #include "util/timer.hpp" 6 | #include "util/utils.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace ts { // ts = Tensor Sketch 13 | 14 | /** 15 | * Naive implementation of weighted min-hash sketching. For more efficient implementations, see 16 | * https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf and 17 | * https://www.microsoft.com/en-us/research/wp-content/uploads/2010/06/ConsistentWeightedSampling2.pdf 18 | * 19 | * Given a set S, and a sequence s=s1...sn with elements from S, this class computes a vector 20 | * {hmin_1(s), hmin_2(s), ..., hmin_sketch_size(s)}, where hmin_k(s)=s_i, such that h_k(s_i, #s_i) 21 | * is the smallest of h_k(s_1, 1..#s_1), h_k(s_2, 1..#s_2), ..., h_k(s_n, 1..#s_n) and 22 | * h_k:Sx{1..n} -> {1..#set_size} is a random permuation of the elements in S and #s_i denotes the 23 | * number of occurences of s_i in the sequence s. 24 | * @tparam T the type of S's elements 25 | */ 26 | template 27 | class WeightedMinHash : public HashBase { 28 | public: 29 | /** 30 | * Constructs a weighted min-hasher for the given alphabet size which constructs sketches of the 31 | * given set size, dimension and maximum length. 32 | * @param set_size the number of elements in S, 33 | * @param sketch_dim the number of components (elements) in the sketch vector. 34 | * @param max_len maximum sequence length to be hashed. 35 | * @param seed the seed to initialize the random number generator used for the random hash 36 | * functions. 37 | */ 38 | WeightedMinHash(T set_size, 39 | size_t sketch_dim, 40 | size_t max_len, 41 | HashAlgorithm hash_algorithm, 42 | uint32_t seed, 43 | const std::string &name = "WMH", 44 | size_t kmer_size = 1) 45 | : HashBase(set_size, sketch_dim, max_len * set_size, hash_algorithm, seed, name, kmer_size), 46 | max_len(max_len) {} 47 | 48 | std::vector compute(const std::vector &kmers) { 49 | Timer timer("weighted_minhash"); 50 | std::vector sketch = std::vector(this->sketch_dim); 51 | if (kmers.empty()) { 52 | return sketch; 53 | } 54 | 55 | for (size_t si = 0; si < this->sketch_dim; si++) { 56 | T min_char = T(0); 57 | T min_rank = std::numeric_limits::max(); 58 | std::unordered_map cnts; 59 | for (const auto s : kmers) { 60 | T r = this->hash(si, s + cnts[s] * this->set_size); 61 | cnts[s]++; 62 | #ifndef NDEBUG 63 | assert(cnts[s] != 0); // no overflow 64 | if (cnts[s] > max_len) { 65 | throw std::invalid_argument("Kmer " + std::to_string(s) + " repeats more than " 66 | + std::to_string(max_len) 67 | + " times. Set --max_len to a higher value."); 68 | } 69 | #endif 70 | 71 | if (r < min_rank) { 72 | min_rank = r; 73 | min_char = s; 74 | } 75 | } 76 | sketch[si] = min_char; 77 | } 78 | return sketch; 79 | } 80 | 81 | /** 82 | * Computes the ordered min-hash sketch for the given sequence. 83 | * @param sequence the sequence to compute the ordered min-hash for 84 | * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be 85 | * hashed 86 | * @param number of characters in the alphabet over which sequence is defined 87 | * @return the ordered min-hash sketch of #sequence 88 | * @tparam C the type of characters in #sequence 89 | */ 90 | template 91 | std::vector compute(const std::vector &sequence, uint32_t k, uint32_t alphabet_size) { 92 | std::vector kmers = seq2kmer(sequence, k, alphabet_size); 93 | std::vector sketch = compute(kmers); 94 | return sketch; 95 | } 96 | 97 | static T dist(const std::vector &a, const std::vector &b) { 98 | Timer timer("weighted_minhash_dist"); 99 | return hamming_dist(a, b); 100 | } 101 | 102 | private: 103 | size_t max_len; 104 | }; 105 | 106 | } // namespace ts 107 | -------------------------------------------------------------------------------- /sketch/sketch_base.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | namespace ts { 7 | 8 | /** 9 | * A base class for sketch algorithms. 10 | * 11 | * @tparam SketchType the type returned by the compute() function. 12 | * @tparam KmerInput is true when the hash algorithm first should convert the sequence to kmers. 13 | * */ 14 | template 15 | class SketchBase { 16 | public: 17 | // The type that the compute function returns. 18 | using sketch_type = SketchType; 19 | 20 | // Whether the compute function takes a list of kmers. 21 | constexpr static bool kmer_input = KmerInput; 22 | 23 | // Whether transformations should be applied to the sketch output of this algorithm. 24 | constexpr static bool transform_sketches = false; 25 | 26 | // The name of the sketching algorithm. 27 | const std::string name; 28 | 29 | // If kmer_input=true, value of kmer_size 30 | size_t kmer_size = 1; 31 | 32 | explicit SketchBase(std::string name, size_t kmer_size = 1) : name(std::move(name)), kmer_size(kmer_size) {} 33 | 34 | // Must be overridden by implementations. 35 | // Calling it will initialize the random hashes, overwriting any previous hash functions. 36 | // May be called multiple times on the same object to reset the state before running it on a new 37 | // set of sequences. 38 | void init() { static_assert(!sizeof(SketchType *), "Sketch type should implement init()."); } 39 | }; 40 | 41 | } // namespace ts 42 | -------------------------------------------------------------------------------- /sketch/tensor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "immintrin.h" // for AVX 4 | #include "nmmintrin.h" // for SSE4.2 5 | #include "sketch//sketch_base.hpp" 6 | #include "util/multivec.hpp" 7 | #include "util/timer.hpp" 8 | #include "util/utils.hpp" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace ts { // ts = Tensor Sketch 15 | 16 | /** 17 | * Computes tensor sketches for a given sequence as described in 18 | * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1 19 | * @tparam seq_type the type of elements in the sequences to be sketched. 20 | */ 21 | template 22 | class Tensor : public SketchBase, false> { 23 | public: 24 | // Tensor sketch output should be transformed if the command line flag is set. 25 | constexpr static bool transform_sketches = false; 26 | 27 | /** 28 | * @param alphabet_size the number of elements in the alphabet S over which sequences are 29 | * defined (e.g. 4 for DNA) 30 | * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper 31 | * @param subsequence_len the length of the subsequences considered for sketching, denoted by t 32 | * in the paper 33 | * @param seed the seed to initialize the random number generator used for the random hash 34 | * functions. 35 | */ 36 | Tensor(seq_type alphabet_size, 37 | size_t sketch_dim, 38 | size_t subsequence_len, 39 | uint32_t seed, 40 | const std::string &name = "TS") 41 | : SketchBase, false>(name), 42 | alphabet_size(alphabet_size), 43 | sketch_dim(sketch_dim), 44 | subsequence_len(subsequence_len), 45 | rng(seed) { 46 | init(); 47 | } 48 | 49 | void init() { 50 | hashes = new2D(subsequence_len, alphabet_size); 51 | signs = new2D(subsequence_len, alphabet_size); 52 | 53 | std::uniform_int_distribution rand_hash2(0, sketch_dim - 1); 54 | std::uniform_int_distribution rand_bool(0, 1); 55 | 56 | for (size_t h = 0; h < subsequence_len; h++) { 57 | for (size_t c = 0; c < alphabet_size; c++) { 58 | hashes[h][c] = rand_hash2(rng); 59 | signs[h][c] = rand_bool(rng); 60 | } 61 | } 62 | } 63 | 64 | /** 65 | * Computes the sketch of the given sequence. 66 | * @param seq the sequence to be sketched 67 | * @return an array of size #sketch_dim containing the sequence's sketch 68 | */ 69 | std::vector compute(const std::vector &seq) { 70 | Timer timer("tensor_sketch"); 71 | // Tp corresponds to T+, Tm to T- in the paper; Tp[0], Tm[0] are sentinels and contain the 72 | // initial condition for empty strings; Tp[p], Tm[p] represent the partial sketch when 73 | // considering hashes h1...hp, over the prefix x1...xi. The final result is then 74 | // Tp[t]-Tm[t], where t is #sequence_len 75 | auto Tp = new2D(subsequence_len + 1, sketch_dim, 0); 76 | auto Tm = new2D(subsequence_len + 1, sketch_dim, 0); 77 | 78 | // the initial condition states that the sketch for the empty string is (1,0,..) 79 | Tp[0][0] = 1; 80 | for (uint32_t i = 0; i < seq.size(); i++) { 81 | const seq_type c = seq[i]; 82 | if (c < 0 or c >= alphabet_size) { 83 | continue; 84 | } 85 | // must traverse in reverse order, to avoid overwriting the values of Tp and Tm before 86 | // they are used in the recurrence 87 | for (uint32_t p = std::min(i + 1, (uint32_t)subsequence_len); p >= 1; --p) { 88 | const double z = p / (i + 1.0); // probability that the last index is i 89 | const seq_type r = hashes[p - 1][c]; 90 | const bool s = signs[p - 1][c]; 91 | if (s) { 92 | this->shift_sum_inplace(Tp[p], Tp[p - 1], r, z); 93 | this->shift_sum_inplace(Tm[p], Tm[p - 1], r, z); 94 | } else { 95 | this->shift_sum_inplace(Tp[p], Tm[p - 1], r, z); 96 | this->shift_sum_inplace(Tm[p], Tp[p - 1], r, z); 97 | } 98 | } 99 | } 100 | std::vector sketch(sketch_dim, 0); 101 | for (uint32_t m = 0; m < sketch_dim; m++) { 102 | sketch[m] = Tp[subsequence_len][m] - Tm[subsequence_len][m]; 103 | } 104 | 105 | return sketch; 106 | } 107 | 108 | /** Sets the hash and sign functions to predetermined values for testing */ 109 | void set_hashes_for_testing(const Vec2D &h, const Vec2D &s) { 110 | hashes = h; 111 | signs = s; 112 | } 113 | 114 | static double dist(const std::vector &a, const std::vector &b) { 115 | Timer timer("tensor_sketch_dist"); 116 | return l2_dist(a, b); 117 | } 118 | 119 | protected: 120 | /** Computes (1-z)*a + z*b_shift */ 121 | void shift_sum_inplace(std::vector &a, 122 | const std::vector &b, 123 | seq_type shift, 124 | double z) { 125 | assert(a.size() == b.size()); 126 | size_t len = a.size(); 127 | for (uint32_t i = 0; i < len; i++) { 128 | a[i] = (1 - z) * a[i] + z * b[(len + i - shift) % len]; 129 | assert(a[i] <= 1 + 1e-5 && a[i] >= -1e-5); 130 | } 131 | } 132 | 133 | /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA */ 134 | seq_type alphabet_size; 135 | /** Number of elements in the sketch, denoted by D in the paper */ 136 | uint32_t sketch_dim; 137 | /** The length of the subsequences considered for sketching, denoted by t in the paper */ 138 | uint8_t subsequence_len; 139 | 140 | /** 141 | * Denotes the hash functions h1,....ht:A->{1....D}, where t is #subsequence_len and D is 142 | * #sketch_dim 143 | */ 144 | Vec2D hashes; 145 | 146 | /** The sign functions s1...st:A->{-1,1} */ 147 | Vec2D signs; 148 | 149 | std::mt19937 rng; 150 | }; 151 | 152 | } // namespace ts 153 | -------------------------------------------------------------------------------- /sketch/tensor_block.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "immintrin.h" // for AVX 4 | #include "nmmintrin.h" // for SSE4.2 5 | #include "sketch//sketch_base.hpp" 6 | #include "util/multivec.hpp" 7 | #include "util/timer.hpp" 8 | #include "util/utils.hpp" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace ts { // ts = Tensor Sketch 16 | 17 | /** 18 | * Computes tensor sketches for a given sequence as described in 19 | * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1, with the additional limitation that 20 | * the subsequences must be made of continuous blocks of a certain size. The adaptation of the 21 | * recurrence formula for this case is at https://go.grlab.org/tensor_block. 22 | * For block_size=1, the normal the #TensorBlock sketch is identical with #Tensor sketch. 23 | * @tparam seq_type the type of elements in the sequences to be sketched. 24 | */ 25 | template 26 | class TensorBlock : public SketchBase, false> { 27 | public: 28 | // Tensor sketch output should be transformed if the command line flag is set. 29 | constexpr static bool transform_sketches = false; 30 | 31 | /** 32 | * @param block_size only subsequences formed out of block_size continuous elements are sketched 33 | * @param alphabet_size the number of elements in the alphabet S over which sequences are 34 | * defined (e.g. 4 for DNA, 20 for protein, etc.) 35 | * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper 36 | * @param subsequence_len the length of the subsequences considered for sketching, denoted by t 37 | * in the paper 38 | * @param seed the seed to initialize the random number generator used for the random hash 39 | * functions. 40 | */ 41 | TensorBlock(seq_type alphabet_size, 42 | size_t sketch_dim, 43 | size_t subsequence_len, 44 | uint8_t block_size, 45 | uint32_t seed, 46 | const std::string &name = "TSB") 47 | : SketchBase, false>(name), 48 | block_size(block_size), 49 | alphabet_size(alphabet_size), 50 | sketch_dim(sketch_dim), 51 | subsequence_len(subsequence_len), 52 | rng(seed) { 53 | assert(block_size > 0 && subsequence_len > 0 && subsequence_len % block_size == 0); 54 | init(); 55 | } 56 | 57 | void init() { 58 | hashes = new2D(subsequence_len, alphabet_size); 59 | signs = new2D(subsequence_len, alphabet_size); 60 | 61 | std::uniform_int_distribution rand_hash2(0, sketch_dim - 1); 62 | std::uniform_int_distribution rand_bool(0, 1); 63 | 64 | for (size_t h = 0; h < subsequence_len; h++) { 65 | for (size_t c = 0; c < alphabet_size; c++) { 66 | hashes[h][c] = rand_hash2(rng); 67 | signs[h][c] = rand_bool(rng); 68 | } 69 | } 70 | } 71 | 72 | /** 73 | * Computes the sketch of the given sequence. 74 | * @param seq the sequence to be sketched 75 | * @return an array of size #sketch_dim containing the sequence's sketch 76 | */ 77 | std::vector compute(const std::vector &seq) { 78 | Timer timer("tensor_sketch"); 79 | // Tp corresponds to T+, Tm to T- in the paper; Tp[0], Tm[0] are sentinels and contain the 80 | // initial condition for empty strings; Tp[p], Tm[p] at step i represent the partial sketch 81 | // when considering hashes h1...hp, over the prefix x1...xi. The final result is then 82 | // Tp[t]-Tm[t], where t is #sequence_len 83 | // since the recurrence formula references the T_[1:N-k], i.e. the element situated 84 | // k=block_size positions behind, we need to always keep the last block_size Tp and Tm 85 | // matrices. At each iteration we create a new pair of Tp and Tm and then discard the oldest 86 | // Tp/Tn pair. 87 | // TODO(ddanciu): use a circular queue on top of vector instead 88 | std::deque> Tp; 89 | std::deque> Tm; 90 | 91 | // The number of blocks. 92 | uint32_t m = subsequence_len / block_size; 93 | 94 | for (uint32_t i = 0; i < block_size; ++i) { 95 | Tp.push_back(new2D(m + 1, sketch_dim, 0)); 96 | Tm.push_back(new2D(m + 1, sketch_dim, 0)); 97 | // the initial condition states that the sketch for the empty string is (1,0,..) 98 | Tp.back()[0][0] = 1; 99 | } 100 | 101 | // the are the "new" Tp and Tm, computed at every iteration and appended to Tp and Tm 102 | auto nTp = new2D(m + 1, sketch_dim, 0); 103 | auto nTm = new2D(m + 1, sketch_dim, 0); 104 | for (uint32_t i = block_size - 1; i < seq.size(); i++) { 105 | uint32_t block_count = std::min(m, (i + 1) / block_size); 106 | // must traverse in reverse order, to avoid overwriting the values of Tp and Tm before 107 | // they are used in the recurrence 108 | // p must be a multiple of block_size 109 | for (uint32_t bc = block_count; bc > 0; bc--) { 110 | uint32_t p = bc * block_size; 111 | double z = bc / (i + 1.0 - p + bc); // probability that the last index is i 112 | seq_type r = 0; 113 | bool s = true; 114 | for (uint32_t j = 0; j < block_size; ++j) { 115 | r += hashes[p - j - 1][seq[i - j]]; 116 | s = s == signs[p - j - 1][seq[i - j]]; 117 | } 118 | r %= sketch_dim; 119 | if (s) { 120 | nTp[bc] = this->shift_sum(Tp.back()[bc], Tp[0][bc - 1], r, z); 121 | nTm[bc] = this->shift_sum(Tm.back()[bc], Tm[0][bc - 1], r, z); 122 | } else { 123 | nTp[bc] = this->shift_sum(Tp.back()[bc], Tm[0][bc - 1], r, z); 124 | nTm[bc] = this->shift_sum(Tm.back()[bc], Tp[0][bc - 1], r, z); 125 | } 126 | } 127 | nTp[0][0] = 1; 128 | Tp.push_back(std::move(nTp)); 129 | Tm.push_back(std::move(nTm)); 130 | nTp = std::move(Tp.front()); 131 | nTm = std::move(Tm.front()); 132 | for (uint32_t j = 0; j < m + 1; ++j) { 133 | std::fill(nTp[j].begin(), nTp[j].end(), 0); 134 | std::fill(nTm[j].begin(), nTm[j].end(), 0); 135 | } 136 | Tp.pop_front(); 137 | Tm.pop_front(); 138 | } 139 | std::vector sketch(sketch_dim, 0); 140 | for (uint32_t l = 0; l < sketch_dim; l++) { 141 | sketch[l] = Tp.back()[m][l] - Tm.back()[m][l]; 142 | } 143 | 144 | return sketch; 145 | } 146 | 147 | /** Sets the hash and sign functions to predetermined values for testing */ 148 | void set_hashes_for_testing(const Vec2D &h, const Vec2D &s) { 149 | hashes = h; 150 | signs = s; 151 | } 152 | 153 | static double dist(const std::vector &a, const std::vector &b) { 154 | Timer timer("tensor_sketch_dist"); 155 | return l2_dist(a, b); 156 | } 157 | 158 | protected: 159 | /** Computes (1-z)*a + z*b_shift */ 160 | inline std::vector shift_sum(const std::vector &a, 161 | const std::vector &b, 162 | seq_type shift, 163 | double z) { 164 | assert(a.size() == b.size()); 165 | size_t len = a.size(); 166 | std::vector result(a.size()); 167 | for (uint32_t i = 0; i < a.size(); i++) { 168 | result[i] = (1 - z) * a[i] + z * b[(len + i - shift) % len]; 169 | assert(result[i] <= 1 + 1e-5 && result[i] >= -1e-5); 170 | } 171 | return result; 172 | } 173 | 174 | /** The size of the block each subsequence is made of. Must be a divisor of #subsequence_len. 175 | * A block size of 1 means that the class is operating on a character basis. */ 176 | uint8_t block_size; 177 | 178 | /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA */ 179 | seq_type alphabet_size; 180 | /** Number of elements in the sketch, denoted by D in the paper */ 181 | uint32_t sketch_dim; 182 | /** The length of the subsequences considered for sketching, denoted by t in the paper */ 183 | uint8_t subsequence_len; 184 | 185 | /** 186 | * Denotes the hash functions h1,....ht:A->{1....D}, where t is #subsequence_len and D is 187 | * #sketch_dim 188 | */ 189 | Vec2D hashes; 190 | 191 | /** The sign functions s1...st:A->{-1,1} */ 192 | Vec2D signs; 193 | 194 | std::mt19937 rng; 195 | }; 196 | 197 | } // namespace ts 198 | -------------------------------------------------------------------------------- /sketch/tensor_embedding.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "immintrin.h" // for AVX 4 | #include "nmmintrin.h" // for SSE4.2 5 | #include "sequence/alphabets.hpp" 6 | #include "sketch//sketch_base.hpp" 7 | #include "util/multivec.hpp" 8 | #include "util/timer.hpp" 9 | #include "util/utils.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace ts { // ts = Tensor Sketch 17 | 18 | /** 19 | * Computes the tensor of subsequence counts for a given sequence as described in 20 | * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1. 21 | * In contrast with TensorSketch, this class does not do any sketching and preserves the full 22 | * tensor. 23 | * @tparam seq_type the type of elements in the sequences to be sketched. 24 | */ 25 | template 26 | class TensorEmbedding : public SketchBase, false> { 27 | public: 28 | /** 29 | * @param alphabet_size the number of elements in the alphabet S over which sequences are 30 | * defined (e.g. 4 for DNA) 31 | * @param normalize when true the counts will be normalized to relative frequencies with sum 1. 32 | */ 33 | TensorEmbedding(seq_type alphabet_size, 34 | uint32_t t, 35 | const std::string &name = "Tensor", 36 | bool normalize = true) 37 | : SketchBase, false>(name), 38 | alphabet_size(alphabet_size), 39 | t(t), 40 | normalize(normalize) {} 41 | 42 | void init() {} 43 | 44 | /** 45 | * Computes the sketch of the given sequence. 46 | * @param seq the sequence to be sketched 47 | * @return an array of size alphabet_size^t containing the sequence's sketch 48 | */ 49 | std::vector compute(const std::vector &seq) { 50 | // ts[i] contains the counts for subsequences of length i. 51 | Vec2D ts(t + 1); 52 | { 53 | uint32_t num_tmers = 1; 54 | for (auto &t : ts) { 55 | t.resize(num_tmers); 56 | num_tmers *= alphabet_size; 57 | } 58 | } 59 | 60 | // The base case is the one empty sequence. 61 | ts[0][0] = 1; 62 | 63 | // The number of successfully read nucleotides. 64 | int32_t length = 0; 65 | for (auto s : seq) { 66 | // TODO(ragnar): Figure out a nice way to deal with uncertain reads. 67 | if (s < 0 || s >= alphabet_size) 68 | continue; 69 | length += 1; 70 | for (int32_t i = static_cast(t) - 1; i >= 0; --i) { 71 | for (size_t j = 0; j < ts[i].size(); ++j) { 72 | ts[i + 1][alphabet_size * j + s] += ts[i][j]; 73 | } 74 | } 75 | } 76 | 77 | if (normalize) { 78 | double nchooset = 1; 79 | for (uint32_t i = 0; i < t; ++i) { 80 | nchooset = nchooset * (length - i) / (i + 1); 81 | } 82 | 83 | for (auto &c : ts.back()) { 84 | c /= nchooset; 85 | } 86 | } 87 | 88 | return std::move(ts.back()); 89 | } 90 | 91 | static double dist(const std::vector &a, const std::vector &b) { 92 | Timer timer("full_tensor_dist"); 93 | return l2_dist(a, b); 94 | } 95 | 96 | protected: 97 | /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA. */ 98 | const seq_type alphabet_size; 99 | 100 | /** The length of the subsequences considered for sketching. */ 101 | const uint32_t t; 102 | 103 | /** Whether to normalize the counts to relative frequencies with sum 1. */ 104 | const bool normalize; 105 | }; 106 | 107 | } // namespace ts 108 | -------------------------------------------------------------------------------- /sketch/tensor_slide.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tensor.hpp" 4 | 5 | #include "util/utils.hpp" 6 | 7 | #include 8 | #include 9 | 10 | namespace ts { 11 | /** 12 | * Computes sliding tensor sketches for a given sequence as described in 13 | * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1 14 | * @tparam seq_type the type of elements in the sequences to be sketched. 15 | */ 16 | template 17 | class TensorSlide : public Tensor { 18 | public: 19 | using sketch_type = Vec2D; 20 | 21 | /** 22 | * @param alphabet_size the number of elements in the alphabet S over which sequences are 23 | * defined (e.g. 4 for DNA) 24 | * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper 25 | * @param tup_len the length of the subsequences considered for sketching, denoted by t 26 | * in the paper 27 | * @param win_len sliding sketches are computed for substrings of size win_len 28 | * @param stride sliding sketches are computed every stride characters 29 | * @param seed the seed to initialize the random number generator used for the random hash 30 | * functions. 31 | * @param name the name of the algorithm in the output 32 | */ 33 | TensorSlide(seq_type alphabet_size, 34 | size_t sketch_dim, 35 | size_t tup_len, 36 | size_t win_len, 37 | size_t stride, 38 | uint32_t seed, 39 | const std::string &name = "TSS") 40 | : Tensor(alphabet_size, sketch_dim, tup_len, seed, name), 41 | win_len(win_len), 42 | stride(stride) { 43 | assert(stride <= win_len && "Stride cannot be larger than the window length"); 44 | assert(tup_len <= win_len && "Tuple length (t) cannot be larger than the window length"); 45 | } 46 | 47 | /** 48 | * Computes sliding sketches for the given sequence. 49 | * A sketch is computed every #stride characters on substrings of length #window. 50 | * @return seq.size()/stride sketches of size #sketch_dim 51 | */ 52 | Vec2D compute(const std::vector &seq) { 53 | Timer timer("tensor_slide_sketch"); 54 | Vec2D sketches; 55 | if (seq.size() < this->subsequence_len) { 56 | return new2D(seq.size() / this->stride, this->sketch_dim, double(0)); 57 | } 58 | auto &hashes = this->hashes; 59 | auto &signs = this->signs; 60 | auto tup_len = this->subsequence_len; 61 | // first index: p; second index: q; third index: r 62 | // p,q go from 1 to tup_len; p==0 and p==tup_len+1 are sentinels for termination condition 63 | auto T1 = new3D(tup_len + 2, tup_len + 1, this->sketch_dim, 0); 64 | auto T2 = new3D(tup_len + 2, tup_len + 1, this->sketch_dim, 0); 65 | 66 | for (uint32_t p = 0; p <= tup_len; p++) { 67 | T1[p + 1][p][0] = 1; 68 | } 69 | 70 | // T[p][q] at step i represents the sketch for seq[i-w+1]...seq[i] when only using hash 71 | // functions 1<=p,p+1,...q<=t, where t is the sketch size 72 | for (uint32_t i = 0; i < seq.size(); i++) { 73 | for (uint32_t p = 1; p <= tup_len; p++) { 74 | // q-p must be smaller than i, hence the min in the condition 75 | for (uint32_t q = std::min(p + i, (uint32_t)tup_len); q >= p; q--) { 76 | double z = (double)(q - p + 1) / std::min(i + 1, win_len + 1); 77 | auto r = hashes[q - 1][seq[i]]; 78 | bool s = signs[q - 1][seq[i]]; 79 | if (s) { 80 | this->shift_sum_inplace(T1[p][q], T1[p][q - 1], r, z); 81 | this->shift_sum_inplace(T2[p][q], T2[p][q - 1], r, z); 82 | } else { 83 | this->shift_sum_inplace(T1[p][q], T2[p][q - 1], r, z); 84 | this->shift_sum_inplace(T2[p][q], T1[p][q - 1], r, z); 85 | } 86 | } 87 | } 88 | 89 | if (i >= win_len) { // only start deleting from front after reaching #win_len 90 | uint32_t ws = i - win_len; // the element to be removed from the sketch 91 | for (uint32_t diff = 0; diff < tup_len; ++diff) { 92 | for (uint32_t p = 1; p <= tup_len - diff; p++) { 93 | auto r = hashes[p - 1][seq[ws]]; 94 | bool s = signs[p - 1][seq[ws]]; 95 | uint32_t q = p + diff; 96 | // this computes t/(w-t); in our case t (the tuple length) is diff+1 97 | double z = (double)(diff + 1) / (win_len - diff); 98 | if (s) { 99 | this->shift_sum_inplace(T1[p][q], T1[p + 1][q], r, -z); 100 | this->shift_sum_inplace(T2[p][q], T2[p + 1][q], r, -z); 101 | } else { 102 | this->shift_sum_inplace(T1[p][q], T2[p + 1][q], r, -z); 103 | this->shift_sum_inplace(T2[p][q], T1[p + 1][q], r, -z); 104 | } 105 | } 106 | } 107 | } 108 | 109 | if ((i + 1) % stride == 0) { // save a sketch every stride times 110 | sketches.push_back(diff(T1[1].back(), T2[1].back())); 111 | } 112 | } 113 | return sketches; 114 | } 115 | 116 | double dist(const Vec2D &a, const Vec2D &b) { 117 | Timer timer("tensor_slide_sketch_dist"); 118 | return l2_dist2D_minlen(a, b); 119 | } 120 | 121 | 122 | private: 123 | std::vector diff(const std::vector &a, const std::vector &b) { 124 | assert(a.size() == b.size()); 125 | std::vector result(a.size()); 126 | for (uint32_t i = 0; i < result.size(); ++i) { 127 | result[i] = a[i] - b[i]; 128 | } 129 | return result; 130 | } 131 | 132 | uint32_t win_len; 133 | uint32_t stride; 134 | }; 135 | 136 | } // namespace ts 137 | -------------------------------------------------------------------------------- /sketch/tensor_slide_flat.hpp: -------------------------------------------------------------------------------- 1 | #include "sketch/tensor_slide.hpp" 2 | 3 | namespace ts { 4 | 5 | /** 6 | * A wrapper class around TensorSlide that flattens the output of TensorSlide::compute() from a 2D 7 | * vector to a 1D vector. The Flattener type must have a .flatten method that does the conversion. 8 | * Typically used with a class form sketch/dim_reduce.h. 9 | * 10 | * @tparam seq_type the type of elements in the sequences to be sketched. 11 | * @tparam Flattener: one of the classes from util/dim_reduce.h. 12 | */ 13 | template 14 | class TensorSlideFlat : public TensorSlide { 15 | Flattener flattener; 16 | 17 | public: 18 | using sketch_type = typename Flattener::sketch_type; 19 | 20 | /** 21 | * @param alphabet_size the number of elements in the alphabet S over which sequences are 22 | * defined (e.g. 4 for DNA) 23 | * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper 24 | * @param tup_len the length of the subsequences considered for sketching, denoted by t 25 | * in the paper 26 | * @param win_len sliding sketches are computed for substrings of size win_len 27 | * @param stride sliding sketches are computed every stride characters 28 | * @param flattener the object to use to flatten the compute output. 29 | * @param seed the seed to initialize the random number generator used for the random hash 30 | * functions. 31 | */ 32 | TensorSlideFlat(seq_type alphabet_size, 33 | size_t sketch_dim, 34 | size_t tup_len, 35 | size_t win_len, 36 | size_t stride, 37 | Flattener flattener, 38 | uint32_t seed, 39 | const std::string &name = "TSS") 40 | : TensorSlide(alphabet_size, sketch_dim, tup_len, win_len, stride, seed, name), 41 | flattener(flattener) {} 42 | 43 | 44 | /** 45 | * Computes sliding sketches for the given sequence. 46 | * A sketch is computed every #stride characters on substrings of length #window. 47 | * @return seq.size()/stride sketches of size #sketch_dim 48 | */ 49 | sketch_type compute(const std::vector &seq) { 50 | return flattener.flatten(TensorSlide::compute(seq)); 51 | } 52 | 53 | static double dist(const sketch_type &a, const sketch_type &b) { return Flattener::dist(a, b); } 54 | }; 55 | 56 | } // namespace ts 57 | -------------------------------------------------------------------------------- /sketch_main.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence/alphabets.hpp" 2 | #include "sequence/fasta_io.hpp" 3 | #include "sketch/edit_distance.hpp" 4 | #include "sketch/hash_base.hpp" 5 | #include "sketch/hash_min.hpp" 6 | #include "sketch/hash_ordered.hpp" 7 | #include "sketch/hash_weighted.hpp" 8 | #include "sketch/tensor.hpp" 9 | #include "sketch/tensor_block.hpp" 10 | #include "sketch/tensor_embedding.hpp" 11 | #include "sketch/tensor_slide.hpp" 12 | #include "util/multivec.hpp" 13 | #include "util/progress.hpp" 14 | #include "util/utils.hpp" 15 | 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace ts; 26 | 27 | DEFINE_string(alphabet, 28 | "dna4", 29 | "The alphabet over which sequences are defined (dna4, dna5, protein)"); 30 | 31 | DEFINE_string(sketch_method, "TSS", "The sketching method to use: MH, WMH, OMH, TS, TSB or TSS"); 32 | DEFINE_string(m, "TSS", "Short hand for --sketch_method"); 33 | 34 | DEFINE_uint32(kmer_length, 1, "The kmer length for: MH, WMH, OMH"); 35 | DEFINE_uint32(k, 3, "Short hand for --kmer_length"); 36 | 37 | DEFINE_string(o, "", "Output file, containing the sketches for each sequence"); 38 | 39 | DEFINE_string(i, 40 | "", 41 | "Input file or directory, containing the sequences to be sketched in .fa format"); 42 | 43 | DEFINE_int32(embed_dim, 4, "Embedding dimension, used for all sketching methods"); 44 | 45 | DEFINE_int32(tuple_length, 46 | 3, 47 | "Ordered tuple length, used in ordered MinHash and Tensor-based sketches"); 48 | DEFINE_int32(t, 3, "Short hand for --tuple_length"); 49 | 50 | static bool ValidateBlockSize(const char *flagname, int32_t value) { 51 | if (FLAGS_tuple_length % value == 0 || FLAGS_t % value == 0) { 52 | return true; 53 | } 54 | printf("Invalid value for --%s: %d. Must be a divisor of --tuple_len\n", flagname, value); 55 | return false; 56 | } 57 | DEFINE_int32(block_size, 58 | 1, 59 | "Only consider tuples made out of block-size continuous characters for Tensor sketch"); 60 | DEFINE_validator(block_size, &ValidateBlockSize); 61 | 62 | DEFINE_int32(window_size, 32, "Window length: the size of sliding window in Tensor Slide Sketch"); 63 | DEFINE_int32(w, 32, "Short hand for --window_size"); 64 | 65 | DEFINE_int32(max_len, 32, "The maximum accepted sequence length for Ordered and Weighted min-hash"); 66 | 67 | DEFINE_int32(stride, 8, "Stride for sliding window: shift step for sliding window"); 68 | DEFINE_int32(s, 8, "Short hand for --stride"); 69 | 70 | static bool ValidateInput(const char * /*unused*/, const std::string &value) { 71 | if (!value.empty()) { 72 | return true; 73 | } 74 | std::cerr << "Please specify a fasta input file using '-i '" << std::endl; 75 | return false; 76 | } 77 | DEFINE_validator(i, &ValidateInput); 78 | 79 | static bool ValidateOutput(const char * /*unused*/, const std::string &value) { 80 | if (value.empty()) { 81 | FLAGS_o = FLAGS_i + "." + FLAGS_sketch_method; 82 | } 83 | return true; 84 | } 85 | DEFINE_validator(o, &ValidateOutput); 86 | 87 | void adjust_short_names() { 88 | if (!gflags::GetCommandLineFlagInfoOrDie("m").is_default) { 89 | FLAGS_sketch_method = FLAGS_m; 90 | } 91 | if (!gflags::GetCommandLineFlagInfoOrDie("k").is_default) { 92 | FLAGS_kmer_length = FLAGS_k; 93 | } 94 | if (!gflags::GetCommandLineFlagInfoOrDie("w").is_default) { 95 | FLAGS_window_size = FLAGS_w; 96 | } 97 | if (!gflags::GetCommandLineFlagInfoOrDie("s").is_default) { 98 | FLAGS_stride = FLAGS_s; 99 | } 100 | if (!gflags::GetCommandLineFlagInfoOrDie("t").is_default) { 101 | FLAGS_tuple_length = FLAGS_t; 102 | } 103 | } 104 | 105 | // Some global constant types. 106 | using seq_type = uint8_t; 107 | 108 | // Run the given sketch method on input specified by the command line arguments, and write a 109 | // triangular distance matrix to the output file. 110 | template 111 | void run_triangle(SketchAlgorithm &algorithm) { 112 | std::cerr << "Reading input .." << std::endl; 113 | std::vector> files = read_directory(FLAGS_i); 114 | std::cerr << "Read " << files.size() << " files" << std::endl; 115 | 116 | const size_t n = files.size(); 117 | 118 | std::vector sketches(n); 119 | 120 | std::cerr << "Sketching .." << std::endl; 121 | progress_bar::init(n); 122 | #pragma omp parallel for default(shared) 123 | for (size_t i = 0; i < n; ++i) { 124 | assert(files[i].sequences.size() == 1 125 | && "Each input file must contain exactly one sequence!"); 126 | if constexpr (SketchAlgorithm::kmer_input) { 127 | sketches[i] 128 | = algorithm.compute(files[i].sequences[0], FLAGS_kmer_length, alphabet_size); 129 | } else { 130 | sketches[i] = algorithm.compute(files[i].sequences[0]); 131 | } 132 | progress_bar::iter(); 133 | } 134 | 135 | std::cerr << "Computing all pairwise distances .." << std::endl; 136 | 137 | std::vector> pairs; 138 | for (size_t i = 0; i < n; ++i) 139 | for (size_t j = 0; j < i; ++j) 140 | pairs.emplace_back(i, j); 141 | 142 | std::vector> distances(n); 143 | for (size_t i = 0; i < n; ++i) 144 | distances[i].resize(i); 145 | 146 | progress_bar::init(n * (n - 1) / 2); 147 | #pragma omp parallel for default(shared) 148 | for (auto it = pairs.begin(); it < pairs.end(); ++it) { // NOLINT 149 | auto [i, j] = *it; 150 | distances[i][j] = algorithm.dist(sketches[i], sketches[j]); 151 | progress_bar::iter(); 152 | } 153 | 154 | std::cerr << "Writing distances triangle to " << FLAGS_o << " .." << std::endl; 155 | std::filesystem::path ofile = std::filesystem::absolute(std::filesystem::path(FLAGS_o)); 156 | 157 | write_output_meta(); 158 | std::ofstream fo(ofile); 159 | if (!fo.is_open()) { 160 | std::cerr << "Could not open " << FLAGS_o << " for writing." << std::endl; 161 | std::exit(1); 162 | } 163 | 164 | // MASH adds an extra tab before the number of lines, so mirror that. 165 | fo << "\t" << n << '\n'; 166 | for (size_t i = 0; i < n; ++i) { 167 | fo << files[i].filename; 168 | for (size_t j = 0; j < i; ++j) 169 | fo << '\t' << distances[i][j]; 170 | fo << '\n'; 171 | } 172 | fo.close(); 173 | }; 174 | 175 | // Runs function f on the sketch method specified by the command line options. 176 | template 177 | void run_function_on_algorithm(F f) { 178 | using kmer_type = uint64_t; 179 | 180 | auto kmer_word_size = int_pow(alphabet_size, FLAGS_kmer_length); 181 | 182 | std::random_device rd; 183 | if (FLAGS_sketch_method == "MH") { 184 | f(MinHash(kmer_word_size, FLAGS_embed_dim, HashAlgorithm::murmur, rd())); 185 | return; 186 | } 187 | if (FLAGS_sketch_method == "WMH") { 188 | f(WeightedMinHash(kmer_word_size, FLAGS_embed_dim, FLAGS_max_len, 189 | HashAlgorithm::murmur, rd())); 190 | return; 191 | } 192 | if (FLAGS_sketch_method == "OMH") { 193 | f(OrderedMinHash(kmer_word_size, FLAGS_embed_dim, FLAGS_max_len, 194 | FLAGS_tuple_length, HashAlgorithm::murmur, rd())); 195 | return; 196 | } 197 | if (FLAGS_sketch_method == "ED") { 198 | f(EditDistance()); 199 | return; 200 | } 201 | if (FLAGS_sketch_method == "TE") { 202 | f(TensorEmbedding(alphabet_size, FLAGS_tuple_length, "TensorEmbedding")); 203 | return; 204 | } 205 | if (FLAGS_sketch_method == "TS") { 206 | f(Tensor(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length, rd())); 207 | return; 208 | } 209 | if (FLAGS_sketch_method == "TSB") { 210 | f(TensorBlock(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length, 211 | FLAGS_block_size, rd())); 212 | return; 213 | } 214 | if (FLAGS_sketch_method == "TSS") { 215 | f(TensorSlide(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length, 216 | FLAGS_window_size, FLAGS_stride, rd())); 217 | return; 218 | } 219 | std::cerr << "Unknown sketch method: " << FLAGS_sketch_method << "\n"; 220 | } 221 | 222 | 223 | int main(int argc, char *argv[]) { 224 | gflags::ParseCommandLineFlags(&argc, &argv, true); 225 | adjust_short_names(); 226 | 227 | init_alphabet(FLAGS_alphabet); 228 | 229 | if (std::pow(alphabet_size, FLAGS_kmer_length) > (double)std::numeric_limits::max()) { 230 | std::cerr << "Kmer size is too large to fit in 64 bits " << std::endl; 231 | std::exit(1); 232 | } 233 | 234 | run_function_on_algorithm([](auto x) { run_triangle(x); }); 235 | } 236 | -------------------------------------------------------------------------------- /tests/phylogeny/data.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ratschlab/Project2020-seq-tensor-sketching/20b19ddd19751840d33af97abe314d29b34dc0d4/tests/phylogeny/data.txt -------------------------------------------------------------------------------- /tests/phylogeny/test_upgma.cpp: -------------------------------------------------------------------------------- 1 | #include "phylogeny/upgma.hpp" 2 | 3 | #include 4 | 5 | namespace { 6 | 7 | using namespace ts; 8 | 9 | TEST(upgma, empty) { 10 | ASSERT_TRUE(upgma({}).empty()); 11 | } 12 | 13 | TEST(upgma, one) { 14 | Tree graph = upgma({ { 11.5 } }); 15 | ASSERT_TRUE(graph.size() == 1); 16 | ASSERT_TRUE(graph[0].age == 0); 17 | ASSERT_TRUE(graph[0].left == NO_CHILD); 18 | ASSERT_TRUE(graph[0].right == NO_CHILD); 19 | } 20 | 21 | TEST(upgma, some_values) { 22 | std::vector> dist_mat = { { 0, 17, 21, 31, 23 }, 23 | { 17, 0, 30, 34, 21 }, 24 | { 21, 30, 0, 28, 39 }, 25 | { 31, 34, 28, 0, 43 }, 26 | { 23, 21, 39, 43, 0 } }; 27 | Tree graph = upgma(dist_mat); 28 | ASSERT_EQ(9, graph.size()); 29 | ASSERT_EQ(graph[8].age, 16.5); 30 | ASSERT_EQ(graph[7].age, 14); 31 | ASSERT_EQ(graph[6].age, 11); 32 | ASSERT_EQ(graph[5].age, 8.5); 33 | for (uint32_t i = 0; i < 5; ++i) { 34 | ASSERT_EQ(graph[i].age, 0); 35 | } 36 | } 37 | 38 | } // namespace 39 | -------------------------------------------------------------------------------- /tests/sketch/test_hash_base.cpp: -------------------------------------------------------------------------------- 1 | #include "sketch/hash_base.hpp" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace { 10 | 11 | using namespace ts; 12 | using namespace ::testing; 13 | 14 | constexpr uint8_t SKETCH_DIM = 3; 15 | constexpr uint8_t SET_SIZE = 4 * 4; 16 | constexpr uint32_t MAX_LEN = 3; 17 | constexpr uint8_t HASH_SIZE = MAX_LEN * SET_SIZE; 18 | 19 | class Hash : public HashBase, public testing::TestWithParam { 20 | public: 21 | Hash() 22 | : HashBase(SET_SIZE, 23 | SKETCH_DIM, 24 | HASH_SIZE, 25 | HashAlgorithm::uniform, 26 | /*seed=*/31415) {} 27 | }; 28 | 29 | // test that the uniform hash function is bijective, i.e. it is in effect a permutation: 30 | TEST_F(Hash, HashesDistinct) { 31 | for (uint32_t s = 0; s < SKETCH_DIM; ++s) { 32 | std::unordered_set seen(SKETCH_DIM); 33 | for (uint32_t i = 0; i < hash_size; ++i) { 34 | uint8_t v = this->hash(s, i); 35 | ASSERT_FALSE(seen.find(v) != seen.end()); 36 | seen.insert(v); 37 | } 38 | ASSERT_EQ(hash_size, seen.size()); 39 | } 40 | } 41 | 42 | class Hash2 : public HashBase, public testing::TestWithParam { 43 | public: 44 | Hash2() 45 | : HashBase(SET_SIZE, 46 | SKETCH_DIM, 47 | HASH_SIZE, 48 | GetParam(), 49 | /*seed=*/31415) {} 50 | }; 51 | 52 | // test that the hash values are consistent - i.e. asking for the same value returns the same result 53 | TEST_P(Hash2, HashesConsistent) { 54 | std::vector> hashes(SKETCH_DIM); 55 | for (uint32_t s = 0; s < SKETCH_DIM; ++s) { 56 | for (uint32_t i = 0; i < hash_size; ++i) { 57 | uint8_t v = this->hash(s, i); 58 | ASSERT_FALSE(hashes[s].find(i) != hashes[s].end()); 59 | hashes[s][i] = v; 60 | } 61 | ASSERT_EQ(hash_size, hashes[s].size()); 62 | } 63 | 64 | for (uint32_t s = 0; s < SKETCH_DIM; ++s) { 65 | for (uint32_t i = 0; i < hash_size; ++i) { 66 | uint8_t v = this->hash(s, i); 67 | ASSERT_EQ(v, hashes[s][i]); 68 | } 69 | } 70 | } 71 | 72 | INSTANTIATE_TEST_SUITE_P(Method, 73 | Hash2, 74 | ::testing::Values(HashAlgorithm::uniform, 75 | HashAlgorithm::crc32, 76 | HashAlgorithm::murmur)); 77 | 78 | } // namespace 79 | -------------------------------------------------------------------------------- /tests/sketch/test_min_hash.cpp: -------------------------------------------------------------------------------- 1 | #include "sketch/hash_base.hpp" 2 | #include "sketch/hash_min.hpp" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace { 10 | 11 | using namespace ts; 12 | using namespace ::testing; 13 | 14 | TEST(MinHash, Empty) { 15 | MinHash under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415); 16 | std::vector sketch = under_test.compute(std::vector()); 17 | ASSERT_THAT(sketch, ElementsAre(0, 0, 0)); 18 | } 19 | 20 | TEST(MinHash, Repeat) { 21 | MinHash under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415); 22 | std::vector sequence = { 0, 1, 2, 3, 4, 5 }; 23 | std::vector sketch1 = under_test.compute(sequence); 24 | std::vector sketch2 = under_test.compute(sequence); 25 | ASSERT_THAT(sketch1, ElementsAreArray(sketch2)); 26 | } 27 | 28 | TEST(MinHash, Permute) { 29 | MinHash under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415); 30 | std::vector sequence1 = { 0, 1, 2, 3, 4, 5 }; 31 | std::vector sequence2 = { 5, 4, 3, 2, 1, 0 }; 32 | std::vector sketch1 = under_test.compute(sequence1); 33 | std::vector sketch2 = under_test.compute(sequence2); 34 | ASSERT_THAT(sketch1, ElementsAreArray(sketch2)); 35 | } 36 | 37 | TEST(MinHash, PermuteAndRepeat) { 38 | MinHash under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415); 39 | std::vector sequence1 = { 0, 1, 2, 3, 4, 5 }; 40 | std::vector sequence2 = { 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 }; 41 | std::vector sketch1 = under_test.compute(sequence1); 42 | std::vector sketch2 = under_test.compute(sequence2); 43 | ASSERT_THAT(sketch1, ElementsAreArray(sketch2)); 44 | } 45 | 46 | std::vector> hash_init(uint32_t set_sz, uint32_t sketch_size) { 47 | std::vector> hashes(sketch_size); 48 | for (size_t m = 0; m < sketch_size; m++) { 49 | for (uint32_t v = 0; v < set_sz; ++v) { 50 | hashes[m][v] = v; 51 | } 52 | } 53 | return hashes; 54 | } 55 | 56 | TEST(MinHash, PresetHash) { 57 | MinHash under_test(4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415); 58 | under_test.set_hashes_for_testing(hash_init(4 * 4, 3)); 59 | for (uint32_t i = 0; i < 4 * 4; ++i) { 60 | std::vector sequence(4 * 4 - i); 61 | std::iota(sequence.begin(), sequence.end(), i); 62 | std::vector sketch = under_test.compute(sequence); 63 | ASSERT_THAT(sketch, ElementsAreArray({ i, i, i })); 64 | } 65 | } 66 | 67 | } // namespace 68 | -------------------------------------------------------------------------------- /tests/sketch/test_ordered_min_hash.cpp: -------------------------------------------------------------------------------- 1 | #include "sketch/hash_ordered.hpp" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace { 9 | 10 | using namespace ts; 11 | using namespace ::testing; 12 | 13 | constexpr uint32_t alphabet_size = 4; 14 | const uint32_t set_size = int_pow(alphabet_size, 3); // k-mers of length 3 15 | constexpr uint32_t sketch_dim = 2; 16 | constexpr uint32_t tuple_length = 3; 17 | constexpr uint32_t max_sequence_len = 200; 18 | 19 | TEST(OrderedMinHash, Empty) { 20 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 21 | HashAlgorithm::uniform, /*seed=*/31415); 22 | ASSERT_THROW(under_test.compute(std::vector()), std::invalid_argument); 23 | } 24 | 25 | TEST(OrderedMinHash, Repeat) { 26 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 27 | HashAlgorithm::uniform, /*seed=*/31415); 28 | std::vector sequence = { 0, 1, 2, 3, 4, 5 }; 29 | Vec2D sketch1 = under_test.compute_2d(sequence); 30 | Vec2D sketch2 = under_test.compute_2d(sequence); 31 | ASSERT_EQ(sketch_dim, sketch1.size()); 32 | ASSERT_EQ(sketch_dim, sketch2.size()); 33 | for (uint32_t i = 0; i < sketch_dim; ++i) { 34 | ASSERT_THAT(sketch1[i], ElementsAreArray(sketch2[i])); 35 | } 36 | } 37 | 38 | TEST(OrderedMinHash, ReverseOrder) { 39 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 40 | HashAlgorithm::uniform, /*seed=*/31415); 41 | std::vector sequence1 = { 0, 1, 2, 3, 4, 5 }; 42 | std::vector sequence2 = { 5, 4, 3, 2, 1, 0 }; 43 | Vec2D sketch1 = under_test.compute_2d(sequence1); 44 | Vec2D sketch2 = under_test.compute_2d(sequence2); 45 | ASSERT_EQ(sketch_dim, sketch1.size()); 46 | ASSERT_EQ(sketch_dim, sketch2.size()); 47 | for (uint32_t i = 0; i < sketch_dim; ++i) { 48 | std::reverse(sketch1[i].begin(), sketch1[i].end()); // reversed order of appearance 49 | ASSERT_THAT(sketch1[i], ElementsAreArray(sketch2[i])); 50 | } 51 | } 52 | 53 | std::vector> 54 | hash_init(uint32_t set_sz, uint32_t sketch_size, uint32_t max_seq_len) { 55 | std::vector> hashes(sketch_size); 56 | for (size_t m = 0; m < sketch_size; m++) { 57 | for (uint32_t v = 0; v < set_sz * max_seq_len; ++v) { 58 | hashes[m][v] = v; 59 | } 60 | } 61 | return hashes; 62 | } 63 | 64 | TEST(OrderedMinHash, PresetHash) { 65 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 66 | HashAlgorithm::uniform, /*seed=*/31415); 67 | under_test.set_hashes_for_testing(hash_init(set_size, sketch_dim, max_sequence_len)); 68 | for (uint32_t i = 0; i < set_size - tuple_length; ++i) { 69 | std::vector sequence(set_size - i); 70 | std::iota(sequence.begin(), sequence.end(), i); 71 | Vec2D sketch = under_test.compute_2d(sequence); 72 | for (uint32_t s = 0; s < sketch_dim; ++s) { 73 | ASSERT_THAT(sketch[s], ElementsAreArray({ i, i + 1, i + 2 })); 74 | } 75 | } 76 | } 77 | 78 | TEST(OrderedMinHash, PresetHashRepeat) { 79 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 80 | HashAlgorithm::uniform, /*seed=*/ 31415); 81 | under_test.set_hashes_for_testing(hash_init(set_size, sketch_dim, max_sequence_len)); 82 | for (uint32_t i = 0; i < set_size - tuple_length; ++i) { 83 | std::vector sequence(2 * (set_size - i)); 84 | std::iota(sequence.begin(), sequence.begin() + sequence.size() / 2, i); 85 | std::iota(sequence.begin() + sequence.size() / 2, sequence.end(), i); 86 | Vec2D sketch = under_test.compute_2d(sequence); 87 | for (uint32_t s = 0; s < sketch_dim; ++s) { 88 | ASSERT_THAT(sketch[s], ElementsAreArray({ i, i + 1, i + 2 })); 89 | } 90 | } 91 | } 92 | 93 | #ifndef NDEBUG 94 | TEST(OrderedMinhash, SequenceTooLong) { 95 | OrderedMinHash under_test(set_size, sketch_dim, max_sequence_len, tuple_length, 96 | HashAlgorithm::uniform, /*seed=*/ 31415); 97 | std::vector sequence(max_sequence_len + 1); 98 | ASSERT_THROW(under_test.compute(sequence), std::invalid_argument); 99 | } 100 | #endif 101 | 102 | } // namespace 103 | -------------------------------------------------------------------------------- /tests/sketch/test_tensor.cpp: -------------------------------------------------------------------------------- 1 | #include "sketch/tensor.hpp" 2 | #include "util/utils.hpp" 3 | 4 | #include 5 | #include 6 | 7 | namespace { 8 | 9 | using namespace ts; 10 | using namespace ::testing; 11 | 12 | constexpr uint8_t alphabet_size = 4; 13 | const uint32_t set_size = int_pow(alphabet_size, 3); // k-mers of length 3 14 | constexpr uint32_t sketch_dim = 2; 15 | constexpr uint32_t tuple_length = 3; 16 | 17 | template 18 | void rand_init(uint32_t sketch_size, Vec2D *hashes, Vec2D *signs) { 19 | std::mt19937 gen(3412343); 20 | std::uniform_int_distribution rand_hash2(0, sketch_size - 1); 21 | std::uniform_int_distribution rand_bool(0, 1); 22 | 23 | for (size_t h = 0; h < hashes->size(); h++) { 24 | for (size_t c = 0; c < alphabet_size; c++) { 25 | (*hashes)[h][c] = rand_hash2(gen); 26 | (*signs)[h][c] = rand_bool(gen); 27 | } 28 | } 29 | } 30 | 31 | TEST(Tensor, Empty) { 32 | Tensor under_test(alphabet_size, sketch_dim, tuple_length, /*seed=*/31415); 33 | std::vector sketch = under_test.compute(std::vector()); 34 | ASSERT_EQ(sketch.size(), sketch_dim); 35 | ASSERT_THAT(sketch, ElementsAre(0, 0)); 36 | } 37 | 38 | /** The sequence has one char, which is shorter than the tuple length, so the sketch will be 0 */ 39 | TEST(Tensor, OneChar) { 40 | Tensor under_test(alphabet_size, sketch_dim, tuple_length, /*seed=*/31415); 41 | for (uint8_t c = 0; c < alphabet_size; ++c) { 42 | std::vector sketch = under_test.compute({ c }); 43 | ASSERT_THAT(sketch, ElementsAre(0, 0)); 44 | } 45 | } 46 | 47 | /** The sequence has one char, the tuple length is 1, so we should have a value of +/-1 on position 48 | * h(seq[0]) */ 49 | TEST(Tensor, OneCharTuple1) { 50 | constexpr uint32_t tuple_len = 1; 51 | Tensor under_test(alphabet_size, sketch_dim, tuple_len, /*seed=*/31415); 52 | 53 | Vec2D hashes = new2D(tuple_len, alphabet_size); 54 | Vec2D signs = new2D(tuple_len, alphabet_size); 55 | rand_init(sketch_dim, &hashes, &signs); 56 | under_test.set_hashes_for_testing(hashes, signs); 57 | 58 | for (uint8_t c = 0; c < alphabet_size; ++c) { 59 | std::vector sketch = under_test.compute({ c }); 60 | for (uint32_t i = 0; i < sketch_dim; ++i) { 61 | int8_t sign = signs[0][c] ? 1 : -1; 62 | ASSERT_EQ(sketch[i] * sign, hashes[0][c] % sketch_dim == i) << "Char: " << (int)c; 63 | } 64 | } 65 | } 66 | 67 | /** 68 | * The size of the sequence equals the size of the tuple, so the sketch will be 1 in one position 69 | * (position H(x)), and 0 in all the other positions 70 | */ 71 | TEST(Tensor, FullStringDistinctChars) { 72 | for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) { 73 | for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) { 74 | Tensor under_test(tuple_len, sketch_dimension, tuple_len, /*seed=*/31415); 75 | std::vector sequence(tuple_len); 76 | std::iota(sequence.begin(), sequence.end(), 0U); 77 | std::vector sketch = under_test.compute(sequence); 78 | ASSERT_EQ(sketch.size(), sketch_dimension); 79 | for (uint32_t i = 0; i < sketch_dimension; ++i) { 80 | ASSERT_TRUE(std::abs(sketch[i]) == 0 || std::abs(sketch[i]) == 1); 81 | } 82 | ASSERT_EQ(1, std::abs(std::accumulate(sketch.begin(), sketch.end(), 0))) 83 | << "D=" << sketch_dimension << " t=" << tuple_len; 84 | } 85 | } 86 | } 87 | 88 | /** 89 | * The size of the sequence equals the size of the tuple, so the sketch will be 1 or -1 in one 90 | * position (position H(x)), and 0 in all the other positions. 91 | */ 92 | TEST(Tensor, FullStringRandomChars) { 93 | std::mt19937 gen(1234567); 94 | for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) { 95 | for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) { 96 | std::uniform_int_distribution rand_char(0, alphabet_size - 1); 97 | Tensor under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415); 98 | 99 | Vec2D hashes = new2D(tuple_len, alphabet_size); 100 | Vec2D signs = new2D(tuple_len, alphabet_size); 101 | rand_init(sketch_dim, &hashes, &signs); 102 | under_test.set_hashes_for_testing(hashes, signs); 103 | 104 | std::vector sequence(tuple_len); 105 | for (uint8_t &c : sequence) { 106 | c = rand_char(gen); 107 | } 108 | std::vector sketch = under_test.compute(sequence); 109 | 110 | uint32_t pos = 0; // the position where the sketch must be one 111 | int8_t s = 1; // the sign of the sketch 112 | for (uint32_t i = 0; i < sequence.size(); ++i) { 113 | pos += hashes[i][sequence[i]]; 114 | s *= signs[i][sequence[i]] ? 1 : -1; 115 | } 116 | pos %= sketch_dimension; 117 | 118 | ASSERT_EQ(sketch.size(), sketch_dimension); 119 | for (uint32_t i = 0; i < sketch_dimension; ++i) { 120 | ASSERT_EQ(i == pos ? s : 0, sketch[i]); 121 | } 122 | } 123 | } 124 | } 125 | 126 | /** 127 | * If a sequence contains identical characters, its sketch will be +/-1 in one position and 0 in all 128 | * others, because all subsequences of length t are identical. 129 | */ 130 | TEST(Tensor, SameChars) { 131 | std::mt19937 gen(342111); 132 | std::uniform_int_distribution rand_char(0, alphabet_size - 1); 133 | std::uniform_int_distribution rand_seq_len(0, 100); 134 | for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) { 135 | for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) { 136 | Tensor under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415); 137 | uint8_t sequence_length = tuple_len + rand_seq_len(gen); 138 | std::vector sequence(sequence_length, rand_char(gen)); 139 | std::vector sketch = under_test.compute(sequence); 140 | ASSERT_EQ(sketch.size(), sketch_dimension); 141 | for (uint32_t i = 0; i < sketch_dimension; ++i) { 142 | ASSERT_TRUE(std::abs(sketch[i]) == 0 || std::abs(sketch[i]) == 1); 143 | } 144 | ASSERT_EQ(1, std::abs(std::accumulate(sketch.begin(), sketch.end(), 0))) 145 | << "Dim=" << sketch_dimension << " t=" << tuple_len; 146 | } 147 | } 148 | } 149 | 150 | /** 151 | * If a sequence contains distinct characters, then the tensor sketch for t=1 will contain multiples 152 | * of (1/alphabet_size), because T(a)=1/alphabet_size for all characters a. 153 | */ 154 | TEST(Tensor, DistinctCharsTuple1) { 155 | std::mt19937 gen(321567); 156 | constexpr uint8_t tuple_len = 1; 157 | std::vector sequence(alphabet_size); 158 | std::iota(sequence.begin(), sequence.end(), 0); 159 | for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) { 160 | Tensor under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415); 161 | 162 | std::vector sketch = under_test.compute(sequence); 163 | ASSERT_EQ(sketch.size(), sketch_dimension); 164 | for (uint32_t i = 0; i < sketch_dimension; ++i) { 165 | double factor = sketch[i] / (1. / alphabet_size); 166 | ASSERT_NEAR(factor, std::round(factor), 1e-3); 167 | } 168 | } 169 | } 170 | 171 | /** 172 | * If a sequence of length seq_len contains distinct characters, then the tensor sketch for 173 | * t=seq_len-1 will contain multiples of (1/seq_len), because T(a)=1/seq_len for all the seq_len 174 | * subsequences of length seq_len-1. 175 | */ 176 | TEST(Tensor, DistinctCharsTupleTMinus1) { 177 | std::mt19937 gen(321567); 178 | for (uint32_t tuple_len = 1; tuple_len < 10; ++tuple_len) { 179 | const uint8_t alphabet_size = tuple_len + 1; 180 | std::vector sequence(alphabet_size); 181 | std::iota(sequence.begin(), sequence.end(), 0); 182 | for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) { 183 | Tensor under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415); 184 | 185 | std::vector sketch = under_test.compute(sequence); 186 | ASSERT_EQ(sketch.size(), sketch_dimension); 187 | for (uint32_t i = 0; i < sketch_dimension; ++i) { 188 | double factor = sketch[i] / (1. / alphabet_size); 189 | ASSERT_NEAR(factor, std::round(factor), 1e-3); 190 | } 191 | } 192 | } 193 | } 194 | 195 | } // namespace 196 | -------------------------------------------------------------------------------- /tests/sketch/test_weighted_min_hash.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "sketch/hash_weighted.hpp" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace { 10 | 11 | using namespace ts; 12 | using namespace ::testing; 13 | 14 | TEST(WeightedMinHash, Empty) { 15 | WeightedMinHash under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 16 | std::vector sketch = under_test.compute(std::vector()); 17 | ASSERT_THAT(sketch, ElementsAre(0, 0, 0)); 18 | } 19 | 20 | TEST(WeightedMinHash, Repeat) { 21 | WeightedMinHash under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 22 | std::vector sequence = { 0, 1, 2, 3, 4, 5 }; 23 | std::vector sketch1 = under_test.compute(sequence); 24 | std::vector sketch2 = under_test.compute(sequence); 25 | ASSERT_THAT(sketch1, ElementsAreArray(sketch2)); 26 | } 27 | 28 | TEST(WeightedMinHash, Permute) { 29 | WeightedMinHash under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 30 | std::vector sequence1 = { 0, 1, 2, 3, 4, 5 }; 31 | std::vector sequence2 = { 5, 4, 3, 2, 1, 0 }; 32 | std::vector sketch1 = under_test.compute(sequence1); 33 | std::vector sketch2 = under_test.compute(sequence2); 34 | ASSERT_THAT(sketch1, ElementsAreArray(sketch2)); 35 | } 36 | 37 | std::vector> 38 | hash_init(uint32_t set_sz, uint32_t sketch_size, uint32_t max_seq_len) { 39 | std::vector> hashes(sketch_size); 40 | for (size_t m = 0; m < sketch_size; m++) { 41 | for (uint32_t v = 0; v < set_sz * max_seq_len; ++v) { 42 | hashes[m][v] = v; 43 | } 44 | } 45 | return hashes; 46 | } 47 | 48 | TEST(WeightedMinHash, PresetHash) { 49 | WeightedMinHash under_test(4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 50 | under_test.set_hashes_for_testing(hash_init(4 * 4, 3, 100)); 51 | for (uint32_t i = 0; i < 4 * 4; ++i) { 52 | std::vector sequence(4 * 4 - i); 53 | std::iota(sequence.begin(), sequence.end(), i); 54 | std::vector sketch = under_test.compute(sequence); 55 | ASSERT_THAT(sketch, ElementsAreArray({ i, i, i })); 56 | } 57 | } 58 | 59 | TEST(WeightedMinHash, PresetHashRepeat) { 60 | constexpr uint32_t set_size = 4 * 4; // corresponds to k-mers of length 2 over the DNA alphabet 61 | WeightedMinHash under_test(set_size, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 62 | under_test.set_hashes_for_testing(hash_init(set_size, 3, 100)); 63 | for (uint32_t i = 0; i < set_size; ++i) { 64 | std::vector sequence(2 * (set_size - i)); 65 | std::iota(sequence.begin(), sequence.begin() + sequence.size() / 2, i); 66 | std::iota(sequence.begin() + sequence.size() / 2, sequence.end(), i); 67 | std::vector sketch = under_test.compute(sequence); 68 | ASSERT_THAT(sketch, ElementsAreArray({ i, i, i })); 69 | } 70 | } 71 | 72 | #ifndef NDEBUG 73 | TEST(WeightedMinhash, SequenceTooLong) { 74 | constexpr uint32_t set_size = 4 * 4; // corresponds to k-mers of length 2 over the DNA alphabet 75 | WeightedMinHash under_test(set_size, 3, 100, HashAlgorithm::uniform, /*seed=*/31415); 76 | std::vector sequence(100 + 1); 77 | ASSERT_THROW(under_test.compute(sequence), std::invalid_argument); 78 | } 79 | #endif 80 | 81 | } // namespace 82 | -------------------------------------------------------------------------------- /tests/util/test_multivec.cpp: -------------------------------------------------------------------------------- 1 | #include "util/utils.hpp" 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace { 8 | template 9 | class Pow : public ::testing::Test {}; 10 | 11 | typedef ::testing::Types PowTypes; 12 | 13 | TYPED_TEST_SUITE(Pow, PowTypes); 14 | 15 | TYPED_TEST(Pow, Zero) { 16 | std::mt19937 rng(123457); 17 | std::uniform_int_distribution dist(0, 10000); 18 | 19 | for (uint32_t i = 0; i < 10; ++i) { 20 | EXPECT_EQ(1, ts::int_pow(dist(rng), 0)); 21 | } 22 | } 23 | 24 | TYPED_TEST(Pow, Random) { 25 | std::mt19937 rng(123457); 26 | std::uniform_int_distribution dist(0, 10); 27 | std::uniform_int_distribution pow_dist(0, 5); 28 | 29 | for (uint32_t i = 0; i < 10; ++i) { 30 | TypeParam base = pow_dist(rng); 31 | TypeParam exp = dist(rng); 32 | EXPECT_EQ(std::pow(base, exp), ts::int_pow(base, exp)); 33 | } 34 | } 35 | 36 | } // namespace 37 | -------------------------------------------------------------------------------- /tests/util/test_spearman.cpp: -------------------------------------------------------------------------------- 1 | #include "util/spearman.hpp" 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace { 8 | 9 | TEST(Spearman, Identical) { 10 | std::mt19937 rng(123457); 11 | std::uniform_int_distribution dist(0, 10000); 12 | std::uniform_int_distribution sz_dist(5, 10); 13 | for (uint32_t trial = 0; trial < 10; ++trial) { 14 | size_t size = sz_dist(rng); 15 | std::vector a; 16 | std::vector b; 17 | for (uint32_t i = 0; i < size; ++i) { 18 | size_t v = dist(rng); 19 | a.push_back(v); 20 | b.push_back(v); 21 | } 22 | ASSERT_EQ(1, spearman(a, b)); 23 | } 24 | } 25 | 26 | TEST(Spearman, Linear) { 27 | std::mt19937 rng(123457); 28 | std::uniform_real_distribution<> dist(1, 10000); 29 | std::uniform_int_distribution<> sz_dist(5, 10); 30 | double coef = dist(rng); 31 | for (uint32_t trial = 0; trial < 10; ++trial) { 32 | size_t size = sz_dist(rng); 33 | std::vector a; 34 | std::vector b; 35 | for (uint32_t i = 0; i < size; ++i) { 36 | size_t v = dist(rng); 37 | a.push_back(v); 38 | b.push_back(v * coef); 39 | } 40 | ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef; 41 | } 42 | } 43 | 44 | TEST(Spearman, LinearInverse) { 45 | size_t size = 10; 46 | std::vector a(size); 47 | std::vector b(size); 48 | for (uint32_t i = 0; i < size; ++i) { 49 | a[i] = 2 * i + 5; 50 | b[size - i - 1] = 2 * i + 5; 51 | } 52 | ASSERT_EQ(-1, spearman(a, b)); 53 | } 54 | 55 | TEST(Spearman, Quadratic) { 56 | std::mt19937 rng(123457); 57 | std::uniform_real_distribution<> dist(1, 10000); 58 | std::uniform_int_distribution<> sz_dist(5, 10); 59 | double coef = dist(rng); 60 | for (uint32_t trial = 0; trial < 10; ++trial) { 61 | size_t size = sz_dist(rng); 62 | std::vector a; 63 | std::vector b; 64 | for (uint32_t i = 0; i < size; ++i) { 65 | size_t v = dist(rng); 66 | a.push_back(v); 67 | b.push_back(v * v); 68 | } 69 | ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef; 70 | } 71 | } 72 | 73 | TEST(Spearman, QuadraticInverse) { 74 | size_t size = 10; 75 | std::vector a(size); 76 | std::vector b(size); 77 | for (uint32_t i = 0; i < size; ++i) { 78 | a[i] = 2 * i * i + 5; 79 | b[size - i - 1] = 2 * i * i + 5; 80 | } 81 | ASSERT_EQ(-1, spearman(a, b)); 82 | } 83 | 84 | TEST(Spearman, AllIdentical) { 85 | size_t size = 10; 86 | std::vector a(size); 87 | std::vector b(size); 88 | for (uint32_t i = 0; i < size; ++i) { 89 | a[i] = 2 * i * i + 5; 90 | b[i] = 2 * i * i + 5; 91 | } 92 | ASSERT_EQ(1, spearman(a, b)); 93 | } 94 | 95 | TEST(Spearman, SomeValues) { 96 | std::vector a = { 35, 23, 47, 17, 10, 43, 9, 6, 28 }; 97 | std::vector b = { 30, 33, 45, 23, 8, 49, 12, 4, 31 }; 98 | ASSERT_EQ(0.9, spearman(a, b)); 99 | } 100 | 101 | TEST(Spearman, LinearRepeats) { 102 | std::mt19937 rng(123457); 103 | std::uniform_real_distribution<> dist(1, 10000); 104 | std::uniform_int_distribution<> sz_dist(5, 10); 105 | double coef = dist(rng); 106 | for (uint32_t trial = 0; trial < 10; ++trial) { 107 | size_t size = sz_dist(rng); 108 | std::vector a; 109 | std::vector b; 110 | for (uint32_t i = 0; i < size; ++i) { 111 | size_t v = dist(rng); 112 | a.push_back(v); 113 | a.push_back(v); 114 | b.push_back(v * coef); 115 | b.push_back(v * coef); 116 | } 117 | ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef; 118 | } 119 | } 120 | 121 | TEST(Spearman, Rankify) { 122 | std::vector a = { 1, 1, 2, 2, 3, 3, 4, 5, 5 }; 123 | std::vector expected_ranks { 1.5, 1.5, 3.5, 3.5, 5.5, 5.5, 7, 8.5, 8.5 }; 124 | ASSERT_EQ(expected_ranks, rankify(a)); 125 | } 126 | 127 | TEST(Spearman, SomeValuesRepeats) { 128 | std::vector a = { 1, 1, 2, 2, 3, 3, 4, 5, 5 }; 129 | std::vector b = { 7, 8, 8, 19, 19, 3, 3, 5, 9 }; 130 | 131 | ASSERT_NEAR(-0.19314, spearman(a, b), 1e-5); 132 | } 133 | 134 | 135 | } // namespace 136 | -------------------------------------------------------------------------------- /third_party/murmur_hash/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) 2 | project(murmur_hash) 3 | set(CMAKE_CXX_STANDARD 17) 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1) 5 | 6 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 7 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 8 | 9 | 10 | file(GLOB murmur_files "*.cpp") 11 | add_library(murmur_lib ${murmur_files}) 12 | target_include_directories(murmur_lib INTERFACE .) 13 | -------------------------------------------------------------------------------- /third_party/murmur_hash/murmur_hash3.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #include 6 | 7 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 8 | 9 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 10 | 11 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 12 | -------------------------------------------------------------------------------- /util/multivec.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/transformer.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ts { // ts = Tensor Sketch 10 | 11 | template 12 | using is_u_integral = typename std::enable_if::value>::type; 13 | 14 | template 15 | using Vec2D = std::vector>; 16 | 17 | template 18 | using Vec3D = std::vector>; 19 | 20 | template 21 | using Vec4D = std::vector>; 22 | 23 | 24 | template 25 | auto new2D(size_t d1, size_t d2, T val = 0) { 26 | return Vec2D(d1, std::vector(d2, val)); 27 | } 28 | template 29 | auto new3D(size_t d1, size_t d2, size_t d3, T val = 0) { 30 | return Vec3D(d1, new2D(d2, d3, val)); 31 | } 32 | 33 | template 34 | void apply(std::vector &vec, const transformer &tr) { 35 | for (auto &v : vec) { 36 | v = tr.transform(v); 37 | } 38 | } 39 | 40 | template 41 | void apply(Vec2D &vec2D, const transformer &tr) { 42 | for (auto &vec : vec2D) { 43 | apply(vec, tr); 44 | } 45 | } 46 | 47 | template 48 | void apply(Vec3D &vec3D, const transformer &tr) { 49 | for (auto &vec2D : vec3D) { 50 | apply(vec2D, tr); 51 | } 52 | } 53 | 54 | } // namespace ts 55 | -------------------------------------------------------------------------------- /util/progress.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by amir on 1/9/21. 3 | // 4 | #include "util/progress.hpp" 5 | #include 6 | #include 7 | 8 | namespace ts { 9 | 10 | size_t progress_bar::it; 11 | size_t progress_bar::total; 12 | size_t progress_bar::bar_len; 13 | size_t progress_bar::bar_step; 14 | 15 | void progress_bar::init(size_t total_iterations, size_t len) { 16 | progress_bar::it = 0; 17 | progress_bar::total = total_iterations; 18 | progress_bar::bar_len = len; 19 | progress_bar::bar_step = 0; 20 | } 21 | 22 | void progress_bar::iter() { 23 | #pragma omp critical 24 | { 25 | ++it; 26 | auto step = (it * bar_len) / total; 27 | while (step > bar_step) { 28 | if (bar_step > 0) 29 | std::cerr << "\b\b\b\b"; 30 | ++bar_step; 31 | std::cerr << "#" << std::setw(3) << (int)(100.0 * it / total) << "%" << std::flush; 32 | } 33 | if (it == total) { 34 | std::cerr << "\033[2K\r" << std::flush; 35 | } 36 | } 37 | } 38 | 39 | }; // namespace ts 40 | -------------------------------------------------------------------------------- /util/progress.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by amir on 18/12/2020. 3 | // 4 | #pragma once 5 | #include 6 | 7 | namespace ts { 8 | 9 | 10 | struct progress_bar { 11 | static size_t it; 12 | static size_t total; 13 | static size_t bar_len; 14 | static size_t bar_step; 15 | 16 | static void init(size_t total_iterations, size_t bar_len = 50); 17 | static void iter() ; 18 | }; 19 | 20 | 21 | }; 22 | -------------------------------------------------------------------------------- /util/spearman.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Function returns the 1-based rank vector of a set of observations v 8 | template 9 | std::vector rankify(const std::vector &v) { 10 | std::vector sorted = v; 11 | std::sort(begin(sorted), end(sorted)); 12 | 13 | std::vector result(v.size()); 14 | 15 | for (size_t i = 0; i < v.size(); i++) { 16 | const auto lb = std::lower_bound(std::begin(sorted), std::end(sorted), v[i]); 17 | const auto ub = std::upper_bound(std::begin(sorted), std::end(sorted), v[i]); 18 | const size_t r = 1 + (lb - std::begin(sorted)), s = ub - lb; 19 | 20 | // Use Fractional Rank formula fractional_rank = r + (s-1)/2 21 | result[i] = r + (s - 1) * 0.5; 22 | } 23 | 24 | return result; 25 | } 26 | 27 | /* Compute the Pearson correlation coefficient of a and b */ 28 | template 29 | double pearson(const std::vector &a, const std::vector &b) { 30 | assert(a.size() == b.size()); 31 | T sum_a = 0, sum_b = 0, sum_ab = 0; 32 | T square_sum_a = 0, square_sum_b = 0; 33 | 34 | for (size_t i = 0; i < a.size(); i++) { 35 | sum_a = sum_a + a[i]; 36 | sum_b = sum_b + b[i]; 37 | sum_ab = sum_ab + a[i] * b[i]; 38 | square_sum_a = square_sum_a + a[i] * a[i]; 39 | square_sum_b = square_sum_b + b[i] * b[i]; 40 | } 41 | 42 | // compute variances 43 | T var_a = a.size() * square_sum_a - sum_a * sum_a; 44 | T var_b = a.size() * square_sum_b - sum_a * sum_b; 45 | // treat degenerate cases 46 | if (var_a == 0 && var_b == 0) { 47 | return 1; 48 | } 49 | if (var_a == 0 || var_b == 0) { 50 | return 0; 51 | } 52 | 53 | return (a.size() * sum_ab - sum_a * sum_b) / std::sqrt(var_a * var_b); 54 | } 55 | 56 | template 57 | double spearman(const std::vector &a, const std::vector &b) { 58 | std::vector rank1 = rankify(a); 59 | std::vector rank2 = rankify(b); 60 | return pearson(rank1, rank2); 61 | } 62 | -------------------------------------------------------------------------------- /util/timer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "timer.hpp" 4 | 5 | namespace ts { 6 | 7 | using namespace std::chrono; 8 | 9 | std::vector> Timer::durations_vec 10 | = std::vector>(100); 11 | std::vector> Timer::counts_vec 12 | = std::vector>(100); 13 | 14 | 15 | void Timer::add_duration(const std::string &func_name, nanoseconds dur) { 16 | int tid = omp_get_thread_num(); 17 | auto &durations = durations_vec[tid]; 18 | 19 | if (durations.find(func_name) == durations.end()) { // doesn't contain `func_name` 20 | durations[func_name] = dur; 21 | counts_vec[tid][func_name] = 1; 22 | } else { 23 | durations[func_name] += dur; 24 | counts_vec[tid][func_name]++; 25 | } 26 | } 27 | 28 | 29 | std::string Timer::summary() { 30 | std::map trans= { 31 | { "edit_distance", "ED" }, 32 | { "minhash", "MH" }, 33 | { "weighted_minhash", "WMH" }, 34 | { "ordered_minhash", "OMH" }, 35 | { "tensor_sketch", "TS" }, 36 | { "tensor_slide_sketch", "TSS" }, 37 | {"Int32Flattener", "I32FLAT"}, 38 | {"DoubleFlattener", "FLAT"}, 39 | {"seq2kmer", "S2K"} 40 | }; 41 | std::map total_counts; 42 | for (auto &counts : counts_vec) { 43 | for (auto const &[arg_name, arg_count] : counts) { 44 | if (total_counts.find(arg_name) != total_counts.end()) 45 | total_counts[arg_name] += arg_count; 46 | else 47 | total_counts[arg_name] = arg_count; 48 | } 49 | } 50 | std::map acc; 51 | for (auto &durations : Timer::durations_vec) { 52 | for (auto const &[arg_name, arg] : durations) { 53 | if (acc.find(arg_name) != acc.end()) { 54 | acc[arg_name] += arg.count(); 55 | } else { 56 | acc[arg_name] = arg.count(); 57 | } 58 | } 59 | } 60 | 61 | std::string str = "long name,short name, time, time sketch, time dist\n"; 62 | 63 | for (auto const &[arg_name, arg] : acc) { 64 | double sk_time = (double)arg, dist_time; 65 | if (arg_name.find("hash") != std::string::npos && // contains *hash* 66 | arg_name.find("dist") == std::string::npos) { // doesn't contain *dist* 67 | sk_time += acc["seq2kmer"]; // add kmer computation time to MH* methods 68 | } 69 | if (arg_name == "edit_distance") { 70 | sk_time = sk_time /1e6/total_counts[arg_name]; 71 | str += arg_name + "," + trans[arg_name] + "," + std::to_string(sk_time) + ",0,0\n"; 72 | } else if (arg_name.find("dist") == std::string::npos && arg_name!="seq2kmer") { 73 | sk_time = sk_time /1e6/total_counts[arg_name] ; // mean sketching time (ms) 74 | dist_time = acc[arg_name + "_dist"]/1e6/total_counts[arg_name + "_dist"]; // mean distance computation time (ms) 75 | str += arg_name + "," + trans[arg_name] + 76 | "," + std::to_string(sk_time + dist_time) + 77 | "," + std::to_string(sk_time) + 78 | "," + std::to_string(dist_time) + '\n'; 79 | } 80 | } 81 | return str; 82 | } 83 | 84 | } // namespace ts 85 | -------------------------------------------------------------------------------- /util/timer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | namespace ts { // ts = Tensor Sketch 12 | 13 | using namespace std::chrono; 14 | 15 | 16 | class Timer { 17 | public: 18 | Timer(std::string name) : 19 | name(std::move(name)), 20 | birth(high_resolution_clock::now()){} 21 | 22 | Timer(const Timer &tt) : 23 | name(tt.name), 24 | birth(high_resolution_clock::now()){} 25 | ~Timer() { 26 | auto dur = high_resolution_clock::now() - birth; 27 | Timer::add_duration(name, dur); 28 | } 29 | 30 | static std::string summary(); 31 | 32 | private: 33 | static void add_duration(const std::string &func_name, std::chrono::nanoseconds dur); 34 | 35 | 36 | std::string name; 37 | high_resolution_clock::time_point birth; 38 | 39 | static std::vector> durations_vec; 40 | static std::vector> counts_vec; 41 | }; 42 | 43 | } // namespace ts 44 | -------------------------------------------------------------------------------- /util/transformer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace ts { 8 | 9 | template 10 | class transformer { 11 | public: 12 | virtual T transform(T val) const = 0; 13 | }; 14 | 15 | template 16 | class discretize : public transformer { 17 | public: 18 | explicit discretize(size_t num_bins) : num_bins(num_bins) { 19 | bins = std::vector(num_bins); 20 | for (size_t b = 0; b < num_bins; b++) { 21 | bins[b] = std::tan(M_PI * (((double)b + .5) / num_bins - .5)); 22 | } 23 | bins.push_back(std::numeric_limits::max()); 24 | bins.insert(bins.begin(), -std::numeric_limits::max()); 25 | } 26 | 27 | 28 | // bin edges used to discretize the sketch output 29 | std::vector bins; 30 | 31 | T transform(T val) const override { 32 | return std::upper_bound(bins.begin(), bins.end(), val) - bins.begin(); 33 | } 34 | 35 | private: 36 | /** number of bins used to discretize the output*/ 37 | size_t num_bins; 38 | }; 39 | 40 | template 41 | class atan_scaler : public transformer { 42 | public: 43 | T transform(T val) const override { return atan(val); } 44 | }; 45 | 46 | } // namespace ts 47 | -------------------------------------------------------------------------------- /util/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ts { 10 | 11 | std::string flag_values(char delimiter, bool skip_empty, bool include_flagfile) { 12 | const std::string short_letters = "fkmstw"; 13 | std::vector flags; 14 | gflags::GetAllFlags(&flags); 15 | std::string result; 16 | for (const auto &flag : flags) { 17 | if (skip_empty && flag.current_value.empty()) 18 | continue; 19 | if (!include_flagfile && flag.name == "flagfile") 20 | continue; 21 | // Exclude short name flags. 22 | if (flag.name.size() == 1 && short_letters.find(flag.name[0]) != std::string::npos) 23 | continue; 24 | result += "--" + flag.name + "=" + flag.current_value + delimiter; 25 | } 26 | return result; 27 | } 28 | 29 | void write_output_meta() { 30 | std::string output_path; 31 | if (!gflags::GetCommandLineOption("o", &output_path)) 32 | return; 33 | 34 | std::string meta_path = output_path + ".meta"; 35 | std::ofstream meta(meta_path); 36 | meta << "#!/bin/sh\n"; 37 | meta << "cd " << std::filesystem::current_path() << "\n"; 38 | meta << gflags::GetArgv0() << " " << flag_values(' ', true, false) << "\n"; 39 | 40 | std::filesystem::permissions(meta_path, std::filesystem::perms::owner_exec, 41 | std::filesystem::perm_options::add); 42 | } 43 | 44 | std::pair avg_stddev(const std::vector &v) { 45 | if (v.empty()) 46 | return { 0, 0 }; 47 | const double sum = std::accumulate(begin(v), end(v), 0.0); 48 | const double avg = sum / v.size(); 49 | 50 | double var = 0; 51 | for (const auto &x : v) 52 | var += (x - avg) * (x - avg); 53 | 54 | return { avg, sqrt(var / v.size()) }; 55 | } 56 | 57 | double median(const std::vector &v) { 58 | assert(!v.empty()); 59 | if (v.size() % 2) 60 | return v[v.size() / 2]; 61 | return (v[v.size() / 2 - 1] + v[v.size() / 2]) / 2; 62 | } 63 | 64 | } // namespace ts 65 | -------------------------------------------------------------------------------- /util/utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/multivec.hpp" 4 | #include "util/timer.hpp" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace ts { // ts = Tensor Sketch 13 | 14 | /** 15 | * Extracts k-mers from a sequence. The k-mer is treated as a number in base alphabet_size and then 16 | * converted to decimal, i.e. the sequence s1...sk is converted to s1*S^(k-1) + s2*S^(k-2) + ... + 17 | * sk, where k is the k-mer size. 18 | * @tparam chr types of elements in the sequence 19 | * @tparam kmer type that stores a kmer 20 | * @param seq the sequence to extract kmers from 21 | * @param kmer_size number of characters in a kmer 22 | * @param alphabet_size size of the alphabet 23 | * @return the extracted kmers, as integers converted from base #alphabet_size 24 | */ 25 | template 26 | std::vector seq2kmer(const std::vector &seq, uint8_t kmer_size, uint8_t alphabet_size) { 27 | Timer timer("seq2kmer"); 28 | if (seq.size() < (size_t)kmer_size) { 29 | return std::vector(); 30 | } 31 | 32 | std::vector result(seq.size() - kmer_size + 1, 0); 33 | 34 | kmer c = 1; 35 | for (uint8_t i = 0; i < kmer_size; i++) { 36 | result[0] += c * seq[i]; 37 | c *= alphabet_size; 38 | } 39 | c /= alphabet_size; 40 | 41 | for (size_t i = 0; i < result.size() - 1; i++) { 42 | kmer base = result[i] - seq[i]; 43 | assert(base % alphabet_size == 0); 44 | result[i + 1] = base / alphabet_size + seq[i + kmer_size] * c; 45 | } 46 | return result; 47 | } 48 | 49 | template 50 | T l1_dist(const std::vector &a, const std::vector &b) { 51 | assert(a.size() == b.size()); 52 | T res = 0; 53 | for (size_t i = 0; i < a.size(); i++) { 54 | auto el = std::abs(a[i] - b[i]); 55 | res += el; 56 | } 57 | return res; 58 | } 59 | 60 | 61 | template 62 | T l2_dist(const std::vector &a, const std::vector &b) { 63 | assert(a.size() == b.size()); 64 | T res = 0; 65 | for (size_t i = 0; i < a.size(); i++) { 66 | auto el = std::abs(a[i] - b[i]); 67 | res += el * el; 68 | } 69 | return res; 70 | } 71 | 72 | 73 | template 74 | T l1_dist2D_minlen(const Vec2D &a, const Vec2D &b) { 75 | auto len = std::min(a.size(), b.size()); 76 | T val = 0; 77 | for (size_t i = 0; i < len; i++) { 78 | for (size_t j = 0; j < a[i].size() and j < b[i].size(); j++) { 79 | auto el = std::abs(a[i][j] - b[i][j]); 80 | val += el; 81 | } 82 | } 83 | return val; 84 | } 85 | 86 | template 87 | T l2_dist2D_minlen(const Vec2D &a, const Vec2D &b) { 88 | auto len = std::min(a.size(), b.size()); 89 | T val = 0; 90 | for (size_t i = 0; i < len; i++) { 91 | for (size_t j = 0; j < a[i].size() and j < b[i].size(); j++) { 92 | auto el = (a[i][j] - b[i][j]); 93 | val += el * el; 94 | } 95 | } 96 | return val; 97 | } 98 | 99 | 100 | template 101 | T hamming_dist(const std::vector &a, const std::vector &b) { 102 | assert(a.size() == b.size()); 103 | T diff = 0; 104 | for (size_t i = 0; i < a.size(); i++) { 105 | if (a[i] != b[i]) { 106 | diff++; 107 | } 108 | } 109 | return diff; 110 | } 111 | 112 | template 113 | int lcs(const std::vector &s1, const std::vector &s2) { 114 | size_t m = s1.size(); 115 | size_t n = s2.size(); 116 | // int L[m + 1][n + 1]; 117 | Vec2D L(m + 1, std::vector(n + 1, 0)); 118 | for (size_t i = 0; i <= m; i++) { 119 | for (size_t j = 0; j <= n; j++) { 120 | if (i == 0 || j == 0) { 121 | L[i][j] = 0; 122 | } else if (s1[i - 1] == s2[j - 1]) { 123 | L[i][j] = L[i - 1][j - 1] + 1; 124 | } else { 125 | L[i][j] = std::max(L[i - 1][j], L[i][j - 1]); 126 | } 127 | } 128 | } 129 | return L[m][n]; 130 | } 131 | 132 | template 133 | size_t lcs_distance(const std::vector &s1, const std::vector &s2) { 134 | return s1.size() + s2.size() - 2 * lcs(s1, s2); 135 | } 136 | 137 | template 138 | size_t edit_distance(const std::vector &s1, const std::vector &s2) { 139 | Timer timer("edit_distance"); 140 | const size_t m(s1.size()); 141 | const size_t n(s2.size()); 142 | 143 | if (m == 0) 144 | return n; 145 | if (n == 0) 146 | return m; 147 | 148 | auto costs = std::vector(n + 1); 149 | 150 | for (size_t k = 0; k <= n; k++) 151 | costs[k] = k; 152 | 153 | size_t i = 0; 154 | for (auto it1 = s1.begin(); it1 != s1.end(); ++it1, ++i) { 155 | costs[0] = i + 1; 156 | size_t corner = i; 157 | 158 | size_t j = 0; 159 | for (auto it2 = s2.begin(); it2 != s2.end(); ++it2, ++j) { 160 | size_t upper = costs[j + 1]; 161 | if (*it1 == *it2) { 162 | costs[j + 1] = corner; 163 | } else { 164 | size_t t(upper < corner ? upper : corner); 165 | costs[j + 1] = (costs[j] < t ? costs[j] : t) + 1; 166 | } 167 | 168 | corner = upper; 169 | } 170 | } 171 | 172 | size_t result = costs[n]; 173 | 174 | return result; 175 | } 176 | 177 | template > 178 | T int_pow(T x, T pow) { 179 | T result = 1; 180 | for (;;) { 181 | if (pow & 1) 182 | result *= x; 183 | pow >>= 1; 184 | if (!pow) 185 | break; 186 | x *= x; 187 | } 188 | 189 | return result; 190 | } 191 | 192 | std::string 193 | flag_values(char delimiter = ' ', bool skip_empty = false, bool include_flagfile = true); 194 | 195 | // If the -o output flag is set, this writes a small shell script .meta containing the 196 | // command line used to generate the output. 197 | void write_output_meta(); 198 | 199 | // A simple wrapper around std::apply that applies a given lambda on each element of a tuple. 200 | template 201 | void apply_tuple(F &&f, T &tuple_t) { 202 | std::apply([&](auto &...t) { (f(t), ...); }, tuple_t); 203 | } 204 | 205 | 206 | // A simple wrapper around std::apply that applies f on pairs of elements of two tuples. 207 | template 208 | void apply_tuple(F &&f, T &tuple_t, U &tuple_u) { 209 | std::apply([&](auto &...t) { std::apply([&](auto &...u) { (f(t, u), ...); }, tuple_u); }, 210 | tuple_t); 211 | } 212 | 213 | 214 | std::pair avg_stddev(const std::vector &v); 215 | 216 | // v must be sorted. 217 | double median(const std::vector &v); 218 | 219 | } // namespace ts 220 | --------------------------------------------------------------------------------