├── .clang-format
├── .github
    └── workflows
    │   └── cmake.yml
├── .gitignore
├── .gitmodules
├── .hgignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── experiments_flags
├── experiments_main.cpp
├── legacy
    ├── align_fasta.cpp
    ├── cross_comp.cpp
    ├── dists_pairwise.cpp
    ├── dtw.cpp
    ├── long_seqs.cpp
    ├── tensor.hpp
    ├── tensor_disc.hpp
    ├── tensor_slide.hpp
    ├── tensor_slide2.hpp
    ├── test_tensor_disc.cpp
    ├── test_typeinfo.cpp
    ├── vectool.cpp
    └── vectool.hpp
├── paper_gen.py
├── phylogeny
    ├── upgma.cpp
    └── upgma.hpp
├── pyproject.toml
├── python
    ├── init_numba_env.sh
    └── lib
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cds.py
    │   ├── sequence.py
    │   ├── tensor_embedding.py
    │   ├── tensor_sketch.py
    │   ├── tensor_sketch_gpu.py
    │   └── util.py
├── sequence
    ├── alphabets.cpp
    ├── alphabets.hpp
    ├── fasta_io.hpp
    └── sequence_generator.hpp
├── sequence_generator_main.cpp
├── sketch
    ├── dim_reduce.hpp
    ├── edit_distance.hpp
    ├── hash_base.cpp
    ├── hash_base.hpp
    ├── hash_min.hpp
    ├── hash_ordered.hpp
    ├── hash_weighted.hpp
    ├── sketch_base.hpp
    ├── tensor.hpp
    ├── tensor_block.hpp
    ├── tensor_embedding.hpp
    ├── tensor_slide.hpp
    └── tensor_slide_flat.hpp
├── sketch_main.cpp
├── tests
    ├── phylogeny
    │   ├── data.txt
    │   └── test_upgma.cpp
    ├── sketch
    │   ├── test_hash_base.cpp
    │   ├── test_min_hash.cpp
    │   ├── test_ordered_min_hash.cpp
    │   ├── test_tensor.cpp
    │   ├── test_tensor_block.cpp
    │   ├── test_tensor_slide.cpp
    │   └── test_weighted_min_hash.cpp
    └── util
    │   ├── test_multivec.cpp
    │   └── test_spearman.cpp
├── third_party
    └── murmur_hash
    │   ├── CMakeLists.txt
    │   ├── murmur_hash3.cpp
    │   └── murmur_hash3.hpp
└── util
    ├── multivec.hpp
    ├── progress.cpp
    ├── progress.hpp
    ├── spearman.hpp
    ├── timer.cpp
    ├── timer.hpp
    ├── transformer.hpp
    ├── utils.cpp
    └── utils.hpp


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | # BasedOnStyle:  LLVM
 4 | AccessModifierOffset: -2
 5 | AlignAfterOpenBracket: Align
 6 | AlignConsecutiveAssignments: false
 7 | AlignConsecutiveDeclarations:  false
 8 | AlignEscapedNewlines: DontAlign
 9 | AlignOperands: false
10 | AlignTrailingComments: false
11 | AllowAllParametersOfDeclarationOnNextLine: false
12 | AllowShortBlocksOnASingleLine: false
13 | AllowShortCaseLabelsOnASingleLine: false
14 | AllowShortFunctionsOnASingleLine: Inline
15 | AllowShortIfStatementsOnASingleLine: false
16 | AllowShortLoopsOnASingleLine: false
17 | AlwaysBreakAfterReturnType: None
18 | AlwaysBreakBeforeMultilineStrings: true
19 | AlwaysBreakTemplateDeclarations: true
20 | BinPackArguments: true
21 | BinPackParameters: false
22 | BraceWrapping:
23 |   AfterEnum: false
24 |   AfterFunction: false
25 |   AfterNamespace: false
26 |   AfterStruct: false
27 |   AfterUnion: false
28 |   AfterExternBlock: false
29 |   BeforeCatch: false
30 |   BeforeElse: false
31 |   IndentBraces: false
32 |   SplitEmptyFunction: false
33 | BreakBeforeBinaryOperators: All
34 | BreakBeforeBraces: Attach
35 | BreakBeforeInheritanceComma: false
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializersBeforeComma: false
38 | BreakConstructorInitializers: BeforeColon
39 | BreakStringLiterals: true
40 | ColumnLimit: 100
41 | CompactNamespaces: false
42 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
43 | ConstructorInitializerIndentWidth: 4
44 | ContinuationIndentWidth: 8
45 | Cpp11BracedListStyle: false
46 | DerivePointerAlignment: false
47 | DisableFormat: false
48 | ExperimentalAutoDetectBinPacking: false
49 | FixNamespaceComments: true
50 | ForEachMacros:
51 |   - foreach
52 |   - Q_FOREACH
53 |   - BOOST_FOREACH
54 | IndentCaseLabels: true
55 | IndentPPDirectives: None
56 | IndentWidth: 4
57 | IndentWrappedFunctionNames: false
58 | KeepEmptyLinesAtTheStartOfBlocks: false
59 | MaxEmptyLinesToKeep: 2
60 | NamespaceIndentation: None
61 | PenaltyBreakAssignment: 2
62 | PenaltyBreakBeforeFirstCallParameter: 70
63 | PenaltyBreakComment: 300
64 | PenaltyBreakFirstLessLess: 120
65 | PenaltyBreakString: 1000
66 | PenaltyBreakTemplateDeclaration: 1
67 | PenaltyExcessCharacter: 5000
68 | PenaltyReturnTypeOnItsOwnLine: 60
69 | PointerAlignment: Right
70 | RawStringFormats:
71 |   - Delimiters:
72 |       - 'pb'
73 |     Language:        TextProto
74 |     BasedOnStyle:    LLVM
75 | ReflowComments: true
76 | SortIncludes: true
77 | SortUsingDeclarations: false
78 | SpaceAfterCStyleCast: false
79 | SpaceAfterTemplateKeyword: true
80 | SpaceBeforeAssignmentOperators: true
81 | SpaceBeforeCpp11BracedList: true
82 | SpaceBeforeCtorInitializerColon: true
83 | SpaceBeforeInheritanceColon: true
84 | SpaceBeforeParens: ControlStatements
85 | SpaceBeforeRangeBasedForLoopColon: true
86 | SpaceInEmptyParentheses: false
87 | SpacesBeforeTrailingComments: 1
88 | SpacesInAngles: false
89 | SpacesInCStyleCastParentheses: false
90 | SpacesInContainerLiterals: true
91 | SpacesInParentheses: false
92 | SpacesInSquareBrackets: false
93 | Standard: Cpp11
94 | TabWidth: 4
95 | UseTab: Never
96 | ...
97 | 


--------------------------------------------------------------------------------
/.github/workflows/cmake.yml:
--------------------------------------------------------------------------------
 1 | name: CMake
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     # The CMake configure and build commands are platform agnostic and should work equally
 8 |     # well on Windows or Mac.  You can convert this to a matrix build if you need
 9 |     # cross-platform coverage.
10 |     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         build_type: [Debug, Release]
16 |         compiler: [g++-8]
17 |         include:
18 |           - compiler: g++-8
19 |             cxx: g++-8
20 |             cc: gcc-8
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v2
24 | 
25 |     - name: checkout submodules
26 |       run: git submodule update --init --recursive
27 | 
28 |     - name: create build dir
29 |       run: mkdir ${{runner.workspace}}/build
30 | 
31 |     - name: Configure CMake
32 |       working-directory: ${{runner.workspace}}/build
33 |       shell: bash
34 |       run: |
35 |         export CC=$(which ${{ matrix.cc }})
36 |         export CXX=$(which ${{ matrix.cxx }})
37 |         cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
38 | 
39 |     - name: Build
40 |       working-directory: ${{runner.workspace}}/build
41 |       shell: bash
42 |       # Execute the build.  You can specify a specific target with "--target <NAME>"
43 |       run: make -j
44 | 
45 |     - name: Test
46 |       working-directory: ${{runner.workspace}}/build
47 |       shell: bash
48 |       # Execute tests defined by the CMake configuration.
49 |       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
50 |       run: ./tests
51 | 
52 |   build-macos:
53 | 
54 |     runs-on: macos-latest
55 | 
56 |     strategy:
57 |       matrix:
58 |         build_type: [Debug, Release]
59 | 
60 |     steps:
61 |     - uses: actions/checkout@v2
62 | 
63 |     - name: checkout submodules
64 |       run: git submodule update --init --recursive
65 | 
66 |     - name: install dependencies
67 |       run: brew install libomp
68 | 
69 |     - name: create build dir
70 |       run: mkdir ${{runner.workspace}}/build
71 | 
72 |     - name: Configure CMake
73 |       working-directory: ${{runner.workspace}}/build
74 |       shell: bash
75 |       run: |
76 |         cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
77 | 
78 |     - name: Build
79 |       working-directory: ${{runner.workspace}}/build
80 |       shell: bash
81 |       # Execute the build.  You can specify a specific target with "--target <NAME>"
82 |       run: make -j
83 | 
84 |     - name: Test
85 |       working-directory: ${{runner.workspace}}/build
86 |       shell: bash
87 |       # Execute tests defined by the CMake configuration.
88 |       # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
89 |       run: ./tests
90 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | cmake-build-debug
2 | cmake-build-release
3 | .idea
4 | build
5 | compile_commands.json
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "googletest"]
 2 | 	path = googletest
 3 | 	url = https://github.com/google/googletest
 4 | [submodule "third_party/googletest"]
 5 | 	path = third_party/googletest
 6 | 	url = https://github.com/google/googletest
 7 | [submodule "third_party/gflags"]
 8 | 	path = third_party/gflags
 9 | 	url = https://github.com/gflags/gflags
10 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | third_party
2 | legacy
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: end-of-file-fixer
 6 |     -   id: trailing-whitespace
 7 | -   repo: https://github.com/psf/black
 8 |     rev: 20.8b1
 9 |     hooks:
10 |     -   id: black
11 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.13)
 2 | project(sequence_sketching)
 3 | set(CMAKE_CXX_STANDARD 17)
 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 5 | find_package(OpenMP REQUIRED)
 6 | 
 7 | include_directories(.)
 8 | 
 9 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
10 | set(CMAKE_CXX_FLAGS_RELEASE "-O3")
11 | 
12 | # Google Flags Library
13 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gflags EXCLUDE_FROM_ALL)
14 | 
15 | # Murmur
16 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/murmur_hash EXCLUDE_FROM_ALL)
17 | 
18 | file(GLOB util_files "util/*.cpp")
19 | add_library(util ${util_files})
20 | target_link_libraries(util gflags OpenMP::OpenMP_CXX)
21 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
22 |   target_link_libraries(util stdc++fs)
23 | endif()
24 | 
25 | file(GLOB sequence_files "sequence/*.cpp")
26 | add_library(sequence ${sequence_files})
27 | target_link_libraries(sequence gflags OpenMP::OpenMP_CXX)
28 | 
29 | file(GLOB sketch_files "sketch/*.cpp")
30 | add_library(sketch_lib ${sketch_files})
31 | target_link_libraries(sketch_lib murmur_lib OpenMP::OpenMP_CXX)
32 | 
33 | file(GLOB phylogeny_files "phylogeny/*.cpp")
34 | add_library(phylogeny_lib ${phylogeny_files})
35 | 
36 | add_executable(experiments experiments_main.cpp )
37 | target_link_libraries(experiments sequence util sketch_lib)
38 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
39 |   target_link_libraries(experiments stdc++fs)
40 | endif()
41 | 
42 | add_executable(sketch sketch_main.cpp)
43 | target_link_libraries(sketch sequence util sketch_lib)
44 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
45 |   target_link_libraries(sketch stdc++fs)
46 | endif()
47 | 
48 | add_executable(seqgen sequence_generator_main.cpp)
49 | target_link_libraries(seqgen sequence util sketch_lib)
50 | 
51 | # TESTS
52 | string(APPEND CMAKE_CXX_FLAGS " -Wall -Wextra -Werror -Wfatal-errors -msse4")
53 | 
54 | 
55 | enable_testing()
56 | include(GoogleTest)
57 | 
58 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest EXCLUDE_FROM_ALL)
59 | target_compile_options(gtest_main PRIVATE -w)
60 | target_compile_options(gtest PRIVATE -w)
61 | 
62 | file(GLOB test_files "tests/**/*.cpp")
63 | 
64 | add_executable(tests ${test_files})
65 | target_link_libraries(tests gtest_main gtest gmock util sketch_lib phylogeny_lib)
66 | target_include_directories(tests PRIVATE "include")
67 | 
68 | gtest_discover_tests(tests)
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ratschlab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Similarity Estimation via Tensor Sketching
 2 | This repository contains the reference implementation for the Tensor Sketching method, which can be used to estimate sequence similarity without needing to align the sequences.
 3 | 
 4 | The method is described in the paper by Amir Joudaki et al. [`Fast Alignment-Free Similarity Estimation By Tensor
 5 |  Sketching`](https://www.biorxiv.org/content/10.1101/2020.11.13.381814v5.full).
 6 | 
 7 | ## Download and build
 8 | ```
 9 | git clone https://github.com/ratschlab/Project2020-seq-tensor-sketching
10 | cd Project2020-seq-tensor-sketching
11 | git submodule update --init --
12 | mkdir build; cd build
13 | cmake ..
14 | make -j
15 | ```
16 | 
17 | ## Run 
18 | The `sketch` binary expects as input a directory containing fasta files (with extension `.fa`, `.fasta` or `.fna`), 
19 | each fasta file containing a single sequence:
20 | ```bash
21 | ./sketch -i /tmp/  -o /tmp/sketch_triangle
22 | ```
23 | 
24 | The output file will contain the number of sequences on the first line and the pairwise distances between each 
25 | sequence on the following lines, e.g.:
26 | ```
27 |         4
28 | test2.fa
29 | test3.fa        0.28125
30 | test4.fa        1.06314 0.915816
31 | test1.fa        0       0.28125 1.06314
32 | ```
33 | For example, the distance between test1.fa and test2.fa is 0 (the lower the distance the more similar the sequences).
34 | 
35 | ### Flags
36 | To see all available flags, run:
37 | ```
38 | ./sketch --help
39 | ```
40 | Here are the most important flags:
41 | 
42 | `-m`, `--sketch_method`: the sketching method to use; can be one of `MH, WMH, OMH, TS, TSB or TSS`, which corresponds to
43 | min-hash, weigheted-min-has, ordered-min-hash, tensor-sketch, tensor-block and tensor-slide-sketch, respectively.
44 | 
45 | `-k`, `--kmer_length`: the length of the k-mer used in the sketching method (default=3)
46 | 
47 | `--embed_dim`: the dimension of the embedded space used in all sketching methods (default=4)
48 | 
49 | `-t, --tuple-length`: the ordered tuple length, not used in Min-hash and Weighted-min-hash (default=3)
50 | 
51 | `--block_size`: only consider tuples made out of block-size continuous characters for Tensor sketch (default=1)
52 | 
53 | `-w, --window_size`: the size of sliding window in Tensor Slide Sketch (default=32)
54 | 
55 | `--max_len`: the maximum accepted sequence length for Ordered and Weighted min-hash (default=32)
56 | 
57 | `-s, --stride`: stride for sliding window: shift step for sliding window (default=8)
58 | ## Contributing
59 | 
60 | - The python code in the repository is formatted using [black](https://github.com/psf/black).
61 |   To enable the pre-commit hook, install [pre-commit](https://pre-commit.com/)
62 |   with `pip` or your package manager (Arch: `python-pre-commit`) and run
63 |   `pre-commit install` from the repository root. All python code will now automatically be formatted
64 |   on each commit.
65 | 


--------------------------------------------------------------------------------
/experiments_flags:
--------------------------------------------------------------------------------
 1 | # sequence generation
 2 | --alphabet_size=4
 3 | --seq_len=10000
 4 | --num_seqs=1000
 5 | --group_size=2
 6 | --phylogeny_shape=path
 7 | --fix_len=true
 8 | --max_mutation_rate=1.0
 9 | --min_mutation_rate=0.0
10 | # sketching methods
11 | --embed_dim=16
12 | --kmer_size=4
13 | --tuple_length=3
14 | --block_size=2
15 | --window_size=2000
16 | --stride=1000
17 | --tss_dim=4
18 | --max_len=1000
19 | # change this to uniform for final experiments
20 | --hash_alg=murmur
21 | --transform=none
22 | --num_bins=256
23 | # execution & I/O
24 | --num_threads=0
25 | --o=/tmp/ts
26 | 


--------------------------------------------------------------------------------
/legacy/align_fasta.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <memory>
  3 | 
  4 | 
  5 | #include "util/modules.hpp"
  6 | #include "util/seqgen.hpp"
  7 | #include "util/utils.hpp"
  8 | 
  9 | using namespace ts;
 10 | using namespace BasicTypes;
 11 | 
 12 | struct KmerModule : public BasicModules {
 13 |     int original_alphabet_size {};
 14 | 
 15 |     void override_pre() override {
 16 |         alphabet_size = 5;
 17 |         original_alphabet_size = alphabet_size;
 18 |         alphabet_size = int_pow<size_t>(alphabet_size, kmer_size);
 19 |     }
 20 | 
 21 |     void override_post() override {
 22 |         //        tensor_slide_params.alphabet_size = original_alphabet_size;
 23 |         //        tensor_slide_params.tup_len = 2;
 24 |         tensor_slide_params.embed_dim = 50;
 25 |         tensor_slide_params.num_bins = 250;
 26 |     }
 27 | };
 28 | 
 29 | struct HGModule {
 30 |     Vec3D<int> dists;
 31 |     std::ifstream infile;
 32 | 
 33 |     std::map<char, int> chr2int = { { 'a', 1 }, { 'c', 2 }, { 'g', 3 }, { 't', 4 }, { 'n', 0 },
 34 |                                     { 'A', 1 }, { 'C', 2 }, { 'G', 3 }, { 'T', 4 }, { 'N', 0 } };
 35 |     std::map<char, int> chr2int_mask
 36 |             = { { 'a', -1 }, { 'c', -2 }, { 'g', -3 }, { 't', -4 }, { 'n', 0 },
 37 |                 { 'A', 1 },  { 'C', 2 },  { 'G', 3 },  { 'T', 4 },  { 'N', 0 } };
 38 | 
 39 |     BasicModules basicModules;
 40 |     KmerModule kmerModules;
 41 | 
 42 |     void parse(int argc, char **argv) {
 43 |         basicModules.parse(argc, argv);
 44 |         basicModules.alphabet_size = 5;
 45 |         basicModules.models_init();
 46 |         kmerModules.parse(argc, argv);
 47 |         kmerModules.models_init();
 48 |     }
 49 | 
 50 | 
 51 |     string read_first() {
 52 |         string hg_file = "data/sub2.fa";
 53 |         infile = std::ifstream(hg_file);
 54 |         string line;
 55 |         std::getline(infile, line);
 56 |         return line;
 57 |     }
 58 | 
 59 |     template <typename seq_type>
 60 |     string read_next_seq(std::vector<seq_type> &seq, std::vector<bool> mask) {
 61 |         seq.clear();
 62 |         string line;
 63 |         while (std::getline(infile, line)) {
 64 |             if (line[0] == '>') {
 65 |                 return line;
 66 |             } else {
 67 |                 for (char c : line) {
 68 |                     seq.push_back(chr2int[c]);
 69 |                     mask.push_back((chr2int[c] > 0));
 70 |                 }
 71 |             }
 72 |         }
 73 |         return "";
 74 |     }
 75 | 
 76 |     void compute_sketches() {
 77 |         Vec2D<int> slide_sketch;
 78 |         std::vector<int> seq, kmer_seq;
 79 |         std::vector<bool> mask;
 80 |         string name = read_first(), next_name;
 81 |         while (not name.empty()) {
 82 |             next_name = read_next_seq(seq, mask);
 83 |             seq2kmer(seq, kmer_seq, basicModules.kmer_size, basicModules.alphabet_size);
 84 |             tensor_slide_sketch(kmer_seq, slide_sketch, kmerModules.tensor_slide_params);
 85 |             save_output(name, slide_sketch);
 86 |             name = next_name;
 87 |         }
 88 |     }
 89 | 
 90 | 
 91 |     void save_output(string seq_name, const Vec2D<int> &sketch) {
 92 |         std::ofstream fo;
 93 |         seq_name = string("data/sketch_") + seq_name.substr(1) + "_" + std::to_string(sketch.size())
 94 |                 + "_" + std::to_string(sketch[0].size()) + ".txt";
 95 |         fo.open(seq_name);
 96 |         //        fo << sketch.size() << ", " << sketch[0].size() << "\n";
 97 |         for (int m = 0; m < sketch.size(); m++) {
 98 |             for (int i = 0; i < sketch[m].size(); i++) {
 99 |                 fo << sketch[m][i] << ",";
100 |             }
101 |             fo << "\n";
102 |         }
103 |         fo.close();
104 |     }
105 | };
106 | 
107 | int main(int argc, char *argv[]) {
108 |     HGModule experiment;
109 |     experiment.parse(argc, argv);
110 |     experiment.compute_sketches();
111 |     //    experiment.save_output();
112 | }
113 | 


--------------------------------------------------------------------------------
/legacy/cross_comp.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <memory>
  3 | 
  4 | 
  5 | #include "util/modules.hpp"
  6 | #include "util/seqgen.hpp"
  7 | #include "util/utils.hpp"
  8 | 
  9 | using namespace ts;
 10 | using namespace BasicTypes;
 11 | 
 12 | struct KmerModule : public BasicModule {
 13 |     int original_alphabet_size {};
 14 | 
 15 |     void override_module_params() override {
 16 |         original_alphabet_size = alphabet_size;
 17 |         alphabet_size = int_pow<size_t>(alphabet_size, kmer_size);
 18 |     }
 19 | };
 20 | 
 21 | template <class seq_type, class embed_type>
 22 | struct SeqGenModule {
 23 |     Vec2D<seq_type> seqs;
 24 |     std::vector<std::string> seq_names;
 25 |     string test_id;
 26 |     Vec2D<seq_type> kmer_seqs;
 27 |     Vec2D<embed_type> mh_sketch;
 28 |     Vec2D<embed_type> wmh_sketch;
 29 |     Vec2D<embed_type> omh_sketch;
 30 |     Vec2D<embed_type> ten_sketch;
 31 |     Vec3D<embed_type> slide_sketch;
 32 |     Vec3D<embed_type> dists;
 33 | 
 34 |     BasicModule basicModules;
 35 |     KmerModule kmerModules;
 36 |     string output;
 37 | 
 38 |     void parse(int argc, char **argv) {
 39 |         basicModules.parse(argc, argv);
 40 |         basicModules.models_init();
 41 |         kmerModules.parse(argc, argv);
 42 |         kmerModules.models_init();
 43 |         output = basicModules.directory + basicModules.output;
 44 |     }
 45 | 
 46 |     void write_fasta(Vec2D<seq_type> &seq_vec, bool Abc = false) {
 47 |         std::ofstream fo;
 48 |         fo.open(output + "seqs.fa");
 49 |         test_id = "#" + std::to_string(random());
 50 |         fo << test_id << "\n";
 51 |         for (int si = 0; si < seq_vec.size(); si++) {
 52 |             fo << "> " << si << "\n";
 53 |             for (int i = 0; i < seq_vec[i].size(); i++) {
 54 |                 if (Abc) {
 55 |                     fo << (char)(seq_vec[si][i] + (int)'A');
 56 |                 } else {
 57 |                     fo << seq_vec[si][i] << ",";
 58 |                 }
 59 |             }
 60 |             fo << "\n\n";
 61 |         }
 62 |         fo.close();
 63 |     }
 64 | 
 65 |     void read_fasta(Vec2D<seq_type> &seq_vec) {
 66 |         seq_vec.clear();
 67 |         string file = (output + "/seqs.fa");
 68 |         std::ifstream infile = std::ifstream(file);
 69 |         string line;
 70 | 
 71 |         std::getline(infile, line);
 72 |         if (line[0] == '#') {
 73 |             test_id = line;
 74 |             std::getline(infile, line);
 75 |         }
 76 |         while (line[0] != '>') {
 77 |             std::cout << line << "\n";
 78 |             std::getline(infile, line);
 79 |         }
 80 |         string name = line;
 81 |         std::vector<seq_type> seq;
 82 |         while (std::getline(infile, line)) {
 83 |             if (line[0] == '>') {
 84 |                 seq_vec.push_back(seq);
 85 |                 seq_names.push_back(name);
 86 |                 seq.clear();
 87 |                 name = line;
 88 |             } else if (!line.empty()) {
 89 |                 for (char c : line) {
 90 |                     int ic = c - (int)'A';
 91 |                     seq.push_back(ic);
 92 |                 }
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 | 
 98 |     void generate_sequences() {
 99 |         if (basicModules.mutation_pattern == "pairs") {
100 |             basicModules.seq_gen.genseqs_pairs(seqs);
101 |         } else if (basicModules.mutation_pattern == "linear") {
102 |             basicModules.seq_gen.genseqs_linear(seqs);
103 |         } else if (basicModules.mutation_pattern == "tree") {
104 |             basicModules.seq_gen.genseqs_tree(seqs, basicModules.sequence_seeds);
105 |         } else {
106 |             std::cerr << " mutation pattern `" << basicModules.mutation_pattern
107 |                       << "` is not valid\n";
108 |             exit(1);
109 |         }
110 |         write_fasta(seqs);
111 |     }
112 | 
113 |     void compute_sketches() {
114 |         int num_seqs = seqs.size();
115 |         kmer_seqs.resize(num_seqs);
116 |         wmh_sketch.resize(num_seqs);
117 |         mh_sketch.resize(num_seqs);
118 |         omh_sketch.resize(num_seqs);
119 |         ten_sketch.resize(num_seqs);
120 |         slide_sketch.resize(num_seqs);
121 |         for (int si = 0; si < num_seqs; si++) {
122 |             seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size);
123 |             minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params);
124 |             weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params);
125 |             ordered_minhash_flat(seqs[si], omh_sketch[si], basicModules.omh_params);
126 |             tensor_sketch(seqs[si], ten_sketch[si], basicModules.tensor_params);
127 |             tensor_slide_sketch(seqs[si], slide_sketch[si], basicModules.tensor_slide_params);
128 |         }
129 |     }
130 |     void compute_pairwise_dists() {
131 |         int num_seqs = seqs.size();
132 |         if (basicModules.mutation_pattern == "pairs") {
133 |             dists = new3D<double>(8, num_seqs, 1, -1);
134 |             for (int i = 0; i < seqs.size(); i += 2) {
135 |                 int j = i + 1;
136 |                 dists[0][i][0] = edit_distance(seqs[i], seqs[j]);
137 |                 dists[1][i][0] = hamming_dist(mh_sketch[i], mh_sketch[j]);
138 |                 dists[2][i][0] = hamming_dist(wmh_sketch[i], wmh_sketch[j]);
139 |                 dists[3][i][0] = hamming_dist(omh_sketch[i], omh_sketch[j]);
140 |                 dists[4][i][0] = l1_dist(ten_sketch[i], ten_sketch[j]);
141 |                 dists[5][i][0] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]);
142 |             }
143 |         } else {
144 |             dists = new3D<double>(8, num_seqs, num_seqs, 0);
145 |             for (int i = 0; i < seqs.size(); i++) {
146 |                 for (int j = i + 1; j < seqs.size(); j++) {
147 |                     dists[0][i][j] = edit_distance(seqs[i], seqs[j]);
148 |                     dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]);
149 |                     dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]);
150 |                     dists[3][i][j] = hamming_dist(omh_sketch[i], omh_sketch[j]);
151 |                     dists[4][i][j] = l1_dist(ten_sketch[i], ten_sketch[j]);
152 |                     dists[5][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]);
153 |                 }
154 |             }
155 |         }
156 |     }
157 | 
158 |     void save_output() {
159 |         std::vector<string> method_names
160 |                 = { "ED", "MH", "WMH", "OMH", "TenSketch", "TenSlide", "Ten2", "Ten2Slide" };
161 |         std::ofstream fo;
162 | 
163 |         fo.open(output + "conf.csv");
164 |         fo << basicModules.config();
165 |         fo.close();
166 | 
167 |         int num_seqs = seqs.size();
168 |         for (int m = 0; m < 6; m++) {
169 |             fo.open(output + "dists/" + method_names[m] + ".txt");
170 |             assert(fo.is_open());
171 |             if (basicModules.mutation_pattern == "pairs") {
172 |                 for (int i = 0; i < num_seqs; i += 2) {
173 |                     int j = i + 1;
174 |                     fo << i << ", " << j << ", " << dists[m][i][0] << "\n";
175 |                 }
176 |             } else {
177 |                 for (int i = 0; i < num_seqs; i++) {
178 |                     for (int j = i + 1; j < seqs.size(); j++) {
179 |                         fo << i << ", " << j << ", " << dists[m][i][j] << "\n";
180 |                     }
181 |                 }
182 |             }
183 |             fo.close();
184 |         }
185 | 
186 |         fo.open(output + "sketches/mh.txt");
187 |         assert(fo.is_open());
188 |         for (int si = 0; si < num_seqs; si++) {
189 |             fo << ">> seq " << si << "\n";
190 |             for (const auto &e : mh_sketch[si]) {
191 |                 fo << e << ", ";
192 |             }
193 |             fo << "\n";
194 |         }
195 |         fo.close();
196 | 
197 |         fo.open(output + "sketches/wmh.txt");
198 |         assert(fo.is_open());
199 |         for (int si = 0; si < num_seqs; si++) {
200 |             fo << ">> seq " << si << "\n";
201 |             for (const auto &e : wmh_sketch[si]) {
202 |                 fo << e << ", ";
203 |             }
204 |             fo << "\n";
205 |         }
206 |         fo.close();
207 | 
208 |         fo.open(output + "sketches/omh.txt");
209 |         assert(fo.is_open());
210 |         for (int si = 0; si < num_seqs; si++) {
211 |             fo << ">> seq " << si << "\n";
212 |             for (const auto &e : omh_sketch[si]) {
213 |                 fo << e << ", ";
214 |             }
215 |             fo << "\n";
216 |         }
217 |         fo.close();
218 | 
219 |         fo.open(output + "sketches/ten.txt");
220 |         assert(fo.is_open());
221 |         for (int si = 0; si < seqs.size(); si++) {
222 |             fo << ">> seq " << si << "\n";
223 |             for (const auto &e : ten_sketch[si]) {
224 |                 fo << e << ", ";
225 |             }
226 |             fo << "\n";
227 |         }
228 |         fo.close();
229 | 
230 |         fo.open(output + "sketches/ten_slide.txt");
231 |         for (int si = 0; si < seqs.size(); si++) {
232 |             auto &sk = slide_sketch[si];
233 |             for (int dim = 0; dim < sk.size(); dim++) {
234 |                 fo << ">> seq: " << si << ", dim: " << dim << "\n";
235 |                 for (auto &item : sk[dim])
236 |                     fo << item << ", ";
237 |                 fo << "\n";
238 |             }
239 |             fo << "\n";
240 |         }
241 |         fo.close();
242 |     }
243 | };
244 | 
245 | int main(int argc, char *argv[]) {
246 |     SeqGenModule<int, double> experiment;
247 |     experiment.parse(argc, argv);
248 |     if (experiment.basicModules.show_help) {
249 |         std::cout << experiment.basicModules.description();
250 |     } else {
251 |         experiment.generate_sequences();
252 |         experiment.compute_sketches();
253 |         experiment.compute_pairwise_dists();
254 |         experiment.save_output();
255 |     }
256 |     return 0;
257 | }
258 | 


--------------------------------------------------------------------------------
/legacy/dists_pairwise.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <memory>
  3 | 
  4 | 
  5 | #include "util/modules.hpp"
  6 | #include "util/seqgen.hpp"
  7 | #include "util/utils.hpp"
  8 | 
  9 | using namespace ts;
 10 | using namespace BasicTypes;
 11 | 
 12 | struct KmerModule : public BasicModules {
 13 |     int original_alphabet_size {};
 14 | 
 15 |     void override_pre() override {
 16 |         original_alphabet_size = alphabet_size;
 17 |         alphabet_size = int_pow<size_t>(alphabet_size, kmer_size);
 18 |     }
 19 | 
 20 |     void override_post() override {
 21 |         tensor_slide_params.alphabet_size = original_alphabet_size;
 22 |         tensor_slide_params.tup_len = 2;
 23 |     }
 24 | };
 25 | 
 26 | struct TestModule1 {
 27 |     Vec2D<int> seqs;
 28 |     std::vector<std::string> seq_names;
 29 |     string test_id;
 30 |     Vec2D<int> kmer_seqs;
 31 |     Vec2D<int> wmh_sketch;
 32 |     Vec2D<int> mh_sketch;
 33 |     Vec3D<int> omh_sketch;
 34 |     Vec2D<double> ten_sketch;
 35 |     Vec3D<int> slide_sketch;
 36 |     Vec2D<double> ten_new_sketch;
 37 |     Vec3D<double> ten_new_slide_sketch;
 38 |     Vec3D<double> dists;
 39 | 
 40 |     BasicModules basicModules;
 41 |     KmerModule kmerModules;
 42 |     NewModules newModules;
 43 | 
 44 |     void parse(int argc, char **argv) {
 45 |         basicModules.parse(argc, argv);
 46 |         basicModules.models_init();
 47 |         kmerModules.parse(argc, argv);
 48 |         kmerModules.models_init();
 49 |         newModules.parse(argc, argv);
 50 |         newModules.model_init();
 51 |     }
 52 | 
 53 | 
 54 |     template <class seq_type>
 55 |     void write_fasta(Vec2D<seq_type> &seq_vec) {
 56 |         std::ofstream fo;
 57 |         fo.open(out_path + "/seqs.fa");
 58 |         test_id = "#" + std::to_string(random());
 59 |         fo << test_id << "\n";
 60 |         for (int si = 0; si < seq_vec.size(); si++) {
 61 |             fo << "> " << si << "\n";
 62 |             for (int i = 0; i < seq_vec[i].size(); i++) {
 63 |                 fo << (char)(seq_vec[si][i] + (int)'A');
 64 |             }
 65 |             fo << "\n\n";
 66 |         }
 67 |         fo.close();
 68 |     }
 69 | 
 70 |     template <typename seq_type>
 71 |     void read_fasta(Vec2D<seq_type> &seq_vec) {
 72 |         seq_vec.clear();
 73 |         string file = (out_path + "/seqs.fa");
 74 |         std::ifstream infile = std::ifstream(file);
 75 |         string line;
 76 | 
 77 |         std::getline(infile, line);
 78 |         if (line[0] == '#') {
 79 |             test_id = line;
 80 |             std::getline(infile, line);
 81 |         }
 82 |         while (line[0] != '>') {
 83 |             std::cout << line << "\n";
 84 |             std::getline(infile, line);
 85 |         }
 86 |         string name = line;
 87 |         std::vector<seq_type> seq;
 88 |         while (std::getline(infile, line)) {
 89 |             if (line[0] == '>') {
 90 |                 seq_vec.push_back(seq);
 91 |                 seq_names.push_back(name);
 92 |                 seq.clear();
 93 |                 name = line;
 94 |             } else if (!line.empty()) {
 95 |                 for (char c : line) {
 96 |                     int ic = c - (int)'A';
 97 |                     seq.push_back(ic);
 98 |                 }
 99 |             }
100 |         }
101 |     }
102 | 
103 |     void generate_sequences() {
104 |         basicModules.seq_gen.gen_seqs(seqs);
105 |         write_fasta(seqs);
106 |         //        read_fasta(seqs);
107 |     }
108 | 
109 |     void compute_sketches() {
110 |         int num_seqs = seqs.size();
111 |         kmer_seqs.resize(num_seqs);
112 |         wmh_sketch.resize(num_seqs);
113 |         mh_sketch.resize(num_seqs);
114 |         omh_sketch.resize(num_seqs);
115 |         ten_sketch.resize(num_seqs);
116 |         slide_sketch.resize(num_seqs);
117 |         ten_new_sketch.resize(num_seqs);
118 |         ten_new_slide_sketch.resize(num_seqs);
119 |         for (int si = 0; si < num_seqs; si++) {
120 |             seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size);
121 |             minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params);
122 |             weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params);
123 |             ordered_minhash(kmer_seqs[si], omh_sketch[si], kmerModules.omh_params);
124 |             tensor_sketch(kmer_seqs[si], ten_sketch[si], kmerModules.tensor_params);
125 |             tensor_slide_sketch(seqs[si], slide_sketch[si], kmerModules.tensor_slide_params);
126 | 
127 |             tensor2_sketch<int, double>(kmer_seqs[si], ten_new_sketch[si], newModules.ten_2_params);
128 |             tensor2_slide_sketch<int, double>(kmer_seqs[si], ten_new_slide_sketch[si],
129 |                                               newModules.ten_2_slide_params);
130 |         }
131 |         std::ofstream fo;
132 |         fo.open(out_path + "/sketches_Ten2.txt");
133 |         fo << test_id << "\n";
134 |         for (int si = 0; si < num_seqs; si++) {
135 |             for (int i = 0; i < ten_new_sketch[si].size(); i++) {
136 |                 fo << ten_new_sketch[si][i];
137 |             }
138 |             fo << "\n";
139 |         }
140 |         fo.close();
141 |         fo.open(out_path + "/sketches_Ten2_slide.txt");
142 |         fo << test_id << "\n";
143 |         for (int si = 0; si < num_seqs; si++) {
144 |             fo << ">> " << si << "\n";
145 |             for (int i = 0; i < ten_new_slide_sketch[si].size(); i++) {
146 |                 for (int j = 0; j < ten_new_slide_sketch[si][i].size(); j++)
147 |                     fo << ten_new_slide_sketch[si][i][j] << ", ";
148 |                 fo << "\n";
149 |             }
150 |             fo << "\n";
151 |         }
152 |         fo.close();
153 |     }
154 |     void compute_dists() {
155 |         std::ofstream fo;
156 |         int num_seqs = seqs.size();
157 |         dists = new3D<double>(8, num_seqs, num_seqs, 0);
158 |         for (int i = 0; i < seqs.size(); i++) {
159 |             for (int j = i + 1; j < seqs.size(); j++) {
160 |                 dists[0][i][j] = edit_distance(seqs[i], seqs[j]);
161 |                 dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]);
162 |                 dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]);
163 |                 dists[3][i][j] = hamming_dist2D(omh_sketch[i], omh_sketch[j]);
164 |                 dists[4][i][j] = l2_sq_dist(ten_sketch[i], ten_sketch[j]);
165 |                 dists[5][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]);
166 |                 dists[6][i][j] = l2_sq_dist(ten_new_sketch[i], ten_new_sketch[j]);
167 |                 dists[7][i][j] = l1_dist2D_minlen(ten_new_slide_sketch[i], ten_new_slide_sketch[j]);
168 |                 //                dists[6][i][j] = cosine_sim(ten_new_sketch[i], ten_new_sketch[j]);
169 |                 //                dists[6][i][j] = l1_dist(ten_new_sketch[i], ten_new_sketch[j]);
170 |             }
171 |         }
172 |         std::vector<string> method_names
173 |                 = { "ED", "MH", "WMH", "OMH", "TenSketch", "TenSlide", "Ten2", "Ten2Slide" };
174 |         for (int m = 0; m < 8; m++) {
175 |             fo.open(out_path + "/dists_" + method_names[m] + ".txt");
176 |             fo << test_id << "\n";
177 |             for (int i = 0; i < num_seqs; i++) {
178 |                 for (int j = i + 1; j < num_seqs; j++) {
179 |                     fo << i << ", " << j << ", " << dists[m][i][j] << "\n";
180 |                 }
181 |             }
182 |             fo.close();
183 |         }
184 |     }
185 | 
186 |     void save_output() {
187 |         std::ofstream fo;
188 |         //        fo.open("output.txt");
189 |         fo.open(out_path + "/matlab_output.txt");
190 |         for (int i = 0; i < seqs.size(); i++) {
191 |             for (int j = i + 1; j < seqs.size(); j++) {
192 |                 fo << dists[0][i][j] << ", " << dists[1][i][j] << ", " << dists[2][i][j] << ", "
193 |                    << dists[3][i][j] << ", " << dists[4][i][j] << ", " << dists[5][i][j] << ", "
194 |                    << dists[6][i][j] << ", " << dists[7][i][j] << "\n";
195 |             }
196 |         }
197 |         fo.close();
198 |     }
199 | };
200 | 
201 | int main(int argc, char *argv[]) {
202 |     TestModule1 experiment;
203 |     experiment.parse(argc, argv);
204 |     experiment.generate_sequences();
205 |     experiment.compute_sketches();
206 |     experiment.compute_dists();
207 |     experiment.save_output();
208 | }
209 | 


--------------------------------------------------------------------------------
/legacy/dtw.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Amir Joudaki on 6/16/20.
 3 | //
 4 | 
 5 | #ifndef SEQUENCE_SKETCHING_DTW_H
 6 | #define SEQUENCE_SKETCHING_DTW_H
 7 | 
 8 | #include <vector>
 9 | // TODO test skeching with l1 over vectors
10 | 
11 | template <class T, class result_type>
12 | void dtw(std::vector<T> &a, const std::vector<T> &b) {
13 |     int n = a.size();
14 |     int m = b.size();
15 |     std::vector<std::vector<T>> DTW(n + 1, std::vector<T>(m, 0));
16 | 
17 |     for (int i = 1; i <= n; i++) {
18 |         for (int j = 1; j <= m; j++) {
19 |             T cost = std::abs(a[i] - b[j]);
20 |             DTW[i][j] = cost + std::min({ DTW[i - 1][j], DTW[i][j - 1], DTW[i - 1][j - 1] });
21 |         }
22 |     }
23 |     return DTW[n][m];
24 | }
25 | 
26 | #endif // SEQUENCE_SKETCHING_DTW_H
27 | 


--------------------------------------------------------------------------------
/legacy/long_seqs.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <memory>
 3 | 
 4 | 
 5 | #include "util/modules.hpp"
 6 | #include "util/seqgen.hpp"
 7 | #include "util/utils.hpp"
 8 | 
 9 | using namespace ts;
10 | using namespace BasicTypes;
11 | 
12 | struct KmerModule : public BasicModules {
13 |     int original_alphabet_size {};
14 | 
15 |     void override_pre() override {
16 |         original_alphabet_size = alphabet_size;
17 |         alphabet_size = int_pow<size_t>(alphabet_size, kmer_size);
18 |     }
19 | 
20 |     void override_post() override {
21 |         //        tensor_slide_params.alphabet_size = original_alphabet_size;
22 |         //        tensor_slide_params.tup_len = 2;
23 |     }
24 | };
25 | 
26 | struct TestModule1 {
27 |     Vec2D<int> seqs;
28 |     Vec2D<int> kmer_seqs;
29 |     Vec2D<int> wmh_sketch;
30 |     Vec2D<int> mh_sketch;
31 |     Vec3D<int> omh_sketch;
32 |     Vec2D<int> ten_sketch;
33 |     Vec3D<int> slide_sketch;
34 |     Vec3D<int> dists;
35 | 
36 |     BasicModules basicModules;
37 |     KmerModule kmerModules;
38 | 
39 |     void parse(int argc, char **argv) {
40 |         basicModules.parse(argc, argv);
41 |         basicModules.models_init();
42 |         kmerModules.parse(argc, argv);
43 |         kmerModules.models_init();
44 |     }
45 | 
46 |     void generate_sequences() { basicModules.seq_gen.gen_seqs(seqs); }
47 | 
48 |     void compute_sketches() {
49 |         int num_seqs = seqs.size();
50 |         kmer_seqs.resize(num_seqs);
51 |         slide_sketch.resize(num_seqs);
52 |         for (int si = 0; si < num_seqs; si++) {
53 |             seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size);
54 |             tensor_slide_sketch(kmer_seqs[si], slide_sketch[si], kmerModules.tensor_slide_params);
55 |         }
56 |     }
57 |     void compute_dists() {
58 |         int num_seqs = seqs.size();
59 |         dists = new3D<int>(2, num_seqs, num_seqs, 0);
60 |         for (int i = 0; i < seqs.size(); i++) {
61 |             for (int j = i + 1; j < seqs.size(); j++) {
62 |                 dists[0][i][j] = edit_distance(seqs[i], seqs[j]);
63 |                 dists[1][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]);
64 |             }
65 |         }
66 |     }
67 | 
68 |     void save_output() {
69 |         std::ofstream fo;
70 |         fo.open("long_seq_output.txt");
71 |         for (int i = 0; i < seqs.size(); i++) {
72 |             for (int j = i + 1; j < seqs.size(); j++) {
73 |                 fo << dists[0][i][j] << ", " << dists[1][i][j] << "\n";
74 |             }
75 |         }
76 |         fo.close();
77 |     }
78 | };
79 | 
80 | int main(int argc, char *argv[]) {
81 |     TestModule1 experiment;
82 |     experiment.parse(argc, argv);
83 |     experiment.generate_sequences();
84 |     experiment.compute_sketches();
85 |     experiment.compute_dists();
86 |     experiment.save_output();
87 | }
88 | 


--------------------------------------------------------------------------------
/legacy/tensor.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cmath>
  4 | #include <cstdio>
  5 | #include <numeric>
  6 | #include <random>
  7 | 
  8 | #include "util/multivec.hpp"
  9 | 
 10 | namespace ts { // ts = Tensor Sketch
 11 | 
 12 | 
 13 | /**
 14 |  * Computes tensor sketches for a given sequence as described in
 15 |  * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1
 16 |  * @tparam set_type the type of the characters in sketched sequences
 17 |  * @tparam sketch_type the type of elements in the sketch
 18 |  */
 19 | template <class set_type, class sketch_type>
 20 | class Tensor {
 21 |   public:
 22 |     /**
 23 |      * @param alphabet_size the size of the alphabet over which the sequences to be sketched are
 24 |      * defined
 25 |      * @param sketch_count number of different sketches to compute
 26 |      * @param embedded_dim the dimension of the embedded (sketched) space, denoted by D in the paper
 27 |      * @param num_bins number of bins for discretization of the sketches.
 28 |      * @param tup_len the length of the subsequences considered for sketching, denoted by t in the
 29 |      * paper
 30 |      */
 31 |     Tensor(set_type alphabet_size,
 32 |            size_t sketch_count,
 33 |            size_t embedded_dim,
 34 |            size_t num_bins,
 35 |            size_t tup_len)
 36 |         : sketch_count(sketch_count),
 37 |           alphabet_size(alphabet_size),
 38 |           embedded_dim(embedded_dim),
 39 |           num_bins(num_bins),
 40 |           tup_len(tup_len) {
 41 |         rand_init();
 42 |     }
 43 | 
 44 |     std::vector<sketch_type> compute(const std::vector<set_type> &sequence) {
 45 |         Timer::start("tensor_sketch");
 46 | 
 47 |         std::vector<sketch_type> sketch(sketch_count, 0);
 48 |         for (size_t m = 0; m < sketch_count; m++) {
 49 |             auto cnt = new2D<double>(tup_len + 1, embedded_dim, sketch_type(0));
 50 |             cnt[0][0] = 1; // base case
 51 |             for (size_t i = 0; i < sequence.size(); i++) {
 52 |                 for (int32_t t = (int32_t)tup_len - 1; t >= 0; t--) {
 53 |                     auto pi = hashes[m][t][sequence[i]];
 54 |                     for (size_t p = 0; p < embedded_dim; p++) {
 55 |                         auto shift = (p + pi) % embedded_dim;
 56 |                         cnt[t + 1][shift] += cnt[t][p];
 57 |                     }
 58 |                 }
 59 |             }
 60 |             const auto &top_cnt = cnt[tup_len]; // this is T^p
 61 |             auto prod = std::inner_product(s[m].begin(), s[m].end(), top_cnt.begin(), 0.0);
 62 |             prod /= l1(top_cnt); // this is the total no of sequences
 63 |             if (num_bins == 0) {
 64 |                 sketch[m] = prod;
 65 |             } else {
 66 |                 sketch_type bin = std::upper_bound(bins.begin(), bins.begin() + num_bins, prod)
 67 |                         - bins.begin();
 68 |                 sketch[m] = bin;
 69 |             }
 70 |         }
 71 |         Timer::stop();
 72 | 
 73 |         return sketch;
 74 |     }
 75 | 
 76 |   protected:
 77 |     void rand_init() {
 78 |         std::random_device rd;
 79 |         std::mt19937 gen(rd());
 80 |         std::uniform_int_distribution<uint32_t> unif_hash(0, embedded_dim - 1);
 81 | 
 82 |         hashes = new3D<set_type>(sketch_count, tup_len, alphabet_size);
 83 |         s = new2D<int8_t>(sketch_count, embedded_dim);
 84 |         for (size_t m = 0; m < sketch_count; m++) {
 85 |             for (size_t t = 0; t < tup_len; t++) {
 86 |                 for (size_t c = 0; c < alphabet_size; c++) {
 87 |                     hashes[m][t][c] = unif_hash(gen);
 88 |                 }
 89 |             }
 90 |             for (size_t p = 0; p < embedded_dim; p++) {
 91 |                 s[m][p] = (p % 2 == 0) ? 1 : -1; // use oddity of p to assign (-1) or (1)
 92 |             }
 93 |         }
 94 |         bins = std::vector<double>(num_bins);
 95 |         for (size_t b = 0; b < num_bins; b++) {
 96 |             bins[b] = std::tan(M_PI * ((b + .5) / num_bins - .5));
 97 |         }
 98 |         bins.push_back(std::numeric_limits<double>::max());
 99 |         bins.insert(bins.begin(), std::numeric_limits<double>::lowest());
100 |     }
101 | 
102 |   protected:
103 |     size_t sketch_count;
104 |     set_type alphabet_size;
105 |     size_t embedded_dim;
106 |     size_t num_bins;
107 |     size_t tup_len;
108 | 
109 |     /**
110 |      * Denotes the hash functions h1,....hD:A->{1....D}.
111 |      */
112 |     Vec3D<set_type> hashes;
113 |     /**
114 |      * Sign function, corresponds to s1,s2,...st:A->{-1,1} in the paper. The first index denotes the
115 |      * sketch count, the second index the embedded dimension
116 |      */
117 |      //TODO: figure out why second index is not the tuple and why there is no 3rd index.
118 |     Vec2D<int8_t> s;
119 | 
120 |     /** Bins the possible values of a sketch into #num_bins integer values */
121 |     std::vector<double> bins;
122 | };
123 | 
124 | } // namespace ts
125 | 


--------------------------------------------------------------------------------
/legacy/tensor_slide.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "tensor.hpp"
 4 | 
 5 | namespace ts { // ts = Tensor Sketch
 6 | 
 7 | /**
 8 |  * Computes tensor slide sketches for a given sequence.
 9 |  * @tparam sketch_type the type of elements in the sequences to be sketched.
10 |  */
11 | template <class set_type, class sketch_type>
12 | class TensorSlideOld : public Tensor<set_type, sketch_type> {
13 |   public:
14 |     /**
15 |      * @param set_size the number of elements in S,
16 |      * @param sketch_dim the number of components (elements) in the sketch vector.
17 |      */
18 |     TensorSlideOld(sketch_type set_size,
19 |                 size_t sketch_dim,
20 |                 size_t num_phases,
21 |                 size_t num_bins,
22 |                 size_t tup_len,
23 |                 size_t win_len,
24 |                 size_t stride,
25 |                 size_t offset)
26 |         : Tensor<set_type, sketch_type>(set_size, sketch_dim, num_phases, num_bins, tup_len),
27 |           win_len(win_len),
28 |           stride(stride),
29 |           offset(offset) {
30 |         this->rand_init();
31 |     }
32 | 
33 |     void compute(const Seq<set_type> &seq, Vec2D<sketch_type> &sketch) {
34 |         Timer::start("tensor_slide_sketch");
35 |         sketch = Vec2D<sketch_type>(this->sketch_count, std::vector<sketch_type>());
36 |         for (size_t m = 0; m < this->sketch_count; m++) {
37 |             auto cnt = new3D<double>(this->tup_len, this->tup_len, this->embedded_dim, 0);
38 |             for (size_t i = 0; i < seq.size(); i++) {
39 |                 if (i >= win_len) {
40 |                     size_t j = i - win_len;
41 |                     for (size_t t = 0; t < this->tup_len; t++) {
42 |                         auto pj = this->hashes[m][t][seq[j]];
43 |                         cnt[t][t][pj]--;
44 |                         for (int t2 = t - 1; t2 >= 0; t2--) {
45 |                             auto pj = this->hashes[m][t2][seq[j]];
46 |                             for (size_t p = 0; p < this->embedded_dim; p++) {
47 |                                 auto shift = (p + pj) % this->embedded_dim;
48 |                                 cnt[t2][t][shift] -= cnt[t2 + 1][t][p];
49 |                             }
50 |                         }
51 |                     }
52 |                 }
53 | 
54 |                 for (size_t t = 0; t < this->tup_len; t++) {
55 |                     for (size_t t2 = this->tup_len - 1; t2 > t; t2--) {
56 |                         auto pi = this->hashes[m][t2][seq[i]];
57 |                         for (size_t p = 0; p < this->embedded_dim; p++) {
58 |                             auto shift = (p + pi) % this->embedded_dim;
59 |                             cnt[t][t2][shift] += cnt[t][t2 - 1][p];
60 |                         }
61 |                     }
62 |                     auto pi = this->hashes[m][t][seq[i]];
63 |                     cnt[t][t][pi]++;
64 |                 }
65 |                 if (sketch_now(i, seq.size(), stride, offset)) {
66 |                     const auto &top_cnt = cnt[0][this->tup_len - 1];
67 |                     auto prod = std::inner_product(this->s[m].begin(), this->s[m].end(),
68 |                                                    top_cnt.begin(), (double)0);
69 |                     prod = prod / l1(top_cnt);
70 |                     //                    int exp;
71 |                     //                    frexp(prod, &exp);
72 |                     //                    embedding[m].push_back(exp * sgn(prod));
73 |                     sketch_type bin = std::upper_bound(this->bins.begin(),
74 |                                              this->bins.begin() + this->num_bins, prod)
75 |                             - this->bins.begin();
76 |                     sketch[m].push_back(bin);
77 |                 }
78 |             }
79 |         }
80 |         Timer::stop();
81 |     }
82 | 
83 |   private:
84 |     size_t win_len;
85 |     size_t stride;
86 |     size_t offset;
87 | };
88 | 
89 | } // namespace ts
90 | 


--------------------------------------------------------------------------------
/legacy/tensor_slide2.hpp:
--------------------------------------------------------------------------------
 1 | ////
 2 | //// Created by Amir Joudaki on 6/19/20.
 3 | ////
 4 | //
 5 | //#ifndef SEQUENCE_SKETCHING_TENSOR_SLIDE2_H
 6 | //#define SEQUENCE_SKETCHING_TENSOR_SLIDE2_H
 7 | //
 8 | //#include "sketch/tensor_slide.hpp"
 9 | //
10 | // namespace ts { // ts = Tensor Sketch
11 | //
12 | //
13 | //    template<class seq_type, class embed_type>
14 | //    void tensor_sketch_slide2(const std::vector<Seq<seq_type>> &seq2D, Vec2D<embed_type> &embedding, const
15 | //    TensorSlideParams &params) {
16 | //        assert(seq2D.size() == params.embed_dim);
17 | //        embedding = Vec2D<embed_type>(params.embed_dim, std::vector<embed_type>());
18 | //        for (int m = 0; m < params.embed_dim; m++) {
19 | //            const auto &seq = seq2D[m];
20 | //            auto cnt = new3D<float>(params.tup_len, params.tup_len, params.num_phases, 0);
21 | //            for (int i = 0; i < seq.size(); i++) {
22 | //                int j = i - params.win_len;
23 | //                if (j >= 0) {
24 | //                    for (int t = 0; t < params.tup_len; t++) {
25 | //                        auto pj = params.iphases[m][t][seq[j]];
26 | //                        cnt[t][t][pj]--;
27 | //                        for (int t2 = t - 1; t2 >= 0; t2--) {
28 | //                            auto pj = params.iphases[m][t2][seq[j]];
29 | //                            for (int p = 0; p < params.num_phases; p++) {
30 | //                                auto shift = (p + pj) % params.num_phases;
31 | //                                cnt[t2][t][shift] -= cnt[t2 + 1][t][p];
32 | //                            }
33 | //                        }
34 | //                    }
35 | //                }
36 | //
37 | //                for (int t = 0; t < params.tup_len; t++) {
38 | //                    for (int t2 = params.tup_len - 1; t2 > t; t2--) {
39 | //                        auto pi = params.iphases[m][t2][seq[i]];
40 | //                        for (int p = 0; p < params.num_phases; p++) {
41 | //                            auto shift = (p + pi) % params.num_phases;
42 | //                            cnt[t][t2][shift] += cnt[t][t2 - 1][p];
43 | //                        }
44 | //                    }
45 | //                    auto pi = params.iphases[m][t][seq[i]];
46 | //                    cnt[t][t][pi]++;
47 | //                }
48 | //                const auto &top_cnt = cnt[0][params.tup_len - 1];
49 | //                auto prod = std::inner_product(params.icdf[m].begin(), params.icdf[m].end(),
50 | //                top_cnt.begin(), (double) 0); auto norm = l1(top_cnt); prod = prod / norm;
51 | //                embed_type bin = std::upper_bound(params.bins.begin(), params.bins.begin() +
52 | //                params.num_bins, prod) - params.bins.begin(); if ((i + 1) % params.stride == 0 or
53 | //                i == (seq.size() - 1)) {
54 | //                    if (norm != 0)
55 | //                        embedding[m].push_back(bin);
56 | //                    else
57 | //                        embedding[m].push_back(params.num_bins / 2);
58 | //                }
59 | //            }
60 | //        }
61 | //    }
62 | //
63 | //}
64 | //
65 | //#endif//SEQUENCE_SKETCHING_TENSOR_SLIDE2_H
66 | 


--------------------------------------------------------------------------------
/legacy/test_tensor_disc.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <memory>
  3 | 
  4 | 
  5 | #include "util/modules.hpp"
  6 | #include "util/seqgen.hpp"
  7 | #include "util/utils.hpp"
  8 | 
  9 | using namespace ts;
 10 | using namespace BasicTypes;
 11 | 
 12 | struct KmerModule : public BasicModules {
 13 |     int original_alphabet_size {};
 14 | 
 15 |     void override_pre() override {
 16 |         original_alphabet_size = alphabet_size;
 17 |         alphabet_size = int_pow<size_t>(alphabet_size, kmer_size);
 18 |     }
 19 | 
 20 |     void override_post() override {
 21 |         tensor_slide_params.alphabet_size = original_alphabet_size;
 22 |         tensor_slide_params.tup_len = 2;
 23 |     }
 24 | };
 25 | 
 26 | struct DiscModules : public BasicModules {
 27 |     std::vector<int> dims = { 16, 64, 256 };
 28 |     std::vector<int> win2stride = { 2, 2, 2 };
 29 |     std::vector<int> tup_lens = { 4, 4, 4 };
 30 |     std::vector<int> num_phases = { 5, 5, 5 };
 31 |     std::vector<int> strid2dim = { 2, 2, 2 };
 32 | 
 33 |     void override_post() override {}
 34 | 
 35 |     TensorSlideParams layer0() {
 36 |         TensorSlideParams tensorParams;
 37 |         init_tensor_slide_params(tensorParams);
 38 |         tensorParams.tup_len = tup_lens[0];
 39 |         tensorParams.win_len = dims[0];
 40 |         tensorParams.stride = dims[0] / strid2dim[0];
 41 |         tensorParams.embed_dim = dims[0];
 42 |         tensorParams.num_phases = num_phases[0];
 43 |         return tensorParams;
 44 |     }
 45 | 
 46 |     std::vector<TensorSlideParams> layers(int l) {
 47 |         assert(0 < l and l <= 2);
 48 |         std::vector<TensorSlideParams> param_vec;
 49 |         for (int i = 0; i < dims[l - 1]; i++) {
 50 |             auto params = layer0();
 51 |             params.tup_len = tup_lens[l];
 52 |             params.alphabet_size = num_phases[l];
 53 |             params.num_phases = num_phases[l];
 54 |             params.embed_dim = dims[l] / dims[l - 1];
 55 |             params.stride = (dims[l] / dims[l - 1]);
 56 |             params.win_len = params.stride * win2stride[l];
 57 |             param_vec.push_back(params);
 58 |         }
 59 |         return param_vec;
 60 |     }
 61 | };
 62 | 
 63 | struct TestModule1 {
 64 |     Vec2D<int> seqs;
 65 |     Vec2D<int> kmer_seqs;
 66 |     Vec2D<int> wmh_sketch;
 67 |     Vec2D<int> mh_sketch;
 68 |     Vec3D<int> omh_sketch;
 69 |     Vec2D<int> ten_sketch;
 70 |     Vec3D<double> ten_disc_sketch;
 71 |     Vec4D<double> slide_disc_sketch1;
 72 |     Vec4D<double> slide_disc_sketch2;
 73 |     Vec4D<double> slide_disc_sketch3;
 74 |     Vec2D<double> slide_disc_flat;
 75 |     Vec3D<int> slide_sketch;
 76 |     Vec3D<double> dists;
 77 | 
 78 |     BasicModules basicModules;
 79 |     KmerModule kmerModules;
 80 |     DiscModules discModules;
 81 | 
 82 |     void parse(int argc, char **argv) {
 83 |         basicModules.parse(argc, argv);
 84 |         basicModules.models_init();
 85 |         kmerModules.parse(argc, argv);
 86 |         kmerModules.models_init();
 87 |         discModules.parse(argc, argv);
 88 |         discModules.models_init();
 89 |     }
 90 | 
 91 |     void generate_sequences() { basicModules.seq_gen.gen_seqs(seqs); }
 92 | 
 93 |     void compute_sketches() {
 94 |         int num_seqs = seqs.size();
 95 |         kmer_seqs.resize(num_seqs);
 96 |         wmh_sketch.resize(num_seqs);
 97 |         mh_sketch.resize(num_seqs);
 98 |         omh_sketch.resize(num_seqs);
 99 |         ten_sketch.resize(num_seqs);
100 |         ten_disc_sketch.resize(num_seqs);
101 |         slide_disc_sketch1.resize(num_seqs);
102 |         slide_disc_sketch2.resize(num_seqs);
103 |         slide_disc_sketch3.resize(num_seqs);
104 |         slide_disc_flat.resize(num_seqs);
105 |         slide_sketch.resize(num_seqs);
106 |         auto lay0 = discModules.layer0();
107 |         lay0.rand_init();
108 |         auto lay1 = discModules.layers(1);
109 |         for (auto &l : lay1)
110 |             l.rand_init();
111 |         auto lay2 = discModules.layers(2);
112 |         for (auto &l : lay2)
113 |             l.rand_init();
114 |         for (int si = 0; si < num_seqs; si++) {
115 |             seq2kmer(seqs[si], kmer_seqs[si], basicModules.kmer_size, basicModules.alphabet_size);
116 |             minhash(kmer_seqs[si], mh_sketch[si], kmerModules.mh_params);
117 |             weighted_minhash(kmer_seqs[si], wmh_sketch[si], kmerModules.wmh_params);
118 |             ordered_minhash(kmer_seqs[si], omh_sketch[si], kmerModules.omh_params);
119 |             //            tensor_sketch(seqs[si], ten_sketch[si], longseqModule.tensor_params);
120 |             //            tensor_disc_sketch<int, double>(seqs[si], ten_disc_sketch[si],
121 |             //            discModules.tensor_params); Vec3D<double> in, out;
122 |             slide_disc_sketch1[si] = tensor_disc_slide<int, double>(seqs[si], lay0);
123 |             slide_disc_sketch2[si]
124 |                     = tensor_disc_slide3<double, double>(slide_disc_sketch1[si], lay1);
125 |             slide_disc_sketch3[si]
126 |                     = tensor_disc_slide3<double, double>(slide_disc_sketch2[si], lay2);
127 |             slide_disc_flat[si] = squeeze_tensor(slide_disc_sketch3[si]);
128 |             tensor_slide_sketch(seqs[si], slide_sketch[si], kmerModules.tensor_slide_params);
129 |         }
130 |     }
131 |     void compute_dists() {
132 |         int num_seqs = seqs.size();
133 |         dists = new3D<double>(7, num_seqs, num_seqs, 0);
134 |         for (int i = 0; i < seqs.size(); i++) {
135 |             for (int j = i + 1; j < seqs.size(); j++) {
136 |                 dists[0][i][j] = edit_distance(seqs[i], seqs[j]);
137 |                 dists[1][i][j] = hamming_dist(mh_sketch[i], mh_sketch[j]);
138 |                 dists[2][i][j] = hamming_dist(wmh_sketch[i], wmh_sketch[j]);
139 |                 dists[3][i][j] = hamming_dist2D(omh_sketch[i], omh_sketch[j]);
140 |                 dists[4][i][j] = l1_dist(ten_sketch[i], ten_sketch[j]);
141 |                 dists[5][i][j] = l1_dist(slide_disc_flat[i], slide_disc_flat[j]);
142 |                 dists[6][i][j] = l1_dist2D_minlen(slide_sketch[i], slide_sketch[j]);
143 |             }
144 |         }
145 |     }
146 | 
147 |     void save_output() {
148 |         std::ofstream fo;
149 |         fo.open("output.txt");
150 |         for (int i = 0; i < seqs.size(); i++) {
151 |             for (int j = i + 1; j < seqs.size(); j++) {
152 |                 fo << dists[0][i][j] << ", " << dists[1][i][j] << ", " << dists[2][i][j] << ", "
153 |                    << dists[3][i][j] << ", " << dists[4][i][j] << ", " << dists[5][i][j] << ", "
154 |                    << dists[6][i][j] << "\n";
155 |             }
156 |         }
157 |         fo.close();
158 |     }
159 | };
160 | 
161 | int main(int argc, char *argv[]) {
162 |     TestModule1 experiment;
163 |     experiment.parse(argc, argv);
164 |     experiment.generate_sequences();
165 |     experiment.compute_sketches();
166 |     experiment.compute_dists();
167 |     experiment.save_output();
168 | }
169 | 


--------------------------------------------------------------------------------
/legacy/test_typeinfo.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Amir Joudaki on 6/18/20.
 3 | //
 4 | 
 5 | #include <cstddef>
 6 | #include <iostream>
 7 | #include <string>
 8 | #include <typeinfo>
 9 | // using std::literals;
10 | 
11 | 
12 | struct Base {}; // non-polymorphic
13 | struct Derived : Base {};
14 | 
15 | struct Base2 {
16 |     virtual void foo() {}
17 | }; // polymorphic
18 | struct Derived2 : Base2 {};
19 | 
20 | 
21 | // requires std::is_integral<T>::value
22 | template <typename T>
23 | requires std::is_integral_v<T, int> struct foo {
24 |     T val;
25 | };
26 | 
27 | int main() {
28 |     foo<float> a = { 11.5 };
29 |     std::cout << (a.val) << "\n";
30 |     int myint = 50;
31 |     std::string mystr = "string";
32 |     double *mydoubleptr = nullptr;
33 | 
34 |     std::cout << "myint has type: " << typeid(myint).name() << '\n'
35 |               << "mystr has type: " << typeid(mystr).name() << '\n'
36 |               << "mydoubleptr has type: " << typeid(mydoubleptr).name() << '\n';
37 | 
38 |     // std::cout << myint is a glvalue expression of polymorphic type; it is evaluated
39 |     const std::type_info &r1 = typeid(std::cout << myint);
40 |     std::cout << '\n' << "std::cout<<myint has type : " << r1.name() << '\n';
41 | 
42 |     // std::printf() is not a glvalue expression of polymorphic type; NOT evaluated
43 |     const std::type_info &r2 = typeid(std::printf("%d\n", myint));
44 |     std::cout << "printf(\"%d\\n\",myint) has type : " << r2.name() << '\n';
45 | 
46 |     // Non-polymorphic lvalue is a static type
47 |     Derived d1;
48 |     Base &b1 = d1;
49 |     std::cout << "reference to non-polymorphic base: " << typeid(b1).name() << '\n';
50 | 
51 |     Derived2 d2;
52 |     Base2 &b2 = d2;
53 |     std::cout << "reference to polymorphic base: " << typeid(b2).name() << '\n';
54 | 
55 |     try {
56 |         // dereferencing a null pointer: okay for a non-polymorphic expression
57 |         std::cout << "mydoubleptr points to " << typeid(*mydoubleptr).name() << '\n';
58 |         // dereferencing a null pointer: not okay for a polymorphic lvalue
59 |         Derived2 *bad_ptr = nullptr;
60 |         std::cout << "bad_ptr points to... ";
61 |         std::cout << typeid(*bad_ptr).name() << '\n';
62 |     } catch (const std::bad_typeid &e) {
63 |         std::cout << " caught " << e.what() << '\n';
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/legacy/vectool.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Amir Joudaki on 6/10/20.
 3 | //
 4 | 
 5 | #include <algorithm>
 6 | #include <iostream>
 7 | #include <memory>
 8 | #include <random>
 9 | 
10 | #include "vectool.hpp"
11 | 
12 | using namespace ts;
13 | 
14 | int main() {
15 |     using std::cout;
16 |     using std::vector;
17 | 
18 |     // we want dims[0]xdim[1] matrix interface
19 |     vector<int> dims = { 3, 2 }, data(6, 0);
20 |     MultiView mat(dims, data);
21 |     for (int i = 0; i < mat.size(1); i++) {
22 |         for (int j = 0; j < mat.size(0); j++) {
23 |             mat[i][j] = i * 10 + j; // elements can be modified with op[][] syntax
24 |         }
25 |     }
26 |     cout << "mat = \n" << mat << "\n"; // ostream&<< is overloaded
27 |     mat[0][1]++; // scalar elements can be modified/accessed
28 |     mat = 2; // assign scalar init_tensor_slide_params all matrix
29 |     mat += 5; // add 5 init_tensor_slide_params all elements (sam for -= and *=)
30 |     mat[0] *= 2; // multiply the first row mat[0][:] by two
31 | 
32 |     // MultiVec has an internal storage
33 |     MultiVec mv({ 3, 2 }, 10), mv2({ 3, 2 }, 1);
34 |     mv[0] += mv2[1]; // partial assignment
35 |     cout << "mv2 = \n" << mv2 << "\n";
36 | 
37 |     std::default_random_engine eng;
38 |     std::uniform_int_distribution<int> unif(0, 10);
39 |     Multistd::vector<int64_t, size_t> T({ 3, 2, 2 }, 0); // tensor with types for index and value
40 |     for (auto it = T.begin(); it != T.end(); it++) { // an iterator over all elements
41 |         *it = unif(eng);
42 |     }
43 |     cout << "Tensor = \n" << T << "\n";
44 |     /*
45 | the output will be
46 | 
47 | mat =
48 | (d1=0)	0	1	2
49 | (d1=1)	10	11	12
50 | 
51 | mv2 =
52 | (d1=0)	1	1	1
53 | (d1=1)	1	1	1
54 | 
55 | Tensor =
56 | (d2=0)
57 |         (d1=0)	1	5	0
58 | (d1=1)	2	0	8
59 | 
60 | (d2=1)
61 | (d1=0)	2	2	10
62 | (d1=1)	8	2	8
63 | 
64 | */
65 | }
66 | 


--------------------------------------------------------------------------------
/phylogeny/upgma.cpp:
--------------------------------------------------------------------------------
 1 | #include "upgma.hpp"
 2 | 
 3 | #include <unordered_set>
 4 | #include <vector>
 5 | 
 6 | namespace ts {
 7 | 
 8 | Tree upgma(const std::vector<std::vector<double>> &dist_mat) {
 9 |     if (dist_mat.empty()) {
10 |         return {};
11 |     }
12 | 
13 |     Tree result(2 * dist_mat.size() - 1);
14 |     // {nodeId, nodeCount} pairs of all the cluster roots
15 |     std::unordered_map<uint32_t, uint32_t> roots;
16 |     std::unordered_map<uint32_t, std::unordered_map<uint32_t, double>> D;
17 |     for (uint32_t i = 0; i < dist_mat.size(); ++i) {
18 |         roots.insert({ i, 1 });
19 |         result[i] = { 0, NO_CHILD, NO_CHILD };
20 |         for (uint32_t j = 0; j < dist_mat.size(); ++j) {
21 |             D[i][j] = dist_mat[i][j];
22 |         }
23 |     }
24 |     for (uint32_t step = 0; step < dist_mat.size() - 1; ++step) {
25 |         double minDist = std::numeric_limits<double>::max();
26 |         uint32_t min_i, min_j;
27 |         for (const auto &root1 : roots) {
28 |             for (const auto &root2 : roots) {
29 |                 if (root1.first == root2.first) {
30 |                     continue;
31 |                 }
32 |                 double currentDist = D[root1.first][root2.first];
33 |                 if (currentDist < minDist) {
34 |                     minDist = currentDist;
35 |                     min_i = root1.first;
36 |                     min_j = root2.first;
37 |                 }
38 |             }
39 |         }
40 |         uint32_t new_node = dist_mat.size() + step;
41 | 
42 |         result[new_node] = { minDist / 2., min_i, min_j };
43 |         // update D
44 |         for (const auto &root : roots) {
45 |             D[new_node][root.first]
46 |                     = (D[min_i][root.first] * roots[min_i] + D[min_j][root.first] * roots[min_j])
47 |                     / (roots[min_i] + roots[min_j]);
48 |             D[root.first][new_node] = D[new_node][root.first];
49 |         }
50 |         D.erase(min_i);
51 |         D.erase(min_j);
52 |         for (auto &row : D) {
53 |             row.second.erase(min_i);
54 |             row.second.erase(min_j);
55 |         }
56 |         roots[new_node] = roots[min_i] + roots[min_j];
57 |         roots.erase(min_i);
58 |         roots.erase(min_j);
59 |     }
60 |     return result;
61 | }
62 | 
63 | } // namespace ts
64 | 


--------------------------------------------------------------------------------
/phylogeny/upgma.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <limits>
 5 | #include <unordered_map>
 6 | #include <vector>
 7 | 
 8 | namespace ts {
 9 | 
10 | /**
11 |  * Node in the phylogeny node.
12 |  */
13 | struct Node {
14 |     double age;
15 |     uint32_t left, right;
16 | };
17 | 
18 | constexpr uint32_t NO_CHILD = std::numeric_limits<uint32_t>::max();
19 | 
20 | /**
21 |  * To avoid dynamic memory allocation, the tree is represented as a map from node id to the actual
22 |  * Node. The root is at index size()-1
23 |  */
24 | using Tree = std::vector<Node>;
25 | 
26 | /**
27 |  * Runs UPGMA (Unweighted Pair Group Method with Arithmetic Mean Algorithm) on the given distance
28 |  * matrix and returns the reconstructed phylogeny graph as a (parent->children) map.
29 |  */
30 | Tree upgma(const std::vector<std::vector<double>> &dist_mat);
31 | 
32 | } // namespace ts
33 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | # Line length 100
3 | line-length = 100
4 | # Keep ' instead of converting to "
5 | skip-string-normalization = true
6 | 


--------------------------------------------------------------------------------
/python/init_numba_env.sh:
--------------------------------------------------------------------------------
 1 | # For colour output; also supports jupyter_nb.
 2 | export NUMBA_COLOR_SCHEME=dark_bg
 3 | 
 4 | # Level 2 is a bit faster for compilation itself.
 5 | # Level 3 is faster??? for running CUDA kernels.
 6 | export NUMBA_OPT=2
 7 | 
 8 | # Disable jit entirely for debugging purposes.
 9 | #export NUMBA_DISABLE_JIT=1
10 | 
11 | # Options for caching
12 | #export NUMBA_DEBUG_CACHE=1
13 | 


--------------------------------------------------------------------------------
/python/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ratschlab/Project2020-seq-tensor-sketching/20b19ddd19751840d33af97abe314d29b34dc0d4/python/lib/__init__.py


--------------------------------------------------------------------------------
/python/lib/base.py:
--------------------------------------------------------------------------------
 1 | # Contains sketch base classes and helper methods
 2 | 
 3 | import random
 4 | 
 5 | import numpy as np
 6 | import numba as nb
 7 | from numba import njit
 8 | from numba.experimental import jitclass
 9 | from numba.typed import List
10 | 
11 | from lib.sequence import *
12 | 
13 | # A SketchedSequence contains a sequence and its sketch.
14 | # The sketch must be a 1D array of float32s.
15 | @jitclass([('seq', Sequence_type), ('sketch', nb.float32[::1])])
16 | class SketchedSequence:
17 |     def __init__(self, seq: Sequence, sketch):
18 |         self.seq = seq
19 |         self.sketch = sketch
20 | 
21 | 
22 | SketchedSequence_type = SketchedSequence.class_type.instance_type
23 | 
24 | # Compute the Euclidean distance between two sketched sequences.
25 | @njit
26 | def dist(ss1: np.ndarray, ss2: np.ndarray) -> np.float32:
27 |     return np.linalg.norm(ss1.sketch - ss2.sketch)
28 | 
29 | 
30 | # Return a sorted list of (dist, seq1, seq2).
31 | @njit
32 | def pairwise_dists(
33 |     seqs: list[SketchedSequence],
34 | ) -> list[tuple[np.float32, SketchedSequence, SketchedSequence]]:
35 |     d = []
36 |     for j in range(len(seqs)):
37 |         for i in range(j):
38 |             d.append((dist(seqs[i], seqs[j]), seqs[i], seqs[j]))
39 |     d.sort(key=lambda tup: tup[0])
40 |     return d
41 | 
42 | 
43 | sketchparams_spec = [
44 |     ('A', nb.int32),
45 |     ('t', nb.int32),
46 |     ('D', nb.int32),
47 |     ('normalize', nb.bool_),
48 |     ('L', nb.int32),
49 |     ('DL', nb.int32),
50 | ]
51 | 
52 | 
53 | @jitclass(sketchparams_spec)
54 | class SketchParams:
55 |     def __init__(self, A, t, D, normalize=True, L=1):
56 |         # Alphabet size
57 |         self.A = A
58 |         # Tensor Sketch tuple size
59 |         self.t = t
60 |         # Tensor Sketch embed dimension
61 |         self.D = D
62 |         # Return frequencies instead of counts
63 |         self.normalize = normalize
64 | 
65 |         # GPU Sketch
66 |         # Amount of work per thread, must divide D.
67 |         # Spawn t*(D/L) instead of t*D threads when this is > 1.
68 |         self.L = L
69 |         assert D % L == 0
70 |         self.DL = D // L
71 | 
72 | 
73 | SketchParams_type = SketchParams.class_type.instance_type
74 | 
75 | 
76 | # NOTE: Sketchers are not always jitted, since e.g. CUDA invocations do not support this.
77 | class Sketcher:
78 |     def __init__(self, params: SketchParams):
79 |         self.A = params.A
80 |         self.t = params.t
81 |         self.D = params.D
82 |         self.normalize = params.normalize
83 |         self.L = params.L
84 |         self.DL = params.DL
85 | 
86 |     # [Optional] sketch a single sequence for all t' <= t.
87 |     def _full_sketch(self, seq: Sequence):
88 |         pass
89 | 
90 |     # Sketch a single sequence.
91 |     def sketch_one(self, seq: Sequence) -> SketchedSequence:
92 |         pass
93 | 
94 |     # Sketch a list of sequences.
95 |     def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]:
96 |         pass
97 | 


--------------------------------------------------------------------------------
/python/lib/cds.py:
--------------------------------------------------------------------------------
 1 | # Helper functions to parse the .CDS files in the homology dataset.
 2 | 
 3 | import os
 4 | import json
 5 | import seaborn as sns
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | from pathlib import Path
10 | from collections import defaultdict
11 | 
12 | # Print: min, mean (std), median, max, 5 lowest + 5 largest values with counts, 5 most common values
13 | def _print_stats(name, data):
14 |     name = name.upper()
15 |     minval = min(*data)
16 |     mean = np.mean(data)
17 |     std = np.std(data)
18 |     maxval = max(*data)
19 |     sumval = sum(data)
20 | 
21 |     counts = defaultdict(int)
22 |     for x in data:
23 |         counts[x] += 1
24 | 
25 |     counts = list(counts.items())
26 | 
27 |     # Sort by value, get low and high 5.
28 |     counts = sorted(counts, key=lambda x: x[0])
29 |     if len(counts) <= 10:
30 |         lowhigh = counts
31 |     else:
32 |         lowhigh = counts[:5] + ['...'] + counts[-5:]
33 | 
34 |     # Sort by count, get the 5 most frequent values.
35 |     counts = sorted(counts, key=lambda x: x[1])
36 |     maxcount = reversed(counts[-5:])
37 | 
38 |     print(name)
39 |     print(f'{minval: 6} <= {mean: 8.1f}=μ ({std: 8.1f}=σ) <= {maxval: 6};     sum={sumval}')
40 |     print('Low/high vals:', *lowhigh)
41 |     print('Frequent vals:', *maxcount)
42 | 
43 |     if len(data) <= 20:
44 |         print(*data)
45 |     print()
46 | 
47 | 
48 | def exon_stats(fasta_paths, sequences):
49 |     exon_lengths = []
50 |     num_exons = []
51 |     total_exon_lengths = []
52 | 
53 |     id_to_exons = dict()
54 |     for f in fasta_paths:
55 |         data = json.loads(f.with_suffix('.CDS.json').read_text())
56 |         id_to_exons |= data
57 | 
58 |     print(len(id_to_exons))
59 | 
60 |     i = 0
61 |     for s in sequences:
62 |         exons = id_to_exons[s.metadata['tid']]
63 |         total_exon_length = 0
64 | 
65 |         num_exons.append(len(exons))
66 | 
67 |         for exon in exons:
68 |             l = exon['end'] - exon['start']
69 |             exon_lengths.append(l)
70 |             total_exon_length += l
71 | 
72 |         _print_stats('Exon length', exon_lengths)
73 |         _print_stats('Exons per gene', num_exons)
74 |         _print_stats('Total exon length per gene', total_exon_lengths)
75 | 
76 |     sns.displot(exon_lengths)
77 |     plt.show()
78 | 


--------------------------------------------------------------------------------
/python/lib/sequence.py:
--------------------------------------------------------------------------------
  1 | # Classes for Sequence and FastaFile.
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import numba as nb
  7 | from numba import njit, types, typed
  8 | from numba.experimental import jitclass
  9 | 
 10 | # Map from sequence characters to internal integer representation.
 11 | _char_map: dict[str, int] = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
 12 | 
 13 | 
 14 | # Given the char_map above, returns an array of length 256 mapping bytes to
 15 | # internal integers. -1 signals unknown bytes.
 16 | def _compute_char_list() -> np.ndarray:
 17 |     char_list = np.full(256, -1, np.int8)
 18 |     for k in _char_map:
 19 |         char_list[ord(k)] = _char_map[k]
 20 |     return char_list
 21 | 
 22 | 
 23 | # Map 256 bytes to integers; built from the map above.
 24 | # -1 signals unknown bytes.
 25 | _char_list: np.ndarray = _compute_char_list()
 26 | 
 27 | 
 28 | # Class that contains a single sequence. The full_seq member contains the
 29 | # original byte-representation of the sequence as read from the Fasta file. The
 30 | # seq member contains the processed internal int8 representation.
 31 | @jitclass(
 32 |     [
 33 |         ('id', types.unicode_type),
 34 |         ('metadata', types.DictType(types.unicode_type, types.unicode_type)),
 35 |         # C-layout 1-dimensional arrays.
 36 |         ('full_seq', nb.byte[::1]),
 37 |         ('seq', nb.int8[::1]),
 38 |     ]
 39 | )
 40 | class Sequence:
 41 | 
 42 |     # Given an ID of the form key:value|otherkey:othervalue, parse it.
 43 |     @staticmethod
 44 |     def id_to_map(id):
 45 |         data = typed.Dict()
 46 |         for kv in id.split('|'):
 47 |             k, v = kv.split(':')
 48 |             data[k] = v
 49 |         return data
 50 | 
 51 |     # Remap characters by char_map. Removes other (lower case) characters.
 52 |     @staticmethod
 53 |     def remap(s):
 54 |         return np.array([_char_list[c] for c in s if _char_list[c] != -1], dtype=np.int8)
 55 | 
 56 |     @staticmethod
 57 |     def reverse_complement(seq: np.ndarray):
 58 |         seqr = np.flip(seq)
 59 |         return np.array([(c ^ 3) for c in seqr], dtype=np.int8)
 60 | 
 61 |     def __init__(self, id: str, s: bytes):
 62 |         # String: header/name/id of this sequence in the Fasta file.
 63 |         self.id = id
 64 |         # Metadata encoded in the header.
 65 |         self.metadata = self.id_to_map(id)
 66 |         # The original sequence.
 67 |         self.full_seq = np.array([c for c in s], dtype=nb.byte)
 68 |         # The sequence with masked repeats (lower case characters) removed, and mapped to integers.
 69 |         self.seq = self.remap(s)
 70 |         if 'strand' in self.metadata and self.metadata['strand'] == '-':
 71 |             self.seq = self.reverse_complement(self.seq)
 72 | 
 73 |     def len(self):
 74 |         return len(self.seq)
 75 | 
 76 | 
 77 | Sequence_type = Sequence.class_type.instance_type
 78 | 
 79 | 
 80 | class FastaFile:
 81 |     def __init__(self, path):
 82 |         # The Path to the current file.
 83 |         self.path = Path(path)
 84 |         # The name of the current file.
 85 |         self.name = self.path.name
 86 |         # A list of Sequence objects in this file.
 87 |         self.seqs = []
 88 | 
 89 |         self.read()
 90 | 
 91 |     def read(self):
 92 |         header = None
 93 |         seq = []
 94 | 
 95 |         def flush():
 96 |             nonlocal header, seq
 97 |             if header is None:
 98 |                 return
 99 |             assert seq
100 |             sequence = Sequence(header, b''.join(seq))
101 |             self.seqs.append(sequence)
102 |             header = None
103 |             seq = []
104 | 
105 |         # Sequences are read in binary mode; ids are decoded as ascii.
106 |         with self.path.open('br') as f:
107 |             for line in f:
108 |                 if line[0] == ord('>'):
109 |                     flush()
110 |                     header = line[1:].decode('ascii').strip()
111 |                 else:
112 |                     seq.append(line)
113 |             flush()
114 | 
115 | 
116 | # Contains a map from ids to sequences, constructed from a list of FastaFiles.
117 | class SequenceDict:
118 |     def __init__(self, fastafiles):
119 |         self.by_id = dict()
120 |         for file in fastafiles:
121 |             for seq in file.seqs:
122 |                 self.by_id[seq.id] = seq
123 | 
124 |     # Returns None if key not found.
125 |     def __getitem__(self, key):
126 |         return self.by_id.get(key)
127 | 


--------------------------------------------------------------------------------
/python/lib/tensor_embedding.py:
--------------------------------------------------------------------------------
 1 | # TENSOR EMBEDDING
 2 | 
 3 | from lib.base import *
 4 | 
 5 | # a_1...a_t is mapped to index  A^{t-1} a_1 + ... + A * a_{t-1} + 1 * a_t
 6 | @jitclass(sketchparams_spec + [('pow', nb.int32[:])])
 7 | class TE(Sketcher):
 8 |     # https://github.com/numba/numba/issues/1694
 9 |     __init__Sketcher = Sketcher.__init__
10 | 
11 |     def __init__(self, params):
12 |         self.__init__Sketcher(params)
13 | 
14 |         self.pow = np.zeros(self.t + 1, np.int32)
15 |         self.pow[0] = 1
16 |         for i in range(1, self.t + 1):
17 |             self.pow[i] = self.A * self.pow[i - 1]
18 | 
19 |     # NOTE: The sketch is stored as float64 here so counting won't overflow.
20 |     def _empty_tensor(self):
21 |         Ts = List()
22 |         for l in self.pow:
23 |             Ts.append(np.zeros(l, np.float64))
24 |         return Ts
25 | 
26 |     # Return the sketch for the concatenation of two sequences.
27 |     # TODO: Optimize this to modify Tr in place.
28 |     def _join(self, Tl, Tr):
29 |         Ts = self._empty_tensor()
30 |         for tr in range(self.t + 1):
31 |             for tl in range(self.t + 1 - tr):
32 |                 Ts[tl + tr] += np.kron(Tl[tl], Tr[tr])
33 |         return Ts
34 | 
35 |     # Returns the raw 1D count sketches for all tuple sizes up to t.
36 |     # NOTE: This returns counts, not frequencies.
37 |     def _full_sketch(self, seq: Sequence):
38 |         Ts = self._empty_tensor()
39 | 
40 |         Ts[0][0] = 1
41 | 
42 |         # sketch
43 |         for c in seq.seq:
44 |             assert 0 <= c and c < self.A
45 |             for i in range(self.t - 1, -1, -1):
46 |                 for j in range(len(Ts[i])):
47 |                     Ts[i + 1][self.A * j + c] += Ts[i][j]
48 |         return Ts
49 | 
50 |     def sketch_one(self, seq: Sequence) -> SketchedSequence:
51 |         full_sketch = self._full_sketch(seq)
52 |         if self.normalize:
53 |             # Normalization factor.
54 |             n = seq.len()
55 |             nct = nb.float64(1)
56 |             for i in range(self.t):
57 |                 nct = nct * (n - i) / (i + 1)
58 |             full_sketch[self.t] /= nct
59 |         sketch = np.array([x for x in full_sketch[self.t]], dtype=nb.float32)
60 |         return SketchedSequence(seq, sketch)
61 | 
62 |     # Returns the sketch for the given t as frequencies.
63 |     def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]:
64 |         return [self.sketch_one(seq) for seq in seqs]
65 | 


--------------------------------------------------------------------------------
/python/lib/tensor_sketch.py:
--------------------------------------------------------------------------------
 1 | # TENSOR SKETCH
 2 | 
 3 | from lib.base import *
 4 | 
 5 | 
 6 | @jitclass(sketchparams_spec + [('hashes', nb.int32[:, :]), ('signs', nb.float32[:, :])])
 7 | class TS(Sketcher):
 8 |     __init__Sketcher = Sketcher.__init__
 9 | 
10 |     def __init__(self, params):
11 |         self.__init__Sketcher(params)
12 | 
13 |         random.seed(31415)
14 |         # An A*t array of random integers in [0, D)
15 |         self.hashes = np.empty((self.A, self.t), dtype=np.int32)
16 |         # An A*t array of random +-1
17 |         self.signs = np.empty((self.A, self.t), dtype=np.float32)
18 |         for c in range(self.A):
19 |             for k in range(self.t):
20 |                 self.hashes[c][k] = random.randrange(0, self.D)
21 |                 self.signs[c][k] = random.randrange(-1, 2, 2)
22 | 
23 |     def _full_sketch(self, seq):
24 |         # NOTE: The sketch is stored as float64 here so counting won't overflow.
25 |         T = np.zeros((self.t + 1, self.D), dtype=np.float64)
26 |         T[0][0] = 1
27 | 
28 |         for c in seq.seq:
29 |             for k in range(self.t - 1, -1, -1):
30 |                 h = self.hashes[c][k]
31 |                 s = self.signs[c][k]
32 |                 for l in range(self.D):
33 |                     r = l + h if l + h < self.D else l + h - self.D
34 |                     T[k + 1][l] += s * T[k][r]
35 | 
36 |         return T
37 | 
38 |     def _normalize(self, seq, T):
39 |         if self.normalize:
40 |             # Normalization factor.
41 |             n = seq.len()
42 |             nct = nb.float64(1)
43 |             for i in range(self.t):
44 |                 nct = nct * (n - i) / (i + 1)
45 |             T /= nct
46 |         return T
47 | 
48 |     def sketch_one(self, seq: Sequence) -> SketchedSequence:
49 |         full_sketch = self._full_sketch(seq)
50 | 
51 |         self._normalize(seq, full_sketch[self.t])
52 | 
53 |         sketch = np.array([x for x in full_sketch[self.t]], dtype=nb.float32)
54 |         return SketchedSequence(seq, sketch)
55 | 
56 |     def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]:
57 |         return [self.sketch_one(seq) for seq in seqs]
58 | 


--------------------------------------------------------------------------------
/python/lib/tensor_sketch_gpu.py:
--------------------------------------------------------------------------------
  1 | # GPU TENSOR SKETCH
  2 | 
  3 | from numba import cuda
  4 | 
  5 | from lib.base import *
  6 | from lib.tensor_sketch import TS
  7 | 
  8 | # CUDA kernel to sketch a list of sequences.
  9 | # A, t, D, L (int32): parameters as usual.
 10 | # global_hashes (int32[:, :]): A*t device array of hashes.
 11 | # global_signs (float32[:, :]): A*t device array of signs. Note that these are
 12 | # floats to avoid additional (slow) int32->float32 conversions.
 13 | # seq (int8[:]): concatenation of the sequences to sketch.
 14 | # starts (int32[:]): the start positions of the subsequences in seq.
 15 | # T: (float32[:, :]): n*D device array for the output, given n input sequences.
 16 | @cuda.jit(fastmath=True)
 17 | def _gpu_sketch(A, t, D, L, hashes, signs, seq, starts, T):
 18 |     seqid = cuda.blockIdx.x
 19 |     start = starts[seqid]
 20 |     end = starts[seqid + 1]
 21 | 
 22 |     l = cuda.threadIdx.x
 23 |     k = cuda.threadIdx.y
 24 |     assert k < t
 25 |     assert l < D // L
 26 | 
 27 |     # We use a 2*(t+1)*D tensor consisting of two 'planes'.
 28 |     # At each step, one plane is the input, and one is the output. Which is indicated by `j` further down.
 29 |     plane = (t + 1) * D
 30 |     threads = t * D // L
 31 | 
 32 |     # Slice the shared memory into local shared memory arrays.
 33 |     # Note the different types per view.
 34 | 
 35 |     # NOTE: Tin has a variable offset of k*D to save a bit on further computations.
 36 |     Tin = cuda.shared.array(shape=0, dtype=nb.float32)[k * D : 2 * plane]
 37 |     local_seq = cuda.shared.array(shape=0, dtype=nb.int32)[2 * plane : 2 * plane + threads]
 38 | 
 39 |     local_signs = cuda.shared.array(shape=0, dtype=nb.float32)[
 40 |         2 * plane + threads : 2 * plane + threads + A * t
 41 |     ]
 42 |     local_hashes = cuda.shared.array(shape=0, dtype=nb.int32)[
 43 |         2 * plane + threads + A * t : 2 * plane + threads + 2 * A * t
 44 |     ]
 45 | 
 46 |     # Copy the device memory hashes/signs to shared memory.
 47 |     if l < A:
 48 |         local_hashes[l * t + k] = hashes[l][k]
 49 |         local_signs[l * t + k] = signs[l][k]
 50 | 
 51 |     # Initialize the tensors to 0.
 52 |     for ll in range(l, D, D // L):
 53 |         Tin[0 * plane + 0 * D + ll] = 0
 54 |         Tin[0 * plane + (0 + 1) * D + ll] = 0
 55 |         Tin[1 * plane + 0 * D + ll] = 0
 56 |         Tin[1 * plane + (0 + 1) * D + ll] = 0
 57 | 
 58 |     cuda.syncthreads()
 59 | 
 60 |     # Initialize the 0-element of the tensor to 1.
 61 |     if k == 0:
 62 |         Tin[0] = 1
 63 |         Tin[plane] = 1
 64 | 
 65 |     cuda.syncthreads()
 66 | 
 67 |     # The offset for the plane we're currently reading from. The write offset
 68 |     # is the other plane: `plane-read_plane`.
 69 |     read_plane = 0
 70 | 
 71 |     # Loop over characters in the sequence.
 72 |     tid = l + k * D // L
 73 |     for i in range((end - start) // threads):
 74 |         # Read `threads` characters from `seq` and store them in `local_seq` in shared memory.
 75 |         idx = start + i * threads + tid
 76 |         local_seq[tid] = seq[idx]
 77 |         cuda.syncthreads()
 78 | 
 79 |         # Process the fetched characters.
 80 |         for c in local_seq:
 81 |             h = local_hashes[c * t + k]
 82 |             s = local_signs[c * t + k]
 83 |             write_plane = plane - read_plane
 84 |             # Process L consecutive indices (of the D in total).
 85 |             # 0 <= l < D/L, so this covers all of [0, D).
 86 |             for ll in range(L * l, L * (l + 1)):
 87 |                 # Compute the shifted target index, avoiding a modulo operation.
 88 |                 r = ll + h
 89 |                 r -= D if r >= D else 0
 90 |                 # Write to output tensor.
 91 |                 Tin[write_plane + D + ll] = Tin[read_plane + D + ll] + s * Tin[read_plane + r]
 92 | 
 93 |             # After this thread has processed the current character `c`, swap the active plane and wait for other threads.
 94 |             read_plane = write_plane
 95 |             cuda.syncthreads()
 96 | 
 97 |     # Process the remaining characters. We don't do synchronous prefetching to
 98 |     # shared memory here, because this only covers the last few characters of
 99 |     # the sequence.
100 |     # TODO: If sequences are short, it may actually be beneficial to still do this.
101 |     for idx in range(start + (end - start) // threads * threads, end):
102 |         c = seq[idx]
103 |         # Same code as above.
104 |         h = local_hashes[c * t + k]
105 |         s = local_signs[c * t + k]
106 |         write_plane = plane - read_plane
107 |         for ll in range(L * l, L * (l + 1)):
108 |             r = ll + h
109 |             r -= D if r >= D else 0
110 |             Tin[write_plane + D + ll] = Tin[read_plane + D + ll] + s * Tin[read_plane + r]
111 | 
112 |         read_plane = write_plane
113 |         cuda.syncthreads()
114 | 
115 |     # Copy to result.
116 |     for ll in range(l, D, D // L):
117 |         T[seqid][k][ll] = Tin[read_plane + ll]
118 |         T[seqid][k + 1][ll] = Tin[read_plane + D + ll]
119 | 
120 | 
121 | class GTS(Sketcher):
122 |     def __init__(self, params):
123 |         super().__init__(params)
124 | 
125 |         # Use the jitclass TS to copy hashes and signs parameters.
126 |         # This is needed, because calling random returns different random
127 |         # numbers inside and outside of jitted functions.
128 |         # Ideally we'd inherit from TS, but inheriting from jitted classes is
129 |         # not possible.
130 |         self.ts = TS(params)
131 |         self.hashes = np.array(self.ts.hashes, dtype=np.int32)
132 |         self.signs = np.array(self.ts.signs, dtype=np.float32)
133 | 
134 |         self.d_hashes = cuda.to_device(self.hashes)
135 |         self.d_signs = cuda.to_device(self.signs)
136 | 
137 |     def sketch(self, seqs: list[Sequence]) -> list[SketchedSequence]:
138 |         assert isinstance(seqs, List)
139 |         assert len(seqs) > 0
140 |         assert isinstance(seqs[0], Sequence)
141 | 
142 |         # TODO: Add normalization to the GPU sketch method.
143 |         for seq in seqs:
144 |             assert (
145 |                 seq.len() ** self.t < 10 ** 38
146 |             ), "Counts may overflow! Lower t or shorten the sequence."
147 | 
148 |         # Sort by decreasing length
149 |         seqs = sorted(seqs, key=lambda s: len(s.seq), reverse=True)
150 | 
151 |         # Put all operations on a stream, so that the python code runs asynchronously of the GPU code.
152 |         stream = cuda.stream()
153 | 
154 |         # Launch one thread block per sequence.
155 |         blocks = len(seqs)
156 | 
157 |         # Convert the input sequences to a single list of characters and the corresponding start indices.
158 |         raw_seqs = [seq.seq for seq in seqs]
159 |         raw_seq = np.concatenate(raw_seqs)
160 |         starts = np.array(
161 |             np.cumsum(np.array([0] + [len(seq) for seq in raw_seqs]), dtype=np.int32),
162 |             dtype=np.int32,
163 |         )
164 | 
165 |         # Copy data from host to device.
166 |         d_raw_seq = cuda.to_device(raw_seq, stream=stream)
167 |         d_starts = cuda.to_device(starts, stream=stream)
168 |         d_T = cuda.device_array((blocks, self.t + 1, self.D), dtype=np.float32, stream=stream)
169 | 
170 |         threads = self.t * self.D // self.L
171 | 
172 |         # Make sure we have enough threads to initialize self.hashes and
173 |         # self.signs by a single synchronous copy.
174 |         assert self.DL >= self.A
175 | 
176 |         # One thread per (l, k) <= (D/L, t)
177 |         _gpu_sketch[
178 |             (blocks, 1),
179 |             (self.DL, self.t),
180 |             stream,
181 |             4 * (threads + 2 * (self.t + 1) * self.D + 2 * self.A * self.t),
182 |         ](
183 |             np.int32(self.A),
184 |             np.int32(self.t),
185 |             np.int32(self.D),
186 |             np.int32(self.L),
187 |             self.d_hashes,
188 |             self.d_signs,
189 |             d_raw_seq,
190 |             d_starts,
191 |             d_T,
192 |         )
193 | 
194 |         T = d_T.copy_to_host(stream=stream)
195 | 
196 |         # Only return the length t sketch
197 |         sketched_seqs = List()
198 |         for seq, sketch in zip(seqs, T):
199 |             self.ts._normalize(seq, sketch[self.t])
200 |             sketched_seqs.append(SketchedSequence(seq, sketch[self.t]))
201 | 
202 |         return sketched_seqs
203 | 
204 |     def sketch_one(self, seq: Sequence) -> SketchedSequence:
205 |         return self.sketch(List([seq]))[0]
206 | 


--------------------------------------------------------------------------------
/python/lib/util.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | # Time the duration of the given lambda and print it.
 4 | def timeit(f, name=''):
 5 |     s = time.time()
 6 |     ret = f()
 7 |     e = time.time()
 8 |     print(f'Duration [{name}]: {e-s:4.4f}')
 9 |     return ret
10 | 


--------------------------------------------------------------------------------
/sequence/alphabets.cpp:
--------------------------------------------------------------------------------
  1 | #include "alphabets.hpp"
  2 | 
  3 | #include <algorithm>
  4 | #include <iostream>
  5 | #include <string>
  6 | 
  7 | namespace ts {
  8 | 
  9 | template <size_t n>
 10 | class log2 {
 11 |     static constexpr size_t _log2(size_t x) {
 12 |         if (x < 2) {
 13 |             return 0;
 14 |         } else {
 15 |             return _log2(x >> 1) + 1;
 16 |         }
 17 |     }
 18 | 
 19 |   public:
 20 |     static constexpr size_t value = _log2(n);
 21 | };
 22 | 
 23 | constexpr uint8_t alphabet_size_dna = 5;
 24 | constexpr char alphabet_dna[] = "ACGTN";
 25 | constexpr uint8_t bits_per_char_dna = 3;
 26 | constexpr uint8_t char2int_tab_dna[128] // A=1,C=2,G=3,T=4,N=0,invalid=5
 27 |         = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //  0=25
 28 |             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 26-50
 29 |             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, // 51-75
 30 |             0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, // 76-100
 31 |             5, 5, 5, 5, 5, 5, 0, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }; // 100-127
 32 | 
 33 | inline uint32_t char2int_dna(uint8_t c) {
 34 |     return char2int_tab_dna[c];
 35 | }
 36 | 
 37 | constexpr uint8_t alphabet_size_dna4 = 4;
 38 | constexpr char alphabet_dna4[] = "ACGTN";
 39 | constexpr uint8_t bits_per_char_dna4 = 2;
 40 | constexpr uint8_t char2int_tab_dna4[128] // A=0,C=1,G=2,T=3, invalid=5
 41 |         = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //  0=25
 42 |             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, // 26-50
 43 |             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, // 51-75
 44 |             5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 2, // 76-100
 45 |             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }; // 100-127
 46 | 
 47 | inline uint32_t char2int_dna4(uint8_t c) {
 48 |     return char2int_tab_dna4[c];
 49 | }
 50 | 
 51 | 
 52 | constexpr char alphabet_protein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX";
 53 | constexpr uint8_t alphabet_size_protein = sizeof(alphabet_protein) - 1;
 54 | constexpr uint8_t bits_per_char_protein = log2<alphabet_size_protein - 1>::value + 1;
 55 | constexpr uint8_t char2int_tab_protein[128]
 56 |         = { 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 57 |             25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 58 |             25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 0,
 59 |             1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
 60 |             25, 23, 24, 25, 25, 25, 25, 25, 25, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
 61 |             13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, 25, 25, 25, 25, 25 };
 62 | 
 63 | inline uint32_t char2int_protein(uint8_t c) {
 64 |     return char2int_tab_protein[c];
 65 | }
 66 | 
 67 | enum class AlphabetType { DNA4, DNA5, Protein };
 68 | 
 69 | AlphabetType from_string(std::string str) {
 70 |     std::transform(str.begin(), str.end(), str.begin(),
 71 |                    [](unsigned char c) { return std::tolower(c); });
 72 |     if (str == "dna4") {
 73 |         return AlphabetType::DNA4;
 74 |     } else if (str == "dna5") {
 75 |         return AlphabetType::DNA5;
 76 |     } else if (str == "protein") {
 77 |         return AlphabetType::Protein;
 78 |     } else {
 79 |         throw std::logic_error("Invalid alphabet type");
 80 |     }
 81 | }
 82 | 
 83 | std::function<uint32_t(uint8_t c)> char2int;
 84 | const char *alphabet;
 85 | uint8_t alphabet_size;
 86 | uint8_t bits_per_char;
 87 | 
 88 | void init_alphabet(const std::string &alphabet_str) {
 89 |     switch (from_string(alphabet_str)) {
 90 |         case AlphabetType::DNA5:
 91 |             char2int = char2int_dna;
 92 |             alphabet = alphabet_dna;
 93 |             alphabet_size = alphabet_size_dna;
 94 |             bits_per_char = bits_per_char_dna;
 95 |             return;
 96 |         case AlphabetType::DNA4:
 97 |             char2int = char2int_dna4;
 98 |             alphabet = alphabet_dna4;
 99 |             alphabet_size = alphabet_size_dna4;
100 |             bits_per_char = bits_per_char_dna4;
101 |             return;
102 |         case AlphabetType::Protein:
103 |             char2int = char2int_protein;
104 |             alphabet = alphabet_protein;
105 |             alphabet_size = alphabet_size_protein;
106 |             bits_per_char = bits_per_char_protein;
107 |             return;
108 |         default:
109 |             std::cerr << "Invalid alphabet type: " << alphabet_str << std::endl;
110 |             std::exit(1);
111 |     }
112 | }
113 | 
114 | } // namespace ts
115 | 


--------------------------------------------------------------------------------
/sequence/alphabets.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <cstdint>
 5 | #include <functional>
 6 | #include <string>
 7 | 
 8 | // Defines the alphabets that TensorSketch can operate on
 9 | namespace ts {
10 | 
11 | extern std::function<uint32_t(uint8_t c)> char2int;
12 | extern const char *alphabet;
13 | extern uint8_t alphabet_size;
14 | extern uint8_t bits_per_char;
15 | 
16 | void init_alphabet(const std::string &alphabet_str);
17 | 
18 | } // namespace ts
19 | 


--------------------------------------------------------------------------------
/sequence/fasta_io.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "sequence/alphabets.hpp"
  4 | #include "util/utils.hpp"
  5 | 
  6 | #include <filesystem>
  7 | #include <fstream>
  8 | #include <iostream>
  9 | #include <iterator>
 10 | #include <sstream>
 11 | #include <string>
 12 | 
 13 | namespace ts { // ts = Tensor Sketch
 14 | 
 15 | /**
 16 |  * Represents the contents of a single Fasta file.
 17 |  * All the sequences in a file should be treated as a single assembly and should be sketched as a
 18 |  * whole.
 19 |  */
 20 | template <typename seq_type>
 21 | struct FastaFile {
 22 |     /** The name of the file. */
 23 |     std::string filename;
 24 |     /** The leading comment before each sequence. Always has the same length as sequences. */
 25 |     std::vector<std::string> comments;
 26 |     /** The sequences in the file. */
 27 |     std::vector<std::vector<seq_type>> sequences;
 28 | };
 29 | 
 30 | /**
 31 |  * Reads a fasta file and returns its contents.
 32 |  * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t
 33 |  */
 34 | template <typename seq_type>
 35 | FastaFile<seq_type> read_fasta(const std::string &file_name, const std::string &input_format) {
 36 |     FastaFile<seq_type> f;
 37 | 
 38 |     if (!std::filesystem::exists(file_name)) {
 39 |         std::cerr << "Input file does not exist: " << file_name << std::endl;
 40 |         std::exit(1);
 41 |     }
 42 | 
 43 |     std::ifstream infile(file_name);
 44 |     if (!infile.is_open()) {
 45 |         std::cout << "Could not open " + file_name << std::endl;
 46 |         std::exit(1);
 47 |     }
 48 | 
 49 |     f.filename = std::filesystem::path(file_name).filename();
 50 | 
 51 |     std::string line;
 52 |     std::vector<seq_type> seq;
 53 |     while (std::getline(infile, line)) {
 54 |         if (line[0] == '>') {
 55 |             if (!seq.empty()) {
 56 |                 f.sequences.push_back(std::move(seq));
 57 |                 seq.clear();
 58 |             }
 59 |             // Drop the leading '>'.
 60 |             f.comments.emplace_back(line.begin() + 1, line.end());
 61 |         } else if (!line.empty()) {
 62 |             if (input_format == "fasta") {
 63 |                 for (char c : line) {
 64 |                     seq.push_back(char2int(c));
 65 |                 }
 66 |             } else if (input_format == "csv") {
 67 |                 std::stringstream ss(line);
 68 |                 std::string item;
 69 |                 while (std::getline(ss, item, ',')) {
 70 |                     seq.push_back(std::stoi(item, 0, 16));
 71 |                 }
 72 |                 f.comments.push_back("seq" + std::to_string(f.sequences.size()));
 73 |                 f.sequences.push_back(std::move(seq));
 74 |                 seq.clear();
 75 |             } else {
 76 |                 std::cerr << "Invalid input foramt: " << input_format << std::endl;
 77 |                 exit(1);
 78 |             }
 79 |         }
 80 |     }
 81 |     if (!seq.empty()) {
 82 |         f.sequences.push_back(std::move(seq));
 83 |         seq.clear();
 84 |     }
 85 |     if(f.sequences.size() != f.comments.size()) {
 86 |         std::cerr << "Invalid fasta file: " << file_name << std::endl;
 87 |         std::exit(1);
 88 |     }
 89 |     return f;
 90 | }
 91 | 
 92 | /**
 93 |  * Reads all .fasta and .fna files in the given directory and returns them.
 94 |  * @tparam seq_type type used for storing a character of the fasta file, typically uint8_t
 95 |  */
 96 | template <typename seq_type>
 97 | std::vector<FastaFile<seq_type>> read_directory(const std::string &directory_name) {
 98 |     if (!std::filesystem::exists(directory_name)) {
 99 |         std::cerr << "Input directory does not exist: " << directory_name << std::endl;
100 |         std::exit(1);
101 |     }
102 |     std::vector<FastaFile<seq_type>> files;
103 | 
104 |     // Handle the case where the argument is a single file as well.
105 |     if (std::filesystem::is_regular_file(directory_name)) {
106 |         files.emplace_back(read_fasta<seq_type>(directory_name, "fasta"));
107 |     } else {
108 |         for (const auto &f : std::filesystem::directory_iterator(directory_name)) {
109 |             const std::filesystem::path ext = f.path().extension();
110 |             if (ext == ".fa" || ext == ".fna" || ext == ".fasta") {
111 |                 files.emplace_back(read_fasta<seq_type>(f.path(), "fasta"));
112 |             }
113 |         }
114 |     }
115 | 
116 |     return files;
117 | }
118 | 
119 | template <class seq_type>
120 | void write_fasta(const std::string &file_name, const Vec2D<seq_type> &sequences, bool Abc = false) {
121 |     std::ofstream fo(file_name);
122 |     fo << "#" + std::to_string(random()) << std::endl;
123 |     fo << "# " << flag_values(' ') << std::endl;
124 |     for (uint32_t si = 0; si < sequences.size(); si++) {
125 |         fo << ">s" << si << "\n";
126 |         auto &seq = sequences[si];
127 |         for (auto &c : seq) {
128 |             if (Abc) {
129 |                 fo << (char)(c + (int)'A');
130 |             } else {
131 |                 fo << c << ",";
132 |             }
133 |         }
134 |         fo << "\n\n";
135 |     }
136 | }
137 | 
138 | } // namespace ts
139 | 


--------------------------------------------------------------------------------
/sequence/sequence_generator.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util/utils.hpp"
  4 | 
  5 | #include <algorithm>
  6 | #include <functional>
  7 | #include <iostream>
  8 | #include <memory>
  9 | #include <random>
 10 | #include <utility>
 11 | 
 12 | namespace ts { // ts = Tensor Sketch
 13 | 
 14 | class SeqGen {
 15 |   public:
 16 |     SeqGen(uint8_t alphabet_size,
 17 |            bool fix_len,
 18 |            uint32_t num_seqs,
 19 |            uint32_t seq_len,
 20 |            uint32_t group_size,
 21 |            double max_mutation_rate,
 22 |            double min_mutation_rate,
 23 |            std::string phylogeny_shape)
 24 |         : alphabet_size(alphabet_size),
 25 |           fix_len(fix_len),
 26 |           num_seqs(num_seqs),
 27 |           seq_len(seq_len),
 28 |           group_size(group_size),
 29 |           max_mutation_rate(max_mutation_rate),
 30 |           min_mutation_rate(min_mutation_rate),
 31 |           phylogeny_shape(std::move(phylogeny_shape)) {
 32 |         assert(group_size >= 2 && "group size<=1 leads to completely independent sequences");
 33 |     }
 34 | 
 35 |     /**
 36 |      * Generate sequences divided into independent groups of size `group_size`, and store
 37 |      * ingroup_pairs within each group in `ingroup_pairs`
 38 |      * @tparam T character type
 39 |      * @tparam C index type
 40 |      * @param seqs generated sequences
 41 |      * @param pairs sequence ingroup_pairs within each group
 42 |      */
 43 |     template <class T>
 44 |     Vec2D<T> generate_seqs() {
 45 |         if (phylogeny_shape == "pair") { // shape=path: implemented as path & group_size=2
 46 |             phylogeny_shape = "path";
 47 |             group_size = 2;
 48 |         }
 49 |         Vec2D<T> seqs;
 50 |         seqs.reserve(num_seqs);
 51 |         while (seqs.size() < num_seqs) {
 52 |             Vec2D<T> group;
 53 | 
 54 |             // tree-like: g1->g2, add g2 to pool, g1->g3, g2->g4, add g3, g4 to pool
 55 |             if (phylogeny_shape == "tree") {
 56 |                 group = Vec2D<T>(1);
 57 |                 random_sequence(group[0], seq_len);
 58 |                 Vec2D<T> children;
 59 |                 while (group.size() < group_size) {
 60 |                     for (auto &seq : group) {
 61 |                         std::vector<T> ch;
 62 |                         mutate(seq, ch);
 63 |                         children.push_back(seq);
 64 |                         children.push_back(ch);
 65 |                     }
 66 |                     std::swap(group, children);
 67 |                     children.clear();
 68 |                 }
 69 |             } else if (phylogeny_shape == "path") { // path-like: g0->g1->g2->g3->...
 70 |                 group = Vec2D<T>(group_size);
 71 |                 random_sequence(group[0], seq_len);
 72 |                 for (size_t i = 0; i < group_size - 1; i++) {
 73 |                     mutate(group[i], group[i + 1]);
 74 |                 }
 75 |             } else if (phylogeny_shape == "star") { // star-like: g0->g1, g0->g2,g0->g3 ...
 76 |                 group = Vec2D<T>(1);
 77 |                 random_sequence(group[0], seq_len);
 78 |                 for (size_t i = 1; i < group_size; i++) {
 79 |                     mutate(group[0], group[i]);
 80 |                 }
 81 |             }
 82 | 
 83 |             group.resize(group_size);
 84 |             seqs.insert(seqs.end(), group.begin(), group.end());
 85 |             if (seqs.size() > num_seqs) {
 86 |                 seqs.resize(num_seqs);
 87 |             }
 88 |         }
 89 |         return seqs;
 90 |     }
 91 | 
 92 |     template <class T>
 93 |     void ingroup_pairs(std::vector<std::pair<T, T>> &pairs) {
 94 |         for (size_t go = 0; go < num_seqs; go += group_size) { // group-offset
 95 |             for (size_t i = 0; i < group_size && go + i < num_seqs; i++) { // group-member i
 96 |                 for (size_t j = i + 1; j < group_size && go + j < num_seqs; j++) { // group-member j
 97 |                     pairs.push_back({ go + i, go + j });
 98 |                 }
 99 |             }
100 |         }
101 |     }
102 | 
103 | 
104 |   private:
105 |     template <class T>
106 |     void mutate(const std::vector<T> &ref, std::vector<T> &seq) {
107 |         std::uniform_real_distribution<double> unif(min_mutation_rate, max_mutation_rate);
108 |         mutate(ref, seq, unif(gen));
109 |         if (fix_len)
110 |             make_fix_len(seq);
111 |     }
112 | 
113 |     /**
114 |      * Mutate seq from ref, by mutating each position with the probability = `rate`
115 |      * @tparam T element type in the sequence
116 |      * @param ref
117 |      * @param seq mutated sequence
118 |      * @param rate probability of mutation at each index
119 |      */
120 |     template <class T>
121 |     void mutate(const std::vector<T> &ref, std::vector<T> &seq, double rate) {
122 |         assert((rate >= 0.0) && (rate <= 1.0) && " rate must be strictly in the range [0,1]");
123 |         // probabilities for each index position: no mutation, insert, delete, substitute
124 |         std::discrete_distribution<int> mut { 1 - rate, rate / 3, rate / 3, rate / 3 };
125 |         // the range chosen such that (sub_char+ref % alphabet_size) will different from ref
126 |         std::uniform_int_distribution<T> sub_char(1, alphabet_size - 1);
127 |         // random character from the alphabet
128 |         std::uniform_int_distribution<T> rand_char(0, alphabet_size - 1);
129 |         for (size_t i = 0; i < ref.size(); i++) {
130 |             switch (mut(gen)) {
131 |                 case 0: { // no mutation
132 |                     seq.push_back(ref[i]);
133 |                     break;
134 |                 }
135 |                 case 1: { // insert
136 |                     seq.push_back(rand_char(gen));
137 |                     i--; // init_tensor_slide_params negate the increment
138 |                     break;
139 |                 }
140 |                 case 2: { // delete
141 |                     break;
142 |                 }
143 |                 case 3: { // substitute
144 |                     seq.push_back((sub_char(gen) + ref[i]) % alphabet_size);
145 |                     break;
146 |                 }
147 |             }
148 |         }
149 |     }
150 | 
151 | 
152 |     template <class T>
153 |     void make_fix_len(std::vector<T> &seq) {
154 |         std::uniform_int_distribution<T> rand_char(0, alphabet_size - 1);
155 |         if (seq.size() > seq_len) {
156 |             seq = std::vector<T>(seq.begin(), seq.end());
157 |         } else if (seq.size() < seq_len) {
158 |             while (seq.size() < seq_len) {
159 |                 seq.push_back(rand_char(gen));
160 |             }
161 |         }
162 |     }
163 | 
164 |     /**
165 |      * Generate a random sequence of length `len`
166 |      * @tparam T
167 |      * @param seq : the result will be stored in `seq`
168 |      * @param len : length of the random sequence
169 |      */
170 |     template <class T>
171 |     void random_sequence(std::vector<T> &seq, size_t len) {
172 |         seq.resize(len);
173 |         std::uniform_int_distribution<T> rand_char(0, alphabet_size - 1);
174 |         for (uint32_t i = 0; i < len; i++) {
175 |             seq[i] = rand_char(gen);
176 |         }
177 |     }
178 | 
179 | 
180 |   private:
181 |     std::mt19937 gen = std::mt19937(341234);
182 | 
183 |     uint8_t alphabet_size;
184 |     bool fix_len;
185 |     uint32_t num_seqs;
186 |     uint32_t seq_len;
187 |     uint32_t group_size;
188 |     double max_mutation_rate;
189 |     double min_mutation_rate;
190 |     std::string phylogeny_shape;
191 | };
192 | 
193 | } // namespace ts
194 | 


--------------------------------------------------------------------------------
/sequence_generator_main.cpp:
--------------------------------------------------------------------------------
 1 | #include "sequence/sequence_generator.hpp"
 2 | #include "sequence/fasta_io.hpp"
 3 | #include "util/utils.hpp"
 4 | 
 5 | #include <gflags/gflags.h>
 6 | 
 7 | #include <filesystem>
 8 | #include <memory>
 9 | 
10 | DEFINE_int32(alphabet_size, 4, "Size of the alphabet for generated sequences");
11 | DEFINE_int32(A, 4, "Short hand for --alphabet_size");
12 | 
13 | DEFINE_bool(fix_len, false, "Force generated sequence length to be equal");
14 | DEFINE_bool(F, false, "Short hand for --fix_len");
15 | 
16 | 
17 | DEFINE_uint32(num_seqs, 200, "Number of sequences to be generated");
18 | DEFINE_uint32(N, 200, "Short hand for --num_seqs");
19 | 
20 | DEFINE_uint32(seq_len, 256, "The length of sequence to be generated");
21 | DEFINE_uint32(L, 256, "Short hand for --seq_len");
22 | 
23 | DEFINE_uint32(group_size, 2, "The number of sequences in each group");
24 | DEFINE_uint32(G, 2, "Short hand for --group_size");
25 | 
26 | DEFINE_double(max_mutation_rate, 0.3, "Maximum rate of point mutation for sequence generation");
27 | DEFINE_double(R, 0.3, "Short hand for --max_mutation_rate");
28 | 
29 | DEFINE_double(min_mutation_rate, 0.0, "min rate for sequence mutation for sequence generation");
30 | DEFINE_double(r, 0.00, "Short hand for --min_mutation_rate");
31 | 
32 | 
33 | DEFINE_string(output_dir, "/tmp/", "File name where the generated sequence should be written");
34 | DEFINE_string(o, "./seqs.fa", "Short hand for --output");
35 | 
36 | 
37 | static bool validatePhylogenyShape(const char *flagname, const std::string &value) {
38 |     if (value == "path" || value == "tree" || value == "star" || value == "pair")
39 |         return true;
40 |     printf("Invalid value for --%s: %s\n", flagname, value.c_str());
41 |     return false;
42 | }
43 | DEFINE_string(phylogeny_shape,
44 |               "path",
45 |               "shape of the phylogeny can be 'path', 'tree', 'star', or 'pair'");
46 | DEFINE_validator(phylogeny_shape, &validatePhylogenyShape);
47 | 
48 | 
49 | 
50 | void adjust_short_names() {
51 |     if (!gflags::GetCommandLineFlagInfoOrDie("A").is_default) {
52 |         FLAGS_alphabet_size = FLAGS_A;
53 |     }
54 |     if (!gflags::GetCommandLineFlagInfoOrDie("N").is_default) {
55 |         FLAGS_num_seqs = FLAGS_N;
56 |     }
57 |     if (!gflags::GetCommandLineFlagInfoOrDie("L").is_default) {
58 |         FLAGS_seq_len = FLAGS_L;
59 |     }
60 |     if (!gflags::GetCommandLineFlagInfoOrDie("R").is_default) {
61 |         FLAGS_max_mutation_rate = FLAGS_R;
62 |     }
63 |     if (!gflags::GetCommandLineFlagInfoOrDie("r").is_default) {
64 |         FLAGS_min_mutation_rate = FLAGS_r;
65 |     }
66 |     if (!gflags::GetCommandLineFlagInfoOrDie("o").is_default) {
67 |         FLAGS_output_dir = FLAGS_o;
68 |     }
69 |     if (!gflags::GetCommandLineFlagInfoOrDie("G").is_default) {
70 |         FLAGS_group_size = FLAGS_G;
71 |     }
72 | }
73 | 
74 | int main(int argc, char *argv[]) {
75 |     gflags::ParseCommandLineFlags(&argc, &argv, true);
76 |     adjust_short_names();
77 | 
78 |     ts::Vec2D<uint8_t> seqs;
79 |     std::vector<std::string> seq_names;
80 |     std::string test_id;
81 | 
82 |     ts::SeqGen seq_gen(FLAGS_alphabet_size, FLAGS_fix_len, FLAGS_num_seqs, FLAGS_seq_len,
83 |                        FLAGS_group_size, FLAGS_max_mutation_rate, FLAGS_min_mutation_rate, FLAGS_phylogeny_shape);
84 | 
85 |     seqs = seq_gen.generate_seqs<uint8_t>();
86 |     ts::write_fasta(std::filesystem::path(FLAGS_output_dir) / "seqs.fa", seqs);
87 | }
88 | 


--------------------------------------------------------------------------------
/sketch/dim_reduce.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util/utils.hpp"
  4 | 
  5 | #include <cstddef>
  6 | #include <random>
  7 | #include <vector>
  8 | 
  9 | 
 10 | namespace ts {
 11 | 
 12 | class Int32Flattener {
 13 |   public:
 14 |     using sketch_type = std::vector<uint32_t>;
 15 | 
 16 |     Int32Flattener(uint32_t flat_dim, uint32_t sketch_dim, uint32_t max_len, uint32_t seed)
 17 |         : flat_dim(flat_dim), sketch_dim(sketch_dim), max_len(max_len) {
 18 |         std::mt19937 gen(seed);
 19 |         std::uniform_int_distribution<uint32_t> distribution;
 20 |         rand_proj = new2D<uint32_t>(flat_dim, this->max_len * sketch_dim * 2);
 21 |         for (auto &v : rand_proj) {
 22 |             for (auto &e : v) {
 23 |                 e = distribution(gen);
 24 |             }
 25 |         }
 26 |     }
 27 | 
 28 |     std::vector<uint32_t> flatten(const Vec2D<double> &sketch) {
 29 |         Timer timer("Int32Flattener");
 30 |         assert(rand_proj.size() == flat_dim);
 31 |         std::vector<uint32_t> v(flat_dim, 0);
 32 |         for (uint32_t s1 = 0; s1 < flat_dim; s1++) {
 33 |             for (uint32_t s2 = 0; s2 < 32; s2++) { // iterate over 32 bits
 34 |                 size_t j = s1 % sketch_dim;
 35 |                 double val = 0;
 36 |                 for (size_t i = 0; i < sketch.size(); i++) {
 37 |                     auto bit = rand_proj[s1][i * sketch_dim + j] >> s2; // random bit
 38 |                     val += (bit & 1) ? sketch[i][j] : -sketch[i][j];
 39 |                 }
 40 |                 v[s1] = (v[s1] << 1) + std::signbit(val); // insert sgn(val) into v[s1]
 41 |             }
 42 |         }
 43 |         return v;
 44 |     }
 45 | 
 46 |     static double dist(const std::vector<uint32_t> &v1, const std::vector<uint32_t> &v2) {
 47 |         Timer timer("Int32Flattener_dist");
 48 |         assert(v1.size() == v2.size());
 49 |         std::vector<double> d(v1.size());
 50 |         double val = 0;
 51 |         for (size_t i = 0; i < d.size(); i++) {
 52 |             val += __builtin_popcount(v1[i] ^ v2[i]);
 53 |         }
 54 |         return val;
 55 |     }
 56 | 
 57 |   private:
 58 |     uint32_t flat_dim;
 59 |     uint32_t sketch_dim;
 60 |     uint32_t max_len;
 61 |     Vec2D<uint32_t> rand_proj;
 62 | };
 63 | 
 64 | 
 65 | class DoubleFlattener {
 66 |   public:
 67 |     using sketch_type = std::vector<double>;
 68 | 
 69 |     DoubleFlattener(uint32_t output_dim,
 70 |                     uint32_t input_dim,
 71 |                     uint32_t input_max_len,
 72 |                     uint32_t seed)
 73 |         : flat_dim(output_dim), sketch_dim(input_dim), max_len(input_max_len) {
 74 |         std::mt19937 gen(seed);
 75 |         std::cauchy_distribution<double> distribution(0, 1.0);
 76 |         rand_proj = new2D<double>(this->flat_dim, this->max_len * input_dim * 2);
 77 |         for (auto &v : rand_proj) {
 78 |             for (double &e : v) {
 79 |                 e = distribution(gen);
 80 |             }
 81 |         }
 82 |     }
 83 | 
 84 |     std::vector<double> flatten(const Vec2D<double> &sketch) {
 85 |         Timer timer("DoubleFlattener");
 86 |         assert(rand_proj.size() == flat_dim);
 87 |         std::vector<double> v(this->flat_dim, 0);
 88 |         for (size_t s = 0; s < this->flat_dim; s++) {
 89 |             size_t j = s % this->sketch_dim;
 90 |             for (size_t i = 0; i < sketch.size(); i++) {
 91 |                 v[s] += rand_proj[s][i * this->sketch_dim + j] * sketch[i][j];
 92 |             }
 93 |             v[s] /= (double)(sketch.size()
 94 |                              * sketch[0]
 95 |                                        .size()); // divide by number of elements to compute the mean
 96 |         }
 97 | 
 98 |         return v;
 99 |     }
100 | 
101 |     static double dist(const std::vector<double> &v1, const std::vector<double> &v2) {
102 |         Timer timer("DoubleFlattener_dist");
103 |         assert(v1.size() == v2.size());
104 |         std::vector<double> d(v1.size());
105 |         for (size_t i = 0; i < d.size(); i++) {
106 |             d[i] = abs(v1[i] - v2[i]);
107 |         }
108 |         std::sort(d.begin(), d.end());
109 |         return d[d.size() / 2]; // return the median
110 |     }
111 | 
112 |   private:
113 |     uint32_t flat_dim;
114 |     uint32_t sketch_dim;
115 |     uint32_t max_len;
116 |     Vec2D<double> rand_proj;
117 | };
118 | 
119 | } // namespace ts
120 | 


--------------------------------------------------------------------------------
/sketch/edit_distance.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "sketch/sketch_base.hpp"
 4 | #include "util/multivec.hpp"
 5 | #include "util/timer.hpp"
 6 | #include "util/utils.hpp"
 7 | 
 8 | #include <cassert>
 9 | #include <cmath>
10 | #include <random>
11 | 
12 | namespace ts { // ts = Tensor Sketch
13 | 
14 | template <class seq_type>
15 | class EditDistance : public SketchBase<const std::vector<seq_type> *, false> {
16 |   public:
17 |     explicit EditDistance(const std::string &name = "ED")
18 |         : SketchBase<const std::vector<seq_type> *, false>(name) {
19 |         init();
20 |     }
21 | 
22 |     void init() {}
23 | 
24 |     const std::vector<seq_type> *compute(const std::vector<seq_type> &seq) { return &seq; }
25 | 
26 |     static double dist(const std::vector<seq_type> *a, const std::vector<seq_type> *b) {
27 |         Timer timer("edit_dist");
28 |         return edit_distance(*a, *b);
29 |     }
30 | };
31 | 
32 | } // namespace ts
33 | 


--------------------------------------------------------------------------------
/sketch/hash_base.cpp:
--------------------------------------------------------------------------------
 1 | #include "sketch/hash_base.hpp"
 2 | 
 3 | namespace ts {
 4 | 
 5 | HashAlgorithm parse_hash_algorithm(const std::string &name) {
 6 |     if (name == "uniform") {
 7 |         return HashAlgorithm::uniform;
 8 |     }
 9 |     if (name == "crc32") {
10 |         return HashAlgorithm::crc32;
11 |     }
12 |     if (name == "murmur") {
13 |         return HashAlgorithm::murmur;
14 |     }
15 |     assert(false);
16 | }
17 | 
18 | } // namespace ts
19 | 


--------------------------------------------------------------------------------
/sketch/hash_base.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "sketch/sketch_base.hpp"
  4 | #include "util/timer.hpp"
  5 | #include "util/utils.hpp"
  6 | 
  7 | #include <murmur_hash3.hpp>
  8 | 
  9 | #include <algorithm>
 10 | #include <immintrin.h>
 11 | #include <random>
 12 | #include <unordered_map>
 13 | #include <unordered_set>
 14 | 
 15 | namespace ts {
 16 | 
 17 | enum class HashAlgorithm { uniform, crc32, murmur };
 18 | 
 19 | HashAlgorithm parse_hash_algorithm(const std::string &name);
 20 | 
 21 | /**
 22 |  * @tparam T the type of elements in the hash.
 23 |  */
 24 | template <typename T>
 25 | class HashBase : public SketchBase<std::vector<T>, true> {
 26 |   public:
 27 |     HashBase(T set_size,
 28 |              size_t sketch_dim,
 29 |              size_t hash_size,
 30 |              HashAlgorithm hash_algorithm,
 31 |              uint32_t seed,
 32 |              const std::string &name = "HashBase",
 33 |              size_t kmer_size = 1)
 34 |         : SketchBase<std::vector<T>, true>(name, kmer_size),
 35 |           set_size(set_size),
 36 |           sketch_dim(sketch_dim),
 37 |           hash_size(2 * hash_size),
 38 |           hash_algorithm(hash_algorithm),
 39 |           rand(0, this->hash_size - 1),
 40 |           rng(seed) {
 41 |         init();
 42 |     }
 43 | 
 44 |     void init() {
 45 |         hash_seed = rand(rng);
 46 |         hash_seed2 = rand(rng);
 47 |         hashes.assign(sketch_dim, {});
 48 |         hash_values.assign(sketch_dim, {});
 49 |     }
 50 | 
 51 |     void set_hashes_for_testing(const std::vector<std::unordered_map<T, T>> &h) { hashes = h; }
 52 | 
 53 |   protected:
 54 |     T set_size;
 55 |     size_t sketch_dim;
 56 |     size_t hash_size;
 57 | 
 58 |     /**
 59 |      * Returns the hash value for the given #key of the #index-th hash function.
 60 |      */
 61 |     T hash(uint64_t index, uint64_t key) {
 62 |         switch (hash_algorithm) {
 63 |             case HashAlgorithm::uniform: {
 64 |                 T val;
 65 |                 // TODO multiple read Semaphore instead of critical
 66 | #pragma omp critical
 67 |                 {
 68 |                     auto [it, inserted] = hashes[index].insert({ key, -1 });
 69 |                     if (!inserted) {
 70 |                         val = it->second;
 71 |                     } else {
 72 |                         do {
 73 |                             val = rand(rng);
 74 |                         } while (!hash_values[index].insert(val).second);
 75 |                         it->second = val;
 76 |                     }
 77 |                 }
 78 |                 assert(val >= 0 && val < hash_size
 79 |                        && " Hash values are not in [0,set_size-1] range");
 80 |                 return val;
 81 |             }
 82 |             case HashAlgorithm::crc32: {
 83 |                 uint32_t val = _mm_crc32_u32(hash_seed, (uint32_t)key);
 84 |                 val =  _mm_crc32_u32((uint32_t)val, (uint32_t)index);
 85 |                 if constexpr (sizeof(T) <= 4) {
 86 |                    return static_cast<T>(val);
 87 |                 } else if constexpr (sizeof(T) <=8) {
 88 |                     uint32_t val2 = _mm_crc32_u32(hash_seed2, (uint32_t)key);
 89 |                     val2 =  _mm_crc32_u32((uint32_t)val2, (uint32_t)index);
 90 |                     return (val << 4) | val2;
 91 |                 }
 92 |             }
 93 |             case HashAlgorithm::murmur: {
 94 |                 uint64_t to_hash[] = { index, key };
 95 |                 uint8_t result[16];
 96 |                 MurmurHash3_x86_128(to_hash, 16, hash_seed, result);
 97 |                 T v = *((T *)result);
 98 |                 return v;
 99 |             }
100 |             default:
101 |                 return -1;
102 |         }
103 |     }
104 | 
105 |   private:
106 |     HashAlgorithm hash_algorithm;
107 | 
108 |     /** Contains the sketch_dim permutations (hashes) that are used to compute the min-hash */
109 |     std::vector<std::unordered_map<T, T>> hashes;
110 |     /** Contains the values used so far for each on-demand permutation */
111 |     std::vector<std::unordered_set<T>> hash_values;
112 |     std::uniform_int_distribution<T> rand;
113 |     std::mt19937 rng;
114 |     uint32_t hash_seed;
115 |     uint32_t hash_seed2;
116 | };
117 | 
118 | 
119 | } // namespace ts
120 | 


--------------------------------------------------------------------------------
/sketch/hash_min.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "hash_base.hpp"
 4 | 
 5 | #include "util/timer.hpp"
 6 | #include "util/utils.hpp"
 7 | 
 8 | #include <cstdint>
 9 | #include <limits>
10 | #include <random>
11 | 
12 | namespace ts { // ts = Tensor Sketch
13 | 
14 | /**
15 |  * Implements min-hash-based sketching, as described in https://en.wikipedia.org/wiki/MinHash.
16 |  * Given a set S, and a sequence s=s1...sn with elements from S, this class computes a vector
17 |  * {hmin_1(s), hmin_2(s), ..., hmin_sketch_size(s)}, where hmin_k(s)=s_i, such that h_k(s_i) is the
18 |  * smallest of h_k(s_1), h_k(s_2), ..., h_k(s_n) and h_k:S->{1..#set_size} is a random permuation of
19 |  * the elements in S.
20 |  * This class assumes that S= {0,1,2....,#set_size}.
21 |  * @tparam T the type of S's elements.
22 |  */
23 | template <class T>
24 | class MinHash : public HashBase<T> {
25 |   public:
26 |     /**
27 |      * Constructs a min-hasher for the given alphabet size which constructs sketches of the set size
28 |      * and sketch dimension.
29 |      * @param set_size the number of elements in S,
30 |      * @param sketch_dim the number of components (elements) in the sketch vector.
31 |      * @param seed the seed to initialize the random number generator used for the random hash
32 |      * functions.
33 |      */
34 |     MinHash(T set_size,
35 |             size_t sketch_dim,
36 |             HashAlgorithm hash_algorithm,
37 |             uint32_t seed,
38 |             const std::string &name = "MH",
39 |             size_t kmer_size = 1)
40 |         : HashBase<T>(set_size, sketch_dim, set_size, hash_algorithm, seed, name, kmer_size) {}
41 | 
42 |     /**
43 |      * Computes the min-hash sketch for the given kmers.
44 |      * @param kmers kmers extracted from a sequence
45 |      * @return the min-hash sketch of #kmers
46 |      */
47 |     std::vector<T> compute(const std::vector<T> &kmers) {
48 |         Timer timer("minhash");
49 |         std::vector<T> sketch(this->sketch_dim);
50 |         if (kmers.empty()) {
51 |             return sketch;
52 |         }
53 | 
54 |         for (size_t si = 0; si < this->sketch_dim; si++) {
55 |             T min_char = T(0);
56 |             T min_rank = std::numeric_limits<T>::max();
57 |             for (auto s : kmers) {
58 |                 T hash = this->hash(si, s);
59 |                 if (hash < min_rank) {
60 |                     min_rank = hash;
61 |                     min_char = s;
62 |                 }
63 |             }
64 |             sketch[si] = min_char;
65 |         }
66 |         return sketch;
67 |     }
68 | 
69 |     /**
70 |      * Computes the min-hash sketch for the given sequence.
71 |      * @param sequence the sequence to compute the min-hash for
72 |      * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be
73 |      * hashed
74 |      * @param number of characters in the alphabet over which sequence is defined
75 |      * @return the min-hash sketch of sequence
76 |      * @tparam C the type of characters in the sequence
77 |      */
78 |     template <typename C>
79 |     std::vector<T> compute(const std::vector<C> &sequence, uint32_t k, uint32_t alphabet_size) {
80 |         std::vector<T> kmers = seq2kmer<C, T>(sequence, k, alphabet_size);
81 |         return compute(kmers);
82 |     }
83 | 
84 |     static T dist(const std::vector<T> &a, const std::vector<T> &b) {
85 |         Timer timer("minhash_dist");
86 |         return hamming_dist(a, b);
87 |     }
88 | };
89 | } // namespace ts
90 | 


--------------------------------------------------------------------------------
/sketch/hash_ordered.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "hash_base.hpp"
  4 | 
  5 | #include "util/utils.hpp"
  6 | 
  7 | #include <iostream>
  8 | #include <random>
  9 | #include <string>
 10 | 
 11 | namespace ts { // ts = Tensor Sketch
 12 | 
 13 | /**
 14 |  * Naive implementation of the Ordered MinHash sketching method described in:
 15 |  * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6612865/
 16 |  *
 17 |  * @tparam T the type of element in the sequences to be sketched
 18 |  */
 19 | template <class T>
 20 | class OrderedMinHash : public HashBase<T> {
 21 |   public:
 22 |     /**
 23 |      * @param set_size the number of elements in S
 24 |      * @param sketch_dim the number of components (elements) in the sketch vector.
 25 |      * @param max_len maximum sequence length to be hashed.
 26 |      * @param tup_len the sketching will select the tup_len lowest values for each hash function
 27 |      * @param seed the seed to initialize the random number generator used for the random hash
 28 |      * functions.
 29 |      */
 30 |     OrderedMinHash(T set_size,
 31 |                    size_t sketch_dim,
 32 |                    size_t max_len,
 33 |                    size_t tup_len,
 34 |                    HashAlgorithm hash_algorithm,
 35 |                    uint32_t seed,
 36 |                    const std::string &name = "OMH",
 37 |                    size_t kmer_size = 1)
 38 |         : HashBase<T>(set_size, sketch_dim, set_size * max_len, hash_algorithm, seed, name, kmer_size),
 39 |           max_len(max_len),
 40 |           tup_len(tup_len) {}
 41 | 
 42 |     Vec2D<T> compute_2d(const std::vector<T> &kmers) {
 43 |         Vec2D<T> sketch(this->sketch_dim);
 44 |         if (kmers.size() < tup_len) {
 45 |             throw std::invalid_argument("Sequence of kmers must be longer than tuple length");
 46 |         }
 47 |         for (size_t pi = 0; pi < this->sketch_dim; pi++) {
 48 |             std::unordered_map<size_t, uint32_t> counts;
 49 |             std::vector<std::pair<T, size_t>> ranks;
 50 |             for (size_t i = 0; i < kmers.size(); i++) {
 51 |                 auto s = kmers[i];
 52 |                 ranks.push_back({ this->hash(pi, s + this->set_size * counts[s]), i });
 53 |                 counts[s]++;
 54 | #ifndef NDEBUG
 55 |                 assert(counts[s] != 0); // no overflow
 56 |                 if (counts[s] > max_len) {
 57 |                     throw std::invalid_argument("Kmer  " + std::to_string(s) + " repeats more than "
 58 |                                                 + std::to_string(max_len)
 59 |                                                 + " times. Set --max_len to a higher value.");
 60 |                 }
 61 | #endif
 62 |             }
 63 |             std::sort(ranks.begin(), ranks.end());
 64 |             std::vector<size_t> tup;
 65 |             for (auto pair = ranks.begin(); pair != ranks.end() && pair != ranks.begin() + tup_len;
 66 |                  pair++) {
 67 |                 tup.push_back(pair->second);
 68 |             }
 69 |             std::sort(tup.begin(), tup.end()); // sort indices of kmers
 70 |             for (auto idx : tup)
 71 |                 sketch[pi].push_back(kmers[idx]);
 72 |         }
 73 |         return sketch;
 74 |     }
 75 | 
 76 |     std::vector<T> compute(const std::vector<T> &kmers) {
 77 |         Timer timer("ordered_minhash");
 78 |         std::vector<T> sketch;
 79 | 
 80 |         Vec2D<T> sketch2D = compute_2d(kmers);
 81 |         for (const auto &tuple : sketch2D) {
 82 |             T sum = 0;
 83 |             for (const auto &item : tuple) {
 84 |                 sum = sum * this->set_size + item; // TODO: deal with overflows
 85 |             }
 86 |             sketch.push_back(sum);
 87 |         }
 88 | 
 89 |         return sketch;
 90 |     }
 91 | 
 92 |     /**
 93 |      * Computes the ordered min-hash sketch for the given sequence.
 94 |      * @param sequence the sequence to compute the ordered min-hash for
 95 |      * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be
 96 |      * hashed
 97 |      * @param number of characters in the alphabet over which sequence is defined
 98 |      * @return the ordered min-hash sketch of sequence
 99 |      * @tparam C the type of characters in the sequence
100 |      */
101 |     template <typename C>
102 |     std::vector<T> compute(const std::vector<C> &sequence, uint32_t k, uint32_t alphabet_size) {
103 |         return compute(seq2kmer<C, T>(sequence, k, alphabet_size));
104 |     }
105 | 
106 |     static T dist(const std::vector<T> &a, const std::vector<T> &b) {
107 |         Timer timer("ordered_minhash_dist");
108 |         return hamming_dist(a, b);
109 |     }
110 | 
111 |   private:
112 |     size_t max_len;
113 |     size_t tup_len;
114 | };
115 | 
116 | } // namespace ts
117 | 


--------------------------------------------------------------------------------
/sketch/hash_weighted.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "hash_base.hpp"
  4 | 
  5 | #include "util/timer.hpp"
  6 | #include "util/utils.hpp"
  7 | 
  8 | #include <iostream>
  9 | #include <limits>
 10 | #include <random>
 11 | 
 12 | namespace ts { // ts = Tensor Sketch
 13 | 
 14 | /**
 15 |  * Naive implementation of weighted min-hash sketching. For more efficient implementations, see
 16 |  * https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36928.pdf and
 17 |  * https://www.microsoft.com/en-us/research/wp-content/uploads/2010/06/ConsistentWeightedSampling2.pdf
 18 |  *
 19 |  * Given a set S, and a sequence s=s1...sn with elements from S, this class computes a vector
 20 |  * {hmin_1(s), hmin_2(s), ..., hmin_sketch_size(s)}, where hmin_k(s)=s_i, such that h_k(s_i, #s_i)
 21 |  * is the smallest of h_k(s_1, 1..#s_1), h_k(s_2, 1..#s_2), ..., h_k(s_n, 1..#s_n) and
 22 |  * h_k:Sx{1..n} -> {1..#set_size} is a random permuation of the elements in S and #s_i denotes the
 23 |  * number of occurences of s_i in the sequence s.
 24 |  * @tparam T the type of S's elements
 25 |  */
 26 | template <class T>
 27 | class WeightedMinHash : public HashBase<T> {
 28 |   public:
 29 |     /**
 30 |      * Constructs a weighted min-hasher for the given alphabet size which constructs sketches of the
 31 |      * given set size, dimension and maximum length.
 32 |      * @param set_size the number of elements in S,
 33 |      * @param sketch_dim the number of components (elements) in the sketch vector.
 34 |      * @param max_len maximum sequence length to be hashed.
 35 |      * @param seed the seed to initialize the random number generator used for the random hash
 36 |      * functions.
 37 |      */
 38 |     WeightedMinHash(T set_size,
 39 |                     size_t sketch_dim,
 40 |                     size_t max_len,
 41 |                     HashAlgorithm hash_algorithm,
 42 |                     uint32_t seed,
 43 |                     const std::string &name = "WMH",
 44 |                     size_t kmer_size = 1)
 45 |         : HashBase<T>(set_size, sketch_dim, max_len * set_size, hash_algorithm, seed, name, kmer_size),
 46 |           max_len(max_len) {}
 47 | 
 48 |     std::vector<T> compute(const std::vector<T> &kmers) {
 49 |         Timer timer("weighted_minhash");
 50 |         std::vector<T> sketch = std::vector<T>(this->sketch_dim);
 51 |         if (kmers.empty()) {
 52 |             return sketch;
 53 |         }
 54 | 
 55 |         for (size_t si = 0; si < this->sketch_dim; si++) {
 56 |             T min_char = T(0);
 57 |             T min_rank = std::numeric_limits<T>::max();
 58 |             std::unordered_map<T, uint32_t> cnts;
 59 |             for (const auto s : kmers) {
 60 |                 T r = this->hash(si, s + cnts[s] * this->set_size);
 61 |                 cnts[s]++;
 62 | #ifndef NDEBUG
 63 |                 assert(cnts[s] != 0); // no overflow
 64 |                 if (cnts[s] > max_len) {
 65 |                     throw std::invalid_argument("Kmer  " + std::to_string(s) + " repeats more than "
 66 |                                                 + std::to_string(max_len)
 67 |                                                 + " times. Set --max_len to a higher value.");
 68 |                 }
 69 | #endif
 70 | 
 71 |                 if (r < min_rank) {
 72 |                     min_rank = r;
 73 |                     min_char = s;
 74 |                 }
 75 |             }
 76 |             sketch[si] = min_char;
 77 |         }
 78 |         return sketch;
 79 |     }
 80 | 
 81 |     /**
 82 |      * Computes the ordered min-hash sketch for the given sequence.
 83 |      * @param sequence the sequence to compute the ordered min-hash for
 84 |      * @param k-mer length; the sequence will be transformed into k-mers and the k-mers will be
 85 |      * hashed
 86 |      * @param number of characters in the alphabet over which sequence is defined
 87 |      * @return the ordered min-hash sketch of #sequence
 88 |      * @tparam C the type of characters in #sequence
 89 |      */
 90 |     template <typename C>
 91 |     std::vector<T> compute(const std::vector<C> &sequence, uint32_t k, uint32_t alphabet_size) {
 92 |         std::vector<T> kmers = seq2kmer<C, T>(sequence, k, alphabet_size);
 93 |         std::vector<T> sketch = compute(kmers);
 94 |         return sketch;
 95 |     }
 96 | 
 97 |     static T dist(const std::vector<T> &a, const std::vector<T> &b) {
 98 |         Timer timer("weighted_minhash_dist");
 99 |         return hamming_dist(a, b);
100 |     }
101 | 
102 |   private:
103 |     size_t max_len;
104 | };
105 | 
106 | } // namespace ts
107 | 


--------------------------------------------------------------------------------
/sketch/sketch_base.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <exception>
 4 | #include <string>
 5 | #include <utility>
 6 | namespace ts {
 7 | 
 8 | /**
 9 |  * A base class for sketch algorithms.
10 |  *
11 |  * @tparam SketchType the type returned by the compute() function.
12 |  * @tparam KmerInput is true when the hash algorithm first should convert the sequence to kmers.
13 |  * */
14 | template <typename SketchType, bool KmerInput>
15 | class SketchBase {
16 |   public:
17 |     // The type that the compute function returns.
18 |     using sketch_type = SketchType;
19 | 
20 |     // Whether the compute function takes a list of kmers.
21 |     constexpr static bool kmer_input = KmerInput;
22 | 
23 |     // Whether transformations should be applied to the sketch output of this algorithm.
24 |     constexpr static bool transform_sketches = false;
25 | 
26 |     // The name of the sketching algorithm.
27 |     const std::string name;
28 | 
29 |     // If  kmer_input=true, value of kmer_size
30 |     size_t kmer_size = 1;
31 | 
32 |     explicit SketchBase(std::string name, size_t kmer_size = 1) : name(std::move(name)), kmer_size(kmer_size) {}
33 | 
34 |     // Must be overridden by implementations.
35 |     // Calling it will initialize the random hashes, overwriting any previous hash functions.
36 |     // May be called multiple times on the same object to reset the state before running it on a new
37 |     // set of sequences.
38 |     void init() { static_assert(!sizeof(SketchType *), "Sketch type should implement init()."); }
39 | };
40 | 
41 | } // namespace ts
42 | 


--------------------------------------------------------------------------------
/sketch/tensor.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "immintrin.h" // for AVX
  4 | #include "nmmintrin.h" // for SSE4.2
  5 | #include "sketch//sketch_base.hpp"
  6 | #include "util/multivec.hpp"
  7 | #include "util/timer.hpp"
  8 | #include "util/utils.hpp"
  9 | 
 10 | #include <cassert>
 11 | #include <cmath>
 12 | #include <random>
 13 | 
 14 | namespace ts { // ts = Tensor Sketch
 15 | 
 16 | /**
 17 |  * Computes tensor sketches for a given sequence as described in
 18 |  * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1
 19 |  * @tparam seq_type the type of elements in the sequences to be sketched.
 20 |  */
 21 | template <class seq_type>
 22 | class Tensor : public SketchBase<std::vector<double>, false> {
 23 |   public:
 24 |     // Tensor sketch output should be transformed if the command line flag is set.
 25 |     constexpr static bool transform_sketches = false;
 26 | 
 27 |     /**
 28 |      * @param alphabet_size the number of elements in the alphabet S over which sequences are
 29 |      * defined (e.g. 4 for DNA)
 30 |      * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper
 31 |      * @param subsequence_len the length of the subsequences considered for sketching, denoted by t
 32 |      * in the paper
 33 |      * @param seed the seed to initialize the random number generator used for the random hash
 34 |      * functions.
 35 |      */
 36 |     Tensor(seq_type alphabet_size,
 37 |            size_t sketch_dim,
 38 |            size_t subsequence_len,
 39 |            uint32_t seed,
 40 |            const std::string &name = "TS")
 41 |         : SketchBase<std::vector<double>, false>(name),
 42 |           alphabet_size(alphabet_size),
 43 |           sketch_dim(sketch_dim),
 44 |           subsequence_len(subsequence_len),
 45 |           rng(seed) {
 46 |         init();
 47 |     }
 48 | 
 49 |     void init() {
 50 |         hashes = new2D<seq_type>(subsequence_len, alphabet_size);
 51 |         signs = new2D<bool>(subsequence_len, alphabet_size);
 52 | 
 53 |         std::uniform_int_distribution<seq_type> rand_hash2(0, sketch_dim - 1);
 54 |         std::uniform_int_distribution<seq_type> rand_bool(0, 1);
 55 | 
 56 |         for (size_t h = 0; h < subsequence_len; h++) {
 57 |             for (size_t c = 0; c < alphabet_size; c++) {
 58 |                 hashes[h][c] = rand_hash2(rng);
 59 |                 signs[h][c] = rand_bool(rng);
 60 |             }
 61 |         }
 62 |     }
 63 | 
 64 |     /**
 65 |      * Computes the sketch of the given sequence.
 66 |      * @param seq the sequence to be sketched
 67 |      * @return an array of size #sketch_dim containing the sequence's sketch
 68 |      */
 69 |     std::vector<double> compute(const std::vector<seq_type> &seq) {
 70 |         Timer timer("tensor_sketch");
 71 |         // Tp corresponds to T+, Tm to T- in the paper; Tp[0], Tm[0] are sentinels and contain the
 72 |         // initial condition for empty strings; Tp[p], Tm[p] represent the partial sketch when
 73 |         // considering hashes h1...hp, over the prefix x1...xi. The final result is then
 74 |         // Tp[t]-Tm[t], where t is #sequence_len
 75 |         auto Tp = new2D<double>(subsequence_len + 1, sketch_dim, 0);
 76 |         auto Tm = new2D<double>(subsequence_len + 1, sketch_dim, 0);
 77 | 
 78 |         // the initial condition states that the sketch for the empty string is (1,0,..)
 79 |         Tp[0][0] = 1;
 80 |         for (uint32_t i = 0; i < seq.size(); i++) {
 81 |             const seq_type c = seq[i];
 82 |             if (c < 0 or c >= alphabet_size) {
 83 |                 continue;
 84 |             }
 85 |             // must traverse in reverse order, to avoid overwriting the values of Tp and Tm before
 86 |             // they are used in the recurrence
 87 |             for (uint32_t p = std::min(i + 1, (uint32_t)subsequence_len); p >= 1; --p) {
 88 |                 const double z = p / (i + 1.0); // probability that the last index is i
 89 |                 const seq_type r = hashes[p - 1][c];
 90 |                 const bool s = signs[p - 1][c];
 91 |                 if (s) {
 92 |                     this->shift_sum_inplace(Tp[p], Tp[p - 1], r, z);
 93 |                     this->shift_sum_inplace(Tm[p], Tm[p - 1], r, z);
 94 |                 } else {
 95 |                     this->shift_sum_inplace(Tp[p], Tm[p - 1], r, z);
 96 |                     this->shift_sum_inplace(Tm[p], Tp[p - 1], r, z);
 97 |                 }
 98 |             }
 99 |         }
100 |         std::vector<double> sketch(sketch_dim, 0);
101 |         for (uint32_t m = 0; m < sketch_dim; m++) {
102 |             sketch[m] = Tp[subsequence_len][m] - Tm[subsequence_len][m];
103 |         }
104 | 
105 |         return sketch;
106 |     }
107 | 
108 |     /** Sets the hash and sign functions to predetermined values for testing */
109 |     void set_hashes_for_testing(const Vec2D<seq_type> &h, const Vec2D<bool> &s) {
110 |         hashes = h;
111 |         signs = s;
112 |     }
113 | 
114 |     static double dist(const std::vector<double> &a, const std::vector<double> &b) {
115 |         Timer timer("tensor_sketch_dist");
116 |         return l2_dist(a, b);
117 |     }
118 | 
119 |   protected:
120 |     /** Computes (1-z)*a + z*b_shift */
121 |     void shift_sum_inplace(std::vector<double> &a,
122 |                            const std::vector<double> &b,
123 |                            seq_type shift,
124 |                            double z) {
125 |         assert(a.size() == b.size());
126 |         size_t len = a.size();
127 |         for (uint32_t i = 0; i < len; i++) {
128 |             a[i] = (1 - z) * a[i] + z * b[(len + i - shift) % len];
129 |             assert(a[i] <= 1 + 1e-5 && a[i] >= -1e-5);
130 |         }
131 |     }
132 | 
133 |     /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA */
134 |     seq_type alphabet_size;
135 |     /** Number of elements in the sketch, denoted by D in the paper */
136 |     uint32_t sketch_dim;
137 |     /** The length of the subsequences considered for sketching, denoted by t in the paper */
138 |     uint8_t subsequence_len;
139 | 
140 |     /**
141 |      * Denotes the hash functions h1,....ht:A->{1....D}, where t is #subsequence_len and D is
142 |      * #sketch_dim
143 |      */
144 |     Vec2D<seq_type> hashes;
145 | 
146 |     /** The sign functions s1...st:A->{-1,1} */
147 |     Vec2D<bool> signs;
148 | 
149 |     std::mt19937 rng;
150 | };
151 | 
152 | } // namespace ts
153 | 


--------------------------------------------------------------------------------
/sketch/tensor_block.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "immintrin.h" // for AVX
  4 | #include "nmmintrin.h" // for SSE4.2
  5 | #include "sketch//sketch_base.hpp"
  6 | #include "util/multivec.hpp"
  7 | #include "util/timer.hpp"
  8 | #include "util/utils.hpp"
  9 | 
 10 | #include <cassert>
 11 | #include <cmath>
 12 | #include <deque>
 13 | #include <random>
 14 | 
 15 | namespace ts { // ts = Tensor Sketch
 16 | 
 17 | /**
 18 |  * Computes tensor sketches for a given sequence as described in
 19 |  * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1, with the additional limitation that
 20 |  * the subsequences must be made of continuous blocks of a certain size. The adaptation of the
 21 |  * recurrence formula for this case is at https://go.grlab.org/tensor_block.
 22 |  * For block_size=1, the normal the #TensorBlock sketch is identical with #Tensor sketch.
 23 |  * @tparam seq_type the type of elements in the sequences to be sketched.
 24 |  */
 25 | template <class seq_type>
 26 | class TensorBlock : public SketchBase<std::vector<double>, false> {
 27 |   public:
 28 |     // Tensor sketch output should be transformed if the command line flag is set.
 29 |     constexpr static bool transform_sketches = false;
 30 | 
 31 |     /**
 32 |      * @param block_size only subsequences formed out of block_size continuous elements are sketched
 33 |      * @param alphabet_size the number of elements in the alphabet S over which sequences are
 34 |      * defined (e.g. 4 for DNA, 20 for protein, etc.)
 35 |      * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper
 36 |      * @param subsequence_len the length of the subsequences considered for sketching, denoted by t
 37 |      * in the paper
 38 |      * @param seed the seed to initialize the random number generator used for the random hash
 39 |      * functions.
 40 |      */
 41 |     TensorBlock(seq_type alphabet_size,
 42 |                 size_t sketch_dim,
 43 |                 size_t subsequence_len,
 44 |                 uint8_t block_size,
 45 |                 uint32_t seed,
 46 |                 const std::string &name = "TSB")
 47 |         : SketchBase<std::vector<double>, false>(name),
 48 |           block_size(block_size),
 49 |           alphabet_size(alphabet_size),
 50 |           sketch_dim(sketch_dim),
 51 |           subsequence_len(subsequence_len),
 52 |           rng(seed) {
 53 |         assert(block_size > 0 && subsequence_len > 0 && subsequence_len % block_size == 0);
 54 |         init();
 55 |     }
 56 | 
 57 |     void init() {
 58 |         hashes = new2D<seq_type>(subsequence_len, alphabet_size);
 59 |         signs = new2D<bool>(subsequence_len, alphabet_size);
 60 | 
 61 |         std::uniform_int_distribution<seq_type> rand_hash2(0, sketch_dim - 1);
 62 |         std::uniform_int_distribution<seq_type> rand_bool(0, 1);
 63 | 
 64 |         for (size_t h = 0; h < subsequence_len; h++) {
 65 |             for (size_t c = 0; c < alphabet_size; c++) {
 66 |                 hashes[h][c] = rand_hash2(rng);
 67 |                 signs[h][c] = rand_bool(rng);
 68 |             }
 69 |         }
 70 |     }
 71 | 
 72 |     /**
 73 |      * Computes the sketch of the given sequence.
 74 |      * @param seq the sequence to be sketched
 75 |      * @return an array of size #sketch_dim containing the sequence's sketch
 76 |      */
 77 |     std::vector<double> compute(const std::vector<seq_type> &seq) {
 78 |         Timer timer("tensor_sketch");
 79 |         // Tp corresponds to T+, Tm to T- in the paper; Tp[0], Tm[0] are sentinels and contain the
 80 |         // initial condition for empty strings; Tp[p], Tm[p] at step i represent the partial sketch
 81 |         // when considering hashes h1...hp, over the prefix x1...xi. The final result is then
 82 |         // Tp[t]-Tm[t], where t is #sequence_len
 83 |         // since the recurrence formula references the T_[1:N-k], i.e. the element situated
 84 |         // k=block_size positions behind, we need to always keep the last block_size Tp and Tm
 85 |         // matrices. At each iteration we create a new pair of Tp and Tm and then discard the oldest
 86 |         // Tp/Tn pair.
 87 |         // TODO(ddanciu): use a circular queue on top of vector instead
 88 |         std::deque<Vec2D<double>> Tp;
 89 |         std::deque<Vec2D<double>> Tm;
 90 | 
 91 |         // The number of blocks.
 92 |         uint32_t m = subsequence_len / block_size;
 93 | 
 94 |         for (uint32_t i = 0; i < block_size; ++i) {
 95 |             Tp.push_back(new2D<double>(m + 1, sketch_dim, 0));
 96 |             Tm.push_back(new2D<double>(m + 1, sketch_dim, 0));
 97 |             // the initial condition states that the sketch for the empty string is (1,0,..)
 98 |             Tp.back()[0][0] = 1;
 99 |         }
100 | 
101 |         // the are the "new" Tp and Tm, computed at every iteration and appended to Tp and Tm
102 |         auto nTp = new2D<double>(m + 1, sketch_dim, 0);
103 |         auto nTm = new2D<double>(m + 1, sketch_dim, 0);
104 |         for (uint32_t i = block_size - 1; i < seq.size(); i++) {
105 |             uint32_t block_count = std::min(m, (i + 1) / block_size);
106 |             // must traverse in reverse order, to avoid overwriting the values of Tp and Tm before
107 |             // they are used in the recurrence
108 |             // p must be a multiple of block_size
109 |             for (uint32_t bc = block_count; bc > 0; bc--) {
110 |                 uint32_t p = bc * block_size;
111 |                 double z = bc / (i + 1.0 - p + bc); // probability that the last index is i
112 |                 seq_type r = 0;
113 |                 bool s = true;
114 |                 for (uint32_t j = 0; j < block_size; ++j) {
115 |                     r += hashes[p - j - 1][seq[i - j]];
116 |                     s = s == signs[p - j - 1][seq[i - j]];
117 |                 }
118 |                 r %= sketch_dim;
119 |                 if (s) {
120 |                     nTp[bc] = this->shift_sum(Tp.back()[bc], Tp[0][bc - 1], r, z);
121 |                     nTm[bc] = this->shift_sum(Tm.back()[bc], Tm[0][bc - 1], r, z);
122 |                 } else {
123 |                     nTp[bc] = this->shift_sum(Tp.back()[bc], Tm[0][bc - 1], r, z);
124 |                     nTm[bc] = this->shift_sum(Tm.back()[bc], Tp[0][bc - 1], r, z);
125 |                 }
126 |             }
127 |             nTp[0][0] = 1;
128 |             Tp.push_back(std::move(nTp));
129 |             Tm.push_back(std::move(nTm));
130 |             nTp = std::move(Tp.front());
131 |             nTm = std::move(Tm.front());
132 |             for (uint32_t j = 0; j < m + 1; ++j) {
133 |                 std::fill(nTp[j].begin(), nTp[j].end(), 0);
134 |                 std::fill(nTm[j].begin(), nTm[j].end(), 0);
135 |             }
136 |             Tp.pop_front();
137 |             Tm.pop_front();
138 |         }
139 |         std::vector<double> sketch(sketch_dim, 0);
140 |         for (uint32_t l = 0; l < sketch_dim; l++) {
141 |             sketch[l] = Tp.back()[m][l] - Tm.back()[m][l];
142 |         }
143 | 
144 |         return sketch;
145 |     }
146 | 
147 |     /** Sets the hash and sign functions to predetermined values for testing */
148 |     void set_hashes_for_testing(const Vec2D<seq_type> &h, const Vec2D<bool> &s) {
149 |         hashes = h;
150 |         signs = s;
151 |     }
152 | 
153 |     static double dist(const std::vector<double> &a, const std::vector<double> &b) {
154 |         Timer timer("tensor_sketch_dist");
155 |         return l2_dist(a, b);
156 |     }
157 | 
158 |   protected:
159 |     /** Computes (1-z)*a + z*b_shift */
160 |     inline std::vector<double> shift_sum(const std::vector<double> &a,
161 |                                          const std::vector<double> &b,
162 |                                          seq_type shift,
163 |                                          double z) {
164 |         assert(a.size() == b.size());
165 |         size_t len = a.size();
166 |         std::vector<double> result(a.size());
167 |         for (uint32_t i = 0; i < a.size(); i++) {
168 |             result[i] = (1 - z) * a[i] + z * b[(len + i - shift) % len];
169 |             assert(result[i] <= 1 + 1e-5 && result[i] >= -1e-5);
170 |         }
171 |         return result;
172 |     }
173 | 
174 |     /** The size of the block each subsequence is made of. Must be a divisor of #subsequence_len.
175 |      * A block size of 1 means that the class is operating on a character basis. */
176 |     uint8_t block_size;
177 | 
178 |     /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA */
179 |     seq_type alphabet_size;
180 |     /** Number of elements in the sketch, denoted by D in the paper */
181 |     uint32_t sketch_dim;
182 |     /** The length of the subsequences considered for sketching, denoted by t in the paper */
183 |     uint8_t subsequence_len;
184 | 
185 |     /**
186 |      * Denotes the hash functions h1,....ht:A->{1....D}, where t is #subsequence_len and D is
187 |      * #sketch_dim
188 |      */
189 |     Vec2D<seq_type> hashes;
190 | 
191 |     /** The sign functions s1...st:A->{-1,1} */
192 |     Vec2D<bool> signs;
193 | 
194 |     std::mt19937 rng;
195 | };
196 | 
197 | } // namespace ts
198 | 


--------------------------------------------------------------------------------
/sketch/tensor_embedding.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "immintrin.h" // for AVX
  4 | #include "nmmintrin.h" // for SSE4.2
  5 | #include "sequence/alphabets.hpp"
  6 | #include "sketch//sketch_base.hpp"
  7 | #include "util/multivec.hpp"
  8 | #include "util/timer.hpp"
  9 | #include "util/utils.hpp"
 10 | 
 11 | #include <cassert>
 12 | #include <cmath>
 13 | #include <iostream>
 14 | #include <random>
 15 | 
 16 | namespace ts { // ts = Tensor Sketch
 17 | 
 18 | /**
 19 |  * Computes the tensor of subsequence counts for a given sequence as described in
 20 |  * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1.
 21 |  * In contrast with TensorSketch, this class does not do any sketching and preserves the full
 22 |  * tensor.
 23 |  * @tparam seq_type the type of elements in the sequences to be sketched.
 24 |  */
 25 | template <class seq_type>
 26 | class TensorEmbedding : public SketchBase<std::vector<double>, false> {
 27 |   public:
 28 |     /**
 29 |      * @param alphabet_size the number of elements in the alphabet S over which sequences are
 30 |      * defined (e.g. 4 for DNA)
 31 |      * @param normalize when true the counts will be normalized to relative frequencies with sum 1.
 32 |      */
 33 |     TensorEmbedding(seq_type alphabet_size,
 34 |                     uint32_t t,
 35 |                     const std::string &name = "Tensor",
 36 |                     bool normalize = true)
 37 |         : SketchBase<std::vector<double>, false>(name),
 38 |           alphabet_size(alphabet_size),
 39 |           t(t),
 40 |           normalize(normalize) {}
 41 | 
 42 |     void init() {}
 43 | 
 44 |     /**
 45 |      * Computes the sketch of the given sequence.
 46 |      * @param seq the sequence to be sketched
 47 |      * @return an array of size alphabet_size^t containing the sequence's sketch
 48 |      */
 49 |     std::vector<double> compute(const std::vector<seq_type> &seq) {
 50 |         // ts[i] contains the counts for subsequences of length i.
 51 |         Vec2D<double> ts(t + 1);
 52 |         {
 53 |             uint32_t num_tmers = 1;
 54 |             for (auto &t : ts) {
 55 |                 t.resize(num_tmers);
 56 |                 num_tmers *= alphabet_size;
 57 |             }
 58 |         }
 59 | 
 60 |         // The base case is the one empty sequence.
 61 |         ts[0][0] = 1;
 62 | 
 63 |         // The number of successfully read nucleotides.
 64 |         int32_t length = 0;
 65 |         for (auto s : seq) {
 66 |             // TODO(ragnar): Figure out a nice way to deal with uncertain reads.
 67 |             if (s < 0 || s >= alphabet_size)
 68 |                 continue;
 69 |             length += 1;
 70 |             for (int32_t i = static_cast<int32_t>(t) - 1; i >= 0; --i) {
 71 |                 for (size_t j = 0; j < ts[i].size(); ++j) {
 72 |                     ts[i + 1][alphabet_size * j + s] += ts[i][j];
 73 |                 }
 74 |             }
 75 |         }
 76 | 
 77 |         if (normalize) {
 78 |             double nchooset = 1;
 79 |             for (uint32_t i = 0; i < t; ++i) {
 80 |                 nchooset = nchooset * (length - i) / (i + 1);
 81 |             }
 82 | 
 83 |             for (auto &c : ts.back()) {
 84 |                 c /= nchooset;
 85 |             }
 86 |         }
 87 | 
 88 |         return std::move(ts.back());
 89 |     }
 90 | 
 91 |     static double dist(const std::vector<double> &a, const std::vector<double> &b) {
 92 |         Timer timer("full_tensor_dist");
 93 |         return l2_dist(a, b);
 94 |     }
 95 | 
 96 |   protected:
 97 |     /** Size of the alphabet over which sequences to be sketched are defined, e.g. 4 for DNA. */
 98 |     const seq_type alphabet_size;
 99 | 
100 |     /** The length of the subsequences considered for sketching. */
101 |     const uint32_t t;
102 | 
103 |     /** Whether to normalize the counts to relative frequencies with sum 1. */
104 |     const bool normalize;
105 | };
106 | 
107 | } // namespace ts
108 | 


--------------------------------------------------------------------------------
/sketch/tensor_slide.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "tensor.hpp"
  4 | 
  5 | #include "util/utils.hpp"
  6 | 
  7 | #include <cstddef>
  8 | #include <vector>
  9 | 
 10 | namespace ts {
 11 | /**
 12 |  * Computes sliding tensor sketches for a given sequence as described in
 13 |  * https://www.biorxiv.org/content/10.1101/2020.11.13.381814v1
 14 |  * @tparam seq_type the type of elements in the sequences to be sketched.
 15 |  */
 16 | template <class seq_type>
 17 | class TensorSlide : public Tensor<seq_type> {
 18 |   public:
 19 |     using sketch_type = Vec2D<double>;
 20 | 
 21 |     /**
 22 |      * @param alphabet_size the number of elements in the alphabet S over which sequences are
 23 |      * defined (e.g. 4 for DNA)
 24 |      * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper
 25 |      * @param tup_len the length of the subsequences considered for sketching, denoted by t
 26 |      * in the paper
 27 |      * @param win_len sliding sketches are computed for substrings of size win_len
 28 |      * @param stride sliding sketches are computed every stride characters
 29 |      * @param seed the seed to initialize the random number generator used for the random hash
 30 |      * functions.
 31 |      * @param name the name of the algorithm in the output
 32 |      */
 33 |     TensorSlide(seq_type alphabet_size,
 34 |                 size_t sketch_dim,
 35 |                 size_t tup_len,
 36 |                 size_t win_len,
 37 |                 size_t stride,
 38 |                 uint32_t seed,
 39 |                 const std::string &name = "TSS")
 40 |         : Tensor<seq_type>(alphabet_size, sketch_dim, tup_len, seed, name),
 41 |           win_len(win_len),
 42 |           stride(stride) {
 43 |         assert(stride <= win_len && "Stride cannot be larger than the window length");
 44 |         assert(tup_len <= win_len && "Tuple length (t) cannot be larger than the window length");
 45 |     }
 46 | 
 47 |     /**
 48 |      * Computes sliding sketches for the given sequence.
 49 |      * A sketch is computed every #stride characters on substrings of length #window.
 50 |      * @return seq.size()/stride sketches of size #sketch_dim
 51 |      */
 52 |     Vec2D<double> compute(const std::vector<seq_type> &seq) {
 53 |         Timer timer("tensor_slide_sketch");
 54 |         Vec2D<double> sketches;
 55 |         if (seq.size() < this->subsequence_len) {
 56 |             return new2D<double>(seq.size() / this->stride, this->sketch_dim, double(0));
 57 |         }
 58 |         auto &hashes = this->hashes;
 59 |         auto &signs = this->signs;
 60 |         auto tup_len = this->subsequence_len;
 61 |         // first index: p; second index: q; third index: r
 62 |         // p,q go from 1 to tup_len; p==0 and p==tup_len+1 are sentinels for termination condition
 63 |         auto T1 = new3D<double>(tup_len + 2, tup_len + 1, this->sketch_dim, 0);
 64 |         auto T2 = new3D<double>(tup_len + 2, tup_len + 1, this->sketch_dim, 0);
 65 | 
 66 |         for (uint32_t p = 0; p <= tup_len; p++) {
 67 |             T1[p + 1][p][0] = 1;
 68 |         }
 69 | 
 70 |         // T[p][q] at step i represents the sketch for seq[i-w+1]...seq[i] when only using hash
 71 |         // functions 1<=p,p+1,...q<=t, where t is the sketch size
 72 |         for (uint32_t i = 0; i < seq.size(); i++) {
 73 |             for (uint32_t p = 1; p <= tup_len; p++) {
 74 |                 // q-p must be smaller than i, hence the min in the condition
 75 |                 for (uint32_t q = std::min(p + i, (uint32_t)tup_len); q >= p; q--) {
 76 |                     double z = (double)(q - p + 1) / std::min(i + 1, win_len + 1);
 77 |                     auto r = hashes[q - 1][seq[i]];
 78 |                     bool s = signs[q - 1][seq[i]];
 79 |                     if (s) {
 80 |                         this->shift_sum_inplace(T1[p][q], T1[p][q - 1], r, z);
 81 |                         this->shift_sum_inplace(T2[p][q], T2[p][q - 1], r, z);
 82 |                     } else {
 83 |                         this->shift_sum_inplace(T1[p][q], T2[p][q - 1], r, z);
 84 |                         this->shift_sum_inplace(T2[p][q], T1[p][q - 1], r, z);
 85 |                     }
 86 |                 }
 87 |             }
 88 | 
 89 |             if (i >= win_len) { // only start deleting from front after reaching #win_len
 90 |                 uint32_t ws = i - win_len; // the element to be removed from the sketch
 91 |                 for (uint32_t diff = 0; diff < tup_len; ++diff) {
 92 |                     for (uint32_t p = 1; p <= tup_len - diff; p++) {
 93 |                         auto r = hashes[p - 1][seq[ws]];
 94 |                         bool s = signs[p - 1][seq[ws]];
 95 |                         uint32_t q = p + diff;
 96 |                         // this computes t/(w-t); in our case t (the tuple length) is diff+1
 97 |                         double z = (double)(diff + 1) / (win_len - diff);
 98 |                         if (s) {
 99 |                             this->shift_sum_inplace(T1[p][q], T1[p + 1][q], r, -z);
100 |                             this->shift_sum_inplace(T2[p][q], T2[p + 1][q], r, -z);
101 |                         } else {
102 |                             this->shift_sum_inplace(T1[p][q], T2[p + 1][q], r, -z);
103 |                             this->shift_sum_inplace(T2[p][q], T1[p + 1][q], r, -z);
104 |                         }
105 |                     }
106 |                 }
107 |             }
108 | 
109 |             if ((i + 1) % stride == 0) { // save a sketch every stride times
110 |                 sketches.push_back(diff(T1[1].back(), T2[1].back()));
111 |             }
112 |         }
113 |         return sketches;
114 |     }
115 | 
116 |     double dist(const Vec2D<double> &a, const Vec2D<double> &b) {
117 |         Timer timer("tensor_slide_sketch_dist");
118 |         return l2_dist2D_minlen(a, b);
119 |     }
120 | 
121 | 
122 |   private:
123 |     std::vector<double> diff(const std::vector<double> &a, const std::vector<double> &b) {
124 |         assert(a.size() == b.size());
125 |         std::vector<double> result(a.size());
126 |         for (uint32_t i = 0; i < result.size(); ++i) {
127 |             result[i] = a[i] - b[i];
128 |         }
129 |         return result;
130 |     }
131 | 
132 |     uint32_t win_len;
133 |     uint32_t stride;
134 | };
135 | 
136 | } // namespace ts
137 | 


--------------------------------------------------------------------------------
/sketch/tensor_slide_flat.hpp:
--------------------------------------------------------------------------------
 1 | #include "sketch/tensor_slide.hpp"
 2 | 
 3 | namespace ts {
 4 | 
 5 | /**
 6 |  * A wrapper class around TensorSlide that flattens the output of TensorSlide::compute() from a 2D
 7 |  * vector to a 1D vector. The Flattener type must have a .flatten method that does the conversion.
 8 |  * Typically used with a class form sketch/dim_reduce.h.
 9 |  *
10 |  * @tparam seq_type the type of elements in the sequences to be sketched.
11 |  * @tparam Flattener: one of the classes from util/dim_reduce.h.
12 |  */
13 | template <class seq_type, class Flattener>
14 | class TensorSlideFlat : public TensorSlide<seq_type> {
15 |     Flattener flattener;
16 | 
17 |   public:
18 |     using sketch_type = typename Flattener::sketch_type;
19 | 
20 |     /**
21 |      * @param alphabet_size the number of elements in the alphabet S over which sequences are
22 |      * defined (e.g. 4 for DNA)
23 |      * @param sketch_dim the dimension of the embedded (sketched) space, denoted by D in the paper
24 |      * @param tup_len the length of the subsequences considered for sketching, denoted by t
25 |      * in the paper
26 |      * @param win_len sliding sketches are computed for substrings of size win_len
27 |      * @param stride sliding sketches are computed every stride characters
28 |      * @param flattener the object to use to flatten the compute output.
29 |      * @param seed the seed to initialize the random number generator used for the random hash
30 |      * functions.
31 |      */
32 |     TensorSlideFlat(seq_type alphabet_size,
33 |                     size_t sketch_dim,
34 |                     size_t tup_len,
35 |                     size_t win_len,
36 |                     size_t stride,
37 |                     Flattener flattener,
38 |                     uint32_t seed,
39 |                     const std::string &name = "TSS")
40 |         : TensorSlide<seq_type>(alphabet_size, sketch_dim, tup_len, win_len, stride, seed, name),
41 |           flattener(flattener) {}
42 | 
43 | 
44 |     /**
45 |      * Computes sliding sketches for the given sequence.
46 |      * A sketch is computed every #stride characters on substrings of length #window.
47 |      * @return seq.size()/stride sketches of size #sketch_dim
48 |      */
49 |     sketch_type compute(const std::vector<seq_type> &seq) {
50 |         return flattener.flatten(TensorSlide<seq_type>::compute(seq));
51 |     }
52 | 
53 |     static double dist(const sketch_type &a, const sketch_type &b) { return Flattener::dist(a, b); }
54 | };
55 | 
56 | } // namespace ts
57 | 


--------------------------------------------------------------------------------
/sketch_main.cpp:
--------------------------------------------------------------------------------
  1 | #include "sequence/alphabets.hpp"
  2 | #include "sequence/fasta_io.hpp"
  3 | #include "sketch/edit_distance.hpp"
  4 | #include "sketch/hash_base.hpp"
  5 | #include "sketch/hash_min.hpp"
  6 | #include "sketch/hash_ordered.hpp"
  7 | #include "sketch/hash_weighted.hpp"
  8 | #include "sketch/tensor.hpp"
  9 | #include "sketch/tensor_block.hpp"
 10 | #include "sketch/tensor_embedding.hpp"
 11 | #include "sketch/tensor_slide.hpp"
 12 | #include "util/multivec.hpp"
 13 | #include "util/progress.hpp"
 14 | #include "util/utils.hpp"
 15 | 
 16 | #include <gflags/gflags.h>
 17 | 
 18 | #include <filesystem>
 19 | #include <memory>
 20 | #include <numeric>
 21 | #include <random>
 22 | #include <sstream>
 23 | #include <utility>
 24 | 
 25 | using namespace ts;
 26 | 
 27 | DEFINE_string(alphabet,
 28 |               "dna4",
 29 |               "The alphabet over which sequences are defined (dna4, dna5, protein)");
 30 | 
 31 | DEFINE_string(sketch_method, "TSS", "The sketching method to use: MH, WMH, OMH, TS, TSB or TSS");
 32 | DEFINE_string(m, "TSS", "Short hand for --sketch_method");
 33 | 
 34 | DEFINE_uint32(kmer_length, 1, "The kmer length for: MH, WMH, OMH");
 35 | DEFINE_uint32(k, 3, "Short hand for --kmer_length");
 36 | 
 37 | DEFINE_string(o, "", "Output file, containing the sketches for each sequence");
 38 | 
 39 | DEFINE_string(i,
 40 |               "",
 41 |               "Input file or directory, containing the sequences to be sketched in .fa format");
 42 | 
 43 | DEFINE_int32(embed_dim, 4, "Embedding dimension, used for all sketching methods");
 44 | 
 45 | DEFINE_int32(tuple_length,
 46 |              3,
 47 |              "Ordered tuple length, used in ordered MinHash and Tensor-based sketches");
 48 | DEFINE_int32(t, 3, "Short hand for --tuple_length");
 49 | 
 50 | static bool ValidateBlockSize(const char *flagname, int32_t value) {
 51 |     if (FLAGS_tuple_length % value == 0 || FLAGS_t % value == 0) {
 52 |         return true;
 53 |     }
 54 |     printf("Invalid value for --%s: %d. Must be a divisor of --tuple_len\n", flagname, value);
 55 |     return false;
 56 | }
 57 | DEFINE_int32(block_size,
 58 |              1,
 59 |              "Only consider tuples made out of block-size continuous characters for Tensor sketch");
 60 | DEFINE_validator(block_size, &ValidateBlockSize);
 61 | 
 62 | DEFINE_int32(window_size, 32, "Window length: the size of sliding window in Tensor Slide Sketch");
 63 | DEFINE_int32(w, 32, "Short hand for --window_size");
 64 | 
 65 | DEFINE_int32(max_len, 32, "The maximum accepted sequence length for Ordered and Weighted min-hash");
 66 | 
 67 | DEFINE_int32(stride, 8, "Stride for sliding window: shift step for sliding window");
 68 | DEFINE_int32(s, 8, "Short hand for --stride");
 69 | 
 70 | static bool ValidateInput(const char * /*unused*/, const std::string &value) {
 71 |     if (!value.empty()) {
 72 |         return true;
 73 |     }
 74 |     std::cerr << "Please specify a fasta input file using '-i <input_file>'" << std::endl;
 75 |     return false;
 76 | }
 77 | DEFINE_validator(i, &ValidateInput);
 78 | 
 79 | static bool ValidateOutput(const char * /*unused*/, const std::string &value) {
 80 |     if (value.empty()) {
 81 |         FLAGS_o = FLAGS_i + "." + FLAGS_sketch_method;
 82 |     }
 83 |     return true;
 84 | }
 85 | DEFINE_validator(o, &ValidateOutput);
 86 | 
 87 | void adjust_short_names() {
 88 |     if (!gflags::GetCommandLineFlagInfoOrDie("m").is_default) {
 89 |         FLAGS_sketch_method = FLAGS_m;
 90 |     }
 91 |     if (!gflags::GetCommandLineFlagInfoOrDie("k").is_default) {
 92 |         FLAGS_kmer_length = FLAGS_k;
 93 |     }
 94 |     if (!gflags::GetCommandLineFlagInfoOrDie("w").is_default) {
 95 |         FLAGS_window_size = FLAGS_w;
 96 |     }
 97 |     if (!gflags::GetCommandLineFlagInfoOrDie("s").is_default) {
 98 |         FLAGS_stride = FLAGS_s;
 99 |     }
100 |     if (!gflags::GetCommandLineFlagInfoOrDie("t").is_default) {
101 |         FLAGS_tuple_length = FLAGS_t;
102 |     }
103 | }
104 | 
105 | // Some global constant types.
106 | using seq_type = uint8_t;
107 | 
108 | // Run the given sketch method on input specified by the command line arguments, and write a
109 | // triangular distance matrix to the output file.
110 | template <class SketchAlgorithm>
111 | void run_triangle(SketchAlgorithm &algorithm) {
112 |     std::cerr << "Reading input .." << std::endl;
113 |     std::vector<FastaFile<seq_type>> files = read_directory<seq_type>(FLAGS_i);
114 |     std::cerr << "Read " << files.size() << " files" << std::endl;
115 | 
116 |     const size_t n = files.size();
117 | 
118 |     std::vector<typename SketchAlgorithm::sketch_type> sketches(n);
119 | 
120 |     std::cerr << "Sketching .." << std::endl;
121 |     progress_bar::init(n);
122 | #pragma omp parallel for default(shared)
123 |     for (size_t i = 0; i < n; ++i) {
124 |         assert(files[i].sequences.size() == 1
125 |                && "Each input file must contain exactly one sequence!");
126 |         if constexpr (SketchAlgorithm::kmer_input) {
127 |             sketches[i]
128 |                     = algorithm.compute(files[i].sequences[0], FLAGS_kmer_length, alphabet_size);
129 |         } else {
130 |             sketches[i] = algorithm.compute(files[i].sequences[0]);
131 |         }
132 |         progress_bar::iter();
133 |     }
134 | 
135 |     std::cerr << "Computing all pairwise distances .." << std::endl;
136 | 
137 |     std::vector<std::pair<int, int>> pairs;
138 |     for (size_t i = 0; i < n; ++i)
139 |         for (size_t j = 0; j < i; ++j)
140 |             pairs.emplace_back(i, j);
141 | 
142 |     std::vector<std::vector<double>> distances(n);
143 |     for (size_t i = 0; i < n; ++i)
144 |         distances[i].resize(i);
145 | 
146 |     progress_bar::init(n * (n - 1) / 2);
147 | #pragma omp parallel for default(shared)
148 |     for (auto it = pairs.begin(); it < pairs.end(); ++it) { // NOLINT
149 |         auto [i, j] = *it;
150 |         distances[i][j] = algorithm.dist(sketches[i], sketches[j]);
151 |         progress_bar::iter();
152 |     }
153 | 
154 |     std::cerr << "Writing distances triangle to " << FLAGS_o << " .." << std::endl;
155 |     std::filesystem::path ofile = std::filesystem::absolute(std::filesystem::path(FLAGS_o));
156 | 
157 |     write_output_meta();
158 |     std::ofstream fo(ofile);
159 |     if (!fo.is_open()) {
160 |         std::cerr << "Could not open " << FLAGS_o << " for writing." << std::endl;
161 |         std::exit(1);
162 |     }
163 | 
164 |     // MASH adds an extra tab before the number of lines, so mirror that.
165 |     fo << "\t" << n << '\n';
166 |     for (size_t i = 0; i < n; ++i) {
167 |         fo << files[i].filename;
168 |         for (size_t j = 0; j < i; ++j)
169 |             fo << '\t' << distances[i][j];
170 |         fo << '\n';
171 |     }
172 |     fo.close();
173 | };
174 | 
175 | // Runs function f on the sketch method specified by the command line options.
176 | template <typename F>
177 | void run_function_on_algorithm(F f) {
178 |     using kmer_type = uint64_t;
179 | 
180 |     auto kmer_word_size = int_pow<kmer_type>(alphabet_size, FLAGS_kmer_length);
181 | 
182 |     std::random_device rd;
183 |     if (FLAGS_sketch_method == "MH") {
184 |         f(MinHash<kmer_type>(kmer_word_size, FLAGS_embed_dim, HashAlgorithm::murmur, rd()));
185 |         return;
186 |     }
187 |     if (FLAGS_sketch_method == "WMH") {
188 |         f(WeightedMinHash<kmer_type>(kmer_word_size, FLAGS_embed_dim, FLAGS_max_len,
189 |                                      HashAlgorithm::murmur, rd()));
190 |         return;
191 |     }
192 |     if (FLAGS_sketch_method == "OMH") {
193 |         f(OrderedMinHash<kmer_type>(kmer_word_size, FLAGS_embed_dim, FLAGS_max_len,
194 |                                     FLAGS_tuple_length, HashAlgorithm::murmur, rd()));
195 |         return;
196 |     }
197 |     if (FLAGS_sketch_method == "ED") {
198 |         f(EditDistance<seq_type>());
199 |         return;
200 |     }
201 |     if (FLAGS_sketch_method == "TE") {
202 |         f(TensorEmbedding<seq_type>(alphabet_size, FLAGS_tuple_length, "TensorEmbedding"));
203 |         return;
204 |     }
205 |     if (FLAGS_sketch_method == "TS") {
206 |         f(Tensor<seq_type>(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length, rd()));
207 |         return;
208 |     }
209 |     if (FLAGS_sketch_method == "TSB") {
210 |         f(TensorBlock<seq_type>(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length,
211 |                                 FLAGS_block_size, rd()));
212 |         return;
213 |     }
214 |     if (FLAGS_sketch_method == "TSS") {
215 |         f(TensorSlide<seq_type>(kmer_word_size, FLAGS_embed_dim, FLAGS_tuple_length,
216 |                                 FLAGS_window_size, FLAGS_stride, rd()));
217 |         return;
218 |     }
219 |     std::cerr << "Unknown sketch method: " << FLAGS_sketch_method << "\n";
220 | }
221 | 
222 | 
223 | int main(int argc, char *argv[]) {
224 |     gflags::ParseCommandLineFlags(&argc, &argv, true);
225 |     adjust_short_names();
226 | 
227 |     init_alphabet(FLAGS_alphabet);
228 | 
229 |     if (std::pow(alphabet_size, FLAGS_kmer_length) > (double)std::numeric_limits<uint64_t>::max()) {
230 |         std::cerr << "Kmer size is too large to fit in 64 bits " << std::endl;
231 |         std::exit(1);
232 |     }
233 | 
234 |     run_function_on_algorithm([](auto x) { run_triangle(x); });
235 | }
236 | 


--------------------------------------------------------------------------------
/tests/phylogeny/data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ratschlab/Project2020-seq-tensor-sketching/20b19ddd19751840d33af97abe314d29b34dc0d4/tests/phylogeny/data.txt


--------------------------------------------------------------------------------
/tests/phylogeny/test_upgma.cpp:
--------------------------------------------------------------------------------
 1 | #include "phylogeny/upgma.hpp"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | namespace {
 6 | 
 7 | using namespace ts;
 8 | 
 9 | TEST(upgma, empty) {
10 |     ASSERT_TRUE(upgma({}).empty());
11 | }
12 | 
13 | TEST(upgma, one) {
14 |     Tree graph = upgma({ { 11.5 } });
15 |     ASSERT_TRUE(graph.size() == 1);
16 |     ASSERT_TRUE(graph[0].age == 0);
17 |     ASSERT_TRUE(graph[0].left == NO_CHILD);
18 |     ASSERT_TRUE(graph[0].right == NO_CHILD);
19 | }
20 | 
21 | TEST(upgma, some_values) {
22 |     std::vector<std::vector<double>> dist_mat = { { 0, 17, 21, 31, 23 },
23 |                                                   { 17, 0, 30, 34, 21 },
24 |                                                   { 21, 30, 0, 28, 39 },
25 |                                                   { 31, 34, 28, 0, 43 },
26 |                                                   { 23, 21, 39, 43, 0 } };
27 |     Tree graph = upgma(dist_mat);
28 |     ASSERT_EQ(9, graph.size());
29 |     ASSERT_EQ(graph[8].age, 16.5);
30 |     ASSERT_EQ(graph[7].age, 14);
31 |     ASSERT_EQ(graph[6].age, 11);
32 |     ASSERT_EQ(graph[5].age, 8.5);
33 |     for (uint32_t i = 0; i < 5; ++i) {
34 |         ASSERT_EQ(graph[i].age, 0);
35 |     }
36 | }
37 | 
38 | } // namespace
39 | 


--------------------------------------------------------------------------------
/tests/sketch/test_hash_base.cpp:
--------------------------------------------------------------------------------
 1 | #include "sketch/hash_base.hpp"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <random>
 6 | #include <unordered_map>
 7 | #include <unordered_set>
 8 | 
 9 | namespace {
10 | 
11 | using namespace ts;
12 | using namespace ::testing;
13 | 
14 | constexpr uint8_t SKETCH_DIM = 3;
15 | constexpr uint8_t SET_SIZE = 4 * 4;
16 | constexpr uint32_t MAX_LEN = 3;
17 | constexpr uint8_t HASH_SIZE = MAX_LEN * SET_SIZE;
18 | 
19 | class Hash : public HashBase<uint32_t>, public testing::TestWithParam<HashAlgorithm> {
20 |   public:
21 |     Hash()
22 |             : HashBase<uint32_t>(SET_SIZE,
23 |                                  SKETCH_DIM,
24 |                                  HASH_SIZE,
25 |                                  HashAlgorithm::uniform,
26 |             /*seed=*/31415) {}
27 | };
28 | 
29 | // test that the uniform hash function is bijective, i.e. it is in effect a permutation:
30 | TEST_F(Hash, HashesDistinct) {
31 |     for (uint32_t s = 0; s < SKETCH_DIM; ++s) {
32 |         std::unordered_set<uint8_t> seen(SKETCH_DIM);
33 |         for (uint32_t i = 0; i < hash_size; ++i) {
34 |             uint8_t v = this->hash(s, i);
35 |             ASSERT_FALSE(seen.find(v) != seen.end());
36 |             seen.insert(v);
37 |         }
38 |         ASSERT_EQ(hash_size, seen.size());
39 |     }
40 | }
41 | 
42 | class Hash2 : public HashBase<uint32_t>, public testing::TestWithParam<HashAlgorithm> {
43 |   public:
44 |     Hash2()
45 |         : HashBase<uint32_t>(SET_SIZE,
46 |                              SKETCH_DIM,
47 |                              HASH_SIZE,
48 |                              GetParam(),
49 |                              /*seed=*/31415) {}
50 | };
51 | 
52 | // test that the hash values are consistent - i.e. asking for the same value returns the same result
53 | TEST_P(Hash2, HashesConsistent) {
54 |     std::vector<std::unordered_map<uint8_t, uint8_t>> hashes(SKETCH_DIM);
55 |     for (uint32_t s = 0; s < SKETCH_DIM; ++s) {
56 |         for (uint32_t i = 0; i < hash_size; ++i) {
57 |             uint8_t v = this->hash(s, i);
58 |             ASSERT_FALSE(hashes[s].find(i) != hashes[s].end());
59 |             hashes[s][i] = v;
60 |         }
61 |         ASSERT_EQ(hash_size, hashes[s].size());
62 |     }
63 | 
64 |     for (uint32_t s = 0; s < SKETCH_DIM; ++s) {
65 |         for (uint32_t i = 0; i < hash_size; ++i) {
66 |             uint8_t v = this->hash(s, i);
67 |             ASSERT_EQ(v, hashes[s][i]);
68 |         }
69 |     }
70 | }
71 | 
72 | INSTANTIATE_TEST_SUITE_P(Method,
73 |                          Hash2,
74 |                          ::testing::Values(HashAlgorithm::uniform,
75 |                                            HashAlgorithm::crc32,
76 |                                            HashAlgorithm::murmur));
77 | 
78 | } // namespace
79 | 


--------------------------------------------------------------------------------
/tests/sketch/test_min_hash.cpp:
--------------------------------------------------------------------------------
 1 | #include "sketch/hash_base.hpp"
 2 | #include "sketch/hash_min.hpp"
 3 | 
 4 | #include <gtest/gtest.h>
 5 | #include <gmock/gmock.h>
 6 | 
 7 | #include <random>
 8 | 
 9 | namespace {
10 | 
11 | using namespace ts;
12 | using namespace ::testing;
13 | 
14 | TEST(MinHash, Empty) {
15 |     MinHash<uint8_t> under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415);
16 |     std::vector<uint8_t> sketch = under_test.compute(std::vector<uint8_t>());
17 |     ASSERT_THAT(sketch, ElementsAre(0, 0, 0));
18 | }
19 | 
20 | TEST(MinHash, Repeat) {
21 |     MinHash<uint8_t> under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415);
22 |     std::vector<uint8_t> sequence = { 0, 1, 2, 3, 4, 5 };
23 |     std::vector<uint8_t> sketch1 = under_test.compute(sequence);
24 |     std::vector<uint8_t> sketch2 = under_test.compute(sequence);
25 |     ASSERT_THAT(sketch1, ElementsAreArray(sketch2));
26 | }
27 | 
28 | TEST(MinHash, Permute) {
29 |     MinHash<uint8_t> under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415);
30 |     std::vector<uint8_t> sequence1 = { 0, 1, 2, 3, 4, 5 };
31 |     std::vector<uint8_t> sequence2 = { 5, 4, 3, 2, 1, 0 };
32 |     std::vector<uint8_t> sketch1 = under_test.compute(sequence1);
33 |     std::vector<uint8_t> sketch2 = under_test.compute(sequence2);
34 |     ASSERT_THAT(sketch1, ElementsAreArray(sketch2));
35 | }
36 | 
37 | TEST(MinHash, PermuteAndRepeat) {
38 |     MinHash<uint8_t> under_test(4 * 4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415);
39 |     std::vector<uint8_t> sequence1 = { 0, 1, 2, 3, 4, 5 };
40 |     std::vector<uint8_t> sequence2 = { 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 };
41 |     std::vector<uint8_t> sketch1 = under_test.compute(sequence1);
42 |     std::vector<uint8_t> sketch2 = under_test.compute(sequence2);
43 |     ASSERT_THAT(sketch1, ElementsAreArray(sketch2));
44 | }
45 | 
46 | std::vector<std::unordered_map<uint8_t, uint8_t>> hash_init(uint32_t set_sz, uint32_t sketch_size) {
47 |     std::vector<std::unordered_map<uint8_t, uint8_t>> hashes(sketch_size);
48 |     for (size_t m = 0; m < sketch_size; m++) {
49 |         for (uint32_t v = 0; v < set_sz; ++v) {
50 |             hashes[m][v] = v;
51 |         }
52 |     }
53 |     return hashes;
54 | }
55 | 
56 | TEST(MinHash, PresetHash) {
57 |     MinHash<uint8_t> under_test(4 * 4, 3, HashAlgorithm::uniform, /*seed=*/ 31415);
58 |     under_test.set_hashes_for_testing(hash_init(4 * 4, 3));
59 |     for (uint32_t i = 0; i < 4 * 4; ++i) {
60 |         std::vector<uint8_t> sequence(4 * 4 - i);
61 |         std::iota(sequence.begin(), sequence.end(), i);
62 |         std::vector<uint8_t> sketch = under_test.compute(sequence);
63 |         ASSERT_THAT(sketch, ElementsAreArray({ i, i, i }));
64 |     }
65 | }
66 | 
67 | } // namespace
68 | 


--------------------------------------------------------------------------------
/tests/sketch/test_ordered_min_hash.cpp:
--------------------------------------------------------------------------------
  1 | #include "sketch/hash_ordered.hpp"
  2 | 
  3 | #include <gmock/gmock.h>
  4 | #include <gtest/gtest.h>
  5 | 
  6 | #include <random>
  7 | 
  8 | namespace {
  9 | 
 10 | using namespace ts;
 11 | using namespace ::testing;
 12 | 
 13 | constexpr uint32_t alphabet_size = 4;
 14 | const uint32_t set_size = int_pow<uint32_t>(alphabet_size, 3); // k-mers of length 3
 15 | constexpr uint32_t sketch_dim = 2;
 16 | constexpr uint32_t tuple_length = 3;
 17 | constexpr uint32_t max_sequence_len = 200;
 18 | 
 19 | TEST(OrderedMinHash, Empty) {
 20 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 21 |                                        HashAlgorithm::uniform, /*seed=*/31415);
 22 |     ASSERT_THROW(under_test.compute(std::vector<uint8_t>()), std::invalid_argument);
 23 | }
 24 | 
 25 | TEST(OrderedMinHash, Repeat) {
 26 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 27 |                                        HashAlgorithm::uniform, /*seed=*/31415);
 28 |     std::vector<uint8_t> sequence = { 0, 1, 2, 3, 4, 5 };
 29 |     Vec2D<uint8_t> sketch1 = under_test.compute_2d(sequence);
 30 |     Vec2D<uint8_t> sketch2 = under_test.compute_2d(sequence);
 31 |     ASSERT_EQ(sketch_dim, sketch1.size());
 32 |     ASSERT_EQ(sketch_dim, sketch2.size());
 33 |     for (uint32_t i = 0; i < sketch_dim; ++i) {
 34 |         ASSERT_THAT(sketch1[i], ElementsAreArray(sketch2[i]));
 35 |     }
 36 | }
 37 | 
 38 | TEST(OrderedMinHash, ReverseOrder) {
 39 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 40 |                                        HashAlgorithm::uniform, /*seed=*/31415);
 41 |     std::vector<uint8_t> sequence1 = { 0, 1, 2, 3, 4, 5 };
 42 |     std::vector<uint8_t> sequence2 = { 5, 4, 3, 2, 1, 0 };
 43 |     Vec2D<uint8_t> sketch1 = under_test.compute_2d(sequence1);
 44 |     Vec2D<uint8_t> sketch2 = under_test.compute_2d(sequence2);
 45 |     ASSERT_EQ(sketch_dim, sketch1.size());
 46 |     ASSERT_EQ(sketch_dim, sketch2.size());
 47 |     for (uint32_t i = 0; i < sketch_dim; ++i) {
 48 |         std::reverse(sketch1[i].begin(), sketch1[i].end()); // reversed order of appearance
 49 |         ASSERT_THAT(sketch1[i], ElementsAreArray(sketch2[i]));
 50 |     }
 51 | }
 52 | 
 53 | std::vector<std::unordered_map<uint8_t, uint8_t>>
 54 | hash_init(uint32_t set_sz, uint32_t sketch_size, uint32_t max_seq_len) {
 55 |     std::vector<std::unordered_map<uint8_t, uint8_t>> hashes(sketch_size);
 56 |     for (size_t m = 0; m < sketch_size; m++) {
 57 |         for (uint32_t v = 0; v < set_sz * max_seq_len; ++v) {
 58 |             hashes[m][v] = v;
 59 |         }
 60 |     }
 61 |     return hashes;
 62 | }
 63 | 
 64 | TEST(OrderedMinHash, PresetHash) {
 65 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 66 |                                        HashAlgorithm::uniform, /*seed=*/31415);
 67 |     under_test.set_hashes_for_testing(hash_init(set_size, sketch_dim, max_sequence_len));
 68 |     for (uint32_t i = 0; i < set_size - tuple_length; ++i) {
 69 |         std::vector<uint8_t> sequence(set_size - i);
 70 |         std::iota(sequence.begin(), sequence.end(), i);
 71 |         Vec2D<uint8_t> sketch = under_test.compute_2d(sequence);
 72 |         for (uint32_t s = 0; s < sketch_dim; ++s) {
 73 |             ASSERT_THAT(sketch[s], ElementsAreArray({ i, i + 1, i + 2 }));
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | TEST(OrderedMinHash, PresetHashRepeat) {
 79 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 80 |                                        HashAlgorithm::uniform, /*seed=*/ 31415);
 81 |     under_test.set_hashes_for_testing(hash_init(set_size, sketch_dim, max_sequence_len));
 82 |     for (uint32_t i = 0; i < set_size - tuple_length; ++i) {
 83 |         std::vector<uint8_t> sequence(2 * (set_size - i));
 84 |         std::iota(sequence.begin(), sequence.begin() + sequence.size() / 2, i);
 85 |         std::iota(sequence.begin() + sequence.size() / 2, sequence.end(), i);
 86 |         Vec2D<uint8_t> sketch = under_test.compute_2d(sequence);
 87 |         for (uint32_t s = 0; s < sketch_dim; ++s) {
 88 |             ASSERT_THAT(sketch[s], ElementsAreArray({ i, i + 1, i + 2 }));
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | #ifndef NDEBUG
 94 | TEST(OrderedMinhash, SequenceTooLong) {
 95 |     OrderedMinHash<uint8_t> under_test(set_size, sketch_dim, max_sequence_len, tuple_length,
 96 |                                        HashAlgorithm::uniform, /*seed=*/ 31415);
 97 |     std::vector<uint8_t> sequence(max_sequence_len + 1);
 98 |     ASSERT_THROW(under_test.compute(sequence), std::invalid_argument);
 99 | }
100 | #endif
101 | 
102 | } // namespace
103 | 


--------------------------------------------------------------------------------
/tests/sketch/test_tensor.cpp:
--------------------------------------------------------------------------------
  1 | #include "sketch/tensor.hpp"
  2 | #include "util/utils.hpp"
  3 | 
  4 | #include <gmock/gmock.h>
  5 | #include <gtest/gtest.h>
  6 | 
  7 | namespace {
  8 | 
  9 | using namespace ts;
 10 | using namespace ::testing;
 11 | 
 12 | constexpr uint8_t alphabet_size = 4;
 13 | const uint32_t set_size = int_pow<uint32_t>(alphabet_size, 3); // k-mers of length 3
 14 | constexpr uint32_t sketch_dim = 2;
 15 | constexpr uint32_t tuple_length = 3;
 16 | 
 17 | template <typename set_type>
 18 | void rand_init(uint32_t sketch_size, Vec2D<set_type> *hashes, Vec2D<bool> *signs) {
 19 |     std::mt19937 gen(3412343);
 20 |     std::uniform_int_distribution<set_type> rand_hash2(0, sketch_size - 1);
 21 |     std::uniform_int_distribution<set_type> rand_bool(0, 1);
 22 | 
 23 |     for (size_t h = 0; h < hashes->size(); h++) {
 24 |         for (size_t c = 0; c < alphabet_size; c++) {
 25 |             (*hashes)[h][c] = rand_hash2(gen);
 26 |             (*signs)[h][c] = rand_bool(gen);
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | TEST(Tensor, Empty) {
 32 |     Tensor<uint8_t> under_test(alphabet_size, sketch_dim, tuple_length, /*seed=*/31415);
 33 |     std::vector<double> sketch = under_test.compute(std::vector<uint8_t>());
 34 |     ASSERT_EQ(sketch.size(), sketch_dim);
 35 |     ASSERT_THAT(sketch, ElementsAre(0, 0));
 36 | }
 37 | 
 38 | /** The sequence has one char, which is shorter than the tuple length, so the sketch will be 0 */
 39 | TEST(Tensor, OneChar) {
 40 |     Tensor<uint8_t> under_test(alphabet_size, sketch_dim, tuple_length, /*seed=*/31415);
 41 |     for (uint8_t c = 0; c < alphabet_size; ++c) {
 42 |         std::vector<double> sketch = under_test.compute({ c });
 43 |         ASSERT_THAT(sketch, ElementsAre(0, 0));
 44 |     }
 45 | }
 46 | 
 47 | /** The sequence has one char, the tuple length is 1, so we should have a value of +/-1 on position
 48 |  * h(seq[0]) */
 49 | TEST(Tensor, OneCharTuple1) {
 50 |     constexpr uint32_t tuple_len = 1;
 51 |     Tensor<uint8_t> under_test(alphabet_size, sketch_dim, tuple_len, /*seed=*/31415);
 52 | 
 53 |     Vec2D<uint8_t> hashes = new2D<uint8_t>(tuple_len, alphabet_size);
 54 |     Vec2D<bool> signs = new2D<bool>(tuple_len, alphabet_size);
 55 |     rand_init(sketch_dim, &hashes, &signs);
 56 |     under_test.set_hashes_for_testing(hashes, signs);
 57 | 
 58 |     for (uint8_t c = 0; c < alphabet_size; ++c) {
 59 |         std::vector<double> sketch = under_test.compute({ c });
 60 |         for (uint32_t i = 0; i < sketch_dim; ++i) {
 61 |             int8_t sign = signs[0][c] ? 1 : -1;
 62 |             ASSERT_EQ(sketch[i] * sign, hashes[0][c] % sketch_dim == i) << "Char: " << (int)c;
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | /**
 68 |  * The size of the sequence equals the size of the tuple, so the sketch will be 1 in one position
 69 |  * (position H(x)), and 0 in all the other positions
 70 |  */
 71 | TEST(Tensor, FullStringDistinctChars) {
 72 |     for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) {
 73 |         for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) {
 74 |             Tensor<uint8_t> under_test(tuple_len, sketch_dimension, tuple_len, /*seed=*/31415);
 75 |             std::vector<uint8_t> sequence(tuple_len);
 76 |             std::iota(sequence.begin(), sequence.end(), 0U);
 77 |             std::vector<double> sketch = under_test.compute(sequence);
 78 |             ASSERT_EQ(sketch.size(), sketch_dimension);
 79 |             for (uint32_t i = 0; i < sketch_dimension; ++i) {
 80 |                 ASSERT_TRUE(std::abs(sketch[i]) == 0 || std::abs(sketch[i]) == 1);
 81 |             }
 82 |             ASSERT_EQ(1, std::abs(std::accumulate(sketch.begin(), sketch.end(), 0)))
 83 |                     << "D=" << sketch_dimension << " t=" << tuple_len;
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | /**
 89 |  * The size of the sequence equals the size of the tuple, so the sketch will be 1 or -1 in one
 90 |  * position (position H(x)), and 0 in all the other positions.
 91 |  */
 92 | TEST(Tensor, FullStringRandomChars) {
 93 |     std::mt19937 gen(1234567);
 94 |     for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) {
 95 |         for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) {
 96 |             std::uniform_int_distribution<uint8_t> rand_char(0, alphabet_size - 1);
 97 |             Tensor<uint8_t> under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415);
 98 | 
 99 |             Vec2D<uint8_t> hashes = new2D<uint8_t>(tuple_len, alphabet_size);
100 |             Vec2D<bool> signs = new2D<bool>(tuple_len, alphabet_size);
101 |             rand_init(sketch_dim, &hashes, &signs);
102 |             under_test.set_hashes_for_testing(hashes, signs);
103 | 
104 |             std::vector<uint8_t> sequence(tuple_len);
105 |             for (uint8_t &c : sequence) {
106 |                 c = rand_char(gen);
107 |             }
108 |             std::vector<double> sketch = under_test.compute(sequence);
109 | 
110 |             uint32_t pos = 0; // the position where the sketch must be one
111 |             int8_t s = 1; // the sign of the sketch
112 |             for (uint32_t i = 0; i < sequence.size(); ++i) {
113 |                 pos += hashes[i][sequence[i]];
114 |                 s *= signs[i][sequence[i]] ? 1 : -1;
115 |             }
116 |             pos %= sketch_dimension;
117 | 
118 |             ASSERT_EQ(sketch.size(), sketch_dimension);
119 |             for (uint32_t i = 0; i < sketch_dimension; ++i) {
120 |                 ASSERT_EQ(i == pos ? s : 0, sketch[i]);
121 |             }
122 |         }
123 |     }
124 | }
125 | 
126 | /**
127 |  * If a sequence contains identical characters, its sketch will be +/-1 in one position and 0 in all
128 |  * others, because all subsequences of length t are identical.
129 |  */
130 | TEST(Tensor, SameChars) {
131 |     std::mt19937 gen(342111);
132 |     std::uniform_int_distribution<uint8_t> rand_char(0, alphabet_size - 1);
133 |     std::uniform_int_distribution<uint8_t> rand_seq_len(0, 100);
134 |     for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) {
135 |         for (uint32_t tuple_len = 2; tuple_len < 10; ++tuple_len) {
136 |             Tensor<uint8_t> under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415);
137 |             uint8_t sequence_length = tuple_len + rand_seq_len(gen);
138 |             std::vector<uint8_t> sequence(sequence_length, rand_char(gen));
139 |             std::vector<double> sketch = under_test.compute(sequence);
140 |             ASSERT_EQ(sketch.size(), sketch_dimension);
141 |             for (uint32_t i = 0; i < sketch_dimension; ++i) {
142 |                 ASSERT_TRUE(std::abs(sketch[i]) == 0 || std::abs(sketch[i]) == 1);
143 |             }
144 |             ASSERT_EQ(1, std::abs(std::accumulate(sketch.begin(), sketch.end(), 0)))
145 |                     << "Dim=" << sketch_dimension << " t=" << tuple_len;
146 |         }
147 |     }
148 | }
149 | 
150 | /**
151 |  * If a sequence contains distinct characters, then the tensor sketch for t=1 will contain multiples
152 |  * of (1/alphabet_size), because T(a)=1/alphabet_size for all characters a.
153 |  */
154 | TEST(Tensor, DistinctCharsTuple1) {
155 |     std::mt19937 gen(321567);
156 |     constexpr uint8_t tuple_len = 1;
157 |     std::vector<uint8_t> sequence(alphabet_size);
158 |     std::iota(sequence.begin(), sequence.end(), 0);
159 |     for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) {
160 |         Tensor<uint8_t> under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415);
161 | 
162 |         std::vector<double> sketch = under_test.compute(sequence);
163 |         ASSERT_EQ(sketch.size(), sketch_dimension);
164 |         for (uint32_t i = 0; i < sketch_dimension; ++i) {
165 |             double factor = sketch[i] / (1. / alphabet_size);
166 |             ASSERT_NEAR(factor, std::round(factor), 1e-3);
167 |         }
168 |     }
169 | }
170 | 
171 | /**
172 |  * If a sequence of length seq_len contains distinct characters, then the tensor sketch for
173 |  * t=seq_len-1 will contain multiples of (1/seq_len), because T(a)=1/seq_len for all the seq_len
174 |  * subsequences of length seq_len-1.
175 |  */
176 | TEST(Tensor, DistinctCharsTupleTMinus1) {
177 |     std::mt19937 gen(321567);
178 |     for (uint32_t tuple_len = 1; tuple_len < 10; ++tuple_len) {
179 |         const uint8_t alphabet_size = tuple_len + 1;
180 |         std::vector<uint8_t> sequence(alphabet_size);
181 |         std::iota(sequence.begin(), sequence.end(), 0);
182 |         for (uint32_t sketch_dimension = 3; sketch_dimension < 10; ++sketch_dimension) {
183 |             Tensor<uint8_t> under_test(alphabet_size, sketch_dimension, tuple_len, /*seed=*/31415);
184 | 
185 |             std::vector<double> sketch = under_test.compute(sequence);
186 |             ASSERT_EQ(sketch.size(), sketch_dimension);
187 |             for (uint32_t i = 0; i < sketch_dimension; ++i) {
188 |                 double factor = sketch[i] / (1. / alphabet_size);
189 |                 ASSERT_NEAR(factor, std::round(factor), 1e-3);
190 |             }
191 |         }
192 |     }
193 | }
194 | 
195 | } // namespace
196 | 


--------------------------------------------------------------------------------
/tests/sketch/test_weighted_min_hash.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "sketch/hash_weighted.hpp"
 3 | 
 4 | #include <gmock/gmock.h>
 5 | #include <gtest/gtest.h>
 6 | 
 7 | #include <random>
 8 | 
 9 | namespace {
10 | 
11 | using namespace ts;
12 | using namespace ::testing;
13 | 
14 | TEST(WeightedMinHash, Empty) {
15 |     WeightedMinHash<uint8_t> under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
16 |     std::vector<uint8_t> sketch = under_test.compute(std::vector<uint8_t>());
17 |     ASSERT_THAT(sketch, ElementsAre(0, 0, 0));
18 | }
19 | 
20 | TEST(WeightedMinHash, Repeat) {
21 |     WeightedMinHash<uint8_t> under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
22 |     std::vector<uint8_t> sequence = { 0, 1, 2, 3, 4, 5 };
23 |     std::vector<uint8_t> sketch1 = under_test.compute(sequence);
24 |     std::vector<uint8_t> sketch2 = under_test.compute(sequence);
25 |     ASSERT_THAT(sketch1, ElementsAreArray(sketch2));
26 | }
27 | 
28 | TEST(WeightedMinHash, Permute) {
29 |     WeightedMinHash<uint8_t> under_test(4 * 4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
30 |     std::vector<uint8_t> sequence1 = { 0, 1, 2, 3, 4, 5 };
31 |     std::vector<uint8_t> sequence2 = { 5, 4, 3, 2, 1, 0 };
32 |     std::vector<uint8_t> sketch1 = under_test.compute(sequence1);
33 |     std::vector<uint8_t> sketch2 = under_test.compute(sequence2);
34 |     ASSERT_THAT(sketch1, ElementsAreArray(sketch2));
35 | }
36 | 
37 | std::vector<std::unordered_map<uint8_t, uint8_t>>
38 | hash_init(uint32_t set_sz, uint32_t sketch_size, uint32_t max_seq_len) {
39 |     std::vector<std::unordered_map<uint8_t, uint8_t>> hashes(sketch_size);
40 |     for (size_t m = 0; m < sketch_size; m++) {
41 |         for (uint32_t v = 0; v < set_sz * max_seq_len; ++v) {
42 |             hashes[m][v] = v;
43 |         }
44 |     }
45 |     return hashes;
46 | }
47 | 
48 | TEST(WeightedMinHash, PresetHash) {
49 |     WeightedMinHash<uint8_t> under_test(4 * 4, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
50 |     under_test.set_hashes_for_testing(hash_init(4 * 4, 3, 100));
51 |     for (uint32_t i = 0; i < 4 * 4; ++i) {
52 |         std::vector<uint8_t> sequence(4 * 4 - i);
53 |         std::iota(sequence.begin(), sequence.end(), i);
54 |         std::vector<uint8_t> sketch = under_test.compute(sequence);
55 |         ASSERT_THAT(sketch, ElementsAreArray({ i, i, i }));
56 |     }
57 | }
58 | 
59 | TEST(WeightedMinHash, PresetHashRepeat) {
60 |     constexpr uint32_t set_size = 4 * 4; // corresponds to k-mers of length 2 over the DNA alphabet
61 |     WeightedMinHash<uint8_t> under_test(set_size, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
62 |     under_test.set_hashes_for_testing(hash_init(set_size, 3, 100));
63 |     for (uint32_t i = 0; i < set_size; ++i) {
64 |         std::vector<uint8_t> sequence(2 * (set_size - i));
65 |         std::iota(sequence.begin(), sequence.begin() + sequence.size() / 2, i);
66 |         std::iota(sequence.begin() + sequence.size() / 2, sequence.end(), i);
67 |         std::vector<uint8_t> sketch = under_test.compute(sequence);
68 |         ASSERT_THAT(sketch, ElementsAreArray({ i, i, i }));
69 |     }
70 | }
71 | 
72 | #ifndef NDEBUG
73 | TEST(WeightedMinhash, SequenceTooLong) {
74 |     constexpr uint32_t set_size = 4 * 4; // corresponds to k-mers of length 2 over the DNA alphabet
75 |     WeightedMinHash<uint8_t> under_test(set_size, 3, 100, HashAlgorithm::uniform, /*seed=*/31415);
76 |     std::vector<uint8_t> sequence(100 + 1);
77 |     ASSERT_THROW(under_test.compute(sequence), std::invalid_argument);
78 | }
79 | #endif
80 | 
81 | } // namespace
82 | 


--------------------------------------------------------------------------------
/tests/util/test_multivec.cpp:
--------------------------------------------------------------------------------
 1 | #include "util/utils.hpp"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <random>
 6 | 
 7 | namespace {
 8 | template <typename T>
 9 | class Pow : public ::testing::Test {};
10 | 
11 | typedef ::testing::Types<uint64_t, uint32_t> PowTypes;
12 | 
13 | TYPED_TEST_SUITE(Pow, PowTypes);
14 | 
15 | TYPED_TEST(Pow, Zero) {
16 |     std::mt19937 rng(123457);
17 |     std::uniform_int_distribution<std::mt19937::result_type> dist(0, 10000);
18 | 
19 |     for (uint32_t i = 0; i < 10; ++i) {
20 |         EXPECT_EQ(1, ts::int_pow<TypeParam>(dist(rng), 0));
21 |     }
22 | }
23 | 
24 | TYPED_TEST(Pow, Random) {
25 |     std::mt19937 rng(123457);
26 |     std::uniform_int_distribution<std::mt19937::result_type> dist(0, 10);
27 |     std::uniform_int_distribution<std::mt19937::result_type> pow_dist(0, 5);
28 | 
29 |     for (uint32_t i = 0; i < 10; ++i) {
30 |         TypeParam base = pow_dist(rng);
31 |         TypeParam exp = dist(rng);
32 |         EXPECT_EQ(std::pow(base, exp), ts::int_pow<TypeParam>(base, exp));
33 |     }
34 | }
35 | 
36 | } // namespace
37 | 


--------------------------------------------------------------------------------
/tests/util/test_spearman.cpp:
--------------------------------------------------------------------------------
  1 | #include "util/spearman.hpp"
  2 | 
  3 | #include <gtest/gtest.h>
  4 | 
  5 | #include <random>
  6 | 
  7 | namespace {
  8 | 
  9 | TEST(Spearman, Identical) {
 10 |     std::mt19937 rng(123457);
 11 |     std::uniform_int_distribution<uint32_t> dist(0, 10000);
 12 |     std::uniform_int_distribution<uint32_t> sz_dist(5, 10);
 13 |     for (uint32_t trial = 0; trial < 10; ++trial) {
 14 |         size_t size = sz_dist(rng);
 15 |         std::vector<uint8_t> a;
 16 |         std::vector<uint8_t> b;
 17 |         for (uint32_t i = 0; i < size; ++i) {
 18 |             size_t v = dist(rng);
 19 |             a.push_back(v);
 20 |             b.push_back(v);
 21 |         }
 22 |         ASSERT_EQ(1, spearman(a, b));
 23 |     }
 24 | }
 25 | 
 26 | TEST(Spearman, Linear) {
 27 |     std::mt19937 rng(123457);
 28 |     std::uniform_real_distribution<> dist(1, 10000);
 29 |     std::uniform_int_distribution<> sz_dist(5, 10);
 30 |     double coef = dist(rng);
 31 |     for (uint32_t trial = 0; trial < 10; ++trial) {
 32 |         size_t size = sz_dist(rng);
 33 |         std::vector<double> a;
 34 |         std::vector<double> b;
 35 |         for (uint32_t i = 0; i < size; ++i) {
 36 |             size_t v = dist(rng);
 37 |             a.push_back(v);
 38 |             b.push_back(v * coef);
 39 |         }
 40 |         ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef;
 41 |     }
 42 | }
 43 | 
 44 | TEST(Spearman, LinearInverse) {
 45 |     size_t size = 10;
 46 |     std::vector<double> a(size);
 47 |     std::vector<double> b(size);
 48 |     for (uint32_t i = 0; i < size; ++i) {
 49 |         a[i] = 2 * i + 5;
 50 |         b[size - i - 1] = 2 * i + 5;
 51 |     }
 52 |     ASSERT_EQ(-1, spearman(a, b));
 53 | }
 54 | 
 55 | TEST(Spearman, Quadratic) {
 56 |     std::mt19937 rng(123457);
 57 |     std::uniform_real_distribution<> dist(1, 10000);
 58 |     std::uniform_int_distribution<> sz_dist(5, 10);
 59 |     double coef = dist(rng);
 60 |     for (uint32_t trial = 0; trial < 10; ++trial) {
 61 |         size_t size = sz_dist(rng);
 62 |         std::vector<double> a;
 63 |         std::vector<double> b;
 64 |         for (uint32_t i = 0; i < size; ++i) {
 65 |             size_t v = dist(rng);
 66 |             a.push_back(v);
 67 |             b.push_back(v * v);
 68 |         }
 69 |         ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef;
 70 |     }
 71 | }
 72 | 
 73 | TEST(Spearman, QuadraticInverse) {
 74 |     size_t size = 10;
 75 |     std::vector<double> a(size);
 76 |     std::vector<double> b(size);
 77 |     for (uint32_t i = 0; i < size; ++i) {
 78 |         a[i] = 2 * i * i + 5;
 79 |         b[size - i - 1] = 2 * i * i + 5;
 80 |     }
 81 |     ASSERT_EQ(-1, spearman(a, b));
 82 | }
 83 | 
 84 | TEST(Spearman, AllIdentical) {
 85 |     size_t size = 10;
 86 |     std::vector<double> a(size);
 87 |     std::vector<double> b(size);
 88 |     for (uint32_t i = 0; i < size; ++i) {
 89 |         a[i] = 2 * i * i + 5;
 90 |         b[i] = 2 * i * i + 5;
 91 |     }
 92 |     ASSERT_EQ(1, spearman(a, b));
 93 | }
 94 | 
 95 | TEST(Spearman, SomeValues) {
 96 |     std::vector<double> a = { 35, 23, 47, 17, 10, 43, 9, 6, 28 };
 97 |     std::vector<double> b = { 30, 33, 45, 23, 8, 49, 12, 4, 31 };
 98 |     ASSERT_EQ(0.9, spearman(a, b));
 99 | }
100 | 
101 | TEST(Spearman, LinearRepeats) {
102 |     std::mt19937 rng(123457);
103 |     std::uniform_real_distribution<> dist(1, 10000);
104 |     std::uniform_int_distribution<> sz_dist(5, 10);
105 |     double coef = dist(rng);
106 |     for (uint32_t trial = 0; trial < 10; ++trial) {
107 |         size_t size = sz_dist(rng);
108 |         std::vector<double> a;
109 |         std::vector<double> b;
110 |         for (uint32_t i = 0; i < size; ++i) {
111 |             size_t v = dist(rng);
112 |             a.push_back(v);
113 |             a.push_back(v);
114 |             b.push_back(v * coef);
115 |             b.push_back(v * coef);
116 |         }
117 |         ASSERT_EQ(1, spearman(a, b)) << "Trial " << trial << " Coef: " << coef;
118 |     }
119 | }
120 | 
121 | TEST(Spearman, Rankify) {
122 |     std::vector<double> a = { 1, 1, 2, 2, 3, 3, 4, 5, 5 };
123 |     std::vector<double> expected_ranks { 1.5, 1.5, 3.5, 3.5, 5.5, 5.5, 7, 8.5, 8.5 };
124 |     ASSERT_EQ(expected_ranks, rankify(a));
125 | }
126 | 
127 | TEST(Spearman, SomeValuesRepeats) {
128 |     std::vector<double> a = { 1, 1, 2, 2, 3, 3, 4, 5, 5 };
129 |     std::vector<double> b = { 7, 8, 8, 19, 19, 3, 3, 5, 9 };
130 | 
131 |     ASSERT_NEAR(-0.19314, spearman(a, b), 1e-5);
132 | }
133 | 
134 | 
135 | } // namespace
136 | 


--------------------------------------------------------------------------------
/third_party/murmur_hash/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.13)
 2 | project(murmur_hash)
 3 | set(CMAKE_CXX_STANDARD 17)
 4 | set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 5 | 
 6 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
 7 | set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 8 | 
 9 | 
10 | file(GLOB murmur_files "*.cpp")
11 | add_library(murmur_lib ${murmur_files})
12 | target_include_directories(murmur_lib INTERFACE .)
13 | 


--------------------------------------------------------------------------------
/third_party/murmur_hash/murmur_hash3.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 3 | // domain. The author hereby disclaims copyright to this source code.
 4 | 
 5 | #include <stdint.h>
 6 | 
 7 | void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
 8 | 
 9 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
10 | 
11 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
12 | 


--------------------------------------------------------------------------------
/util/multivec.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util/transformer.hpp"
 4 | 
 5 | #include <functional>
 6 | #include <type_traits>
 7 | #include <vector>
 8 | 
 9 | namespace ts { // ts = Tensor Sketch
10 | 
11 | template <class T>
12 | using is_u_integral = typename std::enable_if<std::is_unsigned<T>::value>::type;
13 | 
14 | template <class T>
15 | using Vec2D = std::vector<std::vector<T>>;
16 | 
17 | template <class T>
18 | using Vec3D = std::vector<Vec2D<T>>;
19 | 
20 | template <class T>
21 | using Vec4D = std::vector<Vec3D<T>>;
22 | 
23 | 
24 | template <class T>
25 | auto new2D(size_t d1, size_t d2, T val = 0) {
26 |     return Vec2D<T>(d1, std::vector<T>(d2, val));
27 | }
28 | template <class T>
29 | auto new3D(size_t d1, size_t d2, size_t d3, T val = 0) {
30 |     return Vec3D<T>(d1, new2D(d2, d3, val));
31 | }
32 | 
33 | template <class T>
34 | void apply(std::vector<T> &vec, const transformer<T> &tr) {
35 |     for (auto &v : vec) {
36 |         v = tr.transform(v);
37 |     }
38 | }
39 | 
40 | template <class T>
41 | void apply(Vec2D<T> &vec2D, const transformer<T> &tr) {
42 |     for (auto &vec : vec2D) {
43 |         apply(vec, tr);
44 |     }
45 | }
46 | 
47 | template <class T>
48 | void apply(Vec3D<double> &vec3D, const transformer<T> &tr) {
49 |     for (auto &vec2D : vec3D) {
50 |         apply(vec2D, tr);
51 |     }
52 | }
53 | 
54 | } // namespace ts
55 | 


--------------------------------------------------------------------------------
/util/progress.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by amir on 1/9/21.
 3 | //
 4 | #include "util/progress.hpp"
 5 | #include <iomanip>
 6 | #include <iostream>
 7 | 
 8 | namespace ts {
 9 | 
10 | size_t progress_bar::it;
11 | size_t progress_bar::total;
12 | size_t progress_bar::bar_len;
13 | size_t progress_bar::bar_step;
14 | 
15 | void progress_bar::init(size_t total_iterations, size_t len) {
16 |     progress_bar::it = 0;
17 |     progress_bar::total = total_iterations;
18 |     progress_bar::bar_len = len;
19 |     progress_bar::bar_step = 0;
20 | }
21 | 
22 | void progress_bar::iter() {
23 | #pragma omp critical
24 |     {
25 |         ++it;
26 |         auto step = (it * bar_len) / total;
27 |         while (step > bar_step) {
28 |             if (bar_step > 0)
29 |                 std::cerr << "\b\b\b\b";
30 |             ++bar_step;
31 |             std::cerr << "#" << std::setw(3) << (int)(100.0 * it / total) << "%" << std::flush;
32 |         }
33 |         if (it == total) {
34 |             std::cerr << "\033[2K\r" << std::flush;
35 |         }
36 |     }
37 | }
38 | 
39 | }; // namespace ts
40 | 


--------------------------------------------------------------------------------
/util/progress.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by amir on 18/12/2020.
 3 | //
 4 | #pragma once
 5 | #include <istream>
 6 | 
 7 | namespace ts {
 8 | 
 9 | 
10 | struct progress_bar {
11 |     static size_t it;
12 |     static size_t total;
13 |     static size_t bar_len;
14 |     static size_t bar_step;
15 | 
16 |     static void init(size_t total_iterations, size_t bar_len = 50);
17 |     static void iter() ;
18 | };
19 | 
20 | 
21 | };
22 | 


--------------------------------------------------------------------------------
/util/spearman.hpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | #include <cmath>
 4 | #include <iostream>
 5 | #include <vector>
 6 | 
 7 | // Function returns the 1-based rank vector of a set of observations v
 8 | template <typename T>
 9 | std::vector<double> rankify(const std::vector<T> &v) {
10 |     std::vector<T> sorted = v;
11 |     std::sort(begin(sorted), end(sorted));
12 | 
13 |     std::vector<double> result(v.size());
14 | 
15 |     for (size_t i = 0; i < v.size(); i++) {
16 |         const auto lb = std::lower_bound(std::begin(sorted), std::end(sorted), v[i]);
17 |         const auto ub = std::upper_bound(std::begin(sorted), std::end(sorted), v[i]);
18 |         const size_t r = 1 + (lb - std::begin(sorted)), s = ub - lb;
19 | 
20 |         // Use Fractional Rank formula fractional_rank = r + (s-1)/2
21 |         result[i] = r + (s - 1) * 0.5;
22 |     }
23 | 
24 |     return result;
25 | }
26 | 
27 | /* Compute the Pearson correlation coefficient of a and b */
28 | template <typename T>
29 | double pearson(const std::vector<T> &a, const std::vector<T> &b) {
30 |     assert(a.size() == b.size());
31 |     T sum_a = 0, sum_b = 0, sum_ab = 0;
32 |     T square_sum_a = 0, square_sum_b = 0;
33 | 
34 |     for (size_t i = 0; i < a.size(); i++) {
35 |         sum_a = sum_a + a[i];
36 |         sum_b = sum_b + b[i];
37 |         sum_ab = sum_ab + a[i] * b[i];
38 |         square_sum_a = square_sum_a + a[i] * a[i];
39 |         square_sum_b = square_sum_b + b[i] * b[i];
40 |     }
41 | 
42 |     // compute variances
43 |     T var_a = a.size() * square_sum_a - sum_a * sum_a;
44 |     T var_b = a.size() * square_sum_b - sum_a * sum_b;
45 |     // treat degenerate cases
46 |     if (var_a == 0 && var_b == 0) {
47 |         return 1;
48 |     }
49 |     if (var_a == 0 || var_b == 0) {
50 |         return 0;
51 |     }
52 | 
53 |     return (a.size() * sum_ab - sum_a * sum_b) / std::sqrt(var_a * var_b);
54 | }
55 | 
56 | template <typename T, typename U>
57 | double spearman(const std::vector<T> &a, const std::vector<U> &b) {
58 |     std::vector<double> rank1 = rankify(a);
59 |     std::vector<double> rank2 = rankify(b);
60 |     return pearson(rank1, rank2);
61 | }
62 | 


--------------------------------------------------------------------------------
/util/timer.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <vector>
 3 | #include "timer.hpp"
 4 | 
 5 | namespace ts {
 6 | 
 7 | using namespace std::chrono;
 8 | 
 9 | std::vector<std::map<std::string, nanoseconds>> Timer::durations_vec
10 |         = std::vector<std::map<std::string, nanoseconds>>(100);
11 | std::vector<std::map<std::string, size_t>> Timer::counts_vec
12 |         = std::vector<std::map<std::string, size_t>>(100);
13 | 
14 | 
15 | void Timer::add_duration(const std::string &func_name, nanoseconds dur) {
16 |     int tid = omp_get_thread_num();
17 |     auto &durations = durations_vec[tid];
18 | 
19 |     if (durations.find(func_name) == durations.end()) { // doesn't contain `func_name`
20 |         durations[func_name] = dur;
21 |         counts_vec[tid][func_name] = 1;
22 |     } else {
23 |         durations[func_name] += dur;
24 |         counts_vec[tid][func_name]++;
25 |     }
26 | }
27 | 
28 | 
29 | std::string Timer::summary() {
30 |     std::map<std::string, std::string> trans= {
31 |             { "edit_distance", "ED" },
32 |             { "minhash", "MH" },
33 |             { "weighted_minhash", "WMH" },
34 |             { "ordered_minhash", "OMH" },
35 |             { "tensor_sketch", "TS" },
36 |             { "tensor_slide_sketch", "TSS" },
37 |             {"Int32Flattener", "I32FLAT"},
38 |             {"DoubleFlattener", "FLAT"},
39 |             {"seq2kmer", "S2K"}
40 |     };
41 |     std::map<std::string, size_t> total_counts;
42 |     for (auto &counts : counts_vec) {
43 |         for (auto const &[arg_name, arg_count] : counts) {
44 |             if (total_counts.find(arg_name) != total_counts.end())
45 |                 total_counts[arg_name] += arg_count;
46 |             else
47 |                 total_counts[arg_name] = arg_count;
48 |         }
49 |     }
50 |     std::map<std::string, double> acc;
51 |     for (auto &durations : Timer::durations_vec) {
52 |         for (auto const &[arg_name, arg] : durations) {
53 |             if (acc.find(arg_name) != acc.end()) {
54 |                 acc[arg_name] += arg.count();
55 |             } else {
56 |                 acc[arg_name] = arg.count();
57 |             }
58 |         }
59 |     }
60 | 
61 |     std::string str = "long name,short name, time, time sketch, time dist\n";
62 | 
63 |     for (auto const &[arg_name, arg] : acc) {
64 |         double sk_time = (double)arg, dist_time;
65 |         if (arg_name.find("hash") != std::string::npos && // contains *hash*
66 |             arg_name.find("dist") == std::string::npos) { // doesn't contain *dist*
67 |             sk_time += acc["seq2kmer"]; // add kmer computation time to MH* methods
68 |         }
69 |         if (arg_name == "edit_distance") {
70 |             sk_time = sk_time /1e6/total_counts[arg_name];
71 |             str += arg_name + "," + trans[arg_name] + "," + std::to_string(sk_time) + ",0,0\n";
72 |         } else if (arg_name.find("dist") == std::string::npos && arg_name!="seq2kmer") {
73 |             sk_time = sk_time /1e6/total_counts[arg_name] ;    // mean sketching time (ms)
74 |             dist_time = acc[arg_name + "_dist"]/1e6/total_counts[arg_name + "_dist"]; // mean distance computation time (ms)
75 |             str += arg_name + "," + trans[arg_name] +
76 |                   "," + std::to_string(sk_time + dist_time) +
77 |                   "," + std::to_string(sk_time) +
78 |                   "," + std::to_string(dist_time) + '\n';
79 |         }
80 |     }
81 |     return str;
82 | }
83 | 
84 | } // namespace ts
85 | 


--------------------------------------------------------------------------------
/util/timer.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <chrono>
 4 | #include <map>
 5 | #include <string>
 6 | #include <utility>
 7 | #include <cassert>
 8 | #include <vector>
 9 | 
10 | 
11 | namespace ts { // ts = Tensor Sketch
12 | 
13 | using namespace std::chrono;
14 | 
15 | 
16 | class Timer {
17 |   public:
18 |     Timer(std::string name) :
19 |             name(std::move(name)),
20 |             birth(high_resolution_clock::now()){}
21 | 
22 |     Timer(const Timer &tt) :
23 |             name(tt.name),
24 |             birth(high_resolution_clock::now()){}
25 |     ~Timer() {
26 |         auto dur = high_resolution_clock::now() - birth;
27 |         Timer::add_duration(name, dur);
28 |     }
29 | 
30 |     static std::string summary();
31 | 
32 |   private:
33 |     static void add_duration(const std::string &func_name, std::chrono::nanoseconds dur);
34 | 
35 | 
36 |     std::string name;
37 |     high_resolution_clock::time_point birth;
38 | 
39 |     static std::vector<std::map<std::string, std::chrono::nanoseconds>> durations_vec;
40 |     static std::vector<std::map<std::string, size_t>> counts_vec;
41 | };
42 | 
43 | } // namespace ts
44 | 


--------------------------------------------------------------------------------
/util/transformer.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <algorithm>
 4 | #include <cmath>
 5 | #include <vector>
 6 | 
 7 | namespace ts {
 8 | 
 9 | template <class T>
10 | class transformer {
11 |   public:
12 |     virtual T transform(T val) const = 0;
13 | };
14 | 
15 | template <class T>
16 | class discretize : public transformer<T> {
17 |   public:
18 |     explicit discretize(size_t num_bins) : num_bins(num_bins) {
19 |         bins = std::vector<double>(num_bins);
20 |         for (size_t b = 0; b < num_bins; b++) {
21 |             bins[b] = std::tan(M_PI * (((double)b + .5) / num_bins - .5));
22 |         }
23 |         bins.push_back(std::numeric_limits<double>::max());
24 |         bins.insert(bins.begin(), -std::numeric_limits<double>::max());
25 |     }
26 | 
27 | 
28 |     // bin edges used to discretize the sketch output
29 |     std::vector<double> bins;
30 | 
31 |     T transform(T val) const override {
32 |         return std::upper_bound(bins.begin(), bins.end(), val) - bins.begin();
33 |     }
34 | 
35 |   private:
36 |     /** number of bins used to discretize the output*/
37 |     size_t num_bins;
38 | };
39 | 
40 | template <class T>
41 | class atan_scaler : public transformer<T> {
42 |   public:
43 |     T transform(T val) const override { return atan(val); }
44 | };
45 | 
46 | } // namespace ts
47 | 


--------------------------------------------------------------------------------
/util/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.hpp"
 2 | 
 3 | #include <cstdio>
 4 | #include <filesystem>
 5 | #include <fstream>
 6 | #include <gflags/gflags.h>
 7 | #include <numeric>
 8 | 
 9 | namespace ts {
10 | 
11 | std::string flag_values(char delimiter, bool skip_empty, bool include_flagfile) {
12 |     const std::string short_letters = "fkmstw";
13 |     std::vector<gflags::CommandLineFlagInfo> flags;
14 |     gflags::GetAllFlags(&flags);
15 |     std::string result;
16 |     for (const auto &flag : flags) {
17 |         if (skip_empty && flag.current_value.empty())
18 |             continue;
19 |         if (!include_flagfile && flag.name == "flagfile")
20 |             continue;
21 |         // Exclude short name flags.
22 |         if (flag.name.size() == 1 && short_letters.find(flag.name[0]) != std::string::npos)
23 |             continue;
24 |         result += "--" + flag.name + "=" + flag.current_value + delimiter;
25 |     }
26 |     return result;
27 | }
28 | 
29 | void write_output_meta() {
30 |     std::string output_path;
31 |     if (!gflags::GetCommandLineOption("o", &output_path))
32 |         return;
33 | 
34 |     std::string meta_path = output_path + ".meta";
35 |     std::ofstream meta(meta_path);
36 |     meta << "#!/bin/sh\n";
37 |     meta << "cd " << std::filesystem::current_path() << "\n";
38 |     meta << gflags::GetArgv0() << " " << flag_values(' ', true, false) << "\n";
39 | 
40 |     std::filesystem::permissions(meta_path, std::filesystem::perms::owner_exec,
41 |                                  std::filesystem::perm_options::add);
42 | }
43 | 
44 | std::pair<double, double> avg_stddev(const std::vector<double> &v) {
45 |     if (v.empty())
46 |         return { 0, 0 };
47 |     const double sum = std::accumulate(begin(v), end(v), 0.0);
48 |     const double avg = sum / v.size();
49 | 
50 |     double var = 0;
51 |     for (const auto &x : v)
52 |         var += (x - avg) * (x - avg);
53 | 
54 |     return { avg, sqrt(var / v.size()) };
55 | }
56 | 
57 | double median(const std::vector<double> &v) {
58 |     assert(!v.empty());
59 |     if (v.size() % 2)
60 |         return v[v.size() / 2];
61 |     return (v[v.size() / 2 - 1] + v[v.size() / 2]) / 2;
62 | }
63 | 
64 | } // namespace ts
65 | 


--------------------------------------------------------------------------------
/util/utils.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util/multivec.hpp"
  4 | #include "util/timer.hpp"
  5 | 
  6 | #include <algorithm>
  7 | #include <cassert>
  8 | #include <cmath>
  9 | #include <numeric>
 10 | #include <vector>
 11 | 
 12 | namespace ts { // ts = Tensor Sketch
 13 | 
 14 | /**
 15 |  * Extracts k-mers from a sequence. The k-mer is treated as a number in base alphabet_size and then
 16 |  * converted to decimal, i.e. the sequence s1...sk is converted to s1*S^(k-1) + s2*S^(k-2) + ... +
 17 |  * sk, where k is the k-mer size.
 18 |  * @tparam chr types of elements in the sequence
 19 |  * @tparam kmer type that stores a kmer
 20 |  * @param seq the sequence to extract kmers from
 21 |  * @param kmer_size number of characters in a kmer
 22 |  * @param alphabet_size size of the alphabet
 23 |  * @return the extracted kmers, as integers converted from base #alphabet_size
 24 |  */
 25 | template <class chr, class kmer>
 26 | std::vector<kmer> seq2kmer(const std::vector<chr> &seq, uint8_t kmer_size, uint8_t alphabet_size) {
 27 |     Timer timer("seq2kmer");
 28 |     if (seq.size() < (size_t)kmer_size) {
 29 |         return std::vector<kmer>();
 30 |     }
 31 | 
 32 |     std::vector<kmer> result(seq.size() - kmer_size + 1, 0);
 33 | 
 34 |     kmer c = 1;
 35 |     for (uint8_t i = 0; i < kmer_size; i++) {
 36 |         result[0] += c * seq[i];
 37 |         c *= alphabet_size;
 38 |     }
 39 |     c /= alphabet_size;
 40 | 
 41 |     for (size_t i = 0; i < result.size() - 1; i++) {
 42 |         kmer base = result[i] - seq[i];
 43 |         assert(base % alphabet_size == 0);
 44 |         result[i + 1] = base / alphabet_size + seq[i + kmer_size] * c;
 45 |     }
 46 |     return result;
 47 | }
 48 | 
 49 | template <class T>
 50 | T l1_dist(const std::vector<T> &a, const std::vector<T> &b) {
 51 |     assert(a.size() == b.size());
 52 |     T res = 0;
 53 |     for (size_t i = 0; i < a.size(); i++) {
 54 |         auto el = std::abs(a[i] - b[i]);
 55 |         res += el;
 56 |     }
 57 |     return res;
 58 | }
 59 | 
 60 | 
 61 | template <class T>
 62 | T l2_dist(const std::vector<T> &a, const std::vector<T> &b) {
 63 |     assert(a.size() == b.size());
 64 |     T res = 0;
 65 |     for (size_t i = 0; i < a.size(); i++) {
 66 |         auto el = std::abs(a[i] - b[i]);
 67 |         res += el * el;
 68 |     }
 69 |     return res;
 70 | }
 71 | 
 72 | 
 73 | template <class T>
 74 | T l1_dist2D_minlen(const Vec2D<T> &a, const Vec2D<T> &b) {
 75 |     auto len = std::min(a.size(), b.size());
 76 |     T val = 0;
 77 |     for (size_t i = 0; i < len; i++) {
 78 |         for (size_t j = 0; j < a[i].size() and j < b[i].size(); j++) {
 79 |             auto el = std::abs(a[i][j] - b[i][j]);
 80 |             val += el;
 81 |         }
 82 |     }
 83 |     return val;
 84 | }
 85 | 
 86 | template <class T>
 87 | T l2_dist2D_minlen(const Vec2D<T> &a, const Vec2D<T> &b) {
 88 |     auto len = std::min(a.size(), b.size());
 89 |     T val = 0;
 90 |     for (size_t i = 0; i < len; i++) {
 91 |         for (size_t j = 0; j < a[i].size() and j < b[i].size(); j++) {
 92 |             auto el = (a[i][j] - b[i][j]);
 93 |             val += el * el;
 94 |         }
 95 |     }
 96 |     return val;
 97 | }
 98 | 
 99 | 
100 | template <class T>
101 | T hamming_dist(const std::vector<T> &a, const std::vector<T> &b) {
102 |     assert(a.size() == b.size());
103 |     T diff = 0;
104 |     for (size_t i = 0; i < a.size(); i++) {
105 |         if (a[i] != b[i]) {
106 |             diff++;
107 |         }
108 |     }
109 |     return diff;
110 | }
111 | 
112 | template <class seq_type>
113 | int lcs(const std::vector<seq_type> &s1, const std::vector<seq_type> &s2) {
114 |     size_t m = s1.size();
115 |     size_t n = s2.size();
116 |     //        int L[m + 1][n + 1];
117 |     Vec2D<int> L(m + 1, std::vector<int>(n + 1, 0));
118 |     for (size_t i = 0; i <= m; i++) {
119 |         for (size_t j = 0; j <= n; j++) {
120 |             if (i == 0 || j == 0) {
121 |                 L[i][j] = 0;
122 |             } else if (s1[i - 1] == s2[j - 1]) {
123 |                 L[i][j] = L[i - 1][j - 1] + 1;
124 |             } else {
125 |                 L[i][j] = std::max(L[i - 1][j], L[i][j - 1]);
126 |             }
127 |         }
128 |     }
129 |     return L[m][n];
130 | }
131 | 
132 | template <class seq_type>
133 | size_t lcs_distance(const std::vector<seq_type> &s1, const std::vector<seq_type> &s2) {
134 |     return s1.size() + s2.size() - 2 * lcs(s1, s2);
135 | }
136 | 
137 | template <class seq_type>
138 | size_t edit_distance(const std::vector<seq_type> &s1, const std::vector<seq_type> &s2) {
139 |     Timer timer("edit_distance");
140 |     const size_t m(s1.size());
141 |     const size_t n(s2.size());
142 | 
143 |     if (m == 0)
144 |         return n;
145 |     if (n == 0)
146 |         return m;
147 | 
148 |     auto costs = std::vector<size_t>(n + 1);
149 | 
150 |     for (size_t k = 0; k <= n; k++)
151 |         costs[k] = k;
152 | 
153 |     size_t i = 0;
154 |     for (auto it1 = s1.begin(); it1 != s1.end(); ++it1, ++i) {
155 |         costs[0] = i + 1;
156 |         size_t corner = i;
157 | 
158 |         size_t j = 0;
159 |         for (auto it2 = s2.begin(); it2 != s2.end(); ++it2, ++j) {
160 |             size_t upper = costs[j + 1];
161 |             if (*it1 == *it2) {
162 |                 costs[j + 1] = corner;
163 |             } else {
164 |                 size_t t(upper < corner ? upper : corner);
165 |                 costs[j + 1] = (costs[j] < t ? costs[j] : t) + 1;
166 |             }
167 | 
168 |             corner = upper;
169 |         }
170 |     }
171 | 
172 |     size_t result = costs[n];
173 | 
174 |     return result;
175 | }
176 | 
177 | template <class T, class = is_u_integral<T>>
178 | T int_pow(T x, T pow) {
179 |     T result = 1;
180 |     for (;;) {
181 |         if (pow & 1)
182 |             result *= x;
183 |         pow >>= 1;
184 |         if (!pow)
185 |             break;
186 |         x *= x;
187 |     }
188 | 
189 |     return result;
190 | }
191 | 
192 | std::string
193 | flag_values(char delimiter = ' ', bool skip_empty = false, bool include_flagfile = true);
194 | 
195 | // If the -o output flag is set, this writes a small shell script <output>.meta containing the
196 | // command line used to generate the output.
197 | void write_output_meta();
198 | 
199 | // A simple wrapper around std::apply that applies a given lambda on each element of a tuple.
200 | template <typename F, typename T>
201 | void apply_tuple(F &&f, T &tuple_t) {
202 |     std::apply([&](auto &...t) { (f(t), ...); }, tuple_t);
203 | }
204 | 
205 | 
206 | // A simple wrapper around std::apply that applies f on pairs of elements of two tuples.
207 | template <typename F, typename T, typename U>
208 | void apply_tuple(F &&f, T &tuple_t, U &tuple_u) {
209 |     std::apply([&](auto &...t) { std::apply([&](auto &...u) { (f(t, u), ...); }, tuple_u); },
210 |                tuple_t);
211 | }
212 | 
213 | 
214 | std::pair<double, double> avg_stddev(const std::vector<double> &v);
215 | 
216 | // v must be sorted.
217 | double median(const std::vector<double> &v);
218 | 
219 | } // namespace ts
220 | 


--------------------------------------------------------------------------------