├── .github └── workflows │ └── main.yml ├── LICENSE ├── README.md ├── conda └── meta.yaml ├── requirements.txt ├── setup.py ├── tests └── py │ ├── censat_sim_worst_allowed_results.tsv │ ├── check_deletion.py │ ├── check_indel_errors.py │ ├── check_real_censat_chm13.py │ ├── check_sim_censat_chm13.py │ └── check_sim_diploid.py └── veritymap ├── .clang-format ├── CMakeLists.txt ├── Makefile ├── __init__.py ├── __version__.py ├── config ├── config_hifi_diploid.tsv ├── config_hifi_haploid.tsv ├── config_hifi_haploid_complete.tsv └── config_ont_haploid_complete.tsv ├── main.py ├── py_src ├── __init__.py ├── assembly.py ├── mapper.py ├── reporting.py └── utils.py ├── src ├── projects │ ├── CMakeLists.txt │ └── veritymap │ │ ├── CMakeLists.txt │ │ ├── chaining.hpp │ │ ├── cigar.cpp │ │ ├── cigar.hpp │ │ ├── cms_utils.hpp │ │ ├── config │ │ ├── config.cpp │ │ └── config.hpp │ │ ├── dp_scoring.hpp │ │ ├── hash_utils.hpp │ │ ├── kmer_index │ │ ├── filter_rep_kmers.hpp │ │ ├── index_builders │ │ │ ├── approx_canon_kmer_indexer_single_thread.hpp │ │ │ ├── approx_kmer_indexer_builder.hpp │ │ │ ├── exact_canon_kmer_indexer.hpp │ │ │ ├── exact_kmer_index_builder.hpp │ │ │ ├── kmer_filter.hpp │ │ │ ├── kmer_index_builder.hpp │ │ │ └── kmer_window.hpp │ │ ├── indexed_contigs.hpp │ │ ├── kmer_filter_canon.hpp │ │ ├── kmer_index.hpp │ │ └── target_indexer.hpp │ │ ├── ksw_align.hpp │ │ ├── mapper.hpp │ │ ├── matches.hpp │ │ ├── query_indexer.hpp │ │ ├── rolling_hash.hpp │ │ ├── strand.hpp │ │ ├── veritymap.cpp │ │ └── veritymap.hpp └── tools │ ├── CMakeLists.txt │ ├── bloom │ ├── CMakeLists.txt │ └── bloom.hpp │ ├── common │ ├── CMakeLists.txt │ ├── cl_parser.cpp │ ├── cl_parser.hpp │ ├── coverage_utils.cpp │ ├── coverage_utils.hpp │ ├── dir_utils.hpp │ ├── logging.hpp │ ├── math_utils.hpp │ ├── oneline_utils.hpp │ ├── output_utils.hpp │ ├── parallel.h │ └── string_utils.hpp │ ├── ksw2 │ ├── CMakeLists.txt │ ├── LICENSE.txt │ ├── kalloc.cpp │ ├── kalloc.h │ ├── ksw2.h │ └── ksw2_extz2_sse.cpp │ ├── sequences │ ├── CMakeLists.txt │ ├── IntrusiveRefCntPtr.h │ ├── contigs.cpp │ ├── contigs.hpp │ ├── nucl.hpp │ ├── seqio.hpp │ ├── sequence.cpp │ ├── sequence.hpp │ ├── stream.hpp │ └── verify.hpp │ ├── sketch │ ├── CMakeLists.txt │ └── include │ │ ├── aesctr │ │ ├── aesctr.h │ │ └── wy.h │ │ ├── circularqueue │ │ └── cq.h │ │ ├── compact_vector │ │ ├── compact_iterator.hpp │ │ ├── compact_vector.hpp │ │ ├── const_iterator_traits.hpp │ │ ├── parallel_iterator_traits.hpp │ │ └── prefetch_iterator_traits.hpp │ │ ├── flat_hash_map │ │ └── flat_hash_map.hpp │ │ ├── libpopcnt │ │ └── libpopcnt.h │ │ ├── sketch │ │ ├── bbmh.h │ │ ├── bf.h │ │ ├── cbf.h │ │ ├── ccm.h │ │ ├── common.h │ │ ├── count_eq.h │ │ ├── dd.h │ │ ├── div.h │ │ ├── exception.h │ │ ├── filterhll.h │ │ ├── fixed_vector.h │ │ ├── fy.h │ │ ├── hash.h │ │ ├── hbb.h │ │ ├── heap.h │ │ ├── hedley.h │ │ ├── hk.h │ │ ├── hll.h │ │ ├── hmh.h │ │ ├── integral.h │ │ ├── isz.h │ │ ├── kthread.h │ │ ├── macros.h │ │ ├── median.h │ │ ├── mh.h │ │ ├── mod.h │ │ ├── mult.h │ │ ├── pc.h │ │ ├── pmh.h │ │ ├── policy.h │ │ ├── rnla.h │ │ ├── setsketch.h │ │ ├── sketch.h │ │ ├── sparse.h │ │ ├── sseutil.h │ │ ├── tsg.h │ │ ├── update.h │ │ ├── vac.h │ │ └── wip │ │ │ └── cuda │ │ │ └── hll.cuh │ │ ├── vec │ │ ├── stats.h │ │ ├── vec.h │ │ └── welford_sd.h │ │ └── xxHash │ │ ├── xxh3.h │ │ └── xxhash.h │ └── version │ ├── CMakeLists.txt │ ├── version.cpp.in │ └── version.hpp └── test_dataset ├── test_query.fasta └── test_target.fasta /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: BuildAndTestLaunch 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | env: 6 | # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) 7 | BUILD_TYPE: Release 8 | 9 | jobs: 10 | build: 11 | # The CMake configure and build commands are platform agnostic and should work equally 12 | # well on Windows or Mac. You can convert this to a matrix build if you need 13 | # cross-platform coverage. 14 | # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix 15 | runs-on: ${{ matrix.os }} 16 | 17 | strategy: 18 | matrix: 19 | os: [ ubuntu-latest ] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: Build 25 | working-directory: ${{github.workspace}} 26 | shell: bash 27 | run: | 28 | cd veritymap 29 | make build_type=$BUILD_TYPE 30 | 31 | - name: TestLaunch 32 | working-directory: ${{github.workspace}} 33 | shell: bash 34 | run: | 35 | cd veritymap 36 | make test_launch 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VerityMap 2 | 3 | ## Quick start 4 | ```shell 5 | veritymap --reads test_dataset/test_query.fasta test_dataset/test_target.fasta -o test_outdir -d hifi 6 | ``` 7 | 8 | ## Introduction 9 | 10 | **VerityMap** (formerly known as TandemMapper2) is designed for mapping long reads (PacBio HiFi or ONT) to assemblies of extra-long tandem repeats, such as centromeres, but can be applied to whole-genome assemblies. The tool outputs SAM file that can be used in any downstream analysis. In addition, VerityMap yields an information about possible errors and heterozygous sites in the assembly based on analysis of rare k-mers. 11 | 12 | ## Installation 13 | 14 | Requirements are listed in ```requirements.txt``` and can be installed through Conda as ```conda install --file requirements.txt``` or pip as ```pip install -r requirements.txt```. 15 | 16 | ## Usage 17 | 18 | ```shell 19 | veritymap [options] --reads -d -o 20 | 21 | Required arguments: 22 | --reads PATH File with Oxford Nanopore or PacBio HiFi reads used for ETR assembly 23 | -o PATH Folder to store all result files 24 | -d Type of used sequencing platform ("hifi" for PacBio HiFi reads and "ont" for ONT reads) 25 | 26 | Optional arguments: 27 | -t INT Maximum number of threads [default: 4] 28 | -l \"label,label,...\" Human-readable names of assemblies to use in reports, comma-separated. If contain spaces, use quotes 29 | ``` 30 | In case VerityMap is built locally, it should be run as follows: 31 | 32 | ```shell 33 | python veritymap/main.py [options] --reads -d -o 34 | ``` 35 | 36 | ## Output files 37 | 38 | The following files are contained in `` directory (specified by `-o`) and include results 39 | for all input assemblies. 40 | 41 | `/*_alignment.bed` - VerityMap alignments in BED format 42 | 43 | `/*_alignment.sam` - VerityMap alignments in SAM format 44 | 45 | `/*_kmers_dist_diff.bed` - BED file with coordinates of possible heterozygous sites and errors. The format is `{ref_name} {start} {end} {misassembly_len} {% discordant reads}`. Please do not pay much attention to the last value, and use the interactive HTML plot described below to see % discordant reads. Negative misassembly length corresponds to the deletion in the reference. 46 | `/*_kmers_dist_diff.html` - interactive HTML plot showing possible heterozygous sites and errors. 47 | VerityMap analyzes distances between consecutive rare k-mers in the assembly and in a read. If these distances are inconsistent, i.e., a read does not support the assembly in this position, it may indicate the presence of a heterozygous site (if the percent of deviated reads is 20-80%) or an assembly error (if the percent of such reads is 80-100%). 48 | In the plot, OX shows position in the assembly, OY shows the percent of reads that disagree with the assembly in this position. By hovering over the plot, you can also get the information about the number of supporting reads, the mean difference in distances (essentially, the length of an indel) and standard deviation of the difference. 49 | In the BED file, VerityMap outputs approximate coordinates, length, and frequency of detected variants. Importantly, reported coordinates are coordinates of the rare k-mers that are nearest to the variant, i.e., the real coordinates of the variant are unknown and locate somewhere inside the reported region. You can find the real coordinates analyzing the SAM file with the alignments. 50 | 51 | The remaining output files are technical and likely should not concern the end user. 52 | 53 | ## Citation 54 | 55 | Currently, you can refer to the first TandemTools paper: 56 | 57 | Alla Mikheenko, Andrey V Bzikadze, Alexey Gurevich, Karen H Miga, Pavel A Pevzner, TandemTools: mapping long reads and assessing/improving assembly quality in extra-long tandem repeats, Bioinformatics, Volume 36, Issue Supplement_1, July 2020, Pages i75–i83, https://doi.org/10.1093/bioinformatics/btaa440 58 | 59 | The paper describing VerityMap algorithm is in preparation. 60 | 61 | ## Contacts 62 | 63 | Please report any problems to the [issue tracker](https://github.com/ablab/tandemQUAST/issues). Alternatively, you can write directly to [a.mikheenko@spbu.ru](mailto:a.mikheenko@spbu.ru). 64 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: "veritymap" 3 | version: "2.0.0" 4 | 5 | source: 6 | # Relative path to the parent directory. 7 | path: .. 8 | 9 | build: 10 | number: 0 11 | script: {{ PYTHON }} -m pip install . -vv --ignore-installed --no-deps 12 | # script: make install PREFIX=${PREFIX} 13 | 14 | requirements: 15 | host: 16 | - python 17 | - setuptools 18 | - zlib 19 | 20 | build: 21 | - gcc_linux-64==9.4.0 # [linux] 22 | - gxx_linux-64==9.4.0 # [linux] 23 | - llvm # [osx] 24 | - llvm-openmp # [osx] 25 | - libgomp # [linux] 26 | - make 27 | - cmake 28 | 29 | run: 30 | - python 31 | - python-slugify 32 | - biopython 33 | - click 34 | - numpy 35 | - plotly 36 | - zlib 37 | 38 | test: 39 | commands: 40 | - veritymap --help 41 | 42 | about: 43 | home: https://github.com/ablab/VerityMap 44 | license: GPLv3 45 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | click 3 | numpy 4 | plotly 5 | python-slugify -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | 5 | 6 | try: 7 | import setuptools 8 | except ImportError: 9 | sys.exit("setuptools package not found. " 10 | "Please use 'pip install setuptools' first") 11 | 12 | from setuptools import setup 13 | from distutils.command.build import build as DistutilsBuild 14 | from distutils.spawn import find_executable 15 | 16 | from veritymap.__version__ import __version__ 17 | 18 | 19 | # Make sure we're running from the setup.py directory. 20 | script_dir = os.path.dirname(os.path.realpath(__file__)) 21 | if script_dir != os.getcwd(): 22 | os.chdir(script_dir) 23 | 24 | 25 | description = \ 26 | """ 27 | VerityMap aligns long PacBio Hi-Fi and ONT reads to 28 | genome assemblies (including long repetitive regions) 29 | """ 30 | 31 | 32 | class MakeBuild(DistutilsBuild): 33 | def run(self): 34 | os.chdir(os.path.join(script_dir, "veritymap")) 35 | if not find_executable("make"): 36 | sys.exit("ERROR: 'make' command is unavailable") 37 | try: 38 | subprocess.check_call(["make"]) 39 | except subprocess.CalledProcessError as e: 40 | sys.exit("Compilation error: ", e) 41 | os.chdir(script_dir) 42 | DistutilsBuild.run(self) 43 | 44 | 45 | setup( 46 | name="VerityMap", 47 | version="2.0.0", 48 | description=description, 49 | url='https://github.com/ablab/VerityMap', 50 | author='Alla Mikheenko', 51 | author_email='al.miheenko@gmail.com', 52 | license='GNU General Public License v3.0', 53 | install_requires=[ 54 | 'plotly', 'python-slugify', 'biopython', 'numpy'], 55 | packages=['veritymap'], 56 | package_dir={'veritymap': 'veritymap'}, 57 | package_data={'veritymap': ['build/bin/veritymap', 'config/*', '*', 'py_src/*', 'test_dataset/*',]}, 58 | entry_points={ 59 | 'console_scripts': ['veritymap=veritymap.main:main'] 60 | }, 61 | cmdclass={'build': MakeBuild} 62 | ) 63 | -------------------------------------------------------------------------------- /tests/py/censat_sim_worst_allowed_results.tsv: -------------------------------------------------------------------------------- 1 | mapped_freq wrong unmapped_perc uncovered_bases_perc 2 | chr1 95 30 2 1 3 | chr2 95 30 3 2 4 | chr3 97 30 1 1 5 | chr4 95 30 4 2 6 | chr5 97 30 1 1 7 | chr6 92 30 6 3 8 | chr7 95 30 2 1 9 | chr8 97 30 1 1 10 | chr9 90 35 8 3 11 | chr10 95 30 2 1 12 | chr11 97 30 1 1 13 | chr12 97 30 1 1 14 | chr13 95 30 3 1 15 | chr14 97 30 1 1 16 | chr15 95 30 3 1 17 | chr16 95 30 4 1 18 | chr17 95 30 3 1 19 | chr18 93 30 5 2 20 | chr19 97 30 1 1 21 | chr20 97 30 1 1 22 | chr21 97 30 1 1 23 | chr22 95 30 2 1 24 | chrX 97 30 1 1 25 | -------------------------------------------------------------------------------- /tests/py/check_deletion.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from os.path import join 4 | 5 | point_del = 456400 6 | deletion = 10000 7 | threshold=100 8 | 9 | real_pos = dict() 10 | i = 1 11 | read_lens=dict() 12 | 13 | datadir = sys.argv[1] 14 | outdir = sys.argv[2] 15 | veritymap_bin = sys.argv[3] 16 | 17 | fasta_file = join(datadir, "chr7_ext_del.fasta") 18 | reads_fasta_file = join(datadir, "chr7_ext_subreads.fasta") 19 | cmd = veritymap_bin + " --target %s --queries %s -o %s -t 10" % (fasta_file, reads_fasta_file, outdir) 20 | print(cmd) 21 | subprocess.call(cmd.split()) 22 | 23 | maf_file = join(datadir, "chr7_ext_0001.maf") 24 | with open(maf_file) as f: 25 | for line in f: 26 | fs = line.split() 27 | if "ref" in line: 28 | read_name = "S1_%d" % i 29 | read_len,ref_s = int(fs[3]), int(fs[2]) 30 | read_lens[read_name] = read_len 31 | real_pos[read_name] = (int(ref_s),int(read_len)) 32 | i += 1 33 | 34 | tm_pos = dict() 35 | reads_w_diff = 0 36 | reads_wo_diff = 0 37 | 38 | chains = join(outdir, "chains.tsv") 39 | with open(chains) as f: 40 | for line in f: 41 | fs = line.split() 42 | if 'Aln' not in fs[0]: continue 43 | is_primary = bool(int(fs[10])) 44 | if not is_primary: 45 | continue 46 | read_name, ref_name, read_s, read_e, read_len, ref_s, ref_e = fs[1:8] 47 | read_name=read_name.replace('+','').replace('-','') 48 | try: ref_s, ref_e, read_s, read_e = map(int, (ref_s, ref_e, read_s, read_e)) 49 | except: continue 50 | shift1 = 0 51 | if ref_s >= point_del: 52 | shift1 += deletion 53 | if ref_s < point_del and ref_e > point_del: 54 | prev_pos, prev_read_pos = 0, 0 55 | line = f.readline() 56 | while True: 57 | line = f.readline() 58 | if not line or 'Aln' in line: break 59 | fs = line.split() 60 | read_pos, ref_pos = int(fs[0]), int(fs[1]) 61 | if prev_pos <= point_del and ref_pos > point_del: 62 | ref_diff = abs(ref_pos - prev_pos) 63 | read_diff = abs(read_pos - prev_read_pos) 64 | diff = ref_diff-read_diff 65 | if -10500 < diff < -9500: reads_w_diff+=1 66 | else: reads_wo_diff +=1 67 | prev_pos = ref_pos 68 | prev_read_pos = read_pos 69 | read_len = read_lens[read_name] 70 | read_shift = read_s 71 | tm_pos[read_name] = (max(0,ref_s-read_shift+shift1),10000) 72 | 73 | a=0 74 | b = 0 75 | for read_name in tm_pos: 76 | if abs(tm_pos[read_name][0] - real_pos[read_name][0]) >= threshold: 77 | b += 1 78 | print("Wrongly mapped", read_name, tm_pos[read_name][0], real_pos[read_name][0]) 79 | else: a+=1 80 | print("Wrongly mapped reads",b, "total mapped", a+b) 81 | print("Chains extended through deletion with 10kbp diff", reads_w_diff, "(prev result: 8)") 82 | print("Chains extended through deletion with incorrect diff", reads_wo_diff, "(should be 0)") 83 | 84 | MIN_READS_W_DIFF = 5 85 | MAX_READS_WO_DIFF = 0 86 | 87 | if reads_w_diff >= MIN_READS_W_DIFF and reads_wo_diff > MAX_READS_WO_DIFF: 88 | print(f"Failure: " 89 | "MIN_READS_W_DIFF = {MIN_READS_W_DIFF}, real = {reads_w_diff}. " 90 | "MAX_READS_WO_DIFF = {MAX_READS_WO_DIFF}, real = {reads_wo_diff}") 91 | sys.exit(1) 92 | 93 | print("Successful test on a dataset with deletion") 94 | -------------------------------------------------------------------------------- /tests/py/check_indel_errors.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from os.path import join 4 | 5 | datadir = sys.argv[1] 6 | outdir = sys.argv[2] 7 | veritymap_bin = sys.argv[3] 8 | 9 | del_pos=int(sys.argv[4]) 10 | del_len=int(sys.argv[5]) 11 | 12 | additional_option="" 13 | if len(sys.argv) > 6: 14 | additional_option=" ".join(sys.argv[6:]) 15 | 16 | DIFF_POS_THRESHOLD=100 17 | DIFF_THRESHOLD=100 18 | 19 | fasta_file = join(datadir, "reference.fasta") 20 | reads_fasta_file = join(datadir, "reads.fasta") 21 | cmd = veritymap_bin + " --target %s --queries %s -o %s -t 10 %s" % (fasta_file, reads_fasta_file, outdir, additional_option) 22 | print(cmd) 23 | subprocess.call(cmd.split()) 24 | 25 | maf_file = join(datadir, "maf.txt") 26 | real_pos = dict() 27 | read_lens=dict() 28 | with open(maf_file) as f: 29 | for line in f: 30 | fs = line.split() 31 | #if "ref" in line: 32 | # read_name = "S1_%d" % i 33 | read_name,ref_s, read_len = fs[0], int(fs[1]), int(fs[2]) 34 | read_lens[read_name] = read_len 35 | real_pos[read_name] = (int(ref_s),int(read_len)) 36 | # i += 1 37 | 38 | tm_pos = dict() 39 | reads_w_diff = 0 40 | reads_wo_diff = 0 41 | outside_diff = 0 42 | 43 | chains = join(outdir, "chains.tsv") 44 | with open(chains) as f: 45 | for line in f: 46 | if not fs or 'Aln' not in fs[0]: fs = line.split() 47 | if 'Aln' not in fs[0]: continue 48 | is_primary = bool(int(fs[10])) 49 | if not is_primary: 50 | fs = None 51 | continue 52 | read_name, ref_name, read_s, read_e, read_len, ref_s, ref_e = fs[1:8] 53 | read_name=read_name.replace('+','').replace('-','') 54 | try: ref_s, ref_e, read_s, read_e = map(int, (ref_s, ref_e, read_s, read_e)) 55 | except: continue 56 | shift1 = 0 57 | if ref_s >= del_pos: 58 | shift1 += del_len 59 | prev_pos, prev_read_pos = 0, 0 60 | line = f.readline() 61 | while True: 62 | line = f.readline() 63 | fs = line.split() 64 | if not line or 'Aln' in line: break 65 | read_pos, ref_pos = int(fs[0]), int(fs[1]) 66 | ref_diff = abs(ref_pos - prev_pos) 67 | read_diff = abs(read_pos - prev_read_pos) 68 | diff = abs(ref_diff-read_diff) 69 | if prev_pos and prev_pos < del_pos-1 <= ref_pos: 70 | if abs(del_len)-100 < abs(diff) < abs(del_len)+100: 71 | #print(read_name,diff, line) 72 | reads_w_diff+=1 73 | else: 74 | #print("Mapped with incorrect diff", read_name,diff) 75 | reads_wo_diff +=1 76 | elif prev_pos and abs(diff) > DIFF_THRESHOLD: 77 | #print("Mapped with diff", diff, "outside the del_len:", read_name, prev_pos) 78 | outside_diff +=1 79 | prev_pos = ref_pos 80 | prev_read_pos = read_pos 81 | #read_len = read_lens[read_name] 82 | read_shift = read_s 83 | tm_pos[read_name] = (max(0,ref_s-read_shift+shift1),10000) 84 | 85 | a=0 86 | b = 0 87 | for read_name in tm_pos: 88 | if abs(tm_pos[read_name][0] - real_pos[read_name][0]) >= DIFF_POS_THRESHOLD: 89 | b += 1 90 | #print("Wrongly mapped", read_name, tm_pos[read_name][0], real_pos[read_name][0]) 91 | else: a+=1 92 | print("Total mapped reads",a+b) 93 | print("Wrongly mapped reads",b) 94 | print("Chains extended through del_len with correct diff", reads_w_diff) 95 | print("Chains extended through del_len with incorrect diff", reads_wo_diff, "(should be 0)") 96 | print("Reads with discrepancies outside the del_len", outside_diff) 97 | 98 | MIN_READS_W_DIFF = 5 99 | MAX_READS_WO_DIFF = 0 100 | MAX_OUTSIDE_DIFF = 20 101 | 102 | if reads_w_diff <= MIN_READS_W_DIFF or reads_wo_diff > MAX_READS_WO_DIFF or outside_diff > MAX_OUTSIDE_DIFF: 103 | print(f"Failure: " 104 | f"MIN_READS_W_DIFF = {MIN_READS_W_DIFF}, real = {reads_w_diff}. " 105 | f"MAX_READS_WO_DIFF = {MAX_READS_WO_DIFF}, real = {reads_wo_diff}. " 106 | f"MAX_OUTSIDE_DIFF = {MAX_OUTSIDE_DIFF}, real = {outside_diff}. ") 107 | sys.exit(1) 108 | 109 | print("Successful test on a dataset with difference of %d bp length on %d bp" % (del_len, del_pos)) 110 | -------------------------------------------------------------------------------- /tests/py/check_real_censat_chm13.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | import subprocess 5 | import sys 6 | 7 | from joblib import Parallel, delayed 8 | 9 | 10 | SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) 11 | DEFAULT_INPUT = "/Poppy/abzikadze/centroFlye/centroFlye_repo/data-share/tandemtools2/censat" 12 | DEFAULT_VMBIN = os.path.join(SCRIPT_DIR, os.pardir, os.pardir, 13 | 'build', 'bin', 'veritymap') 14 | DEFAULT_CENSAT_BED = os.path.join(DEFAULT_INPUT, "cenAnnotation.merged.bed") 15 | DEFAULT_REF_FAI = os.path.join(DEFAULT_INPUT, "ref_v1", "chm13.draft_v1.0.fasta.fai") 16 | 17 | 18 | class CenSatInfo: 19 | def __init__(self, chrom, s, e): 20 | self.chrom = chrom 21 | self.s = int(s) 22 | self.e = int(e) 23 | 24 | def __len__(self): 25 | return self.e - self.e 26 | 27 | 28 | def get_censats_info(censat_bed_fn, ref_fai_fn): 29 | censats_info = {} 30 | with open(censat_bed_fn) as f: 31 | for line in f: 32 | chrom, s, e = line.strip().split() 33 | censats_info[chrom] = CenSatInfo(chrom, s, e) 34 | with open(ref_fai_fn) as f: 35 | for line in f: 36 | chrom, full_len = line.strip().split()[:2] 37 | full_len = int(full_len) 38 | if chrom in censats_info: 39 | censats_info[chrom].full_len = full_len 40 | return censats_info 41 | 42 | 43 | def parallel_process(outdir, datadir, censat_info, veritymap_bin, threads, 44 | only_index): 45 | outdir = os.path.join(outdir, censat_info.chrom) 46 | queries_fn = os.path.join(datadir, "real_reads", "censat", 47 | censat_info.chrom + "_censat.fasta") 48 | target_fn = os.path.join(datadir, "references", 49 | censat_info.chrom + "_censat.fasta") 50 | cmd = veritymap_bin + " --target %s --queries %s -o %s -t %d" % \ 51 | (target_fn, queries_fn, outdir, threads) 52 | if only_index: 53 | cmd += " --only-index" 54 | print(cmd) 55 | subprocess.call(cmd.split()) 56 | print(censat_info.chrom, " FINISHED!") 57 | 58 | 59 | def merge_sam(outdir, censats_info, cmd, threads): 60 | merged_sam_fn = os.path.join(outdir, 'alignments_merged.sam') 61 | with open(merged_sam_fn, 'w') as f: 62 | for chrom, censat_info in censats_info.items(): 63 | print(f'@SQ\tSN:{chrom}\tLN:{censat_info.full_len}', file=f) 64 | print(f'@PG\tID:VerityMap\tPN:VerityMap\tVN:2.0\tCL:{cmd}', 65 | file=f) 66 | with open(merged_sam_fn, 'a') as f: 67 | for chrom, censat_info in censats_info.items(): 68 | print(chrom) 69 | sam_fn = os.path.join(outdir, censat_info.chrom, 70 | 'alignments.sam') 71 | awk_cmd = \ 72 | f"awk 'NR>2 {{OFS=\"\t\"; $3=\"{chrom}\"; $4+={censat_info.s}; print}}' {sam_fn}" 73 | subprocess.call(awk_cmd, shell=True, stdout=f) 74 | 75 | merged_bam_fn = os.path.join(outdir, 'alignments_merged.bam') 76 | sam2bam_cmd = f"samtools view -b {merged_sam_fn} | samtools sort -@{threads} -o {merged_bam_fn}" 77 | subprocess.call(sam2bam_cmd, shell=True) 78 | bamindex_cmd = f"samtools index {merged_bam_fn}" 79 | subprocess.call(bamindex_cmd, shell=True) 80 | rmsam_cmd = f"rm {merged_sam_fn}" 81 | subprocess.call(rmsam_cmd, shell=True) 82 | 83 | 84 | def merge_indexes(outdir, censats_info, cmd): 85 | merge_indexes_fn = os.path.join(outdir, 'kmer_indexes.tsv') 86 | outfn = os.path.join(outdir, 'kmer_indexes_merged.tsv') 87 | with open(outfn, 'w') as fout: 88 | for chrom, censat_info in censats_info.items(): 89 | fn = os.path.join(outdir, chrom, 'kmer_indexes.tsv') 90 | with open(fn) as fin: 91 | for line in fin: 92 | _, coord = line.split() 93 | coord = int(coord) 94 | coord += censats_info[chrom].s 95 | fout.write(f'{censats_info[chrom].chrom}\t{coord}\n') 96 | 97 | 98 | def merge_results(outdir, censats_info, cmd, threads, only_index): 99 | merge_indexes(outdir, censats_info, cmd) 100 | if only_index: 101 | return 102 | merge_sam(outdir, censats_info, cmd, threads) 103 | 104 | 105 | def main(): 106 | cmd = ' '.join(sys.argv) 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument("--datadir", "-i", 109 | default=DEFAULT_INPUT) 110 | parser.add_argument("--threads", type=int, default=10) 111 | parser.add_argument("--n-jobs", type=int, default=3) 112 | parser.add_argument("--outdir", "-o", required=True) 113 | parser.add_argument("--veritymap-bin", 114 | default=DEFAULT_VMBIN) 115 | parser.add_argument("--censat-bed", default=DEFAULT_CENSAT_BED) 116 | parser.add_argument("--ref-fai", default=DEFAULT_REF_FAI) 117 | parser.add_argument("--only-index", action="store_true") 118 | params = parser.parse_args() 119 | 120 | Path(params.outdir).mkdir(parents=True, exist_ok=True) 121 | censats_info = get_censats_info(params.censat_bed, params.ref_fai) 122 | 123 | Parallel(n_jobs=params.n_jobs)(delayed(parallel_process)\ 124 | (params.outdir, params.datadir, censat_info, 125 | params.veritymap_bin, params.threads, params.only_index) 126 | for censat_info in censats_info.values()) 127 | 128 | merge_results(params.outdir, censats_info, cmd, 129 | params.threads, params.only_index) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /tests/py/check_sim_diploid.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | threshold = 100 6 | chains_file = sys.argv[1] 7 | ANSWER_DIR = sys.argv[2] 8 | 9 | real_pos = dict() 10 | i = 1 11 | read_scores = dict() 12 | with open(os.path.join(ANSWER_DIR, "maf_first.txt")) as f: 13 | for line in f: 14 | read_name = "S1_%d" % i 15 | read_name, ref_s = line.split()[:2] 16 | real_pos[read_name] = int(ref_s) 17 | i += 1 18 | i = 1 19 | with open(os.path.join(ANSWER_DIR, "maf_second.txt")) as f: 20 | for line in f: 21 | read_name, ref_s = line.split()[:2] 22 | read_name = "S2_%d" % i 23 | real_pos[read_name] = int(ref_s) 24 | i += 1 25 | 26 | tm_pos = dict() 27 | d = 0 28 | with open(chains_file) as f: 29 | for line in f: 30 | fs = line.split() 31 | if 'Aln' not in fs[0]: 32 | continue 33 | is_primary = bool(int(fs[10])) 34 | if not is_primary: 35 | continue 36 | read_name, ref_name, read_s, read_e, read_len, ref_s, ref_e = fs[1:8] 37 | if 'S2' in read_name and 'hg' not in ref_name: 38 | d += 1 39 | print(fs) 40 | continue 41 | if 'S2' not in read_name and 'hg' in ref_name: 42 | d += 1 43 | print(fs) 44 | continue 45 | score = float(fs[-1]) 46 | read_name = read_name.replace('+', '').replace('-', '') 47 | try: 48 | ref_s, ref_e, read_s, read_e = map( 49 | int, (ref_s, ref_e, read_s, read_e)) 50 | except BaseException: 51 | continue 52 | shift1 = 0 53 | read_shift = read_s # if strand != 'backward' else read_len - read_e 54 | if read_name in read_scores and read_scores[read_name] > score: 55 | continue 56 | read_scores[read_name] = score 57 | tm_pos[read_name] = (max(0, ref_s-read_shift+shift1), 10000) 58 | 59 | a = 0 60 | b = 0 61 | for read_name in tm_pos: 62 | if abs(tm_pos[read_name][0] - real_pos[read_name]) >= threshold: 63 | b += 1 64 | #if b <10:print(read_name, tm_pos[read_name][0], real_pos[read_name]) 65 | else: 66 | #if a <4:print(read_name, tm_pos[read_name][0], real_pos[read_name]) 67 | a += 1 68 | print("Wrong target: %d reads" % d) 69 | print("Wrong mapped:", b, "reads, correct mapped:", 70 | a, "reads, unmapped:", len(real_pos) - (a+b)) 71 | 72 | 73 | if len(sys.argv) >= 4: 74 | outfn = sys.argv[3] 75 | with open(outfn, 'w') as f: 76 | print("Wrong_target\t%d" % d, file=f) 77 | print("Misplaced\t", b, "\nCorrect\t", a, "\nUnmapped\t", 78 | len(real_pos) - (a+b), sep='', file=f) 79 | -------------------------------------------------------------------------------- /veritymap/.clang-format: -------------------------------------------------------------------------------- 1 | # Generated from CLion C/C++ Code Style settings 2 | BasedOnStyle: Google 3 | AccessModifierOffset: -1 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: None 6 | AlignOperands: DontAlign 7 | AllowAllArgumentsOnNextLine: false 8 | AllowAllConstructorInitializersOnNextLine: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: Always 11 | AllowShortCaseLabelsOnASingleLine: true 12 | AllowShortFunctionsOnASingleLine: All 13 | AllowShortIfStatementsOnASingleLine: Never 14 | AllowShortLambdasOnASingleLine: All 15 | AllowShortLoopsOnASingleLine: true 16 | AlwaysBreakAfterReturnType: None 17 | AlwaysBreakTemplateDeclarations: Yes 18 | BreakBeforeBraces: Custom 19 | BraceWrapping: 20 | AfterCaseLabel: false 21 | AfterClass: false 22 | AfterControlStatement: Never 23 | AfterEnum: false 24 | AfterFunction: false 25 | AfterNamespace: false 26 | AfterUnion: false 27 | BeforeCatch: false 28 | BeforeElse: false 29 | IndentBraces: false 30 | SplitEmptyFunction: false 31 | SplitEmptyRecord: false 32 | BreakBeforeBinaryOperators: NonAssignment 33 | BreakBeforeTernaryOperators: true 34 | BreakConstructorInitializers: BeforeColon 35 | BreakInheritanceList: BeforeColon 36 | ColumnLimit: 120 37 | CompactNamespaces: false 38 | ContinuationIndentWidth: 4 39 | IndentCaseLabels: true 40 | IndentPPDirectives: None 41 | IndentWidth: 2 42 | KeepEmptyLinesAtTheStartOfBlocks: true 43 | MaxEmptyLinesToKeep: 1 44 | NamespaceIndentation: None 45 | ObjCSpaceAfterProperty: false 46 | ObjCSpaceBeforeProtocolList: false 47 | PointerAlignment: Right 48 | ReflowComments: false 49 | SpaceAfterCStyleCast: true 50 | SpaceAfterLogicalNot: false 51 | SpaceAfterTemplateKeyword: false 52 | SpaceBeforeAssignmentOperators: true 53 | SpaceBeforeCpp11BracedList: false 54 | SpaceBeforeCtorInitializerColon: true 55 | SpaceBeforeInheritanceColon: true 56 | SpaceBeforeParens: ControlStatements 57 | SpaceBeforeRangeBasedForLoopColon: true 58 | SpaceInEmptyParentheses: false 59 | SpacesBeforeTrailingComments: 0 60 | SpacesInAngles: false 61 | SpacesInCStyleCastParentheses: false 62 | SpacesInContainerLiterals: false 63 | SpacesInParentheses: false 64 | SpacesInSquareBrackets: false 65 | TabWidth: 4 66 | UseTab: Never 67 | -------------------------------------------------------------------------------- /veritymap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(veritymap) 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | 6 | if(NOT CMAKE_BUILD_TYPE) 7 | set(CMAKE_BUILD_TYPE Release) 8 | endif() 9 | 10 | set(CMAKE_CXX_FLAGS "-ggdb3 -pthread -lz -lrt -fopenmp" ) 11 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 12 | set(CMAKE_CXX_FLAGS_DEBUG "-g -pg") 13 | set(CMAKE_SHARED_LINKER_FLAGS "-static-libgcc -static-libstdc++ -Wall -Wc++-compat -O2 -msse4.1 -DHAVE_KALLOC -DKSW_CPU_DISPATCH -D_FILE_OFFSET_BITS=64 -ltbb -fsigned-char -fsanitize=address -pg") 14 | set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") 15 | 16 | # Define various dirs 17 | set(MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) 18 | # set(LIB_SRC_DIR ${MAIN_SRC_DIR}/lib) 19 | set(PROJECTS_SRC_DIR ${MAIN_SRC_DIR}/projects) 20 | # set(TESTS_SRC_DIR ${MAIN_SRC_DIR}/tests) 21 | set(TOOLS_SRC_DIR ${MAIN_SRC_DIR}/tools) 22 | 23 | set(VM_SRC_DIR ${PROJECTS_SRC_DIR}/veritymap) 24 | 25 | 26 | 27 | find_package(Git) 28 | find_package (ZLIB) 29 | include_directories(SYSTEM "${ZLIB_INCLUDE_DIRS}") 30 | 31 | # the commit's SHA1, and whether the building workspace was dirty or not 32 | execute_process(COMMAND 33 | "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=40 --dirty 34 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 35 | OUTPUT_VARIABLE GIT_SHA1 36 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 37 | 38 | # the date of the commit 39 | execute_process(COMMAND 40 | "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local 41 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 42 | OUTPUT_VARIABLE GIT_DATE 43 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 44 | 45 | # the subject of the commit 46 | execute_process(COMMAND 47 | "${GIT_EXECUTABLE}" log -1 --format=%s 48 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 49 | OUTPUT_VARIABLE GIT_COMMIT_SUBJECT 50 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 51 | 52 | # generate version.cpp 53 | configure_file("${TOOLS_SRC_DIR}/version/version.cpp.in" ${TOOLS_SRC_DIR}/version/version.cpp @ONLY) 54 | set(VERSION ${TOOLS_SRC_DIR}/version/version.cpp) 55 | 56 | include_directories(${TOOLS_SRC_DIR}) 57 | # include_directories(${TESTS_SRC_DIR}) 58 | include_directories(${PROJECT_SRC_DIR}) 59 | 60 | add_subdirectory(src/projects) 61 | add_subdirectory(src/tools) 62 | 63 | 64 | -------------------------------------------------------------------------------- /veritymap/Makefile: -------------------------------------------------------------------------------- 1 | build_type?="Release" 2 | 3 | .PHONY: clean cmake all 4 | 5 | all: veritymap 6 | 7 | cmake: 8 | mkdir -p build 9 | cd build && cmake .. -DCMAKE_BUILD_TYPE="${build_type}" 10 | 11 | veritymap: cmake 12 | $(MAKE) -C build all 13 | mkdir -p build/bin 14 | mv $(abspath build/src/projects/veritymap/veritymap) build/bin/veritymap 15 | -rm -r build/bin/config 16 | mv $(abspath build/src/projects/veritymap/config) build/bin 17 | 18 | test_launch: veritymap 19 | build/bin/veritymap \ 20 | --target test_dataset/test_target.fasta \ 21 | --queries test_dataset/test_query.fasta -o test_dataset/test_launch 22 | grep -q "Thank you for using VerityMap!" test_dataset/test_launch/veritymap.log 23 | 24 | clean: 25 | -rm -r build 26 | -rm -r test_dataset/test_launch 27 | 28 | -------------------------------------------------------------------------------- /veritymap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ablab/VerityMap/d24aa797be9c977dbcb9164ecfe18b3af6e4a026/veritymap/__init__.py -------------------------------------------------------------------------------- /veritymap/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.0.0" -------------------------------------------------------------------------------- /veritymap/config/config_hifi_diploid.tsv: -------------------------------------------------------------------------------- 1 | base 239 2 | k 301 3 | min_uncovered_len 5000 4 | min_end_ident 0.997 5 | max_rare_cnt_target 10 6 | max_rare_cnt_query 1 7 | k_step_size 30 8 | k_window_size 30000 9 | window_regular_density 0.9 10 | strategy approximate_canon 11 | false_positive_probability 0.01 12 | exp_base 0.01 13 | nhash 5 14 | chunk_size 5000000 15 | false_positive_probability_canon 0.01 16 | exp_base_canon 0.01 17 | nhash_canon 5 18 | chunk_size_canon 5000000 19 | false_positive_probability_canon_single_thread 0.01 20 | exp_base_canon_single_thread 0.01 21 | nhash_canon_single_thread 5 22 | diploid 0 23 | careful_upper_bnd_cov_mult 3 24 | min_matches 20 25 | min_score 0 26 | max_top_score_prop 0.95 27 | max_jump 100000 28 | misassembly_penalty_base 5 29 | diff_penalty_mult 1 30 | min_chain_range 5000 31 | max_supp_dist_diff 5 32 | min_uniq_kmers 0 33 | match_score_unique 3 34 | match_score_dup 1 35 | match_score_rare 0.1 36 | match_score 1 37 | mis_score -2 38 | gapo 2 39 | gape 1 -------------------------------------------------------------------------------- /veritymap/config/config_hifi_haploid.tsv: -------------------------------------------------------------------------------- 1 | base 239 2 | k 301 3 | min_uncovered_len 5000 4 | min_end_ident 0.997 5 | max_rare_cnt_target 10 6 | max_rare_cnt_query 1 7 | k_step_size 50 8 | k_window_size 30000 9 | window_regular_density 0.9 10 | strategy approximate_canon 11 | false_positive_probability 0.01 12 | exp_base 0.01 13 | nhash 5 14 | chunk_size 5000000 15 | false_positive_probability_canon 0.01 16 | exp_base_canon 0.01 17 | nhash_canon 5 18 | chunk_size_canon 5000000 19 | false_positive_probability_canon_single_thread 0.01 20 | exp_base_canon_single_thread 0.01 21 | nhash_canon_single_thread 5 22 | diploid 0 23 | careful_upper_bnd_cov_mult 3 24 | min_matches 20 25 | min_score 0 26 | max_top_score_prop 0.9 27 | max_jump 100000 28 | misassembly_penalty_base 5 29 | diff_penalty_mult 1 30 | min_chain_range 5000 31 | max_supp_dist_diff 5 32 | min_uniq_kmers 0 33 | match_score_unique 3 34 | match_score_dup 0.1 35 | match_score_rare 0.1 36 | match_score 1 37 | mis_score -2 38 | gapo 2 39 | gape 1 -------------------------------------------------------------------------------- /veritymap/config/config_hifi_haploid_complete.tsv: -------------------------------------------------------------------------------- 1 | base 239 2 | k 301 3 | min_uncovered_len 5000 4 | min_end_ident 0.997 5 | max_rare_cnt_target 10 6 | max_rare_cnt_query 1 7 | k_step_size 50 8 | k_window_size 30000 9 | window_regular_density 0.9 10 | strategy approximate 11 | false_positive_probability 0.01 12 | exp_base 0.01 13 | nhash 5 14 | chunk_size 5000000 15 | false_positive_probability_canon 0.01 16 | exp_base_canon 0.01 17 | nhash_canon 5 18 | chunk_size_canon 5000000 19 | false_positive_probability_canon_single_thread 0.01 20 | exp_base_canon_single_thread 0.01 21 | nhash_canon_single_thread 5 22 | diploid 0 23 | careful_upper_bnd_cov_mult 3 24 | min_matches 20 25 | min_score 0 26 | max_top_score_prop 0.9 27 | max_jump 100000 28 | misassembly_penalty_base 5 29 | diff_penalty_mult 1 30 | min_chain_range 5000 31 | max_supp_dist_diff 5 32 | min_uniq_kmers 0 33 | match_score_unique 3 34 | match_score_dup 0.1 35 | match_score_rare 0.1 36 | match_score 1 37 | mis_score -2 38 | gapo 2 39 | gape 1 -------------------------------------------------------------------------------- /veritymap/config/config_ont_haploid_complete.tsv: -------------------------------------------------------------------------------- 1 | base 239 2 | k 51 3 | min_uncovered_len 5000 4 | min_end_ident 0.9 5 | max_rare_cnt_target 30 6 | max_rare_cnt_query 1 7 | k_step_size 10 8 | k_window_size 30000 9 | window_regular_density 0.9 10 | strategy approximate 11 | false_positive_probability 0.01 12 | exp_base 0.01 13 | nhash 5 14 | chunk_size 5000000 15 | false_positive_probability_canon 0.01 16 | exp_base_canon 0.01 17 | nhash_canon 5 18 | chunk_size_canon 5000000 19 | false_positive_probability_canon_single_thread 0.01 20 | exp_base_canon_single_thread 0.01 21 | nhash_canon_single_thread 5 22 | diploid 0 23 | careful_upper_bnd_cov_mult 3 24 | min_matches 20 25 | min_score 0 26 | max_top_score_prop 0.9 27 | max_jump 100000 28 | misassembly_penalty_base 1000 29 | diff_penalty_mult 10 30 | min_chain_range 5000 31 | max_supp_dist_diff 5 32 | min_uniq_kmers 0 33 | match_score_unique 1 34 | match_score_dup 0.2 35 | match_score_rare 0.2 36 | match_score 1 37 | mis_score -2 38 | gapo 2 39 | gape 1 40 | -------------------------------------------------------------------------------- /veritymap/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import sys 4 | from os.path import isdir, abspath 5 | 6 | root = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir) 7 | sys.path.insert(0, root) 8 | 9 | import click as click 10 | 11 | from veritymap.py_src.assembly import Assembly 12 | from veritymap.py_src.mapper import do 13 | 14 | 15 | @click.command() 16 | @click.argument('assembly_fnames', type=click.Path(exists=True), nargs=-1) 17 | @click.option('--reads', 'reads_fname', type=click.Path(exists=True), help='File with ONT/PacBio reads') 18 | @click.option('-o', 'out_dir', type=click.Path(), required=True, help='Output folder') 19 | @click.option('-t', 'threads', type=click.INT, help='Threads', default=4) 20 | @click.option('-d', 'datatype', 21 | type=click.Choice(['hifi-haploid', 'hifi-haploid-complete', 'hifi-diploid', 'ont-haploid-complete']), 22 | help='Sequencing platform, supported types are: ' 23 | '"hifi" for PacBio HiFi reads and "ont" for ONT reads.' 24 | 'Please note that "ont" mode is experimental and ' 25 | 'should be used with extra care') 26 | @click.option('-f', '--no-reuse', 'no_reuse', is_flag=True, help='Do not reuse old files') 27 | @click.option('--careful', 'is_careful', is_flag=True, help='Run mapper in a careful mode to better detect inconsistencies. Can be time- and memory-consuming. Not recommended to run on the whole genome. ') 28 | @click.option('-l', 'labels', help='Comma separated list of assembly labels') 29 | def main(assembly_fnames, reads_fname, labels, out_dir, threads, no_reuse, is_careful, datatype): 30 | date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 31 | print("%s VerityMap started" % date) 32 | if not reads_fname: 33 | print("ERROR! You should specify ONE path to a file with reads (ONT or Pacbio CLR reads)") 34 | sys.exit(2) 35 | 36 | if not assembly_fnames: 37 | print("ERROR! You should specify at least one assembly file.") 38 | sys.exit(2) 39 | 40 | if not datatype: 41 | datatype = "hifi-haploid-complete" 42 | 43 | list_labels = [None] * len(assembly_fnames) 44 | if labels: 45 | list_labels = labels.replace('"', '').split(',') 46 | if len(list_labels) != len(assembly_fnames): 47 | print("ERROR! Number of labels must correspond to the number of analyzed assemblies") 48 | sys.exit(2) 49 | 50 | assemblies = [Assembly(assembly_fnames[i], name=list_labels[i], out_dir=out_dir) for i in range(len(assembly_fnames))] 51 | out_dir = abspath(out_dir) 52 | if not isdir(out_dir): 53 | os.makedirs(out_dir) 54 | do(assemblies, reads_fname, datatype, out_dir, threads, no_reuse, is_careful) 55 | date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 56 | print() 57 | print("%s VerityMap finished!" % date) 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /veritymap/py_src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ablab/VerityMap/d24aa797be9c977dbcb9164ecfe18b3af6e4a026/veritymap/py_src/__init__.py -------------------------------------------------------------------------------- /veritymap/py_src/assembly.py: -------------------------------------------------------------------------------- 1 | from os.path import basename, join, splitext, exists 2 | from slugify import slugify 3 | 4 | 5 | class Assembly: 6 | def __init__(self, fname=None, name=None, out_dir=None): 7 | self.fname = fname 8 | self.real_coords = None 9 | self.label = name or splitext(basename(fname))[0] 10 | self.name = slugify(splitext(basename(fname))[0]) 11 | self.contig_name = "contig" 12 | self.chains_fname = join(out_dir, "%s.txt" % self.name) 13 | self.bed_fname = join(out_dir, "%s.bed" % self.name) 14 | self.sam_fname = join(out_dir, "%s.sam" % self.name) 15 | -------------------------------------------------------------------------------- /veritymap/py_src/reporting.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | import numpy as np 4 | import plotly.graph_objects as go 5 | from plotly.subplots import make_subplots 6 | 7 | MIN_COVERAGE = 3 8 | MIN_AF = 20 9 | 10 | def format_func(value, tick_number): 11 | N = value/1000000 12 | if N == 0: 13 | return "0" 14 | else: 15 | return "%d Mb" % N 16 | 17 | 18 | def make_plotly_html(assemblies, all_data, out_dir): 19 | step = 200 20 | all_refs = set() 21 | for errors, coverage in all_data: 22 | for ref in errors.keys(): all_refs.add(ref) 23 | 24 | for asm in assemblies: 25 | open(join(out_dir, asm.label + "_kmers_dist_diff.bed"), "w") 26 | open(join(out_dir, asm.label + "_reads_dist_diff.txt"), "w") 27 | open(join(out_dir, asm.label + "_errors.tsv"), "w") 28 | 29 | for ref_name in all_refs: 30 | fig = make_subplots(rows=len(all_data), cols=1, 31 | subplot_titles=[a.label for a in assemblies]) 32 | for plot_idx, (errors, coverage) in enumerate(all_data): 33 | asm_id = assemblies[plot_idx].label 34 | customdata = [] 35 | data = dict() 36 | data['coverage'] = coverage[ref_name] 37 | data['coverage'] = [max(data['coverage'][i:i+step]) for i in range(0, len(data['coverage']), step)] 38 | data['reads'] = [0] * len(data['coverage']) 39 | data['stddev'] = [0] * len(data['coverage']) 40 | data['diff'] = [[] for i in range(len(data['coverage']))] 41 | vals = [0] * len(data['coverage']) 42 | diffs = [0] * len(data['coverage']) 43 | reads = [0] * len(data['coverage']) 44 | for e in errors[ref_name]: 45 | for i in range(e[0], e[1], step): 46 | real_pos = i 47 | real_pos = int(real_pos / step) 48 | if real_pos >= len(data['reads']): 49 | break 50 | data['reads'][real_pos] += 1 51 | data['diff'][real_pos].append((e[2], e[3])) 52 | new_errors = [] 53 | for i in range(len(data['reads'])): 54 | diff_arr = [d[1] for d in data['diff'][i]] 55 | mean_diff = np.median(diff_arr) if diff_arr else 0 56 | stddev = np.std(diff_arr) if diff_arr else 0 57 | #if stddev > 200: 58 | stddev = min(stddev, 50) 59 | filt_reads = [d[0] for d in data['diff'][i] if abs(d[1]-mean_diff) <= min(abs(mean_diff)/5, stddev)] 60 | filt_diff = [d[1] for d in data['diff'][i] if abs(d[1]-mean_diff) <= min(abs(mean_diff)/5, stddev)] 61 | reads[i] = filt_reads 62 | mean_diff2 = np.mean(filt_diff) if filt_diff else 0 63 | stddev2 = np.std(filt_diff) if filt_diff else 0 64 | vals[i] = len(filt_reads)*100.0/(data['coverage'][i]+data['reads'][i]) if (data['coverage'][i]+data['reads'][i])>=MIN_COVERAGE and len(filt_reads) > 1 else 0 65 | diffs[i] = mean_diff2 66 | customdata.append((len(filt_reads), data['coverage'][i]+data['reads'][i], mean_diff2, stddev2)) 67 | if vals[i] > MIN_AF: 68 | new_errors.append((ref_name, i*step, len(filt_reads), data['coverage'][i], data['coverage'][i]+data['reads'][i], mean_diff2, stddev2)) 69 | 70 | real_x = [i*step for i in range(len(data['coverage']))] 71 | fig.add_trace( 72 | go.Scatter(x=real_x, y=vals, showlegend=False, customdata = customdata, 73 | hovertemplate="%{customdata[0]} out of %{customdata[1]} reads, " 74 | "mean diff %{customdata[2]:.2f} std deviation %{customdata[3]:.2f}"), row=plot_idx+1, col=1) 75 | fig.update_yaxes(range=[-3,105],title_text="% deviated reads", titlefont=dict(size=18), tickfont=dict(size=18), 76 | hoverformat="d", row=plot_idx+1, col=1) 77 | fig.update_xaxes(title_text="Position", titlefont=dict(size=18), tickfont=dict(size=18), hoverformat="d", 78 | row=plot_idx+1, col=1) 79 | 80 | bed_fname = join(out_dir, asm_id + "_kmers_dist_diff.bed") 81 | prev_i = 0 82 | prev_diff = 0 83 | with open(bed_fname, "a") as f: 84 | for x,v,sv_diff in zip(real_x,vals,diffs): 85 | if v >= 20: 86 | if not prev_i: 87 | prev_i = x 88 | prev_v = v 89 | prev_diff = sv_diff 90 | elif prev_i: 91 | f.write("%s\t%d\t%d\t%d\t%2.f\n" % 92 | (ref_name, prev_i, x - step, prev_diff, prev_v)) 93 | prev_i = 0 94 | prev_v = 0 95 | errors_fname = join(out_dir, asm_id + "_errors.tsv") 96 | with open(errors_fname, "a") as f: 97 | for e in new_errors: 98 | f.write("\t".join([str(s) for s in e])) 99 | f.write("\n") 100 | reads_fname = join(out_dir, asm_id + "_reads_dist_diff.txt") 101 | prev_i = 0 102 | with open(reads_fname, "a") as f: 103 | support_reads = set() 104 | for x, v, r in zip(real_x, vals, reads): 105 | if v >= 20: 106 | if not prev_i: 107 | prev_i = x 108 | prev_v = v 109 | for read in r: 110 | support_reads.add(read) 111 | elif prev_i: 112 | f.write("%s\t%d\t%d\n" % 113 | (ref_name, prev_i, x-step)) 114 | for r in support_reads: 115 | f.write(r + "\n") 116 | prev_i = 0 117 | support_reads = set() 118 | 119 | plot_fname = join(out_dir, ref_name + "_kmers_dist_diff.html") 120 | fig.write_html(plot_fname) 121 | print(" Difference in k-mer distances plot for %s saved to %s" % (ref_name, plot_fname)) 122 | 123 | -------------------------------------------------------------------------------- /veritymap/py_src/utils.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | import gzip 3 | import os 4 | 5 | 6 | def get_asm_lenghts(fname): 7 | ref_names = [] 8 | 9 | opener, fname_wogzip = (gzip.open, os.path.splitext(fname)[0]) \ 10 | if fname.endswith('.gz') else (open, fname) 11 | _, ext = os.path.splitext(fname_wogzip) 12 | ext = ext[1:] 13 | if ext in ['fa', 'fasta']: 14 | formt = 'fasta' 15 | elif ext in ['fq', 'fastq']: 16 | formt = 'fastq' 17 | else: 18 | raise ValueError("Can't guess format of " + fname + 19 | " from its extension " + ext) 20 | 21 | with opener(fname, 'rt') as handle: 22 | for record in SeqIO.parse(handle, formt): 23 | ref_names.append((record.id, len(record.seq))) 24 | return ref_names 25 | 26 | 27 | def calculate_coverage(assembly_lenghts, bed_fname, read_names=None): 28 | asm_cov = dict() 29 | for ref_name, assembly_len in assembly_lenghts: 30 | coverage = [0] * assembly_len 31 | starts = [0] * assembly_len 32 | ends = [0] * assembly_len 33 | with open(bed_fname) as f: 34 | for line in f: 35 | fs = line.split() 36 | ref, ref_s, ref_e, read_name, align_start, align_end, read_len = fs 37 | if ref != ref_name: continue 38 | if read_names is not None and read_name not in read_names: 39 | continue 40 | ref_s, ref_e, align_start, align_end, read_len = map(int, (ref_s, ref_e, align_start, align_end, read_len)) 41 | starts[max(0,ref_s)] += 1 42 | ends[min(assembly_len-1, ref_e - 1)] += 1 43 | cur_cov = 0 44 | for i in range(assembly_len): 45 | cur_cov += starts[i] 46 | cur_cov -= ends[i] 47 | coverage[i] = cur_cov 48 | asm_cov[ref_name] = coverage 49 | return asm_cov 50 | -------------------------------------------------------------------------------- /veritymap/src/projects/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(veritymap) -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(veritymap) 2 | 3 | add_library(veritymap_library STATIC cigar.cpp config/config.cpp) 4 | 5 | target_link_libraries(veritymap_library common sequence m ksw2 sketch bloomfilter) 6 | 7 | add_executable(veritymap veritymap.cpp ${VERSION} ${CONFIG_DIR_DEF_CPP}) 8 | 9 | target_link_libraries(veritymap veritymap_library) 10 | 11 | add_custom_command(TARGET veritymap POST_BUILD 12 | COMMAND ${CMAKE_COMMAND} -E copy_directory 13 | ${CMAKE_SOURCE_DIR}/config/ $/config) 14 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/cigar.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 03/02/21. 3 | // 4 | 5 | #include "cigar.hpp" 6 | 7 | #include "sequences/verify.hpp" 8 | 9 | using namespace veritymap::cigar_utils; 10 | 11 | inline char veritymap::cigar_utils::cigar_mode2str(const CigarMode& fragment) { 12 | if (fragment == CigarMode::M) { 13 | return 'M'; 14 | } else if (fragment == CigarMode::I) { 15 | return 'I'; 16 | } else if (fragment == CigarMode::D) { 17 | return 'D'; 18 | } else { 19 | VERIFY(fragment == CigarMode::S); 20 | return 'S'; 21 | } 22 | } 23 | 24 | Cigar::Cigar(const ksw_extz_t& ez) { 25 | for (size_t i = 0; i < ez.n_cigar; ++i) { 26 | const auto mode = static_cast(ez.cigar[i] & 0xf); 27 | const size_t length{ez.cigar[i] >> 4}; 28 | cigar_vec.push_back({length, mode}); 29 | } 30 | } 31 | 32 | const std::vector& Cigar::get_cigar_vec() const { return cigar_vec; } 33 | 34 | void Cigar::extend(const size_t length, const CigarMode mode) { 35 | if (empty() or cigar_vec.back().mode != mode) { 36 | cigar_vec.push_back({length, mode}); 37 | } else { 38 | cigar_vec.back().length += length; 39 | } 40 | } 41 | 42 | void Cigar::extend(Cigar cigar) { 43 | if ((not empty()) and (not cigar.empty()) and (cigar_vec.back().mode == cigar.cigar_vec.front().mode)) { 44 | cigar.cigar_vec.front().length += cigar_vec.back().length; 45 | cigar_vec.pop_back(); 46 | } 47 | cigar_vec.insert(cigar_vec.end(), std::make_move_iterator(cigar.cigar_vec.begin()), 48 | std::make_move_iterator(cigar.cigar_vec.end())); 49 | } 50 | 51 | [[nodiscard]] size_t Cigar::query_length() const { 52 | size_t length{0}; 53 | for (const CigarFragment& fragment : cigar_vec) { 54 | if (fragment.mode != CigarMode::D) { 55 | length += fragment.length; 56 | } 57 | } 58 | return length; 59 | } 60 | 61 | [[nodiscard]] size_t Cigar::target_length() const { 62 | size_t length{0}; 63 | for (const CigarFragment& fragment : cigar_vec) { 64 | if (fragment.mode != CigarMode::I and fragment.mode != CigarMode::S) { 65 | length += fragment.length; 66 | } 67 | } 68 | return length; 69 | } 70 | 71 | [[nodiscard]] int Cigar::nmismatches(const Sequence& target, const Sequence& query) const { 72 | int nmism{0}; 73 | size_t t{0}, q{0}; 74 | for (const CigarFragment& fragment : cigar_vec) { 75 | if (fragment.mode == CigarMode::I or fragment.mode == CigarMode::S) { 76 | q += fragment.length; 77 | nmism += fragment.length; 78 | } else if (fragment.mode == CigarMode::D) { 79 | t += fragment.length; 80 | nmism += fragment.length; 81 | } else { 82 | VERIFY(fragment.mode == CigarMode::M); 83 | for (size_t i = 0; i < fragment.length; ++i) { 84 | if (target[t] != query[q]) { 85 | ++nmism; 86 | } 87 | ++t, ++q; 88 | } 89 | } 90 | } 91 | return nmism; 92 | } 93 | 94 | int Cigar::alignment_length() const { 95 | int length{0}; 96 | for (const CigarFragment& fragment : cigar_vec) { length += fragment.length; } 97 | return length; 98 | } 99 | 100 | double Cigar::identity(const Sequence& target, const Sequence& query) const { 101 | const int al_len = alignment_length(); 102 | if (al_len > 0) { 103 | const double identity = 1. - static_cast(nmismatches(target, query)) / al_len; 104 | VERIFY(identity >= 0); 105 | return identity; 106 | } 107 | return 1.; 108 | } 109 | 110 | std::ostream& veritymap::cigar_utils::operator<<(std::ostream& os, const Cigar& cigar) { 111 | for (const CigarFragment& fragment : cigar.get_cigar_vec()) { 112 | os << fragment.length << cigar_mode2str(fragment.mode); 113 | } 114 | return os; 115 | } -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/cigar.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 03/02/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "ksw2/ksw2.h" 12 | #include "sequences/sequence.hpp" 13 | 14 | namespace veritymap::cigar_utils { 15 | 16 | enum class CigarMode { M, I, D, S }; 17 | 18 | inline char cigar_mode2str(const CigarMode &fragment); 19 | 20 | struct CigarFragment { 21 | size_t length{0}; 22 | CigarMode mode{}; 23 | }; 24 | 25 | class Cigar { 26 | std::vector cigar_vec; 27 | 28 | public: 29 | explicit Cigar(const ksw_extz_t &ez); 30 | Cigar(const size_t length, const CigarMode mode) : cigar_vec{{length, mode}} {} 31 | 32 | Cigar() = default; 33 | Cigar(Cigar &) = default; 34 | Cigar(Cigar &&) = default; 35 | Cigar &operator=(const Cigar &) = default; 36 | Cigar &operator=(Cigar &&) = default; 37 | ~Cigar() = default; 38 | 39 | [[nodiscard]] bool empty() const { return cigar_vec.empty(); } 40 | 41 | [[nodiscard]] const std::vector &get_cigar_vec() const; 42 | 43 | void extend(size_t length, CigarMode mode); 44 | 45 | void extend(Cigar cigar); 46 | 47 | [[nodiscard]] size_t query_length() const; 48 | 49 | [[nodiscard]] size_t target_length() const; 50 | 51 | [[nodiscard]] int nmismatches(const Sequence &target, const Sequence &query) const; 52 | 53 | [[nodiscard]] int alignment_length() const; 54 | 55 | [[nodiscard]] double identity(const Sequence &target, const Sequence &query) const; 56 | 57 | std::pair trim(const CigarMode &mode) { 58 | size_t left_trim = {0}, right_trim{0}; 59 | 60 | if (cigar_vec.empty()) { 61 | return {left_trim, right_trim}; 62 | } 63 | if (cigar_vec.front().mode == mode) { 64 | left_trim = cigar_vec.front().length; 65 | cigar_vec.erase(cigar_vec.begin()); 66 | } 67 | 68 | if (cigar_vec.empty()) { 69 | return {left_trim, right_trim}; 70 | } 71 | if (cigar_vec.back().mode == mode) { 72 | right_trim = cigar_vec.back().length; 73 | cigar_vec.pop_back(); 74 | } 75 | return {left_trim, right_trim}; 76 | } 77 | 78 | void soft_clip() { 79 | if (cigar_vec.empty()) { 80 | return; 81 | } 82 | if (cigar_vec.front().mode == CigarMode::I) { 83 | cigar_vec.front().mode = CigarMode::S; 84 | } 85 | if (cigar_vec.back().mode == CigarMode::I) { 86 | cigar_vec.back().mode = CigarMode::S; 87 | } 88 | } 89 | }; 90 | 91 | std::ostream &operator<<(std::ostream &os, const Cigar &cigar); 92 | 93 | }// namespace veritymap::cigar_utils -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/cms_utils.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 10/26/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include "config/config.hpp" 10 | 11 | namespace veritymap::cms_utils { 12 | struct CMSParams { 13 | int nbits{1}; 14 | int l2sz{1}; 15 | int64_t nhash{1}; 16 | 17 | CMSParams(const Config::CommonParams &common_params, const Config::KmerIndexerParams &kmer_indexer_params, 18 | const uint64_t tot_len, const uint nthreads) 19 | : nbits{(int) ceil(log2(kmer_indexer_params.max_rare_cnt_target))}, 20 | l2sz{(int) ceil( 21 | log2(std::exp(kmer_indexer_params.approximate_kmer_indexer_params.exp_base) * tot_len / nthreads))}, 22 | nhash{kmer_indexer_params.approximate_kmer_indexer_params.nhash} {} 23 | }; 24 | 25 | }// namespace veritymap::cms_utils -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/config/config.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/19/21. 3 | // 4 | 5 | #include "config.hpp" 6 | 7 | #include 8 | #include 9 | 10 | using namespace veritymap; 11 | 12 | Config Config::load_config_file(const std::filesystem::path& config_fn) { 13 | std::ifstream is(config_fn); 14 | std::string key, str_val; 15 | std::map m; 16 | while (is >> key >> str_val) { 17 | VERIFY(not m.contains(key)); 18 | m[key] = str_val; 19 | } 20 | using std::stoull, std::stoll, std::stod, std::stoi; 21 | Config::CommonParams common_params{stoull(m.at("k")), (bool) stoi(m.at("diploid"))}; 22 | Config::HashParams hash_params{stoull(m.at("base"))}; 23 | Config::KmerIndexerParams::ApproximateKmerIndexerParams aprx_kmer_indexer_params{ 24 | stod(m.at("false_positive_probability")), stod(m.at("exp_base")), stoi(m.at("nhash")), 25 | stoull(m.at("chunk_size"))}; 26 | Config::KmerIndexerParams::ApproximateCanonKmerIndexerParams aprx_canon_kmer_indexer_params{ 27 | stod(m.at("false_positive_probability_canon")), stod(m.at("exp_base_canon")), stoi(m.at("nhash_canon")), 28 | stoull(m.at("chunk_size_canon"))}; 29 | Config::KmerIndexerParams::ApproximateCanonSingleThreadKmerIndexerParams aprx_canon_single_thread_kmer_indexer_params{ 30 | stod(m.at("false_positive_probability_canon_single_thread")), stod(m.at("exp_base_canon_single_thread")), 31 | stoi(m.at("nhash_canon_single_thread"))}; 32 | Config::KmerIndexerParams kmer_indexer_params{ 33 | stoull(m.at("min_uncovered_len")), stoull(m.at("max_rare_cnt_target")), 34 | // stoull(m.at("max_rare_cnt_query")), 35 | stoull(m.at("k_step_size")), stoull(m.at("k_window_size")), stod(m.at("window_regular_density")), 36 | Config::KmerIndexerParams::str2strategy(m.at("strategy")), aprx_kmer_indexer_params, 37 | aprx_canon_kmer_indexer_params, aprx_canon_single_thread_kmer_indexer_params, 38 | stod(m.at("careful_upper_bnd_cov_mult"))}; 39 | Config::Chain2SAMParams::KSW2Params ksw_2_params{ 40 | static_cast(stoi(m.at("match_score"))), static_cast(stoi(m.at("mis_score"))), 41 | static_cast(stoi(m.at("gapo"))), static_cast(stoi(m.at("gape")))}; 42 | Config::ChainingParams chaining_params{stoull(m.at("min_matches")), 43 | stod(m.at("min_score")), 44 | stod(m.at("max_top_score_prop")), 45 | stoll(m.at("max_jump")), 46 | stod(m.at("misassembly_penalty_base")), 47 | stod(m.at("diff_penalty_mult")), 48 | stoull(m.at("min_chain_range")), 49 | stoull(m.at("max_supp_dist_diff")), 50 | stoi(m.at("min_uniq_kmers")), 51 | stod(m.at("match_score_unique")), 52 | stod(m.at("match_score_dup")), 53 | stod(m.at("match_score_rare"))}; 54 | Config::Chain2SAMParams chain_2_sam_params{stod(m.at("min_end_ident")), ksw_2_params}; 55 | 56 | return Config{common_params, hash_params, kmer_indexer_params, chaining_params, chain_2_sam_params}; 57 | } 58 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/config/config.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/19/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "sequences/verify.hpp" 12 | 13 | namespace veritymap { 14 | struct Config { 15 | struct CommonParams { 16 | size_t k; 17 | bool diploid; 18 | }; 19 | CommonParams common_params; 20 | 21 | struct HashParams { 22 | using htype = uint64_t; 23 | size_t base; 24 | }; 25 | HashParams hash_params; 26 | 27 | struct KmerIndexerParams { 28 | size_t min_uncovered_len; 29 | 30 | size_t max_rare_cnt_target; 31 | // size_t max_rare_cnt_query; 32 | // static_assert(max_rare_cnt_target <= std::numeric_limits::max(), 33 | // "Match stores frequency as uint8_t to economize memory usage"); 34 | size_t k_step_size; 35 | size_t k_window_size; 36 | double window_regular_density; 37 | 38 | enum class Strategy { exact, approximate, approximate_canon, exact_canon }; 39 | Strategy strategy; 40 | static std::string strategy2str(const Strategy& strategy) { 41 | if (strategy == Strategy::exact) 42 | return "exact"; 43 | if (strategy == Strategy::approximate) 44 | return "approximate"; 45 | if (strategy == Strategy::approximate_canon) 46 | return "approximate_canon"; 47 | VERIFY(strategy == Strategy::exact_canon); 48 | return "exact_canon"; 49 | } 50 | static Strategy str2strategy(const std::string& str) { 51 | VERIFY(str == "exact" or str == "approximate" or str == "approximate_canon" or str == "exact_canon"); 52 | if (str == "exact") 53 | return Strategy::exact; 54 | if (str == "approximate") 55 | return Strategy::approximate; 56 | if (str == "approximate_canon") 57 | return Strategy::approximate_canon; 58 | return Strategy::exact_canon; 59 | } 60 | 61 | struct ApproximateKmerIndexerParams { 62 | double false_positive_probability; 63 | double exp_base; 64 | int nhash; 65 | size_t chunk_size; 66 | }; 67 | ApproximateKmerIndexerParams approximate_kmer_indexer_params; 68 | 69 | struct ApproximateCanonKmerIndexerParams { 70 | double false_positive_probability; 71 | double exp_base; 72 | int nhash; 73 | size_t chunk_size; 74 | }; 75 | ApproximateCanonKmerIndexerParams approximate_canon_kmer_indexer_params; 76 | 77 | struct ApproximateCanonSingleThreadKmerIndexerParams { 78 | double false_positive_probability; 79 | double exp_base; 80 | int nhash; 81 | }; 82 | ApproximateCanonSingleThreadKmerIndexerParams approximate_canon_single_thread_kmer_indexer_params; 83 | 84 | double careful_upper_bnd_cov_mult; 85 | }; 86 | KmerIndexerParams kmer_indexer_params; 87 | 88 | struct ChainingParams { 89 | using match_pos_type = int64_t; 90 | using score_type = double; 91 | size_t min_matches; 92 | score_type min_score; 93 | double max_top_score_prop; 94 | match_pos_type max_jump; 95 | score_type misassembly_penalty; 96 | score_type diff_penalty_mult; 97 | size_t min_chain_range; 98 | size_t max_supp_dist_diff; 99 | // static_assert(min_chain_range / KmerIndexerParams::k_step_size >= min_matches); 100 | int min_uniq_kmers; 101 | 102 | score_type match_score_unique; 103 | score_type match_score_dup; 104 | score_type match_score_rare; 105 | }; 106 | ChainingParams chaining_params; 107 | 108 | struct Chain2SAMParams { 109 | double min_end_ident; 110 | struct KSW2Params { 111 | int8_t match_score; 112 | int8_t mis_score; 113 | int8_t gapo; 114 | int8_t gape; 115 | }; 116 | KSW2Params ksw2_params; 117 | }; 118 | Chain2SAMParams chain2sam_params; 119 | 120 | static Config load_config_file(const std::filesystem::path& config_fn); 121 | }; 122 | 123 | }// namespace veritymap -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/dp_scoring.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/22/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "config/config.hpp" 8 | 9 | namespace veritymap::scoring { 10 | 11 | using ScoresBacktracks = std::pair, std::vector>; 12 | 13 | class DPScorer { 14 | const Config::CommonParams &common_params; 15 | const Config::ChainingParams &chaining_params; 16 | std::vector diff_pens; 17 | 18 | public: 19 | DPScorer(const DPScorer &) = delete; 20 | DPScorer(DPScorer &&) = delete; 21 | DPScorer &operator=(const DPScorer &) = delete; 22 | DPScorer &operator=(DPScorer &&) = delete; 23 | 24 | DPScorer(const Config::CommonParams &common_params, const Config::ChainingParams &chaining_params) 25 | : common_params{common_params}, 26 | chaining_params{chaining_params} { 27 | for (int64_t i = 0; i < chaining_params.max_supp_dist_diff; ++i) { diff_pens.emplace_back(0); } 28 | for (uint64_t i = 1; i < chaining_params.max_jump; ++i) { 29 | diff_pens.emplace_back(std::sqrt(std::sqrt((double) i)) - 1); 30 | } 31 | } 32 | 33 | [[nodiscard]] ScoresBacktracks GetScores(const matches::Matches &matches) const { 34 | using score_type = Config::ChainingParams::score_type; 35 | using match_pos_type = Config::ChainingParams::match_pos_type; 36 | 37 | std::vector scores; 38 | std::vector backtracks; 39 | 40 | size_t def_backtrack = std::numeric_limits::max(); 41 | 42 | for (auto it = matches.cbegin(); it != matches.cend(); ++it) { 43 | const matches::Match &match{*it}; 44 | 45 | const score_type freq_weight = match.is_unique() ? chaining_params.match_score_unique 46 | : match.is_dup() ? chaining_params.match_score_dup 47 | : chaining_params.match_score_rare; 48 | 49 | score_type score{freq_weight}; 50 | size_t backtrack{def_backtrack}; 51 | VERIFY(scores.size() == it - matches.cbegin()); 52 | 53 | for (auto [it2, sc_it] = std::pair{std::make_reverse_iterator(it), scores.crbegin()}; 54 | (it2 != matches.crend()) and (sc_it != scores.crend()); ++it2, ++sc_it) { 55 | 56 | const matches::Match &prev_match{*it2}; 57 | 58 | const match_pos_type target_jump = match.target_pos - prev_match.target_pos - common_params.k; 59 | if (target_jump >= chaining_params.max_jump) { 60 | break; 61 | } 62 | 63 | const match_pos_type query_jump = match.query_pos - prev_match.query_pos - common_params.k; 64 | 65 | if ((std::min(query_jump, target_jump) == -common_params.k) or// no jump on either query or target 66 | ((std::min(query_jump, target_jump) < 0) and (query_jump != target_jump)) or// non-equal overlap 67 | (query_jump >= chaining_params.max_jump)) { // excessive jump on query 68 | continue; 69 | } 70 | 71 | const match_pos_type jump_penalty = std::min(std::abs(query_jump), std::abs(target_jump)); 72 | const match_pos_type dist_diff = std::abs(std::abs(query_jump) - std::abs(target_jump)); 73 | 74 | const score_type diff_penalty = diff_pens[dist_diff]; 75 | 76 | const score_type overlap_penalty = 77 | std::min(1, static_cast(query_jump + common_params.k) / common_params.k); 78 | 79 | if (overlap_penalty < 1) { 80 | VERIFY(diff_penalty == 0); 81 | } 82 | const score_type cur_score = *sc_it + freq_weight * overlap_penalty 83 | - std::min(chaining_params.diff_penalty_mult * diff_penalty, chaining_params.misassembly_penalty); 84 | 85 | if (cur_score > score) { 86 | score = cur_score; 87 | VERIFY(scores.size() - 1 >= sc_it - scores.rbegin()); 88 | backtrack = scores.size() - 1 - (sc_it - scores.rbegin()); 89 | VERIFY(backtrack < scores.size()); 90 | } 91 | } 92 | VERIFY(score >= freq_weight); 93 | scores.emplace_back(score); 94 | if (score == freq_weight) { 95 | VERIFY(backtrack == def_backtrack); 96 | } 97 | backtracks.emplace_back(backtrack); 98 | } 99 | VERIFY(scores.size() == matches.size()); 100 | 101 | return {scores, backtracks}; 102 | } 103 | }; 104 | 105 | }// End namespace veritymap::scoring -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/hash_utils.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/20/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | inline std::ostream &operator<<(std::ostream &os, unsigned __int128 val) { 11 | std::vector res; 12 | while (val != 0) { 13 | res.push_back(val % 10); 14 | val /= 10; 15 | } 16 | for (auto it = res.rbegin(); it != res.rend(); ++it) { os << *it; } 17 | return os; 18 | } 19 | 20 | inline std::istream &operator>>(std::istream &is, unsigned __int128 &val) { 21 | val = 0; 22 | std::string s; 23 | is >> s; 24 | for (char &it : s) { 25 | val *= 10; 26 | val += it - '0'; 27 | } 28 | return is; 29 | } 30 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/filter_rep_kmers.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 10/18/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "../rolling_hash.hpp" 8 | #include "bloom/bloom.hpp" 9 | 10 | namespace veritymap::kmer_index::filter_rep_kmers { 11 | 12 | template 13 | BloomFilter get_bloom_rep_kmers(const Sequence& sequence, const RollingHash& hasher, 14 | const double false_positive_probability) { 15 | if (sequence.size() < hasher.k) { 16 | return {}; 17 | } 18 | BloomParameters bloom_params; 19 | bloom_params.projected_element_count = sequence.size(); 20 | bloom_params.false_positive_probability = false_positive_probability; 21 | bloom_params.compute_optimal_parameters(); 22 | 23 | BloomFilter once_filter{bloom_params}; 24 | BloomFilter twice_filter{bloom_params}; 25 | 26 | KWH kwh(hasher, sequence, 0); 27 | while (true) { 28 | const htype hash = kwh.get_fhash(); 29 | if (once_filter.contains(hash)) { 30 | twice_filter.insert(hash); 31 | } else { 32 | once_filter.insert(hash); 33 | } 34 | if (not kwh.hasNext()) { 35 | break; 36 | } 37 | kwh = kwh.next(); 38 | } 39 | return twice_filter; 40 | } 41 | 42 | }// End namespace veritymap::kmer_index::filter_rep_kmers -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/index_builders/approx_canon_kmer_indexer_single_thread.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/02/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "kmer_index_builder.hpp" 8 | #include "kmer_window.hpp" 9 | 10 | namespace veritymap::kmer_index_builder::approx_canon { 11 | 12 | class ApproxCanonKmerIndexer : public AbstractKmerIndexBuilder { 13 | public: 14 | ApproxCanonKmerIndexer(const RollingHash &hasher, 15 | const Config::CommonParams &common_params, 16 | const Config::KmerIndexerParams &kmer_indexer_params, logging::Logger &logger) 17 | : AbstractKmerIndexBuilder{/*nthreads=*/1, hasher, common_params, kmer_indexer_params, logger} {} 18 | 19 | [[nodiscard]] kmer_index::KmerIndex Build(const std::vector &contigs) const override { 20 | int64_t tot_len{0}; 21 | for (const Contig &contig : contigs) { tot_len += contig.size(); } 22 | const cms_utils::CMSParams kCmsParams(common_params, kmer_indexer_params, tot_len, 1); 23 | sketch::cm::ccm_t cms(kCmsParams.nbits, kCmsParams.l2sz, kCmsParams.nhash); 24 | 25 | for (const Contig &contig : contigs) { 26 | if (contig.size() < hasher.k) { 27 | continue; 28 | } 29 | KWH kwh(hasher, contig.seq, 0); 30 | while (true) { 31 | Config::HashParams::htype hash = kwh.hash(); 32 | if (cms.est_count(hash) <= kmer_indexer_params.max_rare_cnt_target) { 33 | cms.add(kwh.hash()); 34 | } 35 | 36 | if (!kwh.hasNext()) { 37 | break; 38 | } 39 | kwh = kwh.next(); 40 | } 41 | } 42 | 43 | kmer_index::KmerIndex::KmerCounter counter; 44 | kmer_index::KmerIndex::Kmer2Pos kmer2pos; 45 | for (const Contig &contig : contigs) { 46 | auto &k2p{kmer2pos.emplace_back()}; 47 | if (contig.size() < hasher.k + kmer_indexer_params.k_step_size) { 48 | continue; 49 | } 50 | KWH kwh(hasher, contig.seq, 0); 51 | int64_t latest_pos{0}; 52 | for (kmer_index::kmer_window::KmerMinimizerWindow window(kwh, kmer_indexer_params.k_step_size, cms); 53 | kwh.hasNext(); 54 | kwh = kwh.next(), window.Add(cms.est_count(kwh.hash()), kwh.hash(), kwh.get_fhash(), kwh.pos)) { 55 | const int64_t cur_pos = window.GetMinimizerPos(); 56 | if (cur_pos != latest_pos) { 57 | auto [freq, hash, fhash] = window.GetMinimizer(); 58 | if (freq <= kmer_indexer_params.max_rare_cnt_target) { 59 | counter[fhash] = cms.est_count(hash); 60 | k2p[fhash].emplace_back(kwh.pos); 61 | latest_pos = cur_pos; 62 | } 63 | } 64 | } 65 | } 66 | 67 | return {kmer2pos, counter, contigs}; 68 | } 69 | }; 70 | 71 | }// namespace veritymap::kmer_index_builder::approx_canon -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/index_builders/exact_canon_kmer_indexer.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/02/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "kmer_index_builder.hpp" 8 | #include "kmer_window.hpp" 9 | 10 | namespace veritymap::kmer_index_builder::exact_canon { 11 | 12 | class ExactCanonKmerIndexer : public AbstractKmerIndexBuilder { 13 | public: 14 | ExactCanonKmerIndexer(const RollingHash &hasher, const Config::CommonParams &common_params, 15 | const Config::KmerIndexerParams &kmer_indexer_params, logging::Logger &logger) 16 | : AbstractKmerIndexBuilder{/*nthreads=*/1, hasher, common_params, kmer_indexer_params, logger} {} 17 | 18 | [[nodiscard]] kmer_index::KmerIndex Build(const std::vector &contigs) const override { 19 | kmer_index::KmerIndex::KmerCounter full_counter, counter; 20 | 21 | for (const Contig &contig : contigs) { 22 | if (contig.size() < hasher.k) { 23 | continue; 24 | } 25 | KWH kwh(hasher, contig.seq, 0); 26 | while (true) { 27 | full_counter[kwh.hash()] += 1; 28 | if (!kwh.hasNext()) { 29 | break; 30 | } 31 | kwh = kwh.next(); 32 | } 33 | } 34 | 35 | kmer_index::KmerIndex::Kmer2Pos kmer2pos; 36 | for (const Contig &contig : contigs) { 37 | auto &k2p{kmer2pos.emplace_back()}; 38 | if (contig.size() < hasher.k + kmer_indexer_params.k_step_size) { 39 | continue; 40 | } 41 | KWH kwh(hasher, contig.seq, 0); 42 | int64_t latest_pos{0}; 43 | for (kmer_index::kmer_window::KmerMinimizerWindow window(kwh, kmer_indexer_params.k_step_size, full_counter); 44 | kwh.hasNext(); 45 | kwh = kwh.next(), window.Add(full_counter.at(kwh.hash()), kwh.hash(), kwh.get_fhash(), kwh.pos)) { 46 | const int64_t cur_pos = window.GetMinimizerPos(); 47 | if (cur_pos != latest_pos) { 48 | auto [freq, hash, fhash] = window.GetMinimizer(); 49 | if (freq <= kmer_indexer_params.max_rare_cnt_target) { 50 | counter[fhash] = full_counter.at(hash); 51 | k2p[fhash].emplace_back(kwh.pos); 52 | latest_pos = cur_pos; 53 | } 54 | } 55 | } 56 | } 57 | 58 | return {kmer2pos, counter, contigs}; 59 | } 60 | }; 61 | 62 | }// namespace veritymap::kmer_index_builder::exact_canon -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/index_builders/exact_kmer_index_builder.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/14/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include "../../rolling_hash.hpp" 10 | #include "kmer_index_builder.hpp" 11 | 12 | namespace veritymap::kmer_index_builder::exact { 13 | 14 | class ExactKmerIndexBuilder : public AbstractKmerIndexBuilder { 15 | [[nodiscard]] std::vector GetCounters(const std::vector &ctgs) const { 16 | std::vector counters; 17 | for (const auto &ctg : ctgs) { 18 | std::unordered_map &counter{counters.emplace_back()}; 19 | if (ctg.size() < hasher.k) { 20 | continue; 21 | } 22 | KWH kwh(hasher, ctg.seq, 0); 23 | while (true) { 24 | counter[kwh.get_fhash()] += 1; 25 | if (!kwh.hasNext()) { 26 | break; 27 | } 28 | kwh = kwh.next(); 29 | } 30 | } 31 | return counters; 32 | } 33 | 34 | [[nodiscard]] std::unordered_map> Hash2Seqs( 35 | const std::vector &counters) const { 36 | std::unordered_map> hash2seqs; 37 | for (auto it = counters.cbegin(); it != counters.cend(); ++it) { 38 | for (const auto &[hash, cnt] : *it) { hash2seqs[hash].insert(it - counters.cbegin()); } 39 | } 40 | return hash2seqs; 41 | } 42 | 43 | [[nodiscard]] kmer_index::KmerIndex Counter2Index( 44 | const std::vector &ctgs, 45 | const std::unordered_map> &hash2seqs, 46 | const std::vector &counters) const { 47 | kmer_index::KmerIndex::KmerCounter kmer_counter; 48 | kmer_index::KmerIndex::Kmer2Pos kmer2pos; 49 | for (auto it = ctgs.cbegin(); it != ctgs.cend(); ++it) { 50 | auto &k2p{kmer2pos.emplace_back()}; 51 | const Contig &ctg = *it; 52 | const auto &counter = counters.at(it - ctgs.cbegin()); 53 | if (ctg.size() < hasher.k) { 54 | continue; 55 | } 56 | KWH kwh(hasher, ctg.seq, 0); 57 | while (true) { 58 | Config::HashParams::htype fhash{kwh.get_fhash()}; 59 | Config::HashParams::htype rhash{kwh.get_rhash()}; 60 | if ((hash2seqs.at(fhash).size() == 1) and (not hash2seqs.contains(rhash)) 61 | and (counter.at(fhash) <= kmer_indexer_params.max_rare_cnt_target)) { 62 | k2p[fhash].emplace_back(kwh.pos); 63 | ++kmer_counter[fhash]; 64 | } 65 | if (!kwh.hasNext()) { 66 | break; 67 | } 68 | kwh = kwh.next(); 69 | } 70 | } 71 | return {kmer2pos, kmer_counter, ctgs}; 72 | } 73 | 74 | public: 75 | ExactKmerIndexBuilder(const RollingHash &hasher, const Config::CommonParams &common_params, 76 | const Config::KmerIndexerParams &kmer_indexer_params, logging::Logger &logger) 77 | : AbstractKmerIndexBuilder{/*nthreads=*/1, hasher, common_params, kmer_indexer_params, logger} {} 78 | 79 | [[nodiscard]] kmer_index::KmerIndex Build(const std::vector &contigs) const override { 80 | std::vector counters = GetCounters(contigs); 81 | std::unordered_map> hash2seqs = Hash2Seqs(counters); 82 | return Counter2Index(contigs, hash2seqs, counters); 83 | } 84 | }; 85 | 86 | }// End namespace veritymap::kmer_index_builder::exact 87 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/index_builders/kmer_index_builder.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/14/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "../../config/config.hpp" 8 | #include "../../rolling_hash.hpp" 9 | #include "../kmer_index.hpp" 10 | 11 | namespace veritymap::kmer_index_builder { 12 | 13 | class AbstractKmerIndexBuilder { 14 | protected: 15 | int64_t nthreads{1}; 16 | const RollingHash &hasher; 17 | Config::CommonParams common_params; 18 | Config::KmerIndexerParams kmer_indexer_params; 19 | logging::Logger &logger; 20 | 21 | public: 22 | AbstractKmerIndexBuilder(const int64_t nthreads, const RollingHash &hasher, 23 | const Config::CommonParams &common_params, 24 | const Config::KmerIndexerParams &kmer_indexer_params, logging::Logger &logger) 25 | : nthreads{nthreads}, 26 | hasher{hasher}, 27 | common_params{common_params}, 28 | kmer_indexer_params{kmer_indexer_params}, 29 | logger{logger} {} 30 | 31 | AbstractKmerIndexBuilder(const AbstractKmerIndexBuilder &) = delete; 32 | AbstractKmerIndexBuilder(AbstractKmerIndexBuilder &&) = delete; 33 | AbstractKmerIndexBuilder &operator=(const AbstractKmerIndexBuilder &) = delete; 34 | AbstractKmerIndexBuilder &operator=(AbstractKmerIndexBuilder &&) = delete; 35 | 36 | [[nodiscard]] virtual kmer_index::KmerIndex Build(const std::vector &contigs) const = 0; 37 | }; 38 | 39 | };// End namespace veritymap::kmer_index_builder 40 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/index_builders/kmer_window.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 10/19/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "../../cms_utils.hpp" 8 | #include "sketch/ccm.h" 9 | 10 | namespace veritymap::kmer_index::kmer_window { 11 | template 12 | class KmerWindow { 13 | int64_t half_length{0}; 14 | int64_t tot_regular{0}; 15 | const std::vector> &pos_hash_regular; 16 | typename std::vector>::const_iterator next_it; 17 | typename std::vector>::const_iterator cur_it; 18 | std::deque> deque; 19 | 20 | void IncRight() { 21 | for (const size_t pos = std::get<0>(*cur_it); next_it != pos_hash_regular.cend(); ++next_it) { 22 | auto [next_pos, _, is_next_regular] = *next_it; 23 | if (next_pos - pos > half_length) { 24 | break; 25 | } 26 | deque.emplace_back(next_pos, is_next_regular); 27 | tot_regular += is_next_regular; 28 | ++next_pos; 29 | } 30 | } 31 | 32 | void IncLeft() { 33 | const size_t pos = std::get<0>(*cur_it); 34 | while (not deque.empty()) { 35 | const auto [pos_front, is_regular_front]{deque.front()}; 36 | if (pos - pos_front <= half_length) { 37 | break; 38 | } 39 | deque.pop_front(); 40 | tot_regular -= is_regular_front; 41 | } 42 | } 43 | 44 | public: 45 | KmerWindow(const size_t length, const std::vector> &pos_hash_regular) 46 | : half_length{(int64_t) length / 2}, 47 | pos_hash_regular{pos_hash_regular}, 48 | next_it{pos_hash_regular.cbegin()}, 49 | cur_it{pos_hash_regular.cbegin()} { 50 | VERIFY(length >= 1); 51 | } 52 | 53 | [[nodiscard]] double RegularFrac() const { return tot_regular / double(half_length * 2); } 54 | 55 | void Inc() { 56 | ++cur_it; 57 | IncLeft(); 58 | IncRight(); 59 | } 60 | 61 | void Reset() { 62 | next_it = pos_hash_regular.cbegin(); 63 | cur_it = pos_hash_regular.cbegin(); 64 | IncRight(); 65 | } 66 | }; 67 | 68 | template> 69 | class MinQueue { 70 | // elements in vector with deque indixes are non-increasing 71 | std::deque> deque; 72 | Compare value_compare; 73 | int64_t first{0}; 74 | 75 | public: 76 | void PushBack(const int64_t next_pos, const T &next_el) { 77 | while (not deque.empty() and value_compare(next_el, deque.back().second)) { deque.pop_back(); } 78 | deque.emplace_back(next_pos, next_el); 79 | } 80 | 81 | [[nodiscard]] int64_t GetMinIndex() const { return deque.front().first; } 82 | [[nodiscard]] T GetMin() const { return deque.front().second; } 83 | [[nodiscard]] std::pair GetMinPair() const { return deque.front(); } 84 | 85 | void PopFront() { 86 | if (not deque.empty() and first == GetMinIndex()) { 87 | deque.pop_front(); 88 | } 89 | ++first; 90 | } 91 | }; 92 | 93 | class KmerMinimizerWindow { 94 | public: 95 | struct FreqHash { 96 | int64_t freq{0}; 97 | Config::HashParams::htype hash{0}; 98 | Config::HashParams::htype fhash{0}; 99 | 100 | // bool operator==(const FreqHashPos &rhs) const { pos == rhs.pos; } 101 | // bool operator!=(const FreqHashPos &rhs) { return not operator==(rhs); } 102 | bool operator<(const FreqHash &rhs) const { return freq < rhs.freq or (freq == rhs.freq and hash < rhs.hash); } 103 | bool operator>(const FreqHash &rhs) const { return rhs.operator<(*this); } 104 | }; 105 | 106 | private: 107 | MinQueue> queue; 108 | 109 | static std::vector GetInitWindow(KWH &kwh, const int64_t window_size, 110 | const kmer_index::KmerIndex::KmerCounter &counter) { 111 | std::vector init_window; 112 | for (int i = 0; i < window_size; ++i, kwh = kwh.next()) { 113 | init_window.push_back({counter.at(kwh.hash()), kwh.hash(), kwh.get_fhash()}); 114 | VERIFY(kwh.hasNext()); 115 | } 116 | return init_window; 117 | } 118 | 119 | static std::vector GetInitWindow(KWH &kwh, const int64_t window_size, 120 | const sketch::cm::ccm_t &cms) { 121 | std::vector init_window; 122 | for (int i = 0; i < window_size; ++i, kwh = kwh.next()) { 123 | init_window.push_back({static_cast(cms.est_count(kwh.hash())), kwh.hash(), kwh.get_fhash()}); 124 | VERIFY(kwh.hasNext()); 125 | } 126 | return init_window; 127 | } 128 | 129 | void PushBack(const int32_t freq, const Config::HashParams::htype hash, const Config::HashParams::htype fhash, 130 | const int64_t pos) { 131 | queue.PushBack(pos, {freq, hash, fhash}); 132 | } 133 | void PopFront() { queue.PopFront(); } 134 | 135 | public: 136 | explicit KmerMinimizerWindow(const std::vector &init_window) { 137 | for (auto it = init_window.cbegin(); it != init_window.cend(); ++it) { 138 | PushBack(it->freq, it->hash, it->fhash, it - init_window.cbegin()); 139 | } 140 | } 141 | 142 | KmerMinimizerWindow(KWH &kwh, const int64_t window_size, 143 | const kmer_index::KmerIndex::KmerCounter &counter) 144 | : KmerMinimizerWindow(GetInitWindow(kwh, window_size, counter)) {} 145 | 146 | KmerMinimizerWindow(KWH &kwh, const int64_t window_size, const sketch::cm::ccm_t &cms) 147 | : KmerMinimizerWindow(GetInitWindow(kwh, window_size, cms)) {} 148 | 149 | void Add(const int32_t freq, const Config::HashParams::htype hash, const Config::HashParams::htype fhash, 150 | const int64_t pos) { 151 | PushBack(freq, hash, fhash, pos); 152 | PopFront(); 153 | } 154 | 155 | [[nodiscard]] FreqHash GetMinimizer() const { return queue.GetMin(); } 156 | [[nodiscard]] int64_t GetMinimizerPos() const { return queue.GetMinIndex(); } 157 | }; 158 | }// End namespace veritymap::kmer_index::kmer_window -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/indexed_contigs.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/15/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "kmer_index.hpp" 8 | 9 | namespace veritymap::indexed_contigs { 10 | 11 | class IndexedContigs { 12 | const std::vector& contigs; 13 | const RollingHash& hasher; 14 | kmer_index::KmerIndex index; 15 | 16 | std::vector> GetNoSolidRegionsPerContig(const int i, const int max_dist, 17 | const int64_t k) const { 18 | const Contig& contig = contigs.at(i); 19 | std::vector pos; 20 | for (const auto& [hash, kmer_pos] : index[i]) { 21 | for (const int64_t p : kmer_pos) { pos.emplace_back(p); } 22 | } 23 | std::sort(pos.begin(), pos.end()); 24 | 25 | int64_t prev_p{k}; 26 | std::vector> norare_regions; 27 | for (const auto& p : pos) { 28 | if (p <= prev_p) { 29 | continue; 30 | } 31 | if (p - prev_p > max_dist) { 32 | norare_regions.emplace_back(prev_p, p); 33 | } 34 | prev_p = p + k; 35 | } 36 | return norare_regions; 37 | } 38 | 39 | public: 40 | IndexedContigs(const std::vector& contigs, const RollingHash& hasher, 41 | kmer_index::KmerIndex index) 42 | : contigs{contigs}, 43 | hasher{hasher}, 44 | index{std::move(index)} {} 45 | 46 | void Summary(logging::Logger& logger) const { 47 | std::vector n_solid_kmers = index.NSolidKmers(); 48 | for (auto it = contigs.cbegin(); it != contigs.cend(); ++it) { 49 | logger.info() << "Sequence " << it->id << ", # Solid kmers = " << n_solid_kmers[it - contigs.cbegin()] 50 | << std::endl; 51 | } 52 | } 53 | 54 | std::ostream& NoSolidRegions2Bed(const int64_t max_dist, const int64_t k, std::ostream& os) const { 55 | for (int64_t i = 0; i < contigs.size(); ++i) { 56 | const std::vector> norare_regions = GetNoSolidRegionsPerContig(i, max_dist, k); 57 | const Contig& contig = contigs[i]; 58 | for (const auto [s, e] : norare_regions) { 59 | os << contig.id << "\t" << s << "\t" << e << "\t" << e - s << " bp\n"; 60 | } 61 | } 62 | return os; 63 | } 64 | 65 | [[nodiscard]] const std::vector& Contigs() const { return contigs; } 66 | [[nodiscard]] int64_t Size() const { return contigs.size(); } 67 | 68 | [[nodiscard]] const kmer_index::KmerIndex& Index() const { return index; } 69 | }; 70 | 71 | }// namespace veritymap::indexed_contigs -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/kmer_index/kmer_filter_canon.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 06/02/22. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "../cms_utils.hpp" 8 | #include "sketch/ccm.h" 9 | 10 | namespace veritymap::kmer_index::kmer_filter_canon { 11 | 12 | enum class KmerTypeCanon { unique, duplicate, rare, frequent }; 13 | 14 | bool IsKmerTypeCanonRegular(const KmerTypeCanon kmer_type, const bool diploid = false) { 15 | return kmer_type == KmerTypeCanon::unique or (diploid and kmer_type == KmerTypeCanon::duplicate); 16 | } 17 | 18 | class KmerFilterCanon { 19 | std::vector> cmss; 20 | 21 | template 22 | friend class KmerFilterCanonBuilder; 23 | 24 | public: 25 | template 26 | [[nodiscard]] KmerTypeCanon GetKmerType(const size_t ctg_ind, const htype canhash, const size_t i, 27 | const size_t max_rare_cnt) const { 28 | const sketch::cm::ccm_t &cms = cmss[ctg_ind][i]; 29 | const size_t fcnt{cms.est_count(canhash)}; 30 | if (fcnt > max_rare_cnt) 31 | return KmerTypeCanon::frequent; 32 | if (fcnt == 1) 33 | return KmerTypeCanon::unique; 34 | if (fcnt == 2) 35 | return KmerTypeCanon::duplicate; 36 | return KmerTypeCanon::rare; 37 | } 38 | }; 39 | 40 | template 41 | class KmerFilterCanonBuilder { 42 | size_t nthreads{0}; 43 | const RollingHash &hasher; 44 | Config::CommonParams common_params; 45 | Config::KmerIndexerParams kmer_indexer_params; 46 | 47 | void AddContigToFilter(KmerFilterCanon &kmer_filter, const Contig &contig, logging::Logger &logger) const { 48 | const cms_utils::CMSParams kCmsParams(common_params, kmer_indexer_params, contig.size(), nthreads); 49 | std::vector cms; 50 | for (size_t i = 0; i < nthreads; ++i) { cms.emplace_back(kCmsParams.nbits, kCmsParams.l2sz, kCmsParams.nhash); } 51 | kmer_filter.cmss.emplace_back(std::move(cms)); 52 | 53 | if (contig.size() < common_params.k) { 54 | return; 55 | } 56 | 57 | std::vector> hashes(nthreads); 58 | std::vector sizes(nthreads, 0); 59 | 60 | auto process_chunk = [&kmer_filter, &sizes, &hashes](const size_t i) { 61 | sketch::cm::ccm_t &sketch = kmer_filter.cmss.back()[i]; 62 | const std::vector &hashes_th = hashes[i]; 63 | const size_t size = sizes[i]; 64 | for (int j = 0; j < size; ++j) { 65 | const htype hash = hashes_th[j]; 66 | sketch.add(hash); 67 | } 68 | }; 69 | 70 | const size_t chunk_size = kmer_indexer_params.approximate_kmer_indexer_params.chunk_size; 71 | 72 | KWH kwh({hasher, contig.seq, 0}); 73 | while (true) { 74 | logger.info() << "Generating task list for chunk starting at pos " << kwh.pos << "\n"; 75 | for (size_t cnt = 0; cnt < chunk_size; ++cnt) { 76 | const htype hash = kwh.hash(); 77 | const size_t ithread = hash % nthreads; 78 | if (hashes[ithread].size() == sizes[ithread]) { 79 | hashes[ithread].emplace_back(hash); 80 | } else { 81 | hashes[ithread][sizes[ithread]] = hash; 82 | } 83 | ++sizes[ithread]; 84 | if (not kwh.hasNext()) { 85 | break; 86 | } 87 | kwh = kwh.next(); 88 | } 89 | logger.info() << "Parallel run for chunk\n"; 90 | std::vector threads(nthreads); 91 | for (size_t i = 0; i < threads.size(); ++i) { threads[i] = std::thread(process_chunk, i); } 92 | for (auto &thread : threads) { thread.join(); } 93 | 94 | if (not kwh.hasNext()) { 95 | break; 96 | } 97 | 98 | std::fill(sizes.begin(), sizes.end(), 0); 99 | } 100 | } 101 | 102 | public: 103 | KmerFilterCanonBuilder(size_t nthreads, const RollingHash &hasher, const Config::CommonParams &common_params, 104 | const Config::KmerIndexerParams &kmer_indexer_params) 105 | : nthreads(nthreads), 106 | hasher(hasher), 107 | common_params(common_params), 108 | kmer_indexer_params(kmer_indexer_params) {} 109 | 110 | KmerFilterCanonBuilder(const KmerFilterCanonBuilder &) = delete; 111 | KmerFilterCanonBuilder(KmerFilterCanonBuilder &&) = delete; 112 | KmerFilterCanonBuilder &operator=(const KmerFilterCanonBuilder &) = delete; 113 | KmerFilterCanonBuilder &operator=(KmerFilterCanonBuilder &&) = delete; 114 | 115 | [[nodiscard]] KmerFilterCanon GetKmerFilterCanon(const std::vector &contigs, logging::Logger &logger) const { 116 | logger.info() << "Init filter\n"; 117 | KmerFilterCanon kmer_filter; 118 | logger.info() << "Start adding contigs to filter\n"; 119 | for (const Contig &contig : contigs) { 120 | logger.info() << "Add contig " << contig.id << "\n"; 121 | AddContigToFilter(kmer_filter, contig, logger); 122 | } 123 | return kmer_filter; 124 | } 125 | }; 126 | 127 | }// namespace veritymap::kmer_index::kmer_filter_canon -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/ksw_align.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cigar.hpp" 4 | #include "ksw2/ksw2.h" 5 | 6 | namespace veritymap::ksw_align { 7 | 8 | cigar_utils::Cigar align(const Sequence &tseq, const int t_st, const int t_en, const Sequence &qseq, const int q_st, 9 | const int q_en, const int8_t match_score, const int8_t mis_score, const int8_t gapo, 10 | const int8_t gape) { 11 | // Based on example at https://github.com/lh3/ksw2 12 | const int8_t a = match_score, b = mis_score < 0 ? mis_score : -mis_score;// a>0 and b<0 13 | const int8_t mat[25]{a, b, b, b, 0, b, a, b, b, 0, b, b, a, b, 0, b, b, b, a, 0, 0, 0, 0, 0, 0}; 14 | const int32_t tl{t_en - t_st}, ql{q_en - q_st}; 15 | uint8_t *ts, *qs, c[256]; 16 | ksw_extz_t ez; 17 | 18 | memset(&ez, 0, sizeof(ksw_extz_t)); 19 | memset(c, 4, 256); 20 | c['A'] = c['a'] = 0; 21 | c['C'] = c['c'] = 1; 22 | c['G'] = c['g'] = 2; 23 | c['T'] = c['t'] = 3;// build the encoding table 24 | ts = (uint8_t *) malloc(tl); 25 | qs = (uint8_t *) malloc(ql); 26 | for (size_t i = 0; i < tl; ++i) ts[i] = (uint8_t) tseq[i];// encode to 0/1/2/3 27 | for (size_t i = 0; i < ql; ++i) qs[i] = (uint8_t) qseq[i]; 28 | ksw_extz2_sse(nullptr, ql, qs, tl, ts, 5, mat, gapo, gape, -1, -1, 10, 0, &ez); 29 | 30 | cigar_utils::Cigar cigar(ez); 31 | 32 | free(ez.cigar); 33 | free(ts); 34 | free(qs); 35 | 36 | return cigar; 37 | } 38 | 39 | }// End namespace veritymap::ksw_align 40 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/mapper.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 03/31/22. 3 | // 4 | 5 | #include "kmer_index/indexed_contigs.hpp" 6 | 7 | namespace veritymap::mapper { 8 | 9 | class Mapper { 10 | const Config config; 11 | logging::Logger &logger; 12 | const size_t nthreads = 1; 13 | const matches::Matcher matcher; 14 | const scoring::DPScorer dp_scorer; 15 | const chaining::Chainer chainer; 16 | const RollingHash &hasher; 17 | 18 | private: 19 | [[nodiscard]] chaining::Chains MapSingleQueryStrand(const indexed_contigs::IndexedContigs &indexed_targets, 20 | const Contig &query, 21 | const dna_strand::Strand &query_strand) const { 22 | chaining::Chains chains; 23 | for (int i = 0; i < indexed_targets.Size(); ++i) { 24 | const matches::Matches matches = matcher.GetMatches(indexed_targets, i, query, query_strand); 25 | if (matches.size() < config.chaining_params.min_matches) { 26 | continue; 27 | } 28 | const auto [scores, backtracks] = dp_scorer.GetScores(matches); 29 | chaining::Chains new_chains = 30 | chainer.GetChains(indexed_targets.Contigs().at(i), query, query_strand, matches, scores, backtracks); 31 | for (chaining::Chain &chain : new_chains) { chains.emplace_back(std::move(chain)); } 32 | } 33 | return chains; 34 | } 35 | 36 | [[nodiscard]] std::vector MapSingleQuery( 37 | const Contig &query, const indexed_contigs::IndexedContigs &indexed_targets) const { 38 | using score_type = typename Config::ChainingParams::score_type; 39 | 40 | chaining::Chains chains = MapSingleQueryStrand(indexed_targets, query, dna_strand::Strand::forward); 41 | chaining::Chains chains_r = MapSingleQueryStrand(indexed_targets, query, dna_strand::Strand::reverse); 42 | 43 | for (chaining::Chain &chain : chains_r) { chains.emplace_back(std::move(chain)); } 44 | 45 | if (chains.empty()) 46 | return {}; 47 | 48 | auto pr_it = std::max_element(chains.begin(), chains.end(), 49 | [](const auto &lhs, const auto &rhs) { return lhs.score < rhs.score; }); 50 | int64_t top_range = pr_it->Range(config.common_params.k); 51 | if (top_range < config.chaining_params.min_chain_range) 52 | return {}; 53 | 54 | // auto count = [](decltype(pr_it) it) -> std::tuple { 55 | // int uniq{0}, dup{0}, rare{0}; 56 | // for (const auto &match : it->matches) { 57 | // if (match.is_unique()) 58 | // ++uniq; 59 | // else if (match.target_freq == 2) 60 | // ++dup; 61 | // else 62 | // ++rare; 63 | // } 64 | // return {uniq, dup, rare}; 65 | // }; 66 | // if (query.id == "S2_18802") 67 | // { 68 | // auto count_pr = count(pr_it); 69 | // auto count_sc = count(sc_it); 70 | // std::cout << query.id << "\n"; 71 | // bool first = query.id[1] == '1'; 72 | // bool top_correct = first == (pr_it->target.id.substr(5, 5) == "chm13"); 73 | // std::cout << pr_it -> target.id.substr(5, 5) << " " << pr_it -> matches.front().target_pos << " " << pr_it -> score << " " << std::get<0>(count_pr) << " " << std::get<1>(count_pr) << " " << std::get<2>(count_pr) << " " << (top_correct ? "*" : "") << '\n'; 74 | // std::cout << sc_it -> target.id.substr(5, 5) << " " << sc_it -> matches.front().target_pos << " " << sc_it -> score << " " << std::get<0>(count_sc) << " " << std::get<1>(count_sc) << " " << std::get<2>(count_sc) << " " << (not top_correct ? "*" : "") << '\n'; 75 | // std::cout << "\n"; 76 | // } 77 | 78 | std::vector new_chains; 79 | for (chaining::Chain &chain : chains) { 80 | if (chain.score > pr_it->score * config.chaining_params.max_top_score_prop) { 81 | new_chains.emplace_back(std::move(chain)); 82 | } 83 | } 84 | if (new_chains.size() == 1) { 85 | new_chains.front().SetPrimary(); 86 | } 87 | return new_chains; 88 | } 89 | 90 | public: 91 | Mapper(const Mapper &mapper) = delete; 92 | Mapper(Mapper &&mapper) = delete; 93 | Mapper &operator=(const Mapper &mapper) = delete; 94 | Mapper &operator=(Mapper &&mapper) = delete; 95 | 96 | Mapper(const Config &config, logging::Logger &logger, const size_t nthreads, 97 | const RollingHash &hasher) 98 | : config{config}, 99 | logger{logger}, 100 | nthreads{nthreads}, 101 | matcher{config.kmer_indexer_params, hasher}, 102 | dp_scorer{config.common_params, config.chaining_params}, 103 | chainer{config.common_params, config.chaining_params}, 104 | hasher{hasher} {} 105 | 106 | void ParallelRun(const indexed_contigs::IndexedContigs &indexed_targets, const std::vector &queries, 107 | const std::filesystem::path &chains_fn, const std::filesystem::path &sam_fn, 108 | const std::string &cmd) { 109 | std::mutex chainsMutex; 110 | 111 | std::ofstream chains_os(chains_fn); 112 | std::ofstream sam_os(sam_fn); 113 | for (const Contig &target : indexed_targets.Contigs()) { 114 | sam_os << "@SQ\tSN:" << target.id << "\tLN:" << target.seq.size() << "\n"; 115 | } 116 | sam_os << "@PG\tID:VerityMap\tPN:VerityMap\tVN:2.0\tCL:" << cmd << "\n"; 117 | 118 | std::function align_read = [&indexed_targets, &chainsMutex, &chains_os, &sam_os, 119 | this](const Contig &query) { 120 | std::vector chains = MapSingleQuery(query, indexed_targets); 121 | 122 | if (not chains.empty()) { 123 | for (const chaining::Chain &chain : chains) { 124 | std::string sam_record = chaining::chain2samrecord(chain, config.common_params, config.chain2sam_params); 125 | chainsMutex.lock(); 126 | chains_os << chain; 127 | sam_os << sam_record << "\n"; 128 | chainsMutex.unlock(); 129 | } 130 | } 131 | }; 132 | 133 | process_in_parallel(queries, align_read, nthreads, true); 134 | } 135 | }; 136 | 137 | }// namespace veritymap::mapper -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/matches.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/22/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include "config/config.hpp" 8 | #include "kmer_index/filter_rep_kmers.hpp" 9 | #include "kmer_index/indexed_contigs.hpp" 10 | #include "strand.hpp" 11 | 12 | namespace veritymap::matches { 13 | 14 | struct Match { 15 | Config::ChainingParams::match_pos_type target_pos{0}; 16 | int32_t query_pos{0}; 17 | uint8_t target_freq{0};// TODO Change to "is_unique" 18 | 19 | [[nodiscard]] bool is_unique() const { return target_freq == 1; } 20 | [[nodiscard]] bool is_dup() const { return target_freq == 2; } 21 | }; 22 | 23 | inline bool operator<(const Match& lhs, const Match& rhs) { return lhs.target_pos < rhs.target_pos; } 24 | 25 | inline std::ostream& operator<<(std::ostream& os, const Match& match) { 26 | os << match.query_pos << "\t" << match.target_pos << "\t" << static_cast(match.target_freq) << "\n"; 27 | return os; 28 | } 29 | 30 | using Matches = std::vector; 31 | 32 | inline std::ostream& operator<<(std::ostream& os, const Matches& matches) { 33 | size_t prev_pos = 0; 34 | for (const auto& match : matches) { 35 | if (match.target_pos - prev_pos > 10) {// TODO: fix to k or k/2 36 | os << match; 37 | prev_pos = match.target_pos; 38 | } 39 | } 40 | return os; 41 | } 42 | 43 | class Matcher { 44 | const Config::KmerIndexerParams& config; 45 | const RollingHash& hasher; 46 | 47 | public: 48 | Matcher(const Matcher&) = delete; 49 | Matcher(Matcher&&) = delete; 50 | Matcher& operator=(const Matcher&) = delete; 51 | Matcher& operator=(Matcher&&) = delete; 52 | 53 | Matcher(const Config::KmerIndexerParams& config, const RollingHash& hasher) 54 | : config{config}, 55 | hasher{hasher} {} 56 | 57 | [[nodiscard]] Matches GetMatches(const indexed_contigs::IndexedContigs& indexed_targets, const int64_t i, 58 | const Contig& query, const dna_strand::Strand& query_strand) const { 59 | Sequence seq = query_strand == dna_strand::Strand::forward ? query.seq : query.RC().seq; 60 | if (seq.size() < hasher.k) { 61 | return {}; 62 | } 63 | 64 | // We are using approximate kmer detection 65 | const double fpp{config.approximate_kmer_indexer_params.false_positive_probability}; 66 | BloomFilter rep_kmer_bf = veritymap::kmer_index::filter_rep_kmers::get_bloom_rep_kmers(seq, hasher, fpp); 67 | 68 | Matches matches; 69 | KWH kwh(hasher, seq, 0); 70 | const kmer_index::KmerIndex& index = indexed_targets.Index(); 71 | while (true) { 72 | const Config::HashParams::htype hash = kwh.get_fhash(); 73 | const int64_t count64 = index.GetCount(hash); 74 | const std::vector* pos = index.GetPos(hash, i); 75 | const int64_t count_i = pos == nullptr ? 0 : pos->size(); 76 | VERIFY(count64 >= count_i); 77 | if (not rep_kmer_bf.contains(hash) and count_i) { 78 | VERIFY(count64 <= config.max_rare_cnt_target); 79 | VERIFY(count64 <= std::numeric_limits::max()); 80 | const auto count = static_cast(count64); 81 | for (const int64_t tp : *pos) { 82 | VERIFY(kwh.pos < std::numeric_limits::max()); 83 | matches.push_back( 84 | {static_cast(tp), static_cast(kwh.pos), count}); 85 | } 86 | } 87 | if (not kwh.hasNext()) { 88 | break; 89 | } 90 | kwh = kwh.next(); 91 | } 92 | std::sort(matches.begin(), matches.end()); 93 | return matches; 94 | } 95 | }; 96 | 97 | }// End namespace veritymap::matches -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/query_indexer.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 03/15/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | #include "../config/config.hpp" 11 | #include "../rolling_hash.hpp" 12 | #include "bloom/bloom.hpp" 13 | #include "include/sketch/ccm.h" 14 | #include "sketch_contigs.hpp" 15 | 16 | namespace veritymap::kmer_index { 17 | 18 | kmer_index::IndexedContigs get_indexed_queries(const std::vector& queries, const std::filesystem::path& outdir, 19 | const RollingHash& hasher, 20 | const size_t nthreads, logging::Logger& logger, 21 | const std::filesystem::path& index_path, 22 | const Config::CommonParams& common_params, 23 | const Config::KmerIndexerParams& kmer_indexer_params) { 24 | using htype = Config::HashParams::htype; 25 | const std::vector kmers_indexes = [&queries, &hasher, &common_params, &kmer_indexer_params, &index_path, 26 | &logger] { 27 | if (kmer_indexer_params.strategy == Config::KmerIndexerParams::Strategy::exact) { 28 | logger.info() << "Getting exact kmer indexes..." << std::endl; 29 | std::vector kmers_indexes = get_rare_kmers(queries, hasher, kmer_indexer_params.max_rare_cnt_target); 30 | logger.info() << "Finished getting exact kmer indexes" << std::endl; 31 | return kmers_indexes; 32 | } else { 33 | VERIFY(kmer_indexer_params.strategy == Config::KmerIndexerParams::Strategy::approximate) 34 | logger.info() << "Getting approximate kmer indexes..." << std::endl; 35 | 36 | std::vector kmers_indexes = 37 | sketch_contigs::get_rare_kmers_approx(queries, hasher, common_params, kmer_indexer_params); 38 | logger.info() << "Finished getting approximate kmer indexes" << std::endl; 39 | return kmers_indexes; 40 | } 41 | }(); 42 | 43 | IndexedContigs indexed_targets; 44 | for (auto it = kmers_indexes.begin(); it != kmers_indexes.end(); ++it) { 45 | const Contig& target = targets.at(it - kmers_indexes.begin()); 46 | indexed_targets.emplace_back(target, hasher, kmer_indexer_params.max_rare_cnt_target, *it); 47 | } 48 | indexed_query.get_kmer_index().size() << std::endl; 49 | 50 | return indexed_query; 51 | } 52 | }// End namespace veritymap::kmer_index -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/rolling_hash.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // 3 | // Created by anton on 7/20/20. 4 | // 5 | 6 | #include 7 | #include 8 | 9 | template 10 | T pow_custom(T base, U p) { 11 | if (p == 0) 12 | return 1; 13 | T tmp = pow_custom(base, p / 2); 14 | if (p % 2 == 1) 15 | return base * tmp * tmp; 16 | else 17 | return tmp * tmp; 18 | } 19 | 20 | template 21 | class RollingHash { 22 | public: 23 | const size_t k; 24 | const htype hbase; 25 | const htype kpow; 26 | const htype inv; 27 | 28 | RollingHash(size_t _k, htype _hbase) 29 | : k(_k), 30 | hbase(_hbase), 31 | kpow(pow_custom(hbase, k - 1)), 32 | inv(pow_custom(hbase, (htype(1u) << (sizeof(htype) * 8u - 1u)) - 1u)) { 33 | VERIFY(inv * hbase == htype(1)); 34 | } 35 | 36 | RollingHash extensionHash() const { return RollingHash(k + 1, hbase); } 37 | 38 | htype hash(const Sequence &seq, size_t pos) const { 39 | htype hash = 0; 40 | for (size_t i = pos; i < pos + k; i++) { hash = hash * hbase + seq[i]; } 41 | return hash; 42 | } 43 | 44 | htype extendRight(const Sequence &seq, size_t pos, htype hash, unsigned char c) const { return hash * hbase + c; } 45 | 46 | htype extendLeft(const Sequence &seq, size_t pos, htype hash, unsigned char c) const { 47 | return hash + c * kpow * hbase; 48 | } 49 | 50 | htype shiftRight(const Sequence &seq, size_t pos, htype hash, unsigned char c) const { 51 | return (hash - kpow * seq[pos]) * hbase + c; 52 | } 53 | 54 | htype shiftLeft(const Sequence &seq, size_t pos, htype hash, unsigned char c) const { 55 | return (hash - seq[pos + k - 1]) * inv + c * kpow; 56 | } 57 | 58 | htype next(const Sequence &seq, size_t pos, htype hash) const { return shiftRight(seq, pos, hash, seq[pos + k]); } 59 | 60 | htype prev(const Sequence &seq, size_t pos, htype hash) const { return shiftLeft(seq, pos, hash, seq[pos - 1]); } 61 | 62 | bool hasNext(const Sequence &seq, size_t pos) const { return pos + k < seq.size(); } 63 | 64 | bool hasPrev(const Sequence &seq, size_t pos) const { return pos > 0; } 65 | }; 66 | 67 | template 68 | class KWH { 69 | private: 70 | KWH(const RollingHash &_hasher, const Sequence &_seq, size_t _pos, htype _fhash, htype _rhash) 71 | : hasher(_hasher), 72 | seq(_seq), 73 | pos(_pos), 74 | fhash(_fhash), 75 | rhash(_rhash) {} 76 | 77 | htype fhash; 78 | htype rhash; 79 | Sequence seq; 80 | 81 | public: 82 | const RollingHash &hasher; 83 | size_t pos; 84 | 85 | KWH(const RollingHash &_hasher, const Sequence &_seq, size_t _pos) 86 | : hasher(_hasher), 87 | seq(_seq), 88 | pos(_pos), 89 | fhash(_hasher.hash(_seq, _pos)), 90 | rhash(_hasher.hash(!_seq, _seq.size() - _pos - _hasher.k)) {} 91 | 92 | KWH(const KWH &other) = default; 93 | 94 | Sequence getSeq() const { return seq.Subseq(pos, pos + hasher.k); } 95 | 96 | KWH operator!() const { return KWH(hasher, !seq, seq.size() - pos - hasher.k, rhash, fhash); } 97 | 98 | htype hash() const { return std::min(fhash, rhash); } 99 | 100 | htype get_fhash() const { return fhash; } 101 | 102 | htype get_rhash() const { return rhash; } 103 | 104 | htype extendRight(unsigned char c) const { 105 | return std::min(hasher.extendRight(seq, pos, fhash, c), 106 | hasher.extendLeft(!seq, seq.size() - pos - hasher.k, rhash, c ^ 3u)); 107 | } 108 | 109 | htype extendLeft(unsigned char c) const { 110 | return std::min(hasher.extendLeft(seq, pos, fhash, c), 111 | hasher.extendRight(!seq, seq.size() - pos - hasher.k, rhash, c ^ 3u)); 112 | } 113 | 114 | KWH next() const { 115 | return {hasher, seq, pos + 1, hasher.next(seq, pos, fhash), hasher.prev(!seq, seq.size() - pos - hasher.k, rhash)}; 116 | } 117 | 118 | KWH prev() const { 119 | return {hasher, seq, pos - 1, hasher.prev(seq, pos, fhash), hasher.next(!seq, seq.size() - pos - hasher.k, rhash)}; 120 | } 121 | 122 | bool hasNext() const { return hasher.hasNext(seq, pos); } 123 | 124 | bool hasPrev() const { return hasher.hasPrev(seq, pos); } 125 | 126 | KWH &operator=(const KWH &other) { 127 | if (this == &other) 128 | return *this; 129 | seq = other.seq; 130 | pos = other.pos; 131 | fhash = other.fhash; 132 | rhash = other.rhash; 133 | return *this; 134 | } 135 | 136 | bool isCanonical() const { return fhash < rhash; } 137 | }; 138 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/strand.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/28/21. 3 | // 4 | 5 | #pragma once 6 | 7 | namespace veritymap::dna_strand { 8 | 9 | enum class Strand { forward, reverse }; 10 | 11 | std::string strand2str(const Strand& strand) { return strand == Strand::forward ? "+" : "-"; } 12 | 13 | }// namespace veritymap::dna_strand 14 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/veritymap.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/19/21. 3 | // 4 | 5 | #include "veritymap.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "config/config.hpp" 13 | #include "version/version.hpp" 14 | 15 | int main(int argc, char** argv) { 16 | CLParser parser{{"output-dir=", "target=", "queries=none", "threads=40", "only-index", "careful", "diploid", 17 | "index=none", "config=hifi-haploid-complete"}, 18 | {}, 19 | {"o=output-dir", "t=threads"}}; 20 | parser.parseCL(argc, argv); 21 | if (!parser.check().empty()) { 22 | std::cerr << "Incorrect parameters" << std::endl; 23 | std::cerr << parser.check() << std::endl; 24 | return 1; 25 | } 26 | 27 | size_t nthreads = std::stoi(parser.getValue("threads")); 28 | if (nthreads == 0) { 29 | std::cerr << "# threads can not be set 0" << std::endl; 30 | return 1; 31 | } 32 | 33 | const std::filesystem::path output_dir{parser.getValue("output-dir")}; 34 | ensure_dir_existance(output_dir); 35 | 36 | logging::LoggerStorage ls{output_dir, "veritymap"}; 37 | logging::Logger logger; 38 | const std::filesystem::path logfn = ls.newLoggerFile(); 39 | logger.addLogFile(logfn); 40 | 41 | logger << "Log is written to " << logfn << std::endl; 42 | logger << "Git commit SHA1: " << tools::Version::GIT_SHA1 << std::endl; 43 | logger << "Git commit date: " << tools::Version::GIT_DATE << std::endl; 44 | 45 | auto time_point{std::chrono::system_clock::now()}; 46 | std::time_t now = std::chrono::system_clock::to_time_t(time_point); 47 | logger << "Launch time: " << std::put_time(std::localtime(&now), "%c %Z") << std::endl; 48 | 49 | std::stringstream cmd_ss; 50 | for (size_t i = 0; i < argc; i++) { cmd_ss << argv[i] << " "; } 51 | const std::string cmd = cmd_ss.str(); 52 | logger << "CMD: " << cmd << std::endl; 53 | 54 | const std::filesystem::path target_path = std::filesystem::canonical(parser.getValue("target")); 55 | const std::filesystem::path queries_path = std::filesystem::canonical(parser.getValue("queries")); 56 | 57 | bool only_index = parser.getCheck("only-index"); 58 | bool careful_mode = parser.getCheck("careful"); 59 | if (careful_mode and queries_path == "") { 60 | std::cerr << "Cannot use careful mode if no queries are provided\n"; 61 | return 1; 62 | } 63 | 64 | auto get_path_w_def = [&parser](const std::string& parameter) -> std::optional { 65 | std::filesystem::path path = parser.getValue(parameter); 66 | std::optional path_opt; 67 | if (path != "none") { 68 | return std::filesystem::canonical(path); 69 | } 70 | return {}; 71 | }; 72 | const std::optional index_path = get_path_w_def("index"); 73 | 74 | const std::filesystem::path binary_path = argv[0]; 75 | const std::filesystem::path config_fn = [&parser, &logger, &binary_path] { 76 | std::string config = parser.getValue("config"); 77 | std::filesystem::path dirpath = binary_path.parent_path(); 78 | if (config == "hifi-haploid-complete") { 79 | return dirpath / "config/config_hifi_haploid_complete.tsv"; 80 | } else if (config == "hifi-haploid") { 81 | return dirpath / "config/config_hifi_haploid.tsv"; 82 | } else if (config == "hifi-diploid") { 83 | return dirpath / "config/config_hifi_diploid.tsv"; 84 | } else if (config == "ont-haploid-complete") { 85 | return dirpath / "config/config_ont_haploid_complete.tsv"; 86 | } 87 | return static_cast(config); 88 | }(); 89 | veritymap::Config config = veritymap::Config::load_config_file(config_fn); 90 | // bool diploid_mode = parser.getCheck("diploid"); 91 | // if (diploid_mode) { 92 | // // TODO refactor this out and modify config before copying into the output file 93 | // config.common_params.diploid = true; 94 | // config.kmer_indexer_params.strategy = veritymap::Config::KmerIndexerParams::Strategy::approximate_canon; 95 | // } 96 | 97 | const auto config_out_fn = output_dir / "config.tsv"; 98 | std::filesystem::copy_file(config_fn, config_out_fn, std::filesystem::copy_options::overwrite_existing); 99 | logger.info() << "Config exported to " << config_out_fn << "\n"; 100 | 101 | veritymap::VerityMap mapper(config, logger, only_index, careful_mode, nthreads); 102 | mapper.Map(target_path, queries_path, output_dir, cmd, index_path); 103 | 104 | logger.info() << "Thank you for using VerityMap!" << std::endl; 105 | } 106 | -------------------------------------------------------------------------------- /veritymap/src/projects/veritymap/veritymap.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/19/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "chaining.hpp" 15 | #include "config/config.hpp" 16 | #include "dp_scoring.hpp" 17 | #include "hash_utils.hpp" 18 | #include "kmer_index/indexed_contigs.hpp" 19 | // #include "kmer_index/kmer_index.hpp" 20 | #include "kmer_index/target_indexer.hpp" 21 | #include "mapper.hpp" 22 | #include "matches.hpp" 23 | 24 | namespace veritymap { 25 | 26 | class VerityMap { 27 | const Config config; 28 | logging::Logger &logger; 29 | const bool only_index = false; 30 | const bool careful_mode = true; 31 | const size_t nthreads = 1; 32 | const RollingHash hasher; 33 | 34 | private: 35 | public: 36 | VerityMap(const Config &config, logging::Logger &logger, const bool only_index, const bool careful_mode, 37 | const size_t nthreads) 38 | : config{config}, 39 | logger{logger}, 40 | only_index{only_index}, 41 | careful_mode{careful_mode}, 42 | nthreads{nthreads}, 43 | hasher{config.common_params.k, config.hash_params.base} {} 44 | 45 | VerityMap(const VerityMap &) = delete; 46 | VerityMap(VerityMap &&) = delete; 47 | VerityMap &operator=(const VerityMap &) = delete; 48 | VerityMap &operator=(VerityMap &&) = delete; 49 | 50 | void Map(const std::filesystem::path &target_path, const std::filesystem::path &queries_path, 51 | const std::filesystem::path &outdir, const std::string &cmd, 52 | const std::optional &index_path) { 53 | std::vector targets{io::SeqReader(target_path).readAllContigs()}; 54 | for (const Contig &target : targets) { 55 | logger.info() << "Target length " << target.seq.size() << ", name " << target.id << " from " << target_path 56 | << std::endl; 57 | } 58 | 59 | std::vector queries(io::SeqReader(queries_path).readAllContigs()); 60 | logger.info() << "Queries from " << queries_path << ", total " << queries.size() << " sequences " << std::endl; 61 | 62 | kmer_index::TargetIndexer target_indexer(config.common_params, config.kmer_indexer_params, logger, hasher); 63 | const indexed_contigs::IndexedContigs indexed_targets = 64 | target_indexer.GetIndexedTargets(targets, queries, index_path, outdir, nthreads); 65 | 66 | { 67 | const auto no_solid_kmers_fn = outdir / "no_solid_kmers.bed"; 68 | std::ofstream no_solid_kmers_os(no_solid_kmers_fn); 69 | indexed_targets.NoSolidRegions2Bed(config.kmer_indexer_params.min_uncovered_len, config.common_params.k, 70 | no_solid_kmers_os); 71 | logger.info() << "Finished exporting long (>= " << config.kmer_indexer_params.min_uncovered_len << " bp) " 72 | << "regions without solid k-mers to " << no_solid_kmers_fn << std::endl; 73 | } 74 | 75 | if (only_index) { 76 | return; 77 | } 78 | 79 | const auto chains_fn = outdir / "chains.tsv"; 80 | const auto sam_fn = outdir / "alignments.sam"; 81 | 82 | logger.info() << "Computing chains and sam records..." << std::endl; 83 | mapper::Mapper mapper(config, logger, nthreads, hasher); 84 | mapper.ParallelRun(indexed_targets, queries, chains_fn, sam_fn, cmd); 85 | 86 | logger.info() << "Finished outputting chains to " << chains_fn << " and sam records to " << sam_fn << std::endl; 87 | } 88 | }; 89 | 90 | }// End namespace veritymap -------------------------------------------------------------------------------- /veritymap/src/tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(bloom) 2 | add_subdirectory(common) 3 | add_subdirectory(sequences) 4 | add_subdirectory(ksw2) 5 | add_subdirectory(sketch) 6 | add_subdirectory(version) 7 | -------------------------------------------------------------------------------- /veritymap/src/tools/bloom/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(bloomfilter) 2 | 3 | add_library(bloomfilter INTERFACE) 4 | 5 | target_include_directories(bloomfilter INTERFACE .) 6 | -------------------------------------------------------------------------------- /veritymap/src/tools/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(common) 2 | 3 | include_directories(.) 4 | add_library(common STATIC 5 | cl_parser.cpp oneline_utils.hpp coverage_utils.cpp) 6 | target_link_libraries(common m) -------------------------------------------------------------------------------- /veritymap/src/tools/common/cl_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "cl_parser.hpp" 2 | 3 | #include 4 | #include 5 | 6 | const std::string CLParser::emptystring; 7 | 8 | void CLParser::parseCL(const std::vector &args) { 9 | std::stringstream ss; 10 | for (const std::string &arg : args) { 11 | ss << arg; 12 | ss << " "; 13 | } 14 | command_line = command_line + ss.str(); 15 | bool isstart = true; 16 | std::string name; 17 | for (const std::string &s : args) { 18 | if (!name.empty()) { 19 | if (!values[name].empty() && values[name][0] == ',') { 20 | values[name] += "," + s; 21 | } else { 22 | values[name] = s; 23 | } 24 | name = ""; 25 | } else if (s[0] == '-') { 26 | isstart = false; 27 | if (s[1] == '-') { 28 | name = s.substr(2, s.size() - 2); 29 | } else { 30 | name = short_to_long[s[1]]; 31 | } 32 | if (checks.count(name)) { 33 | checks[name] = true; 34 | name = ""; 35 | } else if (values.count(name) == 0) { 36 | errors.push_back("Unknown option " + s); 37 | } 38 | } else if (isstart) { 39 | start.push_back(s); 40 | } else { 41 | extra.push_back(s); 42 | } 43 | } 44 | } 45 | 46 | void CLParser::parseCL(int argc, char **argv) { parseCL(oneline::initialize(argv, argv + argc)); } 47 | 48 | const std::string &CLParser::getValue(const std::string &s) const { 49 | auto it = values.find(s); 50 | if (it == values.end()) { 51 | std::cerr << "Missing parameter " << s << std::endl; 52 | exit(1); 53 | } 54 | VERIFY(it != values.end()); 55 | if (it == values.end()) { 56 | return emptystring; 57 | } else { 58 | return it->second; 59 | } 60 | } 61 | 62 | bool CLParser::getCheck(const std::string &s) const { return checks.find(s)->second; } 63 | 64 | CLParser::CLParser(std::vector _long_params, std::vector _list_params, 65 | std::vector _short_params) 66 | : long_params(std::move(_long_params)), 67 | list_params(std::move(_list_params)), 68 | short_params(std::move(_short_params)) { 69 | for (const std::string &s : long_params) { 70 | size_t pos = s.find('='); 71 | if (pos != -1) { 72 | values[s.substr(0, pos)] = s.substr(pos + 1, s.size() - pos - 1); 73 | } else { 74 | checks[s] = false; 75 | } 76 | } 77 | for (const std::string &s : list_params) { 78 | size_t pos = s.find('='); 79 | if (pos != size_t(-1)) { 80 | values[s.substr(0, pos)] = s.substr(pos + 1, s.size() - pos - 1); 81 | } else { 82 | values[s] = ","; 83 | } 84 | } 85 | for (const std::string &s : short_params) { short_to_long[s[0]] = s.substr(2, s.size() - 2); } 86 | } 87 | -------------------------------------------------------------------------------- /veritymap/src/tools/common/cl_parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "oneline_utils.hpp" 8 | #include "string_utils.hpp" 9 | 10 | class CLParser { 11 | private: 12 | const std::vector long_params; 13 | const std::vector list_params; 14 | const std::vector short_params; 15 | std::map values; 16 | std::map checks; 17 | std::map short_to_long; 18 | std::vector start; 19 | std::vector extra; 20 | std::vector errors; 21 | const static std::string emptystring; 22 | std::string command_line; 23 | 24 | public: 25 | CLParser(std::vector _long_params, std::vector _list_params, 26 | std::vector _short_params); 27 | 28 | // TODO: check what happens with quotes 29 | // TODO: make failsafe 30 | void parseCL(const std::vector &args); 31 | 32 | void parseCL(int argc, char **argv); 33 | 34 | std::string check() { 35 | for (const auto &key : values) { 36 | if (key.second.empty()) { 37 | return key.first + " missing"; 38 | } 39 | } 40 | return ""; 41 | } 42 | 43 | const std::string &getValue(const std::string &s) const; 44 | 45 | std::vector getListValue(const std::string &s) const { return split(getValue(s), ","); } 46 | 47 | bool getCheck(const std::string &s) const; 48 | 49 | const std::vector &getStart() const { return start; } 50 | 51 | const std::vector &getExtra() const { return extra; } 52 | 53 | const std::vector &getErrors() const { return errors; } 54 | 55 | const std::string &getCL() const { return command_line; } 56 | }; -------------------------------------------------------------------------------- /veritymap/src/tools/common/coverage_utils.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 10/19/21. 3 | // 4 | 5 | #include "coverage_utils.hpp" 6 | 7 | using namespace tools::common::coverage_utils; 8 | 9 | double tools::common::coverage_utils::get_coverage(const std::vector& contigs, 10 | const std::vector& readset) { 11 | uint64_t cnt_len{0}; 12 | for (const Contig& contig : contigs) { cnt_len += contig.size(); } 13 | 14 | uint64_t reads_len{0}; 15 | for (const Contig& read : readset) { reads_len += read.size(); } 16 | return static_cast(reads_len) / static_cast(cnt_len); 17 | } 18 | -------------------------------------------------------------------------------- /veritymap/src/tools/common/coverage_utils.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 10/19/21. 3 | // 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include "sequences/contigs.hpp" 10 | 11 | namespace tools::common::coverage_utils { 12 | 13 | double get_coverage(const std::vector& contigs, const std::vector& readset); 14 | 15 | }// End namespace tools::common::coverage_utils -------------------------------------------------------------------------------- /veritymap/src/tools/common/dir_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | //TODO: throw exception if this is file 9 | inline void ensure_dir_existance(const std::filesystem::path& path) { 10 | struct stat statbuf {}; 11 | if (not std::filesystem::is_directory(path)) { 12 | std::filesystem::create_directories(path); 13 | } 14 | } 15 | 16 | //TODO: throw exception if this is file 17 | inline void recreate_dir(const std::filesystem::path& path) { 18 | struct stat statbuf {}; 19 | if (std::filesystem::is_directory(path)) { 20 | std::filesystem::remove_all(path); 21 | } 22 | std::filesystem::create_directories(path); 23 | } -------------------------------------------------------------------------------- /veritymap/src/tools/common/math_utils.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Andrey Bzikadze on 2/26/21. 3 | // 4 | 5 | #pragma once 6 | 7 | template 8 | T constexpr custom_pow(T base, U exponent) { 9 | static_assert(std::is_integral(), "exponent must be integral"); 10 | return exponent == 0 ? 1 : base * custom_pow(base, exponent - 1); 11 | } 12 | -------------------------------------------------------------------------------- /veritymap/src/tools/common/oneline_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace oneline { 7 | template 8 | std::vector map(I begin, I end, std::function f) { 9 | std::vector result; 10 | std::for_each(begin, end, [&](const U ¶m) { result.push_back(f(param)); }); 11 | return std::move(result); 12 | } 13 | 14 | template 15 | std::vector filter(I begin, I end, std::function f) { 16 | std::vector result; 17 | std::for_each(begin, end, [&](V ¶m) { 18 | if (f(param)) 19 | result.emplace_back(std::move(param)); 20 | }); 21 | return std::move(result); 22 | } 23 | 24 | template 25 | std::vector initialize(I begin, const I &end) { 26 | std::vector result; 27 | std::for_each(begin, end, [&](const typename I::value_type ¶m) { result.emplace_back(param); }); 28 | return std::move(result); 29 | } 30 | 31 | template 32 | std::vector initialize(const C &container) { 33 | return std::move(initialize(container.begin(), container.end())); 34 | } 35 | 36 | template 37 | std::vector initialize(I begin, const I &end) { 38 | std::vector result; 39 | std::for_each(begin, end, [&](const U ¶m) { result.emplace_back(param); }); 40 | return std::move(result); 41 | } 42 | 43 | }// namespace oneline -------------------------------------------------------------------------------- /veritymap/src/tools/common/output_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | using std::cout; 6 | using std::endl; 7 | 8 | template 9 | std::ostream& operator<<(std::ostream& out, const std::pair& item) { 10 | return out << "(" << item.first << ", " << item.second << ")"; 11 | } 12 | 13 | //inline std::ostream& operator<<(std::ostream& out, const unsigned __int128& item) { 14 | // std::vector res; 15 | // unsigned __int128 tmp = item; 16 | // while(tmp != 0) { 17 | // res.push_back(char((tmp % 10) + '0')); 18 | // tmp /= 10; 19 | // } 20 | // return out << std::string(res.rbegin(), res.rend()); 21 | //} 22 | 23 | template 24 | std::ostream& operator<<(std::ostream& out, const std::vector& tree) { 25 | if (tree.size() == 0) { 26 | return out << "[]" << std::endl; 27 | } 28 | out << "["; 29 | for (size_t i = 0; i + 1 < tree.size(); i += 1) { out << tree[i] << ", "; } 30 | return out << tree[tree.size() - 1] << "]"; 31 | } -------------------------------------------------------------------------------- /veritymap/src/tools/common/parallel.h: -------------------------------------------------------------------------------- 1 | //(c) 2016 by Authors 2 | //This file is a part of ABruijn program. 3 | //Released under the BSD license (see LICENSE file) 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | //#include "progress_bar.h" 11 | 12 | //simple thread pool implementation 13 | //updateFun should be thread-safe! 14 | template 15 | void process_in_parallel(const std::vector& scheduledTasks, std::function updateFun, 16 | size_t maxThreads, bool progressBar) { 17 | if (scheduledTasks.empty()) 18 | return; 19 | 20 | std::atomic jobId(0); 21 | //ProgressPercent progress(scheduledTasks.size()); 22 | 23 | auto threadWorker = [&jobId, &scheduledTasks, &updateFun, progressBar]() { 24 | while (true) { 25 | size_t expected = 0; 26 | while (true) { 27 | expected = jobId; 28 | if (jobId == scheduledTasks.size()) { 29 | return; 30 | } 31 | if (jobId.compare_exchange_weak(expected, expected + 1)) { 32 | break; 33 | } 34 | } 35 | updateFun(scheduledTasks[expected]); 36 | //if (progressBar) progress.advance(); 37 | } 38 | }; 39 | 40 | std::vector threads(std::min(maxThreads, scheduledTasks.size())); 41 | for (size_t i = 0; i < threads.size(); ++i) { threads[i] = std::thread(threadWorker); } 42 | for (size_t i = 0; i < threads.size(); ++i) { threads[i].join(); } 43 | } 44 | -------------------------------------------------------------------------------- /veritymap/src/tools/common/string_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | static bool endsWith(const std::string &str, const std::string &suffix) { 6 | return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); 7 | } 8 | 9 | static bool startsWith(const std::string &str, const std::string &prefix) { 10 | return str.size() >= prefix.size() && 0 == str.compare(0, prefix.size(), prefix); 11 | } 12 | 13 | static inline void ltrim_inplace(std::string &s) { 14 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char c) { return not std::isspace(c); })); 15 | } 16 | 17 | static inline void rtrim_inplace(std::string &s) { 18 | s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return not std::isspace(ch); }).base(), s.end()); 19 | } 20 | 21 | static inline std::string trim(std::string s) { 22 | ltrim_inplace(s); 23 | rtrim_inplace(s); 24 | return s; 25 | } 26 | 27 | static inline std::string &compress_inplace(std::string &s) { 28 | s.erase(std::unique(s.begin(), s.end()), s.end()); 29 | return s; 30 | } 31 | 32 | inline std::vector split(const std::string &s, const std::string &delimiter) { 33 | std::vector res; 34 | size_t cur = 0; 35 | while (cur < s.size()) { 36 | size_t next = s.find(delimiter, cur); 37 | if (next == size_t(-1)) { 38 | next = s.size(); 39 | } 40 | if (next > cur) { 41 | res.push_back(s.substr(cur, next - cur)); 42 | } 43 | cur = next + delimiter.size(); 44 | } 45 | return res; 46 | } 47 | 48 | inline std::vector split(const std::string &s) { 49 | std::vector res; 50 | size_t cur = 0; 51 | std::string bad = " \n\t"; 52 | while (cur < s.size()) { 53 | size_t next = cur; 54 | while (next < s.size() && bad.find(s[next]) == size_t(-1)) { 55 | // std::cout << s[cur] << " " << size_t(s[next]) << " " << size_t('\t') << std::endl; 56 | next += 1; 57 | } 58 | if (next > cur) { 59 | res.push_back(s.substr(cur, next - cur)); 60 | } 61 | cur = next + 1; 62 | } 63 | return res; 64 | } -------------------------------------------------------------------------------- /veritymap/src/tools/ksw2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(ksw2 STATIC ksw2_extz2_sse.cpp) 2 | target_link_libraries(ksw2) -------------------------------------------------------------------------------- /veritymap/src/tools/ksw2/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2018- Dana-Farber Cancer Institute 4 | 2017-2018 Broad Institute, Inc. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. -------------------------------------------------------------------------------- /veritymap/src/tools/ksw2/kalloc.h: -------------------------------------------------------------------------------- 1 | #ifndef _KALLOC_H_ 2 | #define _KALLOC_H_ 3 | 4 | #include 5 | 6 | #define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t)) 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | void *kmalloc(void *km, size_t size); 13 | void *krealloc(void *km, void *ptr, size_t size); 14 | void *kcalloc(void *km, size_t count, size_t size); 15 | void kfree(void *km, void *ptr); 16 | 17 | void *km_init(void); 18 | void km_destroy(void *km); 19 | 20 | void km_stat(const void *km); // TODO: return numbers instead of print to stderr 21 | 22 | #ifdef __cplusplus 23 | } 24 | #endif 25 | 26 | #endif -------------------------------------------------------------------------------- /veritymap/src/tools/sequences/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(sequences) 2 | 3 | include_directories(.) 4 | add_library(sequence STATIC contigs.cpp sequence.cpp) 5 | 6 | find_package(ZLIB) 7 | target_link_libraries(sequence ${CMAKE_THREAD_LIBS_INIT} ${ZLIB_LIBRARIES} m) 8 | -------------------------------------------------------------------------------- /veritymap/src/tools/sequences/contigs.cpp: -------------------------------------------------------------------------------- 1 | #include "contigs.hpp" 2 | 3 | bool StringContig::needs_compressing = false; -------------------------------------------------------------------------------- /veritymap/src/tools/sequences/nucl.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by anton on 19.12.2019. 3 | // 4 | 5 | #pragma once 6 | 7 | /** 8 | * 0123 -> true 9 | * @param char c 10 | * @return true if c is 0, 1, 2 or 3. 11 | */ 12 | inline bool is_dignucl(char c) { return (c < 4); } 13 | 14 | /** 15 | * 0123 -> 3210 16 | * @param char c 17 | * @return c ^ 3 18 | */ 19 | inline char complement(char c) { return char(c ^ 3u); } 20 | 21 | static const char INVALID_NUCL = char(-1); 22 | 23 | /** 24 | * ACGTacgt0123 -> true 25 | * @param char c 26 | * @return true if c is 'A/a/0', 'C/c/1', 'G/g/2', 'T/t/3'. 27 | */ 28 | inline bool is_nucl(char c) { 29 | switch (c) { 30 | case 'n': 31 | case 'N': 32 | case 0: 33 | case 'a': 34 | case 'A': 35 | case 1: 36 | case 'c': 37 | case 'C': 38 | case 2: 39 | case 'g': 40 | case 'G': 41 | case 3: 42 | case 't': 43 | case 'T': return true; 44 | default: return false; 45 | } 46 | } 47 | 48 | /** 49 | * ACGT -> TGCA 50 | * @param char c is 'A/a/0', 'C/c/1', 'G/g/2', 'T/t/3' or 'N' 51 | * @return complement symbol, i.e. 'A/a/0' => 'T/t/3', 'C/c/1' => 'G/g/2', 'G/g/2' => 'C/c/1', 'T/t/3' => 'A/a/0', 'N' => 'N' 52 | */ 53 | inline char nucl_complement(char c) { 54 | switch (c) { 55 | case 0: return 3; 56 | case 'a': 57 | case 'n': return 't'; 58 | case 'A': 59 | case 'N': return 'T'; 60 | case 1: return 2; 61 | case 'c': return 'g'; 62 | case 'C': return 'G'; 63 | case 2: return 1; 64 | case 'g': return 'c'; 65 | case 'G': return 'C'; 66 | case 3: return 0; 67 | case 't': return 'a'; 68 | case 'T': return 'A'; 69 | default: return INVALID_NUCL; 70 | } 71 | } 72 | 73 | /** 74 | * 0123acgtACGT -> ACGT 75 | * @param char c is 'A/a/0', 'C/c/1', 'G/g/2', 'T/t/3' 76 | * @return 'A/a/0' => 'A', 'C/c/1' => 'C', 'G/g/2' => 'G', 'T/t/3' => 'T' 77 | */ 78 | inline char nucl(char c) { 79 | switch (c) { 80 | case 0: 81 | case 'a': 82 | case 'A': 83 | case 'n': 84 | case 'N': return 'A'; 85 | case 1: 86 | case 'c': 87 | case 'C': return 'C'; 88 | case 2: 89 | case 'g': 90 | case 'G': return 'G'; 91 | case 3: 92 | case 't': 93 | case 'T': return 'T'; 94 | default: return INVALID_NUCL; 95 | } 96 | } 97 | 98 | /** 99 | * ACGT -> 0123 100 | * @param char c is 'A/a/', 'C', 'G' or 'T' 101 | * @return A => 0, C => 1, G => 2, T => 3 102 | */ 103 | inline char dignucl(char c) { 104 | switch (c) { 105 | case 0: 106 | case 'a': 107 | case 'A': 108 | case 'n': 109 | case 'N': return 0; 110 | case 1: 111 | case 'c': 112 | case 'C': return 1; 113 | case 2: 114 | case 'g': 115 | case 'G': return 2; 116 | case 3: 117 | case 't': 118 | case 'T': return 3; 119 | default: return INVALID_NUCL; 120 | } 121 | } -------------------------------------------------------------------------------- /veritymap/src/tools/sequences/sequence.cpp: -------------------------------------------------------------------------------- 1 | #include "sequence.hpp" 2 | -------------------------------------------------------------------------------- /veritymap/src/tools/sequences/verify.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | inline void print_stacktrace() { 9 | std::cout << "=== Stack Trace ===" << std::endl; 10 | 11 | const size_t max_stack_size = 1000; 12 | 13 | void *stack_pointers[max_stack_size]; 14 | int count = backtrace(stack_pointers, max_stack_size); 15 | 16 | char **func_names = backtrace_symbols(stack_pointers, count); 17 | 18 | // Print the stack trace 19 | for (int i = 0; i < count; ++i) std::cerr << func_names[i] << std::endl; 20 | 21 | // Free the string pointers 22 | free(func_names); 23 | } 24 | 25 | #define VERIFY(expr) \ 26 | do { \ 27 | if (!(expr)) { \ 28 | print_stacktrace(); \ 29 | assert(expr); \ 30 | abort(); \ 31 | }; \ 32 | } while (0); 33 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(sketch) 2 | 3 | add_library(sketch INTERFACE) 4 | 5 | target_include_directories(sketch INTERFACE . ./include) 6 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/compact_vector/const_iterator_traits.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CONST_ITERATOR_TRAITS_H__ 2 | #define __CONST_ITERATOR_TRAITS_H__ 3 | 4 | #include 5 | 6 | 7 | namespace compact { 8 | template struct const_iterator_traits { }; 9 | 10 | template 11 | struct const_iterator_traits { 12 | typedef typename std::add_const::type* type; 13 | }; 14 | 15 | template 16 | struct const_iterator_traits { 17 | typedef const T* type; 18 | }; 19 | } // namespace compact 20 | 21 | #endif /* __CONST_ITERATOR_TRAITS_H__ */ 22 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/compact_vector/parallel_iterator_traits.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __PARALLEL_POINTER_TRAITS_H__ 2 | #define __PARALLEL_POINTER_TRAITS_H__ 3 | 4 | #include 5 | 6 | namespace compact { 7 | // Traits for a parallel iterator. Very weak requirements that if two 8 | // threads hold iterators to two different location, then the pointers 9 | // can be read and stored. 10 | // 11 | // This holds for pointers. But it requires attention when dealing 12 | // with compact iterators. 13 | 14 | template struct parallel_iterator_traits { }; 15 | 16 | template 17 | struct parallel_iterator_traits { 18 | typedef T* type; 19 | static bool cas(type x, T& expected, const T& val) { 20 | const T old = expected; 21 | expected = __sync_val_compare_and_swap(x, expected, val); 22 | return old == expected; 23 | } 24 | }; 25 | 26 | template 27 | struct parallel_iterator_traits { 28 | typedef const T* type; 29 | }; 30 | } // namespace compact 31 | 32 | #endif /* __PARALLEL_POINTER_TRAITS_H__ */ 33 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/compact_vector/prefetch_iterator_traits.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __PREFETCH_TRAITS_H__ 2 | #define __PREFETCH_TRAITS_H__ 3 | 4 | #include 5 | 6 | namespace compact { 7 | // Traits to prefetch an iterator 8 | 9 | template struct prefetch_iterator_traits { }; 10 | 11 | template 12 | struct prefetch_iterator_traits { 13 | template 14 | static void read(T* ptr) { __builtin_prefetch((void*)ptr, 0, level); } 15 | template 16 | static void write(T* ptr) { __builtin_prefetch((void*)ptr, 1, level); } 17 | }; 18 | 19 | template 20 | struct prefetch_iterator_traits { 21 | template 22 | static void read(const T* ptr) { __builtin_prefetch((void*)ptr, 0, level); } 23 | template 24 | static void write(const T* ptr) { __builtin_prefetch((void*)ptr, 1, level); } 25 | }; 26 | } // namespace compact 27 | 28 | #endif /* __PREFETCH_TRAITS_H__ */ 29 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/div.h: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_DIV_H__ 2 | #define SKETCH_DIV_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #undef INLINE 10 | #if __GNUC__ || __clang__ 11 | # define INLINE __attribute__((always_inline)) inline 12 | #elif __CUDACC__ 13 | # define INLINE __forceinline__ inline 14 | #else 15 | # define INLINE inline 16 | #endif 17 | 18 | #ifndef CONST_IF 19 | #if defined(__cpp_if_constexpr) && __cplusplus >= __cpp_if_constexpr 20 | #define CONST_IF(...) if constexpr(__VA_ARGS__) 21 | #else 22 | #define CONST_IF(...) if(__VA_ARGS__) 23 | #endif 24 | #endif 25 | 26 | // Extrapolated from 32-it method at https://github.com/lemire/fastmod and its accompanying paper 27 | // Method for 64-bit integers developed and available at https://github.com/dnbaker/fastmod 28 | 29 | namespace schism { 30 | using std::uint32_t; 31 | using std::uint64_t; 32 | 33 | 34 | static inline __uint128_t computeM_u64(uint64_t d) { 35 | return (__uint128_t(-1) / d) + 1; 36 | } 37 | 38 | static inline uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) { 39 | __uint128_t bottom_half = (lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow 40 | bottom_half >>= 64; // Only need the top 64 bits, as we'll shift the lower half away; 41 | __uint128_t top_half = (lowbits >> 64) * d; 42 | __uint128_t both_halves = bottom_half + top_half; // Both halves are already shifted down by 64 43 | return (both_halves >>= 64); // Get top half of both_halves 44 | } 45 | static inline uint64_t fastdiv_u64(uint64_t a, __uint128_t M) { 46 | return mul128_u64(M, a); 47 | } 48 | 49 | static inline uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) { 50 | __uint128_t lowbits = M * a; 51 | return mul128_u64(lowbits, d); 52 | } 53 | static inline uint64_t computeM_u32(uint32_t d) { 54 | return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1; 55 | } 56 | static inline uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { 57 | return ((__uint128_t)lowbits * d) >> 64; 58 | } 59 | static inline uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) { 60 | uint64_t lowbits = M * a; 61 | return (uint32_t)(mul128_u32(lowbits, d)); 62 | } 63 | 64 | // fastmod computes (a / d) given precomputed M for d>1 65 | static inline uint32_t fastdiv_u32(uint32_t a, uint64_t M) { 66 | return (uint32_t)(mul128_u32(M, a)); 67 | } 68 | 69 | template struct div_t { 70 | T quot; 71 | T rem; 72 | operator std::pair &() { 73 | return *reinterpret_cast *>(this); 74 | } 75 | auto &first() { 76 | return this->quot; 77 | } 78 | auto &first() const { 79 | return this->quot; 80 | } 81 | auto &second() { 82 | return this->rem; 83 | } 84 | auto &second() const { 85 | return this->rem; 86 | } 87 | std::pair to_pair() const {return std::make_pair(quot, rem);} 88 | operator const std::pair &() const { 89 | return *const_cast *>(this); 90 | } 91 | }; 92 | 93 | 94 | template 95 | struct Schismatic; 96 | template struct Schismatic { 97 | private: 98 | uint64_t d_; 99 | __uint128_t M_; 100 | uint64_t m32_; 101 | uint64_t &m32() { 102 | return m32_; 103 | } 104 | // We swap location here so that m32 can be 64-bit aligned. 105 | public: 106 | const auto &d() const {return d_;} 107 | const uint64_t &m32() const {assert(shortcircuit); return m32_;} 108 | using DivType = div_t; 109 | Schismatic(uint64_t d): d_(d), M_(computeM_u64(d)) { 110 | CONST_IF(shortcircuit) { 111 | m32_ = computeM_u32(d); 112 | } else { 113 | m32_ = 0; 114 | } 115 | } 116 | INLINE bool test_limits(uint64_t v) const { 117 | assert(shortcircuit); 118 | static constexpr uint64_t threshold = std::numeric_limits::max(); 119 | return d_ <= threshold && v <= threshold; 120 | } 121 | INLINE uint64_t div(uint64_t v) const { 122 | if(shortcircuit) { 123 | return test_limits(v) ? uint64_t(fastdiv_u32(v, m32_)): fastdiv_u64(v, m32_); 124 | } 125 | return fastdiv_u64(v, M_); 126 | } 127 | INLINE uint64_t mod(uint64_t v) const { 128 | if(shortcircuit) 129 | return test_limits(v) ? uint64_t(fastmod_u32(v, m32_, d_)): fastmod_u64(v, m32_, d_); 130 | return fastmod_u64(v, M_, d_); 131 | } 132 | INLINE div_t divmod(uint64_t v) const { 133 | auto d = div(v); 134 | return div_t {d, v - d_ * d}; 135 | } 136 | }; 137 | template<> struct Schismatic { 138 | const uint32_t d_; 139 | const uint64_t M_; 140 | Schismatic(uint32_t d): d_(d), M_(computeM_u32(d)) {} 141 | auto d() const {return d_;} 142 | INLINE uint32_t div(uint32_t v) const {return fastdiv_u32(v, M_);} 143 | INLINE uint32_t mod(uint32_t v) const {return fastmod_u32(v, M_, d_);} 144 | INLINE div_t divmod(uint32_t v) const { 145 | auto tmpd = div(v); 146 | return div_t {tmpd, v - d_ * tmpd}; 147 | } 148 | }; 149 | template<> struct Schismatic: Schismatic { 150 | templateSchismatic(Args &&...args): 151 | Schismatic(std::forward(args)...){} 152 | }; 153 | template<> struct Schismatic: Schismatic { 154 | templateSchismatic(Args &&...args): 155 | Schismatic(std::forward(args)...){} 156 | }; 157 | 158 | } // namespace schism 159 | 160 | #endif /* SKETCH_DIV_H__ */ 161 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/exception.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef SKETCHCEPTION_H__ 3 | #define SKETCHCEPTION_H__ 4 | #include 5 | #include 6 | #if ZWRAP_USE_ZSTD 7 | # include "zstd_zlibwrapper.h" 8 | #else 9 | # include 10 | #endif 11 | 12 | namespace sketch { 13 | 14 | inline namespace exception { 15 | 16 | class NotImplementedError: public std::runtime_error { 17 | public: 18 | template 19 | NotImplementedError(Args &&...args): std::runtime_error(std::forward(args)...) {} 20 | 21 | NotImplementedError(): std::runtime_error("NotImplemented.") {} 22 | }; 23 | 24 | class UnsatisfiedPreconditionError: public std::runtime_error { 25 | public: 26 | UnsatisfiedPreconditionError(std::string msg): std::runtime_error(std::string("Unsatisfied precondition: ") + msg) {} 27 | 28 | UnsatisfiedPreconditionError(): std::runtime_error("Unsatisfied precondition.") {} 29 | }; 30 | 31 | 32 | static int precondition_require(bool condition, std::string s, int ec=0) { 33 | if(!condition) { 34 | if(ec) throw UnsatisfiedPreconditionError(s + " Error code: " + std::to_string(ec)); 35 | else throw UnsatisfiedPreconditionError(s); 36 | } 37 | return ec; 38 | } 39 | 40 | class UnsatisfiedPostconditionError: public std::runtime_error { 41 | public: 42 | UnsatisfiedPostconditionError(std::string msg): std::runtime_error(std::string("Unsatisfied precondition: ") + msg) {} 43 | 44 | UnsatisfiedPostconditionError(): std::runtime_error("Unsatisfied precondition.") {} 45 | }; 46 | 47 | static int postcondition_require(bool condition, std::string s, int ec=0) { 48 | if(!condition) { 49 | if(ec) throw UnsatisfiedPostconditionError(s + " Error code: " + std::to_string(ec)); 50 | else throw UnsatisfiedPostconditionError(s); 51 | } 52 | return ec; 53 | } 54 | 55 | #define PREC_REQ_EC(condition, s, ec) \ 56 | ::sketch::exception::precondition_require(condition, std::string(s) + '[' + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"', ec) 57 | #define PREC_REQ(condition, s) PREC_REQ_EC(condition, s, 0) 58 | #define POST_REQ_EC(condition, s, ec) \ 59 | ::sketch::exception::postcondition_require(condition, std::string(s) + '[' + __FILE__ + '|' + __PRETTY_FUNCTION__ + "|#L" + std::to_string(__LINE__) + "] Failing condition: \"" + #condition + '"', ec) 60 | #define POST_REQ(condition, s) POST_REQ_EC(condition, s, 0) 61 | 62 | #define DEPRECATION_WARNING(condition) std::fprintf(stderr, "[%s:%d:%s] Warning: %s will be deprecated.\n", __PRETTY_FUNCTION__, __LINE__, __FILE__, #condition) 63 | 64 | class ZlibError: public std::runtime_error { 65 | static const char *es(int c) { 66 | #ifndef Z_NEED_DICT 67 | #define Z_NEED_DICT 2 68 | #define UNDEF_Z_NEED_DICT 69 | #endif 70 | static constexpr const char * const z_errmsg[10] = { 71 | (const char *)"need dictionary", /* Z_NEED_DICT 2 */ 72 | (const char *)"stream end", /* Z_STREAM_END 1 */ 73 | (const char *)"", /* Z_OK 0 */ 74 | (const char *)"file error", /* Z_ERRNO (-1) */ 75 | (const char *)"stream error", /* Z_STREAM_ERROR (-2) */ 76 | (const char *)"data error", /* Z_DATA_ERROR (-3) */ 77 | (const char *)"insufficient memory", /* Z_MEM_ERROR (-4) */ 78 | (const char *)"buffer error", /* Z_BUF_ERROR (-5) */ 79 | (const char *)"incompatible version",/* Z_VERSION_ERROR (-6) */ 80 | (const char *)"" 81 | }; 82 | c = Z_NEED_DICT - c; 83 | return c >= 0 ? z_errmsg[c]: "no message"; 84 | #ifdef UNDEF_Z_NEED_DICT 85 | #undef UNDEF_Z_NEED_DICT 86 | #undef Z_NEED_DICT 87 | #endif 88 | } 89 | public: 90 | ZlibError(int ze, std::string s): std::runtime_error(std::string("zlibError [") + es(ze) + "]" + s) {} 91 | ZlibError(std::string s): ZlibError(Z_ERRNO, s) {} 92 | }; 93 | 94 | #ifdef __CUDACC__ 95 | struct CudaError: public std::runtime_error { 96 | public: 97 | CudaError(cudaError_t ce, std::string s): std::runtime_error(std::string("cudaError_t [") + cudaGetErrorString(ce) + "]" + s) {} 98 | }; 99 | #endif // __CUDACC__ 100 | } // exception 101 | 102 | } // sketch 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/filterhll.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_HLL_H__ 2 | #define FILTER_HLL_H__ 3 | #include "common.h" 4 | #include "hll.h" 5 | #include "cbf.h" 6 | 7 | namespace sketch { 8 | inline namespace fhll { 9 | 10 | template 11 | class fhllbase_t { 12 | using cbf_t = bf::cbfbase_t; 13 | using hll_t = hll::hllbase_t; 14 | cbf_t cbf_; 15 | hll_t hll_; 16 | unsigned threshold_; 17 | public: 18 | fhllbase_t(unsigned np_, size_t nbfs, size_t l2sz, unsigned nhashes, uint64_t seedseedseedval, 19 | unsigned threshold, hll::EstimationMethod estim=hll::ERTL_MLE, hll::JointEstimationMethod jestim=hll::ERTL_JOINT_MLE): 20 | cbf_(nbfs, l2sz, nhashes, seedseedseedval), hll_(np_, estim, jestim), threshold_(threshold) { 21 | if(threshold > (1u << (nbfs - 1))) throw std::runtime_error("Count threshold must be countable-to"); 22 | } 23 | void addh(uint64_t val) { 24 | cbf_.addh(val); // This wastes one check in bf1. TODO: elide this. 25 | if(cbf_.est_count(val) >= threshold_) hll_.addh(val); 26 | } 27 | void addh(VType val) { 28 | cbf_.addh(val); // This wastes one check in bf1. TODO: elide this. 29 | val.for_each([&](uint64_t val){if(cbf_.est_count(val) >= threshold_) hll_.addh(val);}); 30 | } 31 | void clear() { 32 | hll_.clear(); 33 | cbf_.clear(); 34 | } 35 | void set_threshold(unsigned threshold) {threshold_ = threshold;} 36 | void resize_bloom(unsigned newnp) { 37 | cbf_.resize_sketches(newnp); 38 | } 39 | auto threshold() const {return threshold_;} 40 | void not_ready() {hll_.not_ready();} 41 | hll_t &hll() {return hll_;} 42 | const hll_t &hll() const {return hll_;} 43 | void free_cbf() {cbf_.free();} 44 | void free_hll() {hll_.free();} 45 | void reseed(uint64_t seed) { 46 | cbf_.reseed(seed); 47 | } 48 | fhllbase_t(const fhllbase_t&) = default; 49 | fhllbase_t clone(uint64_t seed=0) const { 50 | auto ret = fhllbase_t(*this); 51 | ret.clear(); 52 | ret.reseed(seed ? seed: (uint64_t(std::rand()) << 32) | std::rand()); 53 | } 54 | }; 55 | using fhll_t = fhllbase_t<>; 56 | 57 | 58 | template 59 | class pcbfhllbase_t { 60 | using hll_t = hll::hllbase_t; 61 | bf::pcbfbase_t pcb_; 62 | hll_t hll_; 63 | unsigned threshold_; 64 | uint64_t seedseedseedval_; 65 | public: 66 | pcbfhllbase_t(unsigned filternp_, unsigned subnp_, size_t nbfs, size_t l2sz, unsigned nhashes, uint64_t seedseedseedval, 67 | unsigned threshold, hll::EstimationMethod estim=hll::ERTL_MLE, hll::JointEstimationMethod jestim=hll::ERTL_JOINT_MLE, bool shrinkpow2=true): 68 | pcb_(nbfs, l2sz, nhashes, seedseedseedval, subnp_, estim, jestim, shrinkpow2), 69 | hll_(filternp_, estim, jestim), threshold_{threshold}, seedseedseedval_(seedseedseedval) 70 | { 71 | if(threshold > (1u << (pcb_.size() - 1))) throw std::runtime_error("Count threshold must be countable-to"); 72 | } 73 | void addh(uint64_t val) { 74 | pcb_.addh(val); // This wastes a check. TODO: elide this. 75 | if(pcb_.est_count(val) >= threshold_) hll_.addh(val); 76 | } 77 | void addh(VType val) { 78 | pcb_.addh(val); // This wastes a check. TODO: elide this. 79 | val.for_each([&](uint64_t val){if(pcb_.est_count(val) >= threshold_) hll_.addh(val);}); 80 | } 81 | void reseed(uint64_t newseed) { 82 | seedseedseedval_ = newseed; 83 | pcb_.reseed(newseed); 84 | } 85 | void clear() { 86 | hll_.clear(); 87 | pcb_.clear(); 88 | } 89 | void resize_bloom(unsigned newsize) { 90 | pcb_.resize_bloom(newsize); 91 | } 92 | void set_threshold(unsigned threshold) {threshold_ = threshold;} 93 | auto threshold() const {return threshold_;} 94 | void not_ready() {hll_.not_ready();} 95 | hll_t &hll() {return hll_;} 96 | const hll_t &hll() const {return hll_;} 97 | pcbfhllbase_t(const pcbfhllbase_t &other) = default; 98 | pcbfhllbase_t clone(uint64_t seed=0) const { 99 | auto ret = pcbfhllbase_t(*this); 100 | ret.clear(); 101 | ret.reseed(seed ? seed: (uint64_t(std::rand())<<32)|std::rand()); 102 | return ret; 103 | } 104 | void free_filters() { 105 | pcb_.free(); 106 | } 107 | }; 108 | using pcfhll_t = pcbfhllbase_t; 109 | 110 | } // inline namespace fhll 111 | } // namespace sketch 112 | 113 | #endif // #ifndef FILTER_HLL_H__ 114 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/fixed_vector.h: -------------------------------------------------------------------------------- 1 | #ifndef FIXED_VECTOR_H__ 2 | #define FIXED_VECTOR_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef CONST_IF 9 | #if __cplusplus >= 201703L 10 | #define CONST_IF(...) if constexpr(__VA_ARGS__) 11 | #else 12 | #define CONST_IF(...) if(__VA_ARGS__) 13 | #endif 14 | #endif 15 | 16 | namespace fixed { 17 | 18 | template 19 | class vector { 20 | static_assert(std::is_trivially_destructible::value, "T must not have a destructor to call"); 21 | T *data_; 22 | size_t n_; 23 | public: 24 | using value_type = T; 25 | static T *allocate(size_t nelem) { 26 | void *ret; 27 | const size_t nb = nelem * sizeof(T); 28 | CONST_IF(aln) { 29 | if(posix_memalign(&ret, aln, nb)) { 30 | throw std::bad_alloc(); 31 | } 32 | } else { 33 | if((ret = std::malloc(nb)) == nullptr) { 34 | throw std::bad_alloc(); 35 | } 36 | } 37 | return static_cast(ret); 38 | } 39 | template 40 | vector(It i1, It i2): data_(allocate(std::distance(i1, i2))), n_(std::distance(i1, i2)) { 41 | std::copy(i1, i2, data_); 42 | } 43 | vector(size_t n, T initial_value=T()): data_(allocate(n)), n_(n) { 44 | std::fill_n(data_, n_, initial_value); 45 | } 46 | ~vector() {std::free(data_);} 47 | vector(): data_(nullptr), n_(0) {} 48 | vector &operator=(const vector &o) { 49 | auto tmp = static_cast(std::realloc(data_, o.n_ * sizeof(T))); 50 | if(tmp == nullptr) throw std::bad_alloc(); 51 | data_ = tmp; 52 | n_ = o.n_; 53 | std::copy(o.data_, o.data_ + n_, data_); 54 | return *this; 55 | } 56 | void resize(size_t newsize, const T initial_value=T()) { 57 | if(newsize <= n_) { 58 | n_ = newsize; 59 | return; 60 | } 61 | auto tmp = allocate(newsize); 62 | CONST_IF(std::is_trivially_destructible::value) { 63 | std::copy(data_, data_ + n_, tmp); 64 | } else { 65 | std::move(data_, data_ + n_, tmp); 66 | } 67 | std::fill_n(tmp + n_, newsize - n_, initial_value); 68 | data_ = tmp; 69 | } 70 | vector &operator=(vector &&o) { 71 | std::free(data_); 72 | data_ = o.data_; 73 | n_ = o.n_; 74 | } 75 | vector(vector &&o): n_(o.n_) { 76 | if(this == std::addressof(o)) return; 77 | data_ = o.data_; 78 | o.data_ = nullptr; 79 | } 80 | vector(const vector &o): vector(o.size()) { 81 | std::copy(o.begin(), o.end(), begin()); 82 | } 83 | auto begin() {return data_;} 84 | auto begin() const {return data_;} 85 | auto end() {return data_ + n_;} 86 | auto end() const {return data_ + n_;} 87 | auto size() const {return n_;} 88 | T &operator[](size_t k) {return data_[k];} 89 | const T &operator[](size_t k) const {return data_[k];} 90 | const T *data() const {return data_;} 91 | T *data() {return data_;} 92 | T &front() {return data_[0];} 93 | const T &front() const {return data_[0];} 94 | T &back() {return data_[n_ - 1];} 95 | const T &back() const {return data_[n_ - 1];} 96 | void fill(const T val) { 97 | std::fill(this->begin(), this->end(), val); 98 | } 99 | bool operator<(const vector &o) const { 100 | return std::lexicographical_compare(begin(), end(), o.begin(), o.end()); 101 | } 102 | bool operator>(const vector &o) const { 103 | return std::lexicographical_compare(begin(), end(), o.begin(), o.end(), std::greater()); 104 | } 105 | }; 106 | 107 | } 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/fy.h: -------------------------------------------------------------------------------- 1 | #ifndef WMH_FY_H__ 2 | #define WMH_FY_H__ 3 | #include "sketch/div.h" 4 | #include 5 | #include 6 | #include 7 | #include "aesctr/wy.h" 8 | #include 9 | 10 | namespace fisher_yates { 11 | 12 | using std::size_t; 13 | 14 | struct LazyShuffler { 15 | // Algorithm 6, https://arxiv.org/pdf/1911.00675.pdf 16 | // Uses 32-bit integers for cheaper modulo reductions, 17 | // and uses the fastmod https://arxiv.org/abs/1902.01961 trick 18 | using IT = uint32_t; 19 | private: 20 | std::vector data_; 21 | wy::WyRand rng_; 22 | size_t i_ = 0, c_ = 0, sz_; 23 | std::vector> divs_; 24 | 25 | 26 | IT &getg(size_t i) {return data_[i << 1];} 27 | IT &getv(size_t i) {return data_[(i << 1) + 1];} 28 | public: 29 | LazyShuffler(size_t n, uint64_t seed=0): data_(n * 2), rng_(seed), sz_(n) { 30 | divs_.reserve(n); 31 | for(size_t i = 0; i < n; ++i) 32 | divs_.emplace_back(n - i); 33 | reset(); 34 | } 35 | size_t size() const {return sz_;} 36 | IT step() { 37 | IT samp = divs_[i_].mod(rng_()); 38 | IT j = i_ + samp; 39 | assert(j < size()); 40 | IT &gj = getg(j); 41 | const IT &gi = getg(i_); 42 | IT &vj = getv(j); 43 | const IT k = vj == c_ ? gj: j; 44 | gj = getv(i_) == c_ ? gi: i_; 45 | vj = c_; 46 | ++i_; 47 | return k; 48 | } 49 | bool has_next() {return i_ < sz_;} 50 | void seed(uint64_t seed) { 51 | rng_.seed(seed); 52 | } 53 | void resize(size_t newsize, uint64_t seed=0) { 54 | data_.resize(newsize * 2); 55 | std::fill(data_.begin(), data_.end(), IT(0)); 56 | divs_.clear(); 57 | for(size_t i = 0; i < newsize; ++i) 58 | divs_.emplace_back(newsize - i); 59 | rng_.seed(seed); 60 | reset(); 61 | } 62 | void reset() { 63 | i_ = 0; 64 | ++c_; 65 | } 66 | }; 67 | 68 | } // namespace fisher_yates 69 | 70 | namespace fy = fisher_yates; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/hbb.h: -------------------------------------------------------------------------------- 1 | #ifndef HYPERBITBIT_H__ 2 | #define HYPERBITBIT_H__ 3 | #include "sketch/common.h" 4 | #include "sketch/hash.h" 5 | 6 | namespace sketch { 7 | 8 | inline namespace hbb { 9 | 10 | /** 11 | * 12 | * HyperBitBit algorithm (c/o Sedgewick) from 13 | * https://www.cs.princeton.edu/~rs/talks/AC11-Cardinality.pdf 14 | * Based on https://github.com/thomasmueller/tinyStats/blob/master/src/main/java/org/tinyStats/cardinality/HyperBitBit.java 15 | */ 16 | template 17 | class HyperBitBit { 18 | 19 | uint32_t logn_; 20 | uint64_t s1_, s2_; 21 | HashStruct hf_; 22 | public: 23 | uint64_t hash(uint64_t item) const {return hf_(item);} 24 | template 25 | HyperBitBit(Args &&...args): logn_(5), s1_(0), s2_(0), hf_(std::forward(args)...) {} 26 | 27 | void addh(uint64_t item) {add(hash(item));} 28 | void add(uint64_t hv) { 29 | unsigned r = ctz(hv); 30 | if(r > logn_) { 31 | const auto k = (hv >> (sizeof(hv) * CHAR_BIT - 6)); 32 | const auto bit = 1uL << k; 33 | s1_ |= bit; 34 | if (r > logn_ + 1u) s2_ |= bit; 35 | if(popcount(s1_) > 31) 36 | s1_ = s2_, s2_ = 0, ++logn_; 37 | } 38 | } 39 | 40 | double cardinality_estimate() const { 41 | //std::fprintf(stderr, "pcsum for this: %g\n", logn_ + 5.15 + popcount(s1_) / 32.); 42 | return std::pow(2., (logn_ + 5.8 + popcount(s1_) / 32.)); 43 | } 44 | double report() const {return cardinality_estimate();} 45 | }; 46 | 47 | struct HyperHyperBitBitSimple { 48 | std::vector> data_; 49 | std::vector seeds_; 50 | HyperHyperBitBitSimple(size_t n): data_(n), seeds_(n) { 51 | std::mt19937_64 mt(n); 52 | for(auto &i: seeds_) i = mt(); 53 | } 54 | void addh(uint64_t x) { 55 | add(x); 56 | } 57 | void add(uint64_t x) { 58 | data_[x % data_.size()].addh(hash::WangHash()(x ^ seeds_[x % data_.size()])); 59 | } 60 | double report() { 61 | double estsum = 0.; 62 | double harmestsum = 0.; 63 | std::vector v; 64 | for(const auto &i: data_) { 65 | auto r = i.report(); 66 | estsum += r; 67 | harmestsum += 1. / r; 68 | v.push_back(r); 69 | } 70 | std::sort(v.begin(), v.end()); 71 | double oret = .5 * (v[v.size() / 2] + v[(v.size() - 1) / 2]); 72 | fprintf(stderr, "median: %g\n", oret); 73 | harmestsum = data_.size() / harmestsum; 74 | std::fprintf(stderr, "total sum: %g. harmestsum: %g\n", estsum, harmestsum); 75 | return estsum; 76 | } 77 | }; 78 | struct HyperHyperBitBit { 79 | using lntype = uint32_t; 80 | uint32_t nelem_; 81 | std::unique_ptr logns_; 82 | std::unique_ptr s1s_, s2s_; 83 | HyperHyperBitBit(size_t n): nelem_(n), logns_(new lntype[n]), s1s_(new uint64_t[nelem_]()), s2s_(new uint64_t[nelem_]()) { 84 | std::fill_n(logns_.get(), nelem_, uint32_t(5)); 85 | std::fill_n(s1s_.get(), nelem_, uint64_t(0)); 86 | std::fill_n(s2s_.get(), nelem_, uint64_t(0)); 87 | } 88 | void addh(uint64_t x) { 89 | wy::wyhash64_stateless(&x); 90 | return add(x); 91 | } 92 | void add(uint64_t v) { 93 | auto idx = v % nelem_; 94 | v /= nelem_; 95 | auto r = ctz(v); 96 | auto &logn = logns_[idx]; 97 | if(r > logn) { 98 | auto bit = uint64_t(1) << ((v>>(r + 1))%64); 99 | auto &sketch = s1s_[idx], sketch2 = s2s_[idx]; 100 | sketch |= bit; 101 | if(r > logn + 1) { 102 | sketch2 |= bit; 103 | } 104 | if(popcount(sketch) > 31) { 105 | sketch = sketch2; 106 | sketch2 = 0; 107 | ++logn; 108 | } 109 | } 110 | } 111 | double report() const { 112 | double pcsum = 0; 113 | double est_sums = 0, ies = 0., hes = 0.; 114 | for(size_t i = 0; i < nelem_; ++i) { 115 | double cinc = popcount(s1s_[i]) / 32. + 6.43 + logns_[i]; 116 | pcsum += cinc; 117 | est_sums += std::pow(2., cinc); 118 | ies += 1. / std::pow(2., cinc); 119 | hes += 1. / cinc; 120 | } 121 | ies = nelem_ * nelem_ / ies; 122 | hes = nelem_ / hes; 123 | std::fprintf(stderr, "est sum: %g. iestsum: %g. hestsum: %g\n", est_sums, ies, std::pow(2., hes)); 124 | std::fprintf(stderr, "pcsum before: %g\n", pcsum); 125 | pcsum /= nelem_; 126 | std::fprintf(stderr, "pcsum: %g\n", pcsum); 127 | return ies; 128 | } 129 | }; 130 | 131 | } // hbb 132 | } // sketch 133 | 134 | #endif /* HYPERBITBIT_H__ */ 135 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/isz.h: -------------------------------------------------------------------------------- 1 | #ifndef ISZ_H__ 2 | #define ISZ_H__ 3 | #include 4 | 5 | namespace sketch { 6 | namespace isz { 7 | template> 8 | std::uint64_t intersection_size(const Container &c1, const Container &c2, const Cmp &cmp=Cmp()) { 9 | // These containers must be sorted. 10 | //static_assert(std::is_same::value, "Containers must derefernce to the same type."); 11 | //for(const auto v: c1) std::fprintf(stderr, "element is %zu\n", size_t(v)); 12 | assert(std::is_sorted(c2.begin(), c2.end(), cmp)); 13 | assert(std::is_sorted(c1.begin(), c1.end(), cmp)); 14 | auto it1 = std::begin(c1); 15 | auto it2 = std::begin(c2); 16 | const auto e1 = std::cend(c1); 17 | const auto e2 = std::cend(c2); 18 | if(it1 == e1 || it2 == e2) return 0; 19 | std::uint64_t ret = 0; 20 | FOREVER { 21 | if(*it1 == *it2) { // Easily predicted 22 | ++ret; 23 | if(++it1 == e1 || ++it2 == e2) break; 24 | } else if(cmp(*it1, *it2)) { 25 | if(++it1 == e1) break; 26 | } else { 27 | if(++it2 == e2) break; 28 | } 29 | } 30 | return ret; 31 | } 32 | } // common 33 | } // sketch 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/kthread.h: -------------------------------------------------------------------------------- 1 | #ifndef KTHREAD_H 2 | #define KTHREAD_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); 9 | void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); 10 | 11 | void *kt_forpool_init(int n_threads); 12 | void kt_forpool_destroy(void *_fp); 13 | void kt_forpool(void *_fp, void (*func)(void*,long,int), void *data, long n); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef SKETCH_MACROS_H__ 3 | #define SKETCH_MACROS_H__ 4 | #include "hedley.h" 5 | 6 | 7 | // INLINE 8 | #ifndef INLINE 9 | # if __GNUC__ || __clang__ 10 | # define INLINE __attribute__((always_inline)) inline 11 | # else 12 | # define INLINE inline 13 | # endif 14 | #endif 15 | 16 | // unlikely/likely 17 | #ifndef unlikely 18 | # if defined(__GNUC__) || defined(__INTEL_COMPILER) 19 | # define unlikely(x) HEDLEY_UNLIKELY((x)) 20 | # else 21 | # define unlikely(x) (x) 22 | # endif 23 | #endif 24 | 25 | #ifndef likely 26 | # if defined(__GNUC__) || defined(__INTEL_COMPILER) 27 | # define likely(x) HEDLEY_LIKELY(!!(x)) 28 | # else 29 | # define likely(x) (x) 30 | # endif 31 | #endif 32 | 33 | 34 | // OpenMP 35 | 36 | #ifdef _OPENMP 37 | # ifndef OMP_PRAGMA 38 | # define OMP_PRAGMA(x) _Pragma(x) 39 | # endif 40 | # ifndef OMP_ONLY 41 | # define OMP_ONLY(...) __VA_ARGS__ 42 | # endif 43 | # ifndef OMP_PFOR 44 | # define OMP_PFOR OMP_PRAGMA("omp parallel for") 45 | # endif 46 | # ifndef OMP_PFOR_DYN 47 | # define OMP_PFOR_DYN OMP_PRAGMA("omp parallel for schedule(dynamic)") 48 | # endif 49 | # ifndef OMP_ELSE 50 | # define OMP_ELSE(x, y) x 51 | # endif 52 | # ifndef OMP_ATOMIC 53 | # define OMP_ATOMIC OMP_PRAGMA("omp atomic") 54 | # endif 55 | # ifndef OMP_CRITICAL 56 | # define OMP_CRITICAL OMP_PRAGMA("omp critical") 57 | # endif 58 | # ifndef OMP_SECTIONS 59 | # define OMP_SECTIONS OMP_PRAGMA("omp sections") 60 | # endif 61 | # ifndef OMP_SECTION 62 | # define OMP_SECTION OMP_PRAGMA("omp section") 63 | # endif 64 | # ifndef OMP_BARRIER 65 | # define OMP_BARRIER OMP_PRAGMA("omp barrier") 66 | # endif 67 | # ifndef OMP_SET_NT 68 | # define OMP_SET_NT(x) omp_set_num_threads(x) 69 | # endif 70 | #else 71 | # ifndef OMP_PRAGMA 72 | # define OMP_PRAGMA(x) 73 | # endif 74 | # ifndef OMP_ONLY 75 | # define OMP_ONLY(...) 76 | # endif 77 | # ifndef OMP_ELSE 78 | # define OMP_ELSE(x, y) y 79 | # endif 80 | # ifndef OMP_PFOR 81 | # define OMP_PFOR 82 | # endif 83 | # ifndef OMP_PFOR_DYN 84 | # define OMP_PFOR_DYN 85 | # endif 86 | # ifndef OMP_ATOMIC 87 | # define OMP_ATOMIC 88 | # endif 89 | # ifndef OMP_CRITICAL 90 | # define OMP_CRITICAL 91 | # endif 92 | # ifndef OMP_SECTIONS 93 | # define OMP_SECTIONS 94 | # endif 95 | # ifndef OMP_SECTION 96 | # define OMP_SECTION 97 | # endif 98 | # ifndef OMP_BARRIER 99 | # define OMP_BARRIER 100 | # endif 101 | # ifndef OMP_SET_NT 102 | # define OMP_SET_NT(x) 103 | # endif 104 | #endif 105 | 106 | 107 | #ifndef SK_RESTRICT 108 | # if __CUDACC__ || __GNUC__ || __clang__ 109 | # define SK_RESTRICT __restrict__ 110 | # elif _MSC_VER 111 | # define SK_RESTRICT __restrict 112 | # else 113 | # define SK_RESTRICT 114 | # endif 115 | #endif 116 | 117 | #ifdef __CUDA_ARCH__ 118 | # define CUDA_ARCH_ONLY(...) __VA_ARGS__ 119 | # define HOST_ONLY(...) 120 | #else 121 | # define CUDA_ARCH_ONLY(...) 122 | # define HOST_ONLY(...) __VA_ARGS__ 123 | #endif 124 | 125 | #ifdef __CUDACC__ 126 | # define CUDA_PRAGMA(x) _Pragma(x) 127 | # define CUDA_ONLY(...) __VA_ARGS__ 128 | #else 129 | # define CUDA_PRAGMA(x) 130 | # define CUDA_ONLY(...) 131 | #endif 132 | 133 | #define CPP_PASTE(...) sk__xstr__(__VA_ARGS__) 134 | #define CPP_PASTE_UNROLL(...) sk__xstr__("unroll" __VA_ARGS__) 135 | 136 | 137 | #ifndef THREADSAFE_ELSE 138 | # ifndef NOT_THREADSAFE 139 | # define THREADSAFE_ELSE(x, y) x 140 | # define THREADSAFE_ONLY(...) __VA_ARGS__ 141 | # else 142 | # define THREADSAFE_ELSE(x, y) y 143 | # define THREADSAFE_ONLY(...) 144 | # endif 145 | #endif 146 | 147 | 148 | #if !NDEBUG 149 | # define DBG_ONLY(...) __VA_ARGS__ 150 | # define DBG_ELSE(x, y) x 151 | #else 152 | # define DBG_ONLY(...) 153 | # define DBG_ELSE(x, y) y 154 | #endif 155 | 156 | #if VERBOSE_AF 157 | # define VERBOSE_ONLY(...) __VA_ARGS__ 158 | #else 159 | # define VERBOSE_ONLY(...) 160 | #endif 161 | 162 | #ifndef FOREVER 163 | # define FOREVER for(;;) 164 | #endif 165 | 166 | #ifndef SK_UNROLL 167 | # define SK_UNROLL _Pragma("message \"The macro, it does nothing\"") 168 | // Don't use SK_UNROLL, it only tells you if these below macros are defined. 169 | # if defined(__GNUC__) && !defined(__clang__) 170 | # define SK_UNROLL_4 _Pragma("GCC unroll 4") 171 | # define SK_UNROLL_8 _Pragma("GCC unroll 8") 172 | # define SK_UNROLL_16 _Pragma("GCC unroll 16") 173 | # define SK_UNROLL_32 _Pragma("GCC unroll 32") 174 | # define SK_UNROLL_64 _Pragma("GCC unroll 64") 175 | # elif defined(__CUDACC__) || defined(__clang__) 176 | # define SK_UNROLL_4 _Pragma("unroll 4") 177 | # define SK_UNROLL_8 _Pragma("unroll 8") 178 | # define SK_UNROLL_16 _Pragma("unroll 16") 179 | # define SK_UNROLL_32 _Pragma("unroll 32") 180 | # define SK_UNROLL_64 _Pragma("unroll 64") 181 | # else 182 | # define SK_UNROLL_4 183 | # define SK_UNROLL_8 184 | # define SK_UNROLL_16 185 | # define SK_UNROLL_32 186 | # define SK_UNROLL_64 187 | # endif 188 | #endif 189 | 190 | #if defined(__has_cpp_attribute) && __cplusplus >= __has_cpp_attribute(no_unique_address) 191 | # define SK_NO_ADDRESS [[no_unique_address]] 192 | #else 193 | # define SK_NO_ADDRESS 194 | #endif 195 | 196 | #ifndef CONST_IF 197 | # if defined(__cpp_if_constexpr) && __cplusplus >= __cpp_if_constexpr 198 | # define CONST_IF(...) if constexpr(__VA_ARGS__) 199 | # else 200 | # define CONST_IF(...) if(__VA_ARGS__) 201 | # endif 202 | #endif 203 | 204 | #ifndef BLAZE_CHECK_DEBUG 205 | # ifndef NDEBUG 206 | # define BLAZE_CHECK_DEBUG 207 | # else 208 | # define BLAZE_CHECK_DEBUG , ::blaze::unchecked 209 | # endif 210 | #endif 211 | 212 | #endif /* SKETCH_MACROS_H__ */ 213 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/median.h: -------------------------------------------------------------------------------- 1 | #ifndef SK_MEDIAN_H 2 | #define SK_MEDIAN_H 3 | #include 4 | #include "macros.h" 5 | 6 | namespace sketch { 7 | inline namespace med { 8 | 9 | template 10 | INLINE constexpr T median3(T x, T y, T z) { 11 | using std::max; 12 | using std::min; 13 | return max(min(z, x), min(max(z, x), y)); 14 | } 15 | 16 | template 17 | INLINE constexpr T median3(const C &c) { 18 | return median3(c[0], c[1], c[2]); 19 | } 20 | 21 | template 22 | INLINE constexpr T median5(T x, T y, T z, T a, T b) { 23 | using std::max; 24 | using std::min; 25 | return median3(max(min(a, y), min(z, x)), 26 | min(max(a, y), max(z, x)), 27 | b); 28 | } 29 | template 30 | INLINE constexpr T median5(const C &c) { 31 | return median5(c[0], c[1], c[2], c[3], c[4]); 32 | } 33 | 34 | namespace detail { 35 | template 36 | inline void insertion_sort(Iter begin, Iter end, Compare comp) { 37 | using T = typename std::iterator_traits::value_type; 38 | 39 | for (Iter cur = begin + 1; cur < end; ++cur) { 40 | Iter sift = cur; 41 | Iter sift_1 = cur - 1; 42 | 43 | // Compare first so we can avoid 2 moves for an element already positioned correctly. 44 | if (comp(*sift, *sift_1)) { 45 | T tmp = std::move(*sift); 46 | 47 | do { *sift-- = std::move(*sift_1); } 48 | while (sift != begin && comp(tmp, *--sift_1)); 49 | 50 | *sift = std::move(tmp); 51 | } 52 | } 53 | } 54 | template 55 | inline void insertion_sort(Iter begin, Iter end) { 56 | insertion_sort(begin, end, std::less>()); 57 | } 58 | } // detail 59 | template 60 | INLINE T median(T *v, size_t n) { 61 | static_assert(std::is_arithmetic::value, "must be arithmetic"); 62 | switch(n) { 63 | case 1: return v[0]; 64 | case 2: return (v[0] + v[1]) / 2; 65 | case 3: return median3(v[0], v[1], v[2]); 66 | case 5: return median5(v[0], v[1], v[2], v[3], v[4]); 67 | } 68 | if(n < 50) 69 | detail::insertion_sort(v, v + n); 70 | else 71 | #ifdef PDQSORT_H 72 | pdqsort(v, v + n); 73 | #else 74 | std::sort(v, v + n); 75 | #endif 76 | T ret; 77 | if(n&1) ret = v[n / 2]; 78 | else ret = (v[n / 2] + v[(n - 1) / 2]) / 2; 79 | return ret; 80 | } 81 | 82 | } //inline namespace med 83 | } // sketch 84 | #endif 85 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/pc.h: -------------------------------------------------------------------------------- 1 | #ifndef PROBCOUNTING_H__ 2 | #define PROBCOUNTING_H__ 3 | #include "sketch/integral.h" 4 | #include "sketch/hash.h" 5 | 6 | namespace sketch { 7 | 8 | namespace pc { 9 | 10 | namespace detail { 11 | template 12 | INLINE T R(T x) {return ~x & (x + 1);} 13 | template 14 | INLINE T r(T x) { 15 | //return ctz(~x); 16 | return popcount(R(x) - 1); 17 | } 18 | } // namespace detail 19 | 20 | template::value>> 21 | class ProbabilisticCounter { 22 | protected: 23 | T sketch_; 24 | public: 25 | ProbabilisticCounter(): sketch_(0) { 26 | } 27 | void add(uint64_t hv) { 28 | sketch_ |= detail::R(hv); 29 | } 30 | ProbabilisticCounter &operator|=(const ProbabilisticCounter &o) { 31 | sketch_ |= o.sketch_; 32 | return *this; 33 | } 34 | void addh(uint64_t item) { 35 | wy::wyhash64_stateless(&item); 36 | add(item); 37 | } 38 | double report() const { 39 | return detail::R(sketch_) * 1.292808; 40 | } 41 | T getregister() const {return sketch_;} 42 | }; 43 | 44 | template::value>> 45 | class PCSA { 46 | /* 47 | * Note: See https://arxiv.org/abs/2007.08051 48 | for recent theory on space/accuracy results. 49 | */ 50 | std::unique_ptr counters_; 51 | const size_t n_; 52 | public: 53 | PCSA(size_t n): counters_(new T[n]), n_(n) { 54 | std::memset(counters_.get(), 0, sizeof(T) * n_); 55 | } 56 | PCSA(const PCSA &o): counters_(new T[o.n_]), n_(o.n_) { 57 | std::memcpy(counters_.get(), o.counters_.get(), sizeof(T) * n_); 58 | } 59 | PCSA(PCSA &&o) = default; 60 | PCSA &operator|=(const PCSA &o) { 61 | for(unsigned i = 0; i < n_; ++i) counters_[i] |= o.counters_[i]; 62 | return *this; 63 | } 64 | void addh(uint64_t value) { 65 | add(value); 66 | } 67 | void add(uint64_t value) { 68 | auto ind = value % n_; 69 | value /= n_; 70 | counters_[ind] |= detail::R(value); 71 | } 72 | double report() const { 73 | CONST_IF(sizeof(T) == 4) { 74 | /* Notes: this could be accelerated for more cases. 75 | 1. Apply R(x) - 1 using SIMD 76 | 2. Apply popcount using SIMD 77 | 3. convert to floats 78 | 4. Accumulate into a result 79 | */ 80 | #if __AVX2__ 81 | __m256 sums = _mm256_setzero_ps(); 82 | static constexpr size_t nper = sizeof(__m256i) / 4; 83 | const size_t nsimd = n_ / nper; 84 | size_t i; 85 | for(i = 0; i < nsimd; ++i) { 86 | __m256i vals = _mm256_loadu_si256((const __m256i *)&counters_[i * nper]); 87 | // x = R(x) - 1 = (~x & (x + 1)) 88 | __m256i vx = _mm256_andnot_si256(vals, _mm256_add_epi32(vals, _mm256_set1_epi32(1))); 89 | // x = popcount(x) 90 | auto start = (uint32_t *)&vx; 91 | SK_UNROLL_8 92 | for(unsigned i = 0; i < nper; ++i) { 93 | start[i] = popcount(start[i]); 94 | } 95 | // x = vector of floats via popcount(x) 96 | // Acccumulate 97 | sums = _mm256_add_ps(sums, _mm256_cvtepi32_ps(vx)); 98 | } 99 | // Reduce 100 | double sum = 0.; 101 | for(unsigned i = 0; i < nper; ++i) sum += ((float *)&sums)[i]; 102 | for(i *= nper; i < n_; ++i) { 103 | sum += detail::r(counters_[i]); 104 | } 105 | sum /= n_; 106 | return n_ * 1.292808 * sum * sum; 107 | #endif 108 | } 109 | double mean = 110 | double(std::accumulate(counters_.get(), counters_.get() + n_, 0u, [](auto x, auto y) { 111 | return detail::r(y) + x; 112 | })) / n_; 113 | return n_ * 1.292808 * std::pow(2, mean); 114 | } 115 | }; 116 | 117 | } // pc 118 | using pc::PCSA; 119 | using pc::ProbabilisticCounter; 120 | 121 | } // sketch 122 | 123 | #endif /* PROBCOUNTING_H__ */ 124 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/pmh.h: -------------------------------------------------------------------------------- 1 | #ifndef P_MINHASH_H__ 2 | #define P_MINHASH_H__ 3 | #include "sseutil.h" 4 | #include "blaze/Math.h" 5 | #include "common.h" 6 | #include "aesctr/wy.h" 7 | 8 | namespace sketch { 9 | 10 | namespace jp { // British Steel 11 | 12 | template 13 | void maxify(T &x); 14 | 15 | template class TD; 16 | 17 | template 18 | void for_each_nonzero(const T &x, const Func &func) { 19 | #if VERBOSE_AF 20 | std::fprintf(stderr, "Using default %s\n", __PRETTY_FUNCTION__); 21 | #endif 22 | size_t i = 0; 23 | auto it = std::cbegin(x); 24 | while(it != std::cend(x)) { 25 | if(*it) 26 | func(i, *it); 27 | ++i; ++it; 28 | } 29 | } 30 | 31 | template 32 | void for_each_nonzero(const std::vector &x, const Func &func) { 33 | #if VERBOSE_AF 34 | std::fprintf(stderr, "Using vector %s\n", __PRETTY_FUNCTION__); 35 | #endif 36 | for(size_t i = 0; i < x.size(); ++i) 37 | if(x[i]) func(i, x[i]); 38 | } 39 | template class MatrixType, bool SO, bool B1, bool B2, bool B3> 40 | void for_each_nonzero(const blaze::Row, B1, B2, B3> &x, const Func &func) { 41 | #if VERBOSE_AF 42 | std::fprintf(stderr, "Using row %s\n", __PRETTY_FUNCTION__); 43 | #endif 44 | for(size_t i = 0; i < x.size(); ++i) 45 | if(x[i]) 46 | func(i, x[i]); 47 | } 48 | 49 | template 50 | void for_each_nonzero(const blaze::DynamicVector &x, const Func &func) { 51 | #if VERBOSE_AF 52 | std::fprintf(stderr, "Using dv %s\n", __PRETTY_FUNCTION__); 53 | #endif 54 | for(size_t i = 0; i < x.size(); ++i) 55 | if(x[i]) func(i, x[i]); 56 | } 57 | 58 | template 59 | void for_each_nonzero(const blaze::CompressedVector &x, const Func &func) { 60 | #if VERBOSE_AF 61 | std::fprintf(stderr, "Using blaze compressed matrix %s\n", __PRETTY_FUNCTION__); 62 | #endif 63 | for(const auto &el: x) { 64 | func(el.index(), el.value()); 65 | } 66 | } 67 | 68 | template class Map, typename K, typename V, typename Func> 69 | void for_each_nonzero(const Map &x, const Func &func) { 70 | #if VERBOSE_AF 71 | std::fprintf(stderr, "Using blaze compressed matrix %s\n", __PRETTY_FUNCTION__); 72 | #endif 73 | for(const auto &el: x) { 74 | func(el.first, el.second); 75 | } 76 | } 77 | 78 | 79 | template 80 | class PMinHasher { 81 | uint64_t *seeds_; 82 | size_t d_, n_; 83 | Hasher hf_; 84 | public: 85 | template 86 | PMinHasher(size_t dim, uint64_t nelem, uint64_t seed=137, Args &&...args): d_(dim), n_(nelem), hf_(std::forward(args)...) { 87 | if(posix_memalign((void **)&seeds_, sizeof(Space::VType), nelem * sizeof(*seeds_))) 88 | throw std::bad_alloc(); 89 | DefaultRNGType rng(seed); 90 | std::for_each(seeds_, seeds_ + nelem, [&rng](uint64_t &x) {x = rng();}); 91 | } 92 | PMinHasher(PMinHasher &&o): seeds_(o.seed_), d_(o.d_), n_(o.n_), hf_(std::move(o.hf_)) { 93 | o.seeds_ = nullptr; o.d_ = o.n_ = 0; 94 | } 95 | PMinHasher(const PMinHasher &o): seeds_(nullptr), d_(o.d_), n_(o.n_), hf_(o.hf_) { 96 | if(posix_memalign((void **)&seeds_, sizeof(Space::VType), n_ * sizeof(*seeds_))) 97 | throw std::bad_alloc(); 98 | } 99 | ~PMinHasher() {std::free(seeds_);} 100 | template 101 | auto hash(T x, uint64_t seed) const { 102 | if(!x) return FType(0); 103 | static_assert(sizeof(x) >= 4, "must be at least 4 bytes"); 104 | seed ^= sizeof(x) == 8 ? *reinterpret_cast(&x): *reinterpret_cast(&x); 105 | wy::WyHash rng(seed); 106 | std::uniform_real_distribution gen; 107 | return -std::log(gen(rng)) / x; 108 | } 109 | template, typename FType=double> 110 | auto hash(RAContainer &vec) const { 111 | if(vec.size() != d_) throw std::runtime_error("Wrong dimensions"); 112 | using std::min; 113 | RetType ret(n_); 114 | for(auto &e: ret) e = n_; // To work with different containers 115 | std::vector cvals; 116 | std::vector nzs; 117 | for_each_nonzero(vec, [&](auto index, auto value) { 118 | nzs.push_back(index); 119 | cvals.resize(cvals.size() + n_); 120 | for(size_t j = 0; j < n_; ++j) { 121 | cvals[(nzs.size() - 1) * n_ + j] = this->hash(value, seeds_[j]); 122 | } 123 | }); 124 | assert(cvals.size() == nzs.size() * n_); 125 | for(size_t i = 0; i < n_; ++i) { 126 | size_t minind = 0; 127 | FType minval = cvals[i]; 128 | for(size_t j = 1; j < nzs.size(); ++j) { 129 | if(cvals[j * n_ + i] < minval) { 130 | minval = cvals[j * n_ + i]; 131 | minind = j; 132 | } 133 | } 134 | ret[i] = nzs[minind]; 135 | } 136 | return ret; 137 | } 138 | }; 139 | 140 | 141 | } // Screaming For Vengeance 142 | 143 | } // namespace sketch 144 | 145 | #endif /* #ifndef P_MINHASH_H__ */ 146 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/policy.h: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_POLICY_H 2 | #define SKETCH_POLICY_H 3 | #include "integral.h" 4 | #include "div.h" 5 | namespace sketch { 6 | namespace policy { 7 | 8 | template 9 | struct SizePow2Policy { 10 | T mask_; 11 | T shift_; 12 | SizePow2Policy(size_t n): mask_((1ull << nelem2arg(n)) - 1), shift_(ilog2(mask_ + 1)) { 13 | } 14 | static size_t nelem2arg(size_t nelem) { 15 | // Return the floor of nelem, but increment by one if it wasn't a power of two. 16 | return ilog2(nelem) + ((nelem & (nelem - 1)) != 0); 17 | } 18 | size_t nelem() const {return size_t(mask_) + 1;} 19 | static size_t arg2vecsize(size_t arg) {return size_t(1) << nelem2arg(arg);} 20 | auto divmod(T rv) const {return schism::div_t{rv >> shift_, rv & mask_};} 21 | T mod(T rv) const { 22 | return rv & mask_; 23 | } 24 | T div(T rv) const { 25 | return rv >> shift_; 26 | } 27 | }; 28 | 29 | template 30 | struct SizeDivPolicy { 31 | schism::Schismatic div_; 32 | static size_t nelem2arg(size_t nelem) { 33 | return nelem; 34 | } 35 | size_t nelem() const {return div_.d();} 36 | static size_t arg2vecsize(size_t arg) {return arg;} 37 | T mod(T rv) const {return div_.mod(rv);} 38 | T div(T rv) const {return div_.div(rv);} 39 | auto divmod(T rv) const {return div_.divmod(rv);} 40 | SizeDivPolicy(T div): div_(div) {} 41 | }; 42 | 43 | } // policy 44 | } // sketch 45 | 46 | #endif /* SKETCH_POLICY_H */ 47 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/sketch.h: -------------------------------------------------------------------------------- 1 | #ifndef SKETCH_SINGLE_HEADER_H__ 2 | #define SKETCH_SINGLE_HEADER_H__ 3 | #include "./hll.h" 4 | #include "./bf.h" 5 | #include "./mh.h" 6 | #include "./bbmh.h" 7 | #include "./ccm.h" 8 | #include "./cbf.h" 9 | #include "./mult.h" 10 | #include "./heap.h" 11 | #include "./filterhll.h" 12 | #include "./mult.h" 13 | #include "./sparse.h" 14 | #include "./dd.h" 15 | #include "./hk.h" 16 | #include "./vac.h" 17 | #include "./hbb.h" 18 | #include "./mod.h" 19 | #include "./setsketch.h" 20 | 21 | #ifdef __CUDACC__ 22 | #include "hllgpu.h" 23 | #endif 24 | 25 | namespace sketch { 26 | // Flatten all classes to global sketch namespace. 27 | // Subnamespaces can still be subsampled 28 | 29 | // Set representations 30 | using namespace hll; // HyperLogLog 31 | using namespace bf; // Bloom Filters 32 | using namespace minhash; // Minhash 33 | using namespace fhll; // Filtered HLLs 34 | 35 | // Multiplicities 36 | using namespace cws; // Consistent Weighted Sampling 37 | using namespace nt; // ntcard 38 | using namespace wj; // Weighted Jaccard adapters 39 | 40 | // Count point estimators 41 | using namespace hk; // Heavy-Keeper 42 | using namespace cm; // Count/Count-Min 43 | 44 | // Utilities 45 | using namespace heap; // Heap maintainers for multisets based on a variety of criteria 46 | using namespace vac; // Approximate Multiplicity samplers for streams 47 | } 48 | 49 | namespace sk = sketch; 50 | 51 | #endif /* SKETCH_SINGLE_HEADER_H__ */ 52 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/sseutil.h: -------------------------------------------------------------------------------- 1 | #ifndef __SSE_UTIL_H__ 2 | #define __SSE_UTIL_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace sse { 11 | 12 | #ifndef unlikely 13 | # if __GNUC__ || __clang__ || defined(BUILTIN_EXPECT_AVAILABLE) 14 | # define unlikely(x) HEDLEY_LIKELY((x) != 0, 0) 15 | # else 16 | # define unlikely(x) (x) 17 | # endif 18 | #endif 19 | 20 | #ifndef likely 21 | # if __GNUC__ || __clang__ || defined(BUILTIN_EXPECT_AVAILABLE) 22 | # define likely(x) HEDLEY_LIKELY((x) != 0, 1) 23 | # else 24 | # define likely(x) (x) 25 | # endif 26 | #endif 27 | 28 | // From http://stackoverflow.com/questions/12942548/making-stdvector-allocate-aligned-memory 29 | // Accessed 11/7/16 30 | enum class Alignment : size_t 31 | { 32 | Normal = sizeof(void*), 33 | SSE = 16, 34 | AVX = 32, 35 | KB = 64, 36 | KL = 64, 37 | AVX512 = 64 38 | }; 39 | 40 | 41 | #ifndef USE_ALIGNED_ALLOC 42 | # if (__cplusplus >= 201703L && defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)) 43 | # define USE_ALIGNED_ALLOC 1 44 | # else 45 | # define USE_ALIGNED_ALLOC 0 46 | # endif 47 | #endif 48 | 49 | namespace detail { 50 | static inline void* allocate_aligned_memory(const size_t align, size_t size) { 51 | assert(align >= sizeof(void*)); 52 | assert((align & (align - 1)) == 0); // Assert is power of two 53 | 54 | void *ret; 55 | return posix_memalign(&ret, align, size) ? nullptr: ret; 56 | } 57 | } 58 | 59 | 60 | template 61 | class AlignedAllocator; 62 | 63 | 64 | template 65 | class AlignedAllocator 66 | { 67 | public: 68 | using pointer = void *; 69 | using const_pointer = const void *; 70 | using value_type = void; 71 | 72 | template struct rebind { using other = AlignedAllocator; }; 73 | }; 74 | 75 | 76 | template 77 | class AlignedAllocator 78 | { 79 | public: 80 | typedef T value_type; 81 | typedef T* pointer; 82 | typedef const T* const_pointer; 83 | typedef T& reference; 84 | typedef const T& const_reference; 85 | typedef size_t size_type; 86 | typedef std::ptrdiff_t difference_type; 87 | 88 | typedef std::true_type propagate_on_container_move_assignment; 89 | 90 | template 91 | struct rebind { typedef AlignedAllocator other; }; 92 | 93 | public: 94 | AlignedAllocator() noexcept {} 95 | template 96 | AlignedAllocator(const AlignedAllocator&) noexcept {} 97 | 98 | static constexpr size_type max_size() {return (size_type(~0) - size_type(Align)) / sizeof(T);} 99 | 100 | pointer address(reference x) const noexcept { 101 | return std::addressof(x); 102 | } 103 | const_pointer address(const_reference x) const noexcept { 104 | return std::addressof(x); 105 | } 106 | 107 | pointer allocate(size_type n, typename AlignedAllocator::const_pointer = 0) 108 | { 109 | pointer ret(reinterpret_cast(detail::allocate_aligned_memory(static_cast(Align) , n * sizeof(T)))); 110 | if(unlikely(!ret)) throw std::bad_alloc(); 111 | return ret; 112 | } 113 | 114 | void deallocate(pointer p, size_type) noexcept {std::free(p);} 115 | 116 | template 117 | void construct(U* p, Args&&... args) { 118 | ::new(reinterpret_cast(p)) U(std::forward(args)...); 119 | } 120 | 121 | void destroy(pointer p) { p->~T(); } 122 | }; 123 | 124 | 125 | template 126 | class AlignedAllocator 127 | { 128 | public: 129 | typedef T value_type; 130 | typedef const T* pointer; 131 | typedef const T* const_pointer; 132 | typedef const T& reference; 133 | typedef const T& const_reference; 134 | typedef size_t size_type; 135 | typedef std::ptrdiff_t difference_type; 136 | 137 | typedef std::true_type propagate_on_container_move_assignment; 138 | 139 | template 140 | struct rebind { typedef AlignedAllocator other; }; 141 | 142 | public: 143 | AlignedAllocator() noexcept 144 | {} 145 | 146 | template 147 | AlignedAllocator(const AlignedAllocator&) noexcept 148 | {} 149 | 150 | size_type 151 | max_size() const noexcept 152 | { return (size_type(~0) - size_type(Align)) / sizeof(T); } 153 | 154 | const_pointer 155 | address(const_reference x) const noexcept 156 | { return std::addressof(x); } 157 | 158 | pointer 159 | allocate(size_type n, typename AlignedAllocator::const_pointer = 0) 160 | { 161 | pointer ret(reinterpret_cast(detail::allocate_aligned_memory(static_cast(Align) , n * sizeof(T)))); 162 | if(unlikely(!ret)) throw std::bad_alloc(); 163 | return ret; 164 | } 165 | 166 | void 167 | deallocate(pointer p, size_type) noexcept 168 | { std::free(p); } 169 | 170 | template 171 | void 172 | construct(U* p, Args&&... args) 173 | { ::new(reinterpret_cast(p)) U(std::forward(args)...); } 174 | 175 | void 176 | destroy(pointer p) { p->~T(); } 177 | }; 178 | 179 | template 180 | inline bool operator== (const AlignedAllocator&, const AlignedAllocator&) noexcept 181 | { return TAlign == UAlign; } 182 | 183 | template 184 | inline bool operator!= (const AlignedAllocator&, const AlignedAllocator&) noexcept 185 | { return TAlign != UAlign; } 186 | 187 | } // namespace sse 188 | 189 | #endif 190 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/tsg.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_SEEDED_GEN_H__ 2 | #define THREAD_SEEDED_GEN_H__ 3 | #include 4 | #include 5 | 6 | namespace tsg { 7 | template 8 | struct ThreadSeededGen: public RNG { 9 | template 10 | ThreadSeededGen(Args &&...args): RNG(std::forward(args)...) { 11 | this->seed(std::hash{}(std::this_thread::get_id())); 12 | } 13 | template 14 | decltype(auto) operator()(Args &&...args) {return RNG::operator()(std::forward(args)...);} 15 | template 16 | decltype(auto) operator()(Args &&...args) const {return RNG::operator()(std::forward(args)...);} 17 | }; 18 | 19 | } // tsg 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/update.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef UPDATE_H__ 3 | #include "common.h" 4 | 5 | namespace sketch { 6 | namespace update { 7 | struct Increment { 8 | // Saturates 9 | template 10 | void operator()(T &ref, IntType maxval) const { 11 | if(static_cast(ref) < maxval) 12 | ref = static_cast(ref) + 1; 13 | //ref += (ref < maxval); 14 | } 15 | template 16 | void operator()(std::vector &ref, Container &con, IntType nbits) const { 17 | int64_t count = con[ref[0]]; 18 | ++count; 19 | if(range_check().cbegin()))>>(nbits, count) == 0) { 20 | for(const auto el: ref) 21 | con[el] = count; 22 | } 23 | } 24 | template 25 | Increment(Args &&... args) {} 26 | static uint64_t est_count(uint64_t val) { 27 | return val; 28 | } 29 | template 30 | static auto combine(const T1 &i, const T2 &j) { 31 | using RetType = std::common_type_t; 32 | return RetType(i) + RetType(j); 33 | } 34 | }; 35 | struct PowerOfTwo { 36 | common::DefaultRNGType rng_; 37 | uint64_t gen_; 38 | uint8_t nbits_; 39 | // Also saturates 40 | template 41 | void operator()(T &ref, IntType maxval) { 42 | #if !NDEBUG 43 | std::fprintf(stderr, "maxval: %zu. ref: %zu\n", size_t(maxval), size_t(ref)); 44 | #endif 45 | if(static_cast(ref) == 0) ref = 1; 46 | else { 47 | if(ref >= maxval) return; 48 | if(HEDLEY_UNLIKELY(nbits_ < ref)) gen_ = rng_(), nbits_ = 64; 49 | const IntType oldref = ref; 50 | ref = oldref + ((gen_ & (UINT64_C(-1) >> (64 - oldref))) == 0); 51 | gen_ >>= oldref, nbits_ -= oldref; 52 | } 53 | } 54 | template 55 | void operator()(std::vector &ref, Container &con, IntType nbits) { 56 | uint64_t val = con[ref[0]]; 57 | if(val == 0) { 58 | for(const auto el: ref) 59 | con[el] = 1; 60 | } else { 61 | if(HEDLEY_UNLIKELY(nbits_ < val)) gen_ = rng_(), nbits_ = 64; 62 | auto oldval = val; 63 | if((gen_ & (UINT64_C(-1) >> (64 - val))) == 0) { 64 | ++val; 65 | if(range_check(nbits, val) == 0) 66 | for(const auto el: ref) 67 | con[el] = val; 68 | } 69 | gen_ >>= oldval; 70 | nbits_ -= oldval; 71 | } 72 | } 73 | template 74 | static auto combine(const T1 &i, const T2 &j) { 75 | using RetType = std::common_type_t; 76 | RetType i_(i), j_(j); 77 | return std::max(i_, j_) + (i == j); 78 | } 79 | PowerOfTwo(uint64_t seed=0): rng_(seed), gen_(rng_()), nbits_(64) {} 80 | static constexpr uint64_t est_count(uint64_t val) { 81 | return val ? uint64_t(1) << (val - 1): 0; 82 | } 83 | }; 84 | struct CountSketch { 85 | // Saturates 86 | template 87 | void operator()(T &ref, IntType maxval, IntType2 hash) const { 88 | ref = int64_t(ref) + (hash&1 ? 1: -1); 89 | } 90 | template 91 | ssize_t operator()(std::vector &ref, std::vector &hashes, Container &con, IntType nbits) const { 92 | using IDX = std::decay_t; 93 | IDX newval; 94 | std::vector s; 95 | assert(ref.size() == hashes.size()); 96 | for(size_t i(0); i < ref.size(); ++i) { 97 | newval = con[ref[i]] + (hashes[i]&1 ? 1: -1); 98 | s.push_back(newval); 99 | if(range_check(nbits, newval) == 0) 100 | con[ref[i]] = newval; 101 | } 102 | if(s.size()) { 103 | common::sort::insertion_sort(s.begin(), s.end()); 104 | return (s[s.size()>>1] + s[(s.size()-1)>>1]) >> 1; 105 | } 106 | return 0; 107 | } 108 | template 109 | static void Increment(Args &&... args) {} 110 | uint64_t est_count(uint64_t val) const { 111 | return val; 112 | } 113 | template 114 | static uint64_t combine(const T1 &i, const T2 &j) { 115 | using RetType = std::common_type_t; 116 | std::fprintf(stderr, "[%s:%d:%s] I'm not sure this is actually right; this is essentially a placeholder.\n", __FILE__, __LINE__, __PRETTY_FUNCTION__); 117 | return RetType(i) + RetType(j); 118 | } 119 | template 120 | CountSketch(Args &&... args) {} 121 | }; 122 | 123 | } // update 124 | } // sketch 125 | #define UPDATE_H__ 126 | #endif /* UPDATE_H__ */ 127 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/sketch/vac.h: -------------------------------------------------------------------------------- 1 | #ifndef VAC_SKETCH_H__ 2 | #define VAC_SKETCH_H__ 3 | #include "./mult.h" 4 | #include "./hll.h" 5 | #include "./fixed_vector.h" 6 | #include "aesctr/wy.h" 7 | #include "./exception" 8 | #include "./tsg.h" 9 | 10 | namespace sketch { 11 | 12 | namespace vac { 13 | 14 | using tsg::ThreadSeededGen; 15 | 16 | template class Container=std::vector, 18 | typename RNG=wy::WyHash, 19 | typename...VectorArgs> 20 | struct VACSketch { 21 | using base = BaseSketch; 22 | 23 | // Members 24 | Container sketches_; 25 | const unsigned n_; 26 | 27 | // Construction 28 | template 29 | VACSketch(size_t n, Args &&...args): n_(n > 1? n: size_t(1)) { 30 | if(n <= 1) 31 | std::fputs((std::string(__PRETTY_FUNCTION__) + " requires n >= 2. Provided: " + std::to_string(n)).data(), stderr); 32 | 33 | sketches_.reserve(n); 34 | for(size_t i = n; i--; sketches_.emplace_back(std::forward(args)...)); 35 | } 36 | // Addition 37 | void addh(uint64_t x) { 38 | thread_local static ThreadSeededGen gen; 39 | const auto end = std::min(ctz(gen()) + 1, n_); 40 | unsigned i = 0; 41 | do sketches_[i++].addh(x); while(i < end); 42 | } 43 | // Composition 44 | VACSketch &operator+=(const VACSketch &o) { 45 | if(n_ != o.n_) throw std::runtime_error("Mismatched vacsketch counts"); 46 | auto i1 = sketches_.begin(); 47 | auto i2 = o.sketches_.begin(); 48 | auto e1 = sketches_.end(); 49 | while(i1 != e1) (*i1++ += *i2++); 50 | return *this; 51 | } 52 | VACSketch operator+(const VACSketch &o) const { 53 | auto tmp = *this; 54 | tmp += o; 55 | return tmp; 56 | } 57 | }; 58 | 59 | static fixed::vector construct_power_table(double base, size_t n) { 60 | if(base <= 1.) throw std::runtime_error(std::to_string(base) + " is forbidden. Must be > 1."); 61 | fixed::vector ret(n - 1); 62 | std::vector mem(n); 63 | auto p = mem.data(); 64 | p[0] = 1.; 65 | for(size_t i = 1; i < n; ++i) { 66 | auto tmp = base * p[i]; 67 | ret[i] = std::numeric_limits::max() / tmp; 68 | p[i + 1] = tmp; 69 | } 70 | return ret; 71 | } 72 | 73 | template class Container=std::vector, 75 | typename RNG=wy::WyHash, 76 | typename...VectorArgs> 77 | struct PowerVACSketch: public VACSketch { 78 | using super = VACSketch; 79 | 80 | const fixed::vector lut_; 81 | const double base_; 82 | 83 | template 84 | PowerVACSketch(double base, size_t n, Args &&... args): 85 | super(n, std::forward(args)...), 86 | lut_(construct_power_table(base, n)), 87 | base_(base) 88 | { 89 | std::fprintf(stderr, "base: %f. n: %zu\n", base, n); 90 | } 91 | // Addition 92 | void addh(uint64_t x) { 93 | thread_local static ThreadSeededGen gen; 94 | auto v = gen(); 95 | unsigned i = 0; 96 | do { 97 | this->sketches_[i++].addh(x); 98 | } while(i < this->n_ && v < lut_[i]); 99 | } 100 | // Composition 101 | PowerVACSketch &operator+=(const PowerVACSketch &o) { 102 | PREC_REQ(this->n_ == o.n_, "Must be same n"); 103 | PREC_REQ(this->base_ == o.base_, "Must be same base"); 104 | auto i1 = this->sketches_.begin(); 105 | auto i2 = o.sketches_.begin(); 106 | auto e1 = this->sketches_.end(); 107 | while(i1 != e1) (*i1++ += *i2++); 108 | return *this; 109 | } 110 | PowerVACSketch operator+(const PowerVACSketch &o) const { 111 | auto tmp = *this; 112 | tmp += o; 113 | return tmp; 114 | } 115 | }; 116 | 117 | using HVAC = VACSketch; 118 | using PowerHVAC = PowerVACSketch; 119 | 120 | } // vac 121 | using namespace vac; 122 | 123 | } 124 | 125 | #endif /* VAC_SKETCH_H__ */ 126 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/vec/stats.h: -------------------------------------------------------------------------------- 1 | #ifndef VEC_STATS_H 2 | #define VEC_STATS_H 3 | #include "vec.h" 4 | #include 5 | #include 6 | 7 | #ifdef _BLAZE_CONFIG_CONFIG_H_ 8 | #define HAS_BLAZE 1 9 | #else 10 | #define HAS_BLAZE 0 11 | #endif 12 | 13 | namespace stats { 14 | using namespace std::literals; 15 | 16 | template 17 | auto sum(const Container &c) { 18 | using Type = std::decay_t; 19 | using Space = vec::SIMDTypes; 20 | using VType = typename Space::VType; 21 | if(__builtin_expect(c.size() == 0, 0)) return static_cast(0); 22 | // If this saturates the type, this result will be wrong. 23 | VType tmp, tsum = 0; 24 | const VType *ptr = (const VType *)&*std::cbegin(c); 25 | auto eptr = &*std::end(c); 26 | if(Space::aligned(ptr)) { 27 | do { 28 | tmp.simd_ = Space::load((const Type *)ptr++); 29 | tsum.simd_ = Space::add(tsum.simd_, tmp.simd_); 30 | } while(ptr < (const VType *)eptr); 31 | } else { 32 | do { 33 | tmp.simd_ = Space::loadu((const Type *)ptr++); 34 | tsum.simd_ = Space::add(tsum.simd_, tmp.simd_); 35 | } while(ptr < (const VType *)eptr); 36 | } 37 | Type ret = tmp.sum(); 38 | auto lptr = (const Type *)ptr; 39 | while(lptr < eptr) ret += *lptr++; 40 | return ret; 41 | } 42 | 43 | template 44 | FloatType mean(const Container &c) { 45 | return c.size() ? static_cast(sum(c)) / c.size(): std::numeric_limits::quiet_NaN(); 46 | } 47 | 48 | 49 | template 50 | auto pearsonr(const Container &c1, const Container &c2) { 51 | using FType = std::decay_t; 52 | static_assert(std::is_floating_point_v, "Containers must hold floating points."); 53 | if(c1.size() != c2.size()) 54 | throw std::runtime_error("Wrong sizes. size1: "s + std::to_string(c1.size()) + ", " + std::to_string(c2.size())); 55 | using Space = vec::SIMDTypes; 56 | using VType = typename Space::VType; 57 | auto m1 = mean(c1), m2 = mean(c2); 58 | VType v1, v2; 59 | VType sum1sq(0), sum2sq(0), sumdot(0); 60 | const VType mb1 = m1, mb2 = m2; 61 | const VType *p1((const VType *)&c1[0]), *p2((const VType *)&c2[0]); 62 | if(Space::aligned(p1) && Space::aligned(p2)) { 63 | do { // aligned loads 64 | v1.simd_ = Space::sub(Space::load((const FType *)p1++), mb1.simd_); 65 | v2.simd_ = Space::sub(Space::load((const FType *)p2++), mb2.simd_); 66 | sum1sq = Space::add(sum1sq.simd_, Space::mul(v1.simd_, v1.simd_)); 67 | sum2sq = Space::add(sum2sq.simd_, Space::mul(v2.simd_, v2.simd_)); 68 | sumdot = Space::add(sumdot.simd_, Space::mul(v1.simd_, v2.simd_)); 69 | } while(p1 < (const VType *)&c1[c1.size()]); 70 | } else { // unaligned loads 71 | do { 72 | v1.simd_ = Space::sub(Space::loadu((const FType *)p1++), mb1.simd_); 73 | v2.simd_ = Space::sub(Space::loadu((const FType *)p2++), mb2.simd_); 74 | sum1sq = Space::add(sum1sq.simd_, Space::mul(v1.simd_, v1.simd_)); 75 | sum2sq = Space::add(sum2sq.simd_, Space::mul(v2.simd_, v2.simd_)); 76 | sumdot = Space::add(sumdot.simd_, Space::mul(v1.simd_, v2.simd_)); 77 | } while(p1 < (const VType *)&c1[c1.size()]); 78 | } 79 | auto sd = sumdot.sum(); 80 | auto s1s = sum1sq.sum(); 81 | auto s2s = sum2sq.sum(); 82 | const FType *fp1 = (FType *)p1, *fp2 = (FType *)p2; 83 | while(fp1 < &*(std::cend(c1))) { 84 | auto v1 = (*fp1++) - m1; 85 | auto v2 = (*fp2++) - m2; 86 | sd += v1 * v2; 87 | s1s += v1 * v1; 88 | s2s += v2 * v2; 89 | } 90 | auto rden = std::sqrt(s1s) * std::sqrt(s2s); // two square roots for better floating-point accuracy. 91 | return rden ? std::min(std::max(sd / rden, FType(-1.)), FType(1.)): std::numeric_limits::quiet_NAN(); 92 | } 93 | 94 | 95 | } // stats 96 | #endif 97 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/vec/welford_sd.h: -------------------------------------------------------------------------------- 1 | #ifndef WELFORD_ONLINE_STDEV_H__ 2 | #define WELFORD_ONLINE_STDEV_H__ 3 | 4 | // based on John D. Cook's blog https://www.johndcook.com/blog/standard_deviation/ 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "blaze/Math.h" 10 | 11 | namespace stats { 12 | 13 | template::value>::type, 15 | typename=typename std::enable_if::value>::type> 16 | class OnlineSD { 17 | T old_mean_, new_mean_, olds_, news_; 18 | SizeType n_; 19 | public: 20 | OnlineSD() {std::memset(this, 0, sizeof(*this));} 21 | 22 | void add(T x) 23 | { 24 | // See Knuth TAOCP vol 2, 3rd edition, page 232 25 | if (__builtin_expect(++n_ == 1, 0)) old_mean_ = new_mean_ = x, olds_ = 0.; 26 | else 27 | { 28 | new_mean_ = old_mean_ + (x - old_mean_)/n_; 29 | news_ = olds_ + (x - old_mean_)*(x - new_mean_); 30 | // set up for next iteration 31 | old_mean_ = new_mean_, olds_ = news_; 32 | } 33 | } 34 | size_t n() const {return n_;} 35 | T mean() const {return n_ ? new_mean_: 0.0;} 36 | T variance() const {return n_ > 1 ? news_ / (n_ - 1): 0.0;} 37 | T stdev() const {return std::sqrt(variance());} 38 | }; 39 | 40 | template, typename SizeType=std::uint64_t> 41 | class OnlineVectorSD { 42 | VecType old_mean_, new_mean_, olds_, news_; 43 | SizeType n_; 44 | public: 45 | 46 | template 47 | OnlineVectorSD(const VType2 &vec): OnlineVectorSD(vec.size()) { 48 | add(vec); 49 | } 50 | OnlineVectorSD(size_t d): old_mean_(d, 0), new_mean_(d, 0), olds_(d, 0), news_(d, 0), n_(0) {} 51 | 52 | template 53 | void add(const VType2 &x) 54 | { 55 | // See Knuth TAOCP vol 2, 3rd edition, page 232 56 | if (__builtin_expect(++n_ == 1, 0)) old_mean_ = new_mean_ = x, olds_ = 0.; 57 | else 58 | { 59 | new_mean_ = old_mean_ + (x - old_mean_)* (1./n_); 60 | news_ = olds_ + (x - old_mean_)*(x - new_mean_); 61 | // set up for next iteration 62 | old_mean_ = new_mean_, olds_ = news_; 63 | } 64 | } 65 | #ifndef NDEBUG 66 | #define ASSERTFULL() do {if(!n_) {throw std::runtime_error("Cannot calculate stats on an empty stream.");}} while(0) 67 | #endif 68 | size_t n() const {return n_;} 69 | const VecType &mean() const {ASSERTFULL(); return new_mean_;} 70 | const VecType &variance() const {ASSERTFULL(); return news_ / (n_ - 1);} 71 | const VecType &stdev() const {ASSERTFULL(); return blaze::sqrt(variance());} 72 | }; 73 | 74 | template::value>::type, 76 | typename=typename std::enable_if::value>::type> 77 | class OnlineStatistics 78 | { 79 | public: 80 | OnlineStatistics() {clear();} 81 | void clear() {std::memset(this, 0, sizeof(*this));} 82 | void add(T x) { 83 | T delta, delta_n, delta_n2, term1; 84 | 85 | SizeType n1 = n_++; 86 | delta = x - m1_; 87 | delta_n = delta / n_; 88 | delta_n2 = delta_n * delta_n; 89 | term1 = delta * delta_n * n1; 90 | m1_ += delta_n; 91 | m4_ += term1 * delta_n2 * (n_*n_ - 3*n_ + static_cast(3)) + \ 92 | 6. * delta_n2 * m2_ - 4. * delta_n * m3_; 93 | m3_ += term1 * delta_n * (n_ - 2) - 3 * delta_n * m2_; 94 | m2_ += term1; 95 | } 96 | SizeType n() const {return n_;} 97 | T mean() const {return m1_;} 98 | T variance() const {return m2_/(n_- (n_ > 1.0));} 99 | T stdev() const {return std::sqrt(variance());} 100 | T skewness() const { 101 | assert(m2_ >= 0.); 102 | return std::sqrt(static_cast(n_)) * m3_/ std::pow(m2_, 1.5); 103 | } 104 | T kurtosis() const {return static_cast(n_)*m4_ / (m2_*m2_) - 3.0;} 105 | 106 | template 107 | OnlineStatistics& operator+=(const OnlineStatistics& b) 108 | { 109 | auto newn = this->n + b.n; 110 | 111 | const T delta = b.m1_ - this->m1_; 112 | const T delta2 = delta*delta; 113 | const T delta3 = delta*delta2; 114 | const T delta4 = delta2*delta2; 115 | 116 | auto newm1 = (this->n*this->m1_ + b.n*b.m1_) / newn; 117 | auto newm2 = this->m2_ + b.m2_ + 118 | delta2 * this->n * b.n / newn; 119 | 120 | auto newm3 = this->m3_ + b.m3_ + 121 | delta3 * this->n * b.n * (this->n - b.n)/(newn*newn); 122 | newm3 += 3.0*delta * (this->n*b.m2_ - b.n*this->m2_) / newn; 123 | 124 | auto newm4 = this->m4_ + b.m4_ + delta4*this->n*b.n * (this->n*this->n - this->n*b.n + b.n*b.n) / 125 | (newn*newn*newn); 126 | newm4 += 6.0*delta2 * (this->n*this->n*b.m2_ + b.n*b.n*this->m2_)/(newn*newn) + 127 | 4.0*delta*(this->n*b.m3_ - b.n*this->m3_) / newn; 128 | this->n = newn; 129 | this->m4_ = newm4; 130 | this->m3_ = newm3; 131 | this->m2_ = newm2; 132 | this->m1_ = newm1; 133 | return *this; 134 | } 135 | 136 | private: 137 | T m1_, m2_, m3_, m4_; 138 | SizeType n_; 139 | }; 140 | 141 | template 142 | auto operator+(const OnlineStatistics &a, const OnlineStatistics &b) { 143 | auto ret(a); // Copy a 144 | ret += b; 145 | return ret; 146 | } 147 | 148 | } // namespace stats 149 | 150 | #endif /* #ifndef WELFORD_ONLINE_STDEV_H__ */ 151 | -------------------------------------------------------------------------------- /veritymap/src/tools/sketch/include/xxHash/xxh3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * xxHash - Extremely Fast Hash algorithm 3 | * Development source file for `xxh3` 4 | * Copyright (C) 2019-2020 Yann Collet 5 | * 6 | * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions are 10 | * met: 11 | * 12 | * * Redistributions of source code must retain the above copyright 13 | * notice, this list of conditions and the following disclaimer. 14 | * * Redistributions in binary form must reproduce the above 15 | * copyright notice, this list of conditions and the following disclaimer 16 | * in the documentation and/or other materials provided with the 17 | * distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | * You can contact the author at: 32 | * - xxHash homepage: https://www.xxhash.com 33 | * - xxHash source repository: https://github.com/Cyan4973/xxHash 34 | */ 35 | 36 | /* 37 | * Note: This file used to host the source code of XXH3_* variants. 38 | * during the development period. 39 | * The source code is now properly integrated within xxhash.h. 40 | * 41 | * xxh3.h is no longer useful, 42 | * but it is still provided for compatibility with source code 43 | * which used to include it directly. 44 | * 45 | * Programs are now highly discouraged to include xxh3.h. 46 | * Include `xxhash.h` instead, which is the officially supported interface. 47 | * 48 | * In the future, xxh3.h will start to generate warnings, then errors, 49 | * then it will be removed from source package and from include directory. 50 | */ 51 | 52 | /* Simulate the same impact as including the old xxh3.h source file */ 53 | 54 | #define XXH_INLINE_ALL 55 | #include "xxhash.h" 56 | -------------------------------------------------------------------------------- /veritymap/src/tools/version/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ablab/VerityMap/d24aa797be9c977dbcb9164ecfe18b3af6e4a026/veritymap/src/tools/version/CMakeLists.txt -------------------------------------------------------------------------------- /veritymap/src/tools/version/version.cpp.in: -------------------------------------------------------------------------------- 1 | #include "version.hpp" 2 | 3 | using namespace tools; 4 | 5 | const std::string Version::GIT_SHA1 = "@GIT_SHA1@"; 6 | const std::string Version::GIT_DATE = "@GIT_DATE@"; 7 | const std::string Version::GIT_COMMIT_SUBJECT = "@GIT_COMMIT_SUBJECT@"; 8 | -------------------------------------------------------------------------------- /veritymap/src/tools/version/version.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace tools::Version { 6 | extern const std::string GIT_SHA1; 7 | extern const std::string GIT_DATE; 8 | extern const std::string GIT_COMMIT_SUBJECT; 9 | }// End namespace tools::Version 10 | --------------------------------------------------------------------------------