├── .github └── workflows │ ├── deploy.yml │ ├── main.yml │ └── self-hosted.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── data └── pathogens.list ├── libs ├── prebuild.bat └── refresh │ ├── active_thread_pool │ └── lib │ │ └── active_thread_pool.h │ ├── compression │ └── lib │ │ └── file_wrapper.h │ ├── logs │ └── lib │ │ └── progress.h │ └── sort │ └── lib │ └── pdqsort_par.h ├── makefile ├── quick-start.sh ├── refresh.mk ├── src ├── aligned_vector.h ├── alphabet.h ├── array.h ├── bubble_helper.h ├── console.h ├── console_all2all.cpp ├── console_all2all_parts.cpp ├── console_all2all_sparse.cpp ├── console_build.cpp ├── console_db2db.cpp ├── console_distance.cpp ├── console_minhash.cpp ├── console_new2all.cpp ├── console_one2all.cpp ├── conversion.h ├── elias_gamma.h ├── filter.h ├── genome_input_file.h ├── hashmap_lp.h ├── input_file.cpp ├── input_file.h ├── input_file_factory.h ├── instr_set_detect.h ├── kmc_api │ ├── kmc_file.cpp │ ├── kmc_file.h │ ├── kmer_api.cpp │ ├── kmer_api.h │ ├── kmer_defs.h │ ├── mmer.cpp │ └── mmer.h ├── kmc_input_file.h ├── kmer_db.h ├── kmer_db.sln ├── kmer_db.vcxproj ├── kmer_db.vcxproj.filters ├── kmer_extract.h ├── loader_ex.cpp ├── loader_ex.h ├── loader_tasks.h ├── log.cpp ├── log.h ├── main.cpp ├── minhashed_input_file.h ├── parallel_sorter.cpp ├── parallel_sorter.h ├── params.cpp ├── params.h ├── pattern.cpp ├── pattern.h ├── prefix_kmer_db.cpp ├── prefix_kmer_db.h ├── queue.h ├── sampler.h ├── simd │ ├── row_add.h │ ├── row_add_avx.cpp │ ├── row_add_avx2.cpp │ └── row_add_neon.cpp ├── similarity_calculator.cpp ├── similarity_calculator.h ├── sparse_filters.h ├── types.h └── version.h └── test ├── ictv ├── a2a-above10.csv ├── a2a-above10.csv.ani-shorter └── ictv.list ├── protein ├── aa.a2a ├── aa11_diamond.a2a ├── aa12_mmseqs.a2a ├── aa6_dayhoff.a2a ├── aa_100x1000.fasta ├── aa_k7.a2a ├── dna-preserve.a2a ├── dna.a2a ├── dna_100x1000.fasta └── translate.py ├── run-dev.bat ├── run-ictv.bat ├── run-protein.bat ├── run-synth.bat ├── synth ├── a2a ├── a2a-sparse ├── a2a-sparse.ani ├── a2a-sparse.mash ├── a2a.ani ├── a2a.mash ├── a2a.mash-sparse-min2max ├── a2a.mash.above-below ├── a2a.sparse.above-below ├── n2a ├── n2a-sparse ├── n2a-sparse.ani ├── n2a-sparse.mash ├── n2a.ani ├── n2a.mash ├── n2a.sparse.above-below ├── synth-local.list ├── synth.fa └── synth.list └── virus ├── MT159713.csv ├── data ├── MN908947.fasta ├── MN938384.fasta ├── MN975262.fasta ├── MN985325.fasta ├── MN988668.fasta ├── MN988669.fasta ├── MN988713.fasta ├── MN994467.fasta ├── MN994468.fasta ├── MN996527.fasta ├── MN996528.fasta ├── MN996529.fasta ├── MN996530.fasta ├── MN996531.fasta ├── MN997409.fasta ├── MT007544.fasta ├── MT012098.fasta ├── MT019529.fasta ├── MT019530.fasta ├── MT019531.fasta ├── MT019532.fasta ├── MT019533.fasta ├── MT020880.fasta ├── MT020881.fasta ├── MT027062.fasta ├── MT027063.fasta ├── MT027064.fasta ├── MT039873.fasta ├── MT039887.fasta ├── MT039888.fasta ├── MT039890.fasta ├── MT044257.fasta ├── MT044258.fasta ├── MT049951.fasta ├── MT050493.fasta ├── MT066156.fasta ├── MT066175.fasta ├── MT066176.fasta ├── MT072688.fasta ├── MT093571.fasta ├── MT093631.fasta ├── MT106052.fasta ├── MT106053.fasta ├── MT106054.fasta ├── MT118835.fasta ├── MT121215.fasta ├── MT123290.fasta ├── MT123291.fasta ├── MT123292.fasta ├── MT123293.fasta ├── MT126808.fasta ├── MT135041.fasta ├── MT135042.fasta ├── MT135043.fasta ├── MT135044.fasta ├── MT152824.fasta ├── MT159705.fasta ├── MT159706.fasta ├── MT159707.fasta ├── MT159708.fasta ├── MT159709.fasta ├── MT159710.fasta ├── MT159711.fasta ├── MT159712.fasta ├── MT159713.fasta ├── MT159714.fasta ├── MT159715.fasta ├── MT159716.fasta ├── MT159717.fasta ├── MT159718.fasta ├── MT159719.fasta ├── MT159720.fasta ├── MT159721.fasta ├── MT159722.fasta ├── MT163716.fasta ├── MT163717.fasta ├── MT163718.fasta ├── MT163719.fasta ├── MT184907.fasta ├── MT184908.fasta ├── MT184909.fasta ├── MT184910.fasta ├── MT184911.fasta ├── MT184912.fasta ├── MT184913.fasta ├── MT188339.fasta ├── MT188340.fasta ├── MT188341.fasta ├── MT192759.fasta ├── MT192765.fasta ├── MT192772.fasta ├── MT192773.fasta ├── MT198652.fasta ├── MT226610.fasta ├── MT233519.fasta ├── MT233522.fasta ├── MT233523.fasta ├── MT233526.fasta ├── MT240479.fasta ├── MT246449.fasta ├── MT246450.fasta ├── MT246451.fasta ├── MT246452.fasta ├── MT246453.fasta ├── MT246454.fasta ├── MT246455.fasta ├── MT246456.fasta ├── MT246457.fasta ├── MT246458.fasta ├── MT246459.fasta ├── MT246460.fasta ├── MT246461.fasta ├── MT246462.fasta ├── MT246463.fasta ├── MT246464.fasta ├── MT246466.fasta ├── MT246467.fasta ├── MT246468.fasta ├── MT246469.fasta ├── MT246470.fasta ├── MT246471.fasta ├── MT246472.fasta ├── MT246473.fasta ├── MT246474.fasta ├── MT246475.fasta ├── MT246476.fasta ├── MT246477.fasta ├── MT246478.fasta ├── MT246479.fasta ├── MT246480.fasta ├── MT246481.fasta ├── MT246482.fasta ├── MT246484.fasta ├── MT246485.fasta ├── MT246486.fasta ├── MT246487.fasta ├── MT246488.fasta ├── MT246489.fasta ├── MT246490.fasta ├── MT246667.fasta ├── MT251972.fasta ├── MT251973.fasta ├── MT251974.fasta ├── MT251975.fasta ├── MT251976.fasta ├── MT251977.fasta ├── MT251978.fasta ├── MT251979.fasta ├── MT251980.fasta ├── MT253696.fasta ├── MT253697.fasta ├── MT253698.fasta ├── MT253699.fasta ├── MT253700.fasta ├── MT253701.fasta ├── MT253702.fasta ├── MT253703.fasta ├── MT253704.fasta ├── MT253705.fasta ├── MT253706.fasta ├── MT253707.fasta ├── MT253708.fasta ├── MT253709.fasta ├── MT253710.fasta ├── NC_045512.fasta ├── seqs.fasta ├── seqs.part1.fasta └── seqs.part2.fasta ├── k18.csv ├── k18.csv.cosine ├── k18.csv.jaccard ├── k18.csv.mash ├── k18.csv.max ├── k18.csv.min ├── k18.frac.csv ├── k18.n2a.csv ├── k18.n2a.itself.csv ├── k18.n2a.sparse.csv ├── k18.sparse.csv ├── k24.csv ├── multi.list ├── multi.part2.list ├── multi.split.list ├── seqs.list ├── seqs.part1.list └── seqs.part2.list /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | 3 | on: 4 | release: 5 | types: 6 | - created 7 | 8 | jobs: 9 | 10 | 11 | ######################################################################################## 12 | checkout: 13 | name: Checkout 14 | strategy: 15 | matrix: 16 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 17 | runs-on: [self-hosted, kmer-db, '${{ matrix.machine }}'] 18 | 19 | steps: 20 | - name: clean 21 | run: rm -rf ${{ github.workspace }}/* 22 | - uses: actions/checkout@v4 23 | with: 24 | submodules: recursive 25 | 26 | ######################################################################################## 27 | make: 28 | name: Make 29 | needs: checkout 30 | strategy: 31 | fail-fast: false 32 | matrix: 33 | machine: [x64_linux] 34 | platform: [avx2] 35 | compiler: [14] 36 | include: 37 | - {machine: arm64_linux, platform: arm8, compiler: 12} 38 | - {machine: x64_mac, platform: avx2, compiler: 13} 39 | - {machine: arm64_mac, platform: m1, compiler: 13} 40 | 41 | runs-on: [self-hosted, kmer-db, '${{ matrix.machine }}'] 42 | 43 | steps: 44 | - name: make 45 | run: | 46 | gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} STATIC_LINK=true PLATFORM=${{ matrix.platform }} 47 | - name: tar artifacts 48 | run: tar -cvzf kmer-db.tar.gz LICENSE -C ./bin kmer-db 49 | 50 | ######################################################################################## 51 | help: 52 | name: Print usage 53 | needs: make 54 | strategy: 55 | fail-fast: false 56 | matrix: 57 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 58 | runs-on: [self-hosted, kmer-db, '${{ matrix.machine }}'] 59 | 60 | steps: 61 | - name: help 62 | run: ./bin/kmer-db 63 | 64 | - name: version 65 | run: ./bin/kmer-db -version 66 | 67 | ######################################################################################## 68 | upload: 69 | name: Upload 70 | needs: help 71 | strategy: 72 | fail-fast: false 73 | matrix: 74 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 75 | runs-on: [self-hosted, kmer-db, '${{ matrix.machine }}'] 76 | 77 | steps: 78 | - name: deploy 79 | uses: actions/upload-release-asset@v1.0.1 80 | env: 81 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 82 | with: 83 | upload_url: ${{ github.event.release.upload_url }} 84 | asset_path: ./kmer-db.tar.gz 85 | asset_name: kmer-db-${{ github.event.release.tag_name }}-${{matrix.machine}}.tar.gz 86 | asset_content_type: application/gzip 87 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Build and tests 2 | 3 | on: 4 | push: 5 | branches: [ master, develop] 6 | paths-ignore: 7 | - '**.md' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | 12 | 13 | ######################################################################################## 14 | make: 15 | name: Make 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | machine: [ubuntu-latest] 20 | gmake_install_command: ['gmake --version'] 21 | include: 22 | - {machine: macOS-13, gmake_install_command: 'brew install make && gmake --version'} 23 | runs-on: ['${{ matrix.machine }}'] 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | with: 28 | submodules: recursive 29 | - name: install gmake 30 | run: ${{ matrix.gmake_install_command }} 31 | 32 | - name: make 33 | run: | 34 | gmake -j CXX=g++-12 CC=gcc-12 STATIC_LINK=true 35 | 36 | - name: tar artifacts 37 | run: tar -cvf kmer-db.tar ./test/virus -C ./bin kmer-db 38 | 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | name: executable-artifact-${{ matrix.machine }} 42 | path: ./kmer-db.tar 43 | 44 | ######################################################################################## 45 | virus: 46 | needs: make 47 | name: Virus data 48 | strategy: 49 | fail-fast: false 50 | matrix: 51 | machine: [ubuntu-latest, macOS-13] 52 | runs-on: ['${{ matrix.machine }}'] 53 | env: 54 | INPUT_DIR: ./test/virus 55 | 56 | steps: 57 | - uses: actions/download-artifact@v4 58 | with: 59 | name: executable-artifact-${{ matrix.machine }} 60 | path: ./ 61 | 62 | - name: untar artifacts 63 | run: tar -xf kmer-db.tar 64 | 65 | - name: help 66 | run: ./kmer-db 67 | 68 | - name: version 69 | run: ./kmer-db -version 70 | 71 | - name: build 72 | run: | 73 | ./kmer-db build ${INPUT_DIR}/seqs.part1.list k18.parts.db 74 | 75 | - name: new2all 76 | run: | 77 | ./kmer-db new2all k18.parts.db ${INPUT_DIR}/seqs.part2.list k18.n2a.csv 78 | cmp k18.n2a.csv ${INPUT_DIR}/k18.n2a.csv 79 | 80 | - name: new2all (sparse) 81 | run: | 82 | ./kmer-db new2all -sparse k18.parts.db ${INPUT_DIR}/seqs.part2.list k18.n2a.sparse.csv 83 | cmp k18.n2a.sparse.csv ${INPUT_DIR}/k18.n2a.sparse.csv 84 | 85 | - name: extend 86 | run: | 87 | ./kmer-db build -extend -k 25 ${INPUT_DIR}/seqs.part2.list k18.parts.db 88 | 89 | - name: all2all 90 | run: | 91 | ./kmer-db all2all k18.parts.db k18.csv 92 | cmp k18.csv ${INPUT_DIR}/k18.csv 93 | 94 | - name: all2all (sparse) 95 | run: | 96 | ./kmer-db all2all -sparse k18.parts.db k18.sparse.csv 97 | cmp k18.sparse.csv ${INPUT_DIR}/k18.sparse.csv 98 | 99 | - name: distance 100 | run: | 101 | ./kmer-db distance jaccard k18.csv k18.csv.jaccard 102 | ./kmer-db distance min k18.csv k18.csv.min 103 | ./kmer-db distance max k18.csv k18.csv.max 104 | ./kmer-db distance cosine k18.csv k18.csv.cosine 105 | ./kmer-db distance mash k18.csv k18.csv.mash 106 | cmp k18.csv.jaccard ${INPUT_DIR}/k18.csv.jaccard 107 | cmp k18.csv.min ${INPUT_DIR}/k18.csv.min 108 | cmp k18.csv.max ${INPUT_DIR}/k18.csv.max 109 | cmp k18.csv.cosine ${INPUT_DIR}/k18.csv.cosine 110 | cmp k18.csv.mash ${INPUT_DIR}/k18.csv.mash 111 | 112 | - name: build (default k) + all2all 113 | run: | 114 | ./kmer-db build ${INPUT_DIR}/seqs.list k18.db 115 | ./kmer-db all2all k18.db k18.csv 116 | cmp k18.csv ${INPUT_DIR}/k18.csv 117 | 118 | - name: build (default k, multifasta) + all2all 119 | run: | 120 | ./kmer-db build -multisample-fasta ${INPUT_DIR}/multi.list k18.multi.db 121 | ./kmer-db all2all k18.multi.db k18.multi.csv 122 | cmp k18.multi.csv ${INPUT_DIR}/k18.csv 123 | 124 | - name: build (default k, 2 x multifasta) + all2all 125 | run: | 126 | ./kmer-db build -multisample-fasta ${INPUT_DIR}/multi.split.list k18.multi.split.db 127 | ./kmer-db all2all k18.multi.split.db k18.multi.split.csv 128 | cmp k18.multi.split.csv ${INPUT_DIR}/k18.csv 129 | 130 | - name: build (default k) + extend + all2all 131 | run: | 132 | ./kmer-db build ${INPUT_DIR}/seqs.part1.list k18.parts.db 133 | ./kmer-db build -extend -k 25 ${INPUT_DIR}/seqs.part2.list k18.parts.db 134 | ./kmer-db all2all k18.parts.db k18.parts.csv 135 | cmp k18.parts.csv ${INPUT_DIR}/k18.csv 136 | 137 | - name: build (default k, fraction 0.1) + all2all 138 | run: | 139 | ./kmer-db build -f 0.1 ${INPUT_DIR}/seqs.list k18.frac.db 140 | ./kmer-db all2all k18.frac.db k18.frac.csv 141 | cmp k18.frac.csv ${INPUT_DIR}/k18.frac.csv 142 | 143 | - name: minhash (default k, fraction 0.1) + build + all2all 144 | run: | 145 | ./kmer-db minhash -f 0.1 ${INPUT_DIR}/seqs.list 146 | ./kmer-db build -from-minhash ${INPUT_DIR}/seqs.list k18.minhash.db 147 | ./kmer-db all2all k18.minhash.db k18.minhash.csv 148 | cmp k18.minhash.csv ${INPUT_DIR}/k18.frac.csv 149 | 150 | - name: build (k=24) + all2all 151 | run: | 152 | ./kmer-db build -k 24 ${INPUT_DIR}/seqs.list k24.db 153 | ./kmer-db all2all k24.db k24.csv 154 | cmp k24.csv ${INPUT_DIR}/k24.csv 155 | 156 | - name: build (k=25, f=0.1) + one2all 157 | run: | 158 | ./kmer-db build -k 25 -f 0.1 ${INPUT_DIR}/seqs.part1.list k25.db 159 | ./kmer-db one2all k25.db ${INPUT_DIR}/data/MT159713 MT159713.csv 160 | cmp MT159713.csv ${INPUT_DIR}/MT159713.csv 161 | 162 | - name: new2all (against itself) 163 | run: | 164 | ./kmer-db new2all k18.db ${INPUT_DIR}/seqs.list k18.n2a.itself.csv 165 | cmp k18.n2a.itself.csv ${INPUT_DIR}/k18.n2a.itself.csv 166 | 167 | 168 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /src/Debug 2 | /src/Release 3 | /src/x64 4 | /src/.vs/kmer_db/v14/*.suo 5 | /src/kmer_db.VC.VC.opendb 6 | /src/kmer_db.VC.db 7 | *.txt 8 | *.db 9 | *.stat 10 | *.ipch 11 | *.vspx 12 | *.psess 13 | *.opendb 14 | *.sqlite 15 | *.json 16 | *.suo 17 | *.o 18 | /src/kmer_db.vcxproj.user 19 | /src/.vs 20 | /kmer-db 21 | /src/kmer_db/x64 22 | /libs/nasm 23 | /bin 24 | /obj 25 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rd_party/mimalloc"] 2 | path = libs/mimalloc 3 | url = https://github.com/refresh-bio-dependencies/mimalloc.git 4 | [submodule "3rd_party/zlib-ng"] 5 | path = libs/zlib-ng 6 | url = https://github.com/refresh-bio-dependencies/zlib-ng.git 7 | [submodule "3rd_party/isa-l"] 8 | path = libs/isa-l 9 | url = https://github.com/refresh-bio-dependencies/isa-l.git 10 | -------------------------------------------------------------------------------- /libs/prebuild.bat: -------------------------------------------------------------------------------- 1 | rem %1 - $(SolutionDir) 2 | rem %2 - $(Configuration) 3 | 4 | cd %1\..\libs 5 | 6 | 7 | @echo "nasm" 8 | 9 | if exist nasm/nasm.exe ( 10 | @echo "nasm.exe already exists" 11 | cd nasm 12 | ) else ( 13 | rmdir /S /Q nasm 14 | mkdir nasm 15 | cd nasm 16 | curl -L --ssl-no-revoke https://github.com/refresh-bio-dependencies/nasm/releases/download/v2.16.01/nasm-2.16.01-win64.zip --output nasm-2.16.01-win64.zip 17 | tar -xf nasm-2.16.01-win64.zip --strip-components 1 18 | ) 19 | 20 | set PATH=%PATH%;%cd% 21 | cd .. 22 | 23 | 24 | @echo "zlib-ng" 25 | cd zlib-ng 26 | cmake -B build-vs -S . -DZLIB_COMPAT=ON 27 | cmake --build build-vs --config %2 28 | cd .. 29 | 30 | @echo "isa-l" 31 | cd isa-l 32 | nmake -f Makefile.nmake 33 | -------------------------------------------------------------------------------- /libs/refresh/logs/lib/progress.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace refresh 9 | { 10 | class progress_state 11 | { 12 | enum class type_t {none, number, of_total, percent}; 13 | 14 | type_t type{ type_t::none }; 15 | 16 | uint64_t counter{ 0 }; 17 | uint64_t total{ 0 }; 18 | 19 | std::string prefix; 20 | std::string separator; 21 | std::string suffix; 22 | 23 | int precision{ -1 }; 24 | double mult{ 1 }; 25 | 26 | std::string message; 27 | bool message_checked{ false }; 28 | 29 | void adjust_precision() 30 | { 31 | if (total == 0) 32 | { 33 | total = 1; // Should be error message here? 34 | } 35 | 36 | if (precision < 0) 37 | { 38 | if (total <= 100) 39 | precision = 0; 40 | else if (total <= 10000) 41 | precision = 1; 42 | else if (total <= 1000000) 43 | precision = 2; 44 | else 45 | precision = 3; 46 | } 47 | else if (precision > 6) 48 | precision = 6; 49 | 50 | mult = pow(10, precision); 51 | } 52 | 53 | void build_message() 54 | { 55 | std::string s; 56 | char buffer[16]; 57 | 58 | if (type == type_t::number) 59 | s = std::to_string(counter); 60 | else if (type == type_t::of_total) 61 | s = prefix + std::to_string(counter) + separator + std::to_string(total) + suffix; 62 | else if (type == type_t::percent) 63 | { 64 | auto r = std::to_chars(buffer, buffer + 16, 100.0 * counter / total, std::chars_format::fixed, precision); 65 | 66 | if (r.ec != std::errc()) 67 | s = ""; 68 | else 69 | { 70 | s = prefix; 71 | s.append(buffer, r.ptr - buffer); 72 | s += suffix; 73 | } 74 | } 75 | 76 | if (s != message) 77 | { 78 | message = move(s); 79 | message_checked = false; 80 | } 81 | } 82 | 83 | public: 84 | progress_state() : 85 | type(type_t::number) 86 | {} 87 | 88 | progress_state(uint64_t total, const std::string &prefix, const std::string &separator, const std::string &suffix) : 89 | type(type_t::of_total), 90 | total(total), 91 | prefix(prefix), 92 | separator(separator), 93 | suffix(suffix) 94 | {} 95 | 96 | progress_state(uint64_t total, const std::string& prefix, const std::string& suffix, int precision) : 97 | type(type_t::percent), 98 | total(total), 99 | prefix(prefix), 100 | suffix(suffix), 101 | precision(precision) 102 | { 103 | adjust_precision(); 104 | } 105 | 106 | const std::string& str() 107 | { 108 | message_checked = true; 109 | return message; 110 | } 111 | 112 | bool increment(uint64_t n) 113 | { 114 | counter += n; 115 | build_message(); 116 | 117 | return !message_checked; 118 | } 119 | 120 | bool was_checked() const 121 | { 122 | return message_checked; 123 | } 124 | }; 125 | } -------------------------------------------------------------------------------- /libs/refresh/sort/lib/pdqsort_par.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/refresh-bio/kmer-db/bd8c42a862298ef30ef101814db8214ae7074161/libs/refresh/sort/lib/pdqsort_par.h -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | all: kmer-db 2 | 3 | # *** REFRESH makefile utils 4 | include refresh.mk 5 | 6 | $(call INIT_SUBMODULES) 7 | $(call INIT_GLOBALS) 8 | $(call CHECK_OS_ARCH, $(PLATFORM)) 9 | 10 | # *** Project directories 11 | $(call SET_SRC_OBJ_BIN,src,obj,bin) 12 | 3RD_PARTY_DIR := ./libs 13 | 14 | SRC_SIMD_DIR := $(SRC_DIR)/simd 15 | OBJ_SIMD_DIR := $(OBJ_DIR)/simd 16 | 17 | # *** Project configuration 18 | $(call CHECK_NASM) 19 | $(call PROPOSE_ZLIB_NG, $(3RD_PARTY_DIR)/zlib-ng) 20 | $(call PROPOSE_ISAL, $(3RD_PARTY_DIR)/isa-l) 21 | $(call ADD_MIMALLOC, $(3RD_PARTY_DIR)/mimalloc) 22 | $(call CHOOSE_GZIP_DECOMPRESSION) 23 | $(call ADD_REFRESH_LIB, $(3RD_PARTY_DIR)) 24 | $(call SET_STATIC, $(STATIC_LINK)) 25 | $(call SET_C_CPP_STANDARDS, c11, c++20) 26 | $(call SET_GIT_COMMIT) 27 | 28 | $(call SET_FLAGS, $(TYPE)) 29 | 30 | $(call SET_COMPILER_VERSION_ALLOWED, GCC, Linux_x86_64, 10, 20) 31 | $(call SET_COMPILER_VERSION_ALLOWED, GCC, Linux_aarch64, 11, 20) 32 | $(call SET_COMPILER_VERSION_ALLOWED, GCC, Darwin_x86_64, 11, 13) 33 | $(call SET_COMPILER_VERSION_ALLOWED, GCC, Darwin_arm64, 11, 13) 34 | 35 | ifneq ($(MAKECMDGOALS),clean) 36 | $(call CHECK_COMPILER_VERSION) 37 | endif 38 | 39 | # *** Source files and rules 40 | $(eval $(call PREPARE_DEFAULT_COMPILE_RULE,MAIN,)) 41 | $(eval $(call PREPARE_DEFAULT_COMPILE_RULE,KMC_API,kmc_api)) 42 | 43 | # *** SIMD rules 44 | # Main kmer-db files 45 | ifeq ($(ARCH_TYPE),x86_64) 46 | SRC_SIMD := $(SRC_SIMD_DIR)/row_add_avx.cpp $(SRC_SIMD_DIR)/row_add_avx2.cpp 47 | $(OBJ_SIMD_DIR)/row_add_avx.cpp.o: $(SRC_SIMD_DIR)/row_add_avx.cpp 48 | @mkdir -p $(OBJ_SIMD_DIR) 49 | $(CXX) $(CPP_FLAGS_AVX) $(OPTIMIZATION_FLAGS) $(ARCH_FLAGS) $(INCLUDE_DIRS) -MMD -MF $@.d -c $< -o $@ 50 | $(OBJ_SIMD_DIR)/row_add_avx2.cpp.o: $(SRC_SIMD_DIR)/row_add_avx2.cpp 51 | @mkdir -p $(OBJ_SIMD_DIR) 52 | $(CXX) $(CPP_FLAGS_AVX2) $(OPTIMIZATION_FLAGS) $(ARCH_FLAGS) $(INCLUDE_DIRS) -MMD -MF $@.d -c $< -o $@ 53 | else 54 | SRC_SIMD := $(SRC_SIMD_DIR)/row_add_neon.cpp 55 | $(OBJ_SIMD_DIR)/row_add_neon.cpp.o: $(SRC_SIMD_DIR)/row_add_neon.cpp 56 | @mkdir -p $(OBJ_SIMD_DIR) 57 | $(CXX) $(CPP_FLAGS_NEON) $(OPTIMIZATION_FLAGS) $(ARCH_FLAGS) $(INCLUDE_DIRS) -MMD -MF $@.d -c $< -o $@ 58 | endif 59 | 60 | OBJ_SIMD := $(patsubst $(SRC_SIMD_DIR)/%.cpp, $(OBJ_SIMD_DIR)/%.cpp.o, $(SRC_SIMD)) 61 | 62 | # Dependency files (needed only for SIMD) 63 | -include $(OBJ_SIMD:.o=.o.d) 64 | 65 | # *** Targets 66 | kmer-db: $(OUT_BIN_DIR)/kmer-db 67 | $(OUT_BIN_DIR)/kmer-db: $(GZ_TARGET) mimalloc_obj \ 68 | $(OBJ_MAIN) $(OBJ_KMC_API) $(OBJ_SIMD) 69 | -mkdir -p $(OUT_BIN_DIR) 70 | $(CXX) -o $@ \ 71 | $(MIMALLOC_OBJ) \ 72 | $(OBJ_MAIN) $(OBJ_KMC_API) $(OBJ_SIMD) \ 73 | $(LIBRARY_FILES) $(LINKER_FLAGS) $(LINKER_DIRS) 74 | 75 | # *** Cleaning 76 | .PHONY: clean init 77 | clean: clean-zlib-ng clean-isa-l clean-mimalloc_obj 78 | -rm -r $(OBJ_DIR) 79 | -rm -r $(OUT_BIN_DIR) 80 | 81 | init: 82 | $(call INIT_SUBMODULES) 83 | 84 | -------------------------------------------------------------------------------- /quick-start.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | INPUT=./test/virus 3 | OUTPUT=./output 4 | 5 | mkdir $OUTPUT 6 | 7 | # build a database from all 18-mers (default) contained in a set of sequences 8 | ./bin/kmer-db build $INPUT/seqs.part1.list $OUTPUT/k18.db 9 | 10 | # establish numbers of common k-mers between new sequences and the database 11 | ./bin/kmer-db new2all $OUTPUT/k18.db $INPUT/seqs.part2.list $OUTPUT/n2a.csv 12 | 13 | # calculate jaccard index from common k-mers 14 | ./bin/kmer-db distance jaccard $OUTPUT/n2a.csv $OUTPUT/n2a.jaccard 15 | 16 | # extend the database with new sequences 17 | ./bin/kmer-db build -extend $INPUT/seqs.part2.list $OUTPUT/k18.db 18 | 19 | # establish numbers of common k-mers between all sequences in the database 20 | ./bin/kmer-db all2all $OUTPUT/k18.db $OUTPUT/a2a.csv 21 | 22 | # build a database from 10% of 25-mers using 16 threads 23 | ./bin/kmer-db build -k 25 -f 0.1 -t 16 $INPUT/seqs.part1.list $OUTPUT/k25.db 24 | 25 | # establish number of common 25-mers between single sequence and the database 26 | # (minhash filtering that retains 10% of MT159713 k-mers is done prior to the comparison) 27 | ./bin/kmer-db one2all $OUTPUT/k25.db $INPUT/data/MT159713.fasta $OUTPUT/MT159713.csv 28 | 29 | # build two partial databases 30 | ./bin/kmer-db build $INPUT/seqs.part1.list $OUTPUT/k18.parts1.db 31 | ./bin/kmer-db build $INPUT/seqs.part2.list $OUTPUT/k18.parts2.db 32 | 33 | # establish numbers of common k-mers between all sequences in the databases, 34 | # computations are done in the sparse mode, the output matrix is also sparse 35 | echo $OUTPUT/k18.parts1.db > $OUTPUT/db.list 36 | echo $OUTPUT/k18.parts2.db >> $OUTPUT/db.list 37 | ./bin/kmer-db all2all-parts $OUTPUT/db.list $OUTPUT/k18.parts.csv 38 | -------------------------------------------------------------------------------- /src/aligned_vector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | // ***************************************************************************************** 11 | // 12 | template class aligned_vector 13 | { 14 | 15 | size_t data_size; 16 | size_t data_allocated; 17 | T *data_aligned; 18 | char *data_raw; 19 | 20 | // ***************************************************************************************** 21 | // 22 | void allocate(void) 23 | { 24 | if (data_raw) 25 | delete[] data_raw; 26 | 27 | data_allocated = data_size; 28 | if (data_allocated < 16) 29 | data_allocated = 16; 30 | 31 | size_t bytes_to_allocate = data_allocated * sizeof(T) + ALIGNMENT; 32 | 33 | data_raw = new char[bytes_to_allocate]; 34 | 35 | size_t address = reinterpret_cast(data_raw); 36 | size_t offset = ALIGNMENT - address % ALIGNMENT; 37 | 38 | data_aligned = reinterpret_cast(data_raw + offset); 39 | } 40 | 41 | // ***************************************************************************************** 42 | // 43 | void free() 44 | { 45 | if (data_raw) 46 | delete[] data_raw; 47 | data_raw = nullptr; 48 | } 49 | 50 | public: 51 | typedef T value_type; 52 | 53 | size_t get_bytes() const { 54 | return data_allocated * sizeof(T) + ALIGNMENT; 55 | } 56 | 57 | // ***************************************************************************************** 58 | // 59 | aligned_vector(size_t _data_size = 0) : data_size(_data_size), data_aligned(nullptr), data_raw(nullptr) 60 | { 61 | allocate(); 62 | } 63 | 64 | // ***************************************************************************************** 65 | // 66 | ~aligned_vector() 67 | { 68 | free(); 69 | } 70 | 71 | // ***************************************************************************************** 72 | // 73 | void swap(aligned_vector &x) 74 | { 75 | ::swap(data_aligned, x.data_aligned); 76 | ::swap(data_raw, x.data_raw); 77 | ::swap(data_size, x.data_size); 78 | ::swap(data_allocated, x.data_allocated); 79 | } 80 | 81 | // ***************************************************************************************** 82 | // 83 | size_t size() 84 | { 85 | return data_size; 86 | } 87 | 88 | // ***************************************************************************************** 89 | // 90 | void resize(size_t new_size) 91 | { 92 | if (new_size > data_allocated) 93 | { 94 | data_size = new_size; 95 | allocate(); 96 | } 97 | else 98 | data_size = new_size; 99 | } 100 | 101 | // ***************************************************************************************** 102 | // 103 | T* begin() // pseudo iterator 104 | { 105 | return data_aligned; 106 | } 107 | 108 | // ***************************************************************************************** 109 | // 110 | T* end() // pseudo iterator 111 | { 112 | return data_aligned + data_size; 113 | } 114 | 115 | // ***************************************************************************************** 116 | // 117 | T* data() 118 | { 119 | return data_aligned; 120 | } 121 | 122 | // ***************************************************************************************** 123 | // 124 | T& operator[](size_t pos) 125 | { 126 | return data_aligned[pos]; 127 | } 128 | 129 | // ***************************************************************************************** 130 | // 131 | const T& operator[](size_t pos) const 132 | { 133 | return data_aligned[pos]; 134 | } 135 | }; -------------------------------------------------------------------------------- /src/alphabet.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | enum class AlphabetType { 11 | nt, 12 | nt_preserve, 13 | aa, 14 | aa11_diamond, 15 | aa12_mmseqs, 16 | aa6_dayhoff, 17 | unknown 18 | }; 19 | 20 | // ***************************************************************************************** 21 | // 22 | class Alphabet { 23 | 24 | public: 25 | struct Description { 26 | AlphabetType type; 27 | std::string name; 28 | std::string groups; 29 | bool preserveStrand; 30 | }; 31 | 32 | Alphabet(const Description& desc) : 33 | type(desc.type), 34 | name(desc.name), 35 | preserveStrand(desc.preserveStrand), 36 | size(std::count(desc.groups.begin(), desc.groups.end(), ',') + 1), 37 | bitsPerSymbol(std::round(std::ceil(std::log2(size)))), 38 | maxKmerLen(64 / bitsPerSymbol - 1) // highest bit in a value is reserved for the hash table use 39 | 40 | { 41 | std::fill_n(mapping, 256, -1); 42 | 43 | // translate comma-separated string into vector of strings 44 | std::string line; 45 | std::vector vec; 46 | std::stringstream ss(desc.groups); 47 | while (std::getline(ss, line, ',')) { 48 | vec.push_back(line); 49 | } 50 | 51 | for (int gi = 0; gi < vec.size(); ++gi) { 52 | const auto& group = vec[gi]; 53 | 54 | for (unsigned char c : group) { 55 | mapping[std::tolower(c)] = mapping[std::toupper(c)] = gi; 56 | } 57 | } 58 | } 59 | 60 | public: 61 | const AlphabetType type; 62 | const std::string name; 63 | const bool preserveStrand; 64 | const int size; 65 | const int bitsPerSymbol; 66 | const int maxKmerLen; 67 | 68 | int8_t map(char c) const { return mapping[c]; } 69 | 70 | protected: 71 | int8_t mapping[256]; 72 | }; 73 | 74 | // ***************************************************************************************** 75 | // 76 | class AlphabetFactory { 77 | private: 78 | 79 | std::vector descriptions { 80 | { AlphabetType::nt, "nt", "A,C,G,TU", false }, 81 | { AlphabetType::nt_preserve, "nt-preserve", "A,C,G,TU", true }, 82 | { AlphabetType::aa, "aa", "K,R,E,D,Q,N,C,G,H,I,L,V,M,F,Y,W,P,S,T,A", true }, 83 | { AlphabetType::aa11_diamond, "aa11_diamond", "KREDQN,C,G,H,ILV,M,F,Y,W,P,STA", true }, 84 | { AlphabetType::aa12_mmseqs, "aa12_mmseqs", "AST,C,DN,EQ,FY,G,H,IV,KR,LM,P,W", true }, 85 | { AlphabetType::aa6_dayhoff, "aa6_dayhoff", "STPAG,NDEQ,HRK,MILV,FYW,C", true }, 86 | }; 87 | 88 | AlphabetFactory() {} 89 | 90 | public: 91 | 92 | // Creates an alphabet from enumeration 93 | Alphabet* create(AlphabetType type) { 94 | 95 | auto it = std::find_if(descriptions.begin(), descriptions.end(), 96 | [type](const Alphabet::Description& d) {return d.type == type; }); 97 | 98 | if (it == descriptions.end()) { 99 | throw std::runtime_error("Invalid alphabet type"); 100 | } 101 | 102 | return new Alphabet(*it); 103 | } 104 | 105 | // Creates an alphabet from string 106 | Alphabet* create(const std::string& name) { 107 | return create(str2type(name)); 108 | } 109 | 110 | // Converts string to enumeration 111 | AlphabetType str2type(const std::string& name) { 112 | 113 | auto it = std::find_if(descriptions.begin(), descriptions.end(), 114 | [&name](const Alphabet::Description& d) {return d.name == name; }); 115 | 116 | if (it == descriptions.end()) { 117 | throw std::runtime_error("Invalid alphabet type"); 118 | } 119 | 120 | return it->type; 121 | } 122 | 123 | static AlphabetFactory& instance() { 124 | static AlphabetFactory factory; 125 | return factory; 126 | } 127 | }; -------------------------------------------------------------------------------- /src/bubble_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | class CBubbleHelper 10 | { 11 | public: 12 | struct bubble_adders_t 13 | { 14 | uint32_t num_kmers; 15 | vector::iterator first; 16 | vector::iterator last; 17 | 18 | bubble_adders_t(uint32_t num_kmers, vector::iterator first, vector::iterator last) : 19 | num_kmers(num_kmers), first(first), last(last) 20 | {} 21 | }; 22 | 23 | private: 24 | size_t bubble_size_thr; 25 | 26 | struct bubble_t 27 | { 28 | uint32_t num_kmers; 29 | vector ids1; 30 | vector ids2; 31 | 32 | template 33 | bubble_t(uint32_t num_kmers, Iter first1, Iter last1, Iter first2, Iter last2) : 34 | num_kmers(num_kmers), ids1(first1, last1), ids2(first2, last2) 35 | {} 36 | 37 | template 38 | bubble_t(uint32_t num_kmers, Iter first, Iter last) : 39 | num_kmers(num_kmers), ids1(first, last), ids2{} 40 | {} 41 | }; 42 | 43 | vector> presence1; 44 | vector> presence2; 45 | 46 | vector bubbles; 47 | 48 | public: 49 | CBubbleHelper(size_t bubble_size_thr = 8000) : 50 | bubble_size_thr(bubble_size_thr) 51 | {} 52 | 53 | bool empty() const 54 | { 55 | return bubbles.empty(); 56 | } 57 | 58 | void resize(size_t n) 59 | { 60 | clear(); 61 | presence1.resize(n); 62 | } 63 | 64 | void resize(size_t n1, size_t n2) 65 | { 66 | clear(); 67 | presence1.resize(n1); 68 | presence1.resize(n2); 69 | } 70 | 71 | void clear() 72 | { 73 | presence1.clear(); 74 | presence2.clear(); 75 | presence1.shrink_to_fit(); 76 | presence2.shrink_to_fit(); 77 | } 78 | 79 | bool is_bubble(size_t n) 80 | { 81 | return n >= bubble_size_thr; 82 | } 83 | 84 | bool is_bubble(size_t n1, size_t n2) 85 | { 86 | return n1 * n2 >= bubble_size_thr * bubble_size_thr; 87 | } 88 | 89 | template 90 | void add(uint32_t num_kmers, Iter first, Iter last) 91 | { 92 | uint32_t bubble_id = (uint32_t) bubbles.size(); 93 | 94 | uint32_t max_id = *(last - 1); 95 | if (presence1.size() <= max_id) 96 | presence1.resize(max_id + 1); 97 | 98 | for (auto p = first; p != last; ++p) 99 | presence1[*p].emplace_back(bubble_id); 100 | 101 | bubbles.emplace_back(num_kmers, first, last); 102 | } 103 | 104 | template 105 | void add(uint32_t num_kmers, Iter first1, Iter last1, Iter first2, Iter last2) 106 | { 107 | uint32_t bubble_id = (uint32_t) bubbles.size(); 108 | 109 | uint32_t max_id1 = *(last1 - 1); 110 | if (presence1.size() <= max_id1) 111 | presence1.resize(max_id1 + 1); 112 | 113 | // !!! TODO: Probably presence2 is unnecessary 114 | uint32_t max_id2 = *(last2 - 1); 115 | if (presence2.size() <= max_id2) 116 | presence2.resize(max_id2 + 1); 117 | 118 | for (auto p = first1; p != last1; ++p) 119 | presence1[*p].emplace_back(bubble_id); 120 | 121 | for (auto p = first2; p != last2; ++p) 122 | presence2[*p].emplace_back(bubble_id); 123 | 124 | bubbles.emplace_back(num_kmers, first1, last1, first2, last2); 125 | } 126 | 127 | bool get_adders(uint32_t id_query, vector& bubble_adders) 128 | { 129 | bubble_adders.clear(); 130 | 131 | if (id_query >= presence1.size()) 132 | return false; 133 | 134 | for (auto bid : presence1[id_query]) 135 | if (bubbles[bid].ids2.empty()) // all2all-sp mode 136 | { 137 | /* auto p = upper_bound(bubbles[bid].ids1.begin(), bubbles[bid].ids1.end(), id_query); 138 | if (p != bubbles[bid].ids1.end()) 139 | bubble_adders.emplace_back(bubbles[bid].num_kmers, p, bubbles[bid].ids1.end());*/ 140 | 141 | auto p = lower_bound(bubbles[bid].ids1.begin(), bubbles[bid].ids1.end(), id_query); 142 | 143 | if (p != bubbles[bid].ids1.begin()) 144 | bubble_adders.emplace_back(bubbles[bid].num_kmers, bubbles[bid].ids1.begin(), p); 145 | } 146 | else // db2db-sp mode 147 | { 148 | bubble_adders.emplace_back(bubbles[bid].num_kmers, bubbles[bid].ids2.begin(), bubbles[bid].ids2.end()); 149 | } 150 | 151 | return !bubble_adders.empty(); 152 | } 153 | }; -------------------------------------------------------------------------------- /src/console.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #pragma once 10 | #include "params.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // ***************************************************************************************** 17 | class usage_error : public std::runtime_error { 18 | Params::Mode mode; 19 | public: 20 | usage_error(Params::Mode mode) : std::runtime_error(""), mode(mode) {} 21 | Params::Mode getMode() const { return mode; } 22 | }; 23 | 24 | // ***************************************************************************************** 25 | class Console 26 | { 27 | protected: 28 | refresh::active_thread_pool atp; 29 | public: 30 | Console() : atp(4, 1024, std::chrono::milliseconds(2)) {}; 31 | virtual void run(const Params& params) = 0; 32 | virtual ~Console() {} 33 | }; 34 | 35 | // ***************************************************************************************** 36 | class BuildConsole : public Console { 37 | public: 38 | virtual void run(const Params& params) override; 39 | }; 40 | 41 | class All2AllConsole : public Console { 42 | public: 43 | virtual void run(const Params& params) override; 44 | }; 45 | 46 | 47 | class All2AllSparseConsole : public Console { 48 | public: 49 | virtual void run(const Params& params) override; 50 | }; 51 | 52 | class All2AllPartsConsole : public Console { 53 | public: 54 | virtual void run(const Params& params) override; 55 | }; 56 | 57 | class New2AllConsole : public Console { 58 | public: 59 | virtual void run(const Params& params) override; 60 | }; 61 | 62 | class One2AllConsole : public Console { 63 | public: 64 | virtual void run(const Params& params) override; 65 | }; 66 | 67 | class Db2DbConsole : public Console { 68 | public: 69 | virtual void run(const Params& params) override; 70 | }; 71 | 72 | class MinhashConsole : public Console { 73 | public: 74 | virtual void run(const Params& params) override; 75 | }; 76 | 77 | class DistanceConsole : public Console { 78 | public: 79 | virtual void run(const Params& params) override; 80 | }; 81 | 82 | // ***************************************************************************************** 83 | class ConsoleFactory { 84 | public: 85 | 86 | static std::unique_ptr create(Params::Mode mode) { 87 | 88 | Console* p = nullptr; 89 | 90 | if (mode == Params::Mode::build) { 91 | p = new BuildConsole(); 92 | } else if (mode == Params::Mode::all2all) { 93 | p = new All2AllConsole(); 94 | } else if (mode == Params::Mode::all2all_parts) { 95 | p = new All2AllPartsConsole(); 96 | } else if (mode == Params::Mode::all2all_sparse) { 97 | p = new All2AllSparseConsole(); 98 | } else if (mode == Params::Mode::new2all) { 99 | p = new New2AllConsole(); 100 | } else if (mode == Params::Mode::one2all) { 101 | p = new One2AllConsole(); 102 | } else if (mode == Params::Mode::distance) { 103 | p = new DistanceConsole(); 104 | } else if (mode == Params::Mode::minhash) { 105 | p = new MinhashConsole(); 106 | } 107 | 108 | return std::unique_ptr(p); 109 | } 110 | }; -------------------------------------------------------------------------------- /src/console_all2all.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "similarity_calculator.h" 3 | 4 | #include 5 | #include 6 | 7 | void All2AllConsole::run(const Params& params) { 8 | 9 | if (params.files.size() != 2) { 10 | throw usage_error(params.mode); 11 | } 12 | 13 | LOG_NORMAL("All versus all comparison" << endl); 14 | 15 | const std::string& dbFilename = params.files[0]; 16 | const std::string& similarityFile = params.files[1]; 17 | 18 | std::ifstream dbFile(dbFilename, std::ios::binary); 19 | std::ofstream ofs(similarityFile); 20 | PrefixKmerDb* db = new PrefixKmerDb(params.numThreads); 21 | SimilarityCalculator calculator(params.numThreads, params.cacheBufferMb); 22 | 23 | std::chrono::duration dt{ 0 }; 24 | LOG_NORMAL("Loading k-mer database " << dbFilename << "..." << endl); 25 | auto start = std::chrono::high_resolution_clock::now(); 26 | if (!dbFile || !db->deserialize(dbFile, AbstractKmerDb::DeserializationMode::SkipHashtables)) { 27 | throw runtime_error("Cannot open k-mer database " + dbFilename); 28 | } 29 | dt = std::chrono::high_resolution_clock::now() - start; 30 | 31 | LOG_NORMAL("Calculating matrix of common k-mers..." << endl); 32 | start = std::chrono::high_resolution_clock::now(); 33 | LowerTriangularMatrix matrix; 34 | calculator.all2all(*db, matrix); 35 | dt = std::chrono::high_resolution_clock::now() - start; 36 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 37 | 38 | LOG_NORMAL("Storing matrix of common k-mers in " << similarityFile << "..."); 39 | start = std::chrono::high_resolution_clock::now(); 40 | ofs << "kmer-length: " << db->getKmerLength() << " fraction: " << db->getFraction() << " ,db-samples ,"; 41 | std::copy(db->getSampleNames().cbegin(), db->getSampleNames().cend(), ostream_iterator(ofs, ",")); 42 | ofs << endl; 43 | 44 | // allocate row buffer (10000 for sample name + 100 for each row) 45 | char* row = new char[10000 + db->getSamplesCount() * 100]; 46 | char* ptr = row; 47 | 48 | ptr += sprintf(ptr, "query-samples,total-kmers,"); 49 | ptr += num2str(db->getSampleKmersCount().data(), db->getSampleKmersCount().size(), ',', ptr); 50 | *ptr++ = '\n'; 51 | ofs.write(row, ptr - row); 52 | 53 | if (params.sparseOut) { 54 | 55 | CombinedFilter filter( 56 | params.metricFilters, 57 | params.kmerFilter, 58 | db->getSampleKmersCount(), 59 | db->getSampleKmersCount(), 60 | db->getKmerLength()); 61 | 62 | matrix.compact(filter); 63 | } 64 | 65 | for (size_t sid = 0; sid < db->getSamplesCount(); ++sid) { 66 | ptr = row; 67 | ptr += sprintf(ptr, "%s,%lu,", db->getSampleNames()[sid].c_str(), (unsigned long)db->getSampleKmersCount()[sid]); 68 | 69 | if (params.sparseOut) { 70 | ptr += matrix.saveRowSparse(sid, ptr); 71 | } 72 | else { 73 | ptr += matrix.saveRow(sid, ptr); 74 | } 75 | 76 | *ptr++ = '\n'; 77 | ofs.write(row, ptr - row); 78 | } 79 | 80 | delete[] row; 81 | 82 | dt = std::chrono::high_resolution_clock::now() - start; 83 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 84 | 85 | LOG_NORMAL("Releasing memory..."); 86 | start = std::chrono::high_resolution_clock::now(); 87 | delete db; 88 | dt = std::chrono::high_resolution_clock::now() - start; 89 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 90 | } -------------------------------------------------------------------------------- /src/console_all2all_sparse.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "prefix_kmer_db.h" 3 | #include "similarity_calculator.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | using sampler_t = Sampler; 10 | 11 | // ***************************************************************************************** 12 | // 13 | void All2AllSparseConsole::run(const Params& params) { 14 | uint32_t sampling_max_no_items = params.samplingSize; 15 | bool do_sampling = sampling_max_no_items != 0; 16 | sampler_t::strategy_t sampling_strategy = params.samplingCriterion ? sampler_t::strategy_t::best : sampler_t::strategy_t::random; 17 | 18 | if (params.files.size() != 2) { 19 | throw usage_error(params.mode); 20 | } 21 | 22 | LOG_NORMAL("All versus all comparison (sparse computation)" << endl); 23 | 24 | const std::string& dbFilename = params.files[0]; 25 | const std::string& similarityFile = params.files[1]; 26 | 27 | std::ifstream dbFile(dbFilename, std::ios::binary); 28 | std::ofstream ofs(similarityFile, std::ios::binary); 29 | PrefixKmerDb* db = new PrefixKmerDb(params.numThreads); 30 | SimilarityCalculator calculator(params.numThreads, params.cacheBufferMb); 31 | 32 | std::chrono::duration dt{ 0 }; 33 | LOG_NORMAL("Loading k-mer database " << dbFilename << "..." << endl); 34 | auto start = std::chrono::high_resolution_clock::now(); 35 | if (!dbFile || !db->deserialize(dbFile, AbstractKmerDb::DeserializationMode::SkipHashtables)) { 36 | throw runtime_error("Cannot open k-mer database " + dbFilename); 37 | } 38 | dt = std::chrono::high_resolution_clock::now() - start; 39 | 40 | LOG_NORMAL("Calculating matrix of common k-mers..."); 41 | start = std::chrono::high_resolution_clock::now(); 42 | SparseMatrix matrix; 43 | CBubbleHelper bubbles(params.bubbleSize); 44 | calculator.all2all_sp(*db, matrix, bubbles); 45 | dt = std::chrono::high_resolution_clock::now() - start; 46 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 47 | 48 | LOG_NORMAL("Storing matrix of common k-mers in " << similarityFile << "..."); 49 | start = std::chrono::high_resolution_clock::now(); 50 | ofs << "kmer-length: " << db->getKmerLength() << " fraction: " << db->getFraction() << " ,db-samples ,"; 51 | std::copy(db->getSampleNames().cbegin(), db->getSampleNames().cend(), ostream_iterator(ofs, ",")); 52 | ofs << endl; 53 | 54 | // allocate row buffer (10000 for sample name + 100 for each row) 55 | char* row = new char[10000 + db->getSamplesCount() * 100]; 56 | char* ptr = row; 57 | 58 | ptr += sprintf(ptr, "query-samples,total-kmers,"); 59 | ptr += num2str(db->getSampleKmersCount().data(), db->getSampleKmersCount().size(), ',', ptr); 60 | *ptr++ = '\n'; 61 | ofs.write(row, ptr - row); 62 | 63 | CombinedFilter filter( 64 | params.metricFilters, 65 | params.kmerFilter, 66 | db->getSampleKmersCount(), 67 | db->getSampleKmersCount(), 68 | db->getKmerLength()); 69 | 70 | sampler_t sampler(do_sampling ? db->getSamplesCount() : 0, sampling_max_no_items, sampling_strategy); 71 | 72 | if (do_sampling) 73 | { 74 | // !!! TODO: add support for bubbles 75 | matrix.add_to_sampler(filter, sampler, params.samplingCriterion, db->getSampleKmersCount(), db->getSampleKmersCount(), 0, 0, db->getKmerLength(), params.numThreads, bubbles); 76 | matrix.clear(); 77 | } 78 | else 79 | matrix.compact2(filter, params.numThreads, bubbles); 80 | 81 | size_t no_pairs_saved = 0; 82 | 83 | for (size_t sid = 0; sid < db->getSamplesCount(); ++sid) { 84 | ptr = row; 85 | ptr += sprintf(ptr, "%s,%lu,", db->getSampleNames()[sid].c_str(), (unsigned long)db->getSampleKmersCount()[sid]); 86 | if (do_sampling) 87 | { 88 | ptr += sampler.saveRowSparse(sid, ptr, 0); 89 | no_pairs_saved += sampler.getNoInRow(sid); 90 | } 91 | else 92 | { 93 | ptr += matrix.saveRowSparse(sid, ptr, 0); 94 | no_pairs_saved += matrix.getNoInRow(sid); 95 | } 96 | *ptr++ = '\n'; 97 | ofs.write(row, ptr - row); 98 | } 99 | 100 | delete[] row; 101 | 102 | dt = std::chrono::high_resolution_clock::now() - start; 103 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 104 | 105 | LOG_NORMAL("Releasing memory..."); 106 | start = std::chrono::high_resolution_clock::now(); 107 | delete db; 108 | dt = std::chrono::high_resolution_clock::now() - start; 109 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 110 | 111 | LOG_NORMAL("No. saved pairs: " << no_pairs_saved << endl); 112 | } -------------------------------------------------------------------------------- /src/console_build.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "console.h" 23 | #include "kmer_db.h" 24 | #include "loader_ex.h" 25 | #include "prefix_kmer_db.h" 26 | #include "kmer_extract.h" 27 | 28 | #include "../libs/refresh/sort/lib/pdqsort_par.h" 29 | 30 | using namespace std; 31 | 32 | 33 | void BuildConsole::run(const Params& params){ 34 | if (params.files.size() != 2) { 35 | throw usage_error(params.mode); 36 | } 37 | 38 | LOG_NORMAL("Building database (from " << InputFile::format2string(params.inputFormat) << ")" << endl); 39 | // const std::string& multipleSamples(params.files[0]); 40 | const std::string multipleSamples(params.files[0]); 41 | const std::string dbFilename(params.files[1]); 42 | 43 | LOG_DEBUG("Creating PrefixKmerDb object" << endl); 44 | AbstractKmerDb* db = new PrefixKmerDb(params.numThreads); 45 | std::shared_ptr filter; 46 | std::shared_ptr alphabet; 47 | 48 | if (params.extendDb) { 49 | std::ifstream ifs; 50 | LOG_NORMAL("Loading k-mer database " << dbFilename << "..."); 51 | ifs.open(dbFilename, std::ios::binary); 52 | if (!ifs || !db->deserialize(ifs)) { 53 | throw runtime_error("Cannot open k-mer database " + dbFilename); 54 | } 55 | filter = std::shared_ptr(FilterFactory::create(db->getFraction(), db->getStartFraction(), db->getKmerLength())); 56 | alphabet = std::shared_ptr(AlphabetFactory::instance().create(db->getAlphabetType())); 57 | } 58 | else { 59 | filter = std::shared_ptr(FilterFactory::create(params.fraction, params.fractionStart, params.kmerLength)); 60 | alphabet = params.alphabet; 61 | } 62 | 63 | std::chrono::duration sortingTime{ 0 }, processingTime{ 0 }; 64 | 65 | LOG_NORMAL("Processing samples..." << endl); 66 | LOG_DEBUG("Creating Loader object..." << endl); 67 | 68 | LoaderEx loader(filter, alphabet, params.inputFormat, params.numReaderThreads, params.numThreads, params.multisampleFasta); 69 | loader.configure(multipleSamples); 70 | 71 | LOG_DEBUG("Starting loop..." << endl); 72 | auto totalStart = std::chrono::high_resolution_clock::now(); 73 | int sample_id = 0; 74 | for (; !loader.isCompleted(); ++sample_id) { 75 | auto partialTime = std::chrono::duration(std::chrono::high_resolution_clock::now() - totalStart); 76 | LOG_VERBOSE("Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl); 77 | 78 | auto task = loader.popTask(sample_id); 79 | 80 | if (task) { 81 | if ((sample_id + 1) % 10 == 0) { 82 | size_t cnt = loader.getSamplesCount(); 83 | if (cnt > 0) { 84 | LOG_NORMAL("\r" << sample_id + 1 << "/" << cnt << "..."); 85 | } 86 | else { 87 | LOG_NORMAL("\r" << sample_id + 1 << "..."); 88 | } 89 | } 90 | 91 | auto start = std::chrono::high_resolution_clock::now(); 92 | 93 | // postprocess k-mers if neccessary 94 | if (params.inputFormat == InputFile::Format::GENOME) { 95 | // KmerHelper::sortAndUnique(task->kmers, task->kmersCount, params.numThreads); 96 | 97 | refresh::sort::pdqsort_branchless_tp(refresh::sort::pdqsort_adjust_threads(task->kmersCount, params.numThreads), task->kmers, task->kmers + task->kmersCount, atp); 98 | // refresh::sort::pdqsort_branchless(refresh::sort::pdqsort_adjust_threads(task->kmersCount, params.numThreads), task->kmers, task->kmers + task->kmersCount); 99 | // refresh::sort::pdqsort_branchless(task->kmers, task->kmers + task->kmersCount); 100 | // stable_sort(task->kmers, task->kmers + task->kmersCount); 101 | auto it = std::unique(task->kmers, task->kmers + task->kmersCount); 102 | task->kmersCount = it - task->kmers; 103 | } 104 | else if (params.inputFormat == InputFile::Format::KMC) { 105 | KmerHelper::sort(task->kmers, task->kmersCount, params.numThreads); 106 | } 107 | 108 | auto start2 = std::chrono::high_resolution_clock::now(); 109 | sortingTime += start2 - start; 110 | 111 | db->addKmers( 112 | task->sampleName, 113 | task->kmers, 114 | task->kmersCount, 115 | task->kmerLength, 116 | task->fraction, 117 | params.alphabet->type, 118 | atp); 119 | 120 | processingTime += std::chrono::high_resolution_clock::now() - start2; 121 | 122 | loader.releaseTask(*task); 123 | LOG_VERBOSE(db->printProgress() << endl); 124 | } 125 | } 126 | 127 | LOG_NORMAL("\r" << sample_id << "/" << sample_id << " " << endl); 128 | 129 | auto totalTime = std::chrono::duration(std::chrono::high_resolution_clock::now() - totalStart); 130 | 131 | LOG_NORMAL(endl << endl << "EXECUTION TIMES" << endl 132 | << "Total: " << totalTime.count() << endl 133 | << "Kmer sorting/unique time: " << sortingTime.count() << endl 134 | << "Database update time:" << processingTime.count() << endl); 135 | #ifdef COLLECT_DETAILED_TIMES 136 | LOG_NORMAL(db->printDetailedTimes() << endl); 137 | #endif 138 | LOG_NORMAL("STATISTICS" << endl << db->printStats() << endl); 139 | 140 | std::chrono::duration dt{ 0 }; 141 | 142 | LOG_NORMAL("Serializing database..." << endl); 143 | 144 | const size_t io_buffer_size = 64 << 20; 145 | std::ofstream ofs; 146 | ofs.open(dbFilename, ios_base::out | std::ios::binary); 147 | char* io_buffer = new char[io_buffer_size]; 148 | ofs.rdbuf()->pubsetbuf(io_buffer, io_buffer_size); 149 | db->serialize(ofs, true); 150 | ofs.close(); 151 | delete[] io_buffer; 152 | 153 | LOG_NORMAL(endl << "Releasing memory..."); 154 | auto start = std::chrono::high_resolution_clock::now(); 155 | delete db; 156 | dt = std::chrono::high_resolution_clock::now() - start; 157 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 158 | } -------------------------------------------------------------------------------- /src/console_db2db.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "prefix_kmer_db.h" 3 | #include "similarity_calculator.h" 4 | #include "loader_ex.h" 5 | #include "kmer_extract.h" 6 | 7 | #include 8 | #include 9 | 10 | void Db2DbConsole::run(const Params& params) 11 | { 12 | if (params.files.size() != 3) { 13 | throw usage_error(params.mode); 14 | } 15 | 16 | const std::string& dbFilename1 = params.files[0]; 17 | const std::string& dbFilename2 = params.files[1]; 18 | const std::string& similarityFilename = params.files[2]; 19 | 20 | std::ifstream dbFile1(dbFilename1, std::ios::binary); 21 | PrefixKmerDb* db1 = new PrefixKmerDb(params.numThreads); 22 | 23 | std::ifstream dbFile2(dbFilename2, std::ios::binary); 24 | PrefixKmerDb* db2 = new PrefixKmerDb(params.numThreads); 25 | 26 | std::ofstream ofs(similarityFilename, std::ios::binary); 27 | 28 | SimilarityCalculator calculator(params.numThreads, params.cacheBufferMb); 29 | 30 | std::chrono::duration loadingTime{ 0 }, processingTime{ 0 }, dt{ 0 }; 31 | 32 | // !!! TODO: parallel load of databases 33 | LOG_NORMAL("Loading k-mer database " << dbFilename1 << "..." << endl); 34 | auto start = std::chrono::high_resolution_clock::now(); 35 | if (!dbFile1 || !db1->deserialize(dbFile1)) { 36 | throw runtime_error("Cannot open k-mer database " + dbFilename1); 37 | } 38 | 39 | LOG_NORMAL("Loading k-mer database " << dbFilename2 << "..." << endl); 40 | // auto start = std::chrono::high_resolution_clock::now(); 41 | if (!dbFile2 || !db2->deserialize(dbFile2)) { 42 | throw runtime_error("Cannot open k-mer database " + dbFilename2); 43 | } 44 | 45 | dt = std::chrono::high_resolution_clock::now() - start; 46 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl << db1->printStats() << endl << db2->printStats() << endl); 47 | 48 | 49 | LOG_NORMAL("Calculating matrix of common k-mers..."); 50 | start = std::chrono::high_resolution_clock::now(); 51 | SparseMatrix matrix; 52 | CBubbleHelper bubbles; 53 | calculator.db2db_sp(*db1, *db2, matrix, bubbles); 54 | dt = std::chrono::high_resolution_clock::now() - start; 55 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 56 | 57 | LOG_NORMAL("Storing matrix of common k-mers in " << similarityFilename << "..."); 58 | start = std::chrono::high_resolution_clock::now(); 59 | ofs << "kmer-length: " << db1->getKmerLength() << " fraction: " << db1->getFraction() << " ,db-samples ,"; 60 | std::copy(db1->getSampleNames().cbegin(), db1->getSampleNames().cend(), ostream_iterator(ofs, ",")); 61 | std::copy(db2->getSampleNames().cbegin(), db2->getSampleNames().cend(), ostream_iterator(ofs, ",")); 62 | ofs << endl; 63 | 64 | // allocate row buffer (10000 for sample name + 100 for each row) 65 | char* row = new char[10000 + (db1->getSamplesCount() + db2->getSamplesCount()) * 100]; 66 | char* ptr = row; 67 | 68 | ptr += sprintf(ptr, "query-samples,total-kmers,"); 69 | ptr += num2str(db1->getSampleKmersCount().data(), db1->getSampleKmersCount().size(), ',', ptr); 70 | ptr += num2str(db2->getSampleKmersCount().data(), db2->getSampleKmersCount().size(), ',', ptr); 71 | *ptr++ = '\n'; 72 | ofs.write(row, ptr - row); 73 | 74 | /* if (sparse) { 75 | for(auto &row : matrix.getData()) 76 | std::replace_if(row.begin(), row.end(), 77 | [below, above](pair x) { return x.second >= below || x.second <= above; }, 0); 78 | }*/ 79 | 80 | for (size_t sid = 0; sid < db1->getSamplesCount(); ++sid) { 81 | ptr = row; 82 | ptr += sprintf(ptr, "%s,%lu,", db1->getSampleNames()[sid].c_str(), db1->getSampleKmersCount()[sid]); 83 | 84 | /* if (sparse) { 85 | ptr += matrix.saveRowSparse(sid, ptr); 86 | }*/ 87 | /* else { 88 | ptr += matrix.saveRow(sid, ptr); 89 | }*/ 90 | 91 | *ptr++ = '\n'; 92 | ofs.write(row, ptr - row); 93 | } 94 | 95 | size_t samples_count1 = db1->getSamplesCount(); 96 | 97 | for (size_t sid = 0; sid < db2->getSamplesCount(); ++sid) { 98 | ptr = row; 99 | ptr += sprintf(ptr, "%s,%lu,", db2->getSampleNames()[sid].c_str(), db2->getSampleKmersCount()[sid]); 100 | 101 | if (params.sparseOut) { 102 | ptr += matrix.saveRowSparse(samples_count1 + sid, ptr); 103 | } 104 | /* else { 105 | ptr += matrix.saveRow(sid, ptr); 106 | }*/ 107 | 108 | *ptr++ = '\n'; 109 | ofs.write(row, ptr - row); 110 | } 111 | 112 | delete[] row; 113 | 114 | dt = std::chrono::high_resolution_clock::now() - start; 115 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 116 | 117 | LOG_NORMAL("Releasing memory..."); 118 | start = std::chrono::high_resolution_clock::now(); 119 | 120 | delete db1; 121 | delete db2; 122 | 123 | dt = std::chrono::high_resolution_clock::now() - start; 124 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 125 | } -------------------------------------------------------------------------------- /src/console_distance.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void DistanceConsole::run(const Params& params) { 8 | 9 | if (params.files.size() < 2) { 10 | throw usage_error(params.mode); 11 | } 12 | LOG_NORMAL("Calculating distance measures" << endl); 13 | 14 | const std::string& similarityFilename = params.files[0]; 15 | 16 | std::vector dbKmerCounts; 17 | uint32_t kmerLength; 18 | 19 | LOG_NORMAL("Loading file with common k-mer counts: " << similarityFilename << "..."); 20 | ifstream similarityFile(similarityFilename); 21 | if (!similarityFile) { 22 | throw runtime_error("Cannot open common k-mers table: " + similarityFilename); 23 | } 24 | LOG_NORMAL("OK" << endl); 25 | 26 | const size_t io_buf_size = 128 << 20; 27 | char* io_buffer1 = new char[io_buf_size]; 28 | char* io_buffer2 = new char[io_buf_size]; 29 | similarityFile.rdbuf()->pubsetbuf(io_buffer1, io_buf_size); 30 | 31 | 32 | std::ofstream file(params.files[1]); 33 | std::vector dbSampleNames; 34 | 35 | string tmp, in; 36 | double fraction; 37 | similarityFile >> tmp >> kmerLength >> tmp >> fraction >> tmp; 38 | 39 | getline(similarityFile, in); // copy sample names to output files 40 | if (!params.phylipOut) { 41 | file << "kmer-length: " << kmerLength << " fraction: " << fraction << in << endl; 42 | } 43 | 44 | std::replace(in.begin(), in.end(), ',', ' '); 45 | istringstream iss(in); 46 | std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(dbSampleNames)); 47 | 48 | getline(similarityFile, in); // get number of kmers for all samples 49 | std::replace(in.begin(), in.end(), ',', ' '); 50 | istringstream iss2(in); 51 | iss2 >> tmp >> tmp; 52 | std::copy(std::istream_iterator(iss2), std::istream_iterator(), std::back_inserter(dbKmerCounts)); 53 | 54 | if (params.phylipOut) { 55 | file << dbKmerCounts.size() << endl; 56 | } 57 | 58 | std::vector intersections_dense(dbKmerCounts.size()); 59 | std::vector> intersections_sparse; 60 | std::vector values_dense(dbKmerCounts.size()); 61 | std::vector> values_sparse; 62 | 63 | const size_t bufsize = 1ULL << 30; // 1 GB buffer 64 | char* outBuffer = new char[bufsize]; 65 | char* line = new char[bufsize]; 66 | char* begin, * end, * p; 67 | 68 | LOG_NORMAL("Processing rows..." << endl); 69 | bool triangle = false; 70 | bool sparseOut = params.sparseOut && !params.phylipOut; // output in Phylip format is always dense 71 | 72 | auto& metric = params.availableMetrics.at(params.metricName); 73 | 74 | // fixme: check if bufsize can be removed here - maybe some auto-adjustment of outBuffer can be applied 75 | for (int row_id = 0; similarityFile.getline(line, bufsize); ++row_id) { 76 | if ((row_id + 1) % 10 == 0) { 77 | LOG_NORMAL("\r" << row_id + 1 << "/" << dbKmerCounts.size() << "... "); 78 | } 79 | 80 | // extract name 81 | end = line + similarityFile.gcount() - 1; // do not count \n 82 | begin = line; 83 | p = std::find(begin, end, ','); 84 | string queryName(begin, p); 85 | begin = p + 1; 86 | 87 | // extract kmer count 88 | num_kmers_t queryKmersCount = NumericConversions::strtol(begin, &p); // assume no white characters after the number -> p points comma 89 | begin = p + 1; 90 | 91 | std::vector queryKmerCounts(1, queryKmersCount); 92 | 93 | CombinedFilter filter( 94 | params.metricFilters, 95 | params.kmerFilter, 96 | queryKmerCounts, 97 | dbKmerCounts, 98 | kmerLength); 99 | 100 | int numRead = 0; 101 | for (numRead = 0; end - begin > 1; ++numRead) { 102 | long v = NumericConversions::strtol(begin, &p); 103 | 104 | if (*p == ':') { 105 | begin = p + 1; 106 | num_kmers_t common = NumericConversions::strtol(begin, &p); 107 | 108 | if (params.phylipOut) { 109 | intersections_dense[v - 1] = common; // 1-based indexing in file 110 | } 111 | else { 112 | // sparse input always produces sparse outputs 113 | sparseOut = true; 114 | if (common > 0 && filter(common, 0, v - 1)) { 115 | intersections_sparse.emplace_back(v - 1, common); 116 | } 117 | } 118 | 119 | } 120 | else { 121 | num_kmers_t common = v; 122 | if (sparseOut) { 123 | if (common > 0 && filter(common, 0, numRead)) { 124 | intersections_sparse.emplace_back(numRead, common); 125 | } 126 | } else { 127 | // dense form 128 | intersections_dense[numRead] = common; 129 | } 130 | } 131 | 132 | begin = p + 1; 133 | } 134 | 135 | // determine if matrix is triangle 136 | bool emptyDiagonal = sparseOut ? (intersections_sparse.size() == 0) : (intersections_dense[0] == 0); 137 | if (row_id == 0 && queryName == dbSampleNames[0] && emptyDiagonal) { 138 | triangle = true; 139 | } 140 | 141 | // number of processed elements: 142 | // - dense triangle matrices - same as row id 143 | // - dense non-triangle - entire row 144 | // - others - same as input 145 | int numToProcess = !sparseOut 146 | ? (triangle ? row_id : intersections_dense.size()) 147 | : intersections_sparse.size(); 148 | 149 | if (sparseOut) { 150 | values_sparse.resize(intersections_sparse.size()); 151 | 152 | // non-empty row 153 | if (intersections_sparse.size() > 0) { 154 | 155 | std::transform(intersections_sparse.begin(), intersections_sparse.begin() + numToProcess, values_sparse.begin(), 156 | [&metric, &dbKmerCounts, queryKmersCount, kmerLength](const std::pair& entry) { 157 | return std::make_pair( 158 | entry.first + 1, 159 | metric(entry.second, queryKmersCount, dbKmerCounts[entry.first], kmerLength)); 160 | }); 161 | } 162 | } 163 | else { 164 | std::transform(intersections_dense.begin(), intersections_dense.begin() + numToProcess, dbKmerCounts.begin(), values_dense.begin(), 165 | [&metric, queryKmersCount, kmerLength](size_t intersection, num_kmers_t dbKmerCount) { 166 | return metric(intersection, queryKmersCount, dbKmerCount, kmerLength); 167 | }); 168 | } 169 | 170 | char* ptr = outBuffer; 171 | memcpy(ptr, queryName.c_str(), queryName.size()); 172 | ptr += queryName.size(); 173 | 174 | if (params.phylipOut) { 175 | // phylip matrices are always stored in the dense form 176 | *ptr++ = ' '; 177 | ptr += num2str(values_dense.data(), numRead, ' ', ptr); 178 | } 179 | else { 180 | *ptr++ = ','; 181 | if (sparseOut) { 182 | for (auto& x : values_sparse) { 183 | ptr += num2str(x, ptr); 184 | *ptr++ = ','; 185 | } 186 | } 187 | else { 188 | // dense matrix - write the same number of elements as was read 189 | ptr += num2str(values_dense.data(), numToProcess, ',', ptr); 190 | } 191 | } 192 | 193 | *ptr = 0; 194 | size_t len = ptr - outBuffer; 195 | file.write(outBuffer, len); 196 | file << endl; 197 | 198 | if (params.phylipOut || !sparseOut) { 199 | intersections_dense.assign(intersections_dense.size(), 0); 200 | } 201 | else { 202 | intersections_sparse.clear(); 203 | } 204 | } 205 | 206 | LOG_NORMAL("\r" << dbKmerCounts.size() << "/" << dbKmerCounts.size() << "..."); 207 | LOG_NORMAL("OK" << endl); 208 | 209 | delete[] outBuffer; 210 | delete[] line; 211 | delete[] io_buffer1; 212 | delete[] io_buffer2; 213 | } 214 | -------------------------------------------------------------------------------- /src/console_minhash.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "minhashed_input_file.h" 3 | #include "loader_ex.h" 4 | #include "kmer_extract.h" 5 | 6 | void MinhashConsole::run(const Params& params) { 7 | 8 | if (params.files.size() != 1) { 9 | throw usage_error(params.mode); 10 | } 11 | 12 | LOG_NORMAL("Minhashing samples..." << endl); 13 | 14 | const std::string& multipleKmcSamples = params.files[0]; 15 | std::chrono::duration loadingTime{ 0 }, processingTime{ 0 }; 16 | 17 | LOG_DEBUG("Creating Loader object..." << endl); 18 | 19 | std::shared_ptr filter(FilterFactory::create(params.fraction, 0, params.kmerLength)); 20 | 21 | LoaderEx loader(filter, params.alphabet, params.inputFormat, params.numReaderThreads, params.numThreads, params.multisampleFasta); 22 | loader.configure(multipleKmcSamples); 23 | 24 | LOG_DEBUG("Starting loop..." << endl); 25 | auto totalStart = std::chrono::high_resolution_clock::now(); 26 | for (int i = 0; !loader.isCompleted(); ++i) { 27 | auto partialTime = std::chrono::duration(std::chrono::high_resolution_clock::now() - totalStart); 28 | LOG_VERBOSE("Processing time: " << partialTime.count() << ", loader buffers: " << (loader.getBytes() >> 20) << " MB" << endl); 29 | 30 | auto task = loader.popTask(i); 31 | 32 | if (task) { 33 | auto start = std::chrono::high_resolution_clock::now(); 34 | 35 | MihashedInputFile file; 36 | 37 | // postprocess k-mers if neccessary 38 | if (params.inputFormat == InputFile::Format::GENOME) { 39 | KmerHelper::sortAndUnique(task->kmers, task->kmersCount, params.numThreads); 40 | } 41 | else if (params.inputFormat == InputFile::Format::KMC) { 42 | KmerHelper::sort(task->kmers, task->kmersCount, params.numThreads); 43 | } 44 | 45 | file.store(task->filePath, task->kmers, task->kmersCount, task->kmerLength, params.fraction); 46 | 47 | processingTime += std::chrono::high_resolution_clock::now() - start; 48 | loader.releaseTask(*task); 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /src/console_new2all.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "prefix_kmer_db.h" 3 | #include "similarity_calculator.h" 4 | #include "loader_ex.h" 5 | #include "kmer_extract.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | void New2AllConsole::run(const Params& params) 13 | { 14 | if (params.files.size() != 3) { 15 | throw usage_error(params.mode); 16 | } 17 | 18 | LOG_NORMAL("Set of new samples (from " << InputFile::format2string(params.inputFormat) << ") versus entire database comparison" << endl); 19 | 20 | const std::string& dbFilename = params.files[0]; 21 | const std::string& multipleSamples = params.files[1]; 22 | const std::string & similarityFile = params.files[2]; 23 | 24 | std::ifstream dbFile(dbFilename, std::ios::binary); 25 | PrefixKmerDb db(params.numThreads); 26 | SimilarityCalculator calculator(params.numThreads, params.cacheBufferMb); 27 | 28 | std::chrono::duration loadingTime{ 0 }, processingTime{ 0 }, dt{ 0 }; 29 | 30 | LOG_NORMAL("Loading k-mer database " << dbFilename << "..." << endl); 31 | auto start = std::chrono::high_resolution_clock::now(); 32 | if (!dbFile || !db.deserialize(dbFile)) { 33 | throw runtime_error("Cannot open k-mer database " + dbFilename); 34 | } 35 | dt = std::chrono::high_resolution_clock::now() - start; 36 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl << db.printStats() << endl); 37 | 38 | LOG_DEBUG("Creating Loader object..." << endl); 39 | std::shared_ptr filter(FilterFactory::create(db.getFraction(), db.getStartFraction(), db.getKmerLength())); 40 | std::shared_ptr alphabet(AlphabetFactory::instance().create(db.getAlphabetType())); 41 | 42 | LoaderEx loader(filter, alphabet, params.inputFormat, params.numReaderThreads, params.numThreads, params.multisampleFasta); 43 | loader.configure(multipleSamples); 44 | LOG_NORMAL(endl); 45 | 46 | std::vector sims; 47 | 48 | LOG_NORMAL("Processing queries..." << endl); 49 | auto totalStart = std::chrono::high_resolution_clock::now(); 50 | 51 | // create set of buffers for storing similarities 52 | std::vector> buffers(loader.getOutputBuffersCount()); 53 | std::vector>> sparseBuffers(loader.getOutputBuffersCount()); 54 | 55 | RegisteringQueue freeBuffersQueue(1); 56 | for (size_t i = 0; i < buffers.size(); ++i) { 57 | freeBuffersQueue.Push(i); 58 | } 59 | 60 | SynchronizedPriorityQueue> similarityQueue(params.numThreads); 61 | std::vector workers(params.numThreads); 62 | std::atomic sample_id{ 0 }; 63 | 64 | for (int tid = 0; tid < params.numThreads; ++tid) { 65 | workers[tid] = thread([¶ms, &db, &loader, &freeBuffersQueue, &similarityQueue, &buffers, &sparseBuffers, &calculator, &sample_id, tid]() { 66 | int task_id = sample_id.fetch_add(1); 67 | while (!loader.isCompleted()) { 68 | std::shared_ptr task; 69 | if ((task = loader.popTask(task_id)) && freeBuffersQueue.Pop(task->bufferId2)) { 70 | LOG_DEBUG("loader queue " << task_id + 1 << " -> (" << task->id + 1 << ", " << task->sampleName << ")" << endl); 71 | 72 | // only unique k-mers are needed 73 | KmerHelper::unique(task->kmers, task->kmersCount); 74 | 75 | // run sparse variant when needed 76 | if (params.sparseOut) { 77 | sparseBuffers[task->bufferId2].clear(); 78 | calculator.one2all_sp(db, task->kmers, task->kmersCount, sparseBuffers[task->bufferId2]); 79 | } 80 | else { 81 | buffers[task->bufferId2].clear(); 82 | calculator.one2all(db, task->kmers, task->kmersCount, buffers[task->bufferId2]); 83 | } 84 | similarityQueue.Push(task_id, task); 85 | 86 | LOG_DEBUG("(" << task->id + 1 << ", " << task->sampleName << ") -> similarity queue, tid:" << tid << ", buf:" << task->bufferId2 << endl); 87 | task_id = sample_id.fetch_add(1); 88 | 89 | } 90 | } 91 | 92 | similarityQueue.MarkCompleted(); 93 | LOG_DEBUG("similarity thread completed: " << tid << endl); 94 | }); 95 | } 96 | 97 | 98 | // Opening file 99 | std::ofstream ofs(similarityFile); 100 | ofs << "kmer-length: " << db.getKmerLength() << " fraction: " << db.getFraction() << " ,db-samples ,"; 101 | std::copy(db.getSampleNames().cbegin(), db.getSampleNames().cend(), ostream_iterator(ofs, ",")); 102 | ofs << endl; 103 | 104 | // allocate row buffer (10000 for sample name + 100 for each row) 105 | char* row = new char[10000 + db.getSamplesCount() * 100]; 106 | char* ptr = row; 107 | 108 | ptr += sprintf(ptr, "query-samples,total-kmers,"); 109 | ptr += num2str(db.getSampleKmersCount().data(), db.getSampleKmersCount().size(), ',', ptr); 110 | *ptr++ = '\n'; 111 | ofs.write(row, ptr - row); 112 | 113 | // Gather results in one thread 114 | for (int task_id = 0; !similarityQueue.IsCompleted(); ++task_id) { 115 | 116 | std::shared_ptr task; 117 | if (similarityQueue.Pop(task_id, task)) { 118 | 119 | if ((task_id + 1) % 10 == 0) { 120 | LOG_NORMAL("\r" << task_id + 1 << "... "); 121 | } 122 | 123 | LOG_DEBUG("similarity queue -> (" << task_id + 1 << ", " << task->sampleName << "), buf:" << task->bufferId2 << endl); 124 | auto& buf = buffers[task->bufferId2]; 125 | auto& sparseBuf = sparseBuffers[task->bufferId2]; 126 | 127 | ptr = row; 128 | ptr += sprintf(ptr, "%s,%lu,", task->sampleName.c_str(), task->kmersCount); 129 | 130 | if (params.sparseOut) { 131 | 132 | std::vector queryKmersCounts(1, task->kmersCount); 133 | CombinedFilter filter( 134 | params.metricFilters, 135 | params.kmerFilter, 136 | queryKmersCounts, 137 | db.getSampleKmersCount(), 138 | db.getKmerLength()); 139 | 140 | // filter row 141 | for (int j = 0; j < sparseBuf.size(); ++j) { 142 | auto x = sparseBuf[j]; 143 | if (filter(x.second, 0, x.first)) { 144 | x.first += 1; // 1-based indexing 145 | ptr += num2str(x, ptr); 146 | *ptr++ = ','; 147 | } 148 | } 149 | 150 | } 151 | else { 152 | ptr += num2str(buf.data(), buf.size(), ',', ptr); 153 | } 154 | 155 | freeBuffersQueue.Push(task->bufferId2); 156 | loader.releaseTask(*task); 157 | 158 | *ptr++ = '\n'; 159 | ofs.write(row, ptr - row); 160 | } 161 | } 162 | 163 | delete[] row; 164 | 165 | // make sure all threads have finished 166 | for (auto& w : workers) { 167 | w.join(); 168 | } 169 | 170 | auto totalTime = std::chrono::duration(std::chrono::high_resolution_clock::now() - totalStart); 171 | 172 | LOG_NORMAL(endl << endl << "EXECUTION TIMES" << endl 173 | << "Total: " << totalTime.count() << endl); 174 | } 175 | -------------------------------------------------------------------------------- /src/console_one2all.cpp: -------------------------------------------------------------------------------- 1 | #include "console.h" 2 | #include "prefix_kmer_db.h" 3 | #include "similarity_calculator.h" 4 | #include "loader_ex.h" 5 | #include "kmer_extract.h" 6 | 7 | #include "input_file_factory.h" 8 | 9 | #include 10 | #include 11 | 12 | void One2AllConsole::run(const Params& params) { 13 | 14 | if (params.files.size() != 3) { 15 | throw usage_error(params.mode); 16 | } 17 | 18 | LOG_NORMAL("One new sample (from " << InputFile::format2string(params.inputFormat) << ") versus entire database comparison" << endl); 19 | 20 | const std::string& dbFilename = params.files[0]; 21 | const std::string& sampleFasta = params.files[1]; 22 | const std::string& similarityFile = params.files[2]; 23 | 24 | //uint32_t below = (uint32_t)lrint(params.below); 25 | //uint32_t above = (uint32_t)std::max(0l, lrint(params.above)); 26 | 27 | std::ifstream dbFile(dbFilename, std::ios::binary); 28 | PrefixKmerDb db(params.numThreads); 29 | SimilarityCalculator calculator(params.numThreads, params.cacheBufferMb); 30 | 31 | std::chrono::duration dt{ 0 }; 32 | 33 | LOG_NORMAL("Loading k-mer database " << dbFilename << ":" << endl); 34 | auto start = std::chrono::high_resolution_clock::now(); 35 | if (!dbFile || !db.deserialize(dbFile)) { 36 | throw runtime_error("Cannot open k-mer database " + dbFilename); 37 | } 38 | dt = std::chrono::high_resolution_clock::now() - start; 39 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl << db.printStats() << endl); 40 | 41 | LOG_NORMAL("Loading sample kmers..."); 42 | 43 | start = std::chrono::high_resolution_clock::now(); 44 | 45 | std::vector kmersBuffer; 46 | std::vector positions; 47 | uint32_t kmerLength; 48 | 49 | std::shared_ptr filter(FilterFactory::create(db.getFraction(), db.getStartFraction(), db.getKmerLength())); 50 | std::shared_ptr alphabet(AlphabetFactory::instance().create(db.getAlphabetType())); 51 | 52 | std::shared_ptr file(InputFileFactory::create(params.inputFormat, filter, alphabet)); 53 | 54 | double dummy; 55 | size_t queryKmersCount; 56 | kmer_t* queryKmers; 57 | 58 | if (!file->open(sampleFasta) || !file->load(kmersBuffer, positions, queryKmers, queryKmersCount, kmerLength, dummy)) { 59 | throw runtime_error("Cannot open sample file: " + sampleFasta); 60 | } 61 | 62 | // postprocess k-mers if neccessary 63 | if (params.inputFormat == InputFile::Format::GENOME) { 64 | KmerHelper::sortAndUnique(queryKmers, queryKmersCount, params.numThreads); 65 | } 66 | 67 | if (kmerLength != db.getKmerLength()) { 68 | throw runtime_error("Sample and database k-mer length differ"); 69 | } 70 | 71 | dt = std::chrono::high_resolution_clock::now() - start; 72 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl 73 | << "Number of k-mers: " << queryKmersCount << endl 74 | << "Minhash fraction: " << db.getFraction() << endl); 75 | 76 | LOG_NORMAL("Calculating similarity vector..."); 77 | start = std::chrono::high_resolution_clock::now(); 78 | std::vector sims; 79 | calculator.one2all(db, queryKmers, queryKmersCount, sims); 80 | dt = std::chrono::high_resolution_clock::now() - start; 81 | LOG_NORMAL("OK (" << dt.count() << " seconds)" << endl); 82 | 83 | LOG_NORMAL("Storing similarity vector in " << similarityFile << "..."); 84 | std::ofstream ofs(similarityFile); 85 | 86 | ofs << "kmer-length: " << db.getKmerLength() << " fraction: " << db.getFraction() << " ,db-samples ,"; 87 | std::copy(db.getSampleNames().cbegin(), db.getSampleNames().cend(), ostream_iterator(ofs, ",")); 88 | 89 | ofs << endl << "query-samples,total-kmers,"; 90 | std::copy(db.getSampleKmersCount().cbegin(), db.getSampleKmersCount().cend(), ostream_iterator(ofs, ",")); 91 | ofs << endl << sampleFasta << "," << queryKmersCount << ","; 92 | std::copy(sims.begin(), sims.end(), ostream_iterator(ofs, ",")); 93 | 94 | ofs.close(); 95 | LOG_NORMAL("OK" << endl); 96 | } 97 | 98 | -------------------------------------------------------------------------------- /src/conversion.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // ************************************************************************************ 17 | class NumericConversions 18 | { 19 | public: 20 | static char digits[100000 * 5]; 21 | static uint64_t powers10[15]; 22 | struct _si { 23 | _si() 24 | { 25 | for (int i = 0; i < 100000; ++i) 26 | { 27 | int dig = i; 28 | 29 | digits[i * 5 + 4] = '0' + (dig % 10); 30 | dig /= 10; 31 | digits[i * 5 + 3] = '0' + (dig % 10); 32 | dig /= 10; 33 | digits[i * 5 + 2] = '0' + (dig % 10); 34 | dig /= 10; 35 | digits[i * 5 + 1] = '0' + (dig % 10); 36 | dig /= 10; 37 | digits[i * 5 + 0] = '0' + dig; 38 | } 39 | 40 | powers10[0] = 1; 41 | for (int i = 1; i < 15; ++i) 42 | powers10[i] = 10 * powers10[i - 1]; 43 | } 44 | } static _init; 45 | 46 | static int NDigits(uint64_t v) 47 | { 48 | return (v < 10000) 49 | ? (v < 100 ? (v < 10 ? 1 : 2) : (v < 1000 ? 3 : 4)) 50 | : (v < 1000000 ? (v < 100000 ? 5 : 6) : (v < 10000000 ? 7 : 8)); 51 | } 52 | 53 | // Works only for len <= 5 54 | static void short_str_cpy_upto_5(char* dest, char* src, size_t len) 55 | { 56 | assert(len <= 5); 57 | 58 | if (len == 1) 59 | dest[0] = src[0]; 60 | else if (len == 2) 61 | { 62 | dest[0] = src[0]; 63 | dest[1] = src[1]; 64 | } 65 | else if (len == 3) 66 | { 67 | dest[0] = src[0]; 68 | dest[1] = src[1]; 69 | dest[2] = src[2]; 70 | } 71 | else if (len == 4) 72 | { 73 | dest[0] = src[0]; 74 | dest[1] = src[1]; 75 | dest[2] = src[2]; 76 | dest[3] = src[3]; 77 | } 78 | else 79 | { 80 | dest[0] = src[0]; 81 | dest[1] = src[1]; 82 | dest[2] = src[2]; 83 | dest[3] = src[3]; 84 | dest[4] = src[4]; 85 | } 86 | 87 | return; 88 | } 89 | 90 | static void short_str_cpy_5(char* dest, char* src) 91 | { 92 | dest[0] = src[0]; 93 | dest[1] = src[1]; 94 | dest[2] = src[2]; 95 | dest[3] = src[3]; 96 | dest[4] = src[4]; 97 | } 98 | 99 | static int Int2PChar(uint64_t val, char *str) 100 | { 101 | if (val >= 1000000000000000ull) 102 | { 103 | uint64_t dig1 = val / 1000000000000000ull; 104 | val -= dig1 * 1000000000000000ull; 105 | uint64_t dig2 = val / 10000000000ull; 106 | val -= dig2 * 10000000000ull; 107 | uint64_t dig3 = val / 100000ull; 108 | uint64_t dig4 = val - dig3 * 100000ull; 109 | 110 | int ndig = NDigits(dig1); 111 | 112 | /* std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 113 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 114 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5); 115 | std::memcpy(str + ndig + 10, digits + dig4 * 5, 5);*/ 116 | short_str_cpy_upto_5(str, digits + dig1 * 5 + (5 - ndig), ndig); 117 | short_str_cpy_5(str + ndig, digits + dig2 * 5); 118 | short_str_cpy_5(str + ndig + 5, digits + dig3 * 5); 119 | short_str_cpy_5(str + ndig + 10, digits + dig4 * 5); 120 | 121 | return ndig + 15; 122 | } 123 | else if (val >= 10000000000ull) 124 | { 125 | uint64_t dig1 = val / 10000000000ull; 126 | val -= dig1 * 10000000000ull; 127 | uint64_t dig2 = val / 100000ull; 128 | uint64_t dig3 = val - dig2 * 100000ull; 129 | 130 | int ndig = NDigits(dig1); 131 | 132 | /* std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 133 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 134 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5);*/ 135 | short_str_cpy_upto_5(str, digits + dig1 * 5 + (5 - ndig), ndig); 136 | short_str_cpy_5(str + ndig, digits + dig2 * 5); 137 | short_str_cpy_5(str + ndig + 5, digits + dig3 * 5); 138 | 139 | return ndig + 10; 140 | } 141 | else if (val >= 100000ull) 142 | { 143 | uint64_t dig1 = val / 100000ull; 144 | uint64_t dig2 = val - dig1 * 100000ull; 145 | 146 | int ndig = NDigits(dig1); 147 | 148 | // memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 149 | // memcpy(str + ndig, digits + dig2 * 5, 5); 150 | short_str_cpy_upto_5(str, digits + dig1 * 5 + (5 - ndig), ndig); 151 | // short_str_cpy(str + ndig, digits + dig2 * 5, 5); 152 | short_str_cpy_5(str + ndig, digits + dig2 * 5); 153 | 154 | return ndig + 5; 155 | } 156 | else 157 | { 158 | int ndig = NDigits(val); 159 | 160 | // memcpy(str, digits + val * 5 + (5 - ndig), ndig); 161 | short_str_cpy_upto_5(str, digits + val * 5 + (5 - ndig), ndig); 162 | 163 | return ndig; 164 | } 165 | } 166 | 167 | static int Double2PChar(double val, uint32_t prec, char *str) 168 | { 169 | /* int64_t a = (int64_t)val; 170 | int64_t b = (int64_t)((1.0 + (val - (double)a)) * powers10[prec] + 0.5); 171 | 172 | int r1 = Int2PChar(a, str); 173 | int r2 = Int2PChar(b, str + r1); 174 | str[r1] = '.'; 175 | 176 | return r1 + r2;*/ 177 | 178 | int neg = 0; 179 | 180 | if (val < 0) 181 | { 182 | *str++ = '-'; 183 | val = -val; 184 | neg = 1; 185 | } 186 | 187 | uint64_t x = (uint64_t) (val * powers10[prec] + 0.5); 188 | 189 | if (x < powers10[prec]) // |val| < 1.0 190 | { 191 | *str++ = '0'; 192 | *str++ = '.'; 193 | 194 | int to_move = Int2PChar(x, str); 195 | int shift = prec - to_move; 196 | 197 | if (shift) 198 | { 199 | for (int i = 0; i < to_move; ++i) 200 | str[prec - i - 1] = str[prec - i - 1 - shift]; 201 | for (int i = 0; i < shift; ++i) 202 | str[i] = '0'; 203 | } 204 | 205 | return prec + 2 + neg; 206 | } 207 | else 208 | { 209 | int r = Int2PChar(x, str + 1); 210 | 211 | int to_move = r - prec; 212 | 213 | for (int i = 0; i < to_move; ++i) 214 | str[i] = str[i+1]; 215 | str[to_move] = '.'; 216 | 217 | return r + 1 + neg; 218 | } 219 | } 220 | 221 | static long int strtol(const char* str, char** endptr) { 222 | long int val = 0; 223 | char* p = (char*)str; 224 | bool is_negative = false; 225 | 226 | 227 | if (*p == '-') 228 | { 229 | is_negative = true; 230 | ++p; 231 | } 232 | 233 | while (*p >= '0' && *p <= '9') 234 | { 235 | val = val * 10 + (*p++ - '0'); 236 | } 237 | 238 | if (endptr) 239 | *endptr = p; 240 | 241 | return is_negative ? -val : val; 242 | } 243 | }; 244 | 245 | 246 | // integral specialization 247 | template ::value, int>::type* = nullptr> 248 | int num2str(Integer val, char *out) { 249 | return NumericConversions::Int2PChar((uint64_t)val, out); 250 | } 251 | 252 | // floating point specialization 253 | template ::value, int>::type* = nullptr> 254 | int num2str(Floating val, char *out) { 255 | if (val == 0) { 256 | *out = '0'; 257 | return 1; 258 | } 259 | return NumericConversions::Double2PChar((double)val, 6, out); 260 | } 261 | 262 | // pair specialization 263 | template 264 | int num2str(const std::pair& val, char *out) { 265 | char* ptr = out; 266 | ptr += num2str(val.first, ptr); 267 | *ptr++ = ':'; 268 | ptr += num2str(val.second, ptr); 269 | 270 | return ptr - out; 271 | } 272 | 273 | // collection specialization 274 | template 275 | int num2str(const T* collection, size_t size, char delim, char* out) { 276 | char* ptr = out; 277 | for (size_t i = 0; i < size; ++i) { 278 | ptr += num2str(*collection++, ptr); 279 | *ptr++ = delim; 280 | } 281 | 282 | return ptr - out; 283 | } 284 | 285 | template 286 | int num2str_sparse(const T* collection, size_t size, char delim, char* out, const T sparse_val = 0) { 287 | char* ptr = out; 288 | for (size_t i = 0; i < size; ++i, collection++) { 289 | if (*collection != sparse_val) { 290 | ptr += num2str(i + 1, ptr); 291 | *ptr++ = ':'; 292 | ptr += num2str(*collection, ptr); 293 | *ptr++ = delim; 294 | } 295 | } 296 | 297 | return ptr - out; 298 | } -------------------------------------------------------------------------------- /src/filter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include "types.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | 21 | // ***************************************************************************************** 22 | // 23 | class AbstractFilter { 24 | public: 25 | virtual std::unique_ptr clone() const = 0; 26 | virtual void configure (int kmerLength) = 0; 27 | virtual ~AbstractFilter() {} 28 | }; 29 | 30 | 31 | // ***************************************************************************************** 32 | // 33 | class MinHashFilter : public AbstractFilter { 34 | public: 35 | 36 | int getLength() const { return kmer_length; } 37 | double getFraction() const { return fraction; } 38 | double getStartValue() const { return startValue; } 39 | 40 | MinHashFilter(double fraction, double startValue, int kmerLength) : kmer_length(kmerLength), fraction(fraction), startValue(startValue) { 41 | 42 | minThreshold = (uint64_t)((double)std::numeric_limits::max() * startValue); 43 | maxThreshold = (uint64_t)((double)std::numeric_limits::max() * (startValue + fraction)); 44 | 45 | configure(kmerLength); 46 | } 47 | 48 | bool operator()(kmer_t kmer) const { 49 | uint64_t h = hash(kmer); 50 | return (h >= minThreshold && h < maxThreshold); 51 | } 52 | 53 | void configure(int kmer_length) override { 54 | this->k_div_4 = (uint64_t)ceil((double)kmer_length / 4); 55 | this->c42_xor_k_div_4 = 42 ^ k_div_4; 56 | } 57 | 58 | std::unique_ptr clone() const override { 59 | return unique_ptr(new MinHashFilter(*this)); 60 | } 61 | 62 | 63 | protected: 64 | 65 | uint64_t kmer_length; 66 | double fraction; 67 | double startValue; 68 | 69 | uint64_t maxThreshold; 70 | uint64_t minThreshold; 71 | uint64_t k_div_4; 72 | uint64_t c42_xor_k_div_4; 73 | 74 | 75 | FORCE_INLINE uint64_t fmix64(uint64_t k) const 76 | { 77 | k ^= k >> 33; 78 | k *= 0xff51afd7ed558ccdull; 79 | k ^= k >> 33; 80 | k *= 0xc4ceb9fe1a85ec53ull; 81 | k ^= k >> 33; 82 | 83 | return k; 84 | } 85 | 86 | 87 | uint64_t rotl64(uint64_t x, int32_t offset) const 88 | { 89 | #ifdef WIN32 90 | return _rotl64(x, offset); 91 | #else 92 | return (x << offset) | (x >> (64 - offset)); 93 | #endif 94 | } 95 | 96 | uint64_t hash(kmer_t kmer) const { 97 | uint64_t h, h1, h2; 98 | 99 | // calculate hash 100 | h = kmer; 101 | h *= 0x87c37b91114253d5ull; 102 | h = rotl64(h, 31); 103 | h *= 0x4cf5ad432745937full; 104 | h1 = 42 ^ h; 105 | h1 ^= k_div_4; //ceil(k / 4); 106 | h2 = c42_xor_k_div_4; // 42 ^ ceil(k / 4); 107 | h1 += h2; 108 | h2 += h1; 109 | h1 = fmix64(h1); 110 | h2 = fmix64(h2); 111 | h1 += h2; 112 | h2 += h1; 113 | 114 | return h1 ^ h2; // xor as final hash 115 | } 116 | }; 117 | 118 | // ***************************************************************************************** 119 | // 120 | class NullFilter : public MinHashFilter { 121 | public: 122 | 123 | NullFilter(int kmerLength) 124 | : MinHashFilter(1.0, 0.0, kmerLength) {} 125 | 126 | bool operator()(kmer_t kmer) const { return true; } 127 | 128 | std::unique_ptr clone() const override { 129 | return std::unique_ptr(new NullFilter(*this)); 130 | } 131 | }; 132 | 133 | 134 | // ***************************************************************************************** 135 | // 136 | class FilterFactory { 137 | public: 138 | static MinHashFilter* create(double fraction, double startValue, int kmerLength) { 139 | if (fraction < 1.0) { 140 | return new MinHashFilter(fraction, startValue, kmerLength); 141 | } 142 | else { 143 | return new NullFilter(kmerLength); 144 | } 145 | } 146 | 147 | }; -------------------------------------------------------------------------------- /src/input_file.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | #include "input_file.h" 9 | #include "kmc_api/kmc_file.h" 10 | #include "kmer_db.h" 11 | #include "filter.h" 12 | #include "kmer_extract.h" 13 | #include "parallel_sorter.h" 14 | 15 | #include "../libs/refresh/compression/lib/file_wrapper.h" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #ifdef USE_RADULS 25 | #include 26 | #endif 27 | 28 | using namespace std; 29 | 30 | -------------------------------------------------------------------------------- /src/input_file.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include "kmc_api/kmc_file.h" 11 | #include "kmer_db.h" 12 | #include "filter.h" 13 | #include "loader_tasks.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | // ***************************************************************************************** 21 | // 22 | class InputFile { 23 | public: 24 | enum Format { KMC, MINHASH, GENOME }; 25 | 26 | static std::string format2string(enum Format f) { 27 | switch (f) { 28 | case GENOME: return "fasta genomes"; 29 | case KMC: return "k-mers"; 30 | case MINHASH: return "minhashed k-mers"; 31 | } 32 | 33 | return ""; 34 | } 35 | 36 | virtual bool open(const std::string& filename) = 0; 37 | 38 | virtual bool load( 39 | std::vector& kmersBuffer, 40 | std::vector& positionsBuffer, 41 | kmer_t*& kmers, 42 | size_t& kmersCount, 43 | uint32_t& kmerLength, 44 | double& filterValue) = 0; 45 | 46 | virtual ~InputFile() {} 47 | }; 48 | 49 | 50 | //****************************************************************************************** 51 | // 52 | class IMultiSampleFile { 53 | 54 | public: 55 | 56 | virtual bool initMultiFasta() = 0; 57 | 58 | virtual bool loadNext( 59 | std::vector& kmersBuffer, 60 | std::vector& positionsBuffer, 61 | kmer_t*& kmers, 62 | size_t& kmersCount, 63 | uint32_t& kmerLength, 64 | double& filterValue, 65 | std::string& sampleName, 66 | atomic& total_kmers_in_kmers_collections) = 0; 67 | 68 | virtual ~IMultiSampleFile() {} 69 | }; 70 | 71 | 72 | // ***************************************************************************************** 73 | // 74 | template 75 | class FilteredInputFile : public InputFile { 76 | public: 77 | FilteredInputFile(std::shared_ptr& filter) : filter(filter) {} 78 | 79 | protected: 80 | std::shared_ptr filter; 81 | }; 82 | 83 | 84 | -------------------------------------------------------------------------------- /src/input_file_factory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "genome_input_file.h" 3 | #include "kmc_input_file.h" 4 | #include "minhashed_input_file.h" 5 | 6 | class InputFileFactory { 7 | public: 8 | 9 | static InputFile* create( 10 | InputFile::Format format, 11 | std::shared_ptr filter, 12 | std::shared_ptr alphabet) { 13 | 14 | if (format == InputFile::MINHASH) { 15 | return new MihashedInputFile(); 16 | } 17 | else { 18 | std::shared_ptr minhashFilter = std::dynamic_pointer_cast(filter); 19 | std::shared_ptr nullFilter = std::dynamic_pointer_cast(filter); 20 | 21 | if (!minhashFilter) { 22 | throw std::runtime_error("Only MinHashFilter is currently supported"); 23 | } 24 | else if (nullFilter) { 25 | 26 | if (format == InputFile::GENOME) { 27 | return new GenomeInputFile(nullFilter, alphabet); 28 | } 29 | else if (format == InputFile::KMC) { 30 | return new KmcInputFile(nullFilter); 31 | } 32 | else { 33 | throw std::runtime_error("Unsupported input type"); 34 | } 35 | } 36 | else { 37 | if (format == InputFile::GENOME) { 38 | return new GenomeInputFile(minhashFilter, alphabet); 39 | } 40 | else if (format == InputFile::KMC) { 41 | return new KmcInputFile(minhashFilter); 42 | } 43 | else { 44 | throw std::runtime_error("Unsupported input type"); 45 | } 46 | } 47 | } 48 | } 49 | }; -------------------------------------------------------------------------------- /src/instr_set_detect.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #ifdef _MSC_VER 6 | #include 7 | #endif 8 | 9 | namespace InstrSetDetect 10 | { 11 | enum class Instr { NotSet, SSE, SSE2, SSE3, SSE4_1, SSE4_2, AVX, AVX2, NEON }; 12 | 13 | #ifdef ARCH_X64 14 | static void cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) 15 | { 16 | #if _MSC_VER 17 | __cpuidex((int*)abcd, eax, ecx); 18 | #else 19 | uint32_t ebx=0, edx; 20 | __asm__("cpuid" : "+b" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx)); 21 | abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; 22 | #endif 23 | } 24 | 25 | static Instr GetInstr() 26 | { 27 | Instr instr{ Instr::NotSet }; 28 | uint32_t abcd[4]{}; 29 | cpuid(1, 0, abcd); 30 | 31 | uint32_t edx = abcd[3]; 32 | 33 | uint32_t ecx = abcd[2]; 34 | 35 | if (((edx >> 25) & 1) == 0) return instr; 36 | instr = Instr::SSE; 37 | 38 | if (((edx >> 26) & 1) == 0) return instr; 39 | instr = Instr::SSE2; 40 | 41 | if (((ecx >> 0) & 1) == 0) return instr; 42 | instr = Instr::SSE3; 43 | 44 | if (((ecx >> 19) & 1) == 0) return instr; 45 | instr = Instr::SSE4_1; 46 | 47 | if (((ecx >> 20) & 1) == 0) return instr; 48 | instr = Instr::SSE4_2; 49 | 50 | if (((ecx >> 28) & 1) == 0) return instr; 51 | instr = Instr::AVX; 52 | 53 | cpuid(7, 0, abcd); 54 | 55 | uint32_t ebx = abcd[1]; 56 | if (((ebx >> 5) & 1) == 0) return instr; 57 | instr = Instr::AVX2; 58 | 59 | return instr; 60 | } 61 | #else 62 | static Instr GetInstr() 63 | { 64 | return Instr::NEON; 65 | } 66 | #endif 67 | }; 68 | -------------------------------------------------------------------------------- /src/kmc_api/kmc_file.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of KMC software distributed under GNU GPL 3 licence. 3 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot 6 | 7 | Version: 3.0.0 8 | Date : 2017-01-28 9 | */ 10 | 11 | #ifndef _KMC_FILE_H 12 | #define _KMC_FILE_H 13 | 14 | #include "kmer_defs.h" 15 | #include "kmer_api.h" 16 | #include 17 | #include 18 | 19 | struct CKMCFileInfo 20 | { 21 | uint32 kmer_length; 22 | uint32 mode; 23 | uint32 counter_size; 24 | uint32 lut_prefix_length; 25 | uint32 signature_len; 26 | uint32 min_count; 27 | uint64 max_count; 28 | bool both_strands; 29 | uint64 total_kmers; 30 | }; 31 | 32 | class CKMCFile 33 | { 34 | enum open_mode {closed, opened_for_RA, opened_for_listing}; 35 | open_mode is_opened; 36 | 37 | bool end_of_file; 38 | 39 | FILE *file_pre; 40 | FILE *file_suf; 41 | 42 | uint64* prefix_file_buf; 43 | uint64 prefix_file_buf_size; 44 | uint64 prefix_index; // The current prefix's index in an array "prefix_file_buf", readed from *.kmc_pre 45 | uint32 single_LUT_size; // The size of a single LUT (in no. of elements) 46 | 47 | uint32* signature_map; 48 | uint32 signature_map_size; 49 | 50 | uchar* sufix_file_buf; 51 | uint64 sufix_number; // The sufix's number to be listed 52 | uint64 index_in_partial_buf; // The current byte's number in an array "sufix_file_buf", for listing mode 53 | 54 | uint32 kmer_length; 55 | uint32 mode; 56 | uint32 counter_size; 57 | uint32 lut_prefix_length; 58 | uint32 signature_len; 59 | uint32 min_count; 60 | uint64 max_count; 61 | uint64 total_kmers; 62 | bool both_strands; 63 | 64 | uint32 kmc_version; 65 | uint32 sufix_size; // sufix's size in bytes 66 | uint32 sufix_rec_size; // sufix_size + counter_size 67 | 68 | uint32 original_min_count; 69 | uint64 original_max_count; 70 | 71 | static uint64 part_size; // the size of a block readed to sufix_file_buf, in listing mode 72 | 73 | bool BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint64& counter, uint32 pattern_offset); 74 | 75 | // Open a file, recognize its size and check its marker. Auxiliary function. 76 | bool OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]); 77 | 78 | // Recognize current parameters. Auxiliary function. 79 | bool ReadParamsFrom_prefix_file_buf(uint64 &size); 80 | 81 | // Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function. 82 | void Reload_sufix_file_buf(); 83 | 84 | // Implementation of GetCountersForRead for kmc1 database format for both strands 85 | bool GetCountersForRead_kmc1_both_strands(const std::string& read, std::vector& counters); 86 | 87 | // Implementation of GetCountersForRead for kmc1 database format without choosing canonical k-mer 88 | bool GetCountersForRead_kmc1(const std::string& read, std::vector& counters); 89 | 90 | using super_kmers_t = std::vector>;//start_pos, len, bin_no 91 | void GetSuperKmers(const std::string& transformed_read, super_kmers_t& super_kmers); 92 | 93 | // Implementation of GetCountersForRead for kmc2 database format for both strands 94 | bool GetCountersForRead_kmc2_both_strands(const std::string& read, std::vector& counters); 95 | 96 | // Implementation of GetCountersForRead for kmc2 database format 97 | bool GetCountersForRead_kmc2(const std::string& read, std::vector& counters); 98 | public: 99 | 100 | CKMCFile(); 101 | ~CKMCFile(); 102 | 103 | // Open files *.kmc_pre & *.kmc_suf, read them to RAM, close files. *.kmc_suf is opened for random access 104 | bool OpenForRA(const std::string &file_name); 105 | 106 | // Open files *kmc_pre & *.kmc_suf, read *.kmc_pre to RAM, *.kmc_suf is buffered 107 | bool OpenForListing(const std::string& file_name); 108 | 109 | // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF 110 | bool ReadNextKmer(CKmerAPI &kmer, float &count); 111 | 112 | bool ReadNextKmer(CKmerAPI &kmer, uint64 &count); //for small k-values when counter may be longer than 4bytes 113 | 114 | bool ReadNextKmer(CKmerAPI &kmer, uint32 &count); 115 | // Release memory and close files in case they were opened 116 | bool Close(); 117 | 118 | // Set the minimal value for a counter. Kmers with counters below this theshold are ignored 119 | bool SetMinCount(uint32 x); 120 | 121 | // Return a value of min_count. Kmers with counters below this theshold are ignored 122 | uint32 GetMinCount(void); 123 | 124 | // Set the maximal value for a counter. Kmers with counters above this theshold are ignored 125 | bool SetMaxCount(uint32 x); 126 | 127 | // Return a value of max_count. Kmers with counters above this theshold are ignored 128 | uint64 GetMaxCount(void); 129 | 130 | //Return true if kmc was run without -b switch. 131 | bool GetBothStrands(void); 132 | 133 | // Return the total number of kmers between min_count and max_count 134 | uint64 KmerCount(void); 135 | 136 | // Return the length of kmers 137 | uint32 KmerLength(void); 138 | 139 | // Set initial values to enable listing kmers from the begining. Only in listing mode 140 | bool RestartListing(void); 141 | 142 | // Return true if all kmers are listed 143 | bool Eof(void); 144 | 145 | // Return true if kmer exists. In this case return kmer's counter in count 146 | bool CheckKmer(CKmerAPI &kmer, float &count); 147 | 148 | bool CheckKmer(CKmerAPI &kmer, uint32 &count); 149 | 150 | bool CheckKmer(CKmerAPI &kmer, uint64 &count); 151 | 152 | // Return true if kmer exists 153 | bool IsKmer(CKmerAPI &kmer); 154 | 155 | // Set original (readed from *.kmer_pre) values for min_count and max_count 156 | void ResetMinMaxCounts(void); 157 | 158 | // Get current parameters from kmer_database 159 | bool Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint64 &_max_count, uint64 &_total_kmers); 160 | 161 | // Get current parameters from kmer_database 162 | bool Info(CKMCFileInfo& info); 163 | 164 | // Get counters for all k-mers in read 165 | bool GetCountersForRead(const std::string& read, std::vector& counters); 166 | bool GetCountersForRead(const std::string& read, std::vector& counters); 167 | private: 168 | uint32 count_for_kmer_kmc1(CKmerAPI& kmer); 169 | uint32 count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos); 170 | }; 171 | 172 | #endif 173 | 174 | // ***** EOF 175 | -------------------------------------------------------------------------------- /src/kmc_api/kmer_api.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of KMC software distributed under GNU GPL 3 licence. 3 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc 4 | 5 | Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz 6 | 7 | Version: 3.0.0 8 | Date : 2017-01-28 9 | */ 10 | 11 | 12 | #include "kmer_api.h" 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | 18 | const char CKmerAPI::char_codes[] = {'A','C', 'G', 'T'}; 19 | signed char CKmerAPI::num_codes[]; 20 | CKmerAPI::_si CKmerAPI::_init; 21 | uchar CKmerAPI::rev_comp_bytes_LUT[] = { 22 | 0xff, 0xbf, 0x7f, 0x3f, 0xef, 0xaf, 0x6f, 0x2f, 0xdf, 0x9f, 0x5f, 0x1f, 0xcf, 0x8f, 0x4f, 0x0f, 23 | 0xfb, 0xbb, 0x7b, 0x3b, 0xeb, 0xab, 0x6b, 0x2b, 0xdb, 0x9b, 0x5b, 0x1b, 0xcb, 0x8b, 0x4b, 0x0b, 24 | 0xf7, 0xb7, 0x77, 0x37, 0xe7, 0xa7, 0x67, 0x27, 0xd7, 0x97, 0x57, 0x17, 0xc7, 0x87, 0x47, 0x07, 25 | 0xf3, 0xb3, 0x73, 0x33, 0xe3, 0xa3, 0x63, 0x23, 0xd3, 0x93, 0x53, 0x13, 0xc3, 0x83, 0x43, 0x03, 26 | 0xfe, 0xbe, 0x7e, 0x3e, 0xee, 0xae, 0x6e, 0x2e, 0xde, 0x9e, 0x5e, 0x1e, 0xce, 0x8e, 0x4e, 0x0e, 27 | 0xfa, 0xba, 0x7a, 0x3a, 0xea, 0xaa, 0x6a, 0x2a, 0xda, 0x9a, 0x5a, 0x1a, 0xca, 0x8a, 0x4a, 0x0a, 28 | 0xf6, 0xb6, 0x76, 0x36, 0xe6, 0xa6, 0x66, 0x26, 0xd6, 0x96, 0x56, 0x16, 0xc6, 0x86, 0x46, 0x06, 29 | 0xf2, 0xb2, 0x72, 0x32, 0xe2, 0xa2, 0x62, 0x22, 0xd2, 0x92, 0x52, 0x12, 0xc2, 0x82, 0x42, 0x02, 30 | 0xfd, 0xbd, 0x7d, 0x3d, 0xed, 0xad, 0x6d, 0x2d, 0xdd, 0x9d, 0x5d, 0x1d, 0xcd, 0x8d, 0x4d, 0x0d, 31 | 0xf9, 0xb9, 0x79, 0x39, 0xe9, 0xa9, 0x69, 0x29, 0xd9, 0x99, 0x59, 0x19, 0xc9, 0x89, 0x49, 0x09, 32 | 0xf5, 0xb5, 0x75, 0x35, 0xe5, 0xa5, 0x65, 0x25, 0xd5, 0x95, 0x55, 0x15, 0xc5, 0x85, 0x45, 0x05, 33 | 0xf1, 0xb1, 0x71, 0x31, 0xe1, 0xa1, 0x61, 0x21, 0xd1, 0x91, 0x51, 0x11, 0xc1, 0x81, 0x41, 0x01, 34 | 0xfc, 0xbc, 0x7c, 0x3c, 0xec, 0xac, 0x6c, 0x2c, 0xdc, 0x9c, 0x5c, 0x1c, 0xcc, 0x8c, 0x4c, 0x0c, 35 | 0xf8, 0xb8, 0x78, 0x38, 0xe8, 0xa8, 0x68, 0x28, 0xd8, 0x98, 0x58, 0x18, 0xc8, 0x88, 0x48, 0x08, 36 | 0xf4, 0xb4, 0x74, 0x34, 0xe4, 0xa4, 0x64, 0x24, 0xd4, 0x94, 0x54, 0x14, 0xc4, 0x84, 0x44, 0x04, 37 | 0xf0, 0xb0, 0x70, 0x30, 0xe0, 0xa0, 0x60, 0x20, 0xd0, 0x90, 0x50, 0x10, 0xc0, 0x80, 0x40, 0x00 38 | }; 39 | uint64 CKmerAPI::alignment_mask[] = { 40 | 0xFFFFFFFFFFFFFFFFULL, 41 | 0x3FFFFFFFFFFFFFFFULL, 42 | 0x0FFFFFFFFFFFFFFFULL, 43 | 0x03FFFFFFFFFFFFFFULL, 44 | 0x00FFFFFFFFFFFFFFULL 45 | }; 46 | 47 | // ***** EOF 48 | -------------------------------------------------------------------------------- /src/kmc_api/kmer_defs.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of KMC software distributed under GNU GPL 3 licence. 3 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc 4 | 5 | Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz 6 | 7 | Version: 3.0.0 8 | Date : 2017-01-28 9 | */ 10 | 11 | 12 | #ifndef _KMER_DEFS_H 13 | #define _KMER_DEFS_H 14 | 15 | #define KMC_VER "3.0.0" 16 | #define KMC_DATE "2017-01-28" 17 | 18 | #define MIN(x,y) ((x) < (y) ? (x) : (y)) 19 | 20 | #ifndef WIN32 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #define _TCHAR char 28 | #define _tmain main 29 | 30 | #define my_fopen fopen 31 | #define my_fseek fseek 32 | #define my_ftell ftell 33 | 34 | 35 | #include 36 | #include 37 | #include 38 | using namespace std; 39 | 40 | #else 41 | #define my_fopen fopen 42 | #define my_fseek _fseeki64 43 | #define my_ftell _ftelli64 44 | #endif 45 | //typedef unsigned char uchar; 46 | 47 | typedef int int32; 48 | typedef unsigned int uint32; 49 | typedef long long int64; 50 | typedef unsigned long long uint64; 51 | typedef unsigned char uchar; 52 | #endif 53 | 54 | // ***** EOF 55 | -------------------------------------------------------------------------------- /src/kmc_api/mmer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of KMC software distributed under GNU GPL 3 licence. 3 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot 6 | 7 | Version: 3.0.0 8 | Date : 2017-01-28 9 | */ 10 | 11 | #include "../kmc_api/mmer.h" 12 | 13 | 14 | uint32 CMmer::norm5[]; 15 | uint32 CMmer::norm6[]; 16 | uint32 CMmer::norm7[]; 17 | uint32 CMmer::norm8[]; 18 | uint32 CMmer::norm9[]; 19 | uint32 CMmer::norm10[]; 20 | uint32 CMmer::norm11[]; 21 | 22 | CMmer::_si CMmer::_init; 23 | 24 | 25 | //-------------------------------------------------------------------------- 26 | CMmer::CMmer(uint32 _len) 27 | { 28 | switch (_len) 29 | { 30 | case 5: 31 | norm = norm5; 32 | break; 33 | case 6: 34 | norm = norm6; 35 | break; 36 | case 7: 37 | norm = norm7; 38 | break; 39 | case 8: 40 | norm = norm8; 41 | break; 42 | case 9: 43 | norm = norm9; 44 | break; 45 | case 10: 46 | norm = norm10; 47 | break; 48 | case 11: 49 | norm = norm11; 50 | break; 51 | default: 52 | break; 53 | } 54 | len = _len; 55 | mask = (1 << _len * 2) - 1; 56 | str = 0; 57 | } 58 | 59 | //-------------------------------------------------------------------------- 60 | 61 | -------------------------------------------------------------------------------- /src/kmc_api/mmer.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of KMC software distributed under GNU GPL 3 licence. 3 | The homepage of the KMC project is http://sun.aei.polsl.pl/kmc 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot 6 | 7 | Version: 3.0.0 8 | Date : 2017-01-28 9 | */ 10 | 11 | #ifndef _MMER_H 12 | #define _MMER_H 13 | #include "kmer_defs.h" 14 | 15 | // ************************************************************************* 16 | // ************************************************************************* 17 | 18 | 19 | class CMmer 20 | { 21 | uint32 str; 22 | uint32 mask; 23 | uint32 current_val; 24 | uint32* norm; 25 | uint32 len; 26 | static uint32 norm5[1 << 10]; 27 | static uint32 norm6[1 << 12]; 28 | static uint32 norm7[1 << 14]; 29 | static uint32 norm8[1 << 16]; 30 | static uint32 norm9[1 << 18]; 31 | static uint32 norm10[1 << 20]; 32 | static uint32 norm11[1 << 22]; 33 | 34 | static bool is_allowed(uint32 mmer, uint32 len) 35 | { 36 | if ((mmer & 0x3f) == 0x3f) // TTT suffix 37 | return false; 38 | if ((mmer & 0x3f) == 0x3b) // TGT suffix 39 | return false; 40 | if ((mmer & 0x3c) == 0x3c) // TG* suffix 41 | return false; 42 | 43 | for (uint32 j = 0; j < len - 3; ++j) 44 | if ((mmer & 0xf) == 0) // AA inside 45 | return false; 46 | else 47 | mmer >>= 2; 48 | 49 | if (mmer == 0) // AAA prefix 50 | return false; 51 | if (mmer == 0x04) // ACA prefix 52 | return false; 53 | if ((mmer & 0xf) == 0) // *AA prefix 54 | return false; 55 | 56 | return true; 57 | } 58 | 59 | friend class CSignatureMapper; 60 | struct _si 61 | { 62 | static uint32 get_rev(uint32 mmer, uint32 len) 63 | { 64 | uint32 rev = 0; 65 | uint32 shift = len*2 - 2; 66 | for(uint32 i = 0 ; i < len ; ++i) 67 | { 68 | rev += (3 - (mmer & 3)) << shift; 69 | mmer >>= 2; 70 | shift -= 2; 71 | } 72 | return rev; 73 | } 74 | 75 | 76 | 77 | static void init_norm(uint32* norm, uint32 len) 78 | { 79 | uint32 special = 1 << len * 2; 80 | for(uint32 i = 0 ; i < special ; ++i) 81 | { 82 | uint32 rev = get_rev(i, len); 83 | uint32 str_val = is_allowed(i, len) ? i : special; 84 | uint32 rev_val = is_allowed(rev, len) ? rev : special; 85 | norm[i] = MIN(str_val, rev_val); 86 | } 87 | } 88 | 89 | _si() 90 | { 91 | init_norm(norm5, 5); 92 | init_norm(norm6, 6); 93 | init_norm(norm7, 7); 94 | init_norm(norm8, 8); 95 | init_norm(norm9, 9); 96 | init_norm(norm10, 10); 97 | init_norm(norm11, 11); 98 | } 99 | 100 | }static _init; 101 | public: 102 | CMmer(uint32 _len); 103 | inline void insert(uchar symb); 104 | inline uint32 get() const; 105 | // inline bool operator==(const CMmer& x); // fixme: does not compile in C++20 106 | inline bool operator==(CMmer& x); 107 | inline bool operator<(const CMmer& x); 108 | inline void clear(); 109 | inline bool operator<=(const CMmer& x); 110 | inline void set(const CMmer& x); 111 | inline void insert(const char* seq); 112 | 113 | }; 114 | 115 | 116 | 117 | //-------------------------------------------------------------------------- 118 | inline void CMmer::insert(uchar symb) 119 | { 120 | str <<= 2; 121 | str += symb; 122 | str &= mask; 123 | 124 | current_val = norm[str]; 125 | } 126 | 127 | //-------------------------------------------------------------------------- 128 | inline uint32 CMmer::get() const 129 | { 130 | return current_val; 131 | } 132 | 133 | //-------------------------------------------------------------------------- 134 | //inline bool CMmer::operator==(const CMmer& x) 135 | inline bool CMmer::operator==(CMmer& x) 136 | { 137 | return current_val == x.current_val; 138 | } 139 | 140 | //-------------------------------------------------------------------------- 141 | inline bool CMmer::operator<(const CMmer& x) 142 | { 143 | return current_val < x.current_val; 144 | } 145 | 146 | //-------------------------------------------------------------------------- 147 | inline void CMmer::clear() 148 | { 149 | str = 0; 150 | } 151 | 152 | //-------------------------------------------------------------------------- 153 | inline bool CMmer::operator<=(const CMmer& x) 154 | { 155 | return current_val <= x.current_val; 156 | } 157 | 158 | //-------------------------------------------------------------------------- 159 | inline void CMmer::set(const CMmer& x) 160 | { 161 | str = x.str; 162 | current_val = x.current_val; 163 | } 164 | 165 | //-------------------------------------------------------------------------- 166 | inline void CMmer::insert(const char* seq) 167 | { 168 | switch (len) 169 | { 170 | case 5: 171 | str = (seq[0] << 8) + (seq[1] << 6) + (seq[2] << 4) + (seq[3] << 2) + (seq[4]); 172 | break; 173 | case 6: 174 | str = (seq[0] << 10) + (seq[1] << 8) + (seq[2] << 6) + (seq[3] << 4) + (seq[4] << 2) + (seq[5]); 175 | break; 176 | case 7: 177 | str = (seq[0] << 12) + (seq[1] << 10) + (seq[2] << 8) + (seq[3] << 6) + (seq[4] << 4 ) + (seq[5] << 2) + (seq[6]); 178 | break; 179 | case 8: 180 | str = (seq[0] << 14) + (seq[1] << 12) + (seq[2] << 10) + (seq[3] << 8) + (seq[4] << 6) + (seq[5] << 4) + (seq[6] << 2) + (seq[7]); 181 | break; 182 | case 9: 183 | str = (seq[0] << 16) + (seq[1] << 14) + (seq[2] << 12) + (seq[3] << 10) + (seq[4] << 8) + (seq[5] << 6) + (seq[6] << 4) + (seq[7] << 2) + (seq[8]); 184 | break; 185 | case 10: 186 | str = (seq[0] << 18) + (seq[1] << 16) + (seq[2] << 14) + (seq[3] << 12) + (seq[4] << 10) + (seq[5] << 8) + (seq[6] << 6) + (seq[7] << 4) + (seq[8] << 2) + (seq[9]); 187 | break; 188 | case 11: 189 | str = (seq[0] << 20) + (seq[1] << 18) + (seq[2] << 16) + (seq[3] << 14) + (seq[4] << 12) + (seq[5] << 10) + (seq[6] << 8) + (seq[7] << 6) + (seq[8] << 4) + (seq[9] << 2) + (seq[10]); 190 | break; 191 | default: 192 | break; 193 | } 194 | 195 | current_val = norm[str]; 196 | } 197 | 198 | 199 | #endif -------------------------------------------------------------------------------- /src/kmc_input_file.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "input_file.h" 3 | #include "kmc_api/kmc_file.h" 4 | #include "kmer_db.h" 5 | #include "filter.h" 6 | #include "kmer_extract.h" 7 | #include "parallel_sorter.h" 8 | 9 | #include "../libs/refresh/compression/lib/file_wrapper.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #ifdef USE_RADULS 19 | #include 20 | #endif 21 | 22 | 23 | // ***************************************************************************************** 24 | // 25 | template 26 | class KmcInputFile : public FilteredInputFile { 27 | public: 28 | 29 | KmcInputFile(std::shared_ptr filter) : FilteredInputFile(filter) {} 30 | virtual ~KmcInputFile() {} 31 | 32 | bool open(const std::string& filename) override { 33 | kmcfile = std::make_shared(); 34 | return kmcfile->OpenForListing(filename); 35 | } 36 | 37 | bool load( 38 | std::vector& kmersBuffer, 39 | std::vector& positionsBuffer, 40 | kmer_t*& kmers, 41 | size_t& kmersCount, 42 | uint32_t& kmerLength, 43 | double& filterValue) override; 44 | 45 | protected: 46 | std::shared_ptr kmcfile{ nullptr }; 47 | 48 | }; 49 | 50 | 51 | // ***************************************************************************************** 52 | // 53 | template 54 | bool KmcInputFile::load( 55 | std::vector& kmersBuffer, 56 | std::vector& positionsBuffer, 57 | kmer_t*& kmers, 58 | size_t& kmersCount, 59 | uint32_t& kmerLength, 60 | double& filterValue) { 61 | 62 | uint32_t counter; 63 | 64 | uint32 _mode; 65 | uint32 _counter_size; 66 | uint32 _lut_prefix_length; 67 | uint32 _signature_len; 68 | uint32 _min_count; 69 | uint64 _max_count; 70 | uint64 _total_kmers; 71 | 72 | kmcfile->Info(kmerLength, _mode, _counter_size, _lut_prefix_length, _signature_len, _min_count, _max_count, _total_kmers); 73 | 74 | CKmerAPI kmer(kmerLength); 75 | 76 | uint64_t u_kmer; 77 | vector tmp; 78 | 79 | // Wczytuje wszystkie k-mery z pliku do wektora, zeby pozniej moc robic prefetcha 80 | this->filter->configure(kmerLength); 81 | std::shared_ptr minhashFilter = std::dynamic_pointer_cast(this->filter); 82 | 83 | // allocate buffers 84 | #ifdef USE_RADULS 85 | kmersBuffer.resize(2 * _total_kmers + 4 * raduls::ALIGNMENT / sizeof(kmer_t)); 86 | kmers = kmersBuffer.data(); 87 | kmer_t* aux = kmersBuffer.data() + kmersBuffer.size() / 2; 88 | while (reinterpret_cast(kmers) % raduls::ALIGNMENT) ++kmers; 89 | while (reinterpret_cast(aux) % raduls::ALIGNMENT) ++aux; 90 | #else 91 | kmersBuffer.resize(_total_kmers); 92 | kmers = kmersBuffer.data(); 93 | #endif 94 | 95 | // calculate k-mers shifting to get prefix of at least 8 bits 96 | size_t kmer_prefix_shift = 0; 97 | kmer_t tail_mask = 0; 98 | 99 | int prefix_bits = (int)kmerLength * 2 - SUFFIX_BITS; 100 | 101 | if (prefix_bits < 8) { 102 | kmer_prefix_shift = (size_t)(8 - prefix_bits); 103 | tail_mask = (1ULL << kmer_prefix_shift) - 1; 104 | } 105 | 106 | kmersCount = 0; 107 | while (!kmcfile->Eof()) 108 | { 109 | if (!kmcfile->ReadNextKmer(kmer, counter)) 110 | break; 111 | kmer.to_long(tmp); 112 | u_kmer = tmp.front(); 113 | 114 | u_kmer = (u_kmer << kmer_prefix_shift) | (u_kmer & tail_mask); 115 | 116 | if ((*this->filter)(u_kmer)) { 117 | kmers[kmersCount++] = u_kmer; 118 | } 119 | } 120 | 121 | #ifdef USE_RADULS 122 | size_t key_size = ((kmerLength * 2) + 7) / 8; 123 | raduls::PartialRadixSortMSD(reinterpret_cast(kmers), reinterpret_cast(aux), kmersCount, sizeof(kmer_t), key_size, key_size - 4, 4); 124 | if (key_size % 2) { 125 | std::swap(kmers, aux); 126 | } 127 | #else 128 | //ParallelSort(kmers, kmersCount); 129 | #endif 130 | 131 | filterValue = ((double)kmersCount / _total_kmers); // this may differ from theoretical 132 | //LOG_DEBUG << "Filter passed: " << kmersCount << "/" << _total_kmers << "(" << filterValue << ")" << endl ; 133 | filterValue = minhashFilter->getFraction(); // use proper value 134 | return kmcfile->Close(); 135 | } 136 | 137 | 138 | -------------------------------------------------------------------------------- /src/kmer_db.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include "types.h" 11 | #include "pattern.h" 12 | #include "hashmap_lp.h" 13 | #include "array.h" 14 | #include "queue.h" 15 | #include "aligned_vector.h" 16 | #include "simd/row_add.h" 17 | #include "parallel_sorter.h" 18 | #include "alphabet.h" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | class AbstractKmerDb { 30 | protected: 31 | 32 | uint32_t kmerLength; 33 | 34 | bool isInitialized; 35 | 36 | double fraction; 37 | 38 | double startFraction; 39 | 40 | AlphabetType alphabetType; 41 | 42 | std::vector sampleNames; 43 | 44 | std::vector sampleKmersCount; 45 | 46 | virtual void initialize(uint32_t kmerLength, double fraction, AlphabetType alphabetType) { 47 | this->kmerLength = kmerLength; 48 | this->fraction = fraction; 49 | this->alphabetType = alphabetType; 50 | this->isInitialized = true; 51 | } 52 | 53 | public: 54 | 55 | enum class DeserializationMode { 56 | Everything, 57 | SamplesOnly, 58 | SkipHashtables, 59 | CompactedHashtables 60 | }; 61 | 62 | AbstractKmerDb() : 63 | kmerLength(0), isInitialized(false), fraction(0), startFraction(0), alphabetType(AlphabetType::unknown) {} 64 | 65 | virtual ~AbstractKmerDb() {} 66 | 67 | uint32_t getKmerLength() const { return kmerLength; } 68 | 69 | double getFraction() const { return fraction; } 70 | 71 | double getStartFraction() const { return startFraction; } 72 | 73 | AlphabetType getAlphabetType() const { return alphabetType; } 74 | 75 | size_t getSamplesCount() const { return sampleNames.size(); } 76 | 77 | const std::vector& getSampleNames() const { return sampleNames; } 78 | 79 | const std::vector& getSampleKmersCount() const { return sampleKmersCount; } 80 | 81 | virtual size_t getKmersCount() const = 0; 82 | 83 | virtual size_t getPatternsCount() const = 0; 84 | 85 | virtual size_t getPatternBytes() const = 0; 86 | 87 | virtual size_t getHashtableBytes() const = 0; 88 | 89 | virtual size_t getHashtableEntrySize() const = 0; 90 | 91 | virtual void serialize(std::ofstream& file, bool rawHashtables) const = 0; 92 | 93 | virtual bool deserialize(std::ifstream& file, DeserializationMode mode = DeserializationMode::Everything) = 0; 94 | 95 | virtual std::string printStats() const = 0; 96 | 97 | virtual std::string printDetailedTimes() const = 0; 98 | 99 | virtual std::string printProgress() const = 0; 100 | 101 | 102 | virtual sample_id_t addKmers( 103 | const std::string& sampleName, 104 | const kmer_t* kmers, 105 | uint32_t kmersCount, 106 | uint32_t kmerLength, 107 | double fraction, 108 | AlphabetType alphabetType, 109 | refresh::active_thread_pool& atp) { 110 | LOG_VERBOSE("Adding sample " << sampleNames.size() + 1 << ": " << sampleName << " (" << kmersCount << " kmers)" << endl); 111 | 112 | if (!isInitialized) { 113 | initialize(kmerLength, fraction, alphabetType); 114 | } 115 | 116 | if (this->kmerLength != kmerLength) { 117 | throw std::runtime_error("Error in AbstractKmerDb::addKmers(): adding kmers of different length"); 118 | } 119 | if (this->fraction != fraction) { 120 | throw std::runtime_error("Error in AbstractKmerDb::addKmers(): adding kmers of different minhash fraction"); 121 | } 122 | 123 | if (this->alphabetType != alphabetType) { 124 | throw std::runtime_error("Error in AbstractKmerDb::addKmers(): adding samples from different alphabet"); 125 | } 126 | 127 | sample_id_t newId = (sample_id_t) sampleNames.size(); 128 | sampleNames.push_back(sampleName); 129 | sampleKmersCount.push_back(kmersCount); 130 | 131 | return newId; 132 | } 133 | }; 134 | -------------------------------------------------------------------------------- /src/kmer_db.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kmer_db", "kmer_db.vcxproj", "{E196ABDC-0F62-4E7A-8714-C10BAD223FC3}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Debug|x64.ActiveCfg = Debug|x64 17 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Debug|x64.Build.0 = Debug|x64 18 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Debug|x86.ActiveCfg = Debug|Win32 19 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Debug|x86.Build.0 = Debug|Win32 20 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Release|x64.ActiveCfg = Release|x64 21 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Release|x64.Build.0 = Release|x64 22 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Release|x86.ActiveCfg = Release|Win32 23 | {E196ABDC-0F62-4E7A-8714-C10BAD223FC3}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | -------------------------------------------------------------------------------- /src/kmer_db.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | input 27 | 28 | 29 | input 30 | 31 | 32 | input 33 | 34 | 35 | input 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | input 68 | 69 | 70 | input 71 | 72 | 73 | input 74 | 75 | 76 | input 77 | 78 | 79 | input 80 | 81 | 82 | input 83 | 84 | 85 | input 86 | 87 | 88 | input 89 | 90 | 91 | input 92 | 93 | 94 | input 95 | 96 | 97 | input 98 | 99 | 100 | 101 | 102 | {f86f7d73-a327-4e3c-a22e-f1322af7f6d9} 103 | 104 | 105 | -------------------------------------------------------------------------------- /src/kmer_extract.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "alphabet.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class KmerHelper { 9 | 10 | public: 11 | 12 | template 13 | static size_t extract( 14 | char* sequence, 15 | size_t sequenceLength, 16 | uint32_t kmerLength, 17 | const Alphabet& alphabet, 18 | const Filter& filter, 19 | kmer_t* kmers) { 20 | 21 | 22 | size_t counter = 0; 23 | 24 | kmer_t kmer_str, kmer_rev, kmer_can; 25 | uint32_t kmer_len_shift = (kmerLength - 1) * alphabet.bitsPerSymbol; 26 | kmer_t kmer_mask = (1ull << (alphabet.bitsPerSymbol * kmerLength)) - 1; 27 | int omit_next_n_kmers; 28 | uint32_t i; 29 | 30 | kmer_str = kmer_rev = 0; 31 | 32 | uint32_t str_pos = kmer_len_shift - alphabet.bitsPerSymbol; 33 | uint32_t rev_pos = alphabet.bitsPerSymbol; 34 | 35 | omit_next_n_kmers = 0; 36 | 37 | // calculate k-mers shifting to get prefix of at least 8 bits 38 | size_t kmer_prefix_shift = 0; 39 | kmer_t tail_mask = 0; 40 | int prefix_bits = (int)kmerLength * alphabet.bitsPerSymbol - SUFFIX_BITS; 41 | 42 | if (prefix_bits < 8) { 43 | kmer_prefix_shift = (size_t)(8 - prefix_bits); 44 | tail_mask = (1ULL << kmer_prefix_shift) - 1; 45 | } 46 | 47 | 48 | for (i = 0; i < kmerLength - 1; ++i, str_pos -= alphabet.bitsPerSymbol, rev_pos += alphabet.bitsPerSymbol) 49 | { 50 | int8_t symb = alphabet.map(sequence[i]); 51 | if (symb < 0) 52 | { 53 | symb = 0; 54 | omit_next_n_kmers = i + 1; 55 | } 56 | kmer_str += (kmer_t)symb << str_pos; 57 | kmer_rev += (kmer_t)(3 - symb) << rev_pos; // this makes sense only for DNA alphabet 58 | } 59 | 60 | for (; i < sequenceLength; ++i) 61 | { 62 | int8_t symb = alphabet.map(sequence[i]); 63 | if (symb < 0) 64 | { 65 | symb = 0; 66 | omit_next_n_kmers = kmerLength; 67 | } 68 | kmer_str = (kmer_str << alphabet.bitsPerSymbol) + (kmer_t)symb; 69 | kmer_str &= kmer_mask; 70 | 71 | kmer_rev >>= alphabet.bitsPerSymbol; 72 | kmer_rev += (kmer_t)(alphabet.size - 1 - symb) << kmer_len_shift; 73 | 74 | if (omit_next_n_kmers > 0) 75 | { 76 | --omit_next_n_kmers; 77 | continue; 78 | } 79 | 80 | if (alphabet.preserveStrand) { 81 | kmer_can = kmer_str; 82 | } 83 | else { 84 | kmer_can = (kmer_str < kmer_rev) ? kmer_str : kmer_rev; 85 | } 86 | 87 | // ensure at least 8-bit prefix 88 | kmer_can = (kmer_can << kmer_prefix_shift) | (kmer_can & tail_mask); 89 | 90 | if (filter(kmer_can)) { 91 | kmers[counter++] = kmer_can; 92 | } 93 | 94 | } 95 | 96 | return counter; 97 | } 98 | 99 | static void sort(kmer_t* kmers, size_t count, uint32_t n_threads = 1) { 100 | // ParallelSort(kmers, count, n_threads); 101 | refresh::sort::pdqsort_branchless(kmers, kmers + count); 102 | } 103 | 104 | static void sortAndUnique(kmer_t* kmers, size_t& count, uint32_t n_threads = 1) { 105 | // ParallelSort(kmers, count, n_threads); 106 | refresh::sort::pdqsort_branchless(kmers, kmers + count); 107 | auto it = std::unique(kmers, kmers + count); 108 | 109 | count = it - kmers; 110 | } 111 | 112 | static void unique(kmer_t* kmers, size_t& count) { 113 | // std::sort(kmers, kmers + count); 114 | refresh::sort::pdqsort_branchless(kmers, kmers + count); 115 | auto it = std::unique(kmers, kmers + count); 116 | 117 | count = it - kmers; 118 | } 119 | 120 | }; 121 | 122 | -------------------------------------------------------------------------------- /src/loader_ex.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #include "loader_ex.h" 10 | #include "input_file_factory.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace std; 19 | 20 | // ***************************************************************************************** 21 | // 22 | LoaderEx::LoaderEx( 23 | std::shared_ptr filter, 24 | std::shared_ptr alphabet, 25 | InputFile::Format inputFormat, 26 | int suggestedNumThreads, 27 | int numConsumers, 28 | bool multisampleFasta, 29 | bool storePositions) : 30 | 31 | inputFormat(inputFormat), 32 | numThreads(multisampleFasta ? 1 : suggestedNumThreads), // use only one reader thread for multisample fasta files 33 | multisampleFasta(multisampleFasta), 34 | storePositions(storePositions) 35 | { 36 | readers.resize(numThreads); 37 | 38 | int outputBuffersCount = numThreads + numConsumers; 39 | 40 | // configure queues 41 | queues.input.Restart(1); 42 | queues.readers.Restart(1, outputBuffersCount); 43 | queues.output.Restart(numThreads); 44 | queues.freeBuffers.Restart(1); 45 | 46 | kmersCollections.resize(outputBuffersCount); 47 | positionsCollections.resize(outputBuffersCount); 48 | bufferRefCounters.resize(outputBuffersCount); 49 | 50 | for (size_t id = 0; id < kmersCollections.size(); ++id) { 51 | queues.freeBuffers.Push((int)id); 52 | } 53 | 54 | // run prefetcher thread 55 | prefetcher = std::thread(&LoaderEx::prefetcherJob, this, filter, alphabet); 56 | 57 | // run loader threads 58 | if (multisampleFasta) { 59 | // multisample fasta - single reader thread 60 | readers[0] = std::thread(&LoaderEx::multifastaReaderJob, this); 61 | } else { 62 | // collection of fasta files - several reader threads 63 | for (int tid = 0; tid < numThreads; ++tid) { 64 | readers[tid] = std::thread(&LoaderEx::readerJob, this, tid); 65 | } 66 | } 67 | } 68 | 69 | // ***************************************************************************************** 70 | // 71 | LoaderEx::~LoaderEx() { 72 | queues.input.MarkCompleted(); 73 | queues.readers.MarkCompleted(); 74 | queues.output.MarkCompleted(); 75 | queues.freeBuffers.MarkCompleted(); 76 | 77 | for (auto& t : readers) { 78 | t.join(); 79 | } 80 | 81 | prefetcher.join(); 82 | } 83 | 84 | // ***************************************************************************************** 85 | // 86 | int LoaderEx::configure(const std::string& multipleSamples) { 87 | 88 | // determine if single FASTA file was provided as an input 89 | set extensions{ 90 | ".fa", ".fna", ".fasta", ".fastq", 91 | ".gz", ".fa.gz", ".fna.gz", ".fasta.gz", ".fastq.gz" }; 92 | 93 | bool isFasta = false; 94 | for (const auto& ext : extensions) { 95 | if (std::equal(ext.rbegin(), ext.rend(), multipleSamples.rbegin())) { 96 | isFasta = true; 97 | break; 98 | } 99 | } 100 | 101 | if (isFasta) { 102 | fileNames.push_back(multipleSamples); 103 | } 104 | else { 105 | std::ifstream ifs(multipleSamples); 106 | 107 | if (!ifs) { 108 | throw std::runtime_error("Unable to open input file " + multisampleFasta); 109 | } 110 | 111 | string fname; 112 | 113 | while (ifs >> fname) { 114 | fileNames.push_back(fname); 115 | } 116 | } 117 | 118 | for (size_t i = 0; i < fileNames.size(); ++i) { 119 | queues.input.Push(std::make_shared(i, fileNames[i])); 120 | } 121 | 122 | queues.input.MarkCompleted(); 123 | return (int)fileNames.size(); 124 | } 125 | 126 | // ***************************************************************************************** 127 | // 128 | void LoaderEx::prefetcherJob(std::shared_ptr filter, std::shared_ptr alphabet) { 129 | while (!this->queues.input.IsCompleted()) { 130 | std::shared_ptr task; 131 | 132 | if (this->queues.input.Pop(task)) { 133 | LOG_DEBUG("input queue -> (file " << task->fileId + 1 << ")" << endl); 134 | 135 | task->file = std::shared_ptr(InputFileFactory::create(this->inputFormat, filter, alphabet)); 136 | 137 | if (task->file->open(task->filePath)) { 138 | queues.readers.Push(task); 139 | LOG_DEBUG("(file " << task->fileId + 1 << ", " << task->filePath << ") -> readers queue " << endl); 140 | } 141 | else { 142 | LOG_NORMAL("failed:" << task->filePath << endl); 143 | } 144 | } 145 | } 146 | 147 | queues.readers.MarkCompleted(); 148 | LOG_DEBUG("reader thread completed" << endl); 149 | } 150 | 151 | // ***************************************************************************************** 152 | // 153 | void LoaderEx::readerJob(int tid) { 154 | 155 | while (!this->queues.readers.IsCompleted()) { 156 | std::shared_ptr inputTask; 157 | int bufferId = 0; 158 | bool ok = false; 159 | 160 | // get buffer and input task 161 | if (this->queues.freeBuffers.Pop(bufferId) && this->queues.readers.Pop(inputTask)) { 162 | 163 | LOG_DEBUG("readers queue -> (file " << inputTask->fileId + 1 << "), tid: " << tid << endl); 164 | 165 | auto sampleTask = make_shared( 166 | inputTask->fileId, 167 | inputTask->filePath, 168 | std::filesystem::path(inputTask->filePath).filename().string(), 169 | bufferId); 170 | 171 | ok = inputTask->file->load( 172 | kmersCollections[bufferId], positionsCollections[bufferId], 173 | sampleTask->kmers, sampleTask->kmersCount, sampleTask->kmerLength, sampleTask->fraction); 174 | 175 | if (ok) { 176 | ++bufferRefCounters[bufferId]; 177 | queues.output.Push((int)sampleTask->id, sampleTask); 178 | 179 | LOG_DEBUG("(sample " << sampleTask->id + 1 << ", " << sampleTask->sampleName <<") -> loader output queue, buf: " << bufferId << std::endl); 180 | LOG_VERBOSE("File loaded successfully: " << inputTask->fileId + 1 << endl); 181 | } 182 | else { 183 | LOG_NORMAL("File load failed: " << inputTask->fileId + 1 << endl); 184 | } 185 | } 186 | } 187 | 188 | queues.output.MarkCompleted(); 189 | LOG_DEBUG("loader thread completed: " << tid << endl); 190 | } 191 | 192 | // ***************************************************************************************** 193 | // 194 | void LoaderEx::multifastaReaderJob() { 195 | 196 | size_t sample_id = 0; 197 | 198 | while (!this->queues.readers.IsCompleted()) { 199 | std::shared_ptr inputTask; 200 | int bufferId = 0; 201 | int count = 0; 202 | 203 | // wait for input task 204 | if (!this->queues.readers.Pop(inputTask)) { 205 | continue; 206 | } 207 | 208 | auto genomicFile = std::dynamic_pointer_cast(inputTask->file); 209 | 210 | // initialize multifasta file 211 | if (genomicFile->initMultiFasta()) { 212 | 213 | LOG_DEBUG("multifasta initialized : " << inputTask->fileId + 1 << endl); 214 | 215 | while (true) { 216 | 217 | LOG_DEBUG("wait for buf for sample " << sample_id + 1 << endl); 218 | this->queues.freeBuffers.Pop(bufferId); // wait for free buffer 219 | 220 | LOG_DEBUG("acquired buf " << bufferId << " for sample " << sample_id + 1 << endl); 221 | 222 | auto sampleTask = make_shared( 223 | sample_id, 224 | inputTask->filePath, 225 | "", 226 | bufferId); 227 | 228 | bool ok = genomicFile->loadNext( 229 | kmersCollections[bufferId], positionsCollections[bufferId], 230 | sampleTask->kmers, sampleTask->kmersCount, sampleTask->kmerLength, sampleTask->fraction, sampleTask->sampleName, 231 | total_kmers_in_kmers_collections); 232 | 233 | ++sample_id; 234 | ++bufferRefCounters[bufferId]; 235 | queues.output.Push((int)sampleTask->id, sampleTask); 236 | ++count; 237 | 238 | LOG_DEBUG("(sample " << sampleTask->id + 1 << ") -> output queue, buf: " << bufferId << std::endl); 239 | 240 | // no more samples 241 | if (!ok) { 242 | break; 243 | } 244 | } 245 | } 246 | 247 | if (count > 0) { 248 | LOG_VERBOSE("File loaded successfully: " << inputTask->fileId + 1 << endl); 249 | } 250 | else { 251 | LOG_NORMAL("File load failed: " << inputTask->fileId + 1 << endl); 252 | } 253 | } 254 | queues.output.MarkCompleted(); 255 | 256 | LOG_DEBUG("output queue: mark completed" << endl); 257 | } 258 | 259 | -------------------------------------------------------------------------------- /src/loader_ex.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include "input_file.h" 11 | #include "queue.h" 12 | #include "filter.h" 13 | #include "loader_tasks.h" 14 | #include "alphabet.h" 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | 23 | 24 | // ***************************************************************************************** 25 | // 26 | class LoaderEx { 27 | public: 28 | 29 | LoaderEx( 30 | std::shared_ptr filter, 31 | std::shared_ptr alphabet, 32 | InputFile::Format inputFormat, 33 | int suggestedNumThreads, 34 | int numConsumers, 35 | bool multisampleFasta, 36 | bool storePositions = false); 37 | 38 | ~LoaderEx(); 39 | 40 | int configure(const std::string& multipleKmcSamples); 41 | 42 | std::shared_ptr popTask(int sampleId) { 43 | std::shared_ptr task; 44 | if (queues.output.Pop(sampleId, task)) { 45 | LOG_DEBUG("output queue -> (sample " << sampleId + 1 << ")" << std::endl); 46 | } 47 | return task; 48 | } 49 | 50 | void releaseTask(SampleTask& t) { 51 | if (--bufferRefCounters[t.bufferId] == 0) { 52 | queues.freeBuffers.Push(t.bufferId); 53 | LOG_DEBUG("sample " << t.id + 1 << ": release buffer " << t.bufferId << std::endl); 54 | } 55 | } 56 | 57 | size_t getBytes() { 58 | return total_kmers_in_kmers_collections; 59 | 60 | /* size_t mem = 0; 61 | for (const auto& col : kmersCollections) { 62 | mem += col.capacity() * sizeof(kmer_t); 63 | } 64 | 65 | return mem;*/ 66 | } 67 | 68 | bool isCompleted() { 69 | return queues.output.IsCompleted(); 70 | } 71 | 72 | size_t getSamplesCount() const { 73 | return multisampleFasta ? 0 : fileNames.size(); 74 | } 75 | 76 | int getOutputBuffersCount() { 77 | return kmersCollections.size(); 78 | } 79 | 80 | private: 81 | 82 | InputFile::Format inputFormat; 83 | 84 | int numThreads; 85 | 86 | bool multisampleFasta; 87 | 88 | bool storePositions; 89 | 90 | uint32_t kmerLength; 91 | 92 | std::thread prefetcher; 93 | 94 | std::vector fileNames; 95 | 96 | std::vector readers; 97 | 98 | std::vector> kmersCollections; 99 | atomic total_kmers_in_kmers_collections{ 0 }; 100 | 101 | std::vector> positionsCollections; 102 | 103 | std::vector bufferRefCounters; 104 | 105 | 106 | struct { 107 | RegisteringQueue> input{ 1 }; 108 | 109 | RegisteringQueue> readers{ 1 }; 110 | 111 | RegisteringQueue freeBuffers{ 1 }; 112 | 113 | SynchronizedPriorityQueue> output{ 1 }; 114 | } queues; 115 | 116 | 117 | void prefetcherJob(std::shared_ptr filter, std::shared_ptr alphabet); 118 | 119 | void readerJob(int tid); 120 | 121 | void multifastaReaderJob(); 122 | }; -------------------------------------------------------------------------------- /src/loader_tasks.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | class InputFile; 5 | 6 | // ***************************************************************************************** 7 | // 8 | struct InputTask { 9 | size_t fileId; 10 | // const std::string& filePath; 11 | const std::string filePath; 12 | std::shared_ptr file; 13 | 14 | 15 | InputTask(size_t fileId, const std::string& filePath) : 16 | fileId(fileId), filePath(filePath), file(nullptr) { 17 | } 18 | }; 19 | 20 | // ***************************************************************************************** 21 | // 22 | struct SampleTask { 23 | size_t id; 24 | // const std::string& filePath; 25 | const std::string filePath; 26 | std::string sampleName; 27 | kmer_t *kmers; 28 | size_t kmersCount; 29 | uint32_t kmerLength; 30 | double fraction; 31 | int bufferId; 32 | int bufferId2; 33 | 34 | SampleTask(size_t id, const std::string& filePath, const std::string& sampleName, int bufferId) : 35 | id(id), filePath(filePath), sampleName(sampleName), bufferId(bufferId) {} 36 | 37 | }; 38 | -------------------------------------------------------------------------------- /src/log.cpp: -------------------------------------------------------------------------------- 1 | #include "log.h" 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | const int Log::LEVEL_DEBUG = 0; 16 | const int Log::LEVEL_VERBOSE = 1; 17 | const int Log::LEVEL_NORMAL = 2; 18 | 19 | 20 | // ************************************************************************************ 21 | // NumericConversions statics 22 | char NumericConversions::digits[]; 23 | NumericConversions::_si NumericConversions::_init; 24 | uint64_t NumericConversions::powers10[]; 25 | 26 | 27 | 28 | // ***************************************************************************************** 29 | // 30 | std::string Log::formatLargeNumber(uint64_t num, int minWidth) { 31 | std::string ret = ""; 32 | 33 | do { 34 | uint64_t part = num % 1000uLL; 35 | num = num / 1000uLL; 36 | 37 | if (num > 0) { 38 | std::ostringstream oss; 39 | oss << "," << std::setw(3) << std::setfill('0') << part; 40 | ret = oss.str() + ret; 41 | /* auto s = std::to_string(part); 42 | if (s.length() < 3) 43 | ret = "," + std::string(3 - s.length(), '0') + s + ret; 44 | else 45 | ret = "," + s + ret;*/ 46 | } 47 | else { 48 | ret = std::to_string(part) + ret; 49 | } 50 | 51 | } while (num > 0); 52 | 53 | int initialSpaces = minWidth - (int)ret.length(); 54 | 55 | if (initialSpaces > 0) { 56 | ret = string(initialSpaces, ' ') + ret; 57 | } 58 | 59 | return ret; 60 | } 61 | -------------------------------------------------------------------------------- /src/log.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | #include "conversion.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define LOG_VERBOSE(msg) { if (Log::getInstance(Log::LEVEL_VERBOSE)) { Log::getInstance(Log::LEVEL_VERBOSE) << msg << std::flush; }} 19 | #define LOG_DEBUG(msg) { if (Log::getInstance(Log::LEVEL_DEBUG)) { Log::getInstance(Log::LEVEL_DEBUG) << msg << std::flush; }} 20 | #define LOG_NORMAL(msg) { Log::getInstance(Log::LEVEL_NORMAL) << msg << std::flush; } 21 | 22 | 23 | class LockedStream { 24 | std::ostream* out{ nullptr }; 25 | std::recursive_mutex* mtx{ nullptr }; 26 | 27 | public: 28 | // LockedStream() : mtx() {} 29 | LockedStream() = default; 30 | LockedStream(std::ostream& out, std::recursive_mutex& mtx) : out(&out), mtx(&mtx) {} 31 | ~LockedStream() { 32 | if (out) { 33 | out->flush(); 34 | mtx->unlock(); 35 | } 36 | } 37 | 38 | template 39 | // LockedStream& operator<< (const T& v) { if (out) { *out << v; }; return *this; } 40 | LockedStream& operator<< (bool v) { if (out) { *out << v; }; return *this; } 41 | LockedStream& operator<< (long v) { if (out) { *out << v; }; return *this; } 42 | LockedStream& operator<< (unsigned long v) { if (out) { *out << v; }; return *this; } 43 | LockedStream& operator<< (long long v) { if (out) { *out << v; }; return *this; } 44 | LockedStream& operator<< (unsigned long long v) { if (out) { *out << v; }; return *this; } 45 | LockedStream& operator<< (float v) { if (out) { *out << v; }; return *this; } 46 | LockedStream& operator<< (double v) { if (out) { *out << v; }; return *this; } 47 | LockedStream& operator<< (long double v) { if (out) { *out << v; }; return *this; } 48 | // LockedStream& operator<< (const void *v) { if (out) { *out << v; }; return *this; } 49 | LockedStream& operator<< (short v) { if (out) { *out << v; }; return *this; } 50 | LockedStream& operator<< (unsigned short v) { if (out) { *out << v; }; return *this; } 51 | LockedStream& operator<< (int v) { if (out) { *out << v; }; return *this; } 52 | LockedStream& operator<< (unsigned int v) { if (out) { *out << v; }; return *this; } 53 | 54 | LockedStream& operator<< (std::string v) { if (out) { *out << v; }; return *this; } 55 | 56 | LockedStream& operator<< (std::ostream& (*pf)(std::ostream&)) { if (out) { *out << pf; }; return *this; } 57 | LockedStream& operator<< (std::ios& (*pf)(std::ios&)) { if (out) { *out << pf; }; return *this; } 58 | LockedStream& operator<< (std::ios& (*pf)(std::ios_base&)) { if (out) { *out << pf; }; return *this; } 59 | 60 | friend LockedStream& operator<<(LockedStream& os, const char* s); 61 | friend LockedStream& operator<<(LockedStream& os, const signed char* s); 62 | friend LockedStream& operator<<(LockedStream& os, const unsigned char* s); 63 | }; 64 | 65 | inline LockedStream& operator<<(LockedStream& os, const char*s) { if (os.out) { *(os.out) << s; }; return os; } 66 | inline LockedStream& operator<<(LockedStream& os, const signed char*s) { if (os.out) { *(os.out) << s; }; return os; } 67 | inline LockedStream& operator<<(LockedStream& os, const unsigned char*s) { if (os.out) { *(os.out) << s; }; return os; } 68 | 69 | // ***************************************************************************************** 70 | // 71 | class Log 72 | { 73 | public: 74 | static const int LEVEL_NORMAL; 75 | static const int LEVEL_VERBOSE; 76 | static const int LEVEL_DEBUG; 77 | 78 | void enable() { enabled = true; } 79 | void disable() { enabled = false; } 80 | 81 | // ***************************************************************************************** 82 | static Log& getInstance(int level) { 83 | static std::vector> logs{ 84 | std::shared_ptr(new Log()), 85 | std::shared_ptr(new Log()), 86 | std::shared_ptr(new Log()) 87 | }; 88 | 89 | return *logs[level]; 90 | } 91 | 92 | // ***************************************************************************************** 93 | template 94 | LockedStream operator<<(const T& v) { 95 | if (enabled) { 96 | mtx.lock(); 97 | out << v; 98 | return LockedStream(out, mtx); 99 | } 100 | return LockedStream(); 101 | } 102 | 103 | // ***************************************************************************************** 104 | LockedStream operator<< (std::ostream& (*pf)(std::ostream&)) { 105 | if (enabled) { 106 | mtx.lock(); 107 | out << pf; 108 | return LockedStream(out, mtx); 109 | } 110 | return LockedStream(); 111 | } 112 | 113 | // ***************************************************************************************** 114 | LockedStream operator<< (std::ios& (*pf)(std::ios&)) { 115 | if (enabled) { 116 | mtx.lock(); 117 | out << pf; 118 | return LockedStream(out, mtx); 119 | } 120 | return LockedStream(); 121 | } 122 | 123 | // ***************************************************************************************** 124 | LockedStream operator<< (std::ios& (*pf)(std::ios_base&)) { 125 | if (enabled) { 126 | mtx.lock(); 127 | out << pf; 128 | return LockedStream(out, mtx); 129 | } 130 | return LockedStream(); 131 | } 132 | 133 | static std::string formatLargeNumber(uint64_t num, int minWidth = 0); 134 | 135 | operator bool() const 136 | { 137 | return enabled; 138 | } 139 | 140 | protected: 141 | bool enabled; 142 | std::ostream& out{std::cerr}; 143 | std::recursive_mutex mtx; 144 | 145 | Log() : enabled(false) {} 146 | }; 147 | 148 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #ifdef _MSC_VER 10 | //#include 11 | #include 12 | #endif 13 | 14 | #include "log.h" 15 | #include "version.h" 16 | 17 | #include "console.h" 18 | #include "params.h" 19 | 20 | #include 21 | 22 | int main(int argc, char **argv) 23 | { 24 | Log::getInstance(Log::LEVEL_NORMAL).enable(); 25 | 26 | Params params; 27 | 28 | try { 29 | // returns false when help message was desired 30 | if (!params.parse(argc, argv)) { 31 | return 0; 32 | } 33 | 34 | time_t rawtime; 35 | struct tm* timeinfo; 36 | time(&rawtime); 37 | timeinfo = localtime(&rawtime); 38 | LOG_NORMAL("Analysis started at " << asctime(timeinfo) << endl); 39 | 40 | auto console = ConsoleFactory::create(params.mode); 41 | if (!console) { 42 | throw std::runtime_error("Invalid mode selected"); 43 | } 44 | 45 | console->run(params); 46 | 47 | time(&rawtime); 48 | timeinfo = localtime(&rawtime); 49 | LOG_NORMAL(endl << "Analysis finished at " << asctime(timeinfo) << endl); 50 | } 51 | catch (usage_error& err) { 52 | LOG_NORMAL("ERROR: Incorrect usage" << endl << "See detailed instructions below" << endl << endl); 53 | params.showInstructions(err.getMode()); 54 | return -1; 55 | } 56 | catch (std::runtime_error& err) { 57 | LOG_NORMAL("ERROR: " << err.what() << endl); 58 | return -1; 59 | } 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /src/minhashed_input_file.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "input_file.h" 3 | #include "kmer_db.h" 4 | #include "filter.h" 5 | #include "kmer_extract.h" 6 | #include "parallel_sorter.h" 7 | 8 | #include "../libs/refresh/compression/lib/file_wrapper.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #ifdef USE_RADULS 18 | #include 19 | #endif 20 | 21 | // ***************************************************************************************** 22 | // 23 | class MihashedInputFile : public InputFile { 24 | public: 25 | inline bool open(const std::string& filename) override; 26 | 27 | inline bool load( 28 | std::vector& kmersBuffer, 29 | std::vector& positionsBuffer, 30 | kmer_t*& kmers, 31 | size_t& kmersCount, 32 | uint32_t& kmerLength, 33 | double& filterValue) override; 34 | 35 | inline bool store( 36 | const std::string& filename, 37 | const kmer_t* kmers, 38 | size_t kmersCount, 39 | uint32_t kmerLength, 40 | double filterValue); 41 | 42 | protected: 43 | const uint32_t MINHASH_FORMAT_SIGNATURE = 0xfedcba98; 44 | 45 | std::vector kmers; 46 | 47 | uint32_t kmerLength{ 0 }; 48 | 49 | double fraction{ 0 }; 50 | 51 | bool status{ false }; 52 | }; 53 | 54 | 55 | 56 | // ***************************************************************************************** 57 | // 58 | bool MihashedInputFile::open(const std::string& filename) { 59 | std::ifstream file(filename + ".minhash", std::ios_base::binary); 60 | status = false; 61 | if (file) { 62 | uint32_t signature = 0; 63 | file.read(reinterpret_cast(&signature), sizeof(uint32_t)); 64 | if (signature == MINHASH_FORMAT_SIGNATURE) { 65 | size_t numKmers; 66 | file.read(reinterpret_cast(&numKmers), sizeof(size_t)); 67 | kmers.resize(numKmers); 68 | 69 | file.read(reinterpret_cast(kmers.data()), sizeof(kmer_t) * numKmers); 70 | file.read(reinterpret_cast(&kmerLength), sizeof(kmerLength)); 71 | file.read(reinterpret_cast(&fraction), sizeof(fraction)); 72 | if (file) { 73 | status = true; 74 | } 75 | } 76 | file.close(); 77 | } 78 | 79 | if (!status) { 80 | kmers.clear(); 81 | } 82 | 83 | return status; 84 | } 85 | 86 | // ***************************************************************************************** 87 | // 88 | bool MihashedInputFile::load( 89 | std::vector& kmersBuffer, 90 | std::vector& positionsBuffer, 91 | kmer_t*& kmers, 92 | size_t& kmersCount, 93 | uint32_t& kmerLength, 94 | double& filterValue) { 95 | if (!status) { 96 | return false; 97 | } 98 | 99 | kmersBuffer = std::move(this->kmers); 100 | kmers = kmersBuffer.data(); 101 | kmersCount = kmersBuffer.size(); 102 | kmerLength = this->kmerLength; 103 | filterValue = this->fraction; 104 | return true; 105 | } 106 | 107 | // ***************************************************************************************** 108 | // 109 | bool MihashedInputFile::store(const std::string& filename, const kmer_t* kmers, size_t kmersCount, uint32_t kmerLength, double filterValue) { 110 | ofstream ofs(filename + ".minhash", std::ios_base::binary); 111 | ofs.write(reinterpret_cast(&MINHASH_FORMAT_SIGNATURE), sizeof(MINHASH_FORMAT_SIGNATURE)); 112 | 113 | ofs.write(reinterpret_cast(&kmersCount), sizeof(size_t)); 114 | ofs.write(reinterpret_cast(kmers), kmersCount * sizeof(kmer_t)); 115 | ofs.write(reinterpret_cast(&kmerLength), sizeof(kmerLength)); 116 | ofs.write(reinterpret_cast(&filterValue), sizeof(filterValue)); 117 | return true; 118 | } 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /src/parallel_sorter.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #define USE_PDQSORT 10 | 11 | #include "parallel_sorter.h" 12 | #include "types.h" 13 | 14 | #ifdef USE_PDQSORT 15 | #include "../libs/refresh/sort/lib/pdqsort_par.h" 16 | #else 17 | #ifdef WIN32 18 | #include 19 | #elif defined __APPLE__ 20 | #include 21 | #else 22 | #include 23 | #endif 24 | #endif 25 | 26 | #if 0 27 | 28 | // ***************************************************************************************** 29 | // 30 | void ParallelSort(kmer_t *arr, size_t arr_size, uint32_t max_n_threads, refresh::active_thread_pool* atp) 31 | { 32 | #ifdef USE_PDQSORT 33 | if(atp) 34 | refresh::sort::pdqsort_branchless_tp(refresh::sort::pdqsort_adjust_threads(arr_size, max_n_threads), arr, arr + arr_size, *atp); 35 | else 36 | refresh::sort::pdqsort_branchless(refresh::sort::pdqsort_adjust_threads(arr_size, max_n_threads), arr, arr + arr_size); 37 | #else 38 | #ifdef WIN32 39 | // concurrency::parallel_sort(arr, arr + arr_size); 40 | std::stable_sort(arr, arr + arr_size); 41 | //std::stable_sort(samplePatterns.begin(), samplePatterns.end(), pid_comparer); 42 | #elif defined __APPLE__ 43 | std:: stable_sort(arr, arr + arr_size); 44 | #else 45 | __gnu_parallel::sort(arr, arr + arr_size); 46 | #endif 47 | #endif 48 | } 49 | 50 | // ***************************************************************************************** 51 | // 52 | void ParallelSort(pair *arr, size_t arr_size, pair *tmp, int rec_size, int key_size, int n_threads, refresh::active_thread_pool* atp) 53 | { 54 | auto pid_comparer = [](const std::pair& a, const std::pair& b)->bool { 55 | return a.first < b.first; 56 | }; 57 | 58 | #ifdef USE_PDQSORT 59 | if(atp) 60 | refresh::sort::pdqsort_branchless_tp(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer, *atp); 61 | else 62 | refresh::sort::pdqsort_branchless(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer); 63 | #else 64 | #ifdef WIN32 65 | // concurrency::parallel_sort(arr, arr + arr_size, pid_comparer); 66 | std::stable_sort(arr, arr + arr_size, pid_comparer); 67 | #elif defined __APPLE__ 68 | std:: stable_sort(arr, arr + arr_size, pid_comparer); 69 | #else 70 | __gnu_parallel::sort(arr, arr + arr_size, pid_comparer); 71 | #endif 72 | #endif 73 | } 74 | 75 | // ***************************************************************************************** 76 | // 77 | void ParallelSort(pair* arr, size_t arr_size, int n_threads, refresh::active_thread_pool* atp) 78 | { 79 | auto pid_comparer = [](const std::pair& a, const std::pair& b)->bool { 80 | return a < b; 81 | }; 82 | 83 | #ifdef USE_PDQSORT 84 | if(atp) 85 | refresh::sort::pdqsort_branchless_tp(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer, *atp); 86 | else 87 | refresh::sort::pdqsort_branchless(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer); 88 | #else 89 | #ifdef WIN32 90 | // concurrency::parallel_sort(arr, arr + arr_size, pid_comparer); 91 | std::stable_sort(arr, arr + arr_size, pid_comparer); 92 | #elif defined __APPLE__ 93 | std::stable_sort(arr, arr + arr_size, pid_comparer); 94 | #else 95 | __gnu_parallel::sort(arr, arr + arr_size, pid_comparer); 96 | #endif 97 | #endif 98 | } 99 | 100 | // ***************************************************************************************** 101 | // 102 | void ParallelSort(pair *arr, size_t arr_size, pair *tmp, int rec_size, int key_size, int n_threads, refresh::active_thread_pool* atp) 103 | { 104 | auto pid_comparer = [](const std::pair& a, const std::pair& b)->bool { 105 | return a.first.pattern_id < b.first.pattern_id; 106 | }; 107 | 108 | #ifdef USE_PDQSORT 109 | if(atp) 110 | refresh::sort::pdqsort_branchless_tp(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer, *atp); 111 | // refresh::sort::pdqsort_tp(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer, *atp); 112 | else 113 | refresh::sort::pdqsort_branchless(refresh::sort::pdqsort_adjust_threads(arr_size, n_threads), arr, arr + arr_size, pid_comparer); 114 | #else 115 | #ifdef WIN32 116 | // concurrency::parallel_sort(arr, arr + arr_size, pid_comparer); 117 | std::stable_sort(arr, arr + arr_size, pid_comparer); 118 | #elif defined __APPLE__ 119 | std:: stable_sort(arr, arr + arr_size, pid_comparer); 120 | #else 121 | __gnu_parallel::sort(arr, arr + arr_size, pid_comparer); 122 | #endif 123 | #endif 124 | } 125 | #endif -------------------------------------------------------------------------------- /src/parallel_sorter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include 11 | #include "types.h" 12 | #include 13 | 14 | using namespace std; 15 | 16 | 17 | /*void ParallelSort(kmer_t* arr, size_t arr_size, uint32_t n_threads, refresh::active_thread_pool* atp = nullptr); 18 | 19 | void ParallelSort(pair *arr, size_t arr_size, pair *tmp, int rec_size, int key_size, int n_threads, refresh::active_thread_pool* atp = nullptr); 20 | 21 | void ParallelSort(pair *arr, size_t arr_size, pair *tmp, int rec_size, int key_size, int n_threads, refresh::active_thread_pool* atp = nullptr); 22 | 23 | void ParallelSort(pair* arr, size_t arr_size, int n_threads, refresh::active_thread_pool* atp = nullptr); 24 | */ -------------------------------------------------------------------------------- /src/params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "input_file.h" 4 | #include "sparse_filters.h" 5 | #include "alphabet.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // ***************************************************************************************** 13 | // 14 | 15 | class Params { 16 | public: 17 | enum Mode { 18 | build, 19 | minhash, 20 | all2all, 21 | all2all_sparse, 22 | all2all_parts, 23 | new2all, 24 | one2all, 25 | distance, 26 | unknown 27 | }; 28 | 29 | const std::string MODE_BUILD = "build"; 30 | const std::string MODE_MINHASH = "minhash"; 31 | const std::string MODE_ALL_2_ALL = "all2all"; 32 | const std::string MODE_ALL_2_ALL_SPARSE = "all2all-sp"; 33 | const std::string MODE_ALL_2_ALL_PARTS = "all2all-parts"; 34 | const std::string MODE_NEW_2_ALL = "new2all"; 35 | const std::string MODE_ONE_2_ALL = "one2all"; 36 | const std::string MODE_DISTANCE = "distance"; 37 | 38 | const std::string SWITCH_HELP = "-help"; 39 | const std::string SWITCH_VERSION = "-version"; 40 | const std::string SWITCH_KMC_SAMPLES = "-from-kmers"; 41 | const std::string SWITCH_MINHASH_SAMPLES = "-from-minhash"; 42 | const std::string SWITCH_MULTISAMPLE_FASTA = "-multisample-fasta"; 43 | const std::string SWITCH_PHYLIP_OUT = "-phylip-out"; 44 | const std::string SWITCH_EXTEND_DB = "-extend"; 45 | const std::string SWITCH_SPARSE = "-sparse"; 46 | 47 | const std::string OPTION_FRACTION = "-f"; 48 | const std::string OPTION_FRACTION_START = "-f-start"; 49 | const std::string OPTION_LENGTH = "-k"; 50 | const std::string OPTION_VERBOSE = "-v"; 51 | const std::string OPTION_DEBUG = "-vv"; 52 | const std::string OPTION_THREADS = "-t"; 53 | const std::string OPTION_READER_THREADS = "-rt"; 54 | const std::string OPTION_BUFFER = "-buffer"; 55 | const std::string OPTION_BUBBLE_SIZE = "-bubble-size"; 56 | 57 | const std::string OPTION_MAX = "-max"; 58 | const std::string OPTION_MIN = "-min"; 59 | 60 | const std::string OPTION_ALPHABET = "-alphabet"; 61 | const std::string SWITCH_PRESERVE_STRAND = "-preserve-strand"; 62 | 63 | const std::string OPTION_SAMPLE_ROWS = "-sample-rows"; 64 | 65 | std::map availableMetrics; 66 | 67 | private: 68 | 69 | 70 | 71 | public: 72 | double fraction{ 1.0 }; 73 | uint32_t kmerLength{ 18 }; 74 | 75 | int numThreads{ 0 }; 76 | int numReaderThreads{ 0 }; 77 | int cacheBufferMb{ 8 }; 78 | int bubbleSize{ 8000 }; 79 | bool multisampleFasta{ false }; 80 | 81 | double fractionStart{ 0.0 }; 82 | bool fractionSpecified{ false }; 83 | 84 | bool sparseOut{ false }; 85 | bool extendDb{ false }; 86 | bool phylipOut{ false }; 87 | 88 | int samplingSize{ 0 }; 89 | metric_fun_t samplingCriterion; 90 | 91 | 92 | InputFile::Format inputFormat { InputFile::GENOME }; 93 | Mode mode; 94 | 95 | std::shared_ptr alphabet{ AlphabetFactory::instance().create(AlphabetType::nt) }; 96 | 97 | std::vector files; 98 | 99 | std::map metricFilters; 100 | KmerFilter kmerFilter; 101 | 102 | std::string metricName; 103 | 104 | 105 | Params(); 106 | 107 | bool parse(int argc, char** argv); 108 | 109 | 110 | void showInstructions(Mode mode) const; 111 | 112 | void showHeader() const; 113 | 114 | 115 | bool findSwitch(std::vector& params, const std::string& name) const { 116 | auto it = std::find(params.begin(), params.end(), name); // verbose mode 117 | if (it != params.end()) { 118 | params.erase(it); 119 | return true; 120 | } 121 | 122 | return false; 123 | } 124 | 125 | template 126 | bool findOption(std::vector& params, const std::string& name, T& v) const { 127 | auto prevToEnd = std::prev(params.end()); 128 | auto it = std::find(params.begin(), prevToEnd, name); // verbose mode 129 | if (it != prevToEnd) { 130 | std::istringstream iss(*std::next(it)); 131 | if (iss >> v) { 132 | params.erase(it, it + 2); 133 | return true; 134 | } 135 | } 136 | 137 | return false; 138 | } 139 | 140 | template 141 | bool findOption(std::vector& params, const std::string& name, T& value1, U& value2) { 142 | if (params.size() < 3) { 143 | return false; 144 | } 145 | 146 | auto stop = std::prev(params.end(), 2); 147 | auto it = find(params.begin(), stop, name); // verbose mode 148 | if (it != stop) { 149 | if (std::istringstream(*std::next(it)) >> value1 150 | && std::istringstream(*std::next(it, 2)) >> value2) { 151 | params.erase(it, it + 3); 152 | return true; 153 | } 154 | } 155 | return false; 156 | } 157 | 158 | Mode str2mode(const std::string& str); 159 | 160 | private: 161 | void parseFilters(std::vector& params); 162 | 163 | bool parse_build(std::vector& params); 164 | bool parse_all2all(std::vector& params); 165 | bool parse_new2all(std::vector& params); 166 | bool parse_distance(std::vector& params); 167 | bool parse_minhash(std::vector& params); 168 | }; 169 | -------------------------------------------------------------------------------- /src/pattern.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #include "pattern.h" 10 | 11 | CEliasGamma pattern_t::elias; 12 | 13 | // ***************************************************************************************** 14 | // 15 | char* pattern_t::pack(char* buffer) const { 16 | // store members 17 | memcpy(buffer, &num_kmers, sizeof(int64_t)); 18 | buffer += sizeof(int64_t); 19 | 20 | memcpy(buffer, &parent_id, sizeof(int64_t)); 21 | buffer += sizeof(int64_t); 22 | 23 | memcpy(buffer, &num_samples, sizeof(sample_id_t)); 24 | buffer += sizeof(sample_id_t); 25 | 26 | memcpy(buffer, &num_local_samples, sizeof(sample_id_t)); 27 | buffer += sizeof(sample_id_t); 28 | 29 | memcpy(buffer, &last_sample_id, sizeof(sample_id_t)); 30 | buffer += sizeof(sample_id_t); 31 | 32 | memcpy(buffer, &num_bits, sizeof(uint32_t)); 33 | buffer += sizeof(uint32_t); 34 | 35 | uint64_t tmp = (uint64_t)is_parent; 36 | memcpy(buffer, &tmp, sizeof(uint32_t)); 37 | buffer += sizeof(uint64_t); 38 | 39 | size_t data_bytes = get_data_bytes(); 40 | if (data_bytes) { 41 | memcpy(buffer, reinterpret_cast(data), data_bytes); 42 | buffer += data_bytes; 43 | } 44 | 45 | return buffer; 46 | } 47 | 48 | // ***************************************************************************************** 49 | // 50 | char * pattern_t::unpack(char* buffer) { 51 | if (num_local_samples) { 52 | #ifdef USE_MALLOC 53 | free(data); 54 | #else 55 | delete[] data; 56 | #endif 57 | } 58 | 59 | memcpy(&num_kmers, buffer, sizeof(int64_t)); 60 | buffer += sizeof(int64_t); 61 | 62 | memcpy(&parent_id, buffer, sizeof(int64_t)); 63 | buffer += sizeof(int64_t); 64 | 65 | memcpy(&num_samples, buffer, sizeof(sample_id_t)); 66 | buffer += sizeof(sample_id_t); 67 | 68 | memcpy(&num_local_samples, buffer, sizeof(sample_id_t)); 69 | buffer += sizeof(sample_id_t); 70 | 71 | memcpy(&last_sample_id, buffer, sizeof(sample_id_t)); 72 | buffer += sizeof(sample_id_t); 73 | 74 | memcpy(&num_bits, buffer, sizeof(uint32_t)); 75 | buffer += sizeof(uint32_t); 76 | 77 | uint64_t tmp; 78 | memcpy(&tmp, buffer, sizeof(uint32_t)); 79 | is_parent = tmp; 80 | buffer += sizeof(uint64_t); 81 | 82 | size_t num_bytes = get_data_bytes(); 83 | 84 | if (num_bytes) { 85 | #ifdef USE_MALLOC 86 | data = (uint64_t*)malloc(num_bytes); 87 | #else 88 | data = new uint64_t[num_bytes / sizeof(uint64_t)]; 89 | #endif 90 | std::memcpy(reinterpret_cast(data), buffer, num_bytes); 91 | buffer += num_bytes; 92 | } 93 | 94 | return buffer; 95 | } 96 | 97 | // ***************************************************************************************** 98 | // 99 | void pattern_t::decodeSamples(uint32_t* out) const { 100 | if (num_local_samples) { 101 | out[num_local_samples - 1] = last_sample_id; 102 | if (num_local_samples > 1) { 103 | elias.Decode(data, num_bits, out); 104 | for (int i = num_local_samples - 2; i >= 0; --i) { 105 | out[i] = out[i + 1] - out[i]; 106 | } 107 | } 108 | } 109 | } -------------------------------------------------------------------------------- /src/pattern.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include "types.h" 11 | #include "elias_gamma.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | 19 | 20 | class pattern_minimal_t { 21 | private: 22 | int64_t num_kmers; // number of kmers with this pattern 23 | int64_t parent_id; // parrent pattern id 24 | 25 | sample_id_t num_samples; // number of samples in current node and its parents (cannot be larger than sample id) 26 | /* 27 | pattern_minimal_t(pattern_t &v, int64_t parent_id, uint64_t num_kmers) : 28 | num_kmers(num_kmers), parent_id(-1), num_samples(v.num_samples + 1) 29 | { 30 | if (v.num_samples > 0) { 31 | v.is_parent = true; 32 | this->parent_id = parent_id; 33 | } 34 | } 35 | */ 36 | }; 37 | 38 | 39 | 40 | // ***************************************************************************************** 41 | // 42 | class pattern_t { 43 | private: 44 | int64_t num_kmers; // number of kmers with this pattern 45 | int64_t parent_id; // parrent pattern id 46 | sample_id_t num_samples; // number of samples in current node and its parents (cannot be larger than sample id) 47 | sample_id_t num_local_samples; // number of samples in current node (cannot be larger than sample id) 48 | sample_id_t last_sample_id; 49 | uint32_t num_bits; 50 | 51 | bool is_parent; // tells whether the node is parent of some other 52 | 53 | uint64_t* data; // array of samples id (Elias-Gamma encoded) 54 | 55 | static CEliasGamma elias; 56 | 57 | public: 58 | 59 | // ***************************************************************************************** 60 | // 61 | uint64_t* get_data() const { return data; } 62 | 63 | int64_t get_num_kmers() const { return num_kmers; } 64 | 65 | void set_num_kmers(int64_t v) { num_kmers = v; } 66 | 67 | void add_num_kmers(int64_t v) { num_kmers += v; } 68 | 69 | sample_id_t get_num_samples() const { return num_samples; } 70 | 71 | sample_id_t get_num_local_samples() const { return num_local_samples; } 72 | 73 | size_t get_num_bits() const { return num_bits; } 74 | 75 | bool get_is_parrent() const { return is_parent; } 76 | 77 | int64_t get_parent_id() const { return parent_id; } 78 | 79 | size_t get_data_bytes() const { 80 | return (num_bits == 0) ? 0 : ((num_bits + 127) / 128) * 16; 81 | } 82 | 83 | size_t get_bytes(void) const { 84 | return sizeof(pattern_t) + get_data_bytes(); 85 | } 86 | 87 | // ***************************************************************************************** 88 | // 89 | pattern_t() : 90 | num_kmers(0), parent_id(-1), num_samples(0), num_local_samples(0), 91 | last_sample_id(0), num_bits(0), is_parent(false), data(nullptr) 92 | { 93 | } 94 | 95 | // ***************************************************************************************** 96 | // 97 | pattern_t(sample_id_t x, uint64_t num_kmers) : 98 | num_kmers(num_kmers), parent_id(-1), num_samples(1), num_local_samples(1), 99 | last_sample_id(x), num_bits(0), is_parent(false), data(nullptr) 100 | 101 | { 102 | } 103 | 104 | // ***************************************************************************************** 105 | // 106 | pattern_t(pattern_t &v, int64_t parent_id, sample_id_t x, uint64_t num_kmers) : 107 | num_kmers(num_kmers), parent_id(-1), num_samples(v.num_samples + 1), 108 | num_local_samples(1), last_sample_id(x), num_bits(0), is_parent(false), data(nullptr) 109 | { 110 | if (v.num_samples > 0) { 111 | v.is_parent = true; 112 | this->parent_id = parent_id; 113 | } 114 | } 115 | 116 | 117 | // ***************************************************************************************** 118 | // 119 | /* void from_minimal(const pattern_minimal_t& ref, sample_id_t sample_id) { 120 | this->num_kmers = ref.num_kmers; 121 | this->parent_id = ref.parent_id; 122 | this->num_samples = ref.num_samples; 123 | this->last_sample_id = sample_id; 124 | this->num_local_samples = 1; 125 | } 126 | */ 127 | // ***************************************************************************************** 128 | // 129 | pattern_t(const pattern_t &v) = delete; 130 | 131 | // ***************************************************************************************** 132 | // 133 | pattern_t(pattern_t &&v) noexcept 134 | { 135 | *this = std::move(v); 136 | } 137 | 138 | // ***************************************************************************************** 139 | // 140 | ~pattern_t() 141 | { 142 | if (data) 143 | { 144 | #ifdef USE_MALLOC 145 | free(data); 146 | #else 147 | delete[] data; 148 | #endif 149 | } 150 | } 151 | 152 | // ***************************************************************************************** 153 | // 154 | pattern_t& operator=(const pattern_t &v) = delete; 155 | 156 | // ***************************************************************************************** 157 | // 158 | pattern_t& operator=(pattern_t &&v) noexcept { 159 | is_parent = v.is_parent; 160 | num_samples = v.num_samples; 161 | num_local_samples = v.num_local_samples; 162 | num_kmers = v.num_kmers; 163 | data = v.data; 164 | parent_id = v.parent_id; 165 | num_bits = v.num_bits; 166 | last_sample_id = v.last_sample_id; 167 | 168 | v.data = nullptr; 169 | v.parent_id = -1; 170 | v.num_kmers = 0; 171 | v.num_local_samples = 0; 172 | v.num_samples = 0; 173 | v.num_bits = 0; 174 | v.last_sample_id = 0; 175 | 176 | return *this; 177 | } 178 | 179 | // ***************************************************************************************** 180 | // 181 | void release() { 182 | if (data) 183 | { 184 | #ifdef USE_MALLOC 185 | free(data); 186 | #else 187 | delete[] data; 188 | #endif 189 | data = nullptr; 190 | } 191 | } 192 | 193 | // ***************************************************************************************** 194 | // 195 | void expand(const sample_id_t x) 196 | { 197 | ++num_samples; 198 | ++num_local_samples; 199 | 200 | uint32_t delta = x - last_sample_id; 201 | elias.Encode(delta, data, num_bits); 202 | last_sample_id = x; 203 | } 204 | 205 | // ***************************************************************************************** 206 | // 207 | char* pack(char* buffer) const; 208 | 209 | // ***************************************************************************************** 210 | // 211 | char * unpack(char* buffer); 212 | 213 | // ***************************************************************************************** 214 | // 215 | void decodeSamples(uint32_t* out) const; 216 | }; 217 | 218 | -------------------------------------------------------------------------------- /src/sampler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "conversion.h" 9 | #include "../libs/refresh/sort/lib/pdqsort_par.h" 10 | 11 | using namespace std; 12 | 13 | // ***************************************************************************************** 14 | template 15 | class Sampler 16 | { 17 | public: 18 | enum class strategy_t {none, best, random}; 19 | 20 | private: 21 | size_t no_samples; 22 | size_t max_items_per_sample; 23 | 24 | struct item_t 25 | { 26 | Item item; 27 | Value value; 28 | Score score; 29 | 30 | item_t(Item item, Value value, Score score) : 31 | item(item), value(value), score(score) 32 | {} 33 | 34 | item_t(const item_t&) = default; 35 | item_t(item_t&&) = default; 36 | item_t& operator=(const item_t&) = default; 37 | item_t& operator=(item_t&&) = default; 38 | }; 39 | 40 | vector> data; 41 | vector data_sizes; 42 | vector mts; 43 | strategy_t strategy{ strategy_t::none }; 44 | 45 | static bool heap_comparer(const item_t& x, const item_t& y) 46 | { 47 | if (x.score != y.score) 48 | return x.score > y.score; 49 | return x.item < y.item; 50 | } 51 | 52 | void prepare_heap(size_t sample_id) 53 | { 54 | make_heap(data[sample_id].begin(), data[sample_id].end(), &(this->heap_comparer)); 55 | } 56 | 57 | void select_best(size_t sample_id) 58 | { 59 | if (data[sample_id].back().score >= data[sample_id].front().score) // if worse than min of heap, do not push into heap 60 | { 61 | push_heap(data[sample_id].begin(), data[sample_id].end(), &(this->heap_comparer)); 62 | pop_heap(data[sample_id].begin(), data[sample_id].end(), &(this->heap_comparer)); 63 | } 64 | data[sample_id].pop_back(); 65 | } 66 | 67 | void select_random(size_t sample_id) 68 | { 69 | if (mts[sample_id]() % data_sizes[sample_id] == 0) // remove the new one? 70 | ; 71 | else 72 | { 73 | size_t id = mts[sample_id]() % max_items_per_sample; // which to remove 74 | data[sample_id][id] = data[sample_id].back(); 75 | } 76 | 77 | data[sample_id].pop_back(); 78 | } 79 | 80 | public: 81 | Sampler(size_t no_samples, size_t max_items_per_sample, strategy_t strategy) : 82 | no_samples(no_samples), 83 | max_items_per_sample(max_items_per_sample), 84 | strategy(strategy) 85 | { 86 | data.resize(no_samples); 87 | if (strategy == strategy_t::random) 88 | { 89 | data_sizes.resize(no_samples, 0); 90 | mts.resize(no_samples); 91 | } 92 | 93 | for (auto& x : data) 94 | x.reserve(max_items_per_sample + 1); 95 | } 96 | 97 | void add(size_t sample_id, Item item, Value value, Score score) 98 | { 99 | assert(sample_id < no_samples); 100 | 101 | data[sample_id].emplace_back(item, value, score); 102 | 103 | if(strategy == strategy_t::random) 104 | data_sizes[sample_id]++; 105 | 106 | if (strategy == strategy_t::best && data[sample_id].size() == max_items_per_sample) 107 | prepare_heap(sample_id); 108 | 109 | if (data[sample_id].size() <= max_items_per_sample) 110 | return; 111 | 112 | switch (strategy) 113 | { 114 | case strategy_t::best: 115 | select_best(sample_id); 116 | break; 117 | case strategy_t::random: 118 | select_random(sample_id); 119 | break; 120 | } 121 | } 122 | 123 | int saveRowSparse(size_t row_id, char* out, uint32_t idx_shift) 124 | { 125 | auto out0 = out; 126 | 127 | refresh::sort::pdqsort(data[row_id].begin(), data[row_id].end(), [](const auto& x, const auto& y) {return x.item < y.item; }); 128 | 129 | for (auto& x : data[row_id]) 130 | { 131 | out += num2str(idx_shift + x.item + 1, out); 132 | *out++ = ':'; 133 | out += num2str(x.value, out); 134 | *out++ = ','; 135 | } 136 | 137 | return (int) (out - out0); 138 | } 139 | 140 | size_t getNoInRow(size_t row_id) const 141 | { 142 | return data[row_id].size(); 143 | } 144 | }; -------------------------------------------------------------------------------- /src/simd/row_add.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | 8 | */ 9 | 10 | #include 11 | 12 | // uncomment this to disable AVX2 compilation 13 | //#define NO_AVX2 14 | 15 | 16 | void row_add(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add, bool avx2_present); 17 | 18 | #if defined(ARCH_X64) 19 | void row_add_avx(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add); 20 | void row_add_avx2(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add); 21 | #endif 22 | 23 | #if defined(ARCH_ARM) 24 | void row_add_neon(uint32_t* row, uint32_t* src_ids, uint32_t num_elems, uint32_t to_add); 25 | #endif 26 | -------------------------------------------------------------------------------- /src/simd/row_add_avx.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #include "row_add.h" 10 | #include 11 | 12 | 13 | // ***************************************************************************************** 14 | // 15 | #ifdef NO_AVX2 16 | void row_add(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add, bool avx2_present) { 17 | row_add_avx(row, src_ids, num_elems, to_add); 18 | } 19 | #endif 20 | 21 | 22 | // ***************************************************************************************** 23 | // 24 | void row_add_avx(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add) 25 | { 26 | __m128i _to_add = _mm_set1_epi32((int)to_add); 27 | auto p = src_ids; 28 | 29 | uint32_t j; 30 | 31 | if (num_elems % 32 >= 16) 32 | { 33 | j = -16; 34 | goto inner_start; 35 | } 36 | 37 | for (j = 0; j + 32 <= num_elems; j += 32) 38 | { 39 | if (*p + 15 == *(p + 15)) 40 | { 41 | auto _q = (__m128i*) (row + *p); 42 | 43 | _mm_storeu_si128(_q, _mm_add_epi32(_mm_loadu_si128(_q), _to_add)); 44 | _mm_storeu_si128(_q + 1, _mm_add_epi32(_mm_loadu_si128(_q + 1), _to_add)); 45 | _mm_storeu_si128(_q + 2, _mm_add_epi32(_mm_loadu_si128(_q + 2), _to_add)); 46 | _mm_storeu_si128(_q + 3, _mm_add_epi32(_mm_loadu_si128(_q + 3), _to_add)); 47 | 48 | p += 16; 49 | } 50 | else 51 | { 52 | row[*p++] += to_add; 53 | row[*p++] += to_add; 54 | row[*p++] += to_add; 55 | row[*p++] += to_add; 56 | row[*p++] += to_add; 57 | row[*p++] += to_add; 58 | row[*p++] += to_add; 59 | row[*p++] += to_add; 60 | row[*p++] += to_add; 61 | row[*p++] += to_add; 62 | row[*p++] += to_add; 63 | row[*p++] += to_add; 64 | row[*p++] += to_add; 65 | row[*p++] += to_add; 66 | row[*p++] += to_add; 67 | row[*p++] += to_add; 68 | } 69 | 70 | inner_start: 71 | if (*p + 15 == *(p + 15)) 72 | { 73 | auto _q = (__m128i*) (row + *p); 74 | 75 | _mm_storeu_si128(_q, _mm_add_epi32(_mm_loadu_si128(_q), _to_add)); 76 | _mm_storeu_si128(_q + 1, _mm_add_epi32(_mm_loadu_si128(_q + 1), _to_add)); 77 | _mm_storeu_si128(_q + 2, _mm_add_epi32(_mm_loadu_si128(_q + 2), _to_add)); 78 | _mm_storeu_si128(_q + 3, _mm_add_epi32(_mm_loadu_si128(_q + 3), _to_add)); 79 | 80 | p += 16; 81 | } 82 | else 83 | { 84 | row[*p++] += to_add; 85 | row[*p++] += to_add; 86 | row[*p++] += to_add; 87 | row[*p++] += to_add; 88 | row[*p++] += to_add; 89 | row[*p++] += to_add; 90 | row[*p++] += to_add; 91 | row[*p++] += to_add; 92 | row[*p++] += to_add; 93 | row[*p++] += to_add; 94 | row[*p++] += to_add; 95 | row[*p++] += to_add; 96 | row[*p++] += to_add; 97 | row[*p++] += to_add; 98 | row[*p++] += to_add; 99 | row[*p++] += to_add; 100 | } 101 | } 102 | num_elems -= j; 103 | 104 | switch (num_elems % 16) 105 | { 106 | case 15: row[*p++] += to_add; 107 | case 14: row[*p++] += to_add; 108 | case 13: row[*p++] += to_add; 109 | case 12: row[*p++] += to_add; 110 | case 11: row[*p++] += to_add; 111 | case 10: row[*p++] += to_add; 112 | case 9: row[*p++] += to_add; 113 | case 8: row[*p++] += to_add; 114 | case 7: row[*p++] += to_add; 115 | case 6: row[*p++] += to_add; 116 | case 5: row[*p++] += to_add; 117 | case 4: row[*p++] += to_add; 118 | case 3: row[*p++] += to_add; 119 | case 2: row[*p++] += to_add; 120 | case 1: row[*p++] += to_add; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/simd/row_add_avx2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #include "row_add.h" 10 | #include 11 | #include 12 | 13 | // prevent from compiling following functions when NO_AVX2 is defined 14 | #ifndef NO_AVX2 15 | 16 | // ***************************************************************************************** 17 | // 18 | void row_add(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add, bool avx2_present) { 19 | #if defined(ARCH_X64) 20 | if (avx2_present) 21 | row_add_avx2(row, src_ids, num_elems, to_add); 22 | else 23 | row_add_avx(row, src_ids, num_elems, to_add); 24 | #else 25 | row_add_neon(row, src_ids, num_elems, to_add); 26 | #endif 27 | } 28 | 29 | // ***************************************************************************************** 30 | // 31 | void row_add_avx2(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add) 32 | { 33 | __m256i _to_add = _mm256_set1_epi32((int)to_add); 34 | auto p = src_ids; 35 | 36 | uint32_t j; 37 | 38 | if (num_elems % 32 >= 16) 39 | { 40 | j = -16; 41 | goto inner_start; 42 | } 43 | 44 | for (j = 0; j + 32 <= num_elems; j += 32) 45 | { 46 | if (*p + 15 == *(p + 15)) 47 | { 48 | auto _q = (__m256i*) (row + *p); 49 | 50 | _mm256_storeu_si256(_q, _mm256_add_epi32(_mm256_loadu_si256(_q), _to_add)); 51 | _mm256_storeu_si256(_q+1, _mm256_add_epi32(_mm256_loadu_si256(_q+1), _to_add)); 52 | 53 | p += 16; 54 | } 55 | else 56 | { 57 | row[*p++] += to_add; 58 | row[*p++] += to_add; 59 | row[*p++] += to_add; 60 | row[*p++] += to_add; 61 | row[*p++] += to_add; 62 | row[*p++] += to_add; 63 | row[*p++] += to_add; 64 | row[*p++] += to_add; 65 | row[*p++] += to_add; 66 | row[*p++] += to_add; 67 | row[*p++] += to_add; 68 | row[*p++] += to_add; 69 | row[*p++] += to_add; 70 | row[*p++] += to_add; 71 | row[*p++] += to_add; 72 | row[*p++] += to_add; 73 | } 74 | 75 | inner_start: 76 | if (*p + 15 == *(p + 15)) 77 | { 78 | auto _q = (__m256i*) (row + *p); 79 | 80 | _mm256_storeu_si256(_q, _mm256_add_epi32(_mm256_loadu_si256(_q), _to_add)); 81 | _mm256_storeu_si256(_q + 1, _mm256_add_epi32(_mm256_loadu_si256(_q + 1), _to_add)); 82 | 83 | p += 16; 84 | } 85 | else 86 | { 87 | row[*p++] += to_add; 88 | row[*p++] += to_add; 89 | row[*p++] += to_add; 90 | row[*p++] += to_add; 91 | row[*p++] += to_add; 92 | row[*p++] += to_add; 93 | row[*p++] += to_add; 94 | row[*p++] += to_add; 95 | row[*p++] += to_add; 96 | row[*p++] += to_add; 97 | row[*p++] += to_add; 98 | row[*p++] += to_add; 99 | row[*p++] += to_add; 100 | row[*p++] += to_add; 101 | row[*p++] += to_add; 102 | row[*p++] += to_add; 103 | } 104 | } 105 | num_elems -= j; 106 | 107 | switch (num_elems % 16) 108 | { 109 | case 15: row[*p++] += to_add; 110 | case 14: row[*p++] += to_add; 111 | case 13: row[*p++] += to_add; 112 | case 12: row[*p++] += to_add; 113 | case 11: row[*p++] += to_add; 114 | case 10: row[*p++] += to_add; 115 | case 9: row[*p++] += to_add; 116 | case 8: row[*p++] += to_add; 117 | case 7: row[*p++] += to_add; 118 | case 6: row[*p++] += to_add; 119 | case 5: row[*p++] += to_add; 120 | case 4: row[*p++] += to_add; 121 | case 3: row[*p++] += to_add; 122 | case 2: row[*p++] += to_add; 123 | case 1: row[*p++] += to_add; 124 | } 125 | } 126 | 127 | #endif -------------------------------------------------------------------------------- /src/simd/row_add_neon.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 3 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 4 | 5 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 6 | 7 | */ 8 | 9 | #if defined(ARCH_ARM) 10 | 11 | #include "row_add.h" 12 | #include 13 | 14 | // ***************************************************************************************** 15 | // 16 | void row_add(uint32_t* row, uint32_t* src_ids, uint32_t num_elems, uint32_t to_add, bool avx2_present) { 17 | row_add_neon(row, src_ids, num_elems, to_add); 18 | } 19 | 20 | // ***************************************************************************************** 21 | // 22 | void row_add_neon(uint32_t *row, uint32_t *src_ids, uint32_t num_elems, uint32_t to_add) 23 | { 24 | uint32x4_t _to_add = vdupq_n_u32(to_add); 25 | auto p = src_ids; 26 | 27 | uint32_t j; 28 | 29 | if (num_elems % 32 >= 16) 30 | { 31 | j = -16; 32 | goto inner_start; 33 | } 34 | 35 | for (j = 0; j + 32 <= num_elems; j += 32) 36 | { 37 | if (*p + 15 == *(p + 15)) 38 | { 39 | auto _q = (row + *p); 40 | 41 | // _mm_storeu_si128(_q, _mm_add_epi32(_mm_loadu_si128(_q), _to_add)); 42 | vst1q_u32(_q, vaddq_u32(vld1q_u32(_q), _to_add)); 43 | vst1q_u32(_q + 4, vaddq_u32(vld1q_u32(_q + 4), _to_add)); 44 | vst1q_u32(_q + 8, vaddq_u32(vld1q_u32(_q + 8), _to_add)); 45 | vst1q_u32(_q + 12, vaddq_u32(vld1q_u32(_q + 12), _to_add)); 46 | 47 | p += 16; 48 | } 49 | else 50 | { 51 | row[*p++] += to_add; 52 | row[*p++] += to_add; 53 | row[*p++] += to_add; 54 | row[*p++] += to_add; 55 | row[*p++] += to_add; 56 | row[*p++] += to_add; 57 | row[*p++] += to_add; 58 | row[*p++] += to_add; 59 | row[*p++] += to_add; 60 | row[*p++] += to_add; 61 | row[*p++] += to_add; 62 | row[*p++] += to_add; 63 | row[*p++] += to_add; 64 | row[*p++] += to_add; 65 | row[*p++] += to_add; 66 | row[*p++] += to_add; 67 | } 68 | 69 | inner_start: 70 | if (*p + 15 == *(p + 15)) 71 | { 72 | auto _q = (row + *p); 73 | 74 | vst1q_u32(_q, vaddq_u32(vld1q_u32(_q), _to_add)); 75 | vst1q_u32(_q + 4, vaddq_u32(vld1q_u32(_q + 4), _to_add)); 76 | vst1q_u32(_q + 8, vaddq_u32(vld1q_u32(_q + 8), _to_add)); 77 | vst1q_u32(_q + 12, vaddq_u32(vld1q_u32(_q + 12), _to_add)); 78 | 79 | p += 16; 80 | } 81 | else 82 | { 83 | row[*p++] += to_add; 84 | row[*p++] += to_add; 85 | row[*p++] += to_add; 86 | row[*p++] += to_add; 87 | row[*p++] += to_add; 88 | row[*p++] += to_add; 89 | row[*p++] += to_add; 90 | row[*p++] += to_add; 91 | row[*p++] += to_add; 92 | row[*p++] += to_add; 93 | row[*p++] += to_add; 94 | row[*p++] += to_add; 95 | row[*p++] += to_add; 96 | row[*p++] += to_add; 97 | row[*p++] += to_add; 98 | row[*p++] += to_add; 99 | } 100 | } 101 | num_elems -= j; 102 | 103 | switch (num_elems % 16) 104 | { 105 | case 15: row[*p++] += to_add; 106 | case 14: row[*p++] += to_add; 107 | case 13: row[*p++] += to_add; 108 | case 12: row[*p++] += to_add; 109 | case 11: row[*p++] += to_add; 110 | case 10: row[*p++] += to_add; 111 | case 9: row[*p++] += to_add; 112 | case 8: row[*p++] += to_add; 113 | case 7: row[*p++] += to_add; 114 | case 6: row[*p++] += to_add; 115 | case 5: row[*p++] += to_add; 116 | case 4: row[*p++] += to_add; 117 | case 3: row[*p++] += to_add; 118 | case 2: row[*p++] += to_add; 119 | case 1: row[*p++] += to_add; 120 | } 121 | } 122 | #endif -------------------------------------------------------------------------------- /src/similarity_calculator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "prefix_kmer_db.h" 3 | 4 | class SimilarityCalculator { 5 | public: 6 | SimilarityCalculator(int _num_threads, size_t cacheBufferMb); 7 | 8 | void all2all(PrefixKmerDb& db, LowerTriangularMatrix& matrix) const; 9 | void all2all_sp(PrefixKmerDb& db, SparseMatrix& matrix, CBubbleHelper& bubbles) const; 10 | 11 | template 12 | void one2all(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector& similarities) const; 13 | void one2all_sp(const PrefixKmerDb& db, const kmer_t* kmers, size_t kmersCount, std::vector>& similarities) const; 14 | 15 | void db2db(const PrefixKmerDb& db1, const PrefixKmerDb& db2, LowerTriangularMatrix& matrix) const; 16 | void db2db_sp(PrefixKmerDb& db1, PrefixKmerDb& db2, SparseMatrix& matrix, CBubbleHelper& bubbles) const; 17 | 18 | protected: 19 | 20 | static const int PREFETCH_DIST = 48; 21 | 22 | int num_threads; 23 | 24 | size_t cacheBufferMb; 25 | 26 | mutable refresh::active_thread_pool atp; 27 | 28 | bool avx2_present; 29 | 30 | template 31 | int decode_pattern_samples_prefetch(const vector& patterns, int pid, uint32_t* samples) const 32 | { 33 | const auto& pattern = patterns[pid]; 34 | int num_samples = pattern.get_num_samples(); 35 | 36 | uint32_t* out = samples + pattern.get_num_samples(); // start from the end 37 | 38 | int64_t current_id = pid; 39 | while (current_id >= 0) { 40 | const auto& cur = patterns[current_id]; 41 | auto parent_id = cur.get_parent_id(); 42 | 43 | if (parent_id >= 0) 44 | #ifdef WIN32 45 | _mm_prefetch((const char*)(patterns.data() + parent_id), _MM_HINT_T0); 46 | #else 47 | __builtin_prefetch(patterns.data() + parent_id); 48 | #endif 49 | 50 | out -= cur.get_num_local_samples(); 51 | cur.decodeSamples(out); 52 | 53 | current_id = parent_id; 54 | } 55 | 56 | return num_samples; 57 | } 58 | 59 | int decode_pattern_samples(const vector& patterns, int pid, uint32_t* samples) const 60 | { 61 | const auto& pattern = patterns[pid]; 62 | int num_samples = pattern.get_num_samples(); 63 | 64 | uint32_t* out = samples + pattern.get_num_samples(); // start from the end 65 | 66 | int64_t current_id = pid; 67 | while (current_id >= 0) { 68 | const auto& cur = patterns[current_id]; 69 | 70 | out -= cur.get_num_local_samples(); 71 | cur.decodeSamples(out); 72 | 73 | current_id = cur.get_parent_id(); 74 | } 75 | 76 | return num_samples; 77 | } 78 | }; -------------------------------------------------------------------------------- /src/sparse_filters.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "types.h" 8 | 9 | using metric_fun_t = std::function; 10 | 11 | // Filter applied on metric 12 | struct MetricFilter { 13 | 14 | double bounds[2] { std::numeric_limits::lowest(), std::numeric_limits::max() }; 15 | 16 | metric_fun_t metric; 17 | 18 | // calculate measure and check if passed 19 | bool operator()(num_kmers_t common, num_kmers_t cnt1, num_kmers_t cnt2, int kmerLength) const { 20 | double value = metric(common, cnt1, cnt2, kmerLength); 21 | return (value >= bounds[0] && value <= bounds[1]); 22 | } 23 | }; 24 | 25 | // Filter applied on kmer count 26 | struct KmerFilter { 27 | num_kmers_t bounds[2] { 0, std::numeric_limits::max() }; 28 | 29 | bool operator()(num_kmers_t n) const { return n >= bounds[0] && n <= bounds[1]; } 30 | }; 31 | 32 | 33 | template 34 | struct CombinedFilter { 35 | 36 | const std::map& metricFilters; 37 | const KmerFilter& kmerFilter; 38 | const std::vector& rowCounts; 39 | const std::vector& colCounts; 40 | int kmerLength; 41 | 42 | CombinedFilter(const std::map& metricFilters, const KmerFilter& kmerFilter, const std::vector& rowCounts, const std::vector& colCounts, int kmerLength) : 43 | metricFilters(metricFilters), 44 | kmerFilter(kmerFilter), 45 | rowCounts(rowCounts), 46 | colCounts(colCounts), 47 | kmerLength(kmerLength) {} 48 | 49 | bool operator()(T common, int row_id, int col_id) const { 50 | for (const auto& filter : metricFilters) { 51 | if (!filter.second(common, rowCounts[row_id], colCounts[col_id], kmerLength)) { 52 | return false; 53 | } 54 | } 55 | 56 | if (!kmerFilter(common)) { 57 | return false; 58 | } 59 | return true; 60 | } 61 | 62 | }; -------------------------------------------------------------------------------- /src/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define KMER_MSB (1ULL << 63) 6 | 7 | #define SUFFIX_BITS 32 8 | 9 | #define SUFFIX_MASK ((1ULL << SUFFIX_BITS) - 1) 10 | #define PREFIX_MASK (~SUFFIX_MASK) 11 | 12 | typedef uint64_t kmer_t; 13 | typedef uint32_t suffix_t; 14 | 15 | typedef int32_t pattern_id_t; 16 | typedef uint32_t sample_id_t; 17 | 18 | typedef uint32_t num_kmers_t; 19 | 20 | union kmer_or_pattern_t { 21 | kmer_t kmer; 22 | pattern_id_t pattern_id; 23 | }; 24 | 25 | #define GET_PREFIX(kmer) ((kmer) & PREFIX_MASK) 26 | #define GET_SUFFIX(kmer) (static_cast((kmer) & SUFFIX_MASK)) 27 | #define GET_PREFIX_SHIFTED(kmer) ((kmer) >> SUFFIX_BITS) 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VERSION "2.3.1" 4 | #define DATE "20.12.2024" 5 | 6 | /* 7 | 8 | Version history 9 | 10 | 2.3.1 (20.12.2024) 11 | - Improved sparse processing of samples in new2all mode. 12 | 13 | 2.3.0 (16.12.2024) 14 | - Added amino acid alphabets and -preserve-strand option. 15 | 16 | 2.2.5 (02.11.2024) 17 | - Improved logging. 18 | 19 | 2.2.4 (31.10.2024) 20 | - Improved support of patterns shared by large number of samples ('bubbles'). 21 | 22 | 2.2.3 (21.10.2024) 23 | - Added `-version` switch. 24 | 25 | 2.2.2 (04.10.2024) 26 | - Fixed bug with filtering/sampling in all2all variants and new2all always using 18 as a kmer length. 27 | 28 | 2.2.1 (03.10.2024) 29 | - Fixed slight slow down in `build` mode introduced in the previous version. 30 | 31 | 2.2.0 (02.10.2024) 32 | - Added rows sampling (`-sample-rows` option) in `all2all-sp` and `all2all-parts` modes. 33 | 34 | 2.1.0 (27.09.2024) 35 | - Added unified filtering based on any specified measure (parameters `-above`, `-above_eq`, `-below`, and `-below_eq` replaced with `-min` and `-max` options). 36 | - Changed interface in `distance` mode: only one measure allowed, output file has to be specified. 37 | 38 | 2.0.6 (23.09.2024) 39 | - Speed-ups in all2all-sp and all2all-parts modes. 40 | 41 | 2.0.5 (12.09.2024) 42 | - Updates in tests and automatic building scripts. 43 | 44 | 2.0.4 (20.08.2024) 45 | - Above and below options working correctly in all2all-sp mode. 46 | 47 | 2.0.3 (28.06.2024) 48 | - Fixed bug with empty sample. 49 | 50 | 2.0.2 (19.06.2024) 51 | - Some unneccessary stuff removed from the database. 52 | 53 | 2.0.1 (18.06.2024) 54 | - Improved parallelization scheme. 55 | 56 | 57 | 2.0.0 (31.05.2024) 58 | - Added new modes: all2all_sparse, all2all_parts, 59 | - Serious time and memory optimizations, 60 | - Support of MacOS (Apple and x86 CPUs) and ARM platforms. 61 | 62 | 63 | 1.11.1 (07.03.2023) 64 | - Removed deadlock in the -multisample-fasta mode. 65 | 66 | 1.11.0 (27.02.2023) 67 | - Added -below and -above thresholds for all2all and new2all modes. 68 | 69 | 1.10.0 (18.07.2022) 70 | - Added support of sparse inputs in distance mode, 71 | - Added support of sparse outputs in distance mode (-sparse switch) with optional filtering (-above/-below options), 72 | - Extended help information. 73 | 74 | 1.9.4 (19.04.2022) 75 | - fixed database construction for very small samples (#kmers < #threads) 76 | - fixed synchronization issues in new2all mode (non-deterministic row order in output matrix). 77 | - fixed deadlock during database construction when -multisample-fasta mode is run on more than one file. 78 | 79 | 1.9.3 (27.08.2021) 80 | - Disabled h-mer hashatables loading in all2all mode. 81 | - Fast CSV saving in all2all and new2all modes. 82 | 83 | 1.9.2 (16.08.2021) 84 | - Synchronization bugfix in new2all. 85 | 86 | 1.9.1 (11.08.2021) 87 | - Output matrices can be stored in sparse format (-sparse switch). 88 | - Better workload balancing. 89 | 90 | 1.9.0 (09.08.2021) 91 | - Improved parallelization scheme in new2all mode (few-fold speed improvement). 92 | - Reduced memory footprint of -multisample-fasta mode. 93 | - More than one input FASTA files supported in -multisample-fasta mode. 94 | 95 | 1.8.0 (19.03.2021) 96 | - Added -extend switch which allows extending existing kmer database. 97 | - Serialization/deserialization works much faster now. 98 | - Fixed serious bug in -multisample-fasta mode which caused incorrect kmers counting. 99 | 100 | 1.7.6 (31.03.2020) 101 | - Fixed bug in distance mode when sequence id contained spaces. 102 | 103 | 1.7.5 (13.02.2020) 104 | - Some compilation warnings removed. 105 | - Fixed crash on samples with small k-mers count or very small filter values. 106 | 107 | 1.7.4 (29.01.2020) 108 | - Proper handling of triangle input matrices in `distance` mode (triangle outputs are produced). 109 | 110 | 1.7.3 (17.01.2020) 111 | - Fixed rare bug in hashtable when k-mer containing only T bases was treated as an empty entry. Now an empty item is indicated by a special value instead of a special key. 112 | 113 | 1.7.2 (15.01.2020) 114 | - Added new distance measure `-mash-query` which is a mash distance calculated w.r.t. a query length (use if the query is much shorter than database sequences). 115 | - C++11 compatibility (compiles with G++ 4.8.5). 116 | 117 | 1.7.1 (31.10.2019) 118 | - Possibility to specify low threshold of k-mer minhash filter (-f-start parameter). 119 | - When loading genome files, exact filenames are examined first. If this fails, an attempt to add predefined extensions is made. 120 | 121 | 1.7.0 (27.09.2019) 122 | - For performance reasons upper triangle (with diagonal) of distance matrix in all2all mode is no longer saved. 123 | - Preparations for raw serialization of hashtables. 124 | 125 | 1.6.2: (28.05.2019) After data structure update - stable 126 | Note: Starting from this release version numbering conforms to major.minor.patch scheme. 127 | Added: 128 | - Switch-phylip-out in distance mode which allows storing distance/similarity matrices in Phylip format. 129 | 130 | Fixed several bugs from 1.51 release: 131 | - Incorrect support of k-mer lengths < 16. 132 | - Very long processing of long k-mers (k >= 26). 133 | - Segmentation fault when storing minhashed k-mers on a disk (minhash mode). 134 | 135 | 136 | 1.51 (11.04.2019) 137 | - Serious reduction of time and memory requirements of build mode caused by the changes of the data structures. 138 | E.g., when tested on full k-mer spectrum of 40715 pathogen genomes, time and memory footprint decreased by 1/3 (1h30 to 1h, 60 to 40GB). 139 | - Several new parameters added. 140 | - Lots of bugs fixed. 141 | 142 | 1.20 (12.02.2019) 143 | Changes: 144 | - new2all mode added, 145 | - uniform output table format for all2all, new2all, and one2all modes, 146 | 147 | Bugs fixed: 148 | - proper support of samples with no k-mers of given length, 149 | - problems with building database from minhashed k-mers. 150 | 151 | 1.12 (19.07.2018) 152 | - Support of no-AVX2 build. 153 | 154 | 1.11 (13.07.2018) 155 | - Proper handling of uncompressed genomes. 156 | - Linking against pre-installed zlib available. 157 | 158 | 1.1 (2018-06-12) 159 | - File loading refactored. 160 | - Small bugs fixed. 161 | 162 | 1.0 (10.02.2018) 163 | Initial release 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | */ 179 | -------------------------------------------------------------------------------- /test/ictv/ictv.list: -------------------------------------------------------------------------------- 1 | ./ictv.phage.fna -------------------------------------------------------------------------------- /test/protein/translate.py: -------------------------------------------------------------------------------- 1 | from Bio.SeqIO.FastaIO import SimpleFastaParser 2 | import random 3 | 4 | aa2dna = { 'F': 'TTT', 'L': 'CTT', 'I': 'ATT', 'M': 'ATG', 'V': 'GTT', 5 | 'S': 'TCT', 'P': 'CCT', 'T': 'ACT', 'A': 'GCT', 'Y': 'TAT', 6 | 'H': 'CAT', 'Q': 'CAA', 'N': 'AAT', 'K': 'AAA', 'D': 'GAT', 7 | 'E': 'GAA', 'C': 'TGT', 'W': 'TGG', 'R': 'CGT', 'G': 'GGT'} 8 | 9 | aa = [*aa2dna.keys()] 10 | 11 | kmer_len = 8 12 | 13 | # generate 1000 different k-mers 14 | aa_kmers = [] 15 | dna_fwd_kmers = [] 16 | dna_rev_kmers = [] 17 | 18 | rev_cmp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 19 | 20 | for i in range(100000): 21 | aa_seq = [random.choice(aa) for j in range(kmer_len)] 22 | dna_seq = [aa2dna[c] for c in aa_seq] 23 | 24 | aa_str = ''.join(aa_seq) 25 | dna_str = ''.join(dna_seq) 26 | dna_rev_str = ''.join([rev_cmp[c] for c in dna_str[::-1]]) 27 | 28 | aa_kmers.append(aa_str) 29 | dna_fwd_kmers.append(dna_str) 30 | dna_rev_kmers.append(dna_rev_str) 31 | 32 | dna_kmers = [dna_fwd_kmers, dna_rev_kmers] 33 | 34 | # generate proteoms and genomes 35 | n_samples = 100 36 | 37 | for n_kmers_per_sample in [1, 1000]: 38 | aa_file = open(f'aa_{n_samples}x{n_kmers_per_sample}.fasta', 'w') 39 | dna_file = open(f'dna_{n_samples}x{n_kmers_per_sample}.fasta', 'w') 40 | 41 | for i in range(0, n_samples): 42 | 43 | indices = [random.choice(range(0,len(aa_kmers))) for j in range(n_kmers_per_sample)] 44 | fwd_or_rev = [random.choice([0, 1]) for j in range(n_kmers_per_sample)] 45 | 46 | aa_seq = [aa_kmers[j] for j in indices] 47 | dna_seq =[dna_kmers[fwd_or_rev[j]][indices[j]] for j in range(n_kmers_per_sample)] 48 | 49 | aa_str = '.'.join(aa_seq) 50 | dna_str = '.'.join(dna_seq) 51 | id = f'>seq_{i+1}\n' 52 | 53 | aa_file.write(id) 54 | aa_file.write(aa_str) 55 | aa_file.write('\n') 56 | 57 | dna_file.write(id) 58 | dna_file.write(dna_str) 59 | dna_file.write('\n') 60 | 61 | aa_file.close() 62 | dna_file.close() 63 | -------------------------------------------------------------------------------- /test/run-dev.bat: -------------------------------------------------------------------------------- 1 | set EXEC="./../../src/x64/Release/kmer-db.exe" 2 | set INPUT_DIR=./virus 3 | 4 | rem %EXEC% build %INPUT_DIR%/seqs.list k18.db 5 | 6 | rem %EXEC% all2all k18.db k18.csv 7 | 8 | rem %EXEC% all2all -sparse k18.db k18.sparse.csv 9 | 10 | rem %EXEC% build -f 0.1 %INPUT_DIR%/seqs.list k18.frac.db 11 | rem %EXEC% all2all k18.frac.db k18.frac.csv 12 | 13 | rem %EXEC% minhash 0.1 %INPUT_DIR%/seqs.list 14 | rem %EXEC% build -from-minhash -k 25 %INPUT_DIR%/seqs.list k18.minhash.db 15 | rem %EXEC% all2all k18.minhash.db k18.minhash.csv 16 | 17 | 18 | rem %EXEC% build -multisample-fasta %INPUT_DIR%/multi.list k18.multi.db 19 | rem %EXEC% all2all k18.multi.db k18.multi.csv 20 | 21 | rem %EXEC% build %INPUT_DIR%/seqs.part1.list k18.parts.db 22 | rem %EXEC% build -extend -k 25 %INPUT_DIR%/seqs.part2.list k18.parts.db 23 | rem %EXEC% all2all k18.parts.db k18.parts.csv 24 | 25 | rem %EXEC% build -k 24 %INPUT_DIR%/seqs.list k24.db 26 | rem %EXEC% all2all k24.db k24.csv 27 | 28 | rem %EXEC% build %INPUT_DIR%/seqs.part1.list k18.parts.22.db 29 | rem %EXEC% new2all k18.parts.22.db %INPUT_DIR%/seqs.part2.list k18.n2a.22.csv 30 | 31 | rem %EXEC% new2all -sparse k18.parts.db %INPUT_DIR%/seqs.part2.list k18.n2a.sparse.csv 32 | 33 | rem %EXEC% new2all k18.db %INPUT_DIR%/seqs.list k18.n2a.itself.csv 34 | 35 | rem %EXEC% distance jaccard min max cosine mash k18.csv 36 | 37 | 38 | rem %EXEC% build -k 25 -f 0.1 -t 16 %INPUT_DIR%/seqs.part1.list k25.db 39 | rem %EXEC% one2all k25.db %INPUT_DIR%/data/MT159713 MT159713.csv 40 | 41 | cd %INPUT_DIR% 42 | 43 | %EXEC% build seqs.part1-local.list k18.part1.db 44 | %EXEC% build seqs.part2-local.list k18.part2.db 45 | %EXEC% all2all-parts multi.db.list k18.parts.csv 46 | 47 | cd .. -------------------------------------------------------------------------------- /test/run-ictv.bat: -------------------------------------------------------------------------------- 1 | set EXEC="./../../src/x64/Release/kmer-db.exe" 2 | set INPUT_DIR=./ictv 3 | 4 | cd %INPUT_DIR% 5 | 6 | #%EXEC% build -multisample-fasta -k 25 ictv.list k25.db 7 | #%EXEC% all2all-sp -above 10 k25.db a2a-above10.csv 8 | #%EXEC% distance ani-shorter -above 0.7 a2a-above10.csv 9 | 10 | #%EXEC% build -multisample-fasta -k 25 ictv.list k25.db 11 | %EXEC% all2all-sp -min num-kmers:11 k25.db a2a-min11.csv 12 | %EXEC% distance ani-shorter -min ani-shorter:0.7 a2a-min11.csv a2a-min11.dist-min0p7.csv 13 | 14 | %EXEC% all2all-sp -min num-kmers:11 -min ani-shorter:0.7 k25.db a2a-min11-min0p7.csv 15 | %EXEC% distance ani-shorter a2a-min11-min0p7.csv a2a-min11-min0p7.dist.csv 16 | %EXEC% distance ani-shorter -min ani-shorter:0.7 a2a-min11-min0p7.csv a2a-min11-min0p7.dist-min0p7.csv 17 | 18 | cd .. -------------------------------------------------------------------------------- /test/run-protein.bat: -------------------------------------------------------------------------------- 1 | set EXEC="./../../src/x64/Release/kmer-db.exe" 2 | set INPUT_DIR=./protein 3 | 4 | cd %INPUT_DIR% 5 | 6 | 7 | %EXEC% build -t 1 -multisample-fasta -k 24 dna_100x1000.fasta dna.db 8 | %EXEC% build -t 1 -multisample-fasta -k 24 -preserve-strand dna_100x1000.fasta dna-preserve.db 9 | %EXEC% build -t 1 -multisample-fasta -k 8 -alphabet aa aa_100x1000.fasta aa.db 10 | 11 | %EXEC% build -t 1 -multisample-fasta -k 7 -alphabet aa aa_100x1000.fasta aa_k7.db 12 | 13 | %EXEC% build -t 1 -multisample-fasta -k 8 -alphabet aa11_diamond aa_100x1000.fasta aa11_diamond.db 14 | %EXEC% build -t 1 -multisample-fasta -k 8 -alphabet aa12_mmseqs aa_100x1000.fasta aa12_mmseqs.db 15 | %EXEC% build -t 1 -multisample-fasta -k 8 -alphabet aa6_dayhoff aa_100x1000.fasta aa6_dayhoff.db 16 | 17 | rem dense inputs 18 | %EXEC% all2all -t 1 dna.db dna.a2a 19 | %EXEC% all2all -t 1 dna-preserve.db dna-preserve.a2a 20 | %EXEC% all2all -t 1 aa.db aa.a2a 21 | %EXEC% all2all -t 1 aa_k7.db aa_k7.a2a 22 | %EXEC% all2all -t 1 aa11_diamond.db aa11_diamond.a2a 23 | %EXEC% all2all -t 1 aa12_mmseqs.db aa12_mmseqs.a2a 24 | %EXEC% all2all -t 1 aa6_dayhoff.db aa6_dayhoff.a2a 25 | 26 | 27 | cd ../ -------------------------------------------------------------------------------- /test/run-synth.bat: -------------------------------------------------------------------------------- 1 | set EXEC="./../../src/x64/Release/kmer-db.exe" 2 | set INPUT_DIR=./synth 3 | 4 | cd %INPUT_DIR% 5 | 6 | %EXEC% build -multisample-fasta -k 21 synth-local.list synth.db 7 | 8 | rem dense inputs 9 | %EXEC% all2all synth.db a2a 10 | %EXEC% distance mash a2a a2a.mash 11 | %EXEC% distance ani a2a a2a.ani 12 | %EXEC% distance -sparse ani a2a a2a.sparse-ani 13 | %EXEC% distance -sparse -above -1.0 -below 1.0 mash a2a a2a.sparse-mash 14 | %EXEC% distance -sparse -min 0.03 -max mash:1.0 mash a2a a2a.sparse-mash-minmax 15 | %EXEC% distance -sparse -min 0.03 -max mash:1.0 -min num-kmers:36 mash a2a a2a.mash-sparse-min2max 16 | 17 | %EXEC% new2all -multisample-fasta synth.db synth-local.list n2a 18 | %EXEC% distance mash n2a n2a.mash 19 | %EXEC% distance ani n2a n2a.ani 20 | %EXEC% distance -sparse mash n2a n2a.sparse-mash 21 | %EXEC% distance -sparse ani n2a n2a.sparse-ani 22 | 23 | rem sparse inputs 24 | %EXEC% all2all -sparse synth.db a2a-sparse 25 | %EXEC% distance mash a2a-sparse a2a-sparse.mash 26 | %EXEC% distance ani a2a-sparse a2a-sparse.ani 27 | %EXEC% distance -sparse ani a2a-sparse a2a-sparse.sparse-ani 28 | %EXEC% distance -sparse mash a2a-sparse a2a-sparse.sparse-mash 29 | 30 | %EXEC% new2all -multisample-fasta -sparse synth.db synth-local.list n2a-sparse 31 | %EXEC% distance mash n2a-sparse n2a-sparse.mash 32 | %EXEC% distance ani n2a-sparse n2a-sparse.ani 33 | %EXEC% distance -sparse mash n2a-sparse n2a-sparse.sparse-mash 34 | %EXEC% distance -sparse ani n2a-sparse n2a-sparse.sparse-ani 35 | 36 | %EXEC% all2all-sp synth.db a2a-sp 37 | 38 | cd ../ -------------------------------------------------------------------------------- /test/synth/a2a: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80, 4 | B,80,36, 5 | C,39,13,34, 6 | D,31,0,0,0, 7 | E,80,80,36,13,0, 8 | -------------------------------------------------------------------------------- /test/synth/a2a-sparse: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80, 4 | B,80,1:36, 5 | C,39,1:13,2:34, 6 | D,31, 7 | E,80,1:80,2:36,3:13, 8 | -------------------------------------------------------------------------------- /test/synth/a2a-sparse.ani: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,1:0.961976, 4 | C,1:0.927570,2:0.973352, 5 | D, 6 | E,1:1.000000,2:0.961976,3:0.927570, 7 | -------------------------------------------------------------------------------- /test/synth/a2a-sparse.mash: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,1:0.038024, 4 | C,1:0.072430,2:0.026648, 5 | D, 6 | E,1:0,2:0.038024,3:0.072430, 7 | -------------------------------------------------------------------------------- /test/synth/a2a.ani: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,0.961976, 4 | C,0.927570,0.973352, 5 | D,0,0,0, 6 | E,1.000000,0.961976,0.927570,0, 7 | -------------------------------------------------------------------------------- /test/synth/a2a.mash: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,0.038024, 4 | C,0.072430,0.026648, 5 | D,1.000000,1.000000,1.000000, 6 | E,0,0.038024,0.072430,1.000000, 7 | -------------------------------------------------------------------------------- /test/synth/a2a.mash-sparse-min2max: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,1:0.038024, 4 | C, 5 | D, 6 | E,2:0.038024, 7 | -------------------------------------------------------------------------------- /test/synth/a2a.mash.above-below: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A, 3 | B,1:0.038024, 4 | C,1:0.072430, 5 | D, 6 | E,2:0.038024,3:0.072430, 7 | -------------------------------------------------------------------------------- /test/synth/a2a.sparse.above-below: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80, 4 | B,80,1:36, 5 | C,39,2:34, 6 | D,31, 7 | E,80,2:36, 8 | -------------------------------------------------------------------------------- /test/synth/n2a: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80,80,36,13,0,80, 4 | B,80,36,80,34,0,36, 5 | C,39,13,34,39,0,13, 6 | D,31,0,0,0,31,0, 7 | E,80,80,36,13,0,80, 8 | -------------------------------------------------------------------------------- /test/synth/n2a-sparse: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80,1:80,2:36,3:13,5:80, 4 | B,80,1:36,2:80,3:34,5:36, 5 | C,39,1:13,2:34,3:39,5:13, 6 | D,31,4:31, 7 | E,80,1:80,2:36,3:13,5:80, 8 | -------------------------------------------------------------------------------- /test/synth/n2a-sparse.ani: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A,1:1.000000,2:0.961976,3:0.927570,5:1.000000, 3 | B,1:0.961976,2:1.000000,3:0.973352,5:0.961976, 4 | C,1:0.927570,2:0.973352,3:1.000000,5:0.927570, 5 | D,4:1.000000, 6 | E,1:1.000000,2:0.961976,3:0.927570,5:1.000000, 7 | -------------------------------------------------------------------------------- /test/synth/n2a-sparse.mash: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A,1:0,2:0.038024,3:0.072430,5:0, 3 | B,1:0.038024,2:0,3:0.026648,5:0.038024, 4 | C,1:0.072430,2:0.026648,3:0,5:0.072430, 5 | D,4:0, 6 | E,1:0,2:0.038024,3:0.072430,5:0, 7 | -------------------------------------------------------------------------------- /test/synth/n2a.ani: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A,1.000000,0.961976,0.927570,0,1.000000, 3 | B,0.961976,1.000000,0.973352,0,0.961976, 4 | C,0.927570,0.973352,1.000000,0,0.927570, 5 | D,0,0,0,1.000000,0, 6 | E,1.000000,0.961976,0.927570,0,1.000000, 7 | -------------------------------------------------------------------------------- /test/synth/n2a.mash: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,A,B,C,D,E, 2 | A,0,0.038024,0.072430,1.000000,0, 3 | B,0.038024,0,0.026648,1.000000,0.038024, 4 | C,0.072430,0.026648,0,1.000000,0.072430, 5 | D,1.000000,1.000000,1.000000,0,1.000000, 6 | E,0,0.038024,0.072430,1.000000,0, 7 | -------------------------------------------------------------------------------- /test/synth/n2a.sparse.above-below: -------------------------------------------------------------------------------- 1 | kmer-length: 21 fraction: 1 ,db-samples ,A,B,C,D,E, 2 | query-samples,total-kmers,80,80,39,31,80, 3 | A,80,2:36, 4 | B,80,1:36,3:34,5:36, 5 | C,39,2:34,3:39, 6 | D,31,4:31, 7 | E,80,2:36, 8 | -------------------------------------------------------------------------------- /test/synth/synth-local.list: -------------------------------------------------------------------------------- 1 | synth.fa -------------------------------------------------------------------------------- /test/synth/synth.fa: -------------------------------------------------------------------------------- 1 | >A 2 | GAGGGGCCCACAGCGAGGAAGTAAACTGTTATTCGTCGGCGATGGTGGTAGCTAATTATGTTCCTTGCCACTACAATAGTATCTAAGCCGTGTAATGGGA 3 | >B 4 | GAGGGGCCCACAGCGAGGAAGTAAACTGTTATACGTCGGCGATGGTGGTAGCTAATTATGTTCCTTGCGACTACAATAGTATCTAAGCCGTGTAATGGCA 5 | >C 6 | GAGGAGCCCACAGCGAGGAAGTAAACTGTTATACGTCGGCGATGGTGGTAGCTAATTAT 7 | >D 8 | TAGTATAGGAGATCTAGATAGGAGATAGAGCGATGAGAGAGCGCGCAAAAA 9 | >E 10 | GAGGGGCCCACAGCGAGGAAGTAAACTGTTATTCGTCGGCGATGGTGGTAGCTAATTATGTTCCTTGCCACTACAATAGTATCTAAGCCGTGTAATGGGA -------------------------------------------------------------------------------- /test/synth/synth.list: -------------------------------------------------------------------------------- 1 | ./test/synth/synth.fa -------------------------------------------------------------------------------- /test/virus/MT159713.csv: -------------------------------------------------------------------------------- 1 | kmer-length: 25 fraction: 0.1 ,db-samples ,NC_045512,MT253708,MT253709,MT251972,MT251973,MT253696,MT253697,MT253703,MT253704,MT253705,MT253706,MT253707,MT253700,MT253701,MT253698,MT253699,MT253710,MT251974,MT251975,MT251976,MT251977,MT251978,MT251979,MT251980,MT253702,MT246463,MT246464,MT246467,MT246469,MT246471,MT246473,MT246476,MT246489,MT246468,MT246472,MT246452,MT246453,MT246454,MT246455,MT246460,MT246461,MT246462,MT246466,MT246456,MT246450,MT246451,MT246458,MT246459,MT246470,MT246474,MT246475,MT246477,MT246478,MT246479,MT246480,MT246481,MT246482,MT246484,MT246485,MT246487,MT246488,MT246667,MT246486,MT246490,MT233526,MT246449,MT246457,MT240479,MT233522,MT233519,MT233523,MT226610,MT198652,MT192765,MT192759,MT192773,MT192772,MT188340,MT188339,MT188341,MT184911,MT184913,MT184910,MT184907,MT184912,MT184908,MT184909,MT163719,MT163716,MT163717,MT163718,MT159708,MT121215,MT159720,MT159706,MT159709,MT159721,MT066156,MT159707,MT159711, 2 | query-samples,total-kmers,3017,3010,3010,3013,2999,3010,3010,3010,3010,3010,3008,3010,3010,3010,3010,3010,3010,2997,3022,3003,3011,3019,2994,3014,3010,2984,3015,3001,3012,3017,3004,3006,3017,3009,3015,3023,2997,3022,3024,3004,3018,3021,3015,3002,2999,3002,2988,3012,3002,3010,3024,3015,3022,3010,3003,2998,3012,3000,2998,3000,3017,3019,3012,2998,3017,2993,3016,3019,2960,3007,3013,3033,3010,3003,3014,3020,3018,3015,3018,3015,3007,3008,3024,3017,3022,3009,3017,3017,3024,3020,3017,3015,3014,3017,3016,3017,3017,3020,3017,3017, 3 | ./test/virus/data/MT159713,3017,3017,3010,3010,2998,2992,3010,3010,3010,3010,3010,3008,3010,3010,3010,3010,3010,3010,2992,2995,2990,2992,3004,2987,2984,3010,2971,2998,2990,2995,2994,2979,2991,2998,2995,2990,3006,2984,3005,2995,2996,2999,3005,3002,2976,2992,2988,2971,2996,2996,2994,2998,3000,3003,2993,2998,2992,2987,2987,2978,2989,2998,3010,2994,2991,3008,2986,2991,3000,2953,2986,2992,2968,2989,2998,3014,3014,3017,3008,2993,2987,2994,3002,3005,3017,3017,3001,3017,3001,3000,3005,3004,3014,3010,3007,3016,3014,3017,3014,3014,3017, -------------------------------------------------------------------------------- /test/virus/multi.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/seqs.fasta -------------------------------------------------------------------------------- /test/virus/multi.part2.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/seqs.part2.fasta -------------------------------------------------------------------------------- /test/virus/multi.split.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/seqs.part1.fasta 2 | ./test/virus/data/seqs.part2.fasta -------------------------------------------------------------------------------- /test/virus/seqs.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/NC_045512 2 | ./test/virus/data/MT253708 3 | ./test/virus/data/MT253709 4 | ./test/virus/data/MT251972 5 | ./test/virus/data/MT251973 6 | ./test/virus/data/MT253696 7 | ./test/virus/data/MT253697 8 | ./test/virus/data/MT253703 9 | ./test/virus/data/MT253704 10 | ./test/virus/data/MT253705 11 | ./test/virus/data/MT253706 12 | ./test/virus/data/MT253707 13 | ./test/virus/data/MT253700 14 | ./test/virus/data/MT253701 15 | ./test/virus/data/MT253698 16 | ./test/virus/data/MT253699 17 | ./test/virus/data/MT253710 18 | ./test/virus/data/MT251974 19 | ./test/virus/data/MT251975 20 | ./test/virus/data/MT251976 21 | ./test/virus/data/MT251977 22 | ./test/virus/data/MT251978 23 | ./test/virus/data/MT251979 24 | ./test/virus/data/MT251980 25 | ./test/virus/data/MT253702 26 | ./test/virus/data/MT246463 27 | ./test/virus/data/MT246464 28 | ./test/virus/data/MT246467 29 | ./test/virus/data/MT246469 30 | ./test/virus/data/MT246471 31 | ./test/virus/data/MT246473 32 | ./test/virus/data/MT246476 33 | ./test/virus/data/MT246489 34 | ./test/virus/data/MT246468 35 | ./test/virus/data/MT246472 36 | ./test/virus/data/MT246452 37 | ./test/virus/data/MT246453 38 | ./test/virus/data/MT246454 39 | ./test/virus/data/MT246455 40 | ./test/virus/data/MT246460 41 | ./test/virus/data/MT246461 42 | ./test/virus/data/MT246462 43 | ./test/virus/data/MT246466 44 | ./test/virus/data/MT246456 45 | ./test/virus/data/MT246450 46 | ./test/virus/data/MT246451 47 | ./test/virus/data/MT246458 48 | ./test/virus/data/MT246459 49 | ./test/virus/data/MT246470 50 | ./test/virus/data/MT246474 51 | ./test/virus/data/MT246475 52 | ./test/virus/data/MT246477 53 | ./test/virus/data/MT246478 54 | ./test/virus/data/MT246479 55 | ./test/virus/data/MT246480 56 | ./test/virus/data/MT246481 57 | ./test/virus/data/MT246482 58 | ./test/virus/data/MT246484 59 | ./test/virus/data/MT246485 60 | ./test/virus/data/MT246487 61 | ./test/virus/data/MT246488 62 | ./test/virus/data/MT246667 63 | ./test/virus/data/MT246486 64 | ./test/virus/data/MT246490 65 | ./test/virus/data/MT233526 66 | ./test/virus/data/MT246449 67 | ./test/virus/data/MT246457 68 | ./test/virus/data/MT240479 69 | ./test/virus/data/MT233522 70 | ./test/virus/data/MT233519 71 | ./test/virus/data/MT233523 72 | ./test/virus/data/MT226610 73 | ./test/virus/data/MT198652 74 | ./test/virus/data/MT192765 75 | ./test/virus/data/MT192759 76 | ./test/virus/data/MT192773 77 | ./test/virus/data/MT192772 78 | ./test/virus/data/MT188340 79 | ./test/virus/data/MT188339 80 | ./test/virus/data/MT188341 81 | ./test/virus/data/MT184911 82 | ./test/virus/data/MT184913 83 | ./test/virus/data/MT184910 84 | ./test/virus/data/MT184907 85 | ./test/virus/data/MT184912 86 | ./test/virus/data/MT184908 87 | ./test/virus/data/MT184909 88 | ./test/virus/data/MT163719 89 | ./test/virus/data/MT163716 90 | ./test/virus/data/MT163717 91 | ./test/virus/data/MT163718 92 | ./test/virus/data/MT159708 93 | ./test/virus/data/MT121215 94 | ./test/virus/data/MT159720 95 | ./test/virus/data/MT159706 96 | ./test/virus/data/MT159709 97 | ./test/virus/data/MT159721 98 | ./test/virus/data/MT066156 99 | ./test/virus/data/MT159707 100 | ./test/virus/data/MT159711 101 | ./test/virus/data/MT159713 102 | ./test/virus/data/MT159718 103 | ./test/virus/data/MT159712 104 | ./test/virus/data/MT159719 105 | ./test/virus/data/MT159717 106 | ./test/virus/data/MT159715 107 | ./test/virus/data/MT159716 108 | ./test/virus/data/MT159722 109 | ./test/virus/data/MT159705 110 | ./test/virus/data/MT159710 111 | ./test/virus/data/MT159714 112 | ./test/virus/data/MT012098 113 | ./test/virus/data/MT050493 114 | ./test/virus/data/MT152824 115 | ./test/virus/data/MT135041 116 | ./test/virus/data/MT135042 117 | ./test/virus/data/MT135043 118 | ./test/virus/data/MT135044 119 | ./test/virus/data/MT126808 120 | ./test/virus/data/MT123293 121 | ./test/virus/data/MT123290 122 | ./test/virus/data/MT123291 123 | ./test/virus/data/MT123292 124 | ./test/virus/data/MT118835 125 | ./test/virus/data/MT106053 126 | ./test/virus/data/MT106052 127 | ./test/virus/data/MT106054 128 | ./test/virus/data/MT093571 129 | ./test/virus/data/MT093631 130 | ./test/virus/data/MT072688 131 | ./test/virus/data/MT066175 132 | ./test/virus/data/MT066176 133 | ./test/virus/data/MT044258 134 | ./test/virus/data/MT049951 135 | ./test/virus/data/MT044257 136 | ./test/virus/data/MT039873 137 | ./test/virus/data/MT039887 138 | ./test/virus/data/MT039888 139 | ./test/virus/data/MT039890 140 | ./test/virus/data/MT027063 141 | ./test/virus/data/MT027064 142 | ./test/virus/data/MT027062 143 | ./test/virus/data/MT020880 144 | ./test/virus/data/MT019529 145 | ./test/virus/data/MT019531 146 | ./test/virus/data/MT019530 147 | ./test/virus/data/MT019533 148 | ./test/virus/data/MT019532 149 | ./test/virus/data/MT020881 150 | ./test/virus/data/MT007544 151 | ./test/virus/data/MN996527 152 | ./test/virus/data/MN996528 153 | ./test/virus/data/MN996529 154 | ./test/virus/data/MN996530 155 | ./test/virus/data/MN996531 156 | ./test/virus/data/MN994467 157 | ./test/virus/data/MN988669 158 | ./test/virus/data/MN994468 159 | ./test/virus/data/MN997409 160 | ./test/virus/data/MN988668 161 | ./test/virus/data/MN988713 162 | ./test/virus/data/MN938384 163 | ./test/virus/data/MN975262 164 | ./test/virus/data/MN985325 165 | ./test/virus/data/MN908947 -------------------------------------------------------------------------------- /test/virus/seqs.part1.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/NC_045512 2 | ./test/virus/data/MT253708 3 | ./test/virus/data/MT253709 4 | ./test/virus/data/MT251972 5 | ./test/virus/data/MT251973 6 | ./test/virus/data/MT253696 7 | ./test/virus/data/MT253697 8 | ./test/virus/data/MT253703 9 | ./test/virus/data/MT253704 10 | ./test/virus/data/MT253705 11 | ./test/virus/data/MT253706 12 | ./test/virus/data/MT253707 13 | ./test/virus/data/MT253700 14 | ./test/virus/data/MT253701 15 | ./test/virus/data/MT253698 16 | ./test/virus/data/MT253699 17 | ./test/virus/data/MT253710 18 | ./test/virus/data/MT251974 19 | ./test/virus/data/MT251975 20 | ./test/virus/data/MT251976 21 | ./test/virus/data/MT251977 22 | ./test/virus/data/MT251978 23 | ./test/virus/data/MT251979 24 | ./test/virus/data/MT251980 25 | ./test/virus/data/MT253702 26 | ./test/virus/data/MT246463 27 | ./test/virus/data/MT246464 28 | ./test/virus/data/MT246467 29 | ./test/virus/data/MT246469 30 | ./test/virus/data/MT246471 31 | ./test/virus/data/MT246473 32 | ./test/virus/data/MT246476 33 | ./test/virus/data/MT246489 34 | ./test/virus/data/MT246468 35 | ./test/virus/data/MT246472 36 | ./test/virus/data/MT246452 37 | ./test/virus/data/MT246453 38 | ./test/virus/data/MT246454 39 | ./test/virus/data/MT246455 40 | ./test/virus/data/MT246460 41 | ./test/virus/data/MT246461 42 | ./test/virus/data/MT246462 43 | ./test/virus/data/MT246466 44 | ./test/virus/data/MT246456 45 | ./test/virus/data/MT246450 46 | ./test/virus/data/MT246451 47 | ./test/virus/data/MT246458 48 | ./test/virus/data/MT246459 49 | ./test/virus/data/MT246470 50 | ./test/virus/data/MT246474 51 | ./test/virus/data/MT246475 52 | ./test/virus/data/MT246477 53 | ./test/virus/data/MT246478 54 | ./test/virus/data/MT246479 55 | ./test/virus/data/MT246480 56 | ./test/virus/data/MT246481 57 | ./test/virus/data/MT246482 58 | ./test/virus/data/MT246484 59 | ./test/virus/data/MT246485 60 | ./test/virus/data/MT246487 61 | ./test/virus/data/MT246488 62 | ./test/virus/data/MT246667 63 | ./test/virus/data/MT246486 64 | ./test/virus/data/MT246490 65 | ./test/virus/data/MT233526 66 | ./test/virus/data/MT246449 67 | ./test/virus/data/MT246457 68 | ./test/virus/data/MT240479 69 | ./test/virus/data/MT233522 70 | ./test/virus/data/MT233519 71 | ./test/virus/data/MT233523 72 | ./test/virus/data/MT226610 73 | ./test/virus/data/MT198652 74 | ./test/virus/data/MT192765 75 | ./test/virus/data/MT192759 76 | ./test/virus/data/MT192773 77 | ./test/virus/data/MT192772 78 | ./test/virus/data/MT188340 79 | ./test/virus/data/MT188339 80 | ./test/virus/data/MT188341 81 | ./test/virus/data/MT184911 82 | ./test/virus/data/MT184913 83 | ./test/virus/data/MT184910 84 | ./test/virus/data/MT184907 85 | ./test/virus/data/MT184912 86 | ./test/virus/data/MT184908 87 | ./test/virus/data/MT184909 88 | ./test/virus/data/MT163719 89 | ./test/virus/data/MT163716 90 | ./test/virus/data/MT163717 91 | ./test/virus/data/MT163718 92 | ./test/virus/data/MT159708 93 | ./test/virus/data/MT121215 94 | ./test/virus/data/MT159720 95 | ./test/virus/data/MT159706 96 | ./test/virus/data/MT159709 97 | ./test/virus/data/MT159721 98 | ./test/virus/data/MT066156 99 | ./test/virus/data/MT159707 100 | ./test/virus/data/MT159711 -------------------------------------------------------------------------------- /test/virus/seqs.part2.list: -------------------------------------------------------------------------------- 1 | ./test/virus/data/MT159713 2 | ./test/virus/data/MT159718 3 | ./test/virus/data/MT159712 4 | ./test/virus/data/MT159719 5 | ./test/virus/data/MT159717 6 | ./test/virus/data/MT159715 7 | ./test/virus/data/MT159716 8 | ./test/virus/data/MT159722 9 | ./test/virus/data/MT159705 10 | ./test/virus/data/MT159710 11 | ./test/virus/data/MT159714 12 | ./test/virus/data/MT012098 13 | ./test/virus/data/MT050493 14 | ./test/virus/data/MT152824 15 | ./test/virus/data/MT135041 16 | ./test/virus/data/MT135042 17 | ./test/virus/data/MT135043 18 | ./test/virus/data/MT135044 19 | ./test/virus/data/MT126808 20 | ./test/virus/data/MT123293 21 | ./test/virus/data/MT123290 22 | ./test/virus/data/MT123291 23 | ./test/virus/data/MT123292 24 | ./test/virus/data/MT118835 25 | ./test/virus/data/MT106053 26 | ./test/virus/data/MT106052 27 | ./test/virus/data/MT106054 28 | ./test/virus/data/MT093571 29 | ./test/virus/data/MT093631 30 | ./test/virus/data/MT072688 31 | ./test/virus/data/MT066175 32 | ./test/virus/data/MT066176 33 | ./test/virus/data/MT044258 34 | ./test/virus/data/MT049951 35 | ./test/virus/data/MT044257 36 | ./test/virus/data/MT039873 37 | ./test/virus/data/MT039887 38 | ./test/virus/data/MT039888 39 | ./test/virus/data/MT039890 40 | ./test/virus/data/MT027063 41 | ./test/virus/data/MT027064 42 | ./test/virus/data/MT027062 43 | ./test/virus/data/MT020880 44 | ./test/virus/data/MT019529 45 | ./test/virus/data/MT019531 46 | ./test/virus/data/MT019530 47 | ./test/virus/data/MT019533 48 | ./test/virus/data/MT019532 49 | ./test/virus/data/MT020881 50 | ./test/virus/data/MT007544 51 | ./test/virus/data/MN996527 52 | ./test/virus/data/MN996528 53 | ./test/virus/data/MN996529 54 | ./test/virus/data/MN996530 55 | ./test/virus/data/MN996531 56 | ./test/virus/data/MN994467 57 | ./test/virus/data/MN988669 58 | ./test/virus/data/MN994468 59 | ./test/virus/data/MN997409 60 | ./test/virus/data/MN988668 61 | ./test/virus/data/MN988713 62 | ./test/virus/data/MN938384 63 | ./test/virus/data/MN975262 64 | ./test/virus/data/MN985325 65 | ./test/virus/data/MN908947 --------------------------------------------------------------------------------