├── .github └── workflows │ ├── deploy.yml │ ├── main.yml │ └── self-hosted.yml ├── .gitignore ├── .gitmodules ├── FAMSA.sln ├── LICENSE ├── README.md ├── libs └── prebuild.bat ├── makefile ├── run.sh ├── src ├── core │ ├── defs.h │ ├── io_service.cpp │ ├── io_service.h │ ├── params.cpp │ ├── params.h │ ├── profile.cpp │ ├── profile.h │ ├── profile_par.cpp │ ├── profile_seq.cpp │ ├── queues.cpp │ ├── queues.h │ ├── sequence.cpp │ ├── sequence.h │ └── version.h ├── famsa.cpp ├── famsa.vcxproj ├── famsa.vcxproj.filters ├── lcs │ ├── lcsbp.cpp │ ├── lcsbp.h │ ├── lcsbp_avx2_intr.cpp │ ├── lcsbp_avx2_intr.h │ ├── lcsbp_avx_intr.cpp │ ├── lcsbp_avx_intr.h │ ├── lcsbp_classic.cpp │ ├── lcsbp_classic.h │ ├── lcsbp_neon_intr.cpp │ └── lcsbp_neon_intr.h ├── msa.cpp ├── msa.h ├── msa_refinement.cpp ├── tree │ ├── AbstractTreeGenerator.cpp │ ├── AbstractTreeGenerator.h │ ├── AbstractTreeGenerator.hpp │ ├── Chained.h │ ├── Clustering.cpp │ ├── Clustering.h │ ├── DistanceCalculator.cpp │ ├── DistanceCalculator.h │ ├── FastTree.cpp │ ├── FastTree.h │ ├── GuideTree.cpp │ ├── GuideTree.h │ ├── IPartialGenerator.h │ ├── MSTPrim.cpp │ ├── MSTPrim.h │ ├── NeighborJoining.cpp │ ├── NeighborJoining.h │ ├── NewickParser.cpp │ ├── NewickParser.h │ ├── SingleLinkage.cpp │ ├── SingleLinkage.h │ ├── SingleLinkageQueue.h │ ├── TreeDefs.h │ ├── UPGMA.cpp │ └── UPGMA.h └── utils │ ├── array.h │ ├── conversion.h │ ├── cpuid.h │ ├── deterministic_random.h │ ├── log.cpp │ ├── log.h │ ├── memory_monotonic.h │ ├── meta_oper.h │ ├── pooled_threads.cpp │ ├── pooled_threads.h │ ├── statistics.h │ ├── timer.cpp │ ├── timer.h │ ├── utils.cpp │ ├── utils.h │ ├── utils_avx.cpp │ ├── utils_avx2.cpp │ └── utils_neon.cpp └── test ├── LRR ├── LRR └── sl.dnd ├── adeno_fiber ├── adeno_fiber ├── dist.csv ├── dist_sq.csv ├── gaps.fasta ├── pid.csv ├── pid_sq.csv ├── sl.dnd ├── sl.fasta ├── slink.dnd ├── upgma.dnd ├── upgma.fasta ├── upgma.no_refine.fasta ├── upgma.no_refine.part1.fasta ├── upgma.no_refine.part2.fasta └── upgma.pp.fasta ├── adeno_fiber_duplicates ├── adeno_fiber_duplicates ├── sl.dnd └── sl.fasta ├── adeno_fiber_extra ├── adeno_fiber_extra └── ref.fasta ├── dummy ├── many-seq ├── many-seq.aln ├── one-seq ├── one-seq.aln ├── two-seq └── two-seq.aln ├── hemopexin ├── hemopexin ├── medoid-nj-params.dnd ├── medoid-nj.dnd ├── medoid-nj.fasta ├── medoid-sl-params.dnd ├── medoid-sl.dnd ├── medoid-sl.fasta ├── medoid-slink-params.dnd ├── medoid-slink.dnd ├── medoid-upgma-params.dnd ├── medoid-upgma.dnd └── medoid-upgma.fasta ├── hemopexin_duplicates ├── hemopexin_duplicates ├── medoid-sl-dups.dnd ├── medoid-sl-dups.fasta ├── medoid-sl.dnd └── medoid-sl.fasta └── scripts ├── reorder.py └── split.py /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | 3 | on: 4 | release: 5 | types: 6 | - created 7 | 8 | jobs: 9 | 10 | 11 | ######################################################################################## 12 | checkout: 13 | name: Checkout 14 | strategy: 15 | matrix: 16 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 17 | runs-on: [self-hosted, kmer-db, '${{ matrix.machine }}'] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | with: 22 | submodules: recursive 23 | 24 | ######################################################################################## 25 | make: 26 | name: Make 27 | needs: checkout 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | machine: [x64_linux] 32 | platform: [avx2] 33 | compiler: [g++-13] 34 | static: [true] 35 | include: 36 | - {machine: arm64_linux, platform: arm8, compiler: g++-12, static: true} 37 | - {machine: x64_mac, platform: avx2, compiler: g++-13, static: false} 38 | - {machine: arm64_mac, platform: m1, compiler: g++-13, static: false} 39 | 40 | runs-on: [self-hosted, famsa, '${{ matrix.machine }}'] 41 | 42 | steps: 43 | - name: make 44 | run: | 45 | make clean 46 | make -j32 CXX=${{matrix.compiler}} STATIC_LINK=${{ matrix.static }} PLATFORM=${{ matrix.platform }} 47 | - name: tar artifacts 48 | run: tar -cvzf famsa.tar.gz famsa LICENSE 49 | 50 | 51 | ######################################################################################## 52 | help: 53 | name: Print usage 54 | needs: make 55 | strategy: 56 | fail-fast: false 57 | matrix: 58 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 59 | runs-on: [self-hosted, famsa, '${{ matrix.machine }}'] 60 | 61 | steps: 62 | - name: help 63 | run: ./famsa 64 | 65 | ######################################################################################## 66 | upload: 67 | name: Upload 68 | needs: help 69 | strategy: 70 | fail-fast: false 71 | matrix: 72 | machine: [x64_linux, x64_mac, arm64_linux, arm64_mac] 73 | runs-on: [self-hosted, famsa, '${{ matrix.machine }}'] 74 | 75 | steps: 76 | - name: deploy 77 | uses: actions/upload-release-asset@v1.0.1 78 | env: 79 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 80 | with: 81 | upload_url: ${{ github.event.release.upload_url }} 82 | asset_path: ./famsa.tar.gz 83 | asset_name: famsa-${{ github.event.release.tag_name }}-${{matrix.machine}}.tar.gz 84 | asset_content_type: application/gzip 85 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: GitHub Actions CI 2 | 3 | on: 4 | push: 5 | branches: [ master, experimental, develop ] 6 | paths-ignore: 7 | - '**.md' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | 12 | make-tests: 13 | name: Make 14 | strategy: 15 | matrix: 16 | machine: [ubuntu-latest, macOS-12] 17 | runs-on: ['${{ matrix.machine }}'] 18 | 19 | env: 20 | REF_DIR: ./test/adeno_fiber 21 | INPUT: ./test/adeno_fiber/adeno_fiber 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | submodules: recursive 27 | - name: make 28 | run: | 29 | g++ --version 30 | make -j 31 | 32 | - name: copy sl->slink references 33 | run: | 34 | cp ./test/adeno_fiber/sl.fasta ./test/adeno_fiber/slink.fasta 35 | cp ./test/hemopexin/medoid-sl.fasta ./test/hemopexin/medoid-slink.fasta 36 | 37 | - name: tar artifacts 38 | run: tar -cvf famsa.tar ./famsa ./test 39 | 40 | - uses: actions/upload-artifact@v4 41 | with: 42 | name: executable-artifact-${{ matrix.machine }} 43 | path: ./famsa.tar 44 | 45 | 46 | ######################################################################################## 47 | 48 | full-tree: 49 | name: Full tree (adeno-fiber) 50 | needs: make-tests 51 | strategy: 52 | fail-fast: false 53 | matrix: 54 | tree: [sl, upgma] 55 | machine: [ubuntu-latest, macOS-12] 56 | 57 | runs-on: ['${{ matrix.machine }}'] 58 | 59 | env: 60 | REF_DIR: ./test/adeno_fiber 61 | INPUT: ./test/adeno_fiber/adeno_fiber 62 | 63 | steps: 64 | - uses: actions/download-artifact@v4 65 | with: 66 | name: executable-artifact-${{ matrix.machine }} 67 | path: ./ 68 | 69 | - name: untar artifacts 70 | run: tar -xf famsa.tar 71 | 72 | - name: ${{matrix.tree}} (tree only) 73 | run: | 74 | ./famsa -v -gt ${{matrix.tree}} -gt_export ${INPUT} ${{matrix.tree}}.dnd 75 | cmp ${{matrix.tree}}.dnd ${REF_DIR}/${{matrix.tree}}.dnd 76 | 77 | - name: ${{matrix.tree}} (from tree) 78 | run: | 79 | ./famsa -v -gt import ${{matrix.tree}}.dnd ${INPUT} ${{matrix.tree}}.dnd.fasta 80 | cmp ${{matrix.tree}}.dnd.fasta ${REF_DIR}/${{matrix.tree}}.fasta 81 | 82 | - name: ${{matrix.tree}} (complete alignment) 83 | run: | 84 | ./famsa -v -gt ${{matrix.tree}} ${INPUT} ${{matrix.tree}}.fasta 85 | cmp ${{matrix.tree}}.fasta ${REF_DIR}/${{matrix.tree}}.fasta 86 | 87 | ######################################################################################## 88 | 89 | medoid-tree: 90 | name: Medoid tree (hemopexin) 91 | needs: full-tree 92 | strategy: 93 | fail-fast: false 94 | matrix: 95 | tree: [sl, upgma, nj] 96 | machine: [ubuntu-latest, macOS-12] 97 | 98 | runs-on: ['${{ matrix.machine }}'] 99 | 100 | env: 101 | REF_DIR: ./test/hemopexin 102 | INPUT: ./test/hemopexin/hemopexin 103 | 104 | steps: 105 | - uses: actions/download-artifact@v4 106 | with: 107 | name: executable-artifact-${{ matrix.machine }} 108 | path: ./ 109 | 110 | - name: untar artifacts 111 | run: tar -xf famsa.tar 112 | 113 | - name: medoid + ${{matrix.tree}} (tree only) 114 | run: | 115 | ./famsa -medoidtree -gt ${{matrix.tree}} -gt_export ${INPUT} medoid-${{matrix.tree}}.dnd 116 | cmp medoid-${{matrix.tree}}.dnd ${REF_DIR}/medoid-${{matrix.tree}}.dnd 117 | 118 | - name: medoid + ${{matrix.tree}} (complete alignment) 119 | run: | 120 | ./famsa -medoidtree -gt ${{matrix.tree}} ${INPUT} medoid-${{matrix.tree}}.fasta 121 | cmp medoid-${{matrix.tree}}.fasta ${REF_DIR}/medoid-${{matrix.tree}}.fasta 122 | 123 | - name: medoid + ${{matrix.tree}} (from tree) 124 | run: | 125 | ./famsa -gt import medoid-${{matrix.tree}}.dnd ${INPUT} medoid-${{matrix.tree}}.dnd.fasta 126 | cmp medoid-${{matrix.tree}}.dnd.fasta ${REF_DIR}/medoid-${{matrix.tree}}.fasta 127 | 128 | - name: medoid + ${{matrix.tree}} (non-default params) 129 | run: | 130 | ./famsa -medoidtree -gt ${{matrix.tree}} -gt_export -subtree_size 10 -sample_size 100 -cluster_fraction 0.2 -cluster_iters 1 ${INPUT} medoid-${{matrix.tree}}-params.dnd 131 | cmp medoid-${{matrix.tree}}-params.dnd ${REF_DIR}/medoid-${{matrix.tree}}-params.dnd 132 | 133 | ######################################################################################## 134 | 135 | other-tests: 136 | name: Other tests (adeno-fiber) 137 | needs: medoid-tree 138 | strategy: 139 | fail-fast: false 140 | matrix: 141 | machine: [ubuntu-latest, macOS-12] 142 | 143 | runs-on: ['${{ matrix.machine }}'] 144 | 145 | env: 146 | REF_DIR: ./test/adeno_fiber 147 | INPUT: ./test/adeno_fiber/adeno_fiber 148 | 149 | steps: 150 | - uses: actions/download-artifact@v4 151 | with: 152 | name: executable-artifact-${{ matrix.machine }} 153 | path: ./ 154 | 155 | - name: untar artifacts 156 | run: tar -xf famsa.tar 157 | 158 | - name: non-default gaps 159 | run: | 160 | ./famsa -go 10 -ge 2 -tgo 0.5 -tge 1.0 -gsd 3 -gsl 30 ${INPUT} gaps.fasta 161 | cmp gaps.fasta ${REF_DIR}/gaps.fasta 162 | 163 | - name: export distance 164 | run: | 165 | ./famsa -dist_export ${INPUT} dist.csv 166 | cmp dist.csv ${REF_DIR}/dist.csv 167 | 168 | - name: export distance (square) 169 | run: | 170 | ./famsa -dist_export -square_matrix ${INPUT} dist_sq.csv 171 | cmp dist_sq.csv ${REF_DIR}/dist_sq.csv 172 | 173 | - name: export pid 174 | run: | 175 | ./famsa -dist_export -pid ${INPUT} pid.csv 176 | cmp pid.csv ${REF_DIR}/pid.csv 177 | 178 | - name: export pid (square) 179 | run: | 180 | ./famsa -dist_export -square_matrix -pid ${INPUT} pid_sq.csv 181 | cmp pid_sq.csv ${REF_DIR}/pid_sq.csv 182 | 183 | 184 | ######################################################################################## 185 | 186 | 187 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | FAMSA.VC.VC.opendb 2 | Debug 3 | x64 4 | FAMSA.VC.db 5 | *.ipch 6 | *.db 7 | *.db-shm 8 | *.db-wal 9 | *.opendb 10 | /.vs 11 | /src/famsa.vcxproj.user 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libs/mimalloc"] 2 | path = libs/mimalloc 3 | url = https://github.com/refresh-bio-dependencies/mimalloc 4 | [submodule "libs/libdeflate"] 5 | path = libs/libdeflate 6 | url = https://github.com/refresh-bio-dependencies/libdeflate 7 | [submodule "libs/atomic_wait"] 8 | path = libs/atomic_wait 9 | url = https://github.com/refresh-bio-dependencies/atomic_wait 10 | -------------------------------------------------------------------------------- /FAMSA.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.1.32228.430 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "famsa", "src\famsa.vcxproj", "{CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|ARM64 = Debug|ARM64 11 | Debug|x64 = Debug|x64 12 | Debug|x86 = Debug|x86 13 | Release|ARM64 = Release|ARM64 14 | Release|x64 = Release|x64 15 | Release|x86 = Release|x86 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|ARM64.ActiveCfg = Debug|ARM64 19 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|ARM64.Build.0 = Debug|ARM64 20 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|x64.ActiveCfg = Debug|x64 21 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|x64.Build.0 = Debug|x64 22 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|x86.ActiveCfg = Debug|Win32 23 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Debug|x86.Build.0 = Debug|Win32 24 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|ARM64.ActiveCfg = Release|ARM64 25 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|ARM64.Build.0 = Release|ARM64 26 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|x64.ActiveCfg = Release|x64 27 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|x64.Build.0 = Release|x64 28 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|x86.ActiveCfg = Release|Win32 29 | {CDAA5F75-7B1B-4E32-82E3-1026DDEAFFC5}.Release|x86.Build.0 = Release|Win32 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | GlobalSection(ExtensibilityGlobals) = postSolution 35 | VisualSVNWorkingCopyRoot = . 36 | SolutionGuid = {08EB8121-BD6D-40C2-AEB5-E191519A8D66} 37 | EndGlobalSection 38 | EndGlobal 39 | -------------------------------------------------------------------------------- /libs/prebuild.bat: -------------------------------------------------------------------------------- 1 | cd %1\libs 2 | 3 | @echo "Building libdeflate" 4 | cd libdeflate 5 | 6 | cmake -B build 7 | cmake --build build --config Debug 8 | cmake --build build --config Release 9 | 10 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | all: famsa 2 | 3 | 4 | #################### 5 | 6 | 7 | UNAME_S := $(shell uname -s) 8 | GCC_VERSION= $(shell $(CXX) -dumpversion | cut -f1 -d.) 9 | 10 | ifeq ($(UNAME_S),Darwin) 11 | ABI_FLAG = 12 | CLINK_FLAGS = 13 | else 14 | ABI_FLAG = -fabi-version=0 15 | CLINK_FLAGS = -lrt 16 | endif 17 | 18 | 19 | ifeq ($(GCC_VERSION), 5) 20 | $(info *** Detecting g++ version 5 ***) 21 | CPP_STD=c++11 22 | DEFINE_FLAGS = -DNO_PROFILE_PAR -DOLD_ATOMIC_FLAG 23 | else ifeq ($(GCC_VERSION), 6) 24 | $(info *** Detecting g++ version 6 ***) 25 | CPP_STD=c++11 26 | DEFINE_FLAGS = -DNO_PROFILE_PAR -DOLD_ATOMIC_FLAG 27 | else ifeq ($(GCC_VERSION), 7) 28 | $(info *** Detecting g++ version 7 ***) 29 | CPP_STD=c++14 30 | DEFINE_FLAGS = -DNO_PROFILE_PAR -DOLD_ATOMIC_FLAG 31 | else ifeq ($(GCC_VERSION), 8) 32 | $(info *** Detecting g++ version 8 ***) 33 | CPP_STD=c++2a 34 | DEFINE_FLAGS = -DOLD_ATOMIC_FLAG 35 | else ifeq ($(GCC_VERSION), 9) 36 | $(info *** Detecting g++ version 9 ***) 37 | CPP_STD=c++2a 38 | DEFINE_FLAGS = -DOLD_ATOMIC_FLAG 39 | else ifeq ($(GCC_VERSION), 10) 40 | $(info *** Detecting g++ version 10 ***) 41 | CPP_STD=c++2a 42 | DEFINE_FLAGS = -DOLD_ATOMIC_FLAG 43 | else ifeq ($(GCC_VERSION), 11) 44 | $(info *** Detecting g++ version 11 ***) 45 | ifeq ($(UNAME_S),Darwin) 46 | CPP_STD=c++2a 47 | DEFINE_FLAGS = -DOLD_ATOMIC_FLAG 48 | else 49 | CPP_STD=c++20 50 | DEFINE_FLAGS = 51 | endif 52 | else 53 | $(info *** Detecting g++ version 12 or higher ***) 54 | ifeq ($(UNAME_S),Darwin) 55 | CPP_STD=c++2a 56 | DEFINE_FLAGS = -DOLD_ATOMIC_FLAG 57 | else 58 | CPP_STD=c++20 59 | DEFINE_FLAGS = 60 | endif 61 | # DEFINE_FLAGS = -DUSE_NATIVE_BARRIERS 62 | endif 63 | 64 | SIMD_NONE=0 65 | SIMD_AVX1=1 66 | SIMD_AVX2=2 67 | SIMD_AVX512=3 68 | SIMD_NEON=4 69 | 70 | 71 | # Detecting user's options and add flags 72 | ifeq ($(PLATFORM), none) 73 | $(info *** Unspecified platform w/o extensions ***) 74 | COMMON_FLAGS := 75 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_NONE) 76 | SIMD=NONE 77 | else ifeq ($(PLATFORM), arm8) 78 | $(info *** ARMv8 with NEON extensions ***) 79 | COMMON_FLAGS := -march=armv8-a 80 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_NEON) 81 | SIMD=NEON 82 | else ifeq ($(PLATFORM), m1) 83 | $(info *** Apple M1 with NEON extensions ***) 84 | COMMON_FLAGS := -march=armv8.4-a 85 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_NEON) 86 | SIMD=NEON 87 | else ifeq ($(PLATFORM), sse4) 88 | $(info *** x86-64 with SSE4 extensions***) 89 | COMMON_FLAGS := -msse4 90 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_NONE) 91 | SIMD=NONE 92 | else ifeq ($(PLATFORM), avx) 93 | $(info *** x86-64 with AVX extensions***) 94 | COMMON_FLAGS := -msse4 95 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_AVX1) 96 | SIMD=AVX1 97 | else ifeq ($(PLATFORM), native) 98 | $(info *** x86-64 with AVX2 extensions and native architecture ***) 99 | COMMON_FLAGS := -mavx2 -march=native 100 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_AVX2) 101 | SIMD=AVX2 102 | else 103 | $(info *** x86-64 with AVX2 extensions***) 104 | COMMON_FLAGS := -msse4 105 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DSIMD=$(SIMD_AVX2) 106 | SIMD=AVX2 107 | endif 108 | 109 | 110 | # get commit hash 111 | GIT_COMMIT = $(shell git describe --always --dirty) 112 | DEFINE_FLAGS := $(DEFINE_FLAGS) -DGIT_COMMIT=$(GIT_COMMIT) 113 | 114 | INC_DIRS =. libs/mimalloc/include libs 115 | INCLUDES=$(foreach d, $(INC_DIRS), -I$d) 116 | 117 | ifeq ($(STATIC_LINK), true) 118 | CXXFLAGS = -Wall -Wno-char-subscripts -Wno-attributes -O3 $(COMMON_FLAGS) $(DEFINE_FLAGS) -static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=$(CPP_STD) $(INCLUDES) 119 | CLINK = -lm -static -O3 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=$(CPP_STD) 120 | else 121 | CXXFLAGS = -Wall -Wno-char-subscripts -Wno-attributes -O3 $(COMMON_FLAGS) $(DEFINE_FLAGS) -std=$(CPP_STD) -pthread $(INCLUDES) 122 | CLINK = -lm $(CLINK_FLAGS) -O3 $(COMMON_FLAGS) -std=$(CPP_STD) -pthread 123 | endif 124 | 125 | CXXFLAGS_AVX = $(CXXFLAGS) -mavx ${ABI_FLAG} -mpopcnt -funroll-loops 126 | CXXFLAGS_AVX2 = $(CXXFLAGS) -mavx2 ${ABI_FLAG} -mpopcnt -funroll-loops 127 | CXXFLAGS_NEON = $(CXXFLAGS) ${ABI_FLAG} -funroll-loops 128 | 129 | 130 | LIB_DEFLATE=libs/libdeflate/build/libdeflate.a 131 | 132 | deflate: 133 | cmake -S libs/libdeflate -B libs/libdeflate/build 134 | cmake --build libs/libdeflate/build 135 | 136 | 137 | MIMALLOC_OBJ=libs/mimalloc/mimalloc.o 138 | 139 | $(MIMALLOC_OBJ): 140 | $(CXX) -DMI_MALLOC_OVERRIDE -O3 -DNDEBUG -fPIC -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden -Wstrict-prototypes -ftls-model=initial-exec -fno-builtin-malloc -std=$(CPP_STD) -c -I libs/mimalloc/include libs/mimalloc/src/static.c -o $(MIMALLOC_OBJ) 141 | 142 | COMMON_OBJS := src/msa.o \ 143 | src/msa_refinement.o \ 144 | src/tree/AbstractTreeGenerator.o \ 145 | src/tree/Clustering.o \ 146 | src/tree/DistanceCalculator.o \ 147 | src/tree/FastTree.o \ 148 | src/tree/GuideTree.o \ 149 | src/tree/MSTPrim.o \ 150 | src/tree/NeighborJoining.o \ 151 | src/tree/NewickParser.o \ 152 | src/tree/SingleLinkage.o \ 153 | src/tree/UPGMA.o \ 154 | src/utils/timer.o \ 155 | src/utils/log.o \ 156 | src/core/io_service.o \ 157 | src/core/params.o \ 158 | src/core/profile.o \ 159 | src/core/profile_par.o \ 160 | src/core/profile_seq.o \ 161 | src/core/sequence.o \ 162 | src/core/queues.o 163 | 164 | src/lcs/lcsbp_classic.o : src/lcs/lcsbp_classic.cpp 165 | $(CXX) $(CXXFLAGS) -c src/lcs/lcsbp_classic.cpp -o $@ 166 | 167 | ifeq ($(SIMD), NONE) 168 | LCS_OBJS := src/lcs/lcsbp.o \ 169 | src/lcs/lcsbp_classic.o 170 | UTILS_OBJS := src/utils/utils.o 171 | 172 | src/lcs/lcsbp.o : src/lcs/lcsbp.cpp 173 | $(CXX) $(CXXFLAGS) -c src/lcs/lcsbp.cpp -o $@ 174 | src/utils/utils.o : src/utils/utils.cpp 175 | $(CXX) $(CXXFLAGS) -c src/utils/utils.cpp -o $@ 176 | 177 | else ifeq ($(SIMD), AVX1) 178 | LCS_OBJS := src/lcs/lcsbp.o \ 179 | src/lcs/lcsbp_classic.o \ 180 | src/lcs/lcsbp_avx_intr.o 181 | UTILS_OBJS := src/utils/utils.o \ 182 | src/utils/utils_avx.o 183 | 184 | src/lcs/lcsbp.o : src/lcs/lcsbp.cpp 185 | $(CXX) $(CXXFLAGS) -c src/lcs/lcsbp.cpp -o $@ 186 | src/lcs/lcsbp_avx_intr.o : src/lcs/lcsbp_avx_intr.cpp 187 | $(CXX) $(CXXFLAGS_AVX) -c src/lcs/lcsbp_avx_intr.cpp -o $@ 188 | 189 | src/utils/utils.o : src/utils/utils.cpp 190 | $(CXX) $(CXXFLAGS) -c src/utils/utils.cpp -o $@ 191 | src/utils/utils_avx.o : src/utils/utils_avx.cpp 192 | $(CXX) $(CXXFLAGS_AVX) -c src/utils/utils_avx.cpp -o $@ 193 | else ifeq ($(SIMD), NEON) 194 | LCS_OBJS := src/lcs/lcsbp.o \ 195 | src/lcs/lcsbp_classic.o \ 196 | src/lcs/lcsbp_neon_intr.o 197 | UTILS_OBJS := src/utils/utils.o \ 198 | src/utils/utils_neon.o 199 | 200 | src/lcs/lcsbp.o : src/lcs/lcsbp.cpp 201 | $(CXX) $(CXXFLAGS) -c src/lcs/lcsbp.cpp -o $@ 202 | src/lcs/lcsbp_neon_intr.o : src/lcs/lcsbp_neon_intr.cpp 203 | $(CXX) $(CXXFLAGS_NEON) -c src/lcs/lcsbp_neon_intr.cpp -o $@ 204 | 205 | src/utils/utils.o : src/utils/utils.cpp 206 | $(CXX) $(CXXFLAGS) -c src/utils/utils.cpp -o $@ 207 | src/utils/utils_neon.o : src/utils/utils_neon.cpp 208 | $(CXX) $(CXXFLAGS_NEON) -c src/utils/utils_neon.cpp -o $@ 209 | else 210 | LCS_OBJS := src/lcs/lcsbp.o \ 211 | src/lcs/lcsbp_classic.o \ 212 | src/lcs/lcsbp_avx_intr.o \ 213 | src/lcs/lcsbp_avx2_intr.o 214 | 215 | UTILS_OBJS := src/utils/utils.o \ 216 | src/utils/utils_avx.o \ 217 | src/utils/utils_avx2.o 218 | 219 | src/lcs/lcsbp.o : src/lcs/lcsbp.cpp 220 | $(CXX) $(CXXFLAGS) -c src/lcs/lcsbp.cpp -o $@ 221 | src/lcs/lcsbp_avx_intr.o : src/lcs/lcsbp_avx_intr.cpp 222 | $(CXX) $(CXXFLAGS_AVX) -c src/lcs/lcsbp_avx_intr.cpp -o $@ 223 | src/lcs/lcsbp_avx2_intr.o : src/lcs/lcsbp_avx2_intr.cpp 224 | $(CXX) $(CXXFLAGS_AVX2) -c src/lcs/lcsbp_avx2_intr.cpp -o $@ 225 | 226 | src/utils/utils.o : src/utils/utils.cpp 227 | $(CXX) $(CXXFLAGS) -c src/utils/utils.cpp -o $@ 228 | src/utils/utils_avx.o : src/utils/utils_avx.cpp 229 | $(CXX) $(CXXFLAGS_AVX) -c src/utils/utils_avx.cpp -o $@ 230 | src/utils/utils_avx2.o : src/utils/utils_avx2.cpp 231 | $(CXX) $(CXXFLAGS_AVX2) -c src/utils/utils_avx2.cpp -o $@ 232 | endif 233 | 234 | 235 | .cpp.o: 236 | $(CXX) $(CXXFLAGS) -c $< -o $@ 237 | 238 | famsa: deflate $(MIMALLOC_OBJ) src/famsa.o $(COMMON_OBJS) $(LCS_OBJS) $(UTILS_OBJS) 239 | $(CXX) $(CLINK) -o $@ $(MIMALLOC_OBJ) src/famsa.o $(COMMON_OBJS) $(LCS_OBJS) $(UTILS_OBJS) $(LIB_DEFLATE) 240 | 241 | clean: 242 | cd libs/libdeflate/build && make clean 243 | -rm src/core/*.o 244 | -rm src/lcs/*.o 245 | -rm src/tree/*.o 246 | -rm src/utils/*.o 247 | -rm src/*.o 248 | -rm libs/mimalloc/*.o 249 | -rm famsa 250 | 251 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | # align sequences with default parameters (single linkage tree) 4 | ./famsa ./test/adeno_fiber/adeno_fiber sl.aln 5 | 6 | # align sequences using UPGMA tree with 8 computing threads, store the result in the GZ archive 7 | ./famsa -gt upgma -t 8 -gz ./test/adeno_fiber/adeno_fiber upgma.aln.gz 8 | 9 | # export a neighbour joining guide tree to the Newick format 10 | ./famsa -gt nj -gt_export ./test/adeno_fiber/adeno_fiber nj.dnd 11 | 12 | # align sequences with the previously generated guide tree 13 | ./famsa -gt import nj.dnd ./test/adeno_fiber/adeno_fiber nj.aln 14 | 15 | # align sequences with an approximated medoid guide tree and UPGMA subtrees 16 | ./famsa -medoidtree -gt upgma ./test/hemopexin/hemopexin upgma.medoid.aln 17 | 18 | # export distance matrix to CSV format (lower triangular) 19 | ./famsa -dist_export ./test/adeno_fiber/adeno_fiber dist.csv 20 | 21 | # export pairwise identity (PID) matrix to CSV format (square) 22 | ./famsa -dist_export -pid -square_matrix ./test/adeno_fiber/adeno_fiber pid.csv 23 | 24 | # profile-profile alignment without refining output 25 | ./famsa -refine_mode off ./test/adeno_fiber/upgma.no_refine.part1.fasta ./test/adeno_fiber/upgma.no_refine.part2.fasta pp.fasta 26 | -------------------------------------------------------------------------------- /src/core/defs.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _DEFS_H 10 | #define _DEFS_H 11 | 12 | #include 13 | #include 14 | 15 | // Uncomment for huge alignments (e.g., no of sequences > 10^6 and final alignemnt length > 10^5), when 16 | // the default int64_t type for storing alignment scores could be too small 17 | //#define HUGE_ALIGNMENTS 18 | 19 | // Uncommenting enables additional parameters that were important for a development of FAMSA. 20 | // They were also used to obtain some results presented in the FAMSA paper. 21 | // Nevertheless, they are unimportant for end users, so avoid turning this mode on unless you are 22 | // really sure what you want to do. 23 | // Warning: FAMSA was not designed to be user-friendly in the developer mode. 24 | //#define DEVELOPER_MODE 25 | 26 | 27 | // ***** Internal defines 28 | //#define DEBUG_MODE 29 | 30 | #define ALWAYS_3_DIRS 31 | 32 | //#define NO_GAP_CORRECTION 33 | 34 | #define LOG_STATS 35 | 36 | #ifdef HUGE_ALIGNMENTS 37 | typedef double score_t; 38 | #else 39 | typedef int64_t score_t; 40 | #endif 41 | 42 | #define SIMD_NONE 0 43 | #define SIMD_AVX1 1 44 | #define SIMD_AVX2 2 45 | #define SIMD_AVX512 3 46 | #define SIMD_NEON 4 47 | 48 | enum class instruction_set_t { none, sse, sse2, sse3, sse3s, sse41, sse42, avx, avx2 }; 49 | 50 | typedef char symbol_t; 51 | typedef int counter_t; 52 | 53 | typedef unsigned long long bit_vec_t; 54 | 55 | const int bv_size = sizeof(bit_vec_t) * 8; 56 | const int bv_size128 = 64; // length of a single word in AVX type used for bit-par LCS computation 57 | const int bv_size256 = 64; // length of a single word in AVX2 type used for bit-par LCS computation 58 | 59 | #ifdef HUGE_ALIGNMENTS 60 | const score_t infty = 1e30; 61 | #else 62 | const score_t infty = (1ll << 62); 63 | const double cost_cast_factor = 1000.0; 64 | #endif 65 | 66 | 67 | const symbol_t GAP = 30; // value representing gap 68 | const symbol_t GAP_OPEN = 25; 69 | const symbol_t GAP_EXT = 26; 70 | const symbol_t GAP_TERM_EXT = 27; 71 | const symbol_t GAP_TERM_OPEN = 28; 72 | const symbol_t UNKNOWN_SYMBOL = 22; 73 | 74 | const size_t NO_SYMBOLS = 32; // alphabet of protein sequences (including gaps and special symbols) 75 | const symbol_t GUARD = (symbol_t) (NO_SYMBOLS) - 1; 76 | const symbol_t NO_AMINOACIDS = 24; 77 | const symbol_t NO_VALID_AMINOACIDS = 20; 78 | const symbol_t NO_AMINOACIDS_AND_GAPS = 30; 79 | const symbol_t NO_AA_SYMBOLS = UNKNOWN_SYMBOL; // no. of symbols that can be compared 80 | 81 | #define MAX3(x, y, z) (max(x, max(y, z))) 82 | #define ABS(x) ((x) >= 0 ? (x) : -(x)) 83 | 84 | 85 | inline void *my_align(std::size_t alignment, std::size_t size, 86 | void *&ptr, std::size_t &space) { 87 | std::uintptr_t pn = reinterpret_cast< std::uintptr_t >(ptr); 88 | std::uintptr_t aligned = (pn + alignment - 1) & -alignment; 89 | std::size_t padding = aligned - pn; 90 | if (space < size + padding) return nullptr; 91 | space -= padding; 92 | return ptr = reinterpret_cast< void * >(aligned); 93 | } 94 | 95 | #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L) 96 | #define FALL_THROUGH [[fallthrough]]; 97 | #else 98 | #define FALL_THROUGH 99 | #endif 100 | 101 | #endif -------------------------------------------------------------------------------- /src/core/io_service.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 4 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 5 | 6 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 7 | 8 | */ 9 | 10 | #include "../core/io_service.h" 11 | #include "../core/queues.h" 12 | #include 13 | 14 | #ifdef _WIN32 15 | #include 16 | #include 17 | #endif 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | 27 | // ******************************************************************* 28 | bool IOService::saveAlignment(const std::string& file_name, vector& sequences, int no_threads, int gzip_level) 29 | { 30 | string s; 31 | string id, seq; 32 | 33 | int pack_size = gzip_level < 0 ? 5 : 10; 34 | int clear_pack_size = 100; 35 | 36 | atomic seq_id {0}; 37 | vector v_threads; 38 | v_threads.reserve(no_threads); 39 | 40 | for (int i = 0; i < no_threads; ++i) 41 | v_threads.emplace_back([clear_pack_size, &seq_id, &sequences] { 42 | int no_seqs = static_cast(sequences.size()); 43 | 44 | while (true) 45 | { 46 | int32_t id_from = seq_id.fetch_add(clear_pack_size); 47 | int32_t id_to = id_from + clear_pack_size; 48 | 49 | if (id_from >= no_seqs) 50 | break; 51 | if (id_to >= no_seqs) 52 | id_to = no_seqs; 53 | 54 | for (int i = id_from; i < id_to; ++i) 55 | sequences[i]->ClearDPS(); 56 | } 57 | }); 58 | 59 | for (auto& t : v_threads) 60 | t.join(); 61 | 62 | v_threads.clear(); 63 | 64 | seq_id = 0; 65 | 66 | CLimitedPriorityQueue> lpq(no_threads, 5 * no_threads); 67 | 68 | for (int i = 0; i < no_threads; ++i) 69 | v_threads.emplace_back([pack_size, gzip_level, &seq_id, &sequences, &lpq] { 70 | int no_seqs = static_cast(sequences.size()); 71 | string s_tmp; 72 | 73 | libdeflate_compressor* compressor = gzip_level >= 0 ? libdeflate_alloc_compressor(gzip_level) : nullptr; 74 | vector gz_vec, raw_vec; 75 | 76 | while (true) 77 | { 78 | int id_from = seq_id.fetch_add(pack_size); 79 | int id_to = id_from + pack_size; 80 | 81 | if (id_from >= no_seqs) 82 | break; 83 | if (id_to >= no_seqs) 84 | id_to = no_seqs; 85 | 86 | s_tmp.clear(); 87 | 88 | for (int i = id_from; i < id_to; ++i) 89 | { 90 | auto p = sequences[i]; 91 | 92 | string seq = p->Decode(); 93 | s_tmp.append(p->id); 94 | s_tmp.push_back('\n'); 95 | 96 | size_t seq_size = seq.size(); 97 | auto ptr = seq.c_str(); 98 | size_t step; 99 | 100 | for (size_t pos = 0; pos < seq_size; pos += step, ptr += step) 101 | { 102 | step = 60; 103 | if (pos + step > seq_size) 104 | step = seq_size - pos; 105 | 106 | s_tmp.append(ptr, step); 107 | s_tmp.push_back('\n'); 108 | } 109 | 110 | // Clear internal data here to save memory 111 | p->Clear(); 112 | } 113 | 114 | if (gzip_level >= 0) 115 | { 116 | size_t need_alloc = libdeflate_gzip_compress_bound(compressor, s_tmp.size()); 117 | if (gz_vec.size() < need_alloc) 118 | gz_vec.resize(need_alloc); 119 | 120 | size_t gzipped_size = libdeflate_gzip_compress(compressor, s_tmp.data(), s_tmp.size(), gz_vec.data(), gz_vec.size()); 121 | 122 | vector v(gz_vec.begin(), gz_vec.begin() + gzipped_size); 123 | //v_gz_sequences[id_from / pack_size].assign(gz_vec.begin(), gz_vec.begin() + gzipped_size); 124 | lpq.Emplace(id_from / pack_size, move(v)); 125 | } 126 | else 127 | { 128 | raw_vec.assign(s_tmp.begin(), s_tmp.end()); 129 | lpq.Emplace(id_from / pack_size, move(raw_vec)); 130 | } 131 | } 132 | 133 | lpq.MarkCompleted(); 134 | 135 | if(gzip_level >= 0) 136 | libdeflate_free_compressor(compressor); 137 | 138 | }); 139 | 140 | if (file_name == "STDOUT") 141 | { 142 | #ifdef _WIN32 143 | _setmode(_fileno(stdout), _O_BINARY); 144 | #endif 145 | vector dat; 146 | 147 | while (!lpq.IsCompleted()) 148 | { 149 | if (!lpq.Pop(dat)) 150 | continue; 151 | 152 | fwrite(dat.data(), 1, dat.size(), stdout); 153 | } 154 | } 155 | else 156 | { 157 | ofstream outfile; 158 | const size_t BUFFER_SIZE = 128 << 20; 159 | char* buffer = new char[BUFFER_SIZE]; 160 | 161 | outfile.open(file_name.c_str(), ios_base::out | ios_base::binary); 162 | outfile.rdbuf()->pubsetbuf(buffer, BUFFER_SIZE); 163 | 164 | vector dat; 165 | 166 | while (!lpq.IsCompleted()) 167 | { 168 | if (!lpq.Pop(dat)) 169 | continue; 170 | 171 | outfile.write((char*)dat.data(), dat.size()); 172 | } 173 | 174 | outfile.close(); 175 | delete[] buffer; 176 | } 177 | 178 | for (auto& t : v_threads) 179 | t.join(); 180 | 181 | return true; 182 | } 183 | -------------------------------------------------------------------------------- /src/core/io_service.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "../core/sequence.h" 11 | 12 | #include 13 | #include 14 | 15 | 16 | class IOService { 17 | 18 | public: 19 | template 20 | static size_t loadFasta(const std::string& file_name, std::vector& sequences, memory_monotonic_safe* mma = nullptr); 21 | static bool saveAlignment(const std::string& file_name, vector & sequences, int no_threads, int gzip_level); 22 | }; 23 | 24 | 25 | // ******************************************************************* 26 | template 27 | size_t IOService::loadFasta( 28 | const std::string& file_name, std::vector& sequences, memory_monotonic_safe* mma) { 29 | 30 | istream* in; 31 | ifstream infile; 32 | 33 | if (file_name == "STDIN") { 34 | in = &cin; 35 | } 36 | else { 37 | infile.open(file_name.c_str(), ios_base::in); 38 | if (!infile.good()) 39 | return 0; 40 | in = &infile; 41 | } 42 | 43 | string s; 44 | string id, seq; 45 | int seq_no = 0; 46 | 47 | while (in->good()) 48 | { 49 | getline(*in, s); 50 | 51 | while (!s.empty() && (s[s.length() - 1] == '\n' || s[s.length() - 1] == '\r')) 52 | s.pop_back(); 53 | if (s.empty()) 54 | continue; 55 | 56 | if (s[0] == '>') 57 | { 58 | if (!id.empty() && !seq.empty()) 59 | { 60 | sequences.emplace_back(id, seq, seq_no++, mma); 61 | seq.clear(); 62 | } 63 | id = s; 64 | } 65 | else { 66 | seq += s; 67 | } 68 | } 69 | 70 | if (!id.empty() && !seq.empty()) 71 | sequences.emplace_back(id, seq, seq_no++, mma); 72 | 73 | return sequences.size(); 74 | } -------------------------------------------------------------------------------- /src/core/params.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _PARAMS_H 10 | #define _PARAMS_H 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "../core/defs.h" 18 | #include "../tree/TreeDefs.h" 19 | 20 | 21 | using namespace std; 22 | 23 | class Refinement { 24 | public: 25 | enum Mode {ON, OFF, AUTO}; 26 | 27 | static std::string toString(Mode v) { 28 | switch (v) { 29 | case ON: return "on"; 30 | case OFF: return "off"; 31 | case AUTO: return "auto"; 32 | default: 33 | throw new std::runtime_error("Error: Illegal refinment mode."); 34 | } 35 | 36 | return "Unknown"; 37 | } 38 | 39 | static Mode fromString(const std::string& name) { 40 | if (name == "on") { return ON; } 41 | if (name == "off") { return OFF; } 42 | if (name == "auto") { return AUTO; } 43 | 44 | // something went wrong 45 | throw new std::runtime_error("Error: Illegal refinment mode."); 46 | 47 | return ON; 48 | } 49 | }; 50 | 51 | 52 | class CParams 53 | { 54 | private: 55 | double gap_open_base = 14.85; 56 | double gap_ext_base = 1.25; 57 | double gap_term_open_base = 0.66; 58 | double gap_term_ext_base = 0.66; 59 | 60 | public: 61 | 62 | score_t gap_open; 63 | score_t gap_ext; 64 | score_t gap_term_open; 65 | score_t gap_term_ext; 66 | 67 | uint32_t scaler_div = 7; 68 | uint32_t scaler_log = 45; 69 | int guided_alignment_radius = 50; 70 | 71 | bool enable_gap_rescaling = true; 72 | bool enable_gap_optimization = true; 73 | bool enable_total_score_calculation = true; 74 | 75 | Refinement::Mode refinement_mode = Refinement::AUTO; 76 | uint32_t n_refinements = 100; 77 | uint32_t thr_refinement = 1000; 78 | uint32_t thr_internal_refinement = 0; 79 | 80 | GT::Method gt_method = GT::MST_Prim; 81 | GT::Heuristic gt_heuristic = GT::None; 82 | Distance distance = Distance::indel_div_lcs; 83 | int heuristic_threshold = 0; 84 | 85 | int guide_tree_seed = 0; 86 | int subtree_size = 100; 87 | int sample_size = 2000; 88 | float cluster_fraction = 0.1f; 89 | int cluster_iters = 2; 90 | 91 | string guide_tree_in_file; 92 | bool export_distances = false; 93 | bool export_tree = false; 94 | bool generate_square_matrix = false; 95 | bool calculate_pid = false; 96 | bool keepDuplicates = false; 97 | 98 | bool test_ref_sequences = false; 99 | uint64_t ref_seq_subtree_size = 0; 100 | string ref_file_name; 101 | 102 | int64_t shuffle = -1; 103 | uint32_t n_threads = 0; 104 | 105 | bool gzippd_output = false; 106 | int gzip_level = 7; 107 | 108 | instruction_set_t instruction_set = instruction_set_t::none; 109 | 110 | bool verbose_mode = false; 111 | bool very_verbose_mode = false; 112 | 113 | bool profile_aligning = false; 114 | string input_file_name; 115 | string input_file_name_2; 116 | string output_file_name; 117 | 118 | vector> score_matrix; 119 | vector score_vector; 120 | 121 | CParams(); 122 | bool parse(int argc, char** argv, bool& showExpert); 123 | void show_usage(bool expert); 124 | 125 | protected: 126 | bool findSwitch(std::vector& params, const std::string& name) { 127 | auto it = find(params.begin(), params.end(), name); // verbose mode 128 | if (it != params.end()) { 129 | params.erase(it); 130 | return true; 131 | } 132 | 133 | return false; 134 | } 135 | 136 | template 137 | bool findOption(std::vector& params, const std::string& name, T& v) { 138 | auto prevToEnd = std::prev(params.end()); 139 | auto it = find(params.begin(), prevToEnd, name); // verbose mode 140 | if (it != prevToEnd) { 141 | std::istringstream iss(*std::next(it)); 142 | if (iss >> v) { 143 | params.erase(it, it + 2); 144 | return true; 145 | } 146 | } 147 | 148 | return false; 149 | } 150 | 151 | template 152 | bool findOption( 153 | std::vector& params, 154 | const std::string& name, 155 | T& v, 156 | std::vector::iterator & next) { 157 | 158 | auto prevToEnd = std::prev(params.end()); 159 | auto it = find(params.begin(), prevToEnd, name); // verbose mode 160 | if (it != prevToEnd) { 161 | std::istringstream iss(*std::next(it)); 162 | if (iss >> v) { 163 | next = params.erase(it, it + 2); 164 | return true; 165 | } 166 | } 167 | 168 | return false; 169 | } 170 | }; 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /src/core/profile_seq.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/refresh-bio/FAMSA/1669fc1444c8bc4000d71121ec2a7aa62d848b57/src/core/profile_seq.cpp -------------------------------------------------------------------------------- /src/core/queues.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include "../core/queues.h" 10 | #include 11 | 12 | //#define PRODUCE_LOG 13 | 14 | // ******************************************************************* 15 | // CProfileQueue 16 | // ******************************************************************* 17 | CProfileQueue::CProfileQueue(vector *_gapped_sequences, map *_profiles, vector> *_guide_tree, uint32_t _max_no_threads) 18 | { 19 | gapped_sequences = _gapped_sequences; 20 | profiles = _profiles; 21 | guide_tree = _guide_tree; 22 | max_no_threads = _max_no_threads; 23 | no_working_threads = 0; 24 | 25 | eoq_flag = false; 26 | 27 | prof_depth.assign(guide_tree->size(), 0); 28 | for (size_t i = guide_tree->size() - 1; i >= gapped_sequences->size(); --i) 29 | { 30 | prof_depth[(*guide_tree)[i].first] = prof_depth[i] + 1; 31 | prof_depth[(*guide_tree)[i].second] = prof_depth[i] + 1; 32 | } 33 | 34 | // Insert all sequence to profile converstions as ready to process 35 | vector init_ids; 36 | for (size_t i = 0; i < gapped_sequences->size(); ++i) 37 | init_ids.emplace_back(i); 38 | 39 | for (size_t i = 0; i < gapped_sequences->size(); ++i) 40 | pq.push(make_pair(prof_depth[i], i)); 41 | 42 | // Number of child profiles ready for each parent profile 43 | ready_profiles.assign(guide_tree->size(), 0); 44 | 45 | child_parent_mapping.assign(guide_tree->size(), 0); 46 | for (size_t i = 0; i < guide_tree->size(); ++i) 47 | { 48 | int id1 = (*guide_tree)[i].first; 49 | int id2 = (*guide_tree)[i].second; 50 | 51 | if (id1 == -1) 52 | continue; 53 | 54 | child_parent_mapping[id1] = i; 55 | child_parent_mapping[id2] = i; 56 | } 57 | } 58 | 59 | // ******************************************************************* 60 | CProfileQueue::~CProfileQueue() 61 | { 62 | // Nothing to do 63 | } 64 | 65 | // ******************************************************************* 66 | void CProfileQueue::CheckAlignInParallel(CProfile* prof1, CProfile* prof2, uint32_t& no_threads, uint32_t& no_rows_per_box) 67 | { 68 | const uint32_t min_box_width_per_thread = 512; 69 | // const uint32_t min_box_width_per_thread = 12; 70 | 71 | uint32_t no_available_threads = max_no_threads - no_working_threads; 72 | 73 | //uint32_t min_prof_width = min(prof1->width, prof2->width); 74 | uint32_t max_prof_width = (uint32_t) max(prof1->width, prof2->width); 75 | 76 | if(no_available_threads == 1) 77 | { 78 | no_threads = 1; 79 | no_rows_per_box = 0; 80 | 81 | /* cout << "max_no_threads: " + to_string(max_no_threads) + 82 | " no_working_threads: " + to_string(no_working_threads) + 83 | " no_threads: " + to_string(no_threads) + "\n";*/ 84 | 85 | return; 86 | } 87 | 88 | if (max_prof_width < 2 * min_box_width_per_thread) 89 | { 90 | no_threads = 1; 91 | no_rows_per_box = 0; 92 | 93 | /* cout << "max_no_threads: " + to_string(max_no_threads) + 94 | " no_working_threads: " + to_string(no_working_threads) + 95 | " no_threads: " + to_string(no_threads) + "\n";*/ 96 | 97 | return; 98 | } 99 | 100 | uint32_t est_no_threads = max(1u, no_available_threads / (uint32_t) (pq.size() + 1u)); 101 | 102 | no_threads = min(est_no_threads, max_prof_width / min_box_width_per_thread); 103 | no_threads = max(no_threads, 1u); 104 | //uint32_t box_width_per_thread = max_prof_width / no_threads; 105 | 106 | if (no_threads > 1) 107 | { 108 | // if (box_width_per_thread < 512) 109 | no_rows_per_box = 4; 110 | /* else if (box_width_per_thread < 1024) 111 | no_rows_per_box = 3; 112 | else if (box_width_per_thread < 2048) 113 | no_rows_per_box = 2; 114 | else 115 | no_rows_per_box = 1;*/ 116 | } 117 | else 118 | no_rows_per_box = 0; 119 | 120 | /* cout << "max_no_threads: " + to_string(max_no_threads) + 121 | " no_working_threads: " + to_string(no_working_threads) + 122 | " no_threads: " + to_string(no_threads) + "\n";*/ 123 | } 124 | 125 | // ******************************************************************* 126 | bool CProfileQueue::GetTask(size_t &prof_id, CGappedSequence *&gs, CProfile *&prof1, CProfile *&prof2, uint32_t& no_threads, uint32_t& no_rows_per_block) 127 | { 128 | unique_lock lck(mtx); 129 | cv.wait(lck, [this]{return !this->pq.empty() || this->eoq_flag; }); 130 | 131 | if (eoq_flag) 132 | return false; // End of data in the profiles queue 133 | 134 | prof_id = pq.top().second; 135 | pq.pop(); 136 | 137 | if ((*guide_tree)[prof_id].first == -1) 138 | { 139 | gs = (*gapped_sequences)[prof_id]; 140 | prof1 = nullptr; 141 | prof2 = nullptr; 142 | no_threads = 1; 143 | no_rows_per_block = 0; 144 | } 145 | else 146 | { 147 | gs = nullptr; 148 | prof1 = (*profiles)[(*guide_tree)[prof_id].first]; 149 | prof2 = (*profiles)[(*guide_tree)[prof_id].second]; 150 | 151 | CheckAlignInParallel(prof1, prof2, no_threads, no_rows_per_block); 152 | } 153 | 154 | no_working_threads += no_threads; 155 | m_reserved_threads[prof_id] = no_threads; 156 | 157 | return true; 158 | } 159 | 160 | // ******************************************************************* 161 | void CProfileQueue::AddSolution(size_t prof_id, CProfile *prof) 162 | { 163 | lock_guard lck(mtx); 164 | 165 | if ((*guide_tree)[prof_id].first == -1) // Just construct profile from a sequence 166 | (*profiles)[prof_id] = prof; 167 | else 168 | { 169 | // Add new profile and remove old profiles 170 | (*profiles)[prof_id] = prof; 171 | 172 | profiles->erase((*guide_tree)[prof_id].first); 173 | profiles->erase((*guide_tree)[prof_id].second); 174 | } 175 | 176 | if (++ready_profiles[child_parent_mapping[prof_id]] == 2) // Profile is ready to be computed as both child profiles are already computed 177 | pq.push(make_pair(prof_depth[prof_id], child_parent_mapping[prof_id])); 178 | 179 | if (ready_profiles[0] == 1) // final profile was computed 180 | eoq_flag = true; 181 | 182 | no_working_threads -= m_reserved_threads[prof_id]; 183 | m_reserved_threads.erase(prof_id); 184 | 185 | cv.notify_all(); 186 | } 187 | -------------------------------------------------------------------------------- /src/core/sequence.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _SEQUENCE_H 10 | #define _SEQUENCE_H 11 | 12 | #include "../core/defs.h" 13 | #include "../utils/memory_monotonic.h" 14 | #include "../utils/array.h" 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | using namespace std; 21 | using namespace refresh; 22 | 23 | // ******************************************************************* 24 | class CSequence 25 | { 26 | static char mapping_table[25]; 27 | 28 | public: 29 | uint32_t length; 30 | uint32_t data_size; 31 | symbol_t* data = nullptr; 32 | bit_vec_t *p_bit_masks; 33 | uint32_t p_bv_len; 34 | 35 | const int original_no; 36 | int sequence_no; 37 | string id; 38 | 39 | memory_monotonic_safe *mma; 40 | 41 | vector uppercase; 42 | vector> extra_symbols; 43 | 44 | public: 45 | CSequence() = delete; 46 | CSequence(const string& _id, const string& seq, int sequence_no = -1, memory_monotonic_safe *mma = nullptr); 47 | 48 | // sequences are not copyable 49 | CSequence(const CSequence& x) noexcept = delete; 50 | CSequence& operator=(const CSequence& x) noexcept = delete; 51 | 52 | CSequence(CSequence&& x) noexcept; 53 | CSequence& operator=(CSequence&& x) noexcept = delete; 54 | 55 | ~CSequence(); 56 | 57 | void DataResize(uint32_t new_size, symbol_t new_symbol); 58 | 59 | void ComputeBitMasks(); 60 | void ReleaseBitMasks(); 61 | //string DecodeSequence(); 62 | 63 | memory_monotonic_safe* get_mma() 64 | { 65 | return mma; 66 | } 67 | }; 68 | 69 | // ******************************************************************* 70 | struct CSequenceView 71 | { 72 | uint32_t length; 73 | uint32_t padding1; 74 | symbol_t* data; 75 | }; 76 | 77 | // ******************************************************************* 78 | class CGappedSequence 79 | { 80 | static char mapping_table[25]; 81 | 82 | void RecalculateDPS(); 83 | void InitialiseDPS(); 84 | 85 | memory_monotonic_safe* mma = nullptr; 86 | 87 | public: 88 | symbol_t* symbols = nullptr; 89 | size_t size; 90 | size_t symbols_size; 91 | size_t gapped_size; 92 | 93 | size_t dps_size; 94 | size_t dps_size_div2; 95 | int original_no; 96 | int sequence_no; 97 | 98 | vector n_gaps; 99 | vector dps; // dynamic position statistics (DSP) for the sequence 100 | 101 | string id; 102 | vector uppercase; 103 | vector> extra_symbols; 104 | 105 | CGappedSequence() = delete; 106 | CGappedSequence(const string& _id, const string& seq, int seq_no=-1, memory_monotonic_safe *mma=nullptr); 107 | CGappedSequence(CSequence &&_sequence); 108 | CGappedSequence(const CGappedSequence &_gapped_sequence); 109 | CGappedSequence(CGappedSequence &&_gapped_sequence) noexcept; 110 | ~CGappedSequence(); 111 | 112 | bool operator==(const CGappedSequence &gs) const; 113 | 114 | void InsertGap(uint32_t pos); 115 | void InsertGaps(uint32_t pos, uint32_t n); 116 | void InsertGapsVector(const vector> &v_gaps); 117 | 118 | void RemoveGap(size_t pos); 119 | void RemoveGaps(size_t pos, uint32_t n); 120 | symbol_t GetSymbol(size_t pos); 121 | 122 | // void DecodeRaw(symbol_t *seq); 123 | string Decode(); 124 | uint32_t NoSymbols(); 125 | 126 | void InsertFront(symbol_t new_symbol); 127 | 128 | void Clear(); 129 | void ClearDPS(); 130 | }; 131 | #endif 132 | -------------------------------------------------------------------------------- /src/core/version.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _VERSION_H 10 | #define _VERSION_H 11 | 12 | #define FAMSA_VER "2.2.3" 13 | #define FAMSA_DATE "2024-09-17" 14 | #define FAMSA_AUTHORS "S. Deorowicz, A. Debudaj-Grabysz, A. Gudys" 15 | 16 | #endif 17 | 18 | /* 19 | Version history: 20 | 2.2.3 (2024-09-17): 21 | - Fixed rare bug in single linkage when analyzing sequences with zero similarity. Release deployment scripts added. 22 | 23 | 2.2.2 (2022-10-09): 24 | - Fixed slowdown caused by the duplicate removal (feature added in 2.1.0). 25 | 26 | 2.2.1 (2022-10-05): 27 | - Pairwise identity (-pid switch) properly calculated as the number of matching residues divided by the shorter sequence length. 28 | 29 | 2.2.0 (2022-10-05): 30 | - Added possibility to align two pre-aligned profiles (two input files specified). 31 | 32 | 2.1.3 (2022-09-30): 33 | - Fixed incorrect handling of single sequence sets (first residue cut) or sets containing only duplicates (hang). 34 | 35 | 2.1.2 (2022-08-04): 36 | - Makefile updates improving cross-platform compilation. 37 | 38 | 2.1.1 (2022-08-01): 39 | - Preserving non-standard amino acid symbols in the output alignment (instead of replacing with X). 40 | 41 | 2.1.0 (2022-07-26): 42 | - Duplicated sequences are removed prior to the alignment and restored afterwards. 43 | 44 | 2.0.4 (2022-07-25) 45 | - Fixed stack overflow exception when saving/loading large Newick trees 46 | (boost-based algorithm replaced with own non-recursive approach). 47 | 48 | 2.0.3 (2022-05-27): 49 | - The ordering of the input sequences preserved in the final alignment. 50 | 51 | 2.0.2 (2022-05-24): 52 | - Alignment allowed as an input (gaps are removed). 53 | 54 | 2.0.1 (2022-05-18): 55 | - Several fixes for bioconda. 56 | 57 | 2.0.0-rc (2022-05-18):: 58 | - Default algorithm for single linkage trees changed from SLINK to MST Prim, 59 | - Small fixes. 60 | 61 | 1.16.0 (2022-04-29): 62 | - mimalloc added as a single source library (no external linking). 63 | 64 | 1.15.0 (2022-04-28): 65 | 1.14.0 (2022-04-22): 66 | - Highly optimized MST Prim implementation. 67 | 68 | 1.13.0 (2022-04-20): 69 | - Added support of NEON SIMD extensions for ARM architectures. 70 | 71 | 1.12.5 (2022-04-13) 72 | - Further memory optimizations. 73 | - Some refactoring. 74 | - Warnings removal. 75 | 76 | 1.12.4 (2022-04-11) 77 | 1.12.3 (2022-04-11) 78 | - Futher memory optimizations. 79 | 80 | 1.12.2 (2022-04-09) 81 | - Memory optimizations in gapped sequence representation. 82 | 83 | 1.12.1 (2022-04-08) 84 | - Non-AVX and 32-bit compilation fixed. 85 | - Proper handling of . and | symbols in sequence identifiers when importing Newick tree. 86 | 87 | 1.12.0 (2022-04-06) 88 | - Parallel medoid trees. 89 | 90 | 1.11.0 (2022-04-01) 91 | - Gzipped output. 92 | 93 | 1.10.0 (2022-03-24) 94 | - Uniform distance measures. 95 | 96 | 1.9.0 (2022-03-11) 97 | - Export of distance matrix significantly improved. 98 | 99 | 1.8.0 (2021-06-09) 100 | - Added MST Prim algorithm for single linkage trees. 101 | 102 | 1.7.0 (2021-06-08) 103 | - Parallel profile construction. 104 | - Multiple optimizations. 105 | - Single linkage draws resolution. 106 | 107 | 1.6.2 (2020-06-19) 108 | - Clang compilation fixed. 109 | 110 | 1.6.1 (2020-06-18) 111 | - Added parameter for automatic medoid tree usage. Some refactoring. Added license file. 112 | 113 | 1.6.0 (2020-06-18) 114 | - Removed VCL and ASMLIB dependencies. 115 | - Some low-level optimizations in LCS calculations. 116 | 117 | 1.5.20 (2020-05-26) 118 | - PartTree always select assumed number of seeds (1.5.16 patch applied only to MedoidTree). 119 | - Small fix in Sackin index calculation. 120 | 121 | 1.5.19 (2020-04-18) 122 | - Uniform distance computation in PartTree and MedoidTree. 123 | 124 | 1.5.18 (2020-04-17) 125 | - Added modified UPGMA algorithm and distance correction (MAFFT-inspired). 126 | 127 | 1.5.17 (2020-04-16) 128 | - Stats dumped to file in the verbose mode. 129 | - Sackin index calculation right after tree construction. 130 | 131 | 1.5.16 (2020-04-10) 132 | - PartTree and MedoidTree always select assumed number of seeds. 133 | 134 | 1.5.15 (2020-04-08) 135 | - Fixed bug in calculating clustering cost and assignment update. 136 | 137 | 1.5.14 (2020-04-07) 138 | - Bug in Neighbor Joining fixed. 139 | 140 | 1.5.13 (2020-04-06) 141 | - Alternative method of combining children and parental trees in PartTree. 142 | - Neighbor Joining algorithm added. 143 | 144 | 1.5.12 (2020-03-24) 145 | - Fixed bug with -dist_export mode (sequences not ordered as in input FASTA file) 146 | 147 | 1.5.11 (2020-03-23) 148 | - Possiblity to choose between PartTree and MedoidTree heuristic. 149 | - K-medoid clustering with CLARANS heuristic. 150 | 151 | 1.5.8 (2020-03-18) 152 | - Deterministic random generator added (to make Windows and Unix results the same). 153 | 154 | 1.5.7 (2020-03-17) 155 | - Further memory improvements. 156 | 157 | 1.5.6 (2020-03-15) 158 | - Serious refactorization. 159 | - Bit vectors computed when needed and released afterwards. 160 | 161 | 1.5.5 (2020-03-13) 162 | - Segmentation fault fix. 163 | 164 | 1.5.4 (2020-03-10) 165 | - Interface for tree import/export changed a bit. 166 | 167 | 1.5.3 (2020-03-03) 168 | - PartTree + UPGMA support. 169 | - Size of a cluster in PartTree added as a command line parameter. 170 | - Optimizations in the sequence representation. 171 | - Added option for sequence shuffling. 172 | 173 | 1.5.0 (2020-03-02) 174 | - PartTree mode added (only single linkage trees supported). 175 | - Since sequences are sorted, the first one can be taken as the longest one. 176 | - Serious refactoring. 177 | 178 | 1.4.0 (2020-02-27) 179 | - GPU mode no longer supported. 180 | - Possibility to calculate distance matrix or guide tree without doing alignment. 181 | - Lots of refactoring. 182 | 183 | 1.3.2 (2020-02-21) 184 | - Approved pull request "Fix * char emitted for unknown residues, emits X instead" 185 | - Approved pull request "Support single input fasta files" 186 | - Version.h file added 187 | 188 | */ 189 | -------------------------------------------------------------------------------- /src/famsa.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "./core/io_service.h" 16 | #include "./core/params.h" 17 | #include "./msa.h" 18 | #include "./utils/timer.h" 19 | #include "./utils/log.h" 20 | 21 | #include "./core/version.h" 22 | 23 | #define VAL(str) #str 24 | #define TOSTRING(str) VAL(str) 25 | 26 | #undef min 27 | #undef max 28 | 29 | 30 | // **************************************************************************** 31 | int main(int argc, char *argv[]) 32 | { 33 | bool ok = true; 34 | 35 | try { 36 | 37 | Log::getInstance(Log::LEVEL_NORMAL).enable(); 38 | 39 | LOG_NORMAL << "FAMSA (Fast and Accurate Multiple Sequence Alignment) \n" 40 | << " version " << FAMSA_VER 41 | #ifdef GIT_COMMIT 42 | << "-" << TOSTRING(GIT_COMMIT) 43 | #endif 44 | << " (" << FAMSA_DATE << ")\n" 45 | << " " << FAMSA_AUTHORS << "\n\n"; 46 | 47 | bool showExpert = 0; 48 | CParams params; 49 | 50 | if (!params.parse(argc, argv, showExpert)) { 51 | // some parameters could be parsed - used default values for printing 52 | CParams def_params; 53 | def_params.show_usage(showExpert); 54 | return 0; 55 | } 56 | 57 | CStopWatch timer, timer_saving; 58 | 59 | timer.StartTimer(); 60 | 61 | if (params.verbose_mode) { 62 | Log::getInstance(Log::LEVEL_VERBOSE).enable(); 63 | } 64 | if (params.very_verbose_mode) { 65 | Log::getInstance(Log::LEVEL_VERBOSE).enable(); 66 | Log::getInstance(Log::LEVEL_DEBUG).enable(); 67 | } 68 | 69 | memory_monotonic_safe mma(16 << 20, 64); 70 | vector result; 71 | vector sequences; 72 | 73 | // profile - profile alignment 74 | if (params.profile_aligning) { 75 | LOG_VERBOSE << "Aligning " << params.input_file_name << " with " << params.input_file_name_2 << "\n"; 76 | 77 | vector profile1; 78 | vector profile2; 79 | 80 | size_t size1 = IOService::loadFasta(params.input_file_name, profile1, &mma); 81 | size_t size2 = IOService::loadFasta(params.input_file_name_2, profile2, &mma); 82 | CFAMSA profile_aligner(params); 83 | 84 | profile_aligner.adjustParams((int)(size1 + size2)); 85 | profile_aligner.alignProfiles(profile1, profile2); 86 | 87 | profile_aligner.GetAlignment(result); 88 | 89 | IOService::saveAlignment(params.output_file_name, result, params.n_threads, 90 | params.gzippd_output ? params.gzip_level : -1); 91 | return 0; 92 | } 93 | 94 | LOG_VERBOSE << "Aligning " << params.input_file_name << "\n"; 95 | 96 | size_t input_seq_cnt = IOService::loadFasta(params.input_file_name, sequences, &mma); 97 | 98 | if (input_seq_cnt == 0) { 99 | // no sequences loaded - signal error 100 | throw(std::runtime_error("No (or incorrect) input file.")); 101 | } 102 | else { 103 | // at least one input sequences - run alignment 104 | CFAMSA famsa(params); 105 | famsa.getStatistics().put("input.n_sequences", sequences.size()); 106 | 107 | if (famsa.ComputeMSA(sequences)) { 108 | timer_saving.StartTimer(); 109 | 110 | // Save alignment if it was generated 111 | if (famsa.GetAlignment(result)) { 112 | 113 | famsa.getStatistics().put("alignment.length", result[0]->gapped_size); 114 | 115 | LOG_VERBOSE << "Saving alignment in " << params.output_file_name; 116 | ok = IOService::saveAlignment(params.output_file_name, result, params.n_threads, 117 | params.gzippd_output ? params.gzip_level : -1); 118 | 119 | LOG_VERBOSE << " [OK]" << endl; 120 | } 121 | 122 | timer_saving.StopTimer(); 123 | timer.StopTimer(); 124 | LOG_NORMAL << "Done!\n"; 125 | 126 | if (params.verbose_mode || params.very_verbose_mode) { 127 | famsa.getStatistics().put("time.save", timer_saving.GetElapsedTime()); 128 | famsa.getStatistics().put("time.total", timer.GetElapsedTime()); 129 | 130 | string stats = famsa.getStatistics().toString(); 131 | 132 | LOG_VERBOSE << endl << endl << "Statistics:" << endl << stats << endl; 133 | 134 | std::ofstream ofs("famsa.stats"); 135 | ofs << "[stats]" << endl << stats; 136 | ofs.close(); 137 | } 138 | 139 | } 140 | else { 141 | throw(std::runtime_error("Some interal error occured")); 142 | } 143 | } 144 | 145 | sequences.clear(); 146 | } 147 | catch (std::runtime_error& err) { 148 | LOG_NORMAL << endl << "[ERROR] " << err.what() << endl; 149 | ok = false; 150 | } 151 | 152 | return ok ? 0 : -1; 153 | } 154 | 155 | -------------------------------------------------------------------------------- /src/famsa.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {ef795228-abf5-48ab-8066-c443301d43ea} 6 | 7 | 8 | {feadd0e7-1814-4acf-8501-072e28f6762f} 9 | 10 | 11 | {2cea1071-a989-47b3-a25e-f9c804ce945f} 12 | 13 | 14 | {263818da-dcee-49b9-9610-b54e00e4d18b} 15 | 16 | 17 | {142c24dd-1f53-42a6-b0e5-a9aa8ea15d80} 18 | 19 | 20 | 21 | 22 | 23 | 24 | core 25 | 26 | 27 | lcs 28 | 29 | 30 | lcs 31 | 32 | 33 | utils 34 | 35 | 36 | utils 37 | 38 | 39 | core 40 | 41 | 42 | core 43 | 44 | 45 | core 46 | 47 | 48 | tree 49 | 50 | 51 | tree 52 | 53 | 54 | tree 55 | 56 | 57 | tree 58 | 59 | 60 | tree 61 | 62 | 63 | tree 64 | 65 | 66 | tree 67 | 68 | 69 | lcs 70 | 71 | 72 | lcs 73 | 74 | 75 | utils 76 | 77 | 78 | utils 79 | 80 | 81 | utils 82 | 83 | 84 | tree 85 | 86 | 87 | core 88 | 89 | 90 | core 91 | 92 | 93 | utils 94 | 95 | 96 | tree 97 | 98 | 99 | tree 100 | 101 | 102 | core 103 | 104 | 105 | lcs 106 | 107 | 108 | utils 109 | 110 | 111 | 112 | libraries 113 | 114 | 115 | 116 | 117 | 118 | utils 119 | 120 | 121 | core 122 | 123 | 124 | core 125 | 126 | 127 | lcs 128 | 129 | 130 | lcs 131 | 132 | 133 | utils 134 | 135 | 136 | utils 137 | 138 | 139 | utils 140 | 141 | 142 | core 143 | 144 | 145 | core 146 | 147 | 148 | core 149 | 150 | 151 | core 152 | 153 | 154 | core 155 | 156 | 157 | tree 158 | 159 | 160 | tree 161 | 162 | 163 | tree 164 | 165 | 166 | tree 167 | 168 | 169 | tree 170 | 171 | 172 | tree 173 | 174 | 175 | tree 176 | 177 | 178 | tree 179 | 180 | 181 | tree 182 | 183 | 184 | utils 185 | 186 | 187 | tree 188 | 189 | 190 | tree 191 | 192 | 193 | lcs 194 | 195 | 196 | lcs 197 | 198 | 199 | utils 200 | 201 | 202 | utils 203 | 204 | 205 | tree 206 | 207 | 208 | utils 209 | 210 | 211 | tree 212 | 213 | 214 | tree 215 | 216 | 217 | utils 218 | 219 | 220 | tree 221 | 222 | 223 | utils 224 | 225 | 226 | lcs 227 | 228 | 229 | utils 230 | 231 | 232 | -------------------------------------------------------------------------------- /src/lcs/lcsbp.cpp: -------------------------------------------------------------------------------- 1 | #include "../core/sequence.h" 2 | #include "lcsbp.h" 3 | #include "lcsbp_classic.h" 4 | 5 | 6 | #if SIMD==SIMD_AVX1 || SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 7 | #include "lcsbp_avx_intr.h" 8 | #endif 9 | 10 | #if SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 11 | #include "lcsbp_avx2_intr.h" 12 | #endif 13 | 14 | #if SIMD==SIMD_NEON 15 | #include "lcsbp_neon_intr.h" 16 | #endif 17 | 18 | #include 19 | 20 | // ******************************************************************* 21 | CLCSBP::CLCSBP(instruction_set_t _instruction_set) 22 | { 23 | instruction_set = _instruction_set; 24 | 25 | lcsbp_classic = std::shared_ptr(new CLCSBP_Classic()); 26 | 27 | #if SIMD==SIMD_AVX1 || SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 28 | lcsbp_avx_intr = std::shared_ptr(new CLCSBP_AVX_INTR()); 29 | #endif 30 | 31 | #if SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 32 | lcsbp_avx2_intr = std::shared_ptr(new CLCSBP_AVX2_INTR()); 33 | #endif 34 | 35 | #if SIMD==SIMD_NEON 36 | lcsbp_neon_intr = std::shared_ptr(new CLCSBP_NEON_INTR()); 37 | #endif 38 | } 39 | 40 | // ******************************************************************* 41 | void CLCSBP::GetLCSBP(CSequence *seq0, CSequence *seq1, CSequence *seq2, CSequence *seq3, CSequence *seq4, 42 | uint32_t *dist) 43 | { 44 | if (seq4 == nullptr) 45 | { 46 | if (seq1 != nullptr) 47 | lcsbp_classic->Calculate(seq0, seq1, dist + 0); 48 | if (seq2 != nullptr) 49 | lcsbp_classic->Calculate(seq0, seq2, dist + 1); 50 | if (seq3 != nullptr) 51 | lcsbp_classic->Calculate(seq0, seq3, dist + 2); 52 | if (seq4 != nullptr) 53 | lcsbp_classic->Calculate(seq0, seq4, dist + 3); 54 | } 55 | else { 56 | #if SIMD==SIMD_NONE 57 | lcsbp_classic->Calculate(seq0, seq1, dist + 0); 58 | lcsbp_classic->Calculate(seq0, seq2, dist + 1); 59 | lcsbp_classic->Calculate(seq0, seq3, dist + 2); 60 | lcsbp_classic->Calculate(seq0, seq4, dist + 3); 61 | #endif 62 | 63 | #if SIMD==SIMD_AVX1 64 | if (instruction_set < instruction_set_t::avx) // In theory SSE2 will suffice, but the SSE2-compiled code is too slow 65 | { 66 | lcsbp_classic->Calculate(seq0, seq1, dist + 0); 67 | lcsbp_classic->Calculate(seq0, seq2, dist + 1); 68 | lcsbp_classic->Calculate(seq0, seq3, dist + 2); 69 | lcsbp_classic->Calculate(seq0, seq4, dist + 3); 70 | } 71 | else { 72 | lcsbp_avx_intr->Calculate(seq0, seq1, seq2, dist + 0); 73 | lcsbp_avx_intr->Calculate(seq0, seq3, seq4, dist + 2); 74 | } 75 | #endif 76 | 77 | #if SIMD==SIMD_AVX2 78 | if (instruction_set < instruction_set_t::avx) // In theory SSE2 will suffice, but the SSE2-compiled code is too slow 79 | { 80 | lcsbp_classic->Calculate(seq0, seq1, dist + 0); 81 | lcsbp_classic->Calculate(seq0, seq2, dist + 1); 82 | lcsbp_classic->Calculate(seq0, seq3, dist + 2); 83 | lcsbp_classic->Calculate(seq0, seq4, dist + 3); 84 | } 85 | else if (instruction_set < instruction_set_t::avx2) { 86 | lcsbp_avx_intr->Calculate(seq0, seq1, seq2, dist + 0); 87 | lcsbp_avx_intr->Calculate(seq0, seq3, seq4, dist + 2); 88 | } 89 | else { 90 | lcsbp_avx2_intr->Calculate(seq0, seq1, seq2, seq3, seq4, dist); 91 | } 92 | #endif 93 | 94 | #if SIMD==SIMD_NEON 95 | lcsbp_neon_intr->Calculate(seq0, seq1, seq2, dist + 0); 96 | lcsbp_neon_intr->Calculate(seq0, seq3, seq4, dist + 2); 97 | #endif 98 | } 99 | } 100 | 101 | // ******************************************************************* 102 | void CLCSBP::GetLCSBP(CSequence *seq0, CSequenceView *sv1, CSequenceView *sv2, CSequenceView *sv3, CSequenceView *sv4, 103 | uint32_t *dist) 104 | { 105 | if (sv4 == nullptr) 106 | { 107 | if (sv1 != nullptr) 108 | lcsbp_classic->Calculate(seq0, sv1, dist + 0); 109 | if (sv2 != nullptr) 110 | lcsbp_classic->Calculate(seq0, sv2, dist + 1); 111 | if (sv3 != nullptr) 112 | lcsbp_classic->Calculate(seq0, sv3, dist + 2); 113 | if (sv4 != nullptr) 114 | lcsbp_classic->Calculate(seq0, sv4, dist + 3); 115 | } 116 | else { 117 | #if SIMD==SIMD_NONE 118 | lcsbp_classic->Calculate(seq0, sv1, dist + 0); 119 | lcsbp_classic->Calculate(seq0, sv2, dist + 1); 120 | lcsbp_classic->Calculate(seq0, sv3, dist + 2); 121 | lcsbp_classic->Calculate(seq0, sv4, dist + 3); 122 | #endif 123 | 124 | #if SIMD==SIMD_AVX1 125 | if (instruction_set < instruction_set_t::avx) // In theory SSE2 will suffice, but the SSE2-compiled code is too slow 126 | { 127 | lcsbp_classic->Calculate(seq0, sv1, dist + 0); 128 | lcsbp_classic->Calculate(seq0, sv2, dist + 1); 129 | lcsbp_classic->Calculate(seq0, sv3, dist + 2); 130 | lcsbp_classic->Calculate(seq0, sv4, dist + 3); 131 | } 132 | else { 133 | lcsbp_avx_intr->Calculate(seq0, sv1, sv2, dist + 0); 134 | lcsbp_avx_intr->Calculate(seq0, sv3, sv4, dist + 2); 135 | } 136 | #endif 137 | 138 | #if SIMD==SIMD_AVX2 139 | if (instruction_set < instruction_set_t::avx) // In theory SSE2 will suffice, but the SSE2-compiled code is too slow 140 | { 141 | lcsbp_classic->Calculate(seq0, sv1, dist + 0); 142 | lcsbp_classic->Calculate(seq0, sv2, dist + 1); 143 | lcsbp_classic->Calculate(seq0, sv3, dist + 2); 144 | lcsbp_classic->Calculate(seq0, sv4, dist + 3); 145 | } 146 | else if (instruction_set < instruction_set_t::avx2) { 147 | lcsbp_avx_intr->Calculate(seq0, sv1, sv2, dist + 0); 148 | lcsbp_avx_intr->Calculate(seq0, sv3, sv4, dist + 2); 149 | } 150 | else { 151 | lcsbp_avx2_intr->Calculate(seq0, sv1, sv2, sv3, sv4, dist); 152 | } 153 | #endif 154 | 155 | #if SIMD==SIMD_NEON 156 | lcsbp_neon_intr->Calculate(seq0, sv1, sv2, dist + 0); 157 | lcsbp_neon_intr->Calculate(seq0, sv3, sv4, dist + 2); 158 | #endif 159 | } 160 | } 161 | 162 | #ifdef DEVELOPER_MODE 163 | // ******************************************************************* 164 | // Compute LCS length for two sequences in the classical way - just for development 165 | double CLCSBP::GetLCS(CSequence &seq1, CSequence &seq2) 166 | { 167 | int **dp_row = new int*[2]; 168 | 169 | for (int i = 0; i < 2; ++i) 170 | dp_row[i] = new int[seq2.length + 1]; 171 | 172 | fill(dp_row[0], dp_row[0] + seq2.length + 1, 0); 173 | 174 | for (int i = 1; i <= (int)seq1.length; ++i) 175 | { 176 | int ii = i % 2; 177 | dp_row[ii][0] = 0; 178 | for (int j = 1; j <= (int)seq2.length; ++j) 179 | if (seq1.data[i - 1] == seq2.data[j - 1]) 180 | dp_row[ii][j] = dp_row[!ii][j - 1] + 1; 181 | else 182 | dp_row[ii][j] = max(dp_row[ii][j - 1], dp_row[!ii][j]); 183 | } 184 | 185 | return dp_row[seq1.length % 2][seq2.length]; 186 | } 187 | #endif 188 | -------------------------------------------------------------------------------- /src/lcs/lcsbp.h: -------------------------------------------------------------------------------- 1 | #ifndef _LSCBP_H 2 | #define _LSCBP_H 3 | 4 | #include 5 | #include "../core/defs.h" 6 | 7 | class CLCSBP_Classic; 8 | class CLCSBP_AVX_INTR; 9 | class CLCSBP_AVX2_INTR; 10 | class CLCSBP_NEON_INTR; 11 | 12 | class CSequence; 13 | struct CSequenceView; 14 | 15 | 16 | class CLCSBP 17 | { 18 | instruction_set_t instruction_set; 19 | 20 | std::shared_ptr lcsbp_classic; 21 | std::shared_ptr lcsbp_avx_intr; 22 | std::shared_ptr lcsbp_avx2_intr; 23 | std::shared_ptr lcsbp_neon_intr; 24 | 25 | public: 26 | CLCSBP(instruction_set_t _instruction_set = instruction_set_t::none); 27 | 28 | void GetLCSBP(CSequence *seq0, CSequence *seq1, CSequence *seq2, CSequence *seq3, CSequence *seq4, 29 | uint32_t *dist); 30 | void GetLCSBP(CSequence *seq0, CSequenceView *sv1, CSequenceView *sv2, CSequenceView *sv3, CSequenceView *sv4, 31 | uint32_t *dist); 32 | 33 | #ifdef DEVELOPER_MODE 34 | double GetLCS(CSequence &seq1, CSequence &seq2); 35 | #endif 36 | }; 37 | 38 | #endif -------------------------------------------------------------------------------- /src/lcs/lcsbp_classic.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include "../lcs/lcsbp_classic.h" 10 | 11 | // ******************************************************************* 12 | // Prepares (if necessary sufficient amount of memory for LCS calculation 13 | void CLCSBP_Classic::prepare_X(uint32_t bv_len) 14 | { 15 | if (bv_len > X_size) 16 | { 17 | if (X) 18 | delete[] X; 19 | 20 | X_size = bv_len; 21 | X = new bit_vec_t[X_size]; 22 | } 23 | } 24 | 25 | // ******************************************************************* 26 | void CLCSBP_Classic::prefetch_bitmasks(CSequence *seq0) 27 | { 28 | if (seq0 == pf_seq0) 29 | return; 30 | 31 | pf_seq0 = seq0; 32 | 33 | for (int i = 0; i < (int) NO_SYMBOLS; ++i) { 34 | s0bm[i] = seq0->p_bit_masks + i * seq0->p_bv_len; 35 | } 36 | } 37 | 38 | // ******************************************************************* 39 | void CLCSBP_Classic::Calculate(CSequence *seq0, CSequence *seq1, 40 | uint32_t *dist) 41 | { 42 | uint32_t bv_len = (seq0->length + bv_size - 1) / bv_size; 43 | 44 | prepare_X(bv_len); 45 | prefetch_bitmasks(seq0); 46 | 47 | dist[0] = 0; 48 | 49 | switch (bv_len) 50 | { 51 | case 1: CLCSBP_Classic_Impl<1, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 52 | case 2: CLCSBP_Classic_Impl<2, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 53 | case 3: CLCSBP_Classic_Impl<3, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 54 | case 4: CLCSBP_Classic_Impl<4, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 55 | case 5: CLCSBP_Classic_Impl<5, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 56 | case 6: CLCSBP_Classic_Impl<6, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 57 | case 7: CLCSBP_Classic_Impl<7, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 58 | case 8: CLCSBP_Classic_Impl<8, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 59 | case 9: CLCSBP_Classic_Impl<9, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 60 | case 10: CLCSBP_Classic_Impl<10, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 61 | case 11: CLCSBP_Classic_Impl<11, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 62 | case 12: CLCSBP_Classic_Impl<12, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 63 | case 13: CLCSBP_Classic_Impl<13, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 64 | case 14: CLCSBP_Classic_Impl<14, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 65 | case 15: CLCSBP_Classic_Impl<15, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 66 | case 16: CLCSBP_Classic_Impl<16, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 67 | case 17: CLCSBP_Classic_Impl<17, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 68 | case 18: CLCSBP_Classic_Impl<18, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 69 | case 19: CLCSBP_Classic_Impl<19, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 70 | case 20: CLCSBP_Classic_Impl<20, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 71 | case 21: CLCSBP_Classic_Impl<21, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 72 | case 22: CLCSBP_Classic_Impl<22, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 73 | case 23: CLCSBP_Classic_Impl<23, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 74 | case 24: CLCSBP_Classic_Impl<24, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 75 | case 25: CLCSBP_Classic_Impl<25, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 76 | case 26: CLCSBP_Classic_Impl<26, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 77 | case 27: CLCSBP_Classic_Impl<27, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 78 | case 28: CLCSBP_Classic_Impl<28, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 79 | case 29: CLCSBP_Classic_Impl<29, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 80 | case 30: CLCSBP_Classic_Impl<30, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 81 | case 31: CLCSBP_Classic_Impl<31, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 82 | case 32: CLCSBP_Classic_Impl<32, CSequence>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 83 | default: 84 | CLCSBP_Classic_Impl<1, CSequence>::LoopCalculate(seq0, seq1, dist, bv_len, X, s0bm); 85 | } 86 | } 87 | 88 | // ******************************************************************* 89 | void CLCSBP_Classic::Calculate(CSequence *seq0, CSequenceView *seq1, 90 | uint32_t *dist) 91 | { 92 | uint32_t bv_len = (seq0->length + bv_size - 1) / bv_size; 93 | 94 | prepare_X(bv_len); 95 | prefetch_bitmasks(seq0); 96 | 97 | dist[0] = 0; 98 | 99 | switch (bv_len) 100 | { 101 | case 1: CLCSBP_Classic_Impl<1, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 102 | case 2: CLCSBP_Classic_Impl<2, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 103 | case 3: CLCSBP_Classic_Impl<3, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 104 | case 4: CLCSBP_Classic_Impl<4, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 105 | case 5: CLCSBP_Classic_Impl<5, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 106 | case 6: CLCSBP_Classic_Impl<6, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 107 | case 7: CLCSBP_Classic_Impl<7, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 108 | case 8: CLCSBP_Classic_Impl<8, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 109 | case 9: CLCSBP_Classic_Impl<9, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 110 | case 10: CLCSBP_Classic_Impl<10, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 111 | case 11: CLCSBP_Classic_Impl<11, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 112 | case 12: CLCSBP_Classic_Impl<12, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 113 | case 13: CLCSBP_Classic_Impl<13, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 114 | case 14: CLCSBP_Classic_Impl<14, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 115 | case 15: CLCSBP_Classic_Impl<15, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 116 | case 16: CLCSBP_Classic_Impl<16, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 117 | case 17: CLCSBP_Classic_Impl<17, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 118 | case 18: CLCSBP_Classic_Impl<18, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 119 | case 19: CLCSBP_Classic_Impl<19, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 120 | case 20: CLCSBP_Classic_Impl<20, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 121 | case 21: CLCSBP_Classic_Impl<21, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 122 | case 22: CLCSBP_Classic_Impl<22, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 123 | case 23: CLCSBP_Classic_Impl<23, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 124 | case 24: CLCSBP_Classic_Impl<24, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 125 | case 25: CLCSBP_Classic_Impl<25, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 126 | case 26: CLCSBP_Classic_Impl<26, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 127 | case 27: CLCSBP_Classic_Impl<27, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 128 | case 28: CLCSBP_Classic_Impl<28, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 129 | case 29: CLCSBP_Classic_Impl<29, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 130 | case 30: CLCSBP_Classic_Impl<30, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 131 | case 31: CLCSBP_Classic_Impl<31, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 132 | case 32: CLCSBP_Classic_Impl<32, CSequenceView>::UnrolledCalculate(seq0, seq1, dist, X, s0bm); break; 133 | default: 134 | CLCSBP_Classic_Impl<1, CSequenceView>::LoopCalculate(seq0, seq1, dist, bv_len, X, s0bm); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/lcs/lcsbp_classic.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _LCSBP_CLASSIC_H 10 | #define _LCSBP_CLASSIC_H 11 | 12 | #include "../core/sequence.h" 13 | #include "../utils/meta_oper.h" 14 | 15 | template class CLCSBP_Classic_Impl; 16 | 17 | using bit_vec_iterator_t = bit_vec_t*; 18 | 19 | class CLCSBP_Classic 20 | { 21 | bit_vec_t *X; 22 | uint32_t X_size; 23 | 24 | CSequence *pf_seq0; 25 | bit_vec_iterator_t s0bm[NO_SYMBOLS]; 26 | 27 | inline void prepare_X(uint32_t bv_len); 28 | void prefetch_bitmasks(CSequence* seq0); 29 | 30 | public: 31 | CLCSBP_Classic() { 32 | X_size = 0; 33 | X = nullptr; 34 | 35 | pf_seq0 = nullptr; 36 | }; 37 | 38 | ~CLCSBP_Classic() { 39 | if (X) 40 | delete[] X; 41 | }; 42 | 43 | void Calculate(CSequence* seq0, CSequence* seq1, 44 | uint32_t* dist); 45 | void Calculate(CSequence* seq0, CSequenceView* seq1, 46 | uint32_t* dist); 47 | }; 48 | 49 | template class CLCSBP_Classic_Impl { 50 | public: 51 | #define CLASSIC_LCS_INNER_LOOP \ 52 | { \ 53 | V = *pX; \ 54 | tB = V & *s0b++; \ 55 | V2 = V + tB + sB; \ 56 | sB = V2 < V; \ 57 | *pX++ = V2 | (V - tB); \ 58 | } 59 | 60 | #define CLASSIC_POP_CNT_LOOP \ 61 | { \ 62 | for (V = ~*pX; V; V &= V - 1) \ 63 | ++res[0]; \ 64 | pX++; \ 65 | } 66 | 67 | static void LoopCalculate(CSequence* seq0, SeqType* seq1, uint32_t* res, uint32_t bv_len, bit_vec_t* X, bit_vec_iterator_t* s0bm) 68 | { 69 | bit_vec_t V, tB, V2, sB; 70 | 71 | for (size_t i = 0; i < bv_len; ++i) 72 | X[i] = ~(uint64_t)0; 73 | 74 | auto pc = seq1->data; 75 | 76 | for (size_t i = 0; i < seq1->length; ++i) 77 | { 78 | sB = (bit_vec_t)0; 79 | auto s0b = s0bm[*pc]; 80 | auto pX = X; 81 | 82 | if (*pc++ == UNKNOWN_SYMBOL) // Unknown aminoacid 83 | continue; 84 | 85 | for (size_t j = 0; j < bv_len; ++j) 86 | { 87 | V = *pX; 88 | tB = V & *s0b++; 89 | V2 = V + tB + sB; 90 | sB = V2 < V; 91 | *pX++ = V2 | (V - tB); 92 | } 93 | } 94 | 95 | for (size_t i = 0; i < bv_len; ++i) 96 | for (V = ~X[i]; V; V &= V - 1) 97 | ++res[0]; 98 | } 99 | 100 | 101 | 102 | static void UnrolledCalculate(CSequence *seq0, SeqType *seq1, uint32_t *res, bit_vec_t *X, bit_vec_iterator_t *s0bm) 103 | { 104 | bit_vec_t V, tB, V2, sB; 105 | 106 | auto pc = seq1->data; 107 | auto pX0 = X; 108 | 109 | if (BV_LEN > 0) *pX0++ = ~(uint64_t)0; 110 | if (BV_LEN > 1) *pX0++ = ~(uint64_t)0; 111 | if (BV_LEN > 2) *pX0++ = ~(uint64_t)0; 112 | if (BV_LEN > 3) *pX0++ = ~(uint64_t)0; 113 | if (BV_LEN > 4) *pX0++ = ~(uint64_t)0; 114 | if (BV_LEN > 5) *pX0++ = ~(uint64_t)0; 115 | if (BV_LEN > 6) *pX0++ = ~(uint64_t)0; 116 | if (BV_LEN > 7) *pX0++ = ~(uint64_t)0; 117 | if (BV_LEN > 8) *pX0++ = ~(uint64_t)0; 118 | if (BV_LEN > 9) *pX0++ = ~(uint64_t)0; 119 | if (BV_LEN > 10) *pX0++ = ~(uint64_t)0; 120 | if (BV_LEN > 11) *pX0++ = ~(uint64_t)0; 121 | if (BV_LEN > 12) *pX0++ = ~(uint64_t)0; 122 | if (BV_LEN > 13) *pX0++ = ~(uint64_t)0; 123 | if (BV_LEN > 14) *pX0++ = ~(uint64_t)0; 124 | if (BV_LEN > 15) *pX0++ = ~(uint64_t)0; 125 | if (BV_LEN > 16) *pX0++ = ~(uint64_t)0; 126 | if (BV_LEN > 17) *pX0++ = ~(uint64_t)0; 127 | if (BV_LEN > 18) *pX0++ = ~(uint64_t)0; 128 | if (BV_LEN > 19) *pX0++ = ~(uint64_t)0; 129 | if (BV_LEN > 20) *pX0++ = ~(uint64_t)0; 130 | if (BV_LEN > 21) *pX0++ = ~(uint64_t)0; 131 | if (BV_LEN > 22) *pX0++ = ~(uint64_t)0; 132 | if (BV_LEN > 23) *pX0++ = ~(uint64_t)0; 133 | if (BV_LEN > 24) *pX0++ = ~(uint64_t)0; 134 | if (BV_LEN > 25) *pX0++ = ~(uint64_t)0; 135 | if (BV_LEN > 26) *pX0++ = ~(uint64_t)0; 136 | if (BV_LEN > 27) *pX0++ = ~(uint64_t)0; 137 | if (BV_LEN > 28) *pX0++ = ~(uint64_t)0; 138 | if (BV_LEN > 29) *pX0++ = ~(uint64_t)0; 139 | if (BV_LEN > 30) *pX0++ = ~(uint64_t)0; 140 | if (BV_LEN > 31) *pX0++ = ~(uint64_t)0; 141 | 142 | for (size_t i = 0; i < seq1->length; ++i) 143 | { 144 | sB = (bit_vec_t)0; 145 | 146 | auto pX = X; 147 | auto s0b = s0bm[*pc]; 148 | 149 | if (*pc++ == UNKNOWN_SYMBOL) // Unknown aminoacid 150 | continue; 151 | 152 | if (BV_LEN > 0) CLASSIC_LCS_INNER_LOOP; 153 | if (BV_LEN > 1) CLASSIC_LCS_INNER_LOOP; 154 | if (BV_LEN > 2) CLASSIC_LCS_INNER_LOOP; 155 | if (BV_LEN > 3) CLASSIC_LCS_INNER_LOOP; 156 | if (BV_LEN > 4) CLASSIC_LCS_INNER_LOOP; 157 | if (BV_LEN > 5) CLASSIC_LCS_INNER_LOOP; 158 | if (BV_LEN > 6) CLASSIC_LCS_INNER_LOOP; 159 | if (BV_LEN > 7) CLASSIC_LCS_INNER_LOOP; 160 | if (BV_LEN > 8) CLASSIC_LCS_INNER_LOOP; 161 | if (BV_LEN > 9) CLASSIC_LCS_INNER_LOOP; 162 | if (BV_LEN > 10) CLASSIC_LCS_INNER_LOOP; 163 | if (BV_LEN > 11) CLASSIC_LCS_INNER_LOOP; 164 | if (BV_LEN > 12) CLASSIC_LCS_INNER_LOOP; 165 | if (BV_LEN > 13) CLASSIC_LCS_INNER_LOOP; 166 | if (BV_LEN > 14) CLASSIC_LCS_INNER_LOOP; 167 | if (BV_LEN > 15) CLASSIC_LCS_INNER_LOOP; 168 | if (BV_LEN > 16) CLASSIC_LCS_INNER_LOOP; 169 | if (BV_LEN > 17) CLASSIC_LCS_INNER_LOOP; 170 | if (BV_LEN > 18) CLASSIC_LCS_INNER_LOOP; 171 | if (BV_LEN > 19) CLASSIC_LCS_INNER_LOOP; 172 | if (BV_LEN > 20) CLASSIC_LCS_INNER_LOOP; 173 | if (BV_LEN > 21) CLASSIC_LCS_INNER_LOOP; 174 | if (BV_LEN > 22) CLASSIC_LCS_INNER_LOOP; 175 | if (BV_LEN > 23) CLASSIC_LCS_INNER_LOOP; 176 | if (BV_LEN > 24) CLASSIC_LCS_INNER_LOOP; 177 | if (BV_LEN > 25) CLASSIC_LCS_INNER_LOOP; 178 | if (BV_LEN > 26) CLASSIC_LCS_INNER_LOOP; 179 | if (BV_LEN > 27) CLASSIC_LCS_INNER_LOOP; 180 | if (BV_LEN > 28) CLASSIC_LCS_INNER_LOOP; 181 | if (BV_LEN > 29) CLASSIC_LCS_INNER_LOOP; 182 | if (BV_LEN > 30) CLASSIC_LCS_INNER_LOOP; 183 | if (BV_LEN > 31) CLASSIC_LCS_INNER_LOOP; 184 | } 185 | 186 | 187 | auto pX = X; 188 | 189 | if (BV_LEN > 0) CLASSIC_POP_CNT_LOOP; 190 | if (BV_LEN > 1) CLASSIC_POP_CNT_LOOP; 191 | if (BV_LEN > 2) CLASSIC_POP_CNT_LOOP; 192 | if (BV_LEN > 3) CLASSIC_POP_CNT_LOOP; 193 | if (BV_LEN > 4) CLASSIC_POP_CNT_LOOP; 194 | if (BV_LEN > 5) CLASSIC_POP_CNT_LOOP; 195 | if (BV_LEN > 6) CLASSIC_POP_CNT_LOOP; 196 | if (BV_LEN > 7) CLASSIC_POP_CNT_LOOP; 197 | if (BV_LEN > 8) CLASSIC_POP_CNT_LOOP; 198 | if (BV_LEN > 9) CLASSIC_POP_CNT_LOOP; 199 | if (BV_LEN > 10) CLASSIC_POP_CNT_LOOP; 200 | if (BV_LEN > 11) CLASSIC_POP_CNT_LOOP; 201 | if (BV_LEN > 12) CLASSIC_POP_CNT_LOOP; 202 | if (BV_LEN > 13) CLASSIC_POP_CNT_LOOP; 203 | if (BV_LEN > 14) CLASSIC_POP_CNT_LOOP; 204 | if (BV_LEN > 15) CLASSIC_POP_CNT_LOOP; 205 | if (BV_LEN > 16) CLASSIC_POP_CNT_LOOP; 206 | if (BV_LEN > 17) CLASSIC_POP_CNT_LOOP; 207 | if (BV_LEN > 18) CLASSIC_POP_CNT_LOOP; 208 | if (BV_LEN > 19) CLASSIC_POP_CNT_LOOP; 209 | if (BV_LEN > 20) CLASSIC_POP_CNT_LOOP; 210 | if (BV_LEN > 21) CLASSIC_POP_CNT_LOOP; 211 | if (BV_LEN > 22) CLASSIC_POP_CNT_LOOP; 212 | if (BV_LEN > 23) CLASSIC_POP_CNT_LOOP; 213 | if (BV_LEN > 24) CLASSIC_POP_CNT_LOOP; 214 | if (BV_LEN > 25) CLASSIC_POP_CNT_LOOP; 215 | if (BV_LEN > 26) CLASSIC_POP_CNT_LOOP; 216 | if (BV_LEN > 27) CLASSIC_POP_CNT_LOOP; 217 | if (BV_LEN > 28) CLASSIC_POP_CNT_LOOP; 218 | if (BV_LEN > 29) CLASSIC_POP_CNT_LOOP; 219 | if (BV_LEN > 30) CLASSIC_POP_CNT_LOOP; 220 | if (BV_LEN > 31) CLASSIC_POP_CNT_LOOP; 221 | } 222 | }; 223 | 224 | #endif 225 | -------------------------------------------------------------------------------- /src/msa.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _MSA_H 10 | #define _MSA_H 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "./core/queues.h" 21 | #include "./core/sequence.h" 22 | #include "./core/profile.h" 23 | #include "./core/params.h" 24 | #include "./utils/timer.h" 25 | #include "./utils/statistics.h" 26 | 27 | #include "./lcs/lcsbp.h" 28 | #include "./lcs/lcsbp_classic.h" 29 | 30 | class AbstractTreeGenerator; 31 | 32 | class CFAMSA 33 | { 34 | protected: 35 | static const int TIMER_SORTING = 0; 36 | static const int TIMER_TREE_BUILD = 1; 37 | static const int TIMER_ALIGNMENT = 2; 38 | static const int TIMER_REFINMENT = 3; 39 | static const int TIMER_TREE_STORE = 4; 40 | 41 | static double SM_MIQS[24][24]; 42 | 43 | CParams params; 44 | instruction_set_t instruction_set; 45 | 46 | vector> score_matrix; 47 | vector score_vector; 48 | 49 | vector gapped_sequences; 50 | 51 | map profiles; 52 | CProfile *final_profile; 53 | 54 | mt19937 rnd_rfn; 55 | 56 | set already_refined; 57 | 58 | CStopWatch timers[5]; 59 | 60 | Statistics statistics; 61 | 62 | #ifdef DEBUG_MODE 63 | double estimated_identity; 64 | #endif 65 | 66 | void initScoreMatrix(); 67 | 68 | #ifdef DEVELOPER_MODE 69 | vector ref_sequences; 70 | bool LoadRefSequences(); 71 | #endif 72 | 73 | void RefineRandom(CProfile* profile_to_refine, vector &dest_prof_id); 74 | void RefineMostEmptyAndFullColumn(CProfile *profile_to_refine, vector &dest_prof_id, vector &gap_stats, bool valid_gap_stats); 75 | 76 | std::shared_ptr createTreeGenerator(const CParams& params); 77 | 78 | void sortAndExtendSequences(std::vector& sequences); 79 | void extendSequences(std::vector& sequences); 80 | void shrinkSequences(std::vector& sequences); 81 | void removeDuplicates(std::vector& sorted_seqs, std::vector& original2sorted); 82 | 83 | public: 84 | 85 | 86 | CFAMSA(CParams& _params); 87 | ~CFAMSA(); 88 | 89 | CProfile* ComputeAlignment(std::vector& gapped_sequences, tree_structure& guide_tree); 90 | #ifdef DEBUG_MODE 91 | bool RefineAlignment(string output_file_name = ""); 92 | #else 93 | bool RefineAlignment(CProfile *&profile_to_refine); 94 | #endif 95 | 96 | bool GetAlignment(vector &result); 97 | void adjustParams(int n_seqs); 98 | 99 | score_t GetScore() { return final_profile != nullptr ? final_profile->total_score : 0; } 100 | 101 | const Statistics& getStatistics() const { return statistics; } 102 | Statistics& getStatistics() { return statistics; } 103 | 104 | #ifdef DEBUG_MODE 105 | double GetEstimatedIdentity() 106 | { 107 | return estimated_identity; 108 | }; 109 | #endif 110 | 111 | bool ComputeMSA(vector& sequences); 112 | 113 | bool alignProfiles(vector& p1, vector& p2); 114 | }; 115 | 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /src/msa_refinement.cpp: -------------------------------------------------------------------------------- 1 | #include "msa.h" 2 | 3 | #include "./utils/log.h" 4 | #include 5 | 6 | // ******************************************************************* 7 | void CFAMSA::RefineRandom(CProfile* profile_to_refine, vector& dest_prof_id) 8 | { 9 | for (size_t i = 0; i < profile_to_refine->data.size(); ++i) 10 | dest_prof_id.emplace_back(rnd_rfn() % 2); 11 | 12 | if (count(dest_prof_id.begin(), dest_prof_id.end(), 0) == 0 || 13 | count(dest_prof_id.begin(), dest_prof_id.end(), 1) == 0) // Both profiles must contain at least 1 sequence 14 | { 15 | size_t id = rnd_rfn() % dest_prof_id.size(); 16 | dest_prof_id[id] = !dest_prof_id[id]; 17 | } 18 | } 19 | 20 | // ******************************************************************* 21 | void CFAMSA::RefineMostEmptyAndFullColumn(CProfile* profile_to_refine, vector& dest_prof_id, vector& gap_stats, bool valid_gap_stats) 22 | { 23 | size_t size = profile_to_refine->data.front()->gapped_size; 24 | size_t card = profile_to_refine->data.size(); 25 | 26 | dest_prof_id.clear(); 27 | 28 | if (!valid_gap_stats) 29 | profile_to_refine->GetGapStats(gap_stats); 30 | 31 | vector> tmp; 32 | 33 | for (size_t i = 1; i <= size; ++i) 34 | { 35 | int x = (int)min(gap_stats[i], card - gap_stats[i]); 36 | if (x > 0) 37 | tmp.emplace_back(i, x); 38 | } 39 | 40 | stable_sort(tmp.begin(), tmp.end(), [](const pair& x, const pair& y) { 41 | if (x.second != y.second) 42 | return x.second < y.second; 43 | else 44 | return x.first < y.first; 45 | }); 46 | 47 | if (tmp.empty()) 48 | { 49 | RefineRandom(profile_to_refine, dest_prof_id); 50 | return; 51 | } 52 | 53 | size_t col_id = tmp[rnd_rfn() % tmp.size()].first; 54 | 55 | int first_prof_id = 0; 56 | int second_prof_id = 1; 57 | 58 | if (profile_to_refine->data[0]->GetSymbol(col_id) == GAP) 59 | swap(first_prof_id, second_prof_id); 60 | 61 | for (size_t j = 0; j < card; ++j) 62 | if (profile_to_refine->data[j]->GetSymbol(col_id) == GAP) 63 | dest_prof_id.emplace_back(first_prof_id); 64 | else 65 | dest_prof_id.emplace_back(second_prof_id); 66 | } 67 | 68 | // ******************************************************************* 69 | // Refine alignment 70 | #ifdef DEBUG_MODE 71 | bool CFAMSA::RefineAlignment(string output_file_name) 72 | #else 73 | bool CFAMSA::RefineAlignment(CProfile*& profile_to_refine) 74 | #endif 75 | { 76 | // Restart generator 77 | rnd_rfn.seed(5489u); 78 | 79 | if (params.refinement_mode == Refinement::OFF || 80 | (params.refinement_mode == Refinement::AUTO && profile_to_refine->Size() > params.thr_refinement)) { 81 | return true; 82 | } 83 | 84 | size_t n_ref = params.n_refinements; 85 | size_t n_seq = profile_to_refine->Size(); 86 | 87 | vector gap_stats; 88 | 89 | if (n_ref > 2 * n_seq) 90 | n_ref = 2 * n_seq; 91 | if (n_ref > 0 && n_ref < 100 && n_seq < 100) 92 | n_ref = 100; 93 | 94 | #ifdef DEBUG_MODE 95 | FILE* f_stat; 96 | 97 | if (output_file_name != "") 98 | { 99 | vector result; 100 | GetAlignment(result); 101 | 102 | COutputFile out_file; 103 | 104 | out_file.PutSequences(result); 105 | out_file.SaveFile(output_file_name + to_string(0)); 106 | 107 | f_stat = fopen((output_file_name + "_stats").c_str(), "wt"); 108 | 109 | fprintf(f_stat, "%d %f\n", final_profile->width, final_profile->CalculateTotalScore()); 110 | } 111 | #endif 112 | 113 | int n_ref_succ = 0; 114 | score_t prev_total_score = profile_to_refine->CalculateTotalScore(); 115 | 116 | sort(profile_to_refine->data.begin(), profile_to_refine->data.end(), [](CGappedSequence* p, CGappedSequence* q) {return p->id < q->id; }); 117 | 118 | vector dest_prof_id; 119 | vector> old_dest_prof_ids; 120 | 121 | vector column_mapping1, column_mapping2; 122 | 123 | size_t i_ref; 124 | size_t i_succ_ref; 125 | bool valid_gap_stats = false; 126 | #ifdef DEBUG_MODE 127 | int ref_upd[2] = { 0 }; 128 | int hist_size[20] = { 0 }; 129 | #endif 130 | 131 | for (i_ref = i_succ_ref = 0; i_succ_ref < n_ref && i_ref < 20 * n_ref; ++i_ref) 132 | { 133 | LOG_DEBUG << "Computing refinement - " << fixed << setprecision(1) << 100.0 * (double)i_succ_ref / (double)n_ref << "% (" << i_succ_ref << " of " << n_ref << ") \r"; 134 | 135 | CProfile profile1(¶ms), profile2(¶ms); 136 | 137 | RefineMostEmptyAndFullColumn(profile_to_refine, dest_prof_id, gap_stats, valid_gap_stats); 138 | valid_gap_stats = true; 139 | 140 | if (find(old_dest_prof_ids.begin(), old_dest_prof_ids.end(), dest_prof_id) == old_dest_prof_ids.end()) 141 | { 142 | // Split into two profiles 143 | for (size_t i = 0; i < profile_to_refine->data.size(); ++i) 144 | if (dest_prof_id[i]) 145 | profile1.AppendRawSequence(*profile_to_refine->data[i]); 146 | else 147 | profile2.AppendRawSequence(*profile_to_refine->data[i]); 148 | 149 | // Condense the profiles (remove empty columns) 150 | profile1.Condense(column_mapping1); 151 | profile2.Condense(column_mapping2); 152 | 153 | profile1.OptimizeGaps(); 154 | profile2.OptimizeGaps(); 155 | 156 | profile1.Size(); 157 | profile2.Size(); 158 | 159 | #ifdef DEBUG_MODE 160 | int size_min = min(p1_size, p2_size); 161 | hist_size[min(9, size_min)]++; 162 | #endif 163 | 164 | CProfile* prof = new CProfile(¶ms); 165 | 166 | // TODO: Enable parallelization here! 167 | prof->Align(&profile1, &profile2, 1, 0, &column_mapping1, &column_mapping2); 168 | sort(prof->data.begin(), prof->data.end(), [](CGappedSequence* p, CGappedSequence* q) {return p->id < q->id; }); 169 | 170 | if (!(*prof == *profile_to_refine)) // if the new profile is the same as previous do not score it 171 | { 172 | prof->CalculateTotalScore(); 173 | #ifdef DEBUG_MODE 174 | ref_upd[0]++; 175 | #endif 176 | 177 | if (prof->total_score >= prev_total_score) 178 | { 179 | prev_total_score = prof->total_score; 180 | swap(profile_to_refine, prof); 181 | ++n_ref_succ; 182 | old_dest_prof_ids.clear(); 183 | valid_gap_stats = false; 184 | #ifdef DEBUG_MODE 185 | ref_upd[1]++; 186 | hist_size[10 + min(9, size_min)]++; 187 | #endif 188 | } 189 | } 190 | 191 | delete prof; 192 | 193 | old_dest_prof_ids.emplace_back(dest_prof_id); 194 | i_succ_ref++; 195 | 196 | #ifdef DEBUG_MODE 197 | if (output_file_name != "") 198 | { 199 | vector result; 200 | GetAlignment(result); 201 | 202 | COutputFile out_file; 203 | 204 | out_file.PutSequences(result); 205 | out_file.SaveFile(output_file_name + to_string(i_ref + 1)); 206 | fprintf(f_stat, "%d %f p.sizes: %5d %5d\n", final_profile->width, final_profile->total_score, p1_size, p2_size); 207 | } 208 | #endif 209 | } 210 | } 211 | 212 | #ifdef DEBUG_MODE 213 | if (output_file_name != "") 214 | fclose(f_stat); 215 | #endif 216 | 217 | return true; 218 | } -------------------------------------------------------------------------------- /src/tree/AbstractTreeGenerator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #include "AbstractTreeGenerator.h" 9 | 10 | #if SIMD==SIMD_AVX1 || SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 11 | #include "../utils/cpuid.h" 12 | #endif 13 | 14 | #include 15 | 16 | using namespace std; 17 | 18 | 19 | // ******************************************************************* 20 | AbstractTreeGenerator::AbstractTreeGenerator(int n_threads, instruction_set_t instruction_set) 21 | : n_threads(n_threads), instruction_set(instruction_set) { 22 | 23 | } 24 | 25 | // ******************************************************************* 26 | void AbstractTreeGenerator::operator()(std::vector& sequences, tree_structure& tree) 27 | { 28 | tree.clear(); 29 | tree.resize(sequences.size(), std::make_pair(-1, -1)); 30 | 31 | // build the tree 32 | run(sequences, tree); 33 | } 34 | 35 | 36 | // ******************************************************************* 37 | #ifdef DEVELOPER_MODE 38 | size_t AbstractTreeGeneator::refSequencesSubTreeSize( 39 | const vector& sequences, 40 | const vector& ref_sequences, 41 | double *monte_carlo_subtree_size) 42 | { 43 | const int monte_carlo_trials = 1000; 44 | 45 | set ref_seq_ids; 46 | int n_seq = sequences.size(); 47 | int r = 0; 48 | 49 | if (ref_sequences.size() == 1) 50 | return 1; 51 | 52 | // Find the ids of the referential sequences in the input file 53 | for (int i = 0; i < n_seq; ++i) 54 | { 55 | bool is_ref = false; 56 | for (auto &y : ref_sequences) 57 | if (sequences[i].id == y.id) 58 | is_ref = true; 59 | 60 | if (is_ref) 61 | ref_seq_ids.insert(i); 62 | } 63 | 64 | r = subTreeSize(sequences, ref_sequences, ref_seq_ids); 65 | 66 | if (monte_carlo_subtree_size) 67 | { 68 | mt19937 mt; 69 | double mc_r = 0; 70 | 71 | for (int i = 0; i < monte_carlo_trials; ++i) 72 | { 73 | set mc_seq_ids; 74 | 75 | while (mc_seq_ids.size() < ref_seq_ids.size()) 76 | mc_seq_ids.insert(mt() % n_seq); 77 | mc_r += subTreeSize(sequences, ref_sequences, mc_seq_ids); 78 | } 79 | 80 | *monte_carlo_subtree_size = mc_r / (double)monte_carlo_trials; 81 | } 82 | 83 | return r; 84 | } 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /src/tree/AbstractTreeGenerator.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "TreeDefs.h" 11 | 12 | #include "../core/sequence.h" 13 | #include "../core/defs.h" 14 | 15 | #include 16 | 17 | class CLCSBP; 18 | #include "../lcs/lcsbp.h" 19 | 20 | 21 | class AbstractTreeGenerator { 22 | public: 23 | 24 | AbstractTreeGenerator(int n_threads, instruction_set_t instruction_set); 25 | 26 | virtual ~AbstractTreeGenerator() {} 27 | 28 | void operator()(std::vector& sequences, tree_structure& tree); 29 | 30 | template 31 | void calculateDistanceVector( 32 | Transform& transform, 33 | seq_type& ref, 34 | seq_type* sequences, 35 | int n_seqs, 36 | distance_type* out_vector, 37 | CLCSBP& lcsbp); 38 | 39 | template 40 | void calculateDistanceRange( 41 | Transform& transform, 42 | seq_type& ref, 43 | seq_type* sequences, 44 | pair ids_range, 45 | distance_type* out_vector, 46 | CLCSBP& lcsbp); 47 | 48 | template 49 | void calculateDistanceRangeSV( 50 | Transform& transform, 51 | seq_type& ref, 52 | sv_type* sv, 53 | pair ids_range, 54 | distance_type* out_vector, 55 | CLCSBP& lcsbp); 56 | 57 | template 58 | void calculateDistanceMatrix( 59 | Transform& transform, 60 | seq_type* sequences, 61 | int n_seq, 62 | distance_type* out_matrix, 63 | CLCSBP& lcsbp); 64 | 65 | 66 | #ifdef DEVELOPER_MODE 67 | size_t refSequencesSubTreeSize( 68 | const std::vector& sequences, 69 | const std::vector& ref_sequences, 70 | double *monte_carlo_subtree_size); 71 | 72 | #endif 73 | 74 | protected: 75 | int n_threads; 76 | instruction_set_t instruction_set; 77 | 78 | virtual void run(std::vector& sequences, tree_structure& tree) = 0; 79 | 80 | #ifdef DEVELOPER_MODE 81 | size_t subTreeSize( 82 | const std::vector& sequences, 83 | const std::vector& ref_sequences, 84 | const set &seq_ids); 85 | #endif 86 | 87 | }; 88 | -------------------------------------------------------------------------------- /src/tree/Chained.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "AbstractTreeGenerator.h" 4 | 5 | 6 | 7 | #ifdef DEVELOPER_MODE 8 | void GuideTree::computeChained(std::vector& sequences); 9 | { 10 | mt19937 rnd; 11 | 12 | if (sequences.size() < 2) 13 | return; 14 | 15 | vector idx(sequences.size()); 16 | 17 | for (int i = 0; i < sequences.size(); ++i) 18 | idx[i] = i; 19 | 20 | random_device rd; 21 | 22 | // Skip some number of initial values 23 | for (int i = 0; i < seed; ++i) 24 | rd(); 25 | 26 | mt19937 g(rd()); 27 | 28 | shuffle(idx.begin(), idx.end(), g); 29 | 30 | guide_tree.emplace_back(idx[0], idx[1]); 31 | 32 | for (int i = 2; i < sequences.size(); ++i) 33 | guide_tree.emplace_back(idx[i], guide_tree.size() - 1); 34 | } 35 | 36 | #endif 37 | 38 | -------------------------------------------------------------------------------- /src/tree/Clustering.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | class IClustering { 6 | public: 7 | virtual void operator()( 8 | const float* distanceMatrix, 9 | int n_elems, 10 | int n_medoids, 11 | int n_fixed_medoids, 12 | int* centers) = 0; 13 | 14 | virtual ~IClustering() {} 15 | }; 16 | 17 | class CLARANS : public IClustering { 18 | 19 | const int minMaxNeighbor = 250; 20 | const float exploreFraction; 21 | const int numLocal; 22 | 23 | public: 24 | 25 | CLARANS(float exploreFraction, int numLocal) : exploreFraction(exploreFraction), numLocal(numLocal) {} 26 | 27 | void operator()(const float* distanceMatrix, int n_elems, int n_medoids, int n_fixed_medoids, int* medoids) override; 28 | 29 | protected: 30 | 31 | float calculateCost(const float* distanceMatrix, int *candidate, int n_elems, int n_medoids); 32 | 33 | void updateAssignment( 34 | int x, 35 | int *candidate, 36 | int n_medoids, 37 | const float* D, 38 | float& dist_nearest, 39 | float& dist_second, 40 | int& assign_nearest, 41 | int& assign_second); 42 | 43 | }; 44 | 45 | -------------------------------------------------------------------------------- /src/tree/DistanceCalculator.cpp: -------------------------------------------------------------------------------- 1 | #include "DistanceCalculator.h" 2 | #include "AbstractTreeGenerator.hpp" 3 | #include "SingleLinkage.h" 4 | 5 | #include "../utils/conversion.h" 6 | #include "SingleLinkageQueue.h" 7 | 8 | #include 9 | 10 | template 11 | void DistanceCalculator<_distance>::run(std::vector& sequences, tree_structure& tree) { 12 | 13 | std::ofstream ofs(out_file); 14 | // put header line only when full matrix is needed 15 | if (generate_square_matrix) { 16 | for (const auto seq : sequences) { 17 | ofs << ',' << seq->id.c_str() + 1; 18 | } 19 | ofs << endl; 20 | } 21 | 22 | // 23 | int n_seqs = (int)sequences.size(); 24 | CSingleLinkageQueue queue(&sequences, (uint32_t) sequences.size(), n_threads * 8); 25 | std::vector workers(n_threads); 26 | 27 | // run workers 28 | for (int tid = 0; tid < n_threads; ++tid) { 29 | workers[tid] = thread([&queue, this]() { 30 | 31 | CLCSBP lcsbp(instruction_set); 32 | int row_id; 33 | std::vector* sequences; 34 | vector* dist_vector; 35 | vector loc_dist_vector; 36 | 37 | if (calculate_pid) { 38 | Transform transform; 39 | 40 | while (queue.GetTask(row_id, sequences, dist_vector)) { 41 | loc_dist_vector.resize(dist_vector->size()); 42 | int to_calculate = generate_square_matrix ? (int)sequences->size() : row_id; 43 | 44 | calculateDistanceVector( 45 | transform, 46 | (*sequences)[row_id], 47 | sequences->data(), 48 | to_calculate, 49 | loc_dist_vector.data(), 50 | lcsbp); 51 | 52 | //loc_dist_vector[row_id] = 1.0; 53 | swap(*dist_vector, loc_dist_vector); 54 | 55 | //cout << "push " << row_id << endl; 56 | queue.RegisterSolution(row_id); 57 | } 58 | } 59 | else { 60 | Transform transform; 61 | 62 | while (queue.GetTask(row_id, sequences, dist_vector)) { 63 | loc_dist_vector.resize(dist_vector->size()); 64 | int to_calculate = generate_square_matrix ? (int)sequences->size() : row_id; 65 | 66 | calculateDistanceVector( 67 | transform, 68 | (*sequences)[row_id], 69 | sequences->data(), 70 | to_calculate, 71 | loc_dist_vector.data(), 72 | lcsbp); 73 | 74 | swap(*dist_vector, loc_dist_vector); 75 | 76 | //cout << "push " << row_id << endl; 77 | queue.RegisterSolution(row_id); 78 | } 79 | } 80 | 81 | }); 82 | } 83 | 84 | char* out_row = new char[10000 + sequences.size() * 100]; 85 | char *ptr = out_row; 86 | 87 | // Gather results in one thread 88 | for (int row_id = 0; row_id < n_seqs; ++row_id) { 89 | 90 | vector* dist_vector; 91 | 92 | queue.GetSolution(row_id, dist_vector); 93 | //cout << "pop " << row_id << endl; 94 | //if ((row_id + 1) % 100 == 0) { 95 | // cout << "\r" << row_id + 1 << "... " << std::flush; 96 | //} 97 | 98 | ptr = out_row; 99 | ptr += sprintf(ptr, "%s,", sequences[row_id]->id.c_str() + 1); 100 | 101 | if (generate_square_matrix) { 102 | ptr += num2str(dist_vector->data(), dist_vector->size(), ',', ptr); 103 | } 104 | else { 105 | ptr += num2str(dist_vector->data(), row_id, ',', ptr); 106 | } 107 | 108 | queue.ReleaseSolution(row_id); 109 | //cout << "return " << row_id << endl; 110 | --ptr; 111 | *ptr++ = '\n'; 112 | ofs.write(out_row, ptr - out_row); 113 | } 114 | 115 | delete[] out_row; 116 | 117 | // make sure all threads have finished 118 | for (auto &w : workers) { 119 | w.join(); 120 | } 121 | 122 | } 123 | 124 | 125 | 126 | // ******************************************************************* 127 | // Explicit template specializations for specified distance measures 128 | 129 | template class DistanceCalculator; 130 | template class DistanceCalculator; 131 | -------------------------------------------------------------------------------- /src/tree/DistanceCalculator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "AbstractTreeGenerator.h" 3 | 4 | template 5 | class DistanceCalculator : public AbstractTreeGenerator { 6 | private: 7 | string out_file; 8 | bool generate_square_matrix; 9 | bool calculate_pid; 10 | public: 11 | DistanceCalculator( 12 | int n_threads, 13 | instruction_set_t instruction_set, 14 | const string& out_file, 15 | bool generate_square_matrix, 16 | bool calculate_pid) 17 | : 18 | AbstractTreeGenerator(n_threads, instruction_set), 19 | out_file(out_file), 20 | generate_square_matrix(generate_square_matrix), 21 | calculate_pid(calculate_pid) {} 22 | 23 | protected: 24 | void run(std::vector& sequences, tree_structure& tree) override; 25 | 26 | }; -------------------------------------------------------------------------------- /src/tree/FastTree.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "AbstractTreeGenerator.h" 11 | #include "IPartialGenerator.h" 12 | #include "Clustering.h" 13 | 14 | #include 15 | 16 | template 17 | class FastTree : public AbstractTreeGenerator { 18 | public: 19 | 20 | FastTree( 21 | int n_threads, 22 | instruction_set_t instruction_set, 23 | std::shared_ptr partialGenerator, 24 | int subtreeSize, 25 | std::shared_ptr clustering, 26 | int sampleSize); 27 | 28 | virtual void run(std::vector& sequences, tree_structure& tree) override; 29 | 30 | protected: 31 | std::shared_ptr partialGenerator; 32 | int subtreeSize; 33 | std::shared_ptr clustering; 34 | int sampleSize; 35 | int clusteringThreshold; 36 | 37 | void doStep(std::vector& sequences, tree_structure& tree, int previousTop, bool parallel); 38 | 39 | int randomSeeds( 40 | std::vector& sequences, 41 | int n_seeds, 42 | int * seed_ids, 43 | float * similarity_row); 44 | 45 | int clusterSeeds( 46 | std::vector& sequences, 47 | int n_seeds, 48 | int n_samples, 49 | int * seed_ids, 50 | float * similarity_row); 51 | }; -------------------------------------------------------------------------------- /src/tree/GuideTree.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #include "GuideTree.h" 9 | #include "NewickParser.h" 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | 17 | // ******************************************************************* 18 | bool GuideTree::loadNewick( 19 | const std::string& file_name, 20 | std::vector& sequences) 21 | { 22 | // Load newick description 23 | ifstream newickFile; 24 | newickFile.open(file_name); 25 | if (!newickFile.good()) { 26 | throw std::runtime_error("Unable to open Newick file: " + file_name); 27 | return false; 28 | } 29 | 30 | std::stringstream ss; 31 | ss << newickFile.rdbuf(); 32 | std::string description(ss.str()); 33 | auto newend = std::remove_if(description.begin(), description.end(), 34 | [](char c)->bool { return c == '\r' || c == '\n'; }); 35 | description.erase(newend, description.end()); 36 | 37 | // Load guide tree 38 | NewickParser nw(false); 39 | nw.parse(sequences, description, guide_tree); 40 | 41 | return true; 42 | } 43 | 44 | // ******************************************************************* 45 | bool GuideTree::saveNewick( 46 | const std::string& file_name, 47 | const std::vector& sequences) const 48 | { 49 | // store guide tree 50 | string description; 51 | NewickParser nw(false); 52 | nw.store(sequences, guide_tree, description); 53 | 54 | // Open file 55 | ofstream newickFile; 56 | newickFile.open(file_name); 57 | if (!newickFile.good()) { 58 | return false; 59 | } 60 | 61 | newickFile << description; 62 | 63 | return true; 64 | } 65 | 66 | 67 | // ******************************************************************* 68 | int64_t GuideTree::calculateSackinIndex() { 69 | 70 | uint64_t idx = 0; 71 | int n_sequences = getSequenceCount(); 72 | 73 | if (n_sequences) { 74 | std::vector depths(this->guide_tree.size()); 75 | for (int i = (int)guide_tree.size() - 1; i >= n_sequences; --i) 76 | { 77 | depths[guide_tree[i].first] = depths[i] + 1; 78 | depths[guide_tree[i].second] = depths[i] + 1; 79 | } 80 | 81 | 82 | for (int i = 0; i < n_sequences; ++i) { 83 | idx += depths[i] + 1; 84 | } 85 | } 86 | 87 | return idx; 88 | } 89 | 90 | // ******************************************************************* 91 | void GuideTree::toUnique(const std::vector& original2unique, int n_uniques) { 92 | 93 | int n_total_seqs = original2unique.size(); 94 | auto& vt = guide_tree; 95 | 96 | // remove duplicated sequences from the imported tree 97 | int offset = n_total_seqs - n_uniques; 98 | 99 | vt.erase(vt.begin() + n_uniques, vt.begin() + n_total_seqs); 100 | 101 | std::vector out_ids(vt.size()); 102 | std::iota(out_ids.begin(), out_ids.begin() + n_uniques, 0); 103 | int n_dups = 0; 104 | 105 | auto is_duplicate = [&out_ids, n_uniques](int node_id) ->bool { return out_ids[node_id] < n_uniques; }; 106 | 107 | for (int i = n_uniques; i < (int)vt.size(); ++i) { 108 | auto& node = vt[i]; 109 | // correct indices 110 | node.first = (node.first < n_total_seqs) ? original2unique[node.first] : node.first - offset; 111 | node.second = (node.second < n_total_seqs) ? original2unique[node.second] : node.second - offset; 112 | 113 | if (node.first == node.second) { 114 | // merge a leaf with itself - make a duplicate node 115 | ++n_dups; 116 | out_ids[i] = node.second; 117 | } 118 | else if (is_duplicate(node.first) && node.second == out_ids[node.first]) { 119 | // merge a duplicate node (first) with itself (second) 120 | ++n_dups; 121 | out_ids[i] = node.second; 122 | } 123 | else if (is_duplicate(node.second) && node.first == out_ids[node.second]) { 124 | // merge a duplicate node (second) with itself (left) 125 | ++n_dups; 126 | out_ids[i] = node.first; 127 | } 128 | else { 129 | // merge two different nodes 130 | node.first = out_ids[node.first]; 131 | node.second = out_ids[node.second]; 132 | out_ids[i] = i - n_dups; 133 | } 134 | } 135 | 136 | for (int i = n_uniques; i < (int)vt.size(); ++i) { 137 | if (!is_duplicate(i)) { 138 | vt[out_ids[i]] = vt[i]; 139 | } 140 | } 141 | 142 | vt.erase(vt.end() - n_dups, vt.end()); 143 | } 144 | 145 | // ******************************************************************* 146 | void GuideTree::fromUnique(const std::vector& original2unique) { 147 | 148 | int n_total_seqs = (int)original2unique.size(); 149 | int n_uniques = this->getSequenceCount(); 150 | int n_dups = n_total_seqs - n_uniques; 151 | auto& vt = guide_tree; 152 | 153 | std::vector> unique2original(n_uniques, std::vector()); 154 | std::vector out_ids(n_uniques, -1); 155 | std::iota(out_ids.begin(), out_ids.end(), 0); 156 | 157 | for (int i = 0; i < n_total_seqs; ++i) { 158 | unique2original[original2unique[i]].push_back(i); 159 | } 160 | 161 | // add duplicated leafs (n_dups) and nodes joining duplicated leafs (n_dups) 162 | vt.insert(vt.begin() + n_uniques, 2 * n_dups, node_t(-1, -1)); 163 | 164 | // add subtrees joining duplicated leafs [n_unique + dups, n_unique + 2 * dups] 165 | int node_id = n_uniques + n_dups; 166 | for (int iu = 0; iu < n_uniques; ++iu) { 167 | const vector& occs = unique2original[iu]; 168 | 169 | // iterate over occurrences of unique 170 | for (int i = 1; i < (int)occs.size(); ++i, ++node_id) { 171 | if (i == 1) { 172 | // first duplication - join original leaf with duplicated leaf 173 | vt[node_id].first = occs[0]; 174 | vt[node_id].second = occs[1]; 175 | } 176 | else { 177 | // following duplicates - join original leaf with previous node 178 | vt[node_id].first = occs[i]; 179 | vt[node_id].second = node_id - 1; 180 | } 181 | } 182 | 183 | if (occs.size() > 1) { 184 | out_ids[iu] = node_id - 1; // represent sequence by a subtree 185 | } 186 | else { 187 | out_ids[iu] = occs[0]; // represent by the original id 188 | } 189 | } 190 | 191 | // in the previously-existing nodes replace duplicated sequences with subtrees 192 | for (int i = node_id; i < (int)vt.size(); ++i) { 193 | auto& node = vt[i]; 194 | if (node.first < n_uniques) { 195 | node.first = out_ids[node.first]; 196 | } 197 | else { 198 | node.first += 2 * n_dups; 199 | } 200 | 201 | if (node.second < n_uniques) { 202 | node.second = out_ids[node.second]; 203 | } 204 | else { 205 | node.second += 2 * n_dups; 206 | } 207 | } 208 | } -------------------------------------------------------------------------------- /src/tree/GuideTree.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "TreeDefs.h" 11 | #include "../core/sequence.h" 12 | 13 | #include 14 | 15 | 16 | class GuideTree { 17 | public: 18 | 19 | tree_structure & raw() { return guide_tree; } 20 | 21 | GuideTree() {} 22 | 23 | int getSequenceCount() const { return (int)(guide_tree.size() + 1) / 2; } 24 | 25 | bool loadNewick( 26 | const std::string& file_name, 27 | std::vector& sequences); 28 | 29 | bool saveNewick( 30 | const std::string& file_name, 31 | const std::vector& sequences) const; 32 | 33 | int64_t calculateSackinIndex(); 34 | 35 | void toUnique(const std::vector& original2unique, int n_uniques); 36 | void fromUnique(const std::vector& original2unique); 37 | 38 | protected: 39 | 40 | tree_structure guide_tree; 41 | 42 | }; 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/tree/IPartialGenerator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "TreeDefs.h" 4 | 5 | #include 6 | 7 | class CSequence; 8 | 9 | 10 | class IPartialGenerator { 11 | 12 | public: 13 | virtual void runPartial(std::vector& sequences, tree_structure& tree) = 0; 14 | 15 | }; 16 | -------------------------------------------------------------------------------- /src/tree/MSTPrim.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "AbstractTreeGenerator.h" 11 | #include "IPartialGenerator.h" 12 | #include "../lcs/lcsbp.h" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "math.h" 26 | 27 | //#define MANY_CAND 10 28 | 29 | class MSTPartitioner 30 | { 31 | public: 32 | typedef vector::iterator iterator; 33 | 34 | private: 35 | int64_t n_threads; 36 | int64_t n_parts; 37 | int64_t min_part_size; 38 | int64_t n_tail_parts; 39 | 40 | struct part_elem_t { 41 | vector data; 42 | uint32_t i_begin; 43 | uint32_t i_end; 44 | 45 | part_elem_t() : i_begin(0), i_end(0) {}; 46 | 47 | part_elem_t(const vector &_data, uint32_t _i_begin, uint32_t _i_end) : data(_data), i_begin(_i_begin), i_end(_i_end) {}; 48 | 49 | part_elem_t(const part_elem_t&) = default; 50 | part_elem_t(part_elem_t&&) = default; 51 | 52 | part_elem_t& operator=(const part_elem_t& x) noexcept = delete; 53 | part_elem_t& operator=(part_elem_t&& x) noexcept = default; 54 | }; 55 | vector vd_parts; 56 | 57 | public: 58 | MSTPartitioner(int _n_threads, int _n_parts, int _min_part_size, int _n_tail_parts) : n_threads(_n_threads), n_parts(_n_parts), min_part_size(_min_part_size), n_tail_parts(_n_tail_parts) {}; 59 | void InitPartition(int n_elements); 60 | void Remove(int id); 61 | pair GetPart(int part_id); 62 | int GetNoParts(); 63 | bool IsAlmostEmpty(); 64 | }; 65 | 66 | template 67 | class CMaxRangeQueries 68 | { 69 | int n_levels; 70 | vector> vv_data; 71 | 72 | void prepare(const vector& v_data) { 73 | size_t n = v_data.size(); 74 | 75 | n_levels = 1 + (int)log2(n); 76 | 77 | vv_data.resize(n_levels); 78 | 79 | vv_data[0] = v_data; 80 | 81 | for (int i = 1; i < n_levels; ++i) 82 | { 83 | auto& vc = vv_data[i]; 84 | auto& vp = vv_data[i - 1u]; 85 | 86 | vc.resize(n - (1ull << i) + 1ull); 87 | 88 | for (uint32_t j = 0; j + (1u << i) - 1 < n; ++j) 89 | if (vp[j] > vp[j + (1u << (i - 1))]) 90 | vc[j] = vp[j]; 91 | else 92 | vc[j] = vp[j + (1u << (i - 1))]; 93 | } 94 | } 95 | 96 | public: 97 | CMaxRangeQueries() { 98 | n_levels = 0; 99 | } 100 | 101 | void Init(const vector& v_data) { 102 | prepare(v_data); 103 | } 104 | 105 | T MaxElement(int begin, int end) { 106 | uint32_t lev = (uint32_t)log2(end - begin); 107 | 108 | if (vv_data[lev][begin] > vv_data[lev][end - (1u << lev)]) 109 | return vv_data[lev][begin]; 110 | else 111 | return vv_data[lev][end - (1u << lev)]; 112 | } 113 | }; 114 | 115 | template 116 | class MSTPrim : public AbstractTreeGenerator { 117 | #ifdef MANY_CAND 118 | static const int N_CAND = MANY_CAND; 119 | 120 | using dist_value_t = array; 121 | #else 122 | using dist_value_t = double; 123 | // using dist_value_t = uint64_t; 124 | #endif 125 | 126 | using dist_t = pair; 127 | vector v_distances; 128 | vector v_processed; 129 | 130 | void* raw_sequence_views; 131 | CSequenceView* sequence_views; 132 | 133 | static uint64_t ids_to_uint64(int id1, int id2) 134 | { 135 | if (id1 < 0 || id2 < 0) 136 | return 0u; 137 | if (id1 > id2) 138 | return (((uint64_t)id2) << 32) + (uint64_t)id1; 139 | return (((uint64_t)id1) << 32) + (uint64_t)id2; 140 | } 141 | 142 | constexpr pair uint64_to_id(uint64_t packed_ids) 143 | { 144 | int id1 = (int)(packed_ids >> 32); 145 | int id2 = (int)(packed_ids & 0xffffffffull); 146 | 147 | if (id1 < id2) 148 | return make_pair(id1, id2); 149 | else 150 | return make_pair(id2, id1); 151 | } 152 | 153 | struct mst_edge_t { 154 | int seq_from; 155 | int seq_to; 156 | int prim_order; 157 | dist_value_t dist; 158 | 159 | #ifdef MANY_CAND 160 | mst_edge_t(int _seq_from, int _seq_to, int _prim_order, dist_value_t _dist) : seq_from(_seq_from), seq_to(_seq_to), prim_order(_prim_order), dist(_dist) {} 161 | mst_edge_t() { 162 | seq_from = -1; 163 | seq_to = -1; 164 | prim_order = -1; 165 | fill(sim.begin(), sim.end(), numeric_limits::max()); 166 | } 167 | #else 168 | mst_edge_t(int _seq_from = -1, int _seq_to = -1, int _prim_order = -1, dist_value_t _dist = 0) : seq_from(_seq_from), seq_to(_seq_to), prim_order(_prim_order), dist(_dist) {} 169 | #endif 170 | 171 | bool is_less(const mst_edge_t& x, const mst_edge_t& y) 172 | { 173 | if (x.dist != y.dist) 174 | return x.dist > y.dist; 175 | 176 | return ids_to_uint64(x.seq_from, x.seq_to) > ids_to_uint64(y.seq_from, y.seq_to); 177 | } 178 | 179 | bool operator<(const mst_edge_t& x) { 180 | return is_less(*this, x); 181 | } 182 | 183 | bool operator>(const mst_edge_t& x) { 184 | return is_less(x, *this); 185 | } 186 | 187 | bool operator==(const mst_edge_t& x) { 188 | return !is_less(*this, x) && !is_less(x, *this); 189 | } 190 | 191 | bool operator!=(const mst_edge_t& x) { 192 | return is_less(*this, x) || is_less(x, *this); 193 | } 194 | }; 195 | 196 | struct dend_range_t { 197 | int id; 198 | int prim_from; 199 | int prim_to; 200 | 201 | dend_range_t(int _id, int _prim_from, int _prim_to) : id(_id), prim_from(_prim_from), prim_to(_prim_to) {} 202 | }; 203 | 204 | void mst_to_dendogram(vector& mst_edges, vector& v_prim_orders, tree_structure& tree); 205 | void prepare_sequences_view(std::vector& sequences); 206 | void prepare_bit_masks_for_sequence(CSequence& seq, bit_vec_t*& bm, uint32_t& p_bv_len); 207 | 208 | public: 209 | MSTPrim(int n_threads, instruction_set_t instruction_set) : AbstractTreeGenerator(n_threads, instruction_set) { 210 | #ifdef MANY_CAND 211 | // fill(sim_value_empty.begin(), sim_value_empty.end(), 0.0); 212 | #else 213 | // sim_value_empty = 0.0; 214 | #endif 215 | 216 | sequence_views = nullptr; 217 | raw_sequence_views = nullptr; 218 | } 219 | 220 | ~MSTPrim() 221 | { 222 | if (raw_sequence_views) 223 | free(raw_sequence_views); 224 | } 225 | 226 | void run(std::vector& sequences, tree_structure& tree) override; 227 | 228 | void run_view(std::vector& sequences, tree_structure& tree); 229 | }; 230 | 231 | -------------------------------------------------------------------------------- /src/tree/NeighborJoining.cpp: -------------------------------------------------------------------------------- 1 | #include "NeighborJoining.h" 2 | #include "AbstractTreeGenerator.hpp" 3 | 4 | #include "../lcs/lcsbp.h" 5 | 6 | #include 7 | 8 | // ******************************************************************* 9 | template 10 | void NeighborJoining<_distance>::run(std::vector& sequences, tree_structure& tree) { 11 | 12 | float* distances = TriangleMatrix::allocate(sequences.size()); 13 | CLCSBP lcsbp(instruction_set); 14 | 15 | Transform transform; 16 | calculateDistanceMatrix(transform, sequences.data(), (int) sequences.size(), distances, lcsbp); 17 | 18 | computeTree(distances, sequences.size(), tree); 19 | 20 | delete[] distances; 21 | } 22 | 23 | // ******************************************************************* 24 | template 25 | void NeighborJoining<_distance>::runPartial(std::vector& sequences, tree_structure& tree) { 26 | 27 | run(sequences, tree); 28 | } 29 | 30 | 31 | // ******************************************************************* 32 | template 33 | void NeighborJoining<_distance>::computeTree(float* distances, int n_seq, tree_structure& tree) { 34 | 35 | struct cluster { 36 | float sum_of_dists; 37 | int row_id; 38 | int node_id; 39 | }; 40 | 41 | float *D = distances; 42 | std::vector clusters(n_seq); 43 | 44 | // initialize clusters 45 | for (int i = 0; i < n_seq; ++i) { 46 | auto& ci = clusters[i]; 47 | ci.row_id = ci.node_id = i; 48 | ci.sum_of_dists = 0; 49 | 50 | for (int j = 0; j < n_seq; ++j) { 51 | if (i != j) { 52 | ci.sum_of_dists += D[TriangleMatrix::access(i, j)]; 53 | } 54 | } 55 | } 56 | 57 | // merge clusters as long as there are two left 58 | for (int iter = 0, n_clusters = n_seq; n_clusters > 2; ++iter) { 59 | 60 | // find minimum element in Q matrix 61 | float min_q = std::numeric_limits::max(); 62 | int min_i = 0, min_j = 0; 63 | 64 | for (int i = 0; i < n_clusters; ++i) { 65 | for (int j = i + 1; j < n_clusters; ++j) { 66 | const auto & ci = clusters[i]; 67 | const auto & cj = clusters[j]; 68 | 69 | float q = (n_clusters - 2) * D[TriangleMatrix::access(ci.row_id, cj.row_id)] - ci.sum_of_dists - cj.sum_of_dists; 70 | if (q < min_q) { 71 | min_q = q; 72 | min_i = i; 73 | min_j = j; 74 | } 75 | } 76 | } 77 | 78 | // merge two resulting clusters 79 | auto & ci = clusters[min_i]; 80 | auto & cj = clusters[min_j]; 81 | float Dij = D[TriangleMatrix::access(ci.row_id, cj.row_id)]; 82 | 83 | 84 | // ci is going to be replaced by a new cluster 85 | tree.push_back(node_t(ci.node_id, cj.node_id)); 86 | 87 | ci.sum_of_dists = 0; 88 | ci.node_id = n_seq + iter; 89 | 90 | // recalculate distances 91 | for (int k = 0; k < n_clusters; ++k) { 92 | 93 | if (k != min_i && k != min_j) { 94 | auto & ck = clusters[k]; 95 | 96 | float Dik = D[TriangleMatrix::access(ci.row_id, ck.row_id)]; 97 | float Djk = D[TriangleMatrix::access(cj.row_id, ck.row_id)]; 98 | 99 | // remove contribution of ci and cj from ck sums 100 | ck.sum_of_dists -= Dik + Djk; 101 | 102 | // ci is replaced 103 | Dik = (Dik + Djk - Dij) / 2; // updated disance 104 | ck.sum_of_dists += Dik; 105 | ci.sum_of_dists += Dik; 106 | 107 | D[TriangleMatrix::access(ci.row_id, ck.row_id)] = Dik; 108 | } 109 | } 110 | 111 | // remove cj 112 | clusters.erase(clusters.begin() + min_j); 113 | --n_clusters; 114 | } 115 | 116 | // join two remanining clusters 117 | tree.push_back(node_t(clusters[0].node_id, clusters[1].node_id)); 118 | } 119 | 120 | 121 | // ******************************************************************* 122 | // Explicit template specializations for specified distance measures 123 | 124 | template class NeighborJoining; 125 | template class NeighborJoining; -------------------------------------------------------------------------------- /src/tree/NeighborJoining.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "IPartialGenerator.h" 3 | #include "AbstractTreeGenerator.h" 4 | 5 | template 6 | class NeighborJoining : public AbstractTreeGenerator, public IPartialGenerator { 7 | public: 8 | 9 | NeighborJoining(int n_threads, instruction_set_t instruction_set) 10 | : AbstractTreeGenerator(n_threads, instruction_set) {} 11 | 12 | void run(std::vector& sequences, tree_structure& tree) override; 13 | 14 | void runPartial(std::vector& sequences, tree_structure& tree) override; 15 | 16 | protected: 17 | void computeTree(float* distances, int n_seq, tree_structure& tree); 18 | }; 19 | -------------------------------------------------------------------------------- /src/tree/NewickParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | #include "NewickParser.h" 17 | #include "../utils/log.h" 18 | 19 | #undef min 20 | 21 | using namespace std; 22 | 23 | void NewickParser::parse( 24 | const std::vector& sequences, 25 | const std::string& description, 26 | std::vector>& guideTree) 27 | { 28 | if (description.length() == 0) { 29 | throw std::runtime_error("Error while parsing Newick tree: empty description."); 30 | } 31 | 32 | LOG_VERBOSE << endl << "Newick description length: " << description.length() << endl; 33 | 34 | // map sequence names to ids 35 | std::map sequencesToIds; 36 | guideTree.resize(2 * sequences.size(), std::pair(-1, -1)); // add extra node at the end - will be removed 37 | 38 | // fill in mappings 39 | int n_seqs = (int)sequences.size(); 40 | for (int i = 0; i < n_seqs; ++i) { 41 | const auto& seq = sequences[i]; 42 | if (seq.id[0] == '>') { 43 | sequencesToIds[seq.id.substr(1, seq.id.size())] = i; // omit > 44 | } 45 | else { 46 | sequencesToIds[seq.id] = i; 47 | } 48 | } 49 | 50 | const char* p = description.c_str(); 51 | const char* end = p + description.size(); 52 | 53 | int cur_pos = (int)guideTree.size() - 1; 54 | int free_pos = cur_pos - 1; 55 | bool secondBranch = false; 56 | std::vector prevs(guideTree.size() + 1, -1); 57 | 58 | while (p < end) { 59 | // subtree begin 60 | if (*p == '(') { 61 | auto& out_branch = secondBranch ? guideTree[cur_pos].second : guideTree[cur_pos].first; 62 | out_branch = free_pos; 63 | prevs[free_pos] = cur_pos; 64 | cur_pos = free_pos; 65 | 66 | ++p; 67 | --free_pos; 68 | secondBranch = false; 69 | } 70 | else if (*p == ',') { 71 | ++p; 72 | secondBranch = true; 73 | } 74 | else if (*p == ')') { 75 | ++p; 76 | cur_pos = prevs[cur_pos]; 77 | } 78 | else if (*p == ':') { 79 | // branch length 80 | ++p; 81 | char* len_end; 82 | strtof(p, &len_end); 83 | p = len_end; 84 | } 85 | else if (isspace(*p)) { 86 | ++p; // ignore whitespaces 87 | } 88 | else { 89 | // find end of the sequence name 90 | const char* name_end = std::find_if(p, end, [](char c) { return c == ')' || c == ',' || c == ':' || c == '('; }); 91 | string name(p, name_end); 92 | int id = sequencesToIds[name]; 93 | 94 | auto& out_branch = secondBranch ? guideTree[cur_pos].second : guideTree[cur_pos].first; 95 | out_branch = id; 96 | p = name_end; 97 | } 98 | } 99 | 100 | guideTree.resize(guideTree.size() - 1); // remove an extra node 101 | } 102 | 103 | void NewickParser::store( 104 | const std::vector& sequences, 105 | const std::vector>& guideTree, 106 | std::string& description) { 107 | 108 | ostringstream oss; 109 | 110 | std::vector prevs(guideTree.size() + 1, -1); 111 | std::vector num_visits(guideTree.size() + 1, 0); 112 | int last_pos = guideTree.size() - 1; 113 | int cur_pos = last_pos; 114 | 115 | while (true) { 116 | 117 | if (cur_pos < (int)sequences.size()) { 118 | // if sequence was reached 119 | 120 | const char* begin = sequences[cur_pos].id.c_str(); 121 | 122 | // remove trailing '<' if present 123 | if (*begin == '>') { ++begin; } 124 | 125 | oss << begin << ":1.0"; 126 | cur_pos = prevs[cur_pos]; 127 | 128 | } 129 | else { 130 | // if internal node 131 | 132 | if (num_visits[cur_pos] == 0) { 133 | // no visits - left branch 134 | oss << '('; 135 | int dest_pos = guideTree[cur_pos].first; 136 | ++num_visits[cur_pos]; 137 | prevs[dest_pos] = cur_pos; 138 | cur_pos = dest_pos; 139 | 140 | } 141 | else if (num_visits[cur_pos] == 1) { 142 | // one visit - right branch 143 | oss << ','; 144 | int dest_pos = guideTree[cur_pos].second; 145 | ++num_visits[cur_pos]; 146 | prevs[dest_pos] = cur_pos; 147 | cur_pos = dest_pos; 148 | } 149 | else { 150 | // two visits - node processed 151 | if (cur_pos == last_pos) { 152 | // root processed 153 | oss << ");"; 154 | break; 155 | } 156 | 157 | oss << "):1.0"; 158 | ++num_visits[cur_pos]; 159 | cur_pos = prevs[cur_pos]; 160 | } 161 | } 162 | } 163 | 164 | description = oss.str(); 165 | } 166 | -------------------------------------------------------------------------------- /src/tree/NewickParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _NEWICK_TREE_H 10 | #define _NEWICK_TREE_H 11 | 12 | #include "../core/sequence.h" 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | class NewickParser { 19 | 20 | protected: 21 | bool verbose; 22 | 23 | public: 24 | NewickParser(bool verbose) : verbose(verbose) {} 25 | 26 | void parse( 27 | const std::vector& sequences, 28 | const std::string& description, 29 | std::vector>& guideTree); 30 | 31 | void store( 32 | const std::vector& sequences, 33 | const std::vector>& guideTree, 34 | std::string& description); 35 | }; 36 | 37 | 38 | #endif -------------------------------------------------------------------------------- /src/tree/SingleLinkage.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #include "SingleLinkage.h" 9 | #include "AbstractTreeGenerator.hpp" 10 | #include "SingleLinkageQueue.h" 11 | #include "../lcs/lcsbp.h" 12 | #include "../utils/log.h" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef _MSC_VER 24 | #include 25 | #endif 26 | 27 | using namespace std; 28 | 29 | // ******************************************************************* 30 | template 31 | void SingleLinkage<_distance>::run(std::vector& sequences, tree_structure& tree) { 32 | int next; 33 | int n_seq = (int)sequences.size(); 34 | 35 | int prefetch_offset = 64 * 2; 36 | int prefetch_offset_2nd = 128 * 2; 37 | 38 | vector pi(n_seq + max(prefetch_offset, prefetch_offset_2nd), 0); 39 | vector lambda(n_seq); 40 | vector *dist_vector; 41 | 42 | CSingleLinkageQueue slq(&sequences, n_seq, n_threads * 8); 43 | vector workers(n_threads, nullptr); 44 | 45 | mutex mtx; 46 | 47 | // Calculation of similarities is made in working threads 48 | for (int i = 0; i < n_threads; ++i) 49 | workers[i] = new thread([&] { 50 | CLCSBP lcsbp(instruction_set); 51 | int row_id; 52 | vector *sequences; 53 | vector *dist_vector; 54 | Transform transform; 55 | 56 | vector loc_dist_vector; 57 | 58 | while (slq.GetTask(row_id, sequences, dist_vector)) 59 | { 60 | loc_dist_vector.resize(dist_vector->size()); 61 | 62 | calculateDistanceVector( 63 | transform, 64 | (*sequences)[row_id], 65 | sequences->data(), 66 | row_id, 67 | loc_dist_vector.data(), 68 | lcsbp); 69 | 70 | #ifdef SLINK_HANDLE_TIES 71 | for (size_t i = 0; i < loc_dist_vector.size(); ++i) 72 | { 73 | (*dist_vector)[i].first = loc_dist_vector[i]; 74 | (*dist_vector)[i].second = ids_to_uint64(i, row_id); 75 | } 76 | #else 77 | swap(*dist_vector, loc_dist_vector); 78 | #endif 79 | 80 | slq.RegisterSolution(row_id); 81 | } 82 | }); 83 | 84 | // Single linkage algorithm is here 85 | for (int i = 0; i < n_seq; ++i) 86 | { 87 | pi[i] = i; 88 | #ifdef SLINK_HANDLE_TIES 89 | lambda[i] = slink_dist_t{ std::numeric_limits::max(), 0 }; 90 | #else 91 | lambda[i] = std::numeric_limits::max(); 92 | #endif 93 | 94 | if (i % (100) == 0) { 95 | LOG_DEBUG << "Computing guide tree - " << fixed << setprecision(1) 96 | << 100.0 * ((double)i * (i + 1) / 2) / ((double)n_seq * (n_seq + 1) / 2) << "% (" << i << " of " << n_seq << ") \r"; 97 | } 98 | 99 | slq.GetSolution(i, dist_vector); 100 | 101 | auto p_lambda = lambda.begin(); 102 | auto p_dist_vector = (*dist_vector).begin(); 103 | auto p_pi = pi.begin(); 104 | 105 | for (int j = 0; j < i; ++j) 106 | { 107 | next = pi[j]; 108 | 109 | #ifdef _MSC_VER // Visual C++ 110 | _mm_prefetch((const char*)&(*dist_vector)[*(p_pi + prefetch_offset)], 2); 111 | #endif 112 | #ifdef __GNUC__ 113 | // __builtin_prefetch((&(*dist_vector)[pi[j + prefetch_offset]]), 1, 2); 114 | __builtin_prefetch((&(*dist_vector)[*(p_pi + prefetch_offset)]), 1, 2); 115 | #endif 116 | 117 | auto &x = (*dist_vector)[next]; 118 | 119 | if (*p_lambda < *p_dist_vector) 120 | { 121 | x = min(x, *p_dist_vector); 122 | } 123 | else 124 | { 125 | x = min(x, *p_lambda); 126 | *p_pi = i; 127 | *p_lambda = *p_dist_vector; 128 | } 129 | 130 | ++p_pi; 131 | ++p_lambda; 132 | ++p_dist_vector; 133 | } 134 | 135 | slq.ReleaseSolution(i); 136 | 137 | p_pi = pi.begin(); 138 | p_lambda = lambda.begin(); 139 | for (int j = 0; j < i; ++j) 140 | { 141 | #ifdef _MSC_VER // Visual C++ 142 | _mm_prefetch((const char*)&lambda[*(p_pi + prefetch_offset_2nd)], 0); 143 | #endif 144 | #ifdef __GNUC__ 145 | __builtin_prefetch((&lambda[*(p_pi + prefetch_offset_2nd)]), 1, 0); 146 | #endif 147 | 148 | next = *p_pi; 149 | if (lambda[next] <= *p_lambda) 150 | *p_pi = i; 151 | 152 | ++p_pi; 153 | ++p_lambda; 154 | } 155 | } 156 | 157 | for (auto p : workers) 158 | { 159 | p->join(); 160 | delete p; 161 | } 162 | workers.clear(); 163 | 164 | LOG_DEBUG << "Computing guide tree - 100.0% \r"; 165 | 166 | vector elements(n_seq - 1); 167 | for (int i = 0; i < n_seq - 1; ++i) 168 | elements[i] = i; 169 | 170 | #ifdef DEBUG_MODE 171 | identity /= n_seq * (n_seq - 1) / 2.0; 172 | #endif 173 | 174 | stable_sort(elements.begin(), elements.end(), [&](int x, int y) { 175 | return lambda[x] < lambda[y]; 176 | }); 177 | 178 | vector index(n_seq); 179 | for (int i = 0; i < n_seq; ++i) 180 | index[i] = i; 181 | 182 | for (int i = 0; i < n_seq - 1; ++i) 183 | { 184 | int j = elements[i]; 185 | next = pi[j]; 186 | tree.emplace_back(index[j], index[next]); 187 | index[next] = n_seq + i; 188 | } 189 | } 190 | 191 | // ******************************************************************* 192 | template 193 | void SingleLinkage<_distance>::runPartial(std::vector& sequences, tree_structure& tree) 194 | { 195 | int next; 196 | int n_seq = sequences.size(); 197 | 198 | int prefetch_offset = 64 * 2; 199 | int prefetch_offset_2nd = 128 * 2; 200 | 201 | vector pi(n_seq + max(prefetch_offset, prefetch_offset_2nd), 0); 202 | vector lambda(n_seq); 203 | vector dist_vector(n_seq); 204 | vector loc_dist_vector(dist_vector.size()); 205 | 206 | Transform transform; 207 | 208 | CLCSBP lcsbp(instruction_set); 209 | 210 | // Single linkage algorithm is here 211 | for (int i = 0; i < n_seq; ++i) 212 | { 213 | pi[i] = i; 214 | 215 | #ifdef SLINK_HANDLE_TIES 216 | lambda[i] = slink_dist_t{ std::numeric_limits::max(), 0 }; 217 | #else 218 | lambda[i] = std::numeric_limits::max(); 219 | #endif 220 | 221 | /* if (i % (100) == 0) { 222 | LOG_DEBUG << "Computing guide tree - " << fixed << setprecision(1) 223 | << 100.0 * ((double)i * (i + 1) / 2) / ((double)n_seq * (n_seq + 1) / 2) << "\% (" << i << " of " << n_seq << ") \r"; 224 | } 225 | */ 226 | calculateDistanceVector( 227 | transform, 228 | sequences[i], 229 | sequences.data(), 230 | i, 231 | loc_dist_vector.data(), 232 | lcsbp); 233 | 234 | #ifdef SLINK_HANDLE_TIES 235 | for (int j = 0; j < (int) loc_dist_vector.size(); ++j) 236 | { 237 | dist_vector[j].first = loc_dist_vector[j]; 238 | dist_vector[j].second = ids_to_uint64(j, i); 239 | } 240 | #else 241 | swap(dist_vector, loc_dist_vector); 242 | #endif 243 | 244 | auto p_lambda = lambda.begin(); 245 | auto p_dist_vector = dist_vector.begin(); 246 | auto p_pi = pi.begin(); 247 | 248 | for (int j = 0; j < i; ++j) 249 | { 250 | next = pi[j]; 251 | 252 | #ifdef _MSC_VER // Visual C++ 253 | _mm_prefetch((const char*)&(dist_vector)[*(p_pi + prefetch_offset)], 2); 254 | #endif 255 | #ifdef __GNUC__ 256 | // __builtin_prefetch((&(*dist_vector)[pi[j + prefetch_offset]]), 1, 2); 257 | __builtin_prefetch((&(dist_vector)[*(p_pi + prefetch_offset)]), 1, 2); 258 | #endif 259 | 260 | auto &x = (dist_vector)[next]; 261 | 262 | if (*p_lambda < *p_dist_vector) 263 | { 264 | x = min(x, *p_dist_vector); 265 | } 266 | else 267 | { 268 | x = min(x, *p_lambda); 269 | *p_pi = i; 270 | *p_lambda = *p_dist_vector; 271 | } 272 | 273 | ++p_pi; 274 | ++p_lambda; 275 | ++p_dist_vector; 276 | } 277 | 278 | p_pi = pi.begin(); 279 | p_lambda = lambda.begin(); 280 | for (int j = 0; j < i; ++j) 281 | { 282 | #ifdef _MSC_VER // Visual C++ 283 | _mm_prefetch((const char*)&lambda[*(p_pi + prefetch_offset_2nd)], 0); 284 | #endif 285 | #ifdef __GNUC__ 286 | __builtin_prefetch((&lambda[*(p_pi + prefetch_offset_2nd)]), 1, 0); 287 | #endif 288 | 289 | next = *p_pi; 290 | if (lambda[next] <= *p_lambda) 291 | *p_pi = i; 292 | 293 | ++p_pi; 294 | ++p_lambda; 295 | } 296 | } 297 | 298 | // LOG_DEBUG << "Computing guide tree - 100.0\% \r"; 299 | 300 | vector elements(n_seq - 1); 301 | for (int i = 0; i < n_seq - 1; ++i) 302 | elements[i] = i; 303 | 304 | #ifdef DEBUG_MODE 305 | identity /= n_seq * (n_seq - 1) / 2.0; 306 | #endif 307 | 308 | stable_sort(elements.begin(), elements.end(), [&](int x, int y) { 309 | return lambda[x] < lambda[y]; 310 | }); 311 | 312 | vector index(n_seq); 313 | for (int i = 0; i < n_seq; ++i) 314 | index[i] = i; 315 | 316 | for (int i = 0; i < n_seq - 1; ++i) 317 | { 318 | int j = elements[i]; 319 | next = pi[j]; 320 | tree.emplace_back(index[j], index[next]); 321 | index[next] = n_seq + i; 322 | } 323 | } 324 | 325 | 326 | // ******************************************************************* 327 | // Explicit template specializations for specified distance measures 328 | 329 | template class SingleLinkage; 330 | template class SingleLinkage; 331 | -------------------------------------------------------------------------------- /src/tree/SingleLinkage.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "AbstractTreeGenerator.h" 11 | #include "IPartialGenerator.h" 12 | 13 | #include 14 | #include 15 | 16 | #define SLINK_HANDLE_TIES 17 | 18 | #ifdef SLINK_HANDLE_TIES 19 | struct slink_dist_t { 20 | double first; 21 | uint64_t second; 22 | 23 | // this is to preserve consistency with smilarity variant 24 | // - increasingly by distance 25 | // - decreasingly by id 26 | bool operator<(const slink_dist_t& rhs) const { 27 | return (this->first == rhs.first) 28 | ? (this->second > rhs.second) 29 | : (this->first < rhs.first); 30 | } 31 | 32 | bool operator<=(const slink_dist_t& rhs) const { 33 | return (this->first == rhs.first) 34 | ? (this->second >= rhs.second) 35 | : (this->first <= rhs.first); 36 | } 37 | 38 | }; 39 | 40 | //using slink_similarity_t = pair; 41 | #else 42 | using slink_dist_t = double; 43 | #endif 44 | 45 | 46 | 47 | template 48 | class SingleLinkage : public AbstractTreeGenerator, public IPartialGenerator { 49 | uint64_t ids_to_uint64(int id1, int id2) 50 | { 51 | if (id1 < 0 || id2 < 0) 52 | return 0u; 53 | if (id1 > id2) 54 | return (((uint64_t)id2) << 32) + (uint64_t)id1; 55 | return (((uint64_t)id1) << 32) + (uint64_t)id2; 56 | } 57 | 58 | public: 59 | 60 | SingleLinkage(int n_threads, instruction_set_t instruction_set) 61 | : AbstractTreeGenerator(n_threads, instruction_set) {} 62 | 63 | void run(std::vector& sequences, tree_structure& tree) override; 64 | 65 | void runPartial(std::vector& sequences, tree_structure& tree) override; 66 | }; 67 | 68 | -------------------------------------------------------------------------------- /src/tree/SingleLinkageQueue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../core/sequence.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | class CSingleLinkageQueue 13 | { 14 | std::vector* sequences; 15 | 16 | std::vector> sim_vector_2d; 17 | // vector> lcs_len_2d; 18 | 19 | std::vector> ready_rows; 20 | std::stack> available_buffers; 21 | 22 | uint32_t lowest_uncomputed_row; 23 | uint32_t n_rows; 24 | uint32_t max_buffered_rows; 25 | 26 | bool eoq_flag; 27 | 28 | std::mutex mtx; 29 | std::condition_variable cv_tasks, cv_rows; 30 | 31 | public: 32 | 33 | // ******************************************************************* 34 | // CSingleLinkageQueue 35 | // ******************************************************************* 36 | CSingleLinkageQueue(vector* _sequences, uint32_t _n_rows, uint32_t _max_buffered_rows) 37 | { 38 | sequences = _sequences; 39 | n_rows = _n_rows; 40 | max_buffered_rows = min(n_rows, _max_buffered_rows); 41 | 42 | sim_vector_2d.resize(max_buffered_rows); 43 | for (auto& x : sim_vector_2d) 44 | x.resize(n_rows); 45 | 46 | ready_rows.resize(n_rows, make_pair(-1, false)); 47 | 48 | lowest_uncomputed_row = 0; 49 | 50 | for (int i = 0; i < (int) max_buffered_rows; ++i) 51 | available_buffers.push(i); 52 | 53 | eoq_flag = false; 54 | } 55 | 56 | // ******************************************************************* 57 | ~CSingleLinkageQueue() 58 | { 59 | } 60 | 61 | // ******************************************************************* 62 | bool GetTask(int& row_id, vector*& _sequences, vector*& sim_vector) 63 | { 64 | unique_lock lck(mtx); 65 | cv_tasks.wait(lck, [this] {return !this->available_buffers.empty() || this->eoq_flag; }); 66 | 67 | if (eoq_flag) 68 | return false; // End of data in the profiles queue 69 | 70 | row_id = lowest_uncomputed_row++; 71 | 72 | if (lowest_uncomputed_row >= n_rows) 73 | eoq_flag = true; 74 | 75 | _sequences = sequences; 76 | 77 | int buffer_row_id = available_buffers.top(); 78 | available_buffers.pop(); 79 | 80 | sim_vector = &sim_vector_2d[buffer_row_id]; 81 | 82 | ready_rows[row_id].first = buffer_row_id; 83 | 84 | #ifdef PRODUCE_LOG 85 | cerr << "GetTask : " << row_id << "\n"; 86 | cerr << available_buffers.size() << " " << lowest_uncomputed_row << "\n"; 87 | #endif 88 | 89 | return true; 90 | } 91 | 92 | // ******************************************************************* 93 | void RegisterSolution(int row_id) 94 | { 95 | unique_lock lck(mtx); 96 | 97 | ready_rows[row_id].second = true; 98 | 99 | #ifdef PRODUCE_LOG 100 | cerr << "Registered : " << row_id << "\n"; 101 | cerr << available_buffers.size() << " " << lowest_uncomputed_row << "\n"; 102 | #endif 103 | 104 | cv_rows.notify_one(); 105 | } 106 | 107 | // ******************************************************************* 108 | bool GetSolution(int row_id, vector*& sim_vector) 109 | { 110 | unique_lock lck(mtx); 111 | cv_rows.wait(lck, [this, row_id] {return this->ready_rows[row_id].second; }); 112 | 113 | int buffer_row_id = ready_rows[row_id].first; 114 | 115 | sim_vector = &sim_vector_2d[buffer_row_id]; 116 | 117 | #ifdef PRODUCE_LOG 118 | cerr << "GetSol : " << row_id << "\n"; 119 | cerr << available_buffers.size() << " " << lowest_uncomputed_row << "\n"; 120 | #endif 121 | 122 | return true; 123 | } 124 | 125 | // ******************************************************************* 126 | void ReleaseSolution(int row_id) 127 | { 128 | unique_lock lck(mtx); 129 | 130 | int buffer_row_id = ready_rows[row_id].first; 131 | 132 | available_buffers.push(buffer_row_id); 133 | 134 | #ifdef PRODUCE_LOG 135 | cerr << "Release : " << row_id << "\n"; 136 | cerr << available_buffers.size() << " " << lowest_uncomputed_row << "\n"; 137 | #endif 138 | 139 | cv_tasks.notify_all(); 140 | } 141 | }; -------------------------------------------------------------------------------- /src/tree/TreeDefs.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | // tree structure 15 | using node_t = std::pair; 16 | using tree_structure = std::vector; 17 | 18 | 19 | // class representing distance 20 | enum class Distance { 21 | indel_div_lcs, 22 | sqrt_indel_div_lcs, 23 | neg_lcs_div_indel, 24 | neg_lcs_div_minlen, 25 | neg_lcs_div_len_corrected, 26 | pairwise_identity 27 | }; 28 | 29 | inline static Distance str2dist(const std::string& s) { 30 | if (s == "indel_div_lcs") { return Distance::indel_div_lcs; } 31 | else if (s == "sqrt_indel_div_lcs") { return Distance::sqrt_indel_div_lcs; } 32 | else if (s == "neg_lcs_div_indel") { return Distance::neg_lcs_div_indel; } 33 | else if (s == "neg_lcs_div_minlen") { return Distance::neg_lcs_div_minlen; } 34 | else if (s == "neg_lcs_div_len_corrected") { return Distance::neg_lcs_div_len_corrected; } 35 | else { 36 | throw new std::runtime_error("Error: Illegal pairwise distance measure."); 37 | } 38 | 39 | return Distance::indel_div_lcs; 40 | } 41 | 42 | inline static std::string dist2str(Distance d) { 43 | switch (d) { 44 | case Distance::indel_div_lcs: return "indel_div_lcs"; 45 | case Distance::sqrt_indel_div_lcs: return "sqrt_indel_div_lcs"; 46 | case Distance::neg_lcs_div_indel: return "neg_lcs_div_indel"; 47 | case Distance::neg_lcs_div_minlen: return "neg_lcs_div_minlen"; 48 | case Distance::neg_lcs_div_len_corrected: return "neg_lcs_div_len_corrected"; 49 | default: 50 | throw new std::runtime_error("Error: Illegal pairwise distance measure."); 51 | } 52 | 53 | return "Unknown"; 54 | } 55 | 56 | // Class representing guide tree method 57 | class GT { 58 | public: 59 | enum Method { SLINK, MST_Prim, UPGMA, UPGMA_modified, NJ, chained, imported }; 60 | enum Heuristic { None, PartTree, ClusterTree }; 61 | 62 | static std::string toString(Method v) { 63 | switch (v) { 64 | case SLINK: return "single linkage (SLINK)"; 65 | case MST_Prim: return "single linkage (MST+Prim)"; 66 | case UPGMA: return "upgma"; 67 | case UPGMA_modified: return "upgma_modified"; 68 | case NJ: return "nj"; 69 | case chained: return "chained"; 70 | case imported: return "import"; 71 | default: 72 | throw new std::runtime_error("Error: Illegal guide tree method."); 73 | } 74 | 75 | return "Unknown"; 76 | } 77 | 78 | static std::string toString(Heuristic v) { 79 | switch (v) { 80 | case None: return "None"; 81 | case PartTree: return "PartTree"; 82 | case ClusterTree: return "MedoidTree"; 83 | } 84 | 85 | // something went wrong 86 | throw new std::runtime_error("Error: Illegal guide tree heuristic."); 87 | return "Unknown"; 88 | } 89 | 90 | static Method fromString(const std::string& name) { 91 | if (name == "sl") { return MST_Prim; } 92 | if (name == "slink") { return SLINK; } 93 | if (name == "upgma") { return UPGMA; } 94 | if (name == "upgma_modified") { return UPGMA_modified; } 95 | if (name == "nj") { return NJ; } 96 | if (name == "import") { return imported; } 97 | #ifdef DEVELOPER_MODE 98 | if (name == "chained") { return chained; } 99 | #endif 100 | // something went wrong 101 | throw new std::runtime_error("Error: Illegal guide tree method."); 102 | 103 | return SLINK; 104 | } 105 | }; 106 | 107 | 108 | class TriangleMatrix { 109 | public: 110 | template 111 | static T* allocate(size_t size) { 112 | return new T[(size * (size - 1)) / 2]; 113 | } 114 | 115 | static size_t access(int64_t i, int64_t j) { 116 | if (i >= j) 117 | return j + (i * (i - 1)) / 2; 118 | else 119 | return i + (j * (j - 1)) / 2; 120 | } 121 | }; -------------------------------------------------------------------------------- /src/tree/UPGMA.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include "AbstractTreeGenerator.h" 11 | #include "IPartialGenerator.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | // UPGMA defines and consts 20 | typedef float UPGMA_dist_t; 21 | 22 | // ******************************************************************* 23 | // Queue for UPGMA 24 | // ******************************************************************* 25 | class CUPGMAQueue 26 | { 27 | std::vector *sequences; 28 | uint32_t n_rows; 29 | UPGMA_dist_t *dist_matrix; 30 | uint32_t lowest_uncomputed_row; 31 | bool eoq_flag; 32 | 33 | std::mutex mtx; 34 | 35 | 36 | public: 37 | CUPGMAQueue(std::vector *_sequences, uint32_t _n_rows, UPGMA_dist_t *_dist_matrix) : 38 | sequences(_sequences), n_rows(_n_rows), dist_matrix(_dist_matrix), 39 | lowest_uncomputed_row(0), eoq_flag(false) 40 | {} 41 | 42 | ~CUPGMAQueue() {} 43 | 44 | bool GetTask(int& row_id, std::vector*& _sequences, UPGMA_dist_t*& dist_row) { 45 | unique_lock lck(mtx); 46 | 47 | if (eoq_flag) 48 | return false; // End of data in the profiles queue 49 | 50 | row_id = lowest_uncomputed_row++; 51 | 52 | if (lowest_uncomputed_row >= n_rows) 53 | eoq_flag = true; 54 | 55 | _sequences = sequences; 56 | dist_row = dist_matrix + TriangleMatrix::access(row_id, 0); 57 | 58 | return true; 59 | } 60 | }; 61 | 62 | // ******************************************************************* 63 | // UPGMA algo 64 | // ******************************************************************* 65 | template 66 | class UPGMA : public AbstractTreeGenerator, public IPartialGenerator { 67 | public: 68 | 69 | UPGMA(int n_threads, instruction_set_t instruction_set, bool is_modified) 70 | : AbstractTreeGenerator(n_threads, instruction_set), is_modified(is_modified) {} 71 | 72 | void run(std::vector& sequences, tree_structure& tree) override; 73 | 74 | void runPartial(std::vector& sequences, tree_structure& tree) override; 75 | 76 | void computeDistances(std::vector& sequences, UPGMA_dist_t *dist_matrix); 77 | 78 | template 79 | void computeTree(UPGMA_dist_t* distances, int n_seq, tree_structure& tree); 80 | 81 | protected: 82 | const UPGMA_dist_t BIG_DIST = (UPGMA_dist_t) 1e29; 83 | bool is_modified; 84 | }; -------------------------------------------------------------------------------- /src/utils/conversion.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /* 3 | This file is a part of Kmer-db software distributed under GNU GPL 3 licence. 4 | The homepage of the Kmer-db project is http://sun.aei.polsl.pl/REFRESH/kmer-db 5 | 6 | Authors: Sebastian Deorowicz, Adam Gudys, Maciej Dlugosz, Marek Kokot, Agnieszka Danek 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | // ************************************************************************************ 16 | class NumericConversions 17 | { 18 | public: 19 | static char digits[100000 * 5]; 20 | static uint64_t powers10[15]; 21 | struct _si { 22 | _si() 23 | { 24 | for (int i = 0; i < 100000; ++i) 25 | { 26 | int dig = i; 27 | 28 | digits[i * 5 + 4] = '0' + (dig % 10); 29 | dig /= 10; 30 | digits[i * 5 + 3] = '0' + (dig % 10); 31 | dig /= 10; 32 | digits[i * 5 + 2] = '0' + (dig % 10); 33 | dig /= 10; 34 | digits[i * 5 + 1] = '0' + (dig % 10); 35 | dig /= 10; 36 | digits[i * 5 + 0] = '0' + dig; 37 | } 38 | 39 | powers10[0] = 1; 40 | for (int i = 1; i < 15; ++i) 41 | powers10[i] = 10 * powers10[i - 1]; 42 | } 43 | } static _init; 44 | 45 | static int NDigits(uint64_t v) 46 | { 47 | return (v < 10000) 48 | ? (v < 100 ? (v < 10 ? 1 : 2) : (v < 1000 ? 3 : 4)) 49 | : (v < 1000000 ? (v < 100000 ? 5 : 6) : (v < 10000000 ? 7 : 8)); 50 | } 51 | 52 | static int Int2PChar(uint64_t val, char *str) 53 | { 54 | if (val >= 1000000000000000ull) 55 | { 56 | uint64_t dig1 = val / 1000000000000000ull; 57 | val -= dig1 * 1000000000000000ull; 58 | uint64_t dig2 = val / 10000000000ull; 59 | val -= dig2 * 10000000000ull; 60 | uint64_t dig3 = val / 100000ull; 61 | uint64_t dig4 = val - dig3 * 100000ull; 62 | 63 | int ndig = NDigits(dig1); 64 | 65 | std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 66 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 67 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5); 68 | std::memcpy(str + ndig + 10, digits + dig4 * 5, 5); 69 | 70 | return ndig + 15; 71 | } 72 | else if (val >= 10000000000ull) 73 | { 74 | uint64_t dig1 = val / 10000000000ull; 75 | val -= dig1 * 10000000000ull; 76 | uint64_t dig2 = val / 100000ull; 77 | uint64_t dig3 = val - dig2 * 100000ull; 78 | 79 | int ndig = NDigits(dig1); 80 | 81 | std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 82 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 83 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5); 84 | 85 | return ndig + 10; 86 | } 87 | else if (val >= 100000ull) 88 | { 89 | uint64_t dig1 = val / 100000ull; 90 | uint64_t dig2 = val - dig1 * 100000ull; 91 | 92 | int ndig = NDigits(dig1); 93 | 94 | memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 95 | memcpy(str + ndig, digits + dig2 * 5, 5); 96 | 97 | return ndig + 5; 98 | } 99 | else 100 | { 101 | int ndig = NDigits(val); 102 | 103 | memcpy(str, digits + val * 5 + (5 - ndig), ndig); 104 | 105 | return ndig; 106 | } 107 | } 108 | 109 | static int Double2PChar(double val, uint32_t prec, char *str) 110 | { 111 | int64_t a = (int64_t)val; 112 | int64_t b = (int64_t)((1.0 + (val - (double)a)) * powers10[prec] + 0.5); 113 | 114 | int r1 = Int2PChar(a, str); 115 | int r2 = Int2PChar(b, str + r1); 116 | str[r1] = '.'; 117 | 118 | return r1 + r2; 119 | } 120 | }; 121 | 122 | 123 | // integral specialization 124 | template ::value, int>::type* = nullptr> 125 | int num2str(Integer val, char *out) { 126 | return NumericConversions::Int2PChar((uint64_t)val, out); 127 | } 128 | 129 | // floating point specialization 130 | template ::value, int>::type* = nullptr> 131 | int num2str(Floating val, char *out) { 132 | return NumericConversions::Double2PChar((double)val, 6, out); 133 | } 134 | 135 | // pair specialization 136 | template 137 | int num2str(const std::pair val, char *out) { 138 | char* ptr = out; 139 | ptr += num2str(val.first, ptr); 140 | *ptr++ = ':'; 141 | ptr += num2str(val.second, ptr); 142 | 143 | return ptr - out; 144 | } 145 | 146 | // collection specialization 147 | template 148 | int num2str(const T* collection, size_t size, char delim, char* out) { 149 | char* ptr = out; 150 | for (size_t i = 0; i < size; ++i) { 151 | ptr += num2str(*collection++, ptr); 152 | *ptr++ = delim; 153 | } 154 | 155 | return (int) (ptr - out); 156 | } 157 | -------------------------------------------------------------------------------- /src/utils/cpuid.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUID_H 2 | #define CPUID_H 3 | // Taken from https://stackoverflow.com/questions/1666093/cpuid-implementations-in-c 4 | 5 | #ifdef _WIN32 6 | #include 7 | #include 8 | typedef unsigned __int32 uint32_t; 9 | 10 | #else 11 | #include 12 | #endif 13 | 14 | class CPUID { 15 | uint32_t regs[4]; 16 | 17 | public: 18 | explicit CPUID(unsigned i) { 19 | #ifdef _WIN32 20 | __cpuid((int *)regs, (int)i); 21 | 22 | #else 23 | asm volatile 24 | ("cpuid" : "=a" (regs[0]), "=b" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) 25 | : "a" (i), "c" (0)); 26 | // ECX is set to zero for CPUID function 4 27 | #endif 28 | } 29 | 30 | const uint32_t &EAX() const {return regs[0];} 31 | const uint32_t &EBX() const {return regs[1];} 32 | const uint32_t &ECX() const {return regs[2];} 33 | const uint32_t &EDX() const {return regs[3];} 34 | }; 35 | 36 | #endif // CPUID_H -------------------------------------------------------------------------------- /src/utils/deterministic_random.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | #undef min 7 | #undef max 8 | 9 | template 10 | class det_uniform_int_distribution { 11 | public: 12 | // types 13 | typedef IntType result_type; 14 | typedef std::pair param_type; 15 | 16 | // constructors and reset functions 17 | explicit det_uniform_int_distribution(IntType a = 0, IntType b = std::numeric_limits::max()); 18 | explicit det_uniform_int_distribution(const param_type& parm); 19 | void reset(); 20 | 21 | // generating functions 22 | template 23 | result_type operator()(URNG& g); 24 | template 25 | result_type operator()(URNG& g, const param_type& parm); 26 | 27 | // property functions 28 | result_type a() const; 29 | result_type b() const; 30 | param_type param() const; 31 | void param(const param_type& parm); 32 | result_type min() const; 33 | result_type max() const; 34 | 35 | private: 36 | typedef typename std::make_unsigned::type diff_type; 37 | 38 | IntType lower; 39 | IntType upper; 40 | }; 41 | 42 | template 43 | det_uniform_int_distribution::det_uniform_int_distribution(IntType a, IntType b) { 44 | lower = a; 45 | upper = b; 46 | } 47 | 48 | template 49 | det_uniform_int_distribution::det_uniform_int_distribution(const param_type& parm) { 50 | param(parm); 51 | } 52 | 53 | template 54 | void det_uniform_int_distribution::reset() {} 55 | 56 | template 57 | template 58 | auto det_uniform_int_distribution::operator()(URNG& g) -> result_type { 59 | return operator()(g, param()); 60 | } 61 | 62 | template 63 | template 64 | auto det_uniform_int_distribution::operator()(URNG& g, const param_type& parm) -> result_type { 65 | diff_type diff = (diff_type)parm.second - (diff_type)parm.first + 1; 66 | if (diff == 0) // If the +1 overflows we are using the full range, just return g() 67 | return g(); 68 | 69 | diff_type badDistLimit = std::numeric_limits::max() / diff; 70 | do { 71 | diff_type generatedRand = g(); 72 | 73 | if (generatedRand / diff < badDistLimit) 74 | return (IntType)((generatedRand % diff) + (diff_type)parm.first); 75 | } while (true); 76 | } 77 | 78 | template 79 | auto det_uniform_int_distribution::a() const -> result_type { 80 | return lower; 81 | } 82 | 83 | template 84 | auto det_uniform_int_distribution::b() const -> result_type { 85 | return upper; 86 | } 87 | 88 | template 89 | auto det_uniform_int_distribution::param() const -> param_type { 90 | return param_type(lower, upper); 91 | } 92 | 93 | template 94 | void det_uniform_int_distribution::param(const param_type& parm) { 95 | std::tie(lower, upper) = parm; 96 | if (upper < lower) 97 | throw std::exception(); 98 | } 99 | 100 | template 101 | auto det_uniform_int_distribution::min() const -> result_type { 102 | return lower; 103 | } 104 | 105 | template 106 | auto det_uniform_int_distribution::max() const -> result_type { 107 | return upper; 108 | }; 109 | 110 | 111 | 112 | 113 | template 114 | void partial_shuffle(RandomIt first, RandomIt middle, RandomIt last, URBG&& g) 115 | { 116 | typedef typename std::iterator_traits::difference_type diff_t; 117 | typedef det_uniform_int_distribution distr_t; 118 | typedef typename distr_t::param_type param_t; 119 | 120 | distr_t D; 121 | diff_t n = middle - first; 122 | diff_t N = last - first - 1; 123 | for (diff_t i =0; i < n; ++i) { 124 | using std::swap; 125 | swap(first[i], first[D(g, param_t(i, N))]); 126 | } 127 | } 128 | 129 | 130 | -------------------------------------------------------------------------------- /src/utils/log.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #include "log.h" 9 | 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | const int Log::LEVEL_DEBUG = 0; 16 | const int Log::LEVEL_VERBOSE = 1; 17 | const int Log::LEVEL_NORMAL = 2; 18 | 19 | 20 | // ************************************************************************************ 21 | // NumericConversions statics 22 | char NumericConversions::digits[]; 23 | NumericConversions::_si NumericConversions::_init; 24 | uint64_t NumericConversions::powers10[]; 25 | 26 | 27 | 28 | // ***************************************************************************************** 29 | // 30 | Log::Log() 31 | { 32 | enabled = false; 33 | out = &std::cerr; 34 | } 35 | 36 | // ***************************************************************************************** 37 | // 38 | Log& Log::operator<< (std::ostream& (*pf)(std::ostream&)) 39 | { 40 | if (enabled) { 41 | *this->out << pf; 42 | out->flush(); 43 | } 44 | return *this; 45 | } 46 | 47 | // ***************************************************************************************** 48 | // 49 | Log& Log::operator<< (std::ios& (*pf)(std::ios&)) 50 | { 51 | if (enabled) { 52 | *this->out << pf; 53 | out->flush(); 54 | } 55 | 56 | return *this; 57 | } 58 | 59 | // ***************************************************************************************** 60 | // 61 | Log& Log::operator<< (std::ios_base& (*pf)(std::ios_base&)) 62 | { 63 | if (enabled) { 64 | *this->out << pf; 65 | out->flush(); 66 | } 67 | 68 | return *this; 69 | } 70 | 71 | // ***************************************************************************************** 72 | // 73 | std::string Log::formatLargeNumber(uint64_t num, int minWidth) { 74 | std::string out = ""; 75 | 76 | do { 77 | uint64_t part = num % 1000LL; 78 | num = num / 1000LL; 79 | 80 | if (num > 0) { 81 | std::ostringstream oss; 82 | oss << "," << std::setw(3) << std::setfill('0') << part; 83 | out = oss.str() + out; 84 | } 85 | else { 86 | out = std::to_string(part) + out; 87 | } 88 | 89 | } while (num > 0); 90 | 91 | int initialSpaces = (int) (minWidth - out.length()); 92 | 93 | if (initialSpaces > 0) { 94 | out = string(initialSpaces, ' ') + out; 95 | } 96 | 97 | return out; 98 | } -------------------------------------------------------------------------------- /src/utils/log.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #define LOG_NORMAL Log::getInstance(Log::LEVEL_NORMAL) 18 | #define LOG_VERBOSE Log::getInstance(Log::LEVEL_VERBOSE) 19 | #define LOG_DEBUG Log::getInstance(Log::LEVEL_DEBUG) 20 | 21 | // ***************************************************************************************** 22 | // 23 | class Log 24 | { 25 | public: 26 | static const int LEVEL_NORMAL; 27 | static const int LEVEL_VERBOSE; 28 | static const int LEVEL_DEBUG; 29 | 30 | void enable() { enabled = true; } 31 | void disable() { enabled = false; } 32 | bool isEnabled() const { return enabled; } 33 | 34 | // ***************************************************************************************** 35 | // 36 | static Log& getInstance(int level) { 37 | static std::vector> logs; 38 | if (logs.size() == 0) { 39 | logs.push_back(std::shared_ptr(new Log())); 40 | logs.push_back(std::shared_ptr(new Log())); 41 | logs.push_back(std::shared_ptr(new Log())); 42 | } 43 | 44 | return *logs[level]; 45 | } 46 | 47 | // ***************************************************************************************** 48 | // 49 | template 50 | Log& operator<<(T v) { 51 | if (enabled) { *out << v; } 52 | return *this; 53 | } 54 | 55 | Log& operator<< (std::ostream& (*pf)(std::ostream&)); 56 | Log& operator<< (std::ios& (*pf)(std::ios&)); 57 | Log& operator<< (std::ios_base& (*pf)(std::ios_base&)); 58 | 59 | static std::string formatLargeNumber(uint64_t num, int minWidth = 0); 60 | 61 | protected: 62 | bool enabled; 63 | std::ostream* out; 64 | 65 | Log(); 66 | }; 67 | 68 | 69 | 70 | // ************************************************************************************ 71 | class NumericConversions 72 | { 73 | public: 74 | static char digits[100000 * 5]; 75 | static uint64_t powers10[15]; 76 | struct _si { 77 | _si() 78 | { 79 | for (int i = 0; i < 100000; ++i) 80 | { 81 | int dig = i; 82 | 83 | digits[i * 5 + 4] = '0' + (dig % 10); 84 | dig /= 10; 85 | digits[i * 5 + 3] = '0' + (dig % 10); 86 | dig /= 10; 87 | digits[i * 5 + 2] = '0' + (dig % 10); 88 | dig /= 10; 89 | digits[i * 5 + 1] = '0' + (dig % 10); 90 | dig /= 10; 91 | digits[i * 5 + 0] = '0' + dig; 92 | } 93 | 94 | powers10[0] = 1; 95 | for (int i = 1; i < 15; ++i) 96 | powers10[i] = 10 * powers10[i - 1]; 97 | } 98 | } static _init; 99 | 100 | static int NDigits(uint64_t v) 101 | { 102 | return (v < 10000) 103 | ? (v < 100 ? (v < 10 ? 1 : 2) : (v < 1000 ? 3 : 4)) 104 | : (v < 1000000 ? (v < 100000 ? 5 : 6) : (v < 10000000 ? 7 : 8)); 105 | } 106 | 107 | static int Int2PChar(uint64_t val, char *str) 108 | { 109 | if (val >= 1000000000000000ull) 110 | { 111 | uint64_t dig1 = val / 1000000000000000ull; 112 | val -= dig1 * 1000000000000000ull; 113 | uint64_t dig2 = val / 10000000000ull; 114 | val -= dig2 * 10000000000ull; 115 | uint64_t dig3 = val / 100000ull; 116 | uint64_t dig4 = val - dig3 * 100000ull; 117 | 118 | int ndig = NDigits(dig1); 119 | 120 | std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 121 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 122 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5); 123 | std::memcpy(str + ndig + 10, digits + dig4 * 5, 5); 124 | 125 | return ndig + 15; 126 | } 127 | else if (val >= 10000000000ull) 128 | { 129 | uint64_t dig1 = val / 10000000000ull; 130 | val -= dig1 * 10000000000ull; 131 | uint64_t dig2 = val / 100000ull; 132 | uint64_t dig3 = val - dig2 * 100000ull; 133 | 134 | int ndig = NDigits(dig1); 135 | 136 | std::memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 137 | std::memcpy(str + ndig, digits + dig2 * 5, 5); 138 | std::memcpy(str + ndig + 5, digits + dig3 * 5, 5); 139 | 140 | return ndig + 10; 141 | } 142 | else if (val >= 100000ull) 143 | { 144 | uint64_t dig1 = val / 100000ull; 145 | uint64_t dig2 = val - dig1 * 100000ull; 146 | 147 | int ndig = NDigits(dig1); 148 | 149 | memcpy(str, digits + dig1 * 5 + (5 - ndig), ndig); 150 | memcpy(str + ndig, digits + dig2 * 5, 5); 151 | 152 | return ndig + 5; 153 | } 154 | else 155 | { 156 | int ndig = NDigits(val); 157 | 158 | memcpy(str, digits + val * 5 + (5 - ndig), ndig); 159 | 160 | return ndig; 161 | } 162 | } 163 | 164 | static int Double2PChar(double val, uint32_t prec, char *str) 165 | { 166 | int64_t a = (int64_t)val; 167 | int64_t b = (int64_t)((1.0 + (val - (double)a)) * powers10[prec] + 0.5); 168 | 169 | int r1 = Int2PChar(a, str); 170 | int r2 = Int2PChar(b, str + r1); 171 | str[r1] = '.'; 172 | 173 | return r1 + r2; 174 | } 175 | }; 176 | -------------------------------------------------------------------------------- /src/utils/memory_monotonic.h: -------------------------------------------------------------------------------- 1 | #ifndef _MEMORY_MONOTONIC_H 2 | #define _MEMORY_MONOTONIC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace refresh { 12 | class memory_monotonic_base 13 | { 14 | protected: 15 | size_t block_size; 16 | size_t alignment; 17 | size_t total_allocated; 18 | size_t total_requested; 19 | size_t no_allocs; 20 | size_t no_deallocs; 21 | 22 | std::vector blocks; 23 | std::vector freezed_blocks; 24 | char* cur_block; 25 | size_t in_block_pos; 26 | 27 | void _release_blocks() 28 | { 29 | for (auto p : blocks) 30 | free(p); 31 | 32 | blocks.clear(); 33 | cur_block = nullptr; 34 | in_block_pos = block_size; 35 | } 36 | 37 | void _allocate_block(size_t size) 38 | { 39 | cur_block = (char*)malloc(size + alignment); 40 | total_allocated += size + alignment; 41 | 42 | blocks.push_back(cur_block); 43 | 44 | cur_block += alignment - (uint64_t)(cur_block) % alignment; 45 | in_block_pos = 0; 46 | } 47 | 48 | bool _deallocation_status() 49 | { 50 | return no_allocs == no_deallocs; 51 | } 52 | 53 | void* _allocate(size_t size) 54 | { 55 | if (in_block_pos + size > block_size) 56 | _allocate_block(std::max(block_size, size)); 57 | 58 | auto p = cur_block + in_block_pos; 59 | 60 | in_block_pos += (size + alignment - 1) / alignment * alignment; 61 | ++no_allocs; 62 | 63 | total_requested += size; 64 | 65 | return p; 66 | } 67 | 68 | template 69 | void _deallocate(T*& p) 70 | { 71 | if (!p) 72 | return; 73 | 74 | p = nullptr; 75 | ++no_deallocs; 76 | } 77 | 78 | void _freeze() 79 | { 80 | freezed_blocks.insert(freezed_blocks.end(), blocks.begin(), blocks.end()); 81 | blocks.clear(); 82 | cur_block = nullptr; 83 | in_block_pos = block_size; 84 | } 85 | 86 | void _release_freezed() 87 | { 88 | for (auto& p : freezed_blocks) 89 | free(p); 90 | 91 | freezed_blocks.clear(); 92 | } 93 | 94 | void _release() 95 | { 96 | _release_freezed(); 97 | _release_blocks(); 98 | } 99 | 100 | public: 101 | memory_monotonic_base(size_t _block_size, size_t _alignment) : 102 | block_size(_block_size), 103 | alignment(_alignment), 104 | total_allocated(0), 105 | total_requested(0), 106 | no_allocs(0), 107 | no_deallocs(0), 108 | cur_block(nullptr), 109 | in_block_pos(_block_size) 110 | { 111 | } 112 | 113 | memory_monotonic_base() = delete; 114 | memory_monotonic_base(const memory_monotonic_base &x) = delete; 115 | memory_monotonic_base(memory_monotonic_base &&x) = delete; 116 | memory_monotonic_base& operator=(const memory_monotonic_base &x) = delete; 117 | memory_monotonic_base& operator=(const memory_monotonic_base &&x) = delete; 118 | 119 | ~memory_monotonic_base() 120 | { 121 | _release_freezed(); 122 | _release_blocks(); 123 | } 124 | }; 125 | 126 | // **** 127 | class memory_monotonic_unsafe : public memory_monotonic_base 128 | { 129 | public: 130 | memory_monotonic_unsafe(size_t _block_size = (1ull << 20), size_t _alignment = 64) 131 | : memory_monotonic_base(_block_size, _alignment) 132 | { 133 | } 134 | 135 | memory_monotonic_unsafe() = delete; 136 | memory_monotonic_unsafe(const memory_monotonic_unsafe& x) = delete; 137 | memory_monotonic_unsafe(memory_monotonic_unsafe&& x) = delete; 138 | memory_monotonic_unsafe& operator=(const memory_monotonic_unsafe& x) = delete; 139 | memory_monotonic_unsafe& operator=(const memory_monotonic_unsafe&& x) = delete; 140 | 141 | ~memory_monotonic_unsafe() 142 | {} 143 | 144 | bool deallocation_status() 145 | { 146 | return _deallocation_status(); 147 | } 148 | 149 | void* allocate(size_t size) 150 | { 151 | return _allocate(size); 152 | } 153 | 154 | template 155 | void deallocate(T*& p) 156 | { 157 | _deallocate(p); 158 | } 159 | 160 | void freeze() 161 | { 162 | _freeze(); 163 | } 164 | 165 | void release() 166 | { 167 | _release(); 168 | } 169 | 170 | void release_freezed() 171 | { 172 | _release_freezed(); 173 | } 174 | }; 175 | 176 | 177 | // **** 178 | class memory_monotonic_safe : public memory_monotonic_base 179 | { 180 | std::mutex mtx; 181 | 182 | public: 183 | memory_monotonic_safe(size_t _block_size = (1ull << 20), size_t _alignment = 64) 184 | : memory_monotonic_base(_block_size, _alignment) 185 | { 186 | } 187 | 188 | memory_monotonic_safe() = delete; 189 | memory_monotonic_safe(const memory_monotonic_safe& x) = delete; 190 | memory_monotonic_safe(memory_monotonic_safe&& x) = delete; 191 | memory_monotonic_safe& operator=(const memory_monotonic_safe& x) = delete; 192 | memory_monotonic_safe& operator=(const memory_monotonic_safe&& x) = delete; 193 | 194 | ~memory_monotonic_safe() 195 | { 196 | } 197 | 198 | bool deallocation_status() 199 | { 200 | std::lock_guard lck(mtx); 201 | 202 | return _deallocation_status(); 203 | } 204 | 205 | void* allocate(size_t size) 206 | { 207 | std::lock_guard lck(mtx); 208 | 209 | return _allocate(size); 210 | } 211 | 212 | template 213 | void deallocate(T*& p) 214 | { 215 | std::lock_guard lck(mtx); 216 | 217 | _deallocate(p); 218 | } 219 | 220 | void freeze() 221 | { 222 | std::lock_guard lck(mtx); 223 | 224 | _freeze(); 225 | } 226 | 227 | void release() 228 | { 229 | std::lock_guard lck(mtx); 230 | 231 | _release(); 232 | } 233 | 234 | void release_freezed() 235 | { 236 | std::lock_guard lck(mtx); 237 | 238 | _release_freezed(); 239 | } 240 | }; 241 | } 242 | 243 | #endif -------------------------------------------------------------------------------- /src/utils/meta_oper.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _META_OPER_H 10 | #define _META_OPER_H 11 | 12 | //#include 13 | 14 | //#define UNROLL(N, x) ((N > 0) ? ((x), UNROLL((N)-1, (x))) : (x)) 15 | 16 | template struct uint_{ }; 17 | 18 | // For loop (forward) 19 | template 20 | static inline void IterFwd(const Lambda &oper, uint_) { 21 | IterFwd(oper, uint_()); 22 | oper(N); 23 | } 24 | 25 | template 26 | static inline void IterFwd(const Lambda &oper, uint_<0>) { 27 | oper(0); 28 | } 29 | 30 | // For loop (backward) 31 | template 32 | inline void IterRev(const Lambda &oper, uint_) { 33 | oper(N); 34 | IterRev(oper, uint_()); 35 | } 36 | 37 | template 38 | inline void IterRev(const Lambda &oper, uint_<0>) { 39 | oper(0); 40 | } 41 | 42 | #ifdef _MSC_VER // Visual C++ 43 | #include 44 | #define POPCNT(x) (uint32_t) __popcnt64(x) 45 | #endif 46 | 47 | #ifdef __GNUC__ 48 | #define POPCNT(x) (uint32_t) __builtin_popcountll(x) 49 | #endif 50 | 51 | 52 | #endif 53 | 54 | // ***** EOF 55 | -------------------------------------------------------------------------------- /src/utils/pooled_threads.cpp: -------------------------------------------------------------------------------- 1 | #include "pooled_threads.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | //define to create separate thread pool for each thread (than each thread is a producer of tasks) 12 | //#define USE_THREAD_LOCAL 13 | 14 | namespace pooled_threads 15 | { 16 | class ThreadPoolTask 17 | { 18 | int id; 19 | std::function fun; 20 | std::mutex mtx; 21 | std::condition_variable cv_join; 22 | bool done = false; 23 | public: 24 | int GetId() const 25 | { 26 | return id; 27 | } 28 | ThreadPoolTask(int id) : id(id) {} 29 | 30 | void SetFunction(std::function&& _f) 31 | { 32 | done = false; 33 | 34 | //I am not sure it it is legal, but if it is it is possible that it would work a little faster than move assignment operator 35 | //this->fun.~function(); 36 | //new (&this->fun) std::function(move(_f)); 37 | 38 | this->fun = std::move(_f); 39 | } 40 | void operator()() 41 | { 42 | fun(); 43 | } 44 | 45 | void Join() 46 | { 47 | std::unique_lock lck(mtx); 48 | cv_join.wait(lck, [this] {return done; }); 49 | } 50 | 51 | void NotifyDone() 52 | { 53 | std::lock_guard lck(mtx); 54 | done = true; 55 | cv_join.notify_one(); 56 | } 57 | }; 58 | 59 | 60 | class ThreadPoolTaskQueue 61 | { 62 | std::vector tasks; 63 | std::mutex mtx; 64 | std::condition_variable cv_pop; 65 | bool finished = false; 66 | 67 | unsigned long pos = 0; 68 | public: 69 | void SetMaxTasks(unsigned long m) 70 | { 71 | tasks.resize(m); 72 | } 73 | bool PopTask(ThreadPoolTask*& task) 74 | { 75 | std::unique_lock lck(mtx); 76 | cv_pop.wait(lck, [this] {return pos || finished; }); 77 | if (!pos) 78 | return false; 79 | task = tasks[--pos]; 80 | return true; 81 | } 82 | void AddTask(ThreadPoolTask& task) 83 | { 84 | std::lock_guard lck(mtx); 85 | tasks[pos++] = &task; 86 | if (pos == 1) 87 | cv_pop.notify_all(); 88 | } 89 | 90 | void Finish() 91 | { 92 | std::lock_guard lck(mtx); 93 | finished = true; 94 | cv_pop.notify_all(); 95 | } 96 | }; 97 | 98 | class ThreadPool 99 | { 100 | std::vector threads; 101 | std::vector> tasks; 102 | std::vector free_tasks; //free tasks ids 103 | ThreadPoolTaskQueue task_queue; 104 | std::mutex mtx; 105 | public: 106 | void AddTask(ThreadPoolTask& task) 107 | { 108 | task_queue.AddTask(task); 109 | } 110 | 111 | ThreadPoolTask* GetFreeTask() 112 | { 113 | #ifndef USE_THREAD_LOCAL 114 | std::lock_guard lck(mtx); 115 | #endif 116 | if (!free_tasks.size()) 117 | { 118 | tasks.emplace_back(std::make_unique(tasks.size())); 119 | task_queue.SetMaxTasks(static_cast(tasks.size())); 120 | free_tasks.emplace_back(static_cast(tasks.size()) - 1); 121 | threads.emplace_back([this]() 122 | { 123 | ThreadPoolTask* task; 124 | while (task_queue.PopTask(task)) 125 | { 126 | (*task)(); 127 | task->NotifyDone(); 128 | } 129 | }); 130 | } 131 | auto r = tasks[free_tasks.back()].get(); 132 | free_tasks.pop_back(); 133 | return r; 134 | } 135 | 136 | void ReturnTask(ThreadPoolTask* task) 137 | { 138 | #ifndef USE_THREAD_LOCAL 139 | std::lock_guard lck(mtx); 140 | #endif 141 | free_tasks.push_back(task->GetId()); 142 | } 143 | 144 | ~ThreadPool() 145 | { 146 | //std::cout << "ThreadPool dtor: \n"; 147 | //std::cout << "n threads: " << threads.size() << "\n"; 148 | //std::cout << "n tasks: " << tasks.size() << "\n"; 149 | task_queue.Finish(); 150 | for (auto& th : threads) 151 | if (th.joinable()) 152 | th.join(); 153 | } 154 | }; 155 | 156 | //Global Instance of thread pool 157 | #ifdef USE_THREAD_LOCAL 158 | thread_local ThreadPool thread_pool; 159 | #else 160 | ThreadPool thread_pool; 161 | #endif 162 | 163 | 164 | void thread::join() 165 | { 166 | if (task && _joinable) 167 | { 168 | task->Join(); 169 | thread_pool.ReturnTask(task); 170 | task = nullptr; 171 | _joinable = false; 172 | } 173 | } 174 | 175 | thread::~thread() 176 | { 177 | if (task && _joinable) 178 | std::terminate(); 179 | } 180 | 181 | void thread::Create(std::function&& f) 182 | { 183 | task = thread_pool.GetFreeTask(); 184 | task->SetFunction(std::move(f)); 185 | thread_pool.AddTask(*task); 186 | } 187 | } -------------------------------------------------------------------------------- /src/utils/pooled_threads.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace pooled_threads 5 | { 6 | class ThreadPoolTask; 7 | 8 | class thread 9 | { 10 | bool _joinable = false; 11 | ThreadPoolTask* task; 12 | void Create(std::function&& f); 13 | public: 14 | thread(const thread&) = delete; 15 | thread& operator=(const thread&) = delete; 16 | 17 | thread(thread&& rhs) : 18 | _joinable(rhs._joinable), task(rhs.task) 19 | { 20 | rhs._joinable = false; 21 | rhs.task = nullptr; 22 | } 23 | thread& operator=(thread&& rhs) 24 | { 25 | _joinable = rhs._joinable; 26 | task = rhs.task; 27 | rhs._joinable = false; 28 | rhs.task = nullptr; 29 | return *this; 30 | } 31 | 32 | template 33 | explicit thread(_Callable&& __f, _Args&&... __args) : 34 | _joinable(true) 35 | { 36 | Create(std::bind(std::forward<_Callable>(__f), std::forward<_Args>(__args)...)); 37 | } 38 | 39 | bool joinable() 40 | { 41 | return _joinable; 42 | } 43 | 44 | void join(); 45 | 46 | ~thread(); 47 | }; 48 | } -------------------------------------------------------------------------------- /src/utils/statistics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define GET_STATS 9 | 10 | #ifdef GET_STATS 11 | #define STATS_WRITE(key,value) writeStats(key, value) 12 | #define STATS_ADD(key,value) addStats(key, value) 13 | #else 14 | #define STATS_WRITE(key,value) 15 | #define STATS_ADD(key,value) 16 | #endif 17 | 18 | 19 | class IStat { 20 | public: 21 | virtual ~IStat() {} 22 | virtual std::string toString() const = 0; 23 | virtual void add(const IStat& other) = 0; 24 | 25 | virtual std::shared_ptr clone() = 0; 26 | }; 27 | 28 | template 29 | class Stat : public IStat { 30 | public: 31 | typedef T value_type; 32 | 33 | Stat(T value) : value(value) {} 34 | 35 | std::shared_ptr clone() { 36 | auto copy = std::make_shared>(this->value); 37 | return copy; 38 | } 39 | 40 | virtual void add(T other) { 41 | value += other; 42 | } 43 | 44 | virtual void add(const IStat& other) { 45 | auto casted = dynamic_cast&>(other); 46 | value += casted.value; 47 | } 48 | 49 | virtual std::string toString() const { return std::to_string(value); } 50 | protected: 51 | T value; 52 | }; 53 | 54 | inline std::ostream& operator<<(std::ostream& os, IStat& stats) { 55 | os << stats.toString(); 56 | return os; 57 | } 58 | 59 | class Statistics 60 | { 61 | public: 62 | 63 | virtual ~Statistics() {} 64 | 65 | template 66 | void put(const std::string& key, const T& value) { 67 | statistics[key] = std::make_shared>(value); 68 | } 69 | 70 | template 71 | void add(const std::string& key, const T& other) { 72 | auto casted = std::dynamic_pointer_cast>(statistics[key]); 73 | casted->add(other); 74 | } 75 | 76 | void clear() { statistics.clear(); } 77 | 78 | std::string toString() const 79 | { 80 | std::ostringstream oss; 81 | 82 | for (auto s = statistics.begin(); s != statistics.end(); s++) { 83 | oss << s->first << "=" << s->second->toString() << std::endl; 84 | } 85 | 86 | return oss.str(); 87 | } 88 | 89 | 90 | protected: 91 | std::map> statistics; 92 | }; -------------------------------------------------------------------------------- /src/utils/timer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifdef WIN32 10 | #include 11 | #endif 12 | 13 | #include // NULL 14 | #include "timer.h" 15 | 16 | 17 | #ifdef WIN32 18 | // ********************************************************** 19 | double CStopWatch::LIToSecs( LARGE_INTEGER & L) 20 | { 21 | return ((double)L.QuadPart /(double)frequency.QuadPart); 22 | } 23 | 24 | // ********************************************************** 25 | // CStopWatch 26 | // ********************************************************** 27 | CStopWatch::CStopWatch() 28 | { 29 | timer.start.QuadPart=0; 30 | timer.stop.QuadPart=0; 31 | 32 | QueryPerformanceFrequency( &frequency ); 33 | } 34 | 35 | // ********************************************************** 36 | void CStopWatch::StartTimer( ) 37 | { 38 | QueryPerformanceCounter(&timer.start); 39 | } 40 | 41 | // ********************************************************** 42 | void CStopWatch::StopTimer( ) 43 | { 44 | QueryPerformanceCounter(&timer.stop); 45 | } 46 | 47 | // ********************************************************** 48 | double CStopWatch::GetElapsedTime() 49 | { 50 | LARGE_INTEGER time; 51 | time.QuadPart = timer.stop.QuadPart - timer.start.QuadPart; 52 | return LIToSecs(time); 53 | } 54 | 55 | 56 | #else 57 | // ********************************************************** 58 | CStopWatch::CStopWatch() 59 | { 60 | gettimeofday(&(timer.start), NULL); 61 | timer.stop = timer.start; 62 | } 63 | 64 | // ********************************************************** 65 | void CStopWatch::StartTimer( ) 66 | { 67 | gettimeofday(&(timer.start),NULL); 68 | } 69 | 70 | // ********************************************************** 71 | void CStopWatch::StopTimer( ) 72 | { 73 | gettimeofday(&(timer.stop),NULL); 74 | } 75 | 76 | // ********************************************************** 77 | double CStopWatch::GetElapsedTime() 78 | { 79 | timeval res; 80 | timersub(&(timer.stop),&(timer.start),&res); 81 | return res.tv_sec + res.tv_usec/1000000.0; // 10^6 uSec per second 82 | } 83 | 84 | 85 | #endif -------------------------------------------------------------------------------- /src/utils/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _TIMER_H 10 | #define _TIMER_H 11 | 12 | #ifdef WIN32 13 | #include 14 | 15 | // ********************************************************** 16 | typedef struct 17 | { 18 | LARGE_INTEGER start; 19 | LARGE_INTEGER stop; 20 | } stop_watch_t; 21 | 22 | // ********************************************************** 23 | class CStopWatch 24 | { 25 | stop_watch_t timer; 26 | LARGE_INTEGER frequency; 27 | double LIToSecs( LARGE_INTEGER & L); 28 | 29 | public: 30 | CStopWatch(); 31 | void StartTimer( ); 32 | void StopTimer( ); 33 | double GetElapsedTime(); 34 | }; 35 | 36 | // ********************************************************** 37 | typedef struct 38 | { 39 | ULARGE_INTEGER start; 40 | ULARGE_INTEGER stop; 41 | } thread_watch_t; 42 | 43 | 44 | 45 | #else 46 | #include 47 | #include 48 | #include 49 | 50 | typedef struct 51 | { 52 | timeval start; 53 | timeval stop; 54 | } stop_watch_t; 55 | 56 | class CStopWatch 57 | { 58 | stop_watch_t timer; 59 | 60 | public: 61 | CStopWatch(); 62 | void StartTimer( ); 63 | void StopTimer( ); 64 | double GetElapsedTime(); 65 | }; 66 | 67 | typedef timeval thread_watch_t; 68 | 69 | 70 | #endif 71 | 72 | #endif 73 | // ***** EOF 74 | -------------------------------------------------------------------------------- /src/utils/utils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include 10 | 11 | void mem_clear(void* ptr, size_t size) 12 | { 13 | memset(ptr, 0, size); 14 | } 15 | 16 | -------------------------------------------------------------------------------- /src/utils/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #ifndef _UTILS_H 10 | #define _UTILS_H 11 | 12 | #include "../core/defs.h" 13 | #include 14 | 15 | #ifdef _MSC_VER 16 | #include 17 | #endif 18 | 19 | void mem_clear(void* ptr, size_t size); 20 | 21 | #if SIMD==SIMD_AVX1 || SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 22 | void mem_clear_avx(void* ptr, size_t size); 23 | #endif 24 | 25 | #if SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 26 | void mem_clear_avx2(void* ptr, size_t size); 27 | #endif 28 | 29 | #if SIMD==SIMD_NEON 30 | void mem_clear_neon(void* ptr, size_t size); 31 | #endif 32 | 33 | // ******************************************************************* 34 | template 35 | T max4(T x1, T x2, T x3, T x4) 36 | { 37 | T p1 = (x1 > x2) ? x1 : x2; 38 | T p2 = (x3 > x4) ? x3 : x4; 39 | 40 | return (p1 > p2) ? p1 : p2; 41 | } 42 | 43 | // ******************************************************************* 44 | template 45 | void clear_vector(std::vector& vec) 46 | { 47 | std::vector().swap(vec); 48 | } 49 | 50 | // ******************************************************************* 51 | template 52 | void delete_arr_ptr(T* &ptr) 53 | { 54 | if (!ptr) 55 | return; 56 | 57 | delete[] ptr; 58 | ptr = nullptr; 59 | } 60 | 61 | // ******************************************************************* 62 | template 63 | void delete_ptr(T* &ptr) 64 | { 65 | if (!ptr) 66 | return; 67 | 68 | delete ptr; 69 | ptr = nullptr; 70 | } 71 | 72 | // ******************************************************************* 73 | template 74 | void tpl_prefetch(T* ptr) 75 | { 76 | #ifdef _MSC_VER // Visual C++ 77 | _mm_prefetch((const char*) ptr, 2); 78 | #endif 79 | #ifdef __GNUC__ 80 | // __builtin_prefetch((&(*dist_vector)[pi[j + prefetch_offset]]), 1, 2); 81 | __builtin_prefetch(ptr, 1, 2); 82 | #endif 83 | } 84 | 85 | #endif -------------------------------------------------------------------------------- /src/utils/utils_avx.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include 10 | #include "../core/defs.h" 11 | 12 | #if SIMD==SIMD_AVX1 || SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 13 | 14 | void mem_clear_avx(void* ptr, size_t size) 15 | { 16 | memset(ptr, 0, size); 17 | } 18 | #endif 19 | -------------------------------------------------------------------------------- /src/utils/utils_avx2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include 10 | #include "../core/defs.h" 11 | 12 | #if SIMD==SIMD_AVX2 || SIMD==SIMD_AVX512 13 | void mem_clear_avx2(void* ptr, size_t size) 14 | { 15 | memset(ptr, 0, size); 16 | } 17 | #endif 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/utils/utils_neon.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This file is a part of FAMSA software distributed under GNU GPL 3 licence. 3 | The homepage of the FAMSA project is https://github.com/refresh-bio/FAMSA 4 | 5 | Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Adam Gudys 6 | 7 | */ 8 | 9 | #include 10 | #include "../core/defs.h" 11 | 12 | #if SIMD==SIMD_NEON 13 | 14 | void mem_clear_neon(void* ptr, size_t size) 15 | { 16 | memset(ptr, 0, size); 17 | } 18 | #endif 19 | -------------------------------------------------------------------------------- /test/adeno_fiber/sl.dnd: -------------------------------------------------------------------------------- 1 | (((((((((((((((((((P1;1qhva:1.0,(B5MEN1_ADE02/1-178:1.0,J9Z5J6_9ADEN/401-578:1.0):1.0):1.0,V5RH14_ADE02/341-460:1.0):1.0,(((T1UF00_9ADEN/350-524:1.0,W6A1S0_9ADEN/350-524:1.0):1.0,J7IEE1_9ADEN/350-524:1.0):1.0,(((J9Z4R6_9ADEN/401-578:1.0,J9Z589_9ADEN/401-578:1.0):1.0,Q6R2L0_ADE01/401-578:1.0):1.0,(Q2KRZ3_ADE05/401-577:1.0,P1;1knb:1.0):1.0):1.0):1.0):1.0,(G9G864_9ADEN/260-439:1.0,Q6QPC5_9ADEN/260-439:1.0):1.0):1.0,((Q8UY68_9ADEN/248-421:1.0,((B5MEN3_ADE04/1-174:1.0,SPIKE_ADE04/249-422:1.0):1.0,(Q6BEA7_ADE04/248-421:1.0,Q6BEA6_ADE04/248-421:1.0):1.0):1.0):1.0,Q5YKW9_ADE04/1-140:1.0):1.0):1.0,(Q6QP89_9ADEN/266-439:1.0,Q6QPG1_9ADEN/268-441:1.0):1.0):1.0,(((((B5MES3_9ADEN/1-176:1.0,T1UK39_9ADEN/190-365:1.0):1.0,((((G1FBZ2_9ADEN/182-356:1.0,H9XUK9_9ADEN/1-174:1.0):1.0,(T1UKF6_9ADEN/182-356:1.0,H9XUK7_9ADEN/1-174:1.0):1.0):1.0,((B5MES1_9ADEN/1-174:1.0,H9XUJ9_9ADEN/1-173:1.0):1.0,O56784_9ADEN/182-355:1.0):1.0):1.0,((M0QUR0_9ADEN/188-363:1.0,T1UM78_9ADEN/188-363:1.0):1.0,(Q2Z0K1_9ADEN/122-298:1.0,H9XUJ6_9ADEN/1-176:1.0):1.0):1.0):1.0):1.0,(((((O56783_ADE17/186-362:1.0,T1UHF5_9ADEN/186-362:1.0):1.0,C8ZJU4_9ADEN/144-321:1.0):1.0,((((T1UKZ7_9ADEN/182-358:1.0,(H0PPH6_9ADEN/182-358:1.0,(((((E1AI37_9ADEN/182-358:1.0,H9XUJ2_ADE09/1-176:1.0):1.0,SPIKE_ADE09/182-358:1.0):1.0,F8UFS4_9ADEN/182-358:1.0):1.0,T1UKN9_9ADEN/182-358:1.0):1.0,(H9XUI9_ADE09/1-176:1.0,H9XUJ1_ADE09/1-176:1.0):1.0):1.0):1.0):1.0,((B9A5P2_9ADEN/181-357:1.0,(((((B5MET2_ADE08/1-177:1.0,M5AN00_ADE08/139-315:1.0):1.0,B9A5H0_ADE08/182-358:1.0):1.0,B5MET0_ADE08/1-177:1.0):1.0,SPIKE_ADE08/182-358:1.0):1.0,Q75Q72_ADE08/182-358:1.0):1.0):1.0,Q6I7S0_ADE09/181-357:1.0):1.0):1.0,G9MBX5_ADE08/1-136:1.0):1.0,((Q64822_9ADEN/186-361:1.0,Q76SK0_9ADEN/186-361:1.0):1.0,(B5MEN8_9ADEN/1-176:1.0,Q2Z0K8_9ADEN/114-289:1.0):1.0):1.0):1.0):1.0,Q76EL1_ADE08/181-289:1.0):1.0,((T1UHQ0_9ADEN/188-366:1.0,(H5T769_9ADEN/189-367:1.0,M0QUM1_9ADEN/189-367:1.0):1.0):1.0,(T1UGW1_9ADEN/189-367:1.0,((((T1UMC1_9ADEN/189-367:1.0,H9XUJ7_9ADEN/1-178:1.0):1.0,C8ZJU1_9ADEN/144-322:1.0):1.0,Q2Z0J9_9ADEN/122-300:1.0):1.0,E9P574_9ADEN/189-367:1.0):1.0):1.0):1.0):1.0):1.0,(((((((Q2Z0I4_9ADEN/122-307:1.0,H9XUK6_9ADEN/1-185:1.0):1.0,B5MES9_9ADEN/1-187:1.0):1.0,(((W8CZP4_9ADEN/189-371:1.0,H9XUJ5_9ADEN/1-182:1.0):1.0,(M0QUE2_9ADEN/189-371:1.0,Q2Z0K4_9ADEN/122-304:1.0):1.0):1.0,((B5MES5_9ADEN/1-183:1.0,M0QUA4_9ADEN/189-369:1.0):1.0,H9XUK1_9ADEN/1-182:1.0):1.0):1.0):1.0,(((G3CK99_9ADEN/182-366:1.0,H9XUK8_9ADEN/1-184:1.0):1.0,W8VTP6_9ADEN/182-366:1.0):1.0,(((M0QVB6_9ADEN/189-373:1.0,C8ZJU5_9ADEN/145-329:1.0):1.0,(D2CH24_9ADEN/188-372:1.0,H9XUJ8_9ADEN/1-184:1.0):1.0):1.0,B5MER6_9ADEN/1-184:1.0):1.0):1.0):1.0,D4N3K6_9ADEN/183-367:1.0):1.0,((((((((Q76QY0_ADE15/181-361:1.0,B5MES0_9ADEN/1-181:1.0):1.0,Q2Z0J3_9ADEN/114-294:1.0):1.0,M0QU65_ADE15/181-352:1.0):1.0,((H9XUJ3_9ADEN/1-180:1.0,C5HDT4_9ADEN/182-362:1.0):1.0,H9XUJ4_9ADEN/1-175:1.0):1.0):1.0,Q2Z0K5_9ADEN/115-293:1.0):1.0,(((((B8LFM5_ADE17/188-375:1.0,T1UKU4_9ADEN/188-375:1.0):1.0,H9XUK4_9ADEN/1-187:1.0):1.0,(T1UIE7_9ADEN/188-375:1.0,T1UJE7_9ADEN/188-375:1.0):1.0):1.0,B5MES7_9ADEN/1-188:1.0):1.0,H9XUK5_9ADEN/1-187:1.0):1.0):1.0,((((Q2Z0I7_9ADEN/116-302:1.0,A4ZKK6_9ADEN/183-370:1.0):1.0,H9XUK2_9ADEN/1-186:1.0):1.0,H9XUK3_9ADEN/1-182:1.0):1.0,(T1UM09_9ADEN/183-372:1.0,C8ZJU2_9ADEN/140-323:1.0):1.0):1.0):1.0,((M0QUI2_9ADEN/188-371:1.0,(Q4KSI5_9ADEN/188-371:1.0,H9XUK0_9ADEN/1-183:1.0):1.0):1.0,B5MER1_9ADEN/1-184:1.0):1.0):1.0):1.0,SPIKE_ADE15/182-363:1.0):1.0):1.0,Q76C82_ADE08/182-236:1.0):1.0):1.0,(((((Q695Q9_9ADEN/377-542:1.0,F2WTI9_9ADEN/376-541:1.0):1.0,H9TET4_9ADEN/364-528:1.0):1.0,F6KSW0_9ADEN/392-557:1.0):1.0,((H9AAG5_9ADEN/243-415:1.0,H9AAJ7_9ADEN/243-415:1.0):1.0,((((((H9AAU5_9ADEN/242-414:1.0,H9AAX7_9ADEN/242-414:1.0):1.0,M9Z4H7_9ADEN/242-414:1.0):1.0,H9AAA0_9ADEN/242-414:1.0):1.0,H9AB10_9ADEN/242-414:1.0):1.0,H9AAD1_9ADEN/242-414:1.0):1.0,F2WTM1_9ADEN/242-414:1.0):1.0):1.0):1.0,(((B5MEP6_ADE18/1-173:1.0,Q2Z0H6_ADE18/113-285:1.0):1.0,((SPIKE_ADE12/410-583:1.0,P1;1noba:1.0):1.0,((SPIKE_ADE31/379-552:1.0,G1DE39_9ADEN/378-551:1.0):1.0,T1UDQ2_9ADEN/378-551:1.0):1.0):1.0):1.0,G9MC87_ADE31/74-152:1.0):1.0):1.0):1.0,((((M9YY75_9ADEN/438-608:1.0,M9YZ57_9ADEN/423-593:1.0):1.0,Q0PLX4_ADES7/386-556:1.0):1.0,(H8PG12_9ADEN/433-603:1.0,((B5MER8_ADE40/1-171:1.0,(E2IJZ9_ADE41/388-558:1.0,B5SNT7_ADE41/388-558:1.0):1.0):1.0,F8WQR5_ADE41/373-543:1.0):1.0):1.0):1.0,(A0MK71_9ADEN/386-546:1.0,Q5C8N5_9ADEN/386-546:1.0):1.0):1.0):1.0,(Q8B4M4_ADEB2/378-544:1.0,Q9DLD8_ADEB2/391-557:1.0):1.0):1.0,((Q5C8N6_9ADEN/197-359:1.0,A0MK70_9ADEN/197-359:1.0):1.0,Q0PLX5_ADES7/181-343:1.0):1.0):1.0,Q8QVG3_ADEBA/970-1140:1.0):1.0,((SPIK2_ADE40/233-381:1.0,((E2IJZ8_ADE41/233-381:1.0,SPIK2_ADE41/233-381:1.0):1.0,(F8WQQ3_ADE41/233-380:1.0,W0S1J1_ADE41/233-380:1.0):1.0):1.0):1.0,F8WQQ9_ADE41/233-381:1.0):1.0):1.0,((((((T2CHU5_9ADEN/163-348:1.0,(D7P6B4_9ADEN/163-348:1.0,B5MEP4_ADE16/1-186:1.0):1.0):1.0,(T1ULE5_9ADEN/148-333:1.0,R4HLF4_ADE16/163-348:1.0):1.0):1.0,(T2D2F0_ADE16/1-168:1.0,T2D137_ADE16/1-168:1.0):1.0):1.0,((((((((((((((R4HLK2_9ADEN/133-314:1.0,A2I928_ADE03/100-281:1.0):1.0,A2I921_ADE03/100-281:1.0):1.0,A2I917_ADE03/100-281:1.0):1.0,I6LEU2_9ADEN/133-314:1.0):1.0,A2I923_ADE03/100-281:1.0):1.0,S4X4Y5_ADE03/133-314:1.0):1.0,U5T7T2_ADE03/108-289:1.0):1.0,L7PJL3_ADE03/90-265:1.0):1.0,A2I924_ADE03/100-281:1.0):1.0,(T2D132_ADE03/1-164:1.0,T2D2E1_ADE03/1-164:1.0):1.0):1.0,(((A2I922_ADE03/100-281:1.0,A2I920_ADE03/100-281:1.0):1.0,A2I934_ADE03/100-281:1.0):1.0,A2I941_ADE03/100-281:1.0):1.0):1.0,B5MEN2_ADE03/1-182:1.0):1.0,(((T2D2C2_ADE03/1-164:1.0,T2D2E7_ADE03/1-164:1.0):1.0,T2D333_ADE03/1-164:1.0):1.0,T2D2A0_ADE03/1-164:1.0):1.0):1.0,((Q2V0H2_ADE03/93-197:1.0,Q1HAV0_ADE03/104-206:1.0):1.0,Q5YKX1_ADE03/119-223:1.0):1.0):1.0):1.0,(((((((Q1ERP6_ADE07/134-319:1.0,(((((((SPIKE_ADE07/152-338:1.0,Q5EY45_ADE07/134-320:1.0):1.0,Q77SK6_ADE07/134-320:1.0):1.0,C7ED85_ADE07/134-320:1.0):1.0,J7I6T1_9ADEN/134-320:1.0):1.0,C7ED82_ADE07/134-320:1.0):1.0,C7ED83_ADE07/134-320:1.0):1.0,D7RGN5_ADE07/134-318:1.0):1.0):1.0,L7PGT3_ADE07/92-272:1.0):1.0,(T2D324_ADE07/1-167:1.0,T2D293_ADE07/1-167:1.0):1.0):1.0,(T1UK06_9ADEN/133-319:1.0,((((T1UF82_9ADEN/134-320:1.0,Q67713_9ADEN/134-320:1.0):1.0,W6EK83_9ADEN/134-320:1.0):1.0,Q5UVZ3_9ADEN/134-320:1.0):1.0,Q76I92_ADE35/134-320:1.0):1.0):1.0):1.0,T2D126_9ADEN/1-167:1.0):1.0,(((((Q8V791_9ADEN/134-320:1.0,J7H5I2_9ADEN/120-306:1.0):1.0,D6BP12_ADE1A/134-320:1.0):1.0,S4UNB6_9ADEN/134-320:1.0):1.0,S4UM78_9ADEN/134-320:1.0):1.0,D6BP13_9ADEN/134-318:1.0):1.0):1.0,Q5YKW4_ADE07/112-214:1.0):1.0):1.0,(Q5YKW0_9ADEN/120-224:1.0,((T2D338_9ADEN/1-166:1.0,((Q27V31_9ADEN/115-298:1.0,B5MES8_9ADEN/1-184:1.0):1.0,Q67712_9ADEN/134-317:1.0):1.0):1.0,((((Q3ZKX3_9ADEN/134-317:1.0,B5MER3_ADE35/1-184:1.0):1.0,T1UFV8_9ADEN/134-317:1.0):1.0,Q91CL7_9ADEN/134-317:1.0):1.0,(T2D2C6_ADE35/1-166:1.0,T2D2A3_ADE35/1-166:1.0):1.0):1.0):1.0):1.0):1.0):1.0,G0ZAJ9_9ADEN/320-500:1.0):1.0,(H9AAD0_9ADEN/375-486:1.0,(((F2WTM0_9ADEN/360-473:1.0,((M9YVC2_9ADEN/375-484:1.0,(H9AB09_9ADEN/374-483:1.0,H9AAX6_9ADEN/374-483:1.0):1.0):1.0,H9AAR1_9ADEN/375-484:1.0):1.0):1.0,(((H9AAJ6_9ADEN/375-486:1.0,H9AAM9_9ADEN/375-486:1.0):1.0,H9AAG4_9ADEN/375-485:1.0):1.0,H9AAU4_9ADEN/375-486:1.0):1.0):1.0,H9AA99_9ADEN/373-482:1.0):1.0):1.0):1.0,M9Z2Q1_9ADEN/232-338:1.0):1.0,G1FQN7_9ADEN/377-552:1.0):1.0,Q83467_ADEP4/121-287:1.0); -------------------------------------------------------------------------------- /test/adeno_fiber/slink.dnd: -------------------------------------------------------------------------------- 1 | (Q83467_ADEP4/121-287:1.0,(G1FQN7_9ADEN/377-552:1.0,(M9Z2Q1_9ADEN/232-338:1.0,((H9AAD0_9ADEN/375-486:1.0,(H9AA99_9ADEN/373-482:1.0,((H9AAU4_9ADEN/375-486:1.0,((H9AAM9_9ADEN/375-486:1.0,H9AAJ6_9ADEN/375-486:1.0):1.0,H9AAG4_9ADEN/375-485:1.0):1.0):1.0,(F2WTM0_9ADEN/360-473:1.0,(H9AAR1_9ADEN/375-484:1.0,(M9YVC2_9ADEN/375-484:1.0,(H9AB09_9ADEN/374-483:1.0,H9AAX6_9ADEN/374-483:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(G0ZAJ9_9ADEN/320-500:1.0,((((((Q91CL7_9ADEN/134-317:1.0,(T1UFV8_9ADEN/134-317:1.0,(Q3ZKX3_9ADEN/134-317:1.0,B5MER3_ADE35/1-184:1.0):1.0):1.0):1.0,(T2D2A3_ADE35/1-166:1.0,T2D2C6_ADE35/1-166:1.0):1.0):1.0,(((B5MES8_9ADEN/1-184:1.0,Q27V31_9ADEN/115-298:1.0):1.0,Q67712_9ADEN/134-317:1.0):1.0,T2D338_9ADEN/1-166:1.0):1.0):1.0,Q5YKW0_9ADEN/120-224:1.0):1.0,(((((R4HLF4_ADE16/163-348:1.0,T1ULE5_9ADEN/148-333:1.0):1.0,((B5MEP4_ADE16/1-186:1.0,D7P6B4_9ADEN/163-348:1.0):1.0,T2CHU5_9ADEN/163-348:1.0):1.0):1.0,(T2D137_ADE16/1-168:1.0,T2D2F0_ADE16/1-168:1.0):1.0):1.0,(((T2D2A0_ADE03/1-164:1.0,(T2D333_ADE03/1-164:1.0,(T2D2E7_ADE03/1-164:1.0,T2D2C2_ADE03/1-164:1.0):1.0):1.0):1.0,(B5MEN2_ADE03/1-182:1.0,((A2I941_ADE03/100-281:1.0,(A2I934_ADE03/100-281:1.0,(A2I922_ADE03/100-281:1.0,A2I920_ADE03/100-281:1.0):1.0):1.0):1.0,((A2I924_ADE03/100-281:1.0,((U5T7T2_ADE03/108-289:1.0,(S4X4Y5_ADE03/133-314:1.0,(A2I923_ADE03/100-281:1.0,(I6LEU2_9ADEN/133-314:1.0,(A2I917_ADE03/100-281:1.0,(A2I921_ADE03/100-281:1.0,(A2I928_ADE03/100-281:1.0,R4HLK2_9ADEN/133-314:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,L7PJL3_ADE03/90-265:1.0):1.0):1.0,(T2D2E1_ADE03/1-164:1.0,T2D132_ADE03/1-164:1.0):1.0):1.0):1.0):1.0):1.0,(Q5YKX1_ADE03/119-223:1.0,(Q2V0H2_ADE03/93-197:1.0,Q1HAV0_ADE03/104-206:1.0):1.0):1.0):1.0):1.0,((((S4UM78_9ADEN/134-320:1.0,(S4UNB6_9ADEN/134-320:1.0,(D6BP12_ADE1A/134-320:1.0,(J7H5I2_9ADEN/120-306:1.0,Q8V791_9ADEN/134-320:1.0):1.0):1.0):1.0):1.0,D6BP13_9ADEN/134-318:1.0):1.0,(((T1UK06_9ADEN/133-319:1.0,(Q76I92_ADE35/134-320:1.0,(Q5UVZ3_9ADEN/134-320:1.0,(W6EK83_9ADEN/134-320:1.0,(Q67713_9ADEN/134-320:1.0,T1UF82_9ADEN/134-320:1.0):1.0):1.0):1.0):1.0):1.0,(((Q1ERP6_ADE07/134-319:1.0,((C7ED83_ADE07/134-320:1.0,(C7ED82_ADE07/134-320:1.0,(J7I6T1_9ADEN/134-320:1.0,(C7ED85_ADE07/134-320:1.0,(Q77SK6_ADE07/134-320:1.0,(Q5EY45_ADE07/134-320:1.0,SPIKE_ADE07/152-338:1.0):1.0):1.0):1.0):1.0):1.0):1.0,D7RGN5_ADE07/134-318:1.0):1.0):1.0,L7PGT3_ADE07/92-272:1.0):1.0,(T2D324_ADE07/1-167:1.0,T2D293_ADE07/1-167:1.0):1.0):1.0):1.0,T2D126_9ADEN/1-167:1.0):1.0):1.0,Q5YKW4_ADE07/112-214:1.0):1.0):1.0):1.0,((F8WQQ9_ADE41/233-381:1.0,(SPIK2_ADE40/233-381:1.0,((E2IJZ8_ADE41/233-381:1.0,SPIK2_ADE41/233-381:1.0):1.0,(F8WQQ3_ADE41/233-380:1.0,W0S1J1_ADE41/233-380:1.0):1.0):1.0):1.0):1.0,(Q8QVG3_ADEBA/970-1140:1.0,((Q0PLX5_ADES7/181-343:1.0,(Q5C8N6_9ADEN/197-359:1.0,A0MK70_9ADEN/197-359:1.0):1.0):1.0,((Q8B4M4_ADEB2/378-544:1.0,Q9DLD8_ADEB2/391-557:1.0):1.0,((((H8PG12_9ADEN/433-603:1.0,(((B5SNT7_ADE41/388-558:1.0,E2IJZ9_ADE41/388-558:1.0):1.0,B5MER8_ADE40/1-171:1.0):1.0,F8WQR5_ADE41/373-543:1.0):1.0):1.0,(Q0PLX4_ADES7/386-556:1.0,(M9YY75_9ADEN/438-608:1.0,M9YZ57_9ADEN/423-593:1.0):1.0):1.0):1.0,(A0MK71_9ADEN/386-546:1.0,Q5C8N5_9ADEN/386-546:1.0):1.0):1.0,(((((H9AAJ7_9ADEN/243-415:1.0,H9AAG5_9ADEN/243-415:1.0):1.0,((H9AAD1_9ADEN/242-414:1.0,(H9AB10_9ADEN/242-414:1.0,((M9Z4H7_9ADEN/242-414:1.0,(H9AAU5_9ADEN/242-414:1.0,H9AAX7_9ADEN/242-414:1.0):1.0):1.0,H9AAA0_9ADEN/242-414:1.0):1.0):1.0):1.0,F2WTM1_9ADEN/242-414:1.0):1.0):1.0,(F6KSW0_9ADEN/392-557:1.0,((F2WTI9_9ADEN/376-541:1.0,Q695Q9_9ADEN/377-542:1.0):1.0,H9TET4_9ADEN/364-528:1.0):1.0):1.0):1.0,((((P1;1noba:1.0,SPIKE_ADE12/410-583:1.0):1.0,(T1UDQ2_9ADEN/378-551:1.0,(G1DE39_9ADEN/378-551:1.0,SPIKE_ADE31/379-552:1.0):1.0):1.0):1.0,(B5MEP6_ADE18/1-173:1.0,Q2Z0H6_ADE18/113-285:1.0):1.0):1.0,G9MC87_ADE31/74-152:1.0):1.0):1.0,(((Q6QPG1_9ADEN/268-441:1.0,Q6QP89_9ADEN/266-439:1.0):1.0,(((Q8UY68_9ADEN/248-421:1.0,((Q6BEA7_ADE04/248-421:1.0,Q6BEA6_ADE04/248-421:1.0):1.0,(B5MEN3_ADE04/1-174:1.0,SPIKE_ADE04/249-422:1.0):1.0):1.0):1.0,Q5YKW9_ADE04/1-140:1.0):1.0,((Q6QPC5_9ADEN/260-439:1.0,G9G864_9ADEN/260-439:1.0):1.0,((((Q6R2L0_ADE01/401-578:1.0,(J9Z4R6_9ADEN/401-578:1.0,J9Z589_9ADEN/401-578:1.0):1.0):1.0,(P1;1knb:1.0,Q2KRZ3_ADE05/401-577:1.0):1.0):1.0,(J7IEE1_9ADEN/350-524:1.0,(W6A1S0_9ADEN/350-524:1.0,T1UF00_9ADEN/350-524:1.0):1.0):1.0):1.0,((P1;1qhva:1.0,(B5MEN1_ADE02/1-178:1.0,J9Z5J6_9ADEN/401-578:1.0):1.0):1.0,V5RH14_ADE02/341-460:1.0):1.0):1.0):1.0):1.0):1.0,(((SPIKE_ADE15/182-363:1.0,((D4N3K6_9ADEN/183-367:1.0,(((W8VTP6_9ADEN/182-366:1.0,(G3CK99_9ADEN/182-366:1.0,H9XUK8_9ADEN/1-184:1.0):1.0):1.0,(B5MER6_9ADEN/1-184:1.0,((M0QVB6_9ADEN/189-373:1.0,C8ZJU5_9ADEN/145-329:1.0):1.0,(D2CH24_9ADEN/188-372:1.0,H9XUJ8_9ADEN/1-184:1.0):1.0):1.0):1.0):1.0,((B5MES9_9ADEN/1-187:1.0,(Q2Z0I4_9ADEN/122-307:1.0,H9XUK6_9ADEN/1-185:1.0):1.0):1.0,(((M0QUE2_9ADEN/189-371:1.0,Q2Z0K4_9ADEN/122-304:1.0):1.0,(W8CZP4_9ADEN/189-371:1.0,H9XUJ5_9ADEN/1-182:1.0):1.0):1.0,(H9XUK1_9ADEN/1-182:1.0,(B5MES5_9ADEN/1-183:1.0,M0QUA4_9ADEN/189-369:1.0):1.0):1.0):1.0):1.0):1.0):1.0,((B5MER1_9ADEN/1-184:1.0,(M0QUI2_9ADEN/188-371:1.0,(Q4KSI5_9ADEN/188-371:1.0,H9XUK0_9ADEN/1-183:1.0):1.0):1.0):1.0,(((T1UM09_9ADEN/183-372:1.0,C8ZJU2_9ADEN/140-323:1.0):1.0,(((A4ZKK6_9ADEN/183-370:1.0,Q2Z0I7_9ADEN/116-302:1.0):1.0,H9XUK2_9ADEN/1-186:1.0):1.0,H9XUK3_9ADEN/1-182:1.0):1.0):1.0,(((B5MES7_9ADEN/1-188:1.0,((T1UJE7_9ADEN/188-375:1.0,T1UIE7_9ADEN/188-375:1.0):1.0,((T1UKU4_9ADEN/188-375:1.0,B8LFM5_ADE17/188-375:1.0):1.0,H9XUK4_9ADEN/1-187:1.0):1.0):1.0):1.0,H9XUK5_9ADEN/1-187:1.0):1.0,(Q2Z0K5_9ADEN/115-293:1.0,(((C5HDT4_9ADEN/182-362:1.0,H9XUJ3_9ADEN/1-180:1.0):1.0,H9XUJ4_9ADEN/1-175:1.0):1.0,((Q2Z0J3_9ADEN/114-294:1.0,(Q76QY0_ADE15/181-361:1.0,B5MES0_9ADEN/1-181:1.0):1.0):1.0,M0QU65_ADE15/181-352:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(((T1UK39_9ADEN/190-365:1.0,B5MES3_9ADEN/1-176:1.0):1.0,(((M0QUR0_9ADEN/188-363:1.0,T1UM78_9ADEN/188-363:1.0):1.0,(Q2Z0K1_9ADEN/122-298:1.0,H9XUJ6_9ADEN/1-176:1.0):1.0):1.0,(((T1UKF6_9ADEN/182-356:1.0,H9XUK7_9ADEN/1-174:1.0):1.0,(G1FBZ2_9ADEN/182-356:1.0,H9XUK9_9ADEN/1-174:1.0):1.0):1.0,(O56784_9ADEN/182-355:1.0,(B5MES1_9ADEN/1-174:1.0,H9XUJ9_9ADEN/1-173:1.0):1.0):1.0):1.0):1.0):1.0,((((H5T769_9ADEN/189-367:1.0,M0QUM1_9ADEN/189-367:1.0):1.0,T1UHQ0_9ADEN/188-366:1.0):1.0,(T1UGW1_9ADEN/189-367:1.0,(E9P574_9ADEN/189-367:1.0,(Q2Z0J9_9ADEN/122-300:1.0,(C8ZJU1_9ADEN/144-322:1.0,(T1UMC1_9ADEN/189-367:1.0,H9XUJ7_9ADEN/1-178:1.0):1.0):1.0):1.0):1.0):1.0):1.0,(((C8ZJU4_9ADEN/144-321:1.0,(O56783_ADE17/186-362:1.0,T1UHF5_9ADEN/186-362:1.0):1.0):1.0,(((B5MEN8_9ADEN/1-176:1.0,Q2Z0K8_9ADEN/114-289:1.0):1.0,(Q64822_9ADEN/186-361:1.0,Q76SK0_9ADEN/186-361:1.0):1.0):1.0,(((Q6I7S0_ADE09/181-357:1.0,((Q75Q72_ADE08/182-358:1.0,(SPIKE_ADE08/182-358:1.0,(B5MET0_ADE08/1-177:1.0,(B9A5H0_ADE08/182-358:1.0,(B5MET2_ADE08/1-177:1.0,M5AN00_ADE08/139-315:1.0):1.0):1.0):1.0):1.0):1.0,B9A5P2_9ADEN/181-357:1.0):1.0):1.0,(T1UKZ7_9ADEN/182-358:1.0,(H0PPH6_9ADEN/182-358:1.0,((T1UKN9_9ADEN/182-358:1.0,(F8UFS4_9ADEN/182-358:1.0,(SPIKE_ADE09/182-358:1.0,(E1AI37_9ADEN/182-358:1.0,H9XUJ2_ADE09/1-176:1.0):1.0):1.0):1.0):1.0,(H9XUI9_ADE09/1-176:1.0,H9XUJ1_ADE09/1-176:1.0):1.0):1.0):1.0):1.0):1.0,G9MBX5_ADE08/1-136:1.0):1.0):1.0):1.0,Q76EL1_ADE08/181-289:1.0):1.0):1.0):1.0):1.0,Q76C82_ADE08/182-236:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0); -------------------------------------------------------------------------------- /test/adeno_fiber/upgma.dnd: -------------------------------------------------------------------------------- 1 | (((M9Z2Q1_9ADEN/232-338:1.0,((H9AAU4_9ADEN/375-486:1.0,(H9AAG4_9ADEN/375-485:1.0,(H9AAM9_9ADEN/375-486:1.0,H9AAJ6_9ADEN/375-486:1.0):1.0):1.0):1.0,(H9AAD0_9ADEN/375-486:1.0,(H9AA99_9ADEN/373-482:1.0,(F2WTM0_9ADEN/360-473:1.0,(M9YVC2_9ADEN/375-484:1.0,(H9AAR1_9ADEN/375-484:1.0,(H9AB09_9ADEN/374-483:1.0,H9AAX6_9ADEN/374-483:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,((G9MC87_ADE31/74-152:1.0,((B5MEP6_ADE18/1-173:1.0,Q2Z0H6_ADE18/113-285:1.0):1.0,((SPIKE_ADE31/379-552:1.0,(T1UDQ2_9ADEN/378-551:1.0,G1DE39_9ADEN/378-551:1.0):1.0):1.0,(P1;1noba:1.0,SPIKE_ADE12/410-583:1.0):1.0):1.0):1.0):1.0,(((F8WQQ9_ADE41/233-381:1.0,SPIK2_ADE40/233-381:1.0):1.0,((E2IJZ8_ADE41/233-381:1.0,SPIK2_ADE41/233-381:1.0):1.0,(F8WQQ3_ADE41/233-380:1.0,W0S1J1_ADE41/233-380:1.0):1.0):1.0):1.0,(Q8QVG3_ADEBA/970-1140:1.0,(((A0MK71_9ADEN/386-546:1.0,Q5C8N5_9ADEN/386-546:1.0):1.0,((H8PG12_9ADEN/433-603:1.0,(F8WQR5_ADE41/373-543:1.0,((B5SNT7_ADE41/388-558:1.0,E2IJZ9_ADE41/388-558:1.0):1.0,B5MER8_ADE40/1-171:1.0):1.0):1.0):1.0,(Q0PLX4_ADES7/386-556:1.0,(M9YY75_9ADEN/438-608:1.0,M9YZ57_9ADEN/423-593:1.0):1.0):1.0):1.0):1.0,((Q0PLX5_ADES7/181-343:1.0,(Q5C8N6_9ADEN/197-359:1.0,A0MK70_9ADEN/197-359:1.0):1.0):1.0,((Q8B4M4_ADEB2/378-544:1.0,Q9DLD8_ADEB2/391-557:1.0):1.0,(((H9AAJ7_9ADEN/243-415:1.0,H9AAG5_9ADEN/243-415:1.0):1.0,(F2WTM1_9ADEN/242-414:1.0,(H9AAD1_9ADEN/242-414:1.0,(H9AAA0_9ADEN/242-414:1.0,(H9AB10_9ADEN/242-414:1.0,(H9AAU5_9ADEN/242-414:1.0,(M9Z4H7_9ADEN/242-414:1.0,H9AAX7_9ADEN/242-414:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(F6KSW0_9ADEN/392-557:1.0,(H9TET4_9ADEN/364-528:1.0,(F2WTI9_9ADEN/376-541:1.0,Q695Q9_9ADEN/377-542:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(((Q5YKW4_ADE07/112-214:1.0,(((T2D324_ADE07/1-167:1.0,T2D293_ADE07/1-167:1.0):1.0,(L7PGT3_ADE07/92-272:1.0,(Q1ERP6_ADE07/134-319:1.0,(D7RGN5_ADE07/134-318:1.0,(SPIKE_ADE07/152-338:1.0,(Q5EY45_ADE07/134-320:1.0,(C7ED85_ADE07/134-320:1.0,(J7I6T1_9ADEN/134-320:1.0,(Q77SK6_ADE07/134-320:1.0,(C7ED83_ADE07/134-320:1.0,C7ED82_ADE07/134-320:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,((D6BP13_9ADEN/134-318:1.0,(Q8V791_9ADEN/134-320:1.0,(D6BP12_ADE1A/134-320:1.0,(S4UNB6_9ADEN/134-320:1.0,(S4UM78_9ADEN/134-320:1.0,J7H5I2_9ADEN/120-306:1.0):1.0):1.0):1.0):1.0):1.0,(T2D126_9ADEN/1-167:1.0,(T1UK06_9ADEN/133-319:1.0,(T1UF82_9ADEN/134-320:1.0,(W6EK83_9ADEN/134-320:1.0,(Q67713_9ADEN/134-320:1.0,(Q76I92_ADE35/134-320:1.0,Q5UVZ3_9ADEN/134-320:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,((Q5YKW0_9ADEN/120-224:1.0,((T2D338_9ADEN/1-166:1.0,(Q67712_9ADEN/134-317:1.0,(B5MES8_9ADEN/1-184:1.0,Q27V31_9ADEN/115-298:1.0):1.0):1.0):1.0,((B5MER3_ADE35/1-184:1.0,(T1UFV8_9ADEN/134-317:1.0,(Q91CL7_9ADEN/134-317:1.0,Q3ZKX3_9ADEN/134-317:1.0):1.0):1.0):1.0,(T2D2A3_ADE35/1-166:1.0,T2D2C6_ADE35/1-166:1.0):1.0):1.0):1.0):1.0,(((T2D137_ADE16/1-168:1.0,T2D2F0_ADE16/1-168:1.0):1.0,((R4HLF4_ADE16/163-348:1.0,T1ULE5_9ADEN/148-333:1.0):1.0,((B5MEP4_ADE16/1-186:1.0,D7P6B4_9ADEN/163-348:1.0):1.0,T2CHU5_9ADEN/163-348:1.0):1.0):1.0):1.0,((Q2V0H2_ADE03/93-197:1.0,(Q5YKX1_ADE03/119-223:1.0,Q1HAV0_ADE03/104-206:1.0):1.0):1.0,((A2I941_ADE03/100-281:1.0,(A2I922_ADE03/100-281:1.0,(A2I934_ADE03/100-281:1.0,A2I920_ADE03/100-281:1.0):1.0):1.0):1.0,((T2D2A0_ADE03/1-164:1.0,(T2D2C2_ADE03/1-164:1.0,(T2D333_ADE03/1-164:1.0,T2D2E7_ADE03/1-164:1.0):1.0):1.0):1.0,(B5MEN2_ADE03/1-182:1.0,((T2D2E1_ADE03/1-164:1.0,T2D132_ADE03/1-164:1.0):1.0,(A2I924_ADE03/100-281:1.0,(L7PJL3_ADE03/90-265:1.0,(R4HLK2_9ADEN/133-314:1.0,(A2I921_ADE03/100-281:1.0,(A2I917_ADE03/100-281:1.0,(I6LEU2_9ADEN/133-314:1.0,(A2I923_ADE03/100-281:1.0,(S4X4Y5_ADE03/133-314:1.0,(U5T7T2_ADE03/108-289:1.0,A2I928_ADE03/100-281:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,((Q76C82_ADE08/182-236:1.0,(((T1UGW1_9ADEN/189-367:1.0,(E9P574_9ADEN/189-367:1.0,(C8ZJU1_9ADEN/144-322:1.0,(Q2Z0J9_9ADEN/122-300:1.0,(T1UMC1_9ADEN/189-367:1.0,H9XUJ7_9ADEN/1-178:1.0):1.0):1.0):1.0):1.0):1.0,(T1UHQ0_9ADEN/188-366:1.0,(H5T769_9ADEN/189-367:1.0,M0QUM1_9ADEN/189-367:1.0):1.0):1.0):1.0,(Q76EL1_ADE08/181-289:1.0,((C8ZJU4_9ADEN/144-321:1.0,(O56783_ADE17/186-362:1.0,T1UHF5_9ADEN/186-362:1.0):1.0):1.0,(((B5MEN8_9ADEN/1-176:1.0,Q2Z0K8_9ADEN/114-289:1.0):1.0,(Q64822_9ADEN/186-361:1.0,Q76SK0_9ADEN/186-361:1.0):1.0):1.0,(G9MBX5_ADE08/1-136:1.0,((Q6I7S0_ADE09/181-357:1.0,(B9A5P2_9ADEN/181-357:1.0,(Q75Q72_ADE08/182-358:1.0,(M5AN00_ADE08/139-315:1.0,(B9A5H0_ADE08/182-358:1.0,(B5MET0_ADE08/1-177:1.0,(SPIKE_ADE08/182-358:1.0,B5MET2_ADE08/1-177:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(T1UKZ7_9ADEN/182-358:1.0,(H0PPH6_9ADEN/182-358:1.0,(H9XUJ1_ADE09/1-176:1.0,(H9XUI9_ADE09/1-176:1.0,(SPIKE_ADE09/182-358:1.0,(F8UFS4_9ADEN/182-358:1.0,(T1UKN9_9ADEN/182-358:1.0,(E1AI37_9ADEN/182-358:1.0,H9XUJ2_ADE09/1-176:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(Q83467_ADEP4/121-287:1.0,(G1FQN7_9ADEN/377-552:1.0,(G0ZAJ9_9ADEN/320-500:1.0,(((Q5YKW9_ADE04/1-140:1.0,(Q8UY68_9ADEN/248-421:1.0,(SPIKE_ADE04/249-422:1.0,(B5MEN3_ADE04/1-174:1.0,(Q6BEA7_ADE04/248-421:1.0,Q6BEA6_ADE04/248-421:1.0):1.0):1.0):1.0):1.0):1.0,((Q6QPG1_9ADEN/268-441:1.0,Q6QP89_9ADEN/266-439:1.0):1.0,((V5RH14_ADE02/341-460:1.0,(P1;1qhva:1.0,(B5MEN1_ADE02/1-178:1.0,J9Z5J6_9ADEN/401-578:1.0):1.0):1.0):1.0,((Q6QPC5_9ADEN/260-439:1.0,G9G864_9ADEN/260-439:1.0):1.0,((T1UF00_9ADEN/350-524:1.0,(J7IEE1_9ADEN/350-524:1.0,W6A1S0_9ADEN/350-524:1.0):1.0):1.0,((Q6R2L0_ADE01/401-578:1.0,(J9Z4R6_9ADEN/401-578:1.0,J9Z589_9ADEN/401-578:1.0):1.0):1.0,(P1;1knb:1.0,Q2KRZ3_ADE05/401-577:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(((T1UK39_9ADEN/190-365:1.0,B5MES3_9ADEN/1-176:1.0):1.0,((((G1FBZ2_9ADEN/182-356:1.0,H9XUK9_9ADEN/1-174:1.0):1.0,(T1UKF6_9ADEN/182-356:1.0,H9XUK7_9ADEN/1-174:1.0):1.0):1.0,(O56784_9ADEN/182-355:1.0,(B5MES1_9ADEN/1-174:1.0,H9XUJ9_9ADEN/1-173:1.0):1.0):1.0):1.0,((Q2Z0K1_9ADEN/122-298:1.0,H9XUJ6_9ADEN/1-176:1.0):1.0,(M0QUR0_9ADEN/188-363:1.0,T1UM78_9ADEN/188-363:1.0):1.0):1.0):1.0):1.0,((((B5MES9_9ADEN/1-187:1.0,(Q2Z0I4_9ADEN/122-307:1.0,H9XUK6_9ADEN/1-185:1.0):1.0):1.0,((H9XUK1_9ADEN/1-182:1.0,(B5MES5_9ADEN/1-183:1.0,M0QUA4_9ADEN/189-369:1.0):1.0):1.0,((W8CZP4_9ADEN/189-371:1.0,H9XUJ5_9ADEN/1-182:1.0):1.0,(M0QUE2_9ADEN/189-371:1.0,Q2Z0K4_9ADEN/122-304:1.0):1.0):1.0):1.0):1.0,(D4N3K6_9ADEN/183-367:1.0,(((M0QVB6_9ADEN/189-373:1.0,C8ZJU5_9ADEN/145-329:1.0):1.0,(B5MER6_9ADEN/1-184:1.0,(D2CH24_9ADEN/188-372:1.0,H9XUJ8_9ADEN/1-184:1.0):1.0):1.0):1.0,(W8VTP6_9ADEN/182-366:1.0,(G3CK99_9ADEN/182-366:1.0,H9XUK8_9ADEN/1-184:1.0):1.0):1.0):1.0):1.0):1.0,((((H9XUK3_9ADEN/1-182:1.0,(H9XUK2_9ADEN/1-186:1.0,(A4ZKK6_9ADEN/183-370:1.0,Q2Z0I7_9ADEN/116-302:1.0):1.0):1.0):1.0,(T1UM09_9ADEN/183-372:1.0,C8ZJU2_9ADEN/140-323:1.0):1.0):1.0,((H9XUK5_9ADEN/1-187:1.0,(B5MES7_9ADEN/1-188:1.0,((T1UJE7_9ADEN/188-375:1.0,T1UIE7_9ADEN/188-375:1.0):1.0,(H9XUK4_9ADEN/1-187:1.0,(T1UKU4_9ADEN/188-375:1.0,B8LFM5_ADE17/188-375:1.0):1.0):1.0):1.0):1.0):1.0,(Q2Z0K5_9ADEN/115-293:1.0,((H9XUJ4_9ADEN/1-175:1.0,(C5HDT4_9ADEN/182-362:1.0,H9XUJ3_9ADEN/1-180:1.0):1.0):1.0,(M0QU65_ADE15/181-352:1.0,(Q2Z0J3_9ADEN/114-294:1.0,(Q76QY0_ADE15/181-361:1.0,B5MES0_9ADEN/1-181:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0,(SPIKE_ADE15/182-363:1.0,(B5MER1_9ADEN/1-184:1.0,(M0QUI2_9ADEN/188-371:1.0,(Q4KSI5_9ADEN/188-371:1.0,H9XUK0_9ADEN/1-183:1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0):1.0); -------------------------------------------------------------------------------- /test/dummy/many-seq: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 3 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 4 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 5 | >SPIKE_ADE08/182-358.2 6 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 7 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 8 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 9 | >SPIKE_ADE08/182-358.3 10 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 11 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 12 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY -------------------------------------------------------------------------------- /test/dummy/many-seq.aln: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 3 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 4 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 5 | >SPIKE_ADE08/182-358.2 6 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 7 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 8 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 9 | >SPIKE_ADE08/182-358.3 10 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 11 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 12 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 13 | -------------------------------------------------------------------------------- /test/dummy/one-seq: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 3 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 4 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY -------------------------------------------------------------------------------- /test/dummy/one-seq.aln: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 3 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 4 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 5 | -------------------------------------------------------------------------------- /test/dummy/two-seq: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 3 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 4 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 5 | >SPIKE_ADE08/182-358.2 6 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFTIKLLFDKNGV 7 | LMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyARDIVYGNIYLGGKPHQPVTIK 8 | TTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY -------------------------------------------------------------------------------- /test/dummy/two-seq.aln: -------------------------------------------------------------------------------- 1 | >SPIKE_ADE08/182-358 2 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 3 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 4 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 5 | >SPIKE_ADE08/182-358.2 6 | LWTTPDTSPNCRIDQDKDSKLSLVLTKCGSQILANVSLIVVAGRYKIINNntnpALKGFT 7 | IKLLFDKNGVLMESSNLGKSYWNFRNQNSIMSTAYEKAIGFMPNLVAYPKptTGSKKyAR 8 | DIVYGNIYLGGKPHQPVTIKTTFNQETGCEYSITFDFSWAKTYVNVEFETTSFTFSY 9 | -------------------------------------------------------------------------------- /test/scripts/reorder.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | from Bio.Seq import Seq 3 | 4 | 5 | # reorder part 6 | inputs = [seq for seq in SeqIO.parse('adeno_fiber', 'fasta')] 7 | outputs = {seq.id : seq for seq in SeqIO.parse('upgma.pp.fasta', 'fasta')} 8 | 9 | ordered_outputs = [] 10 | for r in inputs: 11 | ordered_outputs.append(outputs[r.id]) 12 | 13 | SeqIO.write(ordered_outputs, f'upgma.pp.ordered.fasta', 'fasta') 14 | 15 | -------------------------------------------------------------------------------- /test/scripts/split.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | from Bio.Seq import Seq 3 | import ete3 4 | 5 | 6 | for i in [1,2]: 7 | 8 | file = open(f'upgma.part{i}.dnd', 'r') 9 | tree_str = file.read() 10 | tree_str = tree_str[:-1].replace(';','___') + ';' 11 | tree = ete3.Tree(tree_str) 12 | file.close() 13 | 14 | nodes = { node.name.replace('___',';') for node in tree.traverse("postorder")} 15 | 16 | records = [] 17 | for seq in SeqIO.parse('upgma.no_refine.fasta', 'fasta'): 18 | if seq.id in nodes: 19 | records.append(seq) 20 | 21 | length = len(records[0].seq) 22 | to_remove = set() 23 | 24 | for c in range(length): 25 | gaps_only = True 26 | for r in records: 27 | if r.seq[c] != '-': 28 | gaps_only = False 29 | break 30 | 31 | if gaps_only == True: 32 | to_remove.add(c) 33 | 34 | for ir in range(len(records)): 35 | seq = records[ir].seq 36 | temp = Seq(''.join([ seq[c] for c in range(length) if c not in to_remove ])) 37 | records[ir].seq = temp 38 | 39 | 40 | SeqIO.write(records, f'upgma.part{i}.fasta', 'fasta') 41 | 42 | 43 | --------------------------------------------------------------------------------