├── .gitignore ├── .gitmodules ├── INSTALL.md ├── LICENCE ├── Makefile ├── README.md ├── doc ├── GraphMap-description.pdf ├── README-v0.21.md ├── README-v0.22.md ├── changelog.md ├── img │ ├── anchors-normal.png │ ├── anchors-rna.png │ ├── region_selection-rna.png │ └── region_selection.png ├── rnaseq.md └── sam_output.md ├── overlap.md ├── reproducibility ├── README.md ├── run.py └── setup.py ├── scripts └── scatterplot8.py └── src ├── aligner ├── aligner_base.h ├── aligner_containers.h ├── aligner_ksw2.cc ├── aligner_ksw2.h ├── aligner_util.cc ├── aligner_util.hpp ├── anchor_aligner.cc ├── anchor_aligner.h ├── pairwise_penalties.h ├── sam_parser.cc └── sam_parser.h ├── alignment ├── alignment.cc ├── alignment.h ├── alignment_wrappers.cc ├── alignment_wrappers.h ├── anchored.cc ├── cigargen.cc ├── cigargen.h ├── semiglobal.cc ├── transcriptome_mod.cc └── transcriptome_mod.h ├── containers ├── mapping_data.cc ├── mapping_data.h ├── path_graph_entry.cc ├── path_graph_entry.h ├── range.h ├── raw_alignment.h ├── region.cc ├── region.h ├── results.h ├── score_registry.cc ├── score_registry.h ├── vertices.cc └── vertices.h ├── graphmap ├── core_graphmap.cc ├── experimental.cc ├── filter_anchors.cc ├── filter_anchors.h ├── graphmap.cc ├── graphmap.h ├── lcs_anchored.cc ├── lcs_semiglobal.cc ├── process_read.cc ├── region_selection.cc ├── rna.cc ├── transcriptome.cc └── transcriptome.h ├── index ├── index_util.cc └── index_util.h ├── ksw2 ├── LICENSE.txt ├── kalloc.cc ├── kalloc.h ├── kseq.h ├── ksw2.h ├── ksw2_extd2_sse.cc ├── ksw2_exts2_sse.cc ├── ksw2_extz2_sse.cc └── ksw2_ll_sse.cc ├── main.cc ├── owler ├── lcsk.cc ├── owler.cc ├── owler.h ├── owler_data.h ├── owler_experimental.cc └── process_read.cc ├── program_parameters.cc ├── program_parameters.h └── sparsehash ├── COPYING ├── dense_hash_map ├── dense_hash_set ├── internal ├── densehashtable.h ├── hashtable-common.h ├── libc_allocator_with_realloc.h ├── sparseconfig.h └── sparsehashtable.h ├── sparse_hash_map ├── sparse_hash_set ├── sparsetable ├── template_util.h └── type_traits.h /.gitignore: -------------------------------------------------------------------------------- 1 | deprecated/ 2 | temp/ 3 | temp/* 4 | obj/ 5 | obj_debug/ 6 | obj_linux/ 7 | obj_mac/ 8 | obj_test/ 9 | obj_testext/ 10 | obj_extcigar/ 11 | # bin/graphmap-not_release 12 | # bin/graphmap-debug 13 | bin/ 14 | .project 15 | .cproject 16 | .settings/ 17 | reproducibility/*/ 18 | test-data/ 19 | .vscode* 20 | 21 | !reproducibility/*.py 22 | !reproducibility/*.md 23 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "codebase/seqlib"] 2 | path = codebase/seqlib 3 | url = https://github.com/isovic/seqlib.git 4 | [submodule "codebase/argumentparser"] 5 | path = codebase/argumentparser 6 | url = https://github.com/isovic/argumentparser.git 7 | [submodule "codebase/gindex"] 8 | path = codebase/gindex 9 | url = https://github.com/isovic/gindex 10 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## 1. Installation 2 | 3 | You will need a recent GCC/G++ version (>=4.7) to compile the source. 4 | 5 | To override the default compiler choice you can set GCC (or GCC_MAC on Mac), e.g.: 6 | 7 | ``` 8 | GCC=/usr/local/bin/g++ make 9 | ``` 10 | 11 | ### 1.1 Initialize submodules 12 | This will automatically initialize/pull the latest version of submodules. 13 | ``` 14 | make modules 15 | ``` 16 | 17 | Submodules are used as source files, so there is no need to pre-compile them in any way. 18 | 19 | 20 | ### 1.2 Linux 21 | For a Linux release version type: 22 | ``` 23 | make 24 | ``` 25 | 26 | To clean, type: 27 | ``` 28 | make clean 29 | ``` 30 | 31 | One can also rebuild, which will cause clean and make to be ran sequentially: 32 | ``` 33 | make rebuild 34 | ``` 35 | 36 | ### 1.3 Mac 37 | ``` 38 | make mac 39 | 40 | make cleanmac 41 | make rebuildmac 42 | ``` 43 | 44 | ### 1.4. Compiling the debug version 45 | ``` 46 | make debug 47 | 48 | make cleandebug 49 | make rebuilddebug 50 | ``` 51 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ivan Sovic, Mile Sikic and Niranjan Nagarajan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BIN = ./bin/graphmap2 2 | BIN_DEBUG = ./bin/graphmap-debug 3 | BIN_LINUX = ./bin/Linux-x64/graphmap2 4 | BIN_MAC = ./bin/Mac/graphmap 5 | OBJ_TESTING = ./obj_test 6 | OBJ_TESTING_EXT = ./obj_testext 7 | OBJ_DEBUG = ./obj_debug 8 | OBJ_LINUX = ./obj_linux 9 | OBJ_EXTCIGAR = ./obj_extcigar 10 | OBJ_MAC = ./obj_mac 11 | SOURCE = src 12 | CODEBASE = codebase 13 | # This finds all 'src' folders at maximum depth 2 (level one inside each submodule's folder). 14 | CODEBASE_SRC_FOLDERS = $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{} \;) 15 | # $(shell find $(CODEBASE) -maxdepth 3 -type d -name "libs" -exec echo "-I"{} \;) 16 | # $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{}"/*/" \;) 17 | 18 | # ? allows override by user using env var 19 | GCC ?= g++ 20 | # define variables for GCC version check here 21 | GCC_MAJOR_VERSION_GE_4 := $(shell expr `$(GCC) -dumpversion | cut -f1 -d.` \>= 4) 22 | GCC_MINOR_VERSION_GE_7 := $(shell expr `$(GCC) -dumpversion | cut -f2 -d.` \>= 7) 23 | GCC_MAC ?= g++ 24 | 25 | 26 | # CPP_FILES := $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp) 27 | # CC_FILES := $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc) 28 | # H_FILES := $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(SOURCE)/libs/*/*.h) 29 | CPP_FILES := $(wildcard $(CODEBASE)/*/src/*.cpp) $(wildcard $(CODEBASE)/*/src/libs/*/*.cpp) $(wildcard $(CODEBASE)/*/src/*/*.cpp) $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp) 30 | CC_FILES := $(wildcard $(CODEBASE)/*/src/*.cc) $(wildcard $(CODEBASE)/*/src/libs/*/*.cc) $(wildcard $(CODEBASE)/*/src/*/*.cc) $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc) 31 | H_FILES := $(wildcard $(CODEBASE)/*/src/*.h) $(wildcard $(CODEBASE)/*/src/libs/*/*.h) $(wildcard $(CODEBASE)/*/src/*/*.h) $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(CODEBASE)/*/src/*.hpp) $(wildcard $(CODEBASE)/*/src/*/*.hpp) $(wildcard $(SOURCE)/*/*.hpp) $(wildcard $(SOURCE)/*.hpp) $(wildcard $(SOURCE)/libs/*/*.h) 32 | 33 | OBJ_FILES := $(CPP_FILES:.cpp=.o) $(CC_FILES:.cc=.o) 34 | OBJ_FILES_FOLDER_TESTING := $(addprefix $(OBJ_TESTING)/,$(OBJ_FILES)) 35 | OBJ_FILES_FOLDER_TESTING_EXT := $(addprefix $(OBJ_TESTING_EXT)/,$(OBJ_FILES)) 36 | OBJ_FILES_FOLDER_DEBUG := $(addprefix $(OBJ_DEBUG)/,$(OBJ_FILES)) 37 | OBJ_FILES_FOLDER_LINUX := $(addprefix $(OBJ_LINUX)/,$(OBJ_FILES)) 38 | OBJ_FILES_FOLDER_EXTCIGAR := $(addprefix $(OBJ_EXTCIGAR)/,$(OBJ_FILES)) 39 | OBJ_FILES_FOLDER_MAC := $(addprefix $(OBJ_MAC)/,$(OBJ_FILES)) 40 | 41 | LIB_DIRS = -L"/usr/local/lib" 42 | CC_LIBS = -static-libgcc -static-libstdc++ -D__cplusplus=201103L 43 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"libs/libdivsufsort-2.0.1/build/include" -I"libs/seqan-library-1.4.2/include" 44 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"src/libs/seqan-library-1.4.2/include" 45 | INCLUDE = -I"./src/" -I"/usr/include/" -I"$(CODEBASE)/seqlib/src/libs/seqan-library-2.0.1/include" -I"$(CODEBASE)/seqlib/src/libs/libdivsufsort-2.0.1-64bit/" $(CODEBASE_SRC_FOLDERS) 46 | 47 | CC_FLAGS_DEBUG = -O3 -g -rdynamic -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native 48 | CC_FLAGS_RELEASE = -DRELEASE_VERSION -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread # -march=native 49 | CC_FLAGS_EXTCIGAR = -DRELEASE_VERSION -DUSE_EXTENDED_CIGAR_FORMAT -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native 50 | CC_FLAGS_NOT_RELEASE = -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native 51 | CC_FLAGS_NOT_RELEASE_EXT = -g -O3 -DUSE_EXTENDED_CIGAR_FORMAT -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native 52 | LD_FLAGS = -static-libgcc -static-libstdc++ -m64 -ffreestanding 53 | # LD_LIBS = -lpthread -lgomp -lm -lz -ldivsufsort64 54 | LD_LIBS = -lpthread -lgomp -lm -lz 55 | 56 | 57 | 58 | all: gcc_version_check linux 59 | 60 | install: /usr/bin/graphmap 61 | 62 | /usr/bin/graphmap: bin/Linux-x64/graphmap 63 | cp bin/Linux-x64/graphmap /usr/bin/graphmap 64 | 65 | modules: 66 | git submodule update --init --recursive 67 | # git submodule foreach git pull origin master 68 | 69 | testing: $(OBJ_FILES_FOLDER_TESTING) 70 | mkdir -p $(dir $(BIN)) 71 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING) $(LD_LIBS) 72 | 73 | obj_test/%.o: %.cc $(H_FILES) 74 | mkdir -p $(dir $@) 75 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $< 76 | 77 | obj_test/%.o: %.cpp $(H_FILES) 78 | mkdir -p $(dir $@) 79 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $< 80 | 81 | testingext: $(OBJ_FILES_FOLDER_TESTING_EXT) 82 | mkdir -p $(dir $(BIN)) 83 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING_EXT) $(LD_LIBS) 84 | 85 | obj_testext/%.o: %.cc $(H_FILES) 86 | mkdir -p $(dir $@) 87 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $< 88 | 89 | obj_testext/%.o: %.cpp $(H_FILES) 90 | mkdir -p $(dir $@) 91 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $< 92 | 93 | 94 | 95 | gcc_version_check: 96 | ifneq ($(GCC_MAJOR_VERSION_GE_4), 1) 97 | $(warning "*** WARNING $(GCC) major version <4 ***") 98 | endif 99 | ifneq ($(GCC_MINOR_VERSION_GE_7), 1) 100 | $(warning "*** WARNING $(GCC) minor version <7 ***") 101 | endif 102 | 103 | 104 | debug: $(OBJ_FILES_FOLDER_DEBUG) 105 | mkdir -p $(dir $(BIN_DEBUG)) 106 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_DEBUG) $(OBJ_FILES_FOLDER_DEBUG) $(LD_LIBS) 107 | 108 | obj_debug/%.o: %.cc $(H_FILES) 109 | mkdir -p $(dir $@) 110 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $< 111 | 112 | obj_debug/%.o: %.cpp $(H_FILES) 113 | mkdir -p $(dir $@) 114 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $< 115 | 116 | 117 | 118 | linux: $(OBJ_FILES_FOLDER_LINUX) 119 | mkdir -p $(dir $(BIN_LINUX)) 120 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_LINUX) $(LD_LIBS) 121 | 122 | obj_linux/%.o: %.cc $(H_FILES) 123 | mkdir -p $(dir $@) 124 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $< 125 | 126 | obj_linux/%.o: %.cpp $(H_FILES) 127 | mkdir -p $(dir $@) 128 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $< 129 | 130 | 131 | 132 | extcigar: $(OBJ_FILES_FOLDER_EXTCIGAR) 133 | mkdir -p $(dir $(BIN_LINUX)) 134 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_EXTCIGAR) $(LD_LIBS) 135 | 136 | obj_extcigar/%.o: %.cc $(H_FILES) 137 | mkdir -p $(dir $@) 138 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $< 139 | 140 | obj_extcigar/%.o: %.cpp $(H_FILES) 141 | mkdir -p $(dir $@) 142 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $< 143 | 144 | 145 | 146 | mac: $(OBJ_FILES_FOLDER_MAC) 147 | mkdir -p $(dir $(BIN_MAC)) 148 | $(GCC_MAC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_MAC) $(OBJ_FILES_FOLDER_MAC) $(LD_LIBS) 149 | 150 | obj_mac/%.o: %.cc $(H_FILES) 151 | mkdir -p $(dir $@) 152 | $(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $< 153 | 154 | obj_mac/%.o: %.cpp $(H_FILES) 155 | mkdir -p $(dir $@) 156 | $(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $< 157 | 158 | 159 | 160 | # deps: 161 | # cd libs; cd libdivsufsort-2.0.1; make clean; rm -rf build; ./configure; mkdir build ;cd build; cmake -DBUILD_DIVSUFSORT64:BOOL=ON -DCMAKE_BUILD_TYPE="Release" -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX="/usr/local" .. ; make 162 | 163 | 164 | 165 | clean: 166 | -rm -rf $(OBJ_LINUX) $(BIN_LINUX) 167 | 168 | cleantesting: 169 | -rm -rf $(OBJ_TESTING) $(BIN) 170 | 171 | cleandebug: 172 | -rm -rf $(OBJ_DEBUG) $(BIN_DEBUG) 173 | 174 | cleanlinux: 175 | -rm -rf $(OBJ_LINUX) $(BIN_LINUX) 176 | 177 | cleanextcigar: 178 | -rm -rf $(OBJ_EXTCIGAR) $(BIN_LINUX) 179 | 180 | cleanmac: 181 | -rm -rf $(OBJ_MAC) $(BIN_MAC) 182 | 183 | cleanbin: 184 | -rm -rf bin/ 185 | 186 | cleanall: clean cleantest cleandebug cleanmac cleanbin 187 | 188 | 189 | 190 | rebuild: clean all 191 | 192 | rebuilddebug: cleandebug debug 193 | 194 | rebuildlinux: cleanlinux linux 195 | 196 | rebuildtesting: cleantesting testing 197 | 198 | rebuildmac: cleanmac mac 199 | 200 | # divsufsort: 201 | # cd libs; ./build-libdivsufsort.sh 202 | 203 | -------------------------------------------------------------------------------- /doc/GraphMap-description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/GraphMap-description.pdf -------------------------------------------------------------------------------- /doc/README-v0.21.md: -------------------------------------------------------------------------------- 1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads 2 | 3 | Preprint of our paper now available on Biorxiv: 4 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719) 5 | 6 | Sequencing data of E. Coli UTI89 generated in-house and used in the paper now available on: 7 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557) 8 | 9 | 10 | 11 | **__Version: 0.21__** 12 | **Update** 13 | Release date: 02 June 2015 14 | 15 | New alignment mode available: anchored alignment. 16 | 17 | Anchored alignment is an alternative to the default semiglobal alignment. It is less sensitive than default semiglobal, but faster and creates alignments around determined homologies (anchors). 18 | This is a very powerful addition to alignment, as it creates highly accurate and confident alignments even in the presence of high error rates. 19 | To run the anchored alignment, use the '-a anchor' option. 20 | 21 | Also, standard Gotoh alignment is also available now as opposed to the default Myers's bit-vector alignment. Custom alignment parameters can be specified via commandline. 22 | To use the Gotoh alignment, use '-a gotoh' commandline option. 23 | 24 | Additionally, E-value and mapping quality thresholds can now be applied directly from commandline (-z and -c options). 25 | 26 | More to follow. 27 | 28 | 29 | 30 | **__Version: v0.20b__** 31 | **Update** 32 | Release date: 26 April 2015 33 | 34 | Added the source code. 35 | 36 | To build from source: 37 | ``` 38 | make 39 | ``` 40 | If the libraries have to be recompiled on your system, type: 41 | ``` 42 | make deps 43 | ``` 44 | More installation instructions can be found in the INSTALL file. 45 | 46 | 47 | Release date: 02 April 2015 48 | Precompiled binary, built on Ubuntu 10.04 x64. 49 | Tested on Mint 17.1 x64. 50 | 51 | Significantly improved speed and sensitivity. 52 | 53 | Added some important features: 54 | - Mapping quality. 55 | - Sensible alignment score. 56 | - E-value added in reported alignments! Look for a custom ZE parameter in the SAM lines. 57 | - Secondary alignments can now be output as well (use the -Z parameter). 58 | 59 | Addressed several reported issues: 60 | - Output only the first whitespace-separated token in the qname field of the SAM output. Previously the entire read's header was output. 61 | - The same for the rname. 62 | - Reads that are marked unmapped now contain no additional mapping information as before. 63 | 64 | Please note that by default, GraphMap will use more memory to allow higher speed and sensitivity. 65 | To run in parsimonious (half the memory requirements), please use the -P parameter. 66 | 67 | Note #2: some command line parameters were changed (removed/added) since the last version, but most stayed the same. 68 | In case you are using one of the removed parameters, you will be warned and the process will not run. 69 | 70 | 71 | **__Version: v0.19b__** 72 | Release date: 16 January 2015 73 | Precompiled binary, built on Ubuntu 10.04 x64. 74 | Tested on Mint 17.1 x64. 75 | 76 | **Update** 77 | Compiled a MacOS version too, now can also be found in the bin directory. 78 | Built on MacOS X 10.9.5 79 | 80 | Important updates: 81 | - Better support for circular genomes - use '-C' option if your reference is circular! 82 | - Added a more sensitive mode (though much slower) - check out the '-x' option in the help! 83 | - Better alignments for Illumina reads - again, check out the '-x' option. 84 | - Better dynamic of the AS (alignment score) - value 254 best score, value 0 worst/unmapped. 85 | 86 | To use the normal (fast) mode, simply use the default parameters (nothing is changed, just omit the '-x' option). 87 | 88 | 89 | **__Version: v0.18b__** 90 | Release date: 11 December 2014 91 | Precompiled binary, built on Ubuntu 10.04 x64. 92 | Tested on Mint 17 (Ubuntu 14.04), Ubuntu Server 14.04, Fedora 20 and Gentoo. 93 | 94 | ### Description 95 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data. 96 | It can handle Oxford Nanopore data with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers (namely, compared to BLASR and BWA-MEM). 97 | 98 | GraphMap was designed for ease-of-use: the default parameters can handle a wide range of read lengths and error profiles. This is an important feature for technologies where the error rates and error profiles can vary widely across sequencing runs. In addition, GraphMap allows users to uniformly map read datasets from disparate technologies with high sensitivity and accuracy. While GraphMap is not runtime optimized for short-read data (e.g. compared to Bowtie2), it provides accurate and typically more sensitive mappings for Illumina and Ion Torrent reads. 99 | 100 | Please keep in mind that this is an early development version and we welcome your comments and feedback on GraphMap. 101 | 102 | ### Comparison to other mappers 103 | 104 | Comparison statistics will be uploaded soon. 105 | 106 | ### Usage 107 | 108 | ``` 109 | # Process all reads from a given FASTA/FASTQ file with default number of threads: 110 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam 111 | 112 | # Process reads using more sensitive parameters for Illumina data: 113 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam 114 | 115 | # Process reads from a circular genome: 116 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam 117 | 118 | # Limit the number of threads to 8, and load reads in batches of 50MB: 119 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 120 | 121 | # Process only the first 1000 reads: 122 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 123 | 124 | # Process all reads from a given folder. 125 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder 126 | 127 | # Generate only the index. 128 | ./graphmap -I -r escherichia_coli.fa 129 | ``` 130 | 131 | ### Contact information 132 | 133 | For additional information, help and bug reports please send an email to one of the following: 134 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg 135 | -------------------------------------------------------------------------------- /doc/README-v0.22.md: -------------------------------------------------------------------------------- 1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads 2 | **__Current Version: 0.22__** 3 | Release date: 12 November 2015 4 | 5 | Updates: 6 | - Many tiny bug fixes, mostly related to anchored alignment. It should be slightly more sensitive now. 7 | - Two overlap modes merged from the dev branch: ```-w owler``` (fast, uses a trimmed GraphMap pipeline, reports output in the MHAP format) and ```-w overlapper``` (full GraphMap pipeline including alignment, output in SAM format). For usage - check examples at the bottom. 8 | - GraphMap integration into marginAlign - we forked marginAlign and extended it to support GraphMap alongside to LAST and BWA-MEM ([https://github.com/isovic/marginAlign](https://github.com/isovic/marginAlign)). Use parameters ```--graphmap``` or ```--graphmapanchor``` with marginAlign to specify the mapper. 9 | 10 | For more information on overlapping, take a look at [overlap.md](overlap.md). 11 | 12 | GraphMap is also used as an overlapper in a new *de novo* genome assembly project called [Ra](https://github.com/mariokostelac/ra-integrate) ([https://github.com/mariokostelac/ra-integrate](https://github.com/mariokostelac/ra-integrate)). 13 | Ra attempts to create *de novo* assemblies from raw nanopore and PacBio reads without requiring error correction, for which a highly sensitive overlapper is required. 14 | 15 | 16 | ### Quick start on Linux x64 17 | ``` 18 | git clone https://github.com/isovic/graphmap.git 19 | cd graphmap 20 | make 21 | 22 | # To align: 23 | ./bin/Linux-x64/graphmap -r reference.fa -d reads.fasta -o output.sam 24 | 25 | # To overlap: 26 | ./bin/Linux-x64/graphmap -w owler -r reads.fasta -d reads.fasta -o output.mhap 27 | ``` 28 | 29 | ### Description 30 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data. 31 | It is **designed to handle Oxford Nanopore MinION 1d and 2d reads** with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers. 32 | 33 | GraphMap was also designed for ease-of-use: the **default parameters** can handle a wide range of read lengths and error profiles, including: *Illumina*, *PacBio* and *Oxford Nanopore*. 34 | This is an especially important feature for technologies where the error rates and error profiles can vary widely across, or even within, sequencing runs. 35 | 36 | **The GraphMap algorithm** is structured to achieve high-sensitivity and speed using a five-stage 37 | read-funneling approach. In stage I, GraphMap uses a novel adaptation of gapped spaced seeds to efficiently reduce the search space and get seed hits as a form of coarse alignment. These are then refined in stage II using graph-based vertex-centric processing of seeds to efficiently construct alignment anchors. GraphMap then chains anchors using a kmer 38 | version of longest common subsequence (LCS) construction (stage III), refines 39 | alignments with a form of L1 linear regression (stage IV) and finally evaluates the 40 | remaining candidates to select the best location to reconstruct a final alignment (stage V). 41 | GraphMap computes a BLAST-like E-value as well as a mapping quality for its alignments. 42 | 43 | **Evaluation** on MinION sequencing datasets against short and long-read mappers indicates that GraphMap increases mapping sensitivity by at least 15-80%. GraphMap alignments are the first to demonstrate consensus calling with <1 error in 100,000 bases, variant calling on the human genome with 76% improvement in sensitivity over the next best mapper (BWA-MEM), precise detection of structural variants from 100bp to 4kbp in length and species and strain-specific identification of pathogens using MinION reads. 44 | 45 | Further details about the algorithm, comparison with other mappers and usage applications can be found in the **preprint** of our paper: 46 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719) 47 | 48 | **Nanopore sequencing data** of E. Coli UTI89 generated in-house and used in the paper now available on ENA: 49 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557) 50 | 51 | ### Features 52 | - Mapping position agnostic to alignment parameters. 53 | - Consistently very high sensitivity and precision across different error profiles, rates and sequencing technologies even with default parameters. 54 | - Circular genome handling to resolve coverage drops near ends of the genome. 55 | - E-value. 56 | - Meaningful mapping quality. 57 | - Various alignment strategies (semiglobal bit-vector and Gotoh, anchored). 58 | 59 | ### Installation 60 | To build GraphMap from source type: 61 | ``` 62 | make 63 | ``` 64 | Required libraries are prebuilt for Linux x64 systems. 65 | To rebuild them for other systems, type: 66 | ``` 67 | make deps 68 | ``` 69 | 70 | You will need a recent GCC/G++ version (>=4.7). 71 | 72 | More installation instructions can be found in the INSTALL file. 73 | 74 | 75 | ### Usage examples 76 | ``` 77 | # Align all reads from a given FASTA/FASTQ file with default number of threads using semiglobal bit-vector alignment: 78 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam 79 | 80 | # Align all reads from a given FASTA/FASTQ file using anchored alignment approach: 81 | ./graphmap -a anchor -r escherichia_coli.fa -d reads.fastq -o alignments.sam 82 | 83 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast): 84 | ./graphmap -w owler -r reads.fa -d reads.fa -o overlaps.mhap 85 | 86 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow): 87 | ./graphmap -w overlapper -r reads.fa -d reads.fa -o overlaps.sam 88 | 89 | # Align reads using the Gotoh for semiglobal alignment: 90 | ./graphmap -a gotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam 91 | 92 | # Align reads using Gotoh alignment with anchored approach: 93 | ./graphmap -a anchorgotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam 94 | 95 | # Process reads from a circular genome: 96 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam 97 | 98 | # Threshold the E-value of alignments to 1e-100. Alignments with E-value > 1e-100 will be called unmapped: 99 | ./graphmap -z 1e-100 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 100 | 101 | # Output all similarly good alignments (to within F*num_kmers_of_best_alnmnt) instead of only one best: 102 | ./graphmap -Z -F 0.05 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 103 | 104 | # Limit the number of threads to 8, and load reads in batches of 50MB: 105 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 106 | 107 | # Align reads using more sensitive parameters for Illumina data (currently equivalent to "-a gotoh"): 108 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam 109 | 110 | # Load all reads in one batch and align only the first 1000 reads: 111 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 112 | 113 | # Process all reads from a given folder. 114 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder 115 | 116 | # Generate only the index. 117 | ./graphmap -I -r escherichia_coli.fa 118 | 119 | # Run a debug version of GraphMap (build with "make debug") and verbose the SAM output to see various info about alignment: 120 | ./graphmap-debug -b 3 -r escherichia_coli.fa -d reads.fastq -o alignments.sam 121 | 122 | ``` 123 | 124 | ### Contact information 125 | 126 | For additional information, help and bug reports please send an email to one of the following: 127 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg 128 | 129 | ### Acknowledgement 130 | This work was supported by the IMaGIN platform (project No. 102 101 0025), through a grant from the Science and Engineering Research Council, funding to the Genome Institute of Singapore from the Agency for Science, Technology and Research (A*STAR), Singapore, and funding from the Croatian Science Foundation (Project no. UIP-11-2013-7353 - Algorithms for Genome Sequence Analysis). 131 | -------------------------------------------------------------------------------- /doc/changelog.md: -------------------------------------------------------------------------------- 1 | ## GraphMap - ChangeLog 2 | 3 | **__Version 0.5.0 -> 0.5.1__** 4 | Release date: 04 March 2017 5 | - Updated the gindex module for smaller memory consumption when building the index. Index construction is now a bit slower (single thread is used for collecting minimizers), but collection of minimizers is now performed on the fly. Previously, all seeds would be collected first, and then they would be pulled through a minimizer generation function. Now, each seed is pushed into the minimizer queue and if the queue yields a seed which is different than the previous one, it is emplaced on the list. 6 | The memory consumption is still large (similar to index in versions 0.4.x), which is due to 128-bit integer representation of all seeds (seed key, sequence ID and sequence position). This could be reduced further by careful redesign. 7 | The disk version of the index is fully compatible to version 0.5.0. 8 | The reduced memory consumption directly also impacts the Owler mode as well. 9 | 10 | **__Version 0.4.1 -> 0.5.0__** 11 | Release date: 28 February 2017 12 | - Re-implemented the index. Removed all other indexes that were previously implemented, and cleaned up the code to only use the new index (MinimizerIndex). MinimizerIndex is implemented in a separate repo added to the codebase. It also uses a hash table to store the seeds, however instead of the perfect hash as before, Google's DenseHash is used. Seeds are first compiled in a giant list (each sequence in its space, in parallel), and afterwards the list is sorted (also multithreaded). Basic statistics on seed key distribution are calculated (mean, median, standard deviation). The index also allows thresholding the amount of hits during lookup (keys with a count higher than a user-specified percentil are skipped) which is very significant for large, repetitive genomes. The index can also generate minimizers (also user specified). Index also allows for custom indexing shapes to be defined, and creates the lookup shapes automatically. 13 | - Changed the command line parameters to allow for new features, concretely: 14 | 1. Removed the parameter ```max-hits``` which is now obsolete. 15 | 2. Added parameter ```minimizer-window``` to specify the length of the minimizer window to choose minimizers from. If equal to 1, minimizers won't be used. 16 | 3. Added parameter ```freq-percentil``` to specify the percentil of key occurances which will be kept. E.g. if 0.99, then 1% of most repetitive keys will be skipped. If 1.0, no filtering will be used. 17 | 4. Added parameter ```fly-index``` which will generate index on the fly and won't store it to disk. If the index already exists on disk, it will be loaded. To completely generate a new index on the fly, use ```--fly-index --rebuild-index```. 18 | 5. Renamed the parameter which was previously known as ```sensitive``` to ```double-index```. 19 | 6. Added a composite parameter called ```-x sensitive``` which will turn off minimizers and key frequency filtering. 20 | 21 | - Fixed an issue with RNA-seq transcriptome mapping, where recall would be lower than expected. There was a bug when checking if alignment is sane - the check would occur *after* the alignment was converted from transcriptome space to genome space, instead still on the transcriptome. This could not have caused false positives, but definitely caused many reads to be unmapped. 22 | - The reimplemented index now fixes the issue of segmentation fault on the human genome. 23 | 24 | 25 | 26 | **__Version 0.4.0 -> 0.4.1__** 27 | Release date: 28 January 2017 28 | - Fixed the SAM headers for transcriptome mapping. In the last version, the headers corresponded to the transcriptome headers, although the alignments are in the genome space. 29 | 30 | **__Version 0.3.2 -> 0.4.0__** 31 | Release date: 22 January 2017 32 | - GraphMap can now accept a GTF file for mapping to a transcriptome. Transcriptome is internally generated using the reference file and the GTF file, and index built from the transcriptome. Reads are then mapped to the transcriptome, and final alignments converted back to the genome coordinate space by introducing 'N' operations at splice sites. 33 | - Transcriptome mapping is only available in anchored alignment modes. 34 | - Updated Edlib to the newest version. Previous version had a bug in the traceback. 35 | - Recent changes in Edlib produced leading and trailing deletions in some cases. This is now handled by removing the deletions and shifting the alignment start position. 36 | - Fixed several (possible) memory leaks and invalid reads/writes. Generating the MD tag in SAM files had an invalid read which for some reason caused strange artifacts in CIGAR strings. 37 | 38 | **__Version 0.3.1 -> 0.3.2__** 39 | Release date: 19 December 2016 40 | - There were segfaults caused by recently-introduced bugs to Edlib. It has since been updated, and this version of GraphMap now includes the fixed version of Edlib. 41 | - There was a memory leak when generating clusters. 42 | - Minor fixes to some syntax. 43 | 44 | **__Version 0.3.0 -> 0.3.1__** 45 | Release date: 12 October 2016 46 | - Important: Fixed MD field issues 47 | - Minor bug fixes: composite command line parameter ```-x illumina``` depended on a parameter which wasn't defined properly, filtered empty SAM lines, etc. 48 | 49 | **__Version 0.22 -> 0.3.0__** 50 | Release date: 15 April 2016 51 | If you are using versions 0.3.x please update to the most recent commit. There were several important memory access issues which are now resolved. 52 | GraphMap's command line has changed significantly between version 0.3.x and 0.2x - although many options remain similar, the usage is incompatible with the older releases due to explicit tool specification. 53 | The first parameter is now mandatory, and specifies whether the **mapping/alignment** (```./graphmap align```) or **overlapping** (```./graphmap owler```) should be used. 54 | **Important change in command line parameters.** The new version is not completely compatible to the previous one. For this reason, the minor version number has changed. 55 | - Changed the version numbering from: ```x.yz``` to ```x.y.z``` 56 | - Implemented a new argument parser. 57 | - Fixed a bug with overhanging base (Issue #14), commit: 41ae30b0d8603469c62794cba1960dc42f739d4e 58 | - Fixed the extensions of alignment to read ends when near an overhang (Issue #18). 59 | - Fixed Issue #19 - inconsistent behaviour for parameter ```-F```. 60 | - Cleaned up the code a bit. 61 | - Restructured the code. Majority of the code was extracted from the repository to be used as the codebase for this and other projects. GraphMap's main code is left in this repo, while the rest is linked via git submodules. 62 | - Added support for reading SAM and GFA files as the input sequences. Gzipped versions of all formats are supported as well. By default the format is chosen by the extension of the fila (--infmt auto), but can be specified manually. 63 | - Added support for the M5 output format. 64 | - Added the MD field to the SAM output. 65 | - New and better anchor filtering (anchored modes only) using chaining of anchors that passed the LCSk. 66 | - New and better clustering of anchor stretches. This will be used for implementing RNA-seq alignment. 67 | - No need to precompile libraries for your system anymore. Libraries are now included in the source, or in the submodules. To initialize submodules, either clone recursively, or call ```make modules``` once GraphMap repo has been cloned. 68 | - Anchored alignment is now the default one. 69 | 70 | Important command line changes: 71 | - Long argument names are now provided. 72 | - Extended CIGAR format can now be used via commandline through the --extcigar parameter (unlike before, where the code needed to be recompiled). 73 | - By default, GraphMap now uses only one gapped spaced index (previously, two were used by default; one could have been used by specifying the parsimonious mode). The defaults now are the ex parsimonious mode. To use two indexes, specify the parameter: --sensitive 74 | - The ```-w owler``` and ```-w overlapper``` have been moved. The alignment/owler mode is chosen as the first parameter in the commandline now (a "subprogram"; e.g. run ```graphmap owler```. To use the ex ```-w overlapper```, specify ```-x overlap``` instead. This mode has now been simply converted to a composite parameter. There is also a command line parameter ```--overlapper``` which only controls the counting of hits in order to skip self-hits. 75 | - There is now a default E-value filter set at ```1e0``` 76 | - There is now a default MAPQ filter set at ```1``` 77 | - It is now possible to switch off extension of alignments to read ends (parameter: ``--no-end2end```). 78 | - If the index needs to be rebuilt, it can now be done using a sinle command line with parameter: ```--rebuild-index`` 79 | -------------------------------------------------------------------------------- /doc/img/anchors-normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-normal.png -------------------------------------------------------------------------------- /doc/img/anchors-rna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-rna.png -------------------------------------------------------------------------------- /doc/img/region_selection-rna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection-rna.png -------------------------------------------------------------------------------- /doc/img/region_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection.png -------------------------------------------------------------------------------- /doc/rnaseq.md: -------------------------------------------------------------------------------- 1 | ## Mapping RNA-seq reads 2 | 3 | ### 1. Transcriptome mapping 4 | Since version 0.4.0, GraphMap has a support for mapping reads to internally generated transcriptomes. 5 | These features are available on the ```master``` branch of the GraphMap repo. 6 | To use this feature, a ```GTF``` annotations file is needed alongside the reference ```FASTA```. 7 | The goal of this option is to simplify the process for end-users. The final alignments are also automatically transformed back to genomic coordinates, thus completely wrapping the entire process. 8 | 9 | 10 | 11 | To use this feature, simply specify ```--gtf annotations.gtf``` alongside to other command line parameters. 12 | 13 | **Acknowledgements** 14 | Mile Šikić (MS) and Niranjan Nagarajan (NN) proposed the implementation of this approach for RNA-seq mapping. Ivan Krpelnik (IK) implemented the initial version of the transcriptome generator, as well as the conversion utility to convert from transcriptome space back to genome space. IK worked under guidance from Krešimir Križanović (KK) and Ivan Sović (IS). The new methods were embedded in existing GraphMap source code by IS. KK was/is working on evaluation and benchmarking of our RNA-seq methods. Mile Šikić (MS) supervised the project. 15 | 16 | ### 2. Spliced alignments 17 | Support for spliced alignments in GraphMap is a work in progress and currently experimental. 18 | To activate this mode, specify ```-x rnaseq``` alongside to other command line parameters. This feature is available on the ```rna-alpha``` branch. Install and compile in the testing mode as such: 19 | ``` 20 | git clone https://github.com/isovic/graphmap 21 | cd graphmap 22 | git checkout rna-alpha 23 | make modules 24 | make -j 4 testing 25 | ``` 26 | 27 | After this, run GraphMap using: 28 | ``` 29 | bin/graphmap-not_release align -x rnaseq -r ref.fa -d reads.fastq -o out.sam 30 | ``` 31 | 32 | ***Please be aware*** that this is currently a highly experimental. It is not production-ready. Implementation may vary significantly from implementation to implementation. 33 | 35 | 36 | Here is a short description of the approach we are taking. 37 | 38 | **2.1 Region selection** 39 | The GraphMap [paper](http://www.nature.com/ncomms/2016/160415/ncomms11307/full/ncomms11307.html) describes the region selection process (first step in GraphMap). In short, an array of bins is constructed, where each bin represents a consecutive, non-overlapping region of the reference, where each bin is of size ```read_length / 3```. For each seed of an analyzed read, all hits on the reference are looked-up. For each hit, ```+1``` is added to a bin corresponding to region where the hit falls into. (If a seed has multiple hits in the same region, only one is counted). 40 | 41 | 42 | 43 | Regions are then sorted in descending order of their counts and further processed one by one. Before a region is processed, it is first extended on both ends (by ```read_length```) so that the entire read may fit inside after the alignment. 44 | This approach was shown to be very well suited for detecting secondary alignments, as different regions which might contain similarly good alignments would be processed individually. 45 | In the default alignment mode, this approach can produce pretty sensitive alignments. 46 | 47 | Now consider mapping of RNA-seq reads. In this case, a read can actually be split into several distant regions across a chromosome. Should the same region selection strategy be applied, the bin counts would simply redistribute to different regions. This means that, should the exons have a few good seed hits, we could detect the correct regions and further process them to obtain the spliced alignments. Of course, noise hits will cause trouble (this will be addressed in continuation). For RNA-seq, regions are also sorted by their bin counts and further processed using the Graph Mapping and the LCSk steps. 48 | 49 | 50 | 51 | **2.2. Graph Mapping and LCSk** 52 | For each region, Graph Mapping is performed to obtain anchors (matches between the read and the region). Anchors are filtered using the LCSk method. 53 | These steps are the same as in normal DNA mapping case. 54 | 55 | However, here we add a method of **clustering anchors** after they have been filtered. Clusters are obtained using the classic chaining approach which joins anchors that are close enough, and are nearly on the same diagonal. 56 | Clusters then represent larger matching chunks between a read and a reference. 57 | 58 | For normal DNA mapping, one would ideally (in the abscence of structural variants) expect to see one large cluster, such as shown below: 59 | 60 | 61 | 62 | However, in case of RNA-seq mapping (or in presence of structural variants) such a graph might look like something closer to the following figure: 63 | 64 | 65 | 66 | What's more, viewed in such way, a cluster actually can represent an *exon*! 67 | Now, if we collect all clusters (some of them being possible repeats), we can use this information to create our spliced alignments! 68 | 69 | But, hold on. Since some exons can be separated by a large gap on the reference (much larger than the read), we need to consider other regions simultaneously. 70 | 71 | For this reason, all clusters (represented with their start and end coordinates in both the reference and the read) for all analyzed regions are first collected in a single list. 72 | 73 | Then, the **knapsack** algorithm is applied on the list of clusters. 74 | 75 | **2.3 Knapsack algorithm** 76 | [Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem) is a problem of combinatorial optimization. Given a set of items, each with a weight and a value, the problem it tries to solve is "how to fill a knapsack with items so that the total weight is less than or equal to the given limit, and the value is as large as possible". 77 | 78 | In our case: 79 | - Knapsack is a *read* 80 | - Weight limit is the read length 81 | - An item is a cluster 82 | - Item weight is the length of the cluster in the read coordinate space 83 | 84 | Now, solving the knapsack problem would result in a list of clusters which fill the read the most. Alignment is then performed only on those clusters, and reported as separate SAM lines - one for each cluster (exon). 85 | 86 | **Acknowledgements** 87 | Ivan Sović (IS) proposed this solution for mapping of RNA-seq reads based on the knapsack algorithm. The initial version of the knapsack algorithm was implemented by Antonio Jurić (AJ). It was embedded in existing GraphMap source code by IS. Krešimir Križanović (KK) was/is working on evaluation and benchmarking of our RNA-seq methods and helped guide AJ. Mile Šikić (MS) supervised the project. 88 | -------------------------------------------------------------------------------- /doc/sam_output.md: -------------------------------------------------------------------------------- 1 | ### Details on the SAM output generated by GraphMap 2 | 3 | Description of special tags in the SAM output: 4 | - **ZE** - The E-value. More accurately - a pesimistic approximation of the E-value obtained by rescoring the generated alignment with scores/penalties for which pre-calculated Gumbel parameters exist. Concretely, scores/penalties are: ```match = 5, mismatch = -4, gap_open = -8, gap_extend = -6```. By default, there is no threshold on the E-value so even weak homologies would be reported, but there is a parameter which provides this functionality (```-z```), e.g.: ```-z 1e0```. 5 | - **ZF** - An internal parameter for quality of alignment calculated using equation (8) in our preprint: (http://biorxiv.org/content/early/2015/06/10/020719). In GraphMap, potential regions for a read are sorted by this parameter, and the primary alignment is the one with the largest ZF value. ZF values for different reads are not mutually comparable. 6 | - **ZQ** - Query (read) length. 7 | - **ZR** - Reference length. 8 | - **H0** - Specified by SAM format as the "number of perfect hits", GraphMap reports here the number of possible mapping positions with the same number of kmer hits. 9 | - **NM** - Edit distance, specified by the SAM format. 10 | - **AS** - Alignment score, specified by the SAM format. 11 | 12 | There are two hidden gems in GraphMap's output, providing more detailed reporting of the alignment process. Compiling GraphMap with ```make testing``` will generate a binary file on path ```bin/graphmap-not_release```. Running this version using parameter ```-b 3``` will generate a more verbose version of the SAM output file: 13 | - **X3** - A string containing very verbose information about the alignment of a particular read. 14 | - **X4** - Measurement of the CPU time spent on major parts of the algorithm, in a human-readible text format. 15 | -------------------------------------------------------------------------------- /overlap.md: -------------------------------------------------------------------------------- 1 | ## GraphMap Owler - Overlap With Long Erroneous Reads 2 | GraphMap implements two overlap modes: 3 | - ```./graphmap owler``` - fast, uses a trimmed GraphMap pipeline, reports output in MHAP or PAF formats, and 4 | - ```./graphmap align -x overlap``` - full GraphMap pipeline including alignment, output in SAM format. 5 | 6 | Owler mode (Overlap With Long Erroneous Reads) skips the graph-mapping and alignment steps. The full pipeline consists of the following steps: 7 | 1. Construct a gapped spaced index of the reads for only one shape (6-mers, "1111110111111"). 8 | 2. For a read, collect all gapped spaced seed hits. 9 | 3. LCSk++. 10 | 4. Filtering seeds reported by LCSk++. 11 | 5. Output overlaps in MHAP-like or PAF format. For details, see below. 12 | 13 | Currently, no seed hits are discarded, which can make overlapping slow on larger or more repetitive datasets, but very sensitive. 14 | 15 | Note that the overlappers are still experimental, and require thorough testing. 16 | 17 | ### Output formats 18 | **MHAP** format is described here: [http://mhap.readthedocs.org/en/latest/quickstart.html#output](http://mhap.readthedocs.org/en/latest/quickstart.html#output). 19 | GraphMap's output uses the same columns, but the meaning of columns 3 and 4 (```Jaccard score``` and ```# shared min-mers``` respectively) is different in our context. 20 | Instead of ```Jaccard score``` the fraction of bases covered by seeds is reported. 21 | Instead of ```# shared min-mers``` the number of seeds which survived filtering is reported. 22 | 23 | GraphMap can also output overlaps to **PAF** format. Specification of the format can be found here: [https://github.com/lh3/miniasm/blob/master/PAF.md](https://github.com/lh3/miniasm/blob/master/PAF.md). 24 | 25 | ### Comparison to other methods 26 | We are working on scripts to benchmark various overlapping tools on simulated and real (later) data. 27 | An initial functioning version can be found here: [https://github.com/isovic/overlap-benchmark](https://github.com/isovic/overlap-benchmark). 28 | 29 | ### Examples 30 | ``` 31 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast): 32 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.mhap 33 | 34 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in PAF format: 35 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.paf -L paf 36 | 37 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow): 38 | ./graphmap align -x overlap -r reads.fa -d reads.fa -o overlaps.sam 39 | ``` 40 | -------------------------------------------------------------------------------- /reproducibility/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/reproducibility/README.md -------------------------------------------------------------------------------- /reproducibility/run.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import os; 4 | import sys; 5 | import subprocess; 6 | 7 | def execute_command(command): 8 | sys.stderr.write('Executing command: %s\n' % (command)); 9 | subprocess.call(command, shell=True); 10 | 11 | def run_simulations(): 12 | sys.stderr.write('Starting the alignment process on simulated data.\n'); 13 | sys.stderr.write('Note that this might take a very long time.\n'); 14 | sys.stderr.write('E.g. BLAST took 110670 CPU secs in our tests on hg19_chr3 Oxford Nanopore 2d simulated dataset.\n'); 15 | execute_command('aligneval/run-alignment.py'); 16 | sys.stderr.write('\n'); 17 | 18 | sys.stderr.write('Alignment script returned.\n'); 19 | sys.stderr.write('\n'); 20 | 21 | sys.stderr.write('Running the evaluation script.\n'); 22 | execute_command('aligneval/run-evaluation.py'); 23 | sys.stderr.write('\n'); 24 | 25 | sys.stderr.write('Copying the results to reproducibility/results-simulated folder.\n'); 26 | execute_command('cp aligneval/results/*.csv results-simulated'); 27 | 28 | sys.stderr.write('Done!\n'); 29 | sys.stderr.write('\n'); 30 | 31 | def main(): 32 | if (os.path.exists('samscripts') == False or os.path.exists('aligneval') == False): 33 | sys.stderr.write('Please run setup.py first, to install all dependencies. Exiting.\n'); 34 | exit(1); 35 | 36 | if (len(sys.argv) < 2): 37 | sys.stderr.write('Run the alignment and evaluation processes from the GraphMap preprint paper.\n'); 38 | sys.stderr.write('Usage:\n'); 39 | sys.stderr.write('\tsim - Runs alignment on all simulation datasets. This might take quite a while to execute.\n'); 40 | exit(0); 41 | 42 | if (sys.argv[1] == 'sim'): 43 | if (len(sys.argv) != 2): 44 | sys.stderr.write('Runs alignment on all simulation datasets. This might take quite a while to execute.\n'); 45 | sys.stderr.write('Usage:\n'); 46 | sys.stderr.write('\t%s %s\n' % (sys.argv[0], sys.argv[1])); 47 | exit(0); 48 | 49 | run_simulations(); 50 | exit(0); 51 | 52 | else: 53 | sys.stderr.write('ERROR: Unknown subcommand!\n'); 54 | exit(0); 55 | 56 | if __name__ == "__main__": 57 | main(); 58 | -------------------------------------------------------------------------------- /reproducibility/setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import os; 4 | import sys; 5 | import subprocess; 6 | 7 | def execute_command(command): 8 | sys.stderr.write('Executing command: %s\n' % (command)); 9 | subprocess.call(command, shell=True); 10 | 11 | def main(): 12 | if (not os.path.exists('samscripts')): 13 | execute_command('git clone https://github.com/isovic/samscripts.git'); 14 | 15 | if (not os.path.exists('aligneval')): 16 | execute_command('git clone https://github.com/isovic/aligneval.git'); 17 | execute_command('cd aligneval; ./setup.py all'); 18 | 19 | folders_to_generate = ['data/reads', 'data/reference', 'results-simulated', 'results-real']; 20 | for folder_to_generate in folders_to_generate: 21 | if (not os.path.exists(folder_to_generate)): 22 | os.makedirs(folder_to_generate); 23 | 24 | if __name__ == "__main__": 25 | main(); 26 | -------------------------------------------------------------------------------- /src/aligner/aligner_base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * aligner_base.h 3 | * 4 | * Created on: Jan 7, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_ALIGNER_ALIGNER_BASE_H_ 9 | #define SRC_ALIGNER_ALIGNER_BASE_H_ 10 | 11 | #include 12 | #include 13 | #include "aligner_containers.h" 14 | #include "pairwise_penalties.h" 15 | 16 | namespace is { 17 | 18 | class AlignerBase { 19 | public: 20 | virtual ~AlignerBase() { } 21 | 22 | // virtual AlignmentReturnValue Align(const char* q, int64_t qlen, const char* t, int64_t tlen, AlignmentType type) = 0; // Selects the alignment mode based on a parameter. 23 | 24 | virtual AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type) = 0; // Global alignment mode. 25 | 26 | virtual AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0; // Local alignment mode. 27 | 28 | virtual AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0; // Semiglobal alignment mode. 29 | 30 | virtual AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, // Extend alignment mode. Does not necessarily 31 | int32_t bandwidth, int32_t zdrop) = 0; // produce CIGAR,but generate max alignment coords 32 | 33 | virtual std::shared_ptr getResults() = 0; 34 | 35 | }; 36 | 37 | } /* namespace is */ 38 | 39 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */ 40 | -------------------------------------------------------------------------------- /src/aligner/aligner_containers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * aligner_containers.h 3 | * 4 | * Created on: Jan 7, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_CONTAINERS_H_ 9 | #define SRC_CONTAINERS_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "sam_parser.h" 17 | 18 | namespace is { 19 | 20 | static constexpr int64_t LARGE_NEGATIVE_INT64 = std::numeric_limits::min() + 10000; 21 | 22 | enum class AlignmentReturnValue { // Strongly typed enum, C++11 feature. 23 | OK, // Everything went ok. 24 | Suboptimal, // Alignment stepped out of defined band. Result is not optimal. 25 | InvalidOptions, // In case parameters of values are invalid. 26 | QlenIsZero, 27 | TlenIsZero, 28 | WrongEditDist, 29 | AlignmentNotPerformed, // A default value for an alignment which wasn't performed. 30 | NotImplementedYet // For features in development. 31 | }; 32 | 33 | enum class AlignmentType { // Strongly typed enum, C++11 feature. 34 | Global, 35 | Local 36 | }; 37 | 38 | class AlignmentPosition { 39 | public: 40 | AlignmentPosition() : qstart(0), qend(0), tstart(0), tend(0) { } 41 | AlignmentPosition(int64_t _qstart, int64_t _qend, int64_t _tstart, int64_t _tend) : 42 | qstart(_qstart), qend(_qend), tstart(_tstart), tend(_tend) { } 43 | AlignmentPosition(const AlignmentPosition& op) : 44 | AlignmentPosition(op.qstart, op.qend, op.tstart, op.tend) { } 45 | AlignmentPosition& operator=(const AlignmentPosition& op) { 46 | qstart = op.qstart; 47 | qend = op.qend; 48 | tstart = op.tstart; 49 | tend = op.tend; 50 | return *this; 51 | } 52 | 53 | int64_t qstart, qend; // Query and target alignment start and end positions. End position 54 | int64_t tstart, tend; // is inclusive (the position of the last base). 55 | }; 56 | 57 | class AlignmentResult { 58 | public: 59 | AlignmentResult() : score(0), edit_dist(0), position(), 60 | max_score(LARGE_NEGATIVE_INT64), 61 | max_q_pos(-1), 62 | max_t_pos(-1), k(-1), rv(AlignmentReturnValue::AlignmentNotPerformed) { 63 | } 64 | 65 | AlignmentResult(const AlignmentResult& op) : 66 | score(op.score), edit_dist(op.edit_dist), 67 | position(op.position), cigar(op.cigar), 68 | max_score(op.max_score), max_q_pos(op.max_q_pos), 69 | max_t_pos(op.max_t_pos), 70 | k(op.k), rv(op.rv) { // Copy constructor. 71 | } 72 | 73 | ~AlignmentResult() { }; 74 | 75 | AlignmentResult& operator=(const AlignmentResult& op) { 76 | score = op.score; 77 | edit_dist = op.edit_dist; 78 | position = op.position; 79 | cigar = op.cigar; 80 | max_score = op.max_score; 81 | max_q_pos = op.max_q_pos; 82 | max_t_pos = op.max_t_pos; 83 | k = op.k; 84 | rv = op.rv; 85 | return *this; 86 | } 87 | 88 | // Alignment results. 89 | int64_t score; 90 | int64_t edit_dist; 91 | is::AlignmentPosition position; // There can be multiple alignments with the same score. 92 | // Only the first position and the corresponding alignment 93 | std::vector cigar; // are reported 94 | int64_t max_score, max_q_pos, max_t_pos; // Maximum score in the alignment, and the coordinates on query and target. 95 | int64_t k; // Value of band k used in the final alignment. 96 | AlignmentReturnValue rv; // Return value of the aligner. 97 | }; 98 | 99 | // If any global margin is true, then the corresponding will be penalized. 100 | // Concretely, if top/left are true, then the first row/column will be initialized 101 | // to the multiple of the gap extend penalty in global alignment. 102 | // If bottom is false, the maximum of last row will be found instead of taking 103 | // the bottom right corner for global alignment. 104 | // If right is false, the maximum of last column will be found instead of taking 105 | // the bottom right corner for global alignment. 106 | class GlobalMargins { 107 | public: 108 | GlobalMargins() 109 | : top(true), 110 | left(true), 111 | bottom(true), 112 | right(true) { 113 | } 114 | GlobalMargins(bool _top, bool _left, bool _bottom, bool _right) 115 | : top(_top), 116 | left(_left), 117 | bottom(_bottom), 118 | right(_right) { 119 | } 120 | bool top, left, bottom, right; 121 | }; 122 | 123 | class AlignmentOptions { 124 | public: 125 | AlignmentOptions() : k(-1), 126 | do_traceback(true) { 127 | } 128 | 129 | int32_t k; // Band for banded alignment. If < 0, banded alignment is turned off. 130 | bool do_traceback; // If traceback is not needed, then there is no need to alocate a large 131 | // matrix to store directions. 132 | GlobalMargins gm; 133 | }; 134 | 135 | } /* namespace is */ 136 | 137 | 138 | 139 | #endif /* SRC_CONTAINERS_H_ */ 140 | -------------------------------------------------------------------------------- /src/aligner/aligner_ksw2.cc: -------------------------------------------------------------------------------- 1 | #include "aligner_ksw2.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | // #include "ksw2/kseq.h" 8 | #include "aligner_util.hpp" 9 | #include 10 | 11 | // KSEQ_INIT(gzFile, gzread) 12 | 13 | namespace is { 14 | 15 | uint8_t seq_nt4_table[256] = { 16 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 17 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 18 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 21 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 22 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 23 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 24 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 25 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 26 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 27 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 28 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 29 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 30 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 31 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 32 | }; 33 | 34 | std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) { 35 | return std::shared_ptr(new AlignerKSW2(p, opt)); 36 | } 37 | 38 | static void print_aln(const char *tname, const char *qname, ksw_extz_t *ez) 39 | { 40 | printf("%s\t%s\t%d", tname, qname, ez->score); 41 | printf("\t%d\t%d\t%d", ez->max, ez->max_t, ez->max_q); 42 | if (ez->n_cigar > 0) { 43 | int i; 44 | putchar('\t'); 45 | for (i = 0; i < ez->n_cigar; ++i) 46 | printf("%d%c", ez->cigar[i]>>4, "MID"[ez->cigar[i]&0xf]); 47 | } 48 | putchar('\n'); 49 | } 50 | 51 | 52 | AlignerKSW2::AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) : p_(p), opt_(opt), result_(nullptr) { 53 | 54 | } 55 | 56 | AlignerKSW2::~AlignerKSW2() { 57 | 58 | } 59 | 60 | is::AlignmentReturnValue AlignerKSW2::Global(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, bool type) { 61 | void *km = 0; 62 | ksw_extz_t ez; // Alignment result. 63 | int w = -1, flag = 0, zdrop = -1; 64 | 65 | #ifdef HAVE_KALLOC 66 | km = km_init(); 67 | #endif 68 | 69 | memset(&ez, 0, sizeof(ksw_extz_t)); 70 | 71 | auto mat = GenerateSimpleMatchMatrix((int8_t) p_.match, (int8_t) p_.mismatch, 5); 72 | // In GraphMap definition, penalties are negative. KSW2 expects positive values. 73 | int8_t q = -p_.w[0].v; // Gap open. The intercept component of the affine function. 74 | int8_t e = -p_.w[0].u; // Gap extend. The slope of the affine function. 75 | int8_t q2 = -p_.w[1].v; 76 | int8_t e2 = -p_.w[1].u; 77 | 78 | KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, w, zdrop, flag, &ez, type); 79 | 80 | // print_aln("Query", "Target", &ez); 81 | 82 | result_ = std::shared_ptr(new is::AlignmentResult); 83 | result_->score = ez.score; 84 | result_->position = is::AlignmentPosition(0, qlen, 0, tlen); 85 | result_->k = -1; 86 | result_->rv = is::AlignmentReturnValue::OK; 87 | 88 | result_->cigar.clear(); 89 | std::vector basic_cigar; 90 | for (size_t i=0; i>4)); 92 | } 93 | result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar); 94 | 95 | result_->edit_dist = EditDistFromExtCIGAR(result_->cigar); 96 | 97 | // printf ("Converted CIGAR:\n"); 98 | // for (size_t i=0; icigar.size(); i++) { 99 | // printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op); 100 | // } 101 | // printf ("\n"); 102 | // printf ("Edit distance: %ld\n", result_->edit_dist); 103 | 104 | kfree(km, ez.cigar); 105 | #ifdef HAVE_KALLOC 106 | km_destroy(km); 107 | #endif 108 | 109 | return is::AlignmentReturnValue::OK; 110 | } 111 | 112 | is::AlignmentReturnValue AlignerKSW2::Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop) { 113 | result_ = std::shared_ptr(new is::AlignmentResult); 114 | 115 | if (qseq == NULL || tseq == NULL || qlen <= 0 || tlen <= 0) { 116 | return is::AlignmentReturnValue::InvalidOptions; 117 | } 118 | 119 | void *km = 0; 120 | ksw_extz_t ez; // Alignment result. 121 | int flag = KSW_EZ_SCORE_ONLY | KSW_EZ_EXTZ_ONLY; 122 | 123 | #ifdef HAVE_KALLOC 124 | km = km_init(); 125 | #endif 126 | 127 | memset(&ez, 0, sizeof(ksw_extz_t)); 128 | 129 | auto mat = GenerateSimpleMatchMatrix((int8_t) p_.match, (int8_t) p_.mismatch, 5); 130 | // In GraphMap definition, penalties are negative. KSW2 expects positive values for affine pieces. 131 | int8_t q = -p_.w[0].v; // Gap open. The intercept component of the affine function. 132 | int8_t e = -p_.w[0].u; // Gap extend. The slope of the affine function. 133 | int8_t q2 = -p_.w[1].v; 134 | int8_t e2 = -p_.w[1].u; 135 | 136 | KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, bandwidth, zdrop, flag, &ez, true); 137 | 138 | // print_aln("Query", "Target", &ez); 139 | 140 | result_->score = ez.score; 141 | result_->position = is::AlignmentPosition(0, qlen, 0, tlen); 142 | result_->k = -1; 143 | result_->rv = is::AlignmentReturnValue::OK; 144 | result_->max_score = ez.max; 145 | result_->max_q_pos = ez.max_q; 146 | result_->max_t_pos = ez.max_t; 147 | 148 | result_->cigar.clear(); 149 | std::vector basic_cigar; 150 | for (size_t i=0; i>4)); 152 | } 153 | result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar); 154 | 155 | result_->edit_dist = EditDistFromExtCIGAR(result_->cigar); 156 | 157 | // printf ("Converted CIGAR:\n"); 158 | // for (size_t i=0; icigar.size(); i++) { 159 | // printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op); 160 | // } 161 | // printf ("\n"); 162 | // printf ("Edit distance: %ld\n", result_->edit_dist); 163 | 164 | kfree(km, ez.cigar); 165 | #ifdef HAVE_KALLOC 166 | km_destroy(km); 167 | #endif 168 | 169 | return is::AlignmentReturnValue::OK; 170 | } 171 | 172 | is::AlignmentReturnValue AlignerKSW2::Local(const char* q, int64_t qlen, const char* t, int64_t tlen) { 173 | return is::AlignmentReturnValue::NotImplementedYet; 174 | } 175 | 176 | is::AlignmentReturnValue AlignerKSW2::Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) { 177 | return is::AlignmentReturnValue::NotImplementedYet; 178 | } 179 | 180 | std::shared_ptr AlignerKSW2::getResults() { 181 | return result_; 182 | } 183 | 184 | void AlignerKSW2::KSW2GlobalAlnWrapper_(void *km, 185 | const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen, 186 | int8_t m, const int8_t *mat, 187 | int8_t q, int8_t e, int8_t q2, int8_t e2, 188 | int w, int zdrop, int flag, ksw_extz_t *ez, bool type) { 189 | int i; 190 | ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1; 191 | ez->max = 0, ez->mqe = ez->mte = KSW_NEG_INF; 192 | ez->n_cigar = 0; 193 | 194 | auto qseq = ConvertSeqAlphabet(qseq_, qlen, &seq_nt4_table[0]); 195 | auto tseq = ConvertSeqAlphabet(tseq_, tlen, &seq_nt4_table[0]); 196 | 197 | if (type) { 198 | ksw_extd2_sse(km, qlen, (const uint8_t*) &qseq[0], 199 | tlen, (const uint8_t*) &tseq[0], 200 | m, mat, q, e, q2, e2, w, zdrop, flag, ez); 201 | } else { 202 | int noncan = 9; 203 | q = 4; 204 | e = 2; 205 | q2 = 32; 206 | zdrop = 200; 207 | flag = 1600; 208 | 209 | ksw_exts2_sse(km, qlen, (const uint8_t*) &qseq[0], 210 | tlen, (const uint8_t*) &tseq[0], 211 | m, mat, q, e, q2, noncan, zdrop, flag, ez); 212 | } 213 | 214 | // const char *algo = "extd2_sse"; 215 | // if (strcmp(algo, "extz2_sse") == 0) ksw_extz2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, w, zdrop, flag, ez); 216 | // else if (strcmp(algo, "extd2_sse") == 0) ksw_extd2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, q2, e2, w, zdrop, flag, ez); 217 | // // else if (strcmp(algo, "extf2_sse") == 0) ksw_extf2_sse(km, qlen, (uint8_t*)qseq, tlen, (uint8_t*)tseq, mat[0], mat[1], e, w, zdrop, ez); 218 | // else { 219 | // fprintf(stderr, "ERROR: can't find algorithm '%s'\n", algo); 220 | // exit(1); 221 | // } 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /src/aligner/aligner_ksw2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * aligner_base.h 3 | * 4 | * Created on: Jan 7, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_ALIGNER_ALIGNER_KSW2_H_ 9 | #define SRC_ALIGNER_ALIGNER_KSW2_H_ 10 | 11 | #include 12 | #include 13 | #include "aligner_base.h" 14 | #include "aligner_containers.h" 15 | #include "pairwise_penalties.h" 16 | #include "aligner_util.hpp" 17 | #include "ksw2/ksw2.h" 18 | 19 | namespace is { 20 | 21 | class AlignerKSW2; 22 | 23 | std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt); 24 | 25 | class AlignerKSW2 : public AlignerBase { 26 | public: 27 | friend std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt); 28 | 29 | ~AlignerKSW2(); 30 | 31 | AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type); // Global alignment mode. 32 | 33 | AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen); // Local alignment mode. 34 | 35 | AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen); // Semiglobal alignment mode. 36 | 37 | AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop); 38 | 39 | std::shared_ptr getResults(); 40 | 41 | protected: 42 | AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt); // We don't want users attempting to instantiate manually, even though the class is virtual. 43 | 44 | private: 45 | AlignerKSW2(const AlignerKSW2&) = delete; // No copying. 46 | AlignerKSW2& operator=(const AlignerKSW2&) = delete; // No copying. 47 | AlignerKSW2(AlignerKSW2&&) = delete; // No move constructor. 48 | AlignerKSW2& operator=(const AlignerKSW2&&) = delete; // No copying. 49 | 50 | void KSW2GlobalAlnWrapper_(void *km, 51 | const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen, 52 | int8_t m, const int8_t *mat, 53 | int8_t q, int8_t e, int8_t q2, int8_t e2, 54 | int w, int zdrop, int flag, ksw_extz_t *ez, bool type); 55 | 56 | const is::PiecewisePenalties& p_; 57 | const is::AlignmentOptions& opt_; 58 | std::shared_ptr result_; 59 | }; 60 | 61 | } /* namespace is */ 62 | 63 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */ 64 | -------------------------------------------------------------------------------- /src/aligner/aligner_util.cc: -------------------------------------------------------------------------------- 1 | #include "aligner_util.hpp" 2 | #include "assert.h" 3 | 4 | #include 5 | #include 6 | 7 | namespace is { 8 | 9 | std::vector ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table) { 10 | std::vector ret(seqlen + 33); // 32 for gaba 11 | for (size_t i=0; i ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen, 18 | const char* tseq, int64_t tlen, 19 | const std::vector& basic_cigar) { 20 | std::vector ret; 21 | 22 | int64_t qpos = 0, tpos = 0; 23 | for (size_t i=0; i 0) { 53 | ret.push_back(is::CigarOp(prev_m, curr_count)); 54 | } 55 | } 56 | } 57 | 58 | return ret; 59 | } 60 | 61 | int64_t EditDistFromExtCIGAR(const std::vector& extended_cigar) { 62 | int64_t edit_dist = 0; 63 | for (size_t i=0; i ExtractCigarBetweenQueryCoords(const std::vector& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q) { 74 | std::vector ret; 75 | 76 | int64_t qpos = 0; 77 | 78 | int lengthOfRef = 0; 79 | int lengthOfRead = 0; 80 | 81 | for (auto& c: cigar) { 82 | 83 | int64_t qpos_next = (c.op == 'M' || c.op == '=' || c.op == 'X' || c.op == 'I' || c.op == 'S') ? (qpos + c.count) : qpos; 84 | 85 | if (qpos > qend) { break; } 86 | 87 | if (qpos_next < qstart) { 88 | qpos = qpos_next; 89 | continue; 90 | } 91 | 92 | int64_t b = 0, e = c.count; 93 | 94 | if (qstart >= qpos && qstart < qpos_next) { b = qstart - qpos; } 95 | if (qend >= qpos && qend < qpos_next) { e = qend - qpos; } 96 | 97 | if ((e - b) > 0) { 98 | ret.emplace_back(is::CigarOp(c.op, (e - b))); 99 | 100 | if (c.op != 'I') { 101 | lengthOfRef += (e - b); 102 | } 103 | if(c.op != 'D' && c.op != 'N') { 104 | lengthOfRead += (e - b); 105 | } 106 | } 107 | 108 | qpos = qpos_next; 109 | } 110 | 111 | *cigar_length = lengthOfRef; 112 | *cigar_length_q = lengthOfRead; 113 | 114 | return ret; 115 | } 116 | 117 | std::string CigarToString(const std::vector& cigar) { 118 | std::stringstream ss; 119 | for (size_t i=0; i 5 | #include 6 | #include 7 | #include "sam_parser.h" 8 | 9 | namespace is { 10 | 11 | template 12 | std::vector GenerateSimpleMatchMatrix(T match, T mismatch, size_t alphabet_size) { 13 | std::vector matrix(alphabet_size * alphabet_size, mismatch); // Set the mismatch score. 14 | // Goes to "-1" to allow for 'N' bases which should not match to themselves. 15 | for (size_t i=0; i<(alphabet_size - 1); i++) { 16 | matrix[i*alphabet_size + i] = match; // Set the match score. 17 | matrix[i*alphabet_size + alphabet_size - 1] = 0; // Reset the last column to 0. 18 | matrix[(alphabet_size - 1) * alphabet_size + i] = 0; // Reset the last row to 0. 19 | } 20 | return matrix; 21 | } 22 | 23 | std::vector ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table); 24 | 25 | std::vector ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen, 26 | const char* tseq, int64_t tlen, 27 | const std::vector& basic_cigar); 28 | 29 | int64_t EditDistFromExtCIGAR(const std::vector& extended_cigar); 30 | 31 | std::vector ExtractCigarBetweenQueryCoords(const std::vector& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q); 32 | 33 | std::string CigarToString(const std::vector& cigar); 34 | 35 | } 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/aligner/anchor_aligner.h: -------------------------------------------------------------------------------- 1 | /* 2 | * anchor_aligner.h 3 | * 4 | * Created on: Aug 23, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_ANCHOR_ALIGNER_H_ 9 | #define SRC_ANCHOR_ALIGNER_H_ 10 | 11 | #include 12 | #include "aligner_base.h" 13 | #include "containers/results.h" 14 | 15 | #include 16 | #include 17 | 18 | namespace is { 19 | 20 | class AnchorAligner; 21 | 22 | std::shared_ptr createAnchorAligner(std::shared_ptr aligner); 23 | 24 | class AlignmentAnchor { 25 | public: 26 | AlignmentAnchor() : qstart(0), qend(0), rstart(0), rend(0) { } 27 | AlignmentAnchor(int64_t _qstart, int64_t _qend, 28 | int64_t _rstart, int64_t _rend) : 29 | qstart(_qstart), qend(_qend), rstart(_rstart), rend(_rend) { } 30 | 31 | int64_t qstart, qend; 32 | int64_t rstart, rend; 33 | }; 34 | 35 | class AnchorAligner { 36 | public: 37 | friend std::shared_ptr createAnchorAligner(std::shared_ptr aligner); 38 | 39 | ~AnchorAligner(); 40 | 41 | std::shared_ptr CreateAlignmentResult(int64_t qstart, int64_t qend, int64_t rstart, int64_t rend, std::vector rez); 42 | 43 | double AlignEdges(const char *query, const char *ref, int leftRef, int rightRef, int64_t start_position_read, int64_t start_position_ref, int number_of_bases, std::stack cigar_stack, std::deque cigar_queue); 44 | void AdjustEnds(int left_offset_ref, int right_offset_ref, const char *query, const char *ref, int64_t *start_position_ref, int64_t *start_position_read, int number_of_bases, std::stack *cigar_stack, std::deque *cigar_queue, bool type); 45 | 46 | /* Sorts anchors and then performs global alignment between the minimum and maximum anchor coordinates. 47 | */ 48 | std::shared_ptr GlobalEndToEnd(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors); 49 | 50 | /* Sorts the anchors, and aligns every neighboring pair of anchors. It does not extend beyond 51 | the ends of the first and last anchor. 52 | */ 53 | std::shared_ptr GlobalAnchored(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors, bool type); 54 | std::shared_ptr GlobalAnchoredWithClipping(const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors); 55 | 56 | /* Sorts the anchors, and aligns every neighboring pair of anchors. This extends alignments beyond 57 | the ends of the first and last anchor in an attempt to produce end-to-end alignment. 58 | */ 59 | std::shared_ptr GlobalAnchoredWithExtend(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen, 60 | const std::vector& anchors, int32_t bandwidth, int32_t zdrop, bool type); 61 | 62 | private: 63 | AnchorAligner(const AnchorAligner&) = delete; 64 | AnchorAligner& operator=(const AnchorAligner&) = delete; 65 | 66 | AnchorAligner(std::shared_ptr aligner); 67 | 68 | const std::shared_ptr aligner_; 69 | }; 70 | 71 | } 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/aligner/pairwise_penalties.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_ALIGNER_PAIRWISE_PENALTIES_H_ 2 | #define SRC_ALIGNER_PAIRWISE_PENALTIES_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace is { 8 | 9 | /* Regular alignment penalties for a single piece Gotoh alignment. 10 | */ 11 | class Penalties { 12 | public: 13 | Penalties() : match(5), mismatch(-4), gapopen(-8), gapext(-6) { } 14 | Penalties(int32_t _match, int32_t _mismatch, int32_t _gapopen, int32_t _gapext) : 15 | match(_match), mismatch(_mismatch), gapopen(_gapopen), gapext(_gapext) { } 16 | int32_t match, mismatch, gapopen, gapext; 17 | }; 18 | 19 | /* A helper class for a linear function. Used for piecewise Gotoh alignment. 20 | */ 21 | class AffinePiece { 22 | public: 23 | AffinePiece() : u(-6.0), v(-8.0) { } 24 | AffinePiece(float _u, float _v) : u(_u), v(_v) { } 25 | 26 | inline float calc(int32_t k) const { 27 | return (u * (k) + v); 28 | } 29 | 30 | float u, v; // Line equation parameters: w(k) = u * k + v. 31 | }; 32 | 33 | /* Penalties for a multiple affine function alignment. 34 | */ 35 | class PiecewisePenalties { 36 | public: 37 | PiecewisePenalties() : match(5), mismatch(-4), w(std::vector{AffinePiece(-6.0, -8.0)}) { } 38 | 39 | PiecewisePenalties(int32_t _match, int32_t _mismatch, const std::vector& _w) : 40 | match(_match), mismatch(_mismatch), w(_w) { } 41 | 42 | std::string Verbose() { 43 | std::stringstream ss; 44 | ss << "match = " << match << ", mismatch = " << mismatch << ""; 45 | for (int32_t l = 0; l < w.size(); l++) { 46 | ss << ", w[" << l << "] = {u = " << w[l].u << ", v = " << w[l].v << "}"; 47 | } 48 | ss << "\n"; 49 | return ss.str(); 50 | } 51 | 52 | float match, mismatch; 53 | std::vector w; 54 | }; 55 | 56 | } 57 | 58 | #endif -------------------------------------------------------------------------------- /src/aligner/sam_parser.cc: -------------------------------------------------------------------------------- 1 | #include "sam_parser.h" 2 | 3 | #include 4 | 5 | namespace is { 6 | 7 | int SplitCigar(const std::string &cigar_str, std::vector& ret) { 8 | ret.clear(); 9 | CigarOp op; 10 | // int32_t digit_count = 0; 11 | int64_t pos_ref = 0, pos_query = 0; 12 | const char *first_digit = NULL; 13 | for (size_t i=0; i= 0; i--) { 93 | auto& c = cigar[i]; 94 | if (c.op == 'H') { 95 | continue; 96 | } else if (c.op == 'S') { 97 | q_end -= c.count; 98 | } else { 99 | break; 100 | } 101 | } 102 | 103 | // Find reference alignment start. (Convert from 1-based to 0-based). 104 | r_start = pos - 1; 105 | 106 | // Find reference alignment end. 107 | r_end = r_start + CalcReferenceLengthFromCigar(cigar); 108 | 109 | // Do not performe reverse complementing here, we do not know 110 | // the length of the reference. 111 | 112 | return 0; 113 | } 114 | 115 | void SamLine::Tokenize_(const std::string& str, const char delimiter, std::vector& words) { 116 | words.clear(); 117 | std::stringstream ss(str); 118 | std::string line; 119 | while(std::getline(ss, line, delimiter)) { 120 | if (line.size() == 0) { continue; } 121 | words.push_back(line); 122 | } 123 | } 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/aligner/sam_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_SAM_PARSER_H_ 2 | #define SRC_SAM_PARSER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace is { 10 | 11 | #define is_cigar_op(x) (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'D' || x == 'S' || x == 'H') 12 | #define is_cigar_match(x) (x == 'M' || x == '=' || x == 'X') 13 | #define is_cigar_ins(x) (x == 'I') 14 | #define is_cigar_del(x) (x == 'D') 15 | #define is_cigar_soft(x) (x == 'S') 16 | #define is_cigar_hard(x) (x == 'H') 17 | #define is_cigar_ref(x) (x == 'M' || x == '=' || x == 'X' || x == 'D') 18 | #define is_cigar_read(x) (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'S') 19 | 20 | // class CigarOp { 21 | // public: 22 | // char op = '-'; 23 | // int32_t count = 0; 24 | // int64_t pos_ref = -1; // Relative to the pos_ field of the corresponding SequenceAlignment object. pos_ref starts from zero, eventhough the actuall alignment starts at an arbitrary position on the reference. 25 | // int64_t pos_query = - 1; 26 | 27 | // CigarOp() { } 28 | // CigarOp(char _op, int32_t _count, int64_t _pos_ref, int64_t _pos_query) : op(_op), count(_count), pos_ref(_pos_ref), pos_query(_pos_query) { } 29 | 30 | // }; 31 | 32 | /** @brief A container for a single CIGAR operation. 33 | * 34 | */ 35 | class CigarOp { 36 | public: 37 | CigarOp() : op(0), count(0) { } 38 | CigarOp(char _op, int32_t _count) : op(_op), count(_count) { } 39 | CigarOp(const CigarOp& t) : CigarOp(t.op, t.count) { } 40 | ~CigarOp() { } 41 | CigarOp& operator=(const CigarOp t) { 42 | op = t.op; 43 | count = t.count; 44 | return *this; 45 | } 46 | std::string get() { std::stringstream ss; ss << count << op; return ss.str(); } 47 | 48 | char op; 49 | int64_t count; 50 | }; 51 | 52 | 53 | int SplitCigar(const std::string &cigar_str, std::vector& ret); 54 | int64_t CalcReferenceLengthFromCigar(const std::vector& split_cigar); 55 | 56 | class SamLine { 57 | public: 58 | SamLine(); 59 | SamLine(const std::string& line); 60 | // SamLine(); 61 | // SequenceAlignment::SequenceAlignment(uint32_t _flag, std::string &rname, int64_t pos, int32_t mapq, std::string &cigar_string, std::string &rnext, int64_t pnext, int64_t tlen, std::vector &optional) 62 | // : flag_(flag), rname_(rname), pos_(pos), mapq_(mapq), rnext_(rnext), pnext_(pnext), tlen_(tlen), optional_(optional) { 63 | // SplitCigar(cigar_string, cigar_); 64 | // ProcessOptional(); 65 | // } 66 | ~SamLine(); 67 | 68 | int ParseLine(const std::string& line); 69 | std::string YieldString(); 70 | bool IsMapped(); 71 | bool IsReverse(); 72 | int FindAlignmentPosition(int64_t& q_start, int64_t& q_end, 73 | int64_t& r_start, int64_t& r_end); 74 | 75 | std::string qname; // Field #1. 76 | uint32_t flag; // Field #2. 77 | std::string rname; // Field #3. 78 | int64_t pos; // Field #4. 79 | int32_t mapq; // Field #5. 80 | // std::string cigar; // Field #6. 81 | std::vector cigar; 82 | std::string rnext; // Field #7. 83 | int64_t pnext; // Field #8. 84 | int64_t tlen; // Field #9. 85 | std::string seq; // Field #10. 86 | std::string qual; // Field #11. 87 | 88 | // Optional fields in the SAM format: 89 | int64_t as; // Alignment score. 90 | double evalue; // E-value. There is no dedicated field in the SAM format, but GraphMap uses ZE to specify the E-value. 91 | std::vector optional; // Raw values (strings) of optional fields, not explicitly converted to expected values; 92 | 93 | 94 | 95 | private: 96 | void Tokenize_(const std::string& str, const char delimiter, std::vector& words); 97 | 98 | }; 99 | 100 | } 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /src/alignment/alignment.h: -------------------------------------------------------------------------------- 1 | /* 2 | * alignment.h 3 | * 4 | * Created on: Jan 17, 2016 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_ALIGNMENT_ALIGNMENT_H_ 9 | #define SRC_ALIGNMENT_ALIGNMENT_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "sequences/single_sequence.h" 21 | #include "utility/utility_general.h" 22 | #include "program_parameters.h" 23 | #include "utility/utility_conversion-inl.h" 24 | #include "containers/path_graph_entry.h" 25 | #include "libs/edlib.h" 26 | #include "alignment/cigargen.h" 27 | #include "alignment_wrappers.h" 28 | #include "log_system/log_system.h" 29 | #include "containers/region.h" 30 | #include "seqan/basic.h" 31 | #include "seqan/align.h" 32 | #include "seqan/sequence.h" 33 | #include "seqan/stream.h" 34 | #include "utility/evalue.h" 35 | #include "graphmap/transcriptome.h" 36 | 37 | 38 | 39 | 40 | int AlignRegion(const SingleSequence *read, std::shared_ptr index, std::shared_ptr transcriptome, const ProgramParameters *parameters, const EValueParams *evalue_params, bool extend_to_end, PathGraphEntry *region_results); 41 | int SemiglobalAlignment(AlignmentFunctionType AlignmentFunction, 42 | const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, 43 | const EValueParams *evalue_params, PathGraphEntry *region_results); 44 | int AnchoredAlignmentNew(AlignmentFunctionType AlignmentFunctionNW, AlignmentFunctionType AlignmentFunctionSHW, 45 | const SingleSequence *read, std::shared_ptr index, std::shared_ptr transcriptome, const ProgramParameters *parameters, 46 | const EValueParams *evalue_params, PathGraphEntry *region_results, bool align_end_to_end, bool spliced_alignment); 47 | 48 | void VerboseAlignment(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const AlignmentResults *aln); 49 | 50 | /// Determines the start and end locations for semiglobal alignment, keeping in mind the boundaries of the reference being aligned to. Works with circular alignment as well. 51 | //int GetAlignmentWindowFromRegion(const SingleSequence *read, const Index *index, const ProgramParameters *parameters, const PathGraphEntry *region_results, 52 | // int64_t *win_start, int64_t *win_end, int64_t *win_len); 53 | int GetL1PosInRegion(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const PathGraphEntry *region_results, 54 | int64_t *l1_start, int64_t *l1_end); 55 | 56 | // Checks if the region is linear or circular. If it's linear, only a pointer to the beginning of the region (in the index) will be returned. Otherwise, a data array will be created containing the 57 | // concatenated region. 58 | // Returns 0 if the region was linear, otherwise 1. Value of 1 means that manual cleanup of ret_data is required, using free(). 59 | int GetAlignmentWindowData(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const PathGraphEntry *region_results, 60 | int8_t** data, int64_t* data_length, int8_t **pos_of_win_start, int8_t **pos_of_win_end, int64_t* offset_from_ref_start, int64_t* pos_of_ref_end, bool *is_cleanup_required); 61 | 62 | int FindCircularEnd(const std::vector &alignment, int64_t pos_of_ref_end, 63 | int64_t *ret_end_on_aln, int64_t *ret_end_on_read, int64_t *ret_end_on_ref, 64 | int64_t *ret_start_on_aln, int64_t *ret_start_on_read, int64_t *ret_start_on_ref); 65 | 66 | int SplitCircularAlignment(const AlignmentResults *aln, int64_t pos_of_ref_end, int64_t ref_start, int64_t ref_len, AlignmentResults *aln_l, AlignmentResults *aln_r); 67 | 68 | 69 | int CheckAlignmentSane(std::vector &alignment, const SingleSequence* read=NULL,std::shared_ptr index=nullptr, int64_t reference_hit_id=-1, int64_t reference_hit_pos=-1); 70 | 71 | #endif /* SRC_ALIGNMENT_ALIGNMENT_H_ */ 72 | -------------------------------------------------------------------------------- /src/alignment/alignment_wrappers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * local_realignment_generic.h 3 | * 4 | * Created on: Jan 16, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef LOCAL_REALIGNMENT_GENERIC_H_ 9 | #define LOCAL_REALIGNMENT_GENERIC_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "utility/utility_general.h" 20 | #include "program_parameters.h" 21 | #include "libs/edlib.h" 22 | #include "alignment/cigargen.h" 23 | #include "log_system/log_system.h" 24 | #include "containers/region.h" 25 | #include "seqan/basic.h" 26 | #include "seqan/align.h" 27 | #include "seqan/sequence.h" 28 | #include "seqan/stream.h" 29 | 30 | #define ALIGNMENT_TYPE_SHW 0 /// Gaps at the end are not penalized. 31 | #define ALIGNMENT_TYPE_HW 1 /// Gaps at the beginning and the end are not penalized. 32 | #define ALIGNMENT_TYPE_NW 2 /// Global alignment (gaps at the beginning and the end are penalized). 33 | 34 | #ifndef RELEASE_VERSION 35 | #include "libs/opal.h" 36 | #endif 37 | 38 | typedef int (*AlignmentFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t*, int64_t*, int64_t*, std::vector &); 39 | typedef int (*EditDistanceFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t*, int64_t*, int); 40 | 41 | int LocalizeAlignmentPosWithMyers(const int8_t *read_data, int64_t read_length, 42 | const int8_t *reference_data, int64_t reference_length, 43 | int64_t rough_reference_start, int64_t rough_reference_end, 44 | int64_t *ret_alignment_start, int64_t *ret_alignment_end, 45 | int64_t *ret_start_ambiguity, int64_t *ret_end_ambiguity, 46 | int64_t *ret_edit_distance, int64_t *ret_band_width, 47 | bool verbose_debug_output=false); 48 | 49 | int MyersSemiglobalWrapper(const int8_t *read_data, int64_t read_length, 50 | const int8_t *reference_data, int64_t reference_length, 51 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 52 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 53 | int64_t *ret_edit_distance, std::vector &ret_alignment); 54 | 55 | int MyersNWWrapper(const int8_t *read_data, int64_t read_length, 56 | const int8_t *reference_data, int64_t reference_length, 57 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 58 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 59 | int64_t *ret_edit_distance, std::vector &ret_alignment); 60 | 61 | int MyersSHWWrapper(const int8_t *read_data, int64_t read_length, 62 | const int8_t *reference_data, int64_t reference_length, 63 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 64 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 65 | int64_t *ret_edit_distance, std::vector &ret_alignment); 66 | 67 | #ifndef RELEASE_VERSION 68 | int OpalNWWrapper(const int8_t *read_data, int64_t read_length, 69 | const int8_t *reference_data, int64_t reference_length, 70 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 71 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 72 | int64_t *ret_edit_distance, std::vector &ret_alignment); 73 | 74 | int OpalSHWWrapper(const int8_t *read_data, int64_t read_length, 75 | const int8_t *reference_data, int64_t reference_length, 76 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 77 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 78 | int64_t *ret_edit_distance, std::vector &ret_alignment); 79 | #endif 80 | 81 | int SeqAnSemiglobalWrapper(const int8_t *read_data, int64_t read_length, 82 | const int8_t *reference_data, int64_t reference_length, 83 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 84 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 85 | int64_t *ret_edit_distance, std::vector &ret_alignment); 86 | int SeqAnSemiglobalWrapperWithMyersLocalization(const int8_t *read_data, int64_t read_length, 87 | const int8_t *reference_data, int64_t reference_length, 88 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 89 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 90 | int64_t *ret_edit_distance, std::vector &ret_alignment); 91 | 92 | int MyersEditDistanceWrapper(const int8_t *read_data, int64_t read_length, 93 | const int8_t *reference_data, int64_t reference_length, 94 | int64_t *ret_alignment_position_end, 95 | int64_t *ret_edit_distance, EdlibAlignMode edlib_mode_code=EDLIB_MODE_HW); 96 | 97 | int SeqAnNWWrapper(const int8_t *read_data, int64_t read_length, 98 | const int8_t *reference_data, int64_t reference_length, 99 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 100 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 101 | int64_t *ret_edit_distance, std::vector &ret_alignment); 102 | int SeqAnSHWWrapper(const int8_t *read_data, int64_t read_length, 103 | const int8_t *reference_data, int64_t reference_length, 104 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty, 105 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end, 106 | int64_t *ret_edit_distance, std::vector &ret_alignment); 107 | 108 | //int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align, bool is_global, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector &ret_alignment); 109 | int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align, int alignment_type, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector &ret_alignment); 110 | 111 | int CheckAlignmentSaneSimple(std::vector &alignment); 112 | 113 | 114 | 115 | #endif /* LOCAL_REALIGNMENT_GENERIC_H_ */ 116 | -------------------------------------------------------------------------------- /src/alignment/cigargen.h: -------------------------------------------------------------------------------- 1 | /* 2 | * cigargen.h 3 | * 4 | * Created on: Aug 28, 2014 5 | * Author: ivan 6 | */ 7 | 8 | #ifndef CIGARGEN_H_ 9 | #define CIGARGEN_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "libs/edlib.h" 20 | #include "utility/utility_general.h" 21 | #include "sequences/sequence_alignment.h" 22 | 23 | #define EDLIB_M 0 24 | #define EDLIB_EQUAL 0 25 | #define EDLIB_X 3 26 | #define EDLIB_I 1 27 | #define EDLIB_D 2 28 | #define EDLIB_S 4 29 | #define EDLIB_H 5 /// Not used in GraphMap currently (26.01.2016.) 30 | #define EDLIB_NOP 6 31 | #define EDLIB_N 7 // Large gaps (e.g. splicing sites). 32 | #define EDLIB_P 8 // Padding. 33 | 34 | inline char EdlibOpToChar(int8_t op) { 35 | return (op == EDLIB_M || op == EDLIB_EQUAL || op == EDLIB_X) ? 'M' : 36 | (op == EDLIB_I) ? 'I' : 37 | (op == EDLIB_D) ? 'D' : 38 | (op == EDLIB_S) ? 'S' : 39 | (op == EDLIB_H) ? 'H' : 0; 40 | } 41 | 42 | inline char EdlibOpToCharExtended(int8_t op) { 43 | return (op == EDLIB_EQUAL) ? '=' : 44 | (op == EDLIB_X) ? 'X' : 45 | (op == EDLIB_M) ? 'M' : 46 | (op == EDLIB_I) ? 'I' : 47 | (op == EDLIB_D) ? 'D' : 48 | (op == EDLIB_S) ? 'S' : 49 | (op == EDLIB_H) ? 'H' : 0; 50 | } 51 | 52 | std::string AlignmentToCigar(unsigned char *alignment, int alignmentLength, bool extended_format); 53 | int AlignmentToBasicCigar(unsigned char* alignment, int alignmentLength, char** cigar_); 54 | int AlignmentToExtendedCigar(unsigned char* alignment, int alignmentLength, char** cigar_); 55 | int AlignmentToExtendedCigarArray(unsigned char* alignment, int alignmentLength, std::vector &cigar); 56 | std::string AlignmentToMD(std::vector& alignment, const int8_t *ref_data, int64_t alignment_position_start); 57 | 58 | /// Searches for consecutive EDLIB_I and EDLIB_D (or vice versa) operations, and replaces the overlap with EDLIB_X. 59 | std::vector FixAlignment(unsigned char* alignment, int alignmentLength); 60 | /// In case an alignment has leading/trailing EDLIB_I operations, they will be replaced with EDLIB_S. 61 | int ConvertInsertionsToClipping(unsigned char* alignment, int alignmentLength); 62 | /// Counts the number of leading and trailing clipped bases (or insertions). 63 | int CountClippedBases(unsigned char* alignment, int alignmentLength, int64_t *ret_num_clipped_front, int64_t *ret_num_clipped_back); 64 | /// Sums up the bases on the reference the alignment spans through (EDLIB_M and EDLIB_D operations). 65 | int64_t CalculateReconstructedLength(unsigned char *alignment, int alignmentLength); 66 | 67 | int CalculateAlignmentScore(std::vector& alignment, int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend); 68 | 69 | /// Counts each operation type, and calculates the alignment score as well (while rescoring the alignment with the given scores/penalties). 70 | int CountAlignmentOperations(std::vector &alignment, const int8_t *read_data, const int8_t *ref_data, int64_t reference_hit_id, int64_t alignment_position_start, SeqOrientation orientation, 71 | int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend, 72 | bool skip_leading_and_trailing_insertions, 73 | int64_t *ret_eq, int64_t *ret_x, int64_t *ret_i, int64_t *ret_d, int64_t *ret_alignment_score, int64_t *ret_edit_dist, int64_t *ret_nonclipped_length); 74 | /// Reverses the operations in a CIGAR string. 75 | std::string ReverseCigarString(std::string &cigar); 76 | 77 | std::string PrintAlignmentToString(const unsigned char* query, const int queryLength, 78 | const unsigned char* target, const int targetLength, 79 | const unsigned char* alignment, const int alignmentLength, 80 | const int position, const int modeCode, int row_width=100); 81 | 82 | int GetAlignmentPatterns(const unsigned char* query, const int64_t queryLength, 83 | const unsigned char* target, const int64_t targetLength, 84 | const unsigned char* alignment, const int64_t alignmentLength, 85 | std::string &ret_query, std::string &ret_target, std::string &ret_match_pattern); 86 | 87 | void FixAlignmentLeadingTrailingID(std::vector& alignment, int64_t *ref_start, int64_t *ref_end); 88 | 89 | #endif /* CIGARGEN_H_ */ 90 | -------------------------------------------------------------------------------- /src/alignment/transcriptome_mod.h: -------------------------------------------------------------------------------- 1 | /* 2 | * transcriptome_mod.h 3 | * 4 | * Created on: Jan 5, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_ 9 | #define SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_ 10 | 11 | //#include "index/index.h" 12 | //#include "index/index_spaced_hash_fast.h" 13 | #include "minimizer_index/minimizer_index.h" 14 | #include "containers/results.h" 15 | #include "program_parameters.h" 16 | #include "graphmap/transcriptome.h" 17 | #include 18 | 19 | int ConvertFromTranscriptomeToGenomeAln(const ProgramParameters *parameters, std::shared_ptr index, std::shared_ptr transcriptome, AlignmentResults *aln); 20 | 21 | #endif /* SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_ */ 22 | -------------------------------------------------------------------------------- /src/containers/mapping_data.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * mapping_data.cc 3 | * 4 | * Created on: Mar 19, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #include "mapping_data.h" 9 | 10 | MappingData::MappingData() { 11 | bins.clear(); 12 | intermediate_mappings.clear(); 13 | final_mapping_ptrs.clear(); 14 | 15 | bin_size = -1; 16 | 17 | num_seeds_over_limit = 0; 18 | num_seeds_with_no_hits = 0; 19 | num_seeds_errors = 0; 20 | 21 | num_similar_mappings = 0; 22 | num_same_mappings = 0; 23 | avg_covered_bases_of_all_mappings = 0; 24 | std_covered_bases_of_all_mappings = 0; 25 | median_covered_bases_of_all_mappings = 0; 26 | 27 | iteration = 0; 28 | 29 | unmapped_reason = std::string(""); 30 | 31 | num_region_iterations = 0; 32 | mapping_quality = 0; 33 | metagen_alignment_score = 0; 34 | 35 | time_region_selection = 0.0; 36 | time_mapping = 0.0; 37 | time_alignment = 0.0; 38 | time_region_seed_lookup = 0.0; 39 | time_region_hitsort = 0.0; 40 | time_region_conversion = 0.0; 41 | time_region_alloc = 0.0; 42 | time_region_counting = 0.0; 43 | } 44 | 45 | MappingData::~MappingData() { 46 | clear(); 47 | } 48 | 49 | bool MappingData::IsMapped() { 50 | for (int32_t i=0; iIsMapped() == true) { return true; }; 52 | } 53 | return false; 54 | } 55 | 56 | bool MappingData::IsAligned() { 57 | for (int32_t i=0; iIsAligned() == true) { return true; }; 59 | } 60 | return false; 61 | } 62 | 63 | std::string MappingData::VerboseMappingDataToString_(const std::vector *mapping_data, std::shared_ptr index, const SingleSequence *read) const { 64 | std::stringstream ss; 65 | 66 | int64_t reference_length = index->get_data().size(); 67 | int64_t read_length = read->get_data_length(); 68 | 69 | ss << "-----------------------\n"; 70 | ss << "--- num_entries = " << mapping_data->size() << "\n"; 71 | ss << "--- read id = " << read->get_sequence_absolute_id() << "\n"; 72 | ss << "--- read name = " << read->get_header() << "\n"; 73 | ss << "--- read_length = " << read_length << "\n"; 74 | ss << "--- reference_length = " << reference_length << "\n"; 75 | 76 | for (int64_t i = (mapping_data->size() - 1); i >= 0; i--) { 77 | // ss << "--- [" << i << "] "; 78 | ss << "[" << i << "/" << mapping_data->size() << "] "; 79 | int64_t start_location = 0, start_location_raw = 0; 80 | 81 | ss << "local_score_id = " << mapping_data->at(i)->get_mapping_data().local_score_id; 82 | ss << "\n ° " << mapping_data->at(i)->VerboseToString(); 83 | ss << "\n ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", fwd_r_id = " << (mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()) << ", region_index = " << mapping_data->at(i)->get_region_data().region_index; 84 | ss << "\n ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\""; 85 | ss << "\n ° Unmapped reason: \"" << unmapped_reason << "\""; 86 | int64_t relative_position = 0; 87 | int64_t absolute_position = 0; 88 | SeqOrientation orientation = kForward; 89 | 90 | ///// TODO: 06.02.2017. 91 | // This chunk below was removed due to the incompatibilities with the new index. 92 | // int64_t reference_id = index->RawPositionConverter(start_location, 0, &absolute_position, &relative_position, &orientation); 93 | int64_t reference_id = mapping_data->at(i)->get_region_data().reference_id; 94 | 95 | int64_t reference_start = mapping_data->at(i)->get_mapping_data().ref_coords.start; 96 | // index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.start, 0, &absolute_position, &reference_start, &orientation); 97 | int64_t reference_end = mapping_data->at(i)->get_mapping_data().ref_coords.end; 98 | // index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.end, 0, &absolute_position, &reference_end, &orientation); 99 | 100 | for (int64_t j = 0; j < mapping_data->at(i)->get_alignments().size(); j++) { 101 | ss << "\n ° Alignment " << j << " / " << mapping_data->at(i)->get_alignments().size(); 102 | ss << "\n ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", region_index = " << mapping_data->at(i)->get_region_data().region_index << ", region_votes = " << mapping_data->at(i)->get_region_data().region_votes << ", position = " << relative_position << ", r1[" << reference_start << ", " << reference_end << "], " << ((orientation == kForward) ? "forward" : "reverse"); 103 | ss << ", sam_NM = " << mapping_data->at(i)->get_alignments()[j].edit_distance << ", sam_AS = " << mapping_data->at(i)->get_alignments()[j].alignment_score << ", sam_evalue = " << mapping_data->at(i)->get_alignments()[j].evalue << ", sam_pos = " << mapping_data->at(i)->get_alignments()[j].ref_start << ", sam_mapq = " << ((int64_t) mapping_data->at(i)->get_alignments()[j].mapping_quality) << ", relative_position = " << relative_position; 104 | ss << "\n ° r_len = " << index->get_reference_lengths()[mapping_data->at(i)->get_region_data().reference_id] << ", l1_l = " << mapping_data->at(i)->get_l1_data().l1_l << 105 | ", match_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_eq_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << 106 | ", error_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops + mapping_data->at(i)->get_alignments()[j].num_d_ops + mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << 107 | " (X: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << 108 | ", I = " << ((float) mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << 109 | ", D: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_d_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << ")"; 110 | 111 | ss << "\n ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\""; 112 | } 113 | ss << "\n-----------"; 114 | if (i == 0) { 115 | ss << "\n"; 116 | } 117 | ss << "\n"; 118 | } 119 | 120 | return ss.str(); 121 | } 122 | 123 | std::string MappingData::VerboseFinalMappingsToString(std::shared_ptr index, const SingleSequence *read) const { 124 | return VerboseMappingDataToString_(&final_mapping_ptrs, index, read); 125 | } 126 | 127 | std::string MappingData::VerboseIntermediateMappingsToString(std::shared_ptr index, const SingleSequence *read) const { 128 | return VerboseMappingDataToString_(&intermediate_mappings, index, read); 129 | } 130 | 131 | void MappingData::clear() { 132 | vertices.Clear(); 133 | bins.clear(); 134 | for (int64_t i = 0; i < intermediate_mappings.size(); i++) { 135 | if (intermediate_mappings[i]) 136 | delete intermediate_mappings[i]; 137 | intermediate_mappings[i] = NULL; 138 | } 139 | intermediate_mappings.clear(); 140 | final_mapping_ptrs.clear(); 141 | unmapped_reason = std::string(""); 142 | num_region_iterations = 0; 143 | mapping_quality = 0; 144 | metagen_alignment_score = 0; 145 | } 146 | -------------------------------------------------------------------------------- /src/containers/mapping_data.h: -------------------------------------------------------------------------------- 1 | /* 2 | * mapping_data.h 3 | * 4 | * Created on: Mar 19, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef MAPPING_DATA_H_ 9 | #define MAPPING_DATA_H_ 10 | 11 | #include "log_system/log_system.h" 12 | #include "program_parameters.h" 13 | #include "utility/utility_general.h" 14 | #include "containers/region.h" 15 | //#include "index/index.h" 16 | //#include "index/index_hash.h" 17 | //#include "index/index_sa.h" 18 | #include "minimizer_index/minimizer_index.h" 19 | #include "containers/vertices.h" 20 | #include "utility/evalue.h" 21 | #include "containers/path_graph_entry.h" 22 | 23 | //#define UNMAPPED_CODE_NO_VALID_GRAPH_PATHS (1 << 0) 24 | 25 | #define MAPPED_CODE_READ_UNPROCESSED_YET (0) 26 | #define MAPPED_CODE_UNIQUE_MAPPING (1 << 0) 27 | #define MAPPED_CODE_MULTIPLE_EQ_MAPPINGS (1 << 1) 28 | 29 | #define ITERATION_RESET_LIMIT ((int64_t) 0x1000000000000000) 30 | 31 | 32 | 33 | struct ChromosomeBin { 34 | int64_t reference_id = 0; 35 | int64_t bin_id = 0; 36 | float bin_value = 0.0f; 37 | }; 38 | 39 | struct bins_greater_than_key 40 | { 41 | inline bool operator() (const ChromosomeBin& op1, const ChromosomeBin& op2) { 42 | if (op1.bin_value > op2.bin_value) 43 | return true; 44 | return false; 45 | } 46 | }; 47 | 48 | class MappingData { 49 | public: 50 | MappingData(); 51 | ~MappingData(); 52 | 53 | void clear(); 54 | 55 | Vertices vertices; 56 | std::vector bins; 57 | std::vector intermediate_mappings; 58 | std::vector final_mapping_ptrs; // Do not free the pointers here! Bad design. These point to intermediate_mappings pointers, which will be freed upon destruction. 59 | 60 | int64_t bin_size; 61 | int64_t num_seeds_over_limit; 62 | int64_t num_seeds_with_no_hits; 63 | int64_t num_seeds_errors; 64 | int64_t iteration; 65 | 66 | int64_t num_similar_mappings; // Number of found mapping positions with very similar (estimated) scores. E.g. to within some difference from the top mapping. 67 | int64_t num_same_mappings; 68 | int64_t avg_covered_bases_of_all_mappings; 69 | int64_t std_covered_bases_of_all_mappings; 70 | int64_t median_covered_bases_of_all_mappings; 71 | 72 | std::string unmapped_reason; 73 | 74 | int64_t num_region_iterations; 75 | int8_t mapping_quality; 76 | int64_t metagen_alignment_score; 77 | 78 | double time_region_selection; 79 | double time_mapping; 80 | double time_alignment; 81 | double time_region_seed_lookup; 82 | double time_region_hitsort; 83 | double time_region_conversion; 84 | double time_region_alloc; 85 | double time_region_counting; 86 | 87 | bool IsMapped(); 88 | bool IsAligned(); 89 | 90 | std::string VerboseFinalMappingsToString(std::shared_ptr index, const SingleSequence *read) const; 91 | std::string VerboseIntermediateMappingsToString(std::shared_ptr index, const SingleSequence *read) const; 92 | 93 | private: 94 | std::string VerboseMappingDataToString_(const std::vector *mapping_data, std::shared_ptr index, const SingleSequence *read) const; 95 | 96 | }; 97 | 98 | #endif /* MAPPING_DATA_H_ */ 99 | -------------------------------------------------------------------------------- /src/containers/range.h: -------------------------------------------------------------------------------- 1 | /* 2 | * range.h 3 | * 4 | * Created on: Jul 16, 2014 5 | * Author: ivan 6 | */ 7 | 8 | #ifndef RANGE_H_ 9 | #define RANGE_H_ 10 | 11 | #include 12 | 13 | class Range { 14 | public: 15 | Range() : start(0), end(0) { } 16 | Range(int64_t _start, int64_t _end) : start(_start), end(_end) { } 17 | 18 | int64_t dist() const { 19 | return (end - start); 20 | } 21 | 22 | int64_t start = 0; 23 | int64_t end = 0; 24 | }; 25 | 26 | #endif /* RANGE_H_ */ 27 | -------------------------------------------------------------------------------- /src/containers/raw_alignment.h: -------------------------------------------------------------------------------- 1 | /* 2 | * raw_alignment.h 3 | * 4 | * Created on: Nov 12, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_CONTAINERS_RAW_ALIGNMENT_H_ 9 | #define SRC_CONTAINERS_RAW_ALIGNMENT_H_ 10 | 11 | struct RawAlignment { 12 | int64_t aln_start = 0; 13 | int64_t aln_end = 0; 14 | std::vector alignment; 15 | std::string cigar = "*"; 16 | std::string md = "*"; /// MD field from SAM output. 17 | SeqOrientation orientation = kForward; 18 | int64_t ref_id = 0; 19 | std::string ref_header = ""; 20 | int64_t query_id = 0; 21 | std::string query_header = ""; 22 | int64_t eq_ops = 0, x_ops = 0, i_ops = 0, d_ops = 0; /// Counts of CIGAR operations. 23 | int64_t aligned_len = 0; /// Number of aligned bases from the read (not counting clipped bases). 24 | int64_t num_clipped_front = 0; /// Number of clipped bases at the beginning of the read. 25 | int64_t num_clipped_back = 0; /// Number of clipped bases at the end of the read. 26 | }; 27 | 28 | #endif /* SRC_CONTAINERS_RAW_ALIGNMENT_H_ */ 29 | -------------------------------------------------------------------------------- /src/containers/region.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * region.cc 3 | * 4 | * Created on: Dec 26, 2014 5 | * Author: isovic 6 | */ 7 | 8 | #include "containers/region.h" 9 | 10 | //int CopyLinearRegion(const Index *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset) { 11 | // if (region->is_split == true) 12 | // return 1; 13 | // 14 | // int8_t *data_copy = new int8_t[(region->end - region->start + 1) + 1]; 15 | // if (data_copy == NULL) { 16 | // return 3; 17 | // } 18 | // 19 | // memmove((data_copy), &(index_reference->get_data()[region->start]), (region->end - region->start + 1)); 20 | // 21 | // data_copy[(region->end - region->start + 1)] = '\0'; 22 | // 23 | // *ret_concatenated_data = data_copy; 24 | // *ret_data_length = (region->end - region->start + 1); 25 | // *ret_start_offset = region->start; 26 | // 27 | // return 0; 28 | //} 29 | 30 | int ConcatenateSplitRegion(std::shared_ptr index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end) { 31 | if (region->is_split == false) 32 | return 1; 33 | 34 | int64_t region_length_first = (region->end - region->start + 1); 35 | int64_t region_length_second = (region->split_end - region->split_start + 1); 36 | int64_t region_length_joined = region_length_first + region_length_second; 37 | if (region_length_first <= 0 || region_length_second <= 0 || region_length_joined <= 0) 38 | return 2; 39 | 40 | int8_t *data_copy = new int8_t[region_length_joined + 1]; 41 | if (data_copy == NULL) { 42 | return 3; 43 | } 44 | 45 | int64_t start_offset = 0; 46 | int64_t position_of_ref_end = 0; 47 | 48 | // If the main region is at the beginning of the reference. The region is then expanded towards left and right, but on the left it zips back 49 | // to the end of the circular reference. 50 | if (region->start < region->split_start) { 51 | memmove(data_copy, &(index_reference->get_data()[region->split_start]), region_length_second); 52 | memmove((data_copy + region_length_second), &(index_reference->get_data()[region->start]), region_length_first); 53 | position_of_ref_end = region->split_end - region->split_start; // + 1; 54 | start_offset = region->split_start; 55 | 56 | // If the main region is at the end of the reference. The region is then expanded towards left and right, but on the right it zips back 57 | // to the beginning of the circular reference. 58 | } else { 59 | memmove((data_copy), &(index_reference->get_data()[region->start]), region_length_first); 60 | memmove((data_copy + region_length_first), &(index_reference->get_data()[region->split_start]), region_length_second); 61 | position_of_ref_end = region->end - region->start; 62 | start_offset = region->start; 63 | 64 | } 65 | 66 | data_copy[region_length_joined] = '\0'; 67 | 68 | *ret_concatenated_data = data_copy; 69 | *ret_data_length = region_length_joined; 70 | *ret_start_offset = start_offset; 71 | *ret_position_of_ref_end = position_of_ref_end; 72 | 73 | return 0; 74 | } 75 | 76 | int GetRegionData(std::shared_ptr index, const Region *region, 77 | int8_t **region_data, int64_t *data_len, int64_t *index_reg_start, int64_t *pos_of_ref_end, bool *is_cleanup_required) { 78 | 79 | if (region->is_split == false) { 80 | *region_data = (int8_t *) (&index->get_data()[0] + region->start); 81 | *data_len = (region->end - region->start); 82 | *index_reg_start = region->start; 83 | *pos_of_ref_end = -1; 84 | *is_cleanup_required = false; 85 | 86 | } else { 87 | ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_start, pos_of_ref_end); 88 | *is_cleanup_required = true; 89 | 90 | } 91 | 92 | return 0; 93 | } 94 | 95 | //int GetRegionDataCopy(const Index *index, const Region *region, 96 | // int8_t **region_data, int64_t *data_len, int64_t *index_reg_pos, int64_t *reg_pos_of_ref_end) { 97 | // 98 | // if (region->is_split == false) { 99 | // CopyLinearRegion(index, region, region_data, data_len, index_reg_pos); 100 | // *reg_pos_of_ref_end = -1; 101 | // 102 | // } else { 103 | // ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_pos, reg_pos_of_ref_end); 104 | // 105 | // } 106 | // 107 | // return 0; 108 | //} 109 | 110 | std::string VerboseRegionAsString(Region ®ion) { 111 | std::stringstream ss; 112 | 113 | ss << "start = " << region.start; 114 | ss << ", end = " << region.end; 115 | ss << ", reference_id = " << region.reference_id; 116 | ss << ", region_index = " << region.region_index; 117 | ss << ", region_votes = " << region.region_votes; 118 | ss << ", is_split = " << ((int) region.is_split); 119 | ss << ", split_start = " << region.split_start; 120 | ss << ", split_end = " << region.split_end; 121 | 122 | return ss.str(); 123 | } 124 | -------------------------------------------------------------------------------- /src/containers/region.h: -------------------------------------------------------------------------------- 1 | /* 2 | * region.h 3 | * 4 | * Created on: Dec 21, 2014 5 | * Author: ivan 6 | */ 7 | 8 | #ifndef REGION_H_ 9 | #define REGION_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | // #include "index/index.h" 15 | #include "minimizer_index/minimizer_index.h" 16 | 17 | struct Region { 18 | int64_t start = 0; 19 | int64_t end = 0; 20 | int64_t reference_id = -1; 21 | std::string rname; 22 | int64_t region_index = -1; 23 | int64_t region_votes = 0; 24 | bool is_split = false; 25 | int64_t split_start = 0; 26 | int64_t split_end = 0; 27 | }; 28 | 29 | //// Creates a copy of the region data from the Index. 30 | //int CopyLinearRegion(const MinimizerIndex *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset); 31 | 32 | // If the region is split in two parts, that is if the genome is circular, this function copies both parts in a new data array. 33 | // It is users responsibility to free the allocated space using delete[]. 34 | int ConcatenateSplitRegion(std::shared_ptr index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end); 35 | 36 | // Checks if the region is linear or split. If the region is linear, it returns the pointer to the existing part of the Index data and is_cleanup_required is set to false. 37 | // Otherwise, a new data array is allocated and the data copied from the split parts of the Index. 38 | // If the is_cleanup_required parameter is true, region_data needs to be freed by the user using free(). 39 | int GetRegionData(std::shared_ptr index, const Region *region, 40 | int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end, bool *is_cleanup_required); 41 | 42 | //// Checks if the region is linear or split. It copies the data to a new array, and returns the pointer to the region data. 43 | //// region_data needs to be freed by the user using free(). 44 | //int GetRegionDataCopy(const MinimizerIndex *index, const Region *region, 45 | // int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end); 46 | 47 | // Simply verbose region's details. 48 | std::string VerboseRegionAsString(Region ®ion); 49 | 50 | #endif /* REGION_H_ */ 51 | -------------------------------------------------------------------------------- /src/containers/results.h: -------------------------------------------------------------------------------- 1 | /* 2 | * results.h 3 | * 4 | * Created on: Jan 16, 2016 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_CONTAINERS_RESULTS_H_ 9 | #define SRC_CONTAINERS_RESULTS_H_ 10 | 11 | #include 12 | #include 13 | #include "containers/range.h" 14 | #include "utility/utility_general.h" 15 | #include "containers/region.h" 16 | 17 | 18 | 19 | typedef struct Cluster { 20 | public: 21 | Range query; 22 | Range ref; 23 | int32_t num_anchors = 0; 24 | int32_t coverage = 0; 25 | bool valid = false; 26 | SeqOrientation orientation; 27 | Region region; 28 | } Cluster; 29 | 30 | typedef struct MappingResults { 31 | int64_t lcs_length = 0; 32 | int64_t cov_bases_max = 0; 33 | int64_t cov_bases_query = 0; 34 | int64_t cov_bases_ref = 0; 35 | int64_t num_covering_kmers = 0; 36 | float deviation = 0.0f; 37 | Range query_coords; 38 | Range ref_coords; 39 | bool is_mapped = false; 40 | bool is_reverse = false; 41 | int64_t local_score_id = 0; 42 | std::vector clusters; 43 | 44 | // int64_t num_same_mappings = 0; // How many mapping positions have exactly the same score. 45 | } MappingResults; 46 | 47 | typedef struct L1Results { 48 | int64_t l1_l = 0; 49 | double l1_k = 1.0f; 50 | int64_t l1_lmin = 0; 51 | int64_t l1_lmax = 0; 52 | double l1_confidence_abs = 0; 53 | double l1_std = 0; 54 | int64_t l1_rough_start = 0; 55 | int64_t l1_rough_end = 0; 56 | } L1Results; 57 | 58 | typedef struct AlignmentResults { 59 | bool is_aligned = false; 60 | bool is_reverse = false; // This should be deprecated and replaced with 'orientation'. 61 | int64_t ref_start = 0; // Starting position of the alignment on the reference. If orientation == kReverse, this assumes that the read should be reverse complemented and the reference stays fwd. pos_start is adjusted accordingly to denote the starting position of the alignment of the reversed read. 62 | int64_t ref_end = 0; // See pos_start. This is the end position of the alignment. 63 | int64_t query_start = 0; // Starting position of the alignment on the read. Everything before this position should be clipped. 64 | int64_t query_end = 0; // Ending position of the alignment on the read. Everything after this position should be clipped. 65 | std::string cigar = "*"; // In case orientation == kReverse, 'cigar' contains the reverse of the 'alignment' operations. 66 | std::string md = ""; 67 | int64_t edit_distance = 0; 68 | int64_t alignment_score = 0; 69 | int64_t mapping_quality = 0; 70 | double evalue = 0.0f; 71 | int64_t num_secondary_alns = 0; // How many mapping positions have similar score. 72 | 73 | int64_t raw_pos_start = 0; // Internally, the fwd read is mapped to a reference and its reverse complement (which have been joined in a single massive sequence). The raw_pos_start then holds the absolute coordinate of the alignment in such joined sequence data. 74 | int64_t raw_pos_end = 0; // See raw_pos_start. This is the end position of the alignment in global coordinates. 75 | std::vector raw_alignment; // Hold the alignment in the global coordinate space (between raw_pos_start and raw_pos_end). Cannot be used with pos_start and pos_end in case the read should be reverse complemented. In this case, the alignment needs to be reversed. 76 | std::vector alignment; // Hold the alignment in the local coordinate space (between ref_start and ref_end). If orientation == kForward, alignment == raw_alignment. Otherwise it's the reverse complement. 77 | 78 | SeqOrientation orientation = kForward; 79 | int64_t ref_id = -1; 80 | std::string ref_header = "*"; 81 | int64_t ref_len = 0; 82 | int64_t query_id = -1; 83 | std::string query_header = "*"; 84 | int64_t query_len = 0; 85 | 86 | int64_t num_eq_ops = 0; 87 | int64_t num_x_ops = 0; 88 | int64_t num_i_ops = 0; 89 | int64_t num_d_ops = 0; 90 | int64_t nonclipped_length = 0; 91 | 92 | // int8_t *ref_data = NULL; 93 | // int8_t *read_data = NULL; 94 | 95 | // These are parameters of alignment which were used to produce the results. 96 | int32_t aln_mode_code = 0; // Type of alignment which was performed to produce the results stored in this structure. 97 | 98 | int64_t reg_pos_start = 0; // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function. 99 | int64_t reg_pos_end = 0; // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function. 100 | 101 | } AlignmentResults; 102 | 103 | 104 | 105 | typedef struct MappingMetadata { 106 | std::string unmapped_reason = "Not processed."; 107 | 108 | double time_region_selection = 0.0; 109 | double time_mapping = 0.0; 110 | double time_alignment = 0.0; 111 | double time_region_seed_lookup = 0.0; 112 | double time_region_hitsort = 0.0; 113 | double time_region_conversion = 0.0; 114 | double time_region_alloc = 0.0; 115 | double time_region_counting = 0.0; 116 | 117 | 118 | 119 | } MappingMetadata; 120 | 121 | #endif /* SRC_CONTAINERS_RESULTS_H_ */ 122 | -------------------------------------------------------------------------------- /src/containers/score_registry.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * score_registry.cc 3 | * 4 | * Created on: Jul 14, 2014 5 | * Author: ivan 6 | */ 7 | 8 | #include "containers/score_registry.h" 9 | 10 | ScoreRegistry::ScoreRegistry() { 11 | scores_id_ = 0; 12 | } 13 | 14 | ScoreRegistry::ScoreRegistry(const Region& region, int64_t scores_id) { 15 | set_region(region); 16 | set_scores_id(scores_id); 17 | } 18 | 19 | ScoreRegistry::~ScoreRegistry() { 20 | Clear(); 21 | } 22 | 23 | void ScoreRegistry::Clear() { 24 | // registry_.clear(); 25 | registry_entries_.Clear(); 26 | scores_id_ = 0; 27 | } 28 | 29 | void ScoreRegistry::Add(Vertices &src_vertices, int64_t vertex_idx) { 30 | // registry_.push_back(vertex_data); 31 | registry_entries_.Add(src_vertices, vertex_idx); 32 | } 33 | 34 | void ScoreRegistry::Register(Vertices &src_vertices, int64_t vertex_idx) { 35 | if (src_vertices.registry_numbers[vertex_idx] < 0) { // || vertex_data.registry_number >= registry_.size()) { 36 | src_vertices.registry_numbers[vertex_idx] = registry_entries_.num_vertices; 37 | registry_entries_.Add(src_vertices, vertex_idx); 38 | 39 | } 40 | else { 41 | // Handle the case where a repeating kmer causes a 'jump' in the middle of an existing long path. 42 | // Edit 07.11.2014.: Because of the condition that a kmer needs to be within l iterations from the 43 | // vertex's path that it want's to extend, the kmer cannot hit it somewhere in the middle of the path. 44 | // It can only occur near the end of the path, and can only cause the path to have a more-or-less 45 | // even/uneven length in the reference and the query. For this reason, I think that forking a path 46 | // is perhaps not a good option, but instead to check for its ratio in query and in reference, and 47 | // choose to extend the path with the new kmer only if the ratio is closer to 1.0f. 48 | // For precaution sake, I'll keep the previous version here in comments. 49 | // if (vertex_data.covered_bases < registry_[vertex_data.registry_number].covered_bases) { 50 | // vertex_data.registry_number = registry_.size(); 51 | // registry_.push_back(vertex_data); 52 | // } else { 53 | // registry_[vertex_data.registry_number] = vertex_data; 54 | // } 55 | 56 | int64_t registry_number = src_vertices.registry_numbers[vertex_idx]; 57 | 58 | if ((src_vertices.num_kmers[vertex_idx] > registry_entries_.num_kmers[registry_number]) || 59 | (src_vertices.num_kmers[vertex_idx] <= registry_entries_.num_kmers[registry_number] && 60 | src_vertices.CalculateSuppress(vertex_idx) < registry_entries_.CalculateSuppress(registry_number))) { 61 | 62 | registry_entries_.CopyValuesFromOut(src_vertices, vertex_idx, registry_number); 63 | } 64 | } 65 | } 66 | 67 | std::string ScoreRegistry::VerboseToString() { 68 | std::stringstream ss; 69 | 70 | ss << "Num scores: " << registry_entries_.num_vertices << std::endl; 71 | 72 | for (int64_t i=0; i 12 | #include 13 | #include 14 | #include 15 | #include "containers/region.h" 16 | #include "sequences/single_sequence.h" 17 | #include "sequences/sequence_file.h" 18 | #include "containers/vertices.h" 19 | 20 | class ScoreRegistry { 21 | public: 22 | ScoreRegistry(); 23 | ScoreRegistry(const Region& region, int64_t scores_id); 24 | ~ScoreRegistry(); 25 | 26 | /// Empties the registry and sets all values to zero. 27 | void Clear(); 28 | 29 | /// Simply appends the data to the end of the registry and updates the top score. 30 | /// No additional checks are performed. 31 | void Add(Vertices &src_vertices, int64_t vertex_idx); 32 | 33 | /// If the data has a registry number >= 0, then the entry with that index will be updated. 34 | /// Otherwise, if registry number < 0 or if the suppress is smaller than the existing one, 35 | /// the new data will only be appended to the end of the registry, and its registry number 36 | /// will be updated. 37 | void Register(Vertices &src_vertices, int64_t vertex_idx); 38 | 39 | // Allocates space for vertices. 40 | void Reserve(int64_t size); 41 | 42 | /// Formats the debug verbose to a std::string. 43 | std::string VerboseToString(); 44 | const Region& get_region() const; 45 | void set_region(const Region& region); 46 | int64_t get_scores_id() const; 47 | void set_scores_id(int64_t scoresId); 48 | const Vertices& get_registry_entries() const; 49 | void set_registry_entries(Vertices& registryEntries); 50 | 51 | private: 52 | Vertices registry_entries_; 53 | Region region_; 54 | int64_t scores_id_; 55 | }; 56 | 57 | #endif /* SCORE_REGISTRY_H_ */ 58 | -------------------------------------------------------------------------------- /src/containers/vertices.h: -------------------------------------------------------------------------------- 1 | /* 2 | * vertices.h 3 | * 4 | * Created on: Feb 13, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef VERTICES_H_ 9 | #define VERTICES_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "log_system/log_system.h" 16 | 17 | 18 | 19 | // Quite an ugly data structure, but very cache friendly. 20 | class Vertices { 21 | public: 22 | int64_t *timestamps; 23 | int64_t *reference_starts; 24 | int64_t *reference_ends; 25 | int64_t *query_starts; 26 | int64_t *query_ends; 27 | int64_t *num_kmers; 28 | int64_t *covered_bases_queries; 29 | int64_t *covered_bases_references; 30 | int64_t *registry_numbers; 31 | 32 | int64_t num_vertices; 33 | int64_t container_capacity; 34 | 35 | Vertices(); 36 | ~Vertices(); 37 | void Clear(); 38 | 39 | inline int Init(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start, 40 | int64_t query_start, int64_t kmer_length, int64_t registry_number) { 41 | return Set(dest_vertex_idx, timestamp, reference_start, reference_start, query_start, query_start, 1, kmer_length, kmer_length, registry_number); 42 | } 43 | 44 | inline int Set(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start, 45 | int64_t reference_end, int64_t query_start, int64_t query_end, 46 | int64_t num_kmer, int64_t covered_bases_query, 47 | int64_t covered_bases_reference, int64_t registry_number) { 48 | if (dest_vertex_idx >= num_vertices || dest_vertex_idx < 0) { 49 | return 1; 50 | } 51 | 52 | timestamps[dest_vertex_idx] = timestamp; 53 | reference_starts[dest_vertex_idx] = reference_start; 54 | reference_ends[dest_vertex_idx] = reference_end; 55 | query_starts[dest_vertex_idx] = query_start; 56 | query_ends[dest_vertex_idx] = query_end; 57 | num_kmers[dest_vertex_idx] = num_kmer; 58 | covered_bases_queries[dest_vertex_idx] = covered_bases_query; 59 | covered_bases_references[dest_vertex_idx] = covered_bases_reference; 60 | registry_numbers[dest_vertex_idx] = registry_number; 61 | 62 | return 0; 63 | } 64 | 65 | inline int Add(int64_t timestamp, int64_t reference_start, 66 | int64_t reference_end, int64_t query_start, int64_t query_end, 67 | int64_t num_kmer, int64_t covered_bases_query, 68 | int64_t covered_bases_reference, int64_t registry_number) { 69 | if (num_vertices >= container_capacity) { 70 | Reserve(container_capacity + capacity_increment_size_); 71 | } 72 | 73 | num_vertices += 1; 74 | Set((num_vertices - 1), timestamp, reference_start, reference_end, query_start, query_end, num_kmer, covered_bases_query, covered_bases_reference, registry_number); 75 | 76 | return 0; 77 | } 78 | 79 | inline int Add(const Vertices &src_vertices, int64_t src_vertex_idx) { 80 | return Add(src_vertices.timestamps[src_vertex_idx], 81 | src_vertices.reference_starts[src_vertex_idx], 82 | src_vertices.reference_ends[src_vertex_idx], 83 | src_vertices.query_starts[src_vertex_idx], 84 | src_vertices.query_ends[src_vertex_idx], 85 | src_vertices.num_kmers[src_vertex_idx], 86 | src_vertices.covered_bases_queries[src_vertex_idx], 87 | src_vertices.covered_bases_references[src_vertex_idx], 88 | src_vertices.registry_numbers[src_vertex_idx]); 89 | } 90 | 91 | void Reserve(int64_t size); 92 | void Resize(int64_t size); 93 | 94 | inline int CopyValuesWithin(int64_t source_idx, int64_t dest_idx) { 95 | if (source_idx >= num_vertices || dest_idx >= num_vertices || source_idx < 0 || dest_idx < 0) { 96 | LogSystem::GetInstance().Error(SEVERITY_INT_WARNING, __FUNCTION__, LogSystem::GetInstance().GenerateErrorMessage(ERR_MEMORY, "When CopyValuesWithin is called. source_idx = %ld, dest_idx = %ld, num_vertices = %ld\n", source_idx, dest_idx, num_vertices)); 97 | return 1; 98 | } 99 | 100 | timestamps[dest_idx] = timestamps[source_idx]; 101 | reference_starts[dest_idx] = reference_starts[source_idx]; 102 | reference_ends[dest_idx] = reference_ends[source_idx]; 103 | query_starts[dest_idx] = query_starts[source_idx]; 104 | query_ends[dest_idx] = query_ends[source_idx]; 105 | num_kmers[dest_idx] = num_kmers[source_idx]; 106 | covered_bases_queries[dest_idx] = covered_bases_queries[source_idx]; 107 | covered_bases_references[dest_idx] = covered_bases_references[source_idx]; 108 | registry_numbers[dest_idx] = registry_numbers[source_idx]; 109 | 110 | return 0; 111 | } 112 | 113 | inline int CopyValuesFromOut(Vertices &src_vertices, int64_t src_vertex_idx, int64_t dest_idx) { 114 | return Set(dest_idx, 115 | src_vertices.timestamps[src_vertex_idx], 116 | src_vertices.reference_starts[src_vertex_idx], 117 | src_vertices.reference_ends[src_vertex_idx], 118 | src_vertices.query_starts[src_vertex_idx], 119 | src_vertices.query_ends[src_vertex_idx], 120 | src_vertices.num_kmers[src_vertex_idx], 121 | src_vertices.covered_bases_queries[src_vertex_idx], 122 | src_vertices.covered_bases_references[src_vertex_idx], 123 | src_vertices.registry_numbers[src_vertex_idx]); 124 | } 125 | 126 | inline void EraseValues() { 127 | if (num_vertices <= 0) 128 | return; 129 | 130 | memset(timestamps, -1, num_vertices); 131 | memset(reference_starts, 0, num_vertices); 132 | memset(reference_ends, 0, num_vertices); 133 | memset(query_starts, 0, num_vertices); 134 | memset(query_ends, 0, num_vertices); 135 | memset(num_kmers, 0, num_vertices); 136 | memset(covered_bases_queries, 0, num_vertices); 137 | memset(covered_bases_references, 0, num_vertices); 138 | memset(registry_numbers, -1, num_vertices); 139 | } 140 | 141 | inline float CalculateRatio(int64_t vertex_idx) { 142 | float ratio = 0.0f; 143 | int64_t query_start = query_starts[vertex_idx]; 144 | int64_t query_end = query_ends[vertex_idx]; 145 | int64_t reference_start = reference_starts[vertex_idx]; 146 | int64_t reference_end = reference_ends[vertex_idx]; 147 | 148 | int64_t query_distance = (query_end >= query_start) ? (query_end - query_start) : (query_start - query_end); 149 | int64_t ref_distance = (reference_end >= reference_start) ? (reference_end - reference_start) : (reference_start - reference_end); 150 | 151 | if (query_distance != 0) 152 | ratio = ((float) std::min(query_distance, ref_distance)) / ((float) std::max(query_distance, ref_distance)); 153 | else 154 | ratio = 1.0f; 155 | 156 | return ratio; 157 | } 158 | 159 | inline float CalculateSuppress(int64_t vertex_idx) { 160 | float ratio = 0.0f, ratio_suppress = 0.0f; 161 | 162 | ratio = CalculateRatio(vertex_idx); 163 | 164 | ratio_suppress = (ratio < 1.0f) ? (1.0f - ratio) : (ratio - 1.0f); 165 | 166 | return ratio_suppress; 167 | } 168 | 169 | inline std::string VerboseToString(int64_t vertex_idx) const { 170 | std::stringstream ret; 171 | 172 | if (vertex_idx < 0 || vertex_idx >= num_vertices) { 173 | ret << "Error with vertex_idx! vertex_idx = " << vertex_idx << ", containter_capacity = " << container_capacity << ", num_vertices = " << num_vertices; 174 | return ret.str(); 175 | } 176 | 177 | ret << "timestamp = " << timestamps[vertex_idx]; 178 | ret << "; q[" << query_starts[vertex_idx] << ", " << query_ends[vertex_idx] << "]; r[" << reference_starts[vertex_idx]<< ", " << 179 | reference_ends[vertex_idx] << 180 | "]; d[" << (query_ends[vertex_idx] - query_starts[vertex_idx]) << ", " << (reference_ends[vertex_idx] - reference_starts[vertex_idx]) << 181 | "]; length = " << num_kmers[vertex_idx] << 182 | "; dist_ratio = " << ((double) std::min((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) / ((double) std::max((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) << 183 | "; cov_bases_query = " << covered_bases_queries[vertex_idx] << "; cov_bases_ref = " << covered_bases_references[vertex_idx] << "; registry_num = " << registry_numbers[vertex_idx]; 184 | 185 | return ret.str(); 186 | } 187 | 188 | private: 189 | inline int ReallocArray_(int64_t **array_ptr, int64_t size); 190 | int64_t capacity_increment_size_; 191 | }; 192 | 193 | #endif /* VERTICES_H_ */ 194 | -------------------------------------------------------------------------------- /src/graphmap/experimental.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * experimental.cc 3 | * 4 | * Created on: Jan 20, 2016 5 | * Author: isovic 6 | */ 7 | 8 | #include "graphmap/graphmap.h" 9 | 10 | -------------------------------------------------------------------------------- /src/graphmap/filter_anchors.h: -------------------------------------------------------------------------------- 1 | /* 2 | * filter_anchors.h 3 | * 4 | * Created on: Mar 22, 2016 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_GRAPHMAP_FILTER_ANCHORS_H_ 9 | #define SRC_GRAPHMAP_FILTER_ANCHORS_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "sequences/single_sequence.h" 16 | #include "sequences/sequence_file.h" 17 | #include "containers/vertices.h" 18 | #include "program_parameters.h" 19 | 20 | #include "containers/score_registry.h" 21 | #include "utility/utility_general.h" 22 | #include "containers/region.h" 23 | #include "containers/mapping_data.h" 24 | #include "containers/vertices.h" 25 | 26 | /// These are some constants used for filtering shady anchors. 27 | /// TODO: This can be omitted if dynamic programming was used to penalize the anchor distances. 28 | /// int64_t min_covered_bases = (new_cluster->query.end - new_cluster->query.start + 1) * MIN_CLUSTER_COVERAGE_FACTOR; 29 | #define MIN_CLUSTER_COVERAGE_FACTOR 0.05f 30 | /// int64_t min_cluster_length = read->get_sequence_length() * MIN_CLUSTER_LENGTH_FACTOR; 31 | #define MIN_CLUSTER_LENGTH_FACTOR 0.03f 32 | 33 | using int128_t = __int128; 34 | using uint128_t = unsigned __int128; 35 | 36 | #define get128_qid(x) ((int32_t) (x & 0x0FFFFFFFF)) 37 | #define get128_rpos(x) ((int32_t) ((x >> 32) & 0x0FFFFFFFF)) 38 | #define get128_qpos(x) ((int32_t) ((x >> 64) & 0x0FFFFFFFF)) 39 | #define get128_rid(x) ((int32_t) ((x >> 96) & 0x0FFFFFFFF)) 40 | /// d c b a 41 | /// ref_id << 96 | query_start << 64 | ref_start << 32 | query_id 42 | #define pack128(qstart,rstart,qid,rid) ((((uint128_t) rid) << 96) | (((uint128_t) qstart) << 64) | (((uint128_t) rstart) << 32) | ((uint128_t) qid)) 43 | 44 | struct ClusterAndIndices { 45 | Range query; 46 | Range ref; 47 | int32_t num_anchors = 0; 48 | int32_t coverage = 0; 49 | std::vector lcskpp_indices; 50 | }; 51 | 52 | int64_t CalcScore(int32_t qpos, int32_t rpos, int32_t next_qpos, int32_t next_rpos, double indel_bandwidth_margin, int32_t fwd_length, int32_t dist_aab, int32_t dist_dbm, double *score_gap, double *score_dist); 53 | 54 | void GetPositionsFromRegistry2(const Vertices& registry_entries, int64_t vertex_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end); 55 | void GetPositionsFromRegistry(const Vertices& registry_entries, const std::vector &lcskpp_indices, int64_t lcskpp_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end); 56 | void GetPositionsFrom128bit(const std::vector &hits, const std::vector &lcskpp_indices, int64_t lcskpp_id, int32_t seed_len, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end); 57 | 58 | int FilterAnchorsByDiff(const SingleSequence* read, ScoreRegistry* local_score, const ProgramParameters *parameters, 59 | const std::vector &lcskpp_indices, std::vector &ret_filtered_lcskpp_indices); 60 | 61 | int FilterAnchorsByChaining(const SingleSequence* seq, ScoreRegistry* local_score, const ProgramParameters *parameters, 62 | const std::vector &lcskpp_indices, double indel_bandwidth_margin, int32_t max_dist, int32_t lookahead_dist_factor, int64_t min_covered_bases, int32_t cluster_size_cutoff, 63 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids); 64 | 65 | int GenerateClusters(int64_t min_num_anchors_in_cluster, int64_t min_cluster_length, int64_t min_cluster_covered_bases, float min_cluster_coverage, std::vector &lcskpp_indices, 66 | ScoreRegistry* local_score, MappingData* mapping_data, 67 | const SingleSequence* read, const ProgramParameters* parameters, std::vector &ret_clusters, 68 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids); 69 | int GenerateClustersDummy(int64_t min_cluster_length, float min_cluster_coverage, std::vector &lcskpp_indices, 70 | ScoreRegistry* local_score, MappingData* mapping_data, 71 | const SingleSequence* read, const ProgramParameters* parameters, std::vector &ret_clusters, 72 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids); 73 | 74 | int VerboseClustersToFile_(std::string out_file, const ScoreRegistry* local_score, const MappingData* mapping_data, std::vector> &indexes, const SingleSequence* read, const ProgramParameters* parameters, const std::vector &clusters); 75 | 76 | #endif /* SRC_GRAPHMAP_FILTER_ANCHORS_H_ */ 77 | -------------------------------------------------------------------------------- /src/graphmap/transcriptome.h: -------------------------------------------------------------------------------- 1 | /* 2 | * transcriptome.h 3 | * 4 | * Created on: Feb 6, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_GRAPHMAP_TRANSCRIPTOME_H_ 9 | #define SRC_GRAPHMAP_TRANSCRIPTOME_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "sequences/sequence_file.h" 18 | 19 | namespace is { 20 | 21 | class Transcriptome; 22 | 23 | std::shared_ptr createTranscriptome(); 24 | 25 | class Transcriptome { 26 | public: 27 | friend std::shared_ptr createTranscriptome(); 28 | ~Transcriptome(); 29 | 30 | /* Parses exons and extracts regions from the given GTF file. 31 | */ 32 | int LoadGTF(const std::string >f_path); 33 | 34 | /* Constructs transcriptome sequences from the preloaded GTF file. 35 | */ 36 | std::shared_ptr GenerateTranscriptomeSeqs(const std::shared_ptr sequences); 37 | 38 | /* Generates a header for a SAM file. The header is composed of 39 | * genomic sequence names. 40 | */ 41 | std::string GenerateSAMHeaders(); 42 | 43 | const std::map > >& get_genome_id_to_trans_id() const { 44 | return genome_id_to_trans_id_; 45 | } 46 | 47 | const std::map > >& get_trans_id_to_exons() const { 48 | return trans_id_to_exons_; 49 | } 50 | 51 | const std::map > >& get_trans_id_to_regions() const { 52 | return trans_id_to_regions_; 53 | } 54 | 55 | const std::map>& get_trans_id_to_genome_id() const { 56 | return trans_id_to_genome_id_; 57 | } 58 | 59 | const std::map& get_genome_id_to_len() const { 60 | return genome_id_to_len_; 61 | } 62 | 63 | private: 64 | std::string gtf_path_; 65 | 66 | // A map from genome (chromosome) name (e.g. header split to first space) to a vector containing all transcriptomes which can be generated from that chromosome. 67 | // Each pair is a (transcript_id, strand), where strand is either '+' or '-'; 68 | std::map>> genome_id_to_trans_id_; 69 | // Reverse map, to obtain the chromosome name when converting from transcriptome space back to genome space. 70 | // Second parameter of the pair is the orientation on the genome. 71 | std::map> trans_id_to_genome_id_; 72 | // A map from transcript_id to a vector containing pairs of coordinates. Each pair of coordinates presents one exon which makes the transcriptome. 73 | std::map>> trans_id_to_exons_; 74 | // A list of exons in such way that it combines overlapping exons into regions. 75 | std::map>> trans_id_to_regions_; 76 | // Length of each chromosome in genome space. Needed for reversing the mapping if transcriptome was reverse complemented. 77 | std::map genome_id_to_len_; 78 | 79 | Transcriptome(); // Private constructor, prevent memory leaks; 80 | Transcriptome(const Transcriptome&) = delete; 81 | const Transcriptome& operator=(const Transcriptome&) = delete; 82 | 83 | // Creates a transcriptome from a given reference sequence and a path to a file with gene annotations. 84 | // Parameters: 85 | // @param annotations_path Path to a GFF file (or another supported format) which contains the annotations of exonic regions. 86 | // @param references A SequenceFile object which contains reference sequences already loaded from disk. 87 | // @param transcripts A SequenceFile which will contain the generated transcriptomes. 88 | // @return 0 if everything went fine (C-style). 89 | int MakeTranscript_(const std::map>> &genome_id_to_trans_id, 90 | const std::map>> &trans_id_to_exons, 91 | const std::shared_ptr references, std::shared_ptr transcripts) const; 92 | /** Resolves lists of exons in such way that it combines overlapping exons into regions. 93 | * Returns dict that maps transcript id to list of regions. 94 | * @param trans_id_to_exons A map from transcriptome ID (name) to a vector of exons which make this transcriptome. 95 | * @param trans_id_to_regions Generated return map from transcriptome ID (name) to a vector containing regions. 96 | * @return 0 if everything went fine (C-style). 97 | */ 98 | int MakeRegions_(const std::map>> &trans_id_to_exons, 99 | std::map>> &trans_id_to_regions) const; 100 | int ParseExons_(const std::string &annotations_path, 101 | std::map>> &genomeToTrans, 102 | std::map> &transIdToGenomeId, 103 | std::map>> &transToExons) const; 104 | void HashGenomeLengths_(const std::shared_ptr sequences, std::map &rlens) const; 105 | std::string trim_(std::string s) const; 106 | std::vector split_(std::string s, char c) const; 107 | std::string getSequenceName_(const SingleSequence &seq) const; 108 | std::string getTID_(const std::string &chr_name, const std::string &attributes) const; 109 | // void outputSeq_(char *header, size_t headerLen, const int8_t *seq, size_t seqLen) const; 110 | 111 | }; 112 | 113 | } /* namespace is */ 114 | 115 | #endif /* SRC_GRAPHMAP_TRANSCRIPTOME_H_ */ 116 | -------------------------------------------------------------------------------- /src/index/index_util.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * index_util.cc 3 | * 4 | * Created on: Feb 6, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #include "index_util.h" 9 | #include "utility/utility_general.h" 10 | 11 | namespace is { 12 | 13 | std::string GenerateSAMHeader(std::shared_ptr index, 14 | ProgramParameters& parameters) { 15 | // Output reference sequence information. 16 | std::stringstream ss_header; 17 | 18 | ss_header << "@HD\t" << 19 | "VN:1.0\t" << 20 | "SO:unknown" << 21 | "\n"; 22 | 23 | for (int64_t rid=0; ridget_num_sequences_forward(); rid++) { 24 | std::string reference_header = TrimToFirstSpace(index->get_headers()[rid]); 25 | uint64_t rlen = (uint64_t) index->get_reference_lengths()[rid]; 26 | 27 | ss_header << "@SQ\t" << 28 | "SN:" << reference_header << "\t" << 29 | "LN:" << rlen << "" << 30 | "\n"; 31 | } 32 | 33 | // If verbose_sam_output == 1, then print out a special version of the PG line. This was used for the web server 34 | // to omit paths from the output (not to share server sensitive information with users). 35 | if (parameters.verbose_sam_output == 1) { 36 | ss_header << "@PG\tID:graphmap\tPN:graphmap"; 37 | } else { 38 | // Output the command line used to run the process to the file. 39 | ss_header << "@PG\t" << 40 | "ID:graphmap\t" << 41 | "PN:graphmap\t" << 42 | "CL:" << parameters.command_line << "\t" << 43 | "VN:" << std::string(GRAPHMAP_CURRENT_VERSION) << " compiled on " << std::string(GRAPHMAP_CURRENT_VERSION_RELEASE_DATE); 44 | } 45 | 46 | return ss_header.str(); 47 | } 48 | 49 | std::string GenerateSAMHeader(std::shared_ptr transcriptome) { 50 | return transcriptome->GenerateSAMHeaders(); 51 | } 52 | 53 | } /* namespace is */ 54 | -------------------------------------------------------------------------------- /src/index/index_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * index_util.h 3 | * 4 | * Created on: Feb 6, 2017 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef SRC_INDEX_INDEX_UTIL_H_ 9 | #define SRC_INDEX_INDEX_UTIL_H_ 10 | 11 | #include "minimizer_index/minimizer_index.h" 12 | #include "graphmap/transcriptome.h" 13 | #include "../program_parameters.h" 14 | 15 | namespace is { 16 | 17 | std::string GenerateSAMHeader(std::shared_ptr index, ProgramParameters ¶meters); 18 | 19 | std::string GenerateSAMHeader(std::shared_ptr transcriptome); 20 | 21 | } /* namespace is */ 22 | 23 | #endif /* SRC_INDEX_INDEX_UTIL_H_ */ 24 | -------------------------------------------------------------------------------- /src/ksw2/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 Broad Institute, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /src/ksw2/kalloc.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "kalloc.h" 6 | 7 | /* The whole thing is: ("@" for the kheader_t of the block, "-" for free 8 | * memory, and "+" for allocated memory. One char for one unit.) 9 | * 10 | * This region is core 1. This region is core 2. 11 | * 12 | * @-------@++++++@++++++++++++@------------ @----------@++++++++++++@+++++++@------------ 13 | * | | | | 14 | * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr 15 | */ 16 | 17 | #define PTR(p) ((size_t*)((size_t*)p)[1]) 18 | 19 | typedef struct _allocated_t { 20 | struct _allocated_t *next; 21 | size_t *ptr; 22 | } allocated_t; 23 | 24 | typedef struct { 25 | size_t base[2], *loop_head; 26 | allocated_t list_head, *list_tail; 27 | size_t total_allocated; 28 | } kmem_t; 29 | 30 | void *km_init() 31 | { 32 | return calloc(1, sizeof(kmem_t)); 33 | } 34 | 35 | static void kerror(const char *s) 36 | { 37 | fprintf(stderr, "%s\n", s); 38 | exit(1); 39 | } 40 | 41 | static size_t *morecore(kmem_t *km, size_t nu) 42 | { 43 | size_t rnu, *up; 44 | 45 | rnu = (nu + 0xfffff) & (~(size_t)0xfffff); 46 | up = (size_t*)malloc(rnu * sizeof(size_t)); 47 | if (!up) { /* fail to allocate memory */ 48 | km_stat(km); 49 | fprintf(stderr, "[morecore] %lu bytes requested but not available.\n", rnu * sizeof(size_t)); 50 | exit(1); 51 | } 52 | /* put the pointer in km->list_head */ 53 | if (km->list_tail == 0) km->list_tail = &km->list_head; 54 | km->list_tail->ptr = up; 55 | km->list_tail->next = (allocated_t*)calloc(1, sizeof(allocated_t)); 56 | km->list_tail = km->list_tail->next; 57 | 58 | km->total_allocated += rnu * sizeof(size_t); 59 | *up = rnu; /* the size of the current block, and in this case the block is the same as the new core */ 60 | kfree(km, up + 1); /* initialize the new "core" */ 61 | return km->loop_head; 62 | } 63 | 64 | void km_destroy(void *_km) 65 | { 66 | kmem_t *km = (kmem_t*)_km; 67 | allocated_t *p, *q; 68 | if (km == 0) return; 69 | p = &km->list_head; 70 | do { 71 | q = p->next; 72 | free(p->ptr); 73 | if (p != &km->list_head) free(p); 74 | p = q; 75 | } while (p && p->next); 76 | if (p != &km->list_head) free(p); 77 | free(km); 78 | } 79 | 80 | void kfree(void *_km, void *ap) 81 | { 82 | size_t *p, *q; 83 | kmem_t *km = (kmem_t*)_km; 84 | 85 | if (!ap) return; 86 | if (km == 0) { 87 | free(ap); 88 | return; 89 | } 90 | p = (size_t*)ap - 1; /* *p is the size of the current block */ 91 | /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: 92 | * 93 | * a) "p>q && pptr": @------@++++++++@+++++++@------- @---------------@+++++++@------- 94 | * (can also be in | | | -> | | 95 | * two cores) q p q->ptr q q->ptr 96 | * 97 | * @-------- @+++++++++@-------- @-------- @------------------ 98 | * | | | -> | | 99 | * q p q->ptr q q->ptr 100 | * 101 | * b) "q>=q->ptr && (p>q || pptr)": @-------@+++++ @--------@+++++++ @-------@+++++ @---------------- 102 | * | | | -> | | 103 | * q->ptr q p q->ptr q 104 | * 105 | * @+++++++@----- @++++++++@------- @------------- @++++++++@------- 106 | * | | | -> | | 107 | * p q->ptr q q->ptr q 108 | */ 109 | for (q = km->loop_head; !(p > q && p < PTR(q)); q = PTR(q)) 110 | if (q >= PTR(q) && (p > q || p < PTR(q))) break; 111 | if (p + (*p) == PTR(q)) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ 112 | *p += *PTR(q); /* this is the new q->ptr size */ 113 | p[1] = (size_t)PTR(PTR(q)); /* this is the new q->ptr->ptr */ 114 | /* p is actually the new q->ptr. The actual change happens a few lines below. */ 115 | } else if (p + (*p) > PTR(q) && PTR(q) >= p) { /* the end of the allocated block is in the next free block */ 116 | kerror("[kfree] The end of the allocated block enters a free block."); 117 | } else p[1] = (size_t)PTR(q); /* backup q->ptr */ 118 | 119 | if (q + (*q) == p) { /* two adjacent blocks, merge q and p (the other two cases) */ 120 | *q += *p; 121 | q[1] = (size_t)PTR(p); 122 | km->loop_head = q; 123 | } else if (q + (*q) > p && p >= q) { /* the end of a free block in the allocated block */ 124 | kerror("[kfree] The end of a free block enters the allocated block."); 125 | } else km->loop_head = p, q[1] = (size_t)p; /* in two cores, cannot be merged */ 126 | } 127 | 128 | void *krealloc(void *_km, void *ap, size_t n_bytes) 129 | { 130 | kmem_t *km = (kmem_t*)_km; 131 | size_t n_units, *p, *q; 132 | 133 | if (n_bytes == 0) { 134 | kfree(km, ap); return 0; 135 | } 136 | if (km == 0) return realloc(ap, n_bytes); 137 | if (!ap) return kmalloc(km, n_bytes); 138 | n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t); 139 | p = (size_t*)ap - 1; 140 | if (*p >= n_units) return ap; /* TODO: this prevents shrinking */ 141 | q = (size_t*)kmalloc(km, n_bytes); 142 | memcpy(q, ap, (*p - 1) * sizeof(size_t)); 143 | kfree(km, ap); 144 | return q; 145 | } 146 | 147 | void *kmalloc(void *_km, size_t n_bytes) 148 | { 149 | kmem_t *km = (kmem_t*)_km; 150 | size_t n_units, *p, *q; 151 | 152 | if (n_bytes == 0) return 0; 153 | if (km == 0) return malloc(n_bytes); 154 | /* "n_units" means the number of units. The size of one unit equals to sizeof(kheader_t). 155 | * "1" is the kheader_t of a block, which is always required. */ 156 | n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t); 157 | if (n_units&1) ++n_units; /* make n_units an even number, or it will segfault if only one unit remains */ 158 | 159 | if (!(q = km->loop_head)) { /* the first time when kmalloc() is called, intialization */ 160 | km->base[1] = (size_t)(km->loop_head = q = km->base); *q = 0; 161 | } 162 | for (p = PTR(q);; q = p, p = PTR(p)) { /* search for a suitable block */ 163 | if (*p >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ 164 | if (*p == n_units) q[1] = (size_t)PTR(p); /* no need to split the block */ 165 | else { /* split the block */ 166 | /* memory is allocated at the end of the block */ 167 | *p -= n_units; /* reduce the size of the free block */ 168 | p += *p; /* skip to the kheader_t of the allocated block */ 169 | *p = n_units; /* set the size */ 170 | } 171 | km->loop_head = q; /* set the end of chain */ 172 | return p + 1; /* skip the kheader_t */ 173 | } 174 | if (p == km->loop_head) { /* then ask for more "cores" */ 175 | if ((p = morecore(km, n_units)) == 0) return 0; 176 | } 177 | } 178 | } 179 | 180 | void *kcalloc(void *_km, size_t count, size_t size) 181 | { 182 | kmem_t *km = (kmem_t*)_km; 183 | void *p; 184 | if (size == 0 || count == 0) return 0; 185 | if (km == 0) return calloc(count, size); 186 | p = kmalloc(km, count * size); 187 | memset(p, 0, count * size); 188 | return p; 189 | } 190 | 191 | void km_stat(const void *_km) 192 | { 193 | kmem_t *km = (kmem_t*)_km; 194 | unsigned n_blocks, n_units; 195 | size_t max_block = 0, *p, *q; 196 | float frag; 197 | 198 | if (km == 0 || !(p = km->loop_head)) return; 199 | n_blocks = n_units = 0; 200 | do { 201 | q = PTR(p); 202 | if (*p > max_block) max_block = *p; 203 | n_units += *p; 204 | if (p + (*p) > q && q > p) 205 | kerror("[kr_stat] The end of a free block enters another free block."); 206 | p = q; 207 | ++n_blocks; 208 | } while (p != km->loop_head); 209 | 210 | --n_blocks; 211 | frag = 1.0/1024.0 * n_units * sizeof(size_t) / n_blocks; 212 | fprintf(stderr, "[kr_stat] tot=%lu, free=%lu, n_block=%u, max_block=%lu, frag_len=%.3fK\n", 213 | km->total_allocated, n_units * sizeof(size_t), n_blocks, max_block * sizeof(size_t), frag); 214 | } 215 | -------------------------------------------------------------------------------- /src/ksw2/kalloc.h: -------------------------------------------------------------------------------- 1 | #ifndef _KALLOC_H_ 2 | #define _KALLOC_H_ 3 | 4 | #include 5 | 6 | #define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t)) 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | void *kmalloc(void *km, size_t size); 13 | void *krealloc(void *km, void *ptr, size_t size); 14 | void *kcalloc(void *km, size_t count, size_t size); 15 | void kfree(void *km, void *ptr); 16 | 17 | void *km_init(void); 18 | void km_destroy(void *km); 19 | 20 | void km_stat(const void *km); // TODO: return numbers instead of print to stderr 21 | 22 | #ifdef __cplusplus 23 | } 24 | #endif 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/ksw2/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | int begin, end; \ 43 | int is_eof:2, bufsize:30; \ 44 | type_t f; \ 45 | unsigned char *buf; \ 46 | } kstream_t; 47 | 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 50 | 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 52 | SCOPE kstream_t *ks_init(type_t f) \ 53 | { \ 54 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 55 | ks->f = f; ks->bufsize = __bufsize; \ 56 | ks->buf = (unsigned char*)malloc(__bufsize); \ 57 | return ks; \ 58 | } \ 59 | SCOPE void ks_destroy(kstream_t *ks) \ 60 | { \ 61 | if (!ks) return; \ 62 | free(ks->buf); \ 63 | free(ks); \ 64 | } 65 | 66 | #define __KS_INLINED(__read) \ 67 | static inline int ks_getc(kstream_t *ks) \ 68 | { \ 69 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 70 | if (ks->begin >= ks->end) { \ 71 | ks->begin = 0; \ 72 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 73 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 74 | if (ks->end == 0) return -1; \ 75 | } \ 76 | return (int)ks->buf[ks->begin++]; \ 77 | } \ 78 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 79 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 80 | 81 | #ifndef KSTRING_T 82 | #define KSTRING_T kstring_t 83 | typedef struct __kstring_t { 84 | unsigned l, m; 85 | char *s; 86 | } kstring_t; 87 | #endif 88 | 89 | #ifndef kroundup32 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 91 | #endif 92 | 93 | #define __KS_GETUNTIL(SCOPE, __read) \ 94 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 95 | { \ 96 | if (dret) *dret = 0; \ 97 | str->l = append? str->l : 0; \ 98 | if (ks->begin >= ks->end && ks->is_eof) return -1; \ 99 | for (;;) { \ 100 | int i; \ 101 | if (ks->begin >= ks->end) { \ 102 | if (!ks->is_eof) { \ 103 | ks->begin = 0; \ 104 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 105 | if (ks->end < ks->bufsize) ks->is_eof = 1; \ 106 | if (ks->end == 0) break; \ 107 | } else break; \ 108 | } \ 109 | if (delimiter == KS_SEP_LINE) { \ 110 | for (i = ks->begin; i < ks->end; ++i) \ 111 | if (ks->buf[i] == '\n') break; \ 112 | } else if (delimiter > KS_SEP_MAX) { \ 113 | for (i = ks->begin; i < ks->end; ++i) \ 114 | if (ks->buf[i] == delimiter) break; \ 115 | } else if (delimiter == KS_SEP_SPACE) { \ 116 | for (i = ks->begin; i < ks->end; ++i) \ 117 | if (isspace(ks->buf[i])) break; \ 118 | } else if (delimiter == KS_SEP_TAB) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 121 | } else i = 0; /* never come to here! */ \ 122 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 123 | str->m = str->l + (i - ks->begin) + 1; \ 124 | kroundup32(str->m); \ 125 | str->s = (char*)realloc(str->s, str->m); \ 126 | } \ 127 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 128 | str->l = str->l + (i - ks->begin); \ 129 | ks->begin = i + 1; \ 130 | if (i < ks->end) { \ 131 | if (dret) *dret = ks->buf[i]; \ 132 | break; \ 133 | } \ 134 | } \ 135 | if (str->s == 0) { \ 136 | str->m = 1; \ 137 | str->s = (char*)calloc(1, 1); \ 138 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 139 | str->s[str->l] = '\0'; \ 140 | return str->l; \ 141 | } 142 | 143 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 144 | __KS_TYPE(type_t) \ 145 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 146 | __KS_GETUNTIL(SCOPE, __read) \ 147 | __KS_INLINED(__read) 148 | 149 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 150 | 151 | #define KSTREAM_DECLARE(type_t, __read) \ 152 | __KS_TYPE(type_t) \ 153 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 154 | extern kstream_t *ks_init(type_t f); \ 155 | extern void ks_destroy(kstream_t *ks); \ 156 | __KS_INLINED(__read) 157 | 158 | /****************** 159 | * FASTA/Q parser * 160 | ******************/ 161 | 162 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 163 | 164 | #define __KSEQ_BASIC(SCOPE, type_t) \ 165 | SCOPE kseq_t *kseq_init(type_t fd) \ 166 | { \ 167 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 168 | s->f = ks_init(fd); \ 169 | return s; \ 170 | } \ 171 | SCOPE void kseq_destroy(kseq_t *ks) \ 172 | { \ 173 | if (!ks) return; \ 174 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 175 | ks_destroy(ks->f); \ 176 | free(ks); \ 177 | } 178 | 179 | /* Return value: 180 | >=0 length of the sequence (normal) 181 | -1 end-of-file 182 | -2 truncated quality string 183 | */ 184 | #define __KSEQ_READ(SCOPE) \ 185 | SCOPE int kseq_read(kseq_t *seq) \ 186 | { \ 187 | int c; \ 188 | kstream_t *ks = seq->f; \ 189 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 190 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 191 | if (c == -1) return -1; /* end of file */ \ 192 | seq->last_char = c; \ 193 | } /* else: the first header char has been read in the previous call */ \ 194 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 195 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 196 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 197 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 198 | seq->seq.m = 256; \ 199 | seq->seq.s = (char*)malloc(seq->seq.m); \ 200 | } \ 201 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 202 | if (c == '\n') continue; /* skip empty lines */ \ 203 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 204 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 205 | } \ 206 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 207 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 208 | seq->seq.m = seq->seq.l + 2; \ 209 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 210 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 211 | } \ 212 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 213 | if (c != '+') return seq->seq.l; /* FASTA */ \ 214 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 215 | seq->qual.m = seq->seq.m; \ 216 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 217 | } \ 218 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 219 | if (c == -1) return -2; /* error: no quality string */ \ 220 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 221 | seq->last_char = 0; /* we have not come to the next header line */ \ 222 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 223 | return seq->seq.l; \ 224 | } 225 | 226 | #define __KSEQ_TYPE(type_t) \ 227 | typedef struct { \ 228 | kstring_t name, comment, seq, qual; \ 229 | int last_char; \ 230 | kstream_t *f; \ 231 | } kseq_t; 232 | 233 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 234 | KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ 235 | __KSEQ_TYPE(type_t) \ 236 | __KSEQ_BASIC(SCOPE, type_t) \ 237 | __KSEQ_READ(SCOPE) 238 | 239 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 240 | 241 | #define KSEQ_DECLARE(type_t) \ 242 | __KS_TYPE(type_t) \ 243 | __KSEQ_TYPE(type_t) \ 244 | extern kseq_t *kseq_init(type_t fd); \ 245 | void kseq_destroy(kseq_t *ks); \ 246 | int kseq_read(kseq_t *seq); 247 | 248 | #endif 249 | -------------------------------------------------------------------------------- /src/ksw2/ksw2.h: -------------------------------------------------------------------------------- 1 | #ifndef KSW2_H_ 2 | #define KSW2_H_ 3 | 4 | #include 5 | 6 | #define KSW_NEG_INF -0x40000000 7 | 8 | #define KSW_EZ_SCORE_ONLY 0x01 // don't record alignment path/cigar 9 | #define KSW_EZ_RIGHT 0x02 // right-align gaps 10 | #define KSW_EZ_GENERIC_SC 0x04 // without this flag: match/mismatch only; last symbol is a wildcard 11 | #define KSW_EZ_APPROX_MAX 0x08 // approximate max; this is faster with sse 12 | #define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse 13 | #define KSW_EZ_EXTZ_ONLY 0x40 // only perform extension 14 | #define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output 15 | #define KSW_EZ_SPLICE_FOR 0x100 16 | #define KSW_EZ_SPLICE_REV 0x200 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | typedef struct { 23 | uint32_t max:31, zdropped:1; 24 | int max_q, max_t; // max extension coordinate 25 | int mqe, mqe_t; // max score when reaching the end of query 26 | int mte, mte_q; // max score when reaching the end of target 27 | int score; // max score reaching both ends; may be KSW_NEG_INF 28 | int m_cigar, n_cigar; 29 | uint32_t *cigar; 30 | } ksw_extz_t; 31 | 32 | /** 33 | * NW-like extension 34 | * 35 | * @param km memory pool, when used with kalloc 36 | * @param qlen query length 37 | * @param query query sequence with 0 <= query[i] < m 38 | * @param tlen target length 39 | * @param target target sequence with 0 <= target[i] < m 40 | * @param m number of residue types 41 | * @param mat m*m scoring mattrix in one-dimension array 42 | * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" 43 | * @param gape gap extension penalty 44 | * @param w band width (<0 to disable) 45 | * @param zdrop off-diagonal drop-off to stop extension (positive; <0 to disable) 46 | * @param flag flag (see KSW_EZ_* macros) 47 | * @param ez (out) scores and cigar 48 | */ 49 | void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); 50 | void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); 51 | 52 | void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, 53 | int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); 54 | 55 | void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, 56 | int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); 57 | 58 | void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, 59 | int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); 60 | 61 | void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); 62 | 63 | /** 64 | * Global alignment 65 | * 66 | * (first 10 parameters identical to ksw_extz_sse()) 67 | * @param m_cigar (modified) max CIGAR length; feed 0 if cigar==0 68 | * @param n_cigar (out) number of CIGAR elements 69 | * @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with kfree(km, ) 70 | * 71 | * @return score of the alignment 72 | */ 73 | int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); 74 | int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); 75 | int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); 76 | 77 | void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat); 78 | int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te); 79 | 80 | #ifdef __cplusplus 81 | } 82 | #endif 83 | 84 | /************************************ 85 | *** Private macros and functions *** 86 | ************************************/ 87 | 88 | #ifdef HAVE_KALLOC 89 | #include "kalloc.h" 90 | #else 91 | #include 92 | #define kmalloc(km, size) malloc((size)) 93 | #define kcalloc(km, count, size) calloc((count), (size)) 94 | #define krealloc(km, ptr, size) realloc((ptr), (size)) 95 | #define kfree(km, ptr) free((ptr)) 96 | #endif 97 | 98 | static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len) 99 | { 100 | if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { 101 | if (*n_cigar == *m_cigar) { 102 | *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; 103 | cigar = (uint32_t*)krealloc(km, cigar, (*m_cigar) << 2); 104 | } 105 | cigar[(*n_cigar)++] = len<<4 | op; 106 | } else cigar[(*n_cigar)-1] += len<<4; 107 | return cigar; 108 | } 109 | 110 | // In the backtrack matrix, value p[] has the following structure: 111 | // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F} 112 | // bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E}) 113 | // bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F}) 114 | static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, 115 | int *m_cigar_, int *n_cigar_, uint32_t **cigar_) 116 | { // p[] - lower 3 bits: which type gets the max; bit 117 | int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; 118 | uint32_t *cigar = *cigar_, tmp; 119 | while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check 120 | int force_state = -1; 121 | if (is_rot) { 122 | r = i + j; 123 | if (i < off[r]) force_state = 2; 124 | if (off_end && i > off_end[r]) force_state = 1; 125 | tmp = force_state < 0? p[r * n_col + i - off[r]] : 0; 126 | } else { 127 | if (j < off[i]) force_state = 2; 128 | if (off_end && j > off_end[i]) force_state = 1; 129 | tmp = force_state < 0? p[i * n_col + j - off[i]] : 0; 130 | } 131 | if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it. 132 | else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H 133 | if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure 134 | if (force_state >= 0) state = force_state; 135 | if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match 136 | else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion 137 | else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron 138 | else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion 139 | } 140 | if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion 141 | if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion 142 | if (!is_rev) 143 | for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR 144 | tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; 145 | *m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar; 146 | } 147 | 148 | static inline void ksw_reset_extz(ksw_extz_t *ez) 149 | { 150 | ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1; 151 | ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF; 152 | ez->n_cigar = 0, ez->zdropped = 0; 153 | } 154 | 155 | static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e) 156 | { 157 | int r, t; 158 | if (is_rot) r = a, t = b; 159 | else r = a + b, t = a; 160 | if (H > (int32_t)ez->max) { 161 | ez->max = H, ez->max_t = t, ez->max_q = r - t; 162 | } else if (t >= ez->max_t && r - t >= ez->max_q) { 163 | int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l; 164 | l = tl > ql? tl - ql : ql - tl; 165 | if (zdrop >= 0 && ez->max - H > zdrop + l * e) { 166 | ez->zdropped = 1; 167 | return 1; 168 | } 169 | } 170 | return 0; 171 | } 172 | 173 | #endif 174 | -------------------------------------------------------------------------------- /src/ksw2/ksw2_ll_sse.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "ksw2.h" 6 | 7 | #ifdef __GNUC__ 8 | #define LIKELY(x) __builtin_expect((x),1) 9 | #define UNLIKELY(x) __builtin_expect((x),0) 10 | #else 11 | #define LIKELY(x) (x) 12 | #define UNLIKELY(x) (x) 13 | #endif 14 | 15 | typedef struct { 16 | int qlen, slen; 17 | uint8_t shift, mdiff, max, size; 18 | __m128i *qp, *H0, *H1, *E, *Hmax; 19 | } kswq_t; 20 | 21 | /** 22 | * Initialize the query data structure 23 | * 24 | * @param size Number of bytes used to store a score; valid valures are 1 or 2 25 | * @param qlen Length of the query sequence 26 | * @param query Query sequence 27 | * @param m Size of the alphabet 28 | * @param mat Scoring matrix in a one-dimension array 29 | * 30 | * @return Query data structure 31 | */ 32 | void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) 33 | { 34 | kswq_t *q; 35 | int slen, a, tmp, p; 36 | 37 | size = size > 1? 2 : 1; 38 | p = 8 * (3 - size); // # values per __m128i 39 | slen = (qlen + p - 1) / p; // segmented length 40 | q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory 41 | q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory 42 | q->H0 = q->qp + slen * m; 43 | q->H1 = q->H0 + slen; 44 | q->E = q->H1 + slen; 45 | q->Hmax = q->E + slen; 46 | q->slen = slen; q->qlen = qlen; q->size = size; 47 | // compute shift 48 | tmp = m * m; 49 | for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score 50 | if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; 51 | if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; 52 | } 53 | q->max = q->mdiff; 54 | q->shift = 256 - q->shift; // NB: q->shift is uint8_t 55 | q->mdiff += q->shift; // this is the difference between the min and max scores 56 | // An example: p=8, qlen=19, slen=3 and segmentation: 57 | // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} 58 | if (size == 1) { 59 | int8_t *t = (int8_t*)q->qp; 60 | for (a = 0; a < m; ++a) { 61 | int i, k, nlen = slen * p; 62 | const int8_t *ma = mat + a * m; 63 | for (i = 0; i < slen; ++i) 64 | for (k = i; k < nlen; k += slen) // p iterations 65 | *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; 66 | } 67 | } else { 68 | int16_t *t = (int16_t*)q->qp; 69 | for (a = 0; a < m; ++a) { 70 | int i, k, nlen = slen * p; 71 | const int8_t *ma = mat + a * m; 72 | for (i = 0; i < slen; ++i) 73 | for (k = i; k < nlen; k += slen) // p iterations 74 | *t++ = (k >= qlen? 0 : ma[query[k]]); 75 | } 76 | } 77 | return q; 78 | } 79 | 80 | int ksw_ll_i16(void *q_, int tlen, const uint8_t *target, int _gapo, int _gape, int *qe, int *te) 81 | { 82 | kswq_t *q = (kswq_t*)q_; 83 | int slen, i, gmax = 0, qlen8; 84 | __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; 85 | uint16_t *H8; 86 | 87 | #define __max_8(ret, xx) do { \ 88 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ 89 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ 90 | (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ 91 | (ret) = _mm_extract_epi16((xx), 0); \ 92 | } while (0) 93 | 94 | // initialization 95 | *qe = *te = -1; 96 | zero = _mm_set1_epi32(0); 97 | gapoe = _mm_set1_epi16(_gapo + _gape); 98 | gape = _mm_set1_epi16(_gape); 99 | H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; 100 | slen = q->slen, qlen8 = slen * 8; 101 | memset(E, 0, slen * sizeof(__m128i)); 102 | memset(H0, 0, slen * sizeof(__m128i)); 103 | memset(Hmax, 0, slen * sizeof(__m128i)); 104 | // the core loop 105 | for (i = 0; i < tlen; ++i) { 106 | int j, k, imax; 107 | __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector 108 | h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example 109 | h = _mm_slli_si128(h, 2); 110 | for (j = 0; LIKELY(j < slen); ++j) { 111 | h = _mm_adds_epi16(h, *S++); 112 | e = _mm_load_si128(E + j); 113 | h = _mm_max_epi16(h, e); 114 | h = _mm_max_epi16(h, f); 115 | max = _mm_max_epi16(max, h); 116 | _mm_store_si128(H1 + j, h); 117 | h = _mm_subs_epu16(h, gapoe); 118 | e = _mm_subs_epu16(e, gape); 119 | e = _mm_max_epi16(e, h); 120 | _mm_store_si128(E + j, e); 121 | f = _mm_subs_epu16(f, gape); 122 | f = _mm_max_epi16(f, h); 123 | h = _mm_load_si128(H0 + j); 124 | } 125 | for (k = 0; LIKELY(k < 16); ++k) { 126 | f = _mm_slli_si128(f, 2); 127 | for (j = 0; LIKELY(j < slen); ++j) { 128 | h = _mm_load_si128(H1 + j); 129 | h = _mm_max_epi16(h, f); 130 | _mm_store_si128(H1 + j, h); 131 | h = _mm_subs_epu16(h, gapoe); 132 | f = _mm_subs_epu16(f, gape); 133 | if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16; 134 | } 135 | } 136 | end_loop_i16: 137 | __max_8(imax, max); 138 | if (imax >= gmax) { 139 | gmax = imax; *te = i; 140 | memcpy(Hmax, H1, slen * sizeof(__m128i)); 141 | } 142 | S = H1; H1 = H0; H0 = S; 143 | } 144 | for (i = 0, H8 = (uint16_t*)Hmax; i < qlen8; ++i) 145 | if ((int)H8[i] == gmax) *qe = i / 8 + i % 8 * slen; 146 | return gmax; 147 | } 148 | -------------------------------------------------------------------------------- /src/main.cc: -------------------------------------------------------------------------------- 1 | //============================================================================ 2 | // Name : graphmap.cpp 3 | // Author : Ivan Sovic 4 | // Version : 5 | // Copyright : Copyright Ivan Sovic, 2014. All rights reserved. 6 | // Description : Hello World in C++, Ansi-style 7 | //============================================================================ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "sequences/sequence_file.h" 15 | #include "sequences/single_sequence.h" 16 | #include "log_system/log_system.h" 17 | #include "graphmap/graphmap.h" 18 | 19 | #include "program_parameters.h" 20 | #include "utility/utility_general.h" 21 | 22 | #include "owler/owler.h" 23 | #include "argparser.h" 24 | 25 | int main(int argc, char *argv[]) { 26 | std::string program_name(argv[0]); 27 | std::string subprogram(""); 28 | 29 | ArgumentParser argparser; 30 | argparser.AddArgument(&subprogram, VALUE_TYPE_STRING, "", "tool", "", "Specifies the tool to run:\n align - the entire GraphMap pipeline.\n owler - Overlapping With Long Erroneous Reads.", -1, ""); 31 | argparser.set_program_name(program_name); 32 | 33 | if (argc == 1) { 34 | fprintf (stderr, "%s", argparser.VerboseUsage().c_str()); 35 | fprintf (stderr, "\n"); 36 | fprintf (stderr, "%s\n", LICENCE_INFORMATION); 37 | fprintf (stderr, "Version: %s\n", std::string(GRAPHMAP_CURRENT_VERSION).c_str()); 38 | fprintf (stderr, "Build date: %s\n", std::string(GRAPHMAP_CURRENT_VERSION_RELEASE_DATE).c_str()); 39 | fprintf (stderr, "\n"); 40 | exit(1); 41 | } 42 | 43 | // The ArgumentParser's function for processing arguments is never explicitly called, because it's overly complicated for this purpose. 44 | // Instead, we just take the value of argv[1] and that's it. ArgumentParser is used only for neat formatting of the usage. 45 | subprogram = std::string(argv[1]); 46 | 47 | // Remove the 'tools' param to format the command line so it can be seemlesly processed in the next step. 48 | std::vector argv2; 49 | argv2.push_back(argv[0]); 50 | for (int32_t i=2; i 1) { 63 | LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_FULL | LOG_VERBOSE_STD; 64 | } 65 | fflush(stdout); 66 | 67 | GraphMap graphmap; 68 | graphmap.Run(program_parameters); 69 | 70 | } else if (subprogram == "owler") { 71 | if (ProcessArgsOwler(argc2, &argv2[0], &program_parameters)) 72 | return 1; 73 | 74 | if (program_parameters.verbose_level == 1) { 75 | LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_STD; 76 | } else if (program_parameters.verbose_level > 1) { 77 | LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_FULL | LOG_VERBOSE_STD; 78 | } 79 | fflush(stdout); 80 | 81 | Owler owler; 82 | owler.Run(program_parameters); 83 | 84 | } else { 85 | fprintf (stderr, "ERROR: Unknown value of 'tool' parameter. Exiting.\n\n"); 86 | fprintf (stderr, "%s\n", argparser.VerboseUsage().c_str()); 87 | exit(1); 88 | 89 | } 90 | 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /src/owler/owler.h: -------------------------------------------------------------------------------- 1 | /* 2 | * owler.h 3 | * 4 | * Created on: Jul 2, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef OWLER_H_ 9 | #define OWLER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "sequences/single_sequence.h" 22 | #include "sequences/sequence_file.h" 23 | #include "containers/score_registry.h" 24 | #include "program_parameters.h" 25 | #include "utility/utility_general.h" 26 | #include "alignment/cigargen.h" 27 | #include "containers/region.h" 28 | #include "containers/mapping_data.h" 29 | #include "utility/evalue.h" 30 | #include "containers/vertices.h" 31 | 32 | #include "owler/owler_data.h" 33 | 34 | #include "minimizer_index/minimizer_index.h" 35 | 36 | #include "utility/tictoc.h" 37 | 38 | 39 | 40 | class Owler { 41 | public: 42 | Owler(); 43 | ~Owler(); 44 | 45 | // Main function for running the mapping process. It generates/loads the index, and handles batch loading of sequences from the reads file. 46 | void Run(ProgramParameters ¶meters); 47 | 48 | private: 49 | std::shared_ptr ref_; 50 | std::shared_ptr reads_; 51 | std::shared_ptr index_; 52 | clock_t run_begin_time_; 53 | clock_t run_end_time_; 54 | 55 | // Opens the output SAM file for writing if the path is specified. If the path is empty, then output is set to STDOUT. 56 | FILE* OpenOutFile_(std::string out_sam_path=""); 57 | 58 | // Generates or loads the index of the reference genome. 59 | int BuildIndex_(ProgramParameters ¶meters); 60 | 61 | // Process the loaded batch of reads. Uses OpenMP to do it in parallel. Calls ProcessOneRead for each read in the SequenceFile. 62 | int ProcessSequenceFileInParallel_(ProgramParameters ¶meters, std::shared_ptr reads, TicToc &tt_all, FILE *fp_out); 63 | 64 | int ProcessRead_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data); 65 | 66 | int CollectHits_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data); 67 | 68 | // int ClusterHits_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, int32_t diag_epsilon, OwlerData &owler_data); 69 | int ClusterHits2_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, int32_t diag_epsilon, OwlerData &owler_data); 70 | 71 | void GenerateOutput_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data); 72 | 73 | 74 | void AppendSeedHits_(const uint128_t& seed, std::shared_ptr index, bool threshold_hits, double count_cutoff, bool is_overlapper, int64_t qid, std::vector &all_hits); 75 | 76 | int WrapLCSk_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, const std::vector &hits, int64_t begin_hit, int64_t end_hit, int32_t seed_len, PairwiseOverlap &overlap); 77 | 78 | void LCSk_(std::vector &events, int64_t n, int64_t k, std::vector &matches_starts, std::vector &matches_indices, std::vector &lcsk_indices, int64_t &lcsk_len); 79 | 80 | void FilterColinear_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, 81 | const std::vector &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len, const std::vector &raw_lcsk_indices, 82 | std::vector &lcsk_indices, std::vector *cluster_ids, int32_t &num_sv); 83 | 84 | int PrepareEvents_(const std::vector &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len, 85 | std::vector &events, std::vector &matches_starts, std::vector &matches_indices, int64_t &max_seq_len); 86 | 87 | int CalcCoveredBases_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, 88 | const std::vector &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len, PairwiseOverlap &overlap); 89 | 90 | bool CheckOverlap_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, PairwiseOverlap& overlap); 91 | 92 | 93 | 94 | std::string GenerateMHAPLine_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap); 95 | std::string GeneratePAFLine_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap); 96 | 97 | std::string GenerateDebugInfo_(std::shared_ptr index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap); 98 | int64_t CalcEditDist_(std::shared_ptr index, const SingleSequence *read, const PairwiseOverlap& overlap); 99 | double CalcRatio_(const PairwiseOverlap& overlap); 100 | 101 | int FilterAnchorBreakpoints_(const std::vector &lcskpp_indices, int64_t ref_hits_start, int64_t ref_hits_end, int64_t seed_length, 102 | int64_t min_cluster_length, float min_cluster_coverage, const std::vector &hits, 103 | const ProgramParameters* parameters, std::vector &ret_filtered_lcskpp_indices, 104 | std::vector *ret_cluster_ids); 105 | bool CheckDistanceTooBig_(const std::vector &hits, int64_t index_last, int64_t index_current, float error_rate); 106 | 107 | void WriteHits_(std::string out_path, const std::vector &hits, int64_t hits_start, int64_t hits_end, 108 | int64_t ref_id, std::string read_header, int64_t read_length, 109 | std::string reference_header, int64_t reference_length, 110 | const std::vector *indices_to_output, const std::vector *cluster_ids); 111 | 112 | static inline uint128_t MakeHit_(const uint128_t& seq_id, const uint128_t& diag, const uint128_t& pos_ref, const uint128_t& pos_read) { 113 | return ((seq_id << 96) | (diag << 64) | (pos_ref << 32) | (pos_read)); 114 | } 115 | 116 | static inline int32_t HitPosRead_(const uint128_t& hit) { 117 | return (int32_t) (hit & kSeedMask32_1); 118 | } 119 | 120 | static inline int32_t HitPosRef_(const uint128_t& hit) { 121 | return (int32_t) ((hit & kSeedMask32_2) >> 32); 122 | } 123 | 124 | static inline int32_t HitDiag_(const uint128_t& hit) { 125 | return (int32_t) ((hit & kSeedMask32_3) >> 64); 126 | } 127 | 128 | static inline int32_t HitSeqId_(const uint128_t& hit) { 129 | return (int32_t) ((hit & kSeedMask32_4) >> 96); 130 | } 131 | }; 132 | 133 | #endif /* OWLER_H_ */ 134 | -------------------------------------------------------------------------------- /src/owler/owler_data.h: -------------------------------------------------------------------------------- 1 | /* 2 | * owler_data.h 3 | * 4 | * Created on: Jul 2, 2015 5 | * Author: isovic 6 | */ 7 | 8 | #ifndef OWLER_DATA_H_ 9 | #define OWLER_DATA_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "minimizer_index/minimizer_index.h" 19 | #include "containers/range.h" 20 | 21 | class PairwiseOverlap { 22 | public: 23 | // PairwiseOverlap() : qid(0), tid(0), num_seeds(0), cov_bases(0), num_sv(0) { } 24 | PairwiseOverlap(int64_t _qid, int64_t _tid, int64_t _tid_fwd) : qid(_qid), tid(_tid), tid_fwd(_tid_fwd), num_seeds(0), num_hits(0), cov_bases_query(0), cov_bases_target(0), num_sv(0), lcsk_len(0) { } 25 | 26 | Range query, target; 27 | int64_t qid, tid, tid_fwd; 28 | int64_t num_seeds; // Number of seed hits which survived all LCSk filters. 29 | int64_t num_hits; // Number of raw seed hits, without any sort of LCSk filtering. 30 | int64_t cov_bases_query; 31 | int64_t cov_bases_target; 32 | int32_t num_sv; 33 | 34 | std::vector lcsk_indices; 35 | std::vector cluster_ids; 36 | int64_t lcsk_len; 37 | 38 | std::string reject_reason; 39 | }; 40 | 41 | class OwlerData { 42 | public: 43 | OwlerData() { }; 44 | ~OwlerData() { }; 45 | 46 | std::vector hits; 47 | std::vector overlaps; 48 | std::string unmapped_reason; 49 | std::string overlap_lines; 50 | // std::vector out_lines; 51 | }; 52 | 53 | #endif /* OWLER_DATA_H_ */ 54 | -------------------------------------------------------------------------------- /src/program_parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * program_parameters.h 3 | * 4 | * Created on: Jul 24, 2014 5 | * Author: ivan 6 | */ 7 | 8 | #ifndef PROGRAM_PARAMETERS_H_ 9 | #define PROGRAM_PARAMETERS_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define SOFTWARE_NAME "GraphMap" 19 | #define GRAPHMAP_CURRENT_VERSION "v0.6.5" 20 | #define GRAPHMAP_CURRENT_VERSION_RELEASE_DATE (std::string(__DATE__) + std::string(" at ") + std::string(__TIME__)) // __TIMESTAMP__ // "12 October 2014" 21 | #define COPYRIGHT "Copyright Ivan Sovic, Mile Sikic and Niranjan Nagarajan, 2015, 2016.\n" \ 22 | "\n" \ 23 | "Affiliations: Ivan Sovic (1, 3), Mile Sikic (2), Niranjan Nagarajan (3)\n" \ 24 | " (1) Ruder Boskovic Institute, Zagreb, Croatia\n" \ 25 | " (2) University of Zagreb, Faculty of Electrical Engineering and Computing\n" \ 26 | " (3) Genome Institute of Singapore, A*STAR, Singapore\n" 27 | 28 | #define LICENCE_INFORMATION \ 29 | "GraphMap (c) by Ivan Sovic, Mile Sikic and Niranjan Nagarajan\n" \ 30 | "GraphMap is licensed under The MIT License.\n" \ 31 | 32 | #define AFFILIATIONS \ 33 | "Affiliations: Ivan Sovic (1, 3), Mile Sikic (2), Niranjan Nagarajan (3)\n" \ 34 | " (1) Ruder Boskovic Institute, Zagreb, Croatia\n" \ 35 | " (2) University of Zagreb, Faculty of Electrical Engineering and Computing\n" \ 36 | " (3) Genome Institute of Singapore, A*STAR, Singapore\n" 37 | 38 | struct ProgramParameters { 39 | std::string subprogram = ""; 40 | 41 | int64_t k_region = 13; // 'j', Kmer size for region search (binning). 42 | int64_t k_graph = 6; // 'k', Kmer size for graph building. 43 | int64_t num_links = 9; // 'l', Number of backward edges to check. 44 | float error_rate = 0.45; // 'e', Approximate error rate of the input read sequences. 45 | int64_t start_read = 0; // 's', Start processing reads from the one specified. 46 | int64_t num_reads_to_process = -1; // 'n', Number of reads to process. If equal to -1, all reads will be processed. 47 | int64_t debug_read = -1; // 'y', Verbose output for read marked with this variable. 48 | std::string debug_read_by_qname = ""; 49 | int64_t num_threads = -1; // 't', Number of threads to use. If equal to -1, number of threads will be equal to number of processors. 50 | std::string reference_path = ""; // 'r', The path to the reference file. 51 | std::string index_file = ""; // 'i', The path to the reference file's index. If it does not exist, index will be created in this path. 52 | std::string reads_path = ""; // 'd', The path to the reads file, in FASTA or FASTQ format. 53 | std::string out_sam_path = ""; // 'o', The output path. If left blank, all sam output will be placed to stdout. 54 | int64_t verbose_sam_output = 0; // 'b', Helpful debug comments can be placed in SAM output lines (at the end), however, some tools (like SAMtools) don't quite like them. Comments can be turned of by setting this variable to 0. Different values increase/decrease verbosity level. 55 | int64_t verbose_level = 5; // 'v', Verbose level. If equal to 0 nothing except strict output will be placed on stdout. 56 | std::string command_line = ""; // The actuall commandline that was used to generate the parameters. 57 | int64_t max_num_regions_cutoff = 0; // 'q' Before the read is skipped, it will be attempted to reduce the number of selected regions if their number is higher than max_num_regions_cutoff. 58 | int64_t max_num_regions = 0; // 'g' If still more regions than this are selected, the read is too ambiguous for processing, so it will be skipped. 59 | 60 | // Binning parameters 61 | // int64_t max_num_hits = 0; // 'm' Maximum number of hits per kmer during the binning process. 62 | bool skip_multiple_kmers_per_bin = true; // 'p' One kmer of a read can have multiple hits withing the same bin. If true, this parameter prevents this. 63 | 64 | bool output_in_original_order = false; // 'u' If true, SAM alignments will be output after the processing has finished, in the order of input reads. 65 | int64_t kmer_step = 1; // 'w' The number of bases to skip between beginnings of every adjecent kmer. 66 | 67 | std::string reads_folder = ""; // 'D', The path to a folder that contains reads, in FASTA or FASTQ format. Intended for batch processing. 68 | std::string output_folder = ""; // 'O', The path to the output folder for batch processing. 69 | bool process_reads_from_folder = false; 70 | int64_t batch_size_in_mb = -1; // 'B', specifies the size of a batch for sequence loading. If <= 0, all sequences will be loaded at once, otherwise the specified number of megabytes will be loaded consequentially. 71 | std::string alignment_algorithm = "sg"; // 'a', specifies whether EDlib or SSW or hybrid should be used for realignment in the last step. 72 | std::string alignment_approach = "normal"; // 'w' 73 | bool calc_only_index = false; 74 | int64_t match_score = 5; 75 | int64_t mex_score = 1; 76 | int64_t mismatch_penalty = 4; 77 | int64_t gap_open_penalty = 8; 78 | int64_t gap_extend_penalty = 6; 79 | int64_t evalue_match = 5; 80 | int64_t evalue_mismatch = 4; 81 | int64_t evalue_gap_open = 8; 82 | int64_t evalue_gap_extend = 6; 83 | bool is_reference_circular = false; // 'C' 84 | std::string composite_parameters = ""; // 'x', specifies several parameters at the same time, such as 'nanopore' and 'illumina'. 85 | float margin_for_ambiguity = 0.05; // All mapping positions within the given fraction of the top score will be counted for ambiguity (mapping quality). Value of 0.0f counts only identical mappings. 86 | bool output_multiple_alignments = false; // If 0, only one best alignment will be output. Otherwise, all alignments within margin_for_ambiguity will be output to a file. 87 | bool use_double_index = false; // If false, only one index will be used, but the memory consumption will be reduced by half. If false, sensitive and memory-hungry mode will be used. 88 | int64_t min_num_anchor_bases = 12; 89 | double evalue_threshold = -1; 90 | int64_t mapq_threshold = 0; 91 | std::string infmt = "auto"; 92 | std::string outfmt = "sam"; 93 | 94 | // bool extend_aln_to_end = true; 95 | 96 | bool use_extended_cigar = false; 97 | 98 | int64_t min_read_len = 80; // If a read is shorter than this, it will be marked as unmapped. 99 | 100 | double min_bin_percent = 0.75f; 101 | double bin_threshold_step = 0.10f; 102 | 103 | bool use_spliced = false; 104 | bool use_split = false; 105 | bool disable_end_to_end = true; 106 | bool overlapper = false; 107 | bool no_self_hits = false; 108 | bool rebuild_index = false; 109 | 110 | double max_error_rate = 1.0f; 111 | double max_indel_error_rate = 1.0f; 112 | 113 | std::string gtf_path; 114 | bool is_transcriptome = false; 115 | 116 | bool auto_rebuild_index = false; 117 | 118 | bool use_minimizers = false; 119 | int64_t minimizer_window = 5; 120 | bool threshold_hits = false; 121 | double frequency_percentil = 0.99; 122 | bool index_on_the_fly = false; 123 | std::string index_shape = "1111110111111"; 124 | bool load_index = false; 125 | bool store_index = false; 126 | int64_t min_overlap_len = 100; 127 | double overhang_percent = 0.20; 128 | int64_t max_allowed_overhang = 1000; 129 | double min_percent_cov_bases = 0.01; 130 | int64_t min_num_seeds = 4; 131 | 132 | double anchor_chain_indel_bandwidth = 0.23; // error_rate / 2 + 0.1f; 133 | int64_t anchor_chain_max_dist = 200; 134 | int64_t anchor_chain_min_cov_bases = 50; 135 | int64_t anchor_chain_size_cutoff = 2; 136 | }; 137 | 138 | int ProcessArgsGraphMap(int argc, char **argv, ProgramParameters *parameters); 139 | int ProcessArgsOwler(int argc, char **argv, ProgramParameters *parameters); 140 | void VerboseProgramParameters(ProgramParameters *parameters); 141 | void VerboseShortHelpAndExit(int argc, char **argv); 142 | 143 | #endif /* PROGRAM_PARAMETERS_H_ */ 144 | -------------------------------------------------------------------------------- /src/sparsehash/COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005, Google Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following disclaimer 12 | in the documentation and/or other materials provided with the 13 | distribution. 14 | * Neither the name of Google Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /src/sparsehash/internal/libc_allocator_with_realloc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // --- 31 | 32 | #ifndef UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 33 | #define UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 34 | 35 | #include 36 | #include // for malloc/realloc/free 37 | #include // for ptrdiff_t 38 | #include // for placement new 39 | 40 | _START_GOOGLE_NAMESPACE_ 41 | 42 | template 43 | class libc_allocator_with_realloc { 44 | public: 45 | typedef T value_type; 46 | typedef size_t size_type; 47 | typedef ptrdiff_t difference_type; 48 | 49 | typedef T* pointer; 50 | typedef const T* const_pointer; 51 | typedef T& reference; 52 | typedef const T& const_reference; 53 | 54 | libc_allocator_with_realloc() {} 55 | libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} 56 | ~libc_allocator_with_realloc() {} 57 | 58 | pointer address(reference r) const { return &r; } 59 | const_pointer address(const_reference r) const { return &r; } 60 | 61 | pointer allocate(size_type n, const_pointer = 0) { 62 | return static_cast(malloc(n * sizeof(value_type))); 63 | } 64 | void deallocate(pointer p, size_type) { 65 | free(p); 66 | } 67 | pointer reallocate(pointer p, size_type n) { 68 | return static_cast(realloc(p, n * sizeof(value_type))); 69 | } 70 | 71 | size_type max_size() const { 72 | return static_cast(-1) / sizeof(value_type); 73 | } 74 | 75 | void construct(pointer p, const value_type& val) { 76 | new(p) value_type(val); 77 | } 78 | void destroy(pointer p) { p->~value_type(); } 79 | 80 | template 81 | libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} 82 | 83 | template 84 | struct rebind { 85 | typedef libc_allocator_with_realloc other; 86 | }; 87 | }; 88 | 89 | // libc_allocator_with_realloc specialization. 90 | template<> 91 | class libc_allocator_with_realloc { 92 | public: 93 | typedef void value_type; 94 | typedef size_t size_type; 95 | typedef ptrdiff_t difference_type; 96 | typedef void* pointer; 97 | typedef const void* const_pointer; 98 | 99 | template 100 | struct rebind { 101 | typedef libc_allocator_with_realloc other; 102 | }; 103 | }; 104 | 105 | template 106 | inline bool operator==(const libc_allocator_with_realloc&, 107 | const libc_allocator_with_realloc&) { 108 | return true; 109 | } 110 | 111 | template 112 | inline bool operator!=(const libc_allocator_with_realloc&, 113 | const libc_allocator_with_realloc&) { 114 | return false; 115 | } 116 | 117 | _END_GOOGLE_NAMESPACE_ 118 | 119 | #endif // UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 120 | -------------------------------------------------------------------------------- /src/sparsehash/internal/sparseconfig.h: -------------------------------------------------------------------------------- 1 | /* 2 | * NOTE: This file is for internal use only. 3 | * Do not use these #defines in your own program! 4 | */ 5 | 6 | /* Namespace for Google classes */ 7 | #define GOOGLE_NAMESPACE ::google 8 | 9 | /* the location of the header defining hash functions */ 10 | #define HASH_FUN_H 11 | 12 | /* the namespace of the hash<> function */ 13 | #define HASH_NAMESPACE std::tr1 14 | 15 | /* Define to 1 if you have the header file. */ 16 | #define HAVE_INTTYPES_H 1 17 | 18 | /* Define to 1 if the system has the type `long long'. */ 19 | #define HAVE_LONG_LONG 1 20 | 21 | /* Define to 1 if you have the `memcpy' function. */ 22 | #define HAVE_MEMCPY 1 23 | 24 | /* Define to 1 if you have the header file. */ 25 | #define HAVE_STDINT_H 1 26 | 27 | /* Define to 1 if you have the header file. */ 28 | #define HAVE_SYS_TYPES_H 1 29 | 30 | /* Define to 1 if the system has the type `uint16_t'. */ 31 | #define HAVE_UINT16_T 1 32 | 33 | /* Define to 1 if the system has the type `u_int16_t'. */ 34 | #define HAVE_U_INT16_T 1 35 | 36 | /* Define to 1 if the system has the type `__uint16'. */ 37 | /* #undef HAVE___UINT16 */ 38 | 39 | /* The system-provided hash function including the namespace. */ 40 | #define SPARSEHASH_HASH HASH_NAMESPACE::hash 41 | 42 | /* Stops putting the code inside the Google namespace */ 43 | #define _END_GOOGLE_NAMESPACE_ } 44 | 45 | /* Puts following code inside the Google namespace */ 46 | #define _START_GOOGLE_NAMESPACE_ namespace google { 47 | -------------------------------------------------------------------------------- /src/sparsehash/template_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2005 Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // ---- 31 | // 32 | // Template metaprogramming utility functions. 33 | // 34 | // This code is compiled directly on many platforms, including client 35 | // platforms like Windows, Mac, and embedded systems. Before making 36 | // any changes here, make sure that you're not breaking any platforms. 37 | // 38 | // 39 | // The names choosen here reflect those used in tr1 and the boost::mpl 40 | // library, there are similar operations used in the Loki library as 41 | // well. I prefer the boost names for 2 reasons: 42 | // 1. I think that portions of the Boost libraries are more likely to 43 | // be included in the c++ standard. 44 | // 2. It is not impossible that some of the boost libraries will be 45 | // included in our own build in the future. 46 | // Both of these outcomes means that we may be able to directly replace 47 | // some of these with boost equivalents. 48 | // 49 | #ifndef BASE_TEMPLATE_UTIL_H_ 50 | #define BASE_TEMPLATE_UTIL_H_ 51 | 52 | #include 53 | _START_GOOGLE_NAMESPACE_ 54 | 55 | // Types small_ and big_ are guaranteed such that sizeof(small_) < 56 | // sizeof(big_) 57 | typedef char small_; 58 | 59 | struct big_ { 60 | char dummy[2]; 61 | }; 62 | 63 | // Identity metafunction. 64 | template 65 | struct identity_ { 66 | typedef T type; 67 | }; 68 | 69 | // integral_constant, defined in tr1, is a wrapper for an integer 70 | // value. We don't really need this generality; we could get away 71 | // with hardcoding the integer type to bool. We use the fully 72 | // general integer_constant for compatibility with tr1. 73 | 74 | template 75 | struct integral_constant { 76 | static const T value = v; 77 | typedef T value_type; 78 | typedef integral_constant type; 79 | }; 80 | 81 | template const T integral_constant::value; 82 | 83 | 84 | // Abbreviations: true_type and false_type are structs that represent boolean 85 | // true and false values. Also define the boost::mpl versions of those names, 86 | // true_ and false_. 87 | typedef integral_constant true_type; 88 | typedef integral_constant false_type; 89 | typedef true_type true_; 90 | typedef false_type false_; 91 | 92 | // if_ is a templatized conditional statement. 93 | // if_ is a compile time evaluation of cond. 94 | // if_<>::type contains A if cond is true, B otherwise. 95 | template 96 | struct if_{ 97 | typedef A type; 98 | }; 99 | 100 | template 101 | struct if_ { 102 | typedef B type; 103 | }; 104 | 105 | 106 | // type_equals_ is a template type comparator, similar to Loki IsSameType. 107 | // type_equals_::value is true iff "A" is the same type as "B". 108 | // 109 | // New code should prefer base::is_same, defined in base/type_traits.h. 110 | // It is functionally identical, but is_same is the standard spelling. 111 | template 112 | struct type_equals_ : public false_ { 113 | }; 114 | 115 | template 116 | struct type_equals_ : public true_ { 117 | }; 118 | 119 | // and_ is a template && operator. 120 | // and_::value evaluates "A::value && B::value". 121 | template 122 | struct and_ : public integral_constant { 123 | }; 124 | 125 | // or_ is a template || operator. 126 | // or_::value evaluates "A::value || B::value". 127 | template 128 | struct or_ : public integral_constant { 129 | }; 130 | 131 | 132 | _END_GOOGLE_NAMESPACE_ 133 | 134 | #endif // BASE_TEMPLATE_UTIL_H_ 135 | --------------------------------------------------------------------------------