├── .gitignore
├── .gitmodules
├── INSTALL.md
├── LICENCE
├── Makefile
├── README.md
├── doc
    ├── GraphMap-description.pdf
    ├── README-v0.21.md
    ├── README-v0.22.md
    ├── changelog.md
    ├── img
    │   ├── anchors-normal.png
    │   ├── anchors-rna.png
    │   ├── region_selection-rna.png
    │   └── region_selection.png
    ├── rnaseq.md
    └── sam_output.md
├── overlap.md
├── reproducibility
    ├── README.md
    ├── run.py
    └── setup.py
├── scripts
    └── scatterplot8.py
└── src
    ├── aligner
        ├── aligner_base.h
        ├── aligner_containers.h
        ├── aligner_ksw2.cc
        ├── aligner_ksw2.h
        ├── aligner_util.cc
        ├── aligner_util.hpp
        ├── anchor_aligner.cc
        ├── anchor_aligner.h
        ├── pairwise_penalties.h
        ├── sam_parser.cc
        └── sam_parser.h
    ├── alignment
        ├── alignment.cc
        ├── alignment.h
        ├── alignment_wrappers.cc
        ├── alignment_wrappers.h
        ├── anchored.cc
        ├── cigargen.cc
        ├── cigargen.h
        ├── semiglobal.cc
        ├── transcriptome_mod.cc
        └── transcriptome_mod.h
    ├── containers
        ├── mapping_data.cc
        ├── mapping_data.h
        ├── path_graph_entry.cc
        ├── path_graph_entry.h
        ├── range.h
        ├── raw_alignment.h
        ├── region.cc
        ├── region.h
        ├── results.h
        ├── score_registry.cc
        ├── score_registry.h
        ├── vertices.cc
        └── vertices.h
    ├── graphmap
        ├── core_graphmap.cc
        ├── experimental.cc
        ├── filter_anchors.cc
        ├── filter_anchors.h
        ├── graphmap.cc
        ├── graphmap.h
        ├── lcs_anchored.cc
        ├── lcs_semiglobal.cc
        ├── process_read.cc
        ├── region_selection.cc
        ├── rna.cc
        ├── transcriptome.cc
        └── transcriptome.h
    ├── index
        ├── index_util.cc
        └── index_util.h
    ├── ksw2
        ├── LICENSE.txt
        ├── kalloc.cc
        ├── kalloc.h
        ├── kseq.h
        ├── ksw2.h
        ├── ksw2_extd2_sse.cc
        ├── ksw2_exts2_sse.cc
        ├── ksw2_extz2_sse.cc
        └── ksw2_ll_sse.cc
    ├── main.cc
    ├── owler
        ├── lcsk.cc
        ├── owler.cc
        ├── owler.h
        ├── owler_data.h
        ├── owler_experimental.cc
        └── process_read.cc
    ├── program_parameters.cc
    ├── program_parameters.h
    └── sparsehash
        ├── COPYING
        ├── dense_hash_map
        ├── dense_hash_set
        ├── internal
            ├── densehashtable.h
            ├── hashtable-common.h
            ├── libc_allocator_with_realloc.h
            ├── sparseconfig.h
            └── sparsehashtable.h
        ├── sparse_hash_map
        ├── sparse_hash_set
        ├── sparsetable
        ├── template_util.h
        └── type_traits.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | deprecated/
 2 | temp/
 3 | temp/*
 4 | obj/
 5 | obj_debug/
 6 | obj_linux/
 7 | obj_mac/
 8 | obj_test/
 9 | obj_testext/
10 | obj_extcigar/
11 | # bin/graphmap-not_release
12 | # bin/graphmap-debug
13 | bin/
14 | .project
15 | .cproject
16 | .settings/
17 | reproducibility/*/
18 | test-data/
19 | .vscode*
20 | 
21 | !reproducibility/*.py
22 | !reproducibility/*.md
23 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "codebase/seqlib"]
 2 | 	path = codebase/seqlib
 3 | 	url = https://github.com/isovic/seqlib.git
 4 | [submodule "codebase/argumentparser"]
 5 | 	path = codebase/argumentparser
 6 | 	url = https://github.com/isovic/argumentparser.git
 7 | [submodule "codebase/gindex"]
 8 | 	path = codebase/gindex
 9 | 	url = https://github.com/isovic/gindex
10 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## 1. Installation
 2 | 
 3 | You will need a recent GCC/G++ version (>=4.7) to compile the source.  
 4 | 
 5 | To override the default compiler choice you can set GCC (or GCC_MAC on Mac), e.g.:  
 6 | 
 7 | ```
 8 | GCC=/usr/local/bin/g++ make  
 9 | ```
10 | 
11 | ### 1.1 Initialize submodules
12 | This will automatically initialize/pull the latest version of submodules.  
13 | ```
14 | make modules  
15 | ```
16 | 
17 | Submodules are used as source files, so there is no need to pre-compile them in any way.  
18 | 
19 | 
20 | ### 1.2 Linux
21 | For a Linux release version type:
22 | ```
23 | make  
24 | ```
25 | 
26 | To clean, type:
27 | ```
28 | make clean  
29 | ```
30 | 
31 | One can also rebuild, which will cause clean and make to be ran sequentially:
32 | ```
33 | make rebuild  
34 | ```
35 | 
36 | ### 1.3 Mac
37 | ```
38 | make mac  
39 | 
40 | make cleanmac  
41 | make rebuildmac  
42 | ```
43 | 
44 | ### 1.4. Compiling the debug version
45 | ```
46 | make debug  
47 | 
48 | make cleandebug  
49 | make rebuilddebug  
50 | ```
51 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Ivan Sovic, Mile Sikic and Niranjan Nagarajan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | BIN = ./bin/graphmap2
  2 | BIN_DEBUG = ./bin/graphmap-debug
  3 | BIN_LINUX = ./bin/Linux-x64/graphmap2
  4 | BIN_MAC = ./bin/Mac/graphmap
  5 | OBJ_TESTING = ./obj_test
  6 | OBJ_TESTING_EXT = ./obj_testext
  7 | OBJ_DEBUG = ./obj_debug
  8 | OBJ_LINUX = ./obj_linux
  9 | OBJ_EXTCIGAR = ./obj_extcigar
 10 | OBJ_MAC = ./obj_mac
 11 | SOURCE = src
 12 | CODEBASE = codebase
 13 | # This finds all 'src' folders at maximum depth 2 (level one inside each submodule's folder).
 14 | CODEBASE_SRC_FOLDERS = $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{} \;)
 15 | # $(shell find $(CODEBASE) -maxdepth 3 -type d -name "libs" -exec echo "-I"{} \;)
 16 | # $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{}"/*/" \;)
 17 | 
 18 | # ? allows override by user using env var
 19 | GCC ?= g++
 20 | # define variables for GCC version check here
 21 | GCC_MAJOR_VERSION_GE_4 := $(shell expr `$(GCC) -dumpversion | cut -f1 -d.` \>= 4)
 22 | GCC_MINOR_VERSION_GE_7 := $(shell expr `$(GCC) -dumpversion | cut -f2 -d.` \>= 7)
 23 | GCC_MAC ?= g++
 24 | 
 25 | 
 26 | # CPP_FILES := $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp)
 27 | # CC_FILES := $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc)
 28 | # H_FILES := $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(SOURCE)/libs/*/*.h)
 29 | CPP_FILES :=  $(wildcard $(CODEBASE)/*/src/*.cpp) $(wildcard $(CODEBASE)/*/src/libs/*/*.cpp) $(wildcard $(CODEBASE)/*/src/*/*.cpp) $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp)
 30 | CC_FILES :=  $(wildcard $(CODEBASE)/*/src/*.cc) $(wildcard $(CODEBASE)/*/src/libs/*/*.cc) $(wildcard $(CODEBASE)/*/src/*/*.cc) $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc)
 31 | H_FILES := $(wildcard $(CODEBASE)/*/src/*.h) $(wildcard $(CODEBASE)/*/src/libs/*/*.h) $(wildcard $(CODEBASE)/*/src/*/*.h) $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(CODEBASE)/*/src/*.hpp) $(wildcard $(CODEBASE)/*/src/*/*.hpp) $(wildcard $(SOURCE)/*/*.hpp) $(wildcard $(SOURCE)/*.hpp) $(wildcard $(SOURCE)/libs/*/*.h)
 32 | 
 33 | OBJ_FILES := $(CPP_FILES:.cpp=.o) $(CC_FILES:.cc=.o)
 34 | OBJ_FILES_FOLDER_TESTING := $(addprefix $(OBJ_TESTING)/,$(OBJ_FILES))
 35 | OBJ_FILES_FOLDER_TESTING_EXT := $(addprefix $(OBJ_TESTING_EXT)/,$(OBJ_FILES))
 36 | OBJ_FILES_FOLDER_DEBUG := $(addprefix $(OBJ_DEBUG)/,$(OBJ_FILES))
 37 | OBJ_FILES_FOLDER_LINUX := $(addprefix $(OBJ_LINUX)/,$(OBJ_FILES))
 38 | OBJ_FILES_FOLDER_EXTCIGAR := $(addprefix $(OBJ_EXTCIGAR)/,$(OBJ_FILES))
 39 | OBJ_FILES_FOLDER_MAC := $(addprefix $(OBJ_MAC)/,$(OBJ_FILES))
 40 | 
 41 | LIB_DIRS = -L"/usr/local/lib"
 42 | CC_LIBS = -static-libgcc -static-libstdc++ -D__cplusplus=201103L
 43 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"libs/libdivsufsort-2.0.1/build/include" -I"libs/seqan-library-1.4.2/include"
 44 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"src/libs/seqan-library-1.4.2/include"
 45 | INCLUDE = -I"./src/" -I"/usr/include/" -I"$(CODEBASE)/seqlib/src/libs/seqan-library-2.0.1/include" -I"$(CODEBASE)/seqlib/src/libs/libdivsufsort-2.0.1-64bit/" $(CODEBASE_SRC_FOLDERS)
 46 | 
 47 | CC_FLAGS_DEBUG = -O3 -g -rdynamic -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native
 48 | CC_FLAGS_RELEASE = -DRELEASE_VERSION -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread # -march=native
 49 | CC_FLAGS_EXTCIGAR = -DRELEASE_VERSION -DUSE_EXTENDED_CIGAR_FORMAT -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native
 50 | CC_FLAGS_NOT_RELEASE = -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
 51 | CC_FLAGS_NOT_RELEASE_EXT = -g -O3 -DUSE_EXTENDED_CIGAR_FORMAT -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
 52 | LD_FLAGS = -static-libgcc -static-libstdc++ -m64 -ffreestanding
 53 | # LD_LIBS = -lpthread -lgomp -lm -lz -ldivsufsort64
 54 | LD_LIBS = -lpthread -lgomp -lm -lz
 55 | 
 56 | 
 57 | 
 58 | all: gcc_version_check linux
 59 | 
 60 | install: /usr/bin/graphmap
 61 | 
 62 | /usr/bin/graphmap: bin/Linux-x64/graphmap
 63 | 	cp bin/Linux-x64/graphmap /usr/bin/graphmap
 64 | 
 65 | modules:
 66 | 	git submodule update --init --recursive
 67 | 	# git submodule foreach git pull origin master
 68 | 
 69 | testing: $(OBJ_FILES_FOLDER_TESTING)
 70 | 	mkdir -p $(dir $(BIN))
 71 | 	$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING) $(LD_LIBS)	
 72 | 	
 73 | obj_test/%.o: %.cc $(H_FILES)
 74 | 	mkdir -p $(dir $@)
 75 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $<
 76 | 	
 77 | obj_test/%.o: %.cpp $(H_FILES)
 78 | 	mkdir -p $(dir $@)
 79 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $<
 80 | 
 81 | testingext: $(OBJ_FILES_FOLDER_TESTING_EXT)
 82 | 	mkdir -p $(dir $(BIN))
 83 | 	$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING_EXT) $(LD_LIBS)
 84 | 	
 85 | obj_testext/%.o: %.cc $(H_FILES)
 86 | 	mkdir -p $(dir $@)
 87 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<
 88 | 	
 89 | obj_testext/%.o: %.cpp $(H_FILES)
 90 | 	mkdir -p $(dir $@)
 91 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<
 92 | 
 93 | 
 94 | 
 95 | gcc_version_check:
 96 | ifneq ($(GCC_MAJOR_VERSION_GE_4), 1)
 97 | 	$(warning "*** WARNING $(GCC) major version <4 ***")
 98 | endif	
 99 | ifneq ($(GCC_MINOR_VERSION_GE_7), 1)
100 | 	$(warning "*** WARNING $(GCC) minor version <7 ***")
101 | endif
102 | 
103 | 
104 | debug: $(OBJ_FILES_FOLDER_DEBUG)
105 | 	mkdir -p $(dir $(BIN_DEBUG))
106 | 	$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_DEBUG) $(OBJ_FILES_FOLDER_DEBUG) $(LD_LIBS)
107 | 	
108 | obj_debug/%.o: %.cc $(H_FILES)
109 | 	mkdir -p $(dir $@)
110 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $<
111 | 	
112 | obj_debug/%.o: %.cpp $(H_FILES)
113 | 	mkdir -p $(dir $@)
114 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $<
115 | 
116 | 
117 | 
118 | linux: $(OBJ_FILES_FOLDER_LINUX)
119 | 	mkdir -p $(dir $(BIN_LINUX))
120 | 	$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_LINUX) $(LD_LIBS)
121 | 	
122 | obj_linux/%.o: %.cc $(H_FILES)
123 | 	mkdir -p $(dir $@)
124 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
125 | 	
126 | obj_linux/%.o: %.cpp $(H_FILES)
127 | 	mkdir -p $(dir $@)
128 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
129 | 
130 | 
131 | 
132 | extcigar: $(OBJ_FILES_FOLDER_EXTCIGAR)
133 | 	mkdir -p $(dir $(BIN_LINUX))
134 | 	$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_EXTCIGAR) $(LD_LIBS)
135 | 	
136 | obj_extcigar/%.o: %.cc $(H_FILES)
137 | 	mkdir -p $(dir $@)
138 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $<
139 | 	
140 | obj_extcigar/%.o: %.cpp $(H_FILES)
141 | 	mkdir -p $(dir $@)
142 | 	$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $<
143 | 
144 | 
145 | 
146 | mac: $(OBJ_FILES_FOLDER_MAC)
147 | 	mkdir -p $(dir $(BIN_MAC))
148 | 	$(GCC_MAC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_MAC) $(OBJ_FILES_FOLDER_MAC) $(LD_LIBS)
149 | 	
150 | obj_mac/%.o: %.cc $(H_FILES)
151 | 	mkdir -p $(dir $@)
152 | 	$(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
153 | 	
154 | obj_mac/%.o: %.cpp $(H_FILES)
155 | 	mkdir -p $(dir $@)
156 | 	$(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
157 | 
158 | 
159 | 
160 | # deps:
161 | # 	cd libs; cd libdivsufsort-2.0.1; make clean; rm -rf build; ./configure; mkdir build ;cd build; cmake -DBUILD_DIVSUFSORT64:BOOL=ON -DCMAKE_BUILD_TYPE="Release" -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX="/usr/local" .. ; make
162 | 
163 | 
164 | 	
165 | clean:
166 | 	-rm -rf $(OBJ_LINUX) $(BIN_LINUX)
167 | 
168 | cleantesting:
169 | 	-rm -rf $(OBJ_TESTING) $(BIN)
170 | 
171 | cleandebug:
172 | 	-rm -rf $(OBJ_DEBUG) $(BIN_DEBUG)
173 | 
174 | cleanlinux:
175 | 	-rm -rf $(OBJ_LINUX) $(BIN_LINUX)
176 | 
177 | cleanextcigar:
178 | 	-rm -rf $(OBJ_EXTCIGAR) $(BIN_LINUX)
179 | 
180 | cleanmac:
181 | 	-rm -rf $(OBJ_MAC) $(BIN_MAC)
182 | 
183 | cleanbin:
184 | 	-rm -rf bin/
185 | 
186 | cleanall: clean cleantest cleandebug cleanmac cleanbin
187 | 
188 | 
189 | 
190 | rebuild: clean all
191 | 
192 | rebuilddebug: cleandebug debug
193 | 
194 | rebuildlinux: cleanlinux linux
195 | 
196 | rebuildtesting: cleantesting testing
197 | 
198 | rebuildmac: cleanmac mac
199 | 
200 | # divsufsort:
201 | # 	cd libs; ./build-libdivsufsort.sh
202 | 
203 | 


--------------------------------------------------------------------------------
/doc/GraphMap-description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/GraphMap-description.pdf


--------------------------------------------------------------------------------
/doc/README-v0.21.md:
--------------------------------------------------------------------------------
  1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads 
  2 | 
  3 | Preprint of our paper now available on Biorxiv:  
  4 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719)  
  5 | 
  6 | Sequencing data of E. Coli UTI89 generated in-house and used in the paper now available on:  
  7 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557)  
  8 |   
  9 | 
 10 | 
 11 | **__Version: 0.21__**
 12 | **Update**
 13 | Release date: 02 June 2015  
 14 | 
 15 | New alignment mode available: anchored alignment.  
 16 | 
 17 | Anchored alignment is an alternative to the default semiglobal alignment. It is less sensitive than default semiglobal, but faster and creates alignments around determined homologies (anchors).  
 18 | This is a very powerful addition to alignment, as it creates highly accurate and confident alignments even in the presence of high error rates.  
 19 | To run the anchored alignment, use the '-a anchor' option.  
 20 | 
 21 | Also, standard Gotoh alignment is also available now as opposed to the default Myers's bit-vector alignment. Custom alignment parameters can be specified via commandline.  
 22 | To use the Gotoh alignment, use '-a gotoh' commandline option.  
 23 | 
 24 | Additionally, E-value and mapping quality thresholds can now be applied directly from commandline (-z and -c options).  
 25 | 
 26 | More to follow.  
 27 | 
 28 | 
 29 | 
 30 | **__Version: v0.20b__**
 31 | **Update**
 32 | Release date: 26 April 2015  
 33 | 
 34 | Added the source code.
 35 | 
 36 | To build from source:
 37 | ```
 38 | make
 39 | ```
 40 | If the libraries have to be recompiled on your system, type:
 41 | ```
 42 | make deps
 43 | ```
 44 | More installation instructions can be found in the INSTALL file.  
 45 | 
 46 | 
 47 | Release date: 02 April 2015  
 48 | Precompiled binary, built on Ubuntu 10.04 x64.  
 49 | Tested on Mint 17.1 x64.
 50 | 
 51 | Significantly improved speed and sensitivity.
 52 | 
 53 | Added some important features:
 54 | - Mapping quality.
 55 | - Sensible alignment score.
 56 | - E-value added in reported alignments! Look for a custom ZE parameter in the SAM lines.
 57 | - Secondary alignments can now be output as well (use the -Z parameter).
 58 | 
 59 | Addressed several reported issues:
 60 | - Output only the first whitespace-separated token in the qname field of the SAM output. Previously the entire read's header was output.
 61 | - The same for the rname.
 62 | - Reads that are marked unmapped now contain no additional mapping information as before.
 63 | 
 64 | Please note that by default, GraphMap will use more memory to allow higher speed and sensitivity.
 65 | To run in parsimonious (half the memory requirements), please use the -P parameter.
 66 | 
 67 | Note #2: some command line parameters were changed (removed/added) since the last version, but most stayed the same.
 68 | In case you are using one of the removed parameters, you will be warned and the process will not run.
 69 | 
 70 | 
 71 | **__Version: v0.19b__**
 72 | Release date: 16 January 2015  
 73 | Precompiled binary, built on Ubuntu 10.04 x64.  
 74 | Tested on Mint 17.1 x64.  
 75 | 
 76 | **Update**
 77 | Compiled a MacOS version too, now can also be found in the bin directory.  
 78 | Built on MacOS X 10.9.5  
 79 | 
 80 | Important updates:
 81 | - Better support for circular genomes - use '-C' option if your reference is circular!
 82 | - Added a more sensitive mode (though much slower) - check out the '-x' option in the help!
 83 | - Better alignments for Illumina reads - again, check out the '-x' option.
 84 | - Better dynamic of the AS (alignment score) - value 254 best score, value 0 worst/unmapped.
 85 | 
 86 | To use the normal (fast) mode, simply use the default parameters (nothing is changed, just omit the '-x' option).
 87 | 
 88 |   
 89 | **__Version: v0.18b__**
 90 | Release date: 11 December 2014  
 91 | Precompiled binary, built on Ubuntu 10.04 x64.  
 92 | Tested on Mint 17 (Ubuntu 14.04), Ubuntu Server 14.04, Fedora 20 and Gentoo.
 93 | 
 94 | ### Description
 95 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data.  
 96 | It can handle Oxford Nanopore data with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers (namely, compared to BLASR and BWA-MEM).
 97 | 
 98 | GraphMap was designed for ease-of-use: the default parameters can handle a wide range of read lengths and error profiles. This is an important feature for technologies where the error rates and error profiles can vary widely across sequencing runs. In addition, GraphMap allows users to uniformly map read datasets from disparate technologies with high sensitivity and accuracy. While GraphMap is not runtime optimized for short-read data (e.g. compared to Bowtie2), it provides accurate and typically more sensitive mappings for Illumina and Ion Torrent reads.
 99 | 
100 | Please keep in mind that this is an early development version and we welcome your comments and feedback on GraphMap.
101 | 
102 | ### Comparison to other mappers
103 | 
104 | Comparison statistics will be uploaded soon.
105 | 
106 | ### Usage
107 | 
108 | ```
109 | # Process all reads from a given FASTA/FASTQ file with default number of threads:
110 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam
111 | 
112 | # Process reads using more sensitive parameters for Illumina data:
113 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam
114 | 
115 | # Process reads from a circular genome:
116 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam
117 | 
118 | # Limit the number of threads to 8, and load reads in batches of 50MB:
119 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
120 | 
121 | # Process only the first 1000 reads:
122 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
123 | 
124 | # Process all reads from a given folder.
125 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder
126 | 
127 | # Generate only the index.
128 | ./graphmap -I -r escherichia_coli.fa
129 | ```
130 | 
131 | ### Contact information
132 | 
133 | For additional information, help and bug reports please send an email to one of the following:
134 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg
135 | 


--------------------------------------------------------------------------------
/doc/README-v0.22.md:
--------------------------------------------------------------------------------
  1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads  
  2 | **__Current Version: 0.22__**  
  3 | Release date: 12 November 2015  
  4 |   
  5 | Updates:  
  6 | - Many tiny bug fixes, mostly related to anchored alignment. It should be slightly more sensitive now.  
  7 | - Two overlap modes merged from the dev branch: ```-w owler``` (fast, uses a trimmed GraphMap pipeline, reports output in the MHAP format) and ```-w overlapper``` (full GraphMap pipeline including alignment, output in SAM format). For usage - check examples at the bottom.  
  8 | - GraphMap integration into marginAlign - we forked marginAlign and extended it to support GraphMap alongside to LAST and BWA-MEM ([https://github.com/isovic/marginAlign](https://github.com/isovic/marginAlign)). Use parameters ```--graphmap``` or ```--graphmapanchor``` with marginAlign to specify the mapper.  
  9 | 
 10 | For more information on overlapping, take a look at [overlap.md](overlap.md).  
 11 | 
 12 | GraphMap is also used as an overlapper in a new *de novo* genome assembly project called [Ra](https://github.com/mariokostelac/ra-integrate) ([https://github.com/mariokostelac/ra-integrate](https://github.com/mariokostelac/ra-integrate)).  
 13 | Ra attempts to create *de novo* assemblies from raw nanopore and PacBio reads without requiring error correction, for which a highly sensitive overlapper is required.  
 14 | 
 15 | 
 16 | ### Quick start on Linux x64
 17 | ```  
 18 | git clone https://github.com/isovic/graphmap.git  
 19 | cd graphmap  
 20 | make  
 21 |   
 22 | # To align:  
 23 | ./bin/Linux-x64/graphmap -r reference.fa -d reads.fasta -o output.sam  
 24 | 
 25 | # To overlap:  
 26 | ./bin/Linux-x64/graphmap -w owler -r reads.fasta -d reads.fasta -o output.mhap  
 27 | ```  
 28 | 
 29 | ### Description
 30 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data.  
 31 | It is **designed to handle Oxford Nanopore MinION 1d and 2d reads** with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers.
 32 | 
 33 | GraphMap was also designed for ease-of-use: the **default parameters** can handle a wide range of read lengths and error profiles, including: *Illumina*, *PacBio* and *Oxford Nanopore*.  
 34 | This is an especially important feature for technologies where the error rates and error profiles can vary widely across, or even within, sequencing runs.  
 35 | 
 36 | **The GraphMap algorithm** is structured to achieve high-sensitivity and speed using a five-stage
 37 | read-funneling approach. In stage I, GraphMap uses a novel adaptation of gapped spaced seeds to efficiently reduce the search space and get seed hits as a form of coarse alignment. These are then refined in stage II using graph-based vertex-centric processing of seeds to efficiently construct alignment anchors. GraphMap then chains anchors using a kmer
 38 | version of longest common subsequence (LCS) construction (stage III), refines
 39 | alignments with a form of L1 linear regression (stage IV) and finally evaluates the
 40 | remaining candidates to select the best location to reconstruct a final alignment (stage V).
 41 | GraphMap computes a BLAST-like E-value as well as a mapping quality for its alignments.
 42 | 
 43 | **Evaluation** on MinION sequencing datasets against short and long-read mappers indicates that GraphMap increases mapping sensitivity by at least 15-80%. GraphMap alignments are the first to demonstrate consensus calling with <1 error in 100,000 bases, variant calling on the human genome with 76% improvement in sensitivity over the next best mapper (BWA-MEM), precise detection of structural variants from 100bp to 4kbp in length and species and strain-specific identification of pathogens using MinION reads.
 44 | 
 45 | Further details about the algorithm, comparison with other mappers and usage applications can be found in the **preprint** of our paper:  
 46 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719)  
 47 | 
 48 | **Nanopore sequencing data** of E. Coli UTI89 generated in-house and used in the paper now available on ENA:  
 49 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557)  
 50 | 
 51 | ### Features  
 52 | - Mapping position agnostic to alignment parameters.
 53 | - Consistently very high sensitivity and precision across different error profiles, rates and sequencing technologies even with default parameters.
 54 | - Circular genome handling to resolve coverage drops near ends of the genome.
 55 | - E-value.
 56 | - Meaningful mapping quality.
 57 | - Various alignment strategies (semiglobal bit-vector and Gotoh, anchored).
 58 | 
 59 | ### Installation
 60 | To build GraphMap from source type:  
 61 | ```
 62 | make
 63 | ```  
 64 | Required libraries are prebuilt for Linux x64 systems.  
 65 | To rebuild them for other systems, type:  
 66 | ```
 67 | make deps
 68 | ```  
 69 | 
 70 | You will need a recent GCC/G++ version (>=4.7).
 71 | 
 72 | More installation instructions can be found in the INSTALL file.
 73 | 
 74 | 
 75 | ### Usage examples
 76 | ```
 77 | # Align all reads from a given FASTA/FASTQ file with default number of threads using semiglobal bit-vector alignment:  
 78 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
 79 | 
 80 | # Align all reads from a given FASTA/FASTQ file using anchored alignment approach:  
 81 | ./graphmap -a anchor -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
 82 | 
 83 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast):
 84 | ./graphmap -w owler -r reads.fa -d reads.fa -o overlaps.mhap  
 85 | 
 86 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow):
 87 | ./graphmap -w overlapper -r reads.fa -d reads.fa -o overlaps.sam  
 88 | 
 89 | # Align reads using the Gotoh for semiglobal alignment:  
 90 | ./graphmap -a gotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
 91 | 
 92 | # Align reads using Gotoh alignment with anchored approach: 
 93 | ./graphmap -a anchorgotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
 94 | 
 95 | # Process reads from a circular genome:  
 96 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
 97 | 
 98 | # Threshold the E-value of alignments to 1e-100. Alignments with E-value > 1e-100 will be called unmapped:  
 99 | ./graphmap -z 1e-100 -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
100 | 
101 | # Output all similarly good alignments (to within F*num_kmers_of_best_alnmnt) instead of only one best:  
102 | ./graphmap -Z -F 0.05 -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
103 | 
104 | # Limit the number of threads to 8, and load reads in batches of 50MB:  
105 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
106 | 
107 | # Align reads using more sensitive parameters for Illumina data (currently equivalent to "-a gotoh"):  
108 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
109 | 
110 | # Load all reads in one batch and align only the first 1000 reads:  
111 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
112 | 
113 | # Process all reads from a given folder.  
114 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder  
115 | 
116 | # Generate only the index.  
117 | ./graphmap -I -r escherichia_coli.fa  
118 | 
119 | # Run a debug version of GraphMap (build with "make debug") and verbose the SAM output to see various info about alignment:  
120 | ./graphmap-debug -b 3 -r escherichia_coli.fa -d reads.fastq -o alignments.sam  
121 | 
122 | ```  
123 | 
124 | ### Contact information
125 | 
126 | For additional information, help and bug reports please send an email to one of the following:  
127 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg
128 | 
129 | ### Acknowledgement  
130 | This work was supported by the IMaGIN platform (project No. 102 101 0025), through a grant from the Science and Engineering Research Council, funding to the Genome Institute of Singapore from the Agency for Science, Technology and Research (A*STAR), Singapore, and funding from the Croatian Science Foundation (Project no. UIP-11-2013-7353 - Algorithms for Genome Sequence Analysis).  
131 | 


--------------------------------------------------------------------------------
/doc/changelog.md:
--------------------------------------------------------------------------------
 1 | ## GraphMap - ChangeLog
 2 | 
 3 | **__Version 0.5.0 -> 0.5.1__**  
 4 | Release date: 04 March 2017
 5 | - Updated the gindex module for smaller memory consumption when building the index. Index construction is now a bit slower (single thread is used for collecting minimizers), but collection of minimizers is now performed on the fly. Previously, all seeds would be collected first, and then they would be pulled through a minimizer generation function. Now, each seed is pushed into the minimizer queue and if the queue yields a seed which is different than the previous one, it is emplaced on the list.
 6 | The memory consumption is still large (similar to index in versions 0.4.x), which is due to 128-bit integer representation of all seeds (seed key, sequence ID and sequence position). This could be reduced further by careful redesign.
 7 | The disk version of the index is fully compatible to version 0.5.0.
 8 | The reduced memory consumption directly also impacts the Owler mode as well.
 9 | 
10 | **__Version 0.4.1 -> 0.5.0__**  
11 | Release date: 28 February 2017
12 | - Re-implemented the index. Removed all other indexes that were previously implemented, and cleaned up the code to only use the new index (MinimizerIndex). MinimizerIndex is implemented in a separate repo added to the codebase. It also uses a hash table to store the seeds, however instead of the perfect hash as before, Google's DenseHash is used. Seeds are first compiled in a giant list (each sequence in its space, in parallel), and afterwards the list is sorted (also multithreaded). Basic statistics on seed key distribution are calculated (mean, median, standard deviation). The index also allows thresholding the amount of hits during lookup (keys with a count higher than a user-specified percentil are skipped) which is very significant for large, repetitive genomes. The index can also generate minimizers (also user specified). Index also allows for custom indexing shapes to be defined, and creates the lookup shapes automatically.  
13 | - Changed the command line parameters to allow for new features, concretely:  
14 |   1. Removed the parameter ```max-hits``` which is now obsolete.
15 |   2. Added parameter ```minimizer-window``` to specify the length of the minimizer window to choose minimizers from. If equal to 1, minimizers won't be used.  
16 |   3. Added parameter ```freq-percentil``` to specify the percentil of key occurances which will be kept. E.g. if 0.99, then 1% of most repetitive keys will be skipped. If 1.0, no filtering will be used.  
17 |   4. Added parameter ```fly-index``` which will generate index on the fly and won't store it to disk. If the index already exists on disk, it will be loaded. To completely generate a new index on the fly, use ```--fly-index --rebuild-index```.  
18 |   5. Renamed the parameter which was previously known as ```sensitive``` to ```double-index```.
19 |   6. Added a composite parameter called ```-x sensitive``` which will turn off minimizers and key frequency filtering.  
20 | 
21 | - Fixed an issue with RNA-seq transcriptome mapping, where recall would be lower than expected. There was a bug when checking if alignment is sane - the check would occur *after* the alignment was converted from transcriptome space to genome space, instead still on the transcriptome. This could not have caused false positives, but definitely caused many reads to be unmapped.  
22 | - The reimplemented index now fixes the issue of segmentation fault on the human genome.  
23 | 
24 | 
25 | 
26 | **__Version 0.4.0 -> 0.4.1__**  
27 | Release date: 28 January 2017  
28 | - Fixed the SAM headers for transcriptome mapping. In the last version, the headers corresponded to the transcriptome headers, although the alignments are in the genome space.
29 | 
30 | **__Version 0.3.2 -> 0.4.0__**  
31 | Release date: 22 January 2017  
32 | - GraphMap can now accept a GTF file for mapping to a transcriptome. Transcriptome is internally generated using the reference file and the GTF file, and index built from the transcriptome. Reads are then mapped to the transcriptome, and final alignments converted back to the genome coordinate space by introducing 'N' operations at splice sites.  
33 | - Transcriptome mapping is only available in anchored alignment modes.  
34 | - Updated Edlib to the newest version. Previous version had a bug in the traceback.  
35 | - Recent changes in Edlib produced leading and trailing deletions in some cases. This is now handled by removing the deletions and shifting the alignment start position.  
36 | - Fixed several (possible) memory leaks and invalid reads/writes. Generating the MD tag in SAM files had an invalid read which for some reason caused strange artifacts in CIGAR strings.  
37 | 
38 | **__Version 0.3.1 -> 0.3.2__**  
39 | Release date: 19 December 2016  
40 | - There were segfaults caused by recently-introduced bugs to Edlib. It has since been updated, and this version of GraphMap now includes the fixed version of Edlib.
41 | - There was a memory leak when generating clusters.
42 | - Minor fixes to some syntax.
43 | 
44 | **__Version 0.3.0 -> 0.3.1__**  
45 | Release date: 12 October 2016  
46 | - Important: Fixed MD field issues
47 | - Minor bug fixes: composite command line parameter ```-x illumina``` depended on a parameter which wasn't defined properly, filtered empty SAM lines, etc.
48 | 
49 | **__Version 0.22 -> 0.3.0__**  
50 | Release date: 15 April 2016  
51 | If you are using versions 0.3.x please update to the most recent commit. There were several important memory access issues which are now resolved.  
52 | GraphMap's command line has changed significantly between version 0.3.x and 0.2x - although many options remain similar, the usage is incompatible with the older releases due to explicit tool specification.  
53 | The first parameter is now mandatory, and specifies whether the **mapping/alignment** (```./graphmap align```) or **overlapping** (```./graphmap owler```) should be used.  
54 | **Important change in command line parameters.** The new version is not completely compatible to the previous one. For this reason, the minor version number has changed.  
55 | - Changed the version numbering from: ```x.yz``` to ```x.y.z```
56 | - Implemented a new argument parser.
57 | - Fixed a bug with overhanging base (Issue #14), commit: 41ae30b0d8603469c62794cba1960dc42f739d4e
58 | - Fixed the extensions of alignment to read ends when near an overhang (Issue #18).
59 | - Fixed Issue #19 - inconsistent behaviour for parameter ```-F```.
60 | - Cleaned up the code a bit.
61 | - Restructured the code. Majority of the code was extracted from the repository to be used as the codebase for this and other projects. GraphMap's main code is left in this repo, while the rest is linked via git submodules.
62 | - Added support for reading SAM and GFA files as the input sequences. Gzipped versions of all formats are supported as well. By default the format is chosen by the extension of the fila (--infmt auto), but can be specified manually.
63 | - Added support for the M5 output format.
64 | - Added the MD field to the SAM output.
65 | - New and better anchor filtering (anchored modes only) using chaining of anchors that passed the LCSk.
66 | - New and better clustering of anchor stretches. This will be used for implementing RNA-seq alignment.
67 | - No need to precompile libraries for your system anymore. Libraries are now included in the source, or in the submodules. To initialize submodules, either clone recursively, or call ```make modules``` once GraphMap repo has been cloned.
68 | - Anchored alignment is now the default one.  
69 | 
70 | Important command line changes:
71 | - Long argument names are now provided.
72 | - Extended CIGAR format can now be used via commandline through the --extcigar parameter (unlike before, where the code needed to be recompiled).
73 | - By default, GraphMap now uses only one gapped spaced index (previously, two were used by default; one could have been used by specifying the parsimonious mode). The defaults now are the ex parsimonious mode. To use two indexes, specify the parameter: --sensitive
74 | - The ```-w owler``` and ```-w overlapper``` have been moved. The alignment/owler mode is chosen as the first parameter in the commandline now (a "subprogram"; e.g. run ```graphmap owler```. To use the ex ```-w overlapper```, specify ```-x overlap``` instead. This mode has now been simply converted to a composite parameter. There is also a command line parameter ```--overlapper``` which only controls the counting of hits in order to skip self-hits.
75 | - There is now a default E-value filter set at ```1e0```
76 | - There is now a default MAPQ filter set at ```1```
77 | - It is now possible to switch off extension of alignments to read ends (parameter: ``--no-end2end```).
78 | - If the index needs to be rebuilt, it can now be done using a sinle command line with parameter: ```--rebuild-index``
79 | 


--------------------------------------------------------------------------------
/doc/img/anchors-normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-normal.png


--------------------------------------------------------------------------------
/doc/img/anchors-rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-rna.png


--------------------------------------------------------------------------------
/doc/img/region_selection-rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection-rna.png


--------------------------------------------------------------------------------
/doc/img/region_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection.png


--------------------------------------------------------------------------------
/doc/rnaseq.md:
--------------------------------------------------------------------------------
 1 | ## Mapping RNA-seq reads  
 2 | 
 3 | ### 1. Transcriptome mapping  
 4 | Since version 0.4.0, GraphMap has a support for mapping reads to internally generated transcriptomes.  
 5 | These features are available on the ```master``` branch of the GraphMap repo.  
 6 | To use this feature, a ```GTF``` annotations file is needed alongside the reference ```FASTA```.  
 7 | The goal of this option is to simplify the process for end-users. The final alignments are also automatically transformed back to genomic coordinates, thus completely wrapping the entire process.
 8 | 
 9 | <!-- Depiction of how this is implemented is presented in [Figure 1](link). -->
10 | 
11 | To use this feature, simply specify ```--gtf annotations.gtf``` alongside to other command line parameters.  
12 | 
13 | **Acknowledgements**  
14 | Mile Šikić (MS) and Niranjan Nagarajan (NN) proposed the implementation of this approach for RNA-seq mapping. Ivan Krpelnik (IK) implemented the initial version of the transcriptome generator, as well as the conversion utility to convert from transcriptome space back to genome space. IK worked under guidance from Krešimir Križanović (KK) and Ivan Sović (IS). The new methods were embedded in existing GraphMap source code by IS. KK was/is working on evaluation and benchmarking of our RNA-seq methods. Mile Šikić (MS) supervised the project.
15 | 
16 | ### 2. Spliced alignments  
17 | Support for spliced alignments in GraphMap is a work in progress and currently experimental.  
18 | To activate this mode, specify ```-x rnaseq``` alongside to other command line parameters. This feature is available on the ```rna-alpha``` branch. Install and compile in the testing mode as such:  
19 | ```  
20 | git clone https://github.com/isovic/graphmap  
21 | cd graphmap  
22 | git checkout rna-alpha  
23 | make modules  
24 | make -j 4 testing  
25 | ```  
26 | 
27 | After this, run GraphMap using:  
28 | ```  
29 | bin/graphmap-not_release align -x rnaseq -r ref.fa -d reads.fastq -o out.sam  
30 | ```  
31 | 
32 | ***Please be aware*** that this is currently a highly experimental. It is not production-ready. Implementation may vary significantly from implementation to implementation.
33 | <!-- This is a composite option which is equivalent to
34 | Equivalent to: ```--ambiguity 0.5 --secondary --min-bin-perc 0.01 --bin-step 0.99 --max-regions 20 --spliced``` -->
35 | 
36 | Here is a short description of the approach we are taking.
37 | 
38 | **2.1 Region selection**  
39 | The GraphMap [paper](http://www.nature.com/ncomms/2016/160415/ncomms11307/full/ncomms11307.html) describes the region selection process (first step in GraphMap). In short, an array of bins is constructed, where each bin represents a consecutive, non-overlapping region of the reference, where each bin is of size ```read_length / 3```. For each seed of an analyzed read, all hits on the reference are looked-up. For each hit, ```+1``` is added to a bin corresponding to region where the hit falls into. (If a seed has multiple hits in the same region, only one is counted).
40 | 
41 | <img src="img/region_selection.png" width="316" height="300" align="middle">
42 | 
43 | Regions are then sorted in descending order of their counts and further processed one by one. Before a region is processed, it is first extended on both ends (by ```read_length```) so that the entire read may fit inside after the alignment.  
44 | This approach was shown to be very well suited for detecting secondary alignments, as different regions which might contain similarly good alignments would be processed individually.  
45 | In the default alignment mode, this approach can produce pretty sensitive alignments.
46 | 
47 | Now consider mapping of RNA-seq reads. In this case, a read can actually be split into several distant regions across a chromosome. Should the same region selection strategy be applied, the bin counts would simply redistribute to different regions. This means that, should the exons have a few good seed hits, we could detect the correct regions and further process them to obtain the spliced alignments. Of course, noise hits will cause trouble (this will be addressed in continuation). For RNA-seq, regions are also sorted by their bin counts and further processed using the Graph Mapping and the LCSk steps.
48 | 
49 | <img src="img/region_selection-rna.png" width="316" height="300" align="middle">
50 | 
51 | **2.2. Graph Mapping and LCSk**  
52 | For each region, Graph Mapping is performed to obtain anchors (matches between the read and the region). Anchors are filtered using the LCSk method.  
53 | These steps are the same as in normal DNA mapping case.  
54 | 
55 | However, here we add a method of **clustering anchors** after they have been filtered. Clusters are obtained using the classic chaining approach which joins anchors that are close enough, and are nearly on the same diagonal.  
56 | Clusters then represent larger matching chunks between a read and a reference.  
57 | 
58 | For normal DNA mapping, one would ideally (in the abscence of structural variants) expect to see one large cluster, such as shown below:  
59 | 
60 | <img src="img/anchors-normal.png" width="316" height="300" align="middle">
61 | 
62 | However, in case of RNA-seq mapping (or in presence of structural variants) such a graph might look like something closer to the following figure:  
63 | 
64 | <img src="img/anchors-rna.png" width="316" height="300" align="middle">
65 | 
66 | What's more, viewed in such way, a cluster actually can represent an *exon*!  
67 | Now, if we collect all clusters (some of them being possible repeats), we can use this information to create our spliced alignments!  
68 | 
69 | But, hold on. Since some exons can be separated by a large gap on the reference (much larger than the read), we need to consider other regions simultaneously.  
70 | 
71 | For this reason, all clusters (represented with their start and end coordinates in both the reference and the read) for all analyzed regions are first collected in a single list.  
72 | 
73 | Then, the **knapsack** algorithm is applied on the list of clusters.  
74 | 
75 | **2.3 Knapsack algorithm**  
76 | [Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem) is a problem of combinatorial optimization. Given a set of items, each with a weight and a value, the problem it tries to solve is "how to fill a knapsack with items so that the total weight is less than or equal to the given limit, and the value is as large as possible".
77 | 
78 | In our case:  
79 | - Knapsack is a *read*
80 | - Weight limit is the read length
81 | - An item is a cluster
82 | - Item weight is the length of the cluster in the read coordinate space  
83 | 
84 | Now, solving the knapsack problem would result in a list of clusters which fill the read the most. Alignment is then performed only on those clusters, and reported as separate SAM lines - one for each cluster (exon).  
85 | 
86 | **Acknowledgements**  
87 | Ivan Sović (IS) proposed this solution for mapping of RNA-seq reads based on the knapsack algorithm. The initial version of the knapsack algorithm was implemented by Antonio Jurić (AJ). It was embedded in existing GraphMap source code by IS. Krešimir Križanović (KK) was/is working on evaluation and benchmarking of our RNA-seq methods and helped guide AJ. Mile Šikić (MS) supervised the project.
88 | 


--------------------------------------------------------------------------------
/doc/sam_output.md:
--------------------------------------------------------------------------------
 1 | ### Details on the SAM output generated by GraphMap
 2 | 
 3 | Description of special tags in the SAM output:  
 4 | - **ZE** - The E-value. More accurately - a pesimistic approximation of the E-value obtained by rescoring the generated alignment with scores/penalties for which pre-calculated Gumbel parameters exist. Concretely, scores/penalties are: ```match = 5, mismatch = -4, gap_open = -8, gap_extend = -6```. By default, there is no threshold on the E-value so even weak homologies would be reported, but there is a parameter which provides this functionality (```-z```), e.g.: ```-z 1e0```.  
 5 | - **ZF** - An internal parameter for quality of alignment calculated using equation (8) in our preprint: (http://biorxiv.org/content/early/2015/06/10/020719). In GraphMap, potential regions for a read are sorted by this parameter, and the primary alignment is the one with the largest ZF value. ZF values for different reads are not mutually comparable.  
 6 | - **ZQ** - Query (read) length.
 7 | - **ZR** - Reference length.  
 8 | - **H0** - Specified by SAM format as the "number of perfect hits", GraphMap reports here the number of possible mapping positions with the same number of kmer hits.
 9 | - **NM** - Edit distance, specified by the SAM format.  
10 | - **AS** - Alignment score, specified by the SAM format.  
11 | 
12 | There are two hidden gems in GraphMap's output, providing more detailed reporting of the alignment process. Compiling GraphMap with ```make testing``` will generate a binary file on path ```bin/graphmap-not_release```. Running this version using parameter ```-b 3``` will generate a more verbose version of the SAM output file:
13 | - **X3** - A string containing very verbose information about the alignment of a particular read.
14 | - **X4** - Measurement of the CPU time spent on major parts of the algorithm, in a human-readible text format.  
15 | 


--------------------------------------------------------------------------------
/overlap.md:
--------------------------------------------------------------------------------
 1 | ## GraphMap Owler - Overlap With Long Erroneous Reads
 2 | GraphMap implements two overlap modes:  
 3 | - ```./graphmap owler``` - fast, uses a trimmed GraphMap pipeline, reports output in MHAP or PAF formats, and  
 4 | - ```./graphmap align -x overlap``` - full GraphMap pipeline including alignment, output in SAM format.  
 5 |   
 6 | Owler mode (Overlap With Long Erroneous Reads) skips the graph-mapping and alignment steps. The full pipeline consists of the following steps:  
 7 | 1. Construct a gapped spaced index of the reads for only one shape (6-mers, "1111110111111").  
 8 | 2. For a read, collect all gapped spaced seed hits.  
 9 | 3. LCSk++.  
10 | 4. Filtering seeds reported by LCSk++.  
11 | 5. Output overlaps in MHAP-like or PAF format. For details, see below.  
12 | 
13 | Currently, no seed hits are discarded, which can make overlapping slow on larger or more repetitive datasets, but very sensitive.  
14 | 
15 | Note that the overlappers are still experimental, and require thorough testing.  
16 | 
17 | ### Output formats
18 | **MHAP** format is described here: [http://mhap.readthedocs.org/en/latest/quickstart.html#output](http://mhap.readthedocs.org/en/latest/quickstart.html#output).  
19 | GraphMap's output uses the same columns, but the meaning of columns 3 and 4 (```Jaccard score``` and ```# shared min-mers``` respectively) is different in our context.  
20 | Instead of ```Jaccard score``` the fraction of bases covered by seeds is reported.  
21 | Instead of ```# shared min-mers``` the number of seeds which survived filtering is reported.  
22 | 
23 | GraphMap can also output overlaps to **PAF** format. Specification of the format can be found here: [https://github.com/lh3/miniasm/blob/master/PAF.md](https://github.com/lh3/miniasm/blob/master/PAF.md).  
24 | 
25 | ### Comparison to other methods  
26 | We are working on scripts to benchmark various overlapping tools on simulated and real (later) data.  
27 | An initial functioning version can be found here: [https://github.com/isovic/overlap-benchmark](https://github.com/isovic/overlap-benchmark).  
28 | 
29 | ### Examples   
30 | ```  
31 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast):  
32 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.mhap  
33 | 
34 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in PAF format:  
35 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.paf -L paf  
36 | 
37 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow):  
38 | ./graphmap align -x overlap -r reads.fa -d reads.fa -o overlaps.sam  
39 | ```  
40 | 


--------------------------------------------------------------------------------
/reproducibility/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/reproducibility/README.md


--------------------------------------------------------------------------------
/reproducibility/run.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | import os;
 4 | import sys;
 5 | import subprocess;
 6 | 
 7 | def execute_command(command):
 8 | 	sys.stderr.write('Executing command: %s\n' % (command));
 9 | 	subprocess.call(command, shell=True);
10 | 
11 | def run_simulations():
12 | 	sys.stderr.write('Starting the alignment process on simulated data.\n');
13 | 	sys.stderr.write('Note that this might take a very long time.\n');
14 | 	sys.stderr.write('E.g. BLAST took 110670 CPU secs in our tests on hg19_chr3 Oxford Nanopore 2d simulated dataset.\n');
15 | 	execute_command('aligneval/run-alignment.py');
16 | 	sys.stderr.write('\n');
17 | 
18 | 	sys.stderr.write('Alignment script returned.\n');
19 | 	sys.stderr.write('\n');
20 | 
21 | 	sys.stderr.write('Running the evaluation script.\n');
22 | 	execute_command('aligneval/run-evaluation.py');
23 | 	sys.stderr.write('\n');
24 | 
25 | 	sys.stderr.write('Copying the results to reproducibility/results-simulated folder.\n');
26 | 	execute_command('cp aligneval/results/*.csv results-simulated');
27 | 
28 | 	sys.stderr.write('Done!\n');
29 | 	sys.stderr.write('\n');
30 | 
31 | def main():
32 | 	if (os.path.exists('samscripts') == False or os.path.exists('aligneval') == False):
33 | 		sys.stderr.write('Please run setup.py first, to install all dependencies. Exiting.\n');
34 | 		exit(1);
35 | 
36 | 	if (len(sys.argv) < 2):
37 | 		sys.stderr.write('Run the alignment and evaluation processes from the GraphMap preprint paper.\n');
38 | 		sys.stderr.write('Usage:\n');
39 | 		sys.stderr.write('\tsim - Runs alignment on all simulation datasets. This might take quite a while to execute.\n');
40 | 		exit(0);
41 | 
42 | 	if (sys.argv[1] == 'sim'):
43 | 		if (len(sys.argv) != 2):
44 | 			sys.stderr.write('Runs alignment on all simulation datasets. This might take quite a while to execute.\n');
45 | 			sys.stderr.write('Usage:\n');
46 | 			sys.stderr.write('\t%s %s\n' % (sys.argv[0], sys.argv[1]));
47 | 			exit(0);
48 | 
49 | 		run_simulations();
50 | 		exit(0);
51 | 		
52 | 	else:
53 | 		sys.stderr.write('ERROR: Unknown subcommand!\n');
54 | 		exit(0);
55 | 
56 | if __name__ == "__main__":
57 | 	main();
58 | 


--------------------------------------------------------------------------------
/reproducibility/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | import os;
 4 | import sys;
 5 | import subprocess;
 6 | 
 7 | def execute_command(command):
 8 | 	sys.stderr.write('Executing command: %s\n' % (command));
 9 | 	subprocess.call(command, shell=True);
10 | 
11 | def main():
12 | 	if (not os.path.exists('samscripts')):
13 | 		execute_command('git clone https://github.com/isovic/samscripts.git');
14 | 
15 | 	if (not os.path.exists('aligneval')):
16 | 		execute_command('git clone https://github.com/isovic/aligneval.git');
17 | 		execute_command('cd aligneval; ./setup.py all');
18 | 
19 | 	folders_to_generate = ['data/reads', 'data/reference', 'results-simulated', 'results-real'];
20 | 	for folder_to_generate in folders_to_generate:
21 | 		if (not os.path.exists(folder_to_generate)):
22 | 			os.makedirs(folder_to_generate);
23 | 
24 | if __name__ == "__main__":
25 | 	main();
26 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * aligner_base.h
 3 |  *
 4 |  *  Created on: Jan 7, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_ALIGNER_ALIGNER_BASE_H_
 9 | #define SRC_ALIGNER_ALIGNER_BASE_H_
10 | 
11 | #include <memory>
12 | #include <vector>
13 | #include "aligner_containers.h"
14 | #include "pairwise_penalties.h"
15 | 
16 | namespace is {
17 | 
18 | class AlignerBase {
19 |  public:
20 |   virtual ~AlignerBase() { }
21 | 
22 |   // virtual AlignmentReturnValue Align(const char* q, int64_t qlen, const char* t, int64_t tlen, AlignmentType type) = 0;    // Selects the alignment mode based on a parameter.
23 | 
24 |   virtual AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type) = 0;      // Global alignment mode.
25 | 
26 |   virtual AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0;       // Local alignment mode.
27 | 
28 |   virtual AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0;  // Semiglobal alignment mode.
29 | 
30 |   virtual AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen,     // Extend alignment mode. Does not necessarily
31 |                                       int32_t bandwidth, int32_t zdrop) = 0;                              //  produce CIGAR,but generate max alignment coords
32 | 
33 |   virtual std::shared_ptr<AlignmentResult> getResults() = 0;
34 | 
35 | };
36 | 
37 | } /* namespace is */
38 | 
39 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */
40 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_containers.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * aligner_containers.h
  3 |  *
  4 |  *  Created on: Jan 7, 2017
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef SRC_CONTAINERS_H_
  9 | #define SRC_CONTAINERS_H_
 10 | 
 11 | #include <stdint.h>
 12 | #include <limits>
 13 | #include <string>
 14 | #include <sstream>
 15 | 
 16 | #include "sam_parser.h"
 17 | 
 18 | namespace is {
 19 | 
 20 | static constexpr int64_t LARGE_NEGATIVE_INT64 = std::numeric_limits<int64_t>::min() + 10000;
 21 | 
 22 | enum class AlignmentReturnValue { // Strongly typed enum, C++11 feature.
 23 |   OK,                   // Everything went ok.
 24 |   Suboptimal,            // Alignment stepped out of defined band. Result is not optimal.
 25 |   InvalidOptions,       // In case parameters of values are invalid.
 26 |   QlenIsZero,
 27 |   TlenIsZero,
 28 |   WrongEditDist,
 29 |   AlignmentNotPerformed,  // A default value for an alignment which wasn't performed.
 30 |   NotImplementedYet     // For features in development.
 31 | };
 32 | 
 33 | enum class AlignmentType {  // Strongly typed enum, C++11 feature.
 34 |   Global,
 35 |   Local
 36 | };
 37 | 
 38 | class AlignmentPosition {
 39 |  public:
 40 |   AlignmentPosition() : qstart(0), qend(0), tstart(0), tend(0) { }
 41 |   AlignmentPosition(int64_t _qstart, int64_t _qend, int64_t _tstart, int64_t _tend) :
 42 |                     qstart(_qstart), qend(_qend), tstart(_tstart), tend(_tend) { }
 43 |   AlignmentPosition(const AlignmentPosition& op) :
 44 |                     AlignmentPosition(op.qstart, op.qend, op.tstart, op.tend) { }
 45 |   AlignmentPosition& operator=(const AlignmentPosition& op) {
 46 |     qstart = op.qstart;
 47 |     qend = op.qend;
 48 |     tstart = op.tstart;
 49 |     tend = op.tend;
 50 |     return *this;
 51 |   }
 52 | 
 53 |   int64_t qstart, qend;              // Query and target alignment start and end positions. End position
 54 |   int64_t tstart, tend;              // is inclusive (the position of the last base).
 55 | };
 56 | 
 57 | class AlignmentResult {
 58 |  public:
 59 |   AlignmentResult() : score(0), edit_dist(0), position(),
 60 |                       max_score(LARGE_NEGATIVE_INT64),
 61 |                       max_q_pos(-1),
 62 |                       max_t_pos(-1), k(-1), rv(AlignmentReturnValue::AlignmentNotPerformed) {
 63 |   }
 64 | 
 65 |   AlignmentResult(const AlignmentResult& op) :
 66 |     score(op.score), edit_dist(op.edit_dist),
 67 |     position(op.position), cigar(op.cigar),
 68 |     max_score(op.max_score), max_q_pos(op.max_q_pos),
 69 |     max_t_pos(op.max_t_pos),
 70 |     k(op.k), rv(op.rv) { // Copy constructor.
 71 |   }
 72 | 
 73 |   ~AlignmentResult() { };
 74 | 
 75 |   AlignmentResult& operator=(const AlignmentResult& op) {
 76 |     score = op.score;
 77 |     edit_dist = op.edit_dist;
 78 |     position = op.position;
 79 |     cigar = op.cigar;
 80 |     max_score = op.max_score;
 81 |     max_q_pos = op.max_q_pos;
 82 |     max_t_pos = op.max_t_pos;
 83 |     k = op.k;
 84 |     rv = op.rv;
 85 |     return *this;
 86 |   }
 87 | 
 88 |   // Alignment results.
 89 |   int64_t score;
 90 |   int64_t edit_dist;
 91 |   is::AlignmentPosition position;                   // There can be multiple alignments with the same score.
 92 |                                                     // Only the first position and the corresponding alignment
 93 |   std::vector<is::CigarOp> cigar;                   // are reported
 94 |   int64_t max_score, max_q_pos, max_t_pos;          // Maximum score in the alignment, and the coordinates on query and target.
 95 |   int64_t k;                                        // Value of band k used in the final alignment.
 96 |   AlignmentReturnValue rv;                          // Return value of the aligner.
 97 | };
 98 | 
 99 | // If any global margin is true, then the corresponding will be penalized.
100 | // Concretely, if top/left are true, then the first row/column will be initialized
101 | // to the multiple of the gap extend penalty in global alignment.
102 | // If bottom is false, the maximum of last row will be found instead of taking
103 | // the bottom right corner for global alignment.
104 | // If right is false, the maximum of last column will be found instead of taking
105 | // the bottom right corner for global alignment.
106 | class GlobalMargins {
107 |  public:
108 |   GlobalMargins()
109 |       : top(true),
110 |         left(true),
111 |         bottom(true),
112 |         right(true) {
113 |   }
114 |   GlobalMargins(bool _top, bool _left, bool _bottom, bool _right)
115 |       : top(_top),
116 |         left(_left),
117 |         bottom(_bottom),
118 |         right(_right) {
119 |   }
120 |   bool top, left, bottom, right;
121 | };
122 | 
123 | class AlignmentOptions {
124 |  public:
125 |   AlignmentOptions() :  k(-1),
126 |                         do_traceback(true) {
127 |   }
128 | 
129 |   int32_t k;                // Band for banded alignment. If < 0, banded alignment is turned off.
130 |   bool do_traceback;        // If traceback is not needed, then there is no need to alocate a large
131 |                             // matrix to store directions.
132 |   GlobalMargins gm;
133 | };
134 | 
135 | } /* namespace is */
136 | 
137 | 
138 | 
139 | #endif /* SRC_CONTAINERS_H_ */
140 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_ksw2.cc:
--------------------------------------------------------------------------------
  1 | #include "aligner_ksw2.h"
  2 | 
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <unistd.h>
  6 | #include <zlib.h>
  7 | // #include "ksw2/kseq.h"
  8 | #include "aligner_util.hpp"
  9 | #include <iostream>
 10 | 
 11 | // KSEQ_INIT(gzFile, gzread)
 12 | 
 13 | namespace is {
 14 | 
 15 | uint8_t seq_nt4_table[256] = {
 16 | 	0, 1, 2, 3,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 17 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 18 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 19 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 20 | 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
 21 | 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 22 | 	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
 23 | 	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 24 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 25 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 26 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 27 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 28 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 29 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 30 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
 31 | 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
 32 | };
 33 | 
 34 | std::shared_ptr<AlignerBase> createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) {
 35 |   return std::shared_ptr<AlignerBase>(new AlignerKSW2(p, opt));
 36 | }
 37 | 
 38 | static void print_aln(const char *tname, const char *qname, ksw_extz_t *ez)
 39 | {
 40 | 	printf("%s\t%s\t%d", tname, qname, ez->score);
 41 | 	printf("\t%d\t%d\t%d", ez->max, ez->max_t, ez->max_q);
 42 | 	if (ez->n_cigar > 0) {
 43 | 		int i;
 44 | 		putchar('\t');
 45 | 		for (i = 0; i < ez->n_cigar; ++i)
 46 | 			printf("%d%c", ez->cigar[i]>>4, "MID"[ez->cigar[i]&0xf]);
 47 | 	}
 48 | 	putchar('\n');
 49 | }
 50 | 
 51 | 
 52 | AlignerKSW2::AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) : p_(p), opt_(opt), result_(nullptr) {
 53 | 
 54 | }
 55 | 
 56 | AlignerKSW2::~AlignerKSW2() {
 57 | 
 58 | }
 59 | 
 60 | is::AlignmentReturnValue AlignerKSW2::Global(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, bool type) {
 61 | 	void *km = 0;
 62 | 	ksw_extz_t ez;  // Alignment result.
 63 | 	int w = -1, flag = 0, zdrop = -1;
 64 | 
 65 |   #ifdef HAVE_KALLOC
 66 |     km = km_init();
 67 |   #endif
 68 | 
 69 | 	memset(&ez, 0, sizeof(ksw_extz_t));
 70 | 
 71 |   auto mat = GenerateSimpleMatchMatrix<int8_t>((int8_t) p_.match, (int8_t) p_.mismatch, 5);
 72 |   // In GraphMap definition, penalties are negative. KSW2 expects positive values.
 73 |   int8_t q = -p_.w[0].v;  // Gap open. The intercept component of the affine function.
 74 |   int8_t e = -p_.w[0].u;  // Gap extend. The slope of the affine function.
 75 |   int8_t q2 = -p_.w[1].v;
 76 |   int8_t e2 = -p_.w[1].u;
 77 | 
 78 |   KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, w, zdrop, flag, &ez, type);
 79 | 
 80 |   // print_aln("Query", "Target", &ez);
 81 | 
 82 |   result_ = std::shared_ptr<is::AlignmentResult>(new is::AlignmentResult);
 83 |   result_->score = ez.score;
 84 |   result_->position = is::AlignmentPosition(0, qlen, 0, tlen);
 85 |   result_->k = -1;
 86 |   result_->rv = is::AlignmentReturnValue::OK;
 87 | 
 88 |   result_->cigar.clear();
 89 |   std::vector<is::CigarOp> basic_cigar;
 90 |   for (size_t i=0; i<ez.n_cigar; i++) {
 91 |     basic_cigar.push_back(is::CigarOp("MIDN"[ez.cigar[i]&0xf], ez.cigar[i]>>4));
 92 |   }
 93 |   result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar);
 94 | 
 95 |   result_->edit_dist = EditDistFromExtCIGAR(result_->cigar);
 96 | 
 97 |   // printf ("Converted CIGAR:\n");
 98 |   // for (size_t i=0; i<result_->cigar.size(); i++) {
 99 |   //   printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op);
100 |   // }
101 |   // printf ("\n");
102 |   // printf ("Edit distance: %ld\n", result_->edit_dist);
103 | 
104 |   kfree(km, ez.cigar);
105 |   #ifdef HAVE_KALLOC
106 |     km_destroy(km);
107 |   #endif
108 | 
109 |   return is::AlignmentReturnValue::OK;
110 | }
111 | 
112 | is::AlignmentReturnValue AlignerKSW2::Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop) {
113 |   result_ = std::shared_ptr<is::AlignmentResult>(new is::AlignmentResult);
114 | 
115 |   if (qseq == NULL || tseq == NULL || qlen <= 0 || tlen <= 0) {
116 |     return is::AlignmentReturnValue::InvalidOptions;
117 |   }
118 | 
119 | 	void *km = 0;
120 | 	ksw_extz_t ez;  // Alignment result.
121 | 	int flag = KSW_EZ_SCORE_ONLY | KSW_EZ_EXTZ_ONLY;
122 | 
123 |   #ifdef HAVE_KALLOC
124 |     km = km_init();
125 |   #endif
126 | 
127 | 	memset(&ez, 0, sizeof(ksw_extz_t));
128 | 
129 |   auto mat = GenerateSimpleMatchMatrix<int8_t>((int8_t) p_.match, (int8_t) p_.mismatch, 5);
130 |   // In GraphMap definition, penalties are negative. KSW2 expects positive values for affine pieces.
131 |   int8_t q = -p_.w[0].v;  // Gap open. The intercept component of the affine function.
132 |   int8_t e = -p_.w[0].u;  // Gap extend. The slope of the affine function.
133 |   int8_t q2 = -p_.w[1].v;
134 |   int8_t e2 = -p_.w[1].u;
135 | 
136 |   KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, bandwidth, zdrop, flag, &ez, true);
137 | 
138 |   // print_aln("Query", "Target", &ez);
139 | 
140 |   result_->score = ez.score;
141 |   result_->position = is::AlignmentPosition(0, qlen, 0, tlen);
142 |   result_->k = -1;
143 |   result_->rv = is::AlignmentReturnValue::OK;
144 |   result_->max_score = ez.max;
145 |   result_->max_q_pos = ez.max_q;
146 |   result_->max_t_pos = ez.max_t;
147 | 
148 |   result_->cigar.clear();
149 |   std::vector<is::CigarOp> basic_cigar;
150 |   for (size_t i=0; i<ez.n_cigar; i++) {
151 |     basic_cigar.push_back(is::CigarOp("MID"[ez.cigar[i]&0xf], ez.cigar[i]>>4));
152 |   }
153 |   result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar);
154 | 
155 |   result_->edit_dist = EditDistFromExtCIGAR(result_->cigar);
156 | 
157 |   // printf ("Converted CIGAR:\n");
158 |   // for (size_t i=0; i<result_->cigar.size(); i++) {
159 |   //   printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op);
160 |   // }
161 |   // printf ("\n");
162 |   // printf ("Edit distance: %ld\n", result_->edit_dist);
163 | 
164 |   kfree(km, ez.cigar);
165 |   #ifdef HAVE_KALLOC
166 |     km_destroy(km);
167 |   #endif
168 | 
169 |   return is::AlignmentReturnValue::OK;
170 | }
171 | 
172 | is::AlignmentReturnValue AlignerKSW2::Local(const char* q, int64_t qlen, const char* t, int64_t tlen) {
173 |   return is::AlignmentReturnValue::NotImplementedYet;
174 | }
175 | 
176 | is::AlignmentReturnValue AlignerKSW2::Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) {
177 |   return is::AlignmentReturnValue::NotImplementedYet;
178 | }
179 | 
180 | std::shared_ptr<is::AlignmentResult> AlignerKSW2::getResults() {
181 |   return result_;
182 | }
183 | 
184 | void AlignerKSW2::KSW2GlobalAlnWrapper_(void *km,
185 |                        const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen,
186 |                        int8_t m, const int8_t *mat,
187 |                        int8_t q, int8_t e, int8_t q2, int8_t e2,
188 |                        int w, int zdrop, int flag, ksw_extz_t *ez, bool type) {
189 | 	int i;
190 | 	ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
191 | 	ez->max = 0, ez->mqe = ez->mte = KSW_NEG_INF;
192 | 	ez->n_cigar = 0;
193 | 
194 |   auto qseq = ConvertSeqAlphabet(qseq_, qlen, &seq_nt4_table[0]);
195 |   auto tseq = ConvertSeqAlphabet(tseq_, tlen, &seq_nt4_table[0]);
196 | 
197 |   if (type) {
198 | 		ksw_extd2_sse(km, qlen, (const uint8_t*) &qseq[0],
199 | 	                tlen, (const uint8_t*) &tseq[0],
200 | 	                m, mat, q, e, q2, e2, w, zdrop, flag, ez);
201 |   } else {
202 | 	  	int noncan = 9;
203 | 	  	q = 4;
204 | 	  	e = 2;
205 | 	  	q2 = 32;
206 | 	  	zdrop = 200;
207 | 	  	flag = 1600;
208 | 
209 | 		ksw_exts2_sse(km, qlen, (const uint8_t*) &qseq[0],
210 | 	                tlen, (const uint8_t*) &tseq[0],
211 | 	                m, mat, q, e, q2, noncan, zdrop, flag, ez);
212 |   }
213 | 
214 |   // const char *algo = "extd2_sse";
215 | 	// if (strcmp(algo, "extz2_sse") == 0)   ksw_extz2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, w, zdrop, flag, ez);
216 | 	// else if (strcmp(algo, "extd2_sse") == 0)   ksw_extd2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, q2, e2, w, zdrop, flag, ez);
217 | 	// // else if (strcmp(algo, "extf2_sse") == 0)   ksw_extf2_sse(km, qlen, (uint8_t*)qseq, tlen, (uint8_t*)tseq, mat[0], mat[1], e, w, zdrop, ez);
218 | 	// else {
219 | 	// 	fprintf(stderr, "ERROR: can't find algorithm '%s'\n", algo);
220 | 	// 	exit(1);
221 | 	// }
222 | }
223 | 
224 | }
225 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_ksw2.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * aligner_base.h
 3 |  *
 4 |  *  Created on: Jan 7, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_ALIGNER_ALIGNER_KSW2_H_
 9 | #define SRC_ALIGNER_ALIGNER_KSW2_H_
10 | 
11 | #include <memory>
12 | #include <vector>
13 | #include "aligner_base.h"
14 | #include "aligner_containers.h"
15 | #include "pairwise_penalties.h"
16 | #include "aligner_util.hpp"
17 | #include "ksw2/ksw2.h"
18 | 
19 | namespace is {
20 | 
21 | class AlignerKSW2;
22 | 
23 | std::shared_ptr<AlignerBase> createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt);
24 | 
25 | class AlignerKSW2 : public AlignerBase {
26 |  public:
27 |   friend std::shared_ptr<AlignerBase> createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt);
28 | 
29 |   ~AlignerKSW2();
30 | 
31 |   AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type);   // Global alignment mode.
32 | 
33 |   AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen);    // Local alignment mode.
34 | 
35 |   AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen);   // Semiglobal alignment mode.
36 | 
37 |   AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop);
38 | 
39 |   std::shared_ptr<AlignmentResult> getResults();
40 | 
41 |  protected:
42 |   AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt);   // We don't want users attempting to instantiate manually, even though the class is virtual.
43 | 
44 |  private:
45 |   AlignerKSW2(const AlignerKSW2&) = delete;                       // No copying.
46 |   AlignerKSW2& operator=(const AlignerKSW2&) = delete;            // No copying.
47 |   AlignerKSW2(AlignerKSW2&&) = delete;                            // No move constructor.
48 |   AlignerKSW2& operator=(const AlignerKSW2&&) = delete;           // No copying.
49 | 
50 |   void KSW2GlobalAlnWrapper_(void *km,
51 |                         const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen,
52 |                         int8_t m, const int8_t *mat,
53 |                         int8_t q, int8_t e, int8_t q2, int8_t e2,
54 |                         int w, int zdrop, int flag, ksw_extz_t *ez, bool type);
55 | 
56 |   const is::PiecewisePenalties& p_;
57 |   const is::AlignmentOptions& opt_;
58 |   std::shared_ptr<is::AlignmentResult> result_;
59 | };
60 | 
61 | } /* namespace is */
62 | 
63 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */
64 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_util.cc:
--------------------------------------------------------------------------------
  1 | #include "aligner_util.hpp"
  2 | #include "assert.h"
  3 | 
  4 | #include <sstream>
  5 | #include <string>
  6 | 
  7 | namespace is {
  8 | 
  9 | std::vector<int8_t> ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table) {
 10 |   std::vector<int8_t> ret(seqlen + 33); // 32 for gaba
 11 |   for (size_t i=0; i<seqlen; i++) {
 12 |     ret[i] = (int8_t) conv_table[(uint8_t) seq[i]];
 13 |   }
 14 |   return ret;
 15 | }
 16 | 
 17 | std::vector<is::CigarOp> ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen,
 18 |                                                 const char* tseq, int64_t tlen,
 19 |                                                 const std::vector<is::CigarOp>& basic_cigar) {
 20 |   std::vector<is::CigarOp> ret;
 21 | 
 22 |   int64_t qpos = 0, tpos = 0;
 23 |   for (size_t i=0; i<basic_cigar.size(); i++) {
 24 |     char op = basic_cigar[i].op;
 25 |     int64_t count = basic_cigar[i].count;
 26 | 
 27 |     if (op != 'M') {
 28 |       ret.push_back(basic_cigar[i]);
 29 | 
 30 |       if (op == 'I' || op == 'S') {
 31 |         qpos += count;
 32 |       }
 33 |       if (op == 'D' || op == 'N') {
 34 |         tpos += count;
 35 |       }
 36 |     } else {
 37 |       char prev_m = 0;
 38 |       int64_t curr_count = 0;
 39 |       for (int64_t j=0; j<count; j++) {
 40 |         char curr_m = (qseq[qpos] == tseq[tpos]) ? '=' : 'X';
 41 |         if (j == 0) { prev_m = curr_m; }
 42 |         if (curr_m == prev_m) {
 43 |           curr_count += 1;
 44 |         } else {
 45 |           ret.push_back(is::CigarOp(prev_m, curr_count));
 46 |           prev_m = curr_m;
 47 |           curr_count = 1;
 48 |         }
 49 |         qpos += 1;
 50 |         tpos += 1;
 51 |       }
 52 |       if (curr_count > 0) {
 53 |         ret.push_back(is::CigarOp(prev_m, curr_count));
 54 |       }
 55 |     }
 56 |   }
 57 | 
 58 |   return ret;
 59 | }
 60 | 
 61 | int64_t EditDistFromExtCIGAR(const std::vector<is::CigarOp>& extended_cigar) {
 62 |   int64_t edit_dist = 0;
 63 |   for (size_t i=0; i<extended_cigar.size(); i++) {
 64 |     char op = extended_cigar[i].op;
 65 |     assert(op != 'M');
 66 |     if (op == 'X' || op == 'I' || op == 'D') {
 67 |       edit_dist += extended_cigar[i].count;
 68 |     }
 69 |   }
 70 |   return edit_dist;
 71 | }
 72 | 
 73 | std::vector<is::CigarOp> ExtractCigarBetweenQueryCoords(const std::vector<is::CigarOp>& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q) {
 74 |   std::vector<is::CigarOp> ret;
 75 | 
 76 |   int64_t qpos = 0;
 77 | 
 78 |   int lengthOfRef = 0;
 79 |   int lengthOfRead = 0;
 80 | 
 81 |   for (auto& c: cigar) {
 82 | 
 83 |     int64_t qpos_next = (c.op == 'M' || c.op == '=' || c.op == 'X' || c.op == 'I' || c.op == 'S') ? (qpos + c.count) : qpos;
 84 | 
 85 |     if (qpos > qend) { break; }
 86 | 
 87 |     if (qpos_next < qstart) {
 88 |       qpos = qpos_next;
 89 |       continue;
 90 |     }
 91 | 
 92 |     int64_t b = 0, e = c.count;
 93 | 
 94 |     if (qstart >= qpos && qstart < qpos_next) { b = qstart - qpos; }
 95 |     if (qend >= qpos && qend < qpos_next) { e = qend - qpos; }
 96 | 
 97 |     if ((e - b) > 0) {
 98 |       ret.emplace_back(is::CigarOp(c.op, (e - b)));
 99 | 
100 |       if (c.op != 'I') {
101 |     	  	  lengthOfRef += (e - b);
102 |       }
103 |       if(c.op != 'D' && c.op != 'N') {
104 |     	  	  lengthOfRead += (e - b);
105 |       }
106 |     }
107 | 
108 |     qpos = qpos_next;
109 |   }
110 | 
111 |   *cigar_length = lengthOfRef;
112 |   *cigar_length_q = lengthOfRead;
113 | 
114 |   return ret;
115 | }
116 | 
117 | std::string CigarToString(const std::vector<is::CigarOp>& cigar) {
118 |   std::stringstream ss;
119 |   for (size_t i=0; i<cigar.size(); i++) {
120 |     ss << cigar[i].count << cigar[i].op;
121 |   }
122 |   return ss.str();
123 | }
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------
/src/aligner/aligner_util.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SRC_ALIGNER_ALIGNER_UTIL_H_
 2 | #define SRC_ALIGNER_ALIGNER_UTIL_H_
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdlib.h>
 6 | #include <vector>
 7 | #include "sam_parser.h"
 8 | 
 9 | namespace is {
10 | 
11 | template<typename T>
12 | std::vector<T> GenerateSimpleMatchMatrix(T match, T mismatch, size_t alphabet_size) {
13 |   std::vector<T> matrix(alphabet_size * alphabet_size, mismatch);  // Set the mismatch score.
14 |   // Goes to "-1" to allow for 'N' bases which should not match to themselves.
15 |   for (size_t i=0; i<(alphabet_size - 1); i++) {
16 |     matrix[i*alphabet_size + i] = match;                  // Set the match score.
17 |     matrix[i*alphabet_size + alphabet_size - 1] = 0;      // Reset the last column to 0.
18 |     matrix[(alphabet_size - 1) * alphabet_size + i] = 0;  // Reset the last row to 0.
19 |   }
20 |   return matrix;
21 | }
22 | 
23 | std::vector<int8_t> ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table);
24 | 
25 | std::vector<is::CigarOp> ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen,
26 |                                                 const char* tseq, int64_t tlen,
27 |                                                 const std::vector<is::CigarOp>& basic_cigar);
28 | 
29 | int64_t EditDistFromExtCIGAR(const std::vector<is::CigarOp>& extended_cigar);
30 | 
31 | std::vector<is::CigarOp> ExtractCigarBetweenQueryCoords(const std::vector<is::CigarOp>& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q);
32 | 
33 | std::string CigarToString(const std::vector<is::CigarOp>& cigar);
34 | 
35 | }
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/aligner/anchor_aligner.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * anchor_aligner.h
 3 |  *
 4 |  *  Created on: Aug 23, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_ANCHOR_ALIGNER_H_
 9 | #define SRC_ANCHOR_ALIGNER_H_
10 | 
11 | #include <memory>
12 | #include "aligner_base.h"
13 | #include "containers/results.h"
14 | 
15 | #include <deque>
16 | #include <stack>
17 | 
18 | namespace is {
19 | 
20 | class AnchorAligner;
21 | 
22 | std::shared_ptr<AnchorAligner> createAnchorAligner(std::shared_ptr<is::AlignerBase> aligner);
23 | 
24 | class AlignmentAnchor {
25 |  public:
26 |   AlignmentAnchor() : qstart(0), qend(0), rstart(0), rend(0) { }
27 |   AlignmentAnchor(int64_t _qstart, int64_t _qend,
28 |                   int64_t _rstart, int64_t _rend) :
29 |                     qstart(_qstart), qend(_qend), rstart(_rstart), rend(_rend) { }
30 | 
31 |   int64_t qstart, qend;
32 |   int64_t rstart, rend;
33 | };
34 | 
35 | class AnchorAligner {
36 |  public:
37 |   friend std::shared_ptr<AnchorAligner> createAnchorAligner(std::shared_ptr<is::AlignerBase> aligner);
38 | 
39 |   ~AnchorAligner();
40 | 
41 |   std::shared_ptr<AlignmentResult> CreateAlignmentResult(int64_t qstart, int64_t qend, int64_t rstart, int64_t rend, std::vector<is::CigarOp> rez);
42 | 
43 |   double AlignEdges(const char *query, const char *ref, int leftRef, int rightRef, int64_t start_position_read, int64_t start_position_ref, int number_of_bases,  std::stack<is::CigarOp> cigar_stack, std::deque<is::CigarOp> cigar_queue);
44 |   void AdjustEnds(int left_offset_ref, int right_offset_ref, const char *query, const char *ref, int64_t *start_position_ref, int64_t *start_position_read, int number_of_bases, std::stack<is::CigarOp> *cigar_stack, std::deque<is::CigarOp> *cigar_queue, bool type);
45 | 
46 |   /* Sorts anchors and then performs global alignment between the minimum and maximum anchor coordinates.
47 |   */
48 |   std::shared_ptr<AlignmentResult> GlobalEndToEnd(int64_t abs_ref_id, std::shared_ptr<is::MinimizerIndex> index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector<AlignmentAnchor>& anchors);
49 | 
50 |   /* Sorts the anchors, and aligns every neighboring pair of anchors. It does not extend beyond
51 |      the ends of the first and last anchor.
52 |   */
53 |   std::shared_ptr<AlignmentResult> GlobalAnchored(int64_t abs_ref_id, std::shared_ptr<is::MinimizerIndex> index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector<AlignmentAnchor>& anchors, bool type);
54 |   std::shared_ptr<AlignmentResult> GlobalAnchoredWithClipping(const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector<AlignmentAnchor>& anchors);
55 | 
56 |   /* Sorts the anchors, and aligns every neighboring pair of anchors. This extends alignments beyond
57 |      the ends of the first and last anchor in an attempt to produce end-to-end alignment.
58 |   */
59 |   std::shared_ptr<AlignmentResult> GlobalAnchoredWithExtend(int64_t abs_ref_id, std::shared_ptr<is::MinimizerIndex> index, const char *query, int64_t qlen, const char *ref, int64_t rlen,
60 |                                                             const std::vector<AlignmentAnchor>& anchors, int32_t bandwidth, int32_t zdrop, bool type);
61 | 
62 |  private:
63 |   AnchorAligner(const AnchorAligner&) = delete;
64 |   AnchorAligner& operator=(const AnchorAligner&) = delete;
65 | 
66 |   AnchorAligner(std::shared_ptr<is::AlignerBase> aligner);
67 | 
68 |   const std::shared_ptr<is::AlignerBase> aligner_;
69 | };
70 | 
71 | }
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/aligner/pairwise_penalties.h:
--------------------------------------------------------------------------------
 1 | #ifndef SRC_ALIGNER_PAIRWISE_PENALTIES_H_
 2 | #define SRC_ALIGNER_PAIRWISE_PENALTIES_H_
 3 | 
 4 | #include <stdint.h>
 5 | #include <vector>
 6 | 
 7 | namespace is {
 8 | 
 9 | /* Regular alignment penalties for a single piece Gotoh alignment.
10 | */
11 | class Penalties {
12 |  public:
13 |     Penalties() : match(5), mismatch(-4), gapopen(-8), gapext(-6) { }
14 |     Penalties(int32_t _match, int32_t _mismatch, int32_t _gapopen, int32_t _gapext) :
15 |               match(_match), mismatch(_mismatch), gapopen(_gapopen), gapext(_gapext) { }
16 |     int32_t match, mismatch, gapopen, gapext;
17 | };
18 | 
19 | /* A helper class for a linear function. Used for piecewise Gotoh alignment.
20 | */
21 | class AffinePiece {
22 |  public:
23 |   AffinePiece() : u(-6.0), v(-8.0) { }
24 |   AffinePiece(float _u, float _v) : u(_u), v(_v) { }
25 | 
26 |   inline float calc(int32_t k) const {
27 |     return (u * (k) + v);
28 |   }
29 | 
30 |   float u, v;     // Line equation parameters: w(k) = u * k + v.
31 | };
32 | 
33 | /* Penalties for a multiple affine function alignment.
34 | */
35 | class PiecewisePenalties {
36 |   public:
37 |     PiecewisePenalties() : match(5), mismatch(-4), w(std::vector<AffinePiece>{AffinePiece(-6.0, -8.0)}) { }
38 | 
39 |     PiecewisePenalties(int32_t _match, int32_t _mismatch, const std::vector<AffinePiece>& _w) :
40 |               match(_match), mismatch(_mismatch), w(_w) { }
41 | 
42 |     std::string Verbose() {
43 |       std::stringstream ss;
44 |       ss << "match = " << match << ", mismatch = " << mismatch << "";
45 |       for (int32_t l = 0; l < w.size(); l++) {
46 |         ss << ", w[" << l << "] = {u = " << w[l].u << ", v = " << w[l].v << "}";
47 |       }
48 |       ss << "\n";
49 |       return ss.str();
50 |     }
51 | 
52 |     float match, mismatch;
53 |     std::vector<AffinePiece> w;
54 | };
55 | 
56 | }
57 | 
58 | #endif


--------------------------------------------------------------------------------
/src/aligner/sam_parser.cc:
--------------------------------------------------------------------------------
  1 | #include "sam_parser.h"
  2 | 
  3 | #include <sstream>
  4 | 
  5 | namespace is {
  6 | 
  7 | int SplitCigar(const std::string &cigar_str, std::vector<CigarOp>& ret) {
  8 |   ret.clear();
  9 |   CigarOp op;
 10 |   // int32_t digit_count = 0;
 11 |   int64_t pos_ref = 0, pos_query = 0;
 12 |   const char *first_digit = NULL;
 13 |   for (size_t i=0; i<cigar_str.size(); i++) {
 14 |     if (isalpha(cigar_str[i]) || cigar_str[i] == '=') {
 15 |       // op.pos_ref = pos_ref;
 16 |       // op.pos_query = pos_query;
 17 |       op.op = cigar_str[i];
 18 |       sscanf(first_digit, "%d", &op.count);
 19 |       ret.push_back(op);
 20 |       first_digit = NULL;
 21 |       if (is_cigar_ref(op.op)) pos_ref += op.count;
 22 |       if (is_cigar_read(op.op)) pos_query += op.count;
 23 |     } else if (first_digit == NULL) {
 24 |       first_digit = &(cigar_str[i]);
 25 |     }
 26 |   }
 27 |   return 0;
 28 | }
 29 | 
 30 | int64_t CalcReferenceLengthFromCigar(const std::vector<CigarOp>& split_cigar) {
 31 |   int64_t len = 0;
 32 |   for (size_t i=0; i<split_cigar.size(); i++) {
 33 |     if (is_cigar_ref(split_cigar[i].op)) { len += split_cigar[i].count; }
 34 |   }
 35 |   return len;
 36 | }
 37 | 
 38 | SamLine::SamLine() {
 39 | 
 40 | }
 41 | 
 42 | SamLine::SamLine(const std::string& line) {
 43 |   ParseLine(line);
 44 | }
 45 | 
 46 | SamLine::~SamLine() {
 47 | 
 48 | }
 49 | 
 50 | int SamLine::ParseLine(const std::string& line) {
 51 |   if (line.size() == 0) { return 1; }
 52 | 
 53 |   std::stringstream ss(line);
 54 |   std::string cigar_string;
 55 |   ss >> qname >> flag >> rname >> pos >>
 56 |         mapq >> cigar_string >> rnext >> pnext >> tlen >> seq >> qual;
 57 | 
 58 |   SplitCigar(cigar_string, cigar);
 59 | 
 60 |   std::string all_optional;
 61 |   std::getline(ss, all_optional);
 62 |   Tokenize_(all_optional, '\t', optional);
 63 |   return 0;
 64 | }
 65 | 
 66 | bool SamLine::IsMapped() {
 67 |   return (!(flag & 4));
 68 | }
 69 | 
 70 | bool SamLine::IsReverse() {
 71 |   return ((flag & 16));
 72 | }
 73 | 
 74 | 
 75 | int SamLine::FindAlignmentPosition(int64_t& q_start, int64_t& q_end,
 76 |                                    int64_t& r_start, int64_t& r_end) {
 77 |   q_start = 0;
 78 |   q_end = seq.size();
 79 | 
 80 |   // Find query alignment start (skip the soft clipped bases).
 81 |   for (auto& c: cigar) {
 82 |     if (c.op == 'H') {
 83 |       continue;
 84 |     } else if (c.op == 'S') {
 85 |       q_start += c.count;
 86 |     } else {
 87 |       break;
 88 |     }
 89 |   }
 90 | 
 91 |   // Find query alignment end (skip the soft clipped bases).
 92 |   for (int64_t i=(cigar.size() - 1); i >= 0; i--) {
 93 |     auto& c = cigar[i];
 94 |     if (c.op == 'H') {
 95 |       continue;
 96 |     } else if (c.op == 'S') {
 97 |       q_end -= c.count;
 98 |     } else {
 99 |       break;
100 |     }
101 |   }
102 | 
103 |   // Find reference alignment start. (Convert from 1-based to 0-based).
104 |   r_start = pos - 1;
105 | 
106 |   // Find reference alignment end.
107 |   r_end = r_start + CalcReferenceLengthFromCigar(cigar);
108 | 
109 |   // Do not performe reverse complementing here, we do not know
110 |   // the length of the reference.
111 | 
112 |   return 0;
113 | }
114 | 
115 | void SamLine::Tokenize_(const std::string& str, const char delimiter, std::vector<std::string>& words) {
116 |   words.clear();
117 |   std::stringstream ss(str);
118 |   std::string line;
119 |   while(std::getline(ss, line, delimiter)) {
120 |     if (line.size() == 0) { continue; }
121 |     words.push_back(line);
122 |   }
123 | }
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------
/src/aligner/sam_parser.h:
--------------------------------------------------------------------------------
  1 | #ifndef SRC_SAM_PARSER_H_
  2 | #define SRC_SAM_PARSER_H_
  3 | 
  4 | #include <stdio.h>
  5 | #include <string>
  6 | #include <sstream>
  7 | #include <vector>
  8 | 
  9 | namespace is {
 10 | 
 11 | #define is_cigar_op(x)  (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'D' || x == 'S' || x == 'H')
 12 | #define is_cigar_match(x)  (x == 'M' || x == '=' || x == 'X')
 13 | #define is_cigar_ins(x)  (x == 'I')
 14 | #define is_cigar_del(x)  (x == 'D')
 15 | #define is_cigar_soft(x)  (x == 'S')
 16 | #define is_cigar_hard(x)  (x == 'H')
 17 | #define is_cigar_ref(x)  (x == 'M' || x == '=' || x == 'X' || x == 'D')
 18 | #define is_cigar_read(x)  (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'S')
 19 | 
 20 | // class CigarOp {
 21 | //  public:
 22 | //   char op = '-';
 23 | //   int32_t count = 0;
 24 | //   int64_t pos_ref = -1;     // Relative to the pos_ field of the corresponding SequenceAlignment object. pos_ref starts from zero, eventhough the actuall alignment starts at an arbitrary position on the reference.
 25 | //   int64_t pos_query = - 1;
 26 | 
 27 | //   CigarOp() { }
 28 | //   CigarOp(char _op, int32_t _count, int64_t _pos_ref, int64_t _pos_query) : op(_op), count(_count), pos_ref(_pos_ref), pos_query(_pos_query) { }
 29 | 
 30 | // };
 31 | 
 32 | /** @brief A container for a single CIGAR operation.
 33 |  *
 34 |  */
 35 | class CigarOp {
 36 |  public:
 37 |   CigarOp() : op(0), count(0) { }
 38 |   CigarOp(char _op, int32_t _count) : op(_op), count(_count) { }
 39 |   CigarOp(const CigarOp& t) : CigarOp(t.op, t.count) { }
 40 |   ~CigarOp() { }
 41 |   CigarOp& operator=(const CigarOp t) {
 42 |     op = t.op;
 43 |     count = t.count;
 44 |     return *this;
 45 |   }
 46 |   std::string get() { std::stringstream ss; ss << count << op; return ss.str(); }
 47 | 
 48 |   char op;
 49 |   int64_t count;
 50 | };
 51 | 
 52 | 
 53 | int SplitCigar(const std::string &cigar_str, std::vector<CigarOp>& ret);
 54 | int64_t CalcReferenceLengthFromCigar(const std::vector<CigarOp>& split_cigar);
 55 | 
 56 | class SamLine {
 57 |  public:
 58 |   SamLine();
 59 |   SamLine(const std::string& line);
 60 | //   SamLine();
 61 | // SequenceAlignment::SequenceAlignment(uint32_t _flag, std::string &rname, int64_t pos, int32_t mapq, std::string &cigar_string, std::string &rnext, int64_t pnext, int64_t tlen, std::vector<std::string> &optional)
 62 | // : flag_(flag), rname_(rname), pos_(pos), mapq_(mapq), rnext_(rnext), pnext_(pnext), tlen_(tlen), optional_(optional) {
 63 | //   SplitCigar(cigar_string, cigar_);
 64 | //   ProcessOptional();
 65 | // }
 66 |   ~SamLine();
 67 | 
 68 |   int ParseLine(const std::string& line);
 69 |   std::string YieldString();
 70 |   bool IsMapped();
 71 |   bool IsReverse();
 72 |   int FindAlignmentPosition(int64_t& q_start, int64_t& q_end,
 73 |                                     int64_t& r_start, int64_t& r_end);
 74 | 
 75 |   std::string qname;      // Field #1.
 76 |   uint32_t flag;          // Field #2.
 77 |   std::string rname;      // Field #3.
 78 |   int64_t pos;            // Field #4.
 79 |   int32_t mapq;           // Field #5.
 80 |   //  std::string cigar;  // Field #6.
 81 |   std::vector<CigarOp> cigar;
 82 |   std::string rnext;      // Field #7.
 83 |   int64_t pnext;          // Field #8.
 84 |   int64_t tlen;           // Field #9.
 85 |   std::string seq;        // Field #10.
 86 |   std::string qual;       // Field #11.
 87 | 
 88 |   // Optional fields in the SAM format:
 89 |   int64_t as;           // Alignment score.
 90 |   double evalue;        // E-value. There is no dedicated field in the SAM format, but GraphMap uses ZE to specify the E-value.
 91 |   std::vector<std::string> optional;  // Raw values (strings) of optional fields, not explicitly converted to expected values;
 92 | 
 93 | 
 94 | 
 95 |  private:
 96 |   void Tokenize_(const std::string& str, const char delimiter, std::vector<std::string>& words);
 97 | 
 98 | };
 99 | 
100 | }
101 | 
102 | #endif
103 | 


--------------------------------------------------------------------------------
/src/alignment/alignment.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * alignment.h
 3 |  *
 4 |  *  Created on: Jan 17, 2016
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_ALIGNMENT_ALIGNMENT_H_
 9 | #define SRC_ALIGNMENT_ALIGNMENT_H_
10 | 
11 | #include <stdint.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <dirent.h>
15 | #include <string>
16 | #include <sstream>
17 | #include <vector>
18 | #include <memory>
19 | 
20 | #include "sequences/single_sequence.h"
21 | #include "utility/utility_general.h"
22 | #include "program_parameters.h"
23 | #include "utility/utility_conversion-inl.h"
24 | #include "containers/path_graph_entry.h"
25 | #include "libs/edlib.h"
26 | #include "alignment/cigargen.h"
27 | #include "alignment_wrappers.h"
28 | #include "log_system/log_system.h"
29 | #include "containers/region.h"
30 | #include "seqan/basic.h"
31 | #include "seqan/align.h"
32 | #include "seqan/sequence.h"
33 | #include "seqan/stream.h"
34 | #include "utility/evalue.h"
35 | #include "graphmap/transcriptome.h"
36 | 
37 | 
38 | 
39 | 
40 | int AlignRegion(const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, std::shared_ptr<is::Transcriptome> transcriptome, const ProgramParameters *parameters, const EValueParams *evalue_params, bool extend_to_end, PathGraphEntry *region_results);
41 | int SemiglobalAlignment(AlignmentFunctionType AlignmentFunction,
42 |                         const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, const ProgramParameters *parameters,
43 |                         const EValueParams *evalue_params, PathGraphEntry *region_results);
44 | int AnchoredAlignmentNew(AlignmentFunctionType AlignmentFunctionNW, AlignmentFunctionType AlignmentFunctionSHW,
45 |                          const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, std::shared_ptr<is::Transcriptome> transcriptome, const ProgramParameters *parameters,
46 |                          const EValueParams *evalue_params, PathGraphEntry *region_results, bool align_end_to_end, bool spliced_alignment);
47 | 
48 | void VerboseAlignment(const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, const ProgramParameters *parameters, const AlignmentResults *aln);
49 | 
50 | /// Determines the start and end locations for semiglobal alignment, keeping in mind the boundaries of the reference being aligned to. Works with circular alignment as well.
51 | //int GetAlignmentWindowFromRegion(const SingleSequence *read, const Index *index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
52 | //                                 int64_t *win_start, int64_t *win_end, int64_t *win_len);
53 | int GetL1PosInRegion(const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
54 |                      int64_t *l1_start, int64_t *l1_end);
55 | 
56 | // Checks if the region is linear or circular. If it's linear, only a pointer to the beginning of the region (in the index) will be returned. Otherwise, a data array will be created containing the
57 | // concatenated region.
58 | // Returns 0 if the region was linear, otherwise 1. Value of 1 means that manual cleanup of ret_data is required, using free().
59 | int GetAlignmentWindowData(const SingleSequence *read, std::shared_ptr<is::MinimizerIndex> index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
60 |                            int8_t** data, int64_t* data_length, int8_t **pos_of_win_start, int8_t **pos_of_win_end, int64_t* offset_from_ref_start, int64_t* pos_of_ref_end, bool *is_cleanup_required);
61 | 
62 | int FindCircularEnd(const std::vector<uint8_t> &alignment, int64_t pos_of_ref_end,
63 |                     int64_t *ret_end_on_aln, int64_t *ret_end_on_read, int64_t *ret_end_on_ref,
64 |                     int64_t *ret_start_on_aln, int64_t *ret_start_on_read, int64_t *ret_start_on_ref);
65 | 
66 | int SplitCircularAlignment(const AlignmentResults *aln, int64_t pos_of_ref_end, int64_t ref_start, int64_t ref_len, AlignmentResults *aln_l, AlignmentResults *aln_r);
67 | 
68 | 
69 | int CheckAlignmentSane(std::vector<unsigned char> &alignment, const SingleSequence* read=NULL,std::shared_ptr<is::MinimizerIndex> index=nullptr, int64_t reference_hit_id=-1, int64_t reference_hit_pos=-1);
70 | 
71 | #endif /* SRC_ALIGNMENT_ALIGNMENT_H_ */
72 | 


--------------------------------------------------------------------------------
/src/alignment/alignment_wrappers.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * local_realignment_generic.h
  3 |  *
  4 |  *  Created on: Jan 16, 2015
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef LOCAL_REALIGNMENT_GENERIC_H_
  9 | #define LOCAL_REALIGNMENT_GENERIC_H_
 10 | 
 11 | #include <stdint.h>
 12 | #include <stdio.h>
 13 | #include <stdlib.h>
 14 | #include <dirent.h>
 15 | #include <string>
 16 | #include <sstream>
 17 | #include <vector>
 18 | 
 19 | #include "utility/utility_general.h"
 20 | #include "program_parameters.h"
 21 | #include "libs/edlib.h"
 22 | #include "alignment/cigargen.h"
 23 | #include "log_system/log_system.h"
 24 | #include "containers/region.h"
 25 | #include "seqan/basic.h"
 26 | #include "seqan/align.h"
 27 | #include "seqan/sequence.h"
 28 | #include "seqan/stream.h"
 29 | 
 30 | #define ALIGNMENT_TYPE_SHW  0     /// Gaps at the end are not penalized.
 31 | #define ALIGNMENT_TYPE_HW   1     /// Gaps at the beginning and the end are not penalized.
 32 | #define ALIGNMENT_TYPE_NW   2     /// Global alignment (gaps at the beginning and the end are penalized).
 33 | 
 34 | #ifndef RELEASE_VERSION
 35 |   #include "libs/opal.h"
 36 | #endif
 37 | 
 38 | typedef int (*AlignmentFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t*, int64_t*, int64_t*, std::vector<unsigned char> &);
 39 | typedef int (*EditDistanceFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t*, int64_t*, int);
 40 | 
 41 | int LocalizeAlignmentPosWithMyers(const int8_t *read_data, int64_t read_length,
 42 |                                   const int8_t *reference_data, int64_t reference_length,
 43 |                                   int64_t rough_reference_start, int64_t rough_reference_end,
 44 |                                   int64_t *ret_alignment_start, int64_t *ret_alignment_end,
 45 |                                   int64_t *ret_start_ambiguity, int64_t *ret_end_ambiguity,
 46 |                                   int64_t *ret_edit_distance, int64_t *ret_band_width,
 47 |                                   bool verbose_debug_output=false);
 48 | 
 49 | int MyersSemiglobalWrapper(const int8_t *read_data, int64_t read_length,
 50 |                            const int8_t *reference_data, int64_t reference_length,
 51 |                            int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 52 |                            int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 53 |                            int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 54 | 
 55 | int MyersNWWrapper(const int8_t *read_data, int64_t read_length,
 56 |                    const int8_t *reference_data, int64_t reference_length,
 57 |                    int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 58 |                    int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 59 |                    int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 60 | 
 61 | int MyersSHWWrapper(const int8_t *read_data, int64_t read_length,
 62 |                    const int8_t *reference_data, int64_t reference_length,
 63 |                    int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 64 |                    int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 65 |                    int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 66 | 
 67 | #ifndef RELEASE_VERSION
 68 | int OpalNWWrapper(const int8_t *read_data, int64_t read_length,
 69 |                    const int8_t *reference_data, int64_t reference_length,
 70 |                    int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 71 |                    int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 72 |                    int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 73 | 
 74 | int OpalSHWWrapper(const int8_t *read_data, int64_t read_length,
 75 |                    const int8_t *reference_data, int64_t reference_length,
 76 |                    int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 77 |                    int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 78 |                    int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 79 | #endif
 80 | 
 81 | int SeqAnSemiglobalWrapper(const int8_t *read_data, int64_t read_length,
 82 |                            const int8_t *reference_data, int64_t reference_length,
 83 |                            int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 84 |                            int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 85 |                            int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 86 | int SeqAnSemiglobalWrapperWithMyersLocalization(const int8_t *read_data, int64_t read_length,
 87 |                                                 const int8_t *reference_data, int64_t reference_length,
 88 |                                                 int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
 89 |                                                 int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
 90 |                                                 int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
 91 | 
 92 | int MyersEditDistanceWrapper(const int8_t *read_data, int64_t read_length,
 93 |                              const int8_t *reference_data, int64_t reference_length,
 94 |                              int64_t *ret_alignment_position_end,
 95 |                              int64_t *ret_edit_distance, EdlibAlignMode edlib_mode_code=EDLIB_MODE_HW);
 96 | 
 97 | int SeqAnNWWrapper(const int8_t *read_data, int64_t read_length,
 98 |                            const int8_t *reference_data, int64_t reference_length,
 99 |                            int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
100 |                            int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
101 |                            int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
102 | int SeqAnSHWWrapper(const int8_t *read_data, int64_t read_length,
103 |                            const int8_t *reference_data, int64_t reference_length,
104 |                            int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
105 |                            int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
106 |                            int64_t *ret_edit_distance, std::vector<unsigned char> &ret_alignment);
107 | 
108 | //int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align, bool is_global, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector<unsigned char> &ret_alignment);
109 | int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align, int alignment_type, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector<unsigned char> &ret_alignment);
110 | 
111 | int CheckAlignmentSaneSimple(std::vector<unsigned char> &alignment);
112 | 
113 | 
114 | 
115 | #endif /* LOCAL_REALIGNMENT_GENERIC_H_ */
116 | 


--------------------------------------------------------------------------------
/src/alignment/cigargen.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * cigargen.h
 3 |  *
 4 |  *  Created on: Aug 28, 2014
 5 |  *      Author: ivan
 6 |  */
 7 | 
 8 | #ifndef CIGARGEN_H_
 9 | #define CIGARGEN_H_
10 | 
11 | #include <stdio.h>
12 | #include <string.h>
13 | #include <stdlib.h>
14 | #include <stdint.h>
15 | #include <string>
16 | #include <sstream>
17 | #include <algorithm>
18 | 
19 | #include "libs/edlib.h"
20 | #include "utility/utility_general.h"
21 | #include "sequences/sequence_alignment.h"
22 | 
23 | #define EDLIB_M 0
24 | #define EDLIB_EQUAL 0
25 | #define EDLIB_X 3
26 | #define EDLIB_I 1
27 | #define EDLIB_D 2
28 | #define EDLIB_S 4
29 | #define EDLIB_H 5     /// Not used in GraphMap currently (26.01.2016.)
30 | #define EDLIB_NOP 6
31 | #define EDLIB_N 7     // Large gaps (e.g. splicing sites).
32 | #define EDLIB_P 8     // Padding.
33 | 
34 | inline char EdlibOpToChar(int8_t op) {
35 |   return (op == EDLIB_M || op == EDLIB_EQUAL || op == EDLIB_X) ? 'M' :
36 |           (op == EDLIB_I) ? 'I' :
37 |              (op == EDLIB_D) ? 'D' :
38 |                  (op == EDLIB_S) ? 'S' :
39 |                      (op == EDLIB_H) ? 'H' : 0;
40 | }
41 | 
42 | inline char EdlibOpToCharExtended(int8_t op) {
43 |   return (op == EDLIB_EQUAL) ? '=' :
44 |            (op == EDLIB_X) ? 'X' :
45 |                (op == EDLIB_M) ? 'M' :
46 |                    (op == EDLIB_I) ? 'I' :
47 |                        (op == EDLIB_D) ? 'D' :
48 |                            (op == EDLIB_S) ? 'S' :
49 |                                (op == EDLIB_H) ? 'H' : 0;
50 | }
51 | 
52 | std::string AlignmentToCigar(unsigned char *alignment, int alignmentLength, bool extended_format);
53 | int AlignmentToBasicCigar(unsigned char* alignment, int alignmentLength, char** cigar_);
54 | int AlignmentToExtendedCigar(unsigned char* alignment, int alignmentLength, char** cigar_);
55 | int AlignmentToExtendedCigarArray(unsigned char* alignment, int alignmentLength, std::vector<CigarOp> &cigar);
56 | std::string AlignmentToMD(std::vector<unsigned char>& alignment, const int8_t *ref_data, int64_t alignment_position_start);
57 | 
58 | /// Searches for consecutive EDLIB_I and EDLIB_D (or vice versa) operations, and replaces the overlap with EDLIB_X.
59 | std::vector<unsigned char> FixAlignment(unsigned char* alignment, int alignmentLength);
60 | /// In case an alignment has leading/trailing EDLIB_I operations, they will be replaced with EDLIB_S.
61 | int ConvertInsertionsToClipping(unsigned char* alignment, int alignmentLength);
62 | /// Counts the number of leading and trailing clipped bases (or insertions).
63 | int CountClippedBases(unsigned char* alignment, int alignmentLength, int64_t *ret_num_clipped_front, int64_t *ret_num_clipped_back);
64 | /// Sums up the bases on the reference the alignment spans through (EDLIB_M and EDLIB_D operations).
65 | int64_t CalculateReconstructedLength(unsigned char *alignment, int alignmentLength);
66 | 
67 | int CalculateAlignmentScore(std::vector<unsigned char>& alignment, int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend);
68 | 
69 | /// Counts each operation type, and calculates the alignment score as well (while rescoring the alignment with the given scores/penalties).
70 | int CountAlignmentOperations(std::vector<unsigned char> &alignment, const int8_t *read_data, const int8_t *ref_data, int64_t reference_hit_id, int64_t alignment_position_start, SeqOrientation orientation,
71 |                              int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend,
72 |                              bool skip_leading_and_trailing_insertions,
73 |                              int64_t *ret_eq, int64_t *ret_x, int64_t *ret_i, int64_t *ret_d, int64_t *ret_alignment_score, int64_t *ret_edit_dist, int64_t *ret_nonclipped_length);
74 | /// Reverses the operations in a CIGAR string.
75 | std::string ReverseCigarString(std::string &cigar);
76 | 
77 | std::string PrintAlignmentToString(const unsigned char* query, const int queryLength,
78 |                     const unsigned char* target, const int targetLength,
79 |                     const unsigned char* alignment, const int alignmentLength,
80 |                     const int position, const int modeCode, int row_width=100);
81 | 
82 | int GetAlignmentPatterns(const unsigned char* query, const int64_t queryLength,
83 |                          const unsigned char* target, const int64_t targetLength,
84 |                          const unsigned char* alignment, const int64_t alignmentLength,
85 |                          std::string &ret_query, std::string &ret_target, std::string &ret_match_pattern);
86 | 
87 | void FixAlignmentLeadingTrailingID(std::vector<unsigned char>& alignment, int64_t *ref_start, int64_t *ref_end);
88 | 
89 | #endif /* CIGARGEN_H_ */
90 | 


--------------------------------------------------------------------------------
/src/alignment/transcriptome_mod.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * transcriptome_mod.h
 3 |  *
 4 |  *  Created on: Jan 5, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_
 9 | #define SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_
10 | 
11 | //#include "index/index.h"
12 | //#include "index/index_spaced_hash_fast.h"
13 | #include "minimizer_index/minimizer_index.h"
14 | #include "containers/results.h"
15 | #include "program_parameters.h"
16 | #include "graphmap/transcriptome.h"
17 | #include <tuple>
18 | 
19 | int ConvertFromTranscriptomeToGenomeAln(const ProgramParameters *parameters, std::shared_ptr<is::MinimizerIndex> index, std::shared_ptr<is::Transcriptome> transcriptome, AlignmentResults *aln);
20 | 
21 | #endif /* SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_ */
22 | 


--------------------------------------------------------------------------------
/src/containers/mapping_data.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * mapping_data.cc
  3 |  *
  4 |  *  Created on: Mar 19, 2015
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #include "mapping_data.h"
  9 | 
 10 | MappingData::MappingData() {
 11 |   bins.clear();
 12 |   intermediate_mappings.clear();
 13 |   final_mapping_ptrs.clear();
 14 | 
 15 |   bin_size = -1;
 16 | 
 17 |   num_seeds_over_limit = 0;
 18 |   num_seeds_with_no_hits = 0;
 19 |   num_seeds_errors = 0;
 20 | 
 21 |   num_similar_mappings = 0;
 22 |   num_same_mappings = 0;
 23 |   avg_covered_bases_of_all_mappings = 0;
 24 |   std_covered_bases_of_all_mappings = 0;
 25 |   median_covered_bases_of_all_mappings = 0;
 26 | 
 27 |   iteration = 0;
 28 | 
 29 |   unmapped_reason = std::string("");
 30 | 
 31 |   num_region_iterations = 0;
 32 |   mapping_quality = 0;
 33 |   metagen_alignment_score = 0;
 34 | 
 35 |   time_region_selection = 0.0;
 36 |   time_mapping = 0.0;
 37 |   time_alignment = 0.0;
 38 |   time_region_seed_lookup = 0.0;
 39 |   time_region_hitsort = 0.0;
 40 |   time_region_conversion = 0.0;
 41 |   time_region_alloc = 0.0;
 42 |   time_region_counting = 0.0;
 43 | }
 44 | 
 45 | MappingData::~MappingData() {
 46 |   clear();
 47 | }
 48 | 
 49 | bool MappingData::IsMapped() {
 50 |   for (int32_t i=0; i<final_mapping_ptrs.size(); i++) {
 51 |     if (final_mapping_ptrs[i]->IsMapped() == true) { return true; };
 52 |   }
 53 |   return false;
 54 | }
 55 | 
 56 | bool MappingData::IsAligned() {
 57 |   for (int32_t i=0; i<final_mapping_ptrs.size(); i++) {
 58 |     if (final_mapping_ptrs[i]->IsAligned() == true) { return true; };
 59 |   }
 60 |   return false;
 61 | }
 62 | 
 63 | std::string MappingData::VerboseMappingDataToString_(const std::vector<PathGraphEntry *> *mapping_data, std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const {
 64 |   std::stringstream ss;
 65 | 
 66 |   int64_t reference_length = index->get_data().size();
 67 |   int64_t read_length = read->get_data_length();
 68 | 
 69 |   ss << "-----------------------\n";
 70 |   ss << "--- num_entries = " << mapping_data->size() << "\n";
 71 |   ss << "--- read id = " << read->get_sequence_absolute_id() << "\n";
 72 |   ss << "--- read name = " << read->get_header() << "\n";
 73 |   ss << "--- read_length = " << read_length << "\n";
 74 |   ss << "--- reference_length = " << reference_length << "\n";
 75 | 
 76 |   for (int64_t i = (mapping_data->size() - 1); i >= 0; i--) {
 77 | //    ss << "--- [" << i << "] ";
 78 |     ss << "[" << i << "/" << mapping_data->size() << "] ";
 79 |     int64_t start_location = 0, start_location_raw = 0;
 80 | 
 81 |     ss << "local_score_id = " << mapping_data->at(i)->get_mapping_data().local_score_id;
 82 |     ss << "\n      ° " << mapping_data->at(i)->VerboseToString();
 83 |     ss << "\n      ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", fwd_r_id = " << (mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()) << ", region_index = " << mapping_data->at(i)->get_region_data().region_index;
 84 |     ss << "\n        ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\"";
 85 |     ss << "\n      ° Unmapped reason: \"" << unmapped_reason << "\"";
 86 |     int64_t relative_position = 0;
 87 |     int64_t absolute_position = 0;
 88 |     SeqOrientation orientation = kForward;
 89 | 
 90 |     ///// TODO: 06.02.2017.
 91 | //     This chunk below was removed due to the incompatibilities with the new index.
 92 | //        int64_t reference_id = index->RawPositionConverter(start_location, 0, &absolute_position, &relative_position, &orientation);
 93 |     int64_t reference_id = mapping_data->at(i)->get_region_data().reference_id;
 94 | 
 95 |         int64_t reference_start = mapping_data->at(i)->get_mapping_data().ref_coords.start;
 96 | //        index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.start, 0, &absolute_position, &reference_start, &orientation);
 97 |         int64_t reference_end = mapping_data->at(i)->get_mapping_data().ref_coords.end;
 98 | //        index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.end, 0, &absolute_position, &reference_end, &orientation);
 99 | 
100 |         for (int64_t j = 0; j < mapping_data->at(i)->get_alignments().size(); j++) {
101 |           ss << "\n      ° Alignment " << j << " / " << mapping_data->at(i)->get_alignments().size();
102 |           ss << "\n        ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", region_index = " << mapping_data->at(i)->get_region_data().region_index << ", region_votes = " << mapping_data->at(i)->get_region_data().region_votes << ", position = " << relative_position << ", r1[" << reference_start << ", " << reference_end << "], " << ((orientation == kForward) ? "forward" : "reverse");
103 |           ss << ", sam_NM = " << mapping_data->at(i)->get_alignments()[j].edit_distance << ", sam_AS = " << mapping_data->at(i)->get_alignments()[j].alignment_score << ", sam_evalue = " << mapping_data->at(i)->get_alignments()[j].evalue << ", sam_pos = " << mapping_data->at(i)->get_alignments()[j].ref_start << ", sam_mapq = " << ((int64_t) mapping_data->at(i)->get_alignments()[j].mapping_quality) << ", relative_position = " << relative_position;
104 |           ss << "\n        ° r_len = " << index->get_reference_lengths()[mapping_data->at(i)->get_region_data().reference_id] << ", l1_l = " << mapping_data->at(i)->get_l1_data().l1_l <<
105 |               ", match_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_eq_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
106 |               ", error_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops + mapping_data->at(i)->get_alignments()[j].num_d_ops + mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
107 |               " (X: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
108 |               ", I = " << ((float) mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
109 |               ", D: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_d_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << ")";
110 | 
111 |           ss << "\n        ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\"";
112 |         }
113 |     ss << "\n-----------";
114 |     if (i == 0) {
115 |       ss << "\n";
116 |     }
117 |     ss << "\n";
118 |   }
119 | 
120 |   return ss.str();
121 | }
122 | 
123 | std::string MappingData::VerboseFinalMappingsToString(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const {
124 |   return VerboseMappingDataToString_(&final_mapping_ptrs, index, read);
125 | }
126 | 
127 | std::string MappingData::VerboseIntermediateMappingsToString(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const {
128 |   return VerboseMappingDataToString_(&intermediate_mappings, index, read);
129 | }
130 | 
131 | void MappingData::clear() {
132 |   vertices.Clear();
133 |   bins.clear();
134 |   for (int64_t i = 0; i < intermediate_mappings.size(); i++) {
135 |     if (intermediate_mappings[i])
136 |       delete intermediate_mappings[i];
137 |     intermediate_mappings[i] = NULL;
138 |   }
139 |   intermediate_mappings.clear();
140 |   final_mapping_ptrs.clear();
141 |   unmapped_reason = std::string("");
142 |   num_region_iterations = 0;
143 |   mapping_quality = 0;
144 |   metagen_alignment_score = 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/src/containers/mapping_data.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * mapping_data.h
 3 |  *
 4 |  *  Created on: Mar 19, 2015
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef MAPPING_DATA_H_
 9 | #define MAPPING_DATA_H_
10 | 
11 | #include "log_system/log_system.h"
12 | #include "program_parameters.h"
13 | #include "utility/utility_general.h"
14 | #include "containers/region.h"
15 | //#include "index/index.h"
16 | //#include "index/index_hash.h"
17 | //#include "index/index_sa.h"
18 | #include "minimizer_index/minimizer_index.h"
19 | #include "containers/vertices.h"
20 | #include "utility/evalue.h"
21 | #include "containers/path_graph_entry.h"
22 | 
23 | //#define UNMAPPED_CODE_NO_VALID_GRAPH_PATHS  (1 << 0)
24 | 
25 | #define MAPPED_CODE_READ_UNPROCESSED_YET      (0)
26 | #define MAPPED_CODE_UNIQUE_MAPPING            (1 << 0)
27 | #define MAPPED_CODE_MULTIPLE_EQ_MAPPINGS      (1 << 1)
28 | 
29 | #define ITERATION_RESET_LIMIT ((int64_t) 0x1000000000000000)
30 | 
31 | 
32 | 
33 | struct ChromosomeBin {
34 |   int64_t reference_id = 0;
35 |   int64_t bin_id = 0;
36 |   float bin_value = 0.0f;
37 | };
38 | 
39 | struct bins_greater_than_key
40 | {
41 |     inline bool operator() (const ChromosomeBin& op1, const ChromosomeBin& op2) {
42 |       if (op1.bin_value > op2.bin_value)
43 |         return true;
44 |       return false;
45 |     }
46 | };
47 | 
48 | class MappingData {
49 |  public:
50 |   MappingData();
51 |   ~MappingData();
52 | 
53 |   void clear();
54 | 
55 |   Vertices vertices;
56 |   std::vector<ChromosomeBin> bins;
57 |   std::vector<PathGraphEntry *> intermediate_mappings;
58 |   std::vector<PathGraphEntry *> final_mapping_ptrs;         // Do not free the pointers here! Bad design. These point to intermediate_mappings pointers, which will be freed upon destruction.
59 | 
60 |   int64_t bin_size;
61 |   int64_t num_seeds_over_limit;
62 |   int64_t num_seeds_with_no_hits;
63 |   int64_t num_seeds_errors;
64 |   int64_t iteration;
65 | 
66 |   int64_t num_similar_mappings;                  // Number of found mapping positions with very similar (estimated) scores. E.g. to within some difference from the top mapping.
67 |   int64_t num_same_mappings;
68 |   int64_t avg_covered_bases_of_all_mappings;
69 |   int64_t std_covered_bases_of_all_mappings;
70 |   int64_t median_covered_bases_of_all_mappings;
71 | 
72 |   std::string unmapped_reason;
73 | 
74 |   int64_t num_region_iterations;
75 |   int8_t mapping_quality;
76 |   int64_t metagen_alignment_score;
77 | 
78 |   double time_region_selection;
79 |   double time_mapping;
80 |   double time_alignment;
81 |   double time_region_seed_lookup;
82 |   double time_region_hitsort;
83 |   double time_region_conversion;
84 |   double time_region_alloc;
85 |   double time_region_counting;
86 | 
87 |   bool IsMapped();
88 |   bool IsAligned();
89 | 
90 |   std::string VerboseFinalMappingsToString(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const;
91 |   std::string VerboseIntermediateMappingsToString(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const;
92 | 
93 |  private:
94 |   std::string VerboseMappingDataToString_(const std::vector<PathGraphEntry *> *mapping_data, std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read) const;
95 | 
96 | };
97 | 
98 | #endif /* MAPPING_DATA_H_ */
99 | 


--------------------------------------------------------------------------------
/src/containers/range.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * range.h
 3 |  *
 4 |  *  Created on: Jul 16, 2014
 5 |  *      Author: ivan
 6 |  */
 7 | 
 8 | #ifndef RANGE_H_
 9 | #define RANGE_H_
10 | 
11 | #include <stdint.h>
12 | 
13 | class Range {
14 |  public:
15 |   Range() : start(0), end(0) { }
16 |   Range(int64_t _start, int64_t _end) : start(_start), end(_end) { }
17 | 
18 |   int64_t dist() const {
19 |     return (end - start);
20 |   }
21 | 
22 |   int64_t start = 0;
23 |   int64_t end = 0;
24 | };
25 | 
26 | #endif /* RANGE_H_ */
27 | 


--------------------------------------------------------------------------------
/src/containers/raw_alignment.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * raw_alignment.h
 3 |  *
 4 |  *  Created on: Nov 12, 2015
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_CONTAINERS_RAW_ALIGNMENT_H_
 9 | #define SRC_CONTAINERS_RAW_ALIGNMENT_H_
10 | 
11 | struct RawAlignment {
12 |   int64_t aln_start = 0;
13 |   int64_t aln_end = 0;
14 |   std::vector<int8_t> alignment;
15 |   std::string cigar = "*";
16 |   std::string md = "*"; /// MD field from SAM output.
17 |   SeqOrientation orientation = kForward;
18 |   int64_t ref_id = 0;
19 |   std::string ref_header = "";
20 |   int64_t query_id = 0;
21 |   std::string query_header = "";
22 |   int64_t eq_ops = 0, x_ops = 0, i_ops = 0, d_ops = 0;  /// Counts of CIGAR operations.
23 |   int64_t aligned_len = 0;  /// Number of aligned bases from the read (not counting clipped bases).
24 |   int64_t num_clipped_front = 0;  /// Number of clipped bases at the beginning of the read.
25 |   int64_t num_clipped_back = 0;   /// Number of clipped bases at the end of the read.
26 | };
27 | 
28 | #endif /* SRC_CONTAINERS_RAW_ALIGNMENT_H_ */
29 | 


--------------------------------------------------------------------------------
/src/containers/region.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * region.cc
  3 |  *
  4 |  *  Created on: Dec 26, 2014
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #include "containers/region.h"
  9 | 
 10 | //int CopyLinearRegion(const Index *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset) {
 11 | //  if (region->is_split == true)
 12 | //    return 1;
 13 | //
 14 | //  int8_t *data_copy = new int8_t[(region->end - region->start + 1) + 1];
 15 | //  if (data_copy == NULL) {
 16 | //    return 3;
 17 | //  }
 18 | //
 19 | //  memmove((data_copy), &(index_reference->get_data()[region->start]), (region->end - region->start + 1));
 20 | //
 21 | //  data_copy[(region->end - region->start + 1)] = '\0';
 22 | //
 23 | //  *ret_concatenated_data = data_copy;
 24 | //  *ret_data_length = (region->end - region->start + 1);
 25 | //  *ret_start_offset = region->start;
 26 | //
 27 | //  return 0;
 28 | //}
 29 | 
 30 | int ConcatenateSplitRegion(std::shared_ptr<is::MinimizerIndex> index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end) {
 31 |   if (region->is_split == false)
 32 |     return 1;
 33 | 
 34 |   int64_t region_length_first = (region->end - region->start + 1);
 35 |   int64_t region_length_second = (region->split_end - region->split_start + 1);
 36 |   int64_t region_length_joined = region_length_first + region_length_second;
 37 |   if (region_length_first <= 0 || region_length_second <= 0 || region_length_joined <= 0)
 38 |     return 2;
 39 | 
 40 |   int8_t *data_copy = new int8_t[region_length_joined + 1];
 41 |   if (data_copy == NULL) {
 42 |     return 3;
 43 |   }
 44 | 
 45 |   int64_t start_offset = 0;
 46 |   int64_t position_of_ref_end = 0;
 47 | 
 48 |   // If the main region is at the beginning of the reference. The region is then expanded towards left and right, but on the left it zips back
 49 |   // to the end of the circular reference.
 50 |   if (region->start < region->split_start) {
 51 |     memmove(data_copy, &(index_reference->get_data()[region->split_start]), region_length_second);
 52 |     memmove((data_copy + region_length_second), &(index_reference->get_data()[region->start]), region_length_first);
 53 |     position_of_ref_end = region->split_end - region->split_start; // + 1;
 54 |     start_offset = region->split_start;
 55 | 
 56 |     // If the main region is at the end of the reference. The region is then expanded towards left and right, but on the right it zips back
 57 |     // to the beginning of the circular reference.
 58 |   } else {
 59 |     memmove((data_copy), &(index_reference->get_data()[region->start]), region_length_first);
 60 |     memmove((data_copy + region_length_first), &(index_reference->get_data()[region->split_start]), region_length_second);
 61 |     position_of_ref_end = region->end - region->start;
 62 |     start_offset = region->start;
 63 | 
 64 |   }
 65 | 
 66 |   data_copy[region_length_joined] = '\0';
 67 | 
 68 |   *ret_concatenated_data = data_copy;
 69 |   *ret_data_length = region_length_joined;
 70 |   *ret_start_offset = start_offset;
 71 |   *ret_position_of_ref_end = position_of_ref_end;
 72 | 
 73 |   return 0;
 74 | }
 75 | 
 76 | int GetRegionData(std::shared_ptr<is::MinimizerIndex> index, const Region *region,
 77 |                   int8_t **region_data, int64_t *data_len, int64_t *index_reg_start, int64_t *pos_of_ref_end, bool *is_cleanup_required) {
 78 | 
 79 |   if (region->is_split == false) {
 80 |     *region_data = (int8_t *) (&index->get_data()[0] + region->start);
 81 |     *data_len = (region->end - region->start);
 82 |     *index_reg_start = region->start;
 83 |     *pos_of_ref_end = -1;
 84 |     *is_cleanup_required = false;
 85 | 
 86 |   } else {
 87 |     ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_start, pos_of_ref_end);
 88 |     *is_cleanup_required = true;
 89 | 
 90 |   }
 91 | 
 92 |   return 0;
 93 | }
 94 | 
 95 | //int GetRegionDataCopy(const Index *index, const Region *region,
 96 | //                  int8_t **region_data, int64_t *data_len, int64_t *index_reg_pos, int64_t *reg_pos_of_ref_end) {
 97 | //
 98 | //  if (region->is_split == false) {
 99 | //    CopyLinearRegion(index, region, region_data, data_len, index_reg_pos);
100 | //    *reg_pos_of_ref_end = -1;
101 | //
102 | //  } else {
103 | //    ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_pos, reg_pos_of_ref_end);
104 | //
105 | //  }
106 | //
107 | //  return 0;
108 | //}
109 | 
110 | std::string VerboseRegionAsString(Region &region) {
111 |   std::stringstream ss;
112 | 
113 |   ss << "start = " << region.start;
114 |   ss << ", end = " << region.end;
115 |   ss << ", reference_id = " << region.reference_id;
116 |   ss << ", region_index = " << region.region_index;
117 |   ss << ", region_votes = " << region.region_votes;
118 |   ss << ", is_split = " << ((int) region.is_split);
119 |   ss << ", split_start = " << region.split_start;
120 |   ss << ", split_end = " << region.split_end;
121 | 
122 |   return ss.str();
123 | }
124 | 


--------------------------------------------------------------------------------
/src/containers/region.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * region.h
 3 |  *
 4 |  *  Created on: Dec 21, 2014
 5 |  *      Author: ivan
 6 |  */
 7 | 
 8 | #ifndef REGION_H_
 9 | #define REGION_H_
10 | 
11 | #include <stdlib.h>
12 | #include <string>
13 | #include <sstream>
14 | // #include "index/index.h"
15 | #include "minimizer_index/minimizer_index.h"
16 | 
17 | struct Region {
18 |   int64_t start = 0;
19 |   int64_t end = 0;
20 |   int64_t reference_id = -1;
21 |   std::string rname;
22 |   int64_t region_index = -1;
23 |   int64_t region_votes = 0;
24 |   bool is_split = false;
25 |   int64_t split_start = 0;
26 |   int64_t split_end = 0;
27 | };
28 | 
29 | //// Creates a copy of the region data from the Index.
30 | //int CopyLinearRegion(const MinimizerIndex *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset);
31 | 
32 | // If the region is split in two parts, that is if the genome is circular, this function copies both parts in a new data array.
33 | // It is users responsibility to free the allocated space using delete[].
34 | int ConcatenateSplitRegion(std::shared_ptr<is::MinimizerIndex> index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end);
35 | 
36 | // Checks if the region is linear or split. If the region is linear, it returns the pointer to the existing part of the Index data and is_cleanup_required is set to false.
37 | // Otherwise, a new data array is allocated and the data copied from the split parts of the Index.
38 | // If the is_cleanup_required parameter is true, region_data needs to be freed by the user using free().
39 | int GetRegionData(std::shared_ptr<is::MinimizerIndex> index, const Region *region,
40 |                   int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end, bool *is_cleanup_required);
41 | 
42 | //// Checks if the region is linear or split. It copies the data to a new array, and returns the pointer to the region data.
43 | //// region_data needs to be freed by the user using free().
44 | //int GetRegionDataCopy(const MinimizerIndex *index, const Region *region,
45 | //                  int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end);
46 | 
47 | // Simply verbose region's details.
48 | std::string VerboseRegionAsString(Region &region);
49 | 
50 | #endif /* REGION_H_ */
51 | 


--------------------------------------------------------------------------------
/src/containers/results.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * results.h
  3 |  *
  4 |  *  Created on: Jan 16, 2016
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef SRC_CONTAINERS_RESULTS_H_
  9 | #define SRC_CONTAINERS_RESULTS_H_
 10 | 
 11 | #include <stdint.h>
 12 | #include <string>
 13 | #include "containers/range.h"
 14 | #include "utility/utility_general.h"
 15 | #include "containers/region.h"
 16 | 
 17 | 
 18 | 
 19 | typedef struct Cluster {
 20 |  public:
 21 |   Range query;
 22 |   Range ref;
 23 |   int32_t num_anchors = 0;
 24 |   int32_t coverage = 0;
 25 |   bool valid = false;
 26 |   SeqOrientation orientation;
 27 |   Region region;
 28 | } Cluster;
 29 | 
 30 | typedef struct MappingResults {
 31 |   int64_t lcs_length = 0;
 32 |   int64_t cov_bases_max = 0;
 33 |   int64_t cov_bases_query = 0;
 34 |   int64_t cov_bases_ref = 0;
 35 |   int64_t num_covering_kmers = 0;
 36 |   float deviation = 0.0f;
 37 |   Range query_coords;
 38 |   Range ref_coords;
 39 |   bool is_mapped = false;
 40 |   bool is_reverse = false;
 41 |   int64_t local_score_id = 0;
 42 |   std::vector<Cluster> clusters;
 43 | 
 44 | //  int64_t num_same_mappings = 0;      // How many mapping positions have exactly the same score.
 45 | } MappingResults;
 46 | 
 47 | typedef struct L1Results {
 48 |   int64_t l1_l = 0;
 49 |   double l1_k = 1.0f;
 50 |   int64_t l1_lmin = 0;
 51 |   int64_t l1_lmax = 0;
 52 |   double l1_confidence_abs = 0;
 53 |   double l1_std = 0;
 54 |   int64_t l1_rough_start = 0;
 55 |   int64_t l1_rough_end = 0;
 56 | } L1Results;
 57 | 
 58 | typedef struct AlignmentResults {
 59 |   bool is_aligned = false;
 60 |   bool is_reverse = false;            // This should be deprecated and replaced with 'orientation'.
 61 |   int64_t ref_start = 0;              // Starting position of the alignment on the reference. If orientation == kReverse, this assumes that the read should be reverse complemented and the reference stays fwd. pos_start is adjusted accordingly to denote the starting position of the alignment of the reversed read.
 62 |   int64_t ref_end = 0;                // See pos_start. This is the end position of the alignment.
 63 |   int64_t query_start = 0;            // Starting position of the alignment on the read. Everything before this position should be clipped.
 64 |   int64_t query_end = 0;              // Ending position of the alignment on the read. Everything after this position should be clipped.
 65 |   std::string cigar = "*";            // In case orientation == kReverse, 'cigar' contains the reverse of the 'alignment' operations.
 66 |   std::string md = "";
 67 |   int64_t edit_distance = 0;
 68 |   int64_t alignment_score = 0;
 69 |   int64_t mapping_quality = 0;
 70 |   double evalue = 0.0f;
 71 |   int64_t num_secondary_alns = 0;      // How many mapping positions have similar score.
 72 | 
 73 |   int64_t raw_pos_start = 0;          // Internally, the fwd read is mapped to a reference and its reverse complement (which have been joined in a single massive sequence). The raw_pos_start then holds the absolute coordinate of the alignment in such joined sequence data.
 74 |   int64_t raw_pos_end = 0;            // See raw_pos_start. This is the end position of the alignment in global coordinates.
 75 |   std::vector<uint8_t> raw_alignment;     // Hold the alignment in the global coordinate space (between raw_pos_start and raw_pos_end). Cannot be used with pos_start and pos_end in case the read should be reverse complemented. In this case, the alignment needs to be reversed.
 76 |   std::vector<uint8_t> alignment;     // Hold the alignment in the local coordinate space (between ref_start and ref_end). If orientation == kForward, alignment == raw_alignment. Otherwise it's the reverse complement.
 77 | 
 78 |   SeqOrientation orientation = kForward;
 79 |   int64_t ref_id = -1;
 80 |   std::string ref_header = "*";
 81 |   int64_t ref_len = 0;
 82 |   int64_t query_id = -1;
 83 |   std::string query_header = "*";
 84 |   int64_t query_len = 0;
 85 | 
 86 |   int64_t num_eq_ops = 0;
 87 |   int64_t num_x_ops = 0;
 88 |   int64_t num_i_ops = 0;
 89 |   int64_t num_d_ops = 0;
 90 |   int64_t nonclipped_length = 0;
 91 | 
 92 | //  int8_t *ref_data = NULL;
 93 | //  int8_t *read_data = NULL;
 94 | 
 95 |   // These are parameters of alignment which were used to produce the results.
 96 |   int32_t aln_mode_code = 0;          // Type of alignment which was performed to produce the results stored in this structure.
 97 | 
 98 |   int64_t reg_pos_start = 0;          // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function.
 99 |   int64_t reg_pos_end = 0;            // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function.
100 | 
101 | } AlignmentResults;
102 | 
103 | 
104 | 
105 | typedef struct MappingMetadata {
106 |   std::string unmapped_reason = "Not processed.";
107 | 
108 |   double time_region_selection = 0.0;
109 |   double time_mapping = 0.0;
110 |   double time_alignment = 0.0;
111 |   double time_region_seed_lookup = 0.0;
112 |   double time_region_hitsort = 0.0;
113 |   double time_region_conversion = 0.0;
114 |   double time_region_alloc = 0.0;
115 |   double time_region_counting = 0.0;
116 | 
117 | 
118 | 
119 | } MappingMetadata;
120 | 
121 | #endif /* SRC_CONTAINERS_RESULTS_H_ */
122 | 


--------------------------------------------------------------------------------
/src/containers/score_registry.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * score_registry.cc
  3 |  *
  4 |  *  Created on: Jul 14, 2014
  5 |  *      Author: ivan
  6 |  */
  7 | 
  8 | #include "containers/score_registry.h"
  9 | 
 10 | ScoreRegistry::ScoreRegistry() {
 11 |   scores_id_ = 0;
 12 | }
 13 | 
 14 | ScoreRegistry::ScoreRegistry(const Region& region, int64_t scores_id) {
 15 |   set_region(region);
 16 |   set_scores_id(scores_id);
 17 | }
 18 | 
 19 | ScoreRegistry::~ScoreRegistry() {
 20 |   Clear();
 21 | }
 22 | 
 23 | void ScoreRegistry::Clear() {
 24 | //  registry_.clear();
 25 |   registry_entries_.Clear();
 26 |   scores_id_ = 0;
 27 | }
 28 | 
 29 | void ScoreRegistry::Add(Vertices &src_vertices, int64_t vertex_idx) {
 30 | //  registry_.push_back(vertex_data);
 31 |   registry_entries_.Add(src_vertices, vertex_idx);
 32 | }
 33 | 
 34 | void ScoreRegistry::Register(Vertices &src_vertices, int64_t vertex_idx) {
 35 |   if (src_vertices.registry_numbers[vertex_idx] < 0) { // || vertex_data.registry_number >= registry_.size()) {
 36 |     src_vertices.registry_numbers[vertex_idx] = registry_entries_.num_vertices;
 37 |     registry_entries_.Add(src_vertices, vertex_idx);
 38 | 
 39 |   }
 40 |   else {
 41 |     // Handle the case where a repeating kmer causes a 'jump' in the middle of an existing long path.
 42 |     // Edit 07.11.2014.: Because of the condition that a kmer needs to be within l iterations from the
 43 |     // vertex's path that it want's to extend, the kmer cannot hit it somewhere in the middle of the path.
 44 |     // It can only occur near the end of the path, and can only cause the path to have a more-or-less
 45 |     // even/uneven length in the reference and the query. For this reason, I think that forking a path
 46 |     // is perhaps not a good option, but instead to check for its ratio in query and in reference, and
 47 |     // choose to extend the path with the new kmer only if the ratio is closer to 1.0f.
 48 |     // For precaution sake, I'll keep the previous version here in comments.
 49 | //    if (vertex_data.covered_bases < registry_[vertex_data.registry_number].covered_bases) {
 50 | //      vertex_data.registry_number = registry_.size();
 51 | //      registry_.push_back(vertex_data);
 52 | //    } else {
 53 | //      registry_[vertex_data.registry_number] = vertex_data;
 54 | //    }
 55 | 
 56 |     int64_t registry_number = src_vertices.registry_numbers[vertex_idx];
 57 | 
 58 |     if ((src_vertices.num_kmers[vertex_idx] > registry_entries_.num_kmers[registry_number]) ||
 59 |         (src_vertices.num_kmers[vertex_idx] <= registry_entries_.num_kmers[registry_number] &&
 60 |             src_vertices.CalculateSuppress(vertex_idx) < registry_entries_.CalculateSuppress(registry_number))) {
 61 | 
 62 |         registry_entries_.CopyValuesFromOut(src_vertices, vertex_idx, registry_number);
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | std::string ScoreRegistry::VerboseToString() {
 68 |   std::stringstream ss;
 69 | 
 70 |   ss << "Num scores: " << registry_entries_.num_vertices << std::endl;
 71 | 
 72 |   for (int64_t i=0; i<registry_entries_.num_vertices; i++) {
 73 |     ss << "[" << i << "] (" << registry_entries_.VerboseToString(i) << ")" << std::endl;
 74 |   }
 75 | 
 76 | //  ss << std::endl;
 77 | 
 78 |   return ss.str();
 79 | }
 80 | 
 81 | const Region& ScoreRegistry::get_region() const {
 82 |   return region_;
 83 | }
 84 | 
 85 | void ScoreRegistry::set_region(const Region& region) {
 86 |   region_ = region;
 87 | }
 88 | 
 89 | int64_t ScoreRegistry::get_scores_id() const {
 90 |   return scores_id_;
 91 | }
 92 | 
 93 | void ScoreRegistry::set_scores_id(int64_t scoresId) {
 94 |   scores_id_ = scoresId;
 95 | }
 96 | 
 97 | const Vertices& ScoreRegistry::get_registry_entries() const {
 98 |   return registry_entries_;
 99 | }
100 | 
101 | void ScoreRegistry::Reserve(int64_t size) {
102 |   registry_entries_.Reserve(size);
103 | }
104 | 
105 | void ScoreRegistry::set_registry_entries(Vertices& registryEntries) {
106 |   registry_entries_ = registryEntries;
107 | }
108 | 


--------------------------------------------------------------------------------
/src/containers/score_registry.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * score_registry.h
 3 |  *
 4 |  *  Created on: Jul 14, 2014
 5 |  *      Author: ivan
 6 |  */
 7 | 
 8 | #ifndef SCORE_REGISTRY_H_
 9 | #define SCORE_REGISTRY_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <list>
14 | #include <sstream>
15 | #include "containers/region.h"
16 | #include "sequences/single_sequence.h"
17 | #include "sequences/sequence_file.h"
18 | #include "containers/vertices.h"
19 | 
20 | class ScoreRegistry {
21 |  public:
22 |   ScoreRegistry();
23 |   ScoreRegistry(const Region& region, int64_t scores_id);
24 |   ~ScoreRegistry();
25 | 
26 |   /// Empties the registry and sets all values to zero.
27 |   void Clear();
28 | 
29 |   /// Simply appends the data to the end of the registry and updates the top score.
30 |   /// No additional checks are performed.
31 |   void Add(Vertices &src_vertices, int64_t vertex_idx);
32 | 
33 |   /// If the data has a registry number >= 0, then the entry with that index will be updated.
34 |   /// Otherwise, if registry number < 0 or if the suppress is smaller than the existing one,
35 |   /// the new data will only be appended to the end of the registry, and its registry number
36 |   /// will be updated.
37 |   void Register(Vertices &src_vertices, int64_t vertex_idx);
38 | 
39 |   // Allocates space for vertices.
40 |   void Reserve(int64_t size);
41 | 
42 |   /// Formats the debug verbose to a std::string.
43 |   std::string VerboseToString();
44 |   const Region& get_region() const;
45 |   void set_region(const Region& region);
46 |   int64_t get_scores_id() const;
47 |   void set_scores_id(int64_t scoresId);
48 |   const Vertices& get_registry_entries() const;
49 |   void set_registry_entries(Vertices& registryEntries);
50 | 
51 |  private:
52 |   Vertices registry_entries_;
53 |   Region region_;
54 |   int64_t scores_id_;
55 | };
56 | 
57 | #endif /* SCORE_REGISTRY_H_ */
58 | 


--------------------------------------------------------------------------------
/src/containers/vertices.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * vertices.h
  3 |  *
  4 |  *  Created on: Feb 13, 2015
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef VERTICES_H_
  9 | #define VERTICES_H_
 10 | 
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | #include <sstream>
 15 | #include "log_system/log_system.h"
 16 | 
 17 | 
 18 | 
 19 | // Quite an ugly data structure, but very cache friendly.
 20 | class Vertices {
 21 |  public:
 22 |   int64_t *timestamps;
 23 |   int64_t *reference_starts;
 24 |   int64_t *reference_ends;
 25 |   int64_t *query_starts;
 26 |   int64_t *query_ends;
 27 |   int64_t *num_kmers;
 28 |   int64_t *covered_bases_queries;
 29 |   int64_t *covered_bases_references;
 30 |   int64_t *registry_numbers;
 31 | 
 32 |   int64_t num_vertices;
 33 |   int64_t container_capacity;
 34 | 
 35 |   Vertices();
 36 |   ~Vertices();
 37 |   void Clear();
 38 | 
 39 |   inline int Init(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start,
 40 |            int64_t query_start, int64_t kmer_length, int64_t registry_number) {
 41 |     return Set(dest_vertex_idx, timestamp, reference_start, reference_start, query_start, query_start, 1, kmer_length, kmer_length, registry_number);
 42 |   }
 43 | 
 44 |   inline int Set(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start,
 45 |           int64_t reference_end, int64_t query_start, int64_t query_end,
 46 |            int64_t num_kmer, int64_t covered_bases_query,
 47 |            int64_t covered_bases_reference, int64_t registry_number) {
 48 |     if (dest_vertex_idx >= num_vertices || dest_vertex_idx < 0) {
 49 |       return 1;
 50 |     }
 51 | 
 52 |     timestamps[dest_vertex_idx] = timestamp;
 53 |     reference_starts[dest_vertex_idx] = reference_start;
 54 |     reference_ends[dest_vertex_idx] = reference_end;
 55 |     query_starts[dest_vertex_idx] = query_start;
 56 |     query_ends[dest_vertex_idx] = query_end;
 57 |     num_kmers[dest_vertex_idx] = num_kmer;
 58 |     covered_bases_queries[dest_vertex_idx] = covered_bases_query;
 59 |     covered_bases_references[dest_vertex_idx] = covered_bases_reference;
 60 |     registry_numbers[dest_vertex_idx] = registry_number;
 61 | 
 62 |     return 0;
 63 |   }
 64 | 
 65 |   inline int Add(int64_t timestamp, int64_t reference_start,
 66 |            int64_t reference_end, int64_t query_start, int64_t query_end,
 67 |            int64_t num_kmer, int64_t covered_bases_query,
 68 |            int64_t covered_bases_reference, int64_t registry_number) {
 69 |     if (num_vertices >= container_capacity) {
 70 |       Reserve(container_capacity + capacity_increment_size_);
 71 |     }
 72 | 
 73 |     num_vertices += 1;
 74 |     Set((num_vertices - 1), timestamp, reference_start, reference_end, query_start, query_end, num_kmer, covered_bases_query, covered_bases_reference, registry_number);
 75 | 
 76 |     return 0;
 77 |   }
 78 | 
 79 |   inline int Add(const Vertices &src_vertices, int64_t src_vertex_idx) {
 80 |     return Add(src_vertices.timestamps[src_vertex_idx],
 81 |                 src_vertices.reference_starts[src_vertex_idx],
 82 |                 src_vertices.reference_ends[src_vertex_idx],
 83 |                 src_vertices.query_starts[src_vertex_idx],
 84 |                 src_vertices.query_ends[src_vertex_idx],
 85 |                 src_vertices.num_kmers[src_vertex_idx],
 86 |                 src_vertices.covered_bases_queries[src_vertex_idx],
 87 |                 src_vertices.covered_bases_references[src_vertex_idx],
 88 |                 src_vertices.registry_numbers[src_vertex_idx]);
 89 |   }
 90 | 
 91 |   void Reserve(int64_t size);
 92 |   void Resize(int64_t size);
 93 | 
 94 |   inline int CopyValuesWithin(int64_t source_idx, int64_t dest_idx) {
 95 |     if (source_idx >= num_vertices || dest_idx >= num_vertices || source_idx < 0 || dest_idx < 0) {
 96 |       LogSystem::GetInstance().Error(SEVERITY_INT_WARNING, __FUNCTION__, LogSystem::GetInstance().GenerateErrorMessage(ERR_MEMORY, "When CopyValuesWithin is called. source_idx = %ld, dest_idx = %ld, num_vertices = %ld\n", source_idx, dest_idx, num_vertices));
 97 |       return 1;
 98 |     }
 99 | 
100 |     timestamps[dest_idx] = timestamps[source_idx];
101 |     reference_starts[dest_idx] = reference_starts[source_idx];
102 |     reference_ends[dest_idx] = reference_ends[source_idx];
103 |     query_starts[dest_idx] = query_starts[source_idx];
104 |     query_ends[dest_idx] = query_ends[source_idx];
105 |     num_kmers[dest_idx] = num_kmers[source_idx];
106 |     covered_bases_queries[dest_idx] = covered_bases_queries[source_idx];
107 |     covered_bases_references[dest_idx] = covered_bases_references[source_idx];
108 |     registry_numbers[dest_idx] = registry_numbers[source_idx];
109 | 
110 |     return 0;
111 |   }
112 | 
113 |   inline int CopyValuesFromOut(Vertices &src_vertices, int64_t src_vertex_idx, int64_t dest_idx) {
114 |     return Set(dest_idx,
115 |                src_vertices.timestamps[src_vertex_idx],
116 |                src_vertices.reference_starts[src_vertex_idx],
117 |                src_vertices.reference_ends[src_vertex_idx],
118 |                src_vertices.query_starts[src_vertex_idx],
119 |                src_vertices.query_ends[src_vertex_idx],
120 |                src_vertices.num_kmers[src_vertex_idx],
121 |                src_vertices.covered_bases_queries[src_vertex_idx],
122 |                src_vertices.covered_bases_references[src_vertex_idx],
123 |                src_vertices.registry_numbers[src_vertex_idx]);
124 |   }
125 | 
126 |   inline void EraseValues() {
127 |     if (num_vertices <= 0)
128 |       return;
129 | 
130 |     memset(timestamps, -1, num_vertices);
131 |     memset(reference_starts, 0, num_vertices);
132 |     memset(reference_ends, 0, num_vertices);
133 |     memset(query_starts, 0, num_vertices);
134 |     memset(query_ends, 0, num_vertices);
135 |     memset(num_kmers, 0, num_vertices);
136 |     memset(covered_bases_queries, 0, num_vertices);
137 |     memset(covered_bases_references, 0, num_vertices);
138 |     memset(registry_numbers, -1, num_vertices);
139 |   }
140 | 
141 |   inline float CalculateRatio(int64_t vertex_idx) {
142 |     float ratio = 0.0f;
143 |     int64_t query_start = query_starts[vertex_idx];
144 |     int64_t query_end = query_ends[vertex_idx];
145 |     int64_t reference_start = reference_starts[vertex_idx];
146 |     int64_t reference_end = reference_ends[vertex_idx];
147 | 
148 |     int64_t query_distance = (query_end >= query_start) ? (query_end - query_start) : (query_start - query_end);
149 |     int64_t ref_distance = (reference_end >= reference_start) ? (reference_end - reference_start) : (reference_start - reference_end);
150 | 
151 |     if (query_distance != 0)
152 |       ratio = ((float) std::min(query_distance, ref_distance)) / ((float) std::max(query_distance, ref_distance));
153 |     else
154 |       ratio = 1.0f;
155 | 
156 |     return ratio;
157 |   }
158 | 
159 |   inline float CalculateSuppress(int64_t vertex_idx) {
160 |     float ratio = 0.0f, ratio_suppress = 0.0f;
161 | 
162 |     ratio = CalculateRatio(vertex_idx);
163 | 
164 |     ratio_suppress = (ratio < 1.0f) ? (1.0f - ratio) : (ratio - 1.0f);
165 | 
166 |     return ratio_suppress;
167 |   }
168 | 
169 |   inline std::string VerboseToString(int64_t vertex_idx) const {
170 |     std::stringstream ret;
171 | 
172 |     if (vertex_idx < 0 || vertex_idx >= num_vertices) {
173 |       ret << "Error with vertex_idx! vertex_idx = " << vertex_idx << ", containter_capacity = " << container_capacity << ", num_vertices = " << num_vertices;
174 |       return ret.str();
175 |     }
176 | 
177 |     ret << "timestamp = " << timestamps[vertex_idx];
178 |     ret <<  "; q[" << query_starts[vertex_idx] << ", " << query_ends[vertex_idx] << "]; r[" << reference_starts[vertex_idx]<< ", " <<
179 |         reference_ends[vertex_idx] <<
180 |             "]; d[" << (query_ends[vertex_idx] - query_starts[vertex_idx]) << ", " << (reference_ends[vertex_idx] - reference_starts[vertex_idx]) <<
181 |             "]; length = " << num_kmers[vertex_idx] <<
182 |             "; dist_ratio = " << ((double) std::min((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) / ((double) std::max((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) <<
183 |             "; cov_bases_query = " << covered_bases_queries[vertex_idx] << "; cov_bases_ref = " << covered_bases_references[vertex_idx] << "; registry_num = " << registry_numbers[vertex_idx];
184 | 
185 |     return ret.str();
186 |   }
187 | 
188 |  private:
189 |   inline int ReallocArray_(int64_t **array_ptr, int64_t size);
190 |   int64_t capacity_increment_size_;
191 | };
192 | 
193 | #endif /* VERTICES_H_ */
194 | 


--------------------------------------------------------------------------------
/src/graphmap/experimental.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * experimental.cc
 3 |  *
 4 |  *  Created on: Jan 20, 2016
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #include "graphmap/graphmap.h"
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/graphmap/filter_anchors.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * filter_anchors.h
 3 |  *
 4 |  *  Created on: Mar 22, 2016
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_GRAPHMAP_FILTER_ANCHORS_H_
 9 | #define SRC_GRAPHMAP_FILTER_ANCHORS_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <algorithm>
14 | #include <stdint.h>
15 | #include "sequences/single_sequence.h"
16 | #include "sequences/sequence_file.h"
17 | #include "containers/vertices.h"
18 | #include "program_parameters.h"
19 | 
20 | #include "containers/score_registry.h"
21 | #include "utility/utility_general.h"
22 | #include "containers/region.h"
23 | #include "containers/mapping_data.h"
24 | #include "containers/vertices.h"
25 | 
26 | /// These are some constants used for filtering shady anchors.
27 | /// TODO: This can be omitted if dynamic programming was used to penalize the anchor distances.
28 | /// int64_t min_covered_bases = (new_cluster->query.end - new_cluster->query.start + 1) * MIN_CLUSTER_COVERAGE_FACTOR;
29 | #define MIN_CLUSTER_COVERAGE_FACTOR 0.05f
30 | /// int64_t min_cluster_length = read->get_sequence_length() * MIN_CLUSTER_LENGTH_FACTOR;
31 | #define MIN_CLUSTER_LENGTH_FACTOR 0.03f
32 | 
33 | using int128_t = __int128;
34 | using uint128_t = unsigned __int128;
35 | 
36 | #define get128_qid(x)   ((int32_t) (x & 0x0FFFFFFFF))
37 | #define get128_rpos(x)  ((int32_t) ((x >> 32) & 0x0FFFFFFFF))
38 | #define get128_qpos(x)  ((int32_t) ((x >> 64) & 0x0FFFFFFFF))
39 | #define get128_rid(x)   ((int32_t) ((x >> 96) & 0x0FFFFFFFF))
40 | ///      d                  c                 b             a
41 | /// ref_id << 96 | query_start << 64 | ref_start << 32 | query_id
42 | #define pack128(qstart,rstart,qid,rid)         ((((uint128_t) rid) << 96) | (((uint128_t) qstart) << 64) | (((uint128_t) rstart) << 32) | ((uint128_t) qid))
43 | 
44 | struct ClusterAndIndices {
45 |   Range query;
46 |   Range ref;
47 |   int32_t num_anchors = 0;
48 |   int32_t coverage = 0;
49 |   std::vector<int> lcskpp_indices;
50 | };
51 | 
52 | int64_t CalcScore(int32_t qpos, int32_t rpos, int32_t next_qpos, int32_t next_rpos, double indel_bandwidth_margin, int32_t fwd_length, int32_t dist_aab, int32_t dist_dbm, double *score_gap, double *score_dist);
53 | 
54 | void GetPositionsFromRegistry2(const Vertices& registry_entries, int64_t vertex_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
55 | void GetPositionsFromRegistry(const Vertices& registry_entries, const std::vector<int> &lcskpp_indices, int64_t lcskpp_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
56 | void GetPositionsFrom128bit(const std::vector<uint128_t> &hits, const std::vector<int> &lcskpp_indices, int64_t lcskpp_id, int32_t seed_len, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
57 | 
58 | int FilterAnchorsByDiff(const SingleSequence* read, ScoreRegistry* local_score, const ProgramParameters *parameters,
59 |                   const std::vector<int> &lcskpp_indices, std::vector<int> &ret_filtered_lcskpp_indices);
60 | 
61 | int FilterAnchorsByChaining(const SingleSequence* seq, ScoreRegistry* local_score, const ProgramParameters *parameters,
62 |                   const std::vector<int> &lcskpp_indices, double indel_bandwidth_margin, int32_t max_dist, int32_t lookahead_dist_factor, int64_t min_covered_bases, int32_t cluster_size_cutoff,
63 |                   std::vector<int> &ret_filtered_lcskpp_indices, std::vector<int32_t> *ret_cluster_ids);
64 | 
65 | int GenerateClusters(int64_t min_num_anchors_in_cluster, int64_t min_cluster_length, int64_t min_cluster_covered_bases, float min_cluster_coverage, std::vector<int> &lcskpp_indices,
66 |                      ScoreRegistry* local_score, MappingData* mapping_data,
67 |                      const SingleSequence* read, const ProgramParameters* parameters, std::vector<ClusterAndIndices *> &ret_clusters,
68 |                      std::vector<int> &ret_filtered_lcskpp_indices, std::vector<int32_t> *ret_cluster_ids);
69 | int GenerateClustersDummy(int64_t min_cluster_length, float min_cluster_coverage, std::vector<int> &lcskpp_indices,
70 |                      ScoreRegistry* local_score, MappingData* mapping_data,
71 |                      const SingleSequence* read, const ProgramParameters* parameters, std::vector<ClusterAndIndices *> &ret_clusters,
72 |                      std::vector<int> &ret_filtered_lcskpp_indices, std::vector<int32_t> *ret_cluster_ids);
73 | 
74 | int VerboseClustersToFile_(std::string out_file, const ScoreRegistry* local_score, const MappingData* mapping_data, std::vector<std::shared_ptr<is::MinimizerIndex>> &indexes, const SingleSequence* read, const ProgramParameters* parameters, const std::vector<ClusterAndIndices *> &clusters);
75 | 
76 | #endif /* SRC_GRAPHMAP_FILTER_ANCHORS_H_ */
77 | 


--------------------------------------------------------------------------------
/src/graphmap/transcriptome.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * transcriptome.h
  3 |  *
  4 |  *  Created on: Feb 6, 2017
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef SRC_GRAPHMAP_TRANSCRIPTOME_H_
  9 | #define SRC_GRAPHMAP_TRANSCRIPTOME_H_
 10 | 
 11 | #include <stdint.h>
 12 | #include <map>
 13 | #include <memory>
 14 | #include <string>
 15 | #include <vector>
 16 | 
 17 | #include "sequences/sequence_file.h"
 18 | 
 19 | namespace is {
 20 | 
 21 | class Transcriptome;
 22 | 
 23 | std::shared_ptr<Transcriptome> createTranscriptome();
 24 | 
 25 | class Transcriptome {
 26 |  public:
 27 |   friend std::shared_ptr<Transcriptome> createTranscriptome();
 28 |   ~Transcriptome();
 29 | 
 30 |   /* Parses exons and extracts regions from the given GTF file.
 31 |    */
 32 |   int LoadGTF(const std::string &gtf_path);
 33 | 
 34 |   /* Constructs transcriptome sequences from the preloaded GTF file.
 35 |    */
 36 |   std::shared_ptr<SequenceFile> GenerateTranscriptomeSeqs(const std::shared_ptr<SequenceFile> sequences);
 37 | 
 38 |   /* Generates a header for a SAM file. The header is composed of
 39 |    * genomic sequence names.
 40 |    */
 41 |   std::string GenerateSAMHeaders();
 42 | 
 43 |   const std::map<std::string, std::vector<std::pair<std::string, char> > >& get_genome_id_to_trans_id() const {
 44 |     return genome_id_to_trans_id_;
 45 |   }
 46 | 
 47 |   const std::map<std::string, std::vector<std::pair<int64_t, int64_t> > >& get_trans_id_to_exons() const {
 48 |     return trans_id_to_exons_;
 49 |   }
 50 | 
 51 |   const std::map<std::string, std::vector<std::pair<int64_t, int64_t> > >& get_trans_id_to_regions() const {
 52 |     return trans_id_to_regions_;
 53 |   }
 54 | 
 55 |   const std::map<std::string, std::pair<std::string, char>>& get_trans_id_to_genome_id() const {
 56 |     return trans_id_to_genome_id_;
 57 |   }
 58 | 
 59 |   const std::map<std::string, int64_t>& get_genome_id_to_len() const {
 60 |     return genome_id_to_len_;
 61 |   }
 62 | 
 63 |  private:
 64 |   std::string gtf_path_;
 65 | 
 66 |   // A map from genome (chromosome) name (e.g. header split to first space) to a vector containing all transcriptomes which can be generated from that chromosome.
 67 |   // Each pair is a (transcript_id, strand), where strand is either '+' or '-';
 68 |   std::map<std::string, std::vector<std::pair<std::string, char>>> genome_id_to_trans_id_;
 69 |   // Reverse map, to obtain the chromosome name when converting from transcriptome space back to genome space.
 70 |   // Second parameter of the pair is the orientation on the genome.
 71 |   std::map<std::string, std::pair<std::string, char>> trans_id_to_genome_id_;
 72 |   // A map from transcript_id to a vector containing pairs of coordinates. Each pair of coordinates presents one exon which makes the transcriptome.
 73 |   std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> trans_id_to_exons_;
 74 |   // A list of exons in such way that it combines overlapping exons into regions.
 75 |   std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> trans_id_to_regions_;
 76 |   // Length of each chromosome in genome space. Needed for reversing the mapping if transcriptome was reverse complemented.
 77 |   std::map<std::string, int64_t> genome_id_to_len_;
 78 | 
 79 |   Transcriptome();      // Private constructor, prevent memory leaks;
 80 |   Transcriptome(const Transcriptome&) = delete;
 81 |   const Transcriptome& operator=(const Transcriptome&) = delete;
 82 | 
 83 |   // Creates a transcriptome from a given reference sequence and a path to a file with gene annotations.
 84 |   // Parameters:
 85 |   // @param annotations_path Path to a GFF file (or another supported format) which contains the annotations of exonic regions.
 86 |   // @param references A SequenceFile object which contains reference sequences already loaded from disk.
 87 |   // @param transcripts A SequenceFile which will contain the generated transcriptomes.
 88 |   // @return 0 if everything went fine (C-style).
 89 |   int MakeTranscript_(const std::map<std::string, std::vector<std::pair<std::string, char>>> &genome_id_to_trans_id,
 90 |                       const std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> &trans_id_to_exons,
 91 |                       const std::shared_ptr<SequenceFile> references, std::shared_ptr<SequenceFile> transcripts) const;
 92 |   /** Resolves lists of exons in such way that it combines overlapping exons into regions.
 93 |    * Returns dict that maps transcript id to list of regions.
 94 |    * @param trans_id_to_exons A map from transcriptome ID (name) to a vector of exons which make this transcriptome.
 95 |    * @param trans_id_to_regions Generated return map from transcriptome ID (name) to a vector containing regions.
 96 |    * @return 0 if everything went fine (C-style).
 97 |    */
 98 |   int MakeRegions_(const std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> &trans_id_to_exons,
 99 |                    std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> &trans_id_to_regions) const;
100 |   int ParseExons_(const std::string &annotations_path,
101 |                   std::map<std::string, std::vector<std::pair<std::string, char>>> &genomeToTrans,
102 |                   std::map<std::string, std::pair<std::string, char>> &transIdToGenomeId,
103 |                   std::map<std::string, std::vector<std::pair<int64_t, int64_t>>> &transToExons) const;
104 |   void HashGenomeLengths_(const std::shared_ptr<SequenceFile> sequences, std::map<std::string, int64_t> &rlens) const;
105 |   std::string trim_(std::string s) const;
106 |   std::vector<std::string> split_(std::string s, char c) const;
107 |   std::string getSequenceName_(const SingleSequence &seq) const;
108 |   std::string getTID_(const std::string &chr_name, const std::string &attributes) const;
109 | //  void outputSeq_(char *header, size_t headerLen, const int8_t *seq, size_t seqLen) const;
110 | 
111 | };
112 | 
113 | } /* namespace is */
114 | 
115 | #endif /* SRC_GRAPHMAP_TRANSCRIPTOME_H_ */
116 | 


--------------------------------------------------------------------------------
/src/index/index_util.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * index_util.cc
 3 |  *
 4 |  *  Created on: Feb 6, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #include "index_util.h"
 9 | #include "utility/utility_general.h"
10 | 
11 | namespace is {
12 | 
13 | std::string GenerateSAMHeader(std::shared_ptr<is::MinimizerIndex> index,
14 |                                   ProgramParameters& parameters) {
15 |   // Output reference sequence information.
16 |   std::stringstream ss_header;
17 | 
18 |   ss_header << "@HD\t" <<
19 |                "VN:1.0\t" <<
20 |                "SO:unknown" <<
21 |                "\n";
22 | 
23 |   for (int64_t rid=0; rid<index->get_num_sequences_forward(); rid++) {
24 |     std::string reference_header = TrimToFirstSpace(index->get_headers()[rid]);
25 |     uint64_t rlen = (uint64_t) index->get_reference_lengths()[rid];
26 | 
27 |     ss_header << "@SQ\t" <<
28 |                 "SN:" << reference_header << "\t" <<
29 |                 "LN:" << rlen << "" <<
30 |                 "\n";
31 |   }
32 | 
33 |   // If verbose_sam_output == 1, then print out a special version of the PG line. This was used for the web server
34 |   // to omit paths from the output (not to share server sensitive information with users).
35 |   if (parameters.verbose_sam_output == 1) {
36 |     ss_header << "@PG\tID:graphmap\tPN:graphmap";
37 |   } else {
38 |     // Output the command line used to run the process to the file.
39 |     ss_header << "@PG\t" <<
40 |                  "ID:graphmap\t" <<
41 |                  "PN:graphmap\t" <<
42 |                  "CL:" << parameters.command_line << "\t" <<
43 |                  "VN:" << std::string(GRAPHMAP_CURRENT_VERSION) << " compiled on " << std::string(GRAPHMAP_CURRENT_VERSION_RELEASE_DATE);
44 |   }
45 | 
46 |   return ss_header.str();
47 | }
48 | 
49 | std::string GenerateSAMHeader(std::shared_ptr<is::Transcriptome> transcriptome) {
50 |   return transcriptome->GenerateSAMHeaders();
51 | }
52 | 
53 | } /* namespace is */
54 | 


--------------------------------------------------------------------------------
/src/index/index_util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * index_util.h
 3 |  *
 4 |  *  Created on: Feb 6, 2017
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef SRC_INDEX_INDEX_UTIL_H_
 9 | #define SRC_INDEX_INDEX_UTIL_H_
10 | 
11 | #include "minimizer_index/minimizer_index.h"
12 | #include "graphmap/transcriptome.h"
13 | #include "../program_parameters.h"
14 | 
15 | namespace is {
16 | 
17 | std::string GenerateSAMHeader(std::shared_ptr<is::MinimizerIndex> index, ProgramParameters &parameters);
18 | 
19 | std::string GenerateSAMHeader(std::shared_ptr<is::Transcriptome> transcriptome);
20 | 
21 | } /* namespace is */
22 | 
23 | #endif /* SRC_INDEX_INDEX_UTIL_H_ */
24 | 


--------------------------------------------------------------------------------
/src/ksw2/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 Broad Institute, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/src/ksw2/kalloc.cc:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <limits.h>
  5 | #include "kalloc.h"
  6 | 
  7 | /* The whole thing is: ("@" for the kheader_t of the block, "-" for free
  8 |  * memory, and "+" for allocated memory. One char for one unit.)
  9 |  *                        
 10 |  *           This region is core 1.                             This region is core 2.
 11 |  *
 12 |  *   @-------@++++++@++++++++++++@------------           @----------@++++++++++++@+++++++@------------
 13 |  *   |                           |                       |                               |
 14 |  *   p=p->ptr->ptr->ptr->ptr     p->ptr             p->ptr->ptr                p->ptr->ptr->ptr
 15 |  */
 16 | 
 17 | #define PTR(p) ((size_t*)((size_t*)p)[1])
 18 | 
 19 | typedef struct _allocated_t {
 20 | 	struct _allocated_t *next;
 21 | 	size_t *ptr;
 22 | } allocated_t;
 23 | 
 24 | typedef struct {
 25 | 	size_t base[2], *loop_head;
 26 | 	allocated_t list_head, *list_tail;
 27 | 	size_t total_allocated;
 28 | } kmem_t;
 29 | 
 30 | void *km_init()
 31 | {
 32 | 	return calloc(1, sizeof(kmem_t));
 33 | }
 34 | 
 35 | static void kerror(const char *s)
 36 | {
 37 | 	fprintf(stderr, "%s\n", s);
 38 | 	exit(1);
 39 | }
 40 | 
 41 | static size_t *morecore(kmem_t *km, size_t nu)
 42 | {
 43 | 	size_t rnu, *up;
 44 | 
 45 | 	rnu = (nu + 0xfffff) & (~(size_t)0xfffff);
 46 | 	up = (size_t*)malloc(rnu * sizeof(size_t));
 47 | 	if (!up) { /* fail to allocate memory */
 48 | 		km_stat(km);
 49 | 		fprintf(stderr, "[morecore] %lu bytes requested but not available.\n", rnu * sizeof(size_t));
 50 | 		exit(1);
 51 | 	}
 52 | 	/* put the pointer in km->list_head */
 53 | 	if (km->list_tail == 0) km->list_tail = &km->list_head;
 54 | 	km->list_tail->ptr = up;
 55 | 	km->list_tail->next = (allocated_t*)calloc(1, sizeof(allocated_t));
 56 | 	km->list_tail = km->list_tail->next;
 57 | 
 58 | 	km->total_allocated += rnu * sizeof(size_t);
 59 | 	*up = rnu; /* the size of the current block, and in this case the block is the same as the new core */
 60 | 	kfree(km, up + 1); /* initialize the new "core" */
 61 | 	return km->loop_head;
 62 | }
 63 | 
 64 | void km_destroy(void *_km)
 65 | {
 66 | 	kmem_t *km = (kmem_t*)_km;
 67 | 	allocated_t *p, *q;
 68 | 	if (km == 0) return;
 69 | 	p = &km->list_head;
 70 | 	do {
 71 | 		q = p->next;
 72 | 		free(p->ptr);
 73 | 		if (p != &km->list_head) free(p);
 74 | 		p = q;
 75 | 	} while (p && p->next);
 76 | 	if (p != &km->list_head) free(p);
 77 | 	free(km);
 78 | }
 79 | 
 80 | void kfree(void *_km, void *ap)
 81 | {
 82 | 	size_t *p, *q;
 83 | 	kmem_t *km = (kmem_t*)_km;
 84 | 	
 85 | 	if (!ap) return;
 86 | 	if (km == 0) {
 87 | 		free(ap);
 88 | 		return;
 89 | 	}
 90 | 	p = (size_t*)ap - 1; /* *p is the size of the current block */
 91 | 	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
 92 | 	 *
 93 | 	 * a) "p>q && p<q->ptr": @------@++++++++@+++++++@-------    @---------------@+++++++@-------
 94 | 	 *    (can also be in    |      |                |        -> |                       |
 95 | 	 *     two cores)        q      p           q->ptr           q                  q->ptr
 96 | 	 *
 97 | 	 *                       @--------    @+++++++++@--------    @--------    @------------------
 98 | 	 *                       |            |         |         -> |            |
 99 | 	 *                       q            p    q->ptr            q       q->ptr
100 | 	 *
101 | 	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------@+++++   @--------@+++++++     @-------@+++++   @----------------
102 | 	 *                                       |                |        |         -> |                |
103 | 	 *                                  q->ptr                q        p       q->ptr                q
104 | 	 *
105 | 	 *                                       @+++++++@-----   @++++++++@-------     @-------------   @++++++++@-------
106 | 	 *                                       |       |                 |         -> |                         |
107 | 	 *                                       p  q->ptr                 q       q->ptr                         q
108 | 	 */
109 | 	for (q = km->loop_head; !(p > q && p < PTR(q)); q = PTR(q))
110 | 		if (q >= PTR(q) && (p > q || p < PTR(q))) break;
111 | 	if (p + (*p) == PTR(q)) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
112 | 		*p += *PTR(q); /* this is the new q->ptr size */
113 | 		p[1] = (size_t)PTR(PTR(q)); /* this is the new q->ptr->ptr */
114 | 		/* p is actually the new q->ptr. The actual change happens a few lines below. */
115 | 	} else if (p + (*p) > PTR(q) && PTR(q) >= p) { /* the end of the allocated block is in the next free block */
116 | 		kerror("[kfree] The end of the allocated block enters a free block.");
117 | 	} else p[1] = (size_t)PTR(q); /* backup q->ptr */
118 | 
119 | 	if (q + (*q) == p) { /* two adjacent blocks, merge q and p (the other two cases) */
120 | 		*q += *p;
121 | 		q[1] = (size_t)PTR(p);
122 | 		km->loop_head = q;
123 | 	} else if (q + (*q) > p && p >= q) { /* the end of a free block in the allocated block */
124 | 		kerror("[kfree] The end of a free block enters the allocated block.");
125 | 	} else km->loop_head = p, q[1] = (size_t)p; /* in two cores, cannot be merged */
126 | }
127 | 
128 | void *krealloc(void *_km, void *ap, size_t n_bytes)
129 | {
130 | 	kmem_t *km = (kmem_t*)_km;
131 | 	size_t n_units, *p, *q;
132 | 
133 | 	if (n_bytes == 0) {
134 | 		kfree(km, ap); return 0;
135 | 	}
136 | 	if (km == 0) return realloc(ap, n_bytes);
137 | 	if (!ap) return kmalloc(km, n_bytes);
138 | 	n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
139 | 	p = (size_t*)ap - 1;
140 | 	if (*p >= n_units) return ap; /* TODO: this prevents shrinking */
141 | 	q = (size_t*)kmalloc(km, n_bytes);
142 | 	memcpy(q, ap, (*p - 1) * sizeof(size_t));
143 | 	kfree(km, ap);
144 | 	return q;
145 | }
146 | 
147 | void *kmalloc(void *_km, size_t n_bytes)
148 | {
149 | 	kmem_t *km = (kmem_t*)_km;
150 | 	size_t n_units, *p, *q;
151 | 
152 | 	if (n_bytes == 0) return 0;
153 | 	if (km == 0) return malloc(n_bytes);
154 | 	/* "n_units" means the number of units. The size of one unit equals to sizeof(kheader_t).
155 | 	 * "1" is the kheader_t of a block, which is always required. */
156 | 	n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
157 | 	if (n_units&1) ++n_units; /* make n_units an even number, or it will segfault if only one unit remains */
158 | 
159 | 	if (!(q = km->loop_head)) { /* the first time when kmalloc() is called, intialization */
160 | 		km->base[1] = (size_t)(km->loop_head = q = km->base); *q = 0;
161 | 	}
162 | 	for (p = PTR(q);; q = p, p = PTR(p)) { /* search for a suitable block */
163 | 		if (*p >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
164 | 			if (*p == n_units) q[1] = (size_t)PTR(p); /* no need to split the block */
165 | 			else { /* split the block */
166 | 				/* memory is allocated at the end of the block */
167 | 				*p -= n_units; /* reduce the size of the free block */
168 | 				p += *p; /* skip to the kheader_t of the allocated block */
169 | 				*p = n_units; /* set the size */
170 | 			}
171 | 			km->loop_head = q; /* set the end of chain */
172 | 			return p + 1; /* skip the kheader_t */
173 | 		}
174 | 		if (p == km->loop_head) { /* then ask for more "cores" */
175 | 			if ((p = morecore(km, n_units)) == 0) return 0;
176 | 		}
177 | 	}
178 | }
179 | 
180 | void *kcalloc(void *_km, size_t count, size_t size)
181 | {
182 | 	kmem_t *km = (kmem_t*)_km;
183 | 	void *p;
184 | 	if (size == 0 || count == 0) return 0;
185 | 	if (km == 0) return calloc(count, size);
186 | 	p = kmalloc(km, count * size);
187 | 	memset(p, 0, count * size);
188 | 	return p;
189 | }
190 | 
191 | void km_stat(const void *_km)
192 | {
193 | 	kmem_t *km = (kmem_t*)_km;
194 | 	unsigned n_blocks, n_units;
195 | 	size_t max_block = 0, *p, *q;
196 | 	float frag;
197 | 
198 | 	if (km == 0 || !(p = km->loop_head)) return;
199 | 	n_blocks = n_units = 0;
200 | 	do {
201 | 		q = PTR(p);
202 | 		if (*p > max_block) max_block = *p;
203 | 		n_units += *p;
204 | 		if (p + (*p) > q && q > p)
205 | 			kerror("[kr_stat] The end of a free block enters another free block.");
206 | 		p = q;
207 | 		++n_blocks;
208 | 	} while (p != km->loop_head);
209 | 	
210 | 	--n_blocks;
211 | 	frag = 1.0/1024.0 * n_units * sizeof(size_t) / n_blocks;
212 | 	fprintf(stderr, "[kr_stat] tot=%lu, free=%lu, n_block=%u, max_block=%lu, frag_len=%.3fK\n",
213 | 			km->total_allocated, n_units * sizeof(size_t), n_blocks, max_block * sizeof(size_t), frag);
214 | }
215 | 


--------------------------------------------------------------------------------
/src/ksw2/kalloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _KALLOC_H_
 2 | #define _KALLOC_H_
 3 | 
 4 | #include <stdlib.h>
 5 | 
 6 | #define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t))
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | 
12 | void *kmalloc(void *km, size_t size);
13 | void *krealloc(void *km, void *ptr, size_t size);
14 | void *kcalloc(void *km, size_t count, size_t size);
15 | void kfree(void *km, void *ptr);
16 | 
17 | void *km_init(void);
18 | void km_destroy(void *km);
19 | 
20 | void km_stat(const void *km); // TODO: return numbers instead of print to stderr
21 | 
22 | #ifdef __cplusplus
23 | }
24 | #endif
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/ksw2/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 36 | #define KS_SEP_TAB   1 // isspace() && !' '
 37 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 38 | #define KS_SEP_MAX   2
 39 | 
 40 | #define __KS_TYPE(type_t) \
 41 | 	typedef struct __kstream_t { \
 42 | 		int begin, end; \
 43 | 		int is_eof:2, bufsize:30; \
 44 | 		type_t f; \
 45 | 		unsigned char *buf; \
 46 | 	} kstream_t;
 47 | 
 48 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 49 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 50 | 
 51 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \
 52 | 	SCOPE kstream_t *ks_init(type_t f) \
 53 | 	{ \
 54 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
 55 | 		ks->f = f; ks->bufsize = __bufsize; \
 56 | 		ks->buf = (unsigned char*)malloc(__bufsize); \
 57 | 		return ks; \
 58 | 	} \
 59 | 	SCOPE void ks_destroy(kstream_t *ks) \
 60 | 	{ \
 61 | 		if (!ks) return; \
 62 | 		free(ks->buf); \
 63 | 		free(ks); \
 64 | 	}
 65 | 
 66 | #define __KS_INLINED(__read) \
 67 | 	static inline int ks_getc(kstream_t *ks) \
 68 | 	{ \
 69 | 		if (ks->is_eof && ks->begin >= ks->end) return -1; \
 70 | 		if (ks->begin >= ks->end) { \
 71 | 			ks->begin = 0; \
 72 | 			ks->end = __read(ks->f, ks->buf, ks->bufsize); \
 73 | 			if (ks->end < ks->bufsize) ks->is_eof = 1; \
 74 | 			if (ks->end == 0) return -1; \
 75 | 		} \
 76 | 		return (int)ks->buf[ks->begin++]; \
 77 | 	} \
 78 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
 79 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
 80 | 
 81 | #ifndef KSTRING_T
 82 | #define KSTRING_T kstring_t
 83 | typedef struct __kstring_t {
 84 | 	unsigned l, m;
 85 | 	char *s;
 86 | } kstring_t;
 87 | #endif
 88 | 
 89 | #ifndef kroundup32
 90 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 91 | #endif
 92 | 
 93 | #define __KS_GETUNTIL(SCOPE, __read) \
 94 | 	SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 95 | 	{ \
 96 | 		if (dret) *dret = 0; \
 97 | 		str->l = append? str->l : 0; \
 98 | 		if (ks->begin >= ks->end && ks->is_eof) return -1; \
 99 | 		for (;;) { \
100 | 			int i; \
101 | 			if (ks->begin >= ks->end) { \
102 | 				if (!ks->is_eof) { \
103 | 					ks->begin = 0; \
104 | 					ks->end = __read(ks->f, ks->buf, ks->bufsize); \
105 | 					if (ks->end < ks->bufsize) ks->is_eof = 1; \
106 | 					if (ks->end == 0) break; \
107 | 				} else break; \
108 | 			} \
109 | 			if (delimiter == KS_SEP_LINE) { \
110 | 				for (i = ks->begin; i < ks->end; ++i) \
111 | 					if (ks->buf[i] == '\n') break; \
112 | 			} else if (delimiter > KS_SEP_MAX) { \
113 | 				for (i = ks->begin; i < ks->end; ++i) \
114 | 					if (ks->buf[i] == delimiter) break; \
115 | 			} else if (delimiter == KS_SEP_SPACE) { \
116 | 				for (i = ks->begin; i < ks->end; ++i) \
117 | 					if (isspace(ks->buf[i])) break; \
118 | 			} else if (delimiter == KS_SEP_TAB) { \
119 | 				for (i = ks->begin; i < ks->end; ++i) \
120 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
121 | 			} else i = 0; /* never come to here! */ \
122 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
123 | 				str->m = str->l + (i - ks->begin) + 1; \
124 | 				kroundup32(str->m); \
125 | 				str->s = (char*)realloc(str->s, str->m); \
126 | 			} \
127 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
128 | 			str->l = str->l + (i - ks->begin); \
129 | 			ks->begin = i + 1; \
130 | 			if (i < ks->end) { \
131 | 				if (dret) *dret = ks->buf[i]; \
132 | 				break; \
133 | 			} \
134 | 		} \
135 | 		if (str->s == 0) { \
136 | 			str->m = 1; \
137 | 			str->s = (char*)calloc(1, 1); \
138 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
139 | 		str->s[str->l] = '\0'; \
140 | 		return str->l; \
141 | 	}
142 | 
143 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
144 | 	__KS_TYPE(type_t) \
145 | 	__KS_BASIC(SCOPE, type_t, __bufsize) \
146 | 	__KS_GETUNTIL(SCOPE, __read) \
147 | 	__KS_INLINED(__read)
148 | 
149 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
150 | 
151 | #define KSTREAM_DECLARE(type_t, __read) \
152 | 	__KS_TYPE(type_t) \
153 | 	extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
154 | 	extern kstream_t *ks_init(type_t f); \
155 | 	extern void ks_destroy(kstream_t *ks); \
156 | 	__KS_INLINED(__read)
157 | 
158 | /******************
159 |  * FASTA/Q parser *
160 |  ******************/
161 | 
162 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
163 | 
164 | #define __KSEQ_BASIC(SCOPE, type_t) \
165 | 	SCOPE kseq_t *kseq_init(type_t fd) \
166 | 	{ \
167 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
168 | 		s->f = ks_init(fd); \
169 | 		return s; \
170 | 	} \
171 | 	SCOPE void kseq_destroy(kseq_t *ks) \
172 | 	{ \
173 | 		if (!ks) return; \
174 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
175 | 		ks_destroy(ks->f); \
176 | 		free(ks); \
177 | 	}
178 | 
179 | /* Return value:
180 |    >=0  length of the sequence (normal)
181 |    -1   end-of-file
182 |    -2   truncated quality string
183 |  */
184 | #define __KSEQ_READ(SCOPE) \
185 | 	SCOPE int kseq_read(kseq_t *seq) \
186 | 	{ \
187 | 		int c; \
188 | 		kstream_t *ks = seq->f; \
189 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
190 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
191 | 			if (c == -1) return -1; /* end of file */ \
192 | 			seq->last_char = c; \
193 | 		} /* else: the first header char has been read in the previous call */ \
194 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
195 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
196 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
197 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
198 | 			seq->seq.m = 256; \
199 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
200 | 		} \
201 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
202 | 			if (c == '\n') continue; /* skip empty lines */ \
203 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
204 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
205 | 		} \
206 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
207 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
208 | 			seq->seq.m = seq->seq.l + 2; \
209 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
210 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
211 | 		} \
212 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
213 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
214 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
215 | 			seq->qual.m = seq->seq.m; \
216 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
217 | 		} \
218 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
219 | 		if (c == -1) return -2; /* error: no quality string */ \
220 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
221 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
222 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
223 | 		return seq->seq.l; \
224 | 	}
225 | 
226 | #define __KSEQ_TYPE(type_t) \
227 | 	typedef struct { \
228 | 		kstring_t name, comment, seq, qual; \
229 | 		int last_char; \
230 | 		kstream_t *f; \
231 | 	} kseq_t;
232 | 
233 | #define KSEQ_INIT2(SCOPE, type_t, __read) \
234 | 	KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
235 | 	__KSEQ_TYPE(type_t) \
236 | 	__KSEQ_BASIC(SCOPE, type_t) \
237 | 	__KSEQ_READ(SCOPE)
238 | 
239 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
240 | 
241 | #define KSEQ_DECLARE(type_t) \
242 | 	__KS_TYPE(type_t) \
243 | 	__KSEQ_TYPE(type_t) \
244 | 	extern kseq_t *kseq_init(type_t fd); \
245 | 	void kseq_destroy(kseq_t *ks); \
246 | 	int kseq_read(kseq_t *seq);
247 | 
248 | #endif
249 | 


--------------------------------------------------------------------------------
/src/ksw2/ksw2.h:
--------------------------------------------------------------------------------
  1 | #ifndef KSW2_H_
  2 | #define KSW2_H_
  3 | 
  4 | #include <stdint.h>
  5 | 
  6 | #define KSW_NEG_INF -0x40000000
  7 | 
  8 | #define KSW_EZ_SCORE_ONLY  0x01 // don't record alignment path/cigar
  9 | #define KSW_EZ_RIGHT       0x02 // right-align gaps
 10 | #define KSW_EZ_GENERIC_SC  0x04 // without this flag: match/mismatch only; last symbol is a wildcard
 11 | #define KSW_EZ_APPROX_MAX  0x08 // approximate max; this is faster with sse
 12 | #define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse
 13 | #define KSW_EZ_EXTZ_ONLY   0x40 // only perform extension
 14 | #define KSW_EZ_REV_CIGAR   0x80 // reverse CIGAR in the output
 15 | #define KSW_EZ_SPLICE_FOR  0x100
 16 | #define KSW_EZ_SPLICE_REV  0x200
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif
 21 | 
 22 | typedef struct {
 23 | 	uint32_t max:31, zdropped:1;
 24 | 	int max_q, max_t;      // max extension coordinate
 25 | 	int mqe, mqe_t;        // max score when reaching the end of query
 26 | 	int mte, mte_q;        // max score when reaching the end of target
 27 | 	int score;             // max score reaching both ends; may be KSW_NEG_INF
 28 | 	int m_cigar, n_cigar;
 29 | 	uint32_t *cigar;
 30 | } ksw_extz_t;
 31 | 
 32 | /**
 33 |  * NW-like extension
 34 |  *
 35 |  * @param km        memory pool, when used with kalloc
 36 |  * @param qlen      query length
 37 |  * @param query     query sequence with 0 <= query[i] < m
 38 |  * @param tlen      target length
 39 |  * @param target    target sequence with 0 <= target[i] < m
 40 |  * @param m         number of residue types
 41 |  * @param mat       m*m scoring mattrix in one-dimension array
 42 |  * @param gapo      gap open penalty; a gap of length l cost "-(gapo+l*gape)"
 43 |  * @param gape      gap extension penalty
 44 |  * @param w         band width (<0 to disable)
 45 |  * @param zdrop     off-diagonal drop-off to stop extension (positive; <0 to disable)
 46 |  * @param flag      flag (see KSW_EZ_* macros)
 47 |  * @param ez        (out) scores and cigar
 48 |  */
 49 | void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
 50 | void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
 51 | 
 52 | void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 53 | 			  int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
 54 | 
 55 | void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 56 | 				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
 57 | 
 58 | void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 59 | 				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
 60 | 
 61 | void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
 62 | 
 63 | /**
 64 |  * Global alignment
 65 |  *
 66 |  * (first 10 parameters identical to ksw_extz_sse())
 67 |  * @param m_cigar   (modified) max CIGAR length; feed 0 if cigar==0
 68 |  * @param n_cigar   (out) number of CIGAR elements
 69 |  * @param cigar     (out) BAM-encoded CIGAR; caller need to deallocate with kfree(km, )
 70 |  *
 71 |  * @return          score of the alignment
 72 |  */
 73 | int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
 74 | int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
 75 | int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
 76 | 
 77 | void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
 78 | int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te);
 79 | 
 80 | #ifdef __cplusplus
 81 | }
 82 | #endif
 83 | 
 84 | /************************************
 85 |  *** Private macros and functions ***
 86 |  ************************************/
 87 | 
 88 | #ifdef HAVE_KALLOC
 89 | #include "kalloc.h"
 90 | #else
 91 | #include <stdlib.h>
 92 | #define kmalloc(km, size) malloc((size))
 93 | #define kcalloc(km, count, size) calloc((count), (size))
 94 | #define krealloc(km, ptr, size) realloc((ptr), (size))
 95 | #define kfree(km, ptr) free((ptr))
 96 | #endif
 97 | 
 98 | static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len)
 99 | {
100 | 	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
101 | 		if (*n_cigar == *m_cigar) {
102 | 			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
103 | 			cigar = (uint32_t*)krealloc(km, cigar, (*m_cigar) << 2);
104 | 		}
105 | 		cigar[(*n_cigar)++] = len<<4 | op;
106 | 	} else cigar[(*n_cigar)-1] += len<<4;
107 | 	return cigar;
108 | }
109 | 
110 | // In the backtrack matrix, value p[] has the following structure:
111 | //   bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
112 | //   bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
113 | //   bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
114 | static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
115 | 								 int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
116 | { // p[] - lower 3 bits: which type gets the max; bit
117 | 	int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
118 | 	uint32_t *cigar = *cigar_, tmp;
119 | 	while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
120 | 		int force_state = -1;
121 | 		if (is_rot) {
122 | 			r = i + j;
123 | 			if (i < off[r]) force_state = 2;
124 | 			if (off_end && i > off_end[r]) force_state = 1;
125 | 			tmp = force_state < 0? p[r * n_col + i - off[r]] : 0;
126 | 		} else {
127 | 			if (j < off[i]) force_state = 2;
128 | 			if (off_end && j > off_end[i]) force_state = 1;
129 | 			tmp = force_state < 0? p[i * n_col + j - off[i]] : 0;
130 | 		}
131 | 		if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it.
132 | 		else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
133 | 		if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
134 | 		if (force_state >= 0) state = force_state;
135 | 		if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
136 | 		else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
137 | 		else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
138 | 		else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
139 | 	}
140 | 	if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
141 | 	if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
142 | 	if (!is_rev)
143 | 		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
144 | 			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
145 | 	*m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar;
146 | }
147 | 
148 | static inline void ksw_reset_extz(ksw_extz_t *ez)
149 | {
150 | 	ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
151 | 	ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF;
152 | 	ez->n_cigar = 0, ez->zdropped = 0;
153 | }
154 | 
155 | static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e)
156 | {
157 | 	int r, t;
158 | 	if (is_rot) r = a, t = b;
159 | 	else r = a + b, t = a;
160 | 	if (H > (int32_t)ez->max) {
161 | 		ez->max = H, ez->max_t = t, ez->max_q = r - t;
162 | 	} else if (t >= ez->max_t && r - t >= ez->max_q) {
163 | 		int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l;
164 | 		l = tl > ql? tl - ql : ql - tl;
165 | 		if (zdrop >= 0 && ez->max - H > zdrop + l * e) {
166 | 			ez->zdropped = 1;
167 | 			return 1;
168 | 		}
169 | 	}
170 | 	return 0;
171 | }
172 | 
173 | #endif
174 | 


--------------------------------------------------------------------------------
/src/ksw2/ksw2_ll_sse.cc:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdint.h>
  3 | #include <string.h>
  4 | #include <emmintrin.h>
  5 | #include "ksw2.h"
  6 | 
  7 | #ifdef __GNUC__
  8 | #define LIKELY(x) __builtin_expect((x),1)
  9 | #define UNLIKELY(x) __builtin_expect((x),0)
 10 | #else
 11 | #define LIKELY(x) (x)
 12 | #define UNLIKELY(x) (x)
 13 | #endif
 14 | 
 15 | typedef struct {
 16 | 	int qlen, slen;
 17 | 	uint8_t shift, mdiff, max, size;
 18 | 	__m128i *qp, *H0, *H1, *E, *Hmax;
 19 | } kswq_t;
 20 | 
 21 | /**
 22 |  * Initialize the query data structure
 23 |  *
 24 |  * @param size   Number of bytes used to store a score; valid valures are 1 or 2
 25 |  * @param qlen   Length of the query sequence
 26 |  * @param query  Query sequence
 27 |  * @param m      Size of the alphabet
 28 |  * @param mat    Scoring matrix in a one-dimension array
 29 |  *
 30 |  * @return       Query data structure
 31 |  */
 32 | void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
 33 | {
 34 | 	kswq_t *q;
 35 | 	int slen, a, tmp, p;
 36 | 
 37 | 	size = size > 1? 2 : 1;
 38 | 	p = 8 * (3 - size); // # values per __m128i
 39 | 	slen = (qlen + p - 1) / p; // segmented length
 40 | 	q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
 41 | 	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
 42 | 	q->H0 = q->qp + slen * m;
 43 | 	q->H1 = q->H0 + slen;
 44 | 	q->E  = q->H1 + slen;
 45 | 	q->Hmax = q->E + slen;
 46 | 	q->slen = slen; q->qlen = qlen; q->size = size;
 47 | 	// compute shift
 48 | 	tmp = m * m;
 49 | 	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
 50 | 		if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
 51 | 		if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
 52 | 	}
 53 | 	q->max = q->mdiff;
 54 | 	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
 55 | 	q->mdiff += q->shift; // this is the difference between the min and max scores
 56 | 	// An example: p=8, qlen=19, slen=3 and segmentation:
 57 | 	//  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
 58 | 	if (size == 1) {
 59 | 		int8_t *t = (int8_t*)q->qp;
 60 | 		for (a = 0; a < m; ++a) {
 61 | 			int i, k, nlen = slen * p;
 62 | 			const int8_t *ma = mat + a * m;
 63 | 			for (i = 0; i < slen; ++i)
 64 | 				for (k = i; k < nlen; k += slen) // p iterations
 65 | 					*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
 66 | 		}
 67 | 	} else {
 68 | 		int16_t *t = (int16_t*)q->qp;
 69 | 		for (a = 0; a < m; ++a) {
 70 | 			int i, k, nlen = slen * p;
 71 | 			const int8_t *ma = mat + a * m;
 72 | 			for (i = 0; i < slen; ++i)
 73 | 				for (k = i; k < nlen; k += slen) // p iterations
 74 | 					*t++ = (k >= qlen? 0 : ma[query[k]]);
 75 | 		}
 76 | 	}
 77 | 	return q;
 78 | }
 79 | 
 80 | int ksw_ll_i16(void *q_, int tlen, const uint8_t *target, int _gapo, int _gape, int *qe, int *te)
 81 | {
 82 | 	kswq_t *q = (kswq_t*)q_;
 83 | 	int slen, i, gmax = 0, qlen8;
 84 | 	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
 85 | 	uint16_t *H8;
 86 | 
 87 | #define __max_8(ret, xx) do { \
 88 | 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
 89 | 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
 90 | 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
 91 | 		(ret) = _mm_extract_epi16((xx), 0); \
 92 | 	} while (0)
 93 | 
 94 | 	// initialization
 95 | 	*qe = *te = -1;
 96 | 	zero = _mm_set1_epi32(0);
 97 | 	gapoe = _mm_set1_epi16(_gapo + _gape);
 98 | 	gape = _mm_set1_epi16(_gape);
 99 | 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
100 | 	slen = q->slen, qlen8 = slen * 8;
101 | 	memset(E,    0, slen * sizeof(__m128i));
102 | 	memset(H0,   0, slen * sizeof(__m128i));
103 | 	memset(Hmax, 0, slen * sizeof(__m128i));
104 | 	// the core loop
105 | 	for (i = 0; i < tlen; ++i) {
106 | 		int j, k, imax;
107 | 		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
108 | 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
109 | 		h = _mm_slli_si128(h, 2);
110 | 		for (j = 0; LIKELY(j < slen); ++j) {
111 | 			h = _mm_adds_epi16(h, *S++);
112 | 			e = _mm_load_si128(E + j);
113 | 			h = _mm_max_epi16(h, e);
114 | 			h = _mm_max_epi16(h, f);
115 | 			max = _mm_max_epi16(max, h);
116 | 			_mm_store_si128(H1 + j, h);
117 | 			h = _mm_subs_epu16(h, gapoe);
118 | 			e = _mm_subs_epu16(e, gape);
119 | 			e = _mm_max_epi16(e, h);
120 | 			_mm_store_si128(E + j, e);
121 | 			f = _mm_subs_epu16(f, gape);
122 | 			f = _mm_max_epi16(f, h);
123 | 			h = _mm_load_si128(H0 + j);
124 | 		}
125 | 		for (k = 0; LIKELY(k < 16); ++k) {
126 | 			f = _mm_slli_si128(f, 2);
127 | 			for (j = 0; LIKELY(j < slen); ++j) {
128 | 				h = _mm_load_si128(H1 + j);
129 | 				h = _mm_max_epi16(h, f);
130 | 				_mm_store_si128(H1 + j, h);
131 | 				h = _mm_subs_epu16(h, gapoe);
132 | 				f = _mm_subs_epu16(f, gape);
133 | 				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16;
134 | 			}
135 | 		}
136 | end_loop_i16:
137 | 		__max_8(imax, max);
138 | 		if (imax >= gmax) {
139 | 			gmax = imax; *te = i;
140 | 			memcpy(Hmax, H1, slen * sizeof(__m128i));
141 | 		}
142 | 		S = H1; H1 = H0; H0 = S;
143 | 	}
144 | 	for (i = 0, H8 = (uint16_t*)Hmax; i < qlen8; ++i)
145 | 		if ((int)H8[i] == gmax) *qe = i / 8 + i % 8 * slen;
146 | 	return gmax;
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main.cc:
--------------------------------------------------------------------------------
 1 | //============================================================================
 2 | // Name        : graphmap.cpp
 3 | // Author      : Ivan Sovic
 4 | // Version     :
 5 | // Copyright   : Copyright Ivan Sovic, 2014. All rights reserved.
 6 | // Description : Hello World in C++, Ansi-style
 7 | //============================================================================
 8 | 
 9 | #include <omp.h>
10 | #include <stdio.h>
11 | #include <math.h>
12 | #include <time.h>
13 | #include <iostream>
14 | #include "sequences/sequence_file.h"
15 | #include "sequences/single_sequence.h"
16 | #include "log_system/log_system.h"
17 | #include "graphmap/graphmap.h"
18 | 
19 | #include "program_parameters.h"
20 | #include "utility/utility_general.h"
21 | 
22 | #include "owler/owler.h"
23 | #include "argparser.h"
24 | 
25 | int main(int argc, char *argv[]) {
26 |   std::string program_name(argv[0]);
27 |   std::string subprogram("");
28 | 
29 |   ArgumentParser argparser;
30 |   argparser.AddArgument(&subprogram, VALUE_TYPE_STRING, "", "tool", "", "Specifies the tool to run:\n  align - the entire GraphMap pipeline.\n  owler - Overlapping With Long Erroneous Reads.", -1, "");
31 |   argparser.set_program_name(program_name);
32 | 
33 |   if (argc == 1) {
34 |     fprintf (stderr, "%s", argparser.VerboseUsage().c_str());
35 |     fprintf (stderr, "\n");
36 |     fprintf (stderr, "%s\n", LICENCE_INFORMATION);
37 |     fprintf (stderr, "Version: %s\n", std::string(GRAPHMAP_CURRENT_VERSION).c_str());
38 |     fprintf (stderr, "Build date: %s\n", std::string(GRAPHMAP_CURRENT_VERSION_RELEASE_DATE).c_str());
39 |     fprintf (stderr, "\n");
40 |     exit(1);
41 |   }
42 | 
43 |   // The ArgumentParser's function for processing arguments is never explicitly called, because it's overly complicated for this purpose.
44 |   // Instead, we just take the value of argv[1] and that's it. ArgumentParser is used only for neat formatting of the usage.
45 |   subprogram = std::string(argv[1]);
46 | 
47 |   // Remove the 'tools' param to format the command line so it can be seemlesly processed in the next step.
48 |   std::vector<char *> argv2;
49 |   argv2.push_back(argv[0]);
50 |   for (int32_t i=2; i<argc; i++) { argv2.push_back(argv[i]); }
51 |   int32_t argc2 = argv2.size();
52 | 
53 |   ProgramParameters program_parameters;
54 |   program_parameters.subprogram = subprogram;
55 | 
56 |   if (subprogram == "align") {
57 |     if (ProcessArgsGraphMap(argc2, &argv2[0], &program_parameters))
58 |       return 1;
59 | 
60 |     if (program_parameters.verbose_level == 1) {
61 |       LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_STD;
62 |     } else if (program_parameters.verbose_level > 1) {
63 |       LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_FULL | LOG_VERBOSE_STD;
64 |     }
65 |     fflush(stdout);
66 | 
67 |     GraphMap graphmap;
68 |     graphmap.Run(program_parameters);
69 | 
70 |   } else if (subprogram == "owler") {
71 |     if (ProcessArgsOwler(argc2, &argv2[0], &program_parameters))
72 |       return 1;
73 | 
74 |     if (program_parameters.verbose_level == 1) {
75 |       LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_STD;
76 |     } else if (program_parameters.verbose_level > 1) {
77 |       LogSystem::GetInstance().LOG_VERBOSE_TYPE = LOG_VERBOSE_FULL | LOG_VERBOSE_STD;
78 |     }
79 |     fflush(stdout);
80 | 
81 |     Owler owler;
82 |     owler.Run(program_parameters);
83 | 
84 |   } else {
85 |     fprintf (stderr, "ERROR: Unknown value of 'tool' parameter. Exiting.\n\n");
86 |     fprintf (stderr, "%s\n", argparser.VerboseUsage().c_str());
87 |     exit(1);
88 | 
89 |   }
90 | 
91 | 
92 | 	return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/src/owler/owler.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * owler.h
  3 |  *
  4 |  *  Created on: Jul 2, 2015
  5 |  *      Author: isovic
  6 |  */
  7 | 
  8 | #ifndef OWLER_H_
  9 | #define OWLER_H_
 10 | 
 11 | #include <memory>
 12 | #include <stdint.h>
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <dirent.h>
 16 | #include <string>
 17 | #include <sstream>
 18 | #include <vector>
 19 | #include <algorithm>
 20 | 
 21 | #include "sequences/single_sequence.h"
 22 | #include "sequences/sequence_file.h"
 23 | #include "containers/score_registry.h"
 24 | #include "program_parameters.h"
 25 | #include "utility/utility_general.h"
 26 | #include "alignment/cigargen.h"
 27 | #include "containers/region.h"
 28 | #include "containers/mapping_data.h"
 29 | #include "utility/evalue.h"
 30 | #include "containers/vertices.h"
 31 | 
 32 | #include "owler/owler_data.h"
 33 | 
 34 | #include "minimizer_index/minimizer_index.h"
 35 | 
 36 | #include "utility/tictoc.h"
 37 | 
 38 | 
 39 | 
 40 | class Owler {
 41 |  public:
 42 |   Owler();
 43 |   ~Owler();
 44 | 
 45 |   // Main function for running the mapping process. It generates/loads the index, and handles batch loading of sequences from the reads file.
 46 |   void Run(ProgramParameters &parameters);
 47 | 
 48 |  private:
 49 |   std::shared_ptr<SequenceFile> ref_;
 50 |   std::shared_ptr<SequenceFile> reads_;
 51 |   std::shared_ptr<is::MinimizerIndex> index_;
 52 |   clock_t run_begin_time_;
 53 |   clock_t run_end_time_;
 54 | 
 55 |   // Opens the output SAM file for writing if the path is specified. If the path is empty, then output is set to STDOUT.
 56 |   FILE* OpenOutFile_(std::string out_sam_path="");
 57 | 
 58 |   // Generates or loads the index of the reference genome.
 59 |   int BuildIndex_(ProgramParameters &parameters);
 60 | 
 61 |   // Process the loaded batch of reads. Uses OpenMP to do it in parallel. Calls ProcessOneRead for each read in the SequenceFile.
 62 |   int ProcessSequenceFileInParallel_(ProgramParameters &parameters, std::shared_ptr<SequenceFile> reads, TicToc &tt_all, FILE *fp_out);
 63 | 
 64 |   int ProcessRead_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data);
 65 | 
 66 |   int CollectHits_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data);
 67 | 
 68 | //  int ClusterHits_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, int32_t diag_epsilon, OwlerData &owler_data);
 69 |   int ClusterHits2_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, int32_t diag_epsilon, OwlerData &owler_data);
 70 | 
 71 |   void GenerateOutput_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, OwlerData &owler_data);
 72 | 
 73 | 
 74 |   void AppendSeedHits_(const uint128_t& seed, std::shared_ptr<is::MinimizerIndex> index, bool threshold_hits, double count_cutoff, bool is_overlapper, int64_t qid, std::vector<uint128_t> &all_hits);
 75 | 
 76 |   int WrapLCSk_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, const std::vector<uint128_t> &hits, int64_t begin_hit, int64_t end_hit, int32_t seed_len, PairwiseOverlap &overlap);
 77 | 
 78 |   void LCSk_(std::vector<uint128_t> &events, int64_t n, int64_t k, std::vector<uint64_t> &matches_starts, std::vector<uint64_t> &matches_indices, std::vector<int32_t> &lcsk_indices, int64_t &lcsk_len);
 79 | 
 80 |   void FilterColinear_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters,
 81 |                        const std::vector<uint128_t> &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len, const std::vector<int32_t> &raw_lcsk_indices,
 82 |                        std::vector<int32_t> &lcsk_indices, std::vector<int32_t> *cluster_ids, int32_t &num_sv);
 83 | 
 84 |   int PrepareEvents_(const std::vector<uint128_t> &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len,
 85 |                             std::vector<uint128_t> &events, std::vector<uint64_t> &matches_starts, std::vector<uint64_t> &matches_indices, int64_t &max_seq_len);
 86 | 
 87 |   int CalcCoveredBases_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters,
 88 |                               const std::vector<uint128_t> &hits, int64_t begin_hit, int64_t end_hit, int64_t seed_len, PairwiseOverlap &overlap);
 89 | 
 90 |   bool CheckOverlap_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, PairwiseOverlap& overlap);
 91 | 
 92 | 
 93 | 
 94 |   std::string GenerateMHAPLine_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap);
 95 |   std::string GeneratePAFLine_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap);
 96 | 
 97 |   std::string GenerateDebugInfo_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const ProgramParameters *parameters, const PairwiseOverlap& overlap);
 98 |   int64_t CalcEditDist_(std::shared_ptr<is::MinimizerIndex> index, const SingleSequence *read, const PairwiseOverlap& overlap);
 99 |   double CalcRatio_(const PairwiseOverlap& overlap);
100 | 
101 |   int FilterAnchorBreakpoints_(const std::vector<int32_t> &lcskpp_indices, int64_t ref_hits_start, int64_t ref_hits_end, int64_t seed_length,
102 |                                      int64_t min_cluster_length, float min_cluster_coverage, const std::vector<uint128_t> &hits,
103 |                                      const ProgramParameters* parameters, std::vector<int32_t> &ret_filtered_lcskpp_indices,
104 |                                      std::vector<int32_t> *ret_cluster_ids);
105 |   bool CheckDistanceTooBig_(const std::vector<uint128_t> &hits, int64_t index_last, int64_t index_current, float error_rate);
106 | 
107 |   void WriteHits_(std::string out_path, const std::vector<uint128_t> &hits, int64_t hits_start, int64_t hits_end,
108 |                  int64_t ref_id, std::string read_header, int64_t read_length,
109 |                  std::string reference_header, int64_t reference_length,
110 |                  const std::vector<int32_t> *indices_to_output, const std::vector<int32_t> *cluster_ids);
111 | 
112 |   static inline uint128_t MakeHit_(const uint128_t& seq_id, const uint128_t& diag, const uint128_t& pos_ref, const uint128_t& pos_read) {
113 |     return ((seq_id << 96) | (diag << 64) | (pos_ref << 32) | (pos_read));
114 |   }
115 | 
116 |   static inline int32_t HitPosRead_(const uint128_t& hit) {
117 |     return (int32_t) (hit & kSeedMask32_1);
118 |   }
119 | 
120 |   static inline int32_t HitPosRef_(const uint128_t& hit) {
121 |     return (int32_t) ((hit & kSeedMask32_2) >> 32);
122 |   }
123 | 
124 |   static inline int32_t HitDiag_(const uint128_t& hit) {
125 |     return (int32_t) ((hit & kSeedMask32_3) >> 64);
126 |   }
127 | 
128 |   static inline int32_t HitSeqId_(const uint128_t& hit) {
129 |     return (int32_t) ((hit & kSeedMask32_4) >> 96);
130 |   }
131 | };
132 | 
133 | #endif /* OWLER_H_ */
134 | 


--------------------------------------------------------------------------------
/src/owler/owler_data.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * owler_data.h
 3 |  *
 4 |  *  Created on: Jul 2, 2015
 5 |  *      Author: isovic
 6 |  */
 7 | 
 8 | #ifndef OWLER_DATA_H_
 9 | #define OWLER_DATA_H_
10 | 
11 | #include <memory>
12 | #include <stdlib.h>
13 | #include <stdint.h>
14 | #include <string>
15 | #include <sstream>
16 | #include <vector>
17 | 
18 | #include "minimizer_index/minimizer_index.h"
19 | #include "containers/range.h"
20 | 
21 | class PairwiseOverlap {
22 |  public:
23 | //  PairwiseOverlap() : qid(0), tid(0), num_seeds(0), cov_bases(0), num_sv(0) { }
24 |   PairwiseOverlap(int64_t _qid, int64_t _tid, int64_t _tid_fwd) : qid(_qid), tid(_tid), tid_fwd(_tid_fwd), num_seeds(0), num_hits(0), cov_bases_query(0), cov_bases_target(0), num_sv(0), lcsk_len(0) { }
25 | 
26 |   Range query, target;
27 |   int64_t qid, tid, tid_fwd;
28 |   int64_t num_seeds;                // Number of seed hits which survived all LCSk filters.
29 |   int64_t num_hits;                 // Number of raw seed hits, without any sort of LCSk filtering.
30 |   int64_t cov_bases_query;
31 |   int64_t cov_bases_target;
32 |   int32_t num_sv;
33 | 
34 |   std::vector<int32_t> lcsk_indices;
35 |   std::vector<int32_t> cluster_ids;
36 |   int64_t lcsk_len;
37 | 
38 |   std::string reject_reason;
39 | };
40 | 
41 | class OwlerData {
42 |  public:
43 |   OwlerData() { };
44 |   ~OwlerData() { };
45 | 
46 |   std::vector<uint128_t> hits;
47 |   std::vector<PairwiseOverlap> overlaps;
48 |   std::string unmapped_reason;
49 |   std::string overlap_lines;
50 | //  std::vector<std::string> out_lines;
51 | };
52 | 
53 | #endif /* OWLER_DATA_H_ */
54 | 


--------------------------------------------------------------------------------
/src/program_parameters.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * program_parameters.h
  3 |  *
  4 |  *  Created on: Jul 24, 2014
  5 |  *      Author: ivan
  6 |  */
  7 | 
  8 | #ifndef PROGRAM_PARAMETERS_H_
  9 | #define PROGRAM_PARAMETERS_H_
 10 | 
 11 | #include <stdio.h>
 12 | #include <ctype.h>
 13 | #include <stdlib.h>
 14 | #include <unistd.h>
 15 | #include <string>
 16 | #include <sstream>
 17 | 
 18 | #define SOFTWARE_NAME "GraphMap"
 19 | #define GRAPHMAP_CURRENT_VERSION "v0.6.5"
 20 | #define GRAPHMAP_CURRENT_VERSION_RELEASE_DATE (std::string(__DATE__) + std::string(" at ") + std::string(__TIME__)) // __TIMESTAMP__ // "12 October 2014"
 21 | #define COPYRIGHT "Copyright Ivan Sovic, Mile Sikic and Niranjan Nagarajan, 2015, 2016.\n" \
 22 |                   "\n" \
 23 |                   "Affiliations: Ivan Sovic (1, 3), Mile Sikic (2), Niranjan Nagarajan (3)\n" \
 24 |                   "  (1) Ruder Boskovic Institute, Zagreb, Croatia\n" \
 25 |                   "  (2) University of Zagreb, Faculty of Electrical Engineering and Computing\n" \
 26 |                   "  (3) Genome Institute of Singapore, A*STAR, Singapore\n"
 27 | 
 28 | #define LICENCE_INFORMATION   \
 29 |   "GraphMap (c) by Ivan Sovic, Mile Sikic and Niranjan Nagarajan\n" \
 30 |   "GraphMap is licensed under The MIT License.\n" \
 31 | 
 32 | #define AFFILIATIONS \
 33 |   "Affiliations: Ivan Sovic (1, 3), Mile Sikic (2), Niranjan Nagarajan (3)\n" \
 34 |   "  (1) Ruder Boskovic Institute, Zagreb, Croatia\n" \
 35 |   "  (2) University of Zagreb, Faculty of Electrical Engineering and Computing\n" \
 36 |   "  (3) Genome Institute of Singapore, A*STAR, Singapore\n"
 37 | 
 38 | struct ProgramParameters {
 39 |   std::string subprogram = "";
 40 | 
 41 |   int64_t k_region = 13;                    // 'j', Kmer size for region search (binning).
 42 |   int64_t k_graph = 6;                      // 'k', Kmer size for graph building.
 43 |   int64_t num_links = 9;                   // 'l', Number of backward edges to check.
 44 |   float error_rate = 0.45;                 // 'e', Approximate error rate of the input read sequences.
 45 |   int64_t start_read = 0;                   // 's', Start processing reads from the one specified.
 46 |   int64_t num_reads_to_process = -1;        // 'n', Number of reads to process. If equal to -1, all reads will be processed.
 47 |   int64_t debug_read = -1;                  // 'y', Verbose output for read marked with this variable.
 48 |   std::string debug_read_by_qname = "";
 49 |   int64_t num_threads = -1;                 // 't', Number of threads to use. If equal to -1, number of threads will be equal to number of processors.
 50 |   std::string reference_path = "";          // 'r', The path to the reference file.
 51 |   std::string index_file = "";    // 'i', The path to the reference file's index. If it does not exist, index will be created in this path.
 52 |   std::string reads_path = "";              // 'd', The path to the reads file, in FASTA or FASTQ format.
 53 |   std::string out_sam_path = "";            // 'o', The output path. If left blank, all sam output will be placed to stdout.
 54 |   int64_t verbose_sam_output = 0;           // 'b', Helpful debug comments can be placed in SAM output lines (at the end), however, some tools (like SAMtools) don't quite like them. Comments can be turned of by setting this variable to 0. Different values increase/decrease verbosity level.
 55 |   int64_t verbose_level = 5;                // 'v', Verbose level. If equal to 0 nothing except strict output will be placed on stdout.
 56 |   std::string command_line = "";            // The actuall commandline that was used to generate the parameters.
 57 |   int64_t max_num_regions_cutoff = 0;     // 'q' Before the read is skipped, it will be attempted to reduce the number of selected regions if their number is higher than max_num_regions_cutoff.
 58 |   int64_t max_num_regions = 0;            // 'g' If still more regions than this are selected, the read is too ambiguous for processing, so it will be skipped.
 59 | 
 60 |   // Binning parameters
 61 | //  int64_t max_num_hits = 0;               // 'm' Maximum number of hits per kmer during the binning process.
 62 |   bool skip_multiple_kmers_per_bin = true;  // 'p' One kmer of a read can have multiple hits withing the same bin. If true, this parameter prevents this.
 63 | 
 64 |   bool output_in_original_order = false;    // 'u' If true, SAM alignments will be output after the processing has finished, in the order of input reads.
 65 |   int64_t kmer_step = 1;              // 'w' The number of bases to skip between beginnings of every adjecent kmer.
 66 | 
 67 |   std::string reads_folder = "";            // 'D', The path to a folder that contains reads, in FASTA or FASTQ format. Intended for batch processing.
 68 |   std::string output_folder = "";           // 'O', The path to the output folder for batch processing.
 69 |   bool process_reads_from_folder = false;
 70 |   int64_t batch_size_in_mb = -1;             // 'B', specifies the size of a batch for sequence loading. If <= 0, all sequences will be loaded at once, otherwise the specified number of megabytes will be loaded consequentially.
 71 |   std::string alignment_algorithm = "sg";  // 'a', specifies whether EDlib or SSW or hybrid should be used for realignment in the last step.
 72 |   std::string alignment_approach = "normal";      // 'w'
 73 |   bool calc_only_index = false;
 74 |   int64_t match_score = 5;
 75 |   int64_t mex_score = 1;
 76 |   int64_t mismatch_penalty = 4;
 77 |   int64_t gap_open_penalty = 8;
 78 |   int64_t gap_extend_penalty = 6;
 79 |   int64_t evalue_match = 5;
 80 |   int64_t evalue_mismatch = 4;
 81 |   int64_t evalue_gap_open = 8;
 82 |   int64_t evalue_gap_extend = 6;
 83 |   bool is_reference_circular = false;     // 'C'
 84 |   std::string composite_parameters = "";  // 'x', specifies several parameters at the same time, such as 'nanopore' and 'illumina'.
 85 |   float margin_for_ambiguity = 0.05;  // All mapping positions within the given fraction of the top score will be counted for ambiguity (mapping quality). Value of 0.0f counts only identical mappings.
 86 |   bool output_multiple_alignments = false;  // If 0, only one best alignment will be output. Otherwise, all alignments within margin_for_ambiguity will be output to a file.
 87 |   bool use_double_index = false; // If false, only one index will be used, but the memory consumption will be reduced by half. If false, sensitive and memory-hungry mode will be used.
 88 |   int64_t min_num_anchor_bases = 12;
 89 |   double evalue_threshold = -1;
 90 |   int64_t mapq_threshold = 0;
 91 |   std::string infmt = "auto";
 92 |   std::string outfmt = "sam";
 93 | 
 94 | //  bool extend_aln_to_end = true;
 95 | 
 96 |   bool use_extended_cigar = false;
 97 | 
 98 |   int64_t min_read_len = 80;      // If a read is shorter than this, it will be marked as unmapped.
 99 | 
100 |   double min_bin_percent = 0.75f;
101 |   double bin_threshold_step = 0.10f;
102 | 
103 |   bool use_spliced = false;
104 |   bool use_split = false;
105 |   bool disable_end_to_end = true;
106 |   bool overlapper = false;
107 |   bool no_self_hits = false;
108 |   bool rebuild_index = false;
109 | 
110 |   double max_error_rate = 1.0f;
111 |   double max_indel_error_rate = 1.0f;
112 | 
113 |   std::string gtf_path;
114 |   bool is_transcriptome = false;
115 | 
116 |   bool auto_rebuild_index = false;
117 | 
118 |   bool use_minimizers = false;
119 |   int64_t minimizer_window = 5;
120 |   bool threshold_hits = false;
121 |   double frequency_percentil = 0.99;
122 |   bool index_on_the_fly = false;
123 |   std::string index_shape = "1111110111111";
124 |   bool load_index = false;
125 |   bool store_index = false;
126 |   int64_t min_overlap_len = 100;
127 |   double overhang_percent = 0.20;
128 |   int64_t max_allowed_overhang = 1000;
129 |   double min_percent_cov_bases = 0.01;
130 |   int64_t min_num_seeds = 4;
131 | 
132 |   double anchor_chain_indel_bandwidth = 0.23; // error_rate / 2 + 0.1f;
133 |   int64_t anchor_chain_max_dist = 200;
134 |   int64_t anchor_chain_min_cov_bases = 50;
135 |   int64_t anchor_chain_size_cutoff = 2;
136 | };
137 | 
138 | int ProcessArgsGraphMap(int argc, char **argv, ProgramParameters *parameters);
139 | int ProcessArgsOwler(int argc, char **argv, ProgramParameters *parameters);
140 | void VerboseProgramParameters(ProgramParameters *parameters);
141 | void VerboseShortHelpAndExit(int argc, char **argv);
142 | 
143 | #endif /* PROGRAM_PARAMETERS_H_ */
144 | 


--------------------------------------------------------------------------------
/src/sparsehash/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2005, Google Inc.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 | notice, this list of conditions and the following disclaimer.
10 |     * Redistributions in binary form must reproduce the above
11 | copyright notice, this list of conditions and the following disclaimer
12 | in the documentation and/or other materials provided with the
13 | distribution.
14 |     * Neither the name of Google Inc. nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/src/sparsehash/internal/libc_allocator_with_realloc.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010, Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ---
 31 | 
 32 | #ifndef UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
 33 | #define UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
 34 | 
 35 | #include <sparsehash/internal/sparseconfig.h>
 36 | #include <stdlib.h>           // for malloc/realloc/free
 37 | #include <stddef.h>           // for ptrdiff_t
 38 | #include <new>                // for placement new
 39 | 
 40 | _START_GOOGLE_NAMESPACE_
 41 | 
 42 | template<class T>
 43 | class libc_allocator_with_realloc {
 44 |  public:
 45 |   typedef T value_type;
 46 |   typedef size_t size_type;
 47 |   typedef ptrdiff_t difference_type;
 48 | 
 49 |   typedef T* pointer;
 50 |   typedef const T* const_pointer;
 51 |   typedef T& reference;
 52 |   typedef const T& const_reference;
 53 | 
 54 |   libc_allocator_with_realloc() {}
 55 |   libc_allocator_with_realloc(const libc_allocator_with_realloc&) {}
 56 |   ~libc_allocator_with_realloc() {}
 57 | 
 58 |   pointer address(reference r) const  { return &r; }
 59 |   const_pointer address(const_reference r) const  { return &r; }
 60 | 
 61 |   pointer allocate(size_type n, const_pointer = 0) {
 62 |     return static_cast<pointer>(malloc(n * sizeof(value_type)));
 63 |   }
 64 |   void deallocate(pointer p, size_type) {
 65 |     free(p);
 66 |   }
 67 |   pointer reallocate(pointer p, size_type n) {
 68 |     return static_cast<pointer>(realloc(p, n * sizeof(value_type)));
 69 |   }
 70 | 
 71 |   size_type max_size() const  {
 72 |     return static_cast<size_type>(-1) / sizeof(value_type);
 73 |   }
 74 | 
 75 |   void construct(pointer p, const value_type& val) {
 76 |     new(p) value_type(val);
 77 |   }
 78 |   void destroy(pointer p) { p->~value_type(); }
 79 | 
 80 |   template <class U>
 81 |   libc_allocator_with_realloc(const libc_allocator_with_realloc<U>&) {}
 82 | 
 83 |   template<class U>
 84 |   struct rebind {
 85 |     typedef libc_allocator_with_realloc<U> other;
 86 |   };
 87 | };
 88 | 
 89 | // libc_allocator_with_realloc<void> specialization.
 90 | template<>
 91 | class libc_allocator_with_realloc<void> {
 92 |  public:
 93 |   typedef void value_type;
 94 |   typedef size_t size_type;
 95 |   typedef ptrdiff_t difference_type;
 96 |   typedef void* pointer;
 97 |   typedef const void* const_pointer;
 98 | 
 99 |   template<class U>
100 |   struct rebind {
101 |     typedef libc_allocator_with_realloc<U> other;
102 |   };
103 | };
104 | 
105 | template<class T>
106 | inline bool operator==(const libc_allocator_with_realloc<T>&,
107 |                        const libc_allocator_with_realloc<T>&) {
108 |   return true;
109 | }
110 | 
111 | template<class T>
112 | inline bool operator!=(const libc_allocator_with_realloc<T>&,
113 |                        const libc_allocator_with_realloc<T>&) {
114 |   return false;
115 | }
116 | 
117 | _END_GOOGLE_NAMESPACE_
118 | 
119 | #endif  // UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
120 | 


--------------------------------------------------------------------------------
/src/sparsehash/internal/sparseconfig.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NOTE: This file is for internal use only.
 3 |  *       Do not use these #defines in your own program!
 4 |  */
 5 | 
 6 | /* Namespace for Google classes */
 7 | #define GOOGLE_NAMESPACE ::google
 8 | 
 9 | /* the location of the header defining hash functions */
10 | #define HASH_FUN_H <tr1/functional>
11 | 
12 | /* the namespace of the hash<> function */
13 | #define HASH_NAMESPACE std::tr1
14 | 
15 | /* Define to 1 if you have the <inttypes.h> header file. */
16 | #define HAVE_INTTYPES_H 1
17 | 
18 | /* Define to 1 if the system has the type `long long'. */
19 | #define HAVE_LONG_LONG 1
20 | 
21 | /* Define to 1 if you have the `memcpy' function. */
22 | #define HAVE_MEMCPY 1
23 | 
24 | /* Define to 1 if you have the <stdint.h> header file. */
25 | #define HAVE_STDINT_H 1
26 | 
27 | /* Define to 1 if you have the <sys/types.h> header file. */
28 | #define HAVE_SYS_TYPES_H 1
29 | 
30 | /* Define to 1 if the system has the type `uint16_t'. */
31 | #define HAVE_UINT16_T 1
32 | 
33 | /* Define to 1 if the system has the type `u_int16_t'. */
34 | #define HAVE_U_INT16_T 1
35 | 
36 | /* Define to 1 if the system has the type `__uint16'. */
37 | /* #undef HAVE___UINT16 */
38 | 
39 | /* The system-provided hash function including the namespace. */
40 | #define SPARSEHASH_HASH HASH_NAMESPACE::hash
41 | 
42 | /* Stops putting the code inside the Google namespace */
43 | #define _END_GOOGLE_NAMESPACE_ }
44 | 
45 | /* Puts following code inside the Google namespace */
46 | #define _START_GOOGLE_NAMESPACE_ namespace google {
47 | 


--------------------------------------------------------------------------------
/src/sparsehash/template_util.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2005 Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ----
 31 | //
 32 | // Template metaprogramming utility functions.
 33 | //
 34 | // This code is compiled directly on many platforms, including client
 35 | // platforms like Windows, Mac, and embedded systems.  Before making
 36 | // any changes here, make sure that you're not breaking any platforms.
 37 | //
 38 | //
 39 | // The names choosen here reflect those used in tr1 and the boost::mpl
 40 | // library, there are similar operations used in the Loki library as
 41 | // well.  I prefer the boost names for 2 reasons:
 42 | // 1.  I think that portions of the Boost libraries are more likely to
 43 | // be included in the c++ standard.
 44 | // 2.  It is not impossible that some of the boost libraries will be
 45 | // included in our own build in the future.
 46 | // Both of these outcomes means that we may be able to directly replace
 47 | // some of these with boost equivalents.
 48 | //
 49 | #ifndef BASE_TEMPLATE_UTIL_H_
 50 | #define BASE_TEMPLATE_UTIL_H_
 51 | 
 52 | #include <sparsehash/internal/sparseconfig.h>
 53 | _START_GOOGLE_NAMESPACE_
 54 | 
 55 | // Types small_ and big_ are guaranteed such that sizeof(small_) <
 56 | // sizeof(big_)
 57 | typedef char small_;
 58 | 
 59 | struct big_ {
 60 |   char dummy[2];
 61 | };
 62 | 
 63 | // Identity metafunction.
 64 | template <class T>
 65 | struct identity_ {
 66 |   typedef T type;
 67 | };
 68 | 
 69 | // integral_constant, defined in tr1, is a wrapper for an integer
 70 | // value. We don't really need this generality; we could get away
 71 | // with hardcoding the integer type to bool. We use the fully
 72 | // general integer_constant for compatibility with tr1.
 73 | 
 74 | template<class T, T v>
 75 | struct integral_constant {
 76 |   static const T value = v;
 77 |   typedef T value_type;
 78 |   typedef integral_constant<T, v> type;
 79 | };
 80 | 
 81 | template <class T, T v> const T integral_constant<T, v>::value;
 82 | 
 83 | 
 84 | // Abbreviations: true_type and false_type are structs that represent boolean
 85 | // true and false values. Also define the boost::mpl versions of those names,
 86 | // true_ and false_.
 87 | typedef integral_constant<bool, true>  true_type;
 88 | typedef integral_constant<bool, false> false_type;
 89 | typedef true_type  true_;
 90 | typedef false_type false_;
 91 | 
 92 | // if_ is a templatized conditional statement.
 93 | // if_<cond, A, B> is a compile time evaluation of cond.
 94 | // if_<>::type contains A if cond is true, B otherwise.
 95 | template<bool cond, typename A, typename B>
 96 | struct if_{
 97 |   typedef A type;
 98 | };
 99 | 
100 | template<typename A, typename B>
101 | struct if_<false, A, B> {
102 |   typedef B type;
103 | };
104 | 
105 | 
106 | // type_equals_ is a template type comparator, similar to Loki IsSameType.
107 | // type_equals_<A, B>::value is true iff "A" is the same type as "B".
108 | //
109 | // New code should prefer base::is_same, defined in base/type_traits.h.
110 | // It is functionally identical, but is_same is the standard spelling.
111 | template<typename A, typename B>
112 | struct type_equals_ : public false_ {
113 | };
114 | 
115 | template<typename A>
116 | struct type_equals_<A, A> : public true_ {
117 | };
118 | 
119 | // and_ is a template && operator.
120 | // and_<A, B>::value evaluates "A::value && B::value".
121 | template<typename A, typename B>
122 | struct and_ : public integral_constant<bool, (A::value && B::value)> {
123 | };
124 | 
125 | // or_ is a template || operator.
126 | // or_<A, B>::value evaluates "A::value || B::value".
127 | template<typename A, typename B>
128 | struct or_ : public integral_constant<bool, (A::value || B::value)> {
129 | };
130 | 
131 | 
132 | _END_GOOGLE_NAMESPACE_
133 | 
134 | #endif  // BASE_TEMPLATE_UTIL_H_
135 | 


--------------------------------------------------------------------------------