├── .gitignore
├── .gitmodules
├── INSTALL.md
├── LICENCE
├── Makefile
├── README.md
├── doc
├── GraphMap-description.pdf
├── README-v0.21.md
├── README-v0.22.md
├── changelog.md
├── img
│ ├── anchors-normal.png
│ ├── anchors-rna.png
│ ├── region_selection-rna.png
│ └── region_selection.png
├── rnaseq.md
└── sam_output.md
├── overlap.md
├── reproducibility
├── README.md
├── run.py
└── setup.py
├── scripts
└── scatterplot8.py
└── src
├── aligner
├── aligner_base.h
├── aligner_containers.h
├── aligner_ksw2.cc
├── aligner_ksw2.h
├── aligner_util.cc
├── aligner_util.hpp
├── anchor_aligner.cc
├── anchor_aligner.h
├── pairwise_penalties.h
├── sam_parser.cc
└── sam_parser.h
├── alignment
├── alignment.cc
├── alignment.h
├── alignment_wrappers.cc
├── alignment_wrappers.h
├── anchored.cc
├── cigargen.cc
├── cigargen.h
├── semiglobal.cc
├── transcriptome_mod.cc
└── transcriptome_mod.h
├── containers
├── mapping_data.cc
├── mapping_data.h
├── path_graph_entry.cc
├── path_graph_entry.h
├── range.h
├── raw_alignment.h
├── region.cc
├── region.h
├── results.h
├── score_registry.cc
├── score_registry.h
├── vertices.cc
└── vertices.h
├── graphmap
├── core_graphmap.cc
├── experimental.cc
├── filter_anchors.cc
├── filter_anchors.h
├── graphmap.cc
├── graphmap.h
├── lcs_anchored.cc
├── lcs_semiglobal.cc
├── process_read.cc
├── region_selection.cc
├── rna.cc
├── transcriptome.cc
└── transcriptome.h
├── index
├── index_util.cc
└── index_util.h
├── ksw2
├── LICENSE.txt
├── kalloc.cc
├── kalloc.h
├── kseq.h
├── ksw2.h
├── ksw2_extd2_sse.cc
├── ksw2_exts2_sse.cc
├── ksw2_extz2_sse.cc
└── ksw2_ll_sse.cc
├── main.cc
├── owler
├── lcsk.cc
├── owler.cc
├── owler.h
├── owler_data.h
├── owler_experimental.cc
└── process_read.cc
├── program_parameters.cc
├── program_parameters.h
└── sparsehash
├── COPYING
├── dense_hash_map
├── dense_hash_set
├── internal
├── densehashtable.h
├── hashtable-common.h
├── libc_allocator_with_realloc.h
├── sparseconfig.h
└── sparsehashtable.h
├── sparse_hash_map
├── sparse_hash_set
├── sparsetable
├── template_util.h
└── type_traits.h
/.gitignore:
--------------------------------------------------------------------------------
1 | deprecated/
2 | temp/
3 | temp/*
4 | obj/
5 | obj_debug/
6 | obj_linux/
7 | obj_mac/
8 | obj_test/
9 | obj_testext/
10 | obj_extcigar/
11 | # bin/graphmap-not_release
12 | # bin/graphmap-debug
13 | bin/
14 | .project
15 | .cproject
16 | .settings/
17 | reproducibility/*/
18 | test-data/
19 | .vscode*
20 |
21 | !reproducibility/*.py
22 | !reproducibility/*.md
23 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "codebase/seqlib"]
2 | path = codebase/seqlib
3 | url = https://github.com/isovic/seqlib.git
4 | [submodule "codebase/argumentparser"]
5 | path = codebase/argumentparser
6 | url = https://github.com/isovic/argumentparser.git
7 | [submodule "codebase/gindex"]
8 | path = codebase/gindex
9 | url = https://github.com/isovic/gindex
10 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | ## 1. Installation
2 |
3 | You will need a recent GCC/G++ version (>=4.7) to compile the source.
4 |
5 | To override the default compiler choice you can set GCC (or GCC_MAC on Mac), e.g.:
6 |
7 | ```
8 | GCC=/usr/local/bin/g++ make
9 | ```
10 |
11 | ### 1.1 Initialize submodules
12 | This will automatically initialize/pull the latest version of submodules.
13 | ```
14 | make modules
15 | ```
16 |
17 | Submodules are used as source files, so there is no need to pre-compile them in any way.
18 |
19 |
20 | ### 1.2 Linux
21 | For a Linux release version type:
22 | ```
23 | make
24 | ```
25 |
26 | To clean, type:
27 | ```
28 | make clean
29 | ```
30 |
31 | One can also rebuild, which will cause clean and make to be ran sequentially:
32 | ```
33 | make rebuild
34 | ```
35 |
36 | ### 1.3 Mac
37 | ```
38 | make mac
39 |
40 | make cleanmac
41 | make rebuildmac
42 | ```
43 |
44 | ### 1.4. Compiling the debug version
45 | ```
46 | make debug
47 |
48 | make cleandebug
49 | make rebuilddebug
50 | ```
51 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Ivan Sovic, Mile Sikic and Niranjan Nagarajan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BIN = ./bin/graphmap2
2 | BIN_DEBUG = ./bin/graphmap-debug
3 | BIN_LINUX = ./bin/Linux-x64/graphmap2
4 | BIN_MAC = ./bin/Mac/graphmap
5 | OBJ_TESTING = ./obj_test
6 | OBJ_TESTING_EXT = ./obj_testext
7 | OBJ_DEBUG = ./obj_debug
8 | OBJ_LINUX = ./obj_linux
9 | OBJ_EXTCIGAR = ./obj_extcigar
10 | OBJ_MAC = ./obj_mac
11 | SOURCE = src
12 | CODEBASE = codebase
13 | # This finds all 'src' folders at maximum depth 2 (level one inside each submodule's folder).
14 | CODEBASE_SRC_FOLDERS = $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{} \;)
15 | # $(shell find $(CODEBASE) -maxdepth 3 -type d -name "libs" -exec echo "-I"{} \;)
16 | # $(shell find $(CODEBASE) -maxdepth 2 -type d -name "src" -exec echo "-I"{}"/*/" \;)
17 |
18 | # ? allows override by user using env var
19 | GCC ?= g++
20 | # define variables for GCC version check here
21 | GCC_MAJOR_VERSION_GE_4 := $(shell expr `$(GCC) -dumpversion | cut -f1 -d.` \>= 4)
22 | GCC_MINOR_VERSION_GE_7 := $(shell expr `$(GCC) -dumpversion | cut -f2 -d.` \>= 7)
23 | GCC_MAC ?= g++
24 |
25 |
26 | # CPP_FILES := $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp)
27 | # CC_FILES := $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc)
28 | # H_FILES := $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(SOURCE)/libs/*/*.h)
29 | CPP_FILES := $(wildcard $(CODEBASE)/*/src/*.cpp) $(wildcard $(CODEBASE)/*/src/libs/*/*.cpp) $(wildcard $(CODEBASE)/*/src/*/*.cpp) $(wildcard $(SOURCE)/*/*.cpp) $(wildcard $(SOURCE)/*.cpp) $(wildcard $(SOURCE)/libs/*/*.cpp)
30 | CC_FILES := $(wildcard $(CODEBASE)/*/src/*.cc) $(wildcard $(CODEBASE)/*/src/libs/*/*.cc) $(wildcard $(CODEBASE)/*/src/*/*.cc) $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc) $(wildcard $(SOURCE)/libs/*/*.cc)
31 | H_FILES := $(wildcard $(CODEBASE)/*/src/*.h) $(wildcard $(CODEBASE)/*/src/libs/*/*.h) $(wildcard $(CODEBASE)/*/src/*/*.h) $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h) $(wildcard $(CODEBASE)/*/src/*.hpp) $(wildcard $(CODEBASE)/*/src/*/*.hpp) $(wildcard $(SOURCE)/*/*.hpp) $(wildcard $(SOURCE)/*.hpp) $(wildcard $(SOURCE)/libs/*/*.h)
32 |
33 | OBJ_FILES := $(CPP_FILES:.cpp=.o) $(CC_FILES:.cc=.o)
34 | OBJ_FILES_FOLDER_TESTING := $(addprefix $(OBJ_TESTING)/,$(OBJ_FILES))
35 | OBJ_FILES_FOLDER_TESTING_EXT := $(addprefix $(OBJ_TESTING_EXT)/,$(OBJ_FILES))
36 | OBJ_FILES_FOLDER_DEBUG := $(addprefix $(OBJ_DEBUG)/,$(OBJ_FILES))
37 | OBJ_FILES_FOLDER_LINUX := $(addprefix $(OBJ_LINUX)/,$(OBJ_FILES))
38 | OBJ_FILES_FOLDER_EXTCIGAR := $(addprefix $(OBJ_EXTCIGAR)/,$(OBJ_FILES))
39 | OBJ_FILES_FOLDER_MAC := $(addprefix $(OBJ_MAC)/,$(OBJ_FILES))
40 |
41 | LIB_DIRS = -L"/usr/local/lib"
42 | CC_LIBS = -static-libgcc -static-libstdc++ -D__cplusplus=201103L
43 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"libs/libdivsufsort-2.0.1/build/include" -I"libs/seqan-library-1.4.2/include"
44 | # INCLUDE = -I"./src/" -I"/usr/include/" -I"src/libs/seqan-library-1.4.2/include"
45 | INCLUDE = -I"./src/" -I"/usr/include/" -I"$(CODEBASE)/seqlib/src/libs/seqan-library-2.0.1/include" -I"$(CODEBASE)/seqlib/src/libs/libdivsufsort-2.0.1-64bit/" $(CODEBASE_SRC_FOLDERS)
46 |
47 | CC_FLAGS_DEBUG = -O3 -g -rdynamic -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native
48 | CC_FLAGS_RELEASE = -DRELEASE_VERSION -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread # -march=native
49 | CC_FLAGS_EXTCIGAR = -DRELEASE_VERSION -DUSE_EXTENDED_CIGAR_FORMAT -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native
50 | CC_FLAGS_NOT_RELEASE = -g -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
51 | CC_FLAGS_NOT_RELEASE_EXT = -g -O3 -DUSE_EXTENDED_CIGAR_FORMAT -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
52 | LD_FLAGS = -static-libgcc -static-libstdc++ -m64 -ffreestanding
53 | # LD_LIBS = -lpthread -lgomp -lm -lz -ldivsufsort64
54 | LD_LIBS = -lpthread -lgomp -lm -lz
55 |
56 |
57 |
58 | all: gcc_version_check linux
59 |
60 | install: /usr/bin/graphmap
61 |
62 | /usr/bin/graphmap: bin/Linux-x64/graphmap
63 | cp bin/Linux-x64/graphmap /usr/bin/graphmap
64 |
65 | modules:
66 | git submodule update --init --recursive
67 | # git submodule foreach git pull origin master
68 |
69 | testing: $(OBJ_FILES_FOLDER_TESTING)
70 | mkdir -p $(dir $(BIN))
71 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING) $(LD_LIBS)
72 |
73 | obj_test/%.o: %.cc $(H_FILES)
74 | mkdir -p $(dir $@)
75 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $<
76 |
77 | obj_test/%.o: %.cpp $(H_FILES)
78 | mkdir -p $(dir $@)
79 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $<
80 |
81 | testingext: $(OBJ_FILES_FOLDER_TESTING_EXT)
82 | mkdir -p $(dir $(BIN))
83 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING_EXT) $(LD_LIBS)
84 |
85 | obj_testext/%.o: %.cc $(H_FILES)
86 | mkdir -p $(dir $@)
87 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<
88 |
89 | obj_testext/%.o: %.cpp $(H_FILES)
90 | mkdir -p $(dir $@)
91 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<
92 |
93 |
94 |
95 | gcc_version_check:
96 | ifneq ($(GCC_MAJOR_VERSION_GE_4), 1)
97 | $(warning "*** WARNING $(GCC) major version <4 ***")
98 | endif
99 | ifneq ($(GCC_MINOR_VERSION_GE_7), 1)
100 | $(warning "*** WARNING $(GCC) minor version <7 ***")
101 | endif
102 |
103 |
104 | debug: $(OBJ_FILES_FOLDER_DEBUG)
105 | mkdir -p $(dir $(BIN_DEBUG))
106 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_DEBUG) $(OBJ_FILES_FOLDER_DEBUG) $(LD_LIBS)
107 |
108 | obj_debug/%.o: %.cc $(H_FILES)
109 | mkdir -p $(dir $@)
110 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $<
111 |
112 | obj_debug/%.o: %.cpp $(H_FILES)
113 | mkdir -p $(dir $@)
114 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_DEBUG) -o $@ $<
115 |
116 |
117 |
118 | linux: $(OBJ_FILES_FOLDER_LINUX)
119 | mkdir -p $(dir $(BIN_LINUX))
120 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_LINUX) $(LD_LIBS)
121 |
122 | obj_linux/%.o: %.cc $(H_FILES)
123 | mkdir -p $(dir $@)
124 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
125 |
126 | obj_linux/%.o: %.cpp $(H_FILES)
127 | mkdir -p $(dir $@)
128 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
129 |
130 |
131 |
132 | extcigar: $(OBJ_FILES_FOLDER_EXTCIGAR)
133 | mkdir -p $(dir $(BIN_LINUX))
134 | $(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_LINUX) $(OBJ_FILES_FOLDER_EXTCIGAR) $(LD_LIBS)
135 |
136 | obj_extcigar/%.o: %.cc $(H_FILES)
137 | mkdir -p $(dir $@)
138 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $<
139 |
140 | obj_extcigar/%.o: %.cpp $(H_FILES)
141 | mkdir -p $(dir $@)
142 | $(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_EXTCIGAR) -o $@ $<
143 |
144 |
145 |
146 | mac: $(OBJ_FILES_FOLDER_MAC)
147 | mkdir -p $(dir $(BIN_MAC))
148 | $(GCC_MAC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN_MAC) $(OBJ_FILES_FOLDER_MAC) $(LD_LIBS)
149 |
150 | obj_mac/%.o: %.cc $(H_FILES)
151 | mkdir -p $(dir $@)
152 | $(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
153 |
154 | obj_mac/%.o: %.cpp $(H_FILES)
155 | mkdir -p $(dir $@)
156 | $(GCC_MAC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_RELEASE) -o $@ $<
157 |
158 |
159 |
160 | # deps:
161 | # cd libs; cd libdivsufsort-2.0.1; make clean; rm -rf build; ./configure; mkdir build ;cd build; cmake -DBUILD_DIVSUFSORT64:BOOL=ON -DCMAKE_BUILD_TYPE="Release" -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX="/usr/local" .. ; make
162 |
163 |
164 |
165 | clean:
166 | -rm -rf $(OBJ_LINUX) $(BIN_LINUX)
167 |
168 | cleantesting:
169 | -rm -rf $(OBJ_TESTING) $(BIN)
170 |
171 | cleandebug:
172 | -rm -rf $(OBJ_DEBUG) $(BIN_DEBUG)
173 |
174 | cleanlinux:
175 | -rm -rf $(OBJ_LINUX) $(BIN_LINUX)
176 |
177 | cleanextcigar:
178 | -rm -rf $(OBJ_EXTCIGAR) $(BIN_LINUX)
179 |
180 | cleanmac:
181 | -rm -rf $(OBJ_MAC) $(BIN_MAC)
182 |
183 | cleanbin:
184 | -rm -rf bin/
185 |
186 | cleanall: clean cleantest cleandebug cleanmac cleanbin
187 |
188 |
189 |
190 | rebuild: clean all
191 |
192 | rebuilddebug: cleandebug debug
193 |
194 | rebuildlinux: cleanlinux linux
195 |
196 | rebuildtesting: cleantesting testing
197 |
198 | rebuildmac: cleanmac mac
199 |
200 | # divsufsort:
201 | # cd libs; ./build-libdivsufsort.sh
202 |
203 |
--------------------------------------------------------------------------------
/doc/GraphMap-description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/GraphMap-description.pdf
--------------------------------------------------------------------------------
/doc/README-v0.21.md:
--------------------------------------------------------------------------------
1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads
2 |
3 | Preprint of our paper now available on Biorxiv:
4 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719)
5 |
6 | Sequencing data of E. Coli UTI89 generated in-house and used in the paper now available on:
7 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557)
8 |
9 |
10 |
11 | **__Version: 0.21__**
12 | **Update**
13 | Release date: 02 June 2015
14 |
15 | New alignment mode available: anchored alignment.
16 |
17 | Anchored alignment is an alternative to the default semiglobal alignment. It is less sensitive than default semiglobal, but faster and creates alignments around determined homologies (anchors).
18 | This is a very powerful addition to alignment, as it creates highly accurate and confident alignments even in the presence of high error rates.
19 | To run the anchored alignment, use the '-a anchor' option.
20 |
21 | Also, standard Gotoh alignment is also available now as opposed to the default Myers's bit-vector alignment. Custom alignment parameters can be specified via commandline.
22 | To use the Gotoh alignment, use '-a gotoh' commandline option.
23 |
24 | Additionally, E-value and mapping quality thresholds can now be applied directly from commandline (-z and -c options).
25 |
26 | More to follow.
27 |
28 |
29 |
30 | **__Version: v0.20b__**
31 | **Update**
32 | Release date: 26 April 2015
33 |
34 | Added the source code.
35 |
36 | To build from source:
37 | ```
38 | make
39 | ```
40 | If the libraries have to be recompiled on your system, type:
41 | ```
42 | make deps
43 | ```
44 | More installation instructions can be found in the INSTALL file.
45 |
46 |
47 | Release date: 02 April 2015
48 | Precompiled binary, built on Ubuntu 10.04 x64.
49 | Tested on Mint 17.1 x64.
50 |
51 | Significantly improved speed and sensitivity.
52 |
53 | Added some important features:
54 | - Mapping quality.
55 | - Sensible alignment score.
56 | - E-value added in reported alignments! Look for a custom ZE parameter in the SAM lines.
57 | - Secondary alignments can now be output as well (use the -Z parameter).
58 |
59 | Addressed several reported issues:
60 | - Output only the first whitespace-separated token in the qname field of the SAM output. Previously the entire read's header was output.
61 | - The same for the rname.
62 | - Reads that are marked unmapped now contain no additional mapping information as before.
63 |
64 | Please note that by default, GraphMap will use more memory to allow higher speed and sensitivity.
65 | To run in parsimonious (half the memory requirements), please use the -P parameter.
66 |
67 | Note #2: some command line parameters were changed (removed/added) since the last version, but most stayed the same.
68 | In case you are using one of the removed parameters, you will be warned and the process will not run.
69 |
70 |
71 | **__Version: v0.19b__**
72 | Release date: 16 January 2015
73 | Precompiled binary, built on Ubuntu 10.04 x64.
74 | Tested on Mint 17.1 x64.
75 |
76 | **Update**
77 | Compiled a MacOS version too, now can also be found in the bin directory.
78 | Built on MacOS X 10.9.5
79 |
80 | Important updates:
81 | - Better support for circular genomes - use '-C' option if your reference is circular!
82 | - Added a more sensitive mode (though much slower) - check out the '-x' option in the help!
83 | - Better alignments for Illumina reads - again, check out the '-x' option.
84 | - Better dynamic of the AS (alignment score) - value 254 best score, value 0 worst/unmapped.
85 |
86 | To use the normal (fast) mode, simply use the default parameters (nothing is changed, just omit the '-x' option).
87 |
88 |
89 | **__Version: v0.18b__**
90 | Release date: 11 December 2014
91 | Precompiled binary, built on Ubuntu 10.04 x64.
92 | Tested on Mint 17 (Ubuntu 14.04), Ubuntu Server 14.04, Fedora 20 and Gentoo.
93 |
94 | ### Description
95 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data.
96 | It can handle Oxford Nanopore data with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers (namely, compared to BLASR and BWA-MEM).
97 |
98 | GraphMap was designed for ease-of-use: the default parameters can handle a wide range of read lengths and error profiles. This is an important feature for technologies where the error rates and error profiles can vary widely across sequencing runs. In addition, GraphMap allows users to uniformly map read datasets from disparate technologies with high sensitivity and accuracy. While GraphMap is not runtime optimized for short-read data (e.g. compared to Bowtie2), it provides accurate and typically more sensitive mappings for Illumina and Ion Torrent reads.
99 |
100 | Please keep in mind that this is an early development version and we welcome your comments and feedback on GraphMap.
101 |
102 | ### Comparison to other mappers
103 |
104 | Comparison statistics will be uploaded soon.
105 |
106 | ### Usage
107 |
108 | ```
109 | # Process all reads from a given FASTA/FASTQ file with default number of threads:
110 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam
111 |
112 | # Process reads using more sensitive parameters for Illumina data:
113 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam
114 |
115 | # Process reads from a circular genome:
116 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam
117 |
118 | # Limit the number of threads to 8, and load reads in batches of 50MB:
119 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
120 |
121 | # Process only the first 1000 reads:
122 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
123 |
124 | # Process all reads from a given folder.
125 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder
126 |
127 | # Generate only the index.
128 | ./graphmap -I -r escherichia_coli.fa
129 | ```
130 |
131 | ### Contact information
132 |
133 | For additional information, help and bug reports please send an email to one of the following:
134 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg
135 |
--------------------------------------------------------------------------------
/doc/README-v0.22.md:
--------------------------------------------------------------------------------
1 | ## GraphMap - A highly sensitive and accurate mapper for long, error-prone reads
2 | **__Current Version: 0.22__**
3 | Release date: 12 November 2015
4 |
5 | Updates:
6 | - Many tiny bug fixes, mostly related to anchored alignment. It should be slightly more sensitive now.
7 | - Two overlap modes merged from the dev branch: ```-w owler``` (fast, uses a trimmed GraphMap pipeline, reports output in the MHAP format) and ```-w overlapper``` (full GraphMap pipeline including alignment, output in SAM format). For usage - check examples at the bottom.
8 | - GraphMap integration into marginAlign - we forked marginAlign and extended it to support GraphMap alongside to LAST and BWA-MEM ([https://github.com/isovic/marginAlign](https://github.com/isovic/marginAlign)). Use parameters ```--graphmap``` or ```--graphmapanchor``` with marginAlign to specify the mapper.
9 |
10 | For more information on overlapping, take a look at [overlap.md](overlap.md).
11 |
12 | GraphMap is also used as an overlapper in a new *de novo* genome assembly project called [Ra](https://github.com/mariokostelac/ra-integrate) ([https://github.com/mariokostelac/ra-integrate](https://github.com/mariokostelac/ra-integrate)).
13 | Ra attempts to create *de novo* assemblies from raw nanopore and PacBio reads without requiring error correction, for which a highly sensitive overlapper is required.
14 |
15 |
16 | ### Quick start on Linux x64
17 | ```
18 | git clone https://github.com/isovic/graphmap.git
19 | cd graphmap
20 | make
21 |
22 | # To align:
23 | ./bin/Linux-x64/graphmap -r reference.fa -d reads.fasta -o output.sam
24 |
25 | # To overlap:
26 | ./bin/Linux-x64/graphmap -w owler -r reads.fasta -d reads.fasta -o output.mhap
27 | ```
28 |
29 | ### Description
30 | GraphMap is a novel mapper targeted at aligning long, error-prone third-generation sequencing data.
31 | It is **designed to handle Oxford Nanopore MinION 1d and 2d reads** with very high sensitivity and accuracy, and also presents a significant improvement over the state-of-the-art for PacBio read mappers.
32 |
33 | GraphMap was also designed for ease-of-use: the **default parameters** can handle a wide range of read lengths and error profiles, including: *Illumina*, *PacBio* and *Oxford Nanopore*.
34 | This is an especially important feature for technologies where the error rates and error profiles can vary widely across, or even within, sequencing runs.
35 |
36 | **The GraphMap algorithm** is structured to achieve high-sensitivity and speed using a five-stage
37 | read-funneling approach. In stage I, GraphMap uses a novel adaptation of gapped spaced seeds to efficiently reduce the search space and get seed hits as a form of coarse alignment. These are then refined in stage II using graph-based vertex-centric processing of seeds to efficiently construct alignment anchors. GraphMap then chains anchors using a kmer
38 | version of longest common subsequence (LCS) construction (stage III), refines
39 | alignments with a form of L1 linear regression (stage IV) and finally evaluates the
40 | remaining candidates to select the best location to reconstruct a final alignment (stage V).
41 | GraphMap computes a BLAST-like E-value as well as a mapping quality for its alignments.
42 |
43 | **Evaluation** on MinION sequencing datasets against short and long-read mappers indicates that GraphMap increases mapping sensitivity by at least 15-80%. GraphMap alignments are the first to demonstrate consensus calling with <1 error in 100,000 bases, variant calling on the human genome with 76% improvement in sensitivity over the next best mapper (BWA-MEM), precise detection of structural variants from 100bp to 4kbp in length and species and strain-specific identification of pathogens using MinION reads.
44 |
45 | Further details about the algorithm, comparison with other mappers and usage applications can be found in the **preprint** of our paper:
46 | [Fast and sensitive mapping of error-prone nanopore sequencing reads with GraphMap](http://biorxiv.org/content/early/2015/06/10/020719)
47 |
48 | **Nanopore sequencing data** of E. Coli UTI89 generated in-house and used in the paper now available on ENA:
49 | [PRJEB9557](http://www.ebi.ac.uk/ena/data/view/PRJEB9557)
50 |
51 | ### Features
52 | - Mapping position agnostic to alignment parameters.
53 | - Consistently very high sensitivity and precision across different error profiles, rates and sequencing technologies even with default parameters.
54 | - Circular genome handling to resolve coverage drops near ends of the genome.
55 | - E-value.
56 | - Meaningful mapping quality.
57 | - Various alignment strategies (semiglobal bit-vector and Gotoh, anchored).
58 |
59 | ### Installation
60 | To build GraphMap from source type:
61 | ```
62 | make
63 | ```
64 | Required libraries are prebuilt for Linux x64 systems.
65 | To rebuild them for other systems, type:
66 | ```
67 | make deps
68 | ```
69 |
70 | You will need a recent GCC/G++ version (>=4.7).
71 |
72 | More installation instructions can be found in the INSTALL file.
73 |
74 |
75 | ### Usage examples
76 | ```
77 | # Align all reads from a given FASTA/FASTQ file with default number of threads using semiglobal bit-vector alignment:
78 | ./graphmap -r escherichia_coli.fa -d reads.fastq -o alignments.sam
79 |
80 | # Align all reads from a given FASTA/FASTQ file using anchored alignment approach:
81 | ./graphmap -a anchor -r escherichia_coli.fa -d reads.fastq -o alignments.sam
82 |
83 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast):
84 | ./graphmap -w owler -r reads.fa -d reads.fa -o overlaps.mhap
85 |
86 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow):
87 | ./graphmap -w overlapper -r reads.fa -d reads.fa -o overlaps.sam
88 |
89 | # Align reads using the Gotoh for semiglobal alignment:
90 | ./graphmap -a gotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam
91 |
92 | # Align reads using Gotoh alignment with anchored approach:
93 | ./graphmap -a anchorgotoh -r escherichia_coli.fa -d reads.fastq -o alignments.sam
94 |
95 | # Process reads from a circular genome:
96 | ./graphmap -C -r escherichia_coli.fa -d reads.fastq -o alignments.sam
97 |
98 | # Threshold the E-value of alignments to 1e-100. Alignments with E-value > 1e-100 will be called unmapped:
99 | ./graphmap -z 1e-100 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
100 |
101 | # Output all similarly good alignments (to within F*num_kmers_of_best_alnmnt) instead of only one best:
102 | ./graphmap -Z -F 0.05 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
103 |
104 | # Limit the number of threads to 8, and load reads in batches of 50MB:
105 | ./graphmap -t 8 -B 50 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
106 |
107 | # Align reads using more sensitive parameters for Illumina data (currently equivalent to "-a gotoh"):
108 | ./graphmap -x illumina -r escherichia_coli.fa -d reads.fastq -o alignments.sam
109 |
110 | # Load all reads in one batch and align only the first 1000 reads:
111 | ./graphmap -B 0 -n 1000 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
112 |
113 | # Process all reads from a given folder.
114 | ./graphmap -r escherichia_coli.fa -D reads_folder -O alignments_folder
115 |
116 | # Generate only the index.
117 | ./graphmap -I -r escherichia_coli.fa
118 |
119 | # Run a debug version of GraphMap (build with "make debug") and verbose the SAM output to see various info about alignment:
120 | ./graphmap-debug -b 3 -r escherichia_coli.fa -d reads.fastq -o alignments.sam
121 |
122 | ```
123 |
124 | ### Contact information
125 |
126 | For additional information, help and bug reports please send an email to one of the following:
127 | ivan.sovic@irb.hr, mile.sikic@fer.hr, nagarajann@gis.a-star.edu.sg
128 |
129 | ### Acknowledgement
130 | This work was supported by the IMaGIN platform (project No. 102 101 0025), through a grant from the Science and Engineering Research Council, funding to the Genome Institute of Singapore from the Agency for Science, Technology and Research (A*STAR), Singapore, and funding from the Croatian Science Foundation (Project no. UIP-11-2013-7353 - Algorithms for Genome Sequence Analysis).
131 |
--------------------------------------------------------------------------------
/doc/changelog.md:
--------------------------------------------------------------------------------
1 | ## GraphMap - ChangeLog
2 |
3 | **__Version 0.5.0 -> 0.5.1__**
4 | Release date: 04 March 2017
5 | - Updated the gindex module for smaller memory consumption when building the index. Index construction is now a bit slower (single thread is used for collecting minimizers), but collection of minimizers is now performed on the fly. Previously, all seeds would be collected first, and then they would be pulled through a minimizer generation function. Now, each seed is pushed into the minimizer queue and if the queue yields a seed which is different than the previous one, it is emplaced on the list.
6 | The memory consumption is still large (similar to index in versions 0.4.x), which is due to 128-bit integer representation of all seeds (seed key, sequence ID and sequence position). This could be reduced further by careful redesign.
7 | The disk version of the index is fully compatible to version 0.5.0.
8 | The reduced memory consumption directly also impacts the Owler mode as well.
9 |
10 | **__Version 0.4.1 -> 0.5.0__**
11 | Release date: 28 February 2017
12 | - Re-implemented the index. Removed all other indexes that were previously implemented, and cleaned up the code to only use the new index (MinimizerIndex). MinimizerIndex is implemented in a separate repo added to the codebase. It also uses a hash table to store the seeds, however instead of the perfect hash as before, Google's DenseHash is used. Seeds are first compiled in a giant list (each sequence in its space, in parallel), and afterwards the list is sorted (also multithreaded). Basic statistics on seed key distribution are calculated (mean, median, standard deviation). The index also allows thresholding the amount of hits during lookup (keys with a count higher than a user-specified percentil are skipped) which is very significant for large, repetitive genomes. The index can also generate minimizers (also user specified). Index also allows for custom indexing shapes to be defined, and creates the lookup shapes automatically.
13 | - Changed the command line parameters to allow for new features, concretely:
14 | 1. Removed the parameter ```max-hits``` which is now obsolete.
15 | 2. Added parameter ```minimizer-window``` to specify the length of the minimizer window to choose minimizers from. If equal to 1, minimizers won't be used.
16 | 3. Added parameter ```freq-percentil``` to specify the percentil of key occurances which will be kept. E.g. if 0.99, then 1% of most repetitive keys will be skipped. If 1.0, no filtering will be used.
17 | 4. Added parameter ```fly-index``` which will generate index on the fly and won't store it to disk. If the index already exists on disk, it will be loaded. To completely generate a new index on the fly, use ```--fly-index --rebuild-index```.
18 | 5. Renamed the parameter which was previously known as ```sensitive``` to ```double-index```.
19 | 6. Added a composite parameter called ```-x sensitive``` which will turn off minimizers and key frequency filtering.
20 |
21 | - Fixed an issue with RNA-seq transcriptome mapping, where recall would be lower than expected. There was a bug when checking if alignment is sane - the check would occur *after* the alignment was converted from transcriptome space to genome space, instead still on the transcriptome. This could not have caused false positives, but definitely caused many reads to be unmapped.
22 | - The reimplemented index now fixes the issue of segmentation fault on the human genome.
23 |
24 |
25 |
26 | **__Version 0.4.0 -> 0.4.1__**
27 | Release date: 28 January 2017
28 | - Fixed the SAM headers for transcriptome mapping. In the last version, the headers corresponded to the transcriptome headers, although the alignments are in the genome space.
29 |
30 | **__Version 0.3.2 -> 0.4.0__**
31 | Release date: 22 January 2017
32 | - GraphMap can now accept a GTF file for mapping to a transcriptome. Transcriptome is internally generated using the reference file and the GTF file, and index built from the transcriptome. Reads are then mapped to the transcriptome, and final alignments converted back to the genome coordinate space by introducing 'N' operations at splice sites.
33 | - Transcriptome mapping is only available in anchored alignment modes.
34 | - Updated Edlib to the newest version. Previous version had a bug in the traceback.
35 | - Recent changes in Edlib produced leading and trailing deletions in some cases. This is now handled by removing the deletions and shifting the alignment start position.
36 | - Fixed several (possible) memory leaks and invalid reads/writes. Generating the MD tag in SAM files had an invalid read which for some reason caused strange artifacts in CIGAR strings.
37 |
38 | **__Version 0.3.1 -> 0.3.2__**
39 | Release date: 19 December 2016
40 | - There were segfaults caused by recently-introduced bugs to Edlib. It has since been updated, and this version of GraphMap now includes the fixed version of Edlib.
41 | - There was a memory leak when generating clusters.
42 | - Minor fixes to some syntax.
43 |
44 | **__Version 0.3.0 -> 0.3.1__**
45 | Release date: 12 October 2016
46 | - Important: Fixed MD field issues
47 | - Minor bug fixes: composite command line parameter ```-x illumina``` depended on a parameter which wasn't defined properly, filtered empty SAM lines, etc.
48 |
49 | **__Version 0.22 -> 0.3.0__**
50 | Release date: 15 April 2016
51 | If you are using versions 0.3.x please update to the most recent commit. There were several important memory access issues which are now resolved.
52 | GraphMap's command line has changed significantly between version 0.3.x and 0.2x - although many options remain similar, the usage is incompatible with the older releases due to explicit tool specification.
53 | The first parameter is now mandatory, and specifies whether the **mapping/alignment** (```./graphmap align```) or **overlapping** (```./graphmap owler```) should be used.
54 | **Important change in command line parameters.** The new version is not completely compatible to the previous one. For this reason, the minor version number has changed.
55 | - Changed the version numbering from: ```x.yz``` to ```x.y.z```
56 | - Implemented a new argument parser.
57 | - Fixed a bug with overhanging base (Issue #14), commit: 41ae30b0d8603469c62794cba1960dc42f739d4e
58 | - Fixed the extensions of alignment to read ends when near an overhang (Issue #18).
59 | - Fixed Issue #19 - inconsistent behaviour for parameter ```-F```.
60 | - Cleaned up the code a bit.
61 | - Restructured the code. Majority of the code was extracted from the repository to be used as the codebase for this and other projects. GraphMap's main code is left in this repo, while the rest is linked via git submodules.
62 | - Added support for reading SAM and GFA files as the input sequences. Gzipped versions of all formats are supported as well. By default the format is chosen by the extension of the fila (--infmt auto), but can be specified manually.
63 | - Added support for the M5 output format.
64 | - Added the MD field to the SAM output.
65 | - New and better anchor filtering (anchored modes only) using chaining of anchors that passed the LCSk.
66 | - New and better clustering of anchor stretches. This will be used for implementing RNA-seq alignment.
67 | - No need to precompile libraries for your system anymore. Libraries are now included in the source, or in the submodules. To initialize submodules, either clone recursively, or call ```make modules``` once GraphMap repo has been cloned.
68 | - Anchored alignment is now the default one.
69 |
70 | Important command line changes:
71 | - Long argument names are now provided.
72 | - Extended CIGAR format can now be used via commandline through the --extcigar parameter (unlike before, where the code needed to be recompiled).
73 | - By default, GraphMap now uses only one gapped spaced index (previously, two were used by default; one could have been used by specifying the parsimonious mode). The defaults now are the ex parsimonious mode. To use two indexes, specify the parameter: --sensitive
74 | - The ```-w owler``` and ```-w overlapper``` have been moved. The alignment/owler mode is chosen as the first parameter in the commandline now (a "subprogram"; e.g. run ```graphmap owler```. To use the ex ```-w overlapper```, specify ```-x overlap``` instead. This mode has now been simply converted to a composite parameter. There is also a command line parameter ```--overlapper``` which only controls the counting of hits in order to skip self-hits.
75 | - There is now a default E-value filter set at ```1e0```
76 | - There is now a default MAPQ filter set at ```1```
77 | - It is now possible to switch off extension of alignments to read ends (parameter: ``--no-end2end```).
78 | - If the index needs to be rebuilt, it can now be done using a sinle command line with parameter: ```--rebuild-index``
79 |
--------------------------------------------------------------------------------
/doc/img/anchors-normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-normal.png
--------------------------------------------------------------------------------
/doc/img/anchors-rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/anchors-rna.png
--------------------------------------------------------------------------------
/doc/img/region_selection-rna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection-rna.png
--------------------------------------------------------------------------------
/doc/img/region_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/doc/img/region_selection.png
--------------------------------------------------------------------------------
/doc/rnaseq.md:
--------------------------------------------------------------------------------
1 | ## Mapping RNA-seq reads
2 |
3 | ### 1. Transcriptome mapping
4 | Since version 0.4.0, GraphMap has a support for mapping reads to internally generated transcriptomes.
5 | These features are available on the ```master``` branch of the GraphMap repo.
6 | To use this feature, a ```GTF``` annotations file is needed alongside the reference ```FASTA```.
7 | The goal of this option is to simplify the process for end-users. The final alignments are also automatically transformed back to genomic coordinates, thus completely wrapping the entire process.
8 |
9 |
10 |
11 | To use this feature, simply specify ```--gtf annotations.gtf``` alongside to other command line parameters.
12 |
13 | **Acknowledgements**
14 | Mile Šikić (MS) and Niranjan Nagarajan (NN) proposed the implementation of this approach for RNA-seq mapping. Ivan Krpelnik (IK) implemented the initial version of the transcriptome generator, as well as the conversion utility to convert from transcriptome space back to genome space. IK worked under guidance from Krešimir Križanović (KK) and Ivan Sović (IS). The new methods were embedded in existing GraphMap source code by IS. KK was/is working on evaluation and benchmarking of our RNA-seq methods. Mile Šikić (MS) supervised the project.
15 |
16 | ### 2. Spliced alignments
17 | Support for spliced alignments in GraphMap is a work in progress and currently experimental.
18 | To activate this mode, specify ```-x rnaseq``` alongside to other command line parameters. This feature is available on the ```rna-alpha``` branch. Install and compile in the testing mode as such:
19 | ```
20 | git clone https://github.com/isovic/graphmap
21 | cd graphmap
22 | git checkout rna-alpha
23 | make modules
24 | make -j 4 testing
25 | ```
26 |
27 | After this, run GraphMap using:
28 | ```
29 | bin/graphmap-not_release align -x rnaseq -r ref.fa -d reads.fastq -o out.sam
30 | ```
31 |
32 | ***Please be aware*** that this is currently a highly experimental. It is not production-ready. Implementation may vary significantly from implementation to implementation.
33 |
35 |
36 | Here is a short description of the approach we are taking.
37 |
38 | **2.1 Region selection**
39 | The GraphMap [paper](http://www.nature.com/ncomms/2016/160415/ncomms11307/full/ncomms11307.html) describes the region selection process (first step in GraphMap). In short, an array of bins is constructed, where each bin represents a consecutive, non-overlapping region of the reference, where each bin is of size ```read_length / 3```. For each seed of an analyzed read, all hits on the reference are looked-up. For each hit, ```+1``` is added to a bin corresponding to region where the hit falls into. (If a seed has multiple hits in the same region, only one is counted).
40 |
41 |
42 |
43 | Regions are then sorted in descending order of their counts and further processed one by one. Before a region is processed, it is first extended on both ends (by ```read_length```) so that the entire read may fit inside after the alignment.
44 | This approach was shown to be very well suited for detecting secondary alignments, as different regions which might contain similarly good alignments would be processed individually.
45 | In the default alignment mode, this approach can produce pretty sensitive alignments.
46 |
47 | Now consider mapping of RNA-seq reads. In this case, a read can actually be split into several distant regions across a chromosome. Should the same region selection strategy be applied, the bin counts would simply redistribute to different regions. This means that, should the exons have a few good seed hits, we could detect the correct regions and further process them to obtain the spliced alignments. Of course, noise hits will cause trouble (this will be addressed in continuation). For RNA-seq, regions are also sorted by their bin counts and further processed using the Graph Mapping and the LCSk steps.
48 |
49 |
50 |
51 | **2.2. Graph Mapping and LCSk**
52 | For each region, Graph Mapping is performed to obtain anchors (matches between the read and the region). Anchors are filtered using the LCSk method.
53 | These steps are the same as in normal DNA mapping case.
54 |
55 | However, here we add a method of **clustering anchors** after they have been filtered. Clusters are obtained using the classic chaining approach which joins anchors that are close enough, and are nearly on the same diagonal.
56 | Clusters then represent larger matching chunks between a read and a reference.
57 |
58 | For normal DNA mapping, one would ideally (in the abscence of structural variants) expect to see one large cluster, such as shown below:
59 |
60 |
61 |
62 | However, in case of RNA-seq mapping (or in presence of structural variants) such a graph might look like something closer to the following figure:
63 |
64 |
65 |
66 | What's more, viewed in such way, a cluster actually can represent an *exon*!
67 | Now, if we collect all clusters (some of them being possible repeats), we can use this information to create our spliced alignments!
68 |
69 | But, hold on. Since some exons can be separated by a large gap on the reference (much larger than the read), we need to consider other regions simultaneously.
70 |
71 | For this reason, all clusters (represented with their start and end coordinates in both the reference and the read) for all analyzed regions are first collected in a single list.
72 |
73 | Then, the **knapsack** algorithm is applied on the list of clusters.
74 |
75 | **2.3 Knapsack algorithm**
76 | [Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem) is a problem of combinatorial optimization. Given a set of items, each with a weight and a value, the problem it tries to solve is "how to fill a knapsack with items so that the total weight is less than or equal to the given limit, and the value is as large as possible".
77 |
78 | In our case:
79 | - Knapsack is a *read*
80 | - Weight limit is the read length
81 | - An item is a cluster
82 | - Item weight is the length of the cluster in the read coordinate space
83 |
84 | Now, solving the knapsack problem would result in a list of clusters which fill the read the most. Alignment is then performed only on those clusters, and reported as separate SAM lines - one for each cluster (exon).
85 |
86 | **Acknowledgements**
87 | Ivan Sović (IS) proposed this solution for mapping of RNA-seq reads based on the knapsack algorithm. The initial version of the knapsack algorithm was implemented by Antonio Jurić (AJ). It was embedded in existing GraphMap source code by IS. Krešimir Križanović (KK) was/is working on evaluation and benchmarking of our RNA-seq methods and helped guide AJ. Mile Šikić (MS) supervised the project.
88 |
--------------------------------------------------------------------------------
/doc/sam_output.md:
--------------------------------------------------------------------------------
1 | ### Details on the SAM output generated by GraphMap
2 |
3 | Description of special tags in the SAM output:
4 | - **ZE** - The E-value. More accurately - a pesimistic approximation of the E-value obtained by rescoring the generated alignment with scores/penalties for which pre-calculated Gumbel parameters exist. Concretely, scores/penalties are: ```match = 5, mismatch = -4, gap_open = -8, gap_extend = -6```. By default, there is no threshold on the E-value so even weak homologies would be reported, but there is a parameter which provides this functionality (```-z```), e.g.: ```-z 1e0```.
5 | - **ZF** - An internal parameter for quality of alignment calculated using equation (8) in our preprint: (http://biorxiv.org/content/early/2015/06/10/020719). In GraphMap, potential regions for a read are sorted by this parameter, and the primary alignment is the one with the largest ZF value. ZF values for different reads are not mutually comparable.
6 | - **ZQ** - Query (read) length.
7 | - **ZR** - Reference length.
8 | - **H0** - Specified by SAM format as the "number of perfect hits", GraphMap reports here the number of possible mapping positions with the same number of kmer hits.
9 | - **NM** - Edit distance, specified by the SAM format.
10 | - **AS** - Alignment score, specified by the SAM format.
11 |
12 | There are two hidden gems in GraphMap's output, providing more detailed reporting of the alignment process. Compiling GraphMap with ```make testing``` will generate a binary file on path ```bin/graphmap-not_release```. Running this version using parameter ```-b 3``` will generate a more verbose version of the SAM output file:
13 | - **X3** - A string containing very verbose information about the alignment of a particular read.
14 | - **X4** - Measurement of the CPU time spent on major parts of the algorithm, in a human-readible text format.
15 |
--------------------------------------------------------------------------------
/overlap.md:
--------------------------------------------------------------------------------
1 | ## GraphMap Owler - Overlap With Long Erroneous Reads
2 | GraphMap implements two overlap modes:
3 | - ```./graphmap owler``` - fast, uses a trimmed GraphMap pipeline, reports output in MHAP or PAF formats, and
4 | - ```./graphmap align -x overlap``` - full GraphMap pipeline including alignment, output in SAM format.
5 |
6 | Owler mode (Overlap With Long Erroneous Reads) skips the graph-mapping and alignment steps. The full pipeline consists of the following steps:
7 | 1. Construct a gapped spaced index of the reads for only one shape (6-mers, "1111110111111").
8 | 2. For a read, collect all gapped spaced seed hits.
9 | 3. LCSk++.
10 | 4. Filtering seeds reported by LCSk++.
11 | 5. Output overlaps in MHAP-like or PAF format. For details, see below.
12 |
13 | Currently, no seed hits are discarded, which can make overlapping slow on larger or more repetitive datasets, but very sensitive.
14 |
15 | Note that the overlappers are still experimental, and require thorough testing.
16 |
17 | ### Output formats
18 | **MHAP** format is described here: [http://mhap.readthedocs.org/en/latest/quickstart.html#output](http://mhap.readthedocs.org/en/latest/quickstart.html#output).
19 | GraphMap's output uses the same columns, but the meaning of columns 3 and 4 (```Jaccard score``` and ```# shared min-mers``` respectively) is different in our context.
20 | Instead of ```Jaccard score``` the fraction of bases covered by seeds is reported.
21 | Instead of ```# shared min-mers``` the number of seeds which survived filtering is reported.
22 |
23 | GraphMap can also output overlaps to **PAF** format. Specification of the format can be found here: [https://github.com/lh3/miniasm/blob/master/PAF.md](https://github.com/lh3/miniasm/blob/master/PAF.md).
24 |
25 | ### Comparison to other methods
26 | We are working on scripts to benchmark various overlapping tools on simulated and real (later) data.
27 | An initial functioning version can be found here: [https://github.com/isovic/overlap-benchmark](https://github.com/isovic/overlap-benchmark).
28 |
29 | ### Examples
30 | ```
31 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in MHAP format (fast):
32 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.mhap
33 |
34 | # Overlap all reads from a given FASTA/FASTQ file and report overlaps in PAF format:
35 | ./graphmap owler -r reads.fa -d reads.fa -o overlaps.paf -L paf
36 |
37 | # Overlap all reads from a given FASTA/FASTQ in a full GraphMap mode with generating alignments (slow):
38 | ./graphmap align -x overlap -r reads.fa -d reads.fa -o overlaps.sam
39 | ```
40 |
--------------------------------------------------------------------------------
/reproducibility/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lbcb-sci/graphmap2/11815edabdfa21533c26218754d09a8fceda6704/reproducibility/README.md
--------------------------------------------------------------------------------
/reproducibility/run.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import os;
4 | import sys;
5 | import subprocess;
6 |
7 | def execute_command(command):
8 | sys.stderr.write('Executing command: %s\n' % (command));
9 | subprocess.call(command, shell=True);
10 |
11 | def run_simulations():
12 | sys.stderr.write('Starting the alignment process on simulated data.\n');
13 | sys.stderr.write('Note that this might take a very long time.\n');
14 | sys.stderr.write('E.g. BLAST took 110670 CPU secs in our tests on hg19_chr3 Oxford Nanopore 2d simulated dataset.\n');
15 | execute_command('aligneval/run-alignment.py');
16 | sys.stderr.write('\n');
17 |
18 | sys.stderr.write('Alignment script returned.\n');
19 | sys.stderr.write('\n');
20 |
21 | sys.stderr.write('Running the evaluation script.\n');
22 | execute_command('aligneval/run-evaluation.py');
23 | sys.stderr.write('\n');
24 |
25 | sys.stderr.write('Copying the results to reproducibility/results-simulated folder.\n');
26 | execute_command('cp aligneval/results/*.csv results-simulated');
27 |
28 | sys.stderr.write('Done!\n');
29 | sys.stderr.write('\n');
30 |
31 | def main():
32 | if (os.path.exists('samscripts') == False or os.path.exists('aligneval') == False):
33 | sys.stderr.write('Please run setup.py first, to install all dependencies. Exiting.\n');
34 | exit(1);
35 |
36 | if (len(sys.argv) < 2):
37 | sys.stderr.write('Run the alignment and evaluation processes from the GraphMap preprint paper.\n');
38 | sys.stderr.write('Usage:\n');
39 | sys.stderr.write('\tsim - Runs alignment on all simulation datasets. This might take quite a while to execute.\n');
40 | exit(0);
41 |
42 | if (sys.argv[1] == 'sim'):
43 | if (len(sys.argv) != 2):
44 | sys.stderr.write('Runs alignment on all simulation datasets. This might take quite a while to execute.\n');
45 | sys.stderr.write('Usage:\n');
46 | sys.stderr.write('\t%s %s\n' % (sys.argv[0], sys.argv[1]));
47 | exit(0);
48 |
49 | run_simulations();
50 | exit(0);
51 |
52 | else:
53 | sys.stderr.write('ERROR: Unknown subcommand!\n');
54 | exit(0);
55 |
56 | if __name__ == "__main__":
57 | main();
58 |
--------------------------------------------------------------------------------
/reproducibility/setup.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import os;
4 | import sys;
5 | import subprocess;
6 |
7 | def execute_command(command):
8 | sys.stderr.write('Executing command: %s\n' % (command));
9 | subprocess.call(command, shell=True);
10 |
11 | def main():
12 | if (not os.path.exists('samscripts')):
13 | execute_command('git clone https://github.com/isovic/samscripts.git');
14 |
15 | if (not os.path.exists('aligneval')):
16 | execute_command('git clone https://github.com/isovic/aligneval.git');
17 | execute_command('cd aligneval; ./setup.py all');
18 |
19 | folders_to_generate = ['data/reads', 'data/reference', 'results-simulated', 'results-real'];
20 | for folder_to_generate in folders_to_generate:
21 | if (not os.path.exists(folder_to_generate)):
22 | os.makedirs(folder_to_generate);
23 |
24 | if __name__ == "__main__":
25 | main();
26 |
--------------------------------------------------------------------------------
/src/aligner/aligner_base.h:
--------------------------------------------------------------------------------
1 | /*
2 | * aligner_base.h
3 | *
4 | * Created on: Jan 7, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_ALIGNER_ALIGNER_BASE_H_
9 | #define SRC_ALIGNER_ALIGNER_BASE_H_
10 |
11 | #include
12 | #include
13 | #include "aligner_containers.h"
14 | #include "pairwise_penalties.h"
15 |
16 | namespace is {
17 |
18 | class AlignerBase {
19 | public:
20 | virtual ~AlignerBase() { }
21 |
22 | // virtual AlignmentReturnValue Align(const char* q, int64_t qlen, const char* t, int64_t tlen, AlignmentType type) = 0; // Selects the alignment mode based on a parameter.
23 |
24 | virtual AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type) = 0; // Global alignment mode.
25 |
26 | virtual AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0; // Local alignment mode.
27 |
28 | virtual AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) = 0; // Semiglobal alignment mode.
29 |
30 | virtual AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, // Extend alignment mode. Does not necessarily
31 | int32_t bandwidth, int32_t zdrop) = 0; // produce CIGAR,but generate max alignment coords
32 |
33 | virtual std::shared_ptr getResults() = 0;
34 |
35 | };
36 |
37 | } /* namespace is */
38 |
39 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */
40 |
--------------------------------------------------------------------------------
/src/aligner/aligner_containers.h:
--------------------------------------------------------------------------------
1 | /*
2 | * aligner_containers.h
3 | *
4 | * Created on: Jan 7, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_CONTAINERS_H_
9 | #define SRC_CONTAINERS_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 |
16 | #include "sam_parser.h"
17 |
18 | namespace is {
19 |
20 | static constexpr int64_t LARGE_NEGATIVE_INT64 = std::numeric_limits::min() + 10000;
21 |
22 | enum class AlignmentReturnValue { // Strongly typed enum, C++11 feature.
23 | OK, // Everything went ok.
24 | Suboptimal, // Alignment stepped out of defined band. Result is not optimal.
25 | InvalidOptions, // In case parameters of values are invalid.
26 | QlenIsZero,
27 | TlenIsZero,
28 | WrongEditDist,
29 | AlignmentNotPerformed, // A default value for an alignment which wasn't performed.
30 | NotImplementedYet // For features in development.
31 | };
32 |
33 | enum class AlignmentType { // Strongly typed enum, C++11 feature.
34 | Global,
35 | Local
36 | };
37 |
38 | class AlignmentPosition {
39 | public:
40 | AlignmentPosition() : qstart(0), qend(0), tstart(0), tend(0) { }
41 | AlignmentPosition(int64_t _qstart, int64_t _qend, int64_t _tstart, int64_t _tend) :
42 | qstart(_qstart), qend(_qend), tstart(_tstart), tend(_tend) { }
43 | AlignmentPosition(const AlignmentPosition& op) :
44 | AlignmentPosition(op.qstart, op.qend, op.tstart, op.tend) { }
45 | AlignmentPosition& operator=(const AlignmentPosition& op) {
46 | qstart = op.qstart;
47 | qend = op.qend;
48 | tstart = op.tstart;
49 | tend = op.tend;
50 | return *this;
51 | }
52 |
53 | int64_t qstart, qend; // Query and target alignment start and end positions. End position
54 | int64_t tstart, tend; // is inclusive (the position of the last base).
55 | };
56 |
57 | class AlignmentResult {
58 | public:
59 | AlignmentResult() : score(0), edit_dist(0), position(),
60 | max_score(LARGE_NEGATIVE_INT64),
61 | max_q_pos(-1),
62 | max_t_pos(-1), k(-1), rv(AlignmentReturnValue::AlignmentNotPerformed) {
63 | }
64 |
65 | AlignmentResult(const AlignmentResult& op) :
66 | score(op.score), edit_dist(op.edit_dist),
67 | position(op.position), cigar(op.cigar),
68 | max_score(op.max_score), max_q_pos(op.max_q_pos),
69 | max_t_pos(op.max_t_pos),
70 | k(op.k), rv(op.rv) { // Copy constructor.
71 | }
72 |
73 | ~AlignmentResult() { };
74 |
75 | AlignmentResult& operator=(const AlignmentResult& op) {
76 | score = op.score;
77 | edit_dist = op.edit_dist;
78 | position = op.position;
79 | cigar = op.cigar;
80 | max_score = op.max_score;
81 | max_q_pos = op.max_q_pos;
82 | max_t_pos = op.max_t_pos;
83 | k = op.k;
84 | rv = op.rv;
85 | return *this;
86 | }
87 |
88 | // Alignment results.
89 | int64_t score;
90 | int64_t edit_dist;
91 | is::AlignmentPosition position; // There can be multiple alignments with the same score.
92 | // Only the first position and the corresponding alignment
93 | std::vector cigar; // are reported
94 | int64_t max_score, max_q_pos, max_t_pos; // Maximum score in the alignment, and the coordinates on query and target.
95 | int64_t k; // Value of band k used in the final alignment.
96 | AlignmentReturnValue rv; // Return value of the aligner.
97 | };
98 |
99 | // If any global margin is true, then the corresponding will be penalized.
100 | // Concretely, if top/left are true, then the first row/column will be initialized
101 | // to the multiple of the gap extend penalty in global alignment.
102 | // If bottom is false, the maximum of last row will be found instead of taking
103 | // the bottom right corner for global alignment.
104 | // If right is false, the maximum of last column will be found instead of taking
105 | // the bottom right corner for global alignment.
106 | class GlobalMargins {
107 | public:
108 | GlobalMargins()
109 | : top(true),
110 | left(true),
111 | bottom(true),
112 | right(true) {
113 | }
114 | GlobalMargins(bool _top, bool _left, bool _bottom, bool _right)
115 | : top(_top),
116 | left(_left),
117 | bottom(_bottom),
118 | right(_right) {
119 | }
120 | bool top, left, bottom, right;
121 | };
122 |
123 | class AlignmentOptions {
124 | public:
125 | AlignmentOptions() : k(-1),
126 | do_traceback(true) {
127 | }
128 |
129 | int32_t k; // Band for banded alignment. If < 0, banded alignment is turned off.
130 | bool do_traceback; // If traceback is not needed, then there is no need to alocate a large
131 | // matrix to store directions.
132 | GlobalMargins gm;
133 | };
134 |
135 | } /* namespace is */
136 |
137 |
138 |
139 | #endif /* SRC_CONTAINERS_H_ */
140 |
--------------------------------------------------------------------------------
/src/aligner/aligner_ksw2.cc:
--------------------------------------------------------------------------------
1 | #include "aligner_ksw2.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | // #include "ksw2/kseq.h"
8 | #include "aligner_util.hpp"
9 | #include
10 |
11 | // KSEQ_INIT(gzFile, gzread)
12 |
13 | namespace is {
14 |
15 | uint8_t seq_nt4_table[256] = {
16 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
17 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
18 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
20 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
21 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
22 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
23 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
24 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
30 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
32 | };
33 |
34 | std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) {
35 | return std::shared_ptr(new AlignerKSW2(p, opt));
36 | }
37 |
38 | static void print_aln(const char *tname, const char *qname, ksw_extz_t *ez)
39 | {
40 | printf("%s\t%s\t%d", tname, qname, ez->score);
41 | printf("\t%d\t%d\t%d", ez->max, ez->max_t, ez->max_q);
42 | if (ez->n_cigar > 0) {
43 | int i;
44 | putchar('\t');
45 | for (i = 0; i < ez->n_cigar; ++i)
46 | printf("%d%c", ez->cigar[i]>>4, "MID"[ez->cigar[i]&0xf]);
47 | }
48 | putchar('\n');
49 | }
50 |
51 |
52 | AlignerKSW2::AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt) : p_(p), opt_(opt), result_(nullptr) {
53 |
54 | }
55 |
56 | AlignerKSW2::~AlignerKSW2() {
57 |
58 | }
59 |
60 | is::AlignmentReturnValue AlignerKSW2::Global(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, bool type) {
61 | void *km = 0;
62 | ksw_extz_t ez; // Alignment result.
63 | int w = -1, flag = 0, zdrop = -1;
64 |
65 | #ifdef HAVE_KALLOC
66 | km = km_init();
67 | #endif
68 |
69 | memset(&ez, 0, sizeof(ksw_extz_t));
70 |
71 | auto mat = GenerateSimpleMatchMatrix((int8_t) p_.match, (int8_t) p_.mismatch, 5);
72 | // In GraphMap definition, penalties are negative. KSW2 expects positive values.
73 | int8_t q = -p_.w[0].v; // Gap open. The intercept component of the affine function.
74 | int8_t e = -p_.w[0].u; // Gap extend. The slope of the affine function.
75 | int8_t q2 = -p_.w[1].v;
76 | int8_t e2 = -p_.w[1].u;
77 |
78 | KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, w, zdrop, flag, &ez, type);
79 |
80 | // print_aln("Query", "Target", &ez);
81 |
82 | result_ = std::shared_ptr(new is::AlignmentResult);
83 | result_->score = ez.score;
84 | result_->position = is::AlignmentPosition(0, qlen, 0, tlen);
85 | result_->k = -1;
86 | result_->rv = is::AlignmentReturnValue::OK;
87 |
88 | result_->cigar.clear();
89 | std::vector basic_cigar;
90 | for (size_t i=0; i>4));
92 | }
93 | result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar);
94 |
95 | result_->edit_dist = EditDistFromExtCIGAR(result_->cigar);
96 |
97 | // printf ("Converted CIGAR:\n");
98 | // for (size_t i=0; icigar.size(); i++) {
99 | // printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op);
100 | // }
101 | // printf ("\n");
102 | // printf ("Edit distance: %ld\n", result_->edit_dist);
103 |
104 | kfree(km, ez.cigar);
105 | #ifdef HAVE_KALLOC
106 | km_destroy(km);
107 | #endif
108 |
109 | return is::AlignmentReturnValue::OK;
110 | }
111 |
112 | is::AlignmentReturnValue AlignerKSW2::Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop) {
113 | result_ = std::shared_ptr(new is::AlignmentResult);
114 |
115 | if (qseq == NULL || tseq == NULL || qlen <= 0 || tlen <= 0) {
116 | return is::AlignmentReturnValue::InvalidOptions;
117 | }
118 |
119 | void *km = 0;
120 | ksw_extz_t ez; // Alignment result.
121 | int flag = KSW_EZ_SCORE_ONLY | KSW_EZ_EXTZ_ONLY;
122 |
123 | #ifdef HAVE_KALLOC
124 | km = km_init();
125 | #endif
126 |
127 | memset(&ez, 0, sizeof(ksw_extz_t));
128 |
129 | auto mat = GenerateSimpleMatchMatrix((int8_t) p_.match, (int8_t) p_.mismatch, 5);
130 | // In GraphMap definition, penalties are negative. KSW2 expects positive values for affine pieces.
131 | int8_t q = -p_.w[0].v; // Gap open. The intercept component of the affine function.
132 | int8_t e = -p_.w[0].u; // Gap extend. The slope of the affine function.
133 | int8_t q2 = -p_.w[1].v;
134 | int8_t e2 = -p_.w[1].u;
135 |
136 | KSW2GlobalAlnWrapper_(km, (const int8_t*) qseq, qlen, (const int8_t*) tseq, tlen, 5, &mat[0], q, e, q2, e2, bandwidth, zdrop, flag, &ez, true);
137 |
138 | // print_aln("Query", "Target", &ez);
139 |
140 | result_->score = ez.score;
141 | result_->position = is::AlignmentPosition(0, qlen, 0, tlen);
142 | result_->k = -1;
143 | result_->rv = is::AlignmentReturnValue::OK;
144 | result_->max_score = ez.max;
145 | result_->max_q_pos = ez.max_q;
146 | result_->max_t_pos = ez.max_t;
147 |
148 | result_->cigar.clear();
149 | std::vector basic_cigar;
150 | for (size_t i=0; i>4));
152 | }
153 | result_->cigar = is::ConvertBasicToExtCIGAR(qseq, qlen, tseq, tlen, basic_cigar);
154 |
155 | result_->edit_dist = EditDistFromExtCIGAR(result_->cigar);
156 |
157 | // printf ("Converted CIGAR:\n");
158 | // for (size_t i=0; icigar.size(); i++) {
159 | // printf ("%d%c", result_->cigar[i].count, result_->cigar[i].op);
160 | // }
161 | // printf ("\n");
162 | // printf ("Edit distance: %ld\n", result_->edit_dist);
163 |
164 | kfree(km, ez.cigar);
165 | #ifdef HAVE_KALLOC
166 | km_destroy(km);
167 | #endif
168 |
169 | return is::AlignmentReturnValue::OK;
170 | }
171 |
172 | is::AlignmentReturnValue AlignerKSW2::Local(const char* q, int64_t qlen, const char* t, int64_t tlen) {
173 | return is::AlignmentReturnValue::NotImplementedYet;
174 | }
175 |
176 | is::AlignmentReturnValue AlignerKSW2::Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen) {
177 | return is::AlignmentReturnValue::NotImplementedYet;
178 | }
179 |
180 | std::shared_ptr AlignerKSW2::getResults() {
181 | return result_;
182 | }
183 |
184 | void AlignerKSW2::KSW2GlobalAlnWrapper_(void *km,
185 | const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen,
186 | int8_t m, const int8_t *mat,
187 | int8_t q, int8_t e, int8_t q2, int8_t e2,
188 | int w, int zdrop, int flag, ksw_extz_t *ez, bool type) {
189 | int i;
190 | ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
191 | ez->max = 0, ez->mqe = ez->mte = KSW_NEG_INF;
192 | ez->n_cigar = 0;
193 |
194 | auto qseq = ConvertSeqAlphabet(qseq_, qlen, &seq_nt4_table[0]);
195 | auto tseq = ConvertSeqAlphabet(tseq_, tlen, &seq_nt4_table[0]);
196 |
197 | if (type) {
198 | ksw_extd2_sse(km, qlen, (const uint8_t*) &qseq[0],
199 | tlen, (const uint8_t*) &tseq[0],
200 | m, mat, q, e, q2, e2, w, zdrop, flag, ez);
201 | } else {
202 | int noncan = 9;
203 | q = 4;
204 | e = 2;
205 | q2 = 32;
206 | zdrop = 200;
207 | flag = 1600;
208 |
209 | ksw_exts2_sse(km, qlen, (const uint8_t*) &qseq[0],
210 | tlen, (const uint8_t*) &tseq[0],
211 | m, mat, q, e, q2, noncan, zdrop, flag, ez);
212 | }
213 |
214 | // const char *algo = "extd2_sse";
215 | // if (strcmp(algo, "extz2_sse") == 0) ksw_extz2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, w, zdrop, flag, ez);
216 | // else if (strcmp(algo, "extd2_sse") == 0) ksw_extd2_sse(km, qlen, (const uint8_t*)&qseq[0], tlen, (const uint8_t*)&tseq[0], m, mat, q, e, q2, e2, w, zdrop, flag, ez);
217 | // // else if (strcmp(algo, "extf2_sse") == 0) ksw_extf2_sse(km, qlen, (uint8_t*)qseq, tlen, (uint8_t*)tseq, mat[0], mat[1], e, w, zdrop, ez);
218 | // else {
219 | // fprintf(stderr, "ERROR: can't find algorithm '%s'\n", algo);
220 | // exit(1);
221 | // }
222 | }
223 |
224 | }
225 |
--------------------------------------------------------------------------------
/src/aligner/aligner_ksw2.h:
--------------------------------------------------------------------------------
1 | /*
2 | * aligner_base.h
3 | *
4 | * Created on: Jan 7, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_ALIGNER_ALIGNER_KSW2_H_
9 | #define SRC_ALIGNER_ALIGNER_KSW2_H_
10 |
11 | #include
12 | #include
13 | #include "aligner_base.h"
14 | #include "aligner_containers.h"
15 | #include "pairwise_penalties.h"
16 | #include "aligner_util.hpp"
17 | #include "ksw2/ksw2.h"
18 |
19 | namespace is {
20 |
21 | class AlignerKSW2;
22 |
23 | std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt);
24 |
25 | class AlignerKSW2 : public AlignerBase {
26 | public:
27 | friend std::shared_ptr createAlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt);
28 |
29 | ~AlignerKSW2();
30 |
31 | AlignmentReturnValue Global(const char* q, int64_t qlen, const char* t, int64_t tlen, bool type); // Global alignment mode.
32 |
33 | AlignmentReturnValue Local(const char* q, int64_t qlen, const char* t, int64_t tlen); // Local alignment mode.
34 |
35 | AlignmentReturnValue Semiglobal(const char* q, int64_t qlen, const char* t, int64_t tlen); // Semiglobal alignment mode.
36 |
37 | AlignmentReturnValue Extend(const char* qseq, int64_t qlen, const char* tseq, int64_t tlen, int32_t bandwidth, int32_t zdrop);
38 |
39 | std::shared_ptr getResults();
40 |
41 | protected:
42 | AlignerKSW2(const is::PiecewisePenalties &p, const is::AlignmentOptions &opt); // We don't want users attempting to instantiate manually, even though the class is virtual.
43 |
44 | private:
45 | AlignerKSW2(const AlignerKSW2&) = delete; // No copying.
46 | AlignerKSW2& operator=(const AlignerKSW2&) = delete; // No copying.
47 | AlignerKSW2(AlignerKSW2&&) = delete; // No move constructor.
48 | AlignerKSW2& operator=(const AlignerKSW2&&) = delete; // No copying.
49 |
50 | void KSW2GlobalAlnWrapper_(void *km,
51 | const int8_t *qseq_, int qlen, const int8_t *tseq_, int tlen,
52 | int8_t m, const int8_t *mat,
53 | int8_t q, int8_t e, int8_t q2, int8_t e2,
54 | int w, int zdrop, int flag, ksw_extz_t *ez, bool type);
55 |
56 | const is::PiecewisePenalties& p_;
57 | const is::AlignmentOptions& opt_;
58 | std::shared_ptr result_;
59 | };
60 |
61 | } /* namespace is */
62 |
63 | #endif /* SRC_ALIGNER_ALIGNER_BASE_H_ */
64 |
--------------------------------------------------------------------------------
/src/aligner/aligner_util.cc:
--------------------------------------------------------------------------------
1 | #include "aligner_util.hpp"
2 | #include "assert.h"
3 |
4 | #include
5 | #include
6 |
7 | namespace is {
8 |
9 | std::vector ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table) {
10 | std::vector ret(seqlen + 33); // 32 for gaba
11 | for (size_t i=0; i ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen,
18 | const char* tseq, int64_t tlen,
19 | const std::vector& basic_cigar) {
20 | std::vector ret;
21 |
22 | int64_t qpos = 0, tpos = 0;
23 | for (size_t i=0; i 0) {
53 | ret.push_back(is::CigarOp(prev_m, curr_count));
54 | }
55 | }
56 | }
57 |
58 | return ret;
59 | }
60 |
61 | int64_t EditDistFromExtCIGAR(const std::vector& extended_cigar) {
62 | int64_t edit_dist = 0;
63 | for (size_t i=0; i ExtractCigarBetweenQueryCoords(const std::vector& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q) {
74 | std::vector ret;
75 |
76 | int64_t qpos = 0;
77 |
78 | int lengthOfRef = 0;
79 | int lengthOfRead = 0;
80 |
81 | for (auto& c: cigar) {
82 |
83 | int64_t qpos_next = (c.op == 'M' || c.op == '=' || c.op == 'X' || c.op == 'I' || c.op == 'S') ? (qpos + c.count) : qpos;
84 |
85 | if (qpos > qend) { break; }
86 |
87 | if (qpos_next < qstart) {
88 | qpos = qpos_next;
89 | continue;
90 | }
91 |
92 | int64_t b = 0, e = c.count;
93 |
94 | if (qstart >= qpos && qstart < qpos_next) { b = qstart - qpos; }
95 | if (qend >= qpos && qend < qpos_next) { e = qend - qpos; }
96 |
97 | if ((e - b) > 0) {
98 | ret.emplace_back(is::CigarOp(c.op, (e - b)));
99 |
100 | if (c.op != 'I') {
101 | lengthOfRef += (e - b);
102 | }
103 | if(c.op != 'D' && c.op != 'N') {
104 | lengthOfRead += (e - b);
105 | }
106 | }
107 |
108 | qpos = qpos_next;
109 | }
110 |
111 | *cigar_length = lengthOfRef;
112 | *cigar_length_q = lengthOfRead;
113 |
114 | return ret;
115 | }
116 |
117 | std::string CigarToString(const std::vector& cigar) {
118 | std::stringstream ss;
119 | for (size_t i=0; i
5 | #include
6 | #include
7 | #include "sam_parser.h"
8 |
9 | namespace is {
10 |
11 | template
12 | std::vector GenerateSimpleMatchMatrix(T match, T mismatch, size_t alphabet_size) {
13 | std::vector matrix(alphabet_size * alphabet_size, mismatch); // Set the mismatch score.
14 | // Goes to "-1" to allow for 'N' bases which should not match to themselves.
15 | for (size_t i=0; i<(alphabet_size - 1); i++) {
16 | matrix[i*alphabet_size + i] = match; // Set the match score.
17 | matrix[i*alphabet_size + alphabet_size - 1] = 0; // Reset the last column to 0.
18 | matrix[(alphabet_size - 1) * alphabet_size + i] = 0; // Reset the last row to 0.
19 | }
20 | return matrix;
21 | }
22 |
23 | std::vector ConvertSeqAlphabet(const int8_t* seq, size_t seqlen, const uint8_t* conv_table);
24 |
25 | std::vector ConvertBasicToExtCIGAR(const char* qseq, int64_t qlen,
26 | const char* tseq, int64_t tlen,
27 | const std::vector& basic_cigar);
28 |
29 | int64_t EditDistFromExtCIGAR(const std::vector& extended_cigar);
30 |
31 | std::vector ExtractCigarBetweenQueryCoords(const std::vector& cigar, int64_t qstart, int64_t qend, int64_t *cigar_length, int64_t *cigar_length_q);
32 |
33 | std::string CigarToString(const std::vector& cigar);
34 |
35 | }
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/src/aligner/anchor_aligner.h:
--------------------------------------------------------------------------------
1 | /*
2 | * anchor_aligner.h
3 | *
4 | * Created on: Aug 23, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_ANCHOR_ALIGNER_H_
9 | #define SRC_ANCHOR_ALIGNER_H_
10 |
11 | #include
12 | #include "aligner_base.h"
13 | #include "containers/results.h"
14 |
15 | #include
16 | #include
17 |
18 | namespace is {
19 |
20 | class AnchorAligner;
21 |
22 | std::shared_ptr createAnchorAligner(std::shared_ptr aligner);
23 |
24 | class AlignmentAnchor {
25 | public:
26 | AlignmentAnchor() : qstart(0), qend(0), rstart(0), rend(0) { }
27 | AlignmentAnchor(int64_t _qstart, int64_t _qend,
28 | int64_t _rstart, int64_t _rend) :
29 | qstart(_qstart), qend(_qend), rstart(_rstart), rend(_rend) { }
30 |
31 | int64_t qstart, qend;
32 | int64_t rstart, rend;
33 | };
34 |
35 | class AnchorAligner {
36 | public:
37 | friend std::shared_ptr createAnchorAligner(std::shared_ptr aligner);
38 |
39 | ~AnchorAligner();
40 |
41 | std::shared_ptr CreateAlignmentResult(int64_t qstart, int64_t qend, int64_t rstart, int64_t rend, std::vector rez);
42 |
43 | double AlignEdges(const char *query, const char *ref, int leftRef, int rightRef, int64_t start_position_read, int64_t start_position_ref, int number_of_bases, std::stack cigar_stack, std::deque cigar_queue);
44 | void AdjustEnds(int left_offset_ref, int right_offset_ref, const char *query, const char *ref, int64_t *start_position_ref, int64_t *start_position_read, int number_of_bases, std::stack *cigar_stack, std::deque *cigar_queue, bool type);
45 |
46 | /* Sorts anchors and then performs global alignment between the minimum and maximum anchor coordinates.
47 | */
48 | std::shared_ptr GlobalEndToEnd(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors);
49 |
50 | /* Sorts the anchors, and aligns every neighboring pair of anchors. It does not extend beyond
51 | the ends of the first and last anchor.
52 | */
53 | std::shared_ptr GlobalAnchored(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors, bool type);
54 | std::shared_ptr GlobalAnchoredWithClipping(const char *query, int64_t qlen, const char *ref, int64_t rlen, const std::vector& anchors);
55 |
56 | /* Sorts the anchors, and aligns every neighboring pair of anchors. This extends alignments beyond
57 | the ends of the first and last anchor in an attempt to produce end-to-end alignment.
58 | */
59 | std::shared_ptr GlobalAnchoredWithExtend(int64_t abs_ref_id, std::shared_ptr index, const char *query, int64_t qlen, const char *ref, int64_t rlen,
60 | const std::vector& anchors, int32_t bandwidth, int32_t zdrop, bool type);
61 |
62 | private:
63 | AnchorAligner(const AnchorAligner&) = delete;
64 | AnchorAligner& operator=(const AnchorAligner&) = delete;
65 |
66 | AnchorAligner(std::shared_ptr aligner);
67 |
68 | const std::shared_ptr aligner_;
69 | };
70 |
71 | }
72 |
73 | #endif
74 |
--------------------------------------------------------------------------------
/src/aligner/pairwise_penalties.h:
--------------------------------------------------------------------------------
1 | #ifndef SRC_ALIGNER_PAIRWISE_PENALTIES_H_
2 | #define SRC_ALIGNER_PAIRWISE_PENALTIES_H_
3 |
4 | #include
5 | #include
6 |
7 | namespace is {
8 |
9 | /* Regular alignment penalties for a single piece Gotoh alignment.
10 | */
11 | class Penalties {
12 | public:
13 | Penalties() : match(5), mismatch(-4), gapopen(-8), gapext(-6) { }
14 | Penalties(int32_t _match, int32_t _mismatch, int32_t _gapopen, int32_t _gapext) :
15 | match(_match), mismatch(_mismatch), gapopen(_gapopen), gapext(_gapext) { }
16 | int32_t match, mismatch, gapopen, gapext;
17 | };
18 |
19 | /* A helper class for a linear function. Used for piecewise Gotoh alignment.
20 | */
21 | class AffinePiece {
22 | public:
23 | AffinePiece() : u(-6.0), v(-8.0) { }
24 | AffinePiece(float _u, float _v) : u(_u), v(_v) { }
25 |
26 | inline float calc(int32_t k) const {
27 | return (u * (k) + v);
28 | }
29 |
30 | float u, v; // Line equation parameters: w(k) = u * k + v.
31 | };
32 |
33 | /* Penalties for a multiple affine function alignment.
34 | */
35 | class PiecewisePenalties {
36 | public:
37 | PiecewisePenalties() : match(5), mismatch(-4), w(std::vector{AffinePiece(-6.0, -8.0)}) { }
38 |
39 | PiecewisePenalties(int32_t _match, int32_t _mismatch, const std::vector& _w) :
40 | match(_match), mismatch(_mismatch), w(_w) { }
41 |
42 | std::string Verbose() {
43 | std::stringstream ss;
44 | ss << "match = " << match << ", mismatch = " << mismatch << "";
45 | for (int32_t l = 0; l < w.size(); l++) {
46 | ss << ", w[" << l << "] = {u = " << w[l].u << ", v = " << w[l].v << "}";
47 | }
48 | ss << "\n";
49 | return ss.str();
50 | }
51 |
52 | float match, mismatch;
53 | std::vector w;
54 | };
55 |
56 | }
57 |
58 | #endif
--------------------------------------------------------------------------------
/src/aligner/sam_parser.cc:
--------------------------------------------------------------------------------
1 | #include "sam_parser.h"
2 |
3 | #include
4 |
5 | namespace is {
6 |
7 | int SplitCigar(const std::string &cigar_str, std::vector& ret) {
8 | ret.clear();
9 | CigarOp op;
10 | // int32_t digit_count = 0;
11 | int64_t pos_ref = 0, pos_query = 0;
12 | const char *first_digit = NULL;
13 | for (size_t i=0; i& split_cigar) {
31 | int64_t len = 0;
32 | for (size_t i=0; i> qname >> flag >> rname >> pos >>
56 | mapq >> cigar_string >> rnext >> pnext >> tlen >> seq >> qual;
57 |
58 | SplitCigar(cigar_string, cigar);
59 |
60 | std::string all_optional;
61 | std::getline(ss, all_optional);
62 | Tokenize_(all_optional, '\t', optional);
63 | return 0;
64 | }
65 |
66 | bool SamLine::IsMapped() {
67 | return (!(flag & 4));
68 | }
69 |
70 | bool SamLine::IsReverse() {
71 | return ((flag & 16));
72 | }
73 |
74 |
75 | int SamLine::FindAlignmentPosition(int64_t& q_start, int64_t& q_end,
76 | int64_t& r_start, int64_t& r_end) {
77 | q_start = 0;
78 | q_end = seq.size();
79 |
80 | // Find query alignment start (skip the soft clipped bases).
81 | for (auto& c: cigar) {
82 | if (c.op == 'H') {
83 | continue;
84 | } else if (c.op == 'S') {
85 | q_start += c.count;
86 | } else {
87 | break;
88 | }
89 | }
90 |
91 | // Find query alignment end (skip the soft clipped bases).
92 | for (int64_t i=(cigar.size() - 1); i >= 0; i--) {
93 | auto& c = cigar[i];
94 | if (c.op == 'H') {
95 | continue;
96 | } else if (c.op == 'S') {
97 | q_end -= c.count;
98 | } else {
99 | break;
100 | }
101 | }
102 |
103 | // Find reference alignment start. (Convert from 1-based to 0-based).
104 | r_start = pos - 1;
105 |
106 | // Find reference alignment end.
107 | r_end = r_start + CalcReferenceLengthFromCigar(cigar);
108 |
109 | // Do not performe reverse complementing here, we do not know
110 | // the length of the reference.
111 |
112 | return 0;
113 | }
114 |
115 | void SamLine::Tokenize_(const std::string& str, const char delimiter, std::vector& words) {
116 | words.clear();
117 | std::stringstream ss(str);
118 | std::string line;
119 | while(std::getline(ss, line, delimiter)) {
120 | if (line.size() == 0) { continue; }
121 | words.push_back(line);
122 | }
123 | }
124 |
125 | }
126 |
--------------------------------------------------------------------------------
/src/aligner/sam_parser.h:
--------------------------------------------------------------------------------
1 | #ifndef SRC_SAM_PARSER_H_
2 | #define SRC_SAM_PARSER_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | namespace is {
10 |
11 | #define is_cigar_op(x) (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'D' || x == 'S' || x == 'H')
12 | #define is_cigar_match(x) (x == 'M' || x == '=' || x == 'X')
13 | #define is_cigar_ins(x) (x == 'I')
14 | #define is_cigar_del(x) (x == 'D')
15 | #define is_cigar_soft(x) (x == 'S')
16 | #define is_cigar_hard(x) (x == 'H')
17 | #define is_cigar_ref(x) (x == 'M' || x == '=' || x == 'X' || x == 'D')
18 | #define is_cigar_read(x) (x == 'M' || x == '=' || x == 'X' || x == 'I' || x == 'S')
19 |
20 | // class CigarOp {
21 | // public:
22 | // char op = '-';
23 | // int32_t count = 0;
24 | // int64_t pos_ref = -1; // Relative to the pos_ field of the corresponding SequenceAlignment object. pos_ref starts from zero, eventhough the actuall alignment starts at an arbitrary position on the reference.
25 | // int64_t pos_query = - 1;
26 |
27 | // CigarOp() { }
28 | // CigarOp(char _op, int32_t _count, int64_t _pos_ref, int64_t _pos_query) : op(_op), count(_count), pos_ref(_pos_ref), pos_query(_pos_query) { }
29 |
30 | // };
31 |
32 | /** @brief A container for a single CIGAR operation.
33 | *
34 | */
35 | class CigarOp {
36 | public:
37 | CigarOp() : op(0), count(0) { }
38 | CigarOp(char _op, int32_t _count) : op(_op), count(_count) { }
39 | CigarOp(const CigarOp& t) : CigarOp(t.op, t.count) { }
40 | ~CigarOp() { }
41 | CigarOp& operator=(const CigarOp t) {
42 | op = t.op;
43 | count = t.count;
44 | return *this;
45 | }
46 | std::string get() { std::stringstream ss; ss << count << op; return ss.str(); }
47 |
48 | char op;
49 | int64_t count;
50 | };
51 |
52 |
53 | int SplitCigar(const std::string &cigar_str, std::vector& ret);
54 | int64_t CalcReferenceLengthFromCigar(const std::vector& split_cigar);
55 |
56 | class SamLine {
57 | public:
58 | SamLine();
59 | SamLine(const std::string& line);
60 | // SamLine();
61 | // SequenceAlignment::SequenceAlignment(uint32_t _flag, std::string &rname, int64_t pos, int32_t mapq, std::string &cigar_string, std::string &rnext, int64_t pnext, int64_t tlen, std::vector &optional)
62 | // : flag_(flag), rname_(rname), pos_(pos), mapq_(mapq), rnext_(rnext), pnext_(pnext), tlen_(tlen), optional_(optional) {
63 | // SplitCigar(cigar_string, cigar_);
64 | // ProcessOptional();
65 | // }
66 | ~SamLine();
67 |
68 | int ParseLine(const std::string& line);
69 | std::string YieldString();
70 | bool IsMapped();
71 | bool IsReverse();
72 | int FindAlignmentPosition(int64_t& q_start, int64_t& q_end,
73 | int64_t& r_start, int64_t& r_end);
74 |
75 | std::string qname; // Field #1.
76 | uint32_t flag; // Field #2.
77 | std::string rname; // Field #3.
78 | int64_t pos; // Field #4.
79 | int32_t mapq; // Field #5.
80 | // std::string cigar; // Field #6.
81 | std::vector cigar;
82 | std::string rnext; // Field #7.
83 | int64_t pnext; // Field #8.
84 | int64_t tlen; // Field #9.
85 | std::string seq; // Field #10.
86 | std::string qual; // Field #11.
87 |
88 | // Optional fields in the SAM format:
89 | int64_t as; // Alignment score.
90 | double evalue; // E-value. There is no dedicated field in the SAM format, but GraphMap uses ZE to specify the E-value.
91 | std::vector optional; // Raw values (strings) of optional fields, not explicitly converted to expected values;
92 |
93 |
94 |
95 | private:
96 | void Tokenize_(const std::string& str, const char delimiter, std::vector& words);
97 |
98 | };
99 |
100 | }
101 |
102 | #endif
103 |
--------------------------------------------------------------------------------
/src/alignment/alignment.h:
--------------------------------------------------------------------------------
1 | /*
2 | * alignment.h
3 | *
4 | * Created on: Jan 17, 2016
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_ALIGNMENT_ALIGNMENT_H_
9 | #define SRC_ALIGNMENT_ALIGNMENT_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 |
20 | #include "sequences/single_sequence.h"
21 | #include "utility/utility_general.h"
22 | #include "program_parameters.h"
23 | #include "utility/utility_conversion-inl.h"
24 | #include "containers/path_graph_entry.h"
25 | #include "libs/edlib.h"
26 | #include "alignment/cigargen.h"
27 | #include "alignment_wrappers.h"
28 | #include "log_system/log_system.h"
29 | #include "containers/region.h"
30 | #include "seqan/basic.h"
31 | #include "seqan/align.h"
32 | #include "seqan/sequence.h"
33 | #include "seqan/stream.h"
34 | #include "utility/evalue.h"
35 | #include "graphmap/transcriptome.h"
36 |
37 |
38 |
39 |
40 | int AlignRegion(const SingleSequence *read, std::shared_ptr index, std::shared_ptr transcriptome, const ProgramParameters *parameters, const EValueParams *evalue_params, bool extend_to_end, PathGraphEntry *region_results);
41 | int SemiglobalAlignment(AlignmentFunctionType AlignmentFunction,
42 | const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters,
43 | const EValueParams *evalue_params, PathGraphEntry *region_results);
44 | int AnchoredAlignmentNew(AlignmentFunctionType AlignmentFunctionNW, AlignmentFunctionType AlignmentFunctionSHW,
45 | const SingleSequence *read, std::shared_ptr index, std::shared_ptr transcriptome, const ProgramParameters *parameters,
46 | const EValueParams *evalue_params, PathGraphEntry *region_results, bool align_end_to_end, bool spliced_alignment);
47 |
48 | void VerboseAlignment(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const AlignmentResults *aln);
49 |
50 | /// Determines the start and end locations for semiglobal alignment, keeping in mind the boundaries of the reference being aligned to. Works with circular alignment as well.
51 | //int GetAlignmentWindowFromRegion(const SingleSequence *read, const Index *index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
52 | // int64_t *win_start, int64_t *win_end, int64_t *win_len);
53 | int GetL1PosInRegion(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
54 | int64_t *l1_start, int64_t *l1_end);
55 |
56 | // Checks if the region is linear or circular. If it's linear, only a pointer to the beginning of the region (in the index) will be returned. Otherwise, a data array will be created containing the
57 | // concatenated region.
58 | // Returns 0 if the region was linear, otherwise 1. Value of 1 means that manual cleanup of ret_data is required, using free().
59 | int GetAlignmentWindowData(const SingleSequence *read, std::shared_ptr index, const ProgramParameters *parameters, const PathGraphEntry *region_results,
60 | int8_t** data, int64_t* data_length, int8_t **pos_of_win_start, int8_t **pos_of_win_end, int64_t* offset_from_ref_start, int64_t* pos_of_ref_end, bool *is_cleanup_required);
61 |
62 | int FindCircularEnd(const std::vector &alignment, int64_t pos_of_ref_end,
63 | int64_t *ret_end_on_aln, int64_t *ret_end_on_read, int64_t *ret_end_on_ref,
64 | int64_t *ret_start_on_aln, int64_t *ret_start_on_read, int64_t *ret_start_on_ref);
65 |
66 | int SplitCircularAlignment(const AlignmentResults *aln, int64_t pos_of_ref_end, int64_t ref_start, int64_t ref_len, AlignmentResults *aln_l, AlignmentResults *aln_r);
67 |
68 |
69 | int CheckAlignmentSane(std::vector &alignment, const SingleSequence* read=NULL,std::shared_ptr index=nullptr, int64_t reference_hit_id=-1, int64_t reference_hit_pos=-1);
70 |
71 | #endif /* SRC_ALIGNMENT_ALIGNMENT_H_ */
72 |
--------------------------------------------------------------------------------
/src/alignment/alignment_wrappers.h:
--------------------------------------------------------------------------------
1 | /*
2 | * local_realignment_generic.h
3 | *
4 | * Created on: Jan 16, 2015
5 | * Author: isovic
6 | */
7 |
8 | #ifndef LOCAL_REALIGNMENT_GENERIC_H_
9 | #define LOCAL_REALIGNMENT_GENERIC_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 | #include "utility/utility_general.h"
20 | #include "program_parameters.h"
21 | #include "libs/edlib.h"
22 | #include "alignment/cigargen.h"
23 | #include "log_system/log_system.h"
24 | #include "containers/region.h"
25 | #include "seqan/basic.h"
26 | #include "seqan/align.h"
27 | #include "seqan/sequence.h"
28 | #include "seqan/stream.h"
29 |
30 | #define ALIGNMENT_TYPE_SHW 0 /// Gaps at the end are not penalized.
31 | #define ALIGNMENT_TYPE_HW 1 /// Gaps at the beginning and the end are not penalized.
32 | #define ALIGNMENT_TYPE_NW 2 /// Global alignment (gaps at the beginning and the end are penalized).
33 |
34 | #ifndef RELEASE_VERSION
35 | #include "libs/opal.h"
36 | #endif
37 |
38 | typedef int (*AlignmentFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t*, int64_t*, int64_t*, std::vector &);
39 | typedef int (*EditDistanceFunctionType)(const int8_t*, int64_t, const int8_t*, int64_t, int64_t*, int64_t*, int);
40 |
41 | int LocalizeAlignmentPosWithMyers(const int8_t *read_data, int64_t read_length,
42 | const int8_t *reference_data, int64_t reference_length,
43 | int64_t rough_reference_start, int64_t rough_reference_end,
44 | int64_t *ret_alignment_start, int64_t *ret_alignment_end,
45 | int64_t *ret_start_ambiguity, int64_t *ret_end_ambiguity,
46 | int64_t *ret_edit_distance, int64_t *ret_band_width,
47 | bool verbose_debug_output=false);
48 |
49 | int MyersSemiglobalWrapper(const int8_t *read_data, int64_t read_length,
50 | const int8_t *reference_data, int64_t reference_length,
51 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
52 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
53 | int64_t *ret_edit_distance, std::vector &ret_alignment);
54 |
55 | int MyersNWWrapper(const int8_t *read_data, int64_t read_length,
56 | const int8_t *reference_data, int64_t reference_length,
57 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
58 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
59 | int64_t *ret_edit_distance, std::vector &ret_alignment);
60 |
61 | int MyersSHWWrapper(const int8_t *read_data, int64_t read_length,
62 | const int8_t *reference_data, int64_t reference_length,
63 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
64 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
65 | int64_t *ret_edit_distance, std::vector &ret_alignment);
66 |
67 | #ifndef RELEASE_VERSION
68 | int OpalNWWrapper(const int8_t *read_data, int64_t read_length,
69 | const int8_t *reference_data, int64_t reference_length,
70 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
71 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
72 | int64_t *ret_edit_distance, std::vector &ret_alignment);
73 |
74 | int OpalSHWWrapper(const int8_t *read_data, int64_t read_length,
75 | const int8_t *reference_data, int64_t reference_length,
76 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
77 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
78 | int64_t *ret_edit_distance, std::vector &ret_alignment);
79 | #endif
80 |
81 | int SeqAnSemiglobalWrapper(const int8_t *read_data, int64_t read_length,
82 | const int8_t *reference_data, int64_t reference_length,
83 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
84 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
85 | int64_t *ret_edit_distance, std::vector &ret_alignment);
86 | int SeqAnSemiglobalWrapperWithMyersLocalization(const int8_t *read_data, int64_t read_length,
87 | const int8_t *reference_data, int64_t reference_length,
88 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
89 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
90 | int64_t *ret_edit_distance, std::vector &ret_alignment);
91 |
92 | int MyersEditDistanceWrapper(const int8_t *read_data, int64_t read_length,
93 | const int8_t *reference_data, int64_t reference_length,
94 | int64_t *ret_alignment_position_end,
95 | int64_t *ret_edit_distance, EdlibAlignMode edlib_mode_code=EDLIB_MODE_HW);
96 |
97 | int SeqAnNWWrapper(const int8_t *read_data, int64_t read_length,
98 | const int8_t *reference_data, int64_t reference_length,
99 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
100 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
101 | int64_t *ret_edit_distance, std::vector &ret_alignment);
102 | int SeqAnSHWWrapper(const int8_t *read_data, int64_t read_length,
103 | const int8_t *reference_data, int64_t reference_length,
104 | int64_t band_width, int64_t match_score, int64_t mex_score, int64_t mismatch_penalty, int64_t gap_open_penalty, int64_t gap_extend_penalty,
105 | int64_t* ret_alignment_position_start, int64_t *ret_alignment_position_end,
106 | int64_t *ret_edit_distance, std::vector &ret_alignment);
107 |
108 | //int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align, bool is_global, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector &ret_alignment);
109 | int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align &align, int alignment_type, int64_t *ret_start_offset, int64_t *ret_end_offset, int64_t *edit_distance, std::vector &ret_alignment);
110 |
111 | int CheckAlignmentSaneSimple(std::vector &alignment);
112 |
113 |
114 |
115 | #endif /* LOCAL_REALIGNMENT_GENERIC_H_ */
116 |
--------------------------------------------------------------------------------
/src/alignment/cigargen.h:
--------------------------------------------------------------------------------
1 | /*
2 | * cigargen.h
3 | *
4 | * Created on: Aug 28, 2014
5 | * Author: ivan
6 | */
7 |
8 | #ifndef CIGARGEN_H_
9 | #define CIGARGEN_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 | #include "libs/edlib.h"
20 | #include "utility/utility_general.h"
21 | #include "sequences/sequence_alignment.h"
22 |
23 | #define EDLIB_M 0
24 | #define EDLIB_EQUAL 0
25 | #define EDLIB_X 3
26 | #define EDLIB_I 1
27 | #define EDLIB_D 2
28 | #define EDLIB_S 4
29 | #define EDLIB_H 5 /// Not used in GraphMap currently (26.01.2016.)
30 | #define EDLIB_NOP 6
31 | #define EDLIB_N 7 // Large gaps (e.g. splicing sites).
32 | #define EDLIB_P 8 // Padding.
33 |
34 | inline char EdlibOpToChar(int8_t op) {
35 | return (op == EDLIB_M || op == EDLIB_EQUAL || op == EDLIB_X) ? 'M' :
36 | (op == EDLIB_I) ? 'I' :
37 | (op == EDLIB_D) ? 'D' :
38 | (op == EDLIB_S) ? 'S' :
39 | (op == EDLIB_H) ? 'H' : 0;
40 | }
41 |
42 | inline char EdlibOpToCharExtended(int8_t op) {
43 | return (op == EDLIB_EQUAL) ? '=' :
44 | (op == EDLIB_X) ? 'X' :
45 | (op == EDLIB_M) ? 'M' :
46 | (op == EDLIB_I) ? 'I' :
47 | (op == EDLIB_D) ? 'D' :
48 | (op == EDLIB_S) ? 'S' :
49 | (op == EDLIB_H) ? 'H' : 0;
50 | }
51 |
52 | std::string AlignmentToCigar(unsigned char *alignment, int alignmentLength, bool extended_format);
53 | int AlignmentToBasicCigar(unsigned char* alignment, int alignmentLength, char** cigar_);
54 | int AlignmentToExtendedCigar(unsigned char* alignment, int alignmentLength, char** cigar_);
55 | int AlignmentToExtendedCigarArray(unsigned char* alignment, int alignmentLength, std::vector &cigar);
56 | std::string AlignmentToMD(std::vector& alignment, const int8_t *ref_data, int64_t alignment_position_start);
57 |
58 | /// Searches for consecutive EDLIB_I and EDLIB_D (or vice versa) operations, and replaces the overlap with EDLIB_X.
59 | std::vector FixAlignment(unsigned char* alignment, int alignmentLength);
60 | /// In case an alignment has leading/trailing EDLIB_I operations, they will be replaced with EDLIB_S.
61 | int ConvertInsertionsToClipping(unsigned char* alignment, int alignmentLength);
62 | /// Counts the number of leading and trailing clipped bases (or insertions).
63 | int CountClippedBases(unsigned char* alignment, int alignmentLength, int64_t *ret_num_clipped_front, int64_t *ret_num_clipped_back);
64 | /// Sums up the bases on the reference the alignment spans through (EDLIB_M and EDLIB_D operations).
65 | int64_t CalculateReconstructedLength(unsigned char *alignment, int alignmentLength);
66 |
67 | int CalculateAlignmentScore(std::vector& alignment, int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend);
68 |
69 | /// Counts each operation type, and calculates the alignment score as well (while rescoring the alignment with the given scores/penalties).
70 | int CountAlignmentOperations(std::vector &alignment, const int8_t *read_data, const int8_t *ref_data, int64_t reference_hit_id, int64_t alignment_position_start, SeqOrientation orientation,
71 | int64_t match, int64_t mismatch, int64_t gap_open, int64_t gap_extend,
72 | bool skip_leading_and_trailing_insertions,
73 | int64_t *ret_eq, int64_t *ret_x, int64_t *ret_i, int64_t *ret_d, int64_t *ret_alignment_score, int64_t *ret_edit_dist, int64_t *ret_nonclipped_length);
74 | /// Reverses the operations in a CIGAR string.
75 | std::string ReverseCigarString(std::string &cigar);
76 |
77 | std::string PrintAlignmentToString(const unsigned char* query, const int queryLength,
78 | const unsigned char* target, const int targetLength,
79 | const unsigned char* alignment, const int alignmentLength,
80 | const int position, const int modeCode, int row_width=100);
81 |
82 | int GetAlignmentPatterns(const unsigned char* query, const int64_t queryLength,
83 | const unsigned char* target, const int64_t targetLength,
84 | const unsigned char* alignment, const int64_t alignmentLength,
85 | std::string &ret_query, std::string &ret_target, std::string &ret_match_pattern);
86 |
87 | void FixAlignmentLeadingTrailingID(std::vector& alignment, int64_t *ref_start, int64_t *ref_end);
88 |
89 | #endif /* CIGARGEN_H_ */
90 |
--------------------------------------------------------------------------------
/src/alignment/transcriptome_mod.h:
--------------------------------------------------------------------------------
1 | /*
2 | * transcriptome_mod.h
3 | *
4 | * Created on: Jan 5, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_
9 | #define SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_
10 |
11 | //#include "index/index.h"
12 | //#include "index/index_spaced_hash_fast.h"
13 | #include "minimizer_index/minimizer_index.h"
14 | #include "containers/results.h"
15 | #include "program_parameters.h"
16 | #include "graphmap/transcriptome.h"
17 | #include
18 |
19 | int ConvertFromTranscriptomeToGenomeAln(const ProgramParameters *parameters, std::shared_ptr index, std::shared_ptr transcriptome, AlignmentResults *aln);
20 |
21 | #endif /* SRC_ALIGNMENT_TRANSCRIPTOME_MOD_H_ */
22 |
--------------------------------------------------------------------------------
/src/containers/mapping_data.cc:
--------------------------------------------------------------------------------
1 | /*
2 | * mapping_data.cc
3 | *
4 | * Created on: Mar 19, 2015
5 | * Author: isovic
6 | */
7 |
8 | #include "mapping_data.h"
9 |
10 | MappingData::MappingData() {
11 | bins.clear();
12 | intermediate_mappings.clear();
13 | final_mapping_ptrs.clear();
14 |
15 | bin_size = -1;
16 |
17 | num_seeds_over_limit = 0;
18 | num_seeds_with_no_hits = 0;
19 | num_seeds_errors = 0;
20 |
21 | num_similar_mappings = 0;
22 | num_same_mappings = 0;
23 | avg_covered_bases_of_all_mappings = 0;
24 | std_covered_bases_of_all_mappings = 0;
25 | median_covered_bases_of_all_mappings = 0;
26 |
27 | iteration = 0;
28 |
29 | unmapped_reason = std::string("");
30 |
31 | num_region_iterations = 0;
32 | mapping_quality = 0;
33 | metagen_alignment_score = 0;
34 |
35 | time_region_selection = 0.0;
36 | time_mapping = 0.0;
37 | time_alignment = 0.0;
38 | time_region_seed_lookup = 0.0;
39 | time_region_hitsort = 0.0;
40 | time_region_conversion = 0.0;
41 | time_region_alloc = 0.0;
42 | time_region_counting = 0.0;
43 | }
44 |
45 | MappingData::~MappingData() {
46 | clear();
47 | }
48 |
49 | bool MappingData::IsMapped() {
50 | for (int32_t i=0; iIsMapped() == true) { return true; };
52 | }
53 | return false;
54 | }
55 |
56 | bool MappingData::IsAligned() {
57 | for (int32_t i=0; iIsAligned() == true) { return true; };
59 | }
60 | return false;
61 | }
62 |
63 | std::string MappingData::VerboseMappingDataToString_(const std::vector *mapping_data, std::shared_ptr index, const SingleSequence *read) const {
64 | std::stringstream ss;
65 |
66 | int64_t reference_length = index->get_data().size();
67 | int64_t read_length = read->get_data_length();
68 |
69 | ss << "-----------------------\n";
70 | ss << "--- num_entries = " << mapping_data->size() << "\n";
71 | ss << "--- read id = " << read->get_sequence_absolute_id() << "\n";
72 | ss << "--- read name = " << read->get_header() << "\n";
73 | ss << "--- read_length = " << read_length << "\n";
74 | ss << "--- reference_length = " << reference_length << "\n";
75 |
76 | for (int64_t i = (mapping_data->size() - 1); i >= 0; i--) {
77 | // ss << "--- [" << i << "] ";
78 | ss << "[" << i << "/" << mapping_data->size() << "] ";
79 | int64_t start_location = 0, start_location_raw = 0;
80 |
81 | ss << "local_score_id = " << mapping_data->at(i)->get_mapping_data().local_score_id;
82 | ss << "\n ° " << mapping_data->at(i)->VerboseToString();
83 | ss << "\n ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", fwd_r_id = " << (mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()) << ", region_index = " << mapping_data->at(i)->get_region_data().region_index;
84 | ss << "\n ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\"";
85 | ss << "\n ° Unmapped reason: \"" << unmapped_reason << "\"";
86 | int64_t relative_position = 0;
87 | int64_t absolute_position = 0;
88 | SeqOrientation orientation = kForward;
89 |
90 | ///// TODO: 06.02.2017.
91 | // This chunk below was removed due to the incompatibilities with the new index.
92 | // int64_t reference_id = index->RawPositionConverter(start_location, 0, &absolute_position, &relative_position, &orientation);
93 | int64_t reference_id = mapping_data->at(i)->get_region_data().reference_id;
94 |
95 | int64_t reference_start = mapping_data->at(i)->get_mapping_data().ref_coords.start;
96 | // index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.start, 0, &absolute_position, &reference_start, &orientation);
97 | int64_t reference_end = mapping_data->at(i)->get_mapping_data().ref_coords.end;
98 | // index->RawPositionConverter(mapping_data->at(i)->get_mapping_data().ref_coords.end, 0, &absolute_position, &reference_end, &orientation);
99 |
100 | for (int64_t j = 0; j < mapping_data->at(i)->get_alignments().size(); j++) {
101 | ss << "\n ° Alignment " << j << " / " << mapping_data->at(i)->get_alignments().size();
102 | ss << "\n ° r_id = " << mapping_data->at(i)->get_region_data().reference_id << ", region_index = " << mapping_data->at(i)->get_region_data().region_index << ", region_votes = " << mapping_data->at(i)->get_region_data().region_votes << ", position = " << relative_position << ", r1[" << reference_start << ", " << reference_end << "], " << ((orientation == kForward) ? "forward" : "reverse");
103 | ss << ", sam_NM = " << mapping_data->at(i)->get_alignments()[j].edit_distance << ", sam_AS = " << mapping_data->at(i)->get_alignments()[j].alignment_score << ", sam_evalue = " << mapping_data->at(i)->get_alignments()[j].evalue << ", sam_pos = " << mapping_data->at(i)->get_alignments()[j].ref_start << ", sam_mapq = " << ((int64_t) mapping_data->at(i)->get_alignments()[j].mapping_quality) << ", relative_position = " << relative_position;
104 | ss << "\n ° r_len = " << index->get_reference_lengths()[mapping_data->at(i)->get_region_data().reference_id] << ", l1_l = " << mapping_data->at(i)->get_l1_data().l1_l <<
105 | ", match_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_eq_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
106 | ", error_rate = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops + mapping_data->at(i)->get_alignments()[j].num_d_ops + mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
107 | " (X: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_x_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
108 | ", I = " << ((float) mapping_data->at(i)->get_alignments()[j].num_i_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) <<
109 | ", D: = " << ((float) mapping_data->at(i)->get_alignments()[j].num_d_ops) / ((float) mapping_data->at(i)->get_alignments()[j].nonclipped_length) << ")";
110 |
111 | ss << "\n ° \"" << index->get_headers()[mapping_data->at(i)->get_region_data().reference_id % index->get_num_sequences_forward()] << "\"";
112 | }
113 | ss << "\n-----------";
114 | if (i == 0) {
115 | ss << "\n";
116 | }
117 | ss << "\n";
118 | }
119 |
120 | return ss.str();
121 | }
122 |
123 | std::string MappingData::VerboseFinalMappingsToString(std::shared_ptr index, const SingleSequence *read) const {
124 | return VerboseMappingDataToString_(&final_mapping_ptrs, index, read);
125 | }
126 |
127 | std::string MappingData::VerboseIntermediateMappingsToString(std::shared_ptr index, const SingleSequence *read) const {
128 | return VerboseMappingDataToString_(&intermediate_mappings, index, read);
129 | }
130 |
131 | void MappingData::clear() {
132 | vertices.Clear();
133 | bins.clear();
134 | for (int64_t i = 0; i < intermediate_mappings.size(); i++) {
135 | if (intermediate_mappings[i])
136 | delete intermediate_mappings[i];
137 | intermediate_mappings[i] = NULL;
138 | }
139 | intermediate_mappings.clear();
140 | final_mapping_ptrs.clear();
141 | unmapped_reason = std::string("");
142 | num_region_iterations = 0;
143 | mapping_quality = 0;
144 | metagen_alignment_score = 0;
145 | }
146 |
--------------------------------------------------------------------------------
/src/containers/mapping_data.h:
--------------------------------------------------------------------------------
1 | /*
2 | * mapping_data.h
3 | *
4 | * Created on: Mar 19, 2015
5 | * Author: isovic
6 | */
7 |
8 | #ifndef MAPPING_DATA_H_
9 | #define MAPPING_DATA_H_
10 |
11 | #include "log_system/log_system.h"
12 | #include "program_parameters.h"
13 | #include "utility/utility_general.h"
14 | #include "containers/region.h"
15 | //#include "index/index.h"
16 | //#include "index/index_hash.h"
17 | //#include "index/index_sa.h"
18 | #include "minimizer_index/minimizer_index.h"
19 | #include "containers/vertices.h"
20 | #include "utility/evalue.h"
21 | #include "containers/path_graph_entry.h"
22 |
23 | //#define UNMAPPED_CODE_NO_VALID_GRAPH_PATHS (1 << 0)
24 |
25 | #define MAPPED_CODE_READ_UNPROCESSED_YET (0)
26 | #define MAPPED_CODE_UNIQUE_MAPPING (1 << 0)
27 | #define MAPPED_CODE_MULTIPLE_EQ_MAPPINGS (1 << 1)
28 |
29 | #define ITERATION_RESET_LIMIT ((int64_t) 0x1000000000000000)
30 |
31 |
32 |
33 | struct ChromosomeBin {
34 | int64_t reference_id = 0;
35 | int64_t bin_id = 0;
36 | float bin_value = 0.0f;
37 | };
38 |
39 | struct bins_greater_than_key
40 | {
41 | inline bool operator() (const ChromosomeBin& op1, const ChromosomeBin& op2) {
42 | if (op1.bin_value > op2.bin_value)
43 | return true;
44 | return false;
45 | }
46 | };
47 |
48 | class MappingData {
49 | public:
50 | MappingData();
51 | ~MappingData();
52 |
53 | void clear();
54 |
55 | Vertices vertices;
56 | std::vector bins;
57 | std::vector intermediate_mappings;
58 | std::vector final_mapping_ptrs; // Do not free the pointers here! Bad design. These point to intermediate_mappings pointers, which will be freed upon destruction.
59 |
60 | int64_t bin_size;
61 | int64_t num_seeds_over_limit;
62 | int64_t num_seeds_with_no_hits;
63 | int64_t num_seeds_errors;
64 | int64_t iteration;
65 |
66 | int64_t num_similar_mappings; // Number of found mapping positions with very similar (estimated) scores. E.g. to within some difference from the top mapping.
67 | int64_t num_same_mappings;
68 | int64_t avg_covered_bases_of_all_mappings;
69 | int64_t std_covered_bases_of_all_mappings;
70 | int64_t median_covered_bases_of_all_mappings;
71 |
72 | std::string unmapped_reason;
73 |
74 | int64_t num_region_iterations;
75 | int8_t mapping_quality;
76 | int64_t metagen_alignment_score;
77 |
78 | double time_region_selection;
79 | double time_mapping;
80 | double time_alignment;
81 | double time_region_seed_lookup;
82 | double time_region_hitsort;
83 | double time_region_conversion;
84 | double time_region_alloc;
85 | double time_region_counting;
86 |
87 | bool IsMapped();
88 | bool IsAligned();
89 |
90 | std::string VerboseFinalMappingsToString(std::shared_ptr index, const SingleSequence *read) const;
91 | std::string VerboseIntermediateMappingsToString(std::shared_ptr index, const SingleSequence *read) const;
92 |
93 | private:
94 | std::string VerboseMappingDataToString_(const std::vector *mapping_data, std::shared_ptr index, const SingleSequence *read) const;
95 |
96 | };
97 |
98 | #endif /* MAPPING_DATA_H_ */
99 |
--------------------------------------------------------------------------------
/src/containers/range.h:
--------------------------------------------------------------------------------
1 | /*
2 | * range.h
3 | *
4 | * Created on: Jul 16, 2014
5 | * Author: ivan
6 | */
7 |
8 | #ifndef RANGE_H_
9 | #define RANGE_H_
10 |
11 | #include
12 |
13 | class Range {
14 | public:
15 | Range() : start(0), end(0) { }
16 | Range(int64_t _start, int64_t _end) : start(_start), end(_end) { }
17 |
18 | int64_t dist() const {
19 | return (end - start);
20 | }
21 |
22 | int64_t start = 0;
23 | int64_t end = 0;
24 | };
25 |
26 | #endif /* RANGE_H_ */
27 |
--------------------------------------------------------------------------------
/src/containers/raw_alignment.h:
--------------------------------------------------------------------------------
1 | /*
2 | * raw_alignment.h
3 | *
4 | * Created on: Nov 12, 2015
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_CONTAINERS_RAW_ALIGNMENT_H_
9 | #define SRC_CONTAINERS_RAW_ALIGNMENT_H_
10 |
11 | struct RawAlignment {
12 | int64_t aln_start = 0;
13 | int64_t aln_end = 0;
14 | std::vector alignment;
15 | std::string cigar = "*";
16 | std::string md = "*"; /// MD field from SAM output.
17 | SeqOrientation orientation = kForward;
18 | int64_t ref_id = 0;
19 | std::string ref_header = "";
20 | int64_t query_id = 0;
21 | std::string query_header = "";
22 | int64_t eq_ops = 0, x_ops = 0, i_ops = 0, d_ops = 0; /// Counts of CIGAR operations.
23 | int64_t aligned_len = 0; /// Number of aligned bases from the read (not counting clipped bases).
24 | int64_t num_clipped_front = 0; /// Number of clipped bases at the beginning of the read.
25 | int64_t num_clipped_back = 0; /// Number of clipped bases at the end of the read.
26 | };
27 |
28 | #endif /* SRC_CONTAINERS_RAW_ALIGNMENT_H_ */
29 |
--------------------------------------------------------------------------------
/src/containers/region.cc:
--------------------------------------------------------------------------------
1 | /*
2 | * region.cc
3 | *
4 | * Created on: Dec 26, 2014
5 | * Author: isovic
6 | */
7 |
8 | #include "containers/region.h"
9 |
10 | //int CopyLinearRegion(const Index *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset) {
11 | // if (region->is_split == true)
12 | // return 1;
13 | //
14 | // int8_t *data_copy = new int8_t[(region->end - region->start + 1) + 1];
15 | // if (data_copy == NULL) {
16 | // return 3;
17 | // }
18 | //
19 | // memmove((data_copy), &(index_reference->get_data()[region->start]), (region->end - region->start + 1));
20 | //
21 | // data_copy[(region->end - region->start + 1)] = '\0';
22 | //
23 | // *ret_concatenated_data = data_copy;
24 | // *ret_data_length = (region->end - region->start + 1);
25 | // *ret_start_offset = region->start;
26 | //
27 | // return 0;
28 | //}
29 |
30 | int ConcatenateSplitRegion(std::shared_ptr index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end) {
31 | if (region->is_split == false)
32 | return 1;
33 |
34 | int64_t region_length_first = (region->end - region->start + 1);
35 | int64_t region_length_second = (region->split_end - region->split_start + 1);
36 | int64_t region_length_joined = region_length_first + region_length_second;
37 | if (region_length_first <= 0 || region_length_second <= 0 || region_length_joined <= 0)
38 | return 2;
39 |
40 | int8_t *data_copy = new int8_t[region_length_joined + 1];
41 | if (data_copy == NULL) {
42 | return 3;
43 | }
44 |
45 | int64_t start_offset = 0;
46 | int64_t position_of_ref_end = 0;
47 |
48 | // If the main region is at the beginning of the reference. The region is then expanded towards left and right, but on the left it zips back
49 | // to the end of the circular reference.
50 | if (region->start < region->split_start) {
51 | memmove(data_copy, &(index_reference->get_data()[region->split_start]), region_length_second);
52 | memmove((data_copy + region_length_second), &(index_reference->get_data()[region->start]), region_length_first);
53 | position_of_ref_end = region->split_end - region->split_start; // + 1;
54 | start_offset = region->split_start;
55 |
56 | // If the main region is at the end of the reference. The region is then expanded towards left and right, but on the right it zips back
57 | // to the beginning of the circular reference.
58 | } else {
59 | memmove((data_copy), &(index_reference->get_data()[region->start]), region_length_first);
60 | memmove((data_copy + region_length_first), &(index_reference->get_data()[region->split_start]), region_length_second);
61 | position_of_ref_end = region->end - region->start;
62 | start_offset = region->start;
63 |
64 | }
65 |
66 | data_copy[region_length_joined] = '\0';
67 |
68 | *ret_concatenated_data = data_copy;
69 | *ret_data_length = region_length_joined;
70 | *ret_start_offset = start_offset;
71 | *ret_position_of_ref_end = position_of_ref_end;
72 |
73 | return 0;
74 | }
75 |
76 | int GetRegionData(std::shared_ptr index, const Region *region,
77 | int8_t **region_data, int64_t *data_len, int64_t *index_reg_start, int64_t *pos_of_ref_end, bool *is_cleanup_required) {
78 |
79 | if (region->is_split == false) {
80 | *region_data = (int8_t *) (&index->get_data()[0] + region->start);
81 | *data_len = (region->end - region->start);
82 | *index_reg_start = region->start;
83 | *pos_of_ref_end = -1;
84 | *is_cleanup_required = false;
85 |
86 | } else {
87 | ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_start, pos_of_ref_end);
88 | *is_cleanup_required = true;
89 |
90 | }
91 |
92 | return 0;
93 | }
94 |
95 | //int GetRegionDataCopy(const Index *index, const Region *region,
96 | // int8_t **region_data, int64_t *data_len, int64_t *index_reg_pos, int64_t *reg_pos_of_ref_end) {
97 | //
98 | // if (region->is_split == false) {
99 | // CopyLinearRegion(index, region, region_data, data_len, index_reg_pos);
100 | // *reg_pos_of_ref_end = -1;
101 | //
102 | // } else {
103 | // ConcatenateSplitRegion(index, region, region_data, data_len, index_reg_pos, reg_pos_of_ref_end);
104 | //
105 | // }
106 | //
107 | // return 0;
108 | //}
109 |
110 | std::string VerboseRegionAsString(Region ®ion) {
111 | std::stringstream ss;
112 |
113 | ss << "start = " << region.start;
114 | ss << ", end = " << region.end;
115 | ss << ", reference_id = " << region.reference_id;
116 | ss << ", region_index = " << region.region_index;
117 | ss << ", region_votes = " << region.region_votes;
118 | ss << ", is_split = " << ((int) region.is_split);
119 | ss << ", split_start = " << region.split_start;
120 | ss << ", split_end = " << region.split_end;
121 |
122 | return ss.str();
123 | }
124 |
--------------------------------------------------------------------------------
/src/containers/region.h:
--------------------------------------------------------------------------------
1 | /*
2 | * region.h
3 | *
4 | * Created on: Dec 21, 2014
5 | * Author: ivan
6 | */
7 |
8 | #ifndef REGION_H_
9 | #define REGION_H_
10 |
11 | #include
12 | #include
13 | #include
14 | // #include "index/index.h"
15 | #include "minimizer_index/minimizer_index.h"
16 |
17 | struct Region {
18 | int64_t start = 0;
19 | int64_t end = 0;
20 | int64_t reference_id = -1;
21 | std::string rname;
22 | int64_t region_index = -1;
23 | int64_t region_votes = 0;
24 | bool is_split = false;
25 | int64_t split_start = 0;
26 | int64_t split_end = 0;
27 | };
28 |
29 | //// Creates a copy of the region data from the Index.
30 | //int CopyLinearRegion(const MinimizerIndex *index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset);
31 |
32 | // If the region is split in two parts, that is if the genome is circular, this function copies both parts in a new data array.
33 | // It is users responsibility to free the allocated space using delete[].
34 | int ConcatenateSplitRegion(std::shared_ptr index_reference, const Region *region, int8_t **ret_concatenated_data, int64_t *ret_data_length, int64_t *ret_start_offset, int64_t *ret_position_of_ref_end);
35 |
36 | // Checks if the region is linear or split. If the region is linear, it returns the pointer to the existing part of the Index data and is_cleanup_required is set to false.
37 | // Otherwise, a new data array is allocated and the data copied from the split parts of the Index.
38 | // If the is_cleanup_required parameter is true, region_data needs to be freed by the user using free().
39 | int GetRegionData(std::shared_ptr index, const Region *region,
40 | int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end, bool *is_cleanup_required);
41 |
42 | //// Checks if the region is linear or split. It copies the data to a new array, and returns the pointer to the region data.
43 | //// region_data needs to be freed by the user using free().
44 | //int GetRegionDataCopy(const MinimizerIndex *index, const Region *region,
45 | // int8_t **region_data, int64_t *data_len, int64_t *index_pos, int64_t *index_pos_of_ref_end);
46 |
47 | // Simply verbose region's details.
48 | std::string VerboseRegionAsString(Region ®ion);
49 |
50 | #endif /* REGION_H_ */
51 |
--------------------------------------------------------------------------------
/src/containers/results.h:
--------------------------------------------------------------------------------
1 | /*
2 | * results.h
3 | *
4 | * Created on: Jan 16, 2016
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_CONTAINERS_RESULTS_H_
9 | #define SRC_CONTAINERS_RESULTS_H_
10 |
11 | #include
12 | #include
13 | #include "containers/range.h"
14 | #include "utility/utility_general.h"
15 | #include "containers/region.h"
16 |
17 |
18 |
19 | typedef struct Cluster {
20 | public:
21 | Range query;
22 | Range ref;
23 | int32_t num_anchors = 0;
24 | int32_t coverage = 0;
25 | bool valid = false;
26 | SeqOrientation orientation;
27 | Region region;
28 | } Cluster;
29 |
30 | typedef struct MappingResults {
31 | int64_t lcs_length = 0;
32 | int64_t cov_bases_max = 0;
33 | int64_t cov_bases_query = 0;
34 | int64_t cov_bases_ref = 0;
35 | int64_t num_covering_kmers = 0;
36 | float deviation = 0.0f;
37 | Range query_coords;
38 | Range ref_coords;
39 | bool is_mapped = false;
40 | bool is_reverse = false;
41 | int64_t local_score_id = 0;
42 | std::vector clusters;
43 |
44 | // int64_t num_same_mappings = 0; // How many mapping positions have exactly the same score.
45 | } MappingResults;
46 |
47 | typedef struct L1Results {
48 | int64_t l1_l = 0;
49 | double l1_k = 1.0f;
50 | int64_t l1_lmin = 0;
51 | int64_t l1_lmax = 0;
52 | double l1_confidence_abs = 0;
53 | double l1_std = 0;
54 | int64_t l1_rough_start = 0;
55 | int64_t l1_rough_end = 0;
56 | } L1Results;
57 |
58 | typedef struct AlignmentResults {
59 | bool is_aligned = false;
60 | bool is_reverse = false; // This should be deprecated and replaced with 'orientation'.
61 | int64_t ref_start = 0; // Starting position of the alignment on the reference. If orientation == kReverse, this assumes that the read should be reverse complemented and the reference stays fwd. pos_start is adjusted accordingly to denote the starting position of the alignment of the reversed read.
62 | int64_t ref_end = 0; // See pos_start. This is the end position of the alignment.
63 | int64_t query_start = 0; // Starting position of the alignment on the read. Everything before this position should be clipped.
64 | int64_t query_end = 0; // Ending position of the alignment on the read. Everything after this position should be clipped.
65 | std::string cigar = "*"; // In case orientation == kReverse, 'cigar' contains the reverse of the 'alignment' operations.
66 | std::string md = "";
67 | int64_t edit_distance = 0;
68 | int64_t alignment_score = 0;
69 | int64_t mapping_quality = 0;
70 | double evalue = 0.0f;
71 | int64_t num_secondary_alns = 0; // How many mapping positions have similar score.
72 |
73 | int64_t raw_pos_start = 0; // Internally, the fwd read is mapped to a reference and its reverse complement (which have been joined in a single massive sequence). The raw_pos_start then holds the absolute coordinate of the alignment in such joined sequence data.
74 | int64_t raw_pos_end = 0; // See raw_pos_start. This is the end position of the alignment in global coordinates.
75 | std::vector raw_alignment; // Hold the alignment in the global coordinate space (between raw_pos_start and raw_pos_end). Cannot be used with pos_start and pos_end in case the read should be reverse complemented. In this case, the alignment needs to be reversed.
76 | std::vector alignment; // Hold the alignment in the local coordinate space (between ref_start and ref_end). If orientation == kForward, alignment == raw_alignment. Otherwise it's the reverse complement.
77 |
78 | SeqOrientation orientation = kForward;
79 | int64_t ref_id = -1;
80 | std::string ref_header = "*";
81 | int64_t ref_len = 0;
82 | int64_t query_id = -1;
83 | std::string query_header = "*";
84 | int64_t query_len = 0;
85 |
86 | int64_t num_eq_ops = 0;
87 | int64_t num_x_ops = 0;
88 | int64_t num_i_ops = 0;
89 | int64_t num_d_ops = 0;
90 | int64_t nonclipped_length = 0;
91 |
92 | // int8_t *ref_data = NULL;
93 | // int8_t *read_data = NULL;
94 |
95 | // These are parameters of alignment which were used to produce the results.
96 | int32_t aln_mode_code = 0; // Type of alignment which was performed to produce the results stored in this structure.
97 |
98 | int64_t reg_pos_start = 0; // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function.
99 | int64_t reg_pos_end = 0; // Local coordinates of the alignment's start and end positions within the region determined by GetRegionData() function.
100 |
101 | } AlignmentResults;
102 |
103 |
104 |
105 | typedef struct MappingMetadata {
106 | std::string unmapped_reason = "Not processed.";
107 |
108 | double time_region_selection = 0.0;
109 | double time_mapping = 0.0;
110 | double time_alignment = 0.0;
111 | double time_region_seed_lookup = 0.0;
112 | double time_region_hitsort = 0.0;
113 | double time_region_conversion = 0.0;
114 | double time_region_alloc = 0.0;
115 | double time_region_counting = 0.0;
116 |
117 |
118 |
119 | } MappingMetadata;
120 |
121 | #endif /* SRC_CONTAINERS_RESULTS_H_ */
122 |
--------------------------------------------------------------------------------
/src/containers/score_registry.cc:
--------------------------------------------------------------------------------
1 | /*
2 | * score_registry.cc
3 | *
4 | * Created on: Jul 14, 2014
5 | * Author: ivan
6 | */
7 |
8 | #include "containers/score_registry.h"
9 |
10 | ScoreRegistry::ScoreRegistry() {
11 | scores_id_ = 0;
12 | }
13 |
14 | ScoreRegistry::ScoreRegistry(const Region& region, int64_t scores_id) {
15 | set_region(region);
16 | set_scores_id(scores_id);
17 | }
18 |
19 | ScoreRegistry::~ScoreRegistry() {
20 | Clear();
21 | }
22 |
23 | void ScoreRegistry::Clear() {
24 | // registry_.clear();
25 | registry_entries_.Clear();
26 | scores_id_ = 0;
27 | }
28 |
29 | void ScoreRegistry::Add(Vertices &src_vertices, int64_t vertex_idx) {
30 | // registry_.push_back(vertex_data);
31 | registry_entries_.Add(src_vertices, vertex_idx);
32 | }
33 |
34 | void ScoreRegistry::Register(Vertices &src_vertices, int64_t vertex_idx) {
35 | if (src_vertices.registry_numbers[vertex_idx] < 0) { // || vertex_data.registry_number >= registry_.size()) {
36 | src_vertices.registry_numbers[vertex_idx] = registry_entries_.num_vertices;
37 | registry_entries_.Add(src_vertices, vertex_idx);
38 |
39 | }
40 | else {
41 | // Handle the case where a repeating kmer causes a 'jump' in the middle of an existing long path.
42 | // Edit 07.11.2014.: Because of the condition that a kmer needs to be within l iterations from the
43 | // vertex's path that it want's to extend, the kmer cannot hit it somewhere in the middle of the path.
44 | // It can only occur near the end of the path, and can only cause the path to have a more-or-less
45 | // even/uneven length in the reference and the query. For this reason, I think that forking a path
46 | // is perhaps not a good option, but instead to check for its ratio in query and in reference, and
47 | // choose to extend the path with the new kmer only if the ratio is closer to 1.0f.
48 | // For precaution sake, I'll keep the previous version here in comments.
49 | // if (vertex_data.covered_bases < registry_[vertex_data.registry_number].covered_bases) {
50 | // vertex_data.registry_number = registry_.size();
51 | // registry_.push_back(vertex_data);
52 | // } else {
53 | // registry_[vertex_data.registry_number] = vertex_data;
54 | // }
55 |
56 | int64_t registry_number = src_vertices.registry_numbers[vertex_idx];
57 |
58 | if ((src_vertices.num_kmers[vertex_idx] > registry_entries_.num_kmers[registry_number]) ||
59 | (src_vertices.num_kmers[vertex_idx] <= registry_entries_.num_kmers[registry_number] &&
60 | src_vertices.CalculateSuppress(vertex_idx) < registry_entries_.CalculateSuppress(registry_number))) {
61 |
62 | registry_entries_.CopyValuesFromOut(src_vertices, vertex_idx, registry_number);
63 | }
64 | }
65 | }
66 |
67 | std::string ScoreRegistry::VerboseToString() {
68 | std::stringstream ss;
69 |
70 | ss << "Num scores: " << registry_entries_.num_vertices << std::endl;
71 |
72 | for (int64_t i=0; i
12 | #include
13 | #include
14 | #include
15 | #include "containers/region.h"
16 | #include "sequences/single_sequence.h"
17 | #include "sequences/sequence_file.h"
18 | #include "containers/vertices.h"
19 |
20 | class ScoreRegistry {
21 | public:
22 | ScoreRegistry();
23 | ScoreRegistry(const Region& region, int64_t scores_id);
24 | ~ScoreRegistry();
25 |
26 | /// Empties the registry and sets all values to zero.
27 | void Clear();
28 |
29 | /// Simply appends the data to the end of the registry and updates the top score.
30 | /// No additional checks are performed.
31 | void Add(Vertices &src_vertices, int64_t vertex_idx);
32 |
33 | /// If the data has a registry number >= 0, then the entry with that index will be updated.
34 | /// Otherwise, if registry number < 0 or if the suppress is smaller than the existing one,
35 | /// the new data will only be appended to the end of the registry, and its registry number
36 | /// will be updated.
37 | void Register(Vertices &src_vertices, int64_t vertex_idx);
38 |
39 | // Allocates space for vertices.
40 | void Reserve(int64_t size);
41 |
42 | /// Formats the debug verbose to a std::string.
43 | std::string VerboseToString();
44 | const Region& get_region() const;
45 | void set_region(const Region& region);
46 | int64_t get_scores_id() const;
47 | void set_scores_id(int64_t scoresId);
48 | const Vertices& get_registry_entries() const;
49 | void set_registry_entries(Vertices& registryEntries);
50 |
51 | private:
52 | Vertices registry_entries_;
53 | Region region_;
54 | int64_t scores_id_;
55 | };
56 |
57 | #endif /* SCORE_REGISTRY_H_ */
58 |
--------------------------------------------------------------------------------
/src/containers/vertices.h:
--------------------------------------------------------------------------------
1 | /*
2 | * vertices.h
3 | *
4 | * Created on: Feb 13, 2015
5 | * Author: isovic
6 | */
7 |
8 | #ifndef VERTICES_H_
9 | #define VERTICES_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include "log_system/log_system.h"
16 |
17 |
18 |
19 | // Quite an ugly data structure, but very cache friendly.
20 | class Vertices {
21 | public:
22 | int64_t *timestamps;
23 | int64_t *reference_starts;
24 | int64_t *reference_ends;
25 | int64_t *query_starts;
26 | int64_t *query_ends;
27 | int64_t *num_kmers;
28 | int64_t *covered_bases_queries;
29 | int64_t *covered_bases_references;
30 | int64_t *registry_numbers;
31 |
32 | int64_t num_vertices;
33 | int64_t container_capacity;
34 |
35 | Vertices();
36 | ~Vertices();
37 | void Clear();
38 |
39 | inline int Init(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start,
40 | int64_t query_start, int64_t kmer_length, int64_t registry_number) {
41 | return Set(dest_vertex_idx, timestamp, reference_start, reference_start, query_start, query_start, 1, kmer_length, kmer_length, registry_number);
42 | }
43 |
44 | inline int Set(int64_t dest_vertex_idx, int64_t timestamp, int64_t reference_start,
45 | int64_t reference_end, int64_t query_start, int64_t query_end,
46 | int64_t num_kmer, int64_t covered_bases_query,
47 | int64_t covered_bases_reference, int64_t registry_number) {
48 | if (dest_vertex_idx >= num_vertices || dest_vertex_idx < 0) {
49 | return 1;
50 | }
51 |
52 | timestamps[dest_vertex_idx] = timestamp;
53 | reference_starts[dest_vertex_idx] = reference_start;
54 | reference_ends[dest_vertex_idx] = reference_end;
55 | query_starts[dest_vertex_idx] = query_start;
56 | query_ends[dest_vertex_idx] = query_end;
57 | num_kmers[dest_vertex_idx] = num_kmer;
58 | covered_bases_queries[dest_vertex_idx] = covered_bases_query;
59 | covered_bases_references[dest_vertex_idx] = covered_bases_reference;
60 | registry_numbers[dest_vertex_idx] = registry_number;
61 |
62 | return 0;
63 | }
64 |
65 | inline int Add(int64_t timestamp, int64_t reference_start,
66 | int64_t reference_end, int64_t query_start, int64_t query_end,
67 | int64_t num_kmer, int64_t covered_bases_query,
68 | int64_t covered_bases_reference, int64_t registry_number) {
69 | if (num_vertices >= container_capacity) {
70 | Reserve(container_capacity + capacity_increment_size_);
71 | }
72 |
73 | num_vertices += 1;
74 | Set((num_vertices - 1), timestamp, reference_start, reference_end, query_start, query_end, num_kmer, covered_bases_query, covered_bases_reference, registry_number);
75 |
76 | return 0;
77 | }
78 |
79 | inline int Add(const Vertices &src_vertices, int64_t src_vertex_idx) {
80 | return Add(src_vertices.timestamps[src_vertex_idx],
81 | src_vertices.reference_starts[src_vertex_idx],
82 | src_vertices.reference_ends[src_vertex_idx],
83 | src_vertices.query_starts[src_vertex_idx],
84 | src_vertices.query_ends[src_vertex_idx],
85 | src_vertices.num_kmers[src_vertex_idx],
86 | src_vertices.covered_bases_queries[src_vertex_idx],
87 | src_vertices.covered_bases_references[src_vertex_idx],
88 | src_vertices.registry_numbers[src_vertex_idx]);
89 | }
90 |
91 | void Reserve(int64_t size);
92 | void Resize(int64_t size);
93 |
94 | inline int CopyValuesWithin(int64_t source_idx, int64_t dest_idx) {
95 | if (source_idx >= num_vertices || dest_idx >= num_vertices || source_idx < 0 || dest_idx < 0) {
96 | LogSystem::GetInstance().Error(SEVERITY_INT_WARNING, __FUNCTION__, LogSystem::GetInstance().GenerateErrorMessage(ERR_MEMORY, "When CopyValuesWithin is called. source_idx = %ld, dest_idx = %ld, num_vertices = %ld\n", source_idx, dest_idx, num_vertices));
97 | return 1;
98 | }
99 |
100 | timestamps[dest_idx] = timestamps[source_idx];
101 | reference_starts[dest_idx] = reference_starts[source_idx];
102 | reference_ends[dest_idx] = reference_ends[source_idx];
103 | query_starts[dest_idx] = query_starts[source_idx];
104 | query_ends[dest_idx] = query_ends[source_idx];
105 | num_kmers[dest_idx] = num_kmers[source_idx];
106 | covered_bases_queries[dest_idx] = covered_bases_queries[source_idx];
107 | covered_bases_references[dest_idx] = covered_bases_references[source_idx];
108 | registry_numbers[dest_idx] = registry_numbers[source_idx];
109 |
110 | return 0;
111 | }
112 |
113 | inline int CopyValuesFromOut(Vertices &src_vertices, int64_t src_vertex_idx, int64_t dest_idx) {
114 | return Set(dest_idx,
115 | src_vertices.timestamps[src_vertex_idx],
116 | src_vertices.reference_starts[src_vertex_idx],
117 | src_vertices.reference_ends[src_vertex_idx],
118 | src_vertices.query_starts[src_vertex_idx],
119 | src_vertices.query_ends[src_vertex_idx],
120 | src_vertices.num_kmers[src_vertex_idx],
121 | src_vertices.covered_bases_queries[src_vertex_idx],
122 | src_vertices.covered_bases_references[src_vertex_idx],
123 | src_vertices.registry_numbers[src_vertex_idx]);
124 | }
125 |
126 | inline void EraseValues() {
127 | if (num_vertices <= 0)
128 | return;
129 |
130 | memset(timestamps, -1, num_vertices);
131 | memset(reference_starts, 0, num_vertices);
132 | memset(reference_ends, 0, num_vertices);
133 | memset(query_starts, 0, num_vertices);
134 | memset(query_ends, 0, num_vertices);
135 | memset(num_kmers, 0, num_vertices);
136 | memset(covered_bases_queries, 0, num_vertices);
137 | memset(covered_bases_references, 0, num_vertices);
138 | memset(registry_numbers, -1, num_vertices);
139 | }
140 |
141 | inline float CalculateRatio(int64_t vertex_idx) {
142 | float ratio = 0.0f;
143 | int64_t query_start = query_starts[vertex_idx];
144 | int64_t query_end = query_ends[vertex_idx];
145 | int64_t reference_start = reference_starts[vertex_idx];
146 | int64_t reference_end = reference_ends[vertex_idx];
147 |
148 | int64_t query_distance = (query_end >= query_start) ? (query_end - query_start) : (query_start - query_end);
149 | int64_t ref_distance = (reference_end >= reference_start) ? (reference_end - reference_start) : (reference_start - reference_end);
150 |
151 | if (query_distance != 0)
152 | ratio = ((float) std::min(query_distance, ref_distance)) / ((float) std::max(query_distance, ref_distance));
153 | else
154 | ratio = 1.0f;
155 |
156 | return ratio;
157 | }
158 |
159 | inline float CalculateSuppress(int64_t vertex_idx) {
160 | float ratio = 0.0f, ratio_suppress = 0.0f;
161 |
162 | ratio = CalculateRatio(vertex_idx);
163 |
164 | ratio_suppress = (ratio < 1.0f) ? (1.0f - ratio) : (ratio - 1.0f);
165 |
166 | return ratio_suppress;
167 | }
168 |
169 | inline std::string VerboseToString(int64_t vertex_idx) const {
170 | std::stringstream ret;
171 |
172 | if (vertex_idx < 0 || vertex_idx >= num_vertices) {
173 | ret << "Error with vertex_idx! vertex_idx = " << vertex_idx << ", containter_capacity = " << container_capacity << ", num_vertices = " << num_vertices;
174 | return ret.str();
175 | }
176 |
177 | ret << "timestamp = " << timestamps[vertex_idx];
178 | ret << "; q[" << query_starts[vertex_idx] << ", " << query_ends[vertex_idx] << "]; r[" << reference_starts[vertex_idx]<< ", " <<
179 | reference_ends[vertex_idx] <<
180 | "]; d[" << (query_ends[vertex_idx] - query_starts[vertex_idx]) << ", " << (reference_ends[vertex_idx] - reference_starts[vertex_idx]) <<
181 | "]; length = " << num_kmers[vertex_idx] <<
182 | "; dist_ratio = " << ((double) std::min((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) / ((double) std::max((reference_ends[vertex_idx] - reference_starts[vertex_idx]), (query_ends[vertex_idx] - query_starts[vertex_idx]))) <<
183 | "; cov_bases_query = " << covered_bases_queries[vertex_idx] << "; cov_bases_ref = " << covered_bases_references[vertex_idx] << "; registry_num = " << registry_numbers[vertex_idx];
184 |
185 | return ret.str();
186 | }
187 |
188 | private:
189 | inline int ReallocArray_(int64_t **array_ptr, int64_t size);
190 | int64_t capacity_increment_size_;
191 | };
192 |
193 | #endif /* VERTICES_H_ */
194 |
--------------------------------------------------------------------------------
/src/graphmap/experimental.cc:
--------------------------------------------------------------------------------
1 | /*
2 | * experimental.cc
3 | *
4 | * Created on: Jan 20, 2016
5 | * Author: isovic
6 | */
7 |
8 | #include "graphmap/graphmap.h"
9 |
10 |
--------------------------------------------------------------------------------
/src/graphmap/filter_anchors.h:
--------------------------------------------------------------------------------
1 | /*
2 | * filter_anchors.h
3 | *
4 | * Created on: Mar 22, 2016
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_GRAPHMAP_FILTER_ANCHORS_H_
9 | #define SRC_GRAPHMAP_FILTER_ANCHORS_H_
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include "sequences/single_sequence.h"
16 | #include "sequences/sequence_file.h"
17 | #include "containers/vertices.h"
18 | #include "program_parameters.h"
19 |
20 | #include "containers/score_registry.h"
21 | #include "utility/utility_general.h"
22 | #include "containers/region.h"
23 | #include "containers/mapping_data.h"
24 | #include "containers/vertices.h"
25 |
26 | /// These are some constants used for filtering shady anchors.
27 | /// TODO: This can be omitted if dynamic programming was used to penalize the anchor distances.
28 | /// int64_t min_covered_bases = (new_cluster->query.end - new_cluster->query.start + 1) * MIN_CLUSTER_COVERAGE_FACTOR;
29 | #define MIN_CLUSTER_COVERAGE_FACTOR 0.05f
30 | /// int64_t min_cluster_length = read->get_sequence_length() * MIN_CLUSTER_LENGTH_FACTOR;
31 | #define MIN_CLUSTER_LENGTH_FACTOR 0.03f
32 |
33 | using int128_t = __int128;
34 | using uint128_t = unsigned __int128;
35 |
36 | #define get128_qid(x) ((int32_t) (x & 0x0FFFFFFFF))
37 | #define get128_rpos(x) ((int32_t) ((x >> 32) & 0x0FFFFFFFF))
38 | #define get128_qpos(x) ((int32_t) ((x >> 64) & 0x0FFFFFFFF))
39 | #define get128_rid(x) ((int32_t) ((x >> 96) & 0x0FFFFFFFF))
40 | /// d c b a
41 | /// ref_id << 96 | query_start << 64 | ref_start << 32 | query_id
42 | #define pack128(qstart,rstart,qid,rid) ((((uint128_t) rid) << 96) | (((uint128_t) qstart) << 64) | (((uint128_t) rstart) << 32) | ((uint128_t) qid))
43 |
44 | struct ClusterAndIndices {
45 | Range query;
46 | Range ref;
47 | int32_t num_anchors = 0;
48 | int32_t coverage = 0;
49 | std::vector lcskpp_indices;
50 | };
51 |
52 | int64_t CalcScore(int32_t qpos, int32_t rpos, int32_t next_qpos, int32_t next_rpos, double indel_bandwidth_margin, int32_t fwd_length, int32_t dist_aab, int32_t dist_dbm, double *score_gap, double *score_dist);
53 |
54 | void GetPositionsFromRegistry2(const Vertices& registry_entries, int64_t vertex_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
55 | void GetPositionsFromRegistry(const Vertices& registry_entries, const std::vector &lcskpp_indices, int64_t lcskpp_id, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
56 | void GetPositionsFrom128bit(const std::vector &hits, const std::vector &lcskpp_indices, int64_t lcskpp_id, int32_t seed_len, int32_t *qpos_start, int32_t *rpos_start, int32_t *qpos_end, int32_t *rpos_end);
57 |
58 | int FilterAnchorsByDiff(const SingleSequence* read, ScoreRegistry* local_score, const ProgramParameters *parameters,
59 | const std::vector &lcskpp_indices, std::vector &ret_filtered_lcskpp_indices);
60 |
61 | int FilterAnchorsByChaining(const SingleSequence* seq, ScoreRegistry* local_score, const ProgramParameters *parameters,
62 | const std::vector &lcskpp_indices, double indel_bandwidth_margin, int32_t max_dist, int32_t lookahead_dist_factor, int64_t min_covered_bases, int32_t cluster_size_cutoff,
63 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids);
64 |
65 | int GenerateClusters(int64_t min_num_anchors_in_cluster, int64_t min_cluster_length, int64_t min_cluster_covered_bases, float min_cluster_coverage, std::vector &lcskpp_indices,
66 | ScoreRegistry* local_score, MappingData* mapping_data,
67 | const SingleSequence* read, const ProgramParameters* parameters, std::vector &ret_clusters,
68 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids);
69 | int GenerateClustersDummy(int64_t min_cluster_length, float min_cluster_coverage, std::vector &lcskpp_indices,
70 | ScoreRegistry* local_score, MappingData* mapping_data,
71 | const SingleSequence* read, const ProgramParameters* parameters, std::vector &ret_clusters,
72 | std::vector &ret_filtered_lcskpp_indices, std::vector *ret_cluster_ids);
73 |
74 | int VerboseClustersToFile_(std::string out_file, const ScoreRegistry* local_score, const MappingData* mapping_data, std::vector> &indexes, const SingleSequence* read, const ProgramParameters* parameters, const std::vector &clusters);
75 |
76 | #endif /* SRC_GRAPHMAP_FILTER_ANCHORS_H_ */
77 |
--------------------------------------------------------------------------------
/src/graphmap/transcriptome.h:
--------------------------------------------------------------------------------
1 | /*
2 | * transcriptome.h
3 | *
4 | * Created on: Feb 6, 2017
5 | * Author: isovic
6 | */
7 |
8 | #ifndef SRC_GRAPHMAP_TRANSCRIPTOME_H_
9 | #define SRC_GRAPHMAP_TRANSCRIPTOME_H_
10 |
11 | #include
12 | #include