├── test_prog ├── query_batch.fasta.gz ├── target_batch.fasta.gz ├── Timer.h ├── README.md ├── Makefile └── test_prog.cpp ├── .gitignore ├── src ├── gasal_header.h ├── res.h ├── interfaces.h ├── ctors.h ├── host_batch.h ├── args_parser.h ├── gasal_kernels.h ├── kernels │ ├── get_tb.h │ ├── ksw_kernel_template.h │ ├── pack_rc_seqs.h │ ├── global.h │ ├── semiglobal_kernel_template.h │ └── local_kernel_template.h ├── gasal.h ├── interfaces.cpp ├── res.cpp ├── gasal_align.h ├── host_batch.cpp ├── args_parser.cpp ├── ctors.cpp ├── __deprecated.cpp └── gasal_align.cu ├── configure.sh ├── Makefile ├── LICENSE └── README.md /test_prog/query_batch.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahmedraja/GASAL2/HEAD/test_prog/query_batch.fasta.gz -------------------------------------------------------------------------------- /test_prog/target_batch.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahmedraja/GASAL2/HEAD/test_prog/target_batch.fasta.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.cuo 3 | *.cppo 4 | *.out 5 | *.txt 6 | *.tsv 7 | *.a 8 | *~ 9 | *.cproject 10 | *.project 11 | *.sam 12 | *.fa 13 | *.fasta 14 | *.bam 15 | *.swp 16 | lib/* 17 | include/* 18 | .vscode/* 19 | *.log 20 | 21 | src/\.vscode/ 22 | 23 | *.nvvp 24 | 25 | *.nvprof 26 | -------------------------------------------------------------------------------- /src/gasal_header.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_HEADER_H__ 2 | #define __GASAL_HEADER_H__ 3 | 4 | 5 | #include "gasal.h" // include cstdlib, cstdint 6 | #include "args_parser.h" // include iostream, string, fstream 7 | #include "gasal_align.h" 8 | #include "host_batch.h" // include cstdio, cstring 9 | #include "ctors.h" 10 | #include "interfaces.h" 11 | 12 | 13 | 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/res.h: -------------------------------------------------------------------------------- 1 | #ifndef __RES_H__ 2 | #define __RES_H__ 3 | 4 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params); 5 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy); 6 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params); 7 | 8 | void gasal_res_destroy_host(gasal_res_t *res); 9 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy); 10 | 11 | 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/interfaces.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_INTERFACES_H__ 2 | #define __GASAL_INTERFACES_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Resizer for the whole gpu_storage in terms of number of sequences 9 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params); 10 | 11 | // operation filler method (field in the gasal_gpu_storage_t field) 12 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC); 13 | 14 | void gasal_set_device(int gpu_select = 0, bool isPrintingProp = true); 15 | #endif 16 | -------------------------------------------------------------------------------- /src/ctors.h: -------------------------------------------------------------------------------- 1 | #ifndef __CTORS_H__ 2 | #define __CTORS_H__ 3 | 4 | 5 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams); 6 | 7 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec, int max_query_len, int max_target_len, int max_n_alns, Parameters *params); 8 | 9 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params); 10 | 11 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params); 12 | 13 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params); 14 | 15 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/host_batch.h: -------------------------------------------------------------------------------- 1 | #ifndef __HOST_BACTH_H__ 2 | #define __HOST_BACTH_H__ 3 | 4 | #include 5 | #include 6 | #include // useful for memcpy, strlen 7 | 8 | // host data structure methods 9 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset); 10 | void gasal_host_batch_destroy(host_batch_t *res); // destructor 11 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg); 12 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage); // get last item of chain 13 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC); // fill the data 14 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC ); 15 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC ); 16 | void gasal_host_batch_print(host_batch_t *res); // printer 17 | void gasal_host_batch_printall(host_batch_t *res); // printer for the whole linked list 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /src/args_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef ARGS_PARSER_H 2 | #define ARGS_PARSER_H 3 | 4 | /* 5 | #include 6 | 7 | 8 | #include "gasal.h" 9 | */ 10 | #include 11 | #include 12 | #include "gasal.h" 13 | #include 14 | 15 | 16 | enum fail_type { 17 | NOT_ENOUGH_ARGS, 18 | TOO_MANY_ARGS, 19 | WRONG_ARG, 20 | WRONG_FILES, 21 | WRONG_ALGO 22 | }; 23 | 24 | class Parameters{ 25 | 26 | public: 27 | Parameters(int argc, char** argv); 28 | ~Parameters(); 29 | void print(); 30 | void failure(fail_type f); 31 | void help(); 32 | void parse(); 33 | void fileopen(); 34 | 35 | 36 | 37 | int32_t sa; 38 | int32_t sb; 39 | int32_t gapo; 40 | int32_t gape; 41 | comp_start start_pos; 42 | int print_out; 43 | int n_threads; 44 | int32_t k_band; 45 | 46 | Bool secondBest; 47 | 48 | bool isPacked; 49 | bool isReverseComplement; 50 | 51 | data_source semiglobal_skipping_head; 52 | data_source semiglobal_skipping_tail; 53 | 54 | algo_type algo; 55 | 56 | std::string query_batch_fasta_filename; 57 | std::string target_batch_fasta_filename; 58 | 59 | std::ifstream query_batch_fasta; 60 | std::ifstream target_batch_fasta; 61 | 62 | 63 | protected: 64 | 65 | private: 66 | int argc; 67 | char** argv; 68 | }; 69 | 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /test_prog/Timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class Timer 10 | { 11 | private: 12 | struct timeval startTime; 13 | struct timeval stopTime; 14 | double elapsedTime; 15 | std::string name; 16 | 17 | public: 18 | Timer(std::string n) { name = n; elapsedTime = 0.0;} 19 | Timer() { name = ""; elapsedTime = 0.0;} 20 | void Clear() { elapsedTime = 0.0; } 21 | void Start() { gettimeofday(&(startTime), NULL); } 22 | void Restart() 23 | { 24 | elapsedTime = 0.0; 25 | gettimeofday(&(startTime), NULL); 26 | } 27 | 28 | void Pause() 29 | { 30 | gettimeofday(&(stopTime), NULL); 31 | 32 | elapsedTime += ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 33 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 34 | } 35 | 36 | void Stop() 37 | { 38 | gettimeofday(&(stopTime), NULL); 39 | 40 | elapsedTime = ( (stopTime).tv_sec - (startTime).tv_sec) * 1000.0; // sec to ms 41 | elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0; // us to ms 42 | } 43 | 44 | void Print() 45 | { 46 | std::cout << name << " : " << elapsedTime << " msec" << std::endl; 47 | } 48 | 49 | double GetTime() { return elapsedTime;} 50 | 51 | }; 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | cuda_path=$1 5 | RED='\033[0;31m' 6 | NC='\033[0m' # No Color 7 | 8 | if [ "$cuda_path" = "" ]; then 9 | echo -e "${RED}Must provide path to CUDA installation directory${NC}" 10 | echo -e "${RED}Configuration incomplete${NC}" 11 | echo -e "${RED}Exiting${NC}" 12 | exit 1 13 | fi 14 | 15 | cuda_nvcc_path=$cuda_path/bin/nvcc 16 | 17 | if [ -f $cuda_nvcc_path ]; then 18 | echo "NVCC found ($cuda_nvcc_path)" 19 | else 20 | echo -e "${RED}NVCC not found${NC}" 21 | echo -e "${RED}Configuration incomplete${NC}" 22 | echo -e "${RED}Exiting${NC}" 23 | exit 1 24 | fi 25 | 26 | 27 | cuda_lib_path="${cuda_path}/targets/x86_64-linux/lib" 28 | 29 | 30 | if [ -d $cuda_lib_path ]; then 31 | echo "CUDA runtime library found (${cuda_lib_path})" 32 | else 33 | echo -e "${RED}CUDA runtime library not found${NC}" 34 | echo -e "${RED}Configuration incomplete${NC}" 35 | echo -e "${RED}Exiting${NC}" 36 | exit 1 37 | fi 38 | 39 | cuda_runtime_file="${cuda_path}/targets/x86_64-linux/include/cuda_runtime.h" 40 | 41 | if [ -f $cuda_runtime_file ]; then 42 | echo "CUDA runtime header file found (${cuda_runtime_file})" 43 | else 44 | echo -e "${RED}CUDA runtime header file not found${NC}" 45 | echo -e "${RED}Configuration incomplete${NC}" 46 | echo -e "${RED}Exiting${NC}" 47 | exit 1 48 | fi 49 | 50 | 51 | echo "Configuring Makefile..." 52 | 53 | sed -i "s,NVCC=.*,NVCC=$cuda_nvcc_path,g" Makefile 54 | 55 | echo "Configuring gasal.h..." 56 | 57 | sed -i "s,.*cuda_runtime\.h\",\#include \"$cuda_runtime_file\",g" ./src/gasal.h 58 | 59 | echo "Configuring Makefile of test program..." 60 | 61 | sed -i "s,CUDA_LD_LIBRARY=.*,CUDA_LD_LIBRARY=$cuda_lib_path,g" ./test_prog/Makefile 62 | 63 | #mkdir -p include 64 | 65 | #cp ./src/gasal.h ./include 66 | 67 | echo "Done" 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/gasal_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_KERNELS_H__ 2 | #define __GASAL_KERNELS_H__ 3 | 4 | 5 | // Template-meta-programming types construction from Int values 6 | // This allows to cut down kernel code at compilation time. 7 | 8 | template 9 | struct Int2Type 10 | { 11 | typedef enum {val_ = Val} val__; 12 | }; 13 | 14 | template 15 | struct SameType 16 | { 17 | enum { result = 0 }; 18 | }; 19 | 20 | template 21 | struct SameType 22 | { 23 | enum { result = 1 }; 24 | }; 25 | 26 | #define SAMETYPE(a, b) (SameType::result) 27 | 28 | 29 | __constant__ int32_t _cudaGapO; /*gap open penalty*/ 30 | __constant__ int32_t _cudaGapOE; /*sum of gap open and extension penalties*/ 31 | __constant__ int32_t _cudaGapExtend; /*sum of gap extend*/ 32 | __constant__ int32_t _cudaMatchScore; /*score for a match*/ 33 | __constant__ int32_t _cudaMismatchScore; /*penalty for a mismatch*/ 34 | 35 | #define MINUS_INF SHRT_MIN 36 | 37 | #define N_VALUE (N_CODE & 0xF) 38 | 39 | #ifdef N_PENALTY 40 | #define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \ 41 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 42 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\ 43 | 44 | #define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \ 45 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 46 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\ 47 | 48 | #else 49 | #define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \ 50 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 51 | score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? 0 : score;\ 52 | 53 | #define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \ 54 | score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\ 55 | 56 | #endif 57 | 58 | #define MAX(a,b) ((a)>(b)?(a):(b)) 59 | #define MIN(a,b) ((a)<(b)?(a):(b)) 60 | 61 | 62 | #define FIND_MAX(curr, gidx) \ 63 | maxXY_y = (maxHH < curr) ? gidx : maxXY_y;\ 64 | maxHH = (maxHH < curr) ? curr : maxHH; 65 | 66 | 67 | // Kernel files 68 | 69 | #include "kernels/pack_rc_seqs.h" 70 | 71 | #include "kernels/global.h" 72 | 73 | #include "kernels/semiglobal_kernel_template.h" 74 | 75 | #include "kernels/local_kernel_template.h" 76 | 77 | #include "kernels/banded.h" 78 | 79 | #include "kernels/ksw_kernel_template.h" 80 | 81 | #include "kernels/get_tb.h" 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /test_prog/README.md: -------------------------------------------------------------------------------- 1 | This directory conatins a test program for GASAL2. The program overlaps the sequence alignment on the GPU with CPU execution. The CPU executes the code for creating a batch of sequences to be aligned on the GPU and printing the alignment results. First compile GASAL with `N_CODE=0x4E`. To compile the test program run `make`. Running the test program with `-h` or `--help` will print the options: 2 | 3 | ``` 4 | $./test_prog.out -h 5 | 6 | Usage: ./test_prog.out [-a] [-b] [-q] [-r] [-s] [-p] [-n] [-y] 7 | Options: -a INT match score [1] 8 | -b INT mismatch penalty [4] 9 | -q INT gap open penalty [6] 10 | -r INT gap extension penalty [1] 11 | -s also find the start position 12 | -t compute traceback. With this option enabled, "-s" has no effect as start position will always be computed with traceback 13 | -p print the alignment results 14 | -n INT Number of threads [1] 15 | -y AL_TYPE Alignment type . Must be "local", "semi_global", "global", "ksw" 16 | -x HEAD TAIL specifies, for semi-global alignment, wha should be skipped for heads and tails of the sequences. (NONE, QUERY, TARGET, BOTH) 17 | -k INT Band width in case "banded" is selected. 18 | --help, -h : displays this message. 19 | --second-best displays second best score (WITHOUT_START only). 20 | Single-pack multi-Parameters (e.g. -sp) is not supported. 21 | 22 | ``` 23 | 24 | 25 | `query_batch.fasta` and `target_batch.fasta` contain the single-line FASTA sequences for the alignment. The sequences in these files are aligned one-to-one, i.e. the first sequence in query_batch.fasta is aligned to the first sequence in target_batch.fasta, the second sequence in query_batch.fasta is aligned to the second sequence in target_batch.fasta, and so on. The directory also contains sample query_batch.fasta and target_batch.fasta files. For the two sample files use `MAX_QUERY_LEN=160`. 26 | 27 | In order to demonstrate easily the possibilities of reverse-complementing independently, one can change the first character of the sequence delimiter `>` in the .fasta files. The test program parses the first character as the following : 28 | 29 | - Parsing `>` does no operation on the sequence (this is the regular mode), 30 | - Parsing `<` flags the sequence to be reversed, 31 | - Parsing `/` flags the sequence to be complemented, 32 | - Parsing `+` flags the sequence to be reversed and complemented. 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GPU_SM_ARCH= 2 | MAX_QUERY_LEN= 3 | N_CODE= 4 | N_PENALTY= 5 | 6 | GPU_COMPUTE_ARCH=$(subst sm,compute,$(GPU_SM_ARCH)) 7 | NVCC=/usr/local/cuda-10.1/bin/nvcc 8 | CC=g++ 9 | SRC_DIR=./src/ 10 | OBJ_DIR=./obj/ 11 | LIB_DIR=./lib/ 12 | INCLUDE_DIR=./include/ 13 | 14 | SOURCES= args_parser.cpp host_batch.cpp ctors.cpp interfaces.cpp res.cpp gasal_align.cu 15 | LOBJS=$(patsubst %,%o,$(SOURCES)) 16 | 17 | LOBJS_PATH=$(addprefix $(OBJ_DIR),$(LOBJS)) 18 | VPATH=src:obj:lib 19 | YELLOW=\033[1;33m 20 | NC=\033[0m # No Color 21 | 22 | ifeq ($(GPU_SM_ARCH),) 23 | error1: 24 | @echo "Must specify GPU architecture as sm_xx" 25 | endif 26 | ifeq ($(MAX_QUERY_LEN),) 27 | error2: 28 | @echo "Must specify maximum sequence length" 29 | endif 30 | 31 | ifeq ($(N_CODE),) 32 | error3: 33 | @echo "Must specify the code for 'N'" 34 | endif 35 | #ifneq ($(GPU_SM_ARCH),clean) 36 | 37 | 38 | 39 | 40 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: --compiler-options -fPIC 41 | ## With Debian and clang, use: $(NVCC) -ccbin clang-3.8 --compiler-options -fpie 42 | 43 | ifeq ($(N_PENALTY),) 44 | %.cuo: %.cu 45 | $(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE) -Xptxas -Werror --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@ 46 | 47 | else 48 | %.cuo: %.cu 49 | $(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE),-DN_PENALTY=$(N_PENALTY) -Xptxas -Werror --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@ 50 | 51 | endif 52 | 53 | 54 | 55 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: -fPIC 56 | ifeq ($(N_PENALTY),) 57 | %.cppo: %.cpp 58 | $(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -Werror $< -o $(OBJ_DIR)$@ 59 | 60 | else 61 | %.cppo: %.cpp 62 | $(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -DN_PENALTY=$(N_PENALTY) -Werror $< -o $(OBJ_DIR)$@ 63 | 64 | endif 65 | 66 | 67 | all: clean makedir libgasal.a 68 | 69 | makedir: 70 | @mkdir -p $(OBJ_DIR) 71 | @mkdir -p $(LIB_DIR) 72 | @mkdir -p $(INCLUDE_DIR) 73 | @cp $(SRC_DIR)/*.h $(INCLUDE_DIR) 74 | @sed -i "s/MAX_QUERY_LEN=[0-9]\{1,9\}/MAX_QUERY_LEN=$(MAX_QUERY_LEN)/" ./test_prog/Makefile 75 | 76 | ifeq ($(N_PENALTY),) 77 | libgasal.a: $(LOBJS) 78 | ar -csru $(LIB_DIR)$@ $(LOBJS_PATH) 79 | @echo "" 80 | @echo -e "${YELLOW}WARNING:${NC}\"N_PENALTY\" is not defined" 81 | else 82 | libgasal.a: $(LOBJS) 83 | ar -csru $(LIB_DIR)$@ $(LOBJS_PATH) 84 | endif 85 | 86 | clean: 87 | rm -f -r $(OBJ_DIR) $(LIB_DIR) $(INCLUDE_DIR) *~ *.exe *.cppo *.cuo *.txt *~ 88 | 89 | gasal_align.cuo: gasal.h gasal_kernels.h 90 | 91 | 92 | -------------------------------------------------------------------------------- /test_prog/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_LD_LIBRARY=/usr/local/cuda-10.1/targets/x86_64-linux/lib 2 | ANALYSIS_FILENAME=analysis 3 | # prefix1 can be optirun in case you need to run it from an optimus-enabled laptop. 4 | PREFIX1= 5 | #i prefix2 can be nvprof. use preferably the following : nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof 6 | PREFIX2=valgrind 7 | #suffix1 and 2 can be an output file. 8 | SUFFIX1=> golden.log 9 | SUFFIX2=> out.log 10 | 11 | PRGM=test_prog.out 12 | 13 | OPTARGS1=-p -y local 14 | OPTARGS2=-p -y local 15 | 16 | 17 | FILES_HUMAN600=reads_600_human_10M.fasta ref_600_human_10M.fasta 18 | FILES_HUMAN300=reads_300_human_10M.fasta ref_300_human_10M.fasta 19 | FILES_HUMAN150=reads_150_human_10M.fasta ref_150_human_10M.fasta 20 | FILES_20K=query_batch.fasta target_batch.fasta 21 | FILES_262K=reads_150.fasta ref_150.fasta 22 | FILES_SHORT=short_query_batch.fasta short_target_batch.fasta 23 | 24 | .cpp.o: 25 | g++ -std=c++11 -g -c -O3 -Wall -Werror -fopenmp -I ../include -o test_prog.o test_prog.cpp 26 | 27 | all: clean test_prog.out 28 | 29 | test_prog.out: test_prog.o 30 | g++ -std=c++11 -O3 -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib test_prog.o -fopenmp -lcudart -lgasal 31 | 32 | clean: 33 | rm -f -r *~ *.exe *.o *.out 34 | 35 | test_prog.o: Timer.h 36 | 37 | 38 | human150: all 39 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN150) $(SUFFIX1) 40 | 41 | human150-2: all 42 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN150) $(SUFFIX2) 43 | 44 | human300: all 45 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN300) $(SUFFIX1) 46 | 47 | human300-2: all 48 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN300) $(SUFFIX2) 49 | 50 | human600: all 51 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN600) $(SUFFIX1) 52 | 53 | human600-2: all 54 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN600) $(SUFFIX2) 55 | 56 | 57 | run: all 58 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_SHORT) $(SUFFIX1) 59 | 60 | run2: all 61 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_SHORT) $(SUFFIX2) 62 | 63 | 64 | fullrun: all 65 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1) 66 | 67 | fullrun2: all 68 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_20K) $(SUFFIX2) 69 | 70 | 71 | 262k: all 72 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_262K) $(SUFFIX1) 73 | 74 | 262k2: all 75 | $(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_262K) $(SUFFIX2) 76 | 77 | 78 | 79 | 80 | 81 | cuda-memcheck: all 82 | cuda-memcheck ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1) 83 | 84 | cuda-gdb: all 85 | cuda-gdb --args ./test_prog.out -p -y local query_batch.fasta target_batch.fasta 86 | 87 | valgrind: all 88 | valgrind ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta 89 | 90 | gdb: all 91 | gdb --args ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta 92 | -------------------------------------------------------------------------------- /src/kernels/get_tb.h: -------------------------------------------------------------------------------- 1 | #ifndef __GET_TB__ 2 | #define __GET_TB__ 3 | 4 | template 5 | __global__ void gasal_get_tb(uint8_t *cigar, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *cigar_offset, uint4 *packed_tb_matrices, gasal_res_t *device_res, int n_tasks) { 6 | 7 | int i, j; 8 | int total_score __attribute__((unused)); 9 | int curr_score __attribute__((unused)); 10 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; 11 | if (tid >= n_tasks) return; 12 | 13 | int offset = cigar_offset[tid]; 14 | 15 | 16 | if (SAMETYPE(T, Int2Type)) { 17 | i = device_res->target_batch_end[tid]; 18 | j = device_res->query_batch_end[tid]; 19 | total_score = device_res->aln_score[tid]; 20 | curr_score = 0; 21 | } else if (SAMETYPE(T, Int2Type)) { 22 | i = target_batch_lens[tid]; 23 | j = query_batch_lens[tid]; 24 | } 25 | 26 | 27 | 28 | uint32_t prev_op_to_fill = 0; 29 | 30 | int read_len_8 = query_batch_lens[tid]%8 ? query_batch_lens[tid] + (8 - (query_batch_lens[tid]%8)) : query_batch_lens[tid]; 31 | 32 | int n_ops = 0; 33 | 34 | int prev_tile_no = -1; 35 | 36 | uint4 tile = make_uint4(0, 0, 0, 0); 37 | 38 | int op_select = 3; 39 | 40 | int op_shift = 0; 41 | 42 | 43 | int count = 0; 44 | 45 | uint32_t op_to_fill; 46 | 47 | while ( i >= 0 && j >= 0) { 48 | 49 | 50 | int cell = (((i >> 3) * read_len_8) << 3) + (j << 3) + (i&7); 51 | 52 | 53 | 54 | int tile_no = cell>>5; 55 | 56 | 57 | tile = tile_no != prev_tile_no ? packed_tb_matrices[(tile_no*n_tasks) + tid] : tile; 58 | 59 | prev_tile_no = tile_no; 60 | 61 | int cell_no_in_tile = cell - (tile_no<<5); 62 | 63 | 64 | int reg_no_in_tile = cell_no_in_tile >> 3; 65 | 66 | int cell_no_in_reg = cell_no_in_tile - (reg_no_in_tile << 3); 67 | 68 | uint32_t reg = reg_no_in_tile == 0 ? tile.x : (reg_no_in_tile == 1 ? tile.y : (reg_no_in_tile == 2 ? tile.z : tile.w)); 69 | 70 | 71 | uint32_t cell_op = (reg >> (28 - (cell_no_in_reg << 2))) & 15; 72 | 73 | 74 | uint32_t op = (cell_op >> op_shift) & op_select; 75 | 76 | 77 | 78 | op_to_fill = op == 0 || op_select == 3 ? op : op_shift ; 79 | 80 | op_select = op == 0 || (op == 1 && op_select == 3) ? 3 : 1; 81 | 82 | op_shift = op == 0 || ( op == 1 && op_select == 3) ? 0 : ((op == 2 || op == 3) ? op : op_shift); 83 | 84 | 85 | 86 | 87 | if(count < 63 && op_to_fill == prev_op_to_fill) { 88 | count++; 89 | } else { 90 | if (count > 0) { 91 | uint8_t reg_out = 0; 92 | reg_out |= prev_op_to_fill; 93 | reg_out |= (uint8_t)(count << 2); 94 | cigar[offset++] = reg_out; 95 | n_ops++; 96 | } 97 | count = 1; 98 | } 99 | 100 | if (SAMETYPE(T, Int2Type)) { 101 | curr_score += ((op_to_fill == 2 || op_to_fill == 3) && prev_op_to_fill != op_to_fill) ? -_cudaGapOE : ((op_to_fill == 2 || op_to_fill == 3) ? - _cudaGapExtend : (op_to_fill == 1 ? -_cudaMismatchScore : _cudaMatchScore)); 102 | if (curr_score == total_score) break; 103 | } 104 | 105 | prev_op_to_fill = op_to_fill; 106 | 107 | i = op_to_fill == 0 || op_to_fill == 1 || op_to_fill == 2 ? i - 1 : i; 108 | j = op_to_fill == 0 || op_to_fill == 1 || op_to_fill == 3 ? j - 1 : j; 109 | 110 | 111 | } 112 | 113 | uint8_t reg_out = 0; 114 | reg_out |= prev_op_to_fill; 115 | reg_out |= (uint8_t)(count << 2); 116 | cigar[offset++] = reg_out; 117 | n_ops++; 118 | 119 | if (SAMETYPE(T, Int2Type)) { 120 | while (i >= 0) { 121 | uint32_t reg_out = 0; 122 | uint8_t resd_count = (i+1) <= 63 ? (i+1) : 63; 123 | reg_out |= 2; 124 | reg_out |= (uint8_t)(resd_count << 2); 125 | cigar[offset++] = reg_out; 126 | n_ops++; 127 | i = i - 63; 128 | 129 | } 130 | while (j >= 0) { 131 | uint32_t reg_out = 0; 132 | uint8_t resd_count = (j+1) <= 63 ? (j+1) : 63; 133 | reg_out |= 3; 134 | reg_out |= (uint8_t)(resd_count << 2); 135 | cigar[offset++] = reg_out; 136 | n_ops++; 137 | j = j - 63; 138 | } 139 | } 140 | 141 | 142 | if (SAMETYPE(T, Int2Type)) { 143 | device_res->target_batch_start[tid] = i; 144 | device_res->query_batch_start[tid] = j; 145 | } 146 | query_batch_lens[tid] = n_ops; 147 | 148 | 149 | } 150 | #endif 151 | -------------------------------------------------------------------------------- /src/gasal.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_H__ 2 | #define __GASAL_H__ 3 | 4 | 5 | #include 6 | #include 7 | 8 | 9 | #include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cuda_runtime.h" 10 | 11 | #ifndef HOST_MALLOC_SAFETY_FACTOR 12 | #define HOST_MALLOC_SAFETY_FACTOR 5 13 | #endif 14 | 15 | #define CHECKCUDAERROR(error) \ 16 | do{\ 17 | err = error;\ 18 | if (cudaSuccess != err ) { \ 19 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err, __LINE__, __FILE__); \ 20 | exit(EXIT_FAILURE);\ 21 | }\ 22 | }while(0)\ 23 | 24 | 25 | inline int CudaCheckKernelLaunch() 26 | { 27 | cudaError err = cudaGetLastError(); 28 | if ( cudaSuccess != err ) 29 | { 30 | return -1; 31 | } 32 | 33 | return 0; 34 | } 35 | 36 | 37 | enum comp_start{ 38 | WITHOUT_START, 39 | WITH_START, 40 | WITH_TB 41 | }; 42 | 43 | // Generic enum for ture/false. Using this instead of bool to generalize templates out of Int values for secondBest. 44 | // Can be usd more generically, for example for WITH_/WITHOUT_START. 45 | enum Bool{ 46 | FALSE, 47 | TRUE 48 | }; 49 | 50 | enum data_source{ 51 | NONE, 52 | QUERY, 53 | TARGET, 54 | BOTH 55 | }; 56 | 57 | enum algo_type{ 58 | UNKNOWN, 59 | GLOBAL, 60 | SEMI_GLOBAL, 61 | LOCAL, 62 | MICROLOCAL, 63 | BANDED, 64 | KSW 65 | }; 66 | 67 | enum operation_on_seq{ 68 | FORWARD_NATURAL, 69 | REVERSE_NATURAL, 70 | FORWARD_COMPLEMENT, 71 | REVERSE_COMPLEMENT, 72 | }; 73 | 74 | // data structure of linked list to allow extension of memory on host side. 75 | struct host_batch{ 76 | uint8_t *data; 77 | uint32_t page_size; 78 | uint32_t data_size; 79 | uint32_t offset; 80 | int is_locked; 81 | struct host_batch* next; 82 | }; 83 | typedef struct host_batch host_batch_t; 84 | 85 | // Data structure to hold results. Can be instantiated for host or device memory (see res.cpp) 86 | struct gasal_res{ 87 | int32_t *aln_score; 88 | int32_t *query_batch_end; 89 | int32_t *target_batch_end; 90 | int32_t *query_batch_start; 91 | int32_t *target_batch_start; 92 | uint8_t *cigar; 93 | uint32_t *n_cigar_ops; 94 | }; 95 | typedef struct gasal_res gasal_res_t; 96 | 97 | //stream data 98 | typedef struct { 99 | uint8_t *unpacked_query_batch; 100 | uint8_t *unpacked_target_batch; 101 | uint32_t *packed_query_batch; 102 | uint32_t *packed_target_batch; 103 | uint32_t *query_batch_offsets; 104 | uint32_t *target_batch_offsets; 105 | uint32_t *query_batch_lens; 106 | uint32_t *target_batch_lens; 107 | 108 | uint32_t *host_seed_scores; 109 | uint32_t *seed_scores; 110 | 111 | host_batch_t *extensible_host_unpacked_query_batch; 112 | host_batch_t *extensible_host_unpacked_target_batch; 113 | 114 | uint8_t *host_query_op; 115 | uint8_t *host_target_op; 116 | uint8_t *query_op; 117 | uint8_t *target_op; 118 | 119 | uint32_t *host_query_batch_offsets; 120 | uint32_t *host_target_batch_offsets; 121 | uint32_t *host_query_batch_lens; 122 | uint32_t *host_target_batch_lens; 123 | 124 | gasal_res_t *host_res; // the results that can be read on host - THE STRUCT IS ON HOST SIDE, ITS CONTENT IS ON HOST SIDE. 125 | gasal_res_t *device_cpy; // a struct that contains the pointers to the device side - THE STRUCT IS ON HOST SIDE, but the CONTENT is malloc'd on and points to the DEVICE SIDE 126 | gasal_res_t *device_res; // the results that are written on device - THE STRUCT IS ON DEVICE SIDE, ITS CONTENT POINTS TO THE DEVICE SIDE. 127 | 128 | gasal_res_t *host_res_second; 129 | gasal_res_t *device_res_second; 130 | gasal_res_t *device_cpy_second; 131 | 132 | uint32_t gpu_max_query_batch_bytes; 133 | uint32_t gpu_max_target_batch_bytes; 134 | 135 | uint32_t host_max_query_batch_bytes; 136 | uint32_t host_max_target_batch_bytes; 137 | 138 | uint32_t gpu_max_n_alns; 139 | uint32_t host_max_n_alns; 140 | uint32_t current_n_alns; 141 | 142 | uint64_t packed_tb_matrix_size; 143 | uint4 *packed_tb_matrices; 144 | 145 | 146 | cudaStream_t str; 147 | int is_free; 148 | int id; //this can be useful in cases where a gasal_gpu_storage only contains PARTS of an alignment (like a seed-extension...), to gather results. 149 | 150 | } gasal_gpu_storage_t; 151 | 152 | //vector of streams 153 | typedef struct { 154 | int n; 155 | gasal_gpu_storage_t *a; 156 | }gasal_gpu_storage_v; 157 | 158 | 159 | //match/mismatch and gap penalties 160 | typedef struct{ 161 | int32_t match; 162 | int32_t mismatch; 163 | int32_t gap_open; 164 | int32_t gap_extend; 165 | } gasal_subst_scores; 166 | 167 | 168 | #endif 169 | -------------------------------------------------------------------------------- /src/interfaces.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "interfaces.h" 4 | #include "res.h" 5 | 6 | 7 | // Function for general resizing 8 | template 9 | T* cudaHostRealloc(void *source, int new_size, int old_size) 10 | { 11 | cudaError_t err; 12 | T* destination = NULL; 13 | if (new_size < old_size) 14 | { 15 | fprintf(stderr, "[GASAL ERROR] cudoHostRealloc: invalid sizes. New size < old size (%d < %d)", new_size, old_size); 16 | exit(EXIT_FAILURE); 17 | } 18 | CHECKCUDAERROR(cudaHostAlloc(&destination, new_size * sizeof(T), cudaHostAllocMapped)); 19 | //fprintf(stderr, "\ndest=%p\tsrc=%p", destination, source); 20 | CHECKCUDAERROR(cudaMemcpy(destination, source, old_size * sizeof(T), cudaMemcpyHostToHost)); 21 | CHECKCUDAERROR(cudaFreeHost(source)); 22 | return destination; 23 | }; 24 | 25 | // Realloc new fields when more alignments are added. 26 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params) 27 | { 28 | /* // Don't reallocate the extensible batches. They're extensible. 29 | gpu_storage->extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0); 30 | gpu_storage->extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0); 31 | */ 32 | /* // don't realloc gpu-sided batches as they will be taken care of before aligning. 33 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 34 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 35 | */ 36 | 37 | fprintf(stderr, "[GASAL WARNING] Resizing gpu_storage from %d sequences to %d sequences... ", gpu_storage->host_max_n_alns,new_max_alns); 38 | // don't care about realloc'ing gpu-sided fields as they will be taken care of before aligning. 39 | 40 | gpu_storage->host_query_op = cudaHostRealloc((void*) gpu_storage->host_query_op, new_max_alns, gpu_storage->host_max_n_alns); 41 | gpu_storage->host_target_op = cudaHostRealloc((void*) gpu_storage->host_target_op, new_max_alns, gpu_storage->host_max_n_alns); 42 | 43 | if (params->algo == KSW) 44 | gpu_storage->host_seed_scores = cudaHostRealloc(gpu_storage->host_seed_scores, new_max_alns, gpu_storage->host_max_n_alns); 45 | //fprintf(stderr, "_ops done "); 46 | 47 | gpu_storage->host_query_batch_lens = cudaHostRealloc((void*) gpu_storage->host_query_batch_lens, new_max_alns, gpu_storage->host_max_n_alns); 48 | gpu_storage->host_target_batch_lens = cudaHostRealloc((void*) gpu_storage->host_target_batch_lens, new_max_alns, gpu_storage->host_max_n_alns); 49 | //fprintf(stderr, "_lens done "); 50 | 51 | gpu_storage->host_query_batch_offsets = cudaHostRealloc((void*) gpu_storage->host_query_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns); 52 | gpu_storage->host_target_batch_offsets = cudaHostRealloc((void*) gpu_storage->host_target_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns); 53 | //fprintf(stderr, "_offsets done "); 54 | 55 | gasal_res_destroy_host(gpu_storage->host_res); 56 | gpu_storage->host_res = gasal_res_new_host(new_max_alns, params); 57 | gpu_storage->device_cpy = gasal_res_new_device_cpy(new_max_alns, params); 58 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 59 | 60 | if (params->secondBest) 61 | { 62 | gasal_res_destroy_host(gpu_storage->host_res_second); 63 | gpu_storage->host_res_second = gasal_res_new_host(new_max_alns, params); 64 | gpu_storage->device_cpy_second = gasal_res_new_device_cpy(new_max_alns, params); 65 | gpu_storage->device_res_second = gasal_res_new_device(gpu_storage->device_cpy_second); 66 | 67 | } else { 68 | gpu_storage->host_res_second = NULL; 69 | gpu_storage->device_cpy_second = NULL; 70 | gpu_storage->device_res_second = NULL; 71 | } 72 | 73 | //fprintf(stderr, "_res done "); 74 | 75 | gpu_storage->host_max_n_alns = new_max_alns; 76 | //gpu_storage->gpu_max_n_alns = gpu_max_n_alns; 77 | fprintf(stderr, " done. This can harm performance.\n"); 78 | } 79 | 80 | // operation (Reverse/complement) filler. 81 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC) 82 | { 83 | uint8_t *host_op = NULL; 84 | switch(SRC) 85 | { 86 | case QUERY: 87 | host_op = (gpu_storage_t->host_query_op); 88 | break; 89 | case TARGET: 90 | host_op = (gpu_storage_t->host_target_op); 91 | break; 92 | default: 93 | break; 94 | } 95 | memcpy(host_op, data, nbr_seqs_in_stream); 96 | } 97 | 98 | void gasal_set_device(int gpu_select, bool isPrintingProp) 99 | { 100 | /* 101 | Select GPU 102 | */ 103 | if (isPrintingProp) 104 | { 105 | int num_devices, device; 106 | cudaGetDeviceCount(&num_devices); 107 | fprintf(stderr, "Found %d GPUs\n", num_devices); 108 | if (gpu_select > num_devices-1) 109 | { 110 | fprintf(stderr, "Error: can't select device %d when only %d devices are selected (range from 0 to %d)\n", gpu_select, num_devices, num_devices-1); 111 | exit(EXIT_FAILURE); 112 | } 113 | if (num_devices > 0) { 114 | cudaDeviceProp properties; 115 | for (device = 0; device < num_devices; device++) { 116 | cudaGetDeviceProperties(&properties, device); 117 | fprintf(stderr, "\tGPU %d: %s\n", device, properties.name); 118 | } 119 | cudaGetDeviceProperties(&properties, gpu_select); 120 | fprintf(stderr, "Selected device %d : %s\n", gpu_select, properties.name); 121 | cudaSetDevice(gpu_select); 122 | } 123 | } else { 124 | // silently select device 125 | cudaSetDevice(gpu_select); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/res.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | 3 | #include "args_parser.h" 4 | 5 | #include "res.h" 6 | 7 | 8 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params) 9 | { 10 | cudaError_t err; 11 | gasal_res_t *res = NULL; 12 | 13 | 14 | res = (gasal_res_t *)malloc(sizeof(gasal_res_t)); 15 | 16 | CHECKCUDAERROR(cudaHostAlloc(&(res->aln_score), max_n_alns * sizeof(int32_t),cudaHostAllocDefault)); 17 | 18 | 19 | if(res ==NULL) 20 | { 21 | fprintf(stderr, "Malloc error on res host "); 22 | exit(1); 23 | } 24 | 25 | 26 | if (params->algo == GLOBAL) { 27 | res->query_batch_start = NULL; 28 | res->target_batch_start = NULL; 29 | res->query_batch_end = NULL; 30 | res->target_batch_end = NULL; 31 | /* 32 | // Deprecated. For semi-global you now need to know the start and stop positions. 33 | } else if (params->algo == SEMI_GLOBAL) { 34 | res->host_query_batch_start = NULL; 35 | res->host_query_batch_end = NULL; 36 | res->query_batch_start = NULL; 37 | res->query_batch_end = NULL; 38 | 39 | if (params->start_pos == WITH_START) { 40 | CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_start),max_n_alns * sizeof(uint32_t))); 41 | CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_end),max_n_alns * sizeof(uint32_t))); 42 | 43 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t))); 44 | CHECKCUDAERROR( 45 | cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t))); 46 | } else { 47 | CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_end),max_n_alns * sizeof(uint32_t))); 48 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t))); 49 | res->host_target_batch_start = NULL; 50 | res->target_batch_start = NULL; 51 | } 52 | */ 53 | } else { 54 | if (params->start_pos == WITH_START || params->start_pos == WITH_TB) { 55 | CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_start),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 56 | CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 57 | CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 58 | CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 59 | 60 | } else { 61 | CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 62 | CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 63 | res->query_batch_start = NULL; 64 | res->target_batch_start = NULL; 65 | } 66 | 67 | } 68 | if (params->start_pos == WITH_TB) { 69 | CHECKCUDAERROR(cudaHostAlloc(&(res->n_cigar_ops), max_n_alns * sizeof(uint32_t),cudaHostAllocDefault)); 70 | } 71 | 72 | return res; 73 | } 74 | 75 | 76 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy) 77 | { 78 | cudaError_t err; 79 | 80 | 81 | 82 | // create class storage on device and copy top level class 83 | gasal_res_t *d_c; 84 | CHECKCUDAERROR(cudaMalloc((void **)&d_c, sizeof(gasal_res_t))); 85 | // CHECKCUDAERROR(cudaMemcpy(d_c, res, sizeof(gasal_res_t), cudaMemcpyHostToDevice)); 86 | 87 | 88 | 89 | // copy pointer to allocated device storage to device class 90 | CHECKCUDAERROR(cudaMemcpy(&(d_c->aln_score), &(device_cpy->aln_score), sizeof(int32_t*), cudaMemcpyHostToDevice)); 91 | CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_start), &(device_cpy->query_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice)); 92 | CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_start), &(device_cpy->target_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice)); 93 | CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_end), &(device_cpy->query_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice)); 94 | CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_end), &(device_cpy->target_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice)); 95 | 96 | 97 | 98 | 99 | 100 | return d_c; 101 | } 102 | 103 | 104 | 105 | 106 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params) 107 | { 108 | cudaError_t err; 109 | gasal_res_t *res; 110 | 111 | res = (gasal_res_t *)malloc(sizeof(gasal_res_t)); 112 | 113 | CHECKCUDAERROR(cudaMalloc(&(res->aln_score), max_n_alns * sizeof(int32_t))); 114 | 115 | if (params->algo == GLOBAL) { 116 | res->query_batch_start = NULL; 117 | res->target_batch_start = NULL; 118 | res->query_batch_end = NULL; 119 | res->target_batch_end = NULL; 120 | 121 | } else { 122 | if (params->start_pos == WITH_START || params->start_pos == WITH_TB) { 123 | 124 | CHECKCUDAERROR(cudaMalloc(&(res->query_batch_start),max_n_alns * sizeof(uint32_t))); 125 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t))); 126 | CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t))); 127 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t))); 128 | 129 | } else { 130 | 131 | CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t))); 132 | CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t))); 133 | 134 | res->query_batch_start = NULL; 135 | res->target_batch_start = NULL; 136 | } 137 | 138 | } 139 | return res; 140 | } 141 | 142 | // TODO : make 2 destroys for host and device 143 | void gasal_res_destroy_host(gasal_res_t *res) 144 | { 145 | cudaError_t err; 146 | if (res == NULL) 147 | return; 148 | 149 | 150 | if (res->aln_score != NULL) CHECKCUDAERROR(cudaFreeHost(res->aln_score)); 151 | if (res->query_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_start)); 152 | if (res->target_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_start)); 153 | if (res->query_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_end)); 154 | if (res->target_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_end)); 155 | if (res->n_cigar_ops != NULL) CHECKCUDAERROR(cudaFreeHost(res->n_cigar_ops)); 156 | 157 | free(res); 158 | } 159 | 160 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy) 161 | { 162 | cudaError_t err; 163 | if (device_cpy == NULL || device_res == NULL) 164 | return; 165 | 166 | if (device_cpy->aln_score != NULL) CHECKCUDAERROR(cudaFree(device_cpy->aln_score)); 167 | if (device_cpy->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_start)); 168 | if (device_cpy->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_start)); 169 | if (device_cpy->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_end)); 170 | if (device_cpy->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_end)); 171 | if (device_cpy->cigar != NULL) CHECKCUDAERROR(cudaFree(device_cpy->cigar)); 172 | 173 | 174 | CHECKCUDAERROR(cudaFree(device_res)); 175 | 176 | free(device_cpy); 177 | } 178 | -------------------------------------------------------------------------------- /src/gasal_align.h: -------------------------------------------------------------------------------- 1 | #ifndef __GASAL_ALIGN_H__ 2 | #define __GASAL_ALIGN_H__ 3 | /* #################################################################################### 4 | SEMI_GLOBAL Kernels generation - read from the bottom one, all the way up. (the most specialized ones are written before the ones that call them) 5 | #################################################################################### 6 | */ 7 | #define SEMIGLOBAL_KERNEL_CALL(a,s,h,t,b) \ 8 | case t:\ 9 | {\ 10 | gasal_semi_global_kernel, Int2Type, Int2Type, Int2Type, Int2Type><<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns);\ 11 | break;\ 12 | }\ 13 | 14 | #define SWITCH_SEMI_GLOBAL_TAIL(a,s,h,t,b) \ 15 | case h:\ 16 | switch(t) { \ 17 | SEMIGLOBAL_KERNEL_CALL(a,s,h,NONE,b)\ 18 | SEMIGLOBAL_KERNEL_CALL(a,s,h,QUERY,b)\ 19 | SEMIGLOBAL_KERNEL_CALL(a,s,h,TARGET,b)\ 20 | SEMIGLOBAL_KERNEL_CALL(a,s,h,BOTH,b)\ 21 | }\ 22 | break; 23 | 24 | #define SWITCH_SEMI_GLOBAL_HEAD(a,s,h,t,b) \ 25 | case s:\ 26 | switch(h) { \ 27 | SWITCH_SEMI_GLOBAL_TAIL(a,s,NONE,t,b)\ 28 | SWITCH_SEMI_GLOBAL_TAIL(a,s,QUERY,t,b)\ 29 | SWITCH_SEMI_GLOBAL_TAIL(a,s,TARGET,t,b)\ 30 | SWITCH_SEMI_GLOBAL_TAIL(a,s,BOTH,t,b)\ 31 | } \ 32 | break; 33 | 34 | 35 | /* #################################################################################### 36 | ALGORITHMS Kernels generation. Allows to have a single line written for all kernels calls. The switch-cases are MACRO-generated. 37 | #################################################################################### 38 | */ 39 | 40 | #define SWITCH_SEMI_GLOBAL(a,s,h,t,b) SWITCH_SEMI_GLOBAL_HEAD(a,s,h,t,b) 41 | 42 | #define SWITCH_LOCAL(a,s,h,t,b) \ 43 | case s: {\ 44 | gasal_local_kernel, Int2Type, Int2Type><<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns); \ 45 | if(s == WITH_TB) {\ 46 | cudaError_t aln_kernel_err = cudaGetLastError();\ 47 | if ( cudaSuccess != aln_kernel_err )\ 48 | {\ 49 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err, __LINE__, __FILE__);\ 50 | exit(EXIT_FAILURE);\ 51 | }\ 52 | gasal_get_tb><<str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns);\ 53 | }\ 54 | break;\ 55 | }\ 56 | 57 | #define SWITCH_GLOBAL(a,s,h,t,b) \ 58 | case s:{\ 59 | gasal_global_kernel><<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->packed_tb_matrices, actual_n_alns);\ 60 | if(s == WITH_TB) {\ 61 | cudaError_t aln_kernel_err = cudaGetLastError();\ 62 | if ( cudaSuccess != aln_kernel_err )\ 63 | {\ 64 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err, __LINE__, __FILE__);\ 65 | exit(EXIT_FAILURE);\ 66 | }\ 67 | gasal_get_tb><<str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns);\ 68 | }\ 69 | break;\ 70 | }\ 71 | 72 | 73 | #define SWITCH_KSW(a,s,h,t,b) \ 74 | case s:\ 75 | gasal_ksw_kernel><<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->seed_scores, gpu_storage->device_res, gpu_storage->device_res_second, actual_n_alns);\ 76 | break; 77 | 78 | #define SWITCH_BANDED(a,s,h,t,b) \ 79 | case s:\ 80 | gasal_banded_tiled_kernel<<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, actual_n_alns, k_band>>3); \ 81 | break; 82 | 83 | /* #################################################################################### 84 | RUN PARAMETERS calls : general call (bottom, should be used), and first level TRUE/FALSE calculation for second best, 85 | then 2nd level WITH / WITHOUT_START switch call (top) 86 | #################################################################################### 87 | */ 88 | 89 | #define SWITCH_START(a,s,h,t,b) \ 90 | case b: \ 91 | switch(s){\ 92 | SWITCH_## a(a,WITH_START,h,t,b)\ 93 | SWITCH_## a(a,WITHOUT_START,h,t,b)\ 94 | SWITCH_## a(a,WITH_TB,h,t,b)\ 95 | } \ 96 | break; 97 | 98 | #define SWITCH_SECONDBEST(a,s,h,t,b) \ 99 | switch(b) { \ 100 | SWITCH_START(a,s,h,t,TRUE)\ 101 | SWITCH_START(a,s,h,t,FALSE)\ 102 | } 103 | 104 | #define KERNEL_SWITCH(a,s,h,t,b) \ 105 | case a:\ 106 | SWITCH_SECONDBEST(a,s,h,t,b)\ 107 | break; 108 | 109 | 110 | /* // Deprecated 111 | void gasal_aln(gasal_gpu_storage_t *gpu_storage, const uint8_t *query_batch, const uint32_t *query_batch_offsets, const uint32_t *query_batch_lens, const uint8_t *target_batch, const uint32_t *target_batch_offsets, const uint32_t *target_batch_lens, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, int32_t *host_aln_score, int32_t *host_query_batch_start, int32_t *host_target_batch_start, int32_t *host_query_batch_end, int32_t *host_target_batch_end, algo_type algo, comp_start start, int32_t k_band); 112 | */ 113 | 114 | void gasal_copy_subst_scores(gasal_subst_scores *subst); 115 | 116 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params); 117 | 118 | inline void gasal_kernel_launcher(int32_t N_BLOCKS, int32_t BLOCKDIM, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band); 119 | 120 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage); 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /src/host_batch.cpp: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "interfaces.h" 4 | #include "host_batch.h" 5 | 6 | 7 | 8 | 9 | // Functions for host batches handling. 10 | 11 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset) 12 | { 13 | cudaError_t err; 14 | host_batch_t *res = (host_batch_t *)calloc(1, sizeof(host_batch_t)); 15 | CHECKCUDAERROR(cudaHostAlloc(&(res->data), batch_bytes*sizeof(uint8_t), cudaHostAllocDefault)); 16 | res->page_size = batch_bytes; 17 | res->data_size = 0; 18 | res->is_locked = 0; 19 | res->offset = offset; 20 | res->next = NULL; 21 | return res; 22 | } 23 | 24 | void gasal_host_batch_destroy(host_batch_t *res) 25 | { 26 | cudaError_t err; 27 | if (res==NULL) 28 | { 29 | fprintf(stderr, "[GASAL ERROR] Trying to free a NULL pointer\n"); 30 | exit(1); 31 | } 32 | // recursive function to destroy all the linked listgasal_res_destroy_host 33 | if (res->next != NULL) 34 | gasal_host_batch_destroy(res->next); 35 | if (res->data != NULL) 36 | { 37 | CHECKCUDAERROR(cudaFreeHost(res->data)); 38 | } 39 | 40 | free(res); 41 | } 42 | 43 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg) 44 | { 45 | return (arg->next == NULL ? arg : gasal_host_batch_getlast(arg->next) ); 46 | 47 | } 48 | 49 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage) 50 | { 51 | // reset all batch idx and data occupation 52 | host_batch_t *cur_page = NULL; 53 | for(int i = 0; i < 2; i++) { 54 | 55 | switch(i) { 56 | case 0: 57 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 58 | break; 59 | case 1: 60 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 61 | break; 62 | default: 63 | break; 64 | } 65 | while(cur_page != NULL) 66 | { 67 | cur_page->data_size = 0; 68 | cur_page->offset = 0; 69 | cur_page->is_locked = 0; 70 | cur_page = cur_page->next; 71 | } 72 | } 73 | //fprintf(stderr, "[GASAL INFO] Batch reset.\n"); 74 | 75 | } 76 | 77 | 78 | // TODO: make a template... now that you started to go the C++/template way, just stick to it. 79 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC) 80 | { 81 | // since query and target are very symmetric here, we use pointers to route the data where it has to, 82 | // while keeping the actual memory management 'source-agnostic'. 83 | 84 | host_batch_t *cur_page = NULL; 85 | uint32_t *p_batch_bytes = NULL; 86 | 87 | switch(SRC) { 88 | case QUERY: 89 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 90 | p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes); 91 | break; 92 | case TARGET: 93 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 94 | p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes); 95 | break; 96 | default: 97 | break; 98 | } 99 | 100 | int nbr_N = 0; 101 | while((size+nbr_N)%8) 102 | nbr_N++; 103 | 104 | while(cur_page->is_locked) 105 | cur_page = cur_page->next; 106 | 107 | if (cur_page->next == NULL && cur_page->page_size - cur_page->data_size < size + nbr_N) 108 | { 109 | fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes while only %d remain (%s) (block size %d, filled %d bytes).\n Allocating a new block of size %d, total size available reaches %d. Doing this repeadtedly slows down the execution.\n", 110 | size + nbr_N, 111 | cur_page->page_size - cur_page->data_size, 112 | (SRC == QUERY ? "query":"target"), 113 | cur_page->page_size, 114 | cur_page->data_size, 115 | cur_page->page_size * 2, 116 | *p_batch_bytes + cur_page->page_size * 2); 117 | 118 | host_batch_t *res = gasal_host_batch_new(cur_page->page_size * 2, cur_page->offset + cur_page->data_size); 119 | cur_page->next = res; 120 | cur_page->is_locked = 1; 121 | *p_batch_bytes = *p_batch_bytes + cur_page->page_size * 2; 122 | 123 | cur_page = cur_page->next; 124 | //fprintf(stderr, "CREATED: "); gasal_host_batch_print(cur_page); 125 | } 126 | 127 | if (cur_page->next != NULL && cur_page->page_size - cur_page->data_size < size + nbr_N) 128 | { 129 | // re-write offset for the next page to correspond to what has been filled on the current page. 130 | cur_page->next->offset = cur_page->offset + cur_page->data_size; 131 | cur_page->is_locked = 1; 132 | // then, jump to next page 133 | cur_page = cur_page->next; 134 | } 135 | 136 | 137 | if (cur_page->page_size - cur_page->data_size >= size + nbr_N) 138 | { 139 | // fprintf(stderr, "FILL: "); gasal_host_batch_print(cur_page); 140 | memcpy(&(cur_page->data[idx - cur_page->offset]), data, size); 141 | 142 | for(int i = 0; i < nbr_N; i++) 143 | { 144 | cur_page->data[idx + size - cur_page->offset + i] = N_CODE; 145 | } 146 | idx = idx + size + nbr_N; 147 | 148 | cur_page->data_size += size + nbr_N; 149 | //is_done = 1; 150 | } 151 | 152 | return idx; 153 | } 154 | 155 | 156 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC ) 157 | { 158 | return gasal_host_batch_add(gpu_storage, idx, &base, 1, SRC ); 159 | } 160 | 161 | 162 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC ) 163 | { 164 | 165 | // since query and target are very symmetric here, we use pointers to route the data where it has to, 166 | // while keeping the actual memory management 'source-agnostic'. 167 | host_batch_t *cur_page = NULL; 168 | uint32_t *p_batch_bytes = NULL; 169 | 170 | 171 | switch(SRC) { 172 | case QUERY: 173 | cur_page = (gpu_storage->extensible_host_unpacked_query_batch); 174 | p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes); 175 | break; 176 | case TARGET: 177 | cur_page = (gpu_storage->extensible_host_unpacked_target_batch); 178 | p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes); 179 | break; 180 | default: 181 | break; 182 | } 183 | 184 | int is_done = 0; 185 | 186 | while (!is_done) 187 | { 188 | if (*p_batch_bytes >= idx + size && (cur_page->next == NULL || (cur_page->next->offset >= idx + size)) ) 189 | { 190 | 191 | memcpy(&(cur_page->data[idx - cur_page->offset]), data, size); 192 | idx = idx + size; 193 | is_done = 1; 194 | 195 | } else if ((*p_batch_bytes >= idx + size) && (cur_page->next != NULL) && (cur_page->next->offset < idx + size)) { 196 | 197 | cur_page = cur_page->next; 198 | 199 | } else { 200 | fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes at position %d on host memory (%s) while only %d bytes are available. Therefore, allocating %d bytes more on CPU. Repeating this many times can provoke a degradation of performance.\n", 201 | size, 202 | idx, 203 | (SRC == QUERY ? "query":"target"), 204 | *p_batch_bytes, 205 | *p_batch_bytes * 2); 206 | 207 | 208 | *p_batch_bytes += *p_batch_bytes; 209 | 210 | // corner case: if we allocated less than a single sequence length to begin with... it shouldn't be allowed actually, but at least it's caught here. 211 | while (*p_batch_bytes < size) 212 | *p_batch_bytes += *p_batch_bytes; 213 | 214 | host_batch_t *res = gasal_host_batch_new(*p_batch_bytes, idx); 215 | 216 | cur_page->next = res; 217 | 218 | cur_page = cur_page->next; 219 | } 220 | } 221 | //gasal_host_batch_printall(gasal_host_batch_getlast(cur_page)); 222 | return idx; 223 | } 224 | 225 | 226 | 227 | // this printer displays the whole sequence. It is heavy and shouldn't be called when you have more than a couple sequences. 228 | void gasal_host_batch_print(host_batch_t *res) 229 | { 230 | fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 231 | res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size); 232 | } 233 | 234 | // this printer allows to see the linked list easily. 235 | void gasal_host_batch_printall(host_batch_t *res) 236 | { 237 | fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 238 | res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size); 239 | if (res->next != NULL) 240 | { 241 | fprintf(stderr, "+--->"); 242 | gasal_host_batch_printall(res->next); 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /src/kernels/ksw_kernel_template.h: -------------------------------------------------------------------------------- 1 | #ifndef __KSW_KERNEL_TEMPLATE__ 2 | #define __KSW_KERNEL_TEMPLATE__ 3 | 4 | 5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes. 6 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \ 7 | uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence */ \ 8 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */ \ 9 | f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \ 10 | h[m] = p[m] + subScore; /*score if rbase is aligned to gbase*/ \ 11 | h[m] = max(h[m], f[m]); \ 12 | h[m] = max(h[m], 0); \ 13 | e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\ 14 | h[m] = max(h[m], e); \ 15 | maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \ 16 | maxHH = (maxHH < h[m]) ? h[m] : maxHH; \ 17 | p[m] = h[m-1]; 18 | 19 | 20 | #define PEN_CLIP5 (5) 21 | #define TILE_SIDE (8) 22 | 23 | /* typename meaning : 24 | - B is for computing the Second Best Score. Its values are on enum FALSE(0)/TRUE(1). 25 | (sidenote: it's based on an enum instead of a bool in order to generalize its type from its Int value, with Int2Type meta-programming-template) 26 | */ 27 | /* 28 | //! Note from the bwa-gasal2 coder : I failed to understand it, so I copied it. 29 | //! You can say to me... 30 | You cheated not only the game, but yourself. 31 | 32 | You didn't grow. 33 | You didn't improve. 34 | You took a shortcut and gained nothing. 35 | 36 | You experienced a hollow victory. 37 | Nothing was risked and nothing was gained. 38 | 39 | It's sad that you don't know the difference. 40 | */ 41 | 42 | typedef struct { 43 | int32_t h, e; 44 | } eh_t; 45 | 46 | template 47 | __global__ void gasal_ksw_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint32_t *seed_score, gasal_res_t *device_res, gasal_res_t *device_res_second, int n_tasks) 48 | { 49 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 50 | if (tid >= n_tasks) return; 51 | 52 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3; //starting index of the target_batch sequence 53 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 54 | uint32_t qlen = query_batch_lens[tid]; 55 | uint32_t tlen = target_batch_lens[tid]; 56 | uint32_t query_batch_regs = (qlen >> 3) + 1;//(qlen >> 3) + (qlen & 0b0111 ? 1 : 0);//number of 32-bit words holding query_batch sequence 57 | uint32_t target_batch_regs = (tlen >> 3) + 1;//(tlen >> 3) + (tlen & 0b0111 ? 1 : 0);//number of 32-bit words holding target_batch sequence 58 | uint32_t h0 = seed_score[tid]; 59 | int32_t subScore; 60 | uint32_t target_tile_id, target_base_id, query_tile_id, query_base_id; 61 | uint32_t gpac, rpac, gbase, rbase; 62 | int zdrop = 0; 63 | 64 | int o_del = _cudaGapO; 65 | int o_ins = _cudaGapO; 66 | int e_del = _cudaGapExtend; 67 | int e_ins = _cudaGapExtend; 68 | 69 | eh_t eh[MAX_QUERY_LEN] ; // score array 70 | int i, j, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ie, gscore, max_off; 71 | for (i = 0; i < MAX_QUERY_LEN; i++) 72 | { 73 | eh[i].h = 0; 74 | eh[i].e = 0; 75 | } 76 | 77 | // fill the first row 78 | eh[0].h = h0; 79 | eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0; 80 | for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j) 81 | eh[j].h = eh[j - 1].h - e_ins; 82 | 83 | // DP loop 84 | max = h0, max_i = max_j = -1; 85 | max_ie = -1, gscore = -1; 86 | max_off = 0; 87 | beg = 0, end = qlen; 88 | 89 | for (target_tile_id = 0; target_tile_id < target_batch_regs; target_tile_id++) //target_batch sequence in rows 90 | { 91 | gpac = packed_target_batch[packed_target_batch_idx + target_tile_id];//load 8 packed bases from target_batch sequence 92 | 93 | for (target_base_id = 0; target_base_id < TILE_SIDE; target_base_id++) 94 | { 95 | 96 | i = target_tile_id * TILE_SIDE + target_base_id; 97 | 98 | if (i >= tlen) // skip padding 99 | break; 100 | 101 | gbase = (gpac >> (32 - (target_base_id+1)*4 )) & 0x0F; /* get a base from target_batch sequence */ 102 | 103 | int t, f = 0, h1, m = 0, mj = -1; 104 | // compute the first column 105 | if (beg == 0) { 106 | h1 = h0 - (o_del + e_del * (i + 1)); 107 | if (h1 < 0) 108 | h1 = 0; 109 | } else 110 | h1 = 0; 111 | 112 | 113 | for(query_tile_id = 0; (query_tile_id < query_batch_regs); query_tile_id++) 114 | { 115 | rpac = packed_query_batch[packed_query_batch_idx + query_tile_id];//load 8 bases from query_batch sequence 116 | 117 | for(query_base_id = 0; (query_base_id < TILE_SIDE); query_base_id++) 118 | { 119 | j = query_tile_id * TILE_SIDE + query_base_id; 120 | if (j < beg) 121 | continue; 122 | if (j >= end) 123 | break; 124 | 125 | rbase = (rpac >> (32 - (query_base_id+1)*4 )) & 0x0F;//get a base from query_batch sequence 126 | 127 | // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) 128 | // Similar to SSE2-SW, cells are computed in the following order: 129 | // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} 130 | // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape 131 | // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape 132 | eh_t *p = &eh[j]; 133 | int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) 134 | p->h = h1; // set H(i,j-1) for the next row 135 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase); 136 | M = M ? M + subScore : 0; // separating H and M to disallow a cigar like "100M3I3D20M" 137 | h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 138 | h = h > f ? h : f; 139 | h1 = h; // save H(i,j) to h1 for the next column 140 | mj = m > h ? mj : j; // record the position where max score is achieved 141 | m = m > h ? m : h; // m is stored at eh[mj+1] 142 | t = M - oe_del; 143 | t = t > 0 ? t : 0; 144 | e -= e_del; 145 | e = e > t ? e : t; // computed E(i+1,j) 146 | p->e = e; // save E(i+1,j) for the next row 147 | t = M - oe_ins; 148 | t = t > 0 ? t : 0; 149 | f -= e_ins; 150 | f = f > t ? f : t; // computed F(i,j+1) 151 | } 152 | } 153 | eh[end].h = h1; 154 | eh[end].e = 0; 155 | if (j == qlen) { 156 | max_ie = gscore > h1 ? max_ie : i; 157 | gscore = gscore > h1 ? gscore : h1; 158 | } 159 | if (m == 0) 160 | break; 161 | if (m > max) { 162 | max = m, max_i = i, max_j = mj; 163 | max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); 164 | } else if (zdrop > 0) { 165 | if (i - max_i > mj - max_j) { 166 | if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) 167 | break; 168 | } else { 169 | if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) 170 | break; 171 | } 172 | } 173 | /* This is defining from where to start the next row and where to end the computation of next row 174 | it skips some of the cells in the beginning and in the end of the row 175 | */ 176 | // update beg and end for the next round 177 | // COULD be done over a constant value... 178 | for (j = beg; (j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) 179 | ; 180 | beg = j; 181 | for (j = end; (j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) 182 | ; 183 | end = j + 2 < qlen ? j + 2 : qlen; 184 | //beg = 0; end = qlen; // uncomment this line for debugging 185 | } 186 | } 187 | 188 | if (gscore <= 0 || gscore <= max - PEN_CLIP5) 189 | { 190 | device_res->aln_score[tid] = max; 191 | device_res->query_batch_end[tid] = max_j + 1; 192 | device_res->target_batch_end[tid] = max_i + 1; 193 | } else { 194 | device_res->aln_score[tid] = gscore; 195 | device_res->query_batch_end[tid] = qlen; 196 | device_res->target_batch_end[tid] = max_ie + 1; 197 | } 198 | 199 | } 200 | 201 | 202 | #endif 203 | 204 | -------------------------------------------------------------------------------- /src/kernels/pack_rc_seqs.h: -------------------------------------------------------------------------------- 1 | #ifndef __KERNEL_SEQPAK__ 2 | #define __KERNEL_SEQPAK__ 3 | 4 | 5 | #define A_PAK ('A'&0x0F) 6 | #define C_PAK ('C'&0x0F) 7 | #define G_PAK ('G'&0x0F) 8 | #define T_PAK ('T'&0x0F) 9 | //#define N_PAK ('N'&0x0F) 10 | 11 | 12 | 13 | __global__ void gasal_pack_kernel(uint32_t* unpacked_query_batch, uint32_t* unpacked_target_batch, uint32_t *packed_query_batch, uint32_t* packed_target_batch, int query_batch_tasks_per_thread, int target_batch_tasks_per_thread, uint32_t total_query_batch_regs, uint32_t total_target_batch_regs) \ 14 | { 15 | 16 | int32_t i; 17 | const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 18 | uint32_t n_threads = gridDim.x * blockDim.x; 19 | for (i = 0; i < query_batch_tasks_per_thread && (((i*n_threads)<<1) + (tid<<1) < total_query_batch_regs); ++i) { 20 | uint32_t *query_addr = &(unpacked_query_batch[(i*n_threads)<<1]); 21 | uint32_t reg1 = query_addr[(tid << 1)]; //load 4 bases of the query sequence from global memory 22 | uint32_t reg2 = query_addr[(tid << 1) + 1]; //load another 4 bases 23 | uint32_t packed_reg = 0; 24 | packed_reg |= (reg1 & 15) << 28; // --- 25 | packed_reg |= ((reg1 >> 8) & 15) << 24; // | 26 | packed_reg |= ((reg1 >> 16) & 15) << 20;// | 27 | packed_reg |= ((reg1 >> 24) & 15) << 16;// | 28 | packed_reg |= (reg2 & 15) << 12; // > pack sequence 29 | packed_reg |= ((reg2 >> 8) & 15) << 8; // | 30 | packed_reg |= ((reg2 >> 16) & 15) << 4; // | 31 | packed_reg |= ((reg2 >> 24) & 15); //---- 32 | uint32_t *packed_query_addr = &(packed_query_batch[i*n_threads]); 33 | packed_query_addr[tid] = packed_reg; //write 8 bases of packed query sequence to global memory 34 | } 35 | 36 | for (i = 0; i < target_batch_tasks_per_thread && (((i*n_threads)<<1) + (tid<<1)) < total_target_batch_regs; ++i) { 37 | uint32_t *target_addr = &(unpacked_target_batch[(i * n_threads)<<1]); 38 | uint32_t reg1 = target_addr[(tid << 1)]; //load 4 bases of the target sequence from global memory 39 | uint32_t reg2 = target_addr[(tid << 1) + 1]; //load another 4 bases 40 | uint32_t packed_reg = 0; 41 | packed_reg |= (reg1 & 15) << 28; // --- 42 | packed_reg |= ((reg1 >> 8) & 15) << 24; // | 43 | packed_reg |= ((reg1 >> 16) & 15) << 20;// | 44 | packed_reg |= ((reg1 >> 24) & 15) << 16;// | 45 | packed_reg |= (reg2 & 15) << 12; // > pack sequence 46 | packed_reg |= ((reg2 >> 8) & 15) << 8; // | 47 | packed_reg |= ((reg2 >> 16) & 15) << 4; // | 48 | packed_reg |= ((reg2 >> 24) & 15); //---- 49 | uint32_t *packed_target_addr = &(packed_target_batch[i * n_threads]); 50 | packed_target_addr[tid] = packed_reg; //write 8 bases of packed target sequence to global memory 51 | } 52 | 53 | } 54 | 55 | 56 | __global__ void gasal_reversecomplement_kernel(uint32_t *packed_query_batch,uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint8_t *query_op, uint8_t *target_op, uint32_t n_tasks) 57 | { 58 | 59 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 60 | 61 | if (tid >= n_tasks) return; 62 | if (query_op[tid] == 0 && target_op[tid] == 0) return; // if there's nothing to do (op=0, meaning sequence is Forward Natural), just exit the kernel ASAP. 63 | 64 | 65 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence 66 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 67 | uint32_t read_len = query_batch_lens[tid]; 68 | uint32_t ref_len = target_batch_lens[tid]; 69 | uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch 70 | uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch 71 | 72 | uint32_t query_batch_regs_to_swap = (query_batch_regs >> 1) + (query_batch_regs & 1); // that's (query_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence) 73 | uint32_t target_batch_regs_to_swap = (target_batch_regs >> 1) + (target_batch_regs & 1); // that's (target_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence) 74 | 75 | 76 | // variables used dependent on target and query: 77 | 78 | uint8_t *op = NULL; 79 | uint32_t *packed_batch = NULL; 80 | uint32_t *batch_regs = NULL; 81 | uint32_t *batch_regs_to_swap = NULL; 82 | uint32_t *packed_batch_idx = NULL; 83 | 84 | // avoid useless code duplicate thanks to pointers to route the data flow where it should be, twice. 85 | // The kernel is already generic. Later on this can be used to split the kernel into two using templates... 86 | #pragma unroll 2 87 | for (int p = QUERY; p <= TARGET; p++) 88 | { 89 | switch(p) 90 | { 91 | case QUERY: 92 | op = query_op; 93 | packed_batch = packed_query_batch; 94 | batch_regs = &query_batch_regs; 95 | batch_regs_to_swap = &query_batch_regs_to_swap; 96 | packed_batch_idx = &packed_query_batch_idx; 97 | break; 98 | case TARGET: 99 | op = target_op; 100 | packed_batch = packed_target_batch; 101 | batch_regs = &target_batch_regs; 102 | batch_regs_to_swap = &target_batch_regs_to_swap; 103 | packed_batch_idx = &packed_target_batch_idx; 104 | break; 105 | default: 106 | break; 107 | } 108 | 109 | if (*(op + tid) & 0x01) // reverse 110 | { 111 | // deal with N's : read last word, find how many N's, store that number as offset, and pad with that many for the last 112 | uint8_t nbr_N = 0; 113 | for (int j = 0; j < 32; j = j + 4) 114 | { 115 | nbr_N += (((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1) & (0x0F << j)) >> j) == N_CODE); 116 | } 117 | 118 | //printf("KERNEL_DEBUG: nbr_N=%d\n", nbr_N); 119 | 120 | 121 | nbr_N = nbr_N << 2; // we operate on nibbles so we will need to do our shifts 4 bits by 4 bits, so 4*nbr_N 122 | 123 | for (uint32_t i = 0; i < *(batch_regs_to_swap); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if. 124 | { 125 | /* This is the current operation flow:\ 126 | - Read the first 32-bits word on HEAD 127 | - Combine the reads of 2 last 32-bits words on tail to create the 32-bits word WITHOUT N's 128 | - Swap them 129 | - Write them at the correct places. Remember we're building 32-bits words across two 32-bits words on tail. 130 | So we have to take care of which bits are to be written on tail, too. 131 | 132 | You progress through both heads and tails that way, until you reach the center of the sequence. 133 | When you reach it, you actually don't write one of the words to avoid overwrite. 134 | */ 135 | uint32_t rpac_1 = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head 136 | uint32_t rpac_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) << (32-nbr_N)) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) >> nbr_N); 137 | 138 | 139 | uint32_t reverse_rpac_1 = 0; 140 | uint32_t reverse_rpac_2 = 0; 141 | 142 | 143 | #pragma unroll 8 144 | for(int k = 28; k >= 0; k = k - 4) // reverse 32-bits word... is pragma-unrolled. 145 | { 146 | reverse_rpac_1 |= ((rpac_1 & (0x0F << k)) >> (k)) << (28-k); 147 | reverse_rpac_2 |= ((rpac_2 & (0x0F << k)) >> (k)) << (28-k); 148 | } 149 | // last swap operated manually, because of its irregular size (32 - 4*nbr_N bits, hence 8 - nbr_N nibbles) 150 | 151 | 152 | uint32_t to_queue_1 = (reverse_rpac_1 << nbr_N) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) & ((1<> (32-nbr_N)); 154 | 155 | 156 | //printf("KERNEL DEBUG: rpac_1 Word before reverse: %x, after: %x, split into %x + %x \n", rpac_1, reverse_rpac_1, to_queue_2, to_queue_1 ); 157 | //printf("KERNEL DEBUG: rpac_2 Word before reverse: %x, after: %x\n", rpac_2, reverse_rpac_2 ); 158 | 159 | 160 | *(packed_batch + *(packed_batch_idx) + i) = reverse_rpac_2; 161 | (*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) = to_queue_1; 162 | if (i!=*(batch_regs_to_swap)-1) 163 | (*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) = to_queue_2; 164 | 165 | 166 | } // end for 167 | } // end if(reverse) 168 | 169 | if (*(op+tid) & 0x02) // complement 170 | { 171 | for (uint32_t i = 0; i < *(batch_regs); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if. 172 | { 173 | uint32_t rpac = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head 174 | uint32_t nucleotide = 0; 175 | 176 | #pragma unroll 8 177 | for(int k = 28; k >= 0; k = k - 4) // complement 32-bits word... is pragma-unrolled. 178 | { 179 | nucleotide = (rpac & (0x0F << k)) >> (k); 180 | switch(nucleotide) 181 | { 182 | case A_PAK: 183 | nucleotide = T_PAK; 184 | break; 185 | case C_PAK: 186 | nucleotide = G_PAK; 187 | break; 188 | case T_PAK: 189 | nucleotide = A_PAK; 190 | break; 191 | case G_PAK: 192 | nucleotide = C_PAK; 193 | break; 194 | default: 195 | break; 196 | } 197 | rpac = (rpac & (0xFFFFFFFF - (0x0F << k))) | nucleotide << k; 198 | } 199 | 200 | //printf("KERNEL DEBUG: Word read : %x, after complement: %x\n", *(packed_batch + *(packed_batch_idx) + i), rpac); 201 | 202 | *(packed_batch + *(packed_batch_idx) + i) = rpac; 203 | 204 | } // end for 205 | } // end if(complement) 206 | 207 | 208 | 209 | } 210 | 211 | return; 212 | } 213 | #endif -------------------------------------------------------------------------------- /src/args_parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "args_parser.h" 5 | 6 | 7 | 8 | Parameters::Parameters(int argc_, char **argv_) { 9 | 10 | 11 | // default values 12 | sa = (1); 13 | sb = (4); 14 | gapo = (6); 15 | gape = (1); 16 | start_pos = (WITHOUT_START); 17 | print_out = (0); 18 | n_threads = (1); 19 | 20 | k_band = (0); 21 | 22 | isPacked = false; 23 | isReverseComplement = false; 24 | 25 | secondBest = FALSE; 26 | 27 | // query head, target head, query tail, target tail 28 | semiglobal_skipping_head = TARGET; 29 | semiglobal_skipping_tail = TARGET; 30 | 31 | algo = (UNKNOWN); 32 | 33 | query_batch_fasta_filename = ""; 34 | target_batch_fasta_filename = ""; 35 | 36 | argc = argc_; 37 | argv = argv_; 38 | 39 | } 40 | 41 | Parameters::~Parameters() { 42 | query_batch_fasta.close(); 43 | target_batch_fasta.close(); 44 | } 45 | 46 | void Parameters::print() { 47 | std::cerr << "sa=" << sa <<" , sb=" << sb <<" , gapo=" << gapo << " , gape="< " << std::endl; 75 | std::cerr << "Options: -a INT match score ["<< sa <<"]" << std::endl; 76 | std::cerr << " -b INT mismatch penalty [" << sb << "]"<< std::endl; 77 | std::cerr << " -q INT gap open penalty [" << gapo << "]" << std::endl; 78 | std::cerr << " -r INT gap extension penalty ["<< gape <<"]" << std::endl; 79 | std::cerr << " -s find the start position" << std::endl; 80 | std::cerr << " -t compute traceback. With this option enabled, \"-s\" has no effect as start position will always be computed with traceback" << std::endl; 81 | std::cerr << " -p print the alignment results" << std::endl; 82 | std::cerr << " -n INT Number of threads ["<< n_threads<<"]" << std::endl; 83 | std::cerr << " -y AL_TYPE Alignment type . Must be \"local\", \"semi_global\", \"global\", \"ksw\" " << std::endl; 84 | std::cerr << " -x HEAD TAIL specifies, for semi-global alignment, wha should be skipped for heads and tails of the sequences. (NONE, QUERY, TARGET, BOTH)" << std::endl; 85 | std::cerr << " -k INT Band width in case \"banded\" is selected." << std::endl; 86 | std::cerr << " --help, -h : displays this message." << std::endl; 87 | std::cerr << " --second-best displays second best score (WITHOUT_START only)." << std::endl; 88 | std::cerr << "Single-pack multi-Parameters (e.g. -sp) is not supported." << std::endl; 89 | std::cerr << " " << std::endl; 90 | } 91 | 92 | 93 | void Parameters::parse() { 94 | 95 | // before testing anything, check if calling for help. 96 | int c; 97 | 98 | std::string arg_next = ""; 99 | std::string arg_cur = ""; 100 | 101 | for (c = 1; c < argc; c++) 102 | { 103 | arg_cur = std::string((const char*) (*(argv + c) ) ); 104 | arg_next = ""; 105 | if (!arg_cur.compare("--help") || !arg_cur.compare("-h")) 106 | { 107 | help(); 108 | exit(0); 109 | } 110 | } 111 | 112 | if (argc < 4) 113 | { 114 | failure(NOT_ENOUGH_ARGS); 115 | } 116 | 117 | for (c = 1; c < argc - 2; c++) 118 | { 119 | arg_cur = std::string((const char*) (*(argv + c) ) ); 120 | if (arg_cur.at(0) == '-' && arg_cur.at(1) == '-' ) 121 | { 122 | if (!arg_cur.compare("--help")) 123 | { 124 | help(); 125 | exit(0); 126 | } 127 | if (!arg_cur.compare("--second-best")) 128 | { 129 | secondBest = TRUE; 130 | } 131 | 132 | } else if (arg_cur.at(0) == '-' ) 133 | { 134 | if (arg_cur.length() > 2) 135 | failure(WRONG_ARG); 136 | char param = arg_cur.at(1); 137 | switch(param) 138 | { 139 | case 'y': 140 | c++; 141 | arg_next = std::string((const char*) (*(argv + c) ) ); 142 | if (!arg_next.compare("local")) 143 | algo = LOCAL; 144 | else if (!arg_next.compare("semi_global")) 145 | algo = SEMI_GLOBAL; 146 | else if (!arg_next.compare("global")) 147 | algo = GLOBAL; 148 | else if (!arg_next.compare("ksw")) 149 | { 150 | algo = KSW; 151 | } 152 | break; 153 | case 'a': 154 | c++; 155 | arg_next = std::string((const char*) (*(argv + c) ) ); 156 | sa = std::stoi(arg_next); 157 | break; 158 | case 'b': 159 | c++; 160 | arg_next = std::string((const char*) (*(argv + c) ) ); 161 | sb = std::stoi(arg_next); 162 | break; 163 | case 'q': 164 | c++; 165 | arg_next = std::string((const char*) (*(argv + c) ) ); 166 | gapo = std::stoi(arg_next); 167 | break; 168 | case 'r': 169 | c++; 170 | arg_next = std::string((const char*) (*(argv + c) ) ); 171 | gape = std::stoi(arg_next); 172 | break; 173 | case 's': 174 | start_pos = WITH_START; 175 | break; 176 | case 't': 177 | start_pos = WITH_TB; 178 | break; 179 | case 'p': 180 | print_out = 1; 181 | break; 182 | case 'n': 183 | c++; 184 | arg_next = std::string((const char*) (*(argv + c) ) ); 185 | n_threads = std::stoi(arg_next); 186 | break; 187 | case 'k': 188 | c++; 189 | arg_next = std::string((const char*) (*(argv + c) ) ); 190 | k_band = std::stoi(arg_next); 191 | break; 192 | case 'x': 193 | c++; 194 | arg_next = std::string((const char*) (*(argv + c) ) ); 195 | if (!arg_next.compare("NONE")) 196 | semiglobal_skipping_head = NONE; 197 | else if (!arg_next.compare("TARGET")) 198 | semiglobal_skipping_head = TARGET; 199 | else if (!arg_next.compare("QUERY")) 200 | semiglobal_skipping_head = QUERY; 201 | else if (!arg_next.compare("BOTH")) 202 | semiglobal_skipping_head = BOTH; 203 | else 204 | { 205 | failure(WRONG_ARG); 206 | } 207 | 208 | c++; 209 | arg_next = std::string((const char*) (*(argv + c) ) ); 210 | if (!arg_next.compare("NONE")) 211 | semiglobal_skipping_tail = NONE; 212 | else if (!arg_next.compare("TARGET")) 213 | semiglobal_skipping_tail = TARGET; 214 | else if (!arg_next.compare("QUERY")) 215 | semiglobal_skipping_tail = QUERY; 216 | else if (!arg_next.compare("BOTH")) 217 | semiglobal_skipping_tail = BOTH; 218 | else 219 | { 220 | failure(WRONG_ARG); 221 | } 222 | break; 223 | 224 | } 225 | 226 | 227 | } else { 228 | failure(WRONG_ARG); 229 | } 230 | } 231 | 232 | 233 | // the last 2 Parameters are the 2 filenames. 234 | query_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) ); 235 | c++; 236 | target_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) ); 237 | 238 | // Parameters retrieved successfully, open files. 239 | fileopen(); 240 | } 241 | 242 | void Parameters::fileopen() { 243 | query_batch_fasta.open(query_batch_fasta_filename, std::ifstream::in); 244 | if (!query_batch_fasta) 245 | failure(WRONG_FILES); 246 | 247 | target_batch_fasta.open(target_batch_fasta_filename); 248 | if (!target_batch_fasta) 249 | failure(WRONG_FILES); 250 | } 251 | -------------------------------------------------------------------------------- /src/kernels/global.h: -------------------------------------------------------------------------------- 1 | #ifndef __KERNEL_GLOBAL__ 2 | #define __KERNEL_GLOBAL__ 3 | 4 | #define CORE_GLOBAL_COMPUTE() \ 5 | uint32_t gbase = (gpac >> l) & 15;\ 6 | DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);\ 7 | int32_t tmp_hm = p[m] + subScore;\ 8 | h[m] = max(tmp_hm, f[m]);\ 9 | h[m] = max(h[m], e);\ 10 | f[m] = (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ? (tmp_hm - _cudaGapOE) : (f[m] - _cudaGapExtend);\ 11 | e = (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ? (tmp_hm - _cudaGapOE) : (e - _cudaGapExtend);\ 12 | p[m] = h[m-1];\ 13 | 14 | #define CORE_GLOBAL_COMPUTE_TB(direction_reg) \ 15 | uint32_t gbase = (gpac >> l) & 15;\ 16 | DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);\ 17 | int32_t tmp_hm = p[m] + subScore;\ 18 | uint32_t m_or_x = tmp_hm >= p[m] ? 0 : 1;\ 19 | h[m] = max(tmp_hm, f[m]);\ 20 | h[m] = max(h[m], e);\ 21 | direction_reg |= h[m] == tmp_hm ? m_or_x << (28 - ((m - 1) << 2)) : (h[m] == f[m] ? (uint32_t)3 << (28 - ((m - 1) << 2)) : (uint32_t)2 << (28 - ((m - 1) << 2)));\ 22 | direction_reg |= (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ? (uint32_t)0 : (uint32_t)1 << (31 - ((m - 1) << 2));\ 23 | f[m] = (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ? (tmp_hm - _cudaGapOE) : (f[m] - _cudaGapExtend);\ 24 | direction_reg|= (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ? (uint32_t)0 : (uint32_t)1 << (30 - ((m - 1) << 2));\ 25 | e = (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ? (tmp_hm - _cudaGapOE) : (e - _cudaGapExtend);\ 26 | p[m] = h[m-1];\ 27 | 28 | 29 | 30 | template 31 | __global__ void gasal_global_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, uint4 *packed_tb_matrices, int n_tasks) 32 | { 33 | int32_t i, j, k, l, m; 34 | int32_t u = 0, r = 0; 35 | int32_t e; 36 | int32_t subScore; 37 | int tile_no = 0; 38 | 39 | int32_t ridx; 40 | short2 HD; 41 | 42 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 43 | if (tid >= n_tasks) return; 44 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence 45 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 46 | uint32_t read_len = query_batch_lens[tid]; 47 | uint32_t ref_len = target_batch_lens[tid]; 48 | uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch 49 | uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch 50 | //-------arrays to save intermediate values---------------- 51 | short2 global[MAX_QUERY_LEN]; 52 | int32_t h[9]; 53 | int32_t f[9]; 54 | int32_t p[9]; 55 | int32_t max_h[9]; 56 | //---------------------------------------------------------- 57 | global[0] = make_short2(0, MINUS_INF); 58 | for (i = 1; i < MAX_QUERY_LEN; i++) { 59 | global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF); 60 | } 61 | 62 | 63 | h[u++] = 0; 64 | p[r++] = 0; 65 | for (i = 0; i < target_batch_regs; i++) { //target_batch sequence in rows, for all WORDS (i=WORD index) 66 | ridx = 0; 67 | for (m = 1; m < 9; m++, u++, r++) { 68 | h[m] = -(_cudaGapO + (_cudaGapExtend*(u))); 69 | f[m] = MINUS_INF; 70 | p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1))); 71 | } 72 | register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence 73 | 74 | 75 | for (j = 0; j < query_batch_regs; /*++j*/ j+=1) { //query_batch sequence in columns, for all WORDS (j=WORD index). 76 | 77 | register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 packed bases from query_batch sequence 78 | 79 | //--------------compute a tile of 8x8 cells------------------- 80 | if (SAMETYPE(S, Int2Type)) { 81 | uint4 direction = make_uint4(0,0,0,0); 82 | uint32_t rbase = (rpac >> 28) & 15;//get a base from query_batch sequence 83 | //------------load intermediate values---------------------- 84 | HD = global[ridx]; 85 | h[0] = HD.x; 86 | e = HD.y; 87 | #pragma unroll 8 88 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 89 | CORE_GLOBAL_COMPUTE_TB(direction.x); 90 | } 91 | //--------------save intermediate values------------------------- 92 | HD.x = h[m-1]; 93 | HD.y = e;//max(e, 0); 94 | global[ridx] = HD; 95 | ridx++; 96 | //-------------------------------------------------------------- 97 | //------the last column of DP matrix------------ 98 | if (ridx == read_len) { 99 | for (m = 1; m < 9; m++) { 100 | max_h[m] = h[m]; 101 | 102 | } 103 | } 104 | rbase = (rpac >> 24) & 15;//get a base from query_batch sequence 105 | //------------load intermediate values---------------------- 106 | HD = global[ridx]; 107 | h[0] = HD.x; 108 | e = HD.y; 109 | #pragma unroll 8 110 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 111 | CORE_GLOBAL_COMPUTE_TB(direction.y); 112 | } 113 | //--------------save intermediate values------------------------- 114 | HD.x = h[m-1]; 115 | HD.y = e;//max(e, 0); 116 | global[ridx] = HD; 117 | ridx++; 118 | //-------------------------------------------------------------- 119 | //------the last column of DP matrix------------ 120 | if (ridx == read_len) { 121 | for (m = 1; m < 9; m++) { 122 | max_h[m] = h[m]; 123 | 124 | } 125 | } 126 | rbase = (rpac >> 20) & 15;//get a base from query_batch sequence 127 | //------------load intermediate values---------------------- 128 | HD = global[ridx]; 129 | h[0] = HD.x; 130 | e = HD.y; 131 | #pragma unroll 8 132 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 133 | CORE_GLOBAL_COMPUTE_TB(direction.z); 134 | } 135 | //--------------save intermediate values------------------------- 136 | HD.x = h[m-1]; 137 | HD.y = e;//max(e, 0); 138 | global[ridx] = HD; 139 | ridx++; 140 | //-------------------------------------------------------------- 141 | //------the last column of DP matrix------------ 142 | if (ridx == read_len) { 143 | for (m = 1; m < 9; m++) { 144 | max_h[m] = h[m]; 145 | 146 | } 147 | } 148 | rbase = (rpac >> 16) & 15;//get a base from query_batch sequence 149 | //------------load intermediate values---------------------- 150 | HD = global[ridx]; 151 | h[0] = HD.x; 152 | e = HD.y; 153 | #pragma unroll 8 154 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 155 | CORE_GLOBAL_COMPUTE_TB(direction.w); 156 | } 157 | //--------------save intermediate values------------------------- 158 | HD.x = h[m-1]; 159 | HD.y = e;//max(e, 0); 160 | global[ridx] = HD; 161 | ridx++; 162 | //-------------------------------------------------------------- 163 | //------the last column of DP matrix------------ 164 | if (ridx == read_len) { 165 | for (m = 1; m < 9; m++) { 166 | max_h[m] = h[m]; 167 | 168 | } 169 | } 170 | packed_tb_matrices[(tile_no*n_tasks) + tid] = direction; 171 | tile_no++; 172 | 173 | direction = make_uint4(0,0,0,0); 174 | rbase = (rpac >> 12) & 15;//get a base from query_batch sequence 175 | //------------load intermediate values---------------------- 176 | HD = global[ridx]; 177 | h[0] = HD.x; 178 | e = HD.y; 179 | #pragma unroll 8 180 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 181 | CORE_GLOBAL_COMPUTE_TB(direction.x); 182 | } 183 | //--------------save intermediate values------------------------- 184 | HD.x = h[m-1]; 185 | HD.y = e;//max(e, 0); 186 | global[ridx] = HD; 187 | ridx++; 188 | //-------------------------------------------------------------- 189 | //------the last column of DP matrix------------ 190 | if (ridx == read_len) { 191 | for (m = 1; m < 9; m++) { 192 | max_h[m] = h[m]; 193 | 194 | } 195 | } 196 | rbase = (rpac >> 8) & 15;//get a base from query_batch sequence 197 | //------------load intermediate values---------------------- 198 | HD = global[ridx]; 199 | h[0] = HD.x; 200 | e = HD.y; 201 | #pragma unroll 8 202 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 203 | CORE_GLOBAL_COMPUTE_TB(direction.y); 204 | } 205 | //--------------save intermediate values------------------------- 206 | HD.x = h[m-1]; 207 | HD.y = e;//max(e, 0); 208 | global[ridx] = HD; 209 | ridx++; 210 | //-------------------------------------------------------------- 211 | //------the last column of DP matrix------------ 212 | if (ridx == read_len) { 213 | for (m = 1; m < 9; m++) { 214 | max_h[m] = h[m]; 215 | 216 | } 217 | } 218 | rbase = (rpac >> 4) & 15;//get a base from query_batch sequence 219 | //------------load intermediate values---------------------- 220 | HD = global[ridx]; 221 | h[0] = HD.x; 222 | e = HD.y; 223 | #pragma unroll 8 224 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 225 | CORE_GLOBAL_COMPUTE_TB(direction.z); 226 | } 227 | //--------------save intermediate values------------------------- 228 | HD.x = h[m-1]; 229 | HD.y = e;//max(e, 0); 230 | global[ridx] = HD; 231 | ridx++; 232 | //-------------------------------------------------------------- 233 | //------the last column of DP matrix------------ 234 | if (ridx == read_len) { 235 | for (m = 1; m < 9; m++) { 236 | max_h[m] = h[m]; 237 | 238 | } 239 | } 240 | rbase = rpac & 15;//get a base from query_batch sequence 241 | //------------load intermediate values---------------------- 242 | HD = global[ridx]; 243 | h[0] = HD.x; 244 | e = HD.y; 245 | #pragma unroll 8 246 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 247 | CORE_GLOBAL_COMPUTE_TB(direction.w); 248 | } 249 | //--------------save intermediate values------------------------- 250 | HD.x = h[m-1]; 251 | HD.y = e;//max(e, 0); 252 | global[ridx] = HD; 253 | ridx++; 254 | //-------------------------------------------------------------- 255 | //------the last column of DP matrix------------ 256 | if (ridx == read_len) { 257 | for (m = 1; m < 9; m++) { 258 | max_h[m] = h[m]; 259 | 260 | } 261 | } 262 | packed_tb_matrices[(tile_no*n_tasks) + tid] = direction; 263 | tile_no++; 264 | 265 | } 266 | else{ 267 | for (k = 28; k >= 0; k -= 4) { 268 | uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence 269 | //------------load intermediate values---------------------- 270 | HD = global[ridx]; 271 | h[0] = HD.x; 272 | e = HD.y; 273 | //---------------------------------------------------------- 274 | #pragma unroll 8 275 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 276 | CORE_GLOBAL_COMPUTE(); 277 | } 278 | //--------------save intermediate values------------------------- 279 | HD.x = h[m-1]; 280 | HD.y = e;//max(e, 0); 281 | global[ridx] = HD; 282 | ridx++; 283 | //-------------------------------------------------------------- 284 | //------the last column of DP matrix------------ 285 | if (ridx == read_len) { 286 | for (m = 1; m < 9; m++) { 287 | max_h[m] = h[m]; 288 | 289 | } 290 | } 291 | //---------------------------------------------- 292 | } 293 | } 294 | //------------------------------------------------------------------ 295 | } 296 | 297 | } 298 | 299 | device_res->aln_score[tid] = max_h[8 - ((target_batch_regs << 3) - (ref_len))];//copy the max score to the output array in the GPU mem 300 | 301 | return; 302 | 303 | } 304 | #endif 305 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/ctors.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gasal.h" 3 | 4 | #include "args_parser.h" 5 | 6 | #include "host_batch.h" 7 | 8 | #include "res.h" 9 | 10 | #include "ctors.h" 11 | 12 | #include "interfaces.h" 13 | 14 | #include 15 | 16 | 17 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams) { 18 | gasal_gpu_storage_v v; 19 | v.a = (gasal_gpu_storage_t*)calloc(n_streams, sizeof(gasal_gpu_storage_t)); 20 | v.n = n_streams; 21 | return v; 22 | 23 | } 24 | 25 | 26 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec, int max_query_len, int max_target_len, int max_n_alns, Parameters *params) { 27 | 28 | cudaError_t err; 29 | int i; 30 | int max_query_len_8 = max_query_len % 8 ? max_query_len + (8 - (max_query_len % 8)) : max_query_len; 31 | int max_target_len_8 = max_target_len % 8 ? max_target_len + (8 - (max_target_len % 8)) : max_target_len; 32 | 33 | int host_max_query_batch_bytes = max_n_alns * max_query_len_8; 34 | int gpu_max_query_batch_bytes = max_n_alns * max_query_len_8; 35 | int host_max_target_batch_bytes = max_n_alns * max_target_len_8; 36 | int gpu_max_target_batch_bytes = max_n_alns * max_target_len_8; 37 | int host_max_n_alns = max_n_alns; 38 | int gpu_max_n_alns = max_n_alns; 39 | 40 | 41 | 42 | for (i = 0; i < gpu_storage_vec->n; i++) { 43 | 44 | gpu_storage_vec->a[i].extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0); 45 | gpu_storage_vec->a[i].extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0); 46 | 47 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 48 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 49 | 50 | 51 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault)); 52 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault)); 53 | uint8_t *no_ops = NULL; 54 | no_ops = (uint8_t*) calloc(host_max_n_alns * sizeof(uint8_t), sizeof(uint8_t)); 55 | gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, QUERY); 56 | gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, TARGET); 57 | free(no_ops); 58 | 59 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_op), gpu_max_n_alns * sizeof(uint8_t))); 60 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_op), gpu_max_n_alns * sizeof(uint8_t))); 61 | 62 | 63 | 64 | if (params->isPacked) 65 | { 66 | gpu_storage_vec->a[i].packed_query_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_query_batch; 67 | gpu_storage_vec->a[i].packed_target_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_target_batch; 68 | 69 | } else { 70 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 71 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 72 | } 73 | 74 | if (params->algo == KSW) 75 | { 76 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_seed_scores), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 77 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].seed_scores), host_max_n_alns * sizeof(uint32_t))); 78 | } else { 79 | gpu_storage_vec->a[i].host_seed_scores = NULL; 80 | gpu_storage_vec->a[i].seed_scores = NULL; 81 | } 82 | 83 | 84 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 85 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 86 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 87 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault)); 88 | 89 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 90 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 91 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 92 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 93 | 94 | 95 | gpu_storage_vec->a[i].host_res = gasal_res_new_host(host_max_n_alns, params); 96 | if(params->start_pos == WITH_TB) CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_res->cigar), gpu_max_query_batch_bytes * sizeof(uint8_t),cudaHostAllocDefault)); 97 | gpu_storage_vec->a[i].device_cpy = gasal_res_new_device_cpy(max_n_alns, params); 98 | gpu_storage_vec->a[i].device_res = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy); 99 | 100 | if (params->secondBest) 101 | { 102 | gpu_storage_vec->a[i].host_res_second = gasal_res_new_host(host_max_n_alns, params); 103 | gpu_storage_vec->a[i].device_cpy_second = gasal_res_new_device_cpy(host_max_n_alns, params); 104 | gpu_storage_vec->a[i].device_res_second = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy_second); 105 | 106 | } else { 107 | gpu_storage_vec->a[i].host_res_second = NULL; 108 | gpu_storage_vec->a[i].device_cpy_second = NULL; 109 | gpu_storage_vec->a[i].device_res_second = NULL; 110 | } 111 | 112 | if (params->start_pos == WITH_TB) { 113 | gpu_storage_vec->a[i].packed_tb_matrix_size = ((uint32_t)ceil(((double)((uint64_t)max_query_len_8*(uint64_t)max_target_len_8))/32)) * gpu_max_n_alns; 114 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_tb_matrices), gpu_storage_vec->a[i].packed_tb_matrix_size * sizeof(uint4))); 115 | } 116 | 117 | 118 | CHECKCUDAERROR(cudaStreamCreate(&(gpu_storage_vec->a[i].str))); 119 | gpu_storage_vec->a[i].is_free = 1; 120 | gpu_storage_vec->a[i].host_max_query_batch_bytes = host_max_query_batch_bytes; 121 | gpu_storage_vec->a[i].host_max_target_batch_bytes = host_max_target_batch_bytes; 122 | gpu_storage_vec->a[i].host_max_n_alns = host_max_n_alns; 123 | gpu_storage_vec->a[i].gpu_max_query_batch_bytes = gpu_max_query_batch_bytes; 124 | gpu_storage_vec->a[i].gpu_max_target_batch_bytes = gpu_max_target_batch_bytes; 125 | gpu_storage_vec->a[i].gpu_max_n_alns = gpu_max_n_alns; 126 | gpu_storage_vec->a[i].current_n_alns = 0; 127 | } 128 | } 129 | 130 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params) { 131 | 132 | cudaError_t err; 133 | 134 | int i; 135 | for (i = 0; i < gpu_storage_vec->n; i ++) { 136 | 137 | gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_query_batch); 138 | gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_target_batch); 139 | 140 | gasal_res_destroy_host(gpu_storage_vec->a[i].host_res); 141 | gasal_res_destroy_device(gpu_storage_vec->a[i].device_res, gpu_storage_vec->a[i].device_cpy); 142 | 143 | if (params->secondBest) 144 | { 145 | gasal_res_destroy_host(gpu_storage_vec->a[i].host_res_second); 146 | gasal_res_destroy_device(gpu_storage_vec->a[i].device_res_second, gpu_storage_vec->a[i].device_cpy_second); 147 | } 148 | 149 | 150 | if (!(params->algo == KSW)) 151 | { 152 | if (gpu_storage_vec->a[i].seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].seed_scores)); 153 | if (gpu_storage_vec->a[i].host_seed_scores != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_seed_scores)); 154 | } 155 | 156 | if (gpu_storage_vec->a[i].query_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_op)); 157 | if (gpu_storage_vec->a[i].target_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_op)); 158 | if (gpu_storage_vec->a[i].host_query_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_op)); 159 | if (gpu_storage_vec->a[i].host_target_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_op)); 160 | 161 | if (gpu_storage_vec->a[i].host_query_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_offsets)); 162 | if (gpu_storage_vec->a[i].host_target_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_offsets)); 163 | if (gpu_storage_vec->a[i].host_query_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_lens)); 164 | if (gpu_storage_vec->a[i].host_target_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_lens)); 165 | if (gpu_storage_vec->a[i].host_res->cigar != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_res->cigar)); 166 | 167 | 168 | 169 | 170 | if (gpu_storage_vec->a[i].unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_query_batch)); 171 | if (gpu_storage_vec->a[i].unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_target_batch)); 172 | if (!(params->isPacked)) 173 | { 174 | if (gpu_storage_vec->a[i].packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_query_batch)); 175 | if (gpu_storage_vec->a[i].packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_target_batch)); 176 | } 177 | 178 | 179 | if (gpu_storage_vec->a[i].query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_offsets)); 180 | if (gpu_storage_vec->a[i].target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_offsets)); 181 | if (gpu_storage_vec->a[i].query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_lens)); 182 | if (gpu_storage_vec->a[i].target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_lens)); 183 | if (gpu_storage_vec->a[i].packed_tb_matrices != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_tb_matrices)); 184 | 185 | if (gpu_storage_vec->a[i].str != NULL)CHECKCUDAERROR(cudaStreamDestroy(gpu_storage_vec->a[i].str)); 186 | } 187 | 188 | 189 | 190 | } 191 | 192 | 193 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec) { 194 | 195 | if(gpu_storage_vec->a != NULL) free(gpu_storage_vec->a); 196 | } 197 | 198 | 199 | 200 | 201 | // Deprecated 202 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params) { 203 | 204 | cudaError_t err; 205 | // if (gpu_storage->gpu_max_query_batch_bytes % 8) { 206 | // fprintf(stderr, "[GASAL ERROR:] max_query_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_query_batch_bytes % 8); 207 | // exit(EXIT_FAILURE); 208 | // } 209 | // if (gpu_storage->gpu_max_target_batch_bytes % 8) { 210 | // fprintf(stderr, "[GASAL ERROR:] max_target_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_target_batch_bytes % 8); 211 | // exit(EXIT_FAILURE); 212 | // } 213 | 214 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t))); 215 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t))); 216 | 217 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 218 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 219 | 220 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 221 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_max_n_alns * sizeof(uint32_t))); 222 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 223 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t))); 224 | 225 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 226 | 227 | gpu_storage->gpu_max_query_batch_bytes = gpu_max_query_batch_bytes; 228 | gpu_storage->gpu_max_target_batch_bytes = gpu_max_target_batch_bytes; 229 | gpu_storage->gpu_max_n_alns = gpu_max_n_alns; 230 | 231 | } 232 | 233 | // Deprecated 234 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params) { 235 | 236 | cudaError_t err; 237 | 238 | if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch)); 239 | if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch)); 240 | if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch)); 241 | if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch)); 242 | if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets)); 243 | if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets)); 244 | if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens)); 245 | if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens)); 246 | 247 | gasal_res_destroy_device(gpu_storage->device_res,gpu_storage->device_cpy); 248 | if (params->secondBest) 249 | { 250 | gasal_res_destroy_device(gpu_storage->device_res_second, gpu_storage->device_cpy_second); 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/__deprecated.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | //GASAL2 blocking alignment function - DEPRECATED 4 | /* 5 | void gasal_aln(gasal_gpu_storage_t *gpu_storage, const uint8_t *query_batch, const uint32_t *query_batch_offsets, const uint32_t *query_batch_lens, const uint8_t *target_batch, const uint32_t *target_batch_offsets, const uint32_t *target_batch_lens, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, int32_t *host_aln_score, int32_t *host_query_batch_start, int32_t *host_target_batch_start, int32_t *host_query_batch_end, int32_t *host_target_batch_end, algo_type algo, comp_start start) { 6 | 7 | cudaError_t err; 8 | if (actual_n_alns <= 0) { 9 | fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n"); 10 | exit(EXIT_FAILURE); 11 | } 12 | if (actual_query_batch_bytes <= 0) { 13 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n"); 14 | exit(EXIT_FAILURE); 15 | } 16 | if (actual_target_batch_bytes <= 0) { 17 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n"); 18 | exit(EXIT_FAILURE); 19 | } 20 | 21 | if (actual_query_batch_bytes % 8) { 22 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes); 23 | exit(EXIT_FAILURE); 24 | } 25 | if (actual_target_batch_bytes % 8) { 26 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes); 27 | exit(EXIT_FAILURE); 28 | 29 | } 30 | //--------------if pre-allocated memory is less, allocate more-------------------------- 31 | if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) { 32 | 33 | int i = 2; 34 | while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++; 35 | gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i; 36 | 37 | fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i); 38 | 39 | if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch)); 40 | if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch)); 41 | 42 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t))); 43 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 44 | 45 | 46 | 47 | 48 | } 49 | 50 | if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) { 51 | 52 | int i = 2; 53 | while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++; 54 | gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i; 55 | 56 | fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i); 57 | 58 | if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch)); 59 | if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch)); 60 | 61 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t))); 62 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 63 | 64 | 65 | } 66 | 67 | if (gpu_storage->gpu_max_n_alns < actual_n_alns) { 68 | fprintf(stderr, "[GASAL] gpu_max_n_alns(%d) should be >= acutal_n_alns(%d)\n", gpu_storage->gpu_max_n_alns, actual_n_alns); 69 | 70 | int i = 2; 71 | while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++; 72 | gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i; 73 | 74 | fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i); 75 | 76 | 77 | if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets)); 78 | if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets)); 79 | if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens)); 80 | if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens)); 81 | if (gpu_storage->aln_score != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->aln_score)); 82 | if (gpu_storage->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_start)); 83 | if (gpu_storage->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_start)); 84 | if (gpu_storage->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_end)); 85 | if (gpu_storage->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_end)); 86 | 87 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 88 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 89 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 90 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 91 | 92 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->aln_score),gpu_storage->gpu_max_n_alns * sizeof(int32_t))); 93 | if (algo == GLOBAL) { 94 | gpu_storage->query_batch_start = NULL; 95 | gpu_storage->query_batch_end = NULL; 96 | gpu_storage->target_batch_start = NULL; 97 | gpu_storage->target_batch_end = NULL; 98 | } else { 99 | CHECKCUDAERROR( 100 | cudaMalloc(&(gpu_storage->target_batch_end), 101 | gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 102 | if (start == WITH_START) { 103 | CHECKCUDAERROR( 104 | cudaMalloc(&(gpu_storage->target_batch_start), 105 | gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 106 | } else 107 | gpu_storage->target_batch_start = NULL; 108 | if (algo == LOCAL) { 109 | CHECKCUDAERROR( 110 | cudaMalloc(&(gpu_storage->query_batch_end), 111 | gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 112 | if (start == WITH_START) { 113 | CHECKCUDAERROR( 114 | cudaMalloc(&(gpu_storage->query_batch_start), 115 | gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 116 | } else 117 | gpu_storage->query_batch_start = NULL; 118 | } else { 119 | gpu_storage->query_batch_start = NULL; 120 | gpu_storage->query_batch_end = NULL; 121 | } 122 | } 123 | 124 | 125 | 126 | } 127 | //------------------------------------------------------------------------------------------- 128 | 129 | //------------------------copy sequence batches from CPU to GPU--------------------------- 130 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->unpacked_query_batch, query_batch, actual_query_batch_bytes, cudaMemcpyHostToDevice)); 131 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->unpacked_target_batch, target_batch, actual_target_batch_bytes, cudaMemcpyHostToDevice)); 132 | //---------------------------------------------------------------------------------------- 133 | 134 | uint32_t BLOCKDIM = 128; 135 | uint32_t N_BLOCKS = (actual_n_alns + BLOCKDIM - 1) / BLOCKDIM; 136 | 137 | int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*BLOCKDIM*N_BLOCKS)); 138 | int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*BLOCKDIM*N_BLOCKS)); 139 | 140 | //launch packing kernel 141 | gasal_pack_kernel<<>> ((uint32_t*)(gpu_storage->unpacked_query_batch), 142 | (uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, 143 | query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4); 144 | cudaError_t pack_kernel_err = cudaGetLastError(); 145 | if ( cudaSuccess != pack_kernel_err ) 146 | { 147 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err, __LINE__, __FILE__); 148 | exit(EXIT_FAILURE); 149 | } 150 | 151 | //----------------------copy sequence offsets and lengths from CPU to GPU-------------------------------------- 152 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->query_batch_lens, query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice)); 153 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->target_batch_lens, target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice)); 154 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->query_batch_offsets, query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice)); 155 | CHECKCUDAERROR(cudaMemcpy(gpu_storage->target_batch_offsets, target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice)); 156 | //------------------------------------------------------------------------------------------------------------------------ 157 | 158 | //--------------------------------------launch alignment kernels-------------------------------------------------------------- 159 | if(algo == LOCAL) { 160 | if (start == WITH_START) { 161 | gasal_local_with_start_kernel<<>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 162 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, 163 | gpu_storage->query_batch_end, gpu_storage->target_batch_end, gpu_storage->query_batch_start, 164 | gpu_storage->target_batch_start, actual_n_alns); 165 | } else { 166 | gasal_local_kernel<<>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 167 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, 168 | gpu_storage->query_batch_end, gpu_storage->target_batch_end, actual_n_alns, LOCAL); 169 | } 170 | } else if (algo == SEMI_GLOBAL) { 171 | if (start == WITH_START) { 172 | gasal_semi_global_with_start_kernel<<>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 173 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, gpu_storage->target_batch_end, 174 | gpu_storage->target_batch_start, actual_n_alns); 175 | } else { 176 | gasal_semi_global_kernel<<>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 177 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, gpu_storage->target_batch_end, 178 | actual_n_alns); 179 | } 180 | 181 | } else if (algo == GLOBAL) { 182 | gasal_global_kernel<<>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 183 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, actual_n_alns); 184 | } 185 | else { 186 | fprintf(stderr, "[GASAL ERROR:] Algo type invalid\n"); 187 | exit(EXIT_FAILURE); 188 | } 189 | //----------------------------------------------------------------------------------------------------------------------- 190 | cudaError_t aln_kernel_err = cudaGetLastError(); 191 | if ( cudaSuccess != aln_kernel_err ) 192 | { 193 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err, __LINE__, __FILE__); 194 | exit(EXIT_FAILURE); 195 | } 196 | 197 | //------------------------copy alignment results from GPU to CPU-------------------------------------- 198 | if (host_aln_score != NULL && gpu_storage->aln_score != NULL) CHECKCUDAERROR(cudaMemcpy(host_aln_score, gpu_storage->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost)); 199 | else { 200 | fprintf(stderr, "[GASAL ERROR:] The *host_aln_score input can't be NULL\n"); 201 | exit(EXIT_FAILURE); 202 | } 203 | if (host_query_batch_start != NULL && gpu_storage->query_batch_start != NULL) CHECKCUDAERROR(cudaMemcpy(host_query_batch_start, gpu_storage->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost)); 204 | if (host_target_batch_start != NULL && gpu_storage->target_batch_start != NULL) CHECKCUDAERROR(cudaMemcpy(host_target_batch_start, gpu_storage->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost)); 205 | if (host_query_batch_end != NULL && gpu_storage->query_batch_end != NULL) CHECKCUDAERROR(cudaMemcpy(host_query_batch_end, gpu_storage->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost)); 206 | if (host_target_batch_end != NULL && gpu_storage->target_batch_end != NULL) CHECKCUDAERROR(cudaMemcpy(host_target_batch_end, gpu_storage->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost)); 207 | //------------------------------------------------------------------------------------------------------ 208 | 209 | } 210 | 211 | */ 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GASAL2 - GPU-accelerated DNA alignment library 2 | GASAL2 is an easy-to-use CUDA library for DNA/RNA sequence alignment algorithms. Currently it supports different kind of alignments: 3 | - local alignment 4 | - semi-global alignment 5 | - global alignment 6 | - tile-based banded alignment. 7 | 8 | It can also reverse and, or complement any sequences independently before alignment, and report second-best scores for certain alignment types. 9 | 10 | It is an extension of GASAL (https://github.com/nahmedraja/GASAL) and allows full overlapping of CPU and GPU execution. 11 | 12 | ## List of new features: 13 | - **Added traceback computation. The ouput is in CIGAR format** 14 | - **GASAL2 can now compute all types of semi-global alignments** 15 | - **Added expandable memory management on host side. The batches of query and target sequences are automatically enlarged if the required memory becomes larger than the allocated memory** 16 | - **Added kernel to reverse-complement sequences.** 17 | - **Cleaned up, inconsistencies fixed, and a small optimization has been added (around 9% speedup with exact same result)** 18 | 19 | 20 | ## Changes in user interface: 21 | - Changed the interface of `gasal_init_streams()` function 22 | - The user now has to provide `MAX_QUERY_LEN` instead of `MAX_SEQ_LEN` during compilation 23 | 24 | ## Requirements 25 | A Linux platform with CUDA toolkit 8 or higher is required, along with usual build environment for C and C++ code. GASAL2 has been tested over NVIDIA GPUs with compute capabilities of 2.0, 3.5 and 5.0. Although lower versions of the CUDA framework might work, they have not been tested. 26 | 27 | ## Compiling GASAL2 28 | The library can be compiled with the following two commands: 29 | 30 | ```bash 31 | $ ./configure.sh 32 | $ make GPU_SM_ARCH= MAX_QUERY_LEN= N_CODE= [N_PENALTY=] 33 | ``` 34 | 35 | `N_PENALTY` is optional and if it is not specified then GASAL2 considers "N" as an ordinary base having the same match/mismatch scores as for A, C, G or T. As a result of these commands, *include* and *lib* directories will be created containing various `.h` files and `libgasal.a`, respectively. The user needs to include `gasal_header.h` in the code and link it with `libgasal.a` during compilation. Also, the CUDA runtime library has to be linked by adding `-lcudart` flag. The path to the CUDA runtime library must also be specfied while linking as *-L *. 36 | 37 | ## Using GASAL2 38 | 39 | ### Initialization 40 | To use GASAL2 alignment functions, first the match/mismatach scores and gap open/extension penalties need to be passed on to the GPU. Assign the values match/mismatach scores and gap open/extension penalties to the members of `gasal_subst_scores` struct: 41 | 42 | ```C 43 | typedef struct{ 44 | int32_t match; 45 | int32_t mismatch; 46 | int32_t gap_open; 47 | int32_t gap_extend; 48 | }gasal_subst_scores; 49 | ``` 50 | 51 | The values are passed to the GPU by calling `gasal_copy_subst_scores()` function: 52 | 53 | ```C 54 | void gasal_copy_subst_scores(gasal_subst_scores *subst); 55 | ``` 56 | 57 | A vector of `gasal_gpu_storage_t` is created a the following function: 58 | 59 | ```C 60 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams); 61 | ``` 62 | 63 | With the help of `n_streams`, the user specifies the number of outstanding GPU alignment kernel launches to be performed. The return type is `gasal_gpu_storage_v`: 64 | 65 | ```C 66 | typedef struct{ 67 | int n; 68 | gasal_gpu_storage_t *a; 69 | }gasal_gpu_storage_v; 70 | ``` 71 | 72 | with `n = n_streams` and `a` being a pointer to the array. An element of the array holds the required data structurea of a stream. To destroy the vector the following function is used: 73 | 74 | ```C 75 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec); 76 | ``` 77 | 78 | The streams in the vector are initialized by calling: 79 | 80 | ```C 81 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec, int max_query_len, int max_target_len, int max_n_alns, Parameters *params); 82 | ``` 83 | 84 | In GASAL2, the sequences to be aligned are conatined in two batches. A sequence in query_batch is aligned to sequence in target_batch. A *batch* is a concatenation of sequences. *The length of a sequence must be a multiple of 8*. Hence, if a sequence is not a multiple of 8, `N's` are added at the end of sequence. We call these redundant bases as *Pad bases*. Note that the pad bases are always "N's" irrespective of whether `N_PENALTY` is defined or not. The `gasal_init_streams()` function alloctes the memory required by a stream. With the help of *max_batch_bytes*, the user specifies the expected maxumum size(in bytes) of sequences in the two batches. *host_max_batch_bytes* are pre-allocated on the CPU. Smilarly, *gpu_max_batch_bytes* are pre-allocated on the GPU. *max_n_alns* is the expected maximum number of sequences in a batch. If the actual required GPU memory is more than the pre-allocated memory, GASAL2 automatically allocates more memory. 85 | 86 | Most GASAL2 functions operate with a Parameters object. This object holds all the informations about the alignment options selected. In particular, the alignment type, the default values when opening or extending gaps, etc. The Parameters object is filled like this: 87 | 88 | ```C 89 | Parameters *args; 90 | args = new Parameters(0, NULL); 91 | 92 | args->algo = ; 93 | args->start_pos = ; //`WITHOUT_START` computes only the score and end-position. `WITH_START` computes the start-position with score and end-position. `WITH_TB` computes the score, start-position, end-position and traceback in CIGAR format. 94 | args->isReverseComplement = ; //whether to reverse-complement the query sequence. 95 | args->semiglobal_skipping_head = ; //ignore gaps at the begining of QUERY|TARGET|BOTH|NONE in semi alignment-global. 96 | args->semiglobal_skipping_tail = ; //ignore gaps at the end of QUERY|TARGET|BOTH|NONE in semi alignment-global. 97 | args->secondBest = ; //whether to compute the second best score in local and semi-global algo. But the start-position(WITH_START) and traceback(WITH_TRACEBACK) is only computMarched with the best score. 98 | 99 | ``` 100 | 101 | 102 | To free up the allocated memory the following function is used: 103 | 104 | ```C 105 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params); 106 | ``` 107 | 108 | The `gasal_init_streams()` and `gasal_destroy_streams()` internally use `cudaMalloc()`, `cudaMallocHost()`, `cudaFree()` and `cudaFreeHost()` functions. These CUDA API functions are time expensive. Therefore, `gasal_init_streams()` and `gasal_destroy_streams()` should be preferably called only once in the program. You will find all these functions in the file `ctors.cpp`. 109 | 110 | 111 | ### Input data preparation 112 | The `gasal_gpu_storage_t` in `gasal.h` holds the data structures for a stream. In the following we only show those members of `gasal_gpu_storage_t` which should be accessed by the user. Other fields should not be modified manually and the user should rely on dedicated functions for complex operations. 113 | 114 | ```C 115 | typedef struct{ 116 | ... 117 | uint8_t *host_query_op; 118 | uint8_t *host_target_op; 119 | ... 120 | uint32_t *host_query_batch_offsets; 121 | uint32_t *host_target_batch_offsets; 122 | uint32_t *host_query_batch_lens; 123 | uint32_t *host_target_batch_lens; 124 | uint32_t host_max_query_batch_bytes; 125 | uint32_t host_max_target_batch_bytes; 126 | gasal_res_t *host_res; 127 | gasal_res_t *host_res_second; 128 | uint32_t host_max_n_alns; 129 | uint32_t current_n_alns; 130 | int is_free; 131 | ... 132 | } gasal_gpu_storage_t; 133 | ``` 134 | 135 | 136 | 137 | To align the sequences the user first need to check the availability of a stream. If `is_free` is 1, the user can use the current stream to perform the alignment on the GPU. 138 | To do this, the user must fill the sequences with the following function. 139 | 140 | ```C 141 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC); 142 | 143 | ``` 144 | 145 | This function takes a sequence and its length, and append it in the data structure. It also adds the neccessary padding bases to ensure the sequence has a length which is a multiple of 8. Moreover, it takes care of allocating more memory if there is not enough room when adding the sequence. `SRC` is either `QUERY` or `TARGET`, depending upon which batch to fill. When executed, this function returns the offset to be filled by the user in `host_target_batch_offsets` or `host_query_batch_offsets`. The user also has to fill `host_target_batch_lens` or `host_query_batch_lens` with original length of sequences, i.e. length without pad bases. **The offset values include pad bases, whereas lengths are without pad bases**. The number of elements in offset and length arrays must be equal. The offset values allows the user to express the mode of pairwise alignment, i.e. one-to-one, one-to-all or one-to-many etc., between the query and traget sequences. The `current_n_alns` must appropriately be incremented to show the current number of alignments. `host_max_n_alns` is initially set equal to `max_n_alns` in `gasal_init_streams()` function. If the 'current_n_alns' exceeds `host_max_n_alns`, the user must call the following funnction to reallocate host offset, lengths and results arrays.March 146 | 147 | ```C 148 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params); 149 | 150 | ``` 151 | 152 | where `new_max_alns` is the new value of `host_max_n_alns`. 153 | 154 | 155 | One can also use the `gasal_host_batch_addbase` to add a single base to the sequence. This takes care of memory reallocation if needed, but does not take care of padding, so this has to be used carefully. 156 | 157 | 158 | The the list of pre-processing operation (nothing, reverse, complement, reverse-complement) that has to be done on the batch of sequence can be loaded into the gpu_storage with the function `gasal_op_fill`. Its code is in `interfaces.cpp`. It fills `host_query_op` and `host_query_op` with an array of size `host_max_n_alns` where each value is the value of the enumeration of `operation_on_seq` (in gasal.h): 159 | ```C 160 | enum operation_on_seq{ 161 | FORWARD_NATURAL, 162 | REVERSE_NATURAL, 163 | FORWARD_COMPLEMENT, 164 | REVERSE_COMPLEMENT, 165 | }; 166 | ``` 167 | By default, no operations are done on the sequences (that is, the fields `host_query_op` and `host_target_op` arrays are initialized to 0, which is the value of FORWARD_NATURAL). 168 | March 169 | 170 | ### Alignment launching 171 | To launch the alignment, the following function is used: 172 | 173 | ```C 174 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params) 175 | ``` 176 | 177 | The `actual_query_batch_bytes` and `actual_target_batch_bytes` specify the size of the two batches (in bytes) including the pad bases. `actual_n_alns` is the number of alignments to be performed. GASAL2 internally sets `is_free` to 0 after launching the alignment kernel on the GPU. From the performance prespective, if the average lengths of the sequences in *query_batch* and *target_batch* are not same, then the shorter sequences should be placed in *query_batch*. Fo rexample, in case of read mappers the read sequences are conatined in query_batch and the genome sequences in target_batch. 178 | 179 | The `gasal_aln_async()` function returns immediately after launching the alignment kernel on the GPU. The user can perform other tasks instead of waiting for the kernel to finish.To test whether the alignment on GPU is finished, the following function is called: 180 | 181 | ``` 182 | int gasal_is_aln_async_done(gasal_gpu_storage *gpu_storage); 183 | ``` 184 | 185 | If the function returns 0 the alignment on the GPU is finished and the output arrays contain valid results. Moreover, `is_free` is set to 1 by GASAL2. Thus, the current stream can be used for the alignment of another batch of sequences. The function returns `-1` if the results are not ready. It returns `-2` if the function is called on a stream in which no alignment has been launced, i.e. `is_free == 1`. 186 | 187 | 188 | ### Alignment results 189 | The structure `gasal_res_t` holds the results of the alignment and can be accessed manually. Its fields are the following: 190 | 191 | ```C 192 | struct gasal_res{ 193 | int32_t *aln_score; 194 | int32_t *query_batch_end; 195 | int32_t *target_batch_end; 196 | int32_t *query_batch_start; 197 | int32_t *target_batch_start; 198 | uint8_t *cigar; 199 | uint32_t *n_cigar_ops; 200 | }; 201 | typedef struct gasal_res gasal_res_t; 202 | ``` 203 | The output of alignments are stored in `aln_score`, `query_batch_end`, `target_batch_end`, `query_batch_start`, and `target_batch_start`, `cigar` and `n_cigar_ops` arrays, within the `host_res` structure inside the `gasal_gpu_storage` structure. `cigar` is a byte array which contains the traceback information in CIGAR format of all the alignments performed . The lower 2 bits of a byte indicate the CIGAR operation: 204 | 205 | ``` 206 | 0 = match 207 | 1 = mismatch 208 | 2 = deletion 209 | 3 = insertion 210 | ``` 211 | The upper 6 bits store the count of the operation in the lower two bits. The traceback information of an alignment in the `cigar` array is in the reverse direction. `host_query_batch_offsets` conatins the offset of an alignment in the `cigar` array. The `n_cigar_ops` contains number of bytes in the cigar array encoding the traceback information of an alignment. 212 | 213 | In case of second-best result, the same applies with the fields in `host_res_secondbest`. But the start-position and traceback( is only computed with the best score. Therefore, only `host_res_secondbest->aln_score`, `host_res_secondbest->query_batch_end` and `host_res_secondbest->target_batch_end` are valid for second-best result. 214 | 215 | 216 | 217 | 218 | ## Example 219 | The `test_prog` directory conatins an example program which uses GASAL2 for sequence alignment on GPU. See the README in the directory for the instructions about running the program. 220 | 221 | ## Citing GASAL2 222 | GASAL2 is published in BMC Bioinformatics: 223 | 224 | N. Ahmed, J. Lévy, S. Ren, H. Mushtaq, K. Bertels and Z. Al-ars, __GASAL2: a GPU accelerated sequence alignment library for high-throughput NGS data__, *BMC Bioinformatics* 20, 520 (2019) doi: [10.1186/s12859-019-3086-9](https://doi.org/10.1186/s12859-019-3086-9). 225 | 226 | ## Problems and suggestions 227 | For any issues and suugestions contact Nauman Ahmed at nahmed@uet.edu.pk. 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /src/kernels/semiglobal_kernel_template.h: -------------------------------------------------------------------------------- 1 | #ifndef __KERNEL_SEMIGLOBAL__ 2 | #define __KERNEL_SEMIGLOBAL__ 3 | 4 | 5 | #define CORE_COMPUTE_SEMIGLOBAL_DEPRECATED() \ 6 | uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence*/\ 7 | DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);/*check the equality of rbase and gbase*/\ 8 | /*int32_t curr_hm_diff = h[m] - _cudaGapOE;*/\ 9 | f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/*whether to introduce or extend a gap in query_batch sequence*/\ 10 | h[m] = p[m] + subScore;/*score if gbase is aligned to rbase*/\ 11 | h[m] = max(h[m], f[m]);\ 12 | e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence*/\ 13 | /*prev_hm_diff=curr_hm_diff;*/\ 14 | h[m] = max(h[m], e);\ 15 | p[m] = h[m-1]; 16 | 17 | #define CORE_COMPUTE_SEMIGLOBAL() \ 18 | uint32_t gbase = (gpac >> l) & 15; /* get a base from target_batch sequence */ \ 19 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */\ 20 | register int32_t curr_hm_diff = h[m] - _cudaGapOE;\ 21 | f[m] = max(curr_hm_diff, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */\ 22 | curr_hm_diff = p[m] + subScore;/* score if rbase is aligned to gbase */\ 23 | curr_hm_diff = max(curr_hm_diff, f[m]);\ 24 | e = max(prev_hm_diff, e - _cudaGapExtend);/* whether to introduce or extend a gap in target_batch sequence */\ 25 | curr_hm_diff = max(curr_hm_diff, e);\ 26 | h[m] = curr_hm_diff;\ 27 | p[m] = prev_hm_diff + _cudaGapOE;\ 28 | prev_hm_diff=curr_hm_diff - _cudaGapOE; 29 | 30 | 31 | 32 | /* typename meanings: 33 | T : algorithm type. Unused at the moment for semi_global as only semi_global type is run in this kernel. Can be used to create several types of computing cores, for example. 34 | S : WITH_ or WITHOUT_ Start. 35 | HEAD : set to QUERY, TARGET, BOTH or NONE. Tells which HEAD (prefix) is allowed to be ignored. 36 | TAIL : set to QUERY, TARGET, BOTH or NONE. Tells which TAIL (suffix) is allowed to be ignored. 37 | */ 38 | 39 | template 40 | __global__ void gasal_semi_global_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks) 41 | { 42 | 43 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 44 | if (tid >= n_tasks) return; 45 | 46 | int32_t i, j, k, l, m; 47 | int32_t e; 48 | 49 | int32_t maxHH = MINUS_INF;//initialize the maximum score to -infinity 50 | int32_t subScore; 51 | int32_t ridx, gidx; 52 | short2 HD; 53 | short2 initHD = make_short2(0, 0); 54 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence 55 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 56 | uint32_t read_len = query_batch_lens[tid]; 57 | uint32_t ref_len = target_batch_lens[tid]; 58 | uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch 59 | uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch 60 | 61 | int32_t maxXY_y __attribute__((unused)) ; 62 | int32_t maxXY_x __attribute__((unused)) ; 63 | maxXY_x = ref_len; 64 | maxXY_y = read_len; 65 | 66 | 67 | int32_t maxHH_second __attribute__((unused)); // __attribute__((unused)) to avoid raising errors at compilation. most template-kernels don't use these. 68 | //int32_t prev_maxHH_second __attribute__((unused)); 69 | int32_t maxXY_x_second __attribute__((unused)); 70 | int32_t maxXY_y_second __attribute__((unused)); 71 | maxHH_second = MINUS_INF; 72 | //prev_maxHH_second = 0; 73 | maxXY_x_second = ref_len; 74 | maxXY_y_second = read_len; 75 | 76 | //-------arrays to save intermediate values---------------- 77 | short2 global[MAX_QUERY_LEN]; 78 | int32_t h[9]; 79 | int32_t f[9]; 80 | int32_t p[9]; 81 | //------------------------------------------------------- 82 | int32_t u __attribute__((unused)) ; // this variable may not be used in some cases, depending on which kernel is generated. 83 | int32_t r __attribute__((unused)) ; 84 | 85 | 86 | 87 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 88 | { 89 | for (i = 0; i < MAX_QUERY_LEN; i++) 90 | { 91 | global[i] = initHD; 92 | } 93 | } else { 94 | global[0] = make_short2(0, MINUS_INF); 95 | for (i = 1; i < MAX_QUERY_LEN; i++) 96 | { 97 | global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF); 98 | } 99 | } 100 | 101 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 102 | { 103 | u = 0; 104 | r = 0; 105 | h[u++] = 0; 106 | p[r++] = 0; 107 | } 108 | 109 | for (i = 0; i < target_batch_regs; i++) 110 | { //target_batch sequence in rows 111 | gidx = i << 3; 112 | ridx = 0; 113 | 114 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 115 | { 116 | for (m = 0; m < 9; m++) 117 | { 118 | h[m] = 0; 119 | f[m] = MINUS_INF; 120 | p[m] = 0; 121 | } 122 | } else { 123 | for (m = 1; m < 9; m++, u++, r++) 124 | { 125 | h[m] = -(_cudaGapO + (_cudaGapExtend*(u-1))); 126 | f[m] = MINUS_INF; 127 | p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1))); 128 | } 129 | } 130 | 131 | 132 | register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence 133 | 134 | for (j = 0; j < query_batch_regs; /*++j*/ j+=1) //query_batch sequence in columns 135 | { 136 | register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 packed bases from query_batch sequence 137 | 138 | //--------------compute a tile of 8x8 cells------------------- 139 | for (k = 28; k >= 0; k -= 4) 140 | { 141 | uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence 142 | //------------load intermediate values---------------------- 143 | HD = global[ridx]; 144 | h[0] = HD.x; 145 | e = HD.y; 146 | //---------------------------------------------------------- 147 | int32_t prev_hm_diff = h[0] - _cudaGapOE; 148 | #pragma unroll 8 149 | for (l = 28, m = 1; m < 9; l -= 4, m++) 150 | { 151 | CORE_COMPUTE_SEMIGLOBAL(); 152 | } 153 | //--------------save intermediate values------------------------- 154 | HD.x = h[m-1]; 155 | HD.y = e; 156 | global[ridx] = HD; 157 | ridx++; 158 | 159 | //------the last line of DP matrix------------ 160 | if (SAMETYPE(TAIL, Int2Type) || SAMETYPE(TAIL, Int2Type)) 161 | { 162 | if (ridx == read_len) 163 | { 164 | //----find the maximum and the corresponding end position----------- 165 | for (m = 1; m < 9; m++) 166 | { 167 | maxXY_y = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? gidx + (m-1) : maxXY_y; 168 | maxHH = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? h[m] : maxHH; 169 | 170 | if (SAMETYPE(B, Int2Type)) 171 | { 172 | bool override_second = (h[m] > maxHH_second && h[m] < maxHH && (gidx + m - 1) < ref_len); 173 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 174 | maxHH_second = (override_second) ? h[m] : maxHH_second; 175 | } 176 | } 177 | } // endif(ridx == read_len) 178 | } 179 | 180 | } // endfor() computing tile 181 | } // endfor() on query words 182 | } // endfor() on targt words 183 | 184 | 185 | if (SAMETYPE(TAIL, Int2Type) || SAMETYPE(TAIL, Int2Type)) 186 | { 187 | for (m = 0; m < MAX_QUERY_LEN; m++) 188 | { 189 | int32_t score_tmp = global[m].x; 190 | if (score_tmp > maxHH && m < read_len) 191 | { 192 | maxXY_x = m; 193 | maxHH = score_tmp; 194 | } 195 | if (SAMETYPE(B, Int2Type)) 196 | { 197 | bool override_second = (score_tmp > maxHH_second && score_tmp < maxHH && m < ref_len); 198 | maxXY_x_second = (override_second) ? m : maxXY_x_second; 199 | maxHH_second = (override_second) ? score_tmp : maxHH_second; 200 | } 201 | 202 | } 203 | /* if the X position has been updated and is not on the bottom line, then the max score is actually on the rightmost column. 204 | * Then, update the Y position to be on the rightmost column. 205 | */ 206 | if (maxXY_x != ref_len) 207 | maxXY_y = read_len; 208 | 209 | if (SAMETYPE(B, Int2Type)) 210 | { 211 | if (maxXY_x_second != ref_len) 212 | maxXY_y_second = read_len; 213 | } 214 | } 215 | 216 | device_res->aln_score[tid] = maxHH;//copy the max score to the output array in the GPU mem 217 | device_res->target_batch_end[tid] = maxXY_y;//copy the end position on the target_batch sequence to the output array in the GPU mem 218 | device_res->query_batch_end[tid] = maxXY_x;//copy the end position on the target_batch sequence to the output array in the GPU mem 219 | 220 | if (SAMETYPE(B, Int2Type)) 221 | { 222 | device_res_second->aln_score[tid] = maxHH_second; 223 | device_res_second->target_batch_end[tid] = maxXY_y_second; 224 | device_res_second->query_batch_end[tid] = maxXY_x_second; 225 | } 226 | 227 | if (SAMETYPE(S, Int2Type)) 228 | { 229 | 230 | /*------------------Now to find the start position-----------------------*/ 231 | 232 | uint32_t reverse_query_batch[(MAX_QUERY_LEN>>3)];//array to hold the reverse query_batch sequence 233 | uint32_t reverse_target_batch[(MAX_QUERY_LEN>>3)];//array to hold the reverse query_batch sequence 234 | uint32_t reverse_query_batch_reg; 235 | uint32_t reverse_target_batch_reg; 236 | 237 | for (i = 0; i < (MAX_QUERY_LEN>>3); i++) { 238 | reverse_query_batch[i] = 0; 239 | } 240 | for (i = 0; i < (MAX_QUERY_LEN>>3); i++) { 241 | reverse_target_batch[i] = 0; 242 | } 243 | 244 | //--------reverse query_batch sequence-------------------- 245 | for (i = read_len - 1, k = 0; i >= 0; i--, k++) { 246 | uint32_t orig_query_batch_reg = i >> 3; 247 | uint32_t orig_symbol_pos = (((orig_query_batch_reg + 1) << 3) - i) - 1; 248 | reverse_query_batch_reg = k >> 3; 249 | uint32_t reverse_symbol_pos = (((reverse_query_batch_reg + 1) << 3) - k) - 1; 250 | uint32_t orig_symbol = 0; 251 | orig_symbol = (packed_query_batch[packed_query_batch_idx + orig_query_batch_reg] >> (orig_symbol_pos << 2)) & 15; 252 | reverse_query_batch[reverse_query_batch_reg] |= (orig_symbol << (reverse_symbol_pos << 2)); 253 | } 254 | //--------------------------------------------------- 255 | 256 | 257 | //--------reverse target_batch sequence-------------------- 258 | for (i = ref_len - 1, k = 0; i >= 0; i--, k++) { 259 | uint32_t orig_target_batch_reg = i >> 3; 260 | uint32_t orig_symbol_pos = (((orig_target_batch_reg + 1) << 3) - i) - 1; 261 | reverse_target_batch_reg = k >> 3; 262 | uint32_t reverse_symbol_pos = (((reverse_target_batch_reg + 1) << 3) - k) - 1; 263 | uint32_t orig_symbol = 0; 264 | orig_symbol = (packed_target_batch[packed_target_batch_idx + orig_target_batch_reg] >> (orig_symbol_pos << 2)) & 15; 265 | reverse_target_batch[reverse_target_batch_reg] |= (orig_symbol << (reverse_symbol_pos << 2)); 266 | } 267 | //--------------------------------------------------- 268 | 269 | int32_t gend_pos = maxXY_y;//end position on target_batch sequence 270 | int32_t fwd_score = maxHH;//the computed score 271 | 272 | //the index of 32-bit word containing the end position on target_batch sequence 273 | int32_t gend_reg = (target_batch_regs - ((gend_pos >> 3) + 1)) > 0 ? (target_batch_regs - ((gend_pos >> 3) + 1)) - 1 : (target_batch_regs - ((gend_pos >> 3) + 1)); 274 | 275 | maxHH = MINUS_INF; 276 | maxXY_y = 0; 277 | 278 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 279 | { 280 | for (i = 0; i < MAX_QUERY_LEN; i++) 281 | { 282 | global[i] = initHD; 283 | } 284 | } else { 285 | global[0] = make_short2(0, MINUS_INF); 286 | for (i = 1; i < MAX_QUERY_LEN; i++) 287 | { 288 | global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF); 289 | } 290 | } 291 | 292 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 293 | { 294 | u = 0; 295 | r = 0; 296 | h[u++] = 0; 297 | p[r++] = 0; 298 | } 299 | 300 | //------starting from the gend_reg, align the sequences in the reverse direction and exit if the max score >= fwd_score------ 301 | for (i = gend_reg; i < target_batch_regs && maxHH < fwd_score; i++) { //target_batch sequence in rows 302 | gidx = i << 3; 303 | ridx = 0; 304 | if (SAMETYPE(HEAD, Int2Type) || SAMETYPE(HEAD, Int2Type)) 305 | { 306 | for (m = 0; m < 9; m++) 307 | { 308 | h[m] = 0; 309 | f[m] = MINUS_INF; 310 | p[m] = 0; 311 | } 312 | } else { 313 | for (m = 1; m < 9; m++, u++, r++) 314 | { 315 | h[m] = -(_cudaGapO + (_cudaGapExtend*(u-1))); 316 | f[m] = MINUS_INF; 317 | p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1))); 318 | } 319 | } 320 | 321 | register uint32_t gpac =reverse_target_batch[i];//load 8 packed bases from target_batch sequence 322 | 323 | for (j = 0; j < query_batch_regs && maxHH < fwd_score;j+=1) { //query_batch sequence in columns 324 | register uint32_t rpac =reverse_query_batch[j];//load 8 packed bases from target_batch sequence 325 | //--------------compute a tile of 8x8 cells------------------- 326 | for (k = 28; k >= 0; k -= 4) { 327 | uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence 328 | //------------load intermediate values---------------------- 329 | HD = global[ridx]; 330 | h[0] = HD.x; 331 | e = HD.y; 332 | //-------------------------------------------------------- 333 | int32_t prev_hm_diff = h[0] - _cudaGapOE; 334 | #pragma unroll 8 335 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 336 | CORE_COMPUTE_SEMIGLOBAL(); 337 | } 338 | //------------save intermediate values---------------------- 339 | HD.x = h[m-1]; 340 | HD.y = e; 341 | global[ridx] = HD; 342 | ridx++; 343 | 344 | //------the last line of DP matrix------------ 345 | if (SAMETYPE(TAIL, Int2Type) || SAMETYPE(TAIL, Int2Type)) 346 | { 347 | if (ridx == read_len) 348 | { 349 | //----find the maximum and the corresponding end position----------- 350 | for (m = 1; m < 9; m++) 351 | { 352 | maxXY_y = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? gidx + (m-1) : maxXY_y; 353 | maxHH = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? h[m] : maxHH; 354 | } 355 | } // endif(ridx == read_len) 356 | } 357 | } // endfor() computing tile 358 | } // endfor() on query words 359 | } // endfor() on target words 360 | 361 | 362 | if (SAMETYPE(TAIL, Int2Type) || SAMETYPE(TAIL, Int2Type)) 363 | { 364 | for (m = 0; m < MAX_QUERY_LEN; m++) 365 | { 366 | int32_t score_tmp = global[m].x; 367 | if (score_tmp > maxHH && m < read_len) 368 | { 369 | maxXY_x = m; 370 | maxHH = score_tmp; 371 | } 372 | } 373 | /* if the X position has been updated and is not on the bottom line, then the max score is actually on the rightmost column. 374 | * Then, update the Y position to be on the rightmost column. 375 | */ 376 | if (maxXY_x != ref_len) 377 | maxXY_y = read_len; 378 | } 379 | 380 | device_res->target_batch_start[tid] = (ref_len - 1) - maxXY_y;//copy the start position on target_batch sequence to the output array in the GPU mem 381 | device_res->query_batch_start[tid] = (read_len - 1) - maxXY_x;//copy the start position on target_batch sequence to the output array in the GPU mem 382 | 383 | 384 | } // endif(SAMETYPE(START, Int2Type())) 385 | 386 | return; 387 | 388 | } 389 | #endif 390 | -------------------------------------------------------------------------------- /test_prog/test_prog.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include "../include/gasal_header.h" 4 | 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "Timer.h" 11 | 12 | #define NB_STREAMS 2 13 | 14 | //#define STREAM_BATCH_SIZE (262144) 15 | // this gives each stream HALF of the sequences. 16 | //#define STREAM_BATCH_SIZE ceil((double)target_seqs.size() / (double)(2)) 17 | 18 | #define STREAM_BATCH_SIZE 5000//ceil((double)target_seqs.size() / (double)(2 * 2)) 19 | 20 | 21 | #define DEBUG 22 | 23 | #define MAX(a,b) (a>b ? a : b) 24 | 25 | //#define GPU_SELECT 0 26 | 27 | 28 | int main(int argc, char **argv) { 29 | 30 | //gasal_set_device(GPU_SELECT); 31 | 32 | Parameters *args; 33 | args = new Parameters(argc, argv); 34 | args->parse(); 35 | args->print(); 36 | 37 | int print_out = args->print_out; 38 | int n_threads = args->n_threads; 39 | 40 | //--------------copy substitution scores to GPU-------------------- 41 | gasal_subst_scores sub_scores; 42 | 43 | sub_scores.match = args->sa; 44 | sub_scores.mismatch = args->sb; 45 | sub_scores.gap_open = args->gapo; 46 | sub_scores.gap_extend = args->gape; 47 | 48 | gasal_copy_subst_scores(&sub_scores); 49 | 50 | //------------------------------------------------------------------- 51 | 52 | 53 | std::vector query_seqs; 54 | std::vector target_seqs; 55 | std::vector query_headers; 56 | std::vector target_headers; 57 | std::string query_batch_line, target_batch_line; 58 | 59 | int total_seqs = 0; 60 | uint32_t maximum_sequence_length = 0; 61 | uint32_t target_seqs_len = 0; 62 | uint32_t query_seqs_len = 0; 63 | std::cerr << "Loading files...." << std::endl; 64 | 65 | /* 66 | Reads FASTA files and fill the corresponding buffers. 67 | FASTA files contain sequences that are usually on separate lines. 68 | The file reader detects a '>' then concatenates all the following lines into one sequence, until the next '>' or EOF. 69 | See more about FASTA format : https://en.wikipedia.org/wiki/FASTA_format 70 | */ 71 | 72 | int seq_begin=0; 73 | 74 | std::vector query_mod; 75 | std::vector target_mod; 76 | std::vector query_id; 77 | std::vector target_id; 78 | 79 | char line_starts[5] = ">' translates to 0b00 (0) = Forward, natural 84 | * - '<' translates to 0b01 (1) = Reverse, natural 85 | * - '/' translates to 0b10 (2) = Forward, complemented 86 | * - '+' translates to 0b11 (3) = Reverse, complemented 87 | * No protection is done, so any other number will only have its two first bytes counted as above. 88 | */ 89 | 90 | while (getline(args->query_batch_fasta, query_batch_line) && getline(args->target_batch_fasta, target_batch_line)) { 91 | 92 | //load sequences from the files 93 | char *q = NULL; 94 | char *t = NULL; 95 | q = strchr(line_starts, (int) (query_batch_line[0])); 96 | t = strchr(line_starts, (int) (target_batch_line[0])); 97 | 98 | /* 99 | t and q are pointers to the first occurence of the first read character in the line_starts array. 100 | so if I compare the address of these pointers with the address of the pointer to line_start, then... 101 | I can get which character was found, so which modifier is required. 102 | */ 103 | 104 | if (q != NULL && t != NULL) { 105 | total_seqs++; 106 | 107 | query_mod.push_back((uint8_t) (q-line_starts)); 108 | query_id.push_back(total_seqs); 109 | 110 | target_mod.push_back((uint8_t)(t-line_starts)); 111 | target_id.push_back(total_seqs); 112 | 113 | query_headers.push_back(query_batch_line.substr(1)); 114 | target_headers.push_back(target_batch_line.substr(1)); 115 | 116 | if (seq_begin == 2) { 117 | // a sequence was already being read. Now it's done, so we should find its length. 118 | target_seqs_len += (target_seqs.back()).length(); 119 | query_seqs_len += (query_seqs.back()).length(); 120 | maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length); 121 | maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length); 122 | } 123 | seq_begin = 1; 124 | 125 | } else if (seq_begin == 1) { 126 | query_seqs.push_back(query_batch_line); 127 | target_seqs.push_back(target_batch_line); 128 | seq_begin=2; 129 | } else if (seq_begin == 2) { 130 | query_seqs.back() += query_batch_line; 131 | target_seqs.back() += target_batch_line; 132 | } else { // should never happen but always put an else, for safety... 133 | seq_begin = 0; 134 | std::cerr << "Batch1 and target_batch files should be fasta having same number of sequences" << std::endl; 135 | exit(EXIT_FAILURE); 136 | } 137 | } 138 | 139 | 140 | 141 | // Check maximum sequence length one more time, to check the last read sequence: 142 | target_seqs_len += (target_seqs.back()).length(); 143 | query_seqs_len += (query_seqs.back()).length(); 144 | maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length); 145 | maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length); 146 | int maximum_sequence_length_query = MAX((query_seqs.back()).length(), 0); 147 | 148 | #ifdef DEBUG 149 | std::cerr << "[TEST_PROG DEBUG]: "; 150 | std::cerr << "Size of read batches are: query=" << query_seqs_len << ", target=" << target_seqs_len << ". maximum_sequence_length=" << maximum_sequence_length << std::endl; 151 | #endif 152 | 153 | 154 | // transforming the _mod into a char* array (to be passed to GASAL, which deals with C types) 155 | uint8_t *target_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) ); 156 | uint8_t *query_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) ); 157 | uint32_t *target_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) ); 158 | uint32_t *query_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) ); 159 | 160 | for (int i = 0; i < total_seqs; i++) 161 | { 162 | query_seq_mod[i] = query_mod.at(i); 163 | query_seq_id[i] = query_id.at(i); 164 | } 165 | 166 | #ifdef DEBUG 167 | std::cerr << "[TEST_PROG DEBUG]: query, mod@id="; 168 | for (int i = 0; i < total_seqs; i++) 169 | { 170 | if ((query_seq_mod[i]) > 0) 171 | std::cerr << +(query_seq_mod[i]) << "@" << query_seq_id[i] << "| "; 172 | } 173 | 174 | std::cerr << std::endl; 175 | #endif 176 | 177 | for (int i = 0; i < total_seqs; i++) 178 | { 179 | target_seq_mod[i] = target_mod.at(i); 180 | target_seq_id[i] = target_id.at(i); 181 | } 182 | 183 | int *thread_seqs_idx = (int*)malloc(n_threads*sizeof(int)); 184 | int *thread_n_seqs = (int*)malloc(n_threads*sizeof(int)); 185 | int *thread_n_batchs = (int*)malloc(n_threads*sizeof(int)); 186 | double *thread_misc_time = (double*)calloc(n_threads, sizeof(double)); 187 | 188 | int thread_batch_size = (int)ceil((double)total_seqs/n_threads); 189 | int n_seqs_alloc = 0; 190 | for (int i = 0; i < n_threads; i++){//distribute the sequences among the threads equally 191 | thread_seqs_idx[i] = n_seqs_alloc; 192 | if (n_seqs_alloc + thread_batch_size < total_seqs) thread_n_seqs[i] = thread_batch_size; 193 | else thread_n_seqs[i] = total_seqs - n_seqs_alloc; 194 | thread_n_batchs[i] = (int)ceil((double)thread_n_seqs[i]/(STREAM_BATCH_SIZE)); 195 | n_seqs_alloc += thread_n_seqs[i]; 196 | } 197 | 198 | std::cerr << "Processing..." << std::endl; 199 | 200 | Timer total_time; 201 | total_time.Start(); 202 | omp_set_num_threads(n_threads); 203 | gasal_gpu_storage_v *gpu_storage_vecs = (gasal_gpu_storage_v*)calloc(n_threads, sizeof(gasal_gpu_storage_v)); 204 | for (int z = 0; z < n_threads; z++) { 205 | gpu_storage_vecs[z] = gasal_init_gpu_storage_v(NB_STREAMS);// creating NB_STREAMS streams per thread 206 | 207 | /* 208 | About memory sizes: 209 | The required memory is the total size of the batch + its padding, divided by the number of streams. 210 | The worst case would be that every sequence has to be padded with 7 'N', since they must have a length multiple of 8. 211 | Even though the memory can be dynamically expanded both for Host and Device, it is advised to start with a memory large enough so that these expansions rarely occur (for better performance.) 212 | Modifying the factor '1' in front of each size lets you see how GASAL2 expands the memory when needed. 213 | */ 214 | /* 215 | // For exemple, this is exactly the memory needed to allocate to fit all sequences is a single GPU BATCH. 216 | gasal_init_streams(&(gpu_storage_vecs[z]), 217 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 218 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 219 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 220 | 1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 221 | ceil((double)target_seqs.size() / (double)(NB_STREAMS)), // maximum number of alignments is bigger on target than on query side. 222 | ceil((double)target_seqs.size() / (double)(NB_STREAMS)), 223 | args); 224 | */ 225 | //initializing the streams by allocating the required CPU and GPU memory 226 | // note: the calculations of the detailed sizes to allocate could be done on the library side (to hide it from the user's perspective) 227 | gasal_init_streams(&(gpu_storage_vecs[z]), (maximum_sequence_length_query + 7) , //TODO: remove maximum_sequence_length_query 228 | (maximum_sequence_length + 7) , 229 | STREAM_BATCH_SIZE, //device 230 | args); 231 | } 232 | #ifdef DEBUG 233 | std::cerr << "[TEST_PROG DEBUG]: "; 234 | std::cerr << "size of host_unpack_query is " << (query_seqs_len +7*total_seqs) / (NB_STREAMS) << std::endl ; 235 | #endif 236 | 237 | #pragma omp parallel 238 | { 239 | int n_seqs = thread_n_seqs[omp_get_thread_num()];//number of sequences allocated to this thread 240 | int curr_idx = thread_seqs_idx[omp_get_thread_num()];//number of sequences allocated to this thread 241 | int seqs_done = 0; 242 | int n_batchs_done = 0; 243 | 244 | struct gpu_batch{ //a struct to hold data structures of a stream 245 | gasal_gpu_storage_t *gpu_storage; //the struct that holds the GASAL2 data structures 246 | int n_seqs_batch;//number of sequences in the batch (<= (target_seqs.size() / NB_STREAMS)) 247 | int batch_start;//starting index of batch 248 | }; 249 | 250 | #ifdef DEBUG 251 | std::cerr << "[TEST_PROG DEBUG]: "; 252 | std::cerr << "Number of gpu_batch in gpu_batch_arr : " << gpu_storage_vecs[omp_get_thread_num()].n << std::endl; 253 | std::cerr << "[TEST_PROG DEBUG]: "; 254 | std::cerr << "Number of gpu_storage_vecs in a gpu_batch : " << omp_get_thread_num()+1 << std::endl; 255 | #endif 256 | 257 | gpu_batch gpu_batch_arr[gpu_storage_vecs[omp_get_thread_num()].n]; 258 | 259 | for(int z = 0; z < gpu_storage_vecs[omp_get_thread_num()].n; z++) { 260 | gpu_batch_arr[z].gpu_storage = &(gpu_storage_vecs[omp_get_thread_num()].a[z]); 261 | 262 | } 263 | 264 | if (n_seqs > 0) { 265 | while (n_batchs_done < thread_n_batchs[omp_get_thread_num()]) { // Loop on streams 266 | int gpu_batch_arr_idx = 0; 267 | //------------checking the availability of a "free" stream"----------------- 268 | while(gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n && (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->is_free != 1) { 269 | gpu_batch_arr_idx++; 270 | } 271 | 272 | if (seqs_done < n_seqs && gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) { 273 | uint32_t query_batch_idx = 0; 274 | uint32_t target_batch_idx = 0; 275 | unsigned int j = 0; 276 | //-----------Create a batch of sequences to be aligned on the GPU. The batch contains (target_seqs.size() / NB_STREAMS) number of sequences----------------------- 277 | 278 | 279 | for (int i = curr_idx; seqs_done < n_seqs && j < (STREAM_BATCH_SIZE); i++, j++, seqs_done++) 280 | { 281 | 282 | gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns++ ; 283 | 284 | if(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns > gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns) 285 | { 286 | gasal_host_alns_resize(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns * 2, args); 287 | } 288 | 289 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j] = query_batch_idx; 290 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_offsets[j] = target_batch_idx; 291 | 292 | /* 293 | All the filling is moved on the library size, to take care of the memory size and expansions (when needed). 294 | The function gasal_host_batch_fill takes care of how to fill, how much to pad with 'N', and how to deal with memory. 295 | It's the same function for query and target, and you only need to set the final flag to either ; this avoides code duplication. 296 | The way the host memory is filled changes the current _idx (it's increased by size, and by the padding). That's why it's returned by the function. 297 | */ 298 | 299 | query_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 300 | query_batch_idx, 301 | query_seqs[i].c_str(), 302 | query_seqs[i].size(), 303 | QUERY); 304 | 305 | target_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 306 | target_batch_idx, 307 | target_seqs[i].c_str(), 308 | target_seqs[i].size(), 309 | TARGET); 310 | 311 | 312 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_lens[j] = query_seqs[i].size(); 313 | (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_lens[j] = target_seqs[i].size(); 314 | 315 | } 316 | 317 | #ifdef DEBUG 318 | std::cerr << "[TEST_PROG DEBUG]: "; 319 | std::cerr << "Stream " << gpu_batch_arr_idx << ": j = " << j << ", seqs_done = " << seqs_done <<", query_batch_idx=" << query_batch_idx << " , target_batch_idx=" << target_batch_idx << std::endl; 320 | #endif 321 | 322 | // Here, we fill the operations arrays for the current batch to be processed by the stream 323 | gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_seq_mod + seqs_done - j, j, QUERY); 324 | gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, target_seq_mod + seqs_done - j, j, TARGET); 325 | 326 | 327 | gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch = j; 328 | uint32_t query_batch_bytes = query_batch_idx; 329 | uint32_t target_batch_bytes = target_batch_idx; 330 | gpu_batch_arr[gpu_batch_arr_idx].batch_start = curr_idx; 331 | curr_idx += (STREAM_BATCH_SIZE); 332 | 333 | //---------------------------------------------------------------------------------------------------- 334 | //-----------------calling the GASAL2 non-blocking alignment function--------------------------------- 335 | 336 | gasal_aln_async(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_batch_bytes, target_batch_bytes, gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch, args); 337 | gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns = 0; 338 | //--------------------------------------------------------------------------------- 339 | } 340 | 341 | 342 | //-------------------------------print alignment results---------------------------------------- 343 | 344 | gpu_batch_arr_idx = 0; 345 | while (gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {//loop through all the streams and print the results 346 | //of the finished streams. 347 | if (gasal_is_aln_async_done(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage) == 0) { 348 | int j = 0; 349 | if(print_out) { 350 | #pragma omp critical 351 | for (int i = gpu_batch_arr[gpu_batch_arr_idx].batch_start; j < gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch; i++, j++) { 352 | 353 | std::cout << "query_name=" << query_headers[i] ; 354 | std::cout << "\ttarget_name=" << target_headers[i] ; 355 | std::cout << "\tscore=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->aln_score[j] ; 356 | 357 | 358 | /// WARNING : INEQUALITY ON ENUM: CAN BREAK IF ENUM ORDER IS CHANGED 359 | if ((args->start_pos == WITH_START || args->start_pos == WITH_TB) 360 | && ((args->algo == SEMI_GLOBAL && (args->semiglobal_skipping_head != NONE || args->semiglobal_skipping_head != NONE)) 361 | || args->algo > SEMI_GLOBAL)) 362 | { 363 | std::cout << "\tquery_batch_start=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_start[j]; 364 | std::cout << "\ttarget_batch_start=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_start[j]; 365 | } 366 | 367 | if (args->algo != GLOBAL) 368 | { 369 | std::cout << "\tquery_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_end[j]; 370 | std::cout << "\ttarget_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_end[j] ; 371 | } 372 | 373 | 374 | 375 | if (args->secondBest) 376 | { 377 | std::cout << "\t2nd_score=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->aln_score[j] ; 378 | std::cout << "\t2nd_query_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->query_batch_end[j]; 379 | std::cout << "\t2nd_target_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->target_batch_end[j] ; 380 | } 381 | 382 | if (args->start_pos == WITH_TB) { 383 | std::cout << "\tCIGAR="; 384 | int u; 385 | int offset = (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j]; 386 | int n_cigar_ops = (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->n_cigar_ops[j]; 387 | int last_op = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + n_cigar_ops - 1]) & 3; 388 | int count = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + n_cigar_ops - 1]) >> 2; 389 | for (u = n_cigar_ops - 2; u >= 0 ; u--){ 390 | int curr_op = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) & 3; 391 | if (curr_op == last_op) { 392 | count += ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) >> 2; 393 | } else { 394 | char op; 395 | switch (last_op) { 396 | case 0: op = 'M'; 397 | break; 398 | case 1: op = 'X'; 399 | break; 400 | case 2: op = 'D'; 401 | break; 402 | case 3: op = 'I'; 403 | break; 404 | default: op = 'E'; 405 | break; 406 | 407 | } 408 | std::cout << count << op; 409 | count = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) >> 2; 410 | 411 | } 412 | last_op = curr_op; 413 | 414 | } 415 | char op; 416 | switch (last_op) { 417 | case 0: op = 'M'; 418 | break; 419 | case 1: op = 'X'; 420 | break; 421 | case 2: op = 'D'; 422 | break; 423 | case 3: op = 'I'; 424 | break; 425 | 426 | } 427 | std::cout << count << op; 428 | } 429 | std::cout << std::endl; 430 | } 431 | } 432 | n_batchs_done++; 433 | } 434 | gpu_batch_arr_idx++; 435 | } 436 | } 437 | } 438 | 439 | 440 | } 441 | for (int z = 0; z < n_threads; z++) { 442 | gasal_destroy_streams(&(gpu_storage_vecs[z]), args); 443 | gasal_destroy_gpu_storage_v(&(gpu_storage_vecs[z])); 444 | } 445 | free(gpu_storage_vecs); 446 | total_time.Stop(); 447 | /* 448 | string algorithm = al_type; 449 | string start_type[2] = {"without_start", "with_start"}; 450 | al_type += "_"; 451 | al_type += start_type[start_pos==WITH_START]; 452 | */ 453 | double av_misc_time = 0.0; 454 | for (int i = 0; i < n_threads; ++i){ 455 | av_misc_time += (thread_misc_time[i]/n_threads); 456 | } 457 | std::cerr << std::endl << "Done" << std::endl; 458 | fprintf(stderr, "Total execution time (in milliseconds): %.3f\n", total_time.GetTime()); 459 | delete args; // closes the files 460 | //free(args); // closes the files 461 | } 462 | -------------------------------------------------------------------------------- /src/gasal_align.cu: -------------------------------------------------------------------------------- 1 | #include "gasal.h" 2 | #include "args_parser.h" 3 | #include "res.h" 4 | #include "gasal_align.h" 5 | #include "gasal_kernels.h" 6 | #include "host_batch.h" 7 | 8 | 9 | 10 | inline void gasal_kernel_launcher(int32_t N_BLOCKS, int32_t BLOCKDIM, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band, data_source semiglobal_skipping_head, data_source semiglobal_skipping_tail, Bool secondBest) 11 | { 12 | switch(algo) 13 | { 14 | 15 | KERNEL_SWITCH(LOCAL, start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest); 16 | KERNEL_SWITCH(SEMI_GLOBAL, start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest); // MACRO that expands all 32 semi-global kernels 17 | KERNEL_SWITCH(GLOBAL, start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest); 18 | KERNEL_SWITCH(KSW, start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest); 19 | KERNEL_SWITCH(BANDED, start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest); 20 | default: 21 | break; 22 | 23 | } 24 | 25 | } 26 | 27 | 28 | //GASAL2 asynchronous (a.k.a non-blocking) alignment function 29 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params) { 30 | 31 | cudaError_t err; 32 | if (actual_n_alns <= 0) { 33 | fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n"); 34 | exit(EXIT_FAILURE); 35 | } 36 | if (actual_query_batch_bytes <= 0) { 37 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n"); 38 | exit(EXIT_FAILURE); 39 | } 40 | if (actual_target_batch_bytes <= 0) { 41 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n"); 42 | exit(EXIT_FAILURE); 43 | } 44 | 45 | if (actual_query_batch_bytes % 8) { 46 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes); 47 | exit(EXIT_FAILURE); 48 | } 49 | if (actual_target_batch_bytes % 8) { 50 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes); 51 | exit(EXIT_FAILURE); 52 | } 53 | 54 | if (actual_query_batch_bytes > gpu_storage->host_max_query_batch_bytes) { 55 | fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes(%d) > host_max_query_batch_bytes(%d)\n", actual_query_batch_bytes, gpu_storage->host_max_query_batch_bytes); 56 | exit(EXIT_FAILURE); 57 | } 58 | 59 | if (actual_target_batch_bytes > gpu_storage->host_max_target_batch_bytes) { 60 | fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes(%d) > host_max_target_batch_bytes(%d)\n", actual_target_batch_bytes, gpu_storage->host_max_target_batch_bytes); 61 | exit(EXIT_FAILURE); 62 | } 63 | 64 | if (actual_n_alns > gpu_storage->host_max_n_alns) { 65 | fprintf(stderr, "[GASAL ERROR:] actual_n_alns(%d) > host_max_n_alns(%d)\n", actual_n_alns, gpu_storage->host_max_n_alns); 66 | exit(EXIT_FAILURE); 67 | } 68 | 69 | //--------------if pre-allocated memory is less, allocate more-------------------------- 70 | if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) { 71 | 72 | int i = 2; 73 | while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++; 74 | 75 | fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i); 76 | 77 | gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i; 78 | 79 | if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch)); 80 | if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch)); 81 | 82 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t))); 83 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t))); 84 | 85 | if (params->start_pos==WITH_TB){ 86 | fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated HOST memory for CIGAR (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on the host (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i); 87 | if (gpu_storage->host_res->cigar != NULL)CHECKCUDAERROR(cudaFreeHost(gpu_storage->host_res->cigar)); 88 | CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage->host_res->cigar), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t),cudaHostAllocDefault)); 89 | } 90 | 91 | } 92 | 93 | if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) { 94 | 95 | int i = 2; 96 | while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++; 97 | 98 | fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i); 99 | 100 | gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i; 101 | 102 | if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch)); 103 | if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch)); 104 | 105 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t))); 106 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t))); 107 | 108 | 109 | } 110 | 111 | if (gpu_storage->gpu_max_n_alns < actual_n_alns) { 112 | 113 | int i = 2; 114 | while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++; 115 | 116 | fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i); 117 | 118 | gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i; 119 | 120 | if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets)); 121 | if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets)); 122 | if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens)); 123 | if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens)); 124 | 125 | if (gpu_storage->seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->seed_scores)); 126 | 127 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 128 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 129 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 130 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 131 | 132 | CHECKCUDAERROR(cudaMalloc(&(gpu_storage->seed_scores), gpu_storage->gpu_max_n_alns * sizeof(uint32_t))); 133 | 134 | gasal_res_destroy_device(gpu_storage->device_res, gpu_storage->device_cpy); 135 | gpu_storage->device_cpy = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params); 136 | gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy); 137 | 138 | if (params->secondBest) 139 | { 140 | gasal_res_destroy_device(gpu_storage->device_res_second, gpu_storage->device_cpy_second); 141 | gpu_storage->device_cpy_second = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params); 142 | gpu_storage->device_res_second = gasal_res_new_device(gpu_storage->device_cpy_second); 143 | } 144 | 145 | } 146 | //------------------------------------------ 147 | 148 | //------------------------launch copying of sequence batches from CPU to GPU--------------------------- 149 | 150 | // here you can track the evolution of your data structure processing with the printer: gasal_host_batch_printall(current); 151 | 152 | host_batch_t *current = gpu_storage->extensible_host_unpacked_query_batch; 153 | while (current != NULL) 154 | { 155 | //gasal_host_batch_printall(current); 156 | CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_query_batch[current->offset]), 157 | current->data, 158 | current->data_size, 159 | cudaMemcpyHostToDevice, 160 | gpu_storage->str ) ); 161 | 162 | current = current->next; 163 | } 164 | 165 | current = gpu_storage->extensible_host_unpacked_target_batch; 166 | while (current != NULL) 167 | { 168 | CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_target_batch[current->offset]), 169 | current->data, 170 | current->data_size, 171 | cudaMemcpyHostToDevice, 172 | gpu_storage->str ) ); 173 | 174 | current = current->next; 175 | } 176 | 177 | //----------------------------------------------------------------------------------------------------------- 178 | // TODO: Adjust the block size depending on the kernel execution. 179 | 180 | uint32_t BLOCKDIM = 128; 181 | uint32_t N_BLOCKS = (actual_n_alns + BLOCKDIM - 1) / BLOCKDIM; 182 | 183 | int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*BLOCKDIM*N_BLOCKS)); 184 | int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*BLOCKDIM*N_BLOCKS)); 185 | 186 | 187 | //-------------------------------------------launch packing kernel 188 | 189 | 190 | if (!(params->isPacked)) 191 | { 192 | gasal_pack_kernel<<str>>>((uint32_t*)(gpu_storage->unpacked_query_batch), 193 | (uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, 194 | query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4); 195 | cudaError_t pack_kernel_err = cudaGetLastError(); 196 | if ( cudaSuccess != pack_kernel_err ) 197 | { 198 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err, __LINE__, __FILE__); 199 | exit(EXIT_FAILURE); 200 | } 201 | } 202 | 203 | 204 | // We could reverse-complement before packing, but we would get 2x more read-writes to memory. 205 | 206 | //----------------------launch copying of sequence offsets and lengths from CPU to GPU-------------------------------------- 207 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_lens, gpu_storage->host_query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 208 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_lens, gpu_storage->host_target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 209 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_offsets, gpu_storage->host_query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 210 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_offsets, gpu_storage->host_target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 211 | 212 | // if needed copy seed scores 213 | if (params->algo == KSW) 214 | { 215 | if (gpu_storage->seed_scores == NULL) 216 | { 217 | fprintf(stderr, "seed_scores == NULL\n"); 218 | 219 | } 220 | if (gpu_storage->host_seed_scores == NULL) 221 | { 222 | fprintf(stderr, "host_seed_scores == NULL\n"); 223 | } 224 | if (gpu_storage->seed_scores == NULL || gpu_storage->host_seed_scores == NULL) 225 | exit(EXIT_FAILURE); 226 | 227 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->seed_scores, gpu_storage->host_seed_scores, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str)); 228 | } 229 | //-------------------------------------------------------------------------------------------------------------------------- 230 | 231 | //----------------------launch copying of sequence operations (reverse/complement) from CPU to GPU-------------------------- 232 | if (params->isReverseComplement) 233 | { 234 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_op, gpu_storage->host_query_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice, gpu_storage->str)); 235 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_op, gpu_storage->host_target_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice, gpu_storage->str)); 236 | //--------------------------------------launch reverse-complement kernel------------------------------------------------------ 237 | gasal_reversecomplement_kernel<<str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, 238 | gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->query_op, gpu_storage->target_op, actual_n_alns); 239 | cudaError_t reversecomplement_kernel_err = cudaGetLastError(); 240 | if ( cudaSuccess != reversecomplement_kernel_err ) 241 | { 242 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(reversecomplement_kernel_err), reversecomplement_kernel_err, __LINE__, __FILE__); 243 | exit(EXIT_FAILURE); 244 | } 245 | 246 | } 247 | 248 | //--------------------------------------launch alignment kernels-------------------------------------------------------------- 249 | gasal_kernel_launcher(N_BLOCKS, BLOCKDIM, params->algo, params->start_pos, gpu_storage, actual_n_alns, params->k_band, params->semiglobal_skipping_head, params->semiglobal_skipping_tail, params->secondBest); 250 | 251 | //if (params->start_pos == WITH_TB) { 252 | 253 | // The output of the kernel: gpu_storage->unpacked_query_batch = cigar, gpu_storage->query_batch_lens = n_cigar_ops 254 | //gasal_get_tbalgo>><<str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns); 255 | //} 256 | 257 | //----------------------------------------------------------------------------------------------------------------------- 258 | cudaError_t aln_kernel_err = cudaGetLastError(); 259 | if ( cudaSuccess != aln_kernel_err ) 260 | { 261 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err, __LINE__, __FILE__); 262 | exit(EXIT_FAILURE); 263 | } 264 | 265 | //------------------------0launch the copying of alignment results from GPU to CPU-------------------------------------- 266 | if (gpu_storage->host_res->aln_score != NULL && gpu_storage->device_cpy->aln_score != NULL) 267 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->aln_score, gpu_storage->device_cpy->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 268 | 269 | if (gpu_storage->host_res->query_batch_start != NULL && gpu_storage->device_cpy->query_batch_start != NULL) 270 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_start, gpu_storage->device_cpy->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 271 | 272 | if (gpu_storage->host_res->target_batch_start != NULL && gpu_storage->device_cpy->target_batch_start != NULL) 273 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_start, gpu_storage->device_cpy->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 274 | 275 | if (gpu_storage->host_res->query_batch_end != NULL && gpu_storage->device_cpy->query_batch_end != NULL) 276 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_end, gpu_storage->device_cpy->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 277 | 278 | if (gpu_storage->host_res->target_batch_end != NULL && gpu_storage->device_cpy->target_batch_end != NULL) 279 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_end, gpu_storage->device_cpy->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 280 | if (params->start_pos == WITH_TB) { 281 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->cigar, gpu_storage->unpacked_query_batch, actual_query_batch_bytes * sizeof(uint8_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 282 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->n_cigar_ops, gpu_storage->query_batch_lens, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 283 | } 284 | //----------------------------------------------------------------------------------------------------------------------- 285 | 286 | 287 | // not really needed to filter with params->secondBest, since all the pointers will be null and non-initialized. 288 | if (params->secondBest) 289 | { 290 | if (gpu_storage->host_res_second->aln_score != NULL && gpu_storage->device_cpy_second->aln_score != NULL) 291 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->aln_score, gpu_storage->device_cpy_second->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 292 | 293 | if (gpu_storage->host_res_second->query_batch_start != NULL && gpu_storage->device_cpy_second->query_batch_start != NULL) 294 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->query_batch_start, gpu_storage->device_cpy_second->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 295 | 296 | if (gpu_storage->host_res_second->target_batch_start != NULL && gpu_storage->device_cpy_second->target_batch_start != NULL) 297 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->target_batch_start, gpu_storage->device_cpy_second->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 298 | 299 | if (gpu_storage->host_res_second->query_batch_end != NULL && gpu_storage->device_cpy_second->query_batch_end != NULL) 300 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->query_batch_end, gpu_storage->device_cpy_second->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 301 | 302 | if (gpu_storage->host_res_second->target_batch_end != NULL && gpu_storage->device_cpy_second->target_batch_end != NULL) 303 | CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->target_batch_end, gpu_storage->device_cpy_second->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str)); 304 | } 305 | 306 | gpu_storage->is_free = 0; //set the availability of current stream to false 307 | } 308 | 309 | 310 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage) 311 | { 312 | cudaError_t err; 313 | if(gpu_storage->is_free == 1) return -2;//if no work is launced in this stream, return -2 314 | err = cudaStreamQuery(gpu_storage->str);//check to see if the stream is finished 315 | if (err != cudaSuccess ) { 316 | if (err == cudaErrorNotReady) return -1; 317 | else{ 318 | fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err, __LINE__, __FILE__); 319 | exit(EXIT_FAILURE); 320 | } 321 | } 322 | gasal_host_batch_reset(gpu_storage); 323 | gpu_storage->is_free = 1; 324 | gpu_storage->current_n_alns = 0; 325 | return 0; 326 | } 327 | 328 | 329 | void gasal_copy_subst_scores(gasal_subst_scores *subst){ 330 | 331 | cudaError_t err; 332 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapO, &(subst->gap_open), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 333 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapExtend, &(subst->gap_extend), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 334 | int32_t gapoe = (subst->gap_open + subst->gap_extend); 335 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapOE, &(gapoe), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 336 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMatchScore, &(subst->match), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 337 | CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMismatchScore, &(subst->mismatch), sizeof(int32_t), 0, cudaMemcpyHostToDevice)); 338 | return; 339 | } 340 | 341 | -------------------------------------------------------------------------------- /src/kernels/local_kernel_template.h: -------------------------------------------------------------------------------- 1 | #ifndef __LOCAL_KERNEL_TEMPLATE__ 2 | #define __LOCAL_KERNEL_TEMPLATE__ 3 | 4 | 5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes. 6 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \ 7 | uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence */ \ 8 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */ \ 9 | f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \ 10 | h[m] = p[m] + subScore; /*score if rbase is aligned to gbase*/ \ 11 | h[m] = max(h[m], f[m]); \ 12 | h[m] = max(h[m], 0); \ 13 | e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\ 14 | h[m] = max(h[m], e); \ 15 | maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \ 16 | maxHH = (maxHH < h[m]) ? h[m] : maxHH; \ 17 | p[m] = h[m-1]; 18 | 19 | #define CORE_LOCAL_COMPUTE() \ 20 | uint32_t gbase = (gpac >> l) & 15;\ 21 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \ 22 | int32_t tmp_hm = p[m] + subScore; \ 23 | h[m] = max(tmp_hm, f[m]); \ 24 | h[m] = max(h[m], e); \ 25 | h[m] = max(h[m], 0); \ 26 | f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \ 27 | e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \ 28 | maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \ 29 | maxHH = (maxHH < h[m]) ? h[m] : maxHH; \ 30 | p[m] = h[m-1]; \ 31 | 32 | #define CORE_LOCAL_COMPUTE_START() \ 33 | uint32_t gbase = (gpac >> l) & 15;\ 34 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \ 35 | int32_t tmp_hm = p[m] + subScore; \ 36 | h[m] = max(tmp_hm, f[m]); \ 37 | h[m] = max(h[m], e); \ 38 | h[m] = max(h[m], 0); \ 39 | f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \ 40 | e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \ 41 | maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \ 42 | maxHH = (maxHH < h[m]) ? h[m] : maxHH; \ 43 | p[m] = h[m-1]; \ 44 | 45 | #define CORE_LOCAL_COMPUTE_TB(direction_reg) \ 46 | uint32_t gbase = (gpac >> l) & 15;\ 47 | DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \ 48 | int32_t tmp_hm = p[m] + subScore; \ 49 | uint32_t m_or_x = tmp_hm >= p[m] ? 0 : 1;\ 50 | h[m] = max(tmp_hm, f[m]); \ 51 | h[m] = max(h[m], e); \ 52 | h[m] = max(h[m], 0); \ 53 | direction_reg |= h[m] == tmp_hm ? m_or_x << (28 - ((m - 1) << 2)) : (h[m] == f[m] ? (uint32_t)3 << (28 - ((m - 1) << 2)) : (uint32_t)2 << (28 - ((m - 1) << 2)));\ 54 | direction_reg |= (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ? (uint32_t)0 : (uint32_t)1 << (31 - ((m - 1) << 2));\ 55 | f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \ 56 | direction_reg |= (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ? (uint32_t)0 : (uint32_t)1 << (30 - ((m - 1) << 2));\ 57 | e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \ 58 | maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \ 59 | maxHH = (maxHH < h[m]) ? h[m] : maxHH; \ 60 | p[m] = h[m-1]; \ 61 | 62 | 63 | 64 | 65 | /* typename meaning : 66 | - T is the algorithm type (LOCAL, MICROLOCAL) 67 | - S is WITH_ or WIHTOUT_START 68 | - B is for computing the Second Best Score. Its values are on enum FALSE(0)/TRUE(1). 69 | (sidenote: it's based on an enum instead of a bool in order to generalize its type from its Int value, with Int2Type meta-programming-template) 70 | */ 71 | template 72 | __global__ void gasal_local_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks) 73 | { 74 | const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID 75 | if (tid >= n_tasks) return; 76 | 77 | int32_t i, j, k, m, l; 78 | int32_t e; 79 | 80 | int32_t maxHH = 0; //initialize the maximum score to zero 81 | int32_t maxXY_y = 0; 82 | 83 | int32_t prev_maxHH = 0; 84 | int32_t maxXY_x = 0; 85 | 86 | int tile_no = 0; 87 | 88 | 89 | int32_t maxHH_second __attribute__((unused)); // __attribute__((unused)) to avoid raising errors at compilation. most template-kernels don't use these. 90 | int32_t prev_maxHH_second __attribute__((unused)); 91 | int32_t maxXY_x_second __attribute__((unused)); 92 | int32_t maxXY_y_second __attribute__((unused)); 93 | maxHH_second = 0; 94 | prev_maxHH_second = 0; 95 | maxXY_x_second = 0; 96 | maxXY_y_second = 0; 97 | 98 | 99 | int32_t subScore; 100 | 101 | int32_t ridx, gidx; 102 | short2 HD; 103 | short2 initHD = make_short2(0, 0); 104 | 105 | uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3; //starting index of the target_batch sequence 106 | uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence 107 | uint32_t read_len = query_batch_lens[tid]; 108 | uint32_t ref_len = target_batch_lens[tid]; 109 | uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding query_batch sequence 110 | uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding target_batch sequence 111 | //-----arrays for saving intermediate values------ 112 | short2 global[MAX_QUERY_LEN]; 113 | int32_t h[9]; 114 | int32_t f[9]; 115 | int32_t p[9]; 116 | //-------------------------------------------- 117 | 118 | for (i = 0; i < MAX_QUERY_LEN; i++) { 119 | global[i] = initHD; 120 | } 121 | 122 | for (i = 0; i < target_batch_regs; i++) { //target_batch sequence in rows 123 | for (m = 0; m < 9; m++) { 124 | h[m] = 0; 125 | f[m] = 0; 126 | p[m] = 0; 127 | } 128 | 129 | register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence 130 | gidx = i << 3; 131 | ridx = 0; 132 | 133 | for (j = 0; j < query_batch_regs; j+=1) { //query_batch sequence in columns 134 | register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 bases from query_batch sequence 135 | 136 | //--------------compute a tile of 8x8 cells------------------- 137 | if (SAMETYPE(S, Int2Type)) { 138 | uint4 direction = make_uint4(0, 0, 0, 0); 139 | uint32_t rbase = (rpac >> 28) & 15;//get a base from query_batch sequence 140 | HD = global[ridx]; 141 | h[0] = HD.x; 142 | e = HD.y; 143 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 144 | CORE_LOCAL_COMPUTE_TB(direction.x); 145 | if (SAMETYPE(B, Int2Type)) 146 | { 147 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 148 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 149 | maxHH_second = (override_second) ? h[m] : maxHH_second; 150 | } 151 | } 152 | HD.x = h[m-1]; 153 | HD.y = e; 154 | global[ridx] = HD; 155 | //--------------------------------------------- 156 | 157 | 158 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 159 | 160 | if (SAMETYPE(B, Int2Type)) 161 | { 162 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 163 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 164 | } 165 | prev_maxHH = max(maxHH, prev_maxHH); 166 | ridx++; 167 | 168 | rbase = (rpac >> 24) & 15;//get a base from query_batch sequence 169 | HD = global[ridx]; 170 | h[0] = HD.x; 171 | e = HD.y; 172 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 173 | CORE_LOCAL_COMPUTE_TB(direction.y); 174 | if (SAMETYPE(B, Int2Type)) 175 | { 176 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 177 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 178 | maxHH_second = (override_second) ? h[m] : maxHH_second; 179 | } 180 | } 181 | HD.x = h[m-1]; 182 | HD.y = e; 183 | global[ridx] = HD; 184 | //--------------------------------------------- 185 | 186 | 187 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 188 | 189 | if (SAMETYPE(B, Int2Type)) 190 | { 191 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 192 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 193 | } 194 | prev_maxHH = max(maxHH, prev_maxHH); 195 | ridx++; 196 | 197 | 198 | rbase = (rpac >> 20) & 15;//get a base from query_batch sequence 199 | HD = global[ridx]; 200 | h[0] = HD.x; 201 | e = HD.y; 202 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 203 | CORE_LOCAL_COMPUTE_TB(direction.z); 204 | if (SAMETYPE(B, Int2Type)) 205 | { 206 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 207 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 208 | maxHH_second = (override_second) ? h[m] : maxHH_second; 209 | } 210 | } 211 | HD.x = h[m-1]; 212 | HD.y = e; 213 | global[ridx] = HD; 214 | //--------------------------------------------- 215 | 216 | 217 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 218 | 219 | if (SAMETYPE(B, Int2Type)) 220 | { 221 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 222 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 223 | } 224 | prev_maxHH = max(maxHH, prev_maxHH); 225 | ridx++; 226 | 227 | 228 | rbase = (rpac >> 16) & 15;//get a base from query_batch sequence 229 | HD = global[ridx]; 230 | h[0] = HD.x; 231 | e = HD.y; 232 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 233 | CORE_LOCAL_COMPUTE_TB(direction.w); 234 | if (SAMETYPE(B, Int2Type)) 235 | { 236 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 237 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 238 | maxHH_second = (override_second) ? h[m] : maxHH_second; 239 | } 240 | } 241 | HD.x = h[m-1]; 242 | HD.y = e; 243 | global[ridx] = HD; 244 | //--------------------------------------------- 245 | 246 | 247 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 248 | 249 | if (SAMETYPE(B, Int2Type)) 250 | { 251 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 252 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 253 | } 254 | prev_maxHH = max(maxHH, prev_maxHH); 255 | ridx++; 256 | 257 | packed_tb_matrices[(tile_no*n_tasks) + tid] = direction; 258 | tile_no++; 259 | 260 | 261 | direction = make_uint4(0,0,0,0); 262 | rbase = (rpac >> 12) & 15;//get a base from query_batch sequence 263 | HD = global[ridx]; 264 | h[0] = HD.x; 265 | e = HD.y; 266 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 267 | CORE_LOCAL_COMPUTE_TB(direction.x); 268 | if (SAMETYPE(B, Int2Type)) 269 | { 270 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 271 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 272 | maxHH_second = (override_second) ? h[m] : maxHH_second; 273 | } 274 | } 275 | HD.x = h[m-1]; 276 | HD.y = e; 277 | global[ridx] = HD; 278 | //--------------------------------------------- 279 | 280 | 281 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 282 | 283 | if (SAMETYPE(B, Int2Type)) 284 | { 285 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 286 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 287 | } 288 | prev_maxHH = max(maxHH, prev_maxHH); 289 | ridx++; 290 | 291 | rbase = (rpac >> 8) & 15;//get a base from query_batch sequence 292 | HD = global[ridx]; 293 | h[0] = HD.x; 294 | e = HD.y; 295 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 296 | CORE_LOCAL_COMPUTE_TB(direction.y); 297 | if (SAMETYPE(B, Int2Type)) 298 | { 299 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 300 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 301 | maxHH_second = (override_second) ? h[m] : maxHH_second; 302 | } 303 | } 304 | HD.x = h[m-1]; 305 | HD.y = e; 306 | global[ridx] = HD; 307 | //--------------------------------------------- 308 | 309 | 310 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 311 | 312 | if (SAMETYPE(B, Int2Type)) 313 | { 314 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 315 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 316 | } 317 | prev_maxHH = max(maxHH, prev_maxHH); 318 | ridx++; 319 | 320 | 321 | rbase = (rpac >> 4) & 15;//get a base from query_batch sequence 322 | HD = global[ridx]; 323 | h[0] = HD.x; 324 | e = HD.y; 325 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 326 | CORE_LOCAL_COMPUTE_TB(direction.z); 327 | if (SAMETYPE(B, Int2Type)) 328 | { 329 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 330 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 331 | maxHH_second = (override_second) ? h[m] : maxHH_second; 332 | } 333 | } 334 | HD.x = h[m-1]; 335 | HD.y = e; 336 | global[ridx] = HD; 337 | //--------------------------------------------- 338 | 339 | 340 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 341 | 342 | if (SAMETYPE(B, Int2Type)) 343 | { 344 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 345 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 346 | } 347 | prev_maxHH = max(maxHH, prev_maxHH); 348 | ridx++; 349 | 350 | 351 | rbase = rpac & 15;//get a base from query_batch sequence 352 | HD = global[ridx]; 353 | h[0] = HD.x; 354 | e = HD.y; 355 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 356 | CORE_LOCAL_COMPUTE_TB(direction.w); 357 | if (SAMETYPE(B, Int2Type)) 358 | { 359 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 360 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 361 | maxHH_second = (override_second) ? h[m] : maxHH_second; 362 | } 363 | } 364 | HD.x = h[m-1]; 365 | HD.y = e; 366 | global[ridx] = HD; 367 | //--------------------------------------------- 368 | 369 | 370 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 371 | 372 | if (SAMETYPE(B, Int2Type)) 373 | { 374 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 375 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 376 | } 377 | prev_maxHH = max(maxHH, prev_maxHH); 378 | ridx++; 379 | 380 | packed_tb_matrices[(tile_no*n_tasks) + tid] = direction; 381 | tile_no++; 382 | 383 | 384 | 385 | } 386 | else { 387 | for (k = 28; k >= 0; k -= 4) { 388 | uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence 389 | //-----load intermediate values-------------- 390 | HD = global[ridx]; 391 | h[0] = HD.x; 392 | e = HD.y; 393 | 394 | #pragma unroll 8 395 | for (l = 28, m = 1; m < 9; l -= 4, m++) { 396 | CORE_LOCAL_COMPUTE(); 397 | if (SAMETYPE(B, Int2Type)) 398 | { 399 | bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]); 400 | maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 401 | maxHH_second = (override_second) ? h[m] : maxHH_second; 402 | } 403 | } 404 | 405 | //----------save intermediate values------------ 406 | HD.x = h[m-1]; 407 | HD.y = e; 408 | global[ridx] = HD; 409 | //--------------------------------------------- 410 | 411 | 412 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score 413 | 414 | if (SAMETYPE(B, Int2Type)) 415 | { 416 | maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second; 417 | prev_maxHH_second = max(maxHH_second, prev_maxHH_second); 418 | } 419 | prev_maxHH = max(maxHH, prev_maxHH); 420 | ridx++; 421 | //------------------------------------------------------- 422 | 423 | } 424 | } 425 | } 426 | } 427 | 428 | device_res->aln_score[tid] = maxHH;//copy the max score to the output array in the GPU mem 429 | device_res->query_batch_end[tid] = maxXY_x;//copy the end position on query_batch sequence to the output array in the GPU mem 430 | device_res->target_batch_end[tid] = maxXY_y;//copy the end position on target_batch sequence to the output array in the GPU mem 431 | 432 | if (SAMETYPE(B, Int2Type)) 433 | { 434 | device_res_second->aln_score[tid] = maxHH_second; 435 | device_res_second->query_batch_end[tid] = maxXY_x_second; 436 | device_res_second->target_batch_end[tid] = maxXY_y_second; 437 | } 438 | 439 | 440 | /*------------------Now to find the start position-----------------------*/ 441 | if (SAMETYPE(S, Int2Type)) 442 | { 443 | 444 | int32_t rend_pos = maxXY_x;//end position on query_batch sequence 445 | int32_t gend_pos = maxXY_y;//end position on target_batch sequence 446 | int32_t fwd_score = maxHH;// the computed score 447 | 448 | //the index of 32-bit word containing the end position on query_batch sequence 449 | int32_t rend_reg = ((rend_pos >> 3) + 1) < query_batch_regs ? ((rend_pos >> 3) + 1) : query_batch_regs; 450 | //the index of 32-bit word containing to end position on target_batch sequence 451 | int32_t gend_reg = ((gend_pos >> 3) + 1) < target_batch_regs ? ((gend_pos >> 3) + 1) : target_batch_regs; 452 | 453 | 454 | 455 | packed_query_batch_idx += (rend_reg - 1); 456 | packed_target_batch_idx += (gend_reg - 1); 457 | 458 | 459 | maxHH = 0; 460 | prev_maxHH = 0; 461 | maxXY_x = 0; 462 | maxXY_y = 0; 463 | 464 | for (i = 0; i < MAX_QUERY_LEN; i++) { 465 | global[i] = initHD; 466 | } 467 | //------starting from the gend_reg and rend_reg, align the sequences in the reverse direction and exit if the max score >= fwd_score------ 468 | gidx = ((gend_reg << 3) + 8) - 1; 469 | for (i = 0; i < gend_reg && maxHH < fwd_score; i++) { 470 | for (m = 0; m < 9; m++) { 471 | h[m] = 0; 472 | f[m] = 0; 473 | p[m] = 0; 474 | } 475 | register uint32_t gpac =packed_target_batch[packed_target_batch_idx - i];//load 8 packed bases from target_batch sequence 476 | gidx = gidx - 8; 477 | ridx = (rend_reg << 3) - 1; 478 | int32_t global_idx = 0; 479 | for (j = 0; j < rend_reg && maxHH < fwd_score; j+=1) { 480 | register uint32_t rpac =packed_query_batch[packed_query_batch_idx - j];//load 8 packed bases from query_batch sequence 481 | //--------------compute a tile of 8x8 cells------------------- 482 | for (k = 0; k <= 28 && maxHH < fwd_score; k += 4) { 483 | uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence 484 | //----------load intermediate values-------------- 485 | HD = global[global_idx]; 486 | h[0] = HD.x; 487 | e = HD.y; 488 | 489 | 490 | #pragma unroll 8 491 | for (l = 0, m = 1; l <= 28; l += 4, m++) { 492 | CORE_LOCAL_COMPUTE_START(); 493 | } 494 | 495 | //------------save intermediate values---------------- 496 | HD.x = h[m-1]; 497 | HD.y = e; 498 | global[global_idx] = HD; 499 | //---------------------------------------------------- 500 | maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//start position on query_batch sequence corresponding to current maximum score 501 | prev_maxHH = max(maxHH, prev_maxHH); 502 | ridx--; 503 | global_idx++; 504 | } 505 | } 506 | } 507 | 508 | device_res->query_batch_start[tid] = maxXY_x;//copy the start position on query_batch sequence to the output array in the GPU mem 509 | device_res->target_batch_start[tid] = maxXY_y;//copy the start position on target_batch sequence to the output array in the GPU mem 510 | 511 | } 512 | 513 | 514 | 515 | 516 | return; 517 | 518 | 519 | } 520 | #endif 521 | --------------------------------------------------------------------------------