├── test_prog
    ├── query_batch.fasta.gz
    ├── target_batch.fasta.gz
    ├── Timer.h
    ├── README.md
    ├── Makefile
    └── test_prog.cpp
├── .gitignore
├── src
    ├── gasal_header.h
    ├── res.h
    ├── interfaces.h
    ├── ctors.h
    ├── host_batch.h
    ├── args_parser.h
    ├── gasal_kernels.h
    ├── kernels
    │   ├── get_tb.h
    │   ├── ksw_kernel_template.h
    │   ├── pack_rc_seqs.h
    │   ├── global.h
    │   ├── semiglobal_kernel_template.h
    │   └── local_kernel_template.h
    ├── gasal.h
    ├── interfaces.cpp
    ├── res.cpp
    ├── gasal_align.h
    ├── host_batch.cpp
    ├── args_parser.cpp
    ├── ctors.cpp
    ├── __deprecated.cpp
    └── gasal_align.cu
├── configure.sh
├── Makefile
├── LICENSE
└── README.md


/test_prog/query_batch.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahmedraja/GASAL2/HEAD/test_prog/query_batch.fasta.gz


--------------------------------------------------------------------------------
/test_prog/target_batch.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahmedraja/GASAL2/HEAD/test_prog/target_batch.fasta.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.cuo
 3 | *.cppo
 4 | *.out
 5 | *.txt
 6 | *.tsv
 7 | *.a
 8 | *~
 9 | *.cproject
10 | *.project
11 | *.sam
12 | *.fa
13 | *.fasta
14 | *.bam
15 | *.swp
16 | lib/*
17 | include/*
18 | .vscode/*
19 | *.log
20 | 
21 | src/\.vscode/
22 | 
23 | *.nvvp
24 | 
25 | *.nvprof
26 | 


--------------------------------------------------------------------------------
/src/gasal_header.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_HEADER_H__
 2 | #define __GASAL_HEADER_H__
 3 | 
 4 | 
 5 | #include "gasal.h"		// include cstdlib, cstdint
 6 | #include "args_parser.h" // include iostream, string, fstream
 7 | #include "gasal_align.h"
 8 | #include "host_batch.h"  // include cstdio, cstring
 9 | #include "ctors.h"
10 | #include "interfaces.h"
11 | 
12 | 
13 | 
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/res.h:
--------------------------------------------------------------------------------
 1 | #ifndef __RES_H__
 2 | #define __RES_H__
 3 | 
 4 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params);
 5 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy);
 6 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params);
 7 | 
 8 | void gasal_res_destroy_host(gasal_res_t *res);
 9 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy);
10 | 
11 | 
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/interfaces.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_INTERFACES_H__
 2 | #define __GASAL_INTERFACES_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | // Resizer for the whole gpu_storage in terms of number of sequences
 9 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params);
10 | 
11 | // operation filler method (field in the gasal_gpu_storage_t field)
12 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC);
13 | 
14 | void gasal_set_device(int gpu_select = 0, bool isPrintingProp = true);
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/ctors.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CTORS_H__
 2 | #define __CTORS_H__
 3 | 
 4 | 
 5 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams);
 6 | 
 7 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec,  int max_query_len, int max_target_len, int max_n_alns,  Parameters *params);
 8 | 
 9 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params);
10 | 
11 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params);
12 | 
13 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params);
14 | 
15 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec);
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/host_batch.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HOST_BACTH_H__
 2 | #define __HOST_BACTH_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h> // useful for memcpy, strlen
 7 | 
 8 | // host data structure methods
 9 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset);
10 | void gasal_host_batch_destroy(host_batch_t *res); 																		// destructor
11 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg); 	
12 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage);															// get last item of chain
13 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC); 	// fill the data
14 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC );
15 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC );
16 | void gasal_host_batch_print(host_batch_t *res); 																		// printer 
17 | void gasal_host_batch_printall(host_batch_t *res);																		// printer for the whole linked list
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/src/args_parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARGS_PARSER_H
 2 | #define ARGS_PARSER_H
 3 | 
 4 | /*
 5 | #include <stdint.h>
 6 | 
 7 | 
 8 | #include "gasal.h"
 9 | */
10 | #include <fstream>
11 | #include <iostream>
12 | #include "gasal.h"
13 | #include <string.h>
14 | 
15 | 
16 | enum fail_type {
17 |     NOT_ENOUGH_ARGS,
18 |     TOO_MANY_ARGS,
19 |     WRONG_ARG,
20 |     WRONG_FILES,
21 |     WRONG_ALGO
22 | };
23 | 
24 | class Parameters{
25 | 
26 |     public: 
27 |         Parameters(int argc, char** argv);
28 |         ~Parameters();
29 |         void print();
30 |         void failure(fail_type f);
31 |         void help();
32 |         void parse();
33 |         void fileopen();
34 | 
35 | 
36 | 
37 |         int32_t sa;
38 |         int32_t sb;
39 |         int32_t gapo;
40 |         int32_t gape;
41 |         comp_start start_pos; 
42 |         int print_out;
43 |         int n_threads;
44 |         int32_t k_band;
45 | 
46 |         Bool secondBest;
47 | 
48 |         bool isPacked;
49 |         bool isReverseComplement;
50 | 
51 |         data_source semiglobal_skipping_head;
52 |         data_source semiglobal_skipping_tail;
53 | 
54 |         algo_type algo;
55 | 
56 |         std::string query_batch_fasta_filename;
57 |         std::string target_batch_fasta_filename;
58 | 
59 |         std::ifstream query_batch_fasta;
60 |         std::ifstream target_batch_fasta;
61 | 
62 | 
63 |     protected:
64 | 
65 |     private:
66 |         int argc;
67 |         char** argv;
68 | };
69 | 
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/test_prog/Timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | #include<string.h>
 5 | #include<stdlib.h>
 6 | #include<assert.h>
 7 | #include<sys/time.h>
 8 | 
 9 | class Timer
10 | {
11 |     private:
12 |         struct timeval startTime;
13 |         struct timeval stopTime;
14 |         double elapsedTime;
15 |         std::string name;
16 | 
17 |     public:
18 |         Timer(std::string n) { name = n; elapsedTime = 0.0;}
19 | 	Timer() { name = ""; elapsedTime = 0.0;}
20 |         void Clear() { elapsedTime = 0.0; }
21 |         void Start() { gettimeofday(&(startTime), NULL); }
22 |         void Restart()
23 |         {
24 |             elapsedTime = 0.0;
25 |             gettimeofday(&(startTime), NULL);
26 |         }
27 | 
28 |         void Pause()
29 |         {
30 |             gettimeofday(&(stopTime), NULL);
31 | 
32 |             elapsedTime +=  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
33 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
34 |         }
35 | 
36 |         void Stop()
37 |         {
38 |             gettimeofday(&(stopTime), NULL);
39 | 
40 |             elapsedTime =  ( (stopTime).tv_sec  - (startTime).tv_sec) * 1000.0;      // sec to ms
41 |             elapsedTime += ( (stopTime).tv_usec - (startTime).tv_usec) / 1000.0;   // us to ms
42 |         }
43 | 
44 |         void Print()
45 |         {
46 |             std::cout << name << " : " <<  elapsedTime << " msec"   << std::endl;
47 |         }
48 | 
49 |         double GetTime() { return elapsedTime;}
50 | 
51 | };
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/configure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | cuda_path=$1
 5 | RED='\033[0;31m'
 6 | NC='\033[0m' # No Color
 7 | 
 8 | if [ "$cuda_path" = "" ]; then
 9 |   echo -e "${RED}Must provide path to CUDA installation directory${NC}"
10 |   echo -e "${RED}Configuration incomplete${NC}"
11 |   echo -e "${RED}Exiting${NC}"	
12 |   exit 1	
13 | fi	
14 | 
15 | cuda_nvcc_path=$cuda_path/bin/nvcc
16 | 
17 | if [ -f $cuda_nvcc_path ]; then
18 |  echo "NVCC found ($cuda_nvcc_path)"
19 | else
20 |   echo -e "${RED}NVCC not found${NC}"
21 |   echo -e "${RED}Configuration incomplete${NC}"
22 |   echo -e "${RED}Exiting${NC}"	
23 |   exit 1	
24 | fi	
25 | 
26 | 
27 | cuda_lib_path="${cuda_path}/targets/x86_64-linux/lib"
28 | 
29 | 
30 | if [ -d $cuda_lib_path ]; then
31 |  echo "CUDA runtime library found (${cuda_lib_path})"
32 | else
33 |   echo -e "${RED}CUDA runtime library not found${NC}" 
34 |   echo -e "${RED}Configuration incomplete${NC}"
35 |   echo -e "${RED}Exiting${NC}"
36 |   exit 1	
37 | fi
38 | 
39 | cuda_runtime_file="${cuda_path}/targets/x86_64-linux/include/cuda_runtime.h"
40 | 
41 | if [ -f $cuda_runtime_file ]; then
42 |  echo  "CUDA runtime header file found (${cuda_runtime_file})"
43 | else
44 |   echo -e "${RED}CUDA runtime header file not found${NC}"
45 |   echo -e "${RED}Configuration incomplete${NC}"
46 |   echo -e "${RED}Exiting${NC}"
47 |   exit 1	
48 | fi
49 | 
50 | 
51 | echo "Configuring Makefile..."
52 | 
53 | sed  -i "s,NVCC=.*,NVCC=$cuda_nvcc_path,g" Makefile 
54 | 
55 | echo "Configuring gasal.h..."
56 | 
57 | sed  -i "s,.*cuda_runtime\.h\",\#include \"$cuda_runtime_file\",g" ./src/gasal.h
58 | 
59 | echo "Configuring Makefile of test program..."
60 | 
61 | sed  -i "s,CUDA_LD_LIBRARY=.*,CUDA_LD_LIBRARY=$cuda_lib_path,g" ./test_prog/Makefile 
62 | 
63 | #mkdir -p include
64 | 
65 | #cp ./src/gasal.h ./include
66 | 
67 | echo "Done"
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/src/gasal_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GASAL_KERNELS_H__
 2 | #define __GASAL_KERNELS_H__
 3 | 
 4 | 
 5 | // Template-meta-programming types construction from Int values
 6 | // This allows to cut down kernel code at compilation time.
 7 | 
 8 | template <int Val>
 9 | struct Int2Type
10 | {
11 | 	typedef enum {val_ = Val} val__;
12 | };
13 | 
14 | template<typename X, typename Y>
15 | struct SameType
16 | {
17 |    enum { result = 0 };
18 | };
19 | 
20 | template<typename T>
21 | struct SameType<T, T>
22 | {
23 |    enum { result = 1 };
24 | };
25 | 
26 | #define SAMETYPE(a, b) (SameType<a,b>::result)
27 | 
28 | 
29 | __constant__ int32_t _cudaGapO; /*gap open penalty*/
30 | __constant__ int32_t _cudaGapOE; /*sum of gap open and extension penalties*/
31 | __constant__ int32_t _cudaGapExtend; /*sum of gap extend*/
32 | __constant__ int32_t _cudaMatchScore; /*score for a match*/
33 | __constant__ int32_t _cudaMismatchScore; /*penalty for a mismatch*/
34 | 
35 | #define MINUS_INF SHRT_MIN
36 | 
37 | #define N_VALUE (N_CODE & 0xF)
38 | 
39 | #ifdef N_PENALTY
40 | 	#define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \
41 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
42 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\
43 | 
44 | 	#define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \
45 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
46 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? -N_PENALTY : score;\
47 | 
48 | #else
49 | 	#define DEV_GET_SUB_SCORE_LOCAL(score, rbase, gbase) \
50 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
51 | 	score = ((rbase == N_VALUE) || (gbase == N_VALUE)) ? 0 : score;\
52 | 
53 | 	#define DEV_GET_SUB_SCORE_GLOBAL(score, rbase, gbase) \
54 | 		score = (rbase == gbase) ?_cudaMatchScore : -_cudaMismatchScore;\
55 | 
56 | #endif
57 | 
58 | #define MAX(a,b) ((a)>(b)?(a):(b))
59 | #define MIN(a,b) ((a)<(b)?(a):(b))
60 | 
61 | 
62 | #define FIND_MAX(curr, gidx) \
63 | 	maxXY_y = (maxHH < curr) ? gidx : maxXY_y;\
64 | maxHH = (maxHH < curr) ? curr : maxHH;
65 | 
66 | 
67 | // Kernel files
68 | 
69 | #include "kernels/pack_rc_seqs.h"
70 | 
71 | #include "kernels/global.h"
72 | 
73 | #include "kernels/semiglobal_kernel_template.h"
74 | 
75 | #include "kernels/local_kernel_template.h"
76 | 
77 | #include "kernels/banded.h"
78 | 
79 | #include "kernels/ksw_kernel_template.h"
80 | 
81 | #include "kernels/get_tb.h"
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/test_prog/README.md:
--------------------------------------------------------------------------------
 1 | This directory conatins a test program for GASAL2. The program overlaps the sequence alignment on the GPU with CPU execution. The CPU executes the code for creating a batch of sequences to be aligned on the GPU and printing the alignment results. First compile GASAL with `N_CODE=0x4E`. To compile the test program run `make`. Running the test program with `-h` or `--help` will print the options:
 2 | 
 3 | ```
 4 | $./test_prog.out -h
 5 | 
 6 | Usage: ./test_prog.out [-a] [-b] [-q] [-r] [-s] [-p] [-n] [-y] <query_batch.fasta> <target_batch.fasta>
 7 | Options: -a INT    match score [1]
 8 |          -b INT    mismatch penalty [4]
 9 |          -q INT    gap open penalty [6]
10 |          -r INT    gap extension penalty [1]
11 |          -s        also find the start position
12 | 	 -t        compute traceback. With this option enabled, "-s" has no effect as start position will always be computed with traceback
13 |          -p        print the alignment results
14 |          -n INT    Number of threads [1]
15 |          -y AL_TYPE       Alignment type . Must be "local", "semi_global", "global", "ksw"
16 |          -x HEAD TAIL     specifies, for semi-global alignment, wha should be skipped for heads and tails of the sequences. (NONE, QUERY, TARGET, BOTH)
17 |          -k INT    Band width in case "banded" is selected.
18 |          --help, -h : displays this message.
19 |          --second-best   displays second best score (WITHOUT_START only).
20 | Single-pack multi-Parameters (e.g. -sp) is not supported.
21 | 
22 | ```
23 | 
24 | 
25 | `query_batch.fasta` and `target_batch.fasta` contain the single-line FASTA sequences for the alignment. The sequences in these files are aligned one-to-one, i.e. the first sequence in query_batch.fasta is aligned to the first sequence in target_batch.fasta, the second sequence in query_batch.fasta is aligned to the second sequence in target_batch.fasta, and so on. The directory also contains sample query_batch.fasta and target_batch.fasta files. For the two sample files use `MAX_QUERY_LEN=160`.
26 | 
27 | In order to demonstrate easily the possibilities of reverse-complementing independently, one can change the first character of the sequence delimiter `>` in the .fasta files. The test program parses the first character as the following :
28 | 
29 | - Parsing `>` does no operation on the sequence (this is the regular mode),
30 | - Parsing `<` flags the sequence to be reversed,
31 | - Parsing `/` flags the sequence to be complemented,
32 | - Parsing `+` flags the sequence to be reversed and complemented.
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GPU_SM_ARCH=
 2 | MAX_QUERY_LEN=
 3 | N_CODE=
 4 | N_PENALTY=
 5 | 
 6 | GPU_COMPUTE_ARCH=$(subst sm,compute,$(GPU_SM_ARCH))
 7 | NVCC=/usr/local/cuda-10.1/bin/nvcc
 8 | CC=g++
 9 | SRC_DIR=./src/
10 | OBJ_DIR=./obj/
11 | LIB_DIR=./lib/
12 | INCLUDE_DIR=./include/
13 | 
14 | SOURCES=  args_parser.cpp host_batch.cpp ctors.cpp interfaces.cpp res.cpp gasal_align.cu 
15 | LOBJS=$(patsubst %,%o,$(SOURCES))
16 | 
17 | LOBJS_PATH=$(addprefix $(OBJ_DIR),$(LOBJS))
18 | VPATH=src:obj:lib
19 | YELLOW=\033[1;33m
20 | NC=\033[0m # No Color
21 | 
22 | ifeq ($(GPU_SM_ARCH),)
23 | error1:
24 | 	@echo "Must specify GPU architecture as sm_xx"
25 | endif
26 | ifeq ($(MAX_QUERY_LEN),)
27 | error2:
28 | 	@echo "Must specify maximum sequence length"
29 | endif
30 | 
31 | ifeq ($(N_CODE),)
32 | error3:
33 | 	@echo "Must specify the code for 'N'"
34 | endif
35 | #ifneq ($(GPU_SM_ARCH),clean)
36 | 
37 | 
38 | 
39 | 
40 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: --compiler-options -fPIC 
41 | ## With Debian and clang, use: $(NVCC) -ccbin clang-3.8 --compiler-options -fpie
42 | 
43 | ifeq ($(N_PENALTY),)
44 | %.cuo: %.cu
45 | 	$(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE) -Xptxas -Werror  --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@
46 | 	
47 | else
48 | %.cuo: %.cu
49 | 	$(NVCC) -c -g -O3 -std=c++11 -Xcompiler -Wall,-DMAX_QUERY_LEN=$(MAX_QUERY_LEN),-DN_CODE=$(N_CODE),-DN_PENALTY=$(N_PENALTY) -Xptxas -Werror  --gpu-architecture=$(GPU_COMPUTE_ARCH) --gpu-code=$(GPU_SM_ARCH) -lineinfo --ptxas-options=-v --default-stream per-thread $< -o $(OBJ_DIR)$@
50 | 	
51 | endif
52 | 
53 | 
54 | 
55 | ## If your computer ships gcc-5.3.1 (at least for CUDA 8.0), this is the regular line. You might need to add: -fPIC 
56 | ifeq ($(N_PENALTY),)
57 | %.cppo: %.cpp
58 | 	$(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -Werror $< -o $(OBJ_DIR)$@
59 | 	
60 | else
61 | %.cppo: %.cpp
62 | 	$(CC) -c -g -O3 -std=c++11 -Wall -DMAX_QUERY_LEN=$(MAX_QUERY_LEN) -DN_CODE=$(N_CODE) -DN_PENALTY=$(N_PENALTY) -Werror $< -o $(OBJ_DIR)$@
63 | 	
64 | endif
65 | 
66 | 
67 | all: clean makedir libgasal.a
68 | 
69 | makedir:
70 | 	@mkdir -p $(OBJ_DIR)
71 | 	@mkdir -p $(LIB_DIR)
72 | 	@mkdir -p $(INCLUDE_DIR)
73 | 	@cp $(SRC_DIR)/*.h $(INCLUDE_DIR)
74 | 	@sed  -i "s/MAX_QUERY_LEN=[0-9]\{1,9\}/MAX_QUERY_LEN=$(MAX_QUERY_LEN)/" ./test_prog/Makefile
75 | 	 
76 | ifeq ($(N_PENALTY),)
77 | libgasal.a: $(LOBJS)
78 | 	ar -csru $(LIB_DIR)$@ $(LOBJS_PATH)
79 | 	@echo ""
80 | 	@echo -e "${YELLOW}WARNING:${NC}\"N_PENALTY\" is not defined"
81 | else
82 | libgasal.a: $(LOBJS)
83 | 	ar -csru $(LIB_DIR)$@ $(LOBJS_PATH)
84 | endif
85 | 	
86 | clean:
87 | 	rm -f -r $(OBJ_DIR) $(LIB_DIR) $(INCLUDE_DIR)  *~ *.exe *.cppo *.cuo *.txt *~
88 | 
89 | gasal_align.cuo: gasal.h gasal_kernels.h
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/test_prog/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_LD_LIBRARY=/usr/local/cuda-10.1/targets/x86_64-linux/lib
 2 | ANALYSIS_FILENAME=analysis
 3 | # prefix1 can be optirun in case you need to run it from an optimus-enabled laptop.
 4 | PREFIX1=
 5 | #i prefix2 can be nvprof. use preferably the following : nvprof --profile-api-trace none -s -f -o /tmp/.nvprof/$(ANALYSIS_FILENAME).nvprof
 6 | PREFIX2=valgrind
 7 | #suffix1 and 2 can be an output file.
 8 | SUFFIX1=> golden.log
 9 | SUFFIX2=> out.log
10 | 
11 | PRGM=test_prog.out
12 |  
13 | OPTARGS1=-p -y local
14 | OPTARGS2=-p -y local
15 | 
16 | 
17 | FILES_HUMAN600=reads_600_human_10M.fasta ref_600_human_10M.fasta
18 | FILES_HUMAN300=reads_300_human_10M.fasta ref_300_human_10M.fasta
19 | FILES_HUMAN150=reads_150_human_10M.fasta ref_150_human_10M.fasta
20 | FILES_20K=query_batch.fasta target_batch.fasta
21 | FILES_262K=reads_150.fasta ref_150.fasta
22 | FILES_SHORT=short_query_batch.fasta short_target_batch.fasta
23 | 
24 | .cpp.o:
25 | 	g++ -std=c++11 -g -c -O3 -Wall -Werror -fopenmp -I ../include  -o test_prog.o test_prog.cpp
26 | 
27 | all: clean test_prog.out
28 | 
29 | test_prog.out: test_prog.o
30 | 	g++ -std=c++11 -O3 -o test_prog.out -L$(CUDA_LD_LIBRARY) -L../lib test_prog.o -fopenmp -lcudart -lgasal
31 | 
32 | clean:
33 | 	rm -f -r *~ *.exe *.o *.out
34 | 	
35 | test_prog.o: Timer.h
36 |  
37 | 
38 | human150: all
39 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN150) $(SUFFIX1)
40 | 
41 | human150-2: all
42 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN150) $(SUFFIX2)
43 | 
44 | human300: all
45 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN300) $(SUFFIX1)
46 | 
47 | human300-2: all
48 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN300) $(SUFFIX2)
49 | 
50 | human600: all
51 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_HUMAN600) $(SUFFIX1)
52 | 
53 | human600-2: all
54 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_HUMAN600) $(SUFFIX2)
55 | 
56 | 
57 | run: all
58 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_SHORT) $(SUFFIX1)
59 | 
60 | run2: all
61 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_SHORT) $(SUFFIX2)
62 | 
63 | 
64 | fullrun: all
65 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1)
66 | 
67 | fullrun2: all
68 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_20K) $(SUFFIX2)
69 | 
70 | 
71 | 262k: all
72 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS1) $(FILES_262K) $(SUFFIX1)
73 | 
74 | 262k2: all
75 | 	$(PREFIX1) $(PREFIX2) ./$(PRGM) $(OPTARGS2) $(FILES_262K) $(SUFFIX2)
76 | 
77 | 
78 | 
79 | 
80 | 
81 | cuda-memcheck: all
82 | 	cuda-memcheck ./$(PRGM) $(OPTARGS1) $(FILES_20K) $(SUFFIX1)
83 | 
84 | cuda-gdb: all
85 | 	cuda-gdb --args ./test_prog.out -p -y local query_batch.fasta target_batch.fasta
86 | 
87 | valgrind: all
88 | 	valgrind ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta
89 | 
90 | gdb: all
91 | 	gdb --args ./test_prog.out -p -y local short_query_batch.fasta short_target_batch.fasta
92 | 


--------------------------------------------------------------------------------
/src/kernels/get_tb.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GET_TB__
  2 | #define __GET_TB__
  3 | 
  4 | template <typename T>
  5 | __global__ void gasal_get_tb(uint8_t *cigar, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *cigar_offset, uint4 *packed_tb_matrices, gasal_res_t *device_res, int n_tasks) {
  6 | 
  7 | 	int i, j;
  8 | 	int total_score __attribute__((unused));
  9 | 	int curr_score __attribute__((unused));
 10 | 	const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
 11 | 	if (tid >= n_tasks) return;
 12 | 
 13 | 	int offset = cigar_offset[tid];
 14 | 
 15 | 
 16 | 	if (SAMETYPE(T, Int2Type<LOCAL>)) {
 17 | 		i = device_res->target_batch_end[tid];
 18 | 		j = device_res->query_batch_end[tid];
 19 | 		total_score = device_res->aln_score[tid];
 20 | 		curr_score = 0;
 21 | 	} else if (SAMETYPE(T, Int2Type<GLOBAL>)) {
 22 | 		i = target_batch_lens[tid];
 23 | 		j = query_batch_lens[tid];
 24 | 	}
 25 | 
 26 | 
 27 | 
 28 | 	uint32_t prev_op_to_fill = 0;
 29 | 
 30 | 	int read_len_8 = query_batch_lens[tid]%8 ? query_batch_lens[tid] + (8 - (query_batch_lens[tid]%8)) : query_batch_lens[tid];
 31 | 
 32 | 	int n_ops = 0;
 33 | 
 34 | 	int prev_tile_no = -1;
 35 | 
 36 | 	uint4 tile = make_uint4(0, 0, 0, 0);
 37 | 
 38 | 	int op_select = 3;
 39 | 
 40 | 	int op_shift = 0;
 41 | 
 42 | 
 43 | 	int count = 0;
 44 | 
 45 | 	uint32_t op_to_fill;
 46 | 
 47 | 	while ( i >= 0 && j >= 0) {
 48 | 
 49 | 
 50 | 		int cell = (((i >> 3) * read_len_8) << 3) + (j << 3) + (i&7);
 51 | 
 52 | 
 53 | 
 54 | 		int tile_no = cell>>5;
 55 | 
 56 | 
 57 | 		tile = tile_no != prev_tile_no ? packed_tb_matrices[(tile_no*n_tasks) + tid] : tile;
 58 | 
 59 | 		prev_tile_no = tile_no;
 60 | 
 61 | 		int cell_no_in_tile = cell - (tile_no<<5);
 62 | 
 63 | 
 64 | 		int reg_no_in_tile = cell_no_in_tile >> 3;
 65 | 
 66 | 		int cell_no_in_reg = cell_no_in_tile - (reg_no_in_tile << 3);
 67 | 
 68 | 		uint32_t reg = reg_no_in_tile == 0 ? tile.x : (reg_no_in_tile == 1 ? tile.y : (reg_no_in_tile == 2 ? tile.z : tile.w));
 69 | 
 70 | 
 71 | 		uint32_t cell_op = (reg >> (28 - (cell_no_in_reg << 2))) & 15;
 72 | 
 73 | 
 74 | 		uint32_t op = (cell_op >> op_shift) & op_select;
 75 | 
 76 | 
 77 | 
 78 | 		op_to_fill = op == 0 || op_select == 3 ? op : op_shift ;
 79 | 
 80 | 		op_select = op == 0 || (op == 1 && op_select == 3) ? 3 : 1;
 81 | 
 82 | 		op_shift = op == 0 || ( op == 1 && op_select == 3) ? 0 : ((op == 2 || op == 3) ?  op : op_shift);
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 		if(count < 63  &&  op_to_fill == prev_op_to_fill) {
 88 | 			count++;
 89 | 		} else {
 90 | 			if (count > 0) {
 91 | 				uint8_t reg_out = 0;
 92 | 				reg_out |= prev_op_to_fill;
 93 | 				reg_out |= (uint8_t)(count << 2);
 94 | 				cigar[offset++] = reg_out;
 95 | 				n_ops++;
 96 | 			}
 97 | 			count = 1;
 98 | 		}
 99 | 
100 | 		if (SAMETYPE(T, Int2Type<LOCAL>)) {
101 | 			curr_score += ((op_to_fill == 2 || op_to_fill == 3) && prev_op_to_fill != op_to_fill) ? -_cudaGapOE : ((op_to_fill == 2 || op_to_fill == 3) ? - _cudaGapExtend : (op_to_fill == 1 ? -_cudaMismatchScore : _cudaMatchScore));
102 | 			if (curr_score == total_score) break;
103 | 		}
104 | 
105 | 		prev_op_to_fill = op_to_fill;
106 | 
107 | 		i = op_to_fill == 0 || op_to_fill == 1 || op_to_fill == 2 ? i - 1 : i;
108 | 		j = op_to_fill == 0 || op_to_fill == 1 || op_to_fill == 3 ? j - 1 : j;
109 | 
110 | 
111 | 	}
112 | 
113 | 	uint8_t reg_out = 0;
114 | 	reg_out |= prev_op_to_fill;
115 | 	reg_out |= (uint8_t)(count << 2);
116 | 	cigar[offset++] = reg_out;
117 | 	n_ops++;
118 | 
119 | 	if (SAMETYPE(T, Int2Type<GLOBAL>)) {
120 | 		while (i >= 0) {
121 | 			uint32_t reg_out = 0;
122 | 			uint8_t resd_count = (i+1) <= 63 ? (i+1) : 63;
123 | 			reg_out |= 2;
124 | 			reg_out |= (uint8_t)(resd_count << 2);
125 | 			cigar[offset++] = reg_out;
126 | 			n_ops++;
127 | 			i = i - 63;
128 | 
129 | 		}
130 | 		while (j >= 0) {
131 | 			uint32_t reg_out = 0;
132 | 			uint8_t resd_count = (j+1) <= 63 ? (j+1) : 63;
133 | 			reg_out |= 3;
134 | 			reg_out |= (uint8_t)(resd_count << 2);
135 | 			cigar[offset++] = reg_out;
136 | 			n_ops++;
137 | 			j = j - 63;
138 | 		}
139 | 	}
140 | 
141 | 
142 | 	if (SAMETYPE(T, Int2Type<LOCAL>)) {
143 | 		device_res->target_batch_start[tid] = i;
144 | 		device_res->query_batch_start[tid] = j;
145 | 	}
146 | 	query_batch_lens[tid] = n_ops;
147 | 
148 | 
149 | }
150 | #endif
151 | 


--------------------------------------------------------------------------------
/src/gasal.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GASAL_H__
  2 | #define __GASAL_H__
  3 | 
  4 | 
  5 | #include <stdlib.h>
  6 | #include <stdint.h>
  7 | 
  8 | 
  9 | #include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cuda_runtime.h"
 10 | 
 11 | #ifndef HOST_MALLOC_SAFETY_FACTOR
 12 | #define HOST_MALLOC_SAFETY_FACTOR 5
 13 | #endif
 14 | 
 15 | #define CHECKCUDAERROR(error) \
 16 | 		do{\
 17 | 			err = error;\
 18 | 			if (cudaSuccess != err ) { \
 19 | 				fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__); \
 20 | 				exit(EXIT_FAILURE);\
 21 | 			}\
 22 | 		}while(0)\
 23 | 
 24 | 
 25 | inline int CudaCheckKernelLaunch()
 26 | {
 27 | 	cudaError err = cudaGetLastError();
 28 | 	if ( cudaSuccess != err )
 29 | 	{
 30 | 		return -1;
 31 | 	}
 32 | 
 33 | 	return 0;
 34 | }
 35 | 
 36 | 
 37 | enum comp_start{
 38 | 	WITHOUT_START,
 39 | 	WITH_START,
 40 | 	WITH_TB
 41 | };
 42 | 
 43 | // Generic enum for ture/false. Using this instead of bool to generalize templates out of Int values for secondBest. 
 44 | // Can be usd more generically, for example for WITH_/WITHOUT_START.
 45 | enum Bool{
 46 | 	FALSE,
 47 | 	TRUE
 48 | };
 49 | 
 50 | enum data_source{
 51 | 	NONE,
 52 | 	QUERY,
 53 | 	TARGET,
 54 | 	BOTH
 55 | };
 56 | 
 57 | enum algo_type{
 58 | 	UNKNOWN,
 59 | 	GLOBAL,
 60 | 	SEMI_GLOBAL,
 61 | 	LOCAL,
 62 | 	MICROLOCAL,
 63 | 	BANDED,
 64 | 	KSW
 65 | };
 66 | 
 67 | enum operation_on_seq{
 68 | 	FORWARD_NATURAL,
 69 | 	REVERSE_NATURAL,
 70 | 	FORWARD_COMPLEMENT,
 71 | 	REVERSE_COMPLEMENT,
 72 | };
 73 | 
 74 | // data structure of linked list to allow extension of memory on host side.
 75 | struct host_batch{
 76 | 	uint8_t *data;
 77 | 	uint32_t page_size;
 78 | 	uint32_t data_size;
 79 | 	uint32_t offset;
 80 | 	int is_locked;
 81 | 	struct host_batch* next;
 82 | };
 83 | typedef struct host_batch host_batch_t;
 84 | 
 85 | // Data structure to hold results. Can be instantiated for host or device memory (see res.cpp)
 86 | struct gasal_res{
 87 | 	int32_t *aln_score;
 88 | 	int32_t *query_batch_end;
 89 | 	int32_t *target_batch_end;
 90 | 	int32_t *query_batch_start;
 91 | 	int32_t *target_batch_start;
 92 | 	uint8_t *cigar;
 93 | 	uint32_t *n_cigar_ops;
 94 | };
 95 | typedef struct gasal_res gasal_res_t;
 96 | 
 97 | //stream data
 98 | typedef struct {
 99 | 	uint8_t *unpacked_query_batch;
100 | 	uint8_t *unpacked_target_batch;
101 | 	uint32_t *packed_query_batch;
102 | 	uint32_t *packed_target_batch;
103 | 	uint32_t *query_batch_offsets;
104 | 	uint32_t *target_batch_offsets;
105 | 	uint32_t *query_batch_lens;
106 | 	uint32_t *target_batch_lens;
107 | 
108 | 	uint32_t *host_seed_scores;
109 | 	uint32_t *seed_scores;
110 | 	
111 | 	host_batch_t *extensible_host_unpacked_query_batch;
112 | 	host_batch_t *extensible_host_unpacked_target_batch;
113 | 
114 | 	uint8_t *host_query_op;
115 | 	uint8_t *host_target_op;
116 | 	uint8_t *query_op;
117 | 	uint8_t *target_op;
118 | 
119 | 	uint32_t *host_query_batch_offsets;
120 | 	uint32_t *host_target_batch_offsets;
121 | 	uint32_t *host_query_batch_lens;
122 | 	uint32_t *host_target_batch_lens;
123 | 
124 | 	gasal_res_t *host_res; // the results that can be read on host - THE STRUCT IS ON HOST SIDE, ITS CONTENT IS ON HOST SIDE.
125 | 	gasal_res_t *device_cpy; // a struct that contains the pointers to the device side - THE STRUCT IS ON HOST SIDE, but the CONTENT is malloc'd on and points to the DEVICE SIDE
126 | 	gasal_res_t *device_res; // the results that are written on device - THE STRUCT IS ON DEVICE SIDE, ITS CONTENT POINTS TO THE DEVICE SIDE.
127 | 
128 | 	gasal_res_t *host_res_second; 
129 | 	gasal_res_t *device_res_second; 
130 | 	gasal_res_t *device_cpy_second;
131 | 
132 | 	uint32_t gpu_max_query_batch_bytes;
133 | 	uint32_t gpu_max_target_batch_bytes;
134 | 
135 | 	uint32_t host_max_query_batch_bytes;
136 | 	uint32_t host_max_target_batch_bytes;
137 | 	
138 | 	uint32_t gpu_max_n_alns;
139 | 	uint32_t host_max_n_alns;
140 | 	uint32_t current_n_alns;
141 | 
142 | 	uint64_t packed_tb_matrix_size;
143 | 	uint4 *packed_tb_matrices;
144 | 
145 | 
146 | 	cudaStream_t str;
147 | 	int is_free;
148 | 	int id; //this can be useful in cases where a gasal_gpu_storage only contains PARTS of an alignment (like a seed-extension...), to gather results.
149 | 
150 | } gasal_gpu_storage_t;
151 | 
152 | //vector of streams
153 | typedef struct {
154 | 	int n;
155 | 	gasal_gpu_storage_t *a;
156 | }gasal_gpu_storage_v;
157 | 
158 | 
159 | //match/mismatch and gap penalties
160 | typedef struct{
161 | 	int32_t match;
162 | 	int32_t mismatch;
163 | 	int32_t gap_open;
164 | 	int32_t gap_extend;
165 | } gasal_subst_scores;
166 | 
167 | 
168 | #endif
169 | 


--------------------------------------------------------------------------------
/src/interfaces.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "interfaces.h"
  4 | #include "res.h"
  5 | 
  6 | 
  7 | // Function for general resizing
  8 | template <typename T>
  9 | T* cudaHostRealloc(void *source, int new_size, int old_size) 
 10 | {
 11 | 	cudaError_t err;
 12 | 	T* destination = NULL;
 13 | 	if (new_size < old_size)
 14 | 	{
 15 | 		fprintf(stderr, "[GASAL ERROR] cudoHostRealloc: invalid sizes. New size < old size (%d < %d)", new_size, old_size);
 16 | 		exit(EXIT_FAILURE);
 17 | 	}
 18 | 	CHECKCUDAERROR(cudaHostAlloc(&destination, new_size * sizeof(T), cudaHostAllocMapped));
 19 | 	//fprintf(stderr, "\ndest=%p\tsrc=%p", destination, source);
 20 | 	CHECKCUDAERROR(cudaMemcpy(destination, source, old_size * sizeof(T), cudaMemcpyHostToHost));
 21 | 	CHECKCUDAERROR(cudaFreeHost(source));
 22 | 	return destination;
 23 | };
 24 | 
 25 | // Realloc new fields when more alignments are added. 
 26 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params)
 27 | {
 28 | 	/*  // Don't reallocate the extensible batches. They're extensible.
 29 | 		gpu_storage->extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0);
 30 | 		gpu_storage->extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0);
 31 | 	*/
 32 | 	/*  // don't realloc gpu-sided batches as they will be taken care of before aligning.
 33 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
 34 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
 35 | 	*/
 36 | 
 37 | 	fprintf(stderr, "[GASAL WARNING] Resizing gpu_storage from %d sequences to %d sequences... ", gpu_storage->host_max_n_alns,new_max_alns);
 38 | 	// don't care about realloc'ing gpu-sided fields as they will be taken care of before aligning.
 39 | 
 40 | 	gpu_storage->host_query_op =  cudaHostRealloc<uint8_t>((void*) gpu_storage->host_query_op, new_max_alns, gpu_storage->host_max_n_alns);
 41 | 	gpu_storage->host_target_op =  cudaHostRealloc<uint8_t>((void*) gpu_storage->host_target_op, new_max_alns, gpu_storage->host_max_n_alns);
 42 | 	
 43 | 	if (params->algo == KSW)
 44 | 		gpu_storage->host_seed_scores = cudaHostRealloc<uint32_t>(gpu_storage->host_seed_scores, new_max_alns, gpu_storage->host_max_n_alns);
 45 | 	//fprintf(stderr, "_ops done ");
 46 | 
 47 | 	gpu_storage->host_query_batch_lens = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_query_batch_lens, new_max_alns, gpu_storage->host_max_n_alns);
 48 | 	gpu_storage->host_target_batch_lens = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_target_batch_lens, new_max_alns, gpu_storage->host_max_n_alns);
 49 | 	//fprintf(stderr, "_lens done ");
 50 | 
 51 | 	gpu_storage->host_query_batch_offsets = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_query_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns);
 52 | 	gpu_storage->host_target_batch_offsets = cudaHostRealloc<uint32_t>((void*) gpu_storage->host_target_batch_offsets, new_max_alns, gpu_storage->host_max_n_alns);
 53 | 	//fprintf(stderr, "_offsets done ");
 54 | 	
 55 | 	gasal_res_destroy_host(gpu_storage->host_res);
 56 | 	gpu_storage->host_res = gasal_res_new_host(new_max_alns, params);
 57 | 	gpu_storage->device_cpy = gasal_res_new_device_cpy(new_max_alns, params);
 58 | 	gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
 59 | 
 60 | 	if (params->secondBest)
 61 | 	{	
 62 | 		gasal_res_destroy_host(gpu_storage->host_res_second);
 63 | 		gpu_storage->host_res_second = gasal_res_new_host(new_max_alns, params);
 64 | 		gpu_storage->device_cpy_second = gasal_res_new_device_cpy(new_max_alns, params);
 65 | 		gpu_storage->device_res_second = gasal_res_new_device(gpu_storage->device_cpy_second);
 66 | 
 67 | 	} else {
 68 | 		gpu_storage->host_res_second = NULL;
 69 | 		gpu_storage->device_cpy_second = NULL;
 70 | 		gpu_storage->device_res_second = NULL;
 71 | 	}
 72 | 	
 73 | 	//fprintf(stderr, "_res done ");
 74 | 
 75 | 	gpu_storage->host_max_n_alns = new_max_alns;
 76 | 	//gpu_storage->gpu_max_n_alns = gpu_max_n_alns;
 77 | 	fprintf(stderr, " done. This can harm performance.\n");
 78 | }
 79 | 
 80 | // operation (Reverse/complement) filler.
 81 | void gasal_op_fill(gasal_gpu_storage_t *gpu_storage_t, uint8_t *data, uint32_t nbr_seqs_in_stream, data_source SRC)
 82 | {
 83 | 	uint8_t *host_op = NULL;
 84 | 	switch(SRC)
 85 | 	{
 86 | 		case QUERY:
 87 | 			host_op = (gpu_storage_t->host_query_op);
 88 | 		break;
 89 | 		case TARGET:
 90 | 			host_op = (gpu_storage_t->host_target_op);
 91 | 		break;
 92 | 		default:
 93 | 		break;
 94 | 	}
 95 | 	memcpy(host_op, data, nbr_seqs_in_stream);
 96 | }
 97 | 
 98 | void gasal_set_device(int gpu_select, bool isPrintingProp)
 99 | {
100 | 	/* 
101 | 	Select GPU
102 | 	*/
103 | 	if (isPrintingProp)
104 | 	{
105 | 		int num_devices, device;
106 | 		cudaGetDeviceCount(&num_devices);
107 | 		fprintf(stderr, "Found %d GPUs\n", num_devices);
108 | 		if (gpu_select  > num_devices-1)
109 | 		{
110 | 			fprintf(stderr, "Error: can't select device %d when only %d devices are selected (range from 0 to %d)\n", gpu_select, num_devices, num_devices-1);
111 | 			exit(EXIT_FAILURE);
112 | 		}
113 | 		if (num_devices > 0) {
114 | 			cudaDeviceProp properties;
115 | 			for (device = 0; device < num_devices; device++) {
116 | 					cudaGetDeviceProperties(&properties, device);
117 | 					fprintf(stderr, "\tGPU %d: %s\n", device, properties.name);
118 | 			}
119 | 			cudaGetDeviceProperties(&properties, gpu_select);
120 | 			fprintf(stderr, "Selected device %d : %s\n", gpu_select, properties.name);
121 | 			cudaSetDevice(gpu_select);
122 | 		}
123 | 	} else {
124 | 		// silently select device
125 | 		cudaSetDevice(gpu_select);
126 | 	}
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/src/res.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | 
  3 | #include "args_parser.h"
  4 | 
  5 | #include "res.h"
  6 | 
  7 | 
  8 | gasal_res_t *gasal_res_new_host(uint32_t max_n_alns, Parameters *params)
  9 | {
 10 | 	cudaError_t err;
 11 | 	gasal_res_t *res = NULL;
 12 | 
 13 | 
 14 | 	res = (gasal_res_t *)malloc(sizeof(gasal_res_t));
 15 | 
 16 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->aln_score), max_n_alns * sizeof(int32_t),cudaHostAllocDefault));
 17 | 	
 18 | 	
 19 | 	if(res ==NULL)
 20 | 	{
 21 | 		fprintf(stderr,  "Malloc error on res host ");
 22 | 		exit(1);
 23 | 	}
 24 | 
 25 | 
 26 | 	if (params->algo == GLOBAL) {
 27 | 		res->query_batch_start = NULL;
 28 | 		res->target_batch_start = NULL;
 29 | 		res->query_batch_end = NULL;
 30 | 		res->target_batch_end = NULL;
 31 | 		/*
 32 | 		// Deprecated. For semi-global you now need to know the start and stop positions.
 33 | 		} else if (params->algo == SEMI_GLOBAL) {
 34 | 		res->host_query_batch_start = NULL;
 35 | 		res->host_query_batch_end = NULL;
 36 | 		res->query_batch_start = NULL;
 37 | 		res->query_batch_end = NULL;
 38 | 
 39 | 		if (params->start_pos == WITH_START) {
 40 | 		CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_start),max_n_alns * sizeof(uint32_t)));
 41 | 		CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_end),max_n_alns * sizeof(uint32_t)));
 42 | 
 43 | 		CHECKCUDAERROR(cudaMalloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t)));
 44 | 		CHECKCUDAERROR(
 45 | 		cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t)));
 46 | 		} else {
 47 | 		CHECKCUDAERROR(cudaHostAlloc(&(res->host_target_batch_end),max_n_alns * sizeof(uint32_t)));
 48 | 		CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t)));
 49 | 		res->host_target_batch_start = NULL;
 50 | 		res->target_batch_start = NULL;
 51 | 		}
 52 | 		 */
 53 | 	} else {
 54 | 		if (params->start_pos == WITH_START || params->start_pos == WITH_TB) {
 55 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_start),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 56 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 57 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 58 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 59 | 
 60 | 		} else {
 61 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 62 | 			CHECKCUDAERROR(cudaHostAlloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 63 | 			res->query_batch_start = NULL;
 64 | 			res->target_batch_start = NULL;
 65 | 		}
 66 | 
 67 | 	}
 68 | 	if (params->start_pos == WITH_TB) {
 69 | 		CHECKCUDAERROR(cudaHostAlloc(&(res->n_cigar_ops), max_n_alns * sizeof(uint32_t),cudaHostAllocDefault));
 70 | 	}
 71 | 
 72 | 	return res;
 73 | }
 74 | 
 75 | 
 76 | gasal_res_t *gasal_res_new_device(gasal_res_t *device_cpy)
 77 | {
 78 | 	cudaError_t err;
 79 | 
 80 | 
 81 | 	
 82 |     // create class storage on device and copy top level class
 83 |     gasal_res_t *d_c;
 84 |     CHECKCUDAERROR(cudaMalloc((void **)&d_c, sizeof(gasal_res_t)));
 85 | 	//    CHECKCUDAERROR(cudaMemcpy(d_c, res, sizeof(gasal_res_t), cudaMemcpyHostToDevice));
 86 | 
 87 | 
 88 | 
 89 |     // copy pointer to allocated device storage to device class
 90 |     CHECKCUDAERROR(cudaMemcpy(&(d_c->aln_score), &(device_cpy->aln_score), sizeof(int32_t*), cudaMemcpyHostToDevice));
 91 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_start), &(device_cpy->query_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice));
 92 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_start), &(device_cpy->target_batch_start), sizeof(int32_t*), cudaMemcpyHostToDevice));
 93 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->query_batch_end), &(device_cpy->query_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice));
 94 | 	CHECKCUDAERROR(cudaMemcpy(&(d_c->target_batch_end), &(device_cpy->target_batch_end), sizeof(int32_t*), cudaMemcpyHostToDevice));
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 	return d_c;
101 | }
102 | 
103 | 
104 | 
105 | 
106 | gasal_res_t *gasal_res_new_device_cpy(uint32_t max_n_alns, Parameters *params)
107 | {
108 | 	cudaError_t err;
109 | 	gasal_res_t *res;
110 | 
111 | 	res = (gasal_res_t *)malloc(sizeof(gasal_res_t));
112 | 
113 | 	CHECKCUDAERROR(cudaMalloc(&(res->aln_score), max_n_alns * sizeof(int32_t)));
114 | 
115 | 	if (params->algo == GLOBAL) {
116 | 		res->query_batch_start = NULL;
117 | 		res->target_batch_start = NULL;
118 | 		res->query_batch_end = NULL;
119 | 		res->target_batch_end = NULL;
120 | 
121 | 	} else {
122 | 		if (params->start_pos == WITH_START || params->start_pos == WITH_TB) {
123 | 
124 | 			CHECKCUDAERROR(cudaMalloc(&(res->query_batch_start),max_n_alns * sizeof(uint32_t)));
125 | 			CHECKCUDAERROR(cudaMalloc(&(res->target_batch_start),max_n_alns * sizeof(uint32_t)));
126 | 			CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t)));
127 | 			CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t)));
128 | 		
129 | 		} else {
130 | 		
131 | 			CHECKCUDAERROR(cudaMalloc(&(res->query_batch_end),max_n_alns * sizeof(uint32_t)));
132 | 			CHECKCUDAERROR(cudaMalloc(&(res->target_batch_end),max_n_alns * sizeof(uint32_t)));
133 | 		
134 | 			res->query_batch_start = NULL;
135 | 			res->target_batch_start = NULL;
136 | 		}
137 | 
138 | 	}
139 | 	return res;
140 | }
141 | 
142 | // TODO : make 2 destroys for host and device
143 | void gasal_res_destroy_host(gasal_res_t *res) 
144 | {
145 | 	cudaError_t err;
146 | 	if (res == NULL)
147 | 		return;
148 | 
149 | 
150 | 	if (res->aln_score != NULL) CHECKCUDAERROR(cudaFreeHost(res->aln_score));
151 | 	if (res->query_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_start));
152 | 	if (res->target_batch_start != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_start));
153 | 	if (res->query_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->query_batch_end));
154 | 	if (res->target_batch_end != NULL) CHECKCUDAERROR(cudaFreeHost(res->target_batch_end));
155 | 	if (res->n_cigar_ops != NULL) CHECKCUDAERROR(cudaFreeHost(res->n_cigar_ops));
156 | 	
157 | 	free(res);
158 | }
159 | 
160 | void gasal_res_destroy_device(gasal_res_t *device_res, gasal_res_t *device_cpy) 
161 | {
162 | 	cudaError_t err;
163 | 	if (device_cpy == NULL || device_res == NULL)
164 | 		return;
165 | 
166 | 	if (device_cpy->aln_score != NULL) CHECKCUDAERROR(cudaFree(device_cpy->aln_score));
167 | 	if (device_cpy->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_start));
168 | 	if (device_cpy->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_start));
169 | 	if (device_cpy->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->query_batch_end));
170 | 	if (device_cpy->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(device_cpy->target_batch_end));
171 | 	if (device_cpy->cigar != NULL) CHECKCUDAERROR(cudaFree(device_cpy->cigar));
172 | 	
173 | 
174 | 	CHECKCUDAERROR(cudaFree(device_res));
175 | 	
176 | 	free(device_cpy);
177 | }
178 | 


--------------------------------------------------------------------------------
/src/gasal_align.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GASAL_ALIGN_H__
  2 | #define __GASAL_ALIGN_H__
  3 | /*  ####################################################################################
  4 |     SEMI_GLOBAL Kernels generation - read from the bottom one, all the way up. (the most specialized ones are written before the ones that call them)
  5 |     ####################################################################################
  6 | */
  7 | #define SEMIGLOBAL_KERNEL_CALL(a,s,h,t,b) \
  8 | 	case t:\
  9 | 		{\
 10 | 		gasal_semi_global_kernel<Int2Type<a>, Int2Type<s>, Int2Type<b>, Int2Type<h>, Int2Type<t>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns);\
 11 | 		break;\
 12 | 		}\
 13 | 
 14 | #define SWITCH_SEMI_GLOBAL_TAIL(a,s,h,t,b) \
 15 | 	case h:\
 16 | 	switch(t) { \
 17 | 		SEMIGLOBAL_KERNEL_CALL(a,s,h,NONE,b)\
 18 | 		SEMIGLOBAL_KERNEL_CALL(a,s,h,QUERY,b)\
 19 | 		SEMIGLOBAL_KERNEL_CALL(a,s,h,TARGET,b)\
 20 | 		SEMIGLOBAL_KERNEL_CALL(a,s,h,BOTH,b)\
 21 | 	}\
 22 | 	break;
 23 | 
 24 | #define SWITCH_SEMI_GLOBAL_HEAD(a,s,h,t,b) \
 25 | 	case s:\
 26 | 	switch(h) { \
 27 | 		SWITCH_SEMI_GLOBAL_TAIL(a,s,NONE,t,b)\
 28 | 		SWITCH_SEMI_GLOBAL_TAIL(a,s,QUERY,t,b)\
 29 | 		SWITCH_SEMI_GLOBAL_TAIL(a,s,TARGET,t,b)\
 30 | 		SWITCH_SEMI_GLOBAL_TAIL(a,s,BOTH,t,b)\
 31 | 	} \
 32 | 	break;
 33 | 
 34 | 
 35 | /*  ####################################################################################
 36 |     ALGORITHMS Kernels generation. Allows to have a single line written for all kernels calls. The switch-cases are MACRO-generated.
 37 |     #################################################################################### 
 38 | */
 39 | 
 40 | #define SWITCH_SEMI_GLOBAL(a,s,h,t,b) SWITCH_SEMI_GLOBAL_HEAD(a,s,h,t,b)
 41 | 
 42 | #define SWITCH_LOCAL(a,s,h,t,b) \
 43 | 		case s: {\
 44 | 			gasal_local_kernel<Int2Type<LOCAL>, Int2Type<s>, Int2Type<b>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->device_res_second, gpu_storage->packed_tb_matrices, actual_n_alns); \
 45 | 			if(s == WITH_TB) {\
 46 | 				cudaError_t aln_kernel_err = cudaGetLastError();\
 47 | 				if ( cudaSuccess != aln_kernel_err )\
 48 | 				{\
 49 | 					fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err,  __LINE__, __FILE__);\
 50 | 					exit(EXIT_FAILURE);\
 51 | 				}\
 52 | 				gasal_get_tb<Int2Type<LOCAL>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns);\
 53 | 			}\
 54 | 			break;\
 55 | 		}\
 56 | 
 57 | #define SWITCH_GLOBAL(a,s,h,t,b) \
 58 | 		case s:{\
 59 | 			gasal_global_kernel<Int2Type<s>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, gpu_storage->packed_tb_matrices, actual_n_alns);\
 60 | 			if(s == WITH_TB) {\
 61 | 				cudaError_t aln_kernel_err = cudaGetLastError();\
 62 | 				if ( cudaSuccess != aln_kernel_err )\
 63 | 				{\
 64 | 					fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err,  __LINE__, __FILE__);\
 65 | 					exit(EXIT_FAILURE);\
 66 | 				}\
 67 | 				gasal_get_tb<Int2Type<GLOBAL>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns);\
 68 | 			}\
 69 | 			break;\
 70 | 		}\
 71 | 
 72 | 
 73 | #define SWITCH_KSW(a,s,h,t,b) \
 74 |     case s:\
 75 |         gasal_ksw_kernel<Int2Type<b>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->seed_scores, gpu_storage->device_res, gpu_storage->device_res_second, actual_n_alns);\
 76 |     break;
 77 | 
 78 | #define SWITCH_BANDED(a,s,h,t,b) \
 79 |     case s:\
 80 |         gasal_banded_tiled_kernel<<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->device_res, actual_n_alns,  k_band>>3); \
 81 | break;
 82 | 
 83 | /*  ####################################################################################
 84 |     RUN PARAMETERS calls : general call (bottom, should be used), and first level TRUE/FALSE calculation for second best, 
 85 |     then 2nd level WITH / WITHOUT_START switch call (top)
 86 |     ####################################################################################
 87 | */
 88 | 
 89 | #define SWITCH_START(a,s,h,t,b) \
 90 |     case b: \
 91 |     switch(s){\
 92 |         SWITCH_## a(a,WITH_START,h,t,b)\
 93 |         SWITCH_## a(a,WITHOUT_START,h,t,b)\
 94 |         SWITCH_## a(a,WITH_TB,h,t,b)\
 95 |     } \
 96 |     break;
 97 | 
 98 | #define SWITCH_SECONDBEST(a,s,h,t,b) \
 99 |     switch(b) { \
100 |         SWITCH_START(a,s,h,t,TRUE)\
101 |         SWITCH_START(a,s,h,t,FALSE)\
102 |     }
103 | 
104 | #define KERNEL_SWITCH(a,s,h,t,b) \
105 |     case a:\
106 |         SWITCH_SECONDBEST(a,s,h,t,b)\
107 |     break;
108 | 
109 | 
110 | /* // Deprecated
111 | void gasal_aln(gasal_gpu_storage_t *gpu_storage, const uint8_t *query_batch, const uint32_t *query_batch_offsets, const uint32_t *query_batch_lens, const uint8_t *target_batch, const uint32_t *target_batch_offsets, const uint32_t *target_batch_lens,   const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, int32_t *host_aln_score, int32_t *host_query_batch_start, int32_t *host_target_batch_start, int32_t *host_query_batch_end, int32_t *host_target_batch_end,  algo_type algo, comp_start start, int32_t k_band);
112 | */
113 | 
114 | void gasal_copy_subst_scores(gasal_subst_scores *subst);
115 | 
116 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params);
117 | 
118 | inline void gasal_kernel_launcher(int32_t N_BLOCKS, int32_t BLOCKDIM, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band);
119 | 
120 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage);
121 | 
122 | #endif
123 | 


--------------------------------------------------------------------------------
/src/host_batch.cpp:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "interfaces.h"
  4 | #include "host_batch.h"
  5 | 
  6 | 
  7 | 
  8 | 
  9 | // Functions for host batches handling. 
 10 | 
 11 | host_batch_t *gasal_host_batch_new(uint32_t batch_bytes, uint32_t offset)
 12 | {
 13 | 	cudaError_t err;
 14 | 	host_batch_t *res = (host_batch_t *)calloc(1, sizeof(host_batch_t));
 15 | 	CHECKCUDAERROR(cudaHostAlloc(&(res->data), batch_bytes*sizeof(uint8_t), cudaHostAllocDefault));
 16 | 	res->page_size = batch_bytes;
 17 | 	res->data_size = 0;
 18 | 	res->is_locked = 0;
 19 | 	res->offset = offset;
 20 | 	res->next = NULL;
 21 | 	return res;
 22 | }
 23 | 
 24 | void gasal_host_batch_destroy(host_batch_t *res)
 25 | {
 26 | 	cudaError_t err;
 27 | 	if (res==NULL)
 28 | 	{
 29 | 		fprintf(stderr, "[GASAL ERROR] Trying to free a NULL pointer\n");
 30 | 		exit(1);
 31 | 	}
 32 | 	// recursive function to destroy all the linked listgasal_res_destroy_host
 33 | 	if (res->next != NULL)
 34 | 		gasal_host_batch_destroy(res->next);
 35 | 	if (res->data != NULL) 
 36 | 	{
 37 | 		CHECKCUDAERROR(cudaFreeHost(res->data));
 38 | 	}
 39 | 	
 40 | 	free(res);
 41 | }
 42 | 
 43 | host_batch_t *gasal_host_batch_getlast(host_batch_t *arg)
 44 | {
 45 | 	return (arg->next == NULL ? arg : gasal_host_batch_getlast(arg->next) );
 46 | 	
 47 | }
 48 | 
 49 | void gasal_host_batch_reset(gasal_gpu_storage_t *gpu_storage)
 50 | {
 51 | 	// reset all batch idx and data occupation
 52 | 	host_batch_t *cur_page = NULL;
 53 | 	for(int i = 0; i < 2; i++) {
 54 | 
 55 | 		switch(i) {
 56 | 			case 0:
 57 | 				cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
 58 | 			break;
 59 | 			case 1:
 60 | 				cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
 61 | 			break;
 62 | 			default:
 63 | 			break;
 64 | 		}
 65 | 		while(cur_page != NULL)
 66 | 		{
 67 | 			cur_page->data_size = 0;
 68 | 			cur_page->offset = 0;
 69 | 			cur_page->is_locked = 0;
 70 | 			cur_page = cur_page->next;
 71 | 		}
 72 | 	}
 73 | 	//fprintf(stderr, "[GASAL INFO] Batch reset.\n");
 74 | 
 75 | }
 76 | 
 77 | 
 78 | // TODO: make a template... now that you started to go the C++/template way, just stick to it.
 79 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC)
 80 | {
 81 | 	// since query and target are very symmetric here, we use pointers to route the data where it has to, 
 82 | 	// while keeping the actual memory management 'source-agnostic'.
 83 | 
 84 | 	host_batch_t *cur_page = NULL;
 85 | 	uint32_t *p_batch_bytes = NULL;
 86 | 
 87 | 	switch(SRC) {
 88 | 		case QUERY:
 89 | 			cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
 90 | 			p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes);
 91 | 		break;
 92 | 		case TARGET:
 93 | 			cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
 94 | 			p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes);
 95 | 		break;
 96 | 		default:
 97 | 		break;
 98 | 	}
 99 | 	
100 | 	int nbr_N = 0;
101 | 	while((size+nbr_N)%8)
102 | 		nbr_N++;
103 | 
104 | 	while(cur_page->is_locked)
105 | 		cur_page = cur_page->next;
106 | 
107 | 	if (cur_page->next == NULL && cur_page->page_size - cur_page->data_size < size + nbr_N)
108 | 	{
109 | 		fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes while only %d remain (%s) (block size %d, filled %d bytes).\n                 Allocating a new block of size %d, total size available reaches %d. Doing this repeadtedly slows down the execution.\n",
110 | 				size + nbr_N,
111 | 				cur_page->page_size - cur_page->data_size,
112 | 				(SRC == QUERY ? "query":"target"),
113 | 				cur_page->page_size,
114 | 				cur_page->data_size,
115 | 				cur_page->page_size * 2,
116 | 				*p_batch_bytes + cur_page->page_size * 2);
117 | 		
118 | 		host_batch_t *res = gasal_host_batch_new(cur_page->page_size * 2, cur_page->offset + cur_page->data_size);
119 | 		cur_page->next = res;
120 | 		cur_page->is_locked = 1;
121 | 		*p_batch_bytes = *p_batch_bytes + cur_page->page_size * 2;
122 | 
123 | 		cur_page = cur_page->next;
124 | 		//fprintf(stderr, "CREATED: "); gasal_host_batch_print(cur_page);
125 | 	}
126 | 	
127 | 	if (cur_page->next != NULL && cur_page->page_size - cur_page->data_size < size + nbr_N)
128 | 	{
129 | 		// re-write offset for the next page to correspond to what has been filled on the current page.
130 | 		cur_page->next->offset = cur_page->offset + cur_page->data_size;
131 | 		cur_page->is_locked = 1;
132 | 		// then, jump to next page
133 | 		cur_page = cur_page->next;
134 | 	}
135 | 
136 | 
137 | 	if (cur_page->page_size - cur_page->data_size >= size + nbr_N)
138 | 	{
139 | 		// fprintf(stderr, "FILL: "); gasal_host_batch_print(cur_page);
140 | 		memcpy(&(cur_page->data[idx - cur_page->offset]), data, size);
141 | 
142 | 		for(int i = 0; i < nbr_N; i++)
143 | 		{
144 | 			cur_page->data[idx + size - cur_page->offset + i] = N_CODE;
145 | 		}
146 | 		idx = idx + size + nbr_N;
147 | 
148 | 		cur_page->data_size += size + nbr_N;
149 | 		//is_done = 1;
150 | 	}
151 | 
152 | 	return idx;
153 | }
154 | 
155 | 
156 | uint32_t gasal_host_batch_addbase(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char base, data_source SRC )
157 | {	 
158 |     return gasal_host_batch_add(gpu_storage, idx, &base, 1, SRC );
159 | }
160 | 
161 | 
162 | uint32_t gasal_host_batch_add(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char *data, uint32_t size, data_source SRC )
163 | {	
164 | 
165 | 	// since query and target are very symmetric here, we use pointers to route the data where it has to, 
166 | 	// while keeping the actual memory management 'source-agnostic'.
167 | 	host_batch_t *cur_page = NULL;
168 | 	uint32_t *p_batch_bytes = NULL;
169 | 	
170 | 
171 | 	switch(SRC) {
172 | 		case QUERY:
173 | 			cur_page = (gpu_storage->extensible_host_unpacked_query_batch);
174 | 			p_batch_bytes = &(gpu_storage->host_max_query_batch_bytes);
175 | 		break;
176 | 		case TARGET:
177 | 			cur_page = (gpu_storage->extensible_host_unpacked_target_batch);
178 | 			p_batch_bytes = &(gpu_storage->host_max_target_batch_bytes);
179 | 		break;
180 | 		default:
181 | 		break;
182 | 	}
183 | 
184 | 	int is_done = 0;
185 | 
186 | 	while (!is_done)
187 | 	{
188 | 		if (*p_batch_bytes >= idx + size && (cur_page->next == NULL || (cur_page->next->offset >= idx + size)) )
189 | 		{
190 | 
191 | 			memcpy(&(cur_page->data[idx - cur_page->offset]), data, size);
192 | 			idx = idx + size;
193 | 			is_done = 1;
194 | 
195 | 		} else if ((*p_batch_bytes >= idx + size) && (cur_page->next != NULL) && (cur_page->next->offset < idx + size)) {
196 | 		
197 | 			cur_page = cur_page->next;
198 | 
199 | 		} else {
200 | 			fprintf(stderr,"[GASAL WARNING:] Trying to write %d bytes at position %d on host memory (%s) while only  %d bytes are available. Therefore, allocating %d bytes more on CPU. Repeating this many times can provoke a degradation of performance.\n",
201 | 					size,
202 | 					idx,
203 | 					(SRC == QUERY ? "query":"target"),
204 | 					*p_batch_bytes,
205 | 					*p_batch_bytes * 2);
206 | 			
207 | 	
208 | 			*p_batch_bytes += *p_batch_bytes;
209 | 
210 | 			// corner case: if we allocated less than a single sequence length to begin with... it shouldn't be allowed actually, but at least it's caught here.
211 | 			while (*p_batch_bytes < size)
212 | 				*p_batch_bytes += *p_batch_bytes;
213 | 
214 | 			host_batch_t *res = gasal_host_batch_new(*p_batch_bytes, idx);
215 | 	
216 | 			cur_page->next = res;
217 | 			
218 | 			cur_page = cur_page->next;
219 | 		}
220 | 	}
221 | 	//gasal_host_batch_printall(gasal_host_batch_getlast(cur_page));
222 | 	return idx;
223 | }
224 | 
225 | 
226 | 
227 | // this printer displays the whole sequence. It is heavy and shouldn't be called when you have more than a couple sequences.
228 | void gasal_host_batch_print(host_batch_t *res) 
229 | {
230 | 	fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 
231 | 					res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size);
232 | }
233 | 
234 | // this printer allows to see the linked list easily.
235 | void gasal_host_batch_printall(host_batch_t *res)
236 | {	
237 | 	fprintf(stderr, "[GASAL PRINT] Page data: offset=%d, next_offset=%d, data size=%d, page size=%d\n", 
238 | 					res->offset, (res->next != NULL? res->next->offset : -1), res->data_size, res->page_size);
239 | 	if (res->next != NULL)
240 | 	{
241 | 		fprintf(stderr, "+--->");
242 | 		gasal_host_batch_printall(res->next);
243 | 	}
244 | }
245 | 


--------------------------------------------------------------------------------
/src/kernels/ksw_kernel_template.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KSW_KERNEL_TEMPLATE__
  2 | #define __KSW_KERNEL_TEMPLATE__
  3 | 
  4 | 
  5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes.
  6 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \
  7 |     uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence */ \
  8 |     DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */ \
  9 |     f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \
 10 |     h[m] = p[m] + subScore; /*score if rbase is aligned to gbase*/ \
 11 |     h[m] = max(h[m], f[m]); \
 12 |     h[m] = max(h[m], 0); \
 13 |     e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\
 14 |     h[m] = max(h[m], e); \
 15 |     maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \
 16 |     maxHH = (maxHH < h[m]) ? h[m] : maxHH; \
 17 |     p[m] = h[m-1];
 18 | 
 19 | 
 20 | #define PEN_CLIP5 (5)
 21 | #define TILE_SIDE (8)
 22 | 
 23 | /* typename meaning : 
 24 |     - B is for computing the Second Best Score. Its values are on enum FALSE(0)/TRUE(1).
 25 |     (sidenote: it's based on an enum instead of a bool in order to generalize its type from its Int value, with Int2Type meta-programming-template)
 26 | */
 27 | /* 
 28 |     //! Note from the bwa-gasal2 coder : I failed to understand it, so I copied it.
 29 |     //! You can say to me...
 30 |     You cheated not only the game, but yourself.
 31 | 
 32 |     You didn't grow.
 33 |     You didn't improve.
 34 |     You took a shortcut and gained nothing.
 35 | 
 36 |     You experienced a hollow victory.
 37 |     Nothing was risked and nothing was gained.
 38 | 
 39 |     It's sad that you don't know the difference.
 40 | */
 41 | 
 42 | typedef struct {
 43 |    int32_t h, e;
 44 | } eh_t;
 45 | 
 46 | template <typename B>
 47 | __global__ void gasal_ksw_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch,  uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint32_t *seed_score, gasal_res_t *device_res, gasal_res_t *device_res_second, int n_tasks)
 48 | {
 49 |     const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 50 |     if (tid >= n_tasks) return;
 51 | 
 52 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3; //starting index of the target_batch sequence
 53 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
 54 | 	uint32_t qlen = query_batch_lens[tid];
 55 | 	uint32_t tlen = target_batch_lens[tid];
 56 | 	uint32_t query_batch_regs = (qlen >> 3) + 1;//(qlen >> 3) + (qlen & 0b0111 ? 1 : 0);//number of 32-bit words holding query_batch sequence
 57 | 	uint32_t target_batch_regs = (tlen >> 3) + 1;//(tlen >> 3) + (tlen & 0b0111 ? 1 : 0);//number of 32-bit words holding target_batch sequence
 58 |     uint32_t h0 = seed_score[tid];
 59 |     int32_t subScore;
 60 |     uint32_t target_tile_id, target_base_id, query_tile_id, query_base_id;
 61 |     uint32_t gpac, rpac, gbase, rbase;
 62 |     int zdrop = 0;
 63 | 
 64 |     int o_del = _cudaGapO;
 65 |     int o_ins = _cudaGapO;
 66 |     int e_del = _cudaGapExtend;
 67 |     int e_ins = _cudaGapExtend;
 68 | 
 69 |     eh_t eh[MAX_QUERY_LEN] ; // score array
 70 |     int i, j, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ie, gscore, max_off;
 71 |     for (i = 0; i < MAX_QUERY_LEN; i++)
 72 |     {
 73 |         eh[i].h = 0;
 74 |         eh[i].e = 0;
 75 |     }
 76 | 
 77 |     // fill the first row
 78 |     eh[0].h = h0;
 79 |     eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0;
 80 |     for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j)
 81 |         eh[j].h = eh[j - 1].h - e_ins;
 82 |    
 83 |     // DP loop
 84 |     max = h0, max_i = max_j = -1;
 85 |     max_ie = -1, gscore = -1;
 86 |     max_off = 0;
 87 |     beg = 0, end = qlen;
 88 |     
 89 |     for (target_tile_id = 0; target_tile_id < target_batch_regs; target_tile_id++) //target_batch sequence in rows
 90 |     {
 91 |         gpac = packed_target_batch[packed_target_batch_idx + target_tile_id];//load 8 packed bases from target_batch sequence
 92 | 
 93 |         for (target_base_id = 0; target_base_id < TILE_SIDE; target_base_id++)
 94 |         {
 95 | 
 96 |             i = target_tile_id * TILE_SIDE + target_base_id;
 97 | 
 98 |             if (i >= tlen) // skip padding
 99 |                 break;
100 |             
101 |             gbase = (gpac >> (32 - (target_base_id+1)*4 )) & 0x0F; /* get a base from target_batch sequence */
102 | 
103 |             int t, f = 0, h1, m = 0, mj = -1;
104 |             // compute the first column
105 |             if (beg == 0) {
106 |                 h1 = h0 - (o_del + e_del * (i + 1));
107 |                 if (h1 < 0)
108 |                 h1 = 0;
109 |             } else
110 |                 h1 = 0;
111 |             
112 |             
113 |             for(query_tile_id = 0; (query_tile_id < query_batch_regs); query_tile_id++)
114 |             {
115 |                 rpac = packed_query_batch[packed_query_batch_idx + query_tile_id];//load 8 bases from query_batch sequence
116 | 
117 |                 for(query_base_id = 0; (query_base_id < TILE_SIDE); query_base_id++)
118 |                 {
119 |                     j = query_tile_id * TILE_SIDE + query_base_id;
120 |                     if (j < beg)
121 |                         continue;      
122 |                     if (j >= end)
123 |                         break;
124 | 
125 |                     rbase = (rpac >> (32 - (query_base_id+1)*4 )) & 0x0F;//get a base from query_batch sequence
126 | 
127 |                     // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
128 |                     // Similar to SSE2-SW, cells are computed in the following order:
129 |                     //   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
130 |                     //   E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
131 |                     //   F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
132 |                     eh_t *p = &eh[j];
133 |                     int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
134 |                     p->h = h1;          // set H(i,j-1) for the next row
135 |                     DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);
136 |                     M = M ? M + subScore : 0;          // separating H and M to disallow a cigar like "100M3I3D20M"
137 |                     h = M > e ? M : e;   // e and f are guaranteed to be non-negative, so h>=0 even if M<0
138 |                     h = h > f ? h : f;
139 |                     h1 = h;             // save H(i,j) to h1 for the next column
140 |                     mj = m > h ? mj : j; // record the position where max score is achieved
141 |                     m = m > h ? m : h;   // m is stored at eh[mj+1]
142 |                     t = M - oe_del;
143 |                     t = t > 0 ? t : 0;
144 |                     e -= e_del;
145 |                     e = e > t ? e : t;   // computed E(i+1,j)
146 |                     p->e = e;           // save E(i+1,j) for the next row
147 |                     t = M - oe_ins;
148 |                     t = t > 0 ? t : 0;
149 |                     f -= e_ins;
150 |                     f = f > t ? f : t;   // computed F(i,j+1)
151 |                 }
152 |             }
153 |             eh[end].h = h1;
154 |             eh[end].e = 0;
155 |             if (j == qlen) {
156 |                 max_ie = gscore > h1 ? max_ie : i;
157 |                 gscore = gscore > h1 ? gscore : h1;
158 |             }
159 |             if (m == 0)
160 |                 break;
161 |             if (m > max) {
162 |                 max = m, max_i = i, max_j = mj;
163 |                 max_off = max_off > abs(mj - i) ? max_off : abs(mj - i);
164 |             } else if (zdrop > 0) {
165 |                 if (i - max_i > mj - max_j) {
166 |                 if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop)
167 |                     break;
168 |                 } else {
169 |                 if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop)
170 |                     break;
171 |                 }
172 |             }
173 |             /* This is defining from where to start the next row and where to end the computation of next row
174 |                 it skips some of the cells in the beginning and in the end of the row
175 |             */
176 |             // update beg and end for the next round
177 |             // COULD be done over a constant value...
178 |             for (j = beg; (j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
179 |                 ;
180 |             beg = j;
181 |             for (j = end; (j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
182 |                 ;
183 |             end = j + 2 < qlen ? j + 2 : qlen;
184 |             //beg = 0; end = qlen; // uncomment this line for debugging
185 |         }
186 |     }
187 | 
188 |     if (gscore <= 0 || gscore <= max - PEN_CLIP5)
189 |     {
190 |         device_res->aln_score[tid] = max;
191 |         device_res->query_batch_end[tid] = max_j + 1;
192 |         device_res->target_batch_end[tid] = max_i + 1;
193 |     } else {
194 |         device_res->aln_score[tid] = gscore;
195 |         device_res->query_batch_end[tid] = qlen;
196 |         device_res->target_batch_end[tid] = max_ie + 1;
197 |     }
198 | 
199 | }
200 | 
201 | 
202 | #endif
203 | 
204 | 


--------------------------------------------------------------------------------
/src/kernels/pack_rc_seqs.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KERNEL_SEQPAK__
  2 | #define __KERNEL_SEQPAK__
  3 | 
  4 | 
  5 | #define A_PAK ('A'&0x0F)
  6 | #define C_PAK ('C'&0x0F)
  7 | #define G_PAK ('G'&0x0F)
  8 | #define T_PAK ('T'&0x0F)
  9 | //#define N_PAK ('N'&0x0F)
 10 | 
 11 | 
 12 | 
 13 | __global__ void gasal_pack_kernel(uint32_t* unpacked_query_batch, uint32_t* unpacked_target_batch, uint32_t *packed_query_batch, uint32_t* packed_target_batch, int query_batch_tasks_per_thread, int target_batch_tasks_per_thread, uint32_t total_query_batch_regs, uint32_t total_target_batch_regs) \
 14 | {
 15 | 
 16 | 	int32_t i;
 17 | 	const int32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 18 | 	uint32_t n_threads = gridDim.x * blockDim.x;
 19 | 	for (i = 0; i < query_batch_tasks_per_thread &&  (((i*n_threads)<<1) + (tid<<1) < total_query_batch_regs); ++i) {
 20 | 		uint32_t *query_addr = &(unpacked_query_batch[(i*n_threads)<<1]);
 21 | 		uint32_t reg1 = query_addr[(tid << 1)]; //load 4 bases of the query sequence from global memory
 22 | 		uint32_t reg2 = query_addr[(tid << 1) + 1]; //load  another 4 bases
 23 | 		uint32_t packed_reg = 0;
 24 | 		packed_reg |= (reg1 & 15) << 28;        // ---
 25 | 		packed_reg |= ((reg1 >> 8) & 15) << 24; //    |
 26 | 		packed_reg |= ((reg1 >> 16) & 15) << 20;//    |
 27 | 		packed_reg |= ((reg1 >> 24) & 15) << 16;//    |
 28 | 		packed_reg |= (reg2 & 15) << 12;        //     > pack sequence
 29 | 		packed_reg |= ((reg2 >> 8) & 15) << 8;  //    |
 30 | 		packed_reg |= ((reg2 >> 16) & 15) << 4; //    |
 31 | 		packed_reg |= ((reg2 >> 24) & 15);      //----
 32 | 		uint32_t *packed_query_addr = &(packed_query_batch[i*n_threads]);
 33 | 		packed_query_addr[tid] = packed_reg; //write 8 bases of packed query sequence to global memory
 34 | 	}
 35 | 
 36 | 	for (i = 0; i < target_batch_tasks_per_thread &&  (((i*n_threads)<<1) + (tid<<1)) < total_target_batch_regs; ++i) {
 37 | 		uint32_t *target_addr = &(unpacked_target_batch[(i * n_threads)<<1]);
 38 | 		uint32_t reg1 = target_addr[(tid << 1)]; //load 4 bases of the target sequence from global memory
 39 | 		uint32_t reg2 = target_addr[(tid << 1) + 1]; //load  another 4 bases
 40 | 		uint32_t packed_reg = 0;
 41 | 		packed_reg |= (reg1 & 15) << 28;        // ---
 42 | 		packed_reg |= ((reg1 >> 8) & 15) << 24; //    |
 43 | 		packed_reg |= ((reg1 >> 16) & 15) << 20;//    |
 44 | 		packed_reg |= ((reg1 >> 24) & 15) << 16;//    |
 45 | 		packed_reg |= (reg2 & 15) << 12;        //     > pack sequence
 46 | 		packed_reg |= ((reg2 >> 8) & 15) << 8;  //    |
 47 | 		packed_reg |= ((reg2 >> 16) & 15) << 4; //    |
 48 | 		packed_reg |= ((reg2 >> 24) & 15);      //----
 49 | 		uint32_t *packed_target_addr = &(packed_target_batch[i * n_threads]);
 50 | 		packed_target_addr[tid] = packed_reg; //write 8 bases of packed target sequence to global memory
 51 | 	}
 52 | 
 53 | }
 54 | 
 55 | 
 56 | __global__ void	gasal_reversecomplement_kernel(uint32_t *packed_query_batch,uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, uint8_t *query_op, uint8_t *target_op, uint32_t  n_tasks)
 57 | {
 58 | 
 59 | 	const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 60 | 
 61 | 	if (tid >= n_tasks) return;
 62 | 	if (query_op[tid] == 0 && target_op[tid] == 0) return;		// if there's nothing to do (op=0, meaning sequence is Forward Natural), just exit the kernel ASAP. 
 63 | 
 64 | 
 65 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence
 66 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
 67 | 	uint32_t read_len = query_batch_lens[tid];
 68 | 	uint32_t ref_len = target_batch_lens[tid];
 69 | 	uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch
 70 | 	uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch
 71 | 
 72 | 	uint32_t query_batch_regs_to_swap = (query_batch_regs >> 1) + (query_batch_regs & 1); // that's (query_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence)
 73 | 	uint32_t target_batch_regs_to_swap = (target_batch_regs >> 1) + (target_batch_regs & 1); // that's (target_batch_regs / 2) + 1 if it's odd, + 0 otherwise. Used for reverse (we start a both ends, and finish at the center of the sequence)
 74 | 
 75 | 
 76 | 	// variables used dependent on target and query: 
 77 | 
 78 | 	uint8_t *op = NULL;
 79 | 	uint32_t *packed_batch = NULL;
 80 | 	uint32_t *batch_regs = NULL;
 81 | 	uint32_t *batch_regs_to_swap = NULL;
 82 | 	uint32_t *packed_batch_idx = NULL;
 83 | 
 84 | 	// avoid useless code duplicate thanks to pointers to route the data flow where it should be, twice.
 85 | 	// The kernel is already generic. Later on this can be used to split the kernel into two using templates...
 86 | 	#pragma unroll 2
 87 | 	for (int p = QUERY; p <= TARGET; p++)
 88 | 	{
 89 | 		switch(p)
 90 | 		{
 91 | 			case QUERY:
 92 | 				op = query_op;
 93 | 				packed_batch = packed_query_batch;
 94 | 				batch_regs = &query_batch_regs;
 95 | 				batch_regs_to_swap = &query_batch_regs_to_swap;
 96 | 				packed_batch_idx = &packed_query_batch_idx;
 97 | 				break;
 98 | 			case TARGET:
 99 | 				op = target_op;
100 | 				packed_batch = packed_target_batch;
101 | 				batch_regs = &target_batch_regs;
102 | 				batch_regs_to_swap = &target_batch_regs_to_swap;
103 | 				packed_batch_idx = &packed_target_batch_idx;
104 | 				break;
105 | 			default:
106 | 			break;
107 | 		}
108 | 
109 | 		if (*(op + tid) & 0x01) // reverse
110 | 		{
111 | 			// deal with N's : read last word, find how many N's, store that number as offset, and pad with that many for the last 
112 | 			uint8_t nbr_N = 0;
113 | 			for (int j = 0; j < 32; j = j + 4)
114 | 			{
115 | 				nbr_N += (((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1) & (0x0F << j)) >> j) == N_CODE);
116 | 			}
117 | 			
118 | 			//printf("KERNEL_DEBUG: nbr_N=%d\n", nbr_N);
119 | 
120 | 
121 | 			nbr_N = nbr_N << 2; // we operate on nibbles so we will need to do our shifts 4 bits by 4 bits, so 4*nbr_N
122 | 
123 | 			for (uint32_t i = 0; i < *(batch_regs_to_swap); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if.
124 | 			{
125 | 				/* This  is the current operation flow:\
126 | 					- Read the first 32-bits word on HEAD
127 | 					- Combine the reads of 2 last 32-bits words on tail to create the 32-bits word WITHOUT N's 
128 | 					- Swap them 
129 | 					- Write them at the correct places. Remember we're building 32-bits words across two 32-bits words on tail. 
130 | 					So we have to take care of which bits are to be written on tail, too.
131 | 
132 | 				You progress through both heads and tails that way, until you reach the center of the sequence. 
133 | 				When you reach it, you actually don't write one of the words to avoid overwrite.
134 | 				*/
135 | 				uint32_t rpac_1 = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head
136 | 				uint32_t rpac_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) << (32-nbr_N)) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) >> nbr_N);
137 | 
138 | 
139 | 				uint32_t reverse_rpac_1 = 0;
140 | 				uint32_t reverse_rpac_2 = 0;
141 | 
142 | 
143 | 				#pragma unroll 8
144 | 				for(int k = 28; k >= 0; k = k - 4)		// reverse 32-bits word... is pragma-unrolled. 
145 | 				{
146 | 					reverse_rpac_1 |= ((rpac_1 & (0x0F << k)) >> (k)) << (28-k);
147 | 					reverse_rpac_2 |= ((rpac_2 & (0x0F << k)) >> (k)) << (28-k);
148 | 				}
149 | 				// last swap operated manually, because of its irregular size (32 - 4*nbr_N bits, hence 8 - nbr_N nibbles)
150 | 
151 | 
152 | 				uint32_t to_queue_1 = (reverse_rpac_1 << nbr_N) | ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) & ((1<<nbr_N) - 1));
153 | 				uint32_t to_queue_2 = ((*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) & (0xFFFFFFFF - ((1<<nbr_N) - 1))) | (reverse_rpac_1 >> (32-nbr_N));
154 | 
155 | 			
156 | 				//printf("KERNEL DEBUG: rpac_1 Word before reverse: %x, after: %x, split into %x + %x \n", rpac_1, reverse_rpac_1, to_queue_2, to_queue_1 );
157 | 				//printf("KERNEL DEBUG: rpac_2 Word before reverse: %x, after: %x\n", rpac_2, reverse_rpac_2 );
158 | 
159 | 
160 | 				*(packed_batch + *(packed_batch_idx) + i) = reverse_rpac_2;
161 | 				(*(packed_batch + *(packed_batch_idx) + *(batch_regs)-1 - i)) = to_queue_1;
162 | 				if (i!=*(batch_regs_to_swap)-1)
163 | 					(*(packed_batch + *(packed_batch_idx) + *(batch_regs)-2 - i)) = to_queue_2;
164 | 
165 | 
166 | 			} // end for
167 | 		} // end if(reverse)
168 | 
169 | 		if (*(op+tid) & 0x02) // complement
170 | 		{
171 | 			for (uint32_t i = 0; i < *(batch_regs); i++) // reverse all words. There's a catch with the last word (in the middle of the sequence), see final if.
172 | 			{
173 | 				uint32_t rpac = *(packed_batch + *(packed_batch_idx) + i); //load 8 packed bases from head
174 | 				uint32_t nucleotide = 0;
175 | 
176 | 				#pragma unroll 8
177 | 				for(int k = 28; k >= 0; k = k - 4)		// complement 32-bits word... is pragma-unrolled. 
178 | 				{
179 | 					nucleotide = (rpac & (0x0F << k)) >> (k);
180 | 					switch(nucleotide)
181 | 					{
182 | 						case A_PAK:
183 | 							nucleotide = T_PAK;
184 | 							break;
185 | 						case C_PAK:
186 | 							nucleotide = G_PAK;
187 | 							break;
188 | 						case T_PAK:
189 | 							nucleotide = A_PAK;
190 | 							break;
191 | 						case G_PAK:
192 | 							nucleotide = C_PAK;
193 | 							break;
194 | 						default:
195 | 							break;
196 | 					}
197 | 					rpac = (rpac & (0xFFFFFFFF - (0x0F << k))) | nucleotide << k;
198 | 				}
199 | 
200 | 				//printf("KERNEL DEBUG: Word read : %x, after complement: %x\n", *(packed_batch + *(packed_batch_idx) + i), rpac);
201 | 
202 | 				*(packed_batch + *(packed_batch_idx) + i) = rpac;
203 | 
204 | 			} // end for
205 | 		} // end if(complement)
206 | 
207 | 
208 | 
209 | 	}
210 | 
211 | 	return;
212 | }
213 | #endif


--------------------------------------------------------------------------------
/src/args_parser.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream>
  3 | 
  4 | #include "args_parser.h"
  5 | 
  6 | 
  7 | 
  8 | Parameters::Parameters(int argc_, char **argv_) {
  9 | 
 10 | 
 11 |     // default values
 12 |     sa = (1);
 13 |     sb = (4);
 14 |     gapo = (6);
 15 |     gape = (1);
 16 |     start_pos = (WITHOUT_START); 
 17 |     print_out = (0);
 18 |     n_threads = (1);
 19 |     
 20 |     k_band = (0);
 21 | 
 22 |     isPacked = false;
 23 |     isReverseComplement = false;
 24 | 
 25 |     secondBest = FALSE;
 26 | 
 27 |     // query head, target head, query tail, target tail
 28 |     semiglobal_skipping_head = TARGET;
 29 |     semiglobal_skipping_tail = TARGET;
 30 | 
 31 |     algo = (UNKNOWN);
 32 | 
 33 |     query_batch_fasta_filename = "";
 34 |     target_batch_fasta_filename = "";
 35 | 
 36 |     argc = argc_;
 37 |     argv = argv_;
 38 | 
 39 | }
 40 | 
 41 | Parameters::~Parameters() {
 42 |     query_batch_fasta.close();
 43 |     target_batch_fasta.close();
 44 | }
 45 | 
 46 | void Parameters::print() {
 47 |     std::cerr <<  "sa=" << sa <<" , sb=" << sb <<" , gapo=" <<  gapo << " , gape="<<gape << std::endl;
 48 |     std::cerr <<  "start_pos=" << start_pos <<" , print_out=" << print_out <<" , n_threads=" <<  n_threads << std::endl;
 49 |     std::cerr <<  "semiglobal_skipping_head=" << semiglobal_skipping_head <<" , semiglobal_skipping_tail=" << semiglobal_skipping_tail <<" , algo=" <<  algo << std::endl;
 50 |     std::cerr <<  std::boolalpha << "isPacked = " << isPacked  << " , secondBest = " << secondBest << std::endl;
 51 |     std::cerr <<  "query_batch_fasta_filename=" << query_batch_fasta_filename <<" , target_batch_fasta_filename=" << target_batch_fasta_filename << std::endl;
 52 | }
 53 | 
 54 | void Parameters::failure(fail_type f) {
 55 |     switch(f)
 56 |     {
 57 |             case NOT_ENOUGH_ARGS:
 58 |                 std::cerr << "Not enough Parameters. Required: -y AL_TYPE file1.fasta file2.fasta. See help (--help, -h) for usage. " << std::endl;
 59 |             break;
 60 |             case WRONG_ARG:
 61 |                 std::cerr << "Wrong argument. See help (--help, -h) for usage. " << std::endl;
 62 |             break;
 63 |             case WRONG_FILES:
 64 |                 std::cerr << "File error: either a file doesn't exist, or cannot be opened." << std::endl;
 65 |             break;
 66 | 
 67 |             default:
 68 |             break;
 69 |     }
 70 |     exit(1);
 71 | }
 72 | 
 73 | void Parameters::help() {
 74 |             std::cerr << "Usage: ./test_prog.out [-a] [-b] [-q] [-r] [-s] [-t] [-p] [-n] [-y] <query_batch.fasta> <target_batch.fasta>" << std::endl;
 75 |             std::cerr << "Options: -a INT    match score ["<< sa <<"]" << std::endl;
 76 |             std::cerr << "         -b INT    mismatch penalty [" << sb << "]"<< std::endl;
 77 |             std::cerr << "         -q INT    gap open penalty [" << gapo << "]" << std::endl;
 78 |             std::cerr << "         -r INT    gap extension penalty ["<< gape <<"]" << std::endl;
 79 |             std::cerr << "         -s        find the start position" << std::endl;
 80 |             std::cerr << "         -t        compute traceback. With this option enabled, \"-s\" has no effect as start position will always be computed with traceback" << std::endl;
 81 |             std::cerr << "         -p        print the alignment results" << std::endl;
 82 |             std::cerr << "         -n INT    Number of threads ["<< n_threads<<"]" << std::endl;
 83 |             std::cerr << "         -y AL_TYPE       Alignment type . Must be \"local\", \"semi_global\", \"global\", \"ksw\" "  << std::endl;
 84 | 	    std::cerr << "         -x HEAD TAIL     specifies, for semi-global alignment, wha should be skipped for heads and tails of the sequences. (NONE, QUERY, TARGET, BOTH)" << std::endl;
 85 |             std::cerr << "         -k INT    Band width in case \"banded\" is selected."  << std::endl;
 86 |             std::cerr << "         --help, -h : displays this message." << std::endl;
 87 |             std::cerr << "         --second-best   displays second best score (WITHOUT_START only)." << std::endl;
 88 |             std::cerr << "Single-pack multi-Parameters (e.g. -sp) is not supported." << std::endl;
 89 |             std::cerr << "		  "  << std::endl;
 90 | }
 91 | 
 92 | 
 93 | void Parameters::parse() {
 94 | 
 95 |     // before testing anything, check if calling for help.
 96 |     int c;
 97 |         
 98 |     std::string arg_next = "";
 99 |     std::string arg_cur = "";
100 | 
101 |     for (c = 1; c < argc; c++)
102 |     {
103 |         arg_cur = std::string((const char*) (*(argv + c) ) );
104 |         arg_next = "";
105 |         if (!arg_cur.compare("--help") || !arg_cur.compare("-h"))
106 |         {
107 |             help();
108 |             exit(0);
109 |         }
110 |     }
111 | 
112 |     if (argc < 4)
113 |     {
114 |         failure(NOT_ENOUGH_ARGS);
115 |     }
116 | 
117 |     for (c = 1; c < argc - 2; c++)
118 |     {
119 |         arg_cur = std::string((const char*) (*(argv + c) ) );
120 |         if (arg_cur.at(0) == '-' && arg_cur.at(1) == '-' )
121 |         {
122 |             if (!arg_cur.compare("--help"))
123 |             {
124 |                 help();
125 |                 exit(0);
126 |             }
127 |             if (!arg_cur.compare("--second-best"))
128 |             {
129 |                 secondBest = TRUE;
130 |             }
131 | 
132 |         } else if (arg_cur.at(0) == '-' )
133 |         {
134 |             if (arg_cur.length() > 2)
135 |                 failure(WRONG_ARG);
136 |             char param = arg_cur.at(1);
137 |             switch(param)
138 |             {
139 |                 case 'y':
140 |                     c++;
141 |                     arg_next = std::string((const char*) (*(argv + c) ) );
142 |                     if (!arg_next.compare("local"))
143 |                         algo = LOCAL;
144 |                     else if (!arg_next.compare("semi_global"))
145 |                         algo = SEMI_GLOBAL;
146 |                     else if (!arg_next.compare("global"))
147 |                         algo = GLOBAL;
148 |                     else if (!arg_next.compare("ksw"))
149 |                     {
150 |                         algo = KSW;
151 |                     }
152 |                 break;
153 |                 case 'a':
154 |                     c++;
155 |                     arg_next = std::string((const char*) (*(argv + c) ) );
156 |                     sa = std::stoi(arg_next);
157 |                 break;
158 |                 case 'b':
159 |                     c++;
160 |                     arg_next = std::string((const char*) (*(argv + c) ) );
161 |                     sb = std::stoi(arg_next);
162 |                 break;
163 |                 case 'q':
164 |                     c++;
165 |                     arg_next = std::string((const char*) (*(argv + c) ) );
166 |                     gapo = std::stoi(arg_next);
167 |                 break;
168 |                 case 'r':
169 |                     c++;
170 |                     arg_next = std::string((const char*) (*(argv + c) ) );
171 |                     gape = std::stoi(arg_next);
172 |                 break;
173 |                 case 's':
174 |                     start_pos = WITH_START;
175 |                 break;
176 |                 case 't':
177 |                 	start_pos = WITH_TB;
178 |                 	break;
179 |                 case 'p':
180 |                     print_out = 1;
181 |                 break;
182 |                 case 'n':
183 |                     c++;
184 |                     arg_next = std::string((const char*) (*(argv + c) ) );
185 |                     n_threads = std::stoi(arg_next);
186 |                 break;
187 |                 case 'k':
188 |                     c++;
189 |                     arg_next = std::string((const char*) (*(argv + c) ) );
190 |                     k_band = std::stoi(arg_next);
191 |                 break;
192 |                 case 'x':
193 |                     c++;
194 |                     arg_next = std::string((const char*) (*(argv + c) ) );
195 |                     if (!arg_next.compare("NONE"))
196 |                         semiglobal_skipping_head = NONE;
197 |                     else if (!arg_next.compare("TARGET"))
198 |                         semiglobal_skipping_head = TARGET;
199 |                     else if (!arg_next.compare("QUERY"))
200 |                         semiglobal_skipping_head = QUERY;
201 |                     else if (!arg_next.compare("BOTH"))
202 |                         semiglobal_skipping_head = BOTH;
203 |                     else 
204 |                     {
205 |                         failure(WRONG_ARG);
206 |                     }
207 | 
208 |                     c++;
209 |                     arg_next = std::string((const char*) (*(argv + c) ) );
210 |                     if (!arg_next.compare("NONE"))
211 |                         semiglobal_skipping_tail = NONE;
212 |                     else if (!arg_next.compare("TARGET"))
213 |                         semiglobal_skipping_tail = TARGET;
214 |                     else if (!arg_next.compare("QUERY"))
215 |                         semiglobal_skipping_tail = QUERY;
216 |                     else if (!arg_next.compare("BOTH"))
217 |                         semiglobal_skipping_tail = BOTH;
218 |                     else 
219 |                     {
220 |                         failure(WRONG_ARG);
221 |                     }
222 |                 break;
223 | 
224 |             }
225 | 
226 |             
227 |         } else {
228 |             failure(WRONG_ARG);
229 |         }
230 |     }
231 | 
232 | 
233 |     // the last 2 Parameters are the 2 filenames.
234 |     query_batch_fasta_filename = std::string( (const char*)  (*(argv + c) ) );
235 |     c++;
236 |     target_batch_fasta_filename = std::string( (const char*) (*(argv + c) ) );
237 | 
238 |     // Parameters retrieved successfully, open files.
239 |     fileopen();
240 | }
241 | 
242 | void Parameters::fileopen() {
243 |     query_batch_fasta.open(query_batch_fasta_filename, std::ifstream::in);
244 |     if (!query_batch_fasta)
245 |         failure(WRONG_FILES);
246 | 
247 |     target_batch_fasta.open(target_batch_fasta_filename);
248 |     if (!target_batch_fasta)
249 |         failure(WRONG_FILES);
250 | }
251 | 


--------------------------------------------------------------------------------
/src/kernels/global.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KERNEL_GLOBAL__
  2 | #define __KERNEL_GLOBAL__
  3 | 
  4 | #define CORE_GLOBAL_COMPUTE() \
  5 | 		uint32_t gbase = (gpac >> l) & 15;\
  6 | 		DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);\
  7 | 		int32_t tmp_hm = p[m] + subScore;\
  8 | 		h[m] = max(tmp_hm, f[m]);\
  9 | 		h[m] = max(h[m], e);\
 10 | 		f[m] =  (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ?  (tmp_hm - _cudaGapOE) : (f[m] - _cudaGapExtend);\
 11 | 		e =  (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ?  (tmp_hm - _cudaGapOE) : (e - _cudaGapExtend);\
 12 | 		p[m] = h[m-1];\
 13 | 
 14 | #define CORE_GLOBAL_COMPUTE_TB(direction_reg) \
 15 | 		uint32_t gbase = (gpac >> l) & 15;\
 16 | 		DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);\
 17 | 		int32_t tmp_hm = p[m] + subScore;\
 18 | 		uint32_t m_or_x = tmp_hm >= p[m] ? 0 : 1;\
 19 | 		h[m] = max(tmp_hm, f[m]);\
 20 | 		h[m] = max(h[m], e);\
 21 | 		direction_reg |= h[m] == tmp_hm ? m_or_x << (28 - ((m - 1) << 2)) : (h[m] == f[m] ? (uint32_t)3 << (28 - ((m - 1) << 2)) : (uint32_t)2 << (28 - ((m - 1) << 2)));\
 22 | 		direction_reg |= (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ?  (uint32_t)0 : (uint32_t)1 << (31 - ((m - 1) << 2));\
 23 | 		f[m] =  (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ?  (tmp_hm - _cudaGapOE) : (f[m] - _cudaGapExtend);\
 24 | 		direction_reg|= (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ?  (uint32_t)0 : (uint32_t)1 << (30 - ((m - 1) << 2));\
 25 | 		e =  (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ?  (tmp_hm - _cudaGapOE) : (e - _cudaGapExtend);\
 26 | 		p[m] = h[m-1];\
 27 | 
 28 | 
 29 | 
 30 | template <typename S>
 31 | __global__ void gasal_global_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch,  uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, uint4 *packed_tb_matrices, int n_tasks)
 32 | {
 33 | 	int32_t i, j, k, l, m;
 34 | 	int32_t u = 0, r = 0;
 35 | 	int32_t e;
 36 | 	int32_t subScore;
 37 | 	int tile_no = 0;
 38 | 
 39 | 	int32_t ridx;
 40 | 	short2 HD;
 41 | 
 42 | 	const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 43 | 	if (tid >= n_tasks) return;
 44 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence
 45 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
 46 | 	uint32_t read_len = query_batch_lens[tid];
 47 | 	uint32_t ref_len = target_batch_lens[tid];
 48 | 	uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch
 49 | 	uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch
 50 | 	//-------arrays to save intermediate values----------------
 51 | 	short2 global[MAX_QUERY_LEN];
 52 | 	int32_t h[9];
 53 | 	int32_t f[9];
 54 | 	int32_t p[9];
 55 | 	int32_t max_h[9];
 56 | 	//----------------------------------------------------------
 57 | 	global[0] = make_short2(0, MINUS_INF);
 58 | 	for (i = 1; i < MAX_QUERY_LEN; i++) {
 59 | 		global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF);
 60 | 	}
 61 | 
 62 | 
 63 | 	h[u++] = 0;
 64 | 	p[r++] = 0;
 65 | 	for (i = 0; i < target_batch_regs; i++) { //target_batch sequence in rows, for all WORDS (i=WORD index)
 66 | 		ridx = 0;
 67 | 		for (m = 1; m < 9; m++, u++, r++) {
 68 | 			h[m] = -(_cudaGapO + (_cudaGapExtend*(u))); 
 69 | 			f[m] = MINUS_INF; 
 70 | 			p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1)));
 71 | 		}
 72 | 		register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence
 73 | 
 74 | 
 75 | 		for (j = 0; j < query_batch_regs; /*++j*/ j+=1) { //query_batch sequence in columns, for all WORDS (j=WORD index).
 76 | 
 77 | 			register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 packed bases from query_batch sequence
 78 | 
 79 | 			//--------------compute a tile of 8x8 cells-------------------
 80 | 			if (SAMETYPE(S, Int2Type<WITH_TB>)) {
 81 | 				uint4 direction = make_uint4(0,0,0,0);
 82 | 				uint32_t rbase = (rpac >> 28) & 15;//get a base from query_batch sequence
 83 | 				//------------load intermediate values----------------------
 84 | 				HD = global[ridx];
 85 | 				h[0] = HD.x;
 86 | 				e = HD.y;
 87 | #pragma unroll 8
 88 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
 89 | 					CORE_GLOBAL_COMPUTE_TB(direction.x);
 90 | 				}
 91 | 				//--------------save intermediate values-------------------------
 92 | 				HD.x = h[m-1];
 93 | 				HD.y = e;//max(e, 0);
 94 | 				global[ridx] = HD;
 95 | 				ridx++;
 96 | 				//--------------------------------------------------------------
 97 | 				//------the last column of DP matrix------------
 98 | 				if (ridx == read_len) {
 99 | 					for (m = 1; m < 9; m++) {
100 | 						max_h[m] = h[m];
101 | 
102 | 					}
103 | 				}
104 | 				rbase = (rpac >> 24) & 15;//get a base from query_batch sequence
105 | 				//------------load intermediate values----------------------
106 | 				HD = global[ridx];
107 | 				h[0] = HD.x;
108 | 				e = HD.y;
109 | #pragma unroll 8
110 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
111 | 					CORE_GLOBAL_COMPUTE_TB(direction.y);
112 | 				}
113 | 				//--------------save intermediate values-------------------------
114 | 				HD.x = h[m-1];
115 | 				HD.y = e;//max(e, 0);
116 | 				global[ridx] = HD;
117 | 				ridx++;
118 | 				//--------------------------------------------------------------
119 | 				//------the last column of DP matrix------------
120 | 				if (ridx == read_len) {
121 | 					for (m = 1; m < 9; m++) {
122 | 						max_h[m] = h[m];
123 | 
124 | 					}
125 | 				}
126 | 				rbase = (rpac >> 20) & 15;//get a base from query_batch sequence
127 | 				//------------load intermediate values----------------------
128 | 				HD = global[ridx];
129 | 				h[0] = HD.x;
130 | 				e = HD.y;
131 | #pragma unroll 8
132 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
133 | 					CORE_GLOBAL_COMPUTE_TB(direction.z);
134 | 				}
135 | 				//--------------save intermediate values-------------------------
136 | 				HD.x = h[m-1];
137 | 				HD.y = e;//max(e, 0);
138 | 				global[ridx] = HD;
139 | 				ridx++;
140 | 				//--------------------------------------------------------------
141 | 				//------the last column of DP matrix------------
142 | 				if (ridx == read_len) {
143 | 					for (m = 1; m < 9; m++) {
144 | 						max_h[m] = h[m];
145 | 
146 | 					}
147 | 				}
148 | 				rbase = (rpac >> 16) & 15;//get a base from query_batch sequence
149 | 				//------------load intermediate values----------------------
150 | 				HD = global[ridx];
151 | 				h[0] = HD.x;
152 | 				e = HD.y;
153 | #pragma unroll 8
154 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
155 | 					CORE_GLOBAL_COMPUTE_TB(direction.w);
156 | 				}
157 | 				//--------------save intermediate values-------------------------
158 | 				HD.x = h[m-1];
159 | 				HD.y = e;//max(e, 0);
160 | 				global[ridx] = HD;
161 | 				ridx++;
162 | 				//--------------------------------------------------------------
163 | 				//------the last column of DP matrix------------
164 | 				if (ridx == read_len) {
165 | 					for (m = 1; m < 9; m++) {
166 | 						max_h[m] = h[m];
167 | 
168 | 					}
169 | 				}
170 | 				packed_tb_matrices[(tile_no*n_tasks) + tid] = direction;
171 | 				tile_no++;
172 | 
173 | 				direction = make_uint4(0,0,0,0);
174 | 				rbase = (rpac >> 12) & 15;//get a base from query_batch sequence
175 | 				//------------load intermediate values----------------------
176 | 				HD = global[ridx];
177 | 				h[0] = HD.x;
178 | 				e = HD.y;
179 | #pragma unroll 8
180 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
181 | 					CORE_GLOBAL_COMPUTE_TB(direction.x);
182 | 				}
183 | 				//--------------save intermediate values-------------------------
184 | 				HD.x = h[m-1];
185 | 				HD.y = e;//max(e, 0);
186 | 				global[ridx] = HD;
187 | 				ridx++;
188 | 				//--------------------------------------------------------------
189 | 				//------the last column of DP matrix------------
190 | 				if (ridx == read_len) {
191 | 					for (m = 1; m < 9; m++) {
192 | 						max_h[m] = h[m];
193 | 
194 | 					}
195 | 				}
196 | 				rbase = (rpac >> 8) & 15;//get a base from query_batch sequence
197 | 				//------------load intermediate values----------------------
198 | 				HD = global[ridx];
199 | 				h[0] = HD.x;
200 | 				e = HD.y;
201 | #pragma unroll 8
202 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
203 | 					CORE_GLOBAL_COMPUTE_TB(direction.y);
204 | 				}
205 | 				//--------------save intermediate values-------------------------
206 | 				HD.x = h[m-1];
207 | 				HD.y = e;//max(e, 0);
208 | 				global[ridx] = HD;
209 | 				ridx++;
210 | 				//--------------------------------------------------------------
211 | 				//------the last column of DP matrix------------
212 | 				if (ridx == read_len) {
213 | 					for (m = 1; m < 9; m++) {
214 | 						max_h[m] = h[m];
215 | 
216 | 					}
217 | 				}
218 | 				rbase = (rpac >> 4) & 15;//get a base from query_batch sequence
219 | 				//------------load intermediate values----------------------
220 | 				HD = global[ridx];
221 | 				h[0] = HD.x;
222 | 				e = HD.y;
223 | #pragma unroll 8
224 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
225 | 					CORE_GLOBAL_COMPUTE_TB(direction.z);
226 | 				}
227 | 				//--------------save intermediate values-------------------------
228 | 				HD.x = h[m-1];
229 | 				HD.y = e;//max(e, 0);
230 | 				global[ridx] = HD;
231 | 				ridx++;
232 | 				//--------------------------------------------------------------
233 | 				//------the last column of DP matrix------------
234 | 				if (ridx == read_len) {
235 | 					for (m = 1; m < 9; m++) {
236 | 						max_h[m] = h[m];
237 | 
238 | 					}
239 | 				}
240 | 				rbase = rpac & 15;//get a base from query_batch sequence
241 | 				//------------load intermediate values----------------------
242 | 				HD = global[ridx];
243 | 				h[0] = HD.x;
244 | 				e = HD.y;
245 | #pragma unroll 8
246 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
247 | 					CORE_GLOBAL_COMPUTE_TB(direction.w);
248 | 				}
249 | 				//--------------save intermediate values-------------------------
250 | 				HD.x = h[m-1];
251 | 				HD.y = e;//max(e, 0);
252 | 				global[ridx] = HD;
253 | 				ridx++;
254 | 				//--------------------------------------------------------------
255 | 				//------the last column of DP matrix------------
256 | 				if (ridx == read_len) {
257 | 					for (m = 1; m < 9; m++) {
258 | 						max_h[m] = h[m];
259 | 
260 | 					}
261 | 				}
262 | 				packed_tb_matrices[(tile_no*n_tasks) + tid] = direction;
263 | 				tile_no++;
264 | 
265 | 			}
266 | 			else{
267 | 			for (k = 28; k >= 0; k -= 4) {
268 | 				uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence
269 | 				//------------load intermediate values----------------------
270 | 				HD = global[ridx];
271 | 				h[0] = HD.x;
272 | 				e = HD.y;
273 | 				//----------------------------------------------------------
274 | 				#pragma unroll 8
275 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
276 | 					CORE_GLOBAL_COMPUTE();
277 | 				}
278 | 				//--------------save intermediate values-------------------------
279 | 				HD.x = h[m-1];
280 | 				HD.y = e;//max(e, 0);
281 | 				global[ridx] = HD;
282 | 				ridx++;
283 | 				//--------------------------------------------------------------
284 | 				//------the last column of DP matrix------------
285 | 				if (ridx == read_len) {
286 | 					for (m = 1; m < 9; m++) {
287 | 						max_h[m] = h[m];
288 | 
289 | 					}
290 | 				}
291 | 				//----------------------------------------------
292 | 			}
293 | 		}
294 | 			//------------------------------------------------------------------
295 | 		}
296 | 
297 | 	}
298 | 	
299 | 	device_res->aln_score[tid] = max_h[8 - ((target_batch_regs << 3) - (ref_len))];//copy the max score to the output array in the GPU mem
300 | 
301 | 	return;
302 | 
303 | }
304 | #endif
305 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/ctors.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "gasal.h"
  3 | 
  4 | #include "args_parser.h"
  5 | 
  6 | #include "host_batch.h"
  7 | 
  8 | #include "res.h"
  9 | 
 10 | #include "ctors.h"
 11 | 
 12 | #include "interfaces.h"
 13 | 
 14 | #include <cmath>
 15 | 
 16 | 
 17 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams) {
 18 | 	gasal_gpu_storage_v v;
 19 | 	v.a = (gasal_gpu_storage_t*)calloc(n_streams, sizeof(gasal_gpu_storage_t));
 20 | 	v.n = n_streams;
 21 | 	return v;
 22 | 
 23 | }
 24 | 
 25 | 
 26 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec,  int max_query_len, int max_target_len, int max_n_alns,  Parameters *params) {
 27 | 
 28 | 	cudaError_t err;
 29 | 	int i;
 30 | 	int max_query_len_8 = max_query_len % 8 ? max_query_len + (8 - (max_query_len % 8)) : max_query_len;
 31 | 	int max_target_len_8 = max_target_len % 8 ? max_target_len + (8 - (max_target_len % 8)) : max_target_len;
 32 | 
 33 | 	int host_max_query_batch_bytes = max_n_alns * max_query_len_8;
 34 | 	int gpu_max_query_batch_bytes = max_n_alns * max_query_len_8;
 35 | 	int host_max_target_batch_bytes =  max_n_alns * max_target_len_8;
 36 | 	int gpu_max_target_batch_bytes =  max_n_alns * max_target_len_8;
 37 | 	int host_max_n_alns = max_n_alns;
 38 | 	int gpu_max_n_alns = max_n_alns;
 39 | 
 40 | 
 41 | 
 42 | 	for (i = 0; i < gpu_storage_vec->n; i++) {
 43 | 
 44 | 		gpu_storage_vec->a[i].extensible_host_unpacked_query_batch = gasal_host_batch_new(host_max_query_batch_bytes, 0);
 45 | 		gpu_storage_vec->a[i].extensible_host_unpacked_target_batch = gasal_host_batch_new(host_max_target_batch_bytes, 0);
 46 | 
 47 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
 48 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
 49 | 
 50 | 
 51 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault));
 52 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_op), host_max_n_alns * sizeof(uint8_t), cudaHostAllocDefault));
 53 | 		uint8_t *no_ops = NULL;
 54 | 		no_ops = (uint8_t*) calloc(host_max_n_alns * sizeof(uint8_t), sizeof(uint8_t));
 55 | 		gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, QUERY);
 56 | 		gasal_op_fill(&(gpu_storage_vec->a[i]), no_ops, host_max_n_alns, TARGET);
 57 | 		free(no_ops);
 58 | 
 59 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_op), gpu_max_n_alns * sizeof(uint8_t)));
 60 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_op), gpu_max_n_alns * sizeof(uint8_t)));
 61 | 
 62 | 
 63 | 
 64 | 		if (params->isPacked)
 65 | 		{
 66 | 			gpu_storage_vec->a[i].packed_query_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_query_batch;
 67 | 			gpu_storage_vec->a[i].packed_target_batch = (uint32_t *) gpu_storage_vec->a[i].unpacked_target_batch;
 68 | 
 69 | 		} else {
 70 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
 71 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
 72 | 		}
 73 | 
 74 | 		if (params->algo == KSW)
 75 | 		{
 76 | 			CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_seed_scores), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 77 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].seed_scores), host_max_n_alns * sizeof(uint32_t)));
 78 | 		} else {
 79 | 			gpu_storage_vec->a[i].host_seed_scores = NULL;
 80 | 			gpu_storage_vec->a[i].seed_scores = NULL;
 81 | 		}
 82 | 
 83 | 
 84 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 85 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_lens), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 86 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_query_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 87 | 		CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_target_batch_offsets), host_max_n_alns * sizeof(uint32_t), cudaHostAllocDefault));
 88 | 
 89 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
 90 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
 91 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
 92 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
 93 | 		
 94 | 
 95 | 		gpu_storage_vec->a[i].host_res = gasal_res_new_host(host_max_n_alns, params);
 96 | 		if(params->start_pos == WITH_TB) CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage_vec->a[i].host_res->cigar), gpu_max_query_batch_bytes * sizeof(uint8_t),cudaHostAllocDefault));
 97 | 		gpu_storage_vec->a[i].device_cpy = gasal_res_new_device_cpy(max_n_alns,  params);
 98 | 		gpu_storage_vec->a[i].device_res = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy);
 99 | 
100 | 		if (params->secondBest)
101 | 		{	
102 | 			gpu_storage_vec->a[i].host_res_second = gasal_res_new_host(host_max_n_alns,  params);
103 | 			gpu_storage_vec->a[i].device_cpy_second = gasal_res_new_device_cpy(host_max_n_alns, params);
104 | 			gpu_storage_vec->a[i].device_res_second = gasal_res_new_device(gpu_storage_vec->a[i].device_cpy_second);
105 | 
106 | 		} else {
107 | 			gpu_storage_vec->a[i].host_res_second = NULL;
108 | 			gpu_storage_vec->a[i].device_cpy_second = NULL;
109 | 			gpu_storage_vec->a[i].device_res_second = NULL;
110 | 		}
111 | 
112 | 		if (params->start_pos == WITH_TB) {
113 | 			gpu_storage_vec->a[i].packed_tb_matrix_size = ((uint32_t)ceil(((double)((uint64_t)max_query_len_8*(uint64_t)max_target_len_8))/32)) * gpu_max_n_alns;
114 | 			CHECKCUDAERROR(cudaMalloc(&(gpu_storage_vec->a[i].packed_tb_matrices), gpu_storage_vec->a[i].packed_tb_matrix_size * sizeof(uint4)));
115 | 		}
116 | 
117 | 
118 | 		CHECKCUDAERROR(cudaStreamCreate(&(gpu_storage_vec->a[i].str)));
119 | 		gpu_storage_vec->a[i].is_free = 1;
120 | 		gpu_storage_vec->a[i].host_max_query_batch_bytes = host_max_query_batch_bytes;
121 | 		gpu_storage_vec->a[i].host_max_target_batch_bytes = host_max_target_batch_bytes;
122 | 		gpu_storage_vec->a[i].host_max_n_alns = host_max_n_alns;
123 | 		gpu_storage_vec->a[i].gpu_max_query_batch_bytes = gpu_max_query_batch_bytes;
124 | 		gpu_storage_vec->a[i].gpu_max_target_batch_bytes = gpu_max_target_batch_bytes;
125 | 		gpu_storage_vec->a[i].gpu_max_n_alns = gpu_max_n_alns;
126 | 		gpu_storage_vec->a[i].current_n_alns = 0;
127 | 	}
128 | }
129 | 
130 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params) {
131 | 
132 | 	cudaError_t err;
133 | 
134 | 	int i;
135 | 	for (i = 0; i < gpu_storage_vec->n; i ++) {
136 | 		
137 | 		gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_query_batch);
138 | 		gasal_host_batch_destroy(gpu_storage_vec->a[i].extensible_host_unpacked_target_batch);
139 | 
140 | 		gasal_res_destroy_host(gpu_storage_vec->a[i].host_res);
141 | 		gasal_res_destroy_device(gpu_storage_vec->a[i].device_res, gpu_storage_vec->a[i].device_cpy);
142 | 
143 | 		if (params->secondBest)
144 | 		{
145 | 			gasal_res_destroy_host(gpu_storage_vec->a[i].host_res_second);
146 | 			gasal_res_destroy_device(gpu_storage_vec->a[i].device_res_second, gpu_storage_vec->a[i].device_cpy_second);
147 | 		}
148 | 
149 | 
150 | 		if (!(params->algo == KSW))
151 | 		{
152 | 			if (gpu_storage_vec->a[i].seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].seed_scores));
153 | 			if (gpu_storage_vec->a[i].host_seed_scores != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_seed_scores));
154 | 		}
155 | 
156 | 		if (gpu_storage_vec->a[i].query_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_op));
157 | 		if (gpu_storage_vec->a[i].target_op != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_op));
158 | 		if (gpu_storage_vec->a[i].host_query_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_op));
159 | 		if (gpu_storage_vec->a[i].host_target_op != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_op));
160 | 
161 | 		if (gpu_storage_vec->a[i].host_query_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_offsets));
162 | 		if (gpu_storage_vec->a[i].host_target_batch_offsets != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_offsets));
163 | 		if (gpu_storage_vec->a[i].host_query_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_query_batch_lens));
164 | 		if (gpu_storage_vec->a[i].host_target_batch_lens != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_target_batch_lens));
165 | 		if (gpu_storage_vec->a[i].host_res->cigar != NULL) CHECKCUDAERROR(cudaFreeHost(gpu_storage_vec->a[i].host_res->cigar));
166 | 
167 | 
168 | 
169 | 
170 | 		if (gpu_storage_vec->a[i].unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_query_batch));
171 | 		if (gpu_storage_vec->a[i].unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].unpacked_target_batch));
172 | 		if (!(params->isPacked))
173 | 		{
174 | 			if (gpu_storage_vec->a[i].packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_query_batch));
175 | 			if (gpu_storage_vec->a[i].packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_target_batch));
176 | 		}
177 | 
178 | 
179 | 		if (gpu_storage_vec->a[i].query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_offsets));
180 | 		if (gpu_storage_vec->a[i].target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_offsets));
181 | 		if (gpu_storage_vec->a[i].query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].query_batch_lens));
182 | 		if (gpu_storage_vec->a[i].target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].target_batch_lens));
183 | 		if (gpu_storage_vec->a[i].packed_tb_matrices != NULL) CHECKCUDAERROR(cudaFree(gpu_storage_vec->a[i].packed_tb_matrices));
184 | 
185 | 		if (gpu_storage_vec->a[i].str != NULL)CHECKCUDAERROR(cudaStreamDestroy(gpu_storage_vec->a[i].str));
186 | 	}
187 | 
188 | 
189 | 
190 | }
191 | 
192 | 
193 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec) {
194 | 
195 | 	if(gpu_storage_vec->a != NULL) free(gpu_storage_vec->a);
196 | }
197 | 
198 | 
199 | 
200 | 
201 | // Deprecated
202 | void gasal_gpu_mem_alloc(gasal_gpu_storage_t *gpu_storage, int gpu_max_query_batch_bytes, int gpu_max_target_batch_bytes, int gpu_max_n_alns, Parameters *params) {
203 | 
204 | 	cudaError_t err;
205 | 	//	if (gpu_storage->gpu_max_query_batch_bytes % 8) {
206 | 	//		fprintf(stderr, "[GASAL ERROR:] max_query_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_query_batch_bytes % 8);
207 | 	//		exit(EXIT_FAILURE);
208 | 	//	}
209 | 	//	if (gpu_storage->gpu_max_target_batch_bytes % 8) {
210 | 	//		fprintf(stderr, "[GASAL ERROR:] max_target_batch_bytes=%d is not a multiple of 8\n", gpu_storage->gpu_max_target_batch_bytes % 8);
211 | 	//		exit(EXIT_FAILURE);
212 | 	//	}
213 | 
214 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_max_query_batch_bytes * sizeof(uint8_t)));
215 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_max_target_batch_bytes * sizeof(uint8_t)));
216 | 
217 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
218 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
219 | 
220 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
221 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_max_n_alns * sizeof(uint32_t)));
222 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
223 | 	CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_max_n_alns * sizeof(uint32_t)));
224 | 
225 | 	gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
226 | 
227 | 	gpu_storage->gpu_max_query_batch_bytes = gpu_max_query_batch_bytes;
228 | 	gpu_storage->gpu_max_target_batch_bytes = gpu_max_target_batch_bytes;
229 | 	gpu_storage->gpu_max_n_alns = gpu_max_n_alns;
230 | 
231 | }
232 | 
233 | // Deprecated
234 | void gasal_gpu_mem_free(gasal_gpu_storage_t *gpu_storage, Parameters *params) {
235 | 
236 | 	cudaError_t err;
237 | 
238 | 	if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch));
239 | 	if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch));
240 | 	if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch));
241 | 	if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch));
242 | 	if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets));
243 | 	if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets));
244 | 	if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens));
245 | 	if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens));
246 | 	
247 | 	gasal_res_destroy_device(gpu_storage->device_res,gpu_storage->device_cpy);
248 | 	if (params->secondBest)
249 | 	{
250 | 		gasal_res_destroy_device(gpu_storage->device_res_second, gpu_storage->device_cpy_second);
251 | 	}
252 | }
253 | 


--------------------------------------------------------------------------------
/src/__deprecated.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | //GASAL2 blocking alignment function - DEPRECATED
  4 | /*
  5 | void gasal_aln(gasal_gpu_storage_t *gpu_storage, const uint8_t *query_batch, const uint32_t *query_batch_offsets, const uint32_t *query_batch_lens, const uint8_t *target_batch, const uint32_t *target_batch_offsets, const uint32_t *target_batch_lens, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, int32_t *host_aln_score, int32_t *host_query_batch_start, int32_t *host_target_batch_start, int32_t *host_query_batch_end, int32_t *host_target_batch_end,  algo_type algo, comp_start start) {
  6 | 
  7 | 	cudaError_t err;
  8 | 	if (actual_n_alns <= 0) {
  9 | 			fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n");
 10 | 			exit(EXIT_FAILURE);
 11 | 		}
 12 | 		if (actual_query_batch_bytes <= 0) {
 13 | 			fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n");
 14 | 			exit(EXIT_FAILURE);
 15 | 		}
 16 | 		if (actual_target_batch_bytes <= 0) {
 17 | 			fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n");
 18 | 			exit(EXIT_FAILURE);
 19 | 		}
 20 | 
 21 | 		if (actual_query_batch_bytes % 8) {
 22 | 			fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes);
 23 | 			exit(EXIT_FAILURE);
 24 | 		}
 25 | 		if (actual_target_batch_bytes % 8) {
 26 | 			fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes);
 27 | 			exit(EXIT_FAILURE);
 28 | 
 29 | 		}
 30 | 	//--------------if pre-allocated memory is less, allocate more--------------------------
 31 | 	if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) {
 32 | 
 33 | 		int i = 2;
 34 | 		while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++;
 35 | 		gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i;
 36 | 
 37 | 		fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i);
 38 | 
 39 | 		if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch));
 40 | 		if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch));
 41 | 
 42 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t)));
 43 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
 44 | 
 45 | 
 46 | 
 47 | 
 48 | 	}
 49 | 
 50 | 	if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) {
 51 | 
 52 | 		int i = 2;
 53 | 		while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++;
 54 | 		gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i;
 55 | 
 56 | 		fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i);
 57 | 
 58 | 		if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch));
 59 | 		if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch));
 60 | 
 61 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t)));
 62 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
 63 | 
 64 | 
 65 | 	}
 66 | 
 67 | 	if (gpu_storage->gpu_max_n_alns < actual_n_alns) {
 68 | 		fprintf(stderr, "[GASAL] gpu_max_n_alns(%d) should be >= acutal_n_alns(%d)\n", gpu_storage->gpu_max_n_alns, actual_n_alns);
 69 | 
 70 | 		int i = 2;
 71 | 		while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++;
 72 | 		gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i;
 73 | 
 74 | 		fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on  GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i);
 75 | 
 76 | 
 77 | 		if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets));
 78 | 		if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets));
 79 | 		if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens));
 80 | 		if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens));
 81 | 		if (gpu_storage->aln_score != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->aln_score));
 82 | 		if (gpu_storage->query_batch_start != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_start));
 83 | 		if (gpu_storage->target_batch_start != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_start));
 84 | 		if (gpu_storage->query_batch_end != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_end));
 85 | 		if (gpu_storage->target_batch_end != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_end));
 86 | 
 87 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
 88 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
 89 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
 90 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
 91 | 
 92 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->aln_score),gpu_storage->gpu_max_n_alns * sizeof(int32_t)));
 93 | 		if (algo == GLOBAL) {
 94 | 			gpu_storage->query_batch_start = NULL;
 95 | 			gpu_storage->query_batch_end = NULL;
 96 | 			gpu_storage->target_batch_start = NULL;
 97 | 			gpu_storage->target_batch_end = NULL;
 98 | 		} else {
 99 | 			CHECKCUDAERROR(
100 | 					cudaMalloc(&(gpu_storage->target_batch_end),
101 | 							gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
102 | 			if (start == WITH_START) {
103 | 				CHECKCUDAERROR(
104 | 						cudaMalloc(&(gpu_storage->target_batch_start),
105 | 								gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
106 | 			} else
107 | 				gpu_storage->target_batch_start = NULL;
108 | 			if (algo == LOCAL) {
109 | 				CHECKCUDAERROR(
110 | 						cudaMalloc(&(gpu_storage->query_batch_end),
111 | 								gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
112 | 				if (start == WITH_START) {
113 | 					CHECKCUDAERROR(
114 | 							cudaMalloc(&(gpu_storage->query_batch_start),
115 | 									gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
116 | 				} else
117 | 					gpu_storage->query_batch_start = NULL;
118 | 			} else {
119 | 				gpu_storage->query_batch_start = NULL;
120 | 				gpu_storage->query_batch_end = NULL;
121 | 			}
122 | 		}
123 | 
124 | 
125 | 
126 | 	}
127 | 	//-------------------------------------------------------------------------------------------
128 | 
129 | 	//------------------------copy sequence batches from CPU to GPU---------------------------
130 | 	CHECKCUDAERROR(cudaMemcpy(gpu_storage->unpacked_query_batch, query_batch, actual_query_batch_bytes, cudaMemcpyHostToDevice));
131 | 	CHECKCUDAERROR(cudaMemcpy(gpu_storage->unpacked_target_batch, target_batch, actual_target_batch_bytes, cudaMemcpyHostToDevice));
132 | 	//----------------------------------------------------------------------------------------
133 | 
134 |     uint32_t BLOCKDIM = 128;
135 |     uint32_t N_BLOCKS = (actual_n_alns + BLOCKDIM - 1) / BLOCKDIM;
136 | 
137 |     int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*BLOCKDIM*N_BLOCKS));
138 |     int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*BLOCKDIM*N_BLOCKS));
139 | 
140 |     //launch packing kernel
141 |     gasal_pack_kernel<<<N_BLOCKS, BLOCKDIM>>>   ((uint32_t*)(gpu_storage->unpacked_query_batch),
142 |     						(uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch,
143 |     					    query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4);
144 |     cudaError_t pack_kernel_err = cudaGetLastError();
145 |     if ( cudaSuccess != pack_kernel_err )
146 |     {
147 |     	 fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err,  __LINE__, __FILE__);
148 |          exit(EXIT_FAILURE);
149 |     }
150 | 
151 |     //----------------------copy sequence offsets and lengths from CPU to GPU--------------------------------------
152 |     CHECKCUDAERROR(cudaMemcpy(gpu_storage->query_batch_lens, query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice));
153 |     CHECKCUDAERROR(cudaMemcpy(gpu_storage->target_batch_lens, target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice));
154 |     CHECKCUDAERROR(cudaMemcpy(gpu_storage->query_batch_offsets, query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice));
155 |     CHECKCUDAERROR(cudaMemcpy(gpu_storage->target_batch_offsets, target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice));
156 |     //------------------------------------------------------------------------------------------------------------------------
157 | 
158 |     //--------------------------------------launch alignment kernels--------------------------------------------------------------
159 |     if(algo == LOCAL) {
160 |     	if (start == WITH_START) {
161 | 			gasal_local_with_start_kernel<<<N_BLOCKS, BLOCKDIM>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
162 | 					gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score,
163 | 					gpu_storage->query_batch_end, gpu_storage->target_batch_end,  gpu_storage->query_batch_start,
164 |     				gpu_storage->target_batch_start, actual_n_alns);
165 |     	} else {
166 |     		gasal_local_kernel<<<N_BLOCKS, BLOCKDIM>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
167 |     				gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score,
168 |     				gpu_storage->query_batch_end, gpu_storage->target_batch_end, actual_n_alns, LOCAL);
169 |     	}
170 |     } else if (algo == SEMI_GLOBAL) {
171 |     	if (start == WITH_START) {
172 |     		gasal_semi_global_with_start_kernel<<<N_BLOCKS, BLOCKDIM>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
173 |     				gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, gpu_storage->target_batch_end,
174 |     				gpu_storage->target_batch_start, actual_n_alns);
175 |     	} else {
176 |     		gasal_semi_global_kernel<<<N_BLOCKS, BLOCKDIM>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
177 |     				gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, gpu_storage->target_batch_end,
178 |     				actual_n_alns);
179 |     	}
180 | 
181 |     } else if (algo == GLOBAL) {
182 |     	gasal_global_kernel<<<N_BLOCKS, BLOCKDIM>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
183 |     			gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->aln_score, actual_n_alns);
184 |     }
185 |     else {
186 |     	fprintf(stderr, "[GASAL ERROR:] Algo type invalid\n");
187 |     	exit(EXIT_FAILURE);
188 |     }
189 |     //-----------------------------------------------------------------------------------------------------------------------
190 |     cudaError_t aln_kernel_err = cudaGetLastError();
191 |     if ( cudaSuccess != aln_kernel_err )
192 |     {
193 |     	fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err,  __LINE__, __FILE__);
194 |     	exit(EXIT_FAILURE);
195 |     }
196 | 
197 |     //------------------------copy alignment results from GPU to CPU--------------------------------------
198 |     if (host_aln_score != NULL && gpu_storage->aln_score != NULL) CHECKCUDAERROR(cudaMemcpy(host_aln_score, gpu_storage->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost));
199 |     else {
200 |     	fprintf(stderr, "[GASAL ERROR:] The *host_aln_score input can't be NULL\n");
201 |     	exit(EXIT_FAILURE);
202 |     }
203 |     if (host_query_batch_start != NULL && gpu_storage->query_batch_start != NULL) CHECKCUDAERROR(cudaMemcpy(host_query_batch_start, gpu_storage->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost));
204 |     if (host_target_batch_start != NULL && gpu_storage->target_batch_start != NULL) CHECKCUDAERROR(cudaMemcpy(host_target_batch_start, gpu_storage->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost));
205 |     if (host_query_batch_end != NULL && gpu_storage->query_batch_end != NULL) CHECKCUDAERROR(cudaMemcpy(host_query_batch_end, gpu_storage->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost));
206 |     if (host_target_batch_end != NULL && gpu_storage->target_batch_end != NULL) CHECKCUDAERROR(cudaMemcpy(host_target_batch_end, gpu_storage->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost));
207 |     //------------------------------------------------------------------------------------------------------
208 | 
209 | }
210 | 
211 | */
212 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GASAL2 - GPU-accelerated DNA alignment library
  2 | GASAL2 is an easy-to-use CUDA library for DNA/RNA sequence alignment algorithms. Currently it supports different kind of alignments:
  3 |  - local alignment
  4 |  - semi-global alignment
  5 |  - global alignment
  6 |  - tile-based banded alignment.
  7 |  
  8 | It can also reverse and, or complement any sequences independently before alignment, and report second-best scores for certain alignment types.
  9 | 
 10 | It is an extension of GASAL (https://github.com/nahmedraja/GASAL) and allows full overlapping of CPU and GPU execution. 
 11 | 
 12 | ## List of new features:
 13 | - **Added traceback computation. The ouput is in CIGAR format**
 14 | - **GASAL2 can now compute all types of semi-global alignments**
 15 | - **Added expandable memory management on host side. The batches of query and target sequences are automatically enlarged if the required memory becomes larger than the allocated memory** 
 16 | - **Added kernel to reverse-complement sequences.**
 17 | - **Cleaned up, inconsistencies fixed, and a small optimization has been added (around 9% speedup with exact same result)** 
 18 | 
 19 | 
 20 | ## Changes in user interface:
 21 | - Changed the interface of `gasal_init_streams()` function
 22 | - The user now has to provide `MAX_QUERY_LEN` instead of `MAX_SEQ_LEN` during compilation
 23 | 
 24 | ## Requirements
 25 | A Linux platform with CUDA toolkit 8 or higher is required, along with usual build environment for C and C++ code. GASAL2 has been tested over NVIDIA GPUs with compute capabilities of 2.0, 3.5 and 5.0. Although lower versions of the CUDA framework might work, they have not been tested.
 26 | 
 27 | ## Compiling GASAL2
 28 | The library can be compiled with the following two commands:
 29 | 
 30 | ```bash
 31 | $ ./configure.sh <path to cuda installation directory>
 32 | $ make GPU_SM_ARCH=<GPU SM architecture> MAX_QUERY_LEN=<maximum query length> N_CODE=<code for "N", e.g. 0x4E if the bases are represented by ASCII characters> [N_PENALTY=<penalty for aligning "N" against any other base>]
 33 | ```
 34 | 
 35 | `N_PENALTY` is optional and if it is not specified then GASAL2 considers "N" as an ordinary base having the same match/mismatch scores as for A, C, G or T. As a result of these commands, *include* and *lib* directories will be created containing various `.h` files and `libgasal.a`, respectively. The user needs to include `gasal_header.h` in the code and link it with `libgasal.a` during compilation. Also, the CUDA runtime library has to be linked by adding `-lcudart` flag. The path to the CUDA runtime library must also be specfied while linking as *-L <path to CUDA lib64 directory>*.
 36 | 
 37 | ## Using GASAL2
 38 | 
 39 | ### Initialization
 40 | To use GASAL2  alignment functions, first the match/mismatach scores and gap open/extension penalties need to be passed on to the GPU. Assign the values match/mismatach scores and gap open/extension penalties to the members of `gasal_subst_scores` struct:
 41 | 
 42 | ```C
 43 | typedef struct{
 44 | 	int32_t match;
 45 | 	int32_t mismatch;
 46 | 	int32_t gap_open;
 47 | 	int32_t gap_extend;
 48 | }gasal_subst_scores;
 49 | ```
 50 | 
 51 | The values are passed to the GPU by calling `gasal_copy_subst_scores()` function:
 52 | 
 53 | ```C
 54 | void gasal_copy_subst_scores(gasal_subst_scores *subst);
 55 | ```
 56 | 
 57 | A vector of `gasal_gpu_storage_t` is created a the following function:
 58 | 
 59 | ```C
 60 | gasal_gpu_storage_v gasal_init_gpu_storage_v(int n_streams);
 61 | ```
 62 | 
 63 | With the help of `n_streams`, the user specifies the number of outstanding GPU alignment kernel launches to be performed. The return type is `gasal_gpu_storage_v`:
 64 | 
 65 | ```C
 66 | typedef struct{
 67 | 	int n;
 68 | 	gasal_gpu_storage_t *a;
 69 | }gasal_gpu_storage_v;
 70 | ```
 71 | 
 72 | with `n = n_streams` and `a` being a pointer to the array. An element of the array holds the required data structurea of a stream. To destroy the vector the following function is used:
 73 | 
 74 | ```C
 75 | void gasal_destroy_gpu_storage_v(gasal_gpu_storage_v *gpu_storage_vec);
 76 | ```
 77 | 
 78 | The streams in the vector are initialized by calling:
 79 | 
 80 | ```C
 81 | void gasal_init_streams(gasal_gpu_storage_v *gpu_storage_vec,  int max_query_len, int max_target_len, int max_n_alns,  Parameters *params);
 82 | ```
 83 | 
 84 | In GASAL2, the sequences to be aligned are conatined in two batches. A sequence in query_batch is aligned to sequence in target_batch. A *batch* is a concatenation of sequences. *The length of a sequence must be a multiple of 8*. Hence, if a sequence is not a multiple of 8, `N's` are added at the end of sequence. We call these redundant bases as *Pad bases*. Note that the pad bases are always "N's" irrespective of whether `N_PENALTY` is defined or not. The `gasal_init_streams()` function alloctes the memory required by a stream. With the help of *max_batch_bytes*, the user specifies the expected maxumum size(in bytes) of sequences in the two batches. *host_max_batch_bytes* are pre-allocated on the CPU. Smilarly, *gpu_max_batch_bytes* are pre-allocated on the GPU. *max_n_alns* is the expected maximum number of sequences in a batch. If the actual required GPU memory is more than the pre-allocated memory, GASAL2 automatically allocates more memory. 
 85 | 
 86 | Most GASAL2 functions operate with a Parameters object. This object holds all the informations about the alignment options selected. In particular, the alignment type, the default values when opening or extending gaps, etc. The Parameters object is filled like this:
 87 | 
 88 | ```C
 89 | Parameters *args;
 90 | args = new Parameters(0, NULL);
 91 | 
 92 | args->algo = <LOCAL|GLOBAL|SEMI_GLOBAL>; 
 93 | args->start_pos = <WITHOUT_START|WITH_START|WITH_TB>; //`WITHOUT_START` computes only the score and end-position. `WITH_START` computes the start-position with score and end-position. `WITH_TB` computes the score, start-position, end-position and traceback in CIGAR format.
 94 | args->isReverseComplement = <TRUE|FALSE>; //whether to reverse-complement the query sequence.
 95 | args->semiglobal_skipping_head = <QUERY|TARGET|BOTH|NONE>; //ignore gaps at the begining of QUERY|TARGET|BOTH|NONE in semi alignment-global.
 96 | args->semiglobal_skipping_tail = <QUERY|TARGET|BOTH|NONE>; //ignore gaps at the end of QUERY|TARGET|BOTH|NONE in semi alignment-global.
 97 | args->secondBest = <TRUE|FALSE>; //whether to compute the second best score in local and semi-global algo. But the start-position(WITH_START) and traceback(WITH_TRACEBACK) is only computMarched with the best score.
 98 | 
 99 | ```
100 | 
101 | 
102 | To free up the allocated memory the following function is used:
103 | 
104 | ```C
105 | void gasal_destroy_streams(gasal_gpu_storage_v *gpu_storage_vec, Parameters *params);
106 | ```
107 | 
108 | The `gasal_init_streams()` and `gasal_destroy_streams()` internally use `cudaMalloc()`, `cudaMallocHost()`, `cudaFree()` and `cudaFreeHost()` functions. These CUDA API functions are time expensive. Therefore, `gasal_init_streams()` and `gasal_destroy_streams()` should be preferably called only once in the program. You will find all these functions in the file `ctors.cpp`.
109 | 
110 | 
111 | ### Input data preparation
112 | The `gasal_gpu_storage_t` in `gasal.h` holds the data structures for a stream. In the following we only show those members of `gasal_gpu_storage_t` which should be accessed by the user. Other fields should not be modified manually and the user should rely on dedicated functions for complex operations.
113 | 
114 | ```C
115 | typedef struct{
116 | 	...
117 | 	uint8_t *host_query_op;
118 | 	uint8_t *host_target_op;
119 | 	...
120 | 	uint32_t *host_query_batch_offsets;
121 | 	uint32_t *host_target_batch_offsets;
122 | 	uint32_t *host_query_batch_lens;
123 | 	uint32_t *host_target_batch_lens;
124 | 	uint32_t host_max_query_batch_bytes;
125 | 	uint32_t host_max_target_batch_bytes;
126 | 	gasal_res_t *host_res;
127 | 	gasal_res_t *host_res_second; 
128 | 	uint32_t host_max_n_alns;
129 | 	uint32_t current_n_alns;
130 | 	int is_free;
131 | 	...
132 | } gasal_gpu_storage_t;
133 | ```
134 | 
135 | 
136 | 
137 | To align the sequences the user first need to check the availability of a stream. If `is_free` is  1, the user can use the current stream to perform the alignment on the GPU. 
138 | To do this, the user must fill the sequences with the following function.
139 | 
140 | ```C
141 | uint32_t gasal_host_batch_fill(gasal_gpu_storage_t *gpu_storage, uint32_t idx, const char* data, uint32_t size, data_source SRC);
142 | 
143 | ```
144 | 
145 | This function takes a sequence and its length, and append it in the data structure. It also adds the neccessary padding bases to ensure the sequence has a length which is a multiple of 8. Moreover, it takes care of allocating more memory if there is not enough room when adding the sequence. `SRC` is either `QUERY` or `TARGET`, depending upon which batch to fill. When executed, this function returns the offset to be filled by the user in `host_target_batch_offsets` or `host_query_batch_offsets`. The user also has to fill `host_target_batch_lens` or `host_query_batch_lens` with original length of sequences, i.e. length without pad bases. **The offset values include pad bases, whereas lengths are without pad bases**. The number of elements in offset and length arrays must be equal. The offset values allows the user to express the mode of pairwise alignment, i.e. one-to-one, one-to-all or one-to-many etc., between the query and traget sequences. The `current_n_alns` must appropriately be incremented to show the current number of alignments. `host_max_n_alns` is initially set equal to `max_n_alns` in `gasal_init_streams()` function. If the 'current_n_alns' exceeds `host_max_n_alns`, the user must call the following funnction to reallocate host offset, lengths and results arrays.March
146 | 
147 | ```C
148 | void gasal_host_alns_resize(gasal_gpu_storage_t *gpu_storage, int new_max_alns, Parameters *params); 
149 | 
150 | ```
151 | 
152 | where `new_max_alns` is the new value of `host_max_n_alns`.
153 | 
154 | 
155 | One can also use the `gasal_host_batch_addbase` to add a single base to the sequence. This takes care of memory reallocation if needed, but does not take care of padding, so this has to be used carefully.
156 | 
157 | 
158 | The the list of pre-processing operation (nothing, reverse, complement, reverse-complement) that has to be done on the batch of sequence can be loaded into the gpu_storage with the function `gasal_op_fill`. Its code is in `interfaces.cpp`. It fills `host_query_op` and `host_query_op` with an array of size `host_max_n_alns` where each value is the value of the enumeration of `operation_on_seq` (in gasal.h):
159 | ```C
160 | enum operation_on_seq{
161 | 	FORWARD_NATURAL,
162 | 	REVERSE_NATURAL,
163 | 	FORWARD_COMPLEMENT,
164 | 	REVERSE_COMPLEMENT,
165 | };
166 | ```
167 | By default, no operations are done on the sequences (that is, the fields `host_query_op` and `host_target_op` arrays are initialized to 0, which is the value of FORWARD_NATURAL).
168 | March
169 | 
170 | ### Alignment launching
171 | To launch the alignment, the following function is used:
172 | 
173 | ```C
174 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params)
175 | ```
176 | 
177 | The `actual_query_batch_bytes` and `actual_target_batch_bytes` specify the size of the two batches (in bytes) including the pad bases. `actual_n_alns` is the number of alignments to be performed. GASAL2 internally sets `is_free` to 0 after launching the alignment kernel on the GPU. From the performance prespective, if the average lengths of the sequences in *query_batch* and *target_batch* are not same, then the shorter sequences should be placed in *query_batch*. Fo rexample, in case of read mappers the read sequences are conatined in query_batch and the genome sequences in target_batch.
178 | 
179 | The `gasal_aln_async()` function returns immediately after launching the alignment kernel on the GPU. The user can perform other tasks instead of waiting for the kernel to finish.To test whether the alignment on GPU is finished, the following function is called:
180 | 
181 | ```
182 | int gasal_is_aln_async_done(gasal_gpu_storage *gpu_storage);
183 | ```
184 | 
185 | If the function returns 0 the alignment on the GPU is finished and the output arrays contain valid results. Moreover, `is_free` is set to 1 by GASAL2. Thus, the current stream can be used for the alignment of another batch of sequences. The function returns `-1` if the results are not ready. It returns `-2` if the function is called on a stream in which no alignment has been launced, i.e. `is_free == 1`.
186 | 
187 | 
188 | ### Alignment results
189 | The structure `gasal_res_t` holds the results of the alignment and can be accessed manually. Its fields are the following:
190 | 
191 | ```C
192 | struct gasal_res{
193 | 	int32_t *aln_score;
194 | 	int32_t *query_batch_end;
195 | 	int32_t *target_batch_end;
196 | 	int32_t *query_batch_start;
197 | 	int32_t *target_batch_start;
198 | 	uint8_t *cigar;
199 | 	uint32_t *n_cigar_ops;
200 | };
201 | typedef struct gasal_res gasal_res_t;
202 | ```
203 | The output of alignments are stored in `aln_score`, `query_batch_end`, `target_batch_end`, `query_batch_start`, and `target_batch_start`, `cigar` and `n_cigar_ops` arrays, within the `host_res` structure inside the `gasal_gpu_storage` structure. `cigar` is a byte array which contains the traceback information in CIGAR format of all the alignments performed . The lower 2 bits of a byte indicate the CIGAR operation:
204 | 
205 | ```
206 | 0 = match
207 | 1 = mismatch
208 | 2 = deletion
209 | 3 = insertion
210 | ```
211 | The upper 6 bits store the count of the operation in the lower two bits. The traceback information of an alignment in the `cigar` array is in the reverse direction. `host_query_batch_offsets` conatins the offset of an alignment in the `cigar` array. The `n_cigar_ops` contains number of bytes in the cigar array encoding the traceback information of an alignment.
212 | 
213 | In case of second-best result, the same applies with the fields in `host_res_secondbest`. But the start-position and traceback( is only computed with the best score. Therefore, only `host_res_secondbest->aln_score`, `host_res_secondbest->query_batch_end` and `host_res_secondbest->target_batch_end` are valid for second-best result. 
214 | 
215 | 
216 | 
217 | 
218 | ## Example
219 | The `test_prog` directory conatins an example program which uses GASAL2 for sequence alignment on GPU. See the README in the directory for the instructions about running the program.
220 | 
221 | ## Citing GASAL2
222 | GASAL2 is published in BMC Bioinformatics:
223 | 
224 | N. Ahmed, J. Lévy, S. Ren, H. Mushtaq, K. Bertels and Z. Al-ars, __GASAL2: a GPU accelerated sequence alignment library for high-throughput NGS data__, *BMC Bioinformatics* 20, 520 (2019) doi: [10.1186/s12859-019-3086-9](https://doi.org/10.1186/s12859-019-3086-9).
225 | 
226 | ## Problems and suggestions
227 | For any issues and suugestions contact Nauman Ahmed at nahmed@uet.edu.pk.
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/src/kernels/semiglobal_kernel_template.h:
--------------------------------------------------------------------------------
  1 | #ifndef __KERNEL_SEMIGLOBAL__
  2 | #define __KERNEL_SEMIGLOBAL__
  3 | 
  4 | 
  5 | #define CORE_COMPUTE_SEMIGLOBAL_DEPRECATED() \
  6 | 	uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence*/\
  7 | 	DEV_GET_SUB_SCORE_GLOBAL(subScore, rbase, gbase);/*check the equality of rbase and gbase*/\
  8 | 	/*int32_t curr_hm_diff = h[m] - _cudaGapOE;*/\
  9 | 	f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/*whether to introduce or extend a gap in query_batch sequence*/\
 10 | 	h[m] = p[m] + subScore;/*score if gbase is aligned to rbase*/\
 11 | 	h[m] = max(h[m], f[m]);\
 12 | 	e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence*/\
 13 | 	/*prev_hm_diff=curr_hm_diff;*/\
 14 | 	h[m] = max(h[m], e);\
 15 | 	p[m] = h[m-1];
 16 | 
 17 | #define CORE_COMPUTE_SEMIGLOBAL() \
 18 |     uint32_t gbase = (gpac >> l) & 15; /* get a base from target_batch sequence */ \
 19 |     DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */\
 20 |     register int32_t curr_hm_diff = h[m] - _cudaGapOE;\
 21 |     f[m] = max(curr_hm_diff, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */\
 22 |     curr_hm_diff = p[m] + subScore;/* score if rbase is aligned to gbase */\
 23 |     curr_hm_diff = max(curr_hm_diff, f[m]);\
 24 |     e = max(prev_hm_diff, e - _cudaGapExtend);/* whether to introduce or extend a gap in target_batch sequence */\
 25 |     curr_hm_diff = max(curr_hm_diff, e);\
 26 |     h[m] = curr_hm_diff;\
 27 |     p[m] = prev_hm_diff + _cudaGapOE;\
 28 |     prev_hm_diff=curr_hm_diff - _cudaGapOE;
 29 | 
 30 | 
 31 | 
 32 | /* typename meanings:
 33 | 	T : algorithm type. Unused at the moment for semi_global as only semi_global type is run in this kernel. Can be used to create several types of computing cores, for example.
 34 | 	S : WITH_ or WITHOUT_ Start.
 35 | 	HEAD : set to QUERY, TARGET, BOTH or NONE. Tells which HEAD (prefix) is allowed to be ignored.
 36 | 	TAIL : set to QUERY, TARGET, BOTH or NONE. Tells which TAIL (suffix) is allowed to be ignored.
 37 | */
 38 | 
 39 | template <typename T, typename S, typename B, typename HEAD, typename TAIL>
 40 | __global__ void gasal_semi_global_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch, uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks)
 41 | {
 42 | 
 43 | 	const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 44 | 	if (tid >= n_tasks) return;
 45 | 
 46 | 	int32_t i, j, k, l, m;
 47 | 	int32_t e;
 48 | 	
 49 | 	int32_t maxHH =  MINUS_INF;//initialize the maximum score to -infinity
 50 | 	int32_t subScore;
 51 | 	int32_t ridx, gidx;
 52 | 	short2 HD;
 53 | 	short2 initHD = make_short2(0, 0);
 54 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3;//starting index of the target_batch sequence
 55 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
 56 | 	uint32_t read_len = query_batch_lens[tid];
 57 | 	uint32_t ref_len = target_batch_lens[tid];
 58 | 	uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding sequence of query_batch
 59 | 	uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding sequence of target_batch
 60 | 
 61 | 	int32_t maxXY_y __attribute__((unused)) ;
 62 | 	int32_t maxXY_x __attribute__((unused)) ;
 63 | 	maxXY_x = ref_len;
 64 | 	maxXY_y = read_len;
 65 | 
 66 | 	
 67 |     int32_t maxHH_second __attribute__((unused)); // __attribute__((unused)) to avoid raising errors at compilation. most template-kernels don't use these.
 68 |     //int32_t prev_maxHH_second __attribute__((unused)); 
 69 |     int32_t maxXY_x_second __attribute__((unused));
 70 |     int32_t maxXY_y_second __attribute__((unused));
 71 |     maxHH_second = MINUS_INF;
 72 |     //prev_maxHH_second = 0;
 73 |     maxXY_x_second = ref_len;
 74 |     maxXY_y_second = read_len;
 75 | 
 76 | 	//-------arrays to save intermediate values----------------
 77 | 	short2 global[MAX_QUERY_LEN];
 78 | 	int32_t h[9];
 79 | 	int32_t f[9];
 80 | 	int32_t p[9];
 81 | 	//-------------------------------------------------------
 82 | 	int32_t u __attribute__((unused)) ;	// this variable may not be used in some cases, depending on which kernel is generated.
 83 | 	int32_t r __attribute__((unused)) ;
 84 | 
 85 | 
 86 | 
 87 | 	if (SAMETYPE(HEAD, Int2Type<QUERY>) || SAMETYPE(HEAD, Int2Type<BOTH>))
 88 | 	{
 89 | 		for (i = 0; i < MAX_QUERY_LEN; i++)
 90 | 		{
 91 | 			global[i] = initHD;
 92 | 		}
 93 | 	} else {
 94 | 		global[0] = make_short2(0, MINUS_INF);
 95 | 		for (i = 1; i < MAX_QUERY_LEN; i++)
 96 | 		{
 97 | 			global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF);
 98 | 		}
 99 | 	}
100 | 
101 | 	if (SAMETYPE(HEAD, Int2Type<QUERY>) || SAMETYPE(HEAD, Int2Type<NONE>))
102 | 	{
103 | 		u = 0;
104 | 		r = 0;
105 | 		h[u++] = 0;
106 | 		p[r++] = 0;
107 | 	}
108 | 
109 | 	for (i = 0; i < target_batch_regs; i++) 
110 | 	{ //target_batch sequence in rows
111 | 		gidx = i << 3;
112 | 		ridx = 0;
113 | 
114 | 		if (SAMETYPE(HEAD, Int2Type<TARGET>) || SAMETYPE(HEAD, Int2Type<BOTH>))
115 | 		{
116 | 			for (m = 0; m < 9; m++) 
117 | 			{
118 | 				h[m] = 0;
119 | 				f[m] = MINUS_INF;
120 | 				p[m] = 0;
121 | 			}
122 | 		} else {
123 | 			for (m = 1; m < 9; m++, u++, r++)
124 | 			{
125 | 				h[m] = -(_cudaGapO + (_cudaGapExtend*(u-1))); 
126 | 				f[m] = MINUS_INF; 
127 | 				p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1)));
128 | 			}
129 | 		}
130 | 
131 | 
132 | 		register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence
133 | 
134 | 		for (j = 0; j < query_batch_regs; /*++j*/ j+=1) //query_batch sequence in columns
135 | 		{ 
136 | 			register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 packed bases from query_batch sequence
137 | 
138 | 			//--------------compute a tile of 8x8 cells-------------------
139 | 			for (k = 28; k >= 0; k -= 4) 
140 | 			{
141 | 				uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence
142 | 				//------------load intermediate values----------------------
143 | 				HD = global[ridx];
144 | 				h[0] = HD.x;
145 | 				e = HD.y;
146 | 				//----------------------------------------------------------
147 | 				int32_t prev_hm_diff = h[0] - _cudaGapOE;
148 | 				#pragma unroll 8
149 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) 
150 | 				{
151 | 					CORE_COMPUTE_SEMIGLOBAL();
152 | 				}
153 | 				//--------------save intermediate values-------------------------
154 | 				HD.x = h[m-1];
155 | 				HD.y = e;
156 | 				global[ridx] = HD;
157 | 				ridx++;
158 | 
159 | 				//------the last line of DP matrix------------
160 | 				if (SAMETYPE(TAIL, Int2Type<TARGET>) || SAMETYPE(TAIL, Int2Type<BOTH>)) 
161 | 				{ 
162 | 					if (ridx == read_len) 
163 | 					{
164 | 						//----find the maximum and the corresponding end position-----------
165 | 						for (m = 1; m < 9; m++)
166 | 						{
167 | 							maxXY_y = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? gidx + (m-1) : maxXY_y;
168 | 							maxHH = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? h[m] : maxHH;
169 | 
170 | 							if (SAMETYPE(B, Int2Type<TRUE>))
171 | 							{
172 | 								bool override_second = (h[m] > maxHH_second && h[m] < maxHH && (gidx + m - 1) < ref_len);
173 | 								maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second; 
174 | 								maxHH_second = (override_second) ? h[m] : maxHH_second;
175 | 							}
176 | 						}
177 | 					} // endif(ridx == read_len)
178 | 				}
179 | 
180 | 			} // endfor() computing tile
181 | 		} // endfor() on query words
182 | 	} // endfor() on targt words
183 | 
184 | 
185 | 	if (SAMETYPE(TAIL, Int2Type<QUERY>) || SAMETYPE(TAIL, Int2Type<BOTH>)) 
186 | 	{
187 | 		for (m = 0; m < MAX_QUERY_LEN; m++)
188 | 		{
189 | 			int32_t score_tmp = global[m].x;
190 | 			if (score_tmp > maxHH && m < read_len)
191 | 			{
192 | 				maxXY_x = m;
193 | 				maxHH = score_tmp;
194 | 			}
195 | 			if (SAMETYPE(B, Int2Type<TRUE>))
196 | 			{
197 | 				bool override_second = (score_tmp > maxHH_second && score_tmp < maxHH && m < ref_len);
198 | 				maxXY_x_second = (override_second) ? m : maxXY_x_second; 
199 | 				maxHH_second = (override_second) ? score_tmp : maxHH_second;
200 | 			}
201 | 
202 | 		}
203 | 		/* if the X position has been updated and is not on the bottom line, then the max score is actually on the rightmost column.
204 | 		 * Then, update the Y position to be on the rightmost column.
205 | 		 */
206 | 		if (maxXY_x != ref_len)
207 | 			maxXY_y = read_len;
208 | 
209 | 		if (SAMETYPE(B, Int2Type<TRUE>))
210 | 		{
211 | 			if (maxXY_x_second != ref_len)
212 | 				maxXY_y_second = read_len;
213 | 		}
214 | 	}
215 | 
216 | 	device_res->aln_score[tid] = maxHH;//copy the max score to the output array in the GPU mem
217 | 	device_res->target_batch_end[tid] =  maxXY_y;//copy the end position on the target_batch sequence to the output array in the GPU mem
218 | 	device_res->query_batch_end[tid] =  maxXY_x;//copy the end position on the target_batch sequence to the output array in the GPU mem
219 | 
220 | 	if (SAMETYPE(B, Int2Type<TRUE>))
221 | 	{
222 | 		device_res_second->aln_score[tid] = maxHH_second;
223 | 		device_res_second->target_batch_end[tid] =  maxXY_y_second;
224 | 		device_res_second->query_batch_end[tid] =  maxXY_x_second;
225 | 	}
226 | 
227 | 	if (SAMETYPE(S, Int2Type<WITH_START>))
228 | 	{
229 | 
230 | 		/*------------------Now to find the start position-----------------------*/
231 | 
232 | 		uint32_t reverse_query_batch[(MAX_QUERY_LEN>>3)];//array to hold the reverse query_batch sequence
233 | 		uint32_t reverse_target_batch[(MAX_QUERY_LEN>>3)];//array to hold the reverse query_batch sequence
234 | 		uint32_t reverse_query_batch_reg;
235 | 		uint32_t reverse_target_batch_reg;
236 | 
237 | 		for (i = 0; i < (MAX_QUERY_LEN>>3); i++) {
238 | 			reverse_query_batch[i] = 0;
239 | 		}
240 | 		for (i = 0; i < (MAX_QUERY_LEN>>3); i++) {
241 | 			reverse_target_batch[i] = 0;
242 | 		}
243 | 
244 | 		//--------reverse query_batch sequence--------------------
245 | 		for (i = read_len - 1, k = 0; i >= 0; i--, k++) {
246 | 			uint32_t orig_query_batch_reg = i >> 3;
247 | 			uint32_t orig_symbol_pos = (((orig_query_batch_reg + 1) << 3) - i) - 1;
248 | 			reverse_query_batch_reg = k >> 3;
249 | 			uint32_t reverse_symbol_pos = (((reverse_query_batch_reg + 1) << 3) - k) - 1;
250 | 			uint32_t orig_symbol = 0;
251 | 			orig_symbol = (packed_query_batch[packed_query_batch_idx + orig_query_batch_reg] >> (orig_symbol_pos << 2)) & 15;
252 | 			reverse_query_batch[reverse_query_batch_reg] |= (orig_symbol << (reverse_symbol_pos << 2));
253 | 		}
254 | 		//---------------------------------------------------
255 | 
256 | 
257 | 		//--------reverse target_batch sequence--------------------
258 | 		for (i = ref_len - 1, k = 0; i >= 0; i--, k++) {
259 | 			uint32_t orig_target_batch_reg = i >> 3;
260 | 			uint32_t orig_symbol_pos = (((orig_target_batch_reg + 1) << 3) - i) - 1;
261 | 			reverse_target_batch_reg = k >> 3;
262 | 			uint32_t reverse_symbol_pos = (((reverse_target_batch_reg + 1) << 3) - k) - 1;
263 | 			uint32_t orig_symbol = 0;
264 | 			orig_symbol = (packed_target_batch[packed_target_batch_idx + orig_target_batch_reg] >> (orig_symbol_pos << 2)) & 15;
265 | 			reverse_target_batch[reverse_target_batch_reg] |= (orig_symbol << (reverse_symbol_pos << 2));
266 | 		}
267 | 		//---------------------------------------------------
268 | 
269 | 		int32_t gend_pos = maxXY_y;//end position on target_batch sequence
270 | 		int32_t fwd_score = maxHH;//the computed score
271 | 
272 | 		//the index of 32-bit word containing the end position on target_batch sequence
273 | 		int32_t gend_reg = (target_batch_regs - ((gend_pos >> 3) + 1)) > 0 ? (target_batch_regs - ((gend_pos >> 3) + 1)) - 1 : (target_batch_regs - ((gend_pos >> 3) + 1));
274 | 
275 | 		maxHH = MINUS_INF;
276 | 		maxXY_y = 0;
277 | 
278 | 		if (SAMETYPE(HEAD, Int2Type<QUERY>) || SAMETYPE(HEAD, Int2Type<BOTH>))
279 | 		{
280 | 			for (i = 0; i < MAX_QUERY_LEN; i++)
281 | 			{
282 | 				global[i] = initHD;
283 | 			}
284 | 		} else {
285 | 			global[0] = make_short2(0, MINUS_INF);
286 | 			for (i = 1; i < MAX_QUERY_LEN; i++)
287 | 			{
288 | 				global[i] = make_short2(-(_cudaGapO + (_cudaGapExtend*(i))), MINUS_INF);
289 | 			}
290 | 		}
291 | 
292 | 		if (SAMETYPE(HEAD, Int2Type<QUERY>) || SAMETYPE(HEAD, Int2Type<NONE>))
293 | 		{
294 | 			u = 0;
295 | 			r = 0;
296 | 			h[u++] = 0;
297 | 			p[r++] = 0;
298 | 		}
299 | 
300 | 		//------starting from the gend_reg, align the sequences in the reverse direction and exit if the max score >= fwd_score------
301 | 		for (i = gend_reg; i < target_batch_regs && maxHH < fwd_score; i++) { //target_batch sequence in rows
302 | 			gidx = i << 3;
303 | 			ridx = 0;
304 | 			if (SAMETYPE(HEAD, Int2Type<TARGET>) || SAMETYPE(HEAD, Int2Type<BOTH>))
305 | 			{
306 | 				for (m = 0; m < 9; m++) 
307 | 				{
308 | 					h[m] = 0;
309 | 					f[m] = MINUS_INF;
310 | 					p[m] = 0;
311 | 				}
312 | 			} else {
313 | 				for (m = 1; m < 9; m++, u++, r++)
314 | 				{
315 | 					h[m] = -(_cudaGapO + (_cudaGapExtend*(u-1))); 
316 | 					f[m] = MINUS_INF; 
317 | 					p[m] = r == 1 ? 0 : -(_cudaGapO + (_cudaGapExtend*(r-1)));
318 | 				}
319 | 			}
320 | 
321 | 			register uint32_t gpac =reverse_target_batch[i];//load 8 packed bases from target_batch sequence
322 | 
323 | 			for (j = 0; j < query_batch_regs && maxHH < fwd_score;j+=1) { //query_batch sequence in columns
324 | 				register uint32_t rpac =reverse_query_batch[j];//load 8 packed bases from target_batch sequence
325 | 				//--------------compute a tile of 8x8 cells-------------------
326 | 				for (k = 28; k >= 0; k -= 4) {
327 | 					uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence
328 | 					//------------load intermediate values----------------------
329 | 					HD = global[ridx];
330 | 					h[0] = HD.x;
331 | 					e = HD.y;
332 | 					//--------------------------------------------------------
333 | 					int32_t prev_hm_diff = h[0] - _cudaGapOE;
334 | 					#pragma unroll 8
335 | 					for (l = 28, m = 1; m < 9; l -= 4, m++) {
336 | 						CORE_COMPUTE_SEMIGLOBAL();
337 | 					}
338 | 					//------------save intermediate values----------------------
339 | 					HD.x = h[m-1];
340 | 					HD.y = e;
341 | 					global[ridx] = HD;
342 | 					ridx++;
343 | 
344 | 					//------the last line of DP matrix------------
345 | 					if (SAMETYPE(TAIL, Int2Type<TARGET>) || SAMETYPE(TAIL, Int2Type<BOTH>)) 
346 | 					{ 
347 | 						if (ridx == read_len) 
348 | 						{
349 | 							//----find the maximum and the corresponding end position-----------
350 | 							for (m = 1; m < 9; m++) 
351 | 							{
352 | 								maxXY_y = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? gidx + (m-1) : maxXY_y;
353 | 								maxHH = (h[m] > maxHH && (gidx + m - 1) < ref_len) ? h[m] : maxHH;
354 | 							}
355 | 						} // endif(ridx == read_len)
356 | 					}
357 | 				} // endfor() computing tile
358 | 			} // endfor() on query words
359 | 		} // endfor() on target words
360 | 		
361 | 
362 | 		if (SAMETYPE(TAIL, Int2Type<QUERY>) || SAMETYPE(TAIL, Int2Type<BOTH>)) 
363 | 		{
364 | 			for (m = 0; m < MAX_QUERY_LEN; m++)
365 | 			{
366 | 				int32_t score_tmp = global[m].x;
367 | 				if (score_tmp > maxHH && m < read_len)
368 | 				{
369 | 					maxXY_x = m;
370 | 					maxHH = score_tmp;
371 | 				}
372 | 			}
373 | 			/* if the X position has been updated and is not on the bottom line, then the max score is actually on the rightmost column.
374 | 			* Then, update the Y position to be on the rightmost column.
375 | 			*/
376 | 			if (maxXY_x != ref_len)
377 | 				maxXY_y = read_len;
378 | 		}
379 | 
380 | 		device_res->target_batch_start[tid] = (ref_len - 1) - maxXY_y;//copy the start position on target_batch sequence to the output array in the GPU mem
381 | 		device_res->query_batch_start[tid] = (read_len - 1) - maxXY_x;//copy the start position on target_batch sequence to the output array in the GPU mem
382 | 
383 | 
384 | 	} // endif(SAMETYPE(START, Int2Type<WITH_START>()))
385 | 
386 | 	return;
387 | 
388 | }
389 | #endif
390 | 


--------------------------------------------------------------------------------
/test_prog/test_prog.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include "../include/gasal_header.h"
  4 | 
  5 | 
  6 | #include <vector>
  7 | #include <unistd.h>
  8 | #include <math.h>
  9 | #include <omp.h>
 10 | #include "Timer.h"
 11 | 
 12 | #define NB_STREAMS 2
 13 | 
 14 | //#define STREAM_BATCH_SIZE (262144)
 15 | // this gives each stream HALF of the sequences.
 16 | //#define STREAM_BATCH_SIZE ceil((double)target_seqs.size() / (double)(2))
 17 | 
 18 | #define STREAM_BATCH_SIZE 5000//ceil((double)target_seqs.size() / (double)(2 * 2))
 19 | 
 20 | 
 21 | #define DEBUG
 22 | 
 23 | #define MAX(a,b) (a>b ? a : b)
 24 | 
 25 | //#define GPU_SELECT 0
 26 | 
 27 | 
 28 | int main(int argc, char **argv) {
 29 | 
 30 | 	//gasal_set_device(GPU_SELECT);
 31 | 
 32 | 	Parameters *args;
 33 | 	args = new Parameters(argc, argv);
 34 | 	args->parse();
 35 | 	args->print();
 36 | 
 37 | 	int print_out = args->print_out;
 38 | 	int n_threads = args->n_threads;
 39 | 
 40 | 	//--------------copy substitution scores to GPU--------------------
 41 | 	gasal_subst_scores sub_scores;
 42 | 
 43 | 	sub_scores.match = args->sa;
 44 | 	sub_scores.mismatch = args->sb;
 45 | 	sub_scores.gap_open = args->gapo;
 46 | 	sub_scores.gap_extend = args->gape;
 47 | 
 48 | 	gasal_copy_subst_scores(&sub_scores);
 49 | 
 50 | 	//-------------------------------------------------------------------
 51 | 
 52 | 
 53 | 	std::vector<std::string> query_seqs;
 54 | 	std::vector<std::string> target_seqs;
 55 | 	std::vector<std::string> query_headers;
 56 | 	std::vector<std::string> target_headers;
 57 | 	std::string query_batch_line, target_batch_line;
 58 | 
 59 | 	int total_seqs = 0;
 60 | 	uint32_t maximum_sequence_length = 0;
 61 | 	uint32_t target_seqs_len = 0;
 62 | 	uint32_t query_seqs_len = 0;
 63 | 	std::cerr << "Loading files...." << std::endl;
 64 | 
 65 | 	/*
 66 | 		Reads FASTA files and fill the corresponding buffers.
 67 | 		FASTA files contain sequences that are usually on separate lines.
 68 | 		The file reader detects a '>' then concatenates all the following lines into one sequence, until the next '>' or EOF.
 69 | 		See more about FASTA format : https://en.wikipedia.org/wiki/FASTA_format
 70 | 	*/
 71 | 	
 72 | 	int seq_begin=0;
 73 | 
 74 | 	std::vector<uint8_t> query_mod;
 75 | 	std::vector<uint8_t> target_mod;
 76 | 	std::vector<uint32_t> query_id;
 77 | 	std::vector<uint32_t> target_id;
 78 | 
 79 | 	char line_starts[5] = "></+";
 80 | 	/* The information of reverse-complementing is simulated by changing the first character of the sequence.
 81 | 	 * This is not explicitly FASTA-compliant, although regular FASTA files will simply be interpreted as Forward-Natural direction.
 82 | 	 * From the header of every sequence:
 83 | 	 * - '>' translates to 0b00 (0) = Forward, natural
 84 | 	 * - '<' translates to 0b01 (1) = Reverse, natural
 85 | 	 * - '/' translates to 0b10 (2) = Forward, complemented
 86 | 	 * - '+' translates to 0b11 (3) = Reverse, complemented
 87 | 	 * No protection is done, so any other number will only have its two first bytes counted as above.	 
 88 | 	 */
 89 | 
 90 | 	while (getline(args->query_batch_fasta, query_batch_line) && getline(args->target_batch_fasta, target_batch_line)) { 
 91 | 
 92 | 		//load sequences from the files
 93 | 		char *q = NULL;
 94 | 		char *t = NULL;
 95 | 		q = strchr(line_starts, (int) (query_batch_line[0]));
 96 | 		t = strchr(line_starts, (int) (target_batch_line[0]));
 97 | 
 98 | 		/*  
 99 | 			t and q are pointers to the first occurence of the first read character in the line_starts array.
100 | 			so if I compare the address of these pointers with the address of the pointer to line_start, then...
101 | 			I can get which character was found, so which modifier is required. 
102 | 		*/
103 | 
104 | 		if (q != NULL && t != NULL) {
105 | 			total_seqs++;
106 | 
107 | 			query_mod.push_back((uint8_t) (q-line_starts));
108 | 			query_id.push_back(total_seqs);
109 | 
110 | 			target_mod.push_back((uint8_t)(t-line_starts));
111 | 			target_id.push_back(total_seqs);
112 | 
113 | 			query_headers.push_back(query_batch_line.substr(1));
114 | 			target_headers.push_back(target_batch_line.substr(1));
115 | 
116 | 			if (seq_begin == 2) {
117 | 				// a sequence was already being read. Now it's done, so we should find its length.
118 | 				target_seqs_len += (target_seqs.back()).length();
119 | 				query_seqs_len += (query_seqs.back()).length();
120 | 				maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length);
121 | 				maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length);
122 | 			}
123 | 			seq_begin = 1;
124 | 			
125 | 		} else if (seq_begin == 1) {
126 | 			query_seqs.push_back(query_batch_line);
127 | 			target_seqs.push_back(target_batch_line);
128 | 			seq_begin=2;
129 | 		} else if (seq_begin == 2) {
130 | 			query_seqs.back() += query_batch_line;
131 | 			target_seqs.back() += target_batch_line;
132 | 		} else { // should never happen but always put an else, for safety...
133 | 			seq_begin = 0;
134 | 			std::cerr << "Batch1 and target_batch files should be fasta having same number of sequences" << std::endl;
135 | 			exit(EXIT_FAILURE);
136 | 		}
137 | 	}
138 | 
139 | 
140 | 
141 | 	// Check maximum sequence length one more time, to check the last read sequence:
142 | 	target_seqs_len += (target_seqs.back()).length();
143 | 	query_seqs_len += (query_seqs.back()).length();
144 | 	maximum_sequence_length = MAX((target_seqs.back()).length(), maximum_sequence_length);
145 | 	maximum_sequence_length = MAX((query_seqs.back()).length(), maximum_sequence_length);
146 | 	int maximum_sequence_length_query = MAX((query_seqs.back()).length(), 0);
147 | 
148 | 	#ifdef DEBUG
149 | 		std::cerr << "[TEST_PROG DEBUG]: ";
150 | 		std::cerr << "Size of read batches are: query=" << query_seqs_len << ", target=" << target_seqs_len << ". maximum_sequence_length=" << maximum_sequence_length << std::endl;
151 | 	#endif
152 | 
153 | 
154 | 	// transforming the _mod into a char* array (to be passed to GASAL, which deals with C types)
155 | 	uint8_t *target_seq_mod = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) );
156 | 	uint8_t *query_seq_mod  = (uint8_t*) malloc(total_seqs * sizeof(uint8_t) );
157 | 	uint32_t *target_seq_id = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) );
158 | 	uint32_t *query_seq_id  = (uint32_t*) malloc(total_seqs * sizeof(uint32_t) );
159 | 
160 | 	for (int i = 0; i < total_seqs; i++)
161 | 	{
162 | 		query_seq_mod[i] = query_mod.at(i);
163 | 		query_seq_id[i] = query_id.at(i);
164 | 	}
165 | 
166 | #ifdef DEBUG
167 | 	std::cerr << "[TEST_PROG DEBUG]: query, mod@id=";
168 | 	for (int i = 0; i < total_seqs; i++)
169 | 	{
170 | 		if ((query_seq_mod[i]) > 0)
171 | 			std::cerr << +(query_seq_mod[i]) << "@" << query_seq_id[i] << "| ";
172 | 	}
173 | 	
174 | 	std::cerr << std::endl;
175 | #endif
176 | 
177 | 	for (int i = 0; i < total_seqs; i++)
178 | 	{
179 | 		target_seq_mod[i] = target_mod.at(i);
180 | 		target_seq_id[i] = target_id.at(i);
181 | 	}
182 | 
183 | 	int *thread_seqs_idx = (int*)malloc(n_threads*sizeof(int));
184 | 	int *thread_n_seqs = (int*)malloc(n_threads*sizeof(int));
185 | 	int *thread_n_batchs = (int*)malloc(n_threads*sizeof(int));
186 | 	double *thread_misc_time = (double*)calloc(n_threads, sizeof(double));
187 | 
188 | 	int thread_batch_size = (int)ceil((double)total_seqs/n_threads);
189 | 	int n_seqs_alloc = 0;
190 | 	for (int i = 0; i < n_threads; i++){//distribute the sequences among the threads equally
191 | 		thread_seqs_idx[i] = n_seqs_alloc;
192 | 		if (n_seqs_alloc + thread_batch_size < total_seqs) thread_n_seqs[i] = thread_batch_size;
193 | 		else thread_n_seqs[i] = total_seqs - n_seqs_alloc;
194 | 		thread_n_batchs[i] = (int)ceil((double)thread_n_seqs[i]/(STREAM_BATCH_SIZE));
195 | 		n_seqs_alloc += thread_n_seqs[i];
196 | 	}
197 | 
198 | 	std::cerr << "Processing..." << std::endl;
199 | 
200 | 	Timer total_time;
201 | 	total_time.Start();
202 | 	omp_set_num_threads(n_threads);
203 | 	gasal_gpu_storage_v *gpu_storage_vecs =  (gasal_gpu_storage_v*)calloc(n_threads, sizeof(gasal_gpu_storage_v));
204 | 	for (int z = 0; z < n_threads; z++) {
205 | 		gpu_storage_vecs[z] = gasal_init_gpu_storage_v(NB_STREAMS);// creating NB_STREAMS streams per thread
206 | 
207 | 		/* 
208 | 			About memory sizes:
209 | 			The required memory is the total size of the batch + its padding, divided by the number of streams. 
210 | 			The worst case would be that every sequence has to be padded with 7 'N', since they must have a length multiple of 8.
211 | 			Even though the memory can be dynamically expanded both for Host and Device, it is advised to start with a memory large enough so that these expansions rarely occur (for better performance.)
212 | 			Modifying the factor '1' in front of each size lets you see how GASAL2 expands the memory when needed.
213 | 		*/
214 | 		/*
215 | 		// For exemple, this is exactly the memory needed to allocate to fit all sequences is a single GPU BATCH.
216 | 		gasal_init_streams(&(gpu_storage_vecs[z]), 
217 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 
218 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) , 
219 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS)) ,
220 | 							1 * ceil((double)(query_seqs_len +7*total_seqs) / (double)(NB_STREAMS))  , 
221 | 							ceil((double)target_seqs.size() / (double)(NB_STREAMS)), // maximum number of alignments is bigger on target than on query side.
222 | 							ceil((double)target_seqs.size() / (double)(NB_STREAMS)), 
223 | 							args);
224 | 		*/		
225 | 		//initializing the streams by allocating the required CPU and GPU memory
226 | 		// note: the calculations of the detailed sizes to allocate could be done on the library side (to hide it from the user's perspective)
227 | 		gasal_init_streams(&(gpu_storage_vecs[z]), (maximum_sequence_length_query + 7) , //TODO: remove maximum_sequence_length_query
228 | 						(maximum_sequence_length + 7) ,
229 | 						 STREAM_BATCH_SIZE, //device
230 | 						 args);
231 | 	}
232 | 	#ifdef DEBUG
233 | 		std::cerr << "[TEST_PROG DEBUG]: ";
234 | 		std::cerr << "size of host_unpack_query is " << (query_seqs_len +7*total_seqs) / (NB_STREAMS) << std::endl ;
235 | 	#endif
236 | 
237 | 	#pragma omp parallel
238 | 	{
239 | 	int n_seqs = thread_n_seqs[omp_get_thread_num()];//number of sequences allocated to this thread
240 | 	int curr_idx = thread_seqs_idx[omp_get_thread_num()];//number of sequences allocated to this thread
241 | 	int seqs_done = 0;
242 | 	int n_batchs_done = 0;
243 | 
244 | 	struct gpu_batch{ //a struct to hold data structures of a stream
245 | 			gasal_gpu_storage_t *gpu_storage; //the struct that holds the GASAL2 data structures
246 | 			int n_seqs_batch;//number of sequences in the batch (<= (target_seqs.size() / NB_STREAMS))
247 | 			int batch_start;//starting index of batch
248 | 	};
249 | 
250 | 	#ifdef DEBUG
251 | 		std::cerr << "[TEST_PROG DEBUG]: ";
252 | 		std::cerr << "Number of gpu_batch in gpu_batch_arr : " << gpu_storage_vecs[omp_get_thread_num()].n << std::endl;
253 | 		std::cerr << "[TEST_PROG DEBUG]: ";
254 | 		std::cerr << "Number of gpu_storage_vecs in a gpu_batch : " << omp_get_thread_num()+1 << std::endl;
255 | 	#endif
256 | 
257 | 	gpu_batch gpu_batch_arr[gpu_storage_vecs[omp_get_thread_num()].n];
258 | 
259 | 	for(int z = 0; z < gpu_storage_vecs[omp_get_thread_num()].n; z++) {
260 | 		gpu_batch_arr[z].gpu_storage = &(gpu_storage_vecs[omp_get_thread_num()].a[z]);
261 | 
262 | 	}
263 | 
264 | 	if (n_seqs > 0) {
265 | 		while (n_batchs_done < thread_n_batchs[omp_get_thread_num()]) { // Loop on streams
266 | 			int gpu_batch_arr_idx = 0;
267 | 			//------------checking the availability of a "free" stream"-----------------
268 | 			while(gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n && (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->is_free != 1) {
269 | 				gpu_batch_arr_idx++;
270 | 			}
271 | 
272 | 			if (seqs_done < n_seqs && gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {
273 | 				uint32_t query_batch_idx = 0;
274 | 				uint32_t target_batch_idx = 0;
275 | 				unsigned int j = 0;
276 | 				//-----------Create a batch of sequences to be aligned on the GPU. The batch contains (target_seqs.size() / NB_STREAMS) number of sequences-----------------------
277 | 
278 | 
279 | 				for (int i = curr_idx; seqs_done < n_seqs && j < (STREAM_BATCH_SIZE); i++, j++, seqs_done++)
280 | 				{
281 | 
282 | 					gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns++ ;
283 | 
284 | 					if(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns > gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns)
285 | 					{
286 | 						gasal_host_alns_resize(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->host_max_n_alns * 2, args);
287 | 					}
288 | 
289 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j] = query_batch_idx;
290 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_offsets[j] = target_batch_idx;
291 | 
292 | 					/*
293 | 						All the filling is moved on the library size, to take care of the memory size and expansions (when needed).
294 | 						The function gasal_host_batch_fill takes care of how to fill, how much to pad with 'N', and how to deal with memory. 
295 | 						It's the same function for query and target, and you only need to set the final flag to either ; this avoides code duplication.
296 | 						The way the host memory is filled changes the current _idx (it's increased by size, and by the padding). That's why it's returned by the function.
297 | 					*/
298 | 
299 | 					query_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 
300 | 									query_batch_idx, 
301 | 									query_seqs[i].c_str(), 
302 | 									query_seqs[i].size(),
303 | 									QUERY);
304 | 
305 | 					target_batch_idx = gasal_host_batch_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, 
306 | 									target_batch_idx, 
307 | 									target_seqs[i].c_str(), 
308 | 									target_seqs[i].size(),
309 | 									TARGET);
310 | 
311 | 					
312 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_lens[j] = query_seqs[i].size();
313 | 					(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_target_batch_lens[j] = target_seqs[i].size();
314 | 
315 | 				}
316 | 
317 | 				#ifdef DEBUG
318 | 					std::cerr << "[TEST_PROG DEBUG]: ";
319 | 					std::cerr << "Stream " << gpu_batch_arr_idx << ": j = " << j << ", seqs_done = " << seqs_done <<", query_batch_idx=" << query_batch_idx << " , target_batch_idx=" << target_batch_idx << std::endl;
320 | 				#endif
321 | 
322 | 				// Here, we fill the operations arrays for the current batch to be processed by the stream
323 | 				gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_seq_mod + seqs_done - j, j, QUERY);
324 | 				gasal_op_fill(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, target_seq_mod + seqs_done - j, j, TARGET);
325 | 
326 | 
327 | 				gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch = j;
328 | 				uint32_t query_batch_bytes = query_batch_idx;
329 | 				uint32_t target_batch_bytes = target_batch_idx;
330 | 				gpu_batch_arr[gpu_batch_arr_idx].batch_start = curr_idx;
331 | 				curr_idx += (STREAM_BATCH_SIZE);
332 | 
333 | 				//----------------------------------------------------------------------------------------------------
334 | 				//-----------------calling the GASAL2 non-blocking alignment function---------------------------------
335 | 
336 | 				gasal_aln_async(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage, query_batch_bytes, target_batch_bytes, gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch, args);
337 | 				gpu_batch_arr[gpu_batch_arr_idx].gpu_storage->current_n_alns = 0;
338 | 				//---------------------------------------------------------------------------------
339 | 			}
340 | 
341 | 
342 | 			//-------------------------------print alignment results----------------------------------------
343 | 		
344 | 			gpu_batch_arr_idx = 0;
345 | 			while (gpu_batch_arr_idx < gpu_storage_vecs[omp_get_thread_num()].n) {//loop through all the streams and print the results
346 | 																					//of the finished streams.
347 | 				if (gasal_is_aln_async_done(gpu_batch_arr[gpu_batch_arr_idx].gpu_storage) == 0) {
348 | 					int j = 0;
349 | 					if(print_out) {
350 | 					#pragma omp critical
351 | 					for (int i = gpu_batch_arr[gpu_batch_arr_idx].batch_start; j < gpu_batch_arr[gpu_batch_arr_idx].n_seqs_batch; i++, j++) {
352 | 						
353 | 						std::cout << "query_name=" << query_headers[i] ;
354 | 						std::cout << "\ttarget_name=" << target_headers[i] ;
355 | 						std::cout << "\tscore=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->aln_score[j] ;
356 | 						
357 | 						
358 | 						/// WARNING : INEQUALITY ON ENUM: CAN BREAK IF ENUM ORDER IS CHANGED
359 | 						if ((args->start_pos == WITH_START || args->start_pos == WITH_TB)
360 | 							&& ((args->algo == SEMI_GLOBAL && (args->semiglobal_skipping_head != NONE || args->semiglobal_skipping_head != NONE))
361 | 								|| args->algo > SEMI_GLOBAL))
362 | 						{
363 | 							std::cout << "\tquery_batch_start=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_start[j];
364 | 							std::cout << "\ttarget_batch_start=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_start[j];
365 | 						}
366 | 						
367 | 						if (args->algo != GLOBAL)
368 | 						{
369 | 							std::cout << "\tquery_batch_end="  << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->query_batch_end[j];
370 | 							std::cout << "\ttarget_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->target_batch_end[j] ;
371 | 						}
372 | 
373 | 	
374 | 
375 | 						if (args->secondBest)
376 | 						{
377 | 							std::cout << "\t2nd_score=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->aln_score[j] ;
378 | 							std::cout << "\t2nd_query_batch_end="  << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->query_batch_end[j];
379 | 							std::cout << "\t2nd_target_batch_end=" << (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res_second->target_batch_end[j] ;
380 | 						}
381 | 
382 | 						if (args->start_pos == WITH_TB) {
383 | 							std::cout << "\tCIGAR=";
384 | 							int u;
385 | 							int offset = (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_query_batch_offsets[j];
386 | 							int n_cigar_ops = (gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->n_cigar_ops[j];
387 | 							int last_op = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + n_cigar_ops - 1]) & 3;
388 | 							int count = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + n_cigar_ops - 1]) >> 2;
389 | 							for (u = n_cigar_ops - 2; u >= 0 ; u--){
390 | 								int curr_op = ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) & 3;
391 | 								if (curr_op == last_op) {
392 | 									count +=  ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) >> 2;
393 | 								} else {
394 | 									char op;
395 | 									switch (last_op) {
396 | 									case 0: op = 'M';
397 | 									break;
398 | 									case 1: op = 'X';
399 | 									break;
400 | 									case 2: op = 'D';
401 | 									break;
402 | 									case 3: op = 'I';
403 | 									break;
404 | 									default: op = 'E';
405 | 									break;
406 | 
407 | 									}
408 | 									std::cout << count << op;
409 | 									count =  ((gpu_batch_arr[gpu_batch_arr_idx].gpu_storage)->host_res->cigar[offset + u]) >> 2;
410 | 
411 | 								}
412 | 								last_op = curr_op;
413 | 
414 | 							}
415 | 							char op;
416 | 							switch (last_op) {
417 | 							case 0: op = 'M';
418 | 							break;
419 | 							case 1: op = 'X';
420 | 							break;
421 | 							case 2: op = 'D';
422 | 							break;
423 | 							case 3: op = 'I';
424 | 							break;
425 | 
426 | 							}
427 | 							std::cout << count << op;
428 | 						}
429 | 						std::cout << std::endl;
430 | 					}
431 | 					}
432 | 					n_batchs_done++;
433 | 				}
434 | 				gpu_batch_arr_idx++;
435 | 			}
436 | 		}
437 | 	}
438 | 
439 | 
440 | 	}
441 | 	for (int z = 0; z < n_threads; z++) {
442 | 		gasal_destroy_streams(&(gpu_storage_vecs[z]), args);
443 | 		gasal_destroy_gpu_storage_v(&(gpu_storage_vecs[z]));
444 | 	}
445 | 	free(gpu_storage_vecs);
446 | 	total_time.Stop();
447 | 	/*
448 | 	string algorithm = al_type;
449 | 	string start_type[2] = {"without_start", "with_start"};
450 | 	al_type += "_";
451 | 	al_type += start_type[start_pos==WITH_START];
452 | 	*/
453 | 	double av_misc_time = 0.0;
454 | 	for (int i = 0; i < n_threads; ++i){
455 | 		av_misc_time += (thread_misc_time[i]/n_threads);
456 | 	}
457 | 	std::cerr << std::endl << "Done" << std::endl;
458 | 	fprintf(stderr, "Total execution time (in milliseconds): %.3f\n", total_time.GetTime());
459 | 	delete args; // closes the files
460 | 	//free(args); // closes the files
461 | }
462 | 


--------------------------------------------------------------------------------
/src/gasal_align.cu:
--------------------------------------------------------------------------------
  1 | #include "gasal.h"
  2 | #include "args_parser.h"
  3 | #include "res.h"
  4 | #include "gasal_align.h"
  5 | #include "gasal_kernels.h"
  6 | #include "host_batch.h"
  7 | 
  8 | 
  9 | 
 10 | inline void gasal_kernel_launcher(int32_t N_BLOCKS, int32_t BLOCKDIM, algo_type algo, comp_start start, gasal_gpu_storage_t *gpu_storage, int32_t actual_n_alns, int32_t k_band, data_source semiglobal_skipping_head, data_source semiglobal_skipping_tail, Bool secondBest)
 11 | {
 12 | 	switch(algo)
 13 | 	{
 14 | 		
 15 | 		KERNEL_SWITCH(LOCAL,		start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest);
 16 | 		KERNEL_SWITCH(SEMI_GLOBAL,  start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest);		// MACRO that expands all 32 semi-global kernels
 17 | 		KERNEL_SWITCH(GLOBAL,		start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest);
 18 | 		KERNEL_SWITCH(KSW,			start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest);
 19 | 		KERNEL_SWITCH(BANDED,		start, semiglobal_skipping_head, semiglobal_skipping_tail, secondBest);
 20 | 		default:
 21 | 		break;
 22 | 
 23 | 	}
 24 | 
 25 | }
 26 | 
 27 | 
 28 | //GASAL2 asynchronous (a.k.a non-blocking) alignment function
 29 | void gasal_aln_async(gasal_gpu_storage_t *gpu_storage, const uint32_t actual_query_batch_bytes, const uint32_t actual_target_batch_bytes, const uint32_t actual_n_alns, Parameters *params) {
 30 | 
 31 | 	cudaError_t err;
 32 | 	if (actual_n_alns <= 0) {
 33 | 		fprintf(stderr, "[GASAL ERROR:] actual_n_alns <= 0\n");
 34 | 		exit(EXIT_FAILURE);
 35 | 	}
 36 | 	if (actual_query_batch_bytes <= 0) {
 37 | 		fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes <= 0\n");
 38 | 		exit(EXIT_FAILURE);
 39 | 	}
 40 | 	if (actual_target_batch_bytes <= 0) {
 41 | 		fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes <= 0\n");
 42 | 		exit(EXIT_FAILURE);
 43 | 	}
 44 | 
 45 | 	if (actual_query_batch_bytes % 8) {
 46 | 		fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes=%d is not a multiple of 8\n", actual_query_batch_bytes);
 47 | 		exit(EXIT_FAILURE);
 48 | 	}
 49 | 	if (actual_target_batch_bytes % 8) {
 50 | 		fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes=%d is not a multiple of 8\n", actual_target_batch_bytes);
 51 | 		exit(EXIT_FAILURE);
 52 | 	}
 53 | 
 54 | 	if (actual_query_batch_bytes > gpu_storage->host_max_query_batch_bytes) {
 55 | 				fprintf(stderr, "[GASAL ERROR:] actual_query_batch_bytes(%d) > host_max_query_batch_bytes(%d)\n", actual_query_batch_bytes, gpu_storage->host_max_query_batch_bytes);
 56 | 				exit(EXIT_FAILURE);
 57 | 	}
 58 | 
 59 | 	if (actual_target_batch_bytes > gpu_storage->host_max_target_batch_bytes) {
 60 | 			fprintf(stderr, "[GASAL ERROR:] actual_target_batch_bytes(%d) > host_max_target_batch_bytes(%d)\n", actual_target_batch_bytes, gpu_storage->host_max_target_batch_bytes);
 61 | 			exit(EXIT_FAILURE);
 62 | 	}
 63 | 
 64 | 	if (actual_n_alns > gpu_storage->host_max_n_alns) {
 65 | 			fprintf(stderr, "[GASAL ERROR:] actual_n_alns(%d) > host_max_n_alns(%d)\n", actual_n_alns, gpu_storage->host_max_n_alns);
 66 | 			exit(EXIT_FAILURE);
 67 | 	}
 68 | 
 69 | 	//--------------if pre-allocated memory is less, allocate more--------------------------
 70 | 	if (gpu_storage->gpu_max_query_batch_bytes < actual_query_batch_bytes) {
 71 | 
 72 | 		int i = 2;
 73 | 		while ( (gpu_storage->gpu_max_query_batch_bytes * i) < actual_query_batch_bytes) i++;
 74 | 
 75 | 		fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated GPU memory (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i);
 76 | 
 77 | 		gpu_storage->gpu_max_query_batch_bytes = gpu_storage->gpu_max_query_batch_bytes * i;
 78 | 
 79 | 		if (gpu_storage->unpacked_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_query_batch));
 80 | 		if (gpu_storage->packed_query_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_query_batch));
 81 | 
 82 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_query_batch), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t)));
 83 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_query_batch), (gpu_storage->gpu_max_query_batch_bytes/8) * sizeof(uint32_t)));
 84 | 
 85 | 		if (params->start_pos==WITH_TB){
 86 | 			fprintf(stderr, "[GASAL WARNING:] actual_query_batch_bytes(%d) > Allocated HOST memory for CIGAR (gpu_max_query_batch_bytes=%d). Therefore, allocating %d bytes on the host (gpu_max_query_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes, gpu_storage->gpu_max_query_batch_bytes*i, gpu_storage->gpu_max_query_batch_bytes*i);
 87 | 			if (gpu_storage->host_res->cigar != NULL)CHECKCUDAERROR(cudaFreeHost(gpu_storage->host_res->cigar));
 88 | 			CHECKCUDAERROR(cudaHostAlloc(&(gpu_storage->host_res->cigar), gpu_storage->gpu_max_query_batch_bytes * sizeof(uint8_t),cudaHostAllocDefault));
 89 | 		}
 90 | 
 91 | 	}
 92 | 
 93 | 	if (gpu_storage->gpu_max_target_batch_bytes < actual_target_batch_bytes) {
 94 | 
 95 | 		int i = 2;
 96 | 		while ( (gpu_storage->gpu_max_target_batch_bytes * i) < actual_target_batch_bytes) i++;
 97 | 		
 98 | 		fprintf(stderr, "[GASAL WARNING:] actual_target_batch_bytes(%d) > Allocated GPU memory (gpu_max_target_batch_bytes=%d). Therefore, allocating %d bytes on GPU (gpu_max_target_batch_bytes=%d). Performance may be lost if this is repeated many times.\n", actual_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes, gpu_storage->gpu_max_target_batch_bytes*i, gpu_storage->gpu_max_target_batch_bytes*i);
 99 | 
100 | 		gpu_storage->gpu_max_target_batch_bytes = gpu_storage->gpu_max_target_batch_bytes * i;
101 | 
102 | 		if (gpu_storage->unpacked_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->unpacked_target_batch));
103 | 		if (gpu_storage->packed_target_batch != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->packed_target_batch));
104 | 
105 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->unpacked_target_batch), gpu_storage->gpu_max_target_batch_bytes * sizeof(uint8_t)));
106 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->packed_target_batch), (gpu_storage->gpu_max_target_batch_bytes/8) * sizeof(uint32_t)));
107 | 
108 | 
109 | 	}
110 | 
111 | 	if (gpu_storage->gpu_max_n_alns < actual_n_alns) {
112 | 
113 | 		int i = 2;
114 | 		while ( (gpu_storage->gpu_max_n_alns * i) < actual_n_alns) i++;
115 | 		
116 | 		fprintf(stderr, "[GASAL WARNING:] actual_n_alns(%d) > gpu_max_n_alns(%d). Therefore, allocating memory for %d alignments on  GPU (gpu_max_n_alns=%d). Performance may be lost if this is repeated many times.\n", actual_n_alns, gpu_storage->gpu_max_n_alns, gpu_storage->gpu_max_n_alns*i, gpu_storage->gpu_max_n_alns*i);
117 | 
118 | 		gpu_storage->gpu_max_n_alns = gpu_storage->gpu_max_n_alns * i;
119 | 
120 | 		if (gpu_storage->query_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_offsets));
121 | 		if (gpu_storage->target_batch_offsets != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_offsets));
122 | 		if (gpu_storage->query_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->query_batch_lens));
123 | 		if (gpu_storage->target_batch_lens != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->target_batch_lens));
124 | 
125 | 		if (gpu_storage->seed_scores != NULL) CHECKCUDAERROR(cudaFree(gpu_storage->seed_scores));
126 | 
127 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
128 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_lens), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
129 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->query_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
130 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->target_batch_offsets), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
131 | 
132 | 		CHECKCUDAERROR(cudaMalloc(&(gpu_storage->seed_scores), gpu_storage->gpu_max_n_alns * sizeof(uint32_t)));
133 | 
134 | 		gasal_res_destroy_device(gpu_storage->device_res, gpu_storage->device_cpy);
135 | 		gpu_storage->device_cpy = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params);
136 | 		gpu_storage->device_res = gasal_res_new_device(gpu_storage->device_cpy);
137 | 
138 | 		if (params->secondBest)
139 | 		{
140 | 			gasal_res_destroy_device(gpu_storage->device_res_second, gpu_storage->device_cpy_second);
141 | 			gpu_storage->device_cpy_second = gasal_res_new_device_cpy(gpu_storage->gpu_max_n_alns, params);
142 | 			gpu_storage->device_res_second = gasal_res_new_device(gpu_storage->device_cpy_second);
143 | 		}
144 | 
145 | 	}
146 | 	//------------------------------------------
147 | 
148 | 	//------------------------launch copying of sequence batches from CPU to GPU---------------------------
149 | 
150 | 	// here you can track the evolution of your data structure processing with the printer: gasal_host_batch_printall(current);
151 | 
152 | 	host_batch_t *current = gpu_storage->extensible_host_unpacked_query_batch;
153 | 	while (current != NULL)
154 | 	{
155 | 		//gasal_host_batch_printall(current);
156 | 		CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_query_batch[current->offset]), 
157 | 										current->data, 
158 | 										current->data_size,
159 | 										cudaMemcpyHostToDevice, 
160 | 										gpu_storage->str ) );
161 | 
162 | 		current = current->next;
163 | 	}
164 | 
165 | 	current = gpu_storage->extensible_host_unpacked_target_batch;
166 | 	while (current != NULL)
167 | 	{
168 | 		CHECKCUDAERROR(cudaMemcpyAsync( &(gpu_storage->unpacked_target_batch[current->offset]), 
169 | 										current->data, 
170 | 										current->data_size,
171 | 										cudaMemcpyHostToDevice, 
172 | 										gpu_storage->str ) );
173 | 
174 | 		current = current->next;
175 | 	}
176 | 
177 | 	//-----------------------------------------------------------------------------------------------------------
178 | 	// TODO: Adjust the block size depending on the kernel execution.
179 | 	
180 |     uint32_t BLOCKDIM = 128;
181 |     uint32_t N_BLOCKS = (actual_n_alns + BLOCKDIM - 1) / BLOCKDIM;
182 | 
183 |     int query_batch_tasks_per_thread = (int)ceil((double)actual_query_batch_bytes/(8*BLOCKDIM*N_BLOCKS));
184 |     int target_batch_tasks_per_thread = (int)ceil((double)actual_target_batch_bytes/(8*BLOCKDIM*N_BLOCKS));
185 | 
186 | 
187 |     //-------------------------------------------launch packing kernel
188 | 
189 | 
190 | 	if (!(params->isPacked))
191 | 	{
192 | 		gasal_pack_kernel<<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>((uint32_t*)(gpu_storage->unpacked_query_batch),
193 | 		(uint32_t*)(gpu_storage->unpacked_target_batch), gpu_storage->packed_query_batch, gpu_storage->packed_target_batch,
194 | 		query_batch_tasks_per_thread, target_batch_tasks_per_thread, actual_query_batch_bytes/4, actual_target_batch_bytes/4);
195 | 		cudaError_t pack_kernel_err = cudaGetLastError();
196 | 		if ( cudaSuccess != pack_kernel_err )
197 | 		{
198 | 		fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(pack_kernel_err), pack_kernel_err,  __LINE__, __FILE__);
199 | 		exit(EXIT_FAILURE);
200 | 		}
201 | 	}
202 |     
203 | 
204 | 	// We could reverse-complement before packing, but we would get 2x more read-writes to memory.
205 | 
206 |     //----------------------launch copying of sequence offsets and lengths from CPU to GPU--------------------------------------
207 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_lens, gpu_storage->host_query_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str));
208 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_lens, gpu_storage->host_target_batch_lens, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
209 |     CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_batch_offsets, gpu_storage->host_query_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
210 | 	CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_batch_offsets, gpu_storage->host_target_batch_offsets, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice,  gpu_storage->str));
211 | 	
212 | 	// if needed copy seed scores
213 | 	if (params->algo == KSW)
214 | 	{
215 | 		if (gpu_storage->seed_scores == NULL)
216 | 		{
217 | 			fprintf(stderr, "seed_scores == NULL\n");
218 | 			
219 | 		}
220 | 		if (gpu_storage->host_seed_scores == NULL)
221 | 		{
222 | 			fprintf(stderr, "host_seed_scores == NULL\n");
223 | 		}
224 | 		if (gpu_storage->seed_scores == NULL || gpu_storage->host_seed_scores == NULL)
225 | 			exit(EXIT_FAILURE);
226 | 
227 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->seed_scores, gpu_storage->host_seed_scores, actual_n_alns * sizeof(uint32_t), cudaMemcpyHostToDevice, gpu_storage->str));
228 | 	}
229 |     //--------------------------------------------------------------------------------------------------------------------------
230 | 
231 | 	//----------------------launch copying of sequence operations (reverse/complement) from CPU to GPU--------------------------
232 | 	if (params->isReverseComplement)
233 | 	{
234 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->query_op, gpu_storage->host_query_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice,  gpu_storage->str));
235 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->target_op, gpu_storage->host_target_op, actual_n_alns * sizeof(uint8_t), cudaMemcpyHostToDevice,  gpu_storage->str));	
236 | 		//--------------------------------------launch reverse-complement kernel------------------------------------------------------
237 | 		gasal_reversecomplement_kernel<<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->packed_query_batch, gpu_storage->packed_target_batch, gpu_storage->query_batch_lens,
238 | 			gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->target_batch_offsets, gpu_storage->query_op, gpu_storage->target_op, actual_n_alns);
239 | 		cudaError_t reversecomplement_kernel_err = cudaGetLastError();
240 | 		if ( cudaSuccess != reversecomplement_kernel_err )
241 | 		{
242 | 			 fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(reversecomplement_kernel_err), reversecomplement_kernel_err,  __LINE__, __FILE__);
243 | 			 exit(EXIT_FAILURE);
244 | 		}
245 | 	
246 | 	}
247 | 	
248 |     //--------------------------------------launch alignment kernels--------------------------------------------------------------
249 | 	gasal_kernel_launcher(N_BLOCKS, BLOCKDIM, params->algo, params->start_pos, gpu_storage, actual_n_alns, params->k_band, params->semiglobal_skipping_head, params->semiglobal_skipping_tail, params->secondBest);
250 | 
251 | 	//if (params->start_pos == WITH_TB) {
252 | 
253 | 		// The output of the kernel: gpu_storage->unpacked_query_batch = cigar, gpu_storage->query_batch_lens = n_cigar_ops
254 | 		//gasal_get_tb<Int2Type<params->algo>><<<N_BLOCKS, BLOCKDIM, 0, gpu_storage->str>>>(gpu_storage->unpacked_query_batch, gpu_storage->query_batch_lens, gpu_storage->target_batch_lens, gpu_storage->query_batch_offsets, gpu_storage->packed_tb_matrices, gpu_storage->device_res, gpu_storage->current_n_alns);
255 | 	//}
256 | 
257 |         //-----------------------------------------------------------------------------------------------------------------------
258 |     cudaError_t aln_kernel_err = cudaGetLastError();
259 |     if ( cudaSuccess != aln_kernel_err )
260 |     {
261 |     	fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(aln_kernel_err), aln_kernel_err,  __LINE__, __FILE__);
262 |     	exit(EXIT_FAILURE);
263 |     }
264 | 
265 |     //------------------------0launch the copying of alignment results from GPU to CPU--------------------------------------
266 |     if (gpu_storage->host_res->aln_score != NULL && gpu_storage->device_cpy->aln_score != NULL) 
267 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->aln_score, gpu_storage->device_cpy->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
268 |     
269 | 	if (gpu_storage->host_res->query_batch_start != NULL && gpu_storage->device_cpy->query_batch_start != NULL) 
270 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_start, gpu_storage->device_cpy->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
271 |     
272 | 	if (gpu_storage->host_res->target_batch_start != NULL && gpu_storage->device_cpy->target_batch_start != NULL) 
273 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_start, gpu_storage->device_cpy->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
274 |     
275 | 	if (gpu_storage->host_res->query_batch_end != NULL && gpu_storage->device_cpy->query_batch_end != NULL) 
276 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->query_batch_end, gpu_storage->device_cpy->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
277 |     
278 | 	if (gpu_storage->host_res->target_batch_end != NULL && gpu_storage->device_cpy->target_batch_end != NULL) 
279 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->target_batch_end, gpu_storage->device_cpy->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
280 | 	if (params->start_pos == WITH_TB) {
281 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->cigar, gpu_storage->unpacked_query_batch, actual_query_batch_bytes * sizeof(uint8_t), cudaMemcpyDeviceToHost, gpu_storage->str));
282 | 		CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res->n_cigar_ops, gpu_storage->query_batch_lens, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
283 | 	}
284 | 	//-----------------------------------------------------------------------------------------------------------------------
285 | 	
286 | 
287 | 	// not really needed to filter with params->secondBest, since all the pointers will be null and non-initialized.
288 | 	if (params->secondBest)
289 | 	{	
290 | 		if (gpu_storage->host_res_second->aln_score != NULL && gpu_storage->device_cpy_second->aln_score != NULL) 
291 | 			CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->aln_score, gpu_storage->device_cpy_second->aln_score, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
292 |     
293 | 		if (gpu_storage->host_res_second->query_batch_start != NULL && gpu_storage->device_cpy_second->query_batch_start != NULL) 
294 | 			CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->query_batch_start, gpu_storage->device_cpy_second->query_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
295 | 		
296 | 		if (gpu_storage->host_res_second->target_batch_start != NULL && gpu_storage->device_cpy_second->target_batch_start != NULL) 
297 | 			CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->target_batch_start, gpu_storage->device_cpy_second->target_batch_start, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
298 | 		
299 | 		if (gpu_storage->host_res_second->query_batch_end != NULL && gpu_storage->device_cpy_second->query_batch_end != NULL) 
300 | 			CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->query_batch_end, gpu_storage->device_cpy_second->query_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
301 | 		
302 | 		if (gpu_storage->host_res_second->target_batch_end != NULL && gpu_storage->device_cpy_second->target_batch_end != NULL) 
303 | 			CHECKCUDAERROR(cudaMemcpyAsync(gpu_storage->host_res_second->target_batch_end, gpu_storage->device_cpy_second->target_batch_end, actual_n_alns * sizeof(int32_t), cudaMemcpyDeviceToHost, gpu_storage->str));
304 | 	}
305 | 
306 |     gpu_storage->is_free = 0; //set the availability of current stream to false
307 | }
308 | 
309 | 
310 | int gasal_is_aln_async_done(gasal_gpu_storage_t *gpu_storage) 
311 | {
312 | 	cudaError_t err;
313 | 	if(gpu_storage->is_free == 1) return -2;//if no work is launced in this stream, return -2
314 | 	err = cudaStreamQuery(gpu_storage->str);//check to see if the stream is finished
315 | 	if (err != cudaSuccess ) {
316 | 		if (err == cudaErrorNotReady) return -1;
317 | 		else{
318 | 			fprintf(stderr, "[GASAL CUDA ERROR:] %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__);
319 | 			exit(EXIT_FAILURE);
320 | 		}
321 | 	}
322 | 	gasal_host_batch_reset(gpu_storage);
323 | 	gpu_storage->is_free = 1;
324 | 	gpu_storage->current_n_alns = 0;
325 | 	return 0;
326 | }
327 | 
328 | 
329 | void gasal_copy_subst_scores(gasal_subst_scores *subst){
330 | 
331 | 	cudaError_t err;
332 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapO, &(subst->gap_open), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
333 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapExtend, &(subst->gap_extend), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
334 | 	int32_t gapoe = (subst->gap_open + subst->gap_extend);
335 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaGapOE, &(gapoe), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
336 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMatchScore, &(subst->match), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
337 | 	CHECKCUDAERROR(cudaMemcpyToSymbol(_cudaMismatchScore, &(subst->mismatch), sizeof(int32_t), 0, cudaMemcpyHostToDevice));
338 | 	return;
339 | }
340 | 
341 | 


--------------------------------------------------------------------------------
/src/kernels/local_kernel_template.h:
--------------------------------------------------------------------------------
  1 | #ifndef __LOCAL_KERNEL_TEMPLATE__
  2 | #define __LOCAL_KERNEL_TEMPLATE__
  3 | 
  4 | 
  5 | // This old core provides the same result as the currently LOCAL core, but lacks some optimization. Left for historical / comparative purposes.
  6 | #define CORE_LOCAL_DEPRECATED_COMPUTE() \
  7 | 		uint32_t gbase = (gpac >> l) & 15;/*get a base from target_batch sequence */ \
  8 | 		DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase);/* check equality of rbase and gbase */ \
  9 | 		f[m] = max(h[m]- _cudaGapOE, f[m] - _cudaGapExtend);/* whether to introduce or extend a gap in query_batch sequence */ \
 10 | 		h[m] = p[m] + subScore; /*score if rbase is aligned to gbase*/ \
 11 | 		h[m] = max(h[m], f[m]); \
 12 | 		h[m] = max(h[m], 0); \
 13 | 		e = max(h[m - 1] - _cudaGapOE, e - _cudaGapExtend);/*whether to introduce or extend a gap in target_batch sequence */\
 14 | 		h[m] = max(h[m], e); \
 15 | 		maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \
 16 | 		maxHH = (maxHH < h[m]) ? h[m] : maxHH; \
 17 | 		p[m] = h[m-1];
 18 | 
 19 | #define CORE_LOCAL_COMPUTE() \
 20 | 		uint32_t gbase = (gpac >> l) & 15;\
 21 | 		DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \
 22 | 		int32_t tmp_hm = p[m] + subScore; \
 23 | 		h[m] = max(tmp_hm, f[m]); \
 24 | 		h[m] = max(h[m], e); \
 25 | 		h[m] = max(h[m], 0); \
 26 | 		f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \
 27 | 		e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \
 28 | 		maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \
 29 | 		maxHH = (maxHH < h[m]) ? h[m] : maxHH; \
 30 | 		p[m] = h[m-1]; \
 31 | 
 32 | #define CORE_LOCAL_COMPUTE_START() \
 33 | 		uint32_t gbase = (gpac >> l) & 15;\
 34 | 		DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \
 35 | 		int32_t tmp_hm = p[m] + subScore; \
 36 | 		h[m] = max(tmp_hm, f[m]); \
 37 | 		h[m] = max(h[m], e); \
 38 | 		h[m] = max(h[m], 0); \
 39 | 		f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \
 40 | 		e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \
 41 | 		maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \
 42 | 		maxHH = (maxHH < h[m]) ? h[m] : maxHH; \
 43 | 		p[m] = h[m-1]; \
 44 | 
 45 | #define CORE_LOCAL_COMPUTE_TB(direction_reg) \
 46 | 		uint32_t gbase = (gpac >> l) & 15;\
 47 | 		DEV_GET_SUB_SCORE_LOCAL(subScore, rbase, gbase) \
 48 | 		int32_t tmp_hm = p[m] + subScore; \
 49 | 		uint32_t m_or_x = tmp_hm >= p[m] ? 0 : 1;\
 50 | 		h[m] = max(tmp_hm, f[m]); \
 51 | 		h[m] = max(h[m], e); \
 52 | 		h[m] = max(h[m], 0); \
 53 | 		direction_reg |= h[m] == tmp_hm ? m_or_x << (28 - ((m - 1) << 2)) : (h[m] == f[m] ? (uint32_t)3 << (28 - ((m - 1) << 2)) : (uint32_t)2 << (28 - ((m - 1) << 2)));\
 54 | 		direction_reg |= (tmp_hm - _cudaGapOE) > (f[m] - _cudaGapExtend) ?  (uint32_t)0 : (uint32_t)1 << (31 - ((m - 1) << 2));\
 55 | 		f[m] = max(tmp_hm- _cudaGapOE, f[m] - _cudaGapExtend); \
 56 | 		direction_reg |= (tmp_hm - _cudaGapOE) > (e - _cudaGapExtend) ?  (uint32_t)0 : (uint32_t)1 << (30 - ((m - 1) << 2));\
 57 | 		e = max(tmp_hm- _cudaGapOE, e - _cudaGapExtend); \
 58 | 		maxXY_y = (maxHH < h[m]) ? gidx + (m-1) : maxXY_y; \
 59 | 		maxHH = (maxHH < h[m]) ? h[m] : maxHH; \
 60 | 		p[m] = h[m-1]; \
 61 | 
 62 | 
 63 | 
 64 | 
 65 | /* typename meaning : 
 66 |     - T is the algorithm type (LOCAL, MICROLOCAL)
 67 |     - S is WITH_ or WIHTOUT_START
 68 |     - B is for computing the Second Best Score. Its values are on enum FALSE(0)/TRUE(1).
 69 |     (sidenote: it's based on an enum instead of a bool in order to generalize its type from its Int value, with Int2Type meta-programming-template)
 70 | */
 71 | template <typename T, typename S, typename B>
 72 | __global__ void gasal_local_kernel(uint32_t *packed_query_batch, uint32_t *packed_target_batch,  uint32_t *query_batch_lens, uint32_t *target_batch_lens, uint32_t *query_batch_offsets, uint32_t *target_batch_offsets, gasal_res_t *device_res, gasal_res_t *device_res_second, uint4 *packed_tb_matrices, int n_tasks)
 73 | {
 74 |     const uint32_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;//thread ID
 75 | 	if (tid >= n_tasks) return;
 76 | 
 77 | 	int32_t i, j, k, m, l;
 78 | 	int32_t e;
 79 | 
 80 |     int32_t maxHH = 0; //initialize the maximum score to zero
 81 | 	int32_t maxXY_y = 0; 
 82 | 
 83 |     int32_t prev_maxHH = 0;
 84 |     int32_t maxXY_x = 0;
 85 | 
 86 | 	int tile_no = 0;
 87 | 
 88 | 
 89 |     int32_t maxHH_second __attribute__((unused)); // __attribute__((unused)) to avoid raising errors at compilation. most template-kernels don't use these.
 90 |     int32_t prev_maxHH_second __attribute__((unused)); 
 91 |     int32_t maxXY_x_second __attribute__((unused));
 92 |     int32_t maxXY_y_second __attribute__((unused));
 93 |     maxHH_second = 0;
 94 |     prev_maxHH_second = 0;
 95 |     maxXY_x_second = 0;
 96 |     maxXY_y_second = 0;
 97 | 
 98 | 
 99 | 	int32_t subScore;
100 | 
101 | 	int32_t ridx, gidx;
102 | 	short2 HD;
103 | 	short2 initHD = make_short2(0, 0);
104 | 	
105 | 	uint32_t packed_target_batch_idx = target_batch_offsets[tid] >> 3; //starting index of the target_batch sequence
106 | 	uint32_t packed_query_batch_idx = query_batch_offsets[tid] >> 3;//starting index of the query_batch sequence
107 | 	uint32_t read_len = query_batch_lens[tid];
108 | 	uint32_t ref_len = target_batch_lens[tid];
109 | 	uint32_t query_batch_regs = (read_len >> 3) + (read_len&7 ? 1 : 0);//number of 32-bit words holding query_batch sequence
110 | 	uint32_t target_batch_regs = (ref_len >> 3) + (ref_len&7 ? 1 : 0);//number of 32-bit words holding target_batch sequence
111 | 	//-----arrays for saving intermediate values------
112 | 	short2 global[MAX_QUERY_LEN];
113 | 	int32_t h[9];
114 | 	int32_t f[9];
115 | 	int32_t p[9];
116 | 	//--------------------------------------------
117 |     
118 |     for (i = 0; i < MAX_QUERY_LEN; i++) {
119 |         global[i] = initHD;
120 |     }
121 | 
122 | 	for (i = 0; i < target_batch_regs; i++) { //target_batch sequence in rows
123 | 		for (m = 0; m < 9; m++) {
124 |             h[m] = 0;
125 |             f[m] = 0;
126 |             p[m] = 0;
127 |         }
128 | 		
129 | 		register uint32_t gpac =packed_target_batch[packed_target_batch_idx + i];//load 8 packed bases from target_batch sequence
130 | 		gidx = i << 3;
131 | 		ridx = 0;
132 | 		
133 | 		for (j = 0; j < query_batch_regs; j+=1) { //query_batch sequence in columns
134 | 			register uint32_t rpac =packed_query_batch[packed_query_batch_idx + j];//load 8 bases from query_batch sequence
135 | 
136 |             //--------------compute a tile of 8x8 cells-------------------
137 | 			if (SAMETYPE(S, Int2Type<WITH_TB>)) {
138 | 				uint4 direction = make_uint4(0, 0, 0, 0);
139 | 				uint32_t rbase = (rpac >> 28) & 15;//get a base from query_batch sequence
140 | 				HD = global[ridx];
141 | 				h[0] = HD.x;
142 | 				e = HD.y;
143 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
144 | 					CORE_LOCAL_COMPUTE_TB(direction.x);
145 | 					if (SAMETYPE(B, Int2Type<TRUE>))
146 | 					{
147 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
148 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
149 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
150 | 					}
151 | 				}
152 | 				HD.x = h[m-1];
153 | 				HD.y = e;
154 | 				global[ridx] = HD;
155 | 				//---------------------------------------------
156 | 
157 | 
158 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
159 | 
160 | 				if (SAMETYPE(B, Int2Type<TRUE>))
161 | 				{
162 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
163 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
164 | 				}
165 | 				prev_maxHH = max(maxHH, prev_maxHH);
166 | 				ridx++;
167 | 
168 | 				rbase = (rpac >> 24) & 15;//get a base from query_batch sequence
169 | 				HD = global[ridx];
170 | 				h[0] = HD.x;
171 | 				e = HD.y;
172 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
173 | 					CORE_LOCAL_COMPUTE_TB(direction.y);
174 | 					if (SAMETYPE(B, Int2Type<TRUE>))
175 | 					{
176 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
177 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
178 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
179 | 					}
180 | 				}
181 | 				HD.x = h[m-1];
182 | 				HD.y = e;
183 | 				global[ridx] = HD;
184 | 				//---------------------------------------------
185 | 
186 | 
187 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
188 | 
189 | 				if (SAMETYPE(B, Int2Type<TRUE>))
190 | 				{
191 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
192 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
193 | 				}
194 | 				prev_maxHH = max(maxHH, prev_maxHH);
195 | 				ridx++;
196 | 
197 | 
198 | 				rbase = (rpac >> 20) & 15;//get a base from query_batch sequence
199 | 				HD = global[ridx];
200 | 				h[0] = HD.x;
201 | 				e = HD.y;
202 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
203 | 					CORE_LOCAL_COMPUTE_TB(direction.z);
204 | 					if (SAMETYPE(B, Int2Type<TRUE>))
205 | 					{
206 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
207 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
208 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
209 | 					}
210 | 				}
211 | 				HD.x = h[m-1];
212 | 				HD.y = e;
213 | 				global[ridx] = HD;
214 | 				//---------------------------------------------
215 | 
216 | 
217 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
218 | 
219 | 				if (SAMETYPE(B, Int2Type<TRUE>))
220 | 				{
221 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
222 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
223 | 				}
224 | 				prev_maxHH = max(maxHH, prev_maxHH);
225 | 				ridx++;
226 | 
227 | 
228 | 				rbase = (rpac >> 16) & 15;//get a base from query_batch sequence
229 | 				HD = global[ridx];
230 | 				h[0] = HD.x;
231 | 				e = HD.y;
232 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
233 | 					CORE_LOCAL_COMPUTE_TB(direction.w);
234 | 					if (SAMETYPE(B, Int2Type<TRUE>))
235 | 					{
236 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
237 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
238 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
239 | 					}
240 | 				}
241 | 				HD.x = h[m-1];
242 | 				HD.y = e;
243 | 				global[ridx] = HD;
244 | 				//---------------------------------------------
245 | 
246 | 
247 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
248 | 
249 | 				if (SAMETYPE(B, Int2Type<TRUE>))
250 | 				{
251 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
252 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
253 | 				}
254 | 				prev_maxHH = max(maxHH, prev_maxHH);
255 | 				ridx++;
256 | 
257 | 				packed_tb_matrices[(tile_no*n_tasks) + tid] = direction;
258 | 				tile_no++;
259 | 
260 | 
261 | 				direction = make_uint4(0,0,0,0);
262 | 				rbase = (rpac >> 12) & 15;//get a base from query_batch sequence
263 | 				HD = global[ridx];
264 | 				h[0] = HD.x;
265 | 				e = HD.y;
266 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
267 | 					CORE_LOCAL_COMPUTE_TB(direction.x);
268 | 					if (SAMETYPE(B, Int2Type<TRUE>))
269 | 					{
270 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
271 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
272 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
273 | 					}
274 | 				}
275 | 				HD.x = h[m-1];
276 | 				HD.y = e;
277 | 				global[ridx] = HD;
278 | 				//---------------------------------------------
279 | 
280 | 
281 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
282 | 
283 | 				if (SAMETYPE(B, Int2Type<TRUE>))
284 | 				{
285 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
286 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
287 | 				}
288 | 				prev_maxHH = max(maxHH, prev_maxHH);
289 | 				ridx++;
290 | 
291 | 				rbase = (rpac >> 8) & 15;//get a base from query_batch sequence
292 | 				HD = global[ridx];
293 | 				h[0] = HD.x;
294 | 				e = HD.y;
295 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
296 | 					CORE_LOCAL_COMPUTE_TB(direction.y);
297 | 					if (SAMETYPE(B, Int2Type<TRUE>))
298 | 					{
299 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
300 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
301 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
302 | 					}
303 | 				}
304 | 				HD.x = h[m-1];
305 | 				HD.y = e;
306 | 				global[ridx] = HD;
307 | 				//---------------------------------------------
308 | 
309 | 
310 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
311 | 
312 | 				if (SAMETYPE(B, Int2Type<TRUE>))
313 | 				{
314 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
315 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
316 | 				}
317 | 				prev_maxHH = max(maxHH, prev_maxHH);
318 | 				ridx++;
319 | 
320 | 
321 | 				rbase = (rpac >> 4) & 15;//get a base from query_batch sequence
322 | 				HD = global[ridx];
323 | 				h[0] = HD.x;
324 | 				e = HD.y;
325 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
326 | 					CORE_LOCAL_COMPUTE_TB(direction.z);
327 | 					if (SAMETYPE(B, Int2Type<TRUE>))
328 | 					{
329 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
330 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
331 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
332 | 					}
333 | 				}
334 | 				HD.x = h[m-1];
335 | 				HD.y = e;
336 | 				global[ridx] = HD;
337 | 				//---------------------------------------------
338 | 
339 | 
340 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
341 | 
342 | 				if (SAMETYPE(B, Int2Type<TRUE>))
343 | 				{
344 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
345 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
346 | 				}
347 | 				prev_maxHH = max(maxHH, prev_maxHH);
348 | 				ridx++;
349 | 
350 | 
351 | 				rbase = rpac & 15;//get a base from query_batch sequence
352 | 				HD = global[ridx];
353 | 				h[0] = HD.x;
354 | 				e = HD.y;
355 | 				for (l = 28, m = 1; m < 9; l -= 4, m++) {
356 | 					CORE_LOCAL_COMPUTE_TB(direction.w);
357 | 					if (SAMETYPE(B, Int2Type<TRUE>))
358 | 					{
359 | 						bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
360 | 						maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
361 | 						maxHH_second = (override_second) ? h[m] : maxHH_second;
362 | 					}
363 | 				}
364 | 				HD.x = h[m-1];
365 | 				HD.y = e;
366 | 				global[ridx] = HD;
367 | 				//---------------------------------------------
368 | 
369 | 
370 | 				maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
371 | 
372 | 				if (SAMETYPE(B, Int2Type<TRUE>))
373 | 				{
374 | 					maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
375 | 					prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
376 | 				}
377 | 				prev_maxHH = max(maxHH, prev_maxHH);
378 | 				ridx++;
379 | 
380 | 				packed_tb_matrices[(tile_no*n_tasks) + tid] = direction;
381 | 				tile_no++;
382 | 
383 | 
384 | 
385 | 			}
386 | 			else {
387 | 				for (k = 28; k >= 0; k -= 4) {
388 | 					uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence
389 | 					//-----load intermediate values--------------
390 | 					HD = global[ridx];
391 | 					h[0] = HD.x;
392 | 					e = HD.y;
393 | 
394 | #pragma unroll 8
395 | 					for (l = 28, m = 1; m < 9; l -= 4, m++) {
396 | 						CORE_LOCAL_COMPUTE();
397 | 						if (SAMETYPE(B, Int2Type<TRUE>))
398 | 						{
399 | 							bool override_second = (maxHH_second < h[m]) && (maxHH > h[m]);
400 | 							maxXY_y_second = (override_second) ? gidx + (m-1) : maxXY_y_second;
401 | 							maxHH_second = (override_second) ? h[m] : maxHH_second;
402 | 						}
403 | 					}
404 | 
405 | 					//----------save intermediate values------------
406 | 					HD.x = h[m-1];
407 | 					HD.y = e;
408 | 					global[ridx] = HD;
409 | 					//---------------------------------------------
410 | 
411 | 
412 | 					maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//end position on query_batch sequence corresponding to current maximum score
413 | 
414 | 					if (SAMETYPE(B, Int2Type<TRUE>))
415 | 					{
416 | 						maxXY_x_second = (prev_maxHH_second < maxHH) ? ridx : maxXY_x_second;
417 | 						prev_maxHH_second = max(maxHH_second, prev_maxHH_second);
418 | 					}
419 | 					prev_maxHH = max(maxHH, prev_maxHH);
420 | 					ridx++;
421 | 					//-------------------------------------------------------
422 | 
423 | 				}
424 | 			}
425 |         }
426 | 	}
427 | 
428 | 	device_res->aln_score[tid] = maxHH;//copy the max score to the output array in the GPU mem
429 | 	device_res->query_batch_end[tid] = maxXY_x;//copy the end position on query_batch sequence to the output array in the GPU mem
430 | 	device_res->target_batch_end[tid] = maxXY_y;//copy the end position on target_batch sequence to the output array in the GPU mem
431 | 
432 |     if (SAMETYPE(B, Int2Type<TRUE>))
433 |     {
434 |         device_res_second->aln_score[tid] = maxHH_second;
435 |         device_res_second->query_batch_end[tid] = maxXY_x_second;
436 |         device_res_second->target_batch_end[tid] = maxXY_y_second;
437 |     }
438 | 
439 | 
440 |     /*------------------Now to find the start position-----------------------*/
441 |     if (SAMETYPE(S, Int2Type<WITH_START>))
442 |     {
443 | 
444 |         int32_t rend_pos = maxXY_x;//end position on query_batch sequence
445 |         int32_t gend_pos = maxXY_y;//end position on target_batch sequence
446 |         int32_t fwd_score = maxHH;// the computed score
447 | 
448 |         //the index of 32-bit word containing the end position on query_batch sequence
449 |         int32_t rend_reg = ((rend_pos >> 3) + 1) < query_batch_regs ? ((rend_pos >> 3) + 1) : query_batch_regs;
450 |         //the index of 32-bit word containing to end position on target_batch sequence
451 |         int32_t gend_reg = ((gend_pos >> 3) + 1) < target_batch_regs ? ((gend_pos >> 3) + 1) : target_batch_regs;
452 |         
453 | 
454 | 
455 |         packed_query_batch_idx += (rend_reg - 1);
456 |         packed_target_batch_idx += (gend_reg - 1);
457 | 
458 | 
459 |         maxHH = 0;
460 |         prev_maxHH = 0;
461 |         maxXY_x = 0;
462 |         maxXY_y = 0;
463 | 
464 |         for (i = 0; i < MAX_QUERY_LEN; i++) {
465 |             global[i] = initHD;
466 |         }
467 |         //------starting from the gend_reg and rend_reg, align the sequences in the reverse direction and exit if the max score >= fwd_score------
468 |         gidx = ((gend_reg << 3) + 8) - 1;
469 |         for (i = 0; i < gend_reg && maxHH < fwd_score; i++) {
470 |             for (m = 0; m < 9; m++) {
471 |                 h[m] = 0;
472 |                 f[m] = 0;
473 |                 p[m] = 0;
474 |             }
475 |             register uint32_t gpac =packed_target_batch[packed_target_batch_idx - i];//load 8 packed bases from target_batch sequence
476 |             gidx = gidx - 8;
477 |             ridx = (rend_reg << 3) - 1;
478 |             int32_t global_idx = 0;
479 |             for (j = 0; j < rend_reg && maxHH < fwd_score; j+=1) {
480 |                 register uint32_t rpac =packed_query_batch[packed_query_batch_idx - j];//load 8 packed bases from query_batch sequence
481 |                 //--------------compute a tile of 8x8 cells-------------------
482 |                 for (k = 0; k <= 28 && maxHH < fwd_score; k += 4) {
483 |                     uint32_t rbase = (rpac >> k) & 15;//get a base from query_batch sequence
484 |                     //----------load intermediate values--------------
485 |                     HD = global[global_idx];
486 |                     h[0] = HD.x;
487 |                     e = HD.y;
488 | 
489 | 
490 |                     #pragma unroll 8
491 |                     for (l = 0, m = 1; l <= 28; l += 4, m++) {
492 |                             CORE_LOCAL_COMPUTE_START();
493 |                     }
494 |                     
495 |                     //------------save intermediate values----------------
496 |                     HD.x = h[m-1];
497 |                     HD.y = e;
498 |                     global[global_idx] = HD;
499 |                     //----------------------------------------------------
500 |                     maxXY_x = (prev_maxHH < maxHH) ? ridx : maxXY_x;//start position on query_batch sequence corresponding to current maximum score
501 |                     prev_maxHH = max(maxHH, prev_maxHH);
502 |                     ridx--;
503 |                     global_idx++;
504 |                 }
505 |             }
506 |         }
507 | 
508 |         device_res->query_batch_start[tid] = maxXY_x;//copy the start position on query_batch sequence to the output array in the GPU mem
509 |         device_res->target_batch_start[tid] = maxXY_y;//copy the start position on target_batch sequence to the output array in the GPU mem
510 | 
511 |     }
512 | 
513 | 
514 | 
515 | 
516 | 	return;
517 | 
518 | 
519 | }
520 | #endif
521 | 


--------------------------------------------------------------------------------